-
Notifications
You must be signed in to change notification settings - Fork 18
/
MC-GPU_v1.5b.cu
4266 lines (3495 loc) · 240 KB
/
MC-GPU_v1.5b.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
////////////////////////////////////////////////////////////////////////////////////////
//
// ****************************
// *** MC-GPU , version 1.5 ***
// ****************************
//
/**
* \mainpage MC-GPU v1.5_DBT
*
* \code
*
* Andreu Badal, PhD (Andreu.Badal-Soler{at}fda.hhs.gov)
*
* Division of Imaging and Applied Mathematics
* Office of Science and Engineering Laboratories
* Center for Devices and Radiological Health
* U.S. Food and Drug Administration
*
* Code release date: 2012/12/12
*
*
*
* \endcode
*
*
*
* \b MC-GPU [1-4] is a Monte Carlo simulation code that can generate synthetic radiographic
* images and computed tomography (CT) scans of realistic models of the human anatomy using the
* computational power of commodity Graphics Processing Unit (GPU) cards.
* The code implements a massively multi-threaded Monte Carlo simulation algorithm
* for the transport of x rays in a voxelized geometry. The x ray interaction models and material
* properties have been adapted from \b PENELOPE \b 2006 [5].
*
*
* \b MC-GPU was developed using the \b CUDA programming model from \b NVIDIA [6] to achieve
* maximum performance on NVIDIA GPUs. The code can also be compiled with a standard C compiler
* to be executed in a regular CPU.
* In a typical medical imaging simulation, the use of GPU computing with MC-GPU has been shown
* to provide a speed up of between 20 and 40 times, compared to the execution on a single CPU core.
*
* The MC-GPU code has been described in different scientific publications [1-4].
* The main reference of this work, which the users should cite, is the following [1]:
* \code
* Andreu Badal and Aldo Badano, "Accelerating Monte Carlo simulations of
* photon transport in a voxelized geometry using a massively parallel
* Graphics Processing Unit", Medical Physics 36, pp. 4878–4880 (2009)
* \endcode
* The main developer of MC-GPU is \b Andreu \b Badal, working at the U.S. \b Food \b and
* \b Drug \b Administration (Center for Devices and Radiological Health, Office of Science
* and Engineering Laboratories, Division of Imaging and Applied Mathematics).
* The source code of MC-GPU is free and open software in the public domain, as explained
* in the Disclaimer section below.
* The source code of the original MC-GPU and its auxiliary files were distributed at the website: http://code.google.com/.
* New versions will be available at https://github.com/DIDSR/.
*
*
* This documentation has been automatically generated by \b Doxygen parsing the comments in
* the MC-GPU source code.
* This code is still in development, please report to the author any issue/bug
* that you may encounter. Feel free to suggest improvements to the code too!
*
*
*
* \section sec_changes List of modifications in different versions of the code
*
* \subsection sec_DBT Version 1.5b (2017/06/28)
*
* - Added electronic noise and Swank factor for detected charges output
* - Anti-scatter grid based on Day and Dance, Phys Med Biol 28, pp. 1429-1433 (1983)
* - Binary tree-based geometry model to reduce memory use: voxel dose tally can't be used with binary tree!
*
* \subsection sec_DBT Version 1.4 (2016/02/02)
*
* - Fixed density for each material: disabling the option to use a different density for each voxel (to reduce memory).
* - Enabled input of voxelized geometry file in binary format (single byte per voxel).
* - Selenium detector model with thickness, attenuation, fluorescence tracking.
* - User-defined rotation axis for the tomography scan (using Rodrigues rotation formula).
* - Enabled the simulation of tomosynthesis scans: half cone source, emission angle offset, etc.
* - Enable translation of the voxelized geometry and of the image sensor within the detector plane.
*
* \subsection sec_changes_v13 Version 1.3 (release date: 2012/12/12)
*
* - Code upgraded to CUDA 5.0 (not compatible with previous versions of CUDA!).
* - Removed limit on the amount of projection images that can be simulated per CT scan (source and
* detector parameters now stored in global memory and transferring to shared memory at run time
* to avoid using the limited constant memory).
* - New material dose tally implemented to estimate the dose deposited in each material independently
* of the voxel dose tally (the voxel dose tally measures the dose in each material adding the energy
* deposited in each voxel of that material within the defined voxelized region-of-interest).
* - Interaction loop re-organized to maximize performance (virtual interactions simulated before real ones).
* - Improvements and small corrections in the source sampling and tally routines.
* - Allow input of material and voxel geometry files compressed with gzip (zlib library now required for compilation).
*
* \subsection sec_changes_v12 Version 1.2 (release date: 2011/10/25)
*
* - Implemented a voxel dose tally.
* - Polyenergetic source model.
* - MPI support for simulating individual projections.
* - Simulation by time limit.
* - Improved flexibility of the CT trajectories, helical scans.
*
*
*
* \section sec_disc Disclaimer
*
* This software and documentation (the "Software") were developed at the Food and
* Drug Administration (FDA) by employees of the Federal Government in the course
* of their official duties. Pursuant to Title 17, Section 105 of the United States
* Code, this work is not subject to copyright protection and is in the public
* domain. Permission is hereby granted, free of charge, to any person obtaining a
* copy of the Software, to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish, distribute,
* sublicense, or sell copies of the Software or derivatives, and to permit persons
* to whom the Software is furnished to do so. FDA assumes no responsibility
* whatsoever for use by other parties of the Software, its source code,
* documentation or compiled executables, and makes no guarantees, expressed or
* implied, about its quality, reliability, or any other characteristic. Further,
* use of this code in no way implies endorsement by the FDA or confers any
* advantage in regulatory decisions. Although this software can be redistributed
* and/or modified freely, we ask that any derivative works bear some notice that
* they are derived from it, and any modified versions bear some notice that they
* have been modified.
*
*
*
* \section sec_Intro Code features
*
* In this section we provide a brief description of the features of the MC-GPU code. A
* more complete description of the code can be found in our published articles.
* important information regarding the operation of the code is provided as comments in the
* input files of the sample simulations provided with the MC-GPU package.
* Detailed information on each function of the code can be found in the complete Doxygen
* documentation of the source code
*
* The basic operation of the code consists in adapting the simulation input file
* to describe the location and characteristics of the x ray source, define the CT trajectory
* (if any), list the materials to be used in the simulation, define the geometry of
* the x ray detector and, finally, specify the voxelized object file to be
* used as the simulation material universe.
* In the first line of the input file, the user can fix the total number of x rays that have
* to be simulated (> 1e5 histories) or the total simulation time (maximum 1e5 seconds).
*
*
* The coordinate system of the simulated world is determined by the input voxelized geometry.
* The origin of coordinates is assumed to be located at the lower-back corner of the voxelized
* volume, and the axis are located on the vertices of the voxelized volume.
* This means that the lower-back corner of the first voxel is on the origin and the
* following voxels are located along the positive X, Y and Z axis (first quadrant).
*
*
* To simulate the atomic interactions, MC-GPU uses a database of material properties based on the
* database from PENELOPE. A PENELOPE 2006 material file can be converted into an MC-GPU material
* file using the auxiliary utility "MC-GPU_create_material_data.f" provided with the MC-GPU
* package. Pre-defined material files for a set of materials typically used in medical imaging
* simulations are already provided in the folder "MC-GPU_material_files".
*
*
* The code includes two tally options: an \b image \b tally that creates projection x-ray images,
* and a radiation \b dose \b tally that estimates the dose deposited inside the patient model.
* MC-GPU does not currently simulate the transport of electrons and therefore the dose
* deposition tally (KERMA tally rigorously) will not be accurate for high energies or near
* material interfaces and small voxels.
* In the image tally the images are formed by counting the energy that enters a user-defined 2D
* grid of pixels, which is a simple approximation to a noise-free flat-panel detector with
* 100% detection efficiency. The pixel values have units of eV/cm^2.
* Four different images are reported at the end of the simulation, corresponding
* to the signal produced by x rays that did not interact between the source and the detector
* (non-scattered), x rays that suffered a single Compton (inelastic) interaction, a single
* Rayleigh (elastic) interaction, and multi-scattered x rays.
* The dose tally counts the energy deposited by each x ray track inside each voxel of the
* geometry, within a user-defined volumetric region-of-interest (ROI). The average dose deposited
* inside each voxel and in each material (and the associated statistical uncertainties) are reported
* at the end of the simulation.
*
*
* MC-GPU can simulate a single projection image or a full CT scan.
* The CT is simulated generating many projection images around the static
* voxelized geometry. Currently, the code is limited to perform a simple
* CT trajectory rotating around the Z axis. The user can specify the angular shift and longitudinal
* translation (pitch) of the source between each projection and also the distance between the
* source and the axis of rotation (the axis is assumed to be parallel to the Z axis).
* By now, the code does not simulate some relevant components of a CT scanner such as the
* anti-scatter grid, a bow-tie filter or a curved detector (flat-panel detector only).
*
*
* The x ray source is defined as a point source emitting x rays with an energy randomly sampled
* from the user-provided energy spectrum. The polyenergetic spectrum is efficiently sampled
* using the Walker aliasing algorithm. The emitted cone beam is computationally
* collimated to produce a rectangular field on the detector plane, within the azimuthal and
* polar angles specified by the user.
* The detector plane is automatically located at the specified distance right in front of the
* source focal spot, with the collimated cone beam pointing towards the geometric center of the detector.
*
*
* In order to optimize the particle tracking algorithm (ray-tracing) and minimize
* the accesses to the slow GPU main memory, the photon trajectories across the voxels
* are computed using the Woodcock tracking algorithm.
* With this technique the photons perceive the geometry as a uniform medium
* composed of the material of the most attenuating voxel.
* In this way, the voxel boundaries do not have to be explicitly calculated and
* multiple voxels can be crossed in a single step.
* To keep the simulation unbiased, some of the interactions are considered
* "virtual" (i.e., do not change the photon energy or direction of movement),
* depending on the x ray energy and the actual material at the interaction site.
* In typical medical imaging simulations where the most attenuating material is cortical bone,
* the Woodcock tracking algorithm gives an speed up of almost one order of magnitude compared
* to computing voxel boundaries all the time. However, if the geometry includes a high
* density voxel, such as a metallic implant, the performance of the code can be severely
* reduced because a large fraction of the sampled interactions will be virtual.
*
*
* The random number generator used in PENELOPE [5], RANECU, is also used in the GPU
* program. To ensure that the simulated tracks are not correlated, each thread initializes
* the generator to a unique position in the random sequence, far enough from the
* other threads, using the algorithm implemented in the seedsMLCG code [7].
*
*
* In a typical simulation, several thousand threads are launched simultaneously in
* the GPU, each one of them simulating a batch of several x ray tracks.
* If the code is compiled with MPI support (see below), multiple GPUs can be used in parallel.
* The code will perform a short speed test to estimate the relative speed of each GPU used
* in the simulation and then distribute the number of particles among the available GPUs correspondingly.
* If the user specified a time limit in the simulation, all the GPUs will simulate in parallel
* for the allowed time. Since the code is already optimized to scale well in
* thousands of GPU threads, it scales almost linearly with the number of GPUs in most
* situations, with only a few seconds of overhead in the initialization of the multiple GPUs
* and in the reduction of the final results.
*
*
*
*
* \section sec_output Code output
*
* At the end of the simulation the code reports the tallied 3D dose distribution and the
* final simulated images in RAW binary form, as 32-bits float values. The image data is provided
* as a collection of five consecutive images corresponding to: total image (scatter+primaries),
* primary particles, Compton, Rayleigh and multi-scatter.
* The dose data is reported as two RAW files with the mean dose and twice the standard deviation
* of the dose in each voxel of the geometry respectively, within the input ROI.
* The average dose deposited in each material of the geometry is also reported to the standard output.
* Organ doses can be obtained by post-processing the output dose file, knowing which voxel
* corresponds to each organ.
* The pixel and voxel dose data values are stored with the X coordinate incrementing first, the Y
* coordinate incrementing second, and the Z coordinate incrementing last.
*
* The program also reports the simulated images and the dose at the Z plane at the level of the x ray
* source as ASCII text files. The ASCII output can be readily visualized with the GNUPLOT scripts
* distributed with MC-GPU. The header section at the beginning of these text files provides the
* information required to easily read the RAW binary files with IMAGEJ, OCTAVE or other programs.
*
*
*
* \section sec_compilation Code compilation and execution
*
* MC-GPU has been developed and tested only in the Linux operating system.
* A Makefile script is provided to compile the MC-GPU code in Linux.
* The CUDA libraries and the GNU GCC compiler must be previously installed.
* The Makefile may have to be edited to modify the library path.
* The code requires the "zlib.h" library to be able to open gzipped input files.
*
*
* MC-GPU uses CUDA to access NVIDIA GPUs but all the actual computations are coded
* in standard C.
* The code can only be executed in NVIDIA GPUs (CPU compatibility from initial versions when not defining -DUSING_CUDA not available).
* Defining the pre-processor variable "USING_MPI" (i.e., compiling with
* "-DUSING_MPI"), Message Passing Interface (MPI) library calls are used to share information
* between multiple CPU threads in different computers.
* Each MPI thread gets a unique id in the CPU and addresses a unique GPU.
* At the end of the simulation the images and doses tallied by the different GPUs are
* reduced to form single output file equivalent to a sequential simulation of the same
* number of particles.
*
* The code can be easily compiled executing the command "make" or running the provided
* "./make.sh" script.
* Optionally, the code can be executed from the command line with a command like this
* (example using CUDA and MPI, openMPI library in this case):
* \code
* nvcc -DUSING_MPI MC-GPU_v1.3.cu -o MC-GPU_v1.3.x -O3
* -use_fast_math -L/usr/lib/ -I. -I/usr/local/cuda/include
* -I/usr/local/cuda/samples/common/inc -I/usr/local/cuda/samples/shared/inc/
* -I/usr/include/openmpi -lmpi -lz --ptxas-options=-v
* -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_30,code=sm_30
* \endcode
*
* The same source code can also be compiled for a regular CPU using:
* \code
* gcc -x c -O3 MC-GPU_v1.3.cu -o MC-GPU_v1.3_CPU.x -I./ -lm -lz
* \endcode
*
* To run a simulation (and keep the information reported to the standard
* output in an external file) the compiled code can be executed as:
* \code
* ./MC-GPU_v1.3.x MC-GPU_v1.3.in | tee MC-GPU_v1.3.out
* \endcode
*
* All simulation can be executed in the same way using the code compiled for the CPU
* or the GPU (however, the number of histories should be reduced for the CPU to finish
* the simulation in a reasonable time).
* To run the simulation in parallel with MPI in multiple GPUs (or CPU cores) in the
* current computer the user can execute:
* \code
* mpirun -n 4 ./MC-GPU_v1.3.x MC-GPU_v1.3.in
* \endcode
*
* To use GPUs in different computers, the user must make sure all computers can access the simulation
* files and that the libraries are correctly set up in all nodes.
* To execute a simulation (with verbose MPI information being reported):
* \code
* mpirun --tag-output -v -x LD_LIBRARY_PATH -hostfile myhostfile.txt -n 8
* /fullPath/MC-GPU_v1.3.x /fullPath/MC-GPU_v1.3.in | tee MC-GPU_v1.3.out
* \endcode
*
* The text file 'hostfile' lists the IP addresses and number of computing slots (GPUs) of the
* computers collaborating in the simulation. This file is not necessary when using multiple
* GPUs in a single workstation. When using multiple computers, the simulation files should
* be located in a shared drive to make sure every node can access the input data.
* The different workstations must have different host names in order to be differentiated by
* the MPI threads. The multiple threads communicate to each other to make sure they don't
* use the same GPU in the same workstation.
*
*
*
* \section sec_issues Known issues
*
* In extremely long simulations, it is theoretically possible to cause an overflow of the counters
* estimating the mean and standard deviation of the material or voxel doses. If this happen, the
* results will be incorrect and even negative or nan values can be reported.
*
*
*
*
* \section sec_ref References
*
* -# Badano A, Graff CG, Badal A, et al. Evaluation of Digital Breast Tomosynthesis as Replacement of Full-Field Digital Mammography Using an In Silico Imaging Trial. JAMA Netw Open 1(7), e185474 (2018)
* -# A. Badal and A. Badano, Accelerating Monte Carlo simulations of photon transport in a voxelized geometry using a massively parallel Graphics Processing Unit, Med. Phys. 36, p. 4878-4880 (2009)
* -# A. Badal and A. Badano, Monte Carlo Simulation of X-Ray Imaging Using a Graphics Processing Unit, IEEE NSC-MIC, Conference Record , HP3–1, p. 4081-4084 (2009)
* -# A. Badal, I. Kyprianou, D. Sharma and A. Badano, Fast cardiac CT simulation using a Graphics Processing Unit-accelerated Monte Carlo code, Proc. SPIE Medical Imaging Conference 7622, p. 762231 (2010)
* -# A. Badal and A. Badano, Fast Simulation of Radiographic Images Using a Monte Carlo X-Ray Transport Algorithm Implemented in CUDA, Chapter 50 of GPU Computing Gems (Emerald Edition), p. 813-830, editor Wen-mei W. Hwu, publisher Morgan Kaufmann (Elsevier), Burlington MA, 2010
* -# F. Salvat, J. M. Fernandez-Varea and J. Sempau, PENELOPE – A code system for Monte Carlo simulation of electron and photon transport, NEA-OECD, Issy-les-Moulineaux, available at www.nea.fr/html/dbprog/peneloperef.html (2006)
* -# NVIDIA Corporation, NVIDIA CUDA(TM) Programming Guide, Technical Report available at www.nvidia.com/cuda (2011)
* -# A. Badal and J. Sempau, A package of Linux scripts for the parallelization of Monte Carlo simulations, Comput. Phys. Commun. 175 (6), p. 440-450 (2006)
*
*
*
* @file MC-GPU_v1.5b.cu
* @author Andreu Badal (Andreu.Badal-Soler{at}fda.hhs.gov)
* @date 2018/01/01
* -- MC-GPU v.1.5_DBT: 2017/06/28
* -- MC-GPU v.1.4_DBT: 2016/02/02
* -- MC-GPU v.1.3: 2012/12/12
* -- MC-GPU v.1.2: 2011/10/25
* -- MC-GPU v.1.1: 2010/06/25
* -- MC-GPU v.1.0: 2009/03/17
*/
////////////////////////////////////////////////////////////////////////////////////////
// *** Include header file with the structures and functions declarations:
#include <MC-GPU_v1.5b.h>
// *** Include functions to read phantom from binary form:
#include "load_voxels_binary_VICTRE_v1.5b.c"
// *** Include the computing kernel:
#include <MC-GPU_kernel_v1.5b.cu>
////////////////////////////////////////////////////////////////////////////////
//! Main program of MC-GPU: initialize the simulation environment, launch the GPU
//! kernels that perform the x ray transport and report the final results.
//! This function reads the description of the simulation from an external file
//! given in the command line. This input file defines the number of particles to
//! simulate, the characteristics of the x-ray source and the detector, the number
//! and spacing of the projections (if simulating a CT), the location of the
//! material files containing the interaction mean free paths, and the location
//! of the voxelized geometry file.
//!
//! @author Andreu Badal
//!
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
// -- Start time counter:
time_t current_time = time(NULL); // Get current time (in seconds)
clock_t clock_start, clock_end, clock_start_beginning; // (requires standard header <time.h>)
clock_start = clock(); // Get current clock counter
clock_start_beginning = clock_start;
#ifdef USING_MPI
// -- Using MPI to access multiple GPUs to simulate the x-ray projection image:
int myID = -88, numprocs = -99, return_reduce = -1;
MPI_Init(&argc, &argv); // Init MPI and get the current thread ID
MPI_Comm_rank(MPI_COMM_WORLD, &myID);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
char MPI_processor_name[81];
int resultlen = -1;
MPI_Get_processor_name(MPI_processor_name, &resultlen);
char* char_time = ctime(¤t_time); char_time[19] = '\0'; // The time is located betwen the characters 11 and 19.
printf(" >> MPI run (myId=%d, numprocs=%d) on processor \"%s\" (time: %s) <<\n", myID, numprocs, MPI_processor_name, &char_time[11]);
fflush(stdout); // Clear the screen output buffer
MPI_Barrier(MPI_COMM_WORLD); // Synchronize MPI threads
MAIN_THREAD printf(" -- Time spent initializing the MPI world (MPI_Barrier): %.3f s\n", ((double)(clock()-clock_start))/CLOCKS_PER_SEC);
#else
int myID = 0, numprocs = 1; // Only one CPU thread used when MPI is not activated (multiple projections will be simulated sequentially).
#endif
MAIN_THREAD
{
printf("\n\n *****************************************************************************\n");
printf( " *** MC-GPU, version 1.5b_DBT (https://github.com/DIDSR/VICTRE_MCGPU) ***\n");
printf( " *** (http://code.google.com/p/mcgpu/) ***\n");
printf( " *** ***\n");
printf( " *** A. Badal and A. Badano, \"Accelerating Monte Carlo simulations of *** \n");
printf( " *** photon transport in a voxelized geometry using a massively parallel *** \n");
printf( " *** Graphics Processing Unit\", Medical Physics 36, pp. 4878–4880 (2009) ***\n");
printf( " *** ***\n");
printf( " *** Andreu Badal (Andreu.Badal-Soler{at}fda.hhs.gov) ***\n");
printf( " *****************************************************************************\n\n");
printf("****** Code execution started on: %s\n\n", ctime(¤t_time));
fflush(stdout);
}
// The "MAIN_THREAD" macro prints the messages just once when using MPI threads (it has no effect if MPI is not used): MAIN_THREAD == "if(0==myID)"
MAIN_THREAD printf ("\n *** CUDA SIMULATION IN THE GPU ***\n");
MAIN_THREAD printf("\n -- INITIALIZATION phase:\n");
MAIN_THREAD fflush(stdout); // Clear the screen output buffer for the main thread
///////////////////////////////////////////////////////////////////////////////////////////////////
// *** Declare the arrays and structures that will contain the simulation data:
struct voxel_struct voxel_data; // Define the geometric constants of the voxel file
struct detector_struct detector_data[MAX_NUM_PROJECTIONS+1]; // Define an x ray detector (for each projection)
struct source_struct source_data[MAX_NUM_PROJECTIONS+1]; // Define the particles source (for each projection)
struct source_energy_struct source_energy_data; // Define the source energy spectrum
struct linear_interp mfp_table_data; // Constant data for the linear interpolation
struct compton_struct compton_table; // Structure containing Compton sampling data (to be copied to CONSTANT memory)
struct rayleigh_struct rayleigh_table; // Structure containing Rayleigh sampling data (to be copied to CONSTANT memory)
// float2 *voxel_mat_dens = NULL; //!!FixedDensity_DBT!! Density is now fixed for each material, not adapted for each voxel.
int *voxel_mat_dens = NULL; //!!bitree!! v1.5 --> using an integer value to be able to store both the material number (positive) or pointers to the binary tree branches (negative)
long long int voxel_mat_dens_bytes = 0; // Size (in bytes) of the voxels array (using unsigned int to allocate up to 4.2GBytes)
char *bitree = NULL; // Array storing the binary tree structures for each non-uniform low resolution voxel
unsigned int bitree_bytes = 0; // Size (in bytes) of the bitree array
int *voxel_geometry_LowRes = NULL; // Array to temporary store the low resolution version of the geometry when the binary tree is created
unsigned int voxel_geometry_LowRes_bytes = 0;
float density_max[MAX_MATERIALS];
float density_nominal[MAX_MATERIALS];
unsigned long long int *image = NULL; // Poiter where image array will be allocated
int image_bytes = -1; // Size of the image array
int mfp_table_bytes = -1, mfp_Woodcock_table_bytes = -1; // Size of the table arrays
float2 *mfp_Woodcock_table = NULL; // Linear interpolation data for the Woodcock mean free path [cm]
float3 *mfp_table_a = NULL, *mfp_table_b = NULL; // Linear interpolation data for 3 different interactions:
// (1) inverse total mean free path (divided by density, cm^2/g)
// (2) inverse Compton mean free path (divided by density, cm^2/g)
// (3) inverse Rayleigh mean free path (divided by density, cm^2/g)
short int dose_ROI_x_min, dose_ROI_x_max, dose_ROI_y_min, dose_ROI_y_max, dose_ROI_z_min, dose_ROI_z_max; // Coordinates of the dose region of interest (ROI)
ulonglong2 *voxels_Edep = NULL; // Poiter where the voxel energy deposition array will be allocated
int voxels_Edep_bytes = 0; // Size of the voxel Edep array
ulonglong2 materials_dose[MAX_MATERIALS]; // Array for tally_materials_dose. !!tally_materials_dose!!
int kk;
for(kk=0;kk<MAX_MATERIALS;kk++)
{
materials_dose[kk].x = 0; // Initializing data !!tally_materials_dose!!
materials_dose[kk].y = 0;
density_nominal[kk] =-1.0f;
}
clock_t clock_kernel; // Using only cpu timers after CUDA 5.0
double time_elapsed_MC_loop = 0.0, time_total_MC_simulation = 0.0, time_total_MC_init_report = 0.0;
unsigned long long int total_histories;
int histories_per_thread, seed_input, num_threads_per_block, gpu_id, num_projections;
int flag_material_dose=-2;
bool flag_simulateMammoAfterDBT=false, flag_detectorFixed=false; // !!DBTv1.4!!;
double SRotAxisD=-1.0, translation_helical=0.0;
char file_name_voxels[250], file_name_materials[MAX_MATERIALS][250], file_name_output[250], file_dose_output[250], file_name_espc[250];
// *** Read the input file given in the command line and return the significant data:
read_input(argc, argv, myID, &total_histories, &seed_input, &gpu_id, &num_threads_per_block, &histories_per_thread, detector_data, &image, &image_bytes, source_data, &source_energy_data, &voxel_data, file_name_voxels, file_name_materials, file_name_output, file_name_espc, &num_projections, &voxels_Edep, &voxels_Edep_bytes, file_dose_output, &dose_ROI_x_min, &dose_ROI_x_max, &dose_ROI_y_min, &dose_ROI_y_max, &dose_ROI_z_min, &dose_ROI_z_max, &SRotAxisD, &translation_helical, &flag_material_dose, &flag_simulateMammoAfterDBT, &flag_detectorFixed);
// *** Read the energy spectrum and initialize its sampling with the Walker aliasing method:
MAIN_THREAD printf(" -- Reading the energy spectrum and initializing the Walker aliasing sampling algorithm.\n");
float mean_energy_spectrum = 0.0f;
init_energy_spectrum(file_name_espc, &source_energy_data, &mean_energy_spectrum);
// *** Output some of the data read to make sure everything was correctly read:
MAIN_THREAD
{
printf("\n -- Data read from the input file:\n");
if (total_histories < (unsigned long long int)(100000))
printf(" simulation time = %lld s\n", total_histories);
else
printf(" x-ray tracks to simulate = %lld\n", total_histories);
printf(" initial random seed = %d\n", seed_input);
double phi0 = ((double)source_data[0].D_phi)*RAD2DEG;
double theta0 = 2.0*(90.0 - acos(((double)source_data[0].cos_theta_low))*RAD2DEG);
if (source_data[0].flag_halfConeX)
theta0 = 0.5*theta0;
printf(" NOTE: sampling only upper half of collimated cone beam, with beam offset to edge of the image (eg, mammo).\n"); // !!DBT!! !!HalfBeam!! !!DBTv1.4!!
printf(" azimuthal (phi), polar apertures = %.6f , %.6f degrees\n", phi0, theta0);
printf(" (max_height_at_y1cm = %f , max_width_at_y1cm = %f)\n", source_data[0].max_height_at_y1cm, source_data[0].max_width_at_y1cm); // !!DBTv1.4!!
printf(" source direction = (%f, %f, %f)\n", source_data[0].direction.x, source_data[0].direction.y, source_data[0].direction.z);
printf(" focal spot position = (%f, %f, %f)\n", source_data[0].position.x, source_data[0].position.y, source_data[0].position.z);
printf(" focal spot Gaussian blur FWHM = %f (3D Gaussian dist. cropped at 2*sigma)\n", source_data[0].focal_spot_FWHM); // !!DBTv1.4!!
if (num_projections!=1 && flag_simulateMammoAfterDBT==true)
printf(" focal spot rotation blur = %f degrees (disabled for the first single projection at 0 deg)\n", source_data[0].rotation_blur*RAD2DEG); // !!DBTv1.5!!
else
printf(" focal spot rotation blur = %f degrees\n", source_data[0].rotation_blur*RAD2DEG); // !!DBTv1.5!!
printf(" source-detector distance = %f cm\n", detector_data[0].sdd);
printf(" detector center position = (%f, %f, %f)\n", detector_data[0].center.x, detector_data[0].center.y, detector_data[0].center.z);
printf(" image offset from beam at center = (%f, %f)\n", detector_data[0].offset.x, detector_data[0].offset.y); // !!DBTv1.4!!
printf(" detector layer thickness = %f cm (=%.2f micron)\n", detector_data[0].scintillator_thickness, 1.0e4f*detector_data[0].scintillator_thickness); // !!DBTv1.4!!
printf(" detector material average MFP = %f cm\n", detector_data[0].scintillator_MFP);
printf(" detector material K-edge energy = %f eV\n", detector_data[0].kedge_energy);
printf(" fluorescence energy and yield = %f eV , %f\n", detector_data[0].fluorescence_energy, detector_data[0].fluorescence_yield);
printf(" MFP at fluorescence energy = %f cm\n", detector_data[0].fluorescence_MFP);
if (detector_data[0].gain_W>0.001f)
{
printf(" detector gain and Swank factor = %f eV/detected_charge, %f (%f relative std_dev)\n", detector_data[0].gain_W, 1.0f/(1.0f+detector_data[0].Swank_rel_std*detector_data[0].Swank_rel_std), detector_data[0].Swank_rel_std); // Swank_factor = mean^2/(mean^2 + std_dev^2) --> (std_dev/mean) = sqrt(1/Swank_factor - 1)
printf(" electronic noise per pixel = %f electrons\n", detector_data[0].electronic_noise);
}
printf(" detector cover thickness = %f cm\n", detector_data[0].cover_thickness); // !!DBTv1.5!!
printf(" cover average mean free path = %f cm\n", detector_data[0].cover_MFP);
if (detector_data[0].grid_freq > 0.0f)
{
printf(" Antiscatter grid ratio = %f\n", fabsf(detector_data[0].grid_ratio)); // !!DBTv1.5!!
printf(" Antiscatter grid frequency = %f lines per cm\n", detector_data[0].grid_freq);
printf(" Antiscatter grid strip thickness = %f cm (=%.2f micron)\n", detector_data[0].grid_strip_thickness, 1.0e4*detector_data[0].grid_strip_thickness);
float h = fabsf(detector_data[0].grid_ratio)*(1.0f/detector_data[0].grid_freq - detector_data[0].grid_strip_thickness); // Height of the grid, according to input grid ratio, freq, and strip thickness
printf(" Computed antiscatter grid height = %f cm (=%.2f micron)\n", h, 1.0e4*h);
printf(" strips average mean free path = %f cm\n", 1.0f/detector_data[0].grid_strip_mu);
printf(" interspace average mean free path = %f cm\n", 1.0f/detector_data[0].grid_interspace_mu);
if (detector_data[0].grid_ratio<0.0f)
printf(" Antiscatter grid orientation = 0 --> 1D collimated grid with strips perpendicular to lateral direction (mammo style)\n");
else
printf(" Antiscatter grid orientation = 1 --> 1D collimated grid with strips parallel to lateral direction (DBT style)\n");
}
else
printf("\n Antiscatter grid: DISABLED!\n\n"); // !!DBTv1.5!!
printf(" number of pixels image = %dx%d = %d\n", detector_data[0].num_pixels.x, detector_data[0].num_pixels.y, detector_data[0].total_num_pixels);
printf(" pixel size = %.5fx%.5f cm\n", 1.0f/detector_data[0].inv_pixel_size_X, 1.0f/detector_data[0].inv_pixel_size_Z);
printf(" detector size = %.5fx%.5f cm\n", detector_data[0].width_X, detector_data[0].height_Z);
printf(" number of projections = %d\n", num_projections);
if (num_projections!=1 || source_data[0].rotation_blur>0.000001f) // Report data if blur is used bc the rotation is around the source-rotation axis
{
printf(" source-rotation axis-distance = %lf cm\n", SRotAxisD);
printf(" angle between projections = %lf\n", source_data[0].angle_per_projection*RAD2DEG);
printf(" initial angle offset = %lf\n", source_data[0].angle_offset*RAD2DEG); // !!DBTv1.4!!
printf(" rotation point = (%f, %f, %f)\n", source_data[0].rotation_point.x, source_data[0].rotation_point.y, source_data[0].rotation_point.z); // !!DBTv1.4!!
printf(" axis of rotation = (%f, %f, %f)\n", source_data[0].axis_of_rotation.x, source_data[0].axis_of_rotation.y, source_data[0].axis_of_rotation.z); // !!DBTv1.4!!
printf(" translation between proj = %lf\n", translation_helical);
}
printf(" output image file = %s\n", file_name_output);
printf(" input voxel file = %s\n", file_name_voxels);
printf(" voxel geometry offset = (%f, %f, %f) cm\n", voxel_data.offset.x, voxel_data.offset.y, voxel_data.offset.z); // !!DBTv1.4!!
printf(" size coarse voxels for binary trees = %d x %d x %d\n", (int)voxel_data.num_voxels_coarse.x, (int)voxel_data.num_voxels_coarse.y, (int)voxel_data.num_voxels_coarse.z); // !!bitree!! v1.5b
if (dose_ROI_x_max>-1)
{
printf(" output dose file = %s\n", file_dose_output);
printf(" input region of interest dose = X[%d,%d], Y[%d,%d], Z[%d,%d]\n", dose_ROI_x_min+1, dose_ROI_x_max+1, dose_ROI_y_min+1, dose_ROI_y_max+1, dose_ROI_z_min+1, dose_ROI_z_max+1); // Show ROI with index=1 for the first voxel instead of 0.
}
printf("\n energy spectrum file = %s\n", file_name_espc);
printf( " number of energy bins read = %d\n", source_energy_data.num_bins_espc);
printf( " minimum, maximum energies = %.3f, %.3f keV\n", 0.001f*source_energy_data.espc[0], 0.001f*source_energy_data.espc[source_energy_data.num_bins_espc]);
printf( " mean energy spectrum = %.3f keV\n\n", 0.001f*mean_energy_spectrum);
fflush(stdout);
}
// *** Set the detectors and sources for the CT trajectory (if needed, ie, for more than one projection):
if (num_projections != 1)
{
set_CT_trajectory(myID, num_projections, source_data, detector_data, translation_helical, flag_detectorFixed);
}
fflush(stdout);
double mass_materials[MAX_MATERIALS];
// !!bitree!! If the binary tree is used, read the geometry only with the main thread, and then broadcast the new data:
// if the tree is not used, every thread reads the input geometry at the same time.
if (0==myID || (voxel_data.num_voxels_coarse.x)==0)
{
// *** Read the voxel data and allocate the density map matrix. Return the maximum density:
if (voxel_data.num_voxels.x<1)
{
// -- Read ASCII format geometry: geometric parameters will be read from the header file !!DBTv1.4!!
load_voxels(myID, file_name_voxels, density_max, &voxel_data, &voxel_mat_dens, &voxel_mat_dens_bytes, &dose_ROI_x_max, &dose_ROI_y_max, &dose_ROI_z_max);
}
else
{
// -- Read binary RAW format geometry: geometric parameters given in input file !!DBTv1.4!!
load_voxels_binary_VICTRE(myID, file_name_voxels, density_max, &voxel_data, &voxel_mat_dens, &voxel_mat_dens_bytes, &dose_ROI_x_max, &dose_ROI_y_max, &dose_ROI_z_max); //!!DBT!! // !!DBTv1.4!!
}
// -- Pre-compute the total mass of each material present in the voxel phantom (to be used in "report_materials_dose"):
double voxel_volume = 1.0 / ( ((double)voxel_data.inv_voxel_size.x) * ((double)voxel_data.inv_voxel_size.y) * ((double)voxel_data.inv_voxel_size.z) );
for(kk=0; kk<MAX_MATERIALS; kk++)
mass_materials[kk] = 0.0;
long long int llk;
for(llk=0; llk<((long long int)voxel_data.num_voxels.x*(long long int)voxel_data.num_voxels.y*(long long int)voxel_data.num_voxels.z); llk++) // For each voxel in the geometry
{
mass_materials[((int)voxel_mat_dens[llk])] += ((double)density_LUT[((int)voxel_mat_dens[llk])])*voxel_volume; // Add material mass = density_LUT*volume (first material==0)
}
// ** Create the low resolution version of the phantom and the binary tree structures, if requested in the input file and dose dep tally disabled: //!!bitree!! v1.5b
if ((voxel_data.num_voxels_coarse.x)!=0)
{
if (dose_ROI_x_max>0)
{
MAIN_THREAD printf("\n\n !!ERROR!! Sorry, the voxel dose deposition tally cannot be used when the binary tree is active. Please, disable the binary tree.\n\n");
exit(-1);
}
MAIN_THREAD printf("\n !!bitree!! Creating a binary tree structure to minimize memory use.\n"); // !!bitree!! v1.5b
create_bitree(myID, &voxel_data, voxel_mat_dens, &bitree, &bitree_bytes, &voxel_geometry_LowRes, &voxel_geometry_LowRes_bytes); //!!bitree!! v1.5b
MAIN_THREAD printf(" >> RAM memory allocation: original voxelized geometry = %f MBytes; low resolution voxelized geometry = %f MBytes;\n", voxel_mat_dens_bytes/(1024.f*1024.f), voxel_geometry_LowRes_bytes/(1024.f*1024.f));
MAIN_THREAD printf(" binary tree = %f MBytes; image vector = %f MBytes; data structures = %f Mbytes\n", bitree_bytes/(1024.f*1024.f), image_bytes/(1024.f*1024.f), (sizeof(struct voxel_struct)+sizeof(struct source_struct)+sizeof(struct detector_struct)+sizeof(struct linear_interp)+2*mfp_table_bytes+sizeof(struct rayleigh_struct)+sizeof(struct compton_struct))/(1024.f*1024.f));
MAIN_THREAD printf(" (reduction in memory use with bitree: [low res voxels + binary tree]-[high res voxels] = %f MBytes = %.3f%%)\n", (voxel_geometry_LowRes_bytes+bitree_bytes-voxel_mat_dens_bytes)/(1024.f*1024.f), 100.f*(voxel_geometry_LowRes_bytes+bitree_bytes-voxel_mat_dens_bytes)/voxel_mat_dens_bytes);
// -- Replace the high resolution version of the geometry by the low resolution version: !!DeBuG!! voxel dose tally can't be used now!!
free(voxel_mat_dens); //!!bitree!! v1.5b
voxel_mat_dens = voxel_geometry_LowRes; //!!bitree!! v1.5b
voxel_mat_dens_bytes = voxel_geometry_LowRes_bytes; //!!bitree!! v1.5b
}
else
{
MAIN_THREAD printf("\n !!bitree!! Binary tree structure disabled: standard voxelized geometry in use.\n\n"); // !!bitree!! v1.5b
MAIN_THREAD printf(" >> RAM memory allocation: voxelized geometry = %f MBytes; image vector = %f MBytes; data structures = %f Mbytes\n", voxel_mat_dens_bytes/(1024.f*1024.f), image_bytes/(1024.f*1024.f), (sizeof(struct voxel_struct)+sizeof(struct source_struct)+sizeof(struct detector_struct)+sizeof(struct linear_interp)+2*mfp_table_bytes+sizeof(struct rayleigh_struct)+sizeof(struct compton_struct))/(1024.f*1024.f));
}
}
fflush(stdout);
// !!bitree!! If the binary tree is used, broadcast the tree data and all auxiliary data from main to every other thread:
#ifdef USING_MPI
if (numprocs>1 && (voxel_data.num_voxels_coarse.x)!=0)
{
MPI_Barrier(MPI_COMM_WORLD); // Synchronize MPI threads
// Send all the geometric adata that has been read or changed by root node:
MPI_Bcast(&voxel_data.num_voxels.x, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&voxel_data.num_voxels.y, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&voxel_data.num_voxels.z, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&voxel_data.size_bbox.x, 1, MPI_FLOAT, 0, MPI_COMM_WORLD); MPI_Bcast(&voxel_data.size_bbox.y, 1, MPI_FLOAT, 0, MPI_COMM_WORLD); MPI_Bcast(&voxel_data.size_bbox.z, 1, MPI_FLOAT, 0, MPI_COMM_WORLD);
voxel_data.voxel_size_HiRes.x = voxel_data.voxel_size.x; voxel_data.voxel_size_HiRes.y = voxel_data.voxel_size.y; voxel_data.voxel_size_HiRes.z = voxel_data.voxel_size.z; // Save the original high resolution voxel size
MPI_Bcast(&voxel_data.voxel_size.x, 1, MPI_FLOAT, 0, MPI_COMM_WORLD); MPI_Bcast(&voxel_data.voxel_size.y, 1, MPI_FLOAT, 0, MPI_COMM_WORLD); MPI_Bcast(&voxel_data.voxel_size.z, 1, MPI_FLOAT, 0, MPI_COMM_WORLD);
voxel_data.inv_voxel_size.x = 1.0/voxel_data.voxel_size.x; voxel_data.inv_voxel_size.y = 1.0/voxel_data.voxel_size.y; voxel_data.inv_voxel_size.z = 1.0/voxel_data.voxel_size.z;
MPI_Bcast(density_max, MAX_MATERIALS, MPI_FLOAT, 0, MPI_COMM_WORLD);
MPI_Bcast(mass_materials, MAX_MATERIALS, MPI_DOUBLE, 0, MPI_COMM_WORLD);
// Allocate memory (except root) and transmit voxel+binary tree links data:
MPI_Bcast(&voxel_geometry_LowRes_bytes, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD);
MPI_Bcast(&bitree_bytes, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD);
if(0!=myID)
{
voxel_mat_dens_bytes = voxel_geometry_LowRes_bytes;
voxel_mat_dens = (int*) malloc(voxel_mat_dens_bytes); // Allocate voxels (low resolution)
bitree = (char*) malloc(bitree_bytes); // Allocate binary tree elements
}
MPI_Bcast(voxel_mat_dens, ((unsigned long long int)voxel_data.num_voxels.x)*(voxel_data.num_voxels.y*voxel_data.num_voxels.z), MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(bitree, bitree_bytes, MPI_CHAR, 0, MPI_COMM_WORLD);
MPI_Barrier(MPI_COMM_WORLD);
}
#endif
// *** Read the material mean free paths and set the interaction table in a "linear_interp" structure:
load_material(myID, file_name_materials, density_max, density_nominal, &mfp_table_data, &mfp_Woodcock_table, &mfp_Woodcock_table_bytes, &mfp_table_a, &mfp_table_b, &mfp_table_bytes, &rayleigh_table, &compton_table);
// -- Check that the input material tables and the x-ray source are consistent:
if ( (source_energy_data.espc[0] < mfp_table_data.e0) || (source_energy_data.espc[source_energy_data.num_bins_espc] > (mfp_table_data.e0 + (mfp_table_data.num_values-1)/mfp_table_data.ide)) )
{
MAIN_THREAD
{
printf("\n\n\n !!ERROR!! The input x-ray source energy spectrum minimum (%.3f eV) and maximum (%.3f eV) energy values\n", source_energy_data.espc[0], source_energy_data.espc[source_energy_data.num_bins_espc]);
printf( " are outside the tabulated energy interval for the material properties tables (from %.3f to %.3f eV)!!\n", mfp_table_data.e0, (mfp_table_data.e0+(mfp_table_data.num_values-1)/mfp_table_data.ide));
printf( " Please, modify the input energy spectra to fit the tabulated limits or create new tables.\n\n");
}
#ifdef USING_MPI
MPI_Finalize();
#endif
exit(-1);
}
// *** Initialize the GPU using the NVIDIA CUDA libraries:
// -- Declare the pointers to the device global memory, when using the GPU:
// float2 *voxel_mat_dens_device = NULL,
// *mfp_Woodcock_table_device = NULL;
// char *voxel_mat_dens_device = NULL; //!!FixedDensity_DBT!! Allocate material vector as char
int *voxel_mat_dens_device = NULL; //!!bitree!! v1.5 --> using an integer value to be able to store both the material number (positive) or pointers to the binary tree branches (negative)
char *bitree_device = NULL; //!!bitree!! v1.5b
float2 *mfp_Woodcock_table_device = NULL; //!!FixedDensity_DBT!!
float3 *mfp_table_a_device = NULL,
*mfp_table_b_device = NULL;
unsigned long long int *image_device = NULL;
struct rayleigh_struct *rayleigh_table_device = NULL;
struct compton_struct *compton_table_device = NULL;
ulonglong2 *voxels_Edep_device = NULL;
struct detector_struct *detector_data_device = NULL;
struct source_struct *source_data_device = NULL;
ulonglong2 *materials_dose_device = NULL; // !!tally_materials_dose!!
int* seed_input_device = NULL; // Store latest random seed used in GPU in global memory to continue random sequence in consecutive projections. !!DBTv1.4!!
// -- Sets the CUDA enabled GPU that will be used in the simulation, and allocate and copies the simulation data in the GPU global and constant memories.
init_CUDA_device(&gpu_id, myID, numprocs, &voxel_data, source_data, &source_energy_data, detector_data, &mfp_table_data, /*Variables GPU constant memory*/
voxel_mat_dens, &voxel_mat_dens_device, voxel_mat_dens_bytes, /*Variables GPU global memory*/
bitree, &bitree_device, bitree_bytes, //!!bitree!! v1.5b
image, &image_device, image_bytes,
mfp_Woodcock_table, &mfp_Woodcock_table_device, mfp_Woodcock_table_bytes,
mfp_table_a, mfp_table_b, &mfp_table_a_device, &mfp_table_b_device, mfp_table_bytes,
&rayleigh_table, &rayleigh_table_device,
&compton_table, &compton_table_device, &detector_data_device, &source_data_device,
voxels_Edep, &voxels_Edep_device, voxels_Edep_bytes, &dose_ROI_x_min, &dose_ROI_x_max, &dose_ROI_y_min, &dose_ROI_y_max, &dose_ROI_z_min, &dose_ROI_z_max,
materials_dose, &materials_dose_device, flag_material_dose, &seed_input_device, &seed_input, (num_projections+1));
// !!DBTv1.4!! Allocate space for one extra projection (num_projections+1) for case flag_simulateMammoAfterDBT==true !!DBTv1.4!!
// -- Constant data already moved to the GPU: clean up unnecessary RAM memory
free(mfp_Woodcock_table);
free(mfp_table_a);
free(mfp_table_b);
if (0!=myID) // Keep the geometry data for the MPI root because the voxel densities are still needed to compute the final doses
free(voxel_mat_dens);
MAIN_THREAD
{
current_time=time(NULL);
printf("\n -- INITIALIZATION finished: elapsed time = %.3f s. \n\n", ((double)(clock()-clock_start))/CLOCKS_PER_SEC);
}
#ifdef USING_MPI
fflush(stdout);
MPI_Barrier(MPI_COMM_WORLD); // Synchronize MPI threads before starting the MC phase.
#endif
///////////////////////////////////////////////////////////////////////////////////////////////////
MAIN_THREAD
{
current_time=time(NULL);
printf("\n\n -- MONTE CARLO LOOP phase. Time: %s\n\n", ctime(¤t_time));
fflush(stdout);
}
// -- A number of histories smaller than 24 hours in sec (3600*24=86400) means that the user wants to simulate for the input number of seconds in each GPU, not a fix number of histories:
unsigned long long int total_histories_INPUT = total_histories; // Save the original input values to be re-used for multiple projections
int doing_speed_test = -1, simulating_by_time = 0; // 0==false
if (total_histories < (unsigned long long int)(95000))
simulating_by_time = 1; // 1=true
int num_blocks_speed_test = 0;
unsigned long long int histories_speed_test = (unsigned long long int)0, total_histories_speed_test = (unsigned long long int)0;
float node_speed = -1.0f, total_speed = 1.0f;
double current_angle = -999;
int num_p; // == current projection number
// *************************************************************************************
// *** CT simulation loop (including speed test, if simulating by time or multi-GPU) ***
// *************************************************************************************
int num_projections_loop = num_projections;
if (num_projections>1)
{
num_projections_loop++; // Add an extra projection because [0] always corresponds to the 0 deg projection (first tomographic image starts at [1]) !!DBTv1.4!!
}
for (num_p=0; num_p<num_projections_loop; num_p++)
{
// --Re-load the num histories for each new projection !!DBTv1.4!!
total_histories = total_histories_INPUT;
// --Skip the initial 0 deg projection if we are simulating a tomographic scan and we don't want a separate projection with the full dose: !!DBTv1.4!!
if (0==num_p && num_projections>1 && flag_simulateMammoAfterDBT==false)
continue;
if (flag_simulateMammoAfterDBT && 0==num_p)
{
// -- !!DBT!! Simulate the first 0 deg projection (mammo) with almost as many histories (SCALE_MAMMO_DBT factor) as the whole tomographic scan to follow: !!DBTv1.4!!
total_histories = total_histories_INPUT * num_projections * SCALE_MAMMO_DBT; //!!DBTv1.4!! Scaling dose a factor SCALE_MAMMO_DBT (eg, factor 2/3 for 1 mGy mammo for 1.5 mGy DBT)
MAIN_THREAD
{
printf("\n\n !!DBT!! Simulating first a 0 degree projection with %.4f times the number of histories as the complete scan with %d projections = %lld histories\n", SCALE_MAMMO_DBT, num_projections, total_histories);
printf( " Afterwards, simulate the tomo acquisition starting at most negative angle and ending at most positive angle.\n"); // !!DBT!! !!DBTv1.4!!
printf( " If defined, motion blur is disabled and anti-scatter grid enabled only for the single projection.\n\n");
}
}
else if (flag_simulateMammoAfterDBT && 1==num_p)
MAIN_THREAD printf("\n\n !!DBT!! After the first full simulation (eg, mammo), simulate a DBT acquisition (starting at neg angle) with the input number histories per projections.\n\n");
if (0==num_p)
current_angle = 0.0;
else
current_angle = source_data[0].angle_offset + (num_p-1) * source_data[0].angle_per_projection;
MAIN_THREAD
if (num_projections!=1)
if (flag_simulateMammoAfterDBT && 0==num_p)
printf("\n\n\n\n << Simulating a 0 degree projection (mammography) with %d * %f as many histories as each tomographic projection >>\n\n", num_projections, SCALE_MAMMO_DBT);
else
printf("\n\n\n\n << Simulating tomographic projection %d of %d >> Angle: %lf degrees.\n\n", num_p, num_projections, current_angle*RAD2DEG);
clock_start = clock(); // Start the CPU clock
// *** Simulate in the GPUs the input amount of time or amount of particles:
// -- Estimate GPU speed to use a total simulation time or multiple GPUs:
if ( simulating_by_time==0 && // Simulating a fixed number of particles, not a fixed time (so performing the speed test only once)
node_speed>0.0f && // Speed test already performed for a previous projection in this simulation (node_speed and total_speed variables set)
numprocs>1) // Using multiple GPUs (ie, multiple MPI threads)
{
// -- Simulating successive projections after the first one with a fix number of particles, with multiple MPI threads: re-use the speed test results from the first projection image:
total_histories = (unsigned long long int)(0.5 + ((double)total_histories) * (((double)node_speed)/total_speed));
doing_speed_test = 0; // No speed test for this projection.
}
else if ( simulating_by_time==1 || numprocs>1)
{
// -- Simulating with a time limit OR multiple MPI threads for the first time (num_p==0): run a speed test to calculate the speed of the current GPU and distribute the number of particles to the multiple GPUs or estimate the total number of particles required to run the input amount of time:
// Note that this ELSE IF block will be skipped if we are using a single MPI thread and a fix number of particles.
doing_speed_test = 1; // Remember that we are performing the speed test to make sure we add the test histories to the total before the tally reports.
if (node_speed<0.0f) // Speed test not performed before (first projection being simulated): set num_blocks_speed_test and histories_speed_test.
{
num_blocks_speed_test = guestimate_GPU_performance(gpu_id); // Guestimating a good number of blocks to estimate the speed of different generations of GPUs. Slower GPUs will simulate less particles and hopefully the fastest GPUs will not have to wait much.
}
histories_speed_test = (unsigned long long int)(num_blocks_speed_test*num_threads_per_block)*(unsigned long long int)(histories_per_thread);
dim3 blocks_speed_test(num_blocks_speed_test, 1);
dim3 threads_speed_test(num_threads_per_block, 1);
#ifdef USING_MPI
// -- Init the current random number generator seed to avoid overlapping sequences with other MPI threads:
if (simulating_by_time == 1)
// Simulating by time: set an arbitrary huge number of particles to skip.
update_seed_PRNG((myID + num_p*numprocs), (unsigned long long int)(123456789012), &seed_input); // Set the random number seed far from any other MPI thread (myID) and away from the seeds used in the previous projections (num_p*numprocs).
else
// Simulating by histories (default):
update_seed_PRNG(myID, total_histories_INPUT*num_projections, &seed_input); // Init the random seed for each MPI thread as far away from the previous thread as if all "total_histories*num_projections" histories were simulated by each thread --> warranty that each thread has uncorrelated random sequence of random values (at least for the first seed of RANECU). !!DBTv1.4!!
checkCudaErrors(cudaMemcpy(seed_input_device, &seed_input, sizeof(int), cudaMemcpyHostToDevice)); // Upload initial seed value to GPU memory. !!DBTv1.4!!
printf(" ==> CUDA (MPI process #%d in \"%s\"): estimate GPU speed executing %d blocks of %d threads, %d histories per thread: %lld histories in total (random seed=%d).\n", myID, MPI_processor_name, num_blocks_speed_test, num_threads_per_block, histories_per_thread, histories_speed_test, seed_input);
#else
printf(" ==> CUDA: Estimating the GPU speed executing %d blocks of %d threads, %d histories per thread: %lld histories in total.\n", num_blocks_speed_test, num_threads_per_block, histories_per_thread, histories_speed_test);
#endif
fflush(stdout);
clock_kernel = clock();
// -- Launch Monte Carlo simulation kernel for the speed test:
track_particles<<<blocks_speed_test,threads_speed_test>>>(histories_per_thread, (short int)num_p, seed_input_device, image_device, voxels_Edep_device, voxel_mat_dens_device, bitree_device, mfp_Woodcock_table_device, mfp_table_a_device, mfp_table_b_device, rayleigh_table_device, compton_table_device, detector_data_device, source_data_device, materials_dose_device);
#ifdef USING_MPI
// Find out the total number of histories simulated in the speed test by all the GPUs. Note that this MPI call will be executed in parallel with the GPU kernel because it is located before the cudaDeviceSynchronize command!
return_reduce = MPI_Allreduce(&histories_speed_test, &total_histories_speed_test, 1, MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);
if (MPI_SUCCESS != return_reduce)
printf("\n\n !!ERROR!! Error reducing (MPI_Allreduce) the total number of histories in the speed test test??? return_reduce = %d for thread %d\n\n\n", return_reduce, myID);
else
#else
total_histories_speed_test = histories_speed_test;
#endif
fflush(stdout);
cudaDeviceSynchronize(); // Force the runtime to wait until GPU kernel has completed
getLastCudaError("\n\n !!Kernel execution failed while simulating particle tracks!! "); // Check if the CUDA function returned any error
float speed_test_time = float(clock()-clock_kernel)/CLOCKS_PER_SEC;
node_speed = (float) (((double)histories_speed_test)/speed_test_time);
#ifdef USING_MPI
printf(" (MPI process #%d): Estimated GPU speed = %lld hist / %.4f s = %.3f hist/s\n", myID, histories_speed_test, speed_test_time, node_speed);
#else
printf(" Estimated GPU speed = %lld hist / %.3f s = %.3f hist/s\n", histories_speed_test, speed_test_time, node_speed);
#endif
// !!DBTv1.4!! No need to update the seed in the main program bc each GPU continues its series.
// // -- Init random number generator seed to avoid repeating the random numbers used in the speed test:
// update_seed_PRNG(1, histories_speed_test, &seed_input);
if (simulating_by_time==1)
{
// -- Set number of histories for each GPU when simulating by time:
if (total_histories > speed_test_time)
total_histories = (total_histories - speed_test_time)*node_speed; // Calculate the total number of remaining histories by "GPU speed" * "remaining time"
else
total_histories = 1; // Enough particles simulated already, simulate just one more history (block) and report (kernel call would fail if total_histories < or == 0).
}
else
{
#ifdef USING_MPI
// -- Simulating a fix number of histories divided between all GPUs (execution time variable):
// Compute the fraction of the total speed that accounts for the current MPI thread:
return_reduce = MPI_Allreduce(&node_speed, &total_speed, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); // Sum all the times and send result to all processes
if (MPI_SUCCESS != return_reduce)
printf("\n\n !!ERROR!! Error reducing (MPI_Allreduce) the speed test results??? return_reduce = %d for thread %d\n\n\n", return_reduce, myID);
else
MAIN_THREAD
{
printf("\n -- Total speed for all GPUs (MPI_Allreduce) = %.3f hist/s; total histories simulated in the speed test (MPI_Allreduce) = %lld.\n", total_speed, total_histories_speed_test);
printf(" The main thread will simulate %.2f%% of the x rays in the simulation.\n", 100.0f*node_speed/total_speed);
}
#else
total_speed = node_speed;
#endif
// - Divide the remaining histories among the MPI threads (GPUs) according to their fraction of the total speed (rounding up).
if (total_histories_speed_test < total_histories)
total_histories = (unsigned long long int)(0.5 + ((double)(total_histories-total_histories_speed_test)) * ((double)(node_speed/total_speed)));
else
total_histories = numprocs; // Enough particles simulated already, simulate just one more history (block) and report (kernel call would fail if total_histories < or == 0).
}
} // [Done with case of simulating projections by time or first projection by number of particles]
// else ==> if using only 1 GPU and a fixed number of histories the whole speed test is skipped. The random seed will be different for each projection because it is updated after calling the kernel below.
// fflush(stdout);