MC-GPU_v1.5b.cu



////////////////////////////////////////////////////////////////////////////////////////
//
//               ****************************
//               *** MC-GPU , version 1.5 ***
//               ****************************
//
/**
 *      \mainpage MC-GPU v1.5_DBT
 * 
 * \code   
 * 
 *               Andreu Badal, PhD (Andreu.Badal-Soler{at}fda.hhs.gov)
 * 
 *                  Division of Imaging and Applied Mathematics
 *                  Office of Science and Engineering Laboratories
 *                  Center for Devices and Radiological Health
 *                  U.S. Food and Drug Administration 
 * 
 *               Code release date: 2012/12/12
 * 
 * 
 * 
 * \endcode
 * 
 *
 * 
 *   \b MC-GPU [1-4] is a Monte Carlo simulation code that can generate synthetic radiographic
 *   images and computed tomography (CT) scans of realistic models of the human anatomy using the
 *   computational power of commodity Graphics Processing Unit (GPU) cards.
 *   The code implements a massively multi-threaded Monte Carlo simulation algorithm
 *   for the transport of x rays in a voxelized geometry. The x ray interaction models and material
 *   properties have been adapted from \b PENELOPE \b 2006 [5].
 * 
 * 
 *   \b MC-GPU was developed using the \b CUDA programming model from \b NVIDIA [6] to achieve 
 *   maximum performance on NVIDIA GPUs. The code can also be compiled with a standard C compiler
 *   to be executed in a regular CPU. 
 *   In a typical medical imaging simulation, the use of GPU computing with MC-GPU has been shown 
 *   to provide a speed up of between 20 and 40 times, compared to the execution on a single CPU core.
 * 
 *   The MC-GPU code has been described in different scientific publications [1-4].
 *   The main reference of this work, which the users should cite, is the following [1]:
 * \code
 *  Andreu Badal and Aldo Badano, "Accelerating Monte Carlo simulations of 
 *  photon transport in a voxelized geometry using a massively parallel 
 *  Graphics Processing Unit", Medical Physics 36, pp. 4878–4880 (2009)
 * \endcode
 *   The main developer of MC-GPU is \b Andreu \b Badal, working at the U.S. \b Food \b and    
 *   \b Drug \b Administration (Center for Devices and Radiological Health, Office of Science
 *   and Engineering Laboratories, Division of Imaging and Applied Mathematics). 
 *   The source code of MC-GPU is free and open software in the public domain, as explained
 *   in the Disclaimer section below. 
 *   The source code of the original MC-GPU and its auxiliary files were distributed at the website: http://code.google.com/. 
 *   New versions will be available at https://github.com/DIDSR/.
 *
 *  
 *   This documentation has been automatically generated by \b Doxygen parsing the comments in
 *   the MC-GPU source code.
 *   This code is still in development, please report to the author any issue/bug
 *   that you may encounter. Feel free to suggest improvements to the code too!
 * 
 * 
 *  
 *    \section  sec_changes  List of modifications in different versions of the code 
 * 
 *      \subsection  sec_DBT  Version 1.5b (2017/06/28)
 *   
  *  - Added electronic noise and Swank factor for detected charges output
 *   - Anti-scatter grid based on Day and Dance, Phys Med Biol 28, pp. 1429-1433 (1983)
 *   - Binary tree-based geometry model to reduce memory use: voxel dose tally can't be used with binary tree!
 *
 *     \subsection  sec_DBT  Version 1.4 (2016/02/02)
 *   
 *   - Fixed density for each material: disabling the option to use a different density for each voxel (to reduce memory).
 *   - Enabled input of voxelized geometry file in binary format (single byte per voxel).
 *   - Selenium detector model with thickness, attenuation, fluorescence tracking.
 *   - User-defined rotation axis for the tomography scan (using Rodrigues rotation formula).
 *   - Enabled the simulation of tomosynthesis scans: half cone source, emission angle offset, etc.
 *   - Enable translation of the voxelized geometry and of the image sensor within the detector plane.

 *
 *      \subsection  sec_changes_v13  Version 1.3 (release date: 2012/12/12)
 * 
 *   - Code upgraded to CUDA 5.0 (not compatible with previous versions of CUDA!).
 *   - Removed limit on the amount of projection images that can be simulated per CT scan (source and 
 *     detector parameters now stored in global memory and transferring to shared memory at run time 
 *     to avoid using the limited constant memory).
 *   - New material dose tally implemented to estimate the dose deposited in each material independently
 *     of the voxel dose tally (the voxel dose tally measures the dose in each material adding the energy 
 *     deposited in each voxel of that material within the defined voxelized region-of-interest).
 *   - Interaction loop re-organized to maximize performance (virtual interactions simulated before real ones). 
 *   - Improvements and small corrections in the source sampling and tally routines.
 *   - Allow input of material and voxel geometry files compressed with gzip (zlib library now required for compilation).
 *
 *      \subsection  sec_changes_v12  Version 1.2 (release date: 2011/10/25)
 * 
 *   - Implemented a voxel dose tally.
 *   - Polyenergetic source model.
 *   - MPI support for simulating individual projections.
 *   - Simulation by time limit.
 *   - Improved flexibility of the CT trajectories, helical scans.
 * 
 *  
 *
 *    \section  sec_disc  Disclaimer
 *
 *   This software and documentation (the "Software") were developed at the Food and
 *   Drug Administration (FDA) by employees of the Federal Government in the course
 *   of their official duties. Pursuant to Title 17, Section 105 of the United States
 *   Code, this work is not subject to copyright protection and is in the public
 *   domain. Permission is hereby granted, free of charge, to any person obtaining a
 *   copy of the Software, to deal in the Software without restriction, including
 *   without limitation the rights to use, copy, modify, merge, publish, distribute,
 *   sublicense, or sell copies of the Software or derivatives, and to permit persons
 *   to whom the Software is furnished to do so. FDA assumes no responsibility
 *   whatsoever for use by other parties of the Software, its source code,
 *   documentation or compiled executables, and makes no guarantees, expressed or
 *   implied, about its quality, reliability, or any other characteristic. Further,
 *   use of this code in no way implies endorsement by the FDA or confers any
 *   advantage in regulatory decisions.  Although this software can be redistributed
 *   and/or modified freely, we ask that any derivative works bear some notice that
 *   they are derived from it, and any modified versions bear some notice that they
 *   have been modified.
 * 
 * 
 * 
 *     \section sec_Intro Code features
 *  
 *   In this section we provide a brief description of the features of the MC-GPU code. A
 *   more complete description of the code can be found in our published articles.
 *   important information regarding the operation of the code is provided as comments in the  
 *   input files of the sample simulations provided with the MC-GPU package.
 *   Detailed information on each function of the code can be found in the complete Doxygen 
 *   documentation of the source code
 * 
 *   The basic operation of the code consists in adapting the simulation input file 
 *   to describe the location and characteristics of the x ray source, define the CT trajectory
 *   (if any), list the materials to be used in the simulation, define the geometry of
 *   the x ray detector and, finally, specify the voxelized object file to be  
 *   used as the simulation material universe. 
 *   In the first line of the input file, the user can fix the total number of x rays that have 
 *   to be simulated (> 1e5 histories) or the total simulation time (maximum 1e5 seconds). 
 * 
 * 
 *   The coordinate system of the simulated world is determined by the input voxelized geometry.
 *   The origin of coordinates is assumed to be located at the lower-back corner of the voxelized 
 *   volume, and the axis are located on the vertices of the voxelized volume. 
 *   This means that the lower-back corner of the first voxel is on the origin and the 
 *   following voxels are located along the positive X, Y and Z axis (first quadrant).
 * 
 * 
 *   To simulate the atomic interactions, MC-GPU uses a database of material properties based on the 
 *   database from PENELOPE. A PENELOPE 2006 material file can be converted into an MC-GPU material
 *   file using the auxiliary utility "MC-GPU_create_material_data.f" provided with the MC-GPU 
 *   package. Pre-defined material files for a set of materials typically used in medical imaging  
 *   simulations are already provided in the folder "MC-GPU_material_files".
 * 
 * 
 *   The code includes two tally options: an \b image \b tally that creates projection x-ray images,
 *   and a radiation \b dose \b tally that estimates the dose deposited inside the patient model.
 *   MC-GPU does not currently simulate the transport of electrons and therefore the dose  
 *   deposition tally (KERMA tally rigorously) will not be accurate for high energies or near 
 *   material interfaces and small voxels.
 *   In the image tally the images are formed by counting the energy that enters a user-defined 2D 
 *   grid of pixels, which is a simple approximation to a noise-free flat-panel detector with
 *   100% detection efficiency. The pixel values have units of eV/cm^2.
 *   Four different images are reported at the end of the simulation, corresponding
 *   to the signal produced by x rays that did not interact between the source and the detector 
 *   (non-scattered), x rays that suffered a single Compton (inelastic) interaction, a single 
 *   Rayleigh (elastic) interaction, and multi-scattered x rays.
 *   The dose tally counts the energy deposited by each x ray track inside each voxel of the 
 *   geometry, within a user-defined volumetric region-of-interest (ROI). The average dose deposited 
 *   inside each voxel and in each material (and the associated statistical uncertainties) are reported 
 *   at the end of the simulation.
 * 
 * 
 *   MC-GPU can simulate a single projection image or a full CT scan.
 *   The CT is simulated generating many projection images around the static 
 *   voxelized geometry. Currently, the code is limited to perform a simple
 *   CT trajectory rotating around the Z axis. The user can specify the angular shift and longitudinal
 *   translation (pitch) of the source between each projection and also the distance between the 
 *   source and the axis of rotation (the axis is assumed to be parallel to the Z axis). 
 *   By now, the code does not simulate some relevant components of a CT scanner such as the 
 *   anti-scatter grid, a bow-tie filter or a curved detector (flat-panel detector only).
 * 
 * 
 *   The x ray source is defined as a point source emitting x rays with an energy randomly sampled 
 *   from the user-provided energy spectrum. The polyenergetic spectrum is efficiently sampled 
 *   using the Walker aliasing algorithm. The emitted cone beam is computationally
 *   collimated to produce a rectangular field on the detector plane, within the azimuthal and 
 *   polar angles specified by the user.
 *   The detector plane is automatically located at the specified distance right in front of the
 *   source focal spot, with the collimated cone beam pointing towards the geometric center of the detector.
 * 
 * 
 *   In order to optimize the particle tracking algorithm (ray-tracing) and minimize 
 *   the accesses to the slow GPU main memory, the photon trajectories across the voxels
 *   are computed using the Woodcock tracking algorithm.
 *   With this technique the photons perceive the geometry as a uniform medium
 *   composed of the material of the most attenuating voxel.
 *   In this way, the voxel boundaries do not have to be explicitly calculated and
 *   multiple voxels can be crossed in a single step.
 *   To keep the simulation unbiased, some of the interactions are considered
 *   "virtual" (i.e., do not change the photon energy or direction of movement),
 *   depending on the x ray energy and the actual material at the interaction site.
 *   In typical medical imaging simulations where the most attenuating material is cortical bone,
 *   the Woodcock tracking algorithm gives an speed up of almost one order of magnitude compared
 *   to computing voxel boundaries all the time. However, if the geometry includes a high
 *   density voxel, such as a metallic implant, the performance of the code can be severely 
 *   reduced because a large fraction of the sampled interactions will be virtual.
 *  
 * 
 *   The random number generator used in PENELOPE [5], RANECU, is also used in the GPU
 *   program. To ensure that the simulated tracks are not correlated, each thread initializes
 *   the generator to a unique position in the random sequence, far enough from the
 *   other threads, using the algorithm implemented in the seedsMLCG code [7].
 * 
 * 
 *   In a typical simulation, several thousand threads are launched simultaneously in
 *   the GPU, each one of them simulating a batch of several x ray tracks.
 *   If the code is compiled with MPI support (see below), multiple GPUs can be used in parallel. 
 *   The code will perform a short speed test to estimate the relative speed of each GPU used 
 *   in the simulation and then distribute the number of particles among the available GPUs correspondingly.
 *   If the user specified a time limit in the simulation, all the GPUs will simulate in parallel 
 *   for the allowed time. Since the code is already optimized to scale well in  
 *   thousands of GPU threads, it scales almost linearly with the number of GPUs in most 
 *   situations, with only a few seconds of overhead in the initialization of the multiple GPUs 
 *   and in the reduction of the final results.
 * 
 *  
 *  
 * 
 *    \section sec_output Code output
 *
 *   At the end of the simulation the code reports the tallied 3D dose distribution and the 
 *   final simulated images in RAW binary form, as 32-bits float values. The image data is provided 
 *   as a collection of five consecutive images corresponding to: total image (scatter+primaries), 
 *   primary particles, Compton, Rayleigh and multi-scatter. 
 *   The dose data is reported as two RAW files with the mean dose and twice the standard deviation
 *   of the dose in each voxel of the geometry respectively, within the input ROI.
 *   The average dose deposited in each material of the geometry is also reported to the standard output. 
 *   Organ doses can be obtained by post-processing the output dose file, knowing which voxel 
 *   corresponds to each organ.
 *   The pixel and voxel dose data values are stored with the X coordinate incrementing first, the Y 
 *   coordinate incrementing second, and the Z coordinate incrementing last.
 *   
 *   The program also reports the simulated images and the dose at the Z plane at the level of the x ray 
 *   source as ASCII text files. The ASCII output can be readily visualized with the GNUPLOT scripts 
 *   distributed with MC-GPU. The header section at the beginning of these text files provides the 
 *   information required to easily read the RAW binary files with IMAGEJ, OCTAVE or other programs. 
 *  
 * 
 * 
 *    \section sec_compilation Code compilation and execution
 *
 *   MC-GPU has been developed and tested only in the Linux operating system.
 *   A Makefile script is provided to compile the MC-GPU code in Linux.
 *   The CUDA libraries and the GNU GCC compiler must be previously installed.
 *   The Makefile may have to be edited to modify the library path.
 *   The code requires the "zlib.h" library to be able to open gzipped input files.
 *
 *
 *   MC-GPU uses CUDA to access NVIDIA GPUs but all the actual computations are coded
 *   in standard C.
 *   The code can only be executed in NVIDIA GPUs (CPU compatibility from initial versions when not defining -DUSING_CUDA not available).
 *   Defining the pre-processor variable "USING_MPI" (i.e., compiling with
 *   "-DUSING_MPI"), Message Passing Interface (MPI) library calls are used to share information 
 *   between multiple CPU threads in different computers. 
 *   Each MPI thread gets a unique id in the CPU and addresses a unique GPU.
 *   At the end of the simulation the images and doses tallied by the different GPUs are 
 *   reduced to form single output file equivalent to a sequential simulation of the same 
 *   number of particles.
 * 
 *   The code can be easily compiled executing the command "make" or running the provided 
 *   "./make.sh" script. 
 *   Optionally, the code can be executed from the command line with a command like this 
 *   (example using CUDA and MPI, openMPI library in this case):
 * \code 
 * nvcc -DUSING_MPI MC-GPU_v1.3.cu -o MC-GPU_v1.3.x -O3
 *  -use_fast_math -L/usr/lib/ -I. -I/usr/local/cuda/include 
 *  -I/usr/local/cuda/samples/common/inc -I/usr/local/cuda/samples/shared/inc/ 
 *  -I/usr/include/openmpi  -lmpi -lz --ptxas-options=-v 
 *  -gencode=arch=compute_20,code=sm_20 -gencode=arch=compute_30,code=sm_30
 * \endcode
 * 
 *   The same source code can also be compiled for a regular CPU using:
 * \code  
 * gcc -x c -O3 MC-GPU_v1.3.cu -o MC-GPU_v1.3_CPU.x -I./ -lm -lz
 * \endcode 
 *   
 *   To run a simulation (and keep the information reported to the standard
 *   output in an external file) the compiled code can be executed as:
 * \code 
 * ./MC-GPU_v1.3.x MC-GPU_v1.3.in | tee MC-GPU_v1.3.out 
 * \endcode 
 * 
 *   All simulation can be executed in the same way using the code compiled for the CPU 
 *   or the GPU (however, the number of histories should be reduced for the CPU to finish 
 *   the simulation in a reasonable time).
 *   To run the simulation in parallel with MPI in multiple GPUs (or CPU cores) in the
 *   current computer the user can execute:
 * \code 
 * mpirun -n 4 ./MC-GPU_v1.3.x MC-GPU_v1.3.in
 * \endcode  
 * 
 *   To use GPUs in different computers, the user must make sure all computers can access the simulation
 *   files and that the libraries are correctly set up in all nodes. 
 *   To execute a simulation (with verbose MPI information being reported):
 * \code  
 * mpirun --tag-output -v -x LD_LIBRARY_PATH -hostfile myhostfile.txt -n 8 
 * /fullPath/MC-GPU_v1.3.x /fullPath/MC-GPU_v1.3.in | tee MC-GPU_v1.3.out
 * \endcode  

 * 
 *   The text file 'hostfile' lists the IP addresses and number of computing slots (GPUs) of the
 *   computers collaborating in the simulation. This file is not necessary when using multiple
 *   GPUs in a single workstation. When using multiple computers, the simulation files should 
 *   be located in a shared drive to make sure every node can access the input data.  
 *   The different workstations must have different host names in order to be differentiated by
 *   the MPI threads. The multiple threads communicate to each other to make sure they don't  
 *   use the same GPU in the same workstation. 
 * 
 *  
 *
 *    \section sec_issues Known issues
 * 
 *   In extremely long simulations, it is theoretically possible to cause an overflow of the counters 
 *   estimating the mean and standard deviation of the material or voxel doses. If this happen, the 
 *   results will be incorrect and even negative or nan values can be reported.
 * 
 *
 * 
 * 
 *    \section sec_ref References
 * 
 * -# Badano A, Graff CG, Badal A, et al. Evaluation of Digital Breast Tomosynthesis as Replacement of Full-Field Digital Mammography Using an In Silico Imaging Trial. JAMA Netw Open 1(7), e185474 (2018) 
 * -# A. Badal and A. Badano, Accelerating Monte Carlo simulations of photon transport in a voxelized geometry using a massively parallel Graphics Processing Unit, Med. Phys. 36, p. 4878-4880 (2009)
 * -# A. Badal and A. Badano, Monte Carlo Simulation of X-Ray Imaging Using a Graphics Processing Unit, IEEE NSC-MIC, Conference Record , HP3–1, p. 4081-4084 (2009)
 * -# A. Badal, I. Kyprianou, D. Sharma and A. Badano, Fast cardiac CT simulation using a Graphics Processing Unit-accelerated Monte Carlo code, Proc. SPIE Medical Imaging Conference 7622, p. 762231 (2010)
 * -# A. Badal and A. Badano, Fast Simulation of Radiographic Images Using a Monte Carlo X-Ray Transport Algorithm Implemented in CUDA, Chapter 50 of GPU Computing Gems (Emerald Edition), p. 813-830, editor Wen-mei W. Hwu, publisher Morgan Kaufmann (Elsevier), Burlington MA, 2010
 * -# F. Salvat, J. M. Fernandez-Varea and J. Sempau, PENELOPE – A code system for Monte Carlo simulation of electron and photon transport, NEA-OECD, Issy-les-Moulineaux, available at www.nea.fr/html/dbprog/peneloperef.html (2006)
 * -# NVIDIA Corporation, NVIDIA CUDA(TM) Programming Guide, Technical Report available at www.nvidia.com/cuda (2011) 
 * -# A. Badal and J. Sempau, A package of Linux scripts for the parallelization of Monte Carlo simulations, Comput. Phys. Commun. 175 (6), p. 440-450 (2006) 
 * 
 * 
 * 
 *                      @file    MC-GPU_v1.5b.cu
 *                      @author  Andreu Badal (Andreu.Badal-Soler{at}fda.hhs.gov)
 *                      @date    2018/01/01
 *                        -- MC-GPU v.1.5_DBT:  2017/06/28
 *                        -- MC-GPU v.1.4_DBT:  2016/02/02
 *                        -- MC-GPU v.1.3:  2012/12/12
 *                        -- MC-GPU v.1.2:  2011/10/25
 *                        -- MC-GPU v.1.1:  2010/06/25
 *                        -- MC-GPU v.1.0:  2009/03/17
 */ 
////////////////////////////////////////////////////////////////////////////////////////


// *** Include header file with the structures and functions declarations:
#include <MC-GPU_v1.5b.h>

// *** Include functions to read phantom from binary form:
#include "load_voxels_binary_VICTRE_v1.5b.c"

// *** Include the computing kernel:
#include <MC-GPU_kernel_v1.5b.cu>


////////////////////////////////////////////////////////////////////////////////
//!  Main program of MC-GPU: initialize the simulation environment, launch the GPU 
//!  kernels that perform the x ray transport and report the final results.
//!  This function reads the description of the simulation from an external file
//!  given in the command line. This input file defines the number of particles to
//!  simulate, the characteristics of the x-ray source and the detector, the number
//!  and spacing of the projections (if simulating a CT), the location of the
//!  material files containing the interaction mean free paths, and the location
//!  of the voxelized geometry file.
//!
//!                            @author  Andreu Badal
//!
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{

  // -- Start time counter:
  time_t current_time = time(NULL);             // Get current time (in seconds)  
  clock_t clock_start, clock_end, clock_start_beginning;  // (requires standard header <time.h>)
  clock_start = clock();                        // Get current clock counter
  clock_start_beginning = clock_start;
  
#ifdef USING_MPI
// -- Using MPI to access multiple GPUs to simulate the x-ray projection image:
  int myID = -88, numprocs = -99, return_reduce = -1;
  MPI_Init(&argc, &argv);                       // Init MPI and get the current thread ID 
  MPI_Comm_rank(MPI_COMM_WORLD, &myID);
  MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
  
  char MPI_processor_name[81];             
  int resultlen = -1;
  MPI_Get_processor_name(MPI_processor_name, &resultlen);
    
  char* char_time = ctime(&current_time); char_time[19] = '\0';   // The time is located betwen the characters 11 and 19.
  printf("          >> MPI run (myId=%d, numprocs=%d) on processor \"%s\" (time: %s) <<\n", myID, numprocs, MPI_processor_name, &char_time[11]);
  fflush(stdout);   // Clear the screen output buffer
  MPI_Barrier(MPI_COMM_WORLD);   // Synchronize MPI threads  
  
  MAIN_THREAD printf("              -- Time spent initializing the MPI world (MPI_Barrier): %.3f s\n", ((double)(clock()-clock_start))/CLOCKS_PER_SEC);
  
  
#else  
  int myID = 0, numprocs = 1;   // Only one CPU thread used when MPI is not activated (multiple projections will be simulated sequentially).
#endif

  MAIN_THREAD 
  { 
      printf("\n\n     *****************************************************************************\n");
      printf(    "     ***     MC-GPU, version 1.5b_DBT (https://github.com/DIDSR/VICTRE_MCGPU)  ***\n");
      printf(    "     ***                              (http://code.google.com/p/mcgpu/)        ***\n");
      printf(    "     ***                                                                       ***\n");
      printf(    "     ***  A. Badal and A. Badano, \"Accelerating Monte Carlo simulations of     *** \n");
      printf(    "     ***  photon transport in a voxelized geometry using a massively parallel  *** \n");
      printf(    "     ***  Graphics Processing Unit\", Medical Physics 36, pp. 4878–4880 (2009)  ***\n");
      printf(    "     ***                                                                       ***\n");
      printf(    "     ***                     Andreu Badal (Andreu.Badal-Soler{at}fda.hhs.gov)  ***\n");
      printf(    "     *****************************************************************************\n\n");

      printf("****** Code execution started on: %s\n\n", ctime(&current_time));  
      fflush(stdout);
  }
    
  
  // The "MAIN_THREAD" macro prints the messages just once when using MPI threads (it has no effect if MPI is not used):  MAIN_THREAD == "if(0==myID)"
  MAIN_THREAD printf  ("\n             *** CUDA SIMULATION IN THE GPU ***\n");
  MAIN_THREAD printf("\n    -- INITIALIZATION phase:\n");
  MAIN_THREAD fflush(stdout);   // Clear the screen output buffer for the main thread
  
  
///////////////////////////////////////////////////////////////////////////////////////////////////
  
  
  // *** Declare the arrays and structures that will contain the simulation data:

  struct voxel_struct voxel_data;          // Define the geometric constants of the voxel file
  struct detector_struct detector_data[MAX_NUM_PROJECTIONS+1];  // Define an x ray detector (for each projection)
  struct source_struct source_data[MAX_NUM_PROJECTIONS+1];      // Define the particles source (for each projection)
  struct source_energy_struct source_energy_data;    // Define the source energy spectrum
  struct linear_interp mfp_table_data;     // Constant data for the linear interpolation
  struct compton_struct  compton_table;    // Structure containing Compton sampling data (to be copied to CONSTANT memory)
  struct rayleigh_struct rayleigh_table;   // Structure containing Rayleigh sampling data (to be copied to CONSTANT memory)
  
      //   float2 *voxel_mat_dens = NULL;  //!!FixedDensity_DBT!! Density is now fixed for each material, not adapted for each voxel.
  int *voxel_mat_dens = NULL;              //!!bitree!! v1.5 --> using an integer value to be able to store both the material number (positive) or pointers to the binary tree branches (negative)
  long long int voxel_mat_dens_bytes = 0;  // Size (in bytes) of the voxels array (using unsigned int to allocate up to 4.2GBytes)
  
  char *bitree = NULL;                     // Array storing the binary tree structures for each non-uniform low resolution voxel
  unsigned int bitree_bytes = 0;           // Size (in bytes) of the bitree array
  int *voxel_geometry_LowRes = NULL;       // Array to temporary store the low resolution version of the geometry when the binary tree is created
  unsigned int voxel_geometry_LowRes_bytes = 0;
  
  
  float density_max[MAX_MATERIALS];
  float density_nominal[MAX_MATERIALS];
  unsigned long long int *image = NULL;    // Poiter where image array will be allocated
  int image_bytes = -1;                    // Size of the image array
  int mfp_table_bytes = -1, mfp_Woodcock_table_bytes = -1;   // Size of the table arrays
  float2 *mfp_Woodcock_table = NULL;                // Linear interpolation data for the Woodcock mean free path [cm]
  float3 *mfp_table_a = NULL, *mfp_table_b = NULL;  // Linear interpolation data for 3 different interactions:
                                              //  (1) inverse total mean free path (divided by density, cm^2/g)
                                              //  (2) inverse Compton mean free path (divided by density, cm^2/g)
                                              //  (3) inverse Rayleigh mean free path (divided by density, cm^2/g)
  short int dose_ROI_x_min, dose_ROI_x_max, dose_ROI_y_min, dose_ROI_y_max, dose_ROI_z_min, dose_ROI_z_max;  // Coordinates of the dose region of interest (ROI)
  ulonglong2 *voxels_Edep = NULL;           // Poiter where the voxel energy deposition array will be allocated
  int voxels_Edep_bytes = 0;                      // Size of the voxel Edep array
  
  ulonglong2 materials_dose[MAX_MATERIALS];    // Array for tally_materials_dose.     !!tally_materials_dose!!
  int kk;
  for(kk=0;kk<MAX_MATERIALS;kk++) 
  {  
    materials_dose[kk].x = 0;       // Initializing data                  !!tally_materials_dose!!
    materials_dose[kk].y = 0;
    density_nominal[kk]  =-1.0f;
  }

  clock_t clock_kernel;     // Using only cpu timers after CUDA 5.0

  double time_elapsed_MC_loop = 0.0, time_total_MC_simulation = 0.0, time_total_MC_init_report = 0.0;
  

  unsigned long long int total_histories;
  int histories_per_thread, seed_input, num_threads_per_block, gpu_id, num_projections;
  int flag_material_dose=-2;
  bool flag_simulateMammoAfterDBT=false, flag_detectorFixed=false;    // !!DBTv1.4!!;
  double SRotAxisD=-1.0, translation_helical=0.0;
  char file_name_voxels[250], file_name_materials[MAX_MATERIALS][250], file_name_output[250], file_dose_output[250], file_name_espc[250];

  // *** Read the input file given in the command line and return the significant data:
  read_input(argc, argv, myID, &total_histories, &seed_input, &gpu_id, &num_threads_per_block, &histories_per_thread, detector_data, &image, &image_bytes, source_data, &source_energy_data, &voxel_data, file_name_voxels, file_name_materials, file_name_output, file_name_espc, &num_projections, &voxels_Edep, &voxels_Edep_bytes, file_dose_output, &dose_ROI_x_min, &dose_ROI_x_max, &dose_ROI_y_min, &dose_ROI_y_max, &dose_ROI_z_min, &dose_ROI_z_max, &SRotAxisD, &translation_helical, &flag_material_dose, &flag_simulateMammoAfterDBT, &flag_detectorFixed);


  // *** Read the energy spectrum and initialize its sampling with the Walker aliasing method:
  MAIN_THREAD printf("    -- Reading the energy spectrum and initializing the Walker aliasing sampling algorithm.\n");
  float mean_energy_spectrum = 0.0f;  
  init_energy_spectrum(file_name_espc, &source_energy_data, &mean_energy_spectrum);
  
  
  // *** Output some of the data read to make sure everything was correctly read:
  MAIN_THREAD
  {
        printf("\n    -- Data read from the input file:\n");
        if (total_histories < (unsigned long long int)(100000))
          printf("                       simulation time = %lld s\n", total_histories);
        else            
          printf("              x-ray tracks to simulate = %lld\n", total_histories);
        printf("                   initial random seed = %d\n", seed_input);
        
        double phi0   = ((double)source_data[0].D_phi)*RAD2DEG;
        double theta0 = 2.0*(90.0 - acos(((double)source_data[0].cos_theta_low))*RAD2DEG);
        if (source_data[0].flag_halfConeX)
          theta0 = 0.5*theta0;
          printf("      NOTE: sampling only upper half of collimated cone beam, with beam offset to edge of the image (eg, mammo).\n");   // !!DBT!!  !!HalfBeam!! !!DBTv1.4!!
        
        printf("      azimuthal (phi), polar apertures = %.6f , %.6f degrees\n", phi0, theta0);        
        printf("              (max_height_at_y1cm = %f , max_width_at_y1cm = %f)\n", source_data[0].max_height_at_y1cm, source_data[0].max_width_at_y1cm);  // !!DBTv1.4!!
        
        printf("                      source direction = (%f, %f, %f)\n", source_data[0].direction.x, source_data[0].direction.y, source_data[0].direction.z);
        printf("                   focal spot position = (%f, %f, %f)\n", source_data[0].position.x, source_data[0].position.y, source_data[0].position.z);
        printf("         focal spot Gaussian blur FWHM = %f  (3D Gaussian dist. cropped at 2*sigma)\n", source_data[0].focal_spot_FWHM);                                      // !!DBTv1.4!!
        
        
        if (num_projections!=1 && flag_simulateMammoAfterDBT==true)
          printf("              focal spot rotation blur = %f degrees (disabled for the first single projection at 0 deg)\n", source_data[0].rotation_blur*RAD2DEG);              // !!DBTv1.5!!
        else
          printf("              focal spot rotation blur = %f degrees\n", source_data[0].rotation_blur*RAD2DEG);              // !!DBTv1.5!!
        
        
        printf("              source-detector distance = %f cm\n", detector_data[0].sdd);        
        printf("              detector center position = (%f, %f, %f)\n", detector_data[0].center.x, detector_data[0].center.y, detector_data[0].center.z);
        printf("      image offset from beam at center = (%f, %f)\n", detector_data[0].offset.x, detector_data[0].offset.y);                                                  // !!DBTv1.4!!
        
        printf("              detector layer thickness = %f cm (=%.2f micron)\n", detector_data[0].scintillator_thickness, 1.0e4f*detector_data[0].scintillator_thickness);    // !!DBTv1.4!!
        printf("         detector material average MFP = %f cm\n", detector_data[0].scintillator_MFP);
        printf("       detector material K-edge energy = %f eV\n", detector_data[0].kedge_energy);
        printf("         fluorescence energy and yield = %f eV , %f\n", detector_data[0].fluorescence_energy, detector_data[0].fluorescence_yield);        
        printf("            MFP at fluorescence energy = %f cm\n", detector_data[0].fluorescence_MFP);
        
        if (detector_data[0].gain_W>0.001f)
        {
          printf("        detector gain and Swank factor = %f eV/detected_charge, %f (%f relative std_dev)\n", detector_data[0].gain_W, 1.0f/(1.0f+detector_data[0].Swank_rel_std*detector_data[0].Swank_rel_std), detector_data[0].Swank_rel_std);   // Swank_factor = mean^2/(mean^2 + std_dev^2) --> (std_dev/mean) = sqrt(1/Swank_factor - 1)
                 
          printf("            electronic noise per pixel = %f electrons\n", detector_data[0].electronic_noise);
        }

        printf("              detector cover thickness = %f cm\n", detector_data[0].cover_thickness);     //  !!DBTv1.5!!
        printf("          cover average mean free path = %f cm\n", detector_data[0].cover_MFP);
        

        if (detector_data[0].grid_freq > 0.0f)
        {
          printf("                Antiscatter grid ratio = %f\n", fabsf(detector_data[0].grid_ratio));             //  !!DBTv1.5!!
          printf("            Antiscatter grid frequency = %f lines per cm\n", detector_data[0].grid_freq);            
          printf("      Antiscatter grid strip thickness = %f cm (=%.2f micron)\n", detector_data[0].grid_strip_thickness, 1.0e4*detector_data[0].grid_strip_thickness);
          float h  = fabsf(detector_data[0].grid_ratio)*(1.0f/detector_data[0].grid_freq - detector_data[0].grid_strip_thickness);    // Height of the grid, according to input grid ratio, freq, and strip thickness
          printf("      Computed antiscatter grid height = %f cm (=%.2f micron)\n", h, 1.0e4*h);
          printf("         strips average mean free path = %f cm\n", 1.0f/detector_data[0].grid_strip_mu);
          printf("     interspace average mean free path = %f cm\n", 1.0f/detector_data[0].grid_interspace_mu);
          
          if (detector_data[0].grid_ratio<0.0f)
            printf("          Antiscatter grid orientation = 0 --> 1D collimated grid with strips perpendicular to lateral direction (mammo style)\n");
          else
            printf("          Antiscatter grid orientation = 1 --> 1D collimated grid with strips parallel to lateral direction (DBT style)\n");
        }
        else
          printf("\n                Antiscatter grid: DISABLED!\n\n");                              //  !!DBTv1.5!!


        printf("                number of pixels image = %dx%d = %d\n", detector_data[0].num_pixels.x, detector_data[0].num_pixels.y, detector_data[0].total_num_pixels);
        printf("                            pixel size = %.5fx%.5f cm\n", 1.0f/detector_data[0].inv_pixel_size_X, 1.0f/detector_data[0].inv_pixel_size_Z);
        printf("                         detector size = %.5fx%.5f cm\n", detector_data[0].width_X, detector_data[0].height_Z);
        printf("                 number of projections = %d\n", num_projections);
        if (num_projections!=1 || source_data[0].rotation_blur>0.000001f)   // Report data if blur is used bc the rotation is around the source-rotation axis
        {
          printf("         source-rotation axis-distance = %lf cm\n", SRotAxisD);          
          printf("             angle between projections = %lf\n", source_data[0].angle_per_projection*RAD2DEG);
          printf("                  initial angle offset = %lf\n", source_data[0].angle_offset*RAD2DEG);                                                                               // !!DBTv1.4!!
          printf("                        rotation point = (%f, %f, %f)\n", source_data[0].rotation_point.x, source_data[0].rotation_point.y, source_data[0].rotation_point.z);        // !!DBTv1.4!!
          printf("                      axis of rotation = (%f, %f, %f)\n", source_data[0].axis_of_rotation.x, source_data[0].axis_of_rotation.y, source_data[0].axis_of_rotation.z);  // !!DBTv1.4!!
          printf("              translation between proj = %lf\n", translation_helical);
        }
        printf("                     output image file = %s\n", file_name_output);
        printf("                      input voxel file = %s\n", file_name_voxels);
        printf("                 voxel geometry offset = (%f, %f, %f) cm\n", voxel_data.offset.x, voxel_data.offset.y, voxel_data.offset.z);              // !!DBTv1.4!!
        printf("   size coarse voxels for binary trees = %d x %d x %d\n", (int)voxel_data.num_voxels_coarse.x, (int)voxel_data.num_voxels_coarse.y, (int)voxel_data.num_voxels_coarse.z);    // !!bitree!! v1.5b 
  
        if (dose_ROI_x_max>-1)
        {
          printf("                      output dose file = %s\n", file_dose_output);
          printf("         input region of interest dose = X[%d,%d], Y[%d,%d], Z[%d,%d]\n", dose_ROI_x_min+1, dose_ROI_x_max+1, dose_ROI_y_min+1, dose_ROI_y_max+1, dose_ROI_z_min+1, dose_ROI_z_max+1);   // Show ROI with index=1 for the first voxel instead of 0.
        }
        
        printf("\n                  energy spectrum file = %s\n", file_name_espc);      
        printf(  "            number of energy bins read = %d\n", source_energy_data.num_bins_espc);
        printf(  "             minimum, maximum energies = %.3f, %.3f keV\n", 0.001f*source_energy_data.espc[0], 0.001f*source_energy_data.espc[source_energy_data.num_bins_espc]);
        printf(  "                  mean energy spectrum = %.3f keV\n\n", 0.001f*mean_energy_spectrum);
        
        fflush(stdout);       
  }
  
 
  // *** Set the detectors and sources for the CT trajectory (if needed, ie, for more than one projection):
  if (num_projections != 1)
  {
    set_CT_trajectory(myID, num_projections, source_data, detector_data, translation_helical, flag_detectorFixed);
  }
  
  fflush(stdout);

    double mass_materials[MAX_MATERIALS];
    
  // !!bitree!! If the binary tree is used, read the geometry only with the main thread, and then broadcast the new data:
  //            if the tree is not used, every thread reads the input geometry at the same time.
  if (0==myID || (voxel_data.num_voxels_coarse.x)==0)
  {


    // *** Read the voxel data and allocate the density map matrix. Return the maximum density:
    
    if (voxel_data.num_voxels.x<1)
    {
      // -- Read ASCII format geometry: geometric parameters will be read from the header file     !!DBTv1.4!! 
      load_voxels(myID, file_name_voxels, density_max, &voxel_data, &voxel_mat_dens, &voxel_mat_dens_bytes, &dose_ROI_x_max, &dose_ROI_y_max, &dose_ROI_z_max);
    }
    else
    {
      // -- Read binary RAW format geometry: geometric parameters given in input file              !!DBTv1.4!! 
      load_voxels_binary_VICTRE(myID, file_name_voxels, density_max, &voxel_data, &voxel_mat_dens, &voxel_mat_dens_bytes, &dose_ROI_x_max, &dose_ROI_y_max, &dose_ROI_z_max);   //!!DBT!!    // !!DBTv1.4!!
    }

    // -- Pre-compute the total mass of each material present in the voxel phantom (to be used in "report_materials_dose"):
    double voxel_volume = 1.0 / ( ((double)voxel_data.inv_voxel_size.x) * ((double)voxel_data.inv_voxel_size.y) * ((double)voxel_data.inv_voxel_size.z) );
    for(kk=0; kk<MAX_MATERIALS; kk++)
      mass_materials[kk] = 0.0;
    long long int llk;
    for(llk=0; llk<((long long int)voxel_data.num_voxels.x*(long long int)voxel_data.num_voxels.y*(long long int)voxel_data.num_voxels.z); llk++)  // For each voxel in the geometry
    {
      mass_materials[((int)voxel_mat_dens[llk])] += ((double)density_LUT[((int)voxel_mat_dens[llk])])*voxel_volume;   // Add material mass = density_LUT*volume (first material==0)
    }
    
    
    // ** Create the low resolution version of the phantom and the binary tree structures, if requested in the input file and dose dep tally disabled:   //!!bitree!! v1.5b
    if ((voxel_data.num_voxels_coarse.x)!=0)
    {
      if (dose_ROI_x_max>0)
      {
        MAIN_THREAD printf("\n\n   !!ERROR!! Sorry, the voxel dose deposition tally cannot be used when the binary tree is active. Please, disable the binary tree.\n\n");
        exit(-1);
      }
      
      MAIN_THREAD printf("\n       !!bitree!! Creating a binary tree structure to minimize memory use.\n");  // !!bitree!! v1.5b

      
      create_bitree(myID, &voxel_data, voxel_mat_dens, &bitree, &bitree_bytes, &voxel_geometry_LowRes, &voxel_geometry_LowRes_bytes);     //!!bitree!! v1.5b
      
      MAIN_THREAD printf("       >> RAM memory allocation: original voxelized geometry = %f MBytes; low resolution voxelized geometry = %f MBytes;\n", voxel_mat_dens_bytes/(1024.f*1024.f), voxel_geometry_LowRes_bytes/(1024.f*1024.f));
      MAIN_THREAD printf("                                 binary tree = %f MBytes; image vector = %f MBytes; data structures = %f Mbytes\n", bitree_bytes/(1024.f*1024.f), image_bytes/(1024.f*1024.f), (sizeof(struct voxel_struct)+sizeof(struct source_struct)+sizeof(struct detector_struct)+sizeof(struct linear_interp)+2*mfp_table_bytes+sizeof(struct rayleigh_struct)+sizeof(struct compton_struct))/(1024.f*1024.f));
      MAIN_THREAD printf("                                 (reduction in memory use with bitree: [low res voxels + binary tree]-[high res voxels] = %f MBytes = %.3f%%)\n", (voxel_geometry_LowRes_bytes+bitree_bytes-voxel_mat_dens_bytes)/(1024.f*1024.f), 100.f*(voxel_geometry_LowRes_bytes+bitree_bytes-voxel_mat_dens_bytes)/voxel_mat_dens_bytes);
      
      // -- Replace the high resolution version of the geometry by the low resolution version:       !!DeBuG!! voxel dose tally can't be used now!!    
      free(voxel_mat_dens);                                  //!!bitree!! v1.5b
      voxel_mat_dens = voxel_geometry_LowRes;                //!!bitree!! v1.5b
      voxel_mat_dens_bytes = voxel_geometry_LowRes_bytes;    //!!bitree!! v1.5b
    }
    else
    {
      MAIN_THREAD printf("\n       !!bitree!! Binary tree structure disabled: standard voxelized geometry in use.\n\n");  // !!bitree!! v1.5b
      MAIN_THREAD printf("       >> RAM memory allocation: voxelized geometry = %f MBytes; image vector = %f MBytes; data structures = %f Mbytes\n", voxel_mat_dens_bytes/(1024.f*1024.f), image_bytes/(1024.f*1024.f), (sizeof(struct voxel_struct)+sizeof(struct source_struct)+sizeof(struct detector_struct)+sizeof(struct linear_interp)+2*mfp_table_bytes+sizeof(struct rayleigh_struct)+sizeof(struct compton_struct))/(1024.f*1024.f));
    }
  
  }
  fflush(stdout);
  
  // !!bitree!! If the binary tree is used, broadcast the tree data and all auxiliary data from main to every other thread:
  #ifdef USING_MPI
  if (numprocs>1 && (voxel_data.num_voxels_coarse.x)!=0)
  {
    MPI_Barrier(MPI_COMM_WORLD);  // Synchronize MPI threads
    
    // Send all the geometric adata that has been read or changed by root node:    
    MPI_Bcast(&voxel_data.num_voxels.x, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&voxel_data.num_voxels.y, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&voxel_data.num_voxels.z, 1, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(&voxel_data.size_bbox.x, 1, MPI_FLOAT, 0, MPI_COMM_WORLD); MPI_Bcast(&voxel_data.size_bbox.y, 1, MPI_FLOAT, 0, MPI_COMM_WORLD); MPI_Bcast(&voxel_data.size_bbox.z, 1, MPI_FLOAT, 0, MPI_COMM_WORLD);
    
    voxel_data.voxel_size_HiRes.x = voxel_data.voxel_size.x; voxel_data.voxel_size_HiRes.y = voxel_data.voxel_size.y; voxel_data.voxel_size_HiRes.z = voxel_data.voxel_size.z; // Save the original high resolution voxel size
    MPI_Bcast(&voxel_data.voxel_size.x, 1, MPI_FLOAT, 0, MPI_COMM_WORLD); MPI_Bcast(&voxel_data.voxel_size.y, 1, MPI_FLOAT, 0, MPI_COMM_WORLD); MPI_Bcast(&voxel_data.voxel_size.z, 1, MPI_FLOAT, 0, MPI_COMM_WORLD);
    voxel_data.inv_voxel_size.x = 1.0/voxel_data.voxel_size.x; voxel_data.inv_voxel_size.y = 1.0/voxel_data.voxel_size.y; voxel_data.inv_voxel_size.z = 1.0/voxel_data.voxel_size.z; 
    
    MPI_Bcast(density_max, MAX_MATERIALS, MPI_FLOAT, 0, MPI_COMM_WORLD);
    MPI_Bcast(mass_materials, MAX_MATERIALS, MPI_DOUBLE, 0, MPI_COMM_WORLD);
    
    
    // Allocate memory (except root) and transmit voxel+binary tree links data:
    MPI_Bcast(&voxel_geometry_LowRes_bytes, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD);
    MPI_Bcast(&bitree_bytes, 1, MPI_UNSIGNED, 0, MPI_COMM_WORLD);
    if(0!=myID)
    {
      voxel_mat_dens_bytes = voxel_geometry_LowRes_bytes;
      voxel_mat_dens = (int*) malloc(voxel_mat_dens_bytes);   // Allocate voxels (low resolution)
      bitree = (char*) malloc(bitree_bytes);                  // Allocate binary tree elements
    }
    MPI_Bcast(voxel_mat_dens, ((unsigned long long int)voxel_data.num_voxels.x)*(voxel_data.num_voxels.y*voxel_data.num_voxels.z), MPI_INT, 0, MPI_COMM_WORLD);  
    MPI_Bcast(bitree, bitree_bytes, MPI_CHAR, 0, MPI_COMM_WORLD);  
    MPI_Barrier(MPI_COMM_WORLD);
  }
  #endif
  
  
  // *** Read the material mean free paths and set the interaction table in a "linear_interp" structure:
  load_material(myID, file_name_materials, density_max, density_nominal, &mfp_table_data, &mfp_Woodcock_table, &mfp_Woodcock_table_bytes, &mfp_table_a, &mfp_table_b, &mfp_table_bytes, &rayleigh_table, &compton_table);

  // -- Check that the input material tables and the x-ray source are consistent:
  if ( (source_energy_data.espc[0] < mfp_table_data.e0) || (source_energy_data.espc[source_energy_data.num_bins_espc] > (mfp_table_data.e0 + (mfp_table_data.num_values-1)/mfp_table_data.ide)) )
  {
    MAIN_THREAD 
    {
      printf("\n\n\n !!ERROR!! The input x-ray source energy spectrum minimum (%.3f eV) and maximum (%.3f eV) energy values\n", source_energy_data.espc[0], source_energy_data.espc[source_energy_data.num_bins_espc]);
      printf(  "           are outside the tabulated energy interval for the material properties tables (from %.3f to %.3f eV)!!\n", mfp_table_data.e0, (mfp_table_data.e0+(mfp_table_data.num_values-1)/mfp_table_data.ide));
      printf(  "           Please, modify the input energy spectra to fit the tabulated limits or create new tables.\n\n");
    }
    #ifdef USING_MPI
      MPI_Finalize();
    #endif
    exit(-1);
  }


  // *** Initialize the GPU using the NVIDIA CUDA libraries:
  // -- Declare the pointers to the device global memory, when using the GPU:
        //   float2 *voxel_mat_dens_device     = NULL,
        //          *mfp_Woodcock_table_device = NULL;
        //   char *voxel_mat_dens_device       = NULL;      //!!FixedDensity_DBT!! Allocate material vector as char
        
  int *voxel_mat_dens_device = NULL;      //!!bitree!! v1.5 --> using an integer value to be able to store both the material number (positive) or pointers to the binary tree branches (negative)
  char *bitree_device = NULL;             //!!bitree!! v1.5b
  
  float2 *mfp_Woodcock_table_device = NULL;      //!!FixedDensity_DBT!!
      
  float3 *mfp_table_a_device        = NULL,
         *mfp_table_b_device        = NULL;
  unsigned long long int *image_device          = NULL;
  struct rayleigh_struct *rayleigh_table_device = NULL;
  struct compton_struct  *compton_table_device  = NULL;
  ulonglong2 *voxels_Edep_device                = NULL;
  struct detector_struct *detector_data_device  = NULL;
  struct source_struct   *source_data_device    = NULL;  
  ulonglong2 *materials_dose_device = NULL;     // !!tally_materials_dose!!
  int* seed_input_device = NULL;  // Store latest random seed used in GPU in global memory to continue random sequence in consecutive projections.   !!DBTv1.4!!

  
  // -- Sets the CUDA enabled GPU that will be used in the simulation, and allocate and copies the simulation data in the GPU global and constant memories.
  init_CUDA_device(&gpu_id, myID, numprocs, &voxel_data, source_data, &source_energy_data, detector_data, &mfp_table_data,  /*Variables GPU constant memory*/
        voxel_mat_dens, &voxel_mat_dens_device, voxel_mat_dens_bytes,                          /*Variables GPU global memory*/
        bitree, &bitree_device, bitree_bytes,                             //!!bitree!! v1.5b
        image, &image_device, image_bytes,
        mfp_Woodcock_table, &mfp_Woodcock_table_device, mfp_Woodcock_table_bytes,
        mfp_table_a, mfp_table_b, &mfp_table_a_device, &mfp_table_b_device, mfp_table_bytes,
        &rayleigh_table, &rayleigh_table_device,
        &compton_table, &compton_table_device, &detector_data_device, &source_data_device,
        voxels_Edep, &voxels_Edep_device, voxels_Edep_bytes, &dose_ROI_x_min, &dose_ROI_x_max, &dose_ROI_y_min, &dose_ROI_y_max, &dose_ROI_z_min, &dose_ROI_z_max,
        materials_dose, &materials_dose_device, flag_material_dose, &seed_input_device, &seed_input, (num_projections+1));
  
  // !!DBTv1.4!! Allocate space for one extra projection (num_projections+1) for case flag_simulateMammoAfterDBT==true      !!DBTv1.4!!
  
  
  // -- Constant data already moved to the GPU: clean up unnecessary RAM memory
  free(mfp_Woodcock_table);
  free(mfp_table_a);
  free(mfp_table_b);
  if (0!=myID)    // Keep the geometry data for the MPI root because the voxel densities are still needed to compute the final doses
    free(voxel_mat_dens);
    
  MAIN_THREAD
  {
    current_time=time(NULL);
    printf("\n    -- INITIALIZATION finished: elapsed time = %.3f s. \n\n", ((double)(clock()-clock_start))/CLOCKS_PER_SEC);
  }
  

#ifdef USING_MPI
  fflush(stdout);
  MPI_Barrier(MPI_COMM_WORLD);   // Synchronize MPI threads before starting the MC phase.
#endif

  
///////////////////////////////////////////////////////////////////////////////////////////////////
  
  
  MAIN_THREAD
  {
    current_time=time(NULL);
    printf("\n\n    -- MONTE CARLO LOOP phase. Time: %s\n\n", ctime(&current_time)); 
    fflush(stdout);    
  }

  
  // -- A number of histories smaller than 24 hours in sec (3600*24=86400) means that the user wants to simulate for the input number of seconds in each GPU, not a fix number of histories:
  unsigned long long int total_histories_INPUT = total_histories;    // Save the original input values to be re-used for multiple projections
  int doing_speed_test = -1, simulating_by_time = 0;  // 0==false
  if (total_histories < (unsigned long long int)(95000))
    simulating_by_time = 1;    // 1=true


  int num_blocks_speed_test = 0;
  unsigned long long int histories_speed_test = (unsigned long long int)0, total_histories_speed_test = (unsigned long long int)0;
  float node_speed = -1.0f, total_speed = 1.0f;
  double current_angle = -999;
  int num_p;  // == current projection number
  
  
  // *************************************************************************************
  // *** CT simulation loop (including speed test, if simulating by time or multi-GPU) ***
  // *************************************************************************************

  int num_projections_loop = num_projections;
  if (num_projections>1)
  {
    num_projections_loop++;   // Add an extra projection because [0] always corresponds to the 0 deg projection (first tomographic image starts at [1])    !!DBTv1.4!!
  }
   
  for (num_p=0; num_p<num_projections_loop; num_p++)
  {
    
    // --Re-load the num histories for each new projection  !!DBTv1.4!!
    total_histories = total_histories_INPUT;

    // --Skip the initial 0 deg projection if we are simulating a tomographic scan and we don't want a separate projection with the full dose:  !!DBTv1.4!!
    if (0==num_p && num_projections>1 && flag_simulateMammoAfterDBT==false)
      continue;

    if (flag_simulateMammoAfterDBT && 0==num_p)
    {
      // -- !!DBT!! Simulate the first 0 deg projection (mammo) with almost as many histories (SCALE_MAMMO_DBT factor) as the whole tomographic scan to follow:  !!DBTv1.4!!
      total_histories = total_histories_INPUT * num_projections * SCALE_MAMMO_DBT;   //!!DBTv1.4!!  Scaling dose a factor SCALE_MAMMO_DBT (eg, factor 2/3 for 1 mGy mammo for 1.5 mGy DBT)
      MAIN_THREAD 
      {
        printf("\n\n  !!DBT!! Simulating first a 0 degree projection with %.4f times the number of histories as the complete scan with %d projections = %lld histories\n", SCALE_MAMMO_DBT, num_projections, total_histories);
        printf(    "          Afterwards, simulate the tomo acquisition starting at most negative angle and ending at most positive angle.\n");                      // !!DBT!! !!DBTv1.4!!
        printf(    "          If defined, motion blur is disabled and anti-scatter grid enabled only for the single projection.\n\n");
      }
    }
    else if (flag_simulateMammoAfterDBT && 1==num_p)
      MAIN_THREAD printf("\n\n !!DBT!!  After the first full simulation (eg, mammo), simulate a DBT acquisition (starting at neg angle) with the input number histories per projections.\n\n");

  
    if (0==num_p)
      current_angle = 0.0;
    else
      current_angle = source_data[0].angle_offset + (num_p-1) * source_data[0].angle_per_projection;
    
    
    MAIN_THREAD 
      if (num_projections!=1)
        if (flag_simulateMammoAfterDBT && 0==num_p)
          printf("\n\n\n\n   << Simulating a 0 degree projection (mammography) with %d * %f as many histories as each tomographic projection >>\n\n", num_projections, SCALE_MAMMO_DBT);
        else
          printf("\n\n\n\n   << Simulating tomographic projection %d of %d >> Angle: %lf degrees.\n\n", num_p, num_projections, current_angle*RAD2DEG);

    
    clock_start = clock();   // Start the CPU clock
    
    // *** Simulate in the GPUs the input amount of time or amount of particles:
    
    // -- Estimate GPU speed to use a total simulation time or multiple GPUs:    
    
    if ( simulating_by_time==0 &&   // Simulating a fixed number of particles, not a fixed time (so performing the speed test only once)
         node_speed>0.0f &&         // Speed test already performed for a previous projection in this simulation (node_speed and total_speed variables set)
         numprocs>1)                // Using multiple GPUs (ie, multiple MPI threads)
    { 
      // -- Simulating successive projections after the first one with a fix number of particles, with multiple MPI threads: re-use the speed test results from the first projection image:
      total_histories = (unsigned long long int)(0.5 + ((double)total_histories) * (((double)node_speed)/total_speed));  
      doing_speed_test = 0;   // No speed test for this projection.
    }
    else if ( simulating_by_time==1 || numprocs>1)
    {
      // -- Simulating with a time limit OR multiple MPI threads for the first time (num_p==0): run a speed test to calculate the speed of the current GPU and distribute the number of particles to the multiple GPUs or estimate the total number of particles required to run the input amount of time:      
      //    Note that this ELSE IF block will be skipped if we are using a single MPI thread and a fix number of particles.

      doing_speed_test = 1;   // Remember that we are performing the speed test to make sure we add the test histories to the total before the tally reports.

      if (node_speed<0.0f)    // Speed test not performed before (first projection being simulated): set num_blocks_speed_test and histories_speed_test.
      {
        num_blocks_speed_test = guestimate_GPU_performance(gpu_id);  // Guestimating a good number of blocks to estimate the speed of different generations of GPUs. Slower GPUs will simulate less particles and hopefully the fastest GPUs will not have to wait much.
      }
      
      histories_speed_test = (unsigned long long int)(num_blocks_speed_test*num_threads_per_block)*(unsigned long long int)(histories_per_thread);

      dim3  blocks_speed_test(num_blocks_speed_test, 1);
      dim3 threads_speed_test(num_threads_per_block, 1);

      #ifdef USING_MPI      
        // -- Init the current random number generator seed to avoid overlapping sequences with other MPI threads:
        if (simulating_by_time == 1) 
          // Simulating by time: set an arbitrary huge number of particles to skip.
          update_seed_PRNG((myID + num_p*numprocs), (unsigned long long int)(123456789012), &seed_input);    // Set the random number seed far from any other MPI thread (myID) and away from the seeds used in the previous projections (num_p*numprocs).
        else  
          // Simulating by histories (default):
          update_seed_PRNG(myID, total_histories_INPUT*num_projections, &seed_input);    // Init the random seed for each MPI thread as far away from the previous thread as if all "total_histories*num_projections" histories were simulated by each thread --> warranty that each thread has uncorrelated random sequence of random values (at least for the first seed of RANECU).    !!DBTv1.4!!

        checkCudaErrors(cudaMemcpy(seed_input_device, &seed_input, sizeof(int), cudaMemcpyHostToDevice));    // Upload initial seed value to GPU memory.   !!DBTv1.4!! 

        printf("        ==> CUDA (MPI process #%d in \"%s\"): estimate GPU speed executing %d blocks of %d threads, %d histories per thread: %lld histories in total (random seed=%d).\n", myID, MPI_processor_name, num_blocks_speed_test, num_threads_per_block, histories_per_thread, histories_speed_test, seed_input);
      #else
        printf("        ==> CUDA: Estimating the GPU speed executing %d blocks of %d threads, %d histories per thread: %lld histories in total.\n", num_blocks_speed_test, num_threads_per_block, histories_per_thread, histories_speed_test);
      #endif  
      fflush(stdout); 
      
      clock_kernel = clock();
      
      // -- Launch Monte Carlo simulation kernel for the speed test:
      track_particles<<<blocks_speed_test,threads_speed_test>>>(histories_per_thread, (short int)num_p, seed_input_device, image_device, voxels_Edep_device, voxel_mat_dens_device, bitree_device, mfp_Woodcock_table_device, mfp_table_a_device, mfp_table_b_device, rayleigh_table_device, compton_table_device, detector_data_device, source_data_device, materials_dose_device);
      
      
      #ifdef USING_MPI    
        // Find out the total number of histories simulated in the speed test by all the GPUs. Note that this MPI call will be executed in parallel with the GPU kernel because it is located before the cudaDeviceSynchronize command!
      
        return_reduce = MPI_Allreduce(&histories_speed_test, &total_histories_speed_test, 1, MPI_UNSIGNED_LONG, MPI_SUM, MPI_COMM_WORLD);  
        if (MPI_SUCCESS != return_reduce)
          printf("\n\n !!ERROR!! Error reducing (MPI_Allreduce) the total number of histories in the speed test test??? return_reduce = %d for thread %d\n\n\n", return_reduce, myID);
        else
      #else
        total_histories_speed_test = histories_speed_test;
      #endif
            
      fflush(stdout);
      cudaDeviceSynchronize();    // Force the runtime to wait until GPU kernel has completed
      getLastCudaError("\n\n !!Kernel execution failed while simulating particle tracks!! ");   // Check if the CUDA function returned any error

      float speed_test_time = float(clock()-clock_kernel)/CLOCKS_PER_SEC;

      node_speed = (float) (((double)histories_speed_test)/speed_test_time); 
      
      #ifdef USING_MPI  
        printf("                 (MPI process #%d): Estimated GPU speed = %lld hist / %.4f s = %.3f hist/s\n", myID, histories_speed_test, speed_test_time, node_speed);      
      #else
        printf("                  Estimated GPU speed = %lld hist / %.3f s = %.3f hist/s\n", histories_speed_test, speed_test_time, node_speed);        
      #endif

      
        // !!DBTv1.4!! No need to update the seed in the main program bc each GPU continues its series.
        //       // -- Init random number generator seed to avoid repeating the random numbers used in the speed test:
        //       update_seed_PRNG(1, histories_speed_test, &seed_input);
      
      if (simulating_by_time==1)
      {
        // -- Set number of histories for each GPU when simulating by time:
        if (total_histories > speed_test_time)
          total_histories = (total_histories - speed_test_time)*node_speed;    // Calculate the total number of remaining histories by "GPU speed" * "remaining time"
        else
          total_histories = 1;       // Enough particles simulated already, simulate just one more history (block) and report (kernel call would fail if total_histories < or == 0).
      }
      else
      {
        
        #ifdef USING_MPI 
          // -- Simulating a fix number of histories divided between all GPUs (execution time variable):                     
          //    Compute the fraction of the total speed that accounts for the current MPI thread:
          return_reduce = MPI_Allreduce(&node_speed, &total_speed, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD);  // Sum all the times and send result to all processes
          
          if (MPI_SUCCESS != return_reduce)
            printf("\n\n !!ERROR!! Error reducing (MPI_Allreduce) the speed test results??? return_reduce = %d for thread %d\n\n\n", return_reduce, myID);
          else
            MAIN_THREAD 
            {
              printf("\n       -- Total speed for all GPUs (MPI_Allreduce) = %.3f hist/s; total histories simulated in the speed test (MPI_Allreduce) = %lld.\n", total_speed, total_histories_speed_test);
              printf("          The main thread will simulate %.2f%% of the x rays in the simulation.\n",  100.0f*node_speed/total_speed);
            }
        #else
          total_speed = node_speed;
        #endif

        // - Divide the remaining histories among the MPI threads (GPUs) according to their fraction of the total speed (rounding up).
        if (total_histories_speed_test < total_histories)
          total_histories = (unsigned long long int)(0.5 + ((double)(total_histories-total_histories_speed_test)) * ((double)(node_speed/total_speed)));
        else
          total_histories = numprocs;       // Enough particles simulated already, simulate just one more history (block) and report (kernel call would fail if total_histories < or == 0).
      }     
     
    }   // [Done with case of simulating projections by time or first projection by number of particles]    
    
    // else  ==>  if using only 1 GPU and a fixed number of histories the whole speed test is skipped. The random seed will be different for each projection because it is updated after calling the kernel below.
  
  
    // fflush(stdout); 
    // MPI_Barrier(MPI_COMM_WORLD);   // Synchronize MPI threads here if we want to have a better organized output text at the expense of losing some performance 


    // *** Perform the MC simulation itself (the speed test would be skipped for a single CPU thread using a fix number of histories):
    
    // -- Compute the number of CUDA blocks to simulate, rounding up and making sure it is below the limit of 65535 blocks.
    //    The total number of particles simulated will be increased to the nearest multiple "histories_per_thread".
    double total_threads = ceil(((double)total_histories)/((double)histories_per_thread));     // Divide the histories among GPU threads, rounding up and avoiding overflow     //  New in MC-GPU v1.4 (Mina's bug)
    int total_threads_blocks = (int)(((double)total_threads)/((double)num_threads_per_block) + 0.9990);   // Divide the GPU threads among CUDA blocks, rounding up
    if (total_threads_blocks>65535)
    {     
      #ifdef USING_MPI       
        printf("          WARNING (MPI process #%d): %d hist per thread would produce %d CUDA blocks (>65535 maximum).", myID, histories_per_thread, total_threads_blocks);
      #else
        printf("\n          WARNING: %d hist per thread would produce %d CUDA blocks, more than the maximum value of 65535.", histories_per_thread, total_threads_blocks);
      #endif
      total_threads_blocks = 65000;    // Increase the histories per thread to have exactly 65000 blocks.
      histories_per_thread = (int) ( ((double)total_histories)/((double)(total_threads_blocks*num_threads_per_block)) + 0.9990 );        
      printf(" Increasing to %d hist to run exactly %d blocks in the GPU.\n", histories_per_thread, total_threads_blocks);
    }
    else if (total_threads_blocks<1)
    {
      total_threads_blocks = 1;        // Make sure we have at least 1 block to run
    }      
    
    total_histories = ((unsigned long long int)(total_threads_blocks*num_threads_per_block))*histories_per_thread;   // Total histories will be equal or higher than the input value due to the rounding up in the division of the histories
    float total_histories_current_kernel_float = (float)total_histories;   // Keep a float approx of the num histories for the timing below
    
    checkCudaErrors(cudaMemcpy(&seed_input, seed_input_device, sizeof(int), cudaMemcpyDeviceToHost));    // Download latest seed value used in the GPU.   !!DBTv1.4!!
    
    fflush(stdout);
    #ifdef USING_MPI  
      MAIN_THREAD printf("\n\n");
      printf("        ==> CUDA (MPI process #%d in \"%s\"): Executing %d blocks of %d threads, with %d histories in each thread: %lld histories in total (random seed=%d, num_p=%d).\n", myID, MPI_processor_name, total_threads_blocks, num_threads_per_block, histories_per_thread, total_histories, seed_input, num_p);
      MPI_Barrier(MPI_COMM_WORLD);   // Synchronize MPI threads to better organize output
    #else
      printf("\n        ==> CUDA: Executing %d blocks of %d threads, with %d histories in each thread: %lld histories in total (random seed=%d, num_p=%d).\n", total_threads_blocks, num_threads_per_block, histories_per_thread, total_histories, seed_input, num_p);     
    #endif
    fflush(stdout);
    
    
    // -- Setup the execution parameters (Max number threads per block: 512, Max sizes each dimension of grid: 65535x65535x1)
    dim3 blocks(total_threads_blocks, 1);
    dim3 threads(num_threads_per_block, 1); 
    
    clock_kernel = clock();

    
    // *** Execute the x-ray transport kernel in the GPU ***
    track_particles<<<blocks,threads>>>(histories_per_thread, (short int)num_p, seed_input_device, image_device, voxels_Edep_device, voxel_mat_dens_device, bitree_device, mfp_Woodcock_table_device, mfp_table_a_device, mfp_table_b_device, rayleigh_table_device, compton_table_device, detector_data_device, source_data_device, materials_dose_device);
    

    #ifdef USING_MPI 
      if (numprocs>1)  // Using more than 1 MPI thread:
      {
        // -- Compute the total number of histories simulated with all MPI thread, including the speed test (histories_speed_test==0 if speed test was skipped).
        //    These MPI messajes are sent concurrently with the GPU kernel computation for maximum efficiency.
        unsigned long long int current_GPU_histories = total_histories;  
        return_reduce = MPI_Reduce(&current_GPU_histories, &total_histories, 1, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD);  // Sum all the simulated particles and send to thread 0
      }
    #endif

    if (1==doing_speed_test)
      total_histories += total_histories_speed_test;     // Speed test was done: compute the total number of histories including the particles simulated in the speed test 

    #ifdef USING_MPI 
      if (numprocs>1)  // Using more than 1 MPI thread:
      {
        MAIN_THREAD
        {
          if (MPI_SUCCESS != return_reduce)
            printf("\n\n !!ERROR!! Error getting the total number of particles simulated in all the GPUs (MPI_Reduce). return_reduce = %d.\n\n\n", return_reduce);
          
          if (1==simulating_by_time || 1==doing_speed_test)
          {
            printf("\n       -- Total number of histories being simulated in all the GPUs for the current projection (including speed test)= %.3lld.\n\n", total_histories);
          }
        }
      }
    #endif
   
    fflush(stdout);
    cudaDeviceSynchronize();    // Force the runtime to wait until the GPU kernel is completed
    getLastCudaError("\n\n !!Kernel execution failed while simulating particle tracks!! ");  // Check if kernel execution generated any error

    float real_GPU_speed = total_histories_current_kernel_float/(float(clock()-clock_kernel)/CLOCKS_PER_SEC);  // GPU speed for all the image simulation, not just the speed test.

    // -- Copy the simulated image from the GPU memory to the CPU:           
    checkCudaErrors(cudaMemcpy(image, image_device, image_bytes, cudaMemcpyDeviceToHost) );  // Copy final results to host

  
    // Get current time and calculate execution time in the MC loop:
    time_elapsed_MC_loop = ((double)(clock()-clock_start))/CLOCKS_PER_SEC;       
    time_total_MC_simulation += time_elapsed_MC_loop;   // Count total time (in seconds).
        //  printf("\n    -- MONTE CARLO LOOP finished: time tallied in MAIN program: %.3f s\n\n", time_elapsed_MC_loop);
        

///////////////////////////////////////////////////////////////////////////////////////////////////
     

    // *** Move the images simulated in the GPU (or multiple CPU cores) to the host memory space:
    
#ifdef USING_MPI 
    if (numprocs>1)  // Using more than 1 MPI thread
    {
      // -- Add the images simulated in all the MPI threads:      
      MAIN_THREAD printf("\n        >>  Synchronize the MPI threads and accumulate the simulated images (MPI_Reduce).\n\n");                    
      
      // Allocate the memory for the final image in the main thread:
      unsigned long long int *image_MPI = NULL;
      MAIN_THREAD image_MPI = (unsigned long long int*) malloc(image_bytes);
      MAIN_THREAD if (image_MPI==NULL)
      {
        printf("\n\n   !!malloc ERROR!! Problem allocating the total MPI image. Out of memory??\n\n");  
        exit(-4);
      }


      // !!DeBuG!! To know how much time the threads lose waiting for other threads in the MPI_Reduce, I have to use an explicit barrier here. It may be more efficient to let the threads advance to the MPI_Reduce directly.
      clock_start = clock();      
      MPI_Barrier(MPI_COMM_WORLD);   // Synchronize MPI threads            
      
      current_time=time(NULL);      
      char_time = ctime(&current_time); char_time[19] = '\0';   // The time is located between the characters 11 and 19.
      
      
      if (1==doing_speed_test)    // This message will be shown only for the first projection simulated in the GPU.
        printf("        ==> CUDA (MPI process #%d in \"%s\"): GPU speed = %.4f hist/s. Time spent at MPI_Barrier waiting to add the partial images: %.6f s (time: %8s)\n", myID, MPI_processor_name, real_GPU_speed, ((double)(clock()-clock_start))/CLOCKS_PER_SEC, &char_time[11]);
      
      fflush(stdout);      
      
      MAIN_THREAD clock_start = clock();
                      
      // -- Sum the pixel values from the different simulated images and send to thread 0.
      //    MPI_Reduce will act as a synchronization barrier for all the MPI threads.
      int num_pixels_image = image_bytes/((int)sizeof(unsigned long long int));   // Number of elements allocated in the "image" array.         
      return_reduce = MPI_Reduce(image, image_MPI, num_pixels_image, MPI_UNSIGNED_LONG, MPI_SUM, 0, MPI_COMM_WORLD); 
      
      if (MPI_SUCCESS != return_reduce)
      {
        printf("\n\n !!ERROR!! Possible error reducing (MPI_SUM) the image results??? Returned value MPI_Reduce = %d\n\n\n", return_reduce);
      }
              
      // -- Exchange the image simulated in thread 0 for the final image from all threads, in the main thread:
      MAIN_THREAD 
      {
        free(image);
        image = image_MPI;    // point the image pointer to the new image in host memory
        image_MPI = NULL;                

        printf("\n       -- Time reducing the images simulated by all the MPI threads (MPI_Reduce) according to the main thread = %.6f s.\n", ((double)(clock()-clock_start))/CLOCKS_PER_SEC); 
      }
    }
#endif
                

    // *** Report the final results:
    char file_name_output_num_p[253];
    
          //     if (1==num_projections || (flag_simulateMammoAfterDBT && 0==num_p))                                     // !!DBTv1.4!!
    
    if (1==num_projections)
      strcpy(file_name_output_num_p, file_name_output);   // Use the input name for single projection
    else
      sprintf(file_name_output_num_p, "%s_%04d", file_name_output, num_p);   // Create the output file name with the input name + projection number (4 digits, padding with 0)

    if (num_p>0)
    {     
      MAIN_THREAD report_image(file_name_output_num_p, detector_data, source_data, mean_energy_spectrum, image, time_elapsed_MC_loop, total_histories, num_p, num_projections, myID, numprocs, current_angle, &seed_input);
    }
    else
    {
      // Projection 0 happens only when num_projections==1 or when flag_simulateMammoAfterDBT==true:
      MAIN_THREAD report_image(file_name_output_num_p, detector_data, source_data, mean_energy_spectrum, image, time_elapsed_MC_loop, total_histories, 0, 1, myID, numprocs, current_angle, &seed_input);
    }

    // *** Clear the image after reporting, unless this is the last projection to simulate:
    if (num_p<num_projections)
    {
      int pixels_per_image = detector_data[0].num_pixels.x * detector_data[0].num_pixels.y;
        MAIN_THREAD printf("       ==> CUDA: Launching kernel to reset the device image to 0: number of blocks = %d, threads per block = 128\n", (int)(ceil(pixels_per_image/128.0f)+0.01f) );
        init_image_array_GPU<<<(int)(ceil(pixels_per_image/128.0f)+0.01f),128>>>(image_device, pixels_per_image);
        fflush(stdout);
        cudaDeviceSynchronize();
        getLastCudaError("\n\n !!Kernel execution failed initializing the image array!! ");  // Check if kernel execution generated any error:
    }
    
    
    if (num_p==0 && flag_material_dose==1 && flag_simulateMammoAfterDBT)       // !!DBTv1.4!!
    {  
      // --Report "tally_materials_dose" for the first projection corresponding to a mammo acquisition, and reset dose counter. The dose for the DBT scan only will be reported at the end:   !!mammo-DBT!!
      MAIN_THREAD printf("\n\n !!DBT Reporting \"tally_materials_dose\" for the first 0 deg projection, and reseting material and voxel dose counters.\n");
      checkCudaErrors( cudaMemcpy( materials_dose, materials_dose_device, MAX_MATERIALS*sizeof(ulonglong2), cudaMemcpyDeviceToHost) );  // Copy materials dose results to host
      #ifdef USING_MPI
        ulonglong2 materials_dose_total[MAX_MATERIALS];
        return_reduce = MPI_Reduce(materials_dose, materials_dose_total, 2*MAX_MATERIALS, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);   // !!tally_materials_dose!!
      #else
        ulonglong2 *materials_dose_total = materials_dose;  // Create a dummy pointer to the materials_dose data 
      #endif
      MAIN_THREAD report_materials_dose(1, total_histories, density_nominal, materials_dose_total, mass_materials, file_name_materials);    // Report the material dose for the mammo image only  !!tally_materials_dose!!
      
      int kk;
      for(kk=0;kk<MAX_MATERIALS;kk++)   // Reset dose in CPU and GPU memory
      {  
        materials_dose[kk].x = 0;
        materials_dose[kk].y = 0;
      }
      checkCudaErrors(cudaMemcpy(materials_dose_device, materials_dose, MAX_MATERIALS*sizeof(ulonglong2), cudaMemcpyHostToDevice));   // !!tally_materials_dose!!
      
      
      //  --Reject the voxel doses tallied for this first projection: re-copy the empty host data to GPU     // !!DeBuG!! (It would be more efficient to disable the tally in kernel directly for the first projection...)
      if (dose_ROI_x_max > -1)
        checkCudaErrors(cudaMemcpy(voxels_Edep_device, &voxels_Edep, voxels_Edep_bytes, cudaMemcpyHostToDevice) );   // !!DBTv1.4!!
    }
  
  
  }  // [Projection loop end: iterate for next CT projection angle]


///////////////////////////////////////////////////////////////////////////////////////////////////


  // *** Simulation finished! Report dose and timings and clean up.

  if (dose_ROI_x_max > -1)
  {   
    MAIN_THREAD clock_kernel = clock();    

    checkCudaErrors( cudaMemcpy( voxels_Edep, voxels_Edep_device, voxels_Edep_bytes, cudaMemcpyDeviceToHost) );  // Copy final dose results to host (for every MPI threads)

    MAIN_THREAD printf("       ==> CUDA: Time copying dose results from device to host: %.6f s\n", float(clock()-clock_kernel)/CLOCKS_PER_SEC);
  }
  
  if (flag_material_dose==1)
    checkCudaErrors( cudaMemcpy( materials_dose, materials_dose_device, MAX_MATERIALS*sizeof(ulonglong2), cudaMemcpyDeviceToHost) );  // Copy materials dose results to host, if tally enabled in input file.   !!tally_materials_dose!!

  // -- Clean up GPU device memory:
  clock_kernel = clock();    

  cudaFree(voxel_mat_dens_device);
  cudaFree(image_device);
  cudaFree(mfp_Woodcock_table_device);
  cudaFree(mfp_table_a_device);
  cudaFree(mfp_table_b_device);
  cudaFree(voxels_Edep_device);
  checkCudaErrors(cudaDeviceReset());

  MAIN_THREAD printf("       ==> CUDA: Time freeing the device memory and ending the GPU threads: %.6f s\n", float(clock()-clock_kernel)/CLOCKS_PER_SEC);


#ifdef USING_MPI
  current_time=time(NULL);     // Get current time (in seconds)
  char_time = ctime(&current_time); char_time[19] = '\0';   // The time is located betwen the characters 11 and 19.  
  printf("        >> MPI thread %d in \"%s\" done! (local time: %s)\n", myID, MPI_processor_name, &char_time[11]);
  fflush(stdout);   // Clear the screen output buffer
#endif


  // *** Report the total dose for all the projections, if the tally is not disabled (must be done after MPI_Barrier to have all the MPI threads synchronized):

  MAIN_THREAD clock_start = clock();
  
  if (dose_ROI_x_max > -1)
  {  
    #ifdef USING_MPI	  
    if (numprocs>1)
    {
      // -- Use MPI_Reduce to accumulate the dose from all projections:      
      //    Allocate memory in the root node to combine the dose results with MPI_REDUCE:
      int num_voxels_ROI = voxels_Edep_bytes/((int)sizeof(ulonglong2));   // Number of elements allocated in the "dose" array.
      ulonglong2 *voxels_Edep_total = (ulonglong2*) malloc(voxels_Edep_bytes);
      if (voxels_Edep_total==NULL)
      {
        printf("\n\n   !!malloc ERROR!! Not enough memory to allocate %d voxels by the MPI root node for the total deposited dose (and uncertainty) array (%f Mbytes)!!\n\n", num_voxels_ROI, voxels_Edep_bytes/(1024.f*1024.f));
        exit(-2);
      }
      else
      {
        MAIN_THREAD
        {
          printf("\n        >> Array for the total deposited dose correctly allocated by the MPI root node (%f Mbytes).\n", voxels_Edep_bytes/(1024.f*1024.f));
          printf(  "           Waiting at MPI_Barrier for thread synchronization.\n");
        }
      }      
      
      
      MAIN_THREAD printf("\n        >> Calling MPI_Reduce to accumulate the dose from all projections...\n\n");    
      
      return_reduce = MPI_Reduce(voxels_Edep, voxels_Edep_total, 2*num_voxels_ROI, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);   // Sum all the doses in "voxels_Edep_total" at thread 0.
            // I am sending a "ulonglong2" array as if it was composed of 2 "ulonglong" variables per element. There could be problems if the alignment in the structure includes some extra padding space (but it seems ok for a 64-bit computer).
      if (MPI_SUCCESS != return_reduce)
      {
        printf("\n\n !!ERROR!! Possible error reducing (MPI_SUM) the dose results??? return_reduce = %d for thread %d\n\n\n", return_reduce, myID);
      }

      // -- Exchange the dose simulated in thread 0 for the final dose from all threads  
      MAIN_THREAD
      {
        free(voxels_Edep);
        voxels_Edep = voxels_Edep_total;    // point the voxels_Edep pointer to the final voxels_Edep array in host memory
        voxels_Edep_total = NULL;           // This pointer is not needed by now
      }
    }
    #endif

    // -- Report the total dose for all the projections:
    MAIN_THREAD report_voxels_dose(file_dose_output, num_projections, &voxel_data, voxel_mat_dens, voxels_Edep, time_total_MC_simulation, total_histories, dose_ROI_x_min, dose_ROI_x_max, dose_ROI_y_min, dose_ROI_y_max, dose_ROI_z_min, dose_ROI_z_max, source_data);
  }
  
  
  // -- Report "tally_materials_dose" with data from all MPI threads, if tally enabled:
  if (flag_material_dose==1)
  {

  #ifdef USING_MPI
    ulonglong2 materials_dose_total[MAX_MATERIALS];
    return_reduce = MPI_Reduce(materials_dose, materials_dose_total, 2*MAX_MATERIALS, MPI_UNSIGNED_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);   // !!tally_materials_dose!!
  #else
    ulonglong2 *materials_dose_total = materials_dose;  // Create a dummy pointer to the materials_dose data 
  #endif
    
    MAIN_THREAD report_materials_dose(num_projections, total_histories, density_nominal, materials_dose_total, mass_materials, file_name_materials);    // Report the material dose  !!tally_materials_dose!!

  }
  
  MAIN_THREAD clock_end = clock();
  MAIN_THREAD printf("\n\n       ==> CUDA: Time reporting the dose data: %.6f s\n", ((double)(clock_end-clock_start))/CLOCKS_PER_SEC);
  

  // *** Clean up RAM memory. If CUDA was used, the geometry and table data were already cleaned for MPI threads other than root after copying data to the GPU:
  free(voxels_Edep);
  free(image);
  
  MAIN_THREAD free(voxel_mat_dens);

  MAIN_THREAD 
  {
    printf("\n\n\n    -- SIMULATION FINISHED!\n");
    
    time_total_MC_init_report = ((double)(clock()-clock_start_beginning))/CLOCKS_PER_SEC;
    
    unsigned long long int total_histories_final = total_histories*((unsigned long long int)num_projections);
    if (flag_simulateMammoAfterDBT)
      total_histories_final = total_histories_final + total_histories_final*SCALE_MAMMO_DBT;      // Add the histories for both the tomo and the 0 deg projection
      
    // -- Report total performance:
    printf("\n\n       ****** TOTAL SIMULATION PERFORMANCE (including initialization and reporting) ******\n\n");  
    printf(    "          >>> Execution time including initialization, transport and report: %.3f s.\n", time_total_MC_init_report);
    printf(    "          >>> Time spent in the Monte Carlo transport only: %.3f s.\n", time_total_MC_simulation);
    printf(    "          >>> Time spent in initialization, reporting and clean up: %.3f s.\n", (time_total_MC_init_report-time_total_MC_simulation));
    printf(    "          >>> Total number of simulated x rays:  %lld\n", total_histories_final);
    if (time_total_MC_init_report>0.000001)
      printf(  "          >>> Total speed (using %d thread, including transport, initialization and report times) [x-rays/s]:  %.2f\n", numprocs, (double)(total_histories_final/time_total_MC_init_report));
    printf(    "          >>> Total speed Monte Carlo transport only (using %d thread) [x-rays/s]:  %.2f\n\n", numprocs, (double)(total_histories_final/time_total_MC_simulation));
  
    current_time=time(NULL);     // Get current time (in seconds)
    
    printf("\n****** Code execution finished on: %s\n\n", ctime(&current_time));
  }
  
  cudaDeviceReset();  // Destroy the CUDA context before ending program (flush visual debugger data).
#ifdef USING_MPI
  MPI_Finalize();   // Finalize MPI library: no more MPI calls allowed below.
#endif

  return 0;
}


////////////////////////////////////////////////////////////////////////////////
//! Read the input file given in the command line and return the significant data.
//! Example input file:
//!
//!    1000000          [Total number of histories to simulate]
//!    geometry.vox     [Voxelized geometry file name]
//!    material.mat     [Material data file name]
//!
//!       @param[in] argc   Command line parameters
//!       @param[in] argv   Command line parameters: name of input file
//!       @param[out] total_histories  Total number of particles to simulate
//!       @param[out] seed_input   Input random number generator seed
//!       @param[out] num_threads_per_block   Number of CUDA threads for each GPU block
//!       @param[out] detector_data
//!       @param[out] image
//!       @param[out] source_data
//!       @param[out] file_name_voxels
//!       @param[out] file_name_materials
//!       @param[out] file_name_output
////////////////////////////////////////////////////////////////////////////////
void read_input(int argc, char** argv, int myID, unsigned long long int* total_histories, int* seed_input, int* gpu_id, int* num_threads_per_block, int* histories_per_thread, struct detector_struct* detector_data, unsigned long long int** image_ptr, int* image_bytes, struct source_struct* source_data, struct source_energy_struct* source_energy_data, struct voxel_struct* voxel_data, char* file_name_voxels, char file_name_materials[MAX_MATERIALS][250] , char* file_name_output, char* file_name_espc, int* num_projections, ulonglong2** voxels_Edep_ptr, int* voxels_Edep_bytes, char* file_dose_output, short int* dose_ROI_x_min, short int* dose_ROI_x_max, short int* dose_ROI_y_min, short int* dose_ROI_y_max, short int* dose_ROI_z_min, short int* dose_ROI_z_max, double* SRotAxisD, double* translation_helical, int* flag_material_dose, bool* flag_simulateMammoAfterDBT, bool* flag_detectorFixed)
{
  FILE* file_ptr = NULL;
  char new_line[400];
  char *new_line_ptr = NULL;
  double dummy_double;

  // -- Read the input file name from command line, if given (otherwise keep default value):
  if (2==argc)
  {
    file_ptr = fopen(argv[1], "r");
    if (NULL==file_ptr)
    {
      printf("\n\n   !!read_input ERROR!! Input file not found or not readable. Input file name: \'%s\'\n\n", argv[1]);      
        //  Not finalizing MPI here because we want the execution to fail if there is a problem with any MPI thread!!! MPI_Finalize();   // Finalize MPI library: no more MPI calls allowed below.
      exit(-1);
    }
  }
  else if (argc>2)
  {
    
    MAIN_THREAD printf("\n\n   !!read_input ERROR!! Too many input parameter (argc=%d)!! Provide only the input file name.\n\n", argc);    
    // Finalizing MPI because all threads will detect the same problem and fail together.
    #ifdef USING_MPI
      MPI_Finalize();
    #endif
    exit(-1);
  }
  else
  {
    MAIN_THREAD printf("\n\n   !!read_input ERROR!! Input file name not given as an execution parameter!! Try again...\n\n");
    #ifdef USING_MPI
      MPI_Finalize();
    #endif
    exit(-1);
  }

  MAIN_THREAD printf("\n    -- Reading the input file \'%s\':\n", argv[1]);


  /////////////////////////////////////////////////////////////////////////////

  // -- Init. [SECTION SIMULATION CONFIG v.2009-05-12]:
  do
  {
    new_line_ptr = fgets(new_line, 400, file_ptr);    // Read full line (max. 400 characters).
    if (new_line_ptr==NULL)
    {
      printf("\n\n   !!read_input ERROR!! Input file is not readable or does not contain the string \'SECTION SIMULATION CONFIG v.2009-05-12\'!!\n");
      exit(-2);
    }
  }
  while(strstr(new_line,"SECTION SIMULATION CONFIG v.2009-05-12")==NULL);   // Skip comments and empty lines until the section begins
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    sscanf(new_line, "%lf", &dummy_double);
    *total_histories = (unsigned long long int) (dummy_double+0.0001);  // Maximum unsigned long long value: 18446744073709551615
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    sscanf(new_line, "%d", seed_input);   // Set the RANECU PRNG seed (the same seed will be used to init the 2 MLCGs in RANECU)
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    sscanf(new_line, "%d", gpu_id);       // GPU NUMBER WHERE SIMULATION WILL RUN
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    sscanf(new_line, "%d", num_threads_per_block);  // GPU THREADS PER CUDA BLOCK
    
  if ((*num_threads_per_block%32)!=0)
  {
    MAIN_THREAD printf("\n\n   !!read_input ERROR!! The input number of GPU threads per CUDA block must be a multiple of 32 (warp size). Input value: %d !!\n\n", *num_threads_per_block);
    #ifdef USING_MPI
      MPI_Finalize();
    #endif
    exit(-2);
  }
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    sscanf(new_line, "%d", histories_per_thread);   // HISTORIES PER GPU THREAD


  /////////////////////////////////////////////////////////////////////////////

  // -- Init. [SECTION SOURCE v.2009-05-12]:  !!DBTv1.4!!   ;  [SECTION SOURCE v.2011-07-12]  ;  [SECTION SOURCE v.2009-05-12]
  do
  {
    new_line_ptr = fgets(new_line, 400, file_ptr);
    if (new_line_ptr==NULL)
    {
      printf("\n\n   !!read_input ERROR!! Input file is not readable or does not contain the string \'SECTION SOURCE v.2016-12-02\'!!\n");    // !!DBTv1.4!!
      exit(-2);
    }
  }
  while(strstr(new_line,"SECTION SOURCE v.2016-12-02")==NULL);   // Skip comments and empty lines until the section begins

  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);  // X-RAY ENERGY SPECTRUM FILE
    trim_name(new_line, file_name_espc);
    
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    sscanf(new_line, "%f %f %f", &source_data[0].position.x, &source_data[0].position.y, &source_data[0].position.z);   // SOURCE POSITION: X Y Z [cm]
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    sscanf(new_line, "%f %f %f", &source_data[0].direction.x, &source_data[0].direction.y, &source_data[0].direction.z);   // SOURCE DIRECTION COSINES: U V W
    // -- Normalize the input beam direction to 1:
    dummy_double = 1.0/sqrt((double)(source_data[0].direction.x*source_data[0].direction.x + source_data[0].direction.y*source_data[0].direction.y + source_data[0].direction.z*source_data[0].direction.z));
    source_data[0].direction.x = (float)(((double)source_data[0].direction.x)*dummy_double);
    source_data[0].direction.y = (float)(((double)source_data[0].direction.y)*dummy_double);
    source_data[0].direction.z = (float)(((double)source_data[0].direction.z)*dummy_double);

  // Read input fan beam polar (theta) and azimuthal (phi) aperture angles (deg):
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
  double phi_aperture, theta_aperture;
    sscanf(new_line, "%lf %lf", &phi_aperture, &theta_aperture);

  if (0.5*theta_aperture > 180.0)
  {
    MAIN_THREAD printf("\n\n   !!read_input ERROR!! Input polar semi-aperture must be in [0,180] deg.!\n");
    MAIN_THREAD printf("                       0.5*theta_aperture = %lf, 0.5*phi_aperture = %lf\n", 0.5*theta_aperture, 0.5*phi_aperture);
    #ifdef USING_MPI
      MPI_Finalize();
    #endif
    exit(-2);
  }
  if (0.5*phi_aperture > 360.0)
  {
    MAIN_THREAD printf("\n\n   !!read_input ERROR!! Input azimuthal semi-aperture must be in [0,360] deg.!\n");
    MAIN_THREAD printf("                       0.5*theta_aperture = %lf, 0.5*phi_aperture = %lf\n", 0.5*theta_aperture, 0.5*phi_aperture);
    #ifdef USING_MPI
      MPI_Finalize();
    #endif    
    exit(-2);
  }
  
  // Read the source rotation: necessary to define which direction is azimuthal (width) and polar (height) in the rotated source emission:             !!DBTv1.4!!
  double rotZ1, rotY2, rotZ3;
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
  sscanf(new_line, "%lf %lf %lf", &rotZ1, &rotY2, &rotZ3);  // EULER ANGLES (RzRyRz) TO ROTATE RECTANGULAR BEAM FROM DEFAULT POSITION AT Y=0, NORMAL=(0,-1,0)   //!!DBTv1.4!!
  
  // *** Init the fan beam source model:
    source_data[0].rot_fan[0] = 1; source_data[0].rot_fan[1] = 0; source_data[0].rot_fan[2] = 0;   // Init rotation matrix to identity
    source_data[0].rot_fan[3] = 0; source_data[0].rot_fan[4] = 1; source_data[0].rot_fan[5] = 0; 
    source_data[0].rot_fan[6] = 0; source_data[0].rot_fan[7] = 0; source_data[0].rot_fan[8] = 1; 
    
    create_rotation_matrix_around_axis(rotZ1*DEG2RAD, 0, 0, 1, source_data[0].rot_fan);  // 1st rotation around Z               !!DBTv1.4!! 
    create_rotation_matrix_around_axis(rotY2*DEG2RAD, 0, 1, 0, source_data[0].rot_fan);  // 2nd rotation around Y               !!DBTv1.4!! 
    create_rotation_matrix_around_axis(rotZ3*DEG2RAD, 0, 0, 1, source_data[0].rot_fan);  // 3rd rotation around Z               !!DBTv1.4!! 
    
  
    MAIN_THREAD printf("       Input Euler angles to rotate the source from (0,1,0) to the input direction [deg]: rotZ1=%f , rotY2=%f , rotZ3=%f\n", rotZ1, rotY2, rotZ3); // !!DBTv1.4!! !!VERBOSE!!
        // printf("\n                               [%f %f %f]\n",source_data[0].rot_fan[0],source_data[0].rot_fan[1],source_data[0].rot_fan[2]);
        // printf(  "  Rotation matrix: Rodrigues = |%f %f %f|\n",source_data[0].rot_fan[3],source_data[0].rot_fan[4],source_data[0].rot_fan[5]);                              // !!DBTv1.4!! !!VERBOSE!!
        // printf(  "                               [%f %f %f]\n\n",source_data[0].rot_fan[6],source_data[0].rot_fan[7],source_data[0].rot_fan[8]); 
    
    
    float3 default_direction;
    default_direction.x = 0.0f;  default_direction.y = 1.0f;  default_direction.z = 0.0f;
    apply_rotation(&default_direction, source_data[0].rot_fan);
    
    
    if ( fabsf(default_direction.x-source_data[0].direction.x)>1e-5f || fabsf(default_direction.y-source_data[0].direction.y)>1e-5f || fabsf(default_direction.z-source_data[0].direction.z)>1e-5f )
    {
      MAIN_THREAD 
      {
        printf("\n\n!!WARNING!! The input Euler rotation angles for the source are incorrect!!!!!\n");       // !!DBTv1.4!!
        printf(  "            The Euler angles are defined as a rotation around Z axis, then Y, then Z again; positive rotations are counter-clock (eg, to move the detector from Y=0 to Z=0, input: 90.0, -90.0, 180.0).\n");
        printf(  "            The input angles would rotate the default source direction (0,1,0) towards direction (%f,%f,%f), but the input direction was (%f,%f,%f) instead.\n\n", default_direction.x, default_direction.y, default_direction.z, source_data[0].direction.x, source_data[0].direction.y, source_data[0].direction.z);
        printf(  "            Please, provide a consistent set of source direction and Euler angle rotation or the code can't determine the orientation of the square field and detector. \n\n\n");  // !!DBTv1.4!!
      }
      #ifdef USING_MPI
        MPI_Finalize();
      #endif
      exit(-1);
    }


  // Read parameters for the non-ideal focal spot:             !!DBTv1.4!!
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
  sscanf(new_line, "%f", &source_data[0].focal_spot_FWHM);  // SOURCE GAUSSIAN FOCAL SPOT FWHM [cm]
  
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
  sscanf(new_line, "%f", &source_data[0].rotation_blur);    // ANGULAR BLUR DUE TO MOVEMENT ([exposure_time]*[angular_speed]) [degrees]
  source_data[0].rotation_blur = fabsf(source_data[0].rotation_blur*DEG2RAD);
  
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);    // COLLIMATE BEAM TOWARDS POSITIVE X ANGLES ONLY? (ie, cone-beam center aligned with chest wall in mammography) [YES/NO]
  if (0==strncmp("YE",new_line,2) || 0==strncmp("Ye",new_line,2) || 0==strncmp("ye",new_line,2))
    source_data[0].flag_halfConeX = true;
    // MAIN_THREAD printf("       \'flag_halfConeX = true\': sampling only upper half beam for mammo geometry; beam centered at image edge.\n");   // !!DBT!!  !!HalfBeam!! !!DBTv1.4!!
  else if (0==strncmp("NO",new_line,2) || 0==strncmp("No",new_line,2) || 0==strncmp("no",new_line,2))
    source_data[0].flag_halfConeX = false;
  else
  {
    MAIN_THREAD printf("\n\n   !!read_input ERROR!! Answer YES or NO in the beam collimation question in \'SECTION SOURCE\'.\n                        Input text: %s\n\n",new_line);
    #ifdef USING_MPI
      MPI_Finalize();
    #endif
    exit(-2);
  }
  

  /////////////////////////////////////////////////////////////////////////////

  // -- Init. [SECTION IMAGE DETECTOR v.2017-06-20]]:       !!DBTv1.5!!
  do
  {
    new_line_ptr = fgets(new_line, 400, file_ptr);
    if (new_line_ptr==NULL)
    {
      printf("\n\n   !!read_input ERROR!! Input file is not readable or does not contain the string \'SECTION IMAGE DETECTOR v.2017-06-20]\'!!\n");
      exit(-2);
    }
  }
  while(strstr(new_line,"SECTION IMAGE DETECTOR v.2017-06-20")==NULL);   // Skip comments and empty lines until the section begins       !!DBTv1.5!!
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    trim_name(new_line, file_name_output);   // OUTPUT IMAGE FILE NAME (no spaces)
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    float dummy_num_pixels_x, dummy_num_pixels_y;  // Read input pixel number as float and truncated to integer
    sscanf(new_line, "%f %f", &dummy_num_pixels_x, &dummy_num_pixels_y);   // NUMBER OF PIXELS IN THE IMAGE: Nx Nz
    detector_data[0].num_pixels.x = (int)(dummy_num_pixels_x+0.001f);
    detector_data[0].num_pixels.y = (int)(dummy_num_pixels_y+0.001f);
    detector_data[0].total_num_pixels = detector_data[0].num_pixels.x * detector_data[0].num_pixels.y;

    if (detector_data[0].total_num_pixels < 1 || detector_data[0].total_num_pixels > 99999999 )
    {
      MAIN_THREAD printf("\n\n   !!read_input ERROR!! The input number of pixels is incorrect. Input: X_pix = %d, Y_pix = %d, total_num_pix = %d!!\n\n", detector_data[0].num_pixels.x, detector_data[0].num_pixels.y, detector_data[0].total_num_pixels);
      #ifdef USING_MPI
        MPI_Finalize();
      #endif      
      exit(-2); 
    }
  
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
  sscanf(new_line, "%f %f", &detector_data[0].width_X, &detector_data[0].height_Z);   // IMAGE SIZE (width, height): Dx Dz [cm]

  detector_data[0].inv_pixel_size_X = detector_data[0].num_pixels.x / detector_data[0].width_X;
  detector_data[0].inv_pixel_size_Z = detector_data[0].num_pixels.y / detector_data[0].height_Z;

  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    sscanf(new_line, "%f", &detector_data[0].sdd);            // SOURCE-TO-DETECTOR DISTANCE [cm] (detector set in front of the source, normal to the input direction)

  if ((detector_data[0].sdd)<1.0e-6)
  {
    MAIN_THREAD printf("\n\n   !!read_input ERROR!! The source-to-detector distance must be positive. Input: sdd=%f!!\n\n", detector_data[0].sdd);
    #ifdef USING_MPI
      MPI_Finalize();
    #endif      
    exit(-2);
  }
    
  // Input parameters for the improved detector model:   !!DBTv1.4!!
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    sscanf(new_line, "%f %f", &detector_data[0].offset.x, &detector_data[0].offset.y);  // IMAGE OFFSET ON DETECTOR PLANE IN WIDTH AND HEIGHT DIRECTIONS (BY DEFAULT BEAM CENTERED AT IMAGE CENTER) [cm]  !!DBTv1.4!!

  if (source_data[0].flag_halfConeX)
    detector_data[0].offset.y = detector_data[0].offset.y + 0.5*detector_data[0].height_Z;   // Center the cone beam at the edge of the image with a halfCone (mammo).    !!DBT!!  !!HalfBeam!!  !!DBTv1.4!!

  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    sscanf(new_line, "%f", &detector_data[0].scintillator_thickness);  // DETECTOR THICKNESS [cm]                                                 !!DBTv1.4!!
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    sscanf(new_line, "%f", &detector_data[0].scintillator_MFP);        // DETECTOR MATERIAL AVERAGE MEAN FREE PATH [1/cm]                         !!DBTv1.4!!
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    sscanf(new_line, "%f %f %f %f", &detector_data[0].kedge_energy, &detector_data[0].fluorescence_energy, &detector_data[0].fluorescence_yield, &detector_data[0].fluorescence_MFP);   // DETECTOR K-EDGE ENERGY [eV], K-FLUORESCENCE ENERGY [eV], K-FLUORESCENCE YIELD, MFP AT FLUORESCENCE ENERGY [cm]
           // NOTE: K-EDGE ENERGY, K-FLUORESCENCE ENERGY and K-FLUORESCENCE YIELD are tabulated in the XRAYLIB and other tables                   !!DBTv1.4!!    
    
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    float Swank_factor = -1.0f;
    sscanf(new_line, "%f %f", &detector_data[0].gain_W, &Swank_factor);   // EFECTIVE DETECTOR GAIN, W_+- [eV/ehp], SWANK FACTOR (input 0 to report ideal energy fluence)
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    sscanf(new_line, "%f", &detector_data[0].electronic_noise);     // ELECTRONIC NOISE LEVEL (electrons/pixel)

  if (detector_data[0].gain_W<0.001f || Swank_factor<0.001f)
  {
    MAIN_THREAD printf("\n   !!read_input!! Negative gain or Swank factor input: reporting pixel values as energy fluence, disabling conversion to detected charges and electronic noise.\n\n");
    detector_data[0].gain_W = 0.0f;
    detector_data[0].Swank_rel_std = 0.0f;
    detector_data[0].electronic_noise = 0.0f;
  }
  else
  {
    if (Swank_factor > 0.9999995f)
      detector_data[0].Swank_rel_std = 0.0f;  // Swank_rel_std = 0 ==>  Swank factor = 1 ==> no variability in the amount of ehp generated
    else  
      detector_data[0].Swank_rel_std = sqrtf(1.0f/Swank_factor - 1.0f);
  }
  
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    sscanf(new_line, "%f %f", &detector_data[0].cover_thickness, &detector_data[0].cover_MFP);   // PROTECTIVE COVER THICKNESS (detector and grid) [cm], MEAN FREE PATH AT AVERAGE ENERGY [cm]    !!DBTv1.5!!
    
  float grid_strip_MFP=-1.0f, grid_interspace_MFP=-1.0f;
  int grid_orientation=99;
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    sscanf(new_line, "%f %f %f", &detector_data[0].grid_ratio, &detector_data[0].grid_freq, &detector_data[0].grid_strip_thickness);   // ANTISCATTER GRID RATIO, FREQUENCY, STRIP THICKNESS [X:1, lp/cm, cm]       !!DBTv1.5!!
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    sscanf(new_line, "%f %f", &grid_strip_MFP, &grid_interspace_MFP);       // ANTISCATTER STRIPS AND INTERSPACE MEAN FREE PATHS AT MEAN ENERGY [cm]             !!DBTv1.5!!
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    sscanf(new_line, "%d", &grid_orientation);                              // ORIENTATION 1D FOCUSED ANTISCATTER GRID LINES: 0==STRIPS PERPENDICULAR LATERAL DIRECTION (mammo style); 1==STRIPS PARALLEL LATERAL DIRECTION (DBT style)      !!DBTv1.5!! 

  detector_data[0].grid_strip_mu = 1.0f/grid_strip_MFP;                     // Store the coefficients of attenuation for the attenuating strips and the interspace material [1/cm]
  detector_data[0].grid_interspace_mu = 1.0f/grid_interspace_MFP;
  
  if (detector_data[0].grid_ratio<1e-7f || detector_data[0].grid_freq<1e-7f || detector_data[0].grid_strip_thickness<2e-8f)
  {
    detector_data[0].grid_freq = -1.0f;    // Signal that the grid is disabled
  } 
  
  if (0==grid_orientation)
    detector_data[0].grid_ratio = -1.0f*detector_data[0].grid_ratio;    // A negative grid ratio will signal orientation 0    !!DBTv1.5!!
  else if (grid_orientation!=0 && grid_orientation!=1)
  {
    if (detector_data[0].grid_freq>0.0f)
    {
      MAIN_THREAD printf("\n\n   !!read_input ERROR!! Incorrect grid orientation value: input 0 for strips perpendicular to image width (lateral direction) as in mammography, or 1 for strips parallel to image widtht. Input: orientation=%d!!\n\n", grid_orientation);
      #ifdef USING_MPI
        MPI_Finalize();
      #endif      
      exit(-2);              // !!DBTv1.5!!
    }
  }
    
    
  if ( (theta_aperture < -1.0e-7) || (phi_aperture < -1.0e-7) )   
  {
    // Negative angle input: set total fan beam angle to exactly cover the detector surface.
    theta_aperture= 2.0 * atan(0.5*detector_data[0].height_Z/(detector_data[0].sdd)) * RAD2DEG;
    phi_aperture  = 2.0 * atan(0.5*detector_data[0].width_X/(detector_data[0].sdd))  * RAD2DEG;
  }
  
  if (source_data[0].flag_halfConeX)
    theta_aperture= 2.0*theta_aperture;    // Double the input aperture towards the nipple to send beam only towards positive angles (+X)     !!DBT!!  !!HalfBeam!!   !!DBTv1.4!!
 
    
  // *** RECTANGULAR BEAM INITIALIZATION: aperture initially centered at (0,1,0), ie, THETA_0=90, PHI_0=90
  //     Using the algorithm used in PENMAIN.f, from penelope 2008 (by F. Salvat).
  source_data[0].cos_theta_low = (float)( cos((90.0 - 0.5*theta_aperture)*DEG2RAD) );
  source_data[0].D_cos_theta   = (float)( -2.0*source_data[0].cos_theta_low );      // Theta aperture is symetric above and below 90 deg
  source_data[0].phi_low       = (float)( (90.0 - 0.5*phi_aperture)*DEG2RAD );
  source_data[0].D_phi         = (float)( phi_aperture*DEG2RAD );
  
  
  // Particular case of pencil beam input: convert the 0 angle to a very small square beam to avoid precission errors in sampling
  if (abs(theta_aperture) < 1.0e-7)
  {
    theta_aperture = +1.00e-8;
    source_data[0].cos_theta_low = 0.0f;  // = cos(90-0)
    source_data[0].D_cos_theta   = 0.0f;
  }
  if (abs(phi_aperture) < 1.0e-7)
  {  
    phi_aperture = +1.00e-8;
    source_data[0].phi_low       = (float)( 90.0*DEG2RAD );
    source_data[0].D_phi         = 0.0f;    
  }  


  source_data[0].max_height_at_y1cm = (float) ( tan(0.5*theta_aperture*DEG2RAD) ); // !!DBTv1.4!!
  source_data[0].max_width_at_y1cm  = (float) ( tan(0.5*phi_aperture*DEG2RAD) );   // Collimate in both directions when using th non-point focal spot.      !!DBTv1.4!!


  if (source_data[0].flag_halfConeX)       // Sampling only half beam towards +X for mammo geometry!           !!DBT!!  !!HalfBeam!! !!DBTv1.4!!
    source_data[0].D_cos_theta = 0.5f*source_data[0].D_cos_theta;                                          //  !!DBT!!  !!HalfBeam!! !!DBTv1.4!!


  /////////////////////////////////////////////////////////////////////////////

  // -- Init. [SECTION TOMOGRAPHIC TRAJECTORY v.2016-12-02]     (OLD NAME SECTION: [SECTION CT SCAN v.2011-10-25])       !!DBTv1.4!!
  do
  {
    new_line_ptr = fgets(new_line, 400, file_ptr);
    if (new_line_ptr==NULL)
    {
      printf("\n\n   !!read_input ERROR!! Input file is not readable or does not contain the string \'SECTION TOMOGRAPHIC TRAJECTORY v.2016-12-02\'!!\n");
      exit(-2);
    }
  }
  while(strstr(new_line,"SECTION TOMOGRAPHIC TRAJECTORY v.2016-12-02")==NULL);  // Skip comments and empty lines until the section begins

  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
  sscanf(new_line, "%d", num_projections);     // NUMBER OF PROJECTIONS (set to 1 or less for a single projection)
  if (*num_projections<1)
    *num_projections = 1;      // Zero projections has the same effect as 1 projection (ie, no CT scan rotation).
  

  if ( fabs(*num_projections) > MAX_NUM_PROJECTIONS )
  {
    MAIN_THREAD printf("\n\n   !!read_input ERROR!! The input number of projections is too large. Increase parameter MAX_NUM_PROJECTIONS=%d in the header file and recompile.\n", MAX_NUM_PROJECTIONS);
    MAIN_THREAD printf(    "                        There is no limit in the number of projections to be simulated because the source, detector data for each projection is stored in global memory and transfered to shared memory for each projection.\n\n");
    #ifdef USING_MPI
      MPI_Finalize();
    #endif    
    exit(-2);
  }
  

  if (*num_projections==1)
  {
    // -- Skip rest of the section if simulating a single projection:
    source_data[0].angle_per_projection = 0.0f;   // Init variables to default values hat will not be used anyway
    source_data[0].angle_offset = 0.0f;
    source_data[0].axis_of_rotation.x = 1.0f; source_data[0].axis_of_rotation.y = source_data[0].axis_of_rotation.z = 0.0f;
    source_data[0].rotation_point.x = source_data[0].rotation_point.y = source_data[0].rotation_point.z = 0.0f;
  }
  else
  {
    // -- Tomographic scan with multiple projections:    
    new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    sscanf(new_line, "%lf", SRotAxisD);   // SOURCE-TO-ROTATION AXIS DISTANCE
    if (*SRotAxisD<0.0 || *SRotAxisD>detector_data[0].sdd)
    {
      MAIN_THREAD printf("\n\n   !!read_input ERROR!! Invalid source-to-rotation axis distance! Input: %f (sdd=%f).\n\n\n", *SRotAxisD, detector_data[0].sdd);
      #ifdef USING_MPI
        MPI_Finalize();
      #endif      
      exit(-2);
    }
  
    source_data[0].rotation_point.x = source_data[0].position.x + source_data[0].direction.x * (*SRotAxisD);   // Store the rotation point to apply rotation in the kernel   !!DBTv1.4!!
    source_data[0].rotation_point.y = source_data[0].position.y + source_data[0].direction.y * (*SRotAxisD);
    source_data[0].rotation_point.z = source_data[0].position.z + source_data[0].direction.z * (*SRotAxisD);

  
    new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    sscanf(new_line, "%f", &source_data[0].angle_per_projection); // ANGLE BETWEEN PROJECTIONS [degrees] (360/num_projections for full CT)   !!DBTv1.4!!
    source_data[0].angle_per_projection = source_data[0].angle_per_projection*DEG2RAD;      // store the angle in radians

 
    new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    sscanf(new_line, "%f", &source_data[0].angle_offset);         // ANGULAR ROTATION TO FIRST PROJECTION (USEFUL FOR DBT, INPUT SOURCE DIRECTION CONSIDERED AS 0 DEGREES) [degrees]   !!DBTv1.4!!
    source_data[0].angle_offset = source_data[0].angle_offset*DEG2RAD;


    double wx,wy,wz,norm;
    new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    sscanf(new_line, "%lf %lf %lf", &wx, &wy, &wz);               // AXIS OF ROTATION     !!DBTv1.4!!
    norm = 1.0/sqrt(wx*wx+wy*wy+wz*wz);
    source_data[0].axis_of_rotation.x = (float) wx*norm;
    source_data[0].axis_of_rotation.y = (float) wy*norm;
    source_data[0].axis_of_rotation.z = (float) wz*norm;
    
    
    new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    sscanf(new_line, "%lf", translation_helical);                 // TRANSLATION ALONG ROTATION AXIS BETWEEN PROJECTIONS (HELICAL SCAN) [cm]


    new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);        // KEEP DETECTOR FIXED AT 0 DEGREES FOR DBT? [YES/NO]
    if (0==strncmp("YE",new_line,2) || 0==strncmp("Ye",new_line,2) || 0==strncmp("ye",new_line,2))
      *flag_detectorFixed = true;
    else if (0==strncmp("NO",new_line,2) || 0==strncmp("No",new_line,2) || 0==strncmp("no",new_line,2))
      *flag_detectorFixed = false;
    else
    {
      MAIN_THREAD printf("\n\n   !!read_input ERROR!! Answer YES or NO to KEEP DETECTOR FIXED AT 0 DEGREES FOR DBT.\n                        Input text: %s\n\n",new_line);
      #ifdef USING_MPI
        MPI_Finalize();
      #endif
      exit(-2);
    }
    
    
    new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);        // SIMULATE BOTH 0 deg PROJECTION AND TOMOGRAPHIC SCAN (WITHOUT GRID) WITH 2/3 TOTAL NUM HIST IN 1st PROJ (eg, DBT+mammo)? [YES/NO]    !!DBTv1.4!!
    if (0==strncmp("YE",new_line,2) || 0==strncmp("Ye",new_line,2) || 0==strncmp("ye",new_line,2))
      *flag_simulateMammoAfterDBT = true;
    else if (0==strncmp("NO",new_line,2) || 0==strncmp("No",new_line,2) || 0==strncmp("no",new_line,2))
      *flag_simulateMammoAfterDBT = false;
    else
    {
      MAIN_THREAD printf("\n\n   !!read_input ERROR!! Answer YES or NO to SIMULATE BOTH FIRST PROJECTION AND TOMOGRAPHIC SCAN (WITHOUT GRID) WITH SAME NUM HIST (eg, DBT+mammo).\n                        Input text: %s\n\n",new_line);
      #ifdef USING_MPI
        MPI_Finalize();
      #endif
      exit(-2);
    }
    if (*num_projections==1)
      *flag_simulateMammoAfterDBT=false;    // Make sure the flag is always false if simulating a single projection

  }


  /////////////////////////////////////////////////////////////////////////////

  // -- Init. [SECTION DOSE DEPOSITION v.2012-12-12] (MC-GPU v.1.3):
  //    Electrons are not transported and therefore we are approximating that the dose is equal to the KERMA (energy released by the photons alone).
  //    This approximation is acceptable when there is electronic equilibrium and when the range of the secondary electrons is shorter than the voxel size.
  //    Usually the doses will be acceptable for photon energies below 1 MeV. The dose estimates may not be accurate at the interface of low density volumes.
  do
  {
    new_line_ptr = fgets(new_line, 400, file_ptr);
    if (new_line_ptr==NULL)
    {
      printf("\n\n   !!read_input ERROR!! Input file is not readable or does not contain the string \'SECTION DOSE DEPOSITION v.2012-12-12\'!!\n");
      exit(-2);
    }
    
    if (strstr(new_line,"SECTION DOSE DEPOSITION v.2011-02-18")!=NULL)  // Detect previous version of input file
    {
      MAIN_THREAD printf("\n\n   !!read_input ERROR!! Please update the input file to the new version of MC-GPU (v1.3)!!\n\n    You simply have to change the input file text line:\n         [SECTION DOSE DEPOSITION v.2011-02-18]\n\n    for these two lines:\n         [SECTION DOSE DEPOSITION v.2012-12-12]\n         NO                              # TALLY MATERIAL DOSE? [YES/NO]\n\n");
      exit(-2);
    }
    
  }
  while(strstr(new_line,"SECTION DOSE DEPOSITION v.2012-12-12")==NULL);  // Skip comments and empty lines until the section begins
    

  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);   // TALLY MATERIAL DOSE? [YES/NO]  --> turn on/off the material dose tallied adding the Edep in each material, independently of the voxels.
  if (0==strncmp("YE",new_line,2) || 0==strncmp("Ye",new_line,2) || 0==strncmp("ye",new_line,2))
  {
    *flag_material_dose = 1;
    MAIN_THREAD printf("       Material dose deposition tally ENABLED.\n");
  }
  else if (0==strncmp("NO",new_line,2) || 0==strncmp("No",new_line,2) || 0==strncmp("no",new_line,2))
  {
    *flag_material_dose = 0;  // -- NO: disabling tally
    MAIN_THREAD printf("       Material dose deposition tally DISABLED.\n");    
  }
  else
  {
    MAIN_THREAD printf("\n\n   !!read_input ERROR!! Answer YES or NO in the first two line of \'SECTION DOSE DEPOSITION\' to enable or disable the material dose and 3D voxel dose tallies.\n                        Input text: %s\n\n",new_line);
    #ifdef USING_MPI
      MPI_Finalize();
    #endif
    exit(-2);
  }        

  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);   // TALLY 3D VOXEL DOSE? [YES/NO] 

  if (0==strncmp("YE",new_line,2) || 0==strncmp("Ye",new_line,2) || 0==strncmp("ye",new_line,2))
  {
    // -- YES: using the tally
    new_line_ptr = fgets_trimmed(new_line, 400, file_ptr); trim_name(new_line, file_dose_output);   // OUTPUT DOSE FILE NAME (no spaces)
    new_line_ptr = fgets_trimmed(new_line, 400, file_ptr); sscanf(new_line, "%hd %hd", dose_ROI_x_min, dose_ROI_x_max);   // # VOXELS TO TALLY DOSE: X-index min max (first voxel has index 1)
    new_line_ptr = fgets_trimmed(new_line, 400, file_ptr); sscanf(new_line, "%hd %hd", dose_ROI_y_min, dose_ROI_y_max);   // # VOXELS TO TALLY DOSE: Y-index min max
    new_line_ptr = fgets_trimmed(new_line, 400, file_ptr); sscanf(new_line, "%hd %hd", dose_ROI_z_min, dose_ROI_z_max);   // # VOXELS TO TALLY DOSE: Z-index min max

    *dose_ROI_x_min -= 1; *dose_ROI_x_max -= 1;  // -Re-scale input coordinates to have index=0 for the first voxel instead of 1.
    *dose_ROI_y_min -= 1; *dose_ROI_y_max -= 1;
    *dose_ROI_z_min -= 1; *dose_ROI_z_max -= 1;

    MAIN_THREAD printf("       3D voxel dose deposition tally ENABLED.\n");
    if ( ((*dose_ROI_x_min)>(*dose_ROI_x_max)) || ((*dose_ROI_y_min)>(*dose_ROI_y_max)) || ((*dose_ROI_z_min)>(*dose_ROI_z_max)) ||
          (*dose_ROI_x_min)<0 || (*dose_ROI_y_min)<0 || (*dose_ROI_z_min)<0 )
    {
      MAIN_THREAD printf("\n\n   !!read_input ERROR!! The input region-of-interst in \'SECTION DOSE DEPOSITION\' is not valid: the minimum voxel index may not be zero or larger than the maximum index.\n");
      MAIN_THREAD printf(  "                          Input data = X[%d,%d], Y[%d,%d], Z[%d,%d]\n\n", *dose_ROI_x_min+1, *dose_ROI_x_max+1, *dose_ROI_y_min+1, *dose_ROI_y_max+1, *dose_ROI_z_min+1, *dose_ROI_z_max+1);  // Show ROI with index=1 for the first voxel instead of 0.
      #ifdef USING_MPI
        MPI_Finalize();
      #endif      
      exit(-2);
    }
    if ( ((*dose_ROI_x_min)==(*dose_ROI_x_max)) && ((*dose_ROI_y_min)==(*dose_ROI_y_max)) && ((*dose_ROI_z_min)==(*dose_ROI_z_max)) ) 
    {
      MAIN_THREAD printf("\n\n   !!read_input!! According to the input region-of-interest in \'SECTION DOSE DEPOSITION\', only the dose in the voxel (%d,%d,%d) will be tallied.\n\n",*dose_ROI_x_min,*dose_ROI_y_min,*dose_ROI_z_min);
    }
    
  }
  else if (0==strncmp("NO",new_line,2) || 0==strncmp("No",new_line,2) || 0==strncmp("no",new_line,2))
  {
    // -- NO: disabling tally
    MAIN_THREAD printf("       3D voxel dose deposition tally DISABLED.\n");
    *dose_ROI_x_min = (short int) 32500; *dose_ROI_x_max = (short int) -32500;   // Set absurd values for the ROI to make sure we never get any dose tallied
    *dose_ROI_y_min = (short int) 32500; *dose_ROI_y_max = (short int) -32500;   // (the maximum values for short int variables are +-32768)
    *dose_ROI_z_min = (short int) 32500; *dose_ROI_z_max = (short int) -32500;
  }
  else
  {
      MAIN_THREAD printf("\n\n   !!read_input ERROR!! Answer YES or NO in the first two line of \'SECTION DOSE DEPOSITION\' to enable or disable the material dose and 3D voxel dose tallies.\n                        Input text: %s\n\n",new_line);
      #ifdef USING_MPI
        MPI_Finalize();
      #endif
      exit(-2);
  }
  MAIN_THREAD printf("\n");


  /////////////////////////////////////////////////////////////////////////////

  
  // -- Init. [SECTION VOXELIZED GEOMETRY FILE v.2017-07-26]  // !!v1.5bitree!!     // 2016-12-02]  // !!DBTv1.4!!   // Previous version: v.2009-11-30
  do
  {
    new_line_ptr = fgets(new_line, 400, file_ptr);
    if (new_line_ptr==NULL)
    {
      printf("\n\n   !!read_input ERROR!! Input file is not readable or does not contain the string \'SECTION VOXELIZED GEOMETRY FILE v.2017-07-26\'!!\n");
      exit(-2);
    }
  }
  while(strstr(new_line,"SECTION VOXELIZED GEOMETRY FILE v.2017-07-26")==NULL);   // Skip comments and empty lines until the section begins
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    trim_name(new_line, file_name_voxels);   // VOXEL GEOMETRY FILE (penEasy 2008 format)
    
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    sscanf(new_line, "%f %f %f", &voxel_data->offset.x, &voxel_data->offset.y, &voxel_data->offset.z);   // OFFSET OF THE VOXEL GEOMETRY (DEFAULT ORIGIN AT LOWER BACK CORNER) [cm]     !!DBTv1.4!!

  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    sscanf(new_line, "%d %d %d", &voxel_data->num_voxels.x, &voxel_data->num_voxels.y, &voxel_data->num_voxels.z);   // NUMBER OF VOXELS: INPUT A "0" TO READ ASCII FORMAT WITH HEADER SECTION, RAW VOXELS WILL BE READ OTHERWISE     !!DBTv1.4!!
    if (voxel_data->num_voxels.x<1 || voxel_data->num_voxels.y<1 || voxel_data->num_voxels.z<1)
      voxel_data->num_voxels.x = -1;  // Indicate to read ASCII format geometry: geometric parameters will be read from the header file      !!DBTv1.4!!
  
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    sscanf(new_line, "%f %f %f", &voxel_data->voxel_size.x, &voxel_data->voxel_size.y, &voxel_data->voxel_size.z);   // VOXEL SIZES [cm]     !!DBTv1.4!!
    voxel_data->inv_voxel_size.x = 1.0f/voxel_data->voxel_size.x;
    voxel_data->inv_voxel_size.y = 1.0f/voxel_data->voxel_size.y;
    voxel_data->inv_voxel_size.z = 1.0f/voxel_data->voxel_size.z;

  int split_x=-1, split_y=-1, split_z=-1;
  new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);                         //!!bitree!! v1.5b
    sscanf(new_line, "%d %d %d", &split_x, &split_y, &split_z);   // SIZE OF LOW RESOLUTION VOXELS THAT WILL BE DESCRIBED BY A BINARY TREE, GIVEN AS POWERS OF TWO (eg, 2 2 3 = 2^2x2^2x2^3 = 128 input voxels per low res voxel; 0 0 0 disables tree)    //!!bitre!! v1.5b

  if( (split_x+split_y+split_z)==0 || split_x<0 || split_y<0 || split_z<0)
  {
    // Disable bitree generation if a negative value or all zeros are entered:
    voxel_data->num_voxels_coarse.x = voxel_data->num_voxels_coarse.y = voxel_data->num_voxels_coarse.z = (unsigned char) 0;
  }
  else
  {
    voxel_data->num_voxels_coarse.x = (unsigned char) min_value(pow(2, split_x), 256);      // Limit max size coarse voxel and void overflow      // !!bitree!! v1.5b
    voxel_data->num_voxels_coarse.y = (unsigned char) min_value(pow(2, split_y), 256);
    voxel_data->num_voxels_coarse.z = (unsigned char) min_value(pow(2, split_z), 256);  
  } 
  

  /////////////////////////////////////////////////////////////////////////////

  // -- Init. [SECTION MATERIAL FILE LIST v.2009-11-30]
  do
  {
    new_line_ptr = fgets(new_line, 400, file_ptr);
    if (new_line_ptr==NULL)
    {
      printf("\n\n   !!read_input ERROR!! Input file is not readable or does not contain the string \'SECTION MATERIAL FILE LIST v.2009-11-30 (or v.2020-03-03)\'!!\n");
      exit(-2);
    }
  }
  while(strstr(new_line,"SECTION MATERIAL FILE LIST v.2009-11-30")==NULL && strstr(new_line,"SECTION MATERIAL FILE LIST v.2020-03-03")==NULL);   // Skip comments and empty lines until the section begins   // !!inputDensity!! Changes started on 2020-03-03 [Andreu Badal]

  int i, flag_voxelId=0, flag_densityId=0;
  
  for (i=0; i<256; i++)
    voxelId[i]=-1;           // Init voxel-to-material conversion table in global variable
  
  for (i=0; i<MAX_MATERIALS; i++)
  {
    density_LUT[i]=-1.0f;    // Init material density global variable    
    new_line_ptr = fgets_trimmed(new_line, 400, file_ptr);
    if (new_line_ptr==NULL)
      file_name_materials[i][0]='\n';   // The input file is allowed to finish without defining all the materials
    else
    {
    
      // -- Read material file name:
      trim_name(new_line, file_name_materials[i]);    

      
      // !!inputDensity!! Allowing users to overwrite the nominal material density in the input material file and list the voxel numbers corresponding to the material in the voxel file (conversion table):
    
      // -- Read the input material density (if given):
      char densityStr[]="density=";
      char *pch = strstr(new_line,densityStr);
      if (pch!=NULL)
      {
        pch += strlen(densityStr);    // Skip label characters
        int dummy = sscanf(pch, "%f", &density_LUT[i]);
        
        if (density_LUT[i]>0.0f)
          flag_densityId++;   // Mark that a density has been input.
        
        
//         if (dummy>0 && density_LUT[i]>0.0f)
//             printf("\tdensity=%f\n",density_LUT[i]);
//           else
//             printf("\tNo density given for material %d (using nominal value from file).\n", i);
      }
//       else
//         printf("\tNo density given for material %d (using nominal value from file).\n", i);
      

      // -- Read the voxel ID numbers for the current material (if given):
      char voxelStr[]="voxelId=";
        //strcpy (new_line_tmp,new_line);   // Using a copy of the new line because strtok will insert end-of-string characters between tokens.   
      pch = strstr(new_line,voxelStr);
      if (pch!=NULL)
      {
        pch += strlen(voxelStr);    // Skip label characters
        pch = strtok(pch,",");      // Get first token before comma (or single token at current location if no comma)
        while (pch != NULL)
        {
          int id=-1;
          int dummy = sscanf(pch, "%d", &id);
          if (dummy>0 && id>=0 && id<256)
          {
            voxelId[id]=i;    // Assign current material to item number id in voxel-to-material conversion table (valid id from 0 to 255; 1 byte).
            flag_voxelId++;   // Mark that at voxelId has been used.
//             MAIN_THREAD printf("\tid=%d\n",id);
          }
          else if (id<-1 || id>255)
          {
            MAIN_THREAD printf("\n\n   !!read_input ERROR!! Input voxelID values must be between 0 and 255 (unsigned char)! Value read for material %d: %d\n\n",i,id);
            exit(-2);
          }
//           else
//             MAIN_THREAD printf("\tNo voxelID given for this material.\n");
            
          pch = strtok (NULL, ",");   // Get following tokens after comma
        }
      }
//       else
//         MAIN_THREAD printf("\tNo voxelID given for this material.\n");
    
    }  
  }
  
  
  // -- Use the VICTRE default voxel conversion table if no voxelId labels were input (keeping compatibility with older VICTRE input files):
  if (0==flag_voxelId)
  {
    voxelId[  0]= 0;  // ==Air
    voxelId[  1]= 1;  // ==fat
    voxelId[  2]= 2;  // ==skin
    voxelId[ 29]= 3;  // ==glandular
    voxelId[ 33]= 4;  // ==nipple
    voxelId[ 40]= 6;  // ==muscle
    voxelId[ 50]=10;  // ==Compression Paddle
    voxelId[ 65]=13;  // ==Tungsten (bar patterns, MTF edge)
    voxelId[ 66]=14;  // ==a-Se (bar patterns)
    voxelId[ 88]= 5;  // ==ligament(88)
    voxelId[ 95]= 9;  // ==terminal duct lobular unit(95)
    voxelId[125]= 7;  // ==duct(125)
    voxelId[150]= 8;  // ==artery(150)
    voxelId[200]=11;  // ==Mass/Signal
    voxelId[225]= 8;  // ==vein(225)
    voxelId[250]=12;  // ==Microcalcification
    
    // -- Use the VICTRE default densities if no voxelId and densities were given in the input file:
    if (0==flag_densityId)
    {
      density_LUT[ 0]=0.00120f;
      density_LUT[ 1]=0.920f;
      density_LUT[ 2]=1.090f;
      density_LUT[ 3]=1.035f;
      density_LUT[ 4]=1.090f;
      density_LUT[ 5]=1.120f;
      density_LUT[ 6]=1.050f;
      density_LUT[ 7]=1.050f;
      density_LUT[ 8]=1.000f;
      density_LUT[ 9]=1.050f;
      density_LUT[10]=1.060f;
      density_LUT[11]=1.060f;
      density_LUT[12]=1.781f;
      density_LUT[13]=19.30f;
      density_LUT[14]=4.50f;
    }
  }

  // Report the input (or default) conversion table:
  MAIN_THREAD printf("       Voxel value to Monte Carlo material number conversion table (material number corresponds to order in input file):\n");
  for (i=0; i<256; i++)
  {
    if (voxelId[i]>=0)
      MAIN_THREAD printf("\t\tvoxelId = %d  -->  Material = %d\n", i, voxelId[i]+1);
  }
  MAIN_THREAD printf("\n"); 
  MAIN_THREAD fflush(stdout);  
  
  
  // [Finish reading input file]
  
  
  /////////////////////////////////////////////////////////////////////////////
  /////////////////////////////////////////////////////////////////////////////
  

  // *** Set the detector center in front of the source:            !!DBTv1.4!!
  detector_data[0].center.x = source_data[0].position.x + source_data[0].direction.x * detector_data[0].sdd;
  detector_data[0].center.y = source_data[0].position.y + source_data[0].direction.y * detector_data[0].sdd;
  detector_data[0].center.z = source_data[0].position.z + source_data[0].direction.z * detector_data[0].sdd;


  // *** Set the rotation that will bring the particles from the actual detector plane to the default detector plane at +Y=(0,+1,0) where pixelization is simpler:                           !!DBTv1.4!!
  //     Essentially I just need to invert the Euler angles and the order of the rotations given for the source, and move the detector a distance SDD in the direction of the cone beam:
  
  detector_data[0].rot_inv[0] = 1; detector_data[0].rot_inv[1] = 0; detector_data[0].rot_inv[2] = 0;   // Init rotation matrix to identity
  detector_data[0].rot_inv[3] = 0; detector_data[0].rot_inv[4] = 1; detector_data[0].rot_inv[5] = 0; 
  detector_data[0].rot_inv[6] = 0; detector_data[0].rot_inv[7] = 0; detector_data[0].rot_inv[8] = 1;

  create_rotation_matrix_around_axis(-rotZ3*DEG2RAD, 0, 0, 1, detector_data[0].rot_inv);  // Inverse 3rd rotation around Z               !!DBTv1.4!!
  create_rotation_matrix_around_axis(-rotY2*DEG2RAD, 0, 1, 0, detector_data[0].rot_inv);  // Inverse 2nd rotation around Y               !!DBTv1.4!!
  create_rotation_matrix_around_axis(-rotZ1*DEG2RAD, 0, 0, 1, detector_data[0].rot_inv);  // Inverse 1st rotation around Z               !!DBTv1.4!!

  MAIN_THREAD printf("       Rotations from the detector plane to default detector plane at Y=0 [deg]: rotZ=%f , rotY=%f , rotZ=%f\n", -rotZ3, -rotY2, -rotZ1);

  /////////////////////////////////////////////////////////////////////////////

  // *** Allocate array for the 4 detected images (non-scattered, Compton, Rayleigh, multiple-scatter):
  int pixels_per_image = detector_data[0].num_pixels.x * detector_data[0].num_pixels.y;
  *image_bytes = 4 * pixels_per_image * sizeof(unsigned long long int);
  (*image_ptr) = (unsigned long long int*) malloc(*image_bytes);
  if (*image_ptr==NULL)
  {
    printf("\n\n   !!malloc ERROR!! Not enough memory to allocate %d pixels for the 4 scatter images (%f Mbytes)!!\n\n", pixels_per_image, (*image_bytes)/(1024.f*1024.f));
    exit(-2);
  }
  else
  {
    MAIN_THREAD printf("       Array for 4 scatter images correctly allocated (%d pixels, %f Mbytes)\n", pixels_per_image, (*image_bytes)/(1024.f*1024.f));
  }

  // *** Initialize the images to 0 in the CPU. The CUDA code will init it to 0 in the GPU global memory later, using kernel "init_image_array_GPU".
  memset(*image_ptr, 0, (*image_bytes));     // Init memory space to 0.   


  // *** Allocate dose and dose^2 array if tally active:
  int num_voxels_ROI = ((int)(*dose_ROI_x_max - *dose_ROI_x_min + 1)) * ((int)(*dose_ROI_y_max - *dose_ROI_y_min + 1)) * ((int)(*dose_ROI_z_max - *dose_ROI_z_min + 1));
  if ((*dose_ROI_x_max)>-1)
  {    
    *voxels_Edep_bytes = num_voxels_ROI * sizeof(ulonglong2);
    (*voxels_Edep_ptr) = (ulonglong2*) malloc(*voxels_Edep_bytes);
    if (*voxels_Edep_ptr==NULL)
    {
      printf("\n\n   !!malloc ERROR!! Not enough memory to allocate %d voxels for the deposited dose (and uncertainty) array (%f Mbytes)!!\n\n", num_voxels_ROI, (*voxels_Edep_bytes)/(1024.f*1024.f));
      exit(-2);
    }
    else
    {
      MAIN_THREAD printf("       Array for the deposited dose ROI (and uncertainty) correctly allocated (%d voxels, %f Mbytes)\n", num_voxels_ROI, (*voxels_Edep_bytes)/(1024.f*1024.f));
    }
  }
  else
  {
    (*voxels_Edep_bytes) = 0;
  }
  
  // *** Initialize the voxel dose to 0 in the CPU. Not necessary for the CUDA code if dose matrix init. in the GPU global memory using a GPU kernel, but needed if using cudaMemcpy.  
  if ((*dose_ROI_x_max)>-1)
  {    
    memset(*voxels_Edep_ptr, 0, (*voxels_Edep_bytes));     // Init memory space to 0.
  }

  return;
}


////////////////////////////////////////////////////////////////////////////////
//! Extract a file name from an input text line, trimming the initial blanks,
//! trailing comment (#) and stopping at the first blank (the file name should
//! not contain blanks).
//!
//!       @param[in] input_line   Input sentence with blanks and a trailing comment
//!       @param[out] file_name   Trimmed file name
////////////////////////////////////////////////////////////////////////////////
void trim_name(char* input_line, char* file_name)
{
  int a=0, b=0;
  
  // Discard initial blanks:
  while(' '==input_line[a])
  {
    a++;
  }

  // Read file name until a blank or a comment symbol (#) is found:
  while ((' '!=input_line[a])&&('#'!=input_line[a]))
  {
    file_name[b] = input_line[a];
    b++;
    a++;
  }
  
  file_name[b] = '\0';    // Terminate output string
}

////////////////////////////////////////////////////////////////////////////////
//! Read a line of text and trim initial blancks and trailing comments (#).
//!
//!       @param[in] num   Characters to read
//!       @param[in] file_ptr   Pointer to the input file stream
//!       @param[out] trimmed_line   Trimmed line from input file, skipping empty lines and comments
////////////////////////////////////////////////////////////////////////////////
char* fgets_trimmed(char* trimmed_line, int num, FILE* file_ptr)
{
  char  new_line[400];
  char *new_line_ptr = NULL;
  int a=0, b=0;
  trimmed_line[0] = '\0';   //  Init with a mark that means no file input
  
  do
  {
    a=0; b=0;
    new_line_ptr = fgets(new_line, num, file_ptr);   // Read new line
    if (new_line_ptr != NULL)
    {
      // Discard initial blanks:
      while(' '==new_line[a])
      {
        a++;
      }
      // Read file until a comment symbol (#) or end-of-line are found:
      while (('\n'!=new_line[a])&&('#'!=new_line[a]))
      {
        trimmed_line[b] = new_line[a];
        b++;
        a++;
      }
    }
  } while(new_line_ptr!=NULL &&  '\0'==trimmed_line[0]);   // Keep reading lines until end-of-file or a line that is not empty or only comment is found
  
  trimmed_line[b] = '\0';    // Terminate output string
  return new_line_ptr;
}


////////////////////////////////////////////////////////////////////////////////
//! Read the voxel data and allocate the material and density matrix.
//! Also find and report the maximum density defined in the geometry.
//!
// -- Sample voxel geometry file:
//
//   #  (comment lines...)
//   #
//   #   Voxel order: X runs first, then Y, then Z.
//   #
//   [SECTION VOXELS HEADER v.2008-04-13]
//   411  190  113      No. OF VOXELS IN X,Y,Z
//   5.000e-02  5.000e-02  5.000e-02    VOXEL SIZE (cm) ALONG X,Y,Z
//   1                  COLUMN NUMBER WHERE MATERIAL ID IS LOCATED
//   2                  COLUMN NUMBER WHERE THE MASS DENSITY IS LOCATED
//   1                  BLANK LINES AT END OF X,Y-CYCLES (1=YES,0=NO)
//   [END OF VXH SECTION]
//   1 0.00120479
//   1 0.00120479
//   ...
//
//!       @param[in] file_name_voxels  Name of the voxelized geometry file.
//!       @param[out] density_max  Array with the maximum density for each material in the voxels.
//!       @param[out] voxel_data   Pointer to a structure containing the voxel number and size.
//!       @param[out] voxel_mat_dens_ptr   Pointer to the vector with the voxel materials and densities.
//!       @param[in] dose_ROI_x/y/z_max   Size of the dose ROI: can not be larger than the total number of voxels in the geometry.
////////////////////////////////////////////////////////////////////////////////
// void load_voxels(int myID, char* file_name_voxels, float* density_max, struct voxel_struct* voxel_data, float2** voxel_mat_dens_ptr, unsigned int* voxel_mat_dens_bytes, short int* dose_ROI_x_max, short int* dose_ROI_y_max, short int* dose_ROI_z_max)
void load_voxels(int myID, char* file_name_voxels, float* density_max, struct voxel_struct* voxel_data, int** voxel_mat_dens_ptr, long long int* voxel_mat_dens_bytes, short int* dose_ROI_x_max, short int* dose_ROI_y_max, short int* dose_ROI_z_max)    //!!FixedDensity_DBT!! Allocating "voxel_mat_dens" as "char" instead of "float2"
{
  char new_line[250];
  char *new_line_ptr = NULL;  
      
  MAIN_THREAD if (strstr(file_name_voxels,".zip")!=NULL)
    printf("\n\n    -- WARNING load_voxels! The input voxel file name has the extension \'.zip\'. Only \'.gz\' compression is allowed!!\n\n");     // !!zlib!!
    
  gzFile file_ptr = gzopen(file_name_voxels, "rb");  // Open the file with zlib: the file can be compressed with gzip or uncompressed.   !!zlib!!  
  
  if (file_ptr==NULL)
  {
    printf("\n\n   !! fopen ERROR load_voxels!! File %s does not exist!!\n", file_name_voxels);
    exit(-2);
  }
  MAIN_THREAD 
  {
    printf("\n    -- Reading voxel file \'%s\':\n",file_name_voxels);
    if (strstr(file_name_voxels,".gz")==NULL)
      printf("         (note that MC-GPU can also read voxel and material files compressed with gzip)\n");     // !!zlib!!  
    fflush(stdout);
  }
  do
  {    
    new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
    
    if (new_line_ptr==NULL)
    {
      MAIN_THREAD printf("\n\n   !!Reading ERROR load_voxels!! File is not readable or does not contain the string \'[SECTION VOXELS HEADER\'!!\n");
      exit(-2);
    }
  }
  while(strstr(new_line,"[SECTION VOXELS")==NULL);   // Skip comments and empty lines until the header begins

  new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!   // Read full line (max. 250 characters).
  sscanf(new_line, "%d %d %d",&voxel_data->num_voxels.x, &voxel_data->num_voxels.y, &voxel_data->num_voxels.z);
  new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
  sscanf(new_line, "%f %f %f", &voxel_data->voxel_size.x, &voxel_data->voxel_size.y, &voxel_data->voxel_size.z);
  do
  {
    new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
    if (new_line_ptr==NULL)
    {
      MAIN_THREAD printf("\n\n   !!Reading ERROR load_voxels!! File is not readable or does not contain the string \'[END OF VXH SECTION]\'!!\n");
      exit(-2);
    }
  }
  while(strstr(new_line,"[END OF VXH SECTION")==NULL);   // Skip rest of the header

  // -- Store the size of the voxel bounding box (used in the source function):
  voxel_data->size_bbox.x = voxel_data->num_voxels.x * voxel_data->voxel_size.x;
  voxel_data->size_bbox.y = voxel_data->num_voxels.y * voxel_data->voxel_size.y;
  voxel_data->size_bbox.z = voxel_data->num_voxels.z * voxel_data->voxel_size.z;
  
  MAIN_THREAD
  {
    printf("       Number of voxels in the input geometry file: %d x %d x %d =  %d\n", voxel_data->num_voxels.x, voxel_data->num_voxels.y, voxel_data->num_voxels.z, (voxel_data->num_voxels.x*voxel_data->num_voxels.y*voxel_data->num_voxels.z));
    printf("       Size of the input voxels: %f x %f x %f cm  (voxel volume=%f cm^3)\n", voxel_data->voxel_size.x, voxel_data->voxel_size.y, voxel_data->voxel_size.z, voxel_data->voxel_size.x*voxel_data->voxel_size.y*voxel_data->voxel_size.z);
    printf("       Voxel bounding box size:  %f x %f x %f cm\n", voxel_data->size_bbox.x, voxel_data->size_bbox.y,  voxel_data->size_bbox.z);
    printf("       Voxel geometry offset:    %f,  %f,  %f cm\n", voxel_data->offset.x, voxel_data->offset.y, voxel_data->offset.z);            // !!DBTv1.4!!
  }

  if (*dose_ROI_x_max > -1)   // Check if tally not disabled
  {
    // -- Make sure the input number of voxels in the vox file is compatible with the input dose ROI (ROI assumes first voxel is index 0):
    if ( (*dose_ROI_x_max+1)>(voxel_data->num_voxels.x) || (*dose_ROI_y_max+1)>(voxel_data->num_voxels.y) || (*dose_ROI_z_max+1)>(voxel_data->num_voxels.z) )
    {
      MAIN_THREAD printf("\n       The input region of interest for the dose deposition is larger than the size of the voxelized geometry:\n");
      *dose_ROI_x_max = min_value(voxel_data->num_voxels.x-1, *dose_ROI_x_max);
      *dose_ROI_y_max = min_value(voxel_data->num_voxels.y-1, *dose_ROI_y_max);
      *dose_ROI_z_max = min_value(voxel_data->num_voxels.z-1, *dose_ROI_z_max);
      MAIN_THREAD printf(  "       updating the ROI max limits to fit the geometry -> dose_ROI_max=(%d, %d, %d)\n", *dose_ROI_x_max+1, *dose_ROI_y_max+1, *dose_ROI_z_max+1);         // Allowing the input of an ROI larger than the voxel volume: in this case some of the allocated memory will be wasted but the program will run ok.
    }
    
    if ( (*dose_ROI_x_max+1)==(voxel_data->num_voxels.x) && (*dose_ROI_y_max+1)==(voxel_data->num_voxels.y) && (*dose_ROI_z_max+1)==(voxel_data->num_voxels.z) )
      MAIN_THREAD printf("       The voxel dose tally ROI covers the entire voxelized phantom: the dose to every voxel will be tallied.\n");
    else
      MAIN_THREAD printf("       The voxel dose tally ROI covers only a fraction of the voxelized phantom: the dose to voxels outside the ROI will not be tallied.\n");
  }
  
  // -- Store the inverse of the pixel sides (in cm) to speed up the particle location in voxels.
  voxel_data->inv_voxel_size.x = 1.0f/(voxel_data->voxel_size.x);
  voxel_data->inv_voxel_size.y = 1.0f/(voxel_data->voxel_size.y);
  voxel_data->inv_voxel_size.z = 1.0f/(voxel_data->voxel_size.z);
  
  // -- Allocate the voxel matrix and store array size:
//   *voxel_mat_dens_bytes = sizeof(float2)*(voxel_data->num_voxels.x)*(voxel_data->num_voxels.y)*(voxel_data->num_voxels.z);
//   *voxel_mat_dens_ptr    = (float2*) malloc(*voxel_mat_dens_bytes);
  *voxel_mat_dens_bytes = sizeof(int)*(voxel_data->num_voxels.x)*(voxel_data->num_voxels.y)*(voxel_data->num_voxels.z);   //!!FixedDensity_DBT!! Allocating "voxel_mat_dens" as "char" instead of "float2"
  *voxel_mat_dens_ptr   = (int*) malloc(*voxel_mat_dens_bytes);                                                           //!!FixedDensity_DBT!! Allocating "voxel_mat_dens" as "char" instead of "float2"
  
  if (*voxel_mat_dens_ptr==NULL)
  {
    printf("\n\n   !!malloc ERROR load_voxels!! Not enough memory to allocate %d voxels (%f Mbytes)!!\n\n", (voxel_data->num_voxels.x*voxel_data->num_voxels.y*voxel_data->num_voxels.z), (*voxel_mat_dens_bytes)/(1024.f*1024.f));
    exit(-2);
  }

  MAIN_THREAD printf("\n\n!!WARNING!! The conversion table assigning a fixed density to each material (global memory array \"density_LUT\") has to be given in the input file or the hardcoded default from VICTRE is used.\n");
  MAIN_THREAD printf(    "            The densities given in the ASCII input .vox file are not used in the actual simulation!\n\n");  //!!FixedDensity_DBT!!    
  MAIN_THREAD printf("    -- Initializing the voxel material vector (%f Mbytes). \n\n", (*voxel_mat_dens_bytes)/(1024.f*1024.f));
  MAIN_THREAD fflush(stdout);
  
  // -- Read the voxel densities:
  //   MAIN_THREAD printf("       Reading the voxel densities... ");
  int i, j, k, read_lines=0, dummy_material, read_items = -99;
  float dummy_density;
//   float2 *voxels_ptr = *voxel_mat_dens_ptr;
  int *voxels_ptr = *voxel_mat_dens_ptr;       //!!FixedDensity_DBT!! Allocating "voxel_mat_dens" as "char" instead of "float2"

  for (k=0; k<MAX_MATERIALS; k++)
    density_max[k] = -999.0f;   // Init array with an impossible low density value
  

  for(k=0; k<(voxel_data->num_voxels.z); k++)
  {
    for(j=0; j<(voxel_data->num_voxels.y); j++)
    {
      for(i=0; i<(voxel_data->num_voxels.x); i++)
      {
        
        do
        {
          new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
        } 
        while (('\n'==new_line[0])||('\n'==new_line[1])||('#'==new_line[0])||('#'==new_line[1]));   // Skip empty lines and comments.
        read_items = sscanf(new_line, "%d %f", &dummy_material, &dummy_density);    // Read the next 2 numbers
  
        if (read_items!=2)
          printf("\n   !!WARNING load_voxels!! Expecting to read 2 items (material and density). read_items=%d, read_lines=%d \n", read_items, read_lines);
  
        if (dummy_material>MAX_MATERIALS)
        {
          printf("\n\n   !!ERROR load_voxels!! Voxel material number too high!! #mat=%d, MAX_MATERIALS=%d, voxel number=%d\n\n", dummy_material, MAX_MATERIALS, read_lines+1);
          exit(-2);
        }
        if (dummy_material<1)
        {
          printf("\n\n   !!ERROR load_voxels!! Voxel material number can not be zero or negative!! #mat=%d, voxel number=%dd\n\n", dummy_material, read_lines+1);
          exit(-2);
        }
        
        if (dummy_density < 1.0e-9f)
        {
          printf("\n\n   !!ERROR load_voxels!! Voxel density can not be 0 or negative: #mat=%d, density=%f, voxel number=%d\n\n", dummy_material, dummy_density, read_lines+1);
          exit(-2);
        }        
        
        if (dummy_density > density_max[dummy_material-1])
          density_max[dummy_material-1] = dummy_density;  // Store maximum density for each material

//         (*voxels_ptr).x = (float)(dummy_material)+0.0001f;  // Assign material value as float (the integer value will be recovered by truncation)
//         (*voxels_ptr).y = dummy_density;      // Assign density values

        (*voxels_ptr) = (int)(dummy_material-1);  // Assign material value as char, starting at 0   //!!FixedDensity_DBT!! Allocating "voxel_mat_dens" as "char" instead of "float2"; density taken from look up table

        voxels_ptr++;                         // Move to next voxel

        read_lines++;
      }
    }
  }
  MAIN_THREAD printf("       Total number of voxels read: %d\n",read_lines);
  gzclose(file_ptr);     // Close input file    !!zlib!!
}


////////////////////////////////////////////////////////////////////////////////
//! Read the material input files and set the mean free paths and the "linear_interp" structures.
//! Find the material nominal density. Set the Woodcock trick data.
//
// -- Sample material data file (data obtained from the PENELOPE 2006 database and models):
//
//    [MATERIAL NAME]
//     Water
//    [NOMINAL DENSITY (g/cm^3)]
//     1.000
//    [NUMBER OF DATA VALUES]
//     4096
//    [MEAN FREE PATHS :: Energy (eV) || Rayleigh | Compton | Photoelectric | Pair-production | TOTAL (cm)]
//     1.00000E+03  7.27451E-01  9.43363E+01  2.45451E-04  1.00000E+35  2.45367E-04
//     5.00000E+03  1.80004E+00  8.35996E+00  2.38881E-02  1.00000E+35  2.35089E-02
//     1.00000E+04  4.34941E+00  6.26746E+00  2.02568E-01  1.00000E+35  1.87755E-01
//     ...
//     #[RAYLEIGH INTERACTIONS (RITA sampling  of atomic form factor from EPDL database)]
//     ...
//     #[COMPTON INTERACTIONS (relativistic impulse model with approximated one-electron analytical profiles)]
//     ...
//
//!       @param[in] file_name_materials    Array with the names of the material files.
//!       @param[in] density_max   maximum density in the geometry (needed to set Woodcock trick)
//!       @param[out] density_nominal   Array with the nominal density of the materials read
//!       @param[out] mfp_table_data   Constant values for the linear interpolation
//!       @param[out] mfp_table_a_ptr   First element for the linear interpolation.
//!       @param[out] mfp_table_b_ptr   Second element for the linear interpolation.
////////////////////////////////////////////////////////////////////////////////
void load_material(int myID, char file_name_materials[MAX_MATERIALS][250], float* density_max, float* density_nominal, struct linear_interp* mfp_table_data, float2** mfp_Woodcock_table_ptr, int* mfp_Woodcock_table_bytes, float3** mfp_table_a_ptr, float3** mfp_table_b_ptr, int* mfp_table_bytes, struct rayleigh_struct *rayleigh_table_ptr, struct compton_struct *compton_table_ptr)
{
  char new_line[250];
  char *new_line_ptr = NULL;
  int mat, i, bin, input_num_values = 0, input_rayleigh_values = 0, input_num_shells = 0;
  double delta_e=-99999.0;

  // -- Init the number of shells to 0 for all materials
  for (mat=0; mat<MAX_MATERIALS; mat++)
    compton_table_ptr->noscco[mat] = 0;
    

  // --Read the material data files:
  MAIN_THREAD printf("\n    -- Reading the material data files (MAX_MATERIALS=%d):\n", MAX_MATERIALS);
  for (mat=0; mat<MAX_MATERIALS; mat++)
  {
    if ((file_name_materials[mat][0]=='\0') || (file_name_materials[mat][0]=='\n'))  //  Empty file name
       continue;   // Re-start loop for next material

    MAIN_THREAD printf("         Mat %d: File \'%s\'\n", mat+1, file_name_materials[mat]);
//     printf("    -- Reading material file #%d: \'%s\'\n", mat, file_name_materials[mat]);

    gzFile file_ptr = gzopen(file_name_materials[mat], "rb");    // !!zlib!!  
    if (file_ptr==NULL)
    {
      printf("\n\n   !!fopen ERROR!! File %d \'%s\' does not exist!!\n", mat, file_name_materials[mat]);
      exit(-2);
    }
    do
    {
      new_line_ptr = gzgets(file_ptr, new_line, 250);   // Read full line (max. 250 characters).   //  !!zlib!!
      if (new_line_ptr==NULL)
      {
        printf("\n\n   !!Reading ERROR!! File is not readable or does not contain the string \'[NOMINAL DENSITY\'!!\n");
        exit(-2);
      }
    }
    while(strstr(new_line,"[NOMINAL DENSITY")==NULL);   // Skip rest of the header

    // Read the material nominal density:
    new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
    sscanf(new_line, "# %f", &density_nominal[mat]);
    
    
    // !!inputDensity!! Using the material density given in the input file, or the nominal material density if no density input:
    if (density_max[mat]>0.0f)
    {
      // Material density read from a text-based (no binary) input geometry:
      density_LUT[mat] = density_max[mat];
      
      MAIN_THREAD printf("                Nominal density = %f g/cm^3; Density in voxels = %f g/cm^3\n", density_nominal[mat], density_max[mat]);
    }
    else if (density_max[mat]>-2.0f)    
    {
      // Material found in a binary voxel file:   
      
      // Use the nominal material density if no alternative density input by the user:
      if (density_LUT[mat]<1.0e-20f)
        density_LUT[mat] = density_nominal[mat];
        
      density_max[mat] = density_LUT[mat];
      
      MAIN_THREAD printf("                Nominal density = %f g/cm^3; Density used in voxels = %f g/cm^3\n", density_nominal[mat], density_max[mat]);
    }
    else                       //  Material NOT found in the voxels
    {
      MAIN_THREAD printf("                This material is not used in any voxel.\n");
      
      // Do not lose time reading the data for materials not found in the voxels, except for the first one (needed to determine the size of the input data).      
      if (0 == mat)
        density_max[mat] = 0.01f*density_nominal[mat];   // Assign a small but positive density; this material will not be used anyway.
      else
        continue;     //  Move on to next material          
    }
      

    // --For the first material, set the number of energy values and allocate table arrays:
    new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
    new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
    sscanf(new_line, "# %d", &input_num_values);
    if (0==mat)
    {
      mfp_table_data->num_values = input_num_values;
      MAIN_THREAD printf("                Number of energy values in the mean free path database: %d.\n", input_num_values);

      // Allocate memory for the linear interpolation arrays:
      *mfp_Woodcock_table_bytes = sizeof(float2)*input_num_values;
      *mfp_Woodcock_table_ptr   = (float2*) malloc(*mfp_Woodcock_table_bytes);  // Allocate space for the 2 parameter table
      *mfp_table_bytes = sizeof(float3)*input_num_values*MAX_MATERIALS;
      *mfp_table_a_ptr = (float3*) malloc(*mfp_table_bytes);  // Allocate space for the 4 MFP tables
      *mfp_table_b_ptr = (float3*) malloc(*mfp_table_bytes);
      *mfp_table_bytes = sizeof(float3)*input_num_values*MAX_MATERIALS;

      if (input_num_values>MAX_ENERGYBINS_RAYLEIGH)
      {
        printf("\n\n   !!load_material ERROR!! Too many energy bins (Input bins=%d): increase parameter MAX_ENERGYBINS_RAYLEIGH=%d!!\n\n", input_num_values, MAX_ENERGYBINS_RAYLEIGH);
        exit(-2);
      }
      
      if ((NULL==*mfp_Woodcock_table_ptr)||(NULL==*mfp_table_a_ptr)||(NULL==*mfp_table_b_ptr))
      {
        printf("\n\n   !!malloc ERROR!! Not enough memory to allocate the linear interpolation data: %d bytes!!\n\n", (*mfp_Woodcock_table_bytes+2*(*mfp_table_bytes)));
        exit(-2);
      }
      else
      {
        MAIN_THREAD printf("                Linear interpolation data correctly allocated (%f Mbytes)\n", (*mfp_Woodcock_table_bytes+2*(*mfp_table_bytes))/(1024.f*1024.f));
      }
      for (i=0; i<input_num_values; i++)
      {
        (*mfp_Woodcock_table_ptr)[i].x = 99999999.99f;    // Init this array with a huge MFP, the minimum values are calculated below
      }
    }
    else   // Materials after first
    {
      if (input_num_values != mfp_table_data->num_values)
      {
        printf("\n\n   !!load_material ERROR!! Incorrect number of energy values given in material \'%s\': input=%d, expected=%d\n",file_name_materials[mat], input_num_values, mfp_table_data->num_values);
        exit(-2);
      }
    }

    // -- Read the mean free paths (and Rayleigh cumulative prob):
    new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
    new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
    double d_energy, d_rayleigh, d_compton, d_photelectric, d_total_mfp, d_pmax, e_last=-1.0;
    
    for (i=0; i<input_num_values; i++)
    {

      new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
      sscanf(new_line,"  %le  %le  %le  %le  %le  %le", &d_energy, &d_rayleigh, &d_compton, &d_photelectric, &d_total_mfp, &d_pmax);

      // Find and store the minimum total MFP at the current energy, for every material's maximum density:
      float temp_mfp = d_total_mfp*(density_nominal[mat])/(density_max[mat]);
      if (temp_mfp < (*mfp_Woodcock_table_ptr)[i].x)
        (*mfp_Woodcock_table_ptr)[i].x = temp_mfp;       // Store minimum total mfp [cm]

      // Store the inverse MFP data points with [num_values rows]*[MAX_MATERIALS columns]
      // Scaling the table to the nominal density so that I can re-scale in the kernel to the actual local density:
      (*mfp_table_a_ptr)[i*(MAX_MATERIALS)+mat].x = 1.0/(d_total_mfp*density_nominal[mat]);   // inverse TOTAL mfp * nominal density
      (*mfp_table_a_ptr)[i*(MAX_MATERIALS)+mat].y = 1.0/(d_compton  *density_nominal[mat]);   // inverse Compton mfp * nominal density
      (*mfp_table_a_ptr)[i*(MAX_MATERIALS)+mat].z = 1.0/(d_rayleigh *density_nominal[mat]);   // inverse Rayleigh mfp * nominal density

      rayleigh_table_ptr->pmax[i*(MAX_MATERIALS)+mat] = d_pmax;    // Store the maximum cumulative probability of atomic form factor F^2 for

      if (0==i && 0==mat)
      {
        mfp_table_data->e0  = d_energy;   // Store the first energy of the first material
      }

      if (0==i)
      {
        if (fabs(d_energy-mfp_table_data->e0)>1.0e-9)
        {
          printf("\n\n   !!load_material ERROR!! Incorrect first energy value given in material \'%s\': input=%f, expected=%f\n", file_name_materials[mat], d_energy, mfp_table_data->e0);
          exit(-2);
        }
      }
      else if (1==i)
      {
        delta_e = d_energy-e_last;
      }
      else if (i>1)
      {
        if (((fabs((d_energy-e_last)-delta_e))/delta_e)>0.001)  // Tolerate up to a 0.1% relative variation in the delta e (for each bin) to account for possible precission errors reading the energy values
        {
          printf("  !!ERROR reading material data!! The energy step between mean free path values is not constant!!\n      (maybe not enough decimals given for the energy values)\n      #value = %d, First delta: %f , New delta: %f, Energy: %f ; Rel.Dif=%f\n", i, delta_e, (d_energy-e_last), d_energy,((fabs((d_energy-e_last)-delta_e))/delta_e));
          exit(-2);
        }
      }
      e_last = d_energy;
    }
    
    if (0==mat) MAIN_THREAD printf("                Lowest energy first bin = %f eV, last bin = %f eV; bin width = %f eV\n", (mfp_table_data->e0), e_last, delta_e);

    // -- Store the inverse of delta energy:
    mfp_table_data->ide = 1.0f/delta_e;

    // -- Store MFP data slope 'b' (.y for Woodcock):
    for (i=0; i<(input_num_values-1); i++)
    {
      bin = i*MAX_MATERIALS+mat;                   // Set current bin, skipping MAX_MATERIALS columns
      (*mfp_table_b_ptr)[bin].x = ((*mfp_table_a_ptr)[bin+MAX_MATERIALS].x - (*mfp_table_a_ptr)[bin].x) / delta_e;
      (*mfp_table_b_ptr)[bin].y = ((*mfp_table_a_ptr)[bin+MAX_MATERIALS].y - (*mfp_table_a_ptr)[bin].y) / delta_e;
      (*mfp_table_b_ptr)[bin].z = ((*mfp_table_a_ptr)[bin+MAX_MATERIALS].z - (*mfp_table_a_ptr)[bin].z) / delta_e;
    }
    // After maximum energy (last bin), assume constant slope:
    (*mfp_table_b_ptr)[(input_num_values-1)*MAX_MATERIALS+mat] = (*mfp_table_b_ptr)[(input_num_values-2)*MAX_MATERIALS+mat];

    // -- Rescale the 'a' parameter (.x for Woodcock) as if the bin started at energy = 0: we will not have to rescale to the bin minimum energy every time
    for (i=0; i<input_num_values; i++)
    {
      d_energy = mfp_table_data->e0 + i*delta_e;   // Set current bin lowest energy value
      bin = i*MAX_MATERIALS+mat;                   // Set current bin, skipping MAX_MATERIALS columns
      (*mfp_table_a_ptr)[bin].x = (*mfp_table_a_ptr)[bin].x - d_energy*(*mfp_table_b_ptr)[bin].x;
      (*mfp_table_a_ptr)[bin].y = (*mfp_table_a_ptr)[bin].y - d_energy*(*mfp_table_b_ptr)[bin].y;
      (*mfp_table_a_ptr)[bin].z = (*mfp_table_a_ptr)[bin].z - d_energy*(*mfp_table_b_ptr)[bin].z;
    }

    // -- Reading data for RAYLEIGH INTERACTIONS (RITA sampling  of atomic form factor from EPDL database):
    do
    {
      new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
      if (gzeof(file_ptr)!=0)                           //  !!zlib!!
      {
        printf("\n\n   !!End-of-file ERROR!! Rayleigh data not found: \"#[DATA VALUES...\" in file \'%s\'. Last line read: %s\n\n", file_name_materials[mat], new_line);
        exit(-2);
      }
    }
    while(strstr(new_line,"[DATA VALUES")==NULL);   // Skip all lines until this text is found
      
    new_line_ptr = gzgets(file_ptr, new_line, 250);   // Read the number of data points in Rayleigh     //  !!zlib!! 
    sscanf(new_line, "# %d", &input_rayleigh_values);
        
    if (input_rayleigh_values != NP_RAYLEIGH)
    {
      printf("\n\n   !!ERROR!! The number of values for Rayleigh sampling is different than the allocated space: input=%d, NP_RAYLEIGH=%d. File=\'%s\'\n", input_rayleigh_values, NP_RAYLEIGH, file_name_materials[mat]);
      exit(-2);
    }
    new_line_ptr = gzgets(file_ptr, new_line, 250);    // Comment line:  #[SAMPLING DATA FROM COMMON/CGRA/: X, P, A, B, ITL, ITU]     //  !!zlib!!
    for (i=0; i<input_rayleigh_values; i++)
    {
      int itlco_tmp, ituco_tmp;
      bin = NP_RAYLEIGH*mat + i;

      new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
      sscanf(new_line,"  %e  %e  %e  %e  %d  %d", &(rayleigh_table_ptr->xco[bin]), &(rayleigh_table_ptr->pco[bin]),
                                                  &(rayleigh_table_ptr->aco[bin]), &(rayleigh_table_ptr->bco[bin]),
                                                  &itlco_tmp, &ituco_tmp);

      rayleigh_table_ptr->itlco[bin] = (unsigned char) itlco_tmp;
      rayleigh_table_ptr->ituco[bin] = (unsigned char) ituco_tmp;
                                                  
    }
    //  printf("    -- Rayleigh sampling data read. Input values = %d\n",input_rayleigh_values);

    // -- Reading COMPTON INTERACTIONS data (relativistic impulse model with approximated one-electron analytical profiles):
    do
    {
      new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
      if (gzeof(file_ptr)!=0)                           //  !!zlib!!
      {
        printf("\n\n   !!End-of-file ERROR!! Compton data not found: \"[NUMBER OF SHELLS]\" in file \'%s\'. Last line read: %s\n\n", file_name_materials[mat], new_line);
        exit(-2);
      }
    }
    while(strstr(new_line,"[NUMBER OF SHELLS")==NULL);   // Skip all lines until this text is found
    new_line_ptr = gzgets(file_ptr, new_line, 250);
    sscanf(new_line, "# %d", &input_num_shells);      // Read the NUMBER OF SHELLS
    if (input_num_shells>MAX_SHELLS)
    {
      printf("\n\n   !!ERROR!! Too many shells for Compton interactions in file \'%s\': input=%d, MAX_SHELLS=%d\n", file_name_materials[mat], input_num_shells, MAX_SHELLS);
      exit(-2);
    }
    compton_table_ptr->noscco[mat] = input_num_shells;   // Store number of shells for this material in structure
    new_line_ptr = gzgets(file_ptr, new_line, 250);      // Comment line:  #[SHELL INFORMATION FROM COMMON/CGCO/: FCO, UICO, FJ0, KZCO, KSCO]
    int kzco_dummy, ksco_dummy;
    for (i=0; i<input_num_shells; i++)
    {

      bin = mat + i*MAX_MATERIALS;

      new_line_ptr = gzgets(file_ptr, new_line, 250);   //  !!zlib!!
      sscanf(new_line," %e  %e  %e  %d  %d", &(compton_table_ptr->fco[bin]), &(compton_table_ptr->uico[bin]),
                                              &(compton_table_ptr->fj0[bin]), &kzco_dummy, &ksco_dummy);
    }
  
    gzclose(file_ptr);    // Material data read. Close the current material input file.           //  !!zlib!!
    
  }  // ["for" loop: continue with next material]


  // -- Store Woodcock MFP slope in component '.y':
  for (i=0; i<(mfp_table_data->num_values-1); i++)
    (*mfp_Woodcock_table_ptr)[i].y = ((*mfp_Woodcock_table_ptr)[i+1].x - (*mfp_Woodcock_table_ptr)[i].x)/delta_e;

  // -- Rescale the first parameter in component .x for Woodcock
  for (i=0; i<mfp_table_data->num_values; i++)
  {
    (*mfp_Woodcock_table_ptr)[i].x = (*mfp_Woodcock_table_ptr)[i].x - (mfp_table_data->e0 + i*delta_e)*(*mfp_Woodcock_table_ptr)[i].y;
  }
  
}
////////////////////////////////////////////////////////////////////////////////


////////////////////////////////////////////////////////////////////////////////
//!  Select and initialize the CUDA-enabled GPU that will be used in the simulation.
//!  Allocates and copies the simulation data in the GPU global and constant memories.
//!
////////////////////////////////////////////////////////////////////////////////
void init_CUDA_device( int* gpu_id, int myID, int numprocs,
      /*Variables to GPU constant memory:*/ struct voxel_struct* voxel_data, struct source_struct* source_data, struct source_energy_struct* source_energy_data, struct detector_struct* detector_data, struct linear_interp* mfp_table_data,
//       /*Variables to GPU global memory:*/ float2* voxel_mat_dens, float2** voxel_mat_dens_device, unsigned int voxel_mat_dens_bytes,    
      /*Variables to GPU global memory:*/ int* voxel_mat_dens, int** voxel_mat_dens_device, long long int voxel_mat_dens_bytes,        //!!FixedDensity_DBT!!
        char* bitree, char** bitree_device, unsigned int bitree_bytes,                                                                //!!bitree!! v1.5b
        unsigned long long int* image, unsigned long long int** image_device, int image_bytes,
        float2* mfp_Woodcock_table, float2** mfp_Woodcock_table_device, int mfp_Woodcock_table_bytes,
        float3* mfp_table_a, float3* mfp_table_b, float3** mfp_table_a_device, float3** mfp_table_b_device, int mfp_table_bytes,
        struct rayleigh_struct* rayleigh_table, struct rayleigh_struct** rayleigh_table_device,
        struct compton_struct* compton_table, struct compton_struct** compton_table_device, 
        struct detector_struct** detector_data_device, struct source_struct** source_data_device,
        ulonglong2* voxels_Edep, ulonglong2** voxels_Edep_device, int voxels_Edep_bytes, short int* dose_ROI_x_min, short int* dose_ROI_x_max, short int* dose_ROI_y_min, short int* dose_ROI_y_max, short int* dose_ROI_z_min, short int* dose_ROI_z_max,
        ulonglong2* materials_dose, ulonglong2** materials_dose_device, int flag_material_dose, int** seed_input_device, int* seed_input, int num_projections)
{    
  cudaDeviceProp deviceProp;
  int deviceCount;  
  checkCudaErrors(cudaGetDeviceCount(&deviceCount));
  if (0==deviceCount)
  {
    printf("\n  !!ERROR!! No CUDA enabled GPU detected by thread #%d!!\n\n", myID);
    exit(-1);
  }  
  
  
#ifdef USING_MPI      
  if (numprocs>1)
  {      
    // *** Select the appropriate GPUs in the different workstations in the MPI hostfile:
    //     The idea is that each threads will wait for the previous thread to send a messages with its processor name and GPU id, 
    //     then it will assign the current GPU, and finally it will notify the following thread:    
    const int NODE_NAME_LENGTH = 31;
    char processor_name[NODE_NAME_LENGTH+1], previous_processor_name[NODE_NAME_LENGTH+1];
    int resultlen = -1;
    
    MPI_Get_processor_name(processor_name, &resultlen);
    
    MPI_Status status;
    
    int gpu_id_to_avoid = *gpu_id;

    clock_t clock_start;
    if (myID == (numprocs-1))
      clock_start = clock();        

    // Unless we are the first thread, wait for a message from the previous thread:
    // The MPI_Recv command will block the execution of the code until the previous threads have communicated and shared the appropriate information.
    if (0!=myID)
    {     
      MPI_Recv(previous_processor_name, NODE_NAME_LENGTH, MPI_CHAR, myID-1, 111, MPI_COMM_WORLD, &status);   // Receive the processor name and gpu_id from the previous thread
          // printf("\n -> MPI_Recv thread %d: gpu_id=%d, %s\n", myID, (int)previous_processor_name[NODE_NAME_LENGTH-1], previous_processor_name); fflush(stdout);  //!!Verbose!! 
    }
    
    // Compare the 30 first characters of the 2 names to see if we changed the node, except for the first thread that allways gets GPU 0:
    if ((0==myID) || (0!=strncmp(processor_name, previous_processor_name, NODE_NAME_LENGTH-1)))
    { 
      *gpu_id = 0;    // Thread in a new node: assign to GPU 0:
    }
    else
    {
      // Current thread in the same node as the previous one: assign next GPU (previous GPU id given in element NODE_NAME_LENGTH-1 of the array)
      *gpu_id = (int)previous_processor_name[NODE_NAME_LENGTH-1] + 1;
    }

    // Set the following GPU if this is the one to be skipped (given in the input file):
    if (*gpu_id == gpu_id_to_avoid)
    {
      *gpu_id = *gpu_id + 1;  
      printf("             Skipping GPU %d in thread %d (%s), as selected in the input file: gpu_id=%d\n", gpu_id_to_avoid, myID, processor_name, *gpu_id); fflush(stdout);
    }
    
  
    //!!MC-GPU_v1.4!! Skip GPUs connected to a monitor, if more GPUs available:
    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, *gpu_id));    
    if (0!=deviceProp.kernelExecTimeoutEnabled)
    {
      if((*gpu_id)<(deviceCount-1))
      {
        printf("\n       ==> CUDA: GPU #%d is connected to a display and the CUDA driver would limit the kernel run time. Skipping this GPU!!\n", *gpu_id);
        *gpu_id = (*gpu_id)+1;
      }
    }
  
       
    // Send the processor and GPU id to the following thread, unless we are the last thread:
    if (myID != (numprocs-1))
    { 
      processor_name[NODE_NAME_LENGTH-1] = (char)(*gpu_id);  // Store GPU number in the last element of the array
      
          // printf(" <- MPI_Send thread %d: gpu_id=%d, %s\n", myID, (int)processor_name[NODE_NAME_LENGTH-1], processor_name); fflush(stdout);  //!!Verbose!!
      MPI_Send(processor_name, NODE_NAME_LENGTH, MPI_CHAR, myID+1, 111, MPI_COMM_WORLD);  // Send processor name and gpu_id to the following thread (tag is the current thread id)
    }
    else
    {
      printf("           -- Time spent communicating between threads to determine the GPU id to use in each thread: %.6f s\n", ((double)(clock()-clock_start))/CLOCKS_PER_SEC); fflush(stdout);
    }    
  }  
#endif  


  if (*gpu_id>=deviceCount)
  {
    printf("\n\n  !!WARNING!! The selected GPU number is too high, this device number does not exist!! GPU_id (starting at 0)=%d, deviceCount=%d\n", (*gpu_id), deviceCount); fflush(stdout);
    if (numprocs==1)
    {
      *gpu_id = gpuGetMaxGflopsDeviceId();
      printf("            Selecting the fastest GPU available using gpuGetMaxGflopsDeviceId(): GPU_id = %d\n\n", (*gpu_id)); fflush(stdout);
    }    
    else
    {
      exit(-1);    
    }
  }     

  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, *gpu_id));   // Re-load card properties in case we chaged gpu_id
  if (deviceProp.major>99 || deviceProp.minor>99)
  {
    printf("\n\n\n  !!ERROR!! The selected GPU device does not support CUDA!! GPU_id=%d, deviceCount=%d, compute capability=%d.%d\n\n\n", (*gpu_id), deviceCount, deviceProp.major,deviceProp.minor);
    exit(-1);
  }
  
  checkCudaErrors(cudaSetDevice(*gpu_id));   // Set the GPU device. (optionally use: cutGetMaxGflopsDeviceId())
        
  if (deviceProp.major>1)
  {
    
#ifdef LARGE_CACHE  
    // -- Compute capability > 1: set a large L1 cache for the global memory, reducing the size of the shared memory:
    //       cudaFuncCachePreferShared: shared memory is 48 KB
    //       cudaFuncCachePreferL1: shared memory is 16 KB
    //       cudaFuncCachePreferNone: no preference
    printf("\n       ==> CUDA: LARGE_CACHE defined --> setting a large global memory cache (L1) and a small shared memory (cudaFuncCachePreferL1).\n");
    cudaFuncSetCacheConfig(track_particles, cudaFuncCachePreferL1);            // -- Set a large cache instead of a large shared memory.
        // #else
        // -- Using default:
        // printf("\n       ==> CUDA: LARGE_CACHE not defined --> setting a large shared memory and a small global memory cache (cudaFuncCachePreferShared).\n");
        //    cudaFuncSetCacheConfig(track_particles, cudaFuncCachePreferShared);       // Setting size of shared memory/global cache
#endif

  }


  // -- Reading the device properties:
  
#ifdef USING_MPI   
  printf("\n       ==> CUDA (MPI process #%d): %d CUDA enabled GPU detected! Using device #%d: \"%s\"\n", myID, deviceCount, (*gpu_id), deviceProp.name);    
#else  
  printf("\n       ==> CUDA: %d CUDA enabled GPU detected! Using device #%d: \"%s\"\n", deviceCount, (*gpu_id), deviceProp.name);    
#endif
  printf("                 Compute capability: %d.%d, Number multiprocessors: %d\n", deviceProp.major, deviceProp.minor, deviceProp.multiProcessorCount);
  printf("                 Clock rate: %.2f GHz, Global memory: %.3f Mbyte, Constant memory: %.2f kbyte\n", deviceProp.clockRate*1.0e-6f, deviceProp.totalGlobalMem/(1024.f*1024.f), deviceProp.totalConstMem/1024.f);
  printf("                 Shared memory per block: %.2f kbyte, Registers per block: %.2f kbyte\n", deviceProp.sharedMemPerBlock/1024.f, deviceProp.regsPerBlock/1024.f);
  int driverVersion = 0, runtimeVersion = 0;  
  cudaDriverGetVersion(&driverVersion);
  cudaRuntimeGetVersion(&runtimeVersion);
  printf("                 CUDA Driver Version: %d.%d, Runtime Version: %d.%d\n\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);

  if (0!=deviceProp.kernelExecTimeoutEnabled)
  {
    printf("\n\n\n   !!WARNING!! The selected GPU is connected to a display and therefore CUDA driver will limit the kernel run time to 5 seconds and the simulation will likely fail!!\n");
    printf( "              You can fix this by executing the simulation in a different GPU (select number in the input file) or by turning off the window manager and using the text-only Linux shell.\n\n\n");
    // exit(-1);
  }    

  fflush(stdout);
  
  clock_t clock_init = clock();    

  // -- Allocate the constant variables in the device:
  checkCudaErrors(cudaMemcpyToSymbol(voxel_data_CONST,     voxel_data,     sizeof(struct voxel_struct)));
  checkCudaErrors(cudaMemcpyToSymbol(source_energy_data_CONST, source_energy_data, sizeof(struct source_energy_struct)));
  
// Source, detector data now copied to global memory and transfered to shared memory in the kernel. OLD CODE:  checkCudaErrors(cudaMemcpyToSymbol(detector_data_CONST,  detector_data,  sizeof(struct detector_struct)));
  
  checkCudaErrors(cudaMemcpyToSymbol(mfp_table_data_CONST, mfp_table_data, sizeof(struct linear_interp)));

  checkCudaErrors(cudaMemcpyToSymbol(dose_ROI_x_min_CONST, dose_ROI_x_min, sizeof(short int)));
  checkCudaErrors(cudaMemcpyToSymbol(dose_ROI_x_max_CONST, dose_ROI_x_max, sizeof(short int)));
  checkCudaErrors(cudaMemcpyToSymbol(dose_ROI_y_min_CONST, dose_ROI_y_min, sizeof(short int)));
  checkCudaErrors(cudaMemcpyToSymbol(dose_ROI_y_max_CONST, dose_ROI_y_max, sizeof(short int)));
  checkCudaErrors(cudaMemcpyToSymbol(dose_ROI_z_min_CONST, dose_ROI_z_min, sizeof(short int)));
  checkCudaErrors(cudaMemcpyToSymbol(dose_ROI_z_max_CONST, dose_ROI_z_max, sizeof(short int)));
  
  checkCudaErrors(cudaMemcpyToSymbol(density_LUT_CONST, &density_LUT, MAX_MATERIALS*sizeof(float)));   // !!inputDensity!! store density look-up table in GPU constant memory 

  double total_mem = sizeof(struct voxel_struct)+sizeof(struct source_struct)+sizeof(struct detector_struct)+sizeof(struct linear_interp) + 6*sizeof(short int) + MAX_MATERIALS*sizeof(float);
  MAIN_THREAD printf("       ==> CUDA: Constant data successfully copied to the device. CONSTANT memory used: %lf kbytes (%.1lf%%)\n", total_mem/1024.0, 100.0*total_mem/deviceProp.totalConstMem);
  

  // -- Allocate the device global memory:

  if (*dose_ROI_x_max > -1)  // Allocate dose array only if the tally is not disabled
  {
    checkCudaErrors(cudaMalloc((void**) voxels_Edep_device, voxels_Edep_bytes));
    if (*voxels_Edep_device==NULL)
    {
      printf("\n cudaMalloc ERROR!! Error allocating the dose array on the device global memory!! (%lf Mbytes)\n", voxels_Edep_bytes/(1024.0*1024.0));
      exit(-1);
    }
  }
  
  checkCudaErrors(cudaMalloc((void**) voxel_mat_dens_device, voxel_mat_dens_bytes));
  checkCudaErrors(cudaMalloc((void**) bitree_device, bitree_bytes));                       //!!bitree!! v1.5b
  
  checkCudaErrors(cudaMalloc((void**) image_device,          image_bytes));
  checkCudaErrors(cudaMalloc((void**) mfp_Woodcock_table_device, mfp_Woodcock_table_bytes));
  checkCudaErrors(cudaMalloc((void**) mfp_table_a_device,    mfp_table_bytes));
  checkCudaErrors(cudaMalloc((void**) mfp_table_b_device,    mfp_table_bytes));
  checkCudaErrors(cudaMalloc((void**) rayleigh_table_device, sizeof(struct rayleigh_struct)));
  checkCudaErrors(cudaMalloc((void**) compton_table_device,  sizeof(struct compton_struct))); 
  checkCudaErrors(cudaMalloc((void**) detector_data_device,  num_projections*sizeof(struct detector_struct)));
  checkCudaErrors(cudaMalloc((void**) source_data_device,    num_projections*sizeof(struct source_struct)));    // The array of detectors, sources has "MAX_NUM_PROJECTIONS" elements but I am allocating only the used "num_projections" elements to the GPU
  
  checkCudaErrors(cudaMalloc((void**) seed_input_device, sizeof(int)));  // Store latest random seed used in GPU in global memory to continue random sequence in consecutive projections.   !!DBTv1.4!!
  
  if (flag_material_dose==1)
    checkCudaErrors(cudaMalloc((void**) materials_dose_device, MAX_MATERIALS*sizeof(ulonglong2)));    // !!tally_materials_dose!!
  
  total_mem = voxels_Edep_bytes + voxel_mat_dens_bytes + image_bytes + mfp_Woodcock_table_bytes + 2*mfp_table_bytes + sizeof(struct compton_struct) + sizeof(struct rayleigh_struct) + num_projections*(sizeof(struct detector_struct) + sizeof(struct source_struct)) + bitree_bytes;
  if (*voxel_mat_dens_device==NULL || *image_device==NULL || *mfp_Woodcock_table_device==NULL || *mfp_table_a_device==NULL ||
      *mfp_table_a_device==NULL || *rayleigh_table_device==NULL || *compton_table_device==NULL || *detector_data_device==NULL || *source_data_device==NULL)
  {
    printf("\n cudaMalloc ERROR!! Device global memory not correctly allocated!! (%lf Mbytes)\n", total_mem/(1024.0*1024.0));
    exit(-1);
  }
  else
  {
    MAIN_THREAD printf("       ==> CUDA: Device global memory correctly allocated. GLOBAL memory used: %lf Mbytes (%.1lf%%)\n", total_mem/(1024.0*1024.0), 100.0*total_mem/deviceProp.totalGlobalMem);
  }

  // --Copy the host memory to the device:
  checkCudaErrors(cudaMemcpy(*voxel_mat_dens_device, voxel_mat_dens, voxel_mat_dens_bytes,                          cudaMemcpyHostToDevice));
  checkCudaErrors(cudaMemcpy(*bitree_device, bitree, bitree_bytes,                                                  cudaMemcpyHostToDevice));      //!!bitree!! v1.5b
  checkCudaErrors(cudaMemcpy(*mfp_Woodcock_table_device, mfp_Woodcock_table, mfp_Woodcock_table_bytes,              cudaMemcpyHostToDevice));
  checkCudaErrors(cudaMemcpy(*mfp_table_a_device,    mfp_table_a,    mfp_table_bytes,                               cudaMemcpyHostToDevice));
  checkCudaErrors(cudaMemcpy(*mfp_table_b_device,    mfp_table_b,    mfp_table_bytes,                               cudaMemcpyHostToDevice));
  checkCudaErrors(cudaMemcpy(*rayleigh_table_device, rayleigh_table, sizeof(struct rayleigh_struct),                cudaMemcpyHostToDevice));
  checkCudaErrors(cudaMemcpy(*compton_table_device,  compton_table,  sizeof(struct compton_struct),                 cudaMemcpyHostToDevice));  
  checkCudaErrors(cudaMemcpy(*detector_data_device,  detector_data,  num_projections*sizeof(struct detector_struct),cudaMemcpyHostToDevice));
  checkCudaErrors(cudaMemcpy(*source_data_device,    source_data,    num_projections*sizeof(struct source_struct),  cudaMemcpyHostToDevice));  
  
  checkCudaErrors(cudaMemcpy(*seed_input_device, seed_input, sizeof(int), cudaMemcpyHostToDevice));    // Upload initial seed value.   !!DBTv1.4!!
  

  //   --Init the image array to 0 using a GPU kernel instead of cudaMemcpy:
  //     Simple version: checkCudaErrors( cudaMemcpy( image_device, image, image_bytes, cudaMemcpyHostToDevice) );

  int pixels_per_image = detector_data[0].num_pixels.x * detector_data[0].num_pixels.y;
  MAIN_THREAD printf("       ==> CUDA: Launching kernel to initialize the device image to 0: number of blocks = %d, threads per block = 128\n", (int)(ceil(pixels_per_image/128.0f)+0.01f) );

  init_image_array_GPU<<<(int)(ceil(pixels_per_image/128.0f)+0.01f),128>>>(*image_device, pixels_per_image);
    fflush(stdout);
    cudaDeviceSynchronize();      // Force the runtime to wait until all device tasks have completed
    getLastCudaError("\n\n !!Kernel execution failed initializing the image array!! ");  // Check if kernel execution generated any error:


  //   --Init the dose array to 0 using a GPU kernel, if the tally is not disabled:
  if (*dose_ROI_x_max > -1)
  {      
    
    MAIN_THREAD printf("       ==> CUDA: Initialize the device dose deposition to 0 using cudaMemcpy.\n");
    checkCudaErrors(cudaMemcpy(*voxels_Edep_device, voxels_Edep, voxels_Edep_bytes, cudaMemcpyHostToDevice) );
   
/*  // -- OPTIONAL CODE: Launch kernel to initialize the device dose deposition to 0 (MAY FAIL IF DOSE MATRIX IS TOO BIG!)
    int num_voxels_dose = voxels_Edep_bytes/sizeof(ulonglong2);   // Calculate the number of voxels in the dose array
    int num_blocks, num_threads_block = 0;  
    // Select the number of threads per block making sure we don't try to launch more blocks than CUDA's maximum value:
    do
    {
      num_threads_block += 64;
      num_blocks = (int)(ceil(((double)num_voxels_dose)/((double)num_threads_block))+0.001);
    }
    while (num_blocks > 65500);    
    MAIN_THREAD printf("       ==> CUDA: Launching kernel to initialize the device dose deposition to 0: number of blocks = %d, threads per block = %d\n", num_blocks, num_threads_block);  
    init_dose_array_GPU<<<num_blocks,num_threads_block>>>(*voxels_Edep_device, num_voxels_dose);    
      cudaDeviceSynchronize();
      getLastCudaError("\n\n !!Kernel execution failed initializing the dose array!! ");  // Check if kernel execution generated any error:
*/

  }
  
  // Init materials_dose array in GPU with 0 (same as host):
  if (flag_material_dose==1)
    checkCudaErrors(cudaMemcpy(*materials_dose_device, materials_dose, MAX_MATERIALS*sizeof(ulonglong2), cudaMemcpyHostToDevice));   // !!tally_materials_dose!!
    
  MAIN_THREAD printf("                 Time spent allocating and copying memory to the device: %.6f s\n", float(clock()-clock_init)/CLOCKS_PER_SEC);    

}


////////////////////////////////////////////////////////////////////////////////

////////////////////////////////////////////////////////////////////////////////
//! Guestimate a good number of blocks to estimate the speed of different generations 
//! of GPUs. Slower GPUs will simulate less particles and hopefully the fastest GPUs 
//! will not have to wait much. If the speed is not accurately estimated in the speed test
//! some GPUs will simulate longer than others and valuable simulation time will be wasted 
//! in the idle GPUs.
//!
//! In this function the "optimum" number of blocks for the speed test is heuristically 
//! computed as the product of three GPU characteristics:
//!   [2.0] * [number of GPU cores] * [core frequency] * [major CUDA compute capability] + [100]
//!
//! The factor 2.0 is arbitrary and can be modified depending on the case (for short 
//! simulations this value may have to be reduced or the speed test will take longer 
//! than the whole simulation). The constant 100 blocks are added to try to get enough 
//! blocks for a reliable timing of slow GPUs.
//!
//! For example, an NVIDIA GeForce 290 will get:
//!   2.0 * 240 (cores) * 1.24 (GHz) * 1 (major compute capability) + 100 =  695.2 ~  695 blocks
//! An NVIDIA GeForce 580 will get:
//!   2.0 * 512 (cores) * 1.54 (GHz) * 2 (major compute capability) + 100 = 3253.9 ~ 3254 blocks 
//! In total the 580 gets 5.7 times more blocks than the 290.
//!
//!       @param[in] gpu_id   GPU number
//!       @param[out] num_blocks   Returns a number of blocks related to the expected GPU speed
////////////////////////////////////////////////////////////////////////////////
int guestimate_GPU_performance(int gpu_id)
{          
  cudaDeviceProp deviceProp;
  cudaGetDeviceProperties(&deviceProp, gpu_id);
  
  // DISCONTINUED CUDA FUNCTION! float num_cores       = (float) _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount;
  float num_cores_aprox = 128 * deviceProp.multiProcessorCount;   // I can't get the exact number of cores anymore; assume 128 per multiprocessor
  float comp_capability = (float) deviceProp.major;
  float frequency       = deviceProp.clockRate*1.0e-6f;
  
  int guestimated_value = (int)(0.5f*num_cores_aprox*frequency*comp_capability + 64.0f);
  return min_value(guestimated_value, 1024);     // Limit the returned number of blocks to prevent too long speed tests   !!DBT!!
}
  
////////////////////////////////////////////////////////////////////////////////


////////////////////////////////////////////////////////////////////////////////
//! Report the tallied image in ASCII and binary form (32-bit floats).
//! Separate images for primary and scatter radiation are generated.
//! 
//!
//!       @param[in] file_name_output   File where tallied image is reported
//!       @param[in] detector_data   Detector description read from the input file (pointer to detector_struct)
//!       @param[in] image  Tallied image (in meV per pixel)
//!       @param[in] time_elapsed   Time elapsed during the main loop execution (in seconds)
//!       @param[in] total_histories   Total number of x-rays simulated
////////////////////////////////////////////////////////////////////////////////
int report_image(char* file_name_output, struct detector_struct* detector_data, struct source_struct* source_data, float mean_energy_spectrum, unsigned long long int* image, double time_elapsed, unsigned long long int total_histories, int current_projection, int num_projections, int myID, int numprocs, double current_angle, int* seed_input)
{
  
  
  int pixels_per_image = (detector_data[0].num_pixels.x*detector_data[0].num_pixels.y), pixel=0;
  float* energy_noScatter_array    = (float*) malloc(pixels_per_image*sizeof(float));   // Allocate temporary space to report the binary image at the end
  float* energy_compton_array      = (float*) malloc(pixels_per_image*sizeof(float));
  float* energy_rayleigh_array     = (float*) malloc(pixels_per_image*sizeof(float));
  float* energy_multiscatter_array = (float*) malloc(pixels_per_image*sizeof(float));
  
  // -- Report data:
  printf("\n\n          *** IMAGE TALLY PERFORMANCE REPORT ***\n");
  
  if(num_projections!=1)   // Output the projection angle when simulating a CT:
  {
    printf("              Tomographic projection %d of %d: acquisition angle = %lf \n", current_projection, num_projections, current_angle*RAD2DEG);
  }
  
  printf("              Simulated x rays:    %lld\n", total_histories);
  printf("              Simulation time [s]: %.2f\n", time_elapsed);
  if (time_elapsed>0.000001)
    printf("              Speed [x-rays/s]:    %.2f\n\n", ((double)total_histories)/time_elapsed);

  FILE* file_ptr = fopen(file_name_output, "w");
  
  if (file_ptr==NULL)
  {
    printf("\n\n   !!fopen ERROR report_image!! File %s can not be opened!!\n", file_name_output);
    exit(-3);
  }
  
  
  // -- Report the whole image in ASCII text only if the file name contains ".dat". Otherwise, save disc space by writting only the header in ASCII and the data in raw format:     // !!DBTv1.4!!
  bool flag_report_ASCII = false;
  if (strstr(file_name_output,".dat")!=NULL)
    flag_report_ASCII = true;                       // !!DBTv1.4!!
    
  
  fprintf(file_ptr, "# \n");
  fprintf(file_ptr, "#     *****************************************************************************\n");
  fprintf(file_ptr, "#     ***     MC-GPU, version 1.5b_DBT (https://github.com/DIDSR/VICTRE_MCGPU)  ***\n");  //  (http://code.google.com/p/mcgpu/)
  fprintf(file_ptr, "#     ***                                                                       ***\n");
  fprintf(file_ptr, "#     ***                     Andreu Badal (Andreu.Badal-Soler{at}fda.hhs.gov)  ***\n");
  fprintf(file_ptr, "#     *****************************************************************************\n");
  fprintf(file_ptr, "# \n");  
  fprintf(file_ptr, "#  *** SIMULATION IN THE GPU USING CUDA ***\n");
  fprintf(file_ptr, "#\n");
  fprintf(file_ptr, "#  Image created counting the energy arriving at each pixel: ideal energy integrating detector.\n");
  
  int2 seed;  
  if (detector_data[0].gain_W<0.001f)
  {
    // Reporting the image in ideal energy fluence units:
    fprintf(file_ptr, "#  Pixel value units: eV/cm^2 per history (energy fluence).\n");
  }
  else
  {
    // Reporting the image in charge units (electrons) after sampling a number of electron-hole-pairs generated by the tallied energy deposition according to the input detector gain and Swank:
    fprintf(file_ptr, "#  Pixel value units: electrons/cm^2 per history (detected charge).\n");   //!!DETECTOR_RESPONSE!!
    fprintf(file_ptr, "#     Detector gain W_+- = %f eV/detected_charge\n", detector_data[0].gain_W);
    fprintf(file_ptr, "#     Swank factor = %f -> relative std_dev of gain = %f\n", 1.0f/(1.0f+detector_data[0].Swank_rel_std*detector_data[0].Swank_rel_std), detector_data[0].Swank_rel_std);   // Swank_factor = mean^2/(mean^2 + std_dev^2) --> (std_dev/mean) = sqrt(1/Swank_factor - 1)
    fprintf(file_ptr, "#     Mean electronic noise per pixel = %f electrons\n", detector_data[0].electronic_noise);
    fprintf(file_ptr, "#  Conversion from the energy deposited in each pixel to charge is done sampling a Gaussian distribution with:\n");
    fprintf(file_ptr, "#     mean      = E_deposited/gain_W + electronic_noise\n");
    fprintf(file_ptr, "#     std_dev^2 = mean*sqrt(1/Swank_factor - 1) + sqrt(electronic_noise)\n");
    
    // Init random seed for the Gaussian sampling:
    seed.x = *seed_input;
    seed.y = *seed_input;
    int l;
    for (l=0; l<1234; l++)
    {
      double dummy = ranecu_double_CPU(&seed);    // Waste a few thousand random values to make sure that we don't have problems by using the same seed for both MLCGs
    }
    
  }
  fprintf(file_ptr, "#\n");

  if(num_projections!=1)   // Output the projection angle when simulating a CT:
  {
    // !!DBTv1.4!! First tomo projection is [1], proj [0] is 0 deg if flag_simulateMammoAfterDBT==true
    fprintf(file_ptr, "#  Tomographic projection %d of %d: acquisition angle = %lf \n", current_projection, num_projections, current_angle*RAD2DEG);
  }  

  fprintf(file_ptr, "#  Focal spot position = (%.8f,%.8f,%.8f), cone beam direction = (%.8f,%.8f,%.8f)\n", source_data[current_projection].position.x, source_data[current_projection].position.y, source_data[current_projection].position.z, source_data[current_projection].direction.x, source_data[current_projection].direction.y, source_data[current_projection].direction.z);
  
  fprintf(file_ptr, "#  Focal spot Gaussian blur FWHM = %f\n", source_data[current_projection].focal_spot_FWHM);                      
  fprintf(file_ptr, "#  Focal spot rotation blur = %f degrees\n", source_data[current_projection].rotation_blur*RAD2DEG);
  if (detector_data[current_projection].grid_freq>0.0f)
    fprintf(file_ptr, "#  Antiscatter grid ratio = %f ; grid frequency = %f lines per cm\n", fabsf(detector_data[current_projection].grid_ratio), detector_data[current_projection].grid_freq);             //  !!DBTv1.5!!
  else
    fprintf(file_ptr, "#  Antiscatter grid not used.\n");
  fprintf(file_ptr, "# \n");
  fprintf(file_ptr, "#  Pixel size:  %lf x %lf = %lf cm^2\n", 1.0/(double)(detector_data[0].inv_pixel_size_X), 1.0/(double)(detector_data[0].inv_pixel_size_Z), 1.0/(double)(detector_data[0].inv_pixel_size_X*detector_data[0].inv_pixel_size_Z));  
  fprintf(file_ptr, "#  Number of pixels in X and Z:  %d  %d\n", detector_data[0].num_pixels.x, detector_data[0].num_pixels.y);
    
  fprintf(file_ptr, "# \n");
  fprintf(file_ptr, "#  The image data is reported in binary format in the .raw files (each pixel given as 32-bit float, little-endian order).\n");

// SEPARATE SCATTER:  fprintf(file_ptr, "#  Five images are reported one after the other in each .raw file: all signal combined, non-scattered x-rays, Compton scatter, Rayleigh scatter, multiple-scatter.\n");

// NOT SEPARATING SCATTER:   January 2018
 fprintf(file_ptr, "#  Two images are reported one after the other in each .raw file: all SCATTER and PRIMARIES combined  and primary x-rays only (which includes additive electronic noise)\n");


  fprintf(file_ptr, "# \n");
  
  if (source_data[0].flag_halfConeX)       // Sampling only half beam towards +X for mammo geometry!           !!DBT!!  !!HalfBeam!! !!DBTv1.4!!
  {
    fprintf(file_ptr, "#  NOTE: \"flag_halfCone==true\" --> Image acquired with only half cone beam towards positive azimuthal angles, with beam offset to the edge of the image.\n");   // !!DBT!!  !!HalfBeam!! !!DBTv1.4!!
  }
  
  if (flag_report_ASCII)
  {
    fprintf(file_ptr, "#  Order of pixel data below: X rows (width) first, blank line separates the different Z rows (height).\n");
    fprintf(file_ptr, "# \n");
    fprintf(file_ptr, "#  [NON-SCATTERED] [COMPTON] [RAYLEIGH] [MULTIPLE-SCATTING]\n");
    fprintf(file_ptr, "# ==========================================================\n");
  }
  else
  {
    fprintf(file_ptr, "#  Pixel data is provided only in the .raw files. To report the data in ASCII format, include the string \".dat\" in the input image file name.\n");
    fprintf(file_ptr, "# \n");
  }

  // -- Prepare binary output:  
  char file_binary[250];
  strncpy (file_binary, file_name_output, 250);
  strcat(file_binary,".raw");                       // !!BINARY!! 
  FILE* file_binary_ptr = fopen(file_binary, "w");  // !!BINARY!!
  if (file_binary_ptr==NULL)
  {
    printf("\n\n   !!fopen ERROR report_image!! Binary file %s can not be opened for writing!!\n", file_binary);
    exit(-3);
  }

  const double invSCALE = 1.0/SCALE_eV;    // conversion to eV using the inverse of the constant used in the "tally_image" kernel function (defined in the header file)
  const double invHIST  = 1.0 / ((double)total_histories);  // ==> [eV per history]
  const double invAREA  = detector_data[0].inv_pixel_size_X * detector_data[0].inv_pixel_size_Z; // ==> [eV/cm^2 per history]
  double energy_noScatter, energy_compton, energy_rayleigh, energy_multiscatter;
  double energy_integral = 0.0;   // Integrate (add) the energy in the image pixels [meV]
  double maximum_energy_pixel = -100.0;  // Find maximum pixel signal
  int maximum_energy_pixel_x=0, maximum_energy_pixel_y=0, maximum_energy_pixel_number=0;   

  int i, j;
  for(j=0; j<detector_data[0].num_pixels.y; j++)
  {  
    for(i=0; i<detector_data[0].num_pixels.x; i++)
    {
      pixel = i + j*detector_data[0].num_pixels.x;    // Set current pixel
      energy_noScatter    = invSCALE*(double)(image[pixel]);
      energy_compton      = invSCALE*(double)(image[pixel +   pixels_per_image]);
      energy_rayleigh     = invSCALE*(double)(image[pixel + 2*pixels_per_image]);
      energy_multiscatter = invSCALE*(double)(image[pixel + 3*pixels_per_image]);
      
      
      if (detector_data[0].gain_W<0.001f)
      {
        // Normalize detected energy by number of histories and pixel area ==> energy fluence per hist.
        energy_noScatter    = energy_noScatter    * invHIST * invAREA;
        energy_compton      = energy_compton      * invHIST * invAREA;
        energy_rayleigh     = energy_rayleigh     * invHIST * invAREA;
        energy_multiscatter = energy_multiscatter * invHIST * invAREA;
        
        // -- Write the results in an external file; the image corresponding to all particles not written: it has to be infered adding all images
        if (flag_report_ASCII)
          fprintf(file_ptr, "%.8lf %.8lf %.8lf %.8lf\n", energy_noScatter, energy_compton, energy_rayleigh, energy_multiscatter);   // Report energy fluence per history

        // Keep data as a arrays to be output at the end in binary form:
        energy_noScatter_array[pixel]   = (float)energy_noScatter;
        energy_compton_array[pixel]     = (float)energy_compton;
        energy_rayleigh_array[pixel]    = (float)energy_rayleigh;
        energy_multiscatter_array[pixel]= (float)energy_multiscatter;
      }
      else
      {
        
        //!!DETECTOR_RESPONSE!!  Convert the detected energy to a random number of charges using a Gaussian distribution with variance == mean:
        //!!DETECTOR_RESPONSE!!  Additive electronic noise is being added only to the No Scatter results. I can't include it to each scatter image bc it would be counted 4 times in final image.
        
        // Sample 4 gaussian distributed random variables (mean=0, std_dev=1):
        double g1=0.0, g2=0.0, g3=0.0, g4=0.0;
        gausspdf_double_CPU(&g1, &g2, &seed);
        gausspdf_double_CPU(&g3, &g4, &seed);
        
          // Conversion from the energy deposited in each pixel to charge is done sampling a Gaussian distribution with:
          //   mean      = E_deposited/gain_W + electronic_noise
          //   std_dev^2 = mean*sqrt(1/Swank_factor - 1) + sqrt(electronic_noise)
        
        double mean    = energy_noScatter/detector_data[0].gain_W;
        double std_dev = sqrt(mean*detector_data[0].Swank_rel_std);     // Swank_rel_std = sqrt(1.0/Swank-1.0)
        mean    = mean    + detector_data[0].electronic_noise;          // Adding additive electronic noise
        std_dev = std_dev + sqrt(detector_data[0].electronic_noise);
        
        
        energy_noScatter = g1*std_dev + mean;
        if (energy_noScatter<0.0) energy_noScatter = 0.0;  // Prevent negative pixel values
        
        mean    = energy_compton/detector_data[0].gain_W;
        std_dev = sqrt(mean*detector_data[0].Swank_rel_std);
        energy_compton = g2*std_dev + mean;
        if (energy_compton<0.0) energy_compton = 0.0;
        
        mean    = energy_rayleigh/detector_data[0].gain_W;
        std_dev = sqrt(mean*detector_data[0].Swank_rel_std);
        energy_rayleigh = g3*std_dev + mean;
        if (energy_rayleigh<0.0) energy_rayleigh = 0.0;
        
        mean    = energy_multiscatter/detector_data[0].gain_W;
        std_dev = sqrt(mean*detector_data[0].Swank_rel_std);
        energy_multiscatter = g4*std_dev + mean;
        if (energy_multiscatter<0.0) energy_multiscatter = 0.0;
                

        if (flag_report_ASCII)
          fprintf(file_ptr, "%d %d %d %d\n", (int)(energy_noScatter+0.5), (int)(energy_compton+0.5), (int)(energy_rayleigh+0.5), (int)(energy_multiscatter+0.5));    // Report collected charge

        // Keep data as a arrays to be output at the end in binary form:
        energy_noScatter_array[pixel]   = (float)round(energy_noScatter);    // Round the floating point value into an integral number of charges
        energy_compton_array[pixel]     = (float)round(energy_compton);
        energy_rayleigh_array[pixel]    = (float)round(energy_rayleigh);
        energy_multiscatter_array[pixel]= (float)round(energy_multiscatter);

      }


      double total_energy_pixel = (double)(image[pixel] + image[pixel + pixels_per_image] + image[pixel + 2*pixels_per_image] + image[pixel + 3*pixels_per_image]);   // Find and report the pixel with maximum signal
        
      if (total_energy_pixel>maximum_energy_pixel)
      {
        maximum_energy_pixel = total_energy_pixel;
        maximum_energy_pixel_x = i;
        maximum_energy_pixel_y = j;
        maximum_energy_pixel_number = pixel;
      }            
      energy_integral += total_energy_pixel;   // Count total energy in the whole image
    }
    if (flag_report_ASCII)
      fprintf(file_ptr, "\n");     // Separate rows with an empty line for visualization with gnuplot.
  }
  
  
  fprintf(file_ptr, "#\n");
  fprintf(file_ptr, "#   *** Simulation REPORT: ***\n");
  fprintf(file_ptr, "#       Fraction of source energy detected (over the mean energy of the spectrum): %.3lf%%\n", 100.0*invSCALE*(energy_integral/(double)(total_histories))/(double)(mean_energy_spectrum));
  fprintf(file_ptr, "#       Maximum energy fluence detected in pixel %i: (x,y)=(%i,%i) -> pixel value = %lf eV/cm^2 per hist.\n", maximum_energy_pixel_number, maximum_energy_pixel_x, maximum_energy_pixel_y, maximum_energy_pixel*invSCALE*invAREA*invHIST);
  fprintf(file_ptr, "#       Simulated x rays:    %lld\n", total_histories);
  fprintf(file_ptr, "#       Simulation time [s]: %.2f\n", time_elapsed);
  if (time_elapsed>0.000001)
    fprintf(file_ptr, "#       Speed [x-rays/sec]:  %.2f\n\n", ((double)total_histories)/time_elapsed);
   
  fclose(file_ptr);  // Close output file and flush stream

  printf("              Fraction of initial energy arriving at the detector (over the mean energy of the spectrum):  %.3lf%%\n", 100.0*invSCALE*(energy_integral/(double)(total_histories))/(double)(mean_energy_spectrum));
  printf("              Maximum energy fluence detected in pixel %i: (x,y)=(%i,%i). Maximum pixel value = %lf eV/cm^2 per hist.\n\n", maximum_energy_pixel_number, maximum_energy_pixel_x, maximum_energy_pixel_y, maximum_energy_pixel*invSCALE*invAREA*invHIST);  
  fflush(stdout);
  
  
  // Report binary data as consecutive images in a binary file:
  for(i=0; i<pixels_per_image; i++)
  {
    float energy_float = energy_noScatter_array[i] + energy_compton_array[i] + energy_rayleigh_array[i] + energy_multiscatter_array[i];
    fwrite(&energy_float, sizeof(float), 1, file_binary_ptr);               // Total image (scatter + primary)
  }
  for(i=0; i<pixels_per_image; i++)
    fwrite(&energy_noScatter_array[i], sizeof(float), 1, file_binary_ptr);  // Non-scattered image

 
// NOT SEPARATING SCATTER:   January 2018
/* 
  for(i=0; i<pixels_per_image; i++)
    fwrite(&energy_compton_array[i], sizeof(float), 1, file_binary_ptr);    // Compton image
  
  for(i=0; i<pixels_per_image; i++)
    fwrite(&energy_rayleigh_array[i], sizeof(float), 1, file_binary_ptr);   // Rayleigh image
  
  for(i=0; i<pixels_per_image; i++)
    fwrite(&energy_multiscatter_array[i], sizeof(float), 1, file_binary_ptr);  // Multiple-scatter image
*/


  fclose(file_binary_ptr);
  
  free(energy_noScatter_array);
  free(energy_compton_array);
  free(energy_rayleigh_array);
  free(energy_multiscatter_array);
    
  return 0;     // Report could return not 0 to continue the simulation...
}
///////////////////////////////////////////////////////////////////////////////


///////////////////////////////////////////////////////////////////////////////
//! Report the total tallied 3D voxel dose deposition for all projections.
//! The voxel doses in the input ROI and their respective uncertainties are reported 
//! in binary form (32-bit floats) in two separate .raw files.
//! The dose in a single plane at the level of the focal spot is also reported in  
//! ASCII format for simple visualization with GNUPLOT.
//! The total dose deposited in each different material is reported to the standard output.
//! The material dose is calculated adding the energy deposited in the individual voxels 
//! within the dose ROI, and dividing by the total mass of the material in the ROI.
//!
//!       @param[in] file_dose_output   File where tallied image is reported
//!       @param[in] detector_data   Detector description read from the input file (pointer to detector_struct)
//!       @param[in] image  Tallied image (in meV per pixel)
//!       @param[in] time_elapsed   Time elapsed during the main loop execution (in seconds)
//!       @param[in] total_histories   Total number of x-rays simulated
//!       @param[in] source_data   Data required to compute the voxel plane to report in ASCII format: Z at the level of the source, 1st projection
////////////////////////////////////////////////////////////////////////////////
// int report_voxels_dose(char* file_dose_output, int num_projections, struct voxel_struct* voxel_data, float2* voxel_mat_dens, ulonglong2* voxels_Edep, double time_total_MC_init_report, unsigned long long int total_histories, short int dose_ROI_x_min, short int dose_ROI_x_max, short int dose_ROI_y_min, short int dose_ROI_y_max, short int dose_ROI_z_min, short int dose_ROI_z_max, struct source_struct* source_data)
int report_voxels_dose(char* file_dose_output, int num_projections, struct voxel_struct* voxel_data, int* voxel_mat_dens, ulonglong2* voxels_Edep, double time_total_MC_init_report, unsigned long long int total_histories, short int dose_ROI_x_min, short int dose_ROI_x_max, short int dose_ROI_y_min, short int dose_ROI_y_max, short int dose_ROI_z_min, short int dose_ROI_z_max, struct source_struct* source_data)  //!!FixedDensity_DBT!! 
{
  printf("\n\n          *** VOXEL ROI DOSE TALLY REPORT ***\n\n");
    
  FILE* file_ptr = fopen(file_dose_output, "w");
  if (file_ptr==NULL)
  {
    printf("\n\n   !!fopen ERROR report_voxels_dose!! File %s can not be opened!!\n", file_dose_output);
    exit(-3);
  }    
    
  // -- Binary output:                                         // !!BINARY!!  
  char file_binary_mean[250], file_binary_sigma[250];
  strncpy (file_binary_mean, file_dose_output, 250);
  strcat(file_binary_mean,".raw");                     
  strncpy (file_binary_sigma, file_dose_output, 250);
  strcat(file_binary_sigma,"_PercentRelError2sigma.raw");    
  FILE* file_binary_mean_ptr  = fopen(file_binary_mean, "w");  // !!BINARY!!
  FILE* file_binary_sigma_ptr = fopen(file_binary_sigma, "w");       // !!BINARY!!
  if (file_binary_mean_ptr==NULL)
  {
    printf("\n\n   !!fopen ERROR report_voxels_dose!! Binary file %s can not be opened!!\n", file_dose_output);
    exit(-3);
  }
  
  int DX = dose_ROI_x_max - dose_ROI_x_min + 1,
      DY = dose_ROI_y_max - dose_ROI_y_min + 1,
      DZ = dose_ROI_z_max - dose_ROI_z_min + 1;           
      
  // -- Calculate the dose plane that will be output as ASCII text:
  int z_plane_dose = (int)(source_data[0].position.z * voxel_data->inv_voxel_size.z + 0.00001f);  // Select voxel plane at the level of the source, 1st projections
  if ( (z_plane_dose<dose_ROI_z_min) || (z_plane_dose>dose_ROI_z_max) )
    z_plane_dose = (dose_ROI_z_max+dose_ROI_z_min)/2;
  
  int z_plane_dose_ROI = z_plane_dose - dose_ROI_z_min;

  printf("              Reporting the 3D voxel dose distribution as binary floats in the .raw file, and the 2D dose for Z plane %d as ASCII text.\n", z_plane_dose);
//   printf("              Also reporting the dose to each material inside the input ROI adding the energy deposited in each individual voxel\n");
//   printf("              (these material dose results will be equal to the materials dose tally below if the ROI covers all the voxels).\n");
  
  fprintf(file_ptr, "# \n");
  fprintf(file_ptr, "#     *****************************************************************************\n");
  fprintf(file_ptr, "#     ***         MC-GPU, version 1.4_DBT (http://code.google.com/p/mcgpu/)     ***\n");
  fprintf(file_ptr, "#     ***                                                                       ***\n");
  fprintf(file_ptr, "#     ***                     Andreu Badal (Andreu.Badal-Soler{at}fda.hhs.gov)  ***\n");
  fprintf(file_ptr, "#     *****************************************************************************\n");
  fprintf(file_ptr, "# \n");  
  fprintf(file_ptr, "#  *** SIMULATION IN THE GPU USING CUDA ***\n");
  fprintf(file_ptr, "#\n");
  
  
  // Report only one dose plane in ASCII, all the other data in binary only:

  fprintf(file_ptr, "#\n");
  fprintf(file_ptr, "#  3D dose deposition map (and dose uncertainty) created tallying the energy deposited by photons inside each voxel of the input geometry.\n");
  fprintf(file_ptr, "#  Electrons were not transported and therefore we are approximating that the dose is equal to the KERMA (energy released by the photons alone).\n");
  fprintf(file_ptr, "#  This approximation is acceptable when there is electronic equilibrium and when the range of the secondary electrons is shorter than the voxel size.\n");
  fprintf(file_ptr, "#  Usually the doses will be acceptable for photon energies below 1 MeV. The dose estimates may not be accurate at the interface of low density volumes.\n");
  fprintf(file_ptr, "#\n");
  fprintf(file_ptr, "#  The 3D dose deposition is reported in binary form in the .raw files (data given as 32-bit floats). \n");
  
  fprintf(file_ptr, "#  The %% relative error in the voxel dose at 2 standard deviations [=100*2*sigma/voxel_dose] is reported in the *_PercentRelError2sigma.raw file (32-bit floats). \n");   //  !!SPIE2013!!   Report relative error 
   
  fprintf(file_ptr, "#  To reduce the memory use and the reporting time this text output reports only the 2D dose at the Z plane at the level\n"); 
  fprintf(file_ptr, "#  of the source focal spot: z_coord = %d (z_coord in ROI = %d)\n", z_plane_dose, z_plane_dose_ROI);
  fprintf(file_ptr, "#\n");  
  fprintf(file_ptr, "#  The total dose deposited in each different material is reported to the standard output.\n");
  fprintf(file_ptr, "#  The dose is calculated adding the energy deposited in the individual voxels within the dose ROI and dividing by the total mass of the material in the ROI.\n");
  fprintf(file_ptr, "#\n");  
  fprintf(file_ptr, "#\n");
  fprintf(file_ptr, "#  Voxel size:  %lf x %lf x %lf = %lf cm^3\n", 1.0/(double)(voxel_data->inv_voxel_size.x), 1.0/(double)(voxel_data->inv_voxel_size.y), 1.0/(double)(voxel_data->inv_voxel_size.z), 1.0/(double)(voxel_data->inv_voxel_size.x*voxel_data->inv_voxel_size.y*voxel_data->inv_voxel_size.z));
  fprintf(file_ptr, "#  Number of voxels in the reported region of interest (ROI) X, Y and Z:\n");
  fprintf(file_ptr, "#      %d  %d  %d\n", DX, DY, DZ);
  fprintf(file_ptr, "#  Coordinates of the ROI inside the voxel volume = X[%d,%d], Y[%d,%d], Z[%d,%d]\n", dose_ROI_x_min+1, dose_ROI_x_max+1, dose_ROI_y_min+1, dose_ROI_y_max+1, dose_ROI_z_min+1, dose_ROI_z_max+1);  // Show ROI with index=1 for the first voxel instead of 0.
  fprintf(file_ptr, "#\n");
  fprintf(file_ptr, "#  Voxel dose units: eV/g per history\n");
  fprintf(file_ptr, "#  X rows given first, then Y, then Z. One blank line separates the different Y, and two blanks the Z values (GNUPLOT format).\n");
  fprintf(file_ptr, "#  The dose distribution is also reported with binary FLOAT values (.raw file) for easy visualization in ImageJ.\n");
  fprintf(file_ptr, "# \n");
  fprintf(file_ptr, "#    [DOSE]   [2*standard_deviation]\n");
  fprintf(file_ptr, "# =====================================\n");
  fflush(file_ptr);
  
  double voxel_dose, max_voxel_dose[MAX_MATERIALS], max_voxel_dose_std_dev[MAX_MATERIALS], max_voxel_dose_all_mat=0.0, max_voxel_dose_std_dev_all_mat=0.0;
  int max_voxel_dose_x[MAX_MATERIALS], max_voxel_dose_y[MAX_MATERIALS], max_voxel_dose_z[MAX_MATERIALS];
  unsigned long long int total_energy_deposited = 0;
  double inv_SCALE_eV = 1.0 / SCALE_eV,      // conversion to eV using the inverse of the constant used in the tally function (defined in the header file).         
                inv_N = 1.0 / (double)(total_histories*((unsigned long long int)num_projections));
                                
  register int i, j, k, voxel=0;
    
  double mat_Edep[MAX_MATERIALS], mat_Edep2[MAX_MATERIALS], mat_mass_ROI[MAX_MATERIALS];    // Arrays with the total energy, energy squared and mass of each material inside the ROI (mass and dose outside the ROI was not tallied).
  unsigned int mat_voxels[MAX_MATERIALS];
  for(i=0; i<MAX_MATERIALS; i++)
  {
     mat_Edep[i]  = 0.0;
     mat_Edep2[i] = 0.0;
     mat_mass_ROI[i]  = 0.0;
     mat_voxels[i]= 0;
     max_voxel_dose[i]        =-1.0;
     max_voxel_dose_std_dev[i]= 1.0e-15;
     max_voxel_dose_x[i]      = 0;
     max_voxel_dose_y[i]      = 0;
     max_voxel_dose_z[i]      = 0;       
  }
  
  double voxel_volume = 1.0 / ( ((double)voxel_data->inv_voxel_size.x) * ((double)voxel_data->inv_voxel_size.y) * ((double)voxel_data->inv_voxel_size.z) );
    
  for(k=0; k<DZ; k++)
  {
    for(j=0; j<DY; j++)
    {
      for(i=0; i<DX; i++)
      {
        register int voxel_geometry = (i+dose_ROI_x_min) + (j+dose_ROI_y_min)*voxel_data->num_voxels.x + (k+dose_ROI_z_min)*voxel_data->num_voxels.x*voxel_data->num_voxels.y;
//         register double inv_voxel_mass = 1.0 / (voxel_mat_dens[voxel_geometry].y*voxel_volume);
//         register int mat_number = (int)(voxel_mat_dens[voxel_geometry].x) - 1 ;  // Material number, starting at 0.
//         mat_mass_ROI[mat_number]  += voxel_mat_dens[voxel_geometry].y*voxel_volume;   // Estimate mass and energy deposited in this material
        
        register double inv_voxel_mass = 1.0 / (density_LUT[(int)voxel_mat_dens[voxel_geometry]]*voxel_volume);       //!!FixedDensity_DBT!! Density taken from function "density_LOT"
        register int mat_number = (int)(voxel_mat_dens[voxel_geometry]);  // Material number, starting at 0.      //!!FixedDensity_DBT!!
        mat_mass_ROI[mat_number]  += density_LUT[(int)voxel_mat_dens[voxel_geometry]]*voxel_volume;   // Estimate mass and energy deposited in this material    //!!FixedDensity_DBT!! Density taken from function "density_LOT"
        

        mat_Edep[mat_number]  += (double)voxels_Edep[voxel].x;        // Using doubles to avoid overflow
        mat_Edep2[mat_number] += (double)voxels_Edep[voxel].y;
        mat_voxels[mat_number]++;                                                // Count voxels made of this material
        
                
              // Optional code to eliminate dose deposited in air (first material).  Sometimes useful for visualization (dose to air irrelevant, noisy)
              //   if (voxel_mat_dens[voxel_geometry].x < 1.1f)
              //   {
              //     voxels_Edep[voxel].x = 0.0f;
              //     voxels_Edep[voxel].y = 0.0f;
              //   }
                
        // -- Convert total energy deposited to dose [eV/gram] per history:                        
        
//  !!DeBuG!! BUG in first version MC-GPU v1.3, corrected for v1.4 [2013-01-31]. Edep2 is NOT scaled by SCALE_eV!! Also, division by voxel_mass must be done at the end!
//  !!DeBuG!!   Wrong:  voxel_dose = ((double)voxels_Edep[voxel].x) * inv_N * inv_voxel_mass * inv_SCALE_eV;
//  !!DeBuG!!   Wrong:  register double voxel_std_dev = (((double)voxels_Edep[voxel].y) * inv_N * inv_SCALE_eV * inv_voxel_mass - voxel_dose*voxel_dose) * inv_N;

        voxel_dose = ((double)voxels_Edep[voxel].x) * inv_N * inv_SCALE_eV;    // [<Edep> == Edep / N_hist /scaling_factor ;  dose == <Edep> / mass]
        total_energy_deposited += voxels_Edep[voxel].x;
               
        register double voxel_std_dev = (((double)voxels_Edep[voxel].y) * inv_N - voxel_dose*voxel_dose) * inv_N * inv_voxel_mass;   // [sigma_Edep^2 = (<Edep^2> - <Edep>^2) / N_hist] ; [sigma_dose^2 = sigma_Edep/mass] (not using SCALE_eV for std_dev to prevent overflow)  

        if (voxel_std_dev>0.0)
          voxel_std_dev = sqrt(voxel_std_dev);
        
        voxel_dose *= inv_voxel_mass;    // [dose == <Edep> / mass]
        
        if (voxel_dose > max_voxel_dose[mat_number])    // Tally peak dose for each material!
        {
          // Find the voxel that has the maximum dose:
          max_voxel_dose[mat_number]          = voxel_dose;
          max_voxel_dose_std_dev[mat_number]  = voxel_std_dev;
          max_voxel_dose_x[mat_number]        = i+dose_ROI_x_min;
          max_voxel_dose_y[mat_number]        = j+dose_ROI_y_min;
          max_voxel_dose_z[mat_number]        = k+dose_ROI_z_min;
          if (voxel_dose > max_voxel_dose_all_mat)
          {
            max_voxel_dose_all_mat = voxel_dose;
            max_voxel_dose_std_dev_all_mat = voxel_std_dev;
          }
        }
        
        // Report only one dose plane in ASCII:
        if (k == z_plane_dose_ROI) 
          fprintf(file_ptr, "%.6lf %.6lf\n", voxel_dose, 2.0*voxel_std_dev);        
        
        float voxel_dose_float  = (float)voxel_dose;         // After dividing by the number of histories I can report FLOAT bc the number of significant digits will be low.  
        
        fwrite(&voxel_dose_float,  sizeof(float), 1, file_binary_mean_ptr);    // Write dose data in a binary file that can be easyly open in imageJ.   !!BINARY!!

       
        float voxel_relErr_float = 0.0f;
        if (voxel_dose > 0.0)
          voxel_relErr_float = 200.0f*(float)(voxel_std_dev/voxel_dose);        //  New in MC-GPU v1.4: Report relative error for 2*sigma, in %  (avoid dividing by 0)
        fwrite(&voxel_relErr_float, sizeof(float), 1, file_binary_sigma_ptr);
        
        
        voxel++;
      }
      if (k == z_plane_dose_ROI) 
        fprintf(file_ptr, "\n");     // Separate Ys with an empty line for visualization with gnuplot.
    }
    if (k == z_plane_dose_ROI) 
      fprintf(file_ptr, "\n");     // Separate Zs.
  }

  
  fprintf(file_ptr, "#   ****** DOSE REPORT: TOTAL SIMULATION PERFORMANCE FOR ALL PROJECTIONS ******\n");
  fprintf(file_ptr, "#       Total number of simulated x rays: %lld\n", total_histories*((unsigned long long int)num_projections));
  fprintf(file_ptr, "#       Simulated x rays per projection:  %lld\n", total_histories);
  fprintf(file_ptr, "#       Total simulation time [s]:  %.2f\n", time_total_MC_init_report);
  if (time_total_MC_init_report>0.000001)
    fprintf(file_ptr, "#       Total speed [x-rays/s]:  %.2f\n", (double)(total_histories*((unsigned long long int)num_projections))/time_total_MC_init_report);

  
  fprintf(file_ptr, "\n#       Total energy absorved inside the dose ROI: %.5lf keV/hist\n\n", 0.001*((double)total_energy_deposited)*inv_N*inv_SCALE_eV);
  
  // Output data to standard input:
  printf("\n              Total energy absorved inside the dose deposition ROI: %.5lf keV/hist\n", 0.001*((double)total_energy_deposited)*inv_N*inv_SCALE_eV);
  printf(  "              Maximum voxel dose (+-2 sigma): %lf +- %lf eV/g per history.\n", max_voxel_dose_all_mat, max_voxel_dose_std_dev_all_mat);  
  
  // -- Report dose deposited in each material:  
  printf("              Dose deposited in the different materials inside the input ROI computed post-processing the 3D voxel dose results:\n\n");
  
//  OLD reporting without peak dose (v1.3):    printf("    [MATERIAL]  [DOSE_ROI, eV/g/hist]  [2*std_dev]  [Rel error 2*std_dev, %%]  [E_dep [eV/hist]  [MASS_ROI, g]  [NUM_VOXELS_ROI]\n");
  printf("  [MAT]  [DOSE_ROI eV/g/hist]  [2*std_dev]  [Rel error %%]  [Peak voxel dose]  [2*std_dev]  [Rel error %%]  [Peak voxel coord]  [E_dep eV/hist]  [MASS_ROI g]  [NUM_VOXELS_ROI]\n");
  printf(" ===============================================================================================================================================================================\n");  
  
  for(i=0; i<MAX_MATERIALS; i++)
  {
    if(mat_voxels[i]>0)   // Report only for materials found at least in 1 voxel of the input geometry (prevent dividing by 0 mass).
    {
      
      double Edep = mat_Edep[i] * inv_N * inv_SCALE_eV;    // [dose == Edep/Mass/N_hist]
      // !!DeBuG!! BUG in version 1.2: I have to divide by mass after computing the mean and sigma!!!
      // !!DeBuG!! WRONG code:  double material_dose = mat_Edep[i] * inv_N  * inv_SCALE_eV / mat_mass_ROI[i];    // [dose == Edep/Mass/N_hist]
      // !!DeBuG!! WRONG code:  double material_std_dev = (mat_Edep2[i] * inv_N  * inv_SCALE_eV / mat_mass_ROI[i] - material_dose*material_dose) * inv_N;   // [sigma^2 = (<Edep^2> - <Edep>^2) / N_hist]      
      
      double material_std_dev = (mat_Edep2[i] * inv_N - Edep*Edep) * inv_N;   // [sigma^2 = (<Edep^2> - <Edep>^2) / N_hist]   (mat_Edep2 not scaled by SCALE_eV in kernel to prevent overflow)
      if (material_std_dev>0.0)
        material_std_dev = sqrt(material_std_dev);
      
      double material_dose = Edep / mat_mass_ROI[i];
      material_std_dev = material_std_dev / mat_mass_ROI[i];
      
      double rel_diff=0.0, rel_diff_peak=0.0;
      if (material_dose>0.0)
      {
        rel_diff = material_std_dev/material_dose;
        rel_diff_peak = max_voxel_dose_std_dev[i]/max_voxel_dose[i];
      }
    
      printf("\t%d\t%.5lf\t%.5lf\t%.3lf\t\t%.5lf\t%.5lf\t%.3lf\t(%d,%d,%d)\t\t%.5lf\t%.5lf\t%u\n", (i+1), material_dose, 2.0*material_std_dev, (200.0*rel_diff), max_voxel_dose[i], 2.0*max_voxel_dose_std_dev[i], (200.0*rel_diff_peak), max_voxel_dose_x[i], max_voxel_dose_y[i], max_voxel_dose_z[i], Edep, mat_mass_ROI[i], mat_voxels[i]); 
    }    
  }       
  printf("\n");
        
  
  fflush(stdout);          
  fclose(file_ptr);  // Close output file and flush stream
  fclose(file_binary_mean_ptr);
  fclose(file_binary_sigma_ptr);

  return 0;   // Report could return not 0 to continue the simulation...
}
///////////////////////////////////////////////////////////////////////////////


///////////////////////////////////////////////////////////////////////////////
//! Report the tallied dose to each material number, accounting for different 
//! densities in different regions with the same material number. 
//!
//!       @param[in] num_projections   Number of projections simulated
//!       @param[in] total_histories   Total number of x-rays simulated per projection
//!       @param[out] density_nominal   Array with the nominal densities of materials given in the input file; -1 for materials not defined. Used to report only defined materials.
//!       @param[in] materials_dose   Tallied dose and dose^2 arrays
////////////////////////////////////////////////////////////////////////////////
int report_materials_dose(int num_projections, unsigned long long int total_histories, float *density_nominal, ulonglong2 *materials_dose, double *mass_materials, char file_name_materials[MAX_MATERIALS][250])  // !!tally_materials_dose!!
{

  printf("\n\n          *** MATERIALS TOTAL DOSE TALLY REPORT ***\n\n");  
  printf("              Dose deposited in each material defined in the input file (tallied directly per material, not per voxel):\n");
  printf("              The results of this tally should be equal to the voxel tally doses for an ROI covering all voxels.\n");
  
  printf("              Total number of simulated x rays: %lld\n", total_histories*((unsigned long long int)num_projections));       // !!DBT!!
  
  if (num_projections>1)
    printf("              Simulated x rays for each of %d projections:  %lld\n\n", num_projections, total_histories);
  
  printf("\t [MAT]  [DOSE eV/g/hist]  [2*std_dev]  [Rel_error 2*std_dev, %%]  [E_dep eV/hist]  [DOSE mGy]  [Material mass g]  [Material file name]\n");
  printf("\t======================================================================================================================================\n");
  
  double dose, Edep, std_dev, rel_diff, inv_N = 1.0 / (double)(total_histories*((unsigned long long int)num_projections));
  int i, flag=0, max_mat=0;
  for(i=0; i<MAX_MATERIALS; i++)
  {
    if (density_nominal[i]<0.0f)
      break;  // Skip report for materials not defined in the input file
      
    // Report the material file names removing the absolute file system path for clarity:
    char file_name_material_without_path[250];
    char* last_slash = strrchr(file_name_materials[i],'/');    // Return a pointer to the last character '/' in the input name, or NULL if not found
    if (last_slash==NULL)
      strcpy(file_name_material_without_path, file_name_materials[i]);
    else
      strcpy(file_name_material_without_path, (last_slash+1));
    
    
    Edep    = ((double)materials_dose[i].x) / SCALE_eV * inv_N;    
    std_dev = sqrt( (((double)materials_dose[i].y)*inv_N - Edep*Edep) * inv_N );   // [sigma^2 = (<Edep^2> - <Edep>^2) / N_hist]   (not scaling "materials_dose[i].y" by SCALE_eV in kernel to prevent overflow).
    
    if (Edep>0.0)
      rel_diff = std_dev/Edep;
    else
      rel_diff = 0.0;

    dose    = Edep / max_value(mass_materials[i], 0.00001);     // Prevent division by 0
    std_dev = std_dev / max_value(mass_materials[i], 0.00001);
    
 
    printf("\t%d\t%.5lf\t\t%.5lf\t\t%.2lf\t\t%.2lf\t\t%.5lf\t\t%.5lf\t\t%s\n", (i+1), dose, 2.0*std_dev, 2.0*100.0*rel_diff, Edep, ((double)materials_dose[i].x)/SCALE_eV/max_value(mass_materials[i], 0.00001)*(1.0e3/6.2415e15), mass_materials[i], file_name_material_without_path);
    
    if (materials_dose[i].x>1e16 || dose!=abs(dose) || std_dev!=abs(std_dev))  // !!DeBuG!!  Try to detect a possible overflow in any material: large counter or negative, nan value
    {
      flag = 1;
      if (materials_dose[i].x>materials_dose[max_mat].x) 
        max_mat = i;
    }
  }
  
  if (flag!=0)    // !!DeBuG!! Try to detect a possible overflow: large counter or negative, nan value. The value of SCALE_eV can be reduced to prevent this overflow in some cases.
  {
    printf("\n     WARNING: it is possible that the unsigned long long int counter used to tally the standard deviation overflowed (>2^64).\n");  // !!DeBuG!! 
    printf("              The standard deviation may be incorrectly measured, but it will surely be very small (<< 1%%).\n");
    printf("              Max counter (mat=%d): E_dep = %llu , E_dep^2 = %llu\n\n", max_mat+1, materials_dose[max_mat].x, materials_dose[max_mat].y);
  }
  fflush(stdout);  
  return 0;
}


///////////////////////////////////////////////////////////////////////////////


///////////////////////////////////////////////////////////////////////////////
//!  Sets the tomographic acquisition trajectory: store in memory the source and detector
//!  positions and rotation matrices that are needed to simulate the multiple projections.
//!  The first projection ([0] = 0 degrees) was previously initialized in function "read_input".
//!  The antiscatter grid is disabled after projection [0]; the motion blur is disabled for projection [0].
///////////////////////////////////////////////////////////////////////////////
void set_CT_trajectory(int myID, int num_projections, struct source_struct* source_data, struct detector_struct* detector_data, double translation_helical, bool flag_detectorFixed)  // !!DBTv1.4!! NEW VERSION with general rotations!!
{
  MAIN_THREAD 
  {
    printf("\n    -- Setting the parameters of the sources and detectors for the %d tomographic projections (MAX_NUM_PROJECTIONS=%d):\n\n", num_projections, MAX_NUM_PROJECTIONS);
    
    if (flag_detectorFixed)
    {
      printf("         \"flag_detectorFixed==true\": the detector will stay fixed at 0 deg position (as in DBT acquisition).\n\n");
    }
  }
    
  int i;
  for (i=1; i<=num_projections; i++)   // The first projection (i=0) was initialized in function "read_input"
  {
    double angle = source_data[0].angle_offset + (i-1)*source_data[0].angle_per_projection;
    
    // --Initialize the source and detector structures to the values in projection 0:
    source_data[i]   = source_data[0];
    detector_data[i] = detector_data[0];

    
    // --Set "source_data[i]->rot_fan" multiplying the rotation matrix for the original 0 degrees with the new rotation of "angle" degrees around "axis_of_rotation".   // !!DBTv1.4!!
    //   The new matrix multiplies "from the left side" (will be applied last; non-commutative):
    float m[9];
    m[0] = 1; m[1] = 0; m[2] = 0;   // Init rotation matrix to identity
    m[3] = 0; m[4] = 1; m[5] = 0; 
    m[6] = 0; m[7] = 0; m[8] = 1;
    create_rotation_matrix_around_axis(angle, source_data[0].axis_of_rotation.x, source_data[0].axis_of_rotation.y, source_data[0].axis_of_rotation.z, m);
    multiply_3x3(source_data[i].rot_fan, m, source_data[0].rot_fan);


    // --Set "source_data[i].direction" multiplying by the rotation matrix:
    apply_rotation(&source_data[i].direction, m);

    
    // --Translate the source position and rotation_point according to the input translation along the axis_of_rotation for an helical scan:
    source_data[i].position.x += (i-1)*translation_helical*source_data[0].axis_of_rotation.x;
    source_data[i].position.y += (i-1)*translation_helical*source_data[0].axis_of_rotation.y;
    source_data[i].position.z += (i-1)*translation_helical*source_data[0].axis_of_rotation.z;
    source_data[i].rotation_point.x += (i-1)*translation_helical*source_data[0].axis_of_rotation.x;
    source_data[i].rotation_point.y += (i-1)*translation_helical*source_data[0].axis_of_rotation.y;
    source_data[i].rotation_point.z += (i-1)*translation_helical*source_data[0].axis_of_rotation.z;

    // --Set "source_data[i].position" multiplying by the rotation matrix in the reference system where the "rotation_point" is the origin of coordinates:
    source_data[i].position.x -= source_data[i].rotation_point.x;  source_data[i].position.y -= source_data[i].rotation_point.y;  source_data[i].position.z -= source_data[i].rotation_point.z;
    apply_rotation(&source_data[i].position, m);
    source_data[i].position.x += source_data[i].rotation_point.x;  source_data[i].position.y += source_data[i].rotation_point.y;  source_data[i].position.z += source_data[i].rotation_point.z;

    if (flag_detectorFixed==false)   // Check if the detector rotates with the source or if it stays static as in DBT
    {
      // --Set "detector_data[i].center" multiplying by the rotation matrix in the reference system where the "rotation_point" is the origin of coordinates:
      detector_data[i].center.x -= source_data[i].rotation_point.x;  detector_data[i].center.y -= source_data[i].rotation_point.y;  detector_data[i].center.z -= source_data[i].rotation_point.z;
      apply_rotation(&detector_data[i].center, m);
      detector_data[i].center.x += source_data[i].rotation_point.x;  detector_data[i].center.y += source_data[i].rotation_point.y;  detector_data[i].center.z += source_data[i].rotation_point.z;
      
      
      // --Set "detector_data[i].rot_inv" multiplying the inverse rotation matrix by "-angle". The new matrix multiplies "from the right side" (will be applied first):   // !!DBTv1.4!!
      m[0] = 1; m[1] = 0; m[2] = 0;   // Init rotation matrix to identity
      m[3] = 0; m[4] = 1; m[5] = 0; 
      m[6] = 0; m[7] = 0; m[8] = 1;
      create_rotation_matrix_around_axis(-angle, source_data[0].axis_of_rotation.x, source_data[0].axis_of_rotation.y, source_data[0].axis_of_rotation.z, m);
      multiply_3x3(detector_data[i].rot_inv, detector_data[0].rot_inv, m);
    }

    MAIN_THREAD printf("         << Projection #%d >>\t Angle=%.5f degrees\n", i, angle*RAD2DEG);
    MAIN_THREAD printf("                             \t Source position=(%.8f,%.8f,%.8f), direction=(%.8f,%.8f,%.8f)\n", source_data[i].position.x,source_data[i].position.y,source_data[i].position.z, source_data[i].direction.x,source_data[i].direction.y,source_data[i].direction.z);
    MAIN_THREAD printf("                             \t Detector center=(%.8f,%.8f,%.8f)\n", detector_data[i].center.x, detector_data[i].center.y, detector_data[i].center.z);

    if (detector_data[0].grid_freq>0.0f)
      detector_data[i].grid_freq = -detector_data[0].grid_freq;     // Disable grid after projection [0], if used        !!DBTv1.5!!

  }
  
  source_data[0].rotation_blur = 0.0f;  // Motion blur disabled for the 0 deg projection (eg, mammo).     // !!DBTv1.4!!

}


///////////////////////////////////////////////////////////////////////////////


////////////////////////////////////////////////////////////////////////////////
//! Initialize the first seed of the pseudo-random number generator (PRNG) 
//! RANECU to a position far away from the previous history (leap frog technique).
//! This function is equivalent to "init_PRNG" but only updates one of the seeds.
//!
//! Note that if we use the same seed number to initialize the 2 MLCGs of the PRNG
//! we can only warranty that the first MLCG will be uncorrelated for each value
//! generated by "update_seed_PRNG". There is a tiny chance that the final PRNs will
//! be correlated because the leap frog on the first MLCG will probably go over the
//! repetition cycle of the MLCG, which is much smaller than the full RANECU. But any
//! correlataion is extremely unlikely. Function "init_PRNG" doesn't have this issue.
//!
//!       @param[in] batch_number   Elements to skip (eg, MPI thread_number).
//!       @param[in] total_histories   Histories to skip.
//!       @param[in,out] seed   Initial PRNG seeds; returns the updated seed.
////////////////////////////////////////////////////////////////////////////////
inline void update_seed_PRNG(int batch_number, unsigned long long int total_histories, int* seed)
{
  if (0==batch_number)
    return;
    
  unsigned long long int leap = total_histories * (batch_number * LEAP_DISTANCE);
  int y = 1;
  int z = a1_RANECU;
  // -- Calculate the modulo power '(a^leap)MOD(m)' using a divide-and-conquer algorithm adapted to modulo arithmetic
  for(;;)
  {
    // (A2) Halve n, and store the integer part and the residue
    if (0!=(leap&01))  // (bit-wise operation for MOD(leap,2), or leap%2 ==> proceed if leap is an odd number)  Equivalent: t=(short)(leap%2);
    {
      leap >>= 1;     // Halve n moving the bits 1 position right. Equivalent to:  leap=(leap/2);  
      y = abMODm(m1_RANECU,z,y);      // (A3) Multiply y by z:  y = [z*y] MOD m
      if (0==leap) break;         // (A4) leap==0? ==> finish
    }
    else           // (leap is even)
    {
      leap>>= 1;     // Halve leap moving the bits 1 position right. Equivalent to:  leap=(leap/2);
    }
    z = abMODm(m1_RANECU,z,z);        // (A5) Square z:  z = [z*z] MOD m
  }
  // AjMODm1 = y;                 // Exponentiation finished:  AjMODm = expMOD = y = a^j
  // -- Compute and display the seeds S(i+j), from the present seed S(i), using the previously calculated value of (a^j)MOD(m):
  //         S(i+j) = [(a**j MOD m)*S(i)] MOD m
  //         S_i = abMODm(m,S_i,AjMODm)
  *seed = abMODm(m1_RANECU, *seed, y);
}


///////////////////////////////////////////////////////////////////////////////


////////////////////////////////////////////////////////////////////////////////
//! Read the energy spectrum file and initialize the Walker aliasing sampling.
//!
//!       @param[in] file_name_espc   File containing the energy spectrum (lower energy value in each bin and its emission probability).
//!       @param[in,out] source_energy_data   Energy spectrum and other source data. The Walker alias and cutoffs are initialized in this function.
//!       @param[out] mean_energy_spectrum   Mean energy in the input x-ray energy spectrum.
////////////////////////////////////////////////////////////////////////////////
void init_energy_spectrum(char* file_name_espc, struct source_energy_struct* source_energy_data, float *mean_energy_spectrum)
{
  char *new_line_ptr = NULL, new_line[250];    
  float lower_energy_bin, prob;
  float prob_espc_bin[MAX_ENERGY_BINS];    // The input probabilities of each energy bin will be discarded after Walker is initialized

  // -- Read spectrum from file:
  FILE* file_ptr = fopen(file_name_espc, "r");
  if (NULL==file_ptr)
  {
    printf("\n\n   !!init_energy_spectrum ERROR!! Error trying to read the energy spectrum input file \"%s\".\n\n", file_name_espc);
    exit(-1);
  }
  
  int current_bin = -1;
  do 
  {
    current_bin++;  // Update bin counter
    
    if (current_bin >= MAX_ENERGY_BINS)
    {
      printf("\n !!init_energy_spectrum ERROR!!: too many energy bins in the input spectrum. Increase the value of MAX_ENERGY_BINS=%d.\n", MAX_ENERGY_BINS);
      printf(  "            A negative probability marks the end of the spectrum.\n\n");
      exit(-1);
    }

    new_line_ptr = fgets_trimmed(new_line, 250, file_ptr);   // Read the following line of text skipping comments and extra spaces
    
    if (new_line_ptr==NULL)
    {
      printf("\n\n   !!init_energy_spectrum ERROR!! The input file for the x ray spectrum (%s) is not readable or incomplete (a negative probability marks the end of the spectrum).\n", file_name_espc);
      exit(-1);
    }
    
    prob = -123456789.0f;  
    
    sscanf(new_line, "%f %f", &lower_energy_bin, &prob);     // Extract the lowest energy in the bin and the corresponding emission probability from the line read 
            
    prob_espc_bin[current_bin]     = prob;
    source_energy_data->espc[current_bin] = lower_energy_bin;           
    
    if (prob == -123456789.0f)
    {
      printf("\n !!init_energy_spectrum ERROR!!: invalid energy bin number %d?\n\n", current_bin);
      exit(-1);
    }
    else if (lower_energy_bin < source_energy_data->espc[max_value(current_bin-1,0)])    // (Avoid a negative index using the macro "max_value" defined in the header file)
    {
      printf("\n !!init_energy_spectrum ERROR!!: input energy bins with decreasing energy? espc(%d)=%f, espc(%d)=%f\n\n", current_bin-1, source_energy_data->espc[max_value(current_bin-1,0)], current_bin, lower_energy_bin);
      exit(-1);
    }
    
  } 
  while (prob > -1.0e-11f);     // A negative probability marks the end of the spectrum


  // Store the number of bins read from the input energy spectrum file:
  source_energy_data->num_bins_espc = current_bin;


  // Init the remaining bins (which will not be used) with the last energy read (will be assumed as the highest energy in the last bin) and 0 probability of emission.
  register int i;
  for (i=current_bin; i<MAX_ENERGY_BINS; i++)
  {
    source_energy_data->espc[i] = lower_energy_bin;
    prob_espc_bin[i]     = 0.0f;
  }


  // Compute the mean energy in the spectrum, taking into account the energy and prob of each bin:
  float all_energy = 0.0f;
  float all_prob = 0.0f;
  for(i=0; i<source_energy_data->num_bins_espc; i++)
  {
    all_energy += 0.5f*(source_energy_data->espc[i]+source_energy_data->espc[i+1])*prob_espc_bin[i];
    all_prob   += prob_espc_bin[i];
  }  
  *mean_energy_spectrum = all_energy/all_prob;
  
          
// -- Init the Walker aliasing sampling method (as it is done in PENELOPE):
  IRND0(prob_espc_bin, source_energy_data->espc_cutoff, source_energy_data->espc_alias, source_energy_data->num_bins_espc);   //!!Walker!! Calling PENELOPE's function to init the Walker method
       
// !!Verbose!! Test sampling
// Sampling the x ray energy using the Walker aliasing algorithm from PENELOPE:
// int sampled_bin = seeki_walker(source_energy_data->espc_cutoff, source_energy_data->espc_alias, 0.5, source_energy_data->num_bins_espc);
// float e = source_energy_data->espc[sampled_bin] + ranecu(seed) * (source_energy_data->espc[sampled_bin+1] - source_energy_data->espc[sampled_bin]);    // Linear interpolation of the final energy within the sampled energy bin
// printf("\n\n !!Walker!! Energy center bin %d = %f keV\n", sampled_bin, 0.001f*e);
  
}       


//********************************************************************
//!    Finds the interval (x(i),x(i+1)] containing the input value    
//!    using Walker's aliasing method.                                
//!                                                                   
//!    Input:                                                         
//!      cutoff(1..n) -> interval cutoff values for the Walker method 
//!      cutoff(1..n) -> alias for the upper part of each interval    
//!      randno       -> point to be located                          
//!      n            -> no. of data points                           
//!    Output:                                                        
//!      index i of the semiopen interval where randno lies           
//!    Comments:                                                      
//!      -> The cutoff and alias values have to be previously         
//!         initialised calling the penelope subroutine IRND0.        
//!                                                                   
//!                                                                   
//!    Algorithm implementation based on the PENELOPE code developed   
//!    by Francesc Salvat at the University of Barcelona. For more     
//!    info: www.oecd-nea.org/science/pubs/2009/nea6416-penelope.pdf  
//!                                                                   
//CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
//C  PENELOPE/PENGEOM (version 2006)                                     C
//C  Copyright (c) 2001-2006                                             C
//C  Universitat de Barcelona                                            C
//C                                                                      C
//C  Permission to use, copy, modify, distribute and sell this software  C
//C  and its documentation for any purpose is hereby granted without     C
//C  fee, provided that the above copyright notice appears in all        C
//C  copies and that both that copyright notice and this permission      C
//C  notice appear in all supporting documentation. The Universitat de   C
//C  Barcelona makes no representations about the suitability of this    C
//C  software for any purpose. It is provided "as is" without express    C
//C  or implied warranty.                                                C
//CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
inline int seeki_walker(float *cutoff, short int *alias, float randno, int n)
{
   float RN = randno * n;                         // Find initial interval (array starting at 0):   
   int int_part = (int)(RN);                      //   -- Integer part
   float fraction_part = RN - ((float)int_part);  //   -- Fractional part

   if (fraction_part < cutoff[int_part])          // Check if we are in the aliased part
      return int_part;                            // Below the cutoff: return current value
   else
      return (int)alias[int_part];                // Above the cutoff: return alias
}     


//****************************************************************** *
//*                    SUBROUTINE IRND0                              *
//********************************************************************
//*                                                                   
//!  Initialisation of Walker's aliasing algorithm for random         
//!  sampling from discrete probability distributions.                
//!                                                                   
//! Input arguments:                                                  
//!   N ........ number of different values of the random variable.   
//!   W(1:N) ... corresponding point probabilities (not necessarily   
//!              normalised to unity).                                
//! Output arguments:                                                 
//!   F(1:N) ... cutoff values.                                       
//!   K(1:N) ... alias values.                                        
//!                                                                   
//!                                                                   
//!  This subroutine is part of the PENELOPE 2006 code developed      
//!  by Francesc Salvat at the University of Barcelona. For more       
//!  info: www.oecd-nea.org/science/pubs/2009/nea6416-penelope.pdf    
//*                                                                   
//CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
//C  PENELOPE/PENGEOM (version 2006)                                     C
//C  Copyright (c) 2001-2006                                             C
//C  Universitat de Barcelona                                            C
//C                                                                      C
//C  Permission to use, copy, modify, distribute and sell this software  C
//C  and its documentation for any purpose is hereby granted without     C
//C  fee, provided that the above copyright notice appears in all        C
//C  copies and that both that copyright notice and this permission      C
//C  notice appear in all supporting documentation. The Universitat de   C
//C  Barcelona makes no representations about the suitability of this    C
//C  software for any purpose. It is provided "as is" without express    C
//C  or implied warranty.                                                C
//CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC
void IRND0(float *W, float *F, short int *K, int N)
{
   register int I;
  
   //  ****  Renormalisation.
   double WS=0.0;
   for (I=0; I<N; I++)
   {   
      if(W[I] < 0.0f) 
      {
         printf("\n\n !!ERROR!! IRND0: Walker sampling initialization. Negative point probability? W(%d)=%f\n\n", I, W[I]);
         exit(-1);
      }
      WS = WS + W[I];
   }
   WS = ((double)N) / WS; 
  
   for (I=0; I<N; I++)
   {
      K[I] = I;
      F[I] = W[I] * WS;
   }
    
   if (N==1) 
      return;
     
   //  ****  Cutoff and alias values.
   float HLOW, HIGH;
   int   ILOW, IHIGH, J;
   for (I=0; I<N-1; I++)
   {
      HLOW = 1.0f;
      HIGH = 1.0f;
      ILOW = -1;
      IHIGH= -1;
      for (J=0; J<N; J++)
      {
         if(K[J]==J)
         {
            if(F[J]<HLOW)
            {
               HLOW = F[J];
               ILOW = J;
            }
            else if(F[J]>HIGH)
            {
               HIGH  = F[J];
               IHIGH = J;
            }
         }
      }
      
      if((ILOW==-1) || (IHIGH==-1)) 
        return;

      K[ILOW] = IHIGH;
      F[IHIGH]= HIGH + HLOW - 1.0f;
   }
   return;
}


///////////////////////////////////////////////////////////////////////////////

// !!DBTv1.4!!
//!*  Create the rotation matrix that will rotate a vector the input angle around the input axis 
//!*  (using Rodrigues' formula: http://mathworld.wolfram.com/RodriguesRotationFormula.html)
//!*  The new rotation matrix is multiplied with the input matrix m[9] to create compounded rotations 
//!*  (input the identity matrix I[3x3] if no previous rotation exists!).
void create_rotation_matrix_around_axis(float angle, float wx, float wy, float wz, float *m)
{
  float mm[9];
  float nn[9];
  int i, flag=0;
  for (i=0; i<9; i++)
  {
    mm[i] = m[i];     // Create temporary copy of input matrix
    if (fabsf(m[i])>1.0e-7f)
      flag += 1;      // Detect non-null elements in the matrix
  }
  if (0==flag)
  {
    printf("\n\n !!WARNING!! Null rotation matrix input to \'create_rotation_matrix_around_axis\'???\n");
    printf(    "             Reseting the matrix to identity to preserve the following rotations.\n\n\n");
    mm[0] = mm[4] = mm[8] = 1.0f;
    mm[1] = mm[2] = mm[3] = mm[5] = mm[6] = mm[7] = 0.0f;
  }    
  double c = cos(angle);
  double s = sin(angle);

  nn[0] = (float)    c+wx*wx*(1.0-c);
  nn[3] = (float) wz*s+wx*wy*(1.0-c);
  nn[6] = (float)-wy*s+wx*wz*(1.0-c);

  nn[1] = (float)-wz*s+wx*wy*(1.0-c);
  nn[4] = (float)    c+wy*wy*(1.0-c);
  nn[7] = (float) wx*s+wy*wz*(1.0-c);

  nn[2] = (float) wy*s+wx*wz*(1.0-c);
  nn[5] = (float)-wx*s+wy*wz*(1.0-c);
  nn[8] = (float)    c+wz*wz*(1.0-c);
  
  multiply_3x3(m, nn, mm);   // Multiply new rotation matrix with the input one (which should be the identity if no previous rotations)
}


//! Multiply two input 3x3 matrices:  m_out[9] = n[9] x m[9]
void multiply_3x3(float *m_out, float *n, float *m)
{
  
  if (m_out==m)
    printf("\n\n !!ERROR in \"multiply_3x3\"!! Input and output pointes are the same; the matrix multiplication will be wrong!!??\n\n");
  
  m_out[0] = n[0]*m[0] + n[1]*m[3] + n[2]*m[6];
  m_out[1] = n[0]*m[1] + n[1]*m[4] + n[2]*m[7];
  m_out[2] = n[0]*m[2] + n[1]*m[5] + n[2]*m[8];

  m_out[3] = n[3]*m[0] + n[4]*m[3] + n[5]*m[6];
  m_out[4] = n[3]*m[1] + n[4]*m[4] + n[5]*m[7];
  m_out[5] = n[3]*m[2] + n[4]*m[5] + n[5]*m[8];

  m_out[6] = n[6]*m[0] + n[7]*m[3] + n[8]*m[6];
  m_out[7] = n[6]*m[1] + n[7]*m[4] + n[8]*m[7];
  m_out[8] = n[6]*m[2] + n[7]*m[5] + n[8]*m[8];
}


///////////////////////////////////////////////////////////////////////////////