[cpp] Convert MatQuad to template

tobanteAudio · Sep 17, 2024 · 8bdc805 · 8bdc805
1 parent 46f7b2f
commit 8bdc805
Show file tree

Hide file tree

Showing 4 changed files with 96 additions and 94 deletions.
diff --git a/src/cpp/main_3d/engine_cpu.cpp b/src/cpp/main_3d/engine_cpu.cpp
@@ -21,52 +21,51 @@ namespace {
 
 // function that does freq-dep RLC boundaries.  See 2016 ISMRA paper and
 // accompanying webpage (slightly improved here)
+template<typename Float>
 double process_bnl_pts_fd(
-    Real* u0b,
-    Real const* u2b,
-    Real const* ssaf_bnl,
+    Float* u0b,
+    Float const* u2b,
+    Float const* ssaf_bnl,
     int8_t const* mat_bnl,
     int64_t Nbl,
     int8_t* Mb,
-    Real lo2,
-    Real* vh1,
-    Real* gh1,
-    MatQuad const* mat_quads,
-    Real const* mat_beta
+    Float lo2,
+    Float* vh1,
+    Float* gh1,
+    MatQuad<Float> const* mat_quads,
+    Float const* mat_beta
 ) {
   auto const start = omp_get_wtime();
 #pragma omp parallel for schedule(static)
   for (int64_t nb = 0; nb < Nbl; nb++) {
-    Real _1   = 1.0;
-    Real _2   = 2.0;
+    Float _1  = 1.0;
+    Float _2  = 2.0;
     int32_t k = mat_bnl[nb];
 
-    Real lo2Kbg = lo2 * ssaf_bnl[nb] * mat_beta[k];
-    Real fac    = _2 * lo2 * ssaf_bnl[nb] / (_1 + lo2Kbg);
+    Float lo2Kbg = lo2 * ssaf_bnl[nb] * mat_beta[k];
+    Float fac    = _2 * lo2 * ssaf_bnl[nb] / (_1 + lo2Kbg);
 
-    Real u0bint = u0b[nb];
-    Real u2bint = u2b[nb];
+    Float u0bint = u0b[nb];
+    Float u2bint = u2b[nb];
 
     u0bint = (u0bint + lo2Kbg * u2bint) / (_1 + lo2Kbg);
 
-    Real vh1nb[MMb];
+    Float vh1nb[MMb];
     for (int8_t m = 0; m < Mb[k]; m++) {
-      int64_t nbm = nb * MMb + m;
-      int32_t mbk = k * MMb + m;
-      MatQuad const* tm;
-      tm       = &(mat_quads[mbk]);
-      vh1nb[m] = vh1[nbm];
+      int64_t nbm              = nb * MMb + m;
+      int32_t mbk              = k * MMb + m;
+      MatQuad<Float> const* tm = &(mat_quads[mbk]);
+      vh1nb[m]                 = vh1[nbm];
       u0bint -= fac * (_2 * (tm->bDh) * vh1nb[m] - (tm->bFh) * gh1[nbm]);
     }
 
-    Real du = u0bint - u2bint;
+    Float du = u0bint - u2bint;
 
     for (int8_t m = 0; m < Mb[k]; m++) {
-      int64_t nbm = nb * MMb + m;
-      int32_t mbk = k * MMb + m;
-      MatQuad const* tm;
-      tm          = &(mat_quads[mbk]);
-      Real vh0nbm = (tm->b) * du + (tm->bd) * vh1nb[m] - _2 * (tm->bFh) * gh1[nbm];
+      int64_t nbm              = nb * MMb + m;
+      int32_t mbk              = k * MMb + m;
+      MatQuad<Float> const* tm = &(mat_quads[mbk]);
+      Float vh0nbm             = (tm->b) * du + (tm->bd) * vh1nb[m] - _2 * (tm->bFh) * gh1[nbm];
       gh1[nbm] += (vh0nbm + vh1nb[m]) / _2;
       vh1[nbm] = vh0nbm;
     }
@@ -93,21 +92,21 @@ auto run(Simulation3D& sd) -> double {
   int8_t* Mb   = sd.Mb;
 
   // keep local copies of pointers (style choice)
-  int64_t* bn_ixyz   = sd.bn_ixyz;
-  int64_t* bnl_ixyz  = sd.bnl_ixyz;
-  int64_t* bna_ixyz  = sd.bna_ixyz;
-  int64_t* in_ixyz   = sd.in_ixyz;
-  int64_t* out_ixyz  = sd.out_ixyz;
-  uint16_t* adj_bn   = sd.adj_bn;
-  uint8_t* bn_mask   = sd.bn_mask;
-  int8_t* mat_bnl    = sd.mat_bnl;
-  int8_t* Q_bna      = sd.Q_bna;
-  double* in_sigs    = sd.in_sigs;
-  double* u_out      = sd.u_out;
-  int8_t fcc_flag    = sd.fcc_flag;
-  Real* ssaf_bnl     = sd.ssaf_bnl;
-  Real* mat_beta     = sd.mat_beta;
-  MatQuad* mat_quads = sd.mat_quads;
+  int64_t* bn_ixyz         = sd.bn_ixyz;
+  int64_t* bnl_ixyz        = sd.bnl_ixyz;
+  int64_t* bna_ixyz        = sd.bna_ixyz;
+  int64_t* in_ixyz         = sd.in_ixyz;
+  int64_t* out_ixyz        = sd.out_ixyz;
+  uint16_t* adj_bn         = sd.adj_bn;
+  uint8_t* bn_mask         = sd.bn_mask;
+  int8_t* mat_bnl          = sd.mat_bnl;
+  int8_t* Q_bna            = sd.Q_bna;
+  double* in_sigs          = sd.in_sigs;
+  double* u_out            = sd.u_out;
+  int8_t fcc_flag          = sd.fcc_flag;
+  Real* ssaf_bnl           = sd.ssaf_bnl;
+  Real* mat_beta           = sd.mat_beta;
+  MatQuad<Real>* mat_quads = sd.mat_quads;
 
   // allocate memory
   auto u0_buf   = std::vector<Real>(static_cast<size_t>(Npts));

diff --git a/src/cpp/main_3d/engine_gpu.cu b/src/cpp/main_3d/engine_gpu.cu
@@ -87,7 +87,7 @@ struct DeviceData { // for or on gpu (arrays all on GPU)
   int8_t* mat_bnl;
   int8_t* K_bn;
   Real* mat_beta;
-  pffdtd::MatQuad* mat_quads;
+  pffdtd::MatQuad<Real>* mat_quads;
   Real* u0;
   Real* u1;
   Real* u0b;
@@ -159,7 +159,7 @@ __global__ void KernelBoundaryFD(
     Real const* ssaf_bnl,
     int8_t const* mat_bnl,
     Real const* __restrict__ mat_beta,
-    pffdtd::MatQuad const* __restrict__ mat_quads
+    pffdtd::MatQuad<Real> const* __restrict__ mat_quads
 );
 __global__ void AddIn(Real* u0, Real sample);
 __global__ void CopyToGridKernel(Real* u, Real const* buffer, int64_t const* locs, int64_t N);
@@ -363,7 +363,7 @@ __global__ void KernelBoundaryFD(
     Real const* ssaf_bnl,
     int8_t const* mat_bnl,
     Real const* __restrict__ mat_beta,
-    pffdtd::MatQuad const* __restrict__ mat_quads
+    pffdtd::MatQuad<Real> const* __restrict__ mat_quads
 ) {
   int64_t nb = blockIdx.x * cuBb + threadIdx.x;
   if (nb < cuNbl) {
@@ -384,7 +384,7 @@ __global__ void KernelBoundaryFD(
     for (int8_t m = 0; m < cuMb[k]; m++) { // faster on average than MMb
       int64_t nbm = m * cuNbl + nb;
       int32_t mbk = k * MMb + m;
-      pffdtd::MatQuad const* tm;
+      pffdtd::MatQuad<Real> const* tm;
       tm        = &(mat_quads[mbk]);
       vh1int[m] = vh1[nbm];
       gh1int[m] = gh1[nbm];
@@ -396,7 +396,7 @@ __global__ void KernelBoundaryFD(
     for (int8_t m = 0; m < cuMb[k]; m++) { // faster on average than MMb
       int64_t nbm = m * cuNbl + nb;
       int32_t mbk = k * MMb + m;
-      pffdtd::MatQuad const* tm;
+      pffdtd::MatQuad<Real> const* tm;
       tm        = &(mat_quads[mbk]);
       Real vh0m = (tm->b) * du + (tm->bd) * vh1int[m] - _2 * (tm->bFh) * gh1int[m];
       gh1[nbm]  = gh1int[m] + (vh0m + vh1int[m]) / _2;
@@ -879,10 +879,13 @@ double run(pffdtd::Simulation3D const& sim) {
     gpuErrchk(cudaMalloc(&(gd->mat_beta), (size_t)sim.Nm * sizeof(Real)));
     gpuErrchk(cudaMemcpy(gd->mat_beta, sim.mat_beta, (size_t)sim.Nm * sizeof(Real), cudaMemcpyHostToDevice));
 
-    gpuErrchk(cudaMalloc(&(gd->mat_quads), (size_t)sim.Nm * MMb * sizeof(pffdtd::MatQuad)));
-    gpuErrchk(
-        cudaMemcpy(gd->mat_quads, sim.mat_quads, (size_t)sim.Nm * MMb * sizeof(pffdtd::MatQuad), cudaMemcpyHostToDevice)
-    );
+    gpuErrchk(cudaMalloc(&(gd->mat_quads), (size_t)sim.Nm * MMb * sizeof(pffdtd::MatQuad<Real>)));
+    gpuErrchk(cudaMemcpy(
+        gd->mat_quads,
+        sim.mat_quads,
+        (size_t)sim.Nm * MMb * sizeof(pffdtd::MatQuad<Real>),
+        cudaMemcpyHostToDevice
+    ));
 
     gpuErrchk(cudaMalloc(&(gd->bn_mask), (size_t)(ghd->Nbm * sizeof(uint8_t))));
     gpuErrchk(cudaMemcpy(gd->bn_mask, ghd->bn_mask, (size_t)ghd->Nbm * sizeof(uint8_t), cudaMemcpyHostToDevice));

diff --git a/src/cpp/pffdtd/simulation_3d.cpp b/src/cpp/pffdtd/simulation_3d.cpp
@@ -67,7 +67,7 @@ namespace pffdtd {
   int8_t NN;
   int8_t* Mb;
   int8_t Nm;
-  MatQuad* mat_quads;
+  MatQuad<Real>* mat_quads;
   Real* mat_beta; // one per material
 
   double Ts;
@@ -334,8 +334,7 @@ namespace pffdtd {
   //////////////////
   // DEF (RLC) datasets
   //////////////////
-  allocate_zeros((void**)&mat_quads,
-                 static_cast<unsigned long>(Nm * MMb) * sizeof(MatQuad)); // initalises to zero
+  allocate_zeros((void**)&mat_quads, static_cast<unsigned long>(Nm * MMb) * sizeof(MatQuad<Real>));
   allocate_zeros((void**)&mat_beta, Nm * sizeof(Real));
   for (int8_t i = 0; i < Nm; i++) {
     double* DEF; // for one material

diff --git a/src/cpp/pffdtd/simulation_3d.hpp b/src/cpp/pffdtd/simulation_3d.hpp
@@ -20,52 +20,53 @@
 namespace pffdtd {
 
 // see python code and 2016 ISMRA paper
+template<typename Float>
 struct MatQuad {
-  Real b;   // b
-  Real bd;  // b*d
-  Real bDh; // b*D-hat
-  Real bFh; // b*F-hat
+  Float b;   // b
+  Float bd;  // b*d
+  Float bDh; // b*D-hat
+  Float bFh; // b*F-hat
 };
 
 // main sim data, on host
 struct Simulation3D {
-  int64_t* bn_ixyz;          // boundary node indices
-  int64_t* bnl_ixyz;         // lossy boundary node indices
-  int64_t* bna_ixyz;         // absorbing boundary node indices
-  int8_t* Q_bna;             // integer for ABCs (wall 1,edge 2,corner 3)
-  int64_t* in_ixyz;          // input points
-  int64_t* out_ixyz;         // output points
-  int64_t* out_reorder;      // ordering for outputs point for final print/save
-  uint16_t* adj_bn;          // nearest-neighbour adjancencies for all boundary nodes
-  Real* ssaf_bnl;            // surface area corrections (with extra volume scaling)
-  uint8_t* bn_mask;          // bit mask for bounday nodes
-  int8_t* mat_bnl;           // material indices for lossy boundary nodes
-  int8_t* K_bn;              // number of adjacent neighbours, boundary nodesa
-  double* in_sigs;           // input signals
-  double* u_out;             // for output signals
-  int64_t Ns;                // number of input grid points
-  int64_t Nr;                // number of output grid points
-  int64_t Nt;                // number of samples simulation
-  int64_t Npts;              // number of Cartesian grid points
-  int64_t Nx;                // x-dim (non-continguous)
-  int64_t Ny;                // y-dim
-  int64_t Nz;                // z-dim (continguous)
-  int64_t Nb;                // number of boundary nodes
-  int64_t Nbl;               // number of lossy boundary nodes
-  int64_t Nba;               // number of ABC nodes
-  double l;                  // Courant number (CFL)
-  double l2;                 // CFL number squared
-  int8_t fcc_flag;           // boolean for FCC
-  int8_t NN;                 // integer, neareast neighbours
-  int8_t Nm;                 // number of materials used
-  int8_t* Mb;                // number of branches per material
-  struct MatQuad* mat_quads; // RLC coefficients (essentially)
-  Real* mat_beta;            // part of FD-boundaries one per material
-  double infac;              // rescaling of input (for numerical reason)
-  Real sl2;                  // scaled l2 (for single precision)
-  Real lo2;                  // 0.5*l
-  Real a2;                   // update stencil coefficient
-  Real a1;                   // update stencil coefficient
+  int64_t* bn_ixyz;         // boundary node indices
+  int64_t* bnl_ixyz;        // lossy boundary node indices
+  int64_t* bna_ixyz;        // absorbing boundary node indices
+  int8_t* Q_bna;            // integer for ABCs (wall 1,edge 2,corner 3)
+  int64_t* in_ixyz;         // input points
+  int64_t* out_ixyz;        // output points
+  int64_t* out_reorder;     // ordering for outputs point for final print/save
+  uint16_t* adj_bn;         // nearest-neighbour adjancencies for all boundary nodes
+  Real* ssaf_bnl;           // surface area corrections (with extra volume scaling)
+  uint8_t* bn_mask;         // bit mask for bounday nodes
+  int8_t* mat_bnl;          // material indices for lossy boundary nodes
+  int8_t* K_bn;             // number of adjacent neighbours, boundary nodesa
+  double* in_sigs;          // input signals
+  double* u_out;            // for output signals
+  int64_t Ns;               // number of input grid points
+  int64_t Nr;               // number of output grid points
+  int64_t Nt;               // number of samples simulation
+  int64_t Npts;             // number of Cartesian grid points
+  int64_t Nx;               // x-dim (non-continguous)
+  int64_t Ny;               // y-dim
+  int64_t Nz;               // z-dim (continguous)
+  int64_t Nb;               // number of boundary nodes
+  int64_t Nbl;              // number of lossy boundary nodes
+  int64_t Nba;              // number of ABC nodes
+  double l;                 // Courant number (CFL)
+  double l2;                // CFL number squared
+  int8_t fcc_flag;          // boolean for FCC
+  int8_t NN;                // integer, neareast neighbours
+  int8_t Nm;                // number of materials used
+  int8_t* Mb;               // number of branches per material
+  MatQuad<Real>* mat_quads; // RLC coefficients (essentially)
+  Real* mat_beta;           // part of FD-boundaries one per material
+  double infac;             // rescaling of input (for numerical reason)
+  Real sl2;                 // scaled l2 (for single precision)
+  Real lo2;                 // 0.5*l
+  Real a2;                  // update stencil coefficient
+  Real a1;                  // update stencil coefficient
 };
 
 [[nodiscard]] auto loadSimulation3D(std::filesystem::path const& simDir) -> Simulation3D;