Skip to content

Commit

Permalink
[cpp] Convert MatQuad to template
Browse files Browse the repository at this point in the history
  • Loading branch information
tobiashienzsch committed Sep 17, 2024
1 parent 46f7b2f commit 8bdc805
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 94 deletions.
81 changes: 40 additions & 41 deletions src/cpp/main_3d/engine_cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,52 +21,51 @@ namespace {

// function that does freq-dep RLC boundaries. See 2016 ISMRA paper and
// accompanying webpage (slightly improved here)
template<typename Float>
double process_bnl_pts_fd(
Real* u0b,
Real const* u2b,
Real const* ssaf_bnl,
Float* u0b,
Float const* u2b,
Float const* ssaf_bnl,
int8_t const* mat_bnl,
int64_t Nbl,
int8_t* Mb,
Real lo2,
Real* vh1,
Real* gh1,
MatQuad const* mat_quads,
Real const* mat_beta
Float lo2,
Float* vh1,
Float* gh1,
MatQuad<Float> const* mat_quads,
Float const* mat_beta
) {
auto const start = omp_get_wtime();
#pragma omp parallel for schedule(static)
for (int64_t nb = 0; nb < Nbl; nb++) {
Real _1 = 1.0;
Real _2 = 2.0;
Float _1 = 1.0;
Float _2 = 2.0;
int32_t k = mat_bnl[nb];

Real lo2Kbg = lo2 * ssaf_bnl[nb] * mat_beta[k];
Real fac = _2 * lo2 * ssaf_bnl[nb] / (_1 + lo2Kbg);
Float lo2Kbg = lo2 * ssaf_bnl[nb] * mat_beta[k];
Float fac = _2 * lo2 * ssaf_bnl[nb] / (_1 + lo2Kbg);

Real u0bint = u0b[nb];
Real u2bint = u2b[nb];
Float u0bint = u0b[nb];
Float u2bint = u2b[nb];

u0bint = (u0bint + lo2Kbg * u2bint) / (_1 + lo2Kbg);

Real vh1nb[MMb];
Float vh1nb[MMb];
for (int8_t m = 0; m < Mb[k]; m++) {
int64_t nbm = nb * MMb + m;
int32_t mbk = k * MMb + m;
MatQuad const* tm;
tm = &(mat_quads[mbk]);
vh1nb[m] = vh1[nbm];
int64_t nbm = nb * MMb + m;
int32_t mbk = k * MMb + m;
MatQuad<Float> const* tm = &(mat_quads[mbk]);
vh1nb[m] = vh1[nbm];
u0bint -= fac * (_2 * (tm->bDh) * vh1nb[m] - (tm->bFh) * gh1[nbm]);
}

Real du = u0bint - u2bint;
Float du = u0bint - u2bint;

for (int8_t m = 0; m < Mb[k]; m++) {
int64_t nbm = nb * MMb + m;
int32_t mbk = k * MMb + m;
MatQuad const* tm;
tm = &(mat_quads[mbk]);
Real vh0nbm = (tm->b) * du + (tm->bd) * vh1nb[m] - _2 * (tm->bFh) * gh1[nbm];
int64_t nbm = nb * MMb + m;
int32_t mbk = k * MMb + m;
MatQuad<Float> const* tm = &(mat_quads[mbk]);
Float vh0nbm = (tm->b) * du + (tm->bd) * vh1nb[m] - _2 * (tm->bFh) * gh1[nbm];
gh1[nbm] += (vh0nbm + vh1nb[m]) / _2;
vh1[nbm] = vh0nbm;
}
Expand All @@ -93,21 +92,21 @@ auto run(Simulation3D& sd) -> double {
int8_t* Mb = sd.Mb;

// keep local copies of pointers (style choice)
int64_t* bn_ixyz = sd.bn_ixyz;
int64_t* bnl_ixyz = sd.bnl_ixyz;
int64_t* bna_ixyz = sd.bna_ixyz;
int64_t* in_ixyz = sd.in_ixyz;
int64_t* out_ixyz = sd.out_ixyz;
uint16_t* adj_bn = sd.adj_bn;
uint8_t* bn_mask = sd.bn_mask;
int8_t* mat_bnl = sd.mat_bnl;
int8_t* Q_bna = sd.Q_bna;
double* in_sigs = sd.in_sigs;
double* u_out = sd.u_out;
int8_t fcc_flag = sd.fcc_flag;
Real* ssaf_bnl = sd.ssaf_bnl;
Real* mat_beta = sd.mat_beta;
MatQuad* mat_quads = sd.mat_quads;
int64_t* bn_ixyz = sd.bn_ixyz;
int64_t* bnl_ixyz = sd.bnl_ixyz;
int64_t* bna_ixyz = sd.bna_ixyz;
int64_t* in_ixyz = sd.in_ixyz;
int64_t* out_ixyz = sd.out_ixyz;
uint16_t* adj_bn = sd.adj_bn;
uint8_t* bn_mask = sd.bn_mask;
int8_t* mat_bnl = sd.mat_bnl;
int8_t* Q_bna = sd.Q_bna;
double* in_sigs = sd.in_sigs;
double* u_out = sd.u_out;
int8_t fcc_flag = sd.fcc_flag;
Real* ssaf_bnl = sd.ssaf_bnl;
Real* mat_beta = sd.mat_beta;
MatQuad<Real>* mat_quads = sd.mat_quads;

// allocate memory
auto u0_buf = std::vector<Real>(static_cast<size_t>(Npts));
Expand Down
21 changes: 12 additions & 9 deletions src/cpp/main_3d/engine_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ struct DeviceData { // for or on gpu (arrays all on GPU)
int8_t* mat_bnl;
int8_t* K_bn;
Real* mat_beta;
pffdtd::MatQuad* mat_quads;
pffdtd::MatQuad<Real>* mat_quads;
Real* u0;
Real* u1;
Real* u0b;
Expand Down Expand Up @@ -159,7 +159,7 @@ __global__ void KernelBoundaryFD(
Real const* ssaf_bnl,
int8_t const* mat_bnl,
Real const* __restrict__ mat_beta,
pffdtd::MatQuad const* __restrict__ mat_quads
pffdtd::MatQuad<Real> const* __restrict__ mat_quads
);
__global__ void AddIn(Real* u0, Real sample);
__global__ void CopyToGridKernel(Real* u, Real const* buffer, int64_t const* locs, int64_t N);
Expand Down Expand Up @@ -363,7 +363,7 @@ __global__ void KernelBoundaryFD(
Real const* ssaf_bnl,
int8_t const* mat_bnl,
Real const* __restrict__ mat_beta,
pffdtd::MatQuad const* __restrict__ mat_quads
pffdtd::MatQuad<Real> const* __restrict__ mat_quads
) {
int64_t nb = blockIdx.x * cuBb + threadIdx.x;
if (nb < cuNbl) {
Expand All @@ -384,7 +384,7 @@ __global__ void KernelBoundaryFD(
for (int8_t m = 0; m < cuMb[k]; m++) { // faster on average than MMb
int64_t nbm = m * cuNbl + nb;
int32_t mbk = k * MMb + m;
pffdtd::MatQuad const* tm;
pffdtd::MatQuad<Real> const* tm;
tm = &(mat_quads[mbk]);
vh1int[m] = vh1[nbm];
gh1int[m] = gh1[nbm];
Expand All @@ -396,7 +396,7 @@ __global__ void KernelBoundaryFD(
for (int8_t m = 0; m < cuMb[k]; m++) { // faster on average than MMb
int64_t nbm = m * cuNbl + nb;
int32_t mbk = k * MMb + m;
pffdtd::MatQuad const* tm;
pffdtd::MatQuad<Real> const* tm;
tm = &(mat_quads[mbk]);
Real vh0m = (tm->b) * du + (tm->bd) * vh1int[m] - _2 * (tm->bFh) * gh1int[m];
gh1[nbm] = gh1int[m] + (vh0m + vh1int[m]) / _2;
Expand Down Expand Up @@ -879,10 +879,13 @@ double run(pffdtd::Simulation3D const& sim) {
gpuErrchk(cudaMalloc(&(gd->mat_beta), (size_t)sim.Nm * sizeof(Real)));
gpuErrchk(cudaMemcpy(gd->mat_beta, sim.mat_beta, (size_t)sim.Nm * sizeof(Real), cudaMemcpyHostToDevice));

gpuErrchk(cudaMalloc(&(gd->mat_quads), (size_t)sim.Nm * MMb * sizeof(pffdtd::MatQuad)));
gpuErrchk(
cudaMemcpy(gd->mat_quads, sim.mat_quads, (size_t)sim.Nm * MMb * sizeof(pffdtd::MatQuad), cudaMemcpyHostToDevice)
);
gpuErrchk(cudaMalloc(&(gd->mat_quads), (size_t)sim.Nm * MMb * sizeof(pffdtd::MatQuad<Real>)));
gpuErrchk(cudaMemcpy(
gd->mat_quads,
sim.mat_quads,
(size_t)sim.Nm * MMb * sizeof(pffdtd::MatQuad<Real>),
cudaMemcpyHostToDevice
));

gpuErrchk(cudaMalloc(&(gd->bn_mask), (size_t)(ghd->Nbm * sizeof(uint8_t))));
gpuErrchk(cudaMemcpy(gd->bn_mask, ghd->bn_mask, (size_t)ghd->Nbm * sizeof(uint8_t), cudaMemcpyHostToDevice));
Expand Down
5 changes: 2 additions & 3 deletions src/cpp/pffdtd/simulation_3d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ namespace pffdtd {
int8_t NN;
int8_t* Mb;
int8_t Nm;
MatQuad* mat_quads;
MatQuad<Real>* mat_quads;
Real* mat_beta; // one per material

double Ts;
Expand Down Expand Up @@ -334,8 +334,7 @@ namespace pffdtd {
//////////////////
// DEF (RLC) datasets
//////////////////
allocate_zeros((void**)&mat_quads,
static_cast<unsigned long>(Nm * MMb) * sizeof(MatQuad)); // initalises to zero
allocate_zeros((void**)&mat_quads, static_cast<unsigned long>(Nm * MMb) * sizeof(MatQuad<Real>));
allocate_zeros((void**)&mat_beta, Nm * sizeof(Real));
for (int8_t i = 0; i < Nm; i++) {
double* DEF; // for one material
Expand Down
83 changes: 42 additions & 41 deletions src/cpp/pffdtd/simulation_3d.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,52 +20,53 @@
namespace pffdtd {

// see python code and 2016 ISMRA paper
template<typename Float>
struct MatQuad {
Real b; // b
Real bd; // b*d
Real bDh; // b*D-hat
Real bFh; // b*F-hat
Float b; // b
Float bd; // b*d
Float bDh; // b*D-hat
Float bFh; // b*F-hat
};

// main sim data, on host
struct Simulation3D {
int64_t* bn_ixyz; // boundary node indices
int64_t* bnl_ixyz; // lossy boundary node indices
int64_t* bna_ixyz; // absorbing boundary node indices
int8_t* Q_bna; // integer for ABCs (wall 1,edge 2,corner 3)
int64_t* in_ixyz; // input points
int64_t* out_ixyz; // output points
int64_t* out_reorder; // ordering for outputs point for final print/save
uint16_t* adj_bn; // nearest-neighbour adjancencies for all boundary nodes
Real* ssaf_bnl; // surface area corrections (with extra volume scaling)
uint8_t* bn_mask; // bit mask for bounday nodes
int8_t* mat_bnl; // material indices for lossy boundary nodes
int8_t* K_bn; // number of adjacent neighbours, boundary nodesa
double* in_sigs; // input signals
double* u_out; // for output signals
int64_t Ns; // number of input grid points
int64_t Nr; // number of output grid points
int64_t Nt; // number of samples simulation
int64_t Npts; // number of Cartesian grid points
int64_t Nx; // x-dim (non-continguous)
int64_t Ny; // y-dim
int64_t Nz; // z-dim (continguous)
int64_t Nb; // number of boundary nodes
int64_t Nbl; // number of lossy boundary nodes
int64_t Nba; // number of ABC nodes
double l; // Courant number (CFL)
double l2; // CFL number squared
int8_t fcc_flag; // boolean for FCC
int8_t NN; // integer, neareast neighbours
int8_t Nm; // number of materials used
int8_t* Mb; // number of branches per material
struct MatQuad* mat_quads; // RLC coefficients (essentially)
Real* mat_beta; // part of FD-boundaries one per material
double infac; // rescaling of input (for numerical reason)
Real sl2; // scaled l2 (for single precision)
Real lo2; // 0.5*l
Real a2; // update stencil coefficient
Real a1; // update stencil coefficient
int64_t* bn_ixyz; // boundary node indices
int64_t* bnl_ixyz; // lossy boundary node indices
int64_t* bna_ixyz; // absorbing boundary node indices
int8_t* Q_bna; // integer for ABCs (wall 1,edge 2,corner 3)
int64_t* in_ixyz; // input points
int64_t* out_ixyz; // output points
int64_t* out_reorder; // ordering for outputs point for final print/save
uint16_t* adj_bn; // nearest-neighbour adjancencies for all boundary nodes
Real* ssaf_bnl; // surface area corrections (with extra volume scaling)
uint8_t* bn_mask; // bit mask for bounday nodes
int8_t* mat_bnl; // material indices for lossy boundary nodes
int8_t* K_bn; // number of adjacent neighbours, boundary nodesa
double* in_sigs; // input signals
double* u_out; // for output signals
int64_t Ns; // number of input grid points
int64_t Nr; // number of output grid points
int64_t Nt; // number of samples simulation
int64_t Npts; // number of Cartesian grid points
int64_t Nx; // x-dim (non-continguous)
int64_t Ny; // y-dim
int64_t Nz; // z-dim (continguous)
int64_t Nb; // number of boundary nodes
int64_t Nbl; // number of lossy boundary nodes
int64_t Nba; // number of ABC nodes
double l; // Courant number (CFL)
double l2; // CFL number squared
int8_t fcc_flag; // boolean for FCC
int8_t NN; // integer, neareast neighbours
int8_t Nm; // number of materials used
int8_t* Mb; // number of branches per material
MatQuad<Real>* mat_quads; // RLC coefficients (essentially)
Real* mat_beta; // part of FD-boundaries one per material
double infac; // rescaling of input (for numerical reason)
Real sl2; // scaled l2 (for single precision)
Real lo2; // 0.5*l
Real a2; // update stencil coefficient
Real a1; // update stencil coefficient
};

[[nodiscard]] auto loadSimulation3D(std::filesystem::path const& simDir) -> Simulation3D;
Expand Down

0 comments on commit 8bdc805

Please sign in to comment.