Skip to content

Commit

Permalink
[cpp] Start SYCL sim3d engine
Browse files Browse the repository at this point in the history
  • Loading branch information
tobiashienzsch committed Sep 22, 2024
1 parent a5378ee commit 6a67d30
Show file tree
Hide file tree
Showing 7 changed files with 435 additions and 36 deletions.
2 changes: 1 addition & 1 deletion run_3d.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ set -e

root_dir="$(cd "$(dirname "$0")" && pwd)"
pffdtd_engine="$root_dir/build/src/cpp/pffdtd-engine"
pffdtd_engine="$root_dir/cmake-build-cuda/src/cpp/pffdtd-engine"
pffdtd_engine="$root_dir/cmake-build-sycl/src/cpp/pffdtd-engine"

sim_name="ProStudio"
sim_setup="${sim_name}.py"
Expand Down
6 changes: 4 additions & 2 deletions src/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ target_sources(pffdtd
pffdtd/simulation_2d.hpp
pffdtd/simulation_3d.cpp
pffdtd/simulation_3d.hpp
pffdtd/sycl.hpp
pffdtd/time.hpp
pffdtd/utility.hpp
)
Expand All @@ -45,11 +44,14 @@ endif()

if(PFFDTD_HAS_SYCL)
target_sources(pffdtd PRIVATE pffdtd/engine_2d_sycl.cpp pffdtd/engine_2d_sycl.hpp)
add_sycl_to_target(TARGET pffdtd SOURCES pffdtd/engine_2d_sycl.cpp)
target_sources(pffdtd PRIVATE pffdtd/engine_3d_sycl.cpp pffdtd/engine_3d_sycl.hpp)
target_sources(pffdtd PRIVATE pffdtd/sycl.cpp pffdtd/sycl.hpp)
add_sycl_to_target(TARGET pffdtd SOURCES pffdtd/engine_2d_sycl.cpp pffdtd/engine_3d_sycl.cpp pffdtd/sycl.cpp)
target_compile_definitions(pffdtd PUBLIC PFFDTD_HAS_SYCL=1)

if(PFFDTD_ENABLE_SYCL_ONEAPI)
set_source_files_properties(pffdtd/engine_2d_sycl.cpp PROPERTIES COMPILE_FLAGS "-fsycl-targets=nvptx64-nvidia-cuda")
set_source_files_properties(pffdtd/engine_3d_sycl.cpp PROPERTIES COMPILE_FLAGS "-fsycl-targets=nvptx64-nvidia-cuda")
target_link_libraries(pffdtd PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
endif()
endif()
Expand Down
3 changes: 3 additions & 0 deletions src/cpp/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#if defined(PFFDTD_HAS_SYCL)
#include "pffdtd/engine_2d_sycl.hpp"
#include "pffdtd/engine_3d_sycl.hpp"
#endif

#include <CLI/CLI.hpp>
Expand Down Expand Up @@ -100,6 +101,8 @@ auto main(int argc, char** argv) -> int {

#if defined(PFFDTD_HAS_CUDA)
pffdtd::Engine3DCUDA{}(sim);
#elif defined(PFFDTD_HAS_SYCL)
pffdtd::Engine3DSYCL{}(sim);
#else
pffdtd::Engine3DCPU{}(sim);
#endif
Expand Down
327 changes: 327 additions & 0 deletions src/cpp/pffdtd/engine_3d_sycl.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,327 @@
// SPDX-License-Identifier: MIT
// SPDX-FileCopyrightText: 2024 Tobias Hienzsch
#include "engine_3d_sycl.hpp"

#include "pffdtd/assert.hpp"
#include "pffdtd/progress.hpp"
#include "pffdtd/sycl.hpp"
#include "pffdtd/time.hpp"

#include <fmt/format.h>

namespace pffdtd {

auto Engine3DSYCL::operator()(Simulation3D& sim) const -> double {
PFFDTD_ASSERT(sim.fcc_flag == 0);

auto queue = sycl::queue{sycl::property::queue::enable_profiling{}};
auto device = queue.get_device();
summary(device);

auto const Nx = sim.Nx;
auto const Ny = sim.Ny;
auto const Nz = sim.Nz;
auto const NzNy = Nz * Ny;
auto const Npts = sim.Npts;
auto const Nb = sim.Nb;
auto const Nbl = sim.Nbl;
auto const Nba = sim.Nba;
auto const Nm = sim.Nm;
auto const Nr = sim.Nr;
auto const Ns = sim.Ns;
auto const Nt = sim.Nt;

auto const lo2 = static_cast<Real>(sim.lo2);
auto const sl2 = static_cast<Real>(sim.sl2);
auto const l = static_cast<Real>(sim.l);
auto const a1 = static_cast<Real>(sim.a1);
auto const a2 = static_cast<Real>(sim.a2);

auto Q_bna_buf = sycl::buffer{sim.Q_bna, sycl::range(size_t(Nba))};
auto bn_mask_buf = sycl::buffer{sim.bn_mask, sycl::range(size_t(Npts))};
auto adj_bn_buf = sycl::buffer{sim.adj_bn, sycl::range(size_t(Nb))};
auto bn_ixyz_buf = sycl::buffer{sim.bn_ixyz, sycl::range(size_t(Nb))};
auto bnl_ixyz_buf = sycl::buffer{sim.bnl_ixyz, sycl::range(size_t(Nb))};
auto bna_ixyz_buf = sycl::buffer{sim.bna_ixyz, sycl::range(size_t(Nba))};
auto in_ixyz_buf = sycl::buffer{sim.in_ixyz, sycl::range(size_t(Ns))};
auto out_ixyz_buf = sycl::buffer{sim.out_ixyz, sycl::range(size_t(Nr))};
auto in_sigs_buf = sycl::buffer{sim.in_sigs, sycl::range(size_t(Ns * Nt))};
auto mat_beta_buf = sycl::buffer{sim.mat_beta, sycl::range(size_t(Nm))};
auto mat_bnl_buf = sycl::buffer{sim.mat_bnl, sycl::range(size_t(Nbl))};
auto mat_quads_buf = sycl::buffer{sim.mat_quads, sycl::range(size_t(Nm * MMb))};
auto Mb_buf = sycl::buffer{sim.Mb, sycl::range(size_t(Nm))};
auto ssaf_bnl_buf = sycl::buffer{sim.ssaf_bnl, sycl::range(size_t(Nbl))};

auto u0_buf = sycl::buffer<Real>(size_t(Npts));
auto u1_buf = sycl::buffer<Real>(size_t(Npts));
auto u0b_buf = sycl::buffer<Real>(size_t(Nbl));
auto u1b_buf = sycl::buffer<Real>(size_t(Nbl));
auto u2b_buf = sycl::buffer<Real>(size_t(Nbl));
auto u2ba_buf = sycl::buffer<Real>(size_t(Nba));
auto vh1_buf = sycl::buffer<Real>(size_t(Nbl * MMb));
auto gh1_buf = sycl::buffer<Real>(size_t(Nbl * MMb));
auto u_out_buf = sycl::buffer<Real>(size_t(Nr * Nt));

auto elapsedAir = Seconds(0.0);
auto elapsedBoundary = Seconds(0.0);
auto const start = getTime();

for (int64_t n = 0; n < Nt; n++) {
auto const sampleStart = getTime();

// Rigid update
auto boundaryStartEvent = queue.submit([&](sycl::handler& cgh) {
auto u0 = sycl::accessor{u0_buf, cgh, sycl::write_only};
auto u1 = sycl::accessor{u1_buf, cgh, sycl::read_only};
auto bn_ixyz = sycl::accessor{bn_ixyz_buf, cgh, sycl::read_only};
auto adj_bn = sycl::accessor{adj_bn_buf, cgh, sycl::read_only};
cgh.parallel_for<struct RigidUpdateCart>(Nb, [=](sycl::id<1> id) {
auto const nb = id[0];
auto const ii = bn_ixyz[nb];
auto const adj = adj_bn[nb];
auto const Kint = sycl::popcount(adj);

auto const _2 = static_cast<Real>(2.0);
auto const K = static_cast<Real>(Kint);
auto const b2 = static_cast<Real>(a2);
auto const b1 = (_2 - sl2 * K);

auto partial = b1 * u1[ii] - u0[ii];
partial += b2 * get_bit_as<Real>(adj, 0) * u1[ii + NzNy];
partial += b2 * get_bit_as<Real>(adj, 1) * u1[ii - NzNy];
partial += b2 * get_bit_as<Real>(adj, 2) * u1[ii + Nz];
partial += b2 * get_bit_as<Real>(adj, 3) * u1[ii - Nz];
partial += b2 * get_bit_as<Real>(adj, 4) * u1[ii + 1];
partial += b2 * get_bit_as<Real>(adj, 5) * u1[ii - 1];
u0[ii] = partial;
});
});

// Copy to boundary buffer
queue.submit([&](sycl::handler& cgh) {
auto u0 = sycl::accessor{u0_buf, cgh, sycl::read_only};
auto u0b = sycl::accessor{u0b_buf, cgh, sycl::write_only};
auto bnl_ixyz = sycl::accessor{bnl_ixyz_buf, cgh, sycl::read_only};
cgh.parallel_for<struct CopyToBoundary>(Nbl, [=](sycl::id<1> id) {
auto nb = id[0];
u0b[nb] = u0[bnl_ixyz[nb]];
});
});

// Apply boundary loss
queue.submit([&](sycl::handler& cgh) {
auto mat_beta = sycl::accessor{mat_beta_buf, cgh, sycl::read_only};
auto mat_bnl = sycl::accessor{mat_bnl_buf, cgh, sycl::read_only};
auto ssaf_bnl = sycl::accessor{ssaf_bnl_buf, cgh, sycl::read_only};
auto u2b = sycl::accessor{u2b_buf, cgh, sycl::read_only};
auto Mb = sycl::accessor{Mb_buf, cgh, sycl::read_only};
auto mat_quads = sycl::accessor{mat_quads_buf, cgh, sycl::read_only};
auto u0b = sycl::accessor{u0b_buf, cgh, sycl::read_write};
auto gh1 = sycl::accessor{gh1_buf, cgh, sycl::read_write};
auto vh1 = sycl::accessor{vh1_buf, cgh, sycl::read_write};
cgh.parallel_for<struct ApplyBoundaryLoss>(Nbl, [=](sycl::id<1> id) {
auto nb = id[0];
Real _1 = 1.0;
Real _2 = 2.0;
int32_t const k = mat_bnl[nb];

Real lo2Kbg = lo2 * ssaf_bnl[nb] * mat_beta[k];
Real fac = _2 * lo2 * ssaf_bnl[nb] / (_1 + lo2Kbg);

Real u0bint = u0b[nb];
Real u2bint = u2b[nb];

u0bint = (u0bint + lo2Kbg * u2bint) / (_1 + lo2Kbg);

Real vh1nb[MMb];
for (int8_t m = 0; m < Mb[k]; m++) {
int64_t const nbm = nb * MMb + m;
int32_t const mbk = k * MMb + m;
auto const* tm = &(mat_quads[mbk]);
vh1nb[m] = vh1[nbm];
u0bint -= fac * (_2 * (tm->bDh) * vh1nb[m] - (tm->bFh) * gh1[nbm]);
}

Real du = u0bint - u2bint;

for (int8_t m = 0; m < Mb[k]; m++) {
int64_t const nbm = nb * MMb + m;
int32_t const mbk = k * MMb + m;
auto const* tm = &(mat_quads[mbk]);
Real vh0nbm = (tm->b) * du + (tm->bd) * vh1nb[m] - _2 * (tm->bFh) * gh1[nbm];
gh1[nbm] += (vh0nbm + vh1nb[m]) / _2;
vh1[nbm] = vh0nbm;
}

u0b[nb] = u0bint;
});
});

// Copy from boundary buffer
auto boundaryEndEvent = queue.submit([&](sycl::handler& cgh) {
auto u0 = sycl::accessor{u0_buf, cgh, sycl::write_only};
auto u0b = sycl::accessor{u0b_buf, cgh, sycl::read_only};
auto bnl_ixyz = sycl::accessor{bnl_ixyz_buf, cgh, sycl::read_only};
cgh.parallel_for<struct CopyFromBoundary>(Nbl, [=](sycl::id<1> id) {
auto nb = id[0];
u0[bnl_ixyz[nb]] = u0b[nb];
});
});

// Copy last state ABCs
auto airStartEvent = queue.submit([&](sycl::handler& cgh) {
auto u2ba = sycl::accessor{u2ba_buf, cgh, sycl::write_only};
auto u0 = sycl::accessor{u0_buf, cgh, sycl::read_only};
auto bna_ixyz = sycl::accessor{bna_ixyz_buf, cgh, sycl::read_only};

cgh.parallel_for<struct CopyABCs>(Nba, [=](sycl::id<1> id) {
auto nb = id[0];
u2ba[nb] = u0[bna_ixyz[nb]];
});
});

// Flip halo XY
queue.submit([&](sycl::handler& cgh) {
auto u1 = sycl::accessor{u1_buf, cgh, sycl::read_write};
cgh.parallel_for<struct FlipHaloXY>(sycl::range<2>(Nx, Ny), [=](sycl::id<2> id) {
auto const i = id[0] * NzNy + id[1] * Nz;
u1[i + 0] = u1[i + 2];
u1[i + Nz - 1] = u1[i + Nz - 3];
});
});

// Flip halo XZ
queue.submit([&](sycl::handler& cgh) {
auto u1 = sycl::accessor{u1_buf, cgh, sycl::read_write};
cgh.parallel_for<struct FlipHaloXZ>(sycl::range<2>(Nx, Nz), [=](sycl::id<2> id) {
auto const ix = id[0];
auto const iz = id[1];

u1[ix * NzNy + 0 * Nz + iz] = u1[ix * NzNy + 2 * Nz + iz];
u1[ix * NzNy + (Ny - 1) * Nz + iz] = u1[ix * NzNy + (Ny - 3) * Nz + iz];
});
});

// Flip halo YZ
queue.submit([&](sycl::handler& cgh) {
auto u1 = sycl::accessor{u1_buf, cgh, sycl::read_write};
cgh.parallel_for<struct FlipHaloYZ>(sycl::range<2>(Ny, Nz), [=](sycl::id<2> id) {
auto const iy = id[0];
auto const iz = id[1];

u1[0 * NzNy + iy * Nz + iz] = u1[2 * NzNy + iy * Nz + iz];
u1[(Nx - 1) * NzNy + iy * Nz + iz] = u1[(Nx - 3) * NzNy + iy * Nz + iz];
});
});

// Add sources
queue.submit([&](sycl::handler& cgh) {
auto u0 = sycl::accessor{u0_buf, cgh, sycl::read_write};
auto in_sigs = sycl::accessor{in_sigs_buf, cgh, sycl::read_only};
auto in_ixyz = sycl::accessor{in_ixyz_buf, cgh, sycl::read_only};
cgh.parallel_for<struct AddSources>(Ns, [=](sycl::id<1> id) {
auto const ns = id[0];
auto const ii = in_ixyz[ns];
u0[ii] += in_sigs[ns * Nt + n];
});
});

// Air update
queue.submit([&](sycl::handler& cgh) {
auto u0 = sycl::accessor{u0_buf, cgh, sycl::write_only};
auto u1 = sycl::accessor{u1_buf, cgh, sycl::read_only};
auto bn_mask = sycl::accessor{bn_mask_buf, cgh, sycl::read_only};
cgh.parallel_for<struct AirUpdateCart>(sycl::range<3>(Nx - 2, Ny - 2, Nz - 2), [=](sycl::id<3> id) {
auto const ix = id[0] + 1;
auto const iy = id[1] + 1;
auto const iz = id[2] + 1;
auto const ii = ix * NzNy + iy * Nz + iz;

if ((GET_BIT(bn_mask[ii >> 3], ii % 8)) != 0) {
return;
}

auto partial = a1 * u1[ii] - u0[ii];
partial += a2 * u1[ii + NzNy];
partial += a2 * u1[ii - NzNy];
partial += a2 * u1[ii + Nz];
partial += a2 * u1[ii - Nz];
partial += a2 * u1[ii + 1];
partial += a2 * u1[ii - 1];
u0[ii] = partial;
});
});

// ABC Loss
auto airEndEvent = queue.submit([&](sycl::handler& cgh) {
auto u0 = sycl::accessor{u0_buf, cgh, sycl::read_write};
auto u2ba = sycl::accessor{u2ba_buf, cgh, sycl::read_only};
auto Q_bna = sycl::accessor{Q_bna_buf, cgh, sycl::read_only};
auto bna_ixyz = sycl::accessor{bna_ixyz_buf, cgh, sycl::read_only};
cgh.parallel_for<struct LossABC>(Nba, [=](sycl::id<1> id) {
auto const nb = id[0];
auto const lQ = l * Q_bna[nb];
auto const ib = bna_ixyz[nb];
u0[ib] = (u0[ib] + lQ * u2ba[nb]) / (Real(1) + lQ);
});
});

// Read output
queue.submit([&](sycl::handler& cgh) {
auto u1 = sycl::accessor{u1_buf, cgh, sycl::read_only};
auto out_ixyz = sycl::accessor{out_ixyz_buf, cgh, sycl::read_only};
auto u_out = sycl::accessor{u_out_buf, cgh, sycl::write_only};
cgh.parallel_for<struct ReadOutput>(Nr, [=](sycl::id<1> id) {
auto const nr = id[0];
auto const ii = out_ixyz[nr];
u_out[nr * Nt + n] = static_cast<double>(u1[ii]);
});
});

queue.wait_and_throw();

std::swap(u0_buf, u1_buf);

auto tmp = u2b_buf;
u2b_buf = u1b_buf;
u1b_buf = u0b_buf;
u0b_buf = tmp;

auto const now = getTime();

auto const elapsed = Seconds(now - start);
auto const elapsedSample = Seconds(now - sampleStart);

auto const elapsedAirSample = elapsedTime(airStartEvent, airEndEvent);
elapsedAir += elapsedAirSample;

auto const elapsedBoundarySample = elapsedTime(boundaryStartEvent, boundaryEndEvent);
elapsedBoundary += elapsedBoundarySample;

print(ProgressReport{
.n = n,
.Nt = Nt,
.Npts = Npts,
.Nb = Nb,
.elapsed = elapsed.count(),
.elapsedSample = elapsedSample.count(),
.elapsedAir = elapsedAir.count(),
.elapsedSampleAir = elapsedAirSample.count(),
.elapsedBoundary = elapsedBoundary.count(),
.elapsedSampleBoundary = elapsedBoundarySample.count(),
.numWorkers = 1,
});
}

// Copy output to host
auto host = sycl::host_accessor{u_out_buf, sycl::read_only};
for (auto i{0UL}; i < static_cast<size_t>(Nr * Nt); ++i) {
sim.u_out[i] = host[i];
}

fmt::println("");

return 0.0;
}

} // namespace pffdtd
17 changes: 17 additions & 0 deletions src/cpp/pffdtd/engine_3d_sycl.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// SPDX-License-Identifier: MIT
// SPDX-FileCopyrightText: 2024 Tobias Hienzsch
#pragma once

#include "pffdtd/simulation_3d.hpp"

#if not defined(PFFDTD_HAS_SYCL)
#error "SYCL must be enabled in CMake via PFFDTD_ENABLE_SYCL_ACPP or PFFDTD_ENABLE_SYCL_ONEAPI"
#endif

namespace pffdtd {

struct Engine3DSYCL {
auto operator()(Simulation3D& sim) const -> double;
};

} // namespace pffdtd
Loading

0 comments on commit 6a67d30

Please sign in to comment.