From 487b7259ef1946bd777f2f9fac454d5328433aa9 Mon Sep 17 00:00:00 2001 From: Tobias Hienzsch Date: Sun, 29 Sep 2024 21:47:17 +0200 Subject: [PATCH] [cpp] Make precision a runtime parameter for sim3d --- run_2d.sh | 2 +- run_3d.sh | 4 +- src/cpp/main.cpp | 20 +--- src/cpp/pffdtd/engine_cpu_3d.cpp | 36 ++++-- src/cpp/pffdtd/engine_cpu_3d.hpp | 4 +- src/cpp/pffdtd/engine_cuda_3d.cu | 37 +++--- src/cpp/pffdtd/engine_cuda_3d.hpp | 3 +- src/cpp/pffdtd/engine_metal_2d.mm | 2 +- src/cpp/pffdtd/engine_metal_3d.hpp | 2 +- src/cpp/pffdtd/engine_metal_3d.mm | 7 +- src/cpp/pffdtd/engine_sycl_3d.cpp | 34 ++++-- src/cpp/pffdtd/engine_sycl_3d.hpp | 3 +- src/cpp/pffdtd/simulation_3d.cpp | 186 +++++++++++++++++++++-------- src/cpp/pffdtd/simulation_3d.hpp | 147 +++++++---------------- src/cpp/pffdtd/utility.hpp | 25 ++++ 15 files changed, 294 insertions(+), 218 deletions(-) diff --git a/run_2d.sh b/run_2d.sh index 1f72b23..6139f8e 100755 --- a/run_2d.sh +++ b/run_2d.sh @@ -28,7 +28,7 @@ cd "$model_dir" python "$sim_setup" # Run sim -DPCPP_CPU_PLACES=cores DPCPP_CPU_CU_AFFINITY=spread DPCPP_CPU_NUM_CUS=$jobs OMP_NUM_THREADS=$jobs "$engine_exe" sim2d -p 32 -s "$sim_dir" -e sycl +DPCPP_CPU_PLACES=cores DPCPP_CPU_CU_AFFINITY=spread DPCPP_CPU_NUM_CUS=$jobs OMP_NUM_THREADS=$jobs "$engine_exe" sim2d -p 64 -s "$sim_dir" -e sycl # pffdtd sim2d run --sim_dir "$sim_dir" --video # Post-process diff --git a/run_3d.sh b/run_3d.sh index 36e9ab0..dbefe8f 100755 --- a/run_3d.sh +++ b/run_3d.sh @@ -7,7 +7,7 @@ set -e root_dir="$(cd "$(dirname "$0")" && pwd)" pffdtd_engine="$root_dir/build/src/cpp/pffdtd-engine" -# pffdtd_engine="$root_dir/cmake-build-cuda/src/cpp/pffdtd-engine" +pffdtd_engine="$root_dir/cmake-build-cuda/src/cpp/pffdtd-engine" sim_name="Modes" sim_setup="${sim_name}.py" @@ -28,7 +28,7 @@ cd "$model_dir" pffdtd sim3d setup "$sim_setup" # Run sim -$pffdtd_engine sim3d -e cpu -p "32" -s "$sim_dir" +$pffdtd_engine sim3d -e cuda -p "64" -s "$sim_dir" # pffdtd sim3d engine --sim_dir="$sim_dir" --plot --draw_backend="mayavi" --json_model="${model_dir}/model.json" # Post-process diff --git a/src/cpp/main.cpp b/src/cpp/main.cpp index 7dac46a..a6b8d25 100644 --- a/src/cpp/main.cpp +++ b/src/cpp/main.cpp @@ -69,18 +69,15 @@ namespace { return engines; } -template [[nodiscard]] auto getEngines3D() { using namespace pffdtd; - auto engines = std::map const&)>>{}; + auto engines = std::map>{}; engines["cpu"] = EngineCPU3D{}; #if defined(PFFDTD_HAS_CUDA) engines["cuda"] = EngineCUDA3D{}; #endif #if defined(PFFDTD_HAS_METAL) - if constexpr (std::same_as) { - engines["metal"] = EngineMETAL3D{}; - } + engines["metal"] = EngineMETAL3D{}; #endif #if defined(PFFDTD_HAS_SYCL) engines["sycl"] = EngineSYCL3D{}; @@ -111,18 +108,17 @@ struct Arguments { Sim3D sim3d; }; -template auto run3D(Arguments::Sim3D const& args) { using namespace pffdtd; fmt::println("Running: {} on {} with precision {}", args.simDir, args.engine, toString(args.precision)); - auto const engines = getEngines3D(); + auto const engines = getEngines3D(); auto const& engine = engines.at(args.engine); auto const simDir = std::filesystem::path{args.simDir}; auto const start = getTime(); - auto sim = loadSimulation3D(simDir); + auto sim = loadSimulation3D(simDir, args.precision); scaleInput(sim); engine(sim); rescaleOutput(sim); @@ -177,13 +173,7 @@ auto main(int argc, char** argv) -> int { } if (*sim3d) { - if (args.sim3d.precision == pffdtd::Precision::Float) { - run3D(args.sim3d); - } else if (args.sim3d.precision == pffdtd::Precision::Double) { - run3D(args.sim3d); - } else { - pffdtd::raisef("invalid precision '{}'", toString(args.sim3d.precision)); - } + run3D(args.sim3d); } if (*test) { diff --git a/src/cpp/pffdtd/engine_cpu_3d.cpp b/src/cpp/pffdtd/engine_cpu_3d.cpp index bf510e7..df71a7c 100644 --- a/src/cpp/pffdtd/engine_cpu_3d.cpp +++ b/src/cpp/pffdtd/engine_cpu_3d.cpp @@ -4,6 +4,8 @@ #include "engine_cpu_3d.hpp" +#include "pffdtd/double.hpp" +#include "pffdtd/exception.hpp" #include "pffdtd/progress.hpp" #include "pffdtd/time.hpp" #include "pffdtd/utility.hpp" @@ -79,7 +81,11 @@ auto process_bnl_fd( } template -auto run(Simulation3D const& sim) -> void { +auto run(Simulation3D const& sim) -> void { + auto const ssaf_bnl_real = convertTo(sim.ssaf_bnl); + auto const mat_beta_real = convertTo(sim.mat_beta); + auto const mat_quads_real = convertTo(sim.mat_quads); + // keep local ints, scalars int64_t const Ns = sim.Ns; int64_t const Nr = sim.Nr; @@ -105,9 +111,9 @@ auto run(Simulation3D const& sim) -> void { int8_t const* mat_bnl = sim.mat_bnl.data(); int8_t const* Q_bna = sim.Q_bna.data(); double const* in_sigs = sim.in_sigs.data(); - Real const* ssaf_bnl = sim.ssaf_bnl.data(); - Real const* mat_beta = sim.mat_beta.data(); - MatQuad const* mat_quads = sim.mat_quads.data(); + Real const* ssaf_bnl = ssaf_bnl_real.data(); + Real const* mat_beta = mat_beta_real.data(); + MatQuad const* mat_quads = mat_quads_real.data(); double* u_out = sim.u_out.get(); // allocate memory @@ -130,11 +136,11 @@ auto run(Simulation3D const& sim) -> void { auto* gh1 = gh1_buf.data(); // sim coefficients - auto const lo2 = sim.lo2; - auto const sl2 = sim.sl2; - auto const l = sim.l; - auto const a1 = sim.a1; - auto const a2 = sim.a2; + auto const lo2 = static_cast(sim.lo2); + auto const sl2 = static_cast(sim.sl2); + auto const l = static_cast(sim.l); + auto const a1 = static_cast(sim.a1); + auto const a2 = static_cast(sim.a2); // can control outside with OMP_NUM_THREADS env variable int const numWorkers = omp_get_max_threads(); @@ -373,8 +379,14 @@ auto run(Simulation3D const& sim) -> void { } // namespace -auto EngineCPU3D::operator()(Simulation3D const& sim) const -> void { run(sim); } - -auto EngineCPU3D::operator()(Simulation3D const& sim) const -> void { run(sim); } +auto EngineCPU3D::operator()(Simulation3D const& sim) const -> void { + switch (sim.precision) { + case Precision::Float: return run(sim); + case Precision::Double: return run(sim); + case Precision::DoubleFloat: return run>(sim); + case Precision::DoubleDouble: return run>(sim); + default: raisef("invalid precision {}", static_cast(sim.precision)); + } +} } // namespace pffdtd diff --git a/src/cpp/pffdtd/engine_cpu_3d.hpp b/src/cpp/pffdtd/engine_cpu_3d.hpp index fe7a920..fd981c6 100644 --- a/src/cpp/pffdtd/engine_cpu_3d.hpp +++ b/src/cpp/pffdtd/engine_cpu_3d.hpp @@ -4,13 +4,13 @@ #pragma once +#include "pffdtd/precision.hpp" #include "pffdtd/simulation_3d.hpp" namespace pffdtd { struct EngineCPU3D { - auto operator()(Simulation3D const& sim) const -> void; - auto operator()(Simulation3D const& sim) const -> void; + auto operator()(Simulation3D const& sim) const -> void; }; } // namespace pffdtd diff --git a/src/cpp/pffdtd/engine_cuda_3d.cu b/src/cpp/pffdtd/engine_cuda_3d.cu index 1fbd453..74fec61 100644 --- a/src/cpp/pffdtd/engine_cuda_3d.cu +++ b/src/cpp/pffdtd/engine_cuda_3d.cu @@ -543,8 +543,7 @@ auto print_gpu_details(int i) -> uint64_t { } // input indices need to be sorted for multi-device allocation -template -void checkSorted(Simulation3D const& sim) { +void checkSorted(Simulation3D const& sim) { int64_t const* bn_ixyz = sim.bn_ixyz.data(); int64_t const* bnl_ixyz = sim.bnl_ixyz.data(); int64_t const* bna_ixyz = sim.bna_ixyz.data(); @@ -574,7 +573,7 @@ void checkSorted(Simulation3D const& sim) { // counts for splitting data across GPUs template -void splitData(Simulation3D const& sim, std::span> ghds) { +void splitData(Simulation3D const& sim, std::span> ghds) { auto const Nx = sim.Nx; auto const Ny = sim.Ny; auto const Nz = sim.Nz; @@ -724,7 +723,7 @@ void splitData(Simulation3D const& sim, std::span> ghds) { } template -auto run(Simulation3D const& sim) -> void { // NOLINT(readability-function-cognitive-complexity) +auto run(Simulation3D const& sim) -> void { // NOLINT(readability-function-cognitive-complexity) // if you want to test synchronous, env variable for that auto const* s = std::getenv("CUDA_LAUNCH_BLOCKING"); // NOLINT(concurrency-mt-unsafe) if (s != nullptr) { @@ -745,6 +744,10 @@ auto run(Simulation3D const& sim) -> void { // NOLINT(readability-function auto ghds = std::vector>(static_cast(ngpus)); auto gds = std::vector>(static_cast(ngpus)); + auto const ssaf_bnl_real = convertTo(sim.ssaf_bnl); + auto const mat_beta_real = convertTo(sim.mat_beta); + auto const mat_quads_real = convertTo(sim.mat_quads); + if (ngpus > 1) { checkSorted(sim); // needs to be sorted for multi-GPU } @@ -756,11 +759,11 @@ auto run(Simulation3D const& sim) -> void { // NOLINT(readability-function gds[gid].totalmembytes = print_gpu_details(gid); } - Real lo2 = sim.lo2; - Real a1 = sim.a1; - Real a2 = sim.a2; - Real l = sim.l; - Real sl2 = sim.sl2; + auto lo2 = static_cast(sim.lo2); + auto a1 = static_cast(sim.a1); + auto a2 = static_cast(sim.a2); + auto l = static_cast(sim.l); + auto sl2 = static_cast(sim.sl2); // timing stuff auto elapsed = std::chrono::nanoseconds{0}; @@ -829,7 +832,7 @@ auto run(Simulation3D const& sim) -> void { // NOLINT(readability-function // aliased pointers (to memory already allocated) host.in_sigs = sim.in_sigs.data() + Ns_read * sim.Nt; - host.ssaf_bnl = sim.ssaf_bnl.data() + Nbl_read; + host.ssaf_bnl = ssaf_bnl_real.data() + Nbl_read; host.adj_bn = sim.adj_bn.data() + Nb_read; host.mat_bnl = sim.mat_bnl.data() + Nbl_read; host.K_bn = sim.K_bn.data() + Nb_read; @@ -942,12 +945,12 @@ auto run(Simulation3D const& sim) -> void { // NOLINT(readability-function gpuErrchk(cudaMemcpy(gpu.mat_bnl, host.mat_bnl, (size_t)host.Nbl * sizeof(int8_t), cudaMemcpyHostToDevice)); gpuErrchk(cudaMalloc(&(gpu.mat_beta), (size_t)sim.Nm * sizeof(Real))); - gpuErrchk(cudaMemcpy(gpu.mat_beta, sim.mat_beta.data(), (size_t)sim.Nm * sizeof(Real), cudaMemcpyHostToDevice)); + gpuErrchk(cudaMemcpy(gpu.mat_beta, mat_beta_real.data(), (size_t)sim.Nm * sizeof(Real), cudaMemcpyHostToDevice)); gpuErrchk(cudaMalloc(&(gpu.mat_quads), (size_t)sim.Nm * MMb * sizeof(MatQuad))); gpuErrchk(cudaMemcpy( gpu.mat_quads, - sim.mat_quads.data(), + mat_quads_real.data(), (size_t)sim.Nm * MMb * sizeof(MatQuad), cudaMemcpyHostToDevice )); @@ -1407,8 +1410,12 @@ auto run(Simulation3D const& sim) -> void { // NOLINT(readability-function } // namespace -auto EngineCUDA3D::operator()(Simulation3D const& sim) const -> void { run(sim); } - -auto EngineCUDA3D::operator()(Simulation3D const& sim) const -> void { run(sim); } +auto EngineCUDA3D::operator()(Simulation3D const& sim) const -> void { + switch (sim.precision) { + case Precision::Float: return run(sim); + case Precision::Double: return run(sim); + default: throw std::invalid_argument("invalid precision " + std::to_string(static_cast(sim.precision))); + } +} } // namespace pffdtd diff --git a/src/cpp/pffdtd/engine_cuda_3d.hpp b/src/cpp/pffdtd/engine_cuda_3d.hpp index 8571d12..35fa6b8 100644 --- a/src/cpp/pffdtd/engine_cuda_3d.hpp +++ b/src/cpp/pffdtd/engine_cuda_3d.hpp @@ -13,8 +13,7 @@ namespace pffdtd { struct EngineCUDA3D { - auto operator()(Simulation3D const& sim) const -> void; - auto operator()(Simulation3D const& sim) const -> void; + auto operator()(Simulation3D const& sim) const -> void; }; } // namespace pffdtd diff --git a/src/cpp/pffdtd/engine_metal_2d.mm b/src/cpp/pffdtd/engine_metal_2d.mm index 59efa56..e7e9ae8 100644 --- a/src/cpp/pffdtd/engine_metal_2d.mm +++ b/src/cpp/pffdtd/engine_metal_2d.mm @@ -19,7 +19,7 @@ auto toFloat(std::vector const& buf) { auto buf32 = std::vector(buf.size()); - std::ranges::transform(buf32, buf32.begin(), [](auto v) { return static_cast(v); }); + std::ranges::transform(buf, buf32.begin(), [](auto v) { return static_cast(v); }); return buf32; } diff --git a/src/cpp/pffdtd/engine_metal_3d.hpp b/src/cpp/pffdtd/engine_metal_3d.hpp index c3f72b8..3c1bcaa 100644 --- a/src/cpp/pffdtd/engine_metal_3d.hpp +++ b/src/cpp/pffdtd/engine_metal_3d.hpp @@ -11,7 +11,7 @@ namespace pffdtd { struct EngineMETAL3D { - auto operator()(Simulation3D const& sim) const -> void; + auto operator()(Simulation3D const& sim) const -> void; }; } // namespace pffdtd diff --git a/src/cpp/pffdtd/engine_metal_3d.mm b/src/cpp/pffdtd/engine_metal_3d.mm index 47f2578..7a0e157 100644 --- a/src/cpp/pffdtd/engine_metal_3d.mm +++ b/src/cpp/pffdtd/engine_metal_3d.mm @@ -17,7 +17,7 @@ namespace { template -auto run(Simulation3D const& sim) { +auto run(Simulation3D const& sim) { @autoreleasepool { // Device @@ -286,6 +286,9 @@ auto run(Simulation3D const& sim) { } // namespace -auto EngineMETAL3D::operator()(Simulation3D const& sim) const -> void { run(sim); } +auto EngineMETAL3D::operator()(Simulation3D const& sim) const -> void { + PFFDTD_ASSERT(precision == Precision::Float); + run(sim); +} } // namespace pffdtd diff --git a/src/cpp/pffdtd/engine_sycl_3d.cpp b/src/cpp/pffdtd/engine_sycl_3d.cpp index bb7bb3a..5d0d2fd 100644 --- a/src/cpp/pffdtd/engine_sycl_3d.cpp +++ b/src/cpp/pffdtd/engine_sycl_3d.cpp @@ -3,6 +3,9 @@ #include "engine_sycl_3d.hpp" #include "pffdtd/assert.hpp" +#include "pffdtd/double.hpp" +#include "pffdtd/exception.hpp" +#include "pffdtd/precision.hpp" #include "pffdtd/progress.hpp" #include "pffdtd/sycl.hpp" #include "pffdtd/time.hpp" @@ -39,7 +42,7 @@ template struct ReadOutput; template -auto run(Simulation3D const& sim) -> void { +auto run(Simulation3D const& sim) -> void { PFFDTD_ASSERT(sim.grid == Grid::CART); auto queue = sycl::queue{sycl::property::queue::enable_profiling{}}; @@ -64,6 +67,10 @@ auto run(Simulation3D const& sim) -> void { auto const a1 = static_cast(sim.a1); auto const a2 = static_cast(sim.a2); + auto const ssaf_bnl_real = convertTo(sim.ssaf_bnl); + auto const mat_beta_real = convertTo(sim.mat_beta); + auto const mat_quads_real = convertTo(sim.mat_quads); + auto Q_bna_buf = sycl::buffer{sim.Q_bna}; auto bn_mask_buf = sycl::buffer{sim.bn_mask}; auto adj_bn_buf = sycl::buffer{sim.adj_bn}; @@ -73,11 +80,11 @@ auto run(Simulation3D const& sim) -> void { auto in_ixyz_buf = sycl::buffer{sim.in_ixyz}; auto out_ixyz_buf = sycl::buffer{sim.out_ixyz}; auto in_sigs_buf = sycl::buffer{sim.in_sigs}; - auto mat_beta_buf = sycl::buffer{sim.mat_beta}; + auto mat_beta_buf = sycl::buffer{mat_beta_real}; auto mat_bnl_buf = sycl::buffer{sim.mat_bnl}; - auto mat_quads_buf = sycl::buffer{sim.mat_quads}; + auto mat_quads_buf = sycl::buffer{mat_quads_real}; auto Mb_buf = sycl::buffer{sim.Mb}; - auto ssaf_bnl_buf = sycl::buffer{sim.ssaf_bnl}; + auto ssaf_bnl_buf = sycl::buffer{ssaf_bnl_real}; auto u0_buf = sycl::buffer(static_cast(Npts)); auto u1_buf = sycl::buffer(static_cast(Npts)); @@ -343,14 +350,27 @@ auto run(Simulation3D const& sim) -> void { // Copy output to host auto host = sycl::host_accessor{u_out_buf, sycl::read_only}; for (auto i{0UL}; i < static_cast(Nr * Nt); ++i) { - sim.u_out[i] = host[i]; + sim.u_out[i] = static_cast(host[i]); } } } // namespace -auto EngineSYCL3D::operator()(Simulation3D const& sim) const -> void { run(sim); } +auto EngineSYCL3D::operator()(Simulation3D const& sim) const -> void { + switch (sim.precision) { +#if defined(__APPLE__) or defined(__clang__) + case Precision::Half: return run<_Float16>(sim); + case Precision::DoubleHalf: return run>(sim); +#endif + + case Precision::Float: return run(sim); + case Precision::DoubleFloat: return run>(sim); + + case Precision::Double: return run(sim); + case Precision::DoubleDouble: return run>(sim); -auto EngineSYCL3D::operator()(Simulation3D const& sim) const -> void { run(sim); } + default: raisef("invalid precision {}", static_cast(sim.precision)); + } +} } // namespace pffdtd diff --git a/src/cpp/pffdtd/engine_sycl_3d.hpp b/src/cpp/pffdtd/engine_sycl_3d.hpp index ca45bd4..c5d3e8b 100644 --- a/src/cpp/pffdtd/engine_sycl_3d.hpp +++ b/src/cpp/pffdtd/engine_sycl_3d.hpp @@ -11,8 +11,7 @@ namespace pffdtd { struct EngineSYCL3D { - auto operator()(Simulation3D const& sim) const -> void; - auto operator()(Simulation3D const& sim) const -> void; + auto operator()(Simulation3D const& sim) const -> void; }; } // namespace pffdtd diff --git a/src/cpp/pffdtd/simulation_3d.cpp b/src/cpp/pffdtd/simulation_3d.cpp index 120b142..dd75776 100644 --- a/src/cpp/pffdtd/simulation_3d.cpp +++ b/src/cpp/pffdtd/simulation_3d.cpp @@ -3,6 +3,8 @@ #include "simulation_3d.hpp" +#include "pffdtd/double.hpp" +#include "pffdtd/exception.hpp" #include "pffdtd/hdf.hpp" #include @@ -23,6 +25,61 @@ constexpr auto EPS = 0.0; template<> constexpr auto EPS = 1.19209289e-07; +auto getEpsilon(Precision precision) { + switch (precision) { + case Precision::Half: return EPS; + case Precision::Float: return EPS; + case Precision::Double: return EPS; + + case Precision::DoubleHalf: return EPS; + case Precision::DoubleFloat: return EPS; + case Precision::DoubleDouble: return EPS; + + default: break; + } + + raisef("invalid precision = {}", int(precision)); +} + +template +static constexpr auto min_exponent = std::numeric_limits::min_exponent; + +template +static constexpr auto max_exponent = std::numeric_limits::max_exponent; + +template +static constexpr auto min_exponent> = min_exponent; + +template +static constexpr auto max_exponent> = max_exponent; + +#if defined(__APPLE__) or defined(__clang__) +template<> +static constexpr auto min_exponent<_Float16> = -13; + +template<> +static constexpr auto max_exponent<_Float16> = 16; +#endif + +auto getMinMaxExponent(Precision precision) { + switch (precision) { + case Precision::Float: return std::pair{min_exponent, max_exponent}; + case Precision::DoubleFloat: return std::pair{min_exponent, max_exponent}; + + case Precision::Double: return std::pair{min_exponent, max_exponent}; + case Precision::DoubleDouble: return std::pair{min_exponent, max_exponent}; + +#if defined(__APPLE__) or defined(__clang__) + case Precision::Half: return std::pair{min_exponent<_Float16>, max_exponent<_Float16>}; + case Precision::DoubleHalf: return std::pair{min_exponent<_Float16>, max_exponent<_Float16>}; +#endif + + default: break; + } + + raisef("invalid precision = {}", int(precision)); +} + // sort and return indices void sort_keys(int64_t* val_arr, int64_t* key_arr, int64_t N) { // for sorting int64 arrays and returning keys @@ -81,9 +138,7 @@ void check_inside_grid(int64_t const* idx, int64_t N, int64_t Nx, int64_t Ny, in // load the sim data from Python-written HDF5 files // NOLINTBEGIN(clang-analyzer-cplusplus.NewDeleteLeaks) -template -[[nodiscard]] auto loadSimulation3D_impl(std::filesystem::path const& simDir) -> Simulation3D { - +[[nodiscard]] auto loadSimulation3D_impl(std::filesystem::path const& simDir, Precision precision) -> Simulation3D { int expected_ndims = 0; hsize_t dims[2] = {}; @@ -120,22 +175,23 @@ template } // calculate some update coefficients - double const lfac = isFCC(grid) ? 0.25 : 1.0; // laplacian factor - double const dsl2 = (1.0 + EPS)*lfac * l2; // scale for stability (EPS in fdtd_common.hpp) - double const da1 = (2.0 - dsl2 * NN); // scaling for stability in single + double const eps = getEpsilon(precision); + double const lfac = isFCC(grid) ? 0.25 : 1.0; // laplacian factor + double const dsl2 = (1.0 + eps) * lfac * l2; // scale for stability (EPS in fdtd_common.hpp) + double const da1 = (2.0 - dsl2 * NN); // scaling for stability in single double const da2 = lfac * l2; - auto const a1 = static_cast(da1); - auto const a2 = static_cast(da2); - auto const sl2 = static_cast(dsl2); - auto const lo2 = static_cast(0.5 * l); + auto const a1 = static_cast(da1); + auto const a2 = static_cast(da2); + auto const sl2 = static_cast(dsl2); + auto const lo2 = static_cast(0.5 * l); fmt::println("a2 (double): {:.16g}", da2); - fmt::println("a2 (Real): {:.16g}", a2); + fmt::println("a2 (double): {:.16g}", a2); fmt::println("a1 (double): {:.16g}", da1); - fmt::println("a1 (Real): {:.16g}", a1); + fmt::println("a1 (double): {:.16g}", a1); fmt::println("sl2 (double): {:.16g}", dsl2); - fmt::println("sl2 (Real): {:.16g}", sl2); + fmt::println("sl2 (double): {:.16g}", sl2); fmt::println("l2={:.16g}", l2); fmt::println("NN={}", NN); @@ -185,12 +241,12 @@ template auto saf_bn = vox_out.read>("saf_bn"); PFFDTD_ASSERT(std::cmp_equal(saf_bn.size(), Nb)); - auto ssaf_bn = std::vector(size_t(Nb)); + auto ssaf_bn = std::vector(size_t(Nb)); for (int64_t i = 0; i < Nb; i++) { if (isFCC(grid)) { - ssaf_bn[i] = (Real)(0.5 / std::numbers::sqrt2) * saf_bn[i]; // rescale for S*h/V and cast + ssaf_bn[i] = (double)(0.5 / std::numbers::sqrt2) * saf_bn[i]; // rescale for S*h/V and cast } else { - ssaf_bn[i] = (Real)saf_bn[i]; // just cast + ssaf_bn[i] = (double)saf_bn[i]; // just cast } } @@ -233,7 +289,7 @@ template PFFDTD_ASSERT(std::cmp_equal(in_sigs.size(), Ns * Nt)); // not recommended to run single without differentiating input - if (sizeof(Real) == 4) { + if (sizeof(double) == 4) { PFFDTD_ASSERT(diff); } @@ -259,8 +315,8 @@ template ////////////////// // DEF (RLC) datasets ////////////////// - auto mat_beta = std::vector(size_t(Nm)); - auto mat_quads = std::vector>(static_cast(Nm) * size_t(MMb)); + auto mat_beta = std::vector(size_t(Nm)); + auto mat_quads = std::vector>(static_cast(Nm) * size_t(MMb)); for (int8_t i = 0; i < Nm; i++) { auto DEF = materials.read>(fmt::format("mat_{:02d}_DEF", i).c_str()); PFFDTD_ASSERT(Mb[i] <= MMb); @@ -287,11 +343,11 @@ template PFFDTD_ASSERT(not std::isnan(bd)); int32_t const mij = (int32_t)MMb * i + j; - mat_quads[mij].b = (Real)b; - mat_quads[mij].bd = (Real)bd; - mat_quads[mij].bDh = (Real)bDh; - mat_quads[mij].bFh = (Real)bFh; - mat_beta[i] += (Real)b; + mat_quads[mij].b = (double)b; + mat_quads[mij].bd = (double)bd; + mat_quads[mij].bDh = (double)bDh; + mat_quads[mij].bFh = (double)bFh; + mat_beta[i] += (double)b; } } @@ -393,7 +449,7 @@ template fmt::println("Nbl = {}", Nbl); auto mat_bnl = std::vector(static_cast(Nbl)); auto bnl_ixyz = std::vector(static_cast(Nbl)); - auto ssaf_bnl = std::vector(static_cast(Nbl)); + auto ssaf_bnl = std::vector(static_cast(Nbl)); { int64_t j = 0; for (int64_t i = 0; i < Nb; i++) { @@ -463,7 +519,7 @@ template } } - return Simulation3D{ + return Simulation3D{ .bn_ixyz = std::move(bn_ixyz), .bnl_ixyz = std::move(bnl_ixyz), .bna_ixyz = std::move(bna_ixyz), @@ -501,13 +557,61 @@ template .lo2 = lo2, .a2 = a2, .a1 = a1, + .precision = precision, }; } // NOLINTEND(clang-analyzer-cplusplus.NewDeleteLeaks) -template -void writeOutputs_impl(Simulation3D const& sim, std::filesystem::path const& simDir) { +} // namespace + +[[nodiscard]] auto loadSimulation3D(std::filesystem::path const& simDir, Precision precision) -> Simulation3D { + return loadSimulation3D_impl(simDir, precision); +} + +// scale input to be in middle of floating-point range +void scaleInput(Simulation3D& sim) { + auto* in_sigs = sim.in_sigs.data(); + int64_t const Nt = sim.Nt; + int64_t const Ns = sim.Ns; + + // normalise input signals (and save gain) + double max_in = 0.0; + for (int64_t n = 0; n < Nt; n++) { + for (int64_t ns = 0; ns < Ns; ns++) { + max_in = std::max(max_in, fabs(in_sigs[ns * Nt + n])); + } + } + + auto const [min_exp, max_exp] = std::pair{getMinMaxExponent(sim.precision)}; + + auto const aexp = 0.5; // normalise to middle power of two + auto const pow2 = static_cast(std::round(aexp * max_exp + (1 - aexp) * min_exp)); + auto const norm1 = std::pow(2.0, pow2); + auto const inv_infac = norm1 / max_in; + auto const infac = 1.0 / inv_infac; + + std::printf( + "max_in = %.16e, pow2 = %d, norm1 = %.16e, inv_infac = %.16e, infac = " + "%.16e\n", + max_in, + pow2, + norm1, + inv_infac, + infac + ); + + // normalise input data + for (int64_t ns = 0; ns < Ns; ns++) { + for (int64_t n = 0; n < Nt; n++) { + in_sigs[ns * Nt + n] *= inv_infac; + } + } + + sim.infac = infac; +} + +void writeOutputs(Simulation3D const& sim, std::filesystem::path const& simDir) { auto const* out_reorder = sim.out_reorder.data(); auto Nt = static_cast(sim.Nt); auto Nr = static_cast(sim.Nr); @@ -525,9 +629,7 @@ void writeOutputs_impl(Simulation3D const& sim, std::filesystem::path cons std::puts("wrote output dataset"); } -// print last samples of simulation (for correctness checking..) -template -void printLastSample_impl(Simulation3D const& sim) { +void printLastSample(Simulation3D const& sim) { int64_t const Nt = sim.Nt; int64_t const Nr = sim.Nr; double* u_out = sim.u_out.get(); @@ -542,26 +644,4 @@ void printLastSample_impl(Simulation3D const& sim) { } } -} // namespace - -[[nodiscard]] auto loadSimulation3D_float(std::filesystem::path const& simDir) -> Simulation3D { - return loadSimulation3D_impl(simDir); -} - -[[nodiscard]] auto loadSimulation3D_double(std::filesystem::path const& simDir) -> Simulation3D { - return loadSimulation3D_impl(simDir); -} - -void printLastSample(Simulation3D const& sim) { printLastSample_impl(sim); } - -void printLastSample(Simulation3D const& sim) { printLastSample_impl(sim); } - -void writeOutputs(Simulation3D const& sim, std::filesystem::path const& simDir) { - writeOutputs_impl(sim, simDir); -} - -void writeOutputs(Simulation3D const& sim, std::filesystem::path const& simDir) { - writeOutputs_impl(sim, simDir); -} - } // namespace pffdtd diff --git a/src/cpp/pffdtd/simulation_3d.hpp b/src/cpp/pffdtd/simulation_3d.hpp index 99b9d27..069aa9b 100644 --- a/src/cpp/pffdtd/simulation_3d.hpp +++ b/src/cpp/pffdtd/simulation_3d.hpp @@ -6,6 +6,7 @@ #include "pffdtd/assert.hpp" #include "pffdtd/mat_quad.hpp" +#include "pffdtd/precision.hpp" #include "pffdtd/utility.hpp" #include @@ -29,113 +30,53 @@ enum struct Grid : int8_t { [[nodiscard]] constexpr auto isFCC(Grid grid) noexcept -> bool { return grid != Grid::CART; } // main sim data, on host -template struct Simulation3D { - std::vector bn_ixyz; // boundary node indices - std::vector bnl_ixyz; // lossy boundary node indices - std::vector bna_ixyz; // absorbing boundary node indices - std::vector Q_bna; // integer for ABCs (wall 1,edge 2,corner 3) - std::vector in_ixyz; // input points - std::vector out_ixyz; // output points - std::vector out_reorder; // ordering for outputs point for final print/save - std::vector adj_bn; // nearest-neighbour adjancencies for all boundary nodes - std::vector ssaf_bnl; // surface area corrections (with extra volume scaling) - std::vector bn_mask; // bit mask for bounday nodes - std::vector mat_bnl; // material indices for lossy boundary nodes - std::vector K_bn; // number of adjacent neighbours, boundary nodesa - std::vector in_sigs; // input signals - std::unique_ptr u_out; // for output signals - int64_t Ns; // number of input grid points - int64_t Nr; // number of output grid points - int64_t Nt; // number of samples simulation - int64_t Npts; // number of Cartesian grid points - int64_t Nx; // x-dim (non-continguous) - int64_t Ny; // y-dim - int64_t Nz; // z-dim (continguous) - int64_t Nb; // number of boundary nodes - int64_t Nbl; // number of lossy boundary nodes - int64_t Nba; // number of ABC nodes - double l; // Courant number (CFL) - double l2; // CFL number squared - Grid grid; // Grid type - int8_t NN; // integer, neareast neighbours - int8_t Nm; // number of materials used - std::vector Mb; // number of branches per material - std::vector> mat_quads; // RLC coefficients (essentially) - std::vector mat_beta; // part of FD-boundaries one per material - double infac; // rescaling of input (for numerical reason) - Real sl2; // scaled l2 (for single precision) - Real lo2; // 0.5*l - Real a2; // update stencil coefficient - Real a1; // update stencil coefficient + std::vector bn_ixyz; // boundary node indices + std::vector bnl_ixyz; // lossy boundary node indices + std::vector bna_ixyz; // absorbing boundary node indices + std::vector Q_bna; // integer for ABCs (wall 1,edge 2,corner 3) + std::vector in_ixyz; // input points + std::vector out_ixyz; // output points + std::vector out_reorder; // ordering for outputs point for final print/save + std::vector adj_bn; // nearest-neighbour adjancencies for all boundary nodes + std::vector ssaf_bnl; // surface area corrections (with extra volume scaling) + std::vector bn_mask; // bit mask for bounday nodes + std::vector mat_bnl; // material indices for lossy boundary nodes + std::vector K_bn; // number of adjacent neighbours, boundary nodesa + std::vector in_sigs; // input signals + std::unique_ptr u_out; // for output signals + int64_t Ns; // number of input grid points + int64_t Nr; // number of output grid points + int64_t Nt; // number of samples simulation + int64_t Npts; // number of Cartesian grid points + int64_t Nx; // x-dim (non-continguous) + int64_t Ny; // y-dim + int64_t Nz; // z-dim (continguous) + int64_t Nb; // number of boundary nodes + int64_t Nbl; // number of lossy boundary nodes + int64_t Nba; // number of ABC nodes + double l; // Courant number (CFL) + double l2; // CFL number squared + Grid grid; // Grid type + int8_t NN; // integer, neareast neighbours + int8_t Nm; // number of materials used + std::vector Mb; // number of branches per material + std::vector> mat_quads; // RLC coefficients (essentially) + std::vector mat_beta; // part of FD-boundaries one per material + double infac; // rescaling of input (for numerical reason) + double sl2; // scaled l2 (for single precision) + double lo2; // 0.5*l + double a2; // update stencil coefficient + double a1; // update stencil coefficient + Precision precision; // Runtime floating-point precision }; -[[nodiscard]] auto loadSimulation3D_float(std::filesystem::path const& simDir) -> Simulation3D; -[[nodiscard]] auto loadSimulation3D_double(std::filesystem::path const& simDir) -> Simulation3D; +[[nodiscard]] auto loadSimulation3D(std::filesystem::path const& simDir, Precision precision) -> Simulation3D; +auto scaleInput(Simulation3D& sim) -> void; +auto printLastSample(Simulation3D const& sim) -> void; +auto writeOutputs(Simulation3D const& sim, std::filesystem::path const& simDir) -> void; -auto printLastSample(Simulation3D const& sim) -> void; -auto printLastSample(Simulation3D const& sim) -> void; - -auto writeOutputs(Simulation3D const& sim, std::filesystem::path const& simDir) -> void; -auto writeOutputs(Simulation3D const& sim, std::filesystem::path const& simDir) -> void; - -template -[[nodiscard]] auto loadSimulation3D(std::filesystem::path const& simDir) -> Simulation3D { - if constexpr (std::is_same_v) { - return loadSimulation3D_float(simDir); - } else if constexpr (std::is_same_v) { - return loadSimulation3D_double(simDir); - } else { - static_assert(always_false); - } -} - -// scale input to be in middle of floating-point range -template -void scaleInput(Simulation3D& sim) { - auto* in_sigs = sim.in_sigs.data(); - int64_t const Nt = sim.Nt; - int64_t const Ns = sim.Ns; - - // normalise input signals (and save gain) - double max_in = 0.0; - for (int64_t n = 0; n < Nt; n++) { - for (int64_t ns = 0; ns < Ns; ns++) { - max_in = std::max(max_in, fabs(in_sigs[ns * Nt + n])); - } - } - - constexpr auto min_exp = static_cast(std::numeric_limits::min_exponent); - constexpr auto max_exp = static_cast(std::numeric_limits::max_exponent); - - auto const aexp = 0.5; // normalise to middle power of two - auto const pow2 = static_cast(std::round(aexp * max_exp + (1 - aexp) * min_exp)); - auto const norm1 = std::pow(2.0, pow2); - auto const inv_infac = norm1 / max_in; - auto const infac = 1.0 / inv_infac; - - std::printf( - "max_in = %.16e, pow2 = %d, norm1 = %.16e, inv_infac = %.16e, infac = " - "%.16e\n", - max_in, - pow2, - norm1, - inv_infac, - infac - ); - - // normalise input data - for (int64_t ns = 0; ns < Ns; ns++) { - for (int64_t n = 0; n < Nt; n++) { - in_sigs[ns * Nt + n] *= inv_infac; - } - } - - sim.infac = infac; -} - -template -void rescaleOutput(Simulation3D& sim) { +inline void rescaleOutput(Simulation3D& sim) { int64_t const Nt = sim.Nt; int64_t const Nr = sim.Nr; double infac = sim.infac; diff --git a/src/cpp/pffdtd/utility.hpp b/src/cpp/pffdtd/utility.hpp index 4de282a..64f8936 100644 --- a/src/cpp/pffdtd/utility.hpp +++ b/src/cpp/pffdtd/utility.hpp @@ -3,12 +3,16 @@ // Misc function definitions not specific to FDTD simulation #pragma once +#include "pffdtd/mat_quad.hpp" + +#include #include #include #include #include #include #include +#include #ifndef DIV_CEIL #define DIV_CEIL(x, y) (((x) + (y) - 1) / (y)) // this works for x≥0 and y>0 @@ -31,4 +35,25 @@ template return static_cast(GET_BIT(word, pos)); } +template +auto convertTo(std::vector const& in) { + auto out = std::vector(in.size()); + std::ranges::transform(in, out.begin(), [](auto v) { return static_cast(v); }); + return out; +} + +template +auto convertTo(std::vector> const& in) { + auto out = std::vector>(in.size()); + std::ranges::transform(in, out.begin(), [](MatQuad mq) { + return MatQuad{ + .b = static_cast(mq.b), + .bd = static_cast(mq.bd), + .bDh = static_cast(mq.bDh), + .bFh = static_cast(mq.bFh), + }; + }); + return out; +} + } // namespace pffdtd