From 487b7259ef1946bd777f2f9fac454d5328433aa9 Mon Sep 17 00:00:00 2001
From: Tobias Hienzsch <post@tobias-hienzsch.de>
Date: Sun, 29 Sep 2024 21:47:17 +0200
Subject: [PATCH] [cpp] Make precision a runtime parameter for sim3d

---
 run_2d.sh                          |   2 +-
 run_3d.sh                          |   4 +-
 src/cpp/main.cpp                   |  20 +---
 src/cpp/pffdtd/engine_cpu_3d.cpp   |  36 ++++--
 src/cpp/pffdtd/engine_cpu_3d.hpp   |   4 +-
 src/cpp/pffdtd/engine_cuda_3d.cu   |  37 +++---
 src/cpp/pffdtd/engine_cuda_3d.hpp  |   3 +-
 src/cpp/pffdtd/engine_metal_2d.mm  |   2 +-
 src/cpp/pffdtd/engine_metal_3d.hpp |   2 +-
 src/cpp/pffdtd/engine_metal_3d.mm  |   7 +-
 src/cpp/pffdtd/engine_sycl_3d.cpp  |  34 ++++--
 src/cpp/pffdtd/engine_sycl_3d.hpp  |   3 +-
 src/cpp/pffdtd/simulation_3d.cpp   | 186 +++++++++++++++++++++--------
 src/cpp/pffdtd/simulation_3d.hpp   | 147 +++++++----------------
 src/cpp/pffdtd/utility.hpp         |  25 ++++
 15 files changed, 294 insertions(+), 218 deletions(-)
diff --git a/run_2d.sh b/run_2d.sh
index 1f72b23..6139f8e 100755
--- a/run_2d.sh
+++ b/run_2d.sh
@@ -28,7 +28,7 @@ cd "$model_dir"
 python "$sim_setup"
 
 # Run sim
-DPCPP_CPU_PLACES=cores DPCPP_CPU_CU_AFFINITY=spread DPCPP_CPU_NUM_CUS=$jobs OMP_NUM_THREADS=$jobs "$engine_exe" sim2d -p 32 -s "$sim_dir" -e sycl
+DPCPP_CPU_PLACES=cores DPCPP_CPU_CU_AFFINITY=spread DPCPP_CPU_NUM_CUS=$jobs OMP_NUM_THREADS=$jobs "$engine_exe" sim2d -p 64 -s "$sim_dir" -e sycl
 # pffdtd sim2d run --sim_dir "$sim_dir" --video
 
 # Post-process
diff --git a/run_3d.sh b/run_3d.sh
index 36e9ab0..dbefe8f 100755
--- a/run_3d.sh
+++ b/run_3d.sh
@@ -7,7 +7,7 @@ set -e
 
 root_dir="$(cd "$(dirname "$0")" && pwd)"
 pffdtd_engine="$root_dir/build/src/cpp/pffdtd-engine"
-# pffdtd_engine="$root_dir/cmake-build-cuda/src/cpp/pffdtd-engine"
+pffdtd_engine="$root_dir/cmake-build-cuda/src/cpp/pffdtd-engine"
 
 sim_name="Modes"
 sim_setup="${sim_name}.py"
@@ -28,7 +28,7 @@ cd "$model_dir"
 pffdtd sim3d setup "$sim_setup"
 
 # Run sim
-$pffdtd_engine sim3d -e cpu -p "32" -s "$sim_dir"
+$pffdtd_engine sim3d -e cuda -p "64" -s "$sim_dir"
 # pffdtd sim3d engine --sim_dir="$sim_dir" --plot --draw_backend="mayavi" --json_model="${model_dir}/model.json"
 
 # Post-process
diff --git a/src/cpp/main.cpp b/src/cpp/main.cpp
index 7dac46a..a6b8d25 100644
--- a/src/cpp/main.cpp
+++ b/src/cpp/main.cpp
@@ -69,18 +69,15 @@ namespace {
   return engines;
 }
 
-template<typename Real>
 [[nodiscard]] auto getEngines3D() {
   using namespace pffdtd;
-  auto engines   = std::map<std::string, std::function<void(Simulation3D<Real> const&)>>{};
+  auto engines   = std::map<std::string, std::function<void(Simulation3D const&)>>{};
   engines["cpu"] = EngineCPU3D{};
 #if defined(PFFDTD_HAS_CUDA)
   engines["cuda"] = EngineCUDA3D{};
 #endif
 #if defined(PFFDTD_HAS_METAL)
-  if constexpr (std::same_as<Real, float>) {
-    engines["metal"] = EngineMETAL3D{};
-  }
+  engines["metal"] = EngineMETAL3D{};
 #endif
 #if defined(PFFDTD_HAS_SYCL)
   engines["sycl"] = EngineSYCL3D{};
@@ -111,18 +108,17 @@ struct Arguments {
   Sim3D sim3d;
 };
 
-template<typename Real>
 auto run3D(Arguments::Sim3D const& args) {
   using namespace pffdtd;
   fmt::println("Running: {} on {} with precision {}", args.simDir, args.engine, toString(args.precision));
 
-  auto const engines = getEngines3D<Real>();
+  auto const engines = getEngines3D();
   auto const& engine = engines.at(args.engine);
 
   auto const simDir = std::filesystem::path{args.simDir};
   auto const start  = getTime();
 
-  auto sim = loadSimulation3D<Real>(simDir);
+  auto sim = loadSimulation3D(simDir, args.precision);
   scaleInput(sim);
   engine(sim);
   rescaleOutput(sim);
@@ -177,13 +173,7 @@ auto main(int argc, char** argv) -> int {
   }
 
   if (*sim3d) {
-    if (args.sim3d.precision == pffdtd::Precision::Float) {
-      run3D<float>(args.sim3d);
-    } else if (args.sim3d.precision == pffdtd::Precision::Double) {
-      run3D<double>(args.sim3d);
-    } else {
-      pffdtd::raisef<std::invalid_argument>("invalid precision '{}'", toString(args.sim3d.precision));
-    }
+    run3D(args.sim3d);
   }
 
   if (*test) {
diff --git a/src/cpp/pffdtd/engine_cpu_3d.cpp b/src/cpp/pffdtd/engine_cpu_3d.cpp
index bf510e7..df71a7c 100644
--- a/src/cpp/pffdtd/engine_cpu_3d.cpp
+++ b/src/cpp/pffdtd/engine_cpu_3d.cpp
@@ -4,6 +4,8 @@
 
 #include "engine_cpu_3d.hpp"
 
+#include "pffdtd/double.hpp"
+#include "pffdtd/exception.hpp"
 #include "pffdtd/progress.hpp"
 #include "pffdtd/time.hpp"
 #include "pffdtd/utility.hpp"
@@ -79,7 +81,11 @@ auto process_bnl_fd(
 }
 
 template<typename Real>
-auto run(Simulation3D<Real> const& sim) -> void {
+auto run(Simulation3D const& sim) -> void {
+  auto const ssaf_bnl_real  = convertTo<Real>(sim.ssaf_bnl);
+  auto const mat_beta_real  = convertTo<Real>(sim.mat_beta);
+  auto const mat_quads_real = convertTo<Real>(sim.mat_quads);
+
   // keep local ints, scalars
   int64_t const Ns   = sim.Ns;
   int64_t const Nr   = sim.Nr;
@@ -105,9 +111,9 @@ auto run(Simulation3D<Real> const& sim) -> void {
   int8_t const* mat_bnl          = sim.mat_bnl.data();
   int8_t const* Q_bna            = sim.Q_bna.data();
   double const* in_sigs          = sim.in_sigs.data();
-  Real const* ssaf_bnl           = sim.ssaf_bnl.data();
-  Real const* mat_beta           = sim.mat_beta.data();
-  MatQuad<Real> const* mat_quads = sim.mat_quads.data();
+  Real const* ssaf_bnl           = ssaf_bnl_real.data();
+  Real const* mat_beta           = mat_beta_real.data();
+  MatQuad<Real> const* mat_quads = mat_quads_real.data();
   double* u_out                  = sim.u_out.get();
 
   // allocate memory
@@ -130,11 +136,11 @@ auto run(Simulation3D<Real> const& sim) -> void {
   auto* gh1  = gh1_buf.data();
 
   // sim coefficients
-  auto const lo2 = sim.lo2;
-  auto const sl2 = sim.sl2;
-  auto const l   = sim.l;
-  auto const a1  = sim.a1;
-  auto const a2  = sim.a2;
+  auto const lo2 = static_cast<Real>(sim.lo2);
+  auto const sl2 = static_cast<Real>(sim.sl2);
+  auto const l   = static_cast<Real>(sim.l);
+  auto const a1  = static_cast<Real>(sim.a1);
+  auto const a2  = static_cast<Real>(sim.a2);
 
   // can control outside with OMP_NUM_THREADS env variable
   int const numWorkers = omp_get_max_threads();
@@ -373,8 +379,14 @@ auto run(Simulation3D<Real> const& sim) -> void {
 
 } // namespace
 
-auto EngineCPU3D::operator()(Simulation3D<float> const& sim) const -> void { run(sim); }
-
-auto EngineCPU3D::operator()(Simulation3D<double> const& sim) const -> void { run(sim); }
+auto EngineCPU3D::operator()(Simulation3D const& sim) const -> void {
+  switch (sim.precision) {
+    case Precision::Float: return run<float>(sim);
+    case Precision::Double: return run<double>(sim);
+    case Precision::DoubleFloat: return run<Double<float>>(sim);
+    case Precision::DoubleDouble: return run<Double<double>>(sim);
+    default: raisef<std::invalid_argument>("invalid precision {}", static_cast<int>(sim.precision));
+  }
+}
 
 } // namespace pffdtd
diff --git a/src/cpp/pffdtd/engine_cpu_3d.hpp b/src/cpp/pffdtd/engine_cpu_3d.hpp
index fe7a920..fd981c6 100644
--- a/src/cpp/pffdtd/engine_cpu_3d.hpp
+++ b/src/cpp/pffdtd/engine_cpu_3d.hpp
@@ -4,13 +4,13 @@
 
 #pragma once
 
+#include "pffdtd/precision.hpp"
 #include "pffdtd/simulation_3d.hpp"
 
 namespace pffdtd {
 
 struct EngineCPU3D {
-  auto operator()(Simulation3D<float> const& sim) const -> void;
-  auto operator()(Simulation3D<double> const& sim) const -> void;
+  auto operator()(Simulation3D const& sim) const -> void;
 };
 
 } // namespace pffdtd
diff --git a/src/cpp/pffdtd/engine_cuda_3d.cu b/src/cpp/pffdtd/engine_cuda_3d.cu
index 1fbd453..74fec61 100644
--- a/src/cpp/pffdtd/engine_cuda_3d.cu
+++ b/src/cpp/pffdtd/engine_cuda_3d.cu
@@ -543,8 +543,7 @@ auto print_gpu_details(int i) -> uint64_t {
 }
 
 // input indices need to be sorted for multi-device allocation
-template<typename Real>
-void checkSorted(Simulation3D<Real> const& sim) {
+void checkSorted(Simulation3D const& sim) {
   int64_t const* bn_ixyz  = sim.bn_ixyz.data();
   int64_t const* bnl_ixyz = sim.bnl_ixyz.data();
   int64_t const* bna_ixyz = sim.bna_ixyz.data();
@@ -574,7 +573,7 @@ void checkSorted(Simulation3D<Real> const& sim) {
 
 // counts for splitting data across GPUs
 template<typename Real>
-void splitData(Simulation3D<Real> const& sim, std::span<HostData<Real>> ghds) {
+void splitData(Simulation3D const& sim, std::span<HostData<Real>> ghds) {
   auto const Nx    = sim.Nx;
   auto const Ny    = sim.Ny;
   auto const Nz    = sim.Nz;
@@ -724,7 +723,7 @@ void splitData(Simulation3D<Real> const& sim, std::span<HostData<Real>> ghds) {
 }
 
 template<typename Real>
-auto run(Simulation3D<Real> const& sim) -> void { // NOLINT(readability-function-cognitive-complexity)
+auto run(Simulation3D const& sim) -> void { // NOLINT(readability-function-cognitive-complexity)
   // if you want to test synchronous, env variable for that
   auto const* s = std::getenv("CUDA_LAUNCH_BLOCKING"); // NOLINT(concurrency-mt-unsafe)
   if (s != nullptr) {
@@ -745,6 +744,10 @@ auto run(Simulation3D<Real> const& sim) -> void { // NOLINT(readability-function
   auto ghds = std::vector<HostData<Real>>(static_cast<size_t>(ngpus));
   auto gds  = std::vector<DeviceData<Real>>(static_cast<size_t>(ngpus));
 
+  auto const ssaf_bnl_real  = convertTo<Real>(sim.ssaf_bnl);
+  auto const mat_beta_real  = convertTo<Real>(sim.mat_beta);
+  auto const mat_quads_real = convertTo<Real>(sim.mat_quads);
+
   if (ngpus > 1) {
     checkSorted(sim); // needs to be sorted for multi-GPU
   }
@@ -756,11 +759,11 @@ auto run(Simulation3D<Real> const& sim) -> void { // NOLINT(readability-function
     gds[gid].totalmembytes = print_gpu_details(gid);
   }
 
-  Real lo2 = sim.lo2;
-  Real a1  = sim.a1;
-  Real a2  = sim.a2;
-  Real l   = sim.l;
-  Real sl2 = sim.sl2;
+  auto lo2 = static_cast<Real>(sim.lo2);
+  auto a1  = static_cast<Real>(sim.a1);
+  auto a2  = static_cast<Real>(sim.a2);
+  auto l   = static_cast<Real>(sim.l);
+  auto sl2 = static_cast<Real>(sim.sl2);
 
   // timing stuff
   auto elapsed               = std::chrono::nanoseconds{0};
@@ -829,7 +832,7 @@ auto run(Simulation3D<Real> const& sim) -> void { // NOLINT(readability-function
 
     // aliased pointers (to memory already allocated)
     host.in_sigs   = sim.in_sigs.data() + Ns_read * sim.Nt;
-    host.ssaf_bnl  = sim.ssaf_bnl.data() + Nbl_read;
+    host.ssaf_bnl  = ssaf_bnl_real.data() + Nbl_read;
     host.adj_bn    = sim.adj_bn.data() + Nb_read;
     host.mat_bnl   = sim.mat_bnl.data() + Nbl_read;
     host.K_bn      = sim.K_bn.data() + Nb_read;
@@ -942,12 +945,12 @@ auto run(Simulation3D<Real> const& sim) -> void { // NOLINT(readability-function
     gpuErrchk(cudaMemcpy(gpu.mat_bnl, host.mat_bnl, (size_t)host.Nbl * sizeof(int8_t), cudaMemcpyHostToDevice));
 
     gpuErrchk(cudaMalloc(&(gpu.mat_beta), (size_t)sim.Nm * sizeof(Real)));
-    gpuErrchk(cudaMemcpy(gpu.mat_beta, sim.mat_beta.data(), (size_t)sim.Nm * sizeof(Real), cudaMemcpyHostToDevice));
+    gpuErrchk(cudaMemcpy(gpu.mat_beta, mat_beta_real.data(), (size_t)sim.Nm * sizeof(Real), cudaMemcpyHostToDevice));
 
     gpuErrchk(cudaMalloc(&(gpu.mat_quads), (size_t)sim.Nm * MMb * sizeof(MatQuad<Real>)));
     gpuErrchk(cudaMemcpy(
         gpu.mat_quads,
-        sim.mat_quads.data(),
+        mat_quads_real.data(),
         (size_t)sim.Nm * MMb * sizeof(MatQuad<Real>),
         cudaMemcpyHostToDevice
     ));
@@ -1407,8 +1410,12 @@ auto run(Simulation3D<Real> const& sim) -> void { // NOLINT(readability-function
 
 } // namespace
 
-auto EngineCUDA3D::operator()(Simulation3D<float> const& sim) const -> void { run(sim); }
-
-auto EngineCUDA3D::operator()(Simulation3D<double> const& sim) const -> void { run(sim); }
+auto EngineCUDA3D::operator()(Simulation3D const& sim) const -> void {
+  switch (sim.precision) {
+    case Precision::Float: return run<float>(sim);
+    case Precision::Double: return run<double>(sim);
+    default: throw std::invalid_argument("invalid precision " + std::to_string(static_cast<int>(sim.precision)));
+  }
+}
 
 } // namespace pffdtd
diff --git a/src/cpp/pffdtd/engine_cuda_3d.hpp b/src/cpp/pffdtd/engine_cuda_3d.hpp
index 8571d12..35fa6b8 100644
--- a/src/cpp/pffdtd/engine_cuda_3d.hpp
+++ b/src/cpp/pffdtd/engine_cuda_3d.hpp
@@ -13,8 +13,7 @@
 namespace pffdtd {
 
 struct EngineCUDA3D {
-  auto operator()(Simulation3D<float> const& sim) const -> void;
-  auto operator()(Simulation3D<double> const& sim) const -> void;
+  auto operator()(Simulation3D const& sim) const -> void;
 };
 
 } // namespace pffdtd
diff --git a/src/cpp/pffdtd/engine_metal_2d.mm b/src/cpp/pffdtd/engine_metal_2d.mm
index 59efa56..e7e9ae8 100644
--- a/src/cpp/pffdtd/engine_metal_2d.mm
+++ b/src/cpp/pffdtd/engine_metal_2d.mm
@@ -19,7 +19,7 @@
 
 auto toFloat(std::vector<double> const& buf) {
   auto buf32 = std::vector<float>(buf.size());
-  std::ranges::transform(buf32, buf32.begin(), [](auto v) { return static_cast<float>(v); });
+  std::ranges::transform(buf, buf32.begin(), [](auto v) { return static_cast<float>(v); });
   return buf32;
 }
 
diff --git a/src/cpp/pffdtd/engine_metal_3d.hpp b/src/cpp/pffdtd/engine_metal_3d.hpp
index c3f72b8..3c1bcaa 100644
--- a/src/cpp/pffdtd/engine_metal_3d.hpp
+++ b/src/cpp/pffdtd/engine_metal_3d.hpp
@@ -11,7 +11,7 @@
 namespace pffdtd {
 
 struct EngineMETAL3D {
-  auto operator()(Simulation3D<float> const& sim) const -> void;
+  auto operator()(Simulation3D const& sim) const -> void;
 };
 
 } // namespace pffdtd
diff --git a/src/cpp/pffdtd/engine_metal_3d.mm b/src/cpp/pffdtd/engine_metal_3d.mm
index 47f2578..7a0e157 100644
--- a/src/cpp/pffdtd/engine_metal_3d.mm
+++ b/src/cpp/pffdtd/engine_metal_3d.mm
@@ -17,7 +17,7 @@
 namespace {
 
 template<typename Real>
-auto run(Simulation3D<Real> const& sim) {
+auto run(Simulation3D const& sim) {
   @autoreleasepool {
 
     // Device
@@ -286,6 +286,9 @@ auto run(Simulation3D<Real> const& sim) {
 
 } // namespace
 
-auto EngineMETAL3D::operator()(Simulation3D<float> const& sim) const -> void { run(sim); }
+auto EngineMETAL3D::operator()(Simulation3D const& sim) const -> void {
+  PFFDTD_ASSERT(precision == Precision::Float);
+  run<float>(sim);
+}
 
 } // namespace pffdtd
diff --git a/src/cpp/pffdtd/engine_sycl_3d.cpp b/src/cpp/pffdtd/engine_sycl_3d.cpp
index bb7bb3a..5d0d2fd 100644
--- a/src/cpp/pffdtd/engine_sycl_3d.cpp
+++ b/src/cpp/pffdtd/engine_sycl_3d.cpp
@@ -3,6 +3,9 @@
 #include "engine_sycl_3d.hpp"
 
 #include "pffdtd/assert.hpp"
+#include "pffdtd/double.hpp"
+#include "pffdtd/exception.hpp"
+#include "pffdtd/precision.hpp"
 #include "pffdtd/progress.hpp"
 #include "pffdtd/sycl.hpp"
 #include "pffdtd/time.hpp"
@@ -39,7 +42,7 @@ template<typename T>
 struct ReadOutput;
 
 template<typename Real>
-auto run(Simulation3D<Real> const& sim) -> void {
+auto run(Simulation3D const& sim) -> void {
   PFFDTD_ASSERT(sim.grid == Grid::CART);
 
   auto queue  = sycl::queue{sycl::property::queue::enable_profiling{}};
@@ -64,6 +67,10 @@ auto run(Simulation3D<Real> const& sim) -> void {
   auto const a1  = static_cast<Real>(sim.a1);
   auto const a2  = static_cast<Real>(sim.a2);
 
+  auto const ssaf_bnl_real  = convertTo<Real>(sim.ssaf_bnl);
+  auto const mat_beta_real  = convertTo<Real>(sim.mat_beta);
+  auto const mat_quads_real = convertTo<Real>(sim.mat_quads);
+
   auto Q_bna_buf     = sycl::buffer{sim.Q_bna};
   auto bn_mask_buf   = sycl::buffer{sim.bn_mask};
   auto adj_bn_buf    = sycl::buffer{sim.adj_bn};
@@ -73,11 +80,11 @@ auto run(Simulation3D<Real> const& sim) -> void {
   auto in_ixyz_buf   = sycl::buffer{sim.in_ixyz};
   auto out_ixyz_buf  = sycl::buffer{sim.out_ixyz};
   auto in_sigs_buf   = sycl::buffer{sim.in_sigs};
-  auto mat_beta_buf  = sycl::buffer{sim.mat_beta};
+  auto mat_beta_buf  = sycl::buffer{mat_beta_real};
   auto mat_bnl_buf   = sycl::buffer{sim.mat_bnl};
-  auto mat_quads_buf = sycl::buffer{sim.mat_quads};
+  auto mat_quads_buf = sycl::buffer{mat_quads_real};
   auto Mb_buf        = sycl::buffer{sim.Mb};
-  auto ssaf_bnl_buf  = sycl::buffer{sim.ssaf_bnl};
+  auto ssaf_bnl_buf  = sycl::buffer{ssaf_bnl_real};
 
   auto u0_buf    = sycl::buffer<Real>(static_cast<size_t>(Npts));
   auto u1_buf    = sycl::buffer<Real>(static_cast<size_t>(Npts));
@@ -343,14 +350,27 @@ auto run(Simulation3D<Real> const& sim) -> void {
   // Copy output to host
   auto host = sycl::host_accessor{u_out_buf, sycl::read_only};
   for (auto i{0UL}; i < static_cast<size_t>(Nr * Nt); ++i) {
-    sim.u_out[i] = host[i];
+    sim.u_out[i] = static_cast<double>(host[i]);
   }
 }
 
 } // namespace
 
-auto EngineSYCL3D::operator()(Simulation3D<float> const& sim) const -> void { run(sim); }
+auto EngineSYCL3D::operator()(Simulation3D const& sim) const -> void {
+  switch (sim.precision) {
+#if defined(__APPLE__) or defined(__clang__)
+    case Precision::Half: return run<_Float16>(sim);
+    case Precision::DoubleHalf: return run<Double<_Float16>>(sim);
+#endif
+
+    case Precision::Float: return run<float>(sim);
+    case Precision::DoubleFloat: return run<Double<float>>(sim);
+
+    case Precision::Double: return run<double>(sim);
+    case Precision::DoubleDouble: return run<Double<double>>(sim);
 
-auto EngineSYCL3D::operator()(Simulation3D<double> const& sim) const -> void { run(sim); }
+    default: raisef<std::invalid_argument>("invalid precision {}", static_cast<int>(sim.precision));
+  }
+}
 
 } // namespace pffdtd
diff --git a/src/cpp/pffdtd/engine_sycl_3d.hpp b/src/cpp/pffdtd/engine_sycl_3d.hpp
index ca45bd4..c5d3e8b 100644
--- a/src/cpp/pffdtd/engine_sycl_3d.hpp
+++ b/src/cpp/pffdtd/engine_sycl_3d.hpp
@@ -11,8 +11,7 @@
 namespace pffdtd {
 
 struct EngineSYCL3D {
-  auto operator()(Simulation3D<float> const& sim) const -> void;
-  auto operator()(Simulation3D<double> const& sim) const -> void;
+  auto operator()(Simulation3D const& sim) const -> void;
 };
 
 } // namespace pffdtd
diff --git a/src/cpp/pffdtd/simulation_3d.cpp b/src/cpp/pffdtd/simulation_3d.cpp
index 120b142..dd75776 100644
--- a/src/cpp/pffdtd/simulation_3d.cpp
+++ b/src/cpp/pffdtd/simulation_3d.cpp
@@ -3,6 +3,8 @@
 
 #include "simulation_3d.hpp"
 
+#include "pffdtd/double.hpp"
+#include "pffdtd/exception.hpp"
 #include "pffdtd/hdf.hpp"
 
 #include <fmt/format.h>
@@ -23,6 +25,61 @@ constexpr auto EPS = 0.0;
 template<>
 constexpr auto EPS<float> = 1.19209289e-07;
 
+auto getEpsilon(Precision precision) {
+  switch (precision) {
+    case Precision::Half: return EPS<float>;
+    case Precision::Float: return EPS<float>;
+    case Precision::Double: return EPS<double>;
+
+    case Precision::DoubleHalf: return EPS<float>;
+    case Precision::DoubleFloat: return EPS<float>;
+    case Precision::DoubleDouble: return EPS<double>;
+
+    default: break;
+  }
+
+  raisef<std::invalid_argument>("invalid precision = {}", int(precision));
+}
+
+template<typename Real>
+static constexpr auto min_exponent = std::numeric_limits<Real>::min_exponent;
+
+template<typename Real>
+static constexpr auto max_exponent = std::numeric_limits<Real>::max_exponent;
+
+template<typename Real>
+static constexpr auto min_exponent<Double<Real>> = min_exponent<Real>;
+
+template<typename Real>
+static constexpr auto max_exponent<Double<Real>> = max_exponent<Real>;
+
+#if defined(__APPLE__) or defined(__clang__)
+template<>
+static constexpr auto min_exponent<_Float16> = -13;
+
+template<>
+static constexpr auto max_exponent<_Float16> = 16;
+#endif
+
+auto getMinMaxExponent(Precision precision) {
+  switch (precision) {
+    case Precision::Float: return std::pair{min_exponent<float>, max_exponent<float>};
+    case Precision::DoubleFloat: return std::pair{min_exponent<float>, max_exponent<float>};
+
+    case Precision::Double: return std::pair{min_exponent<double>, max_exponent<double>};
+    case Precision::DoubleDouble: return std::pair{min_exponent<double>, max_exponent<double>};
+
+#if defined(__APPLE__) or defined(__clang__)
+    case Precision::Half: return std::pair{min_exponent<_Float16>, max_exponent<_Float16>};
+    case Precision::DoubleHalf: return std::pair{min_exponent<_Float16>, max_exponent<_Float16>};
+#endif
+
+    default: break;
+  }
+
+  raisef<std::invalid_argument>("invalid precision = {}", int(precision));
+}
+
 // sort and return indices
 void sort_keys(int64_t* val_arr, int64_t* key_arr, int64_t N) {
   // for sorting int64 arrays and returning keys
@@ -81,9 +138,7 @@ void check_inside_grid(int64_t const* idx, int64_t N, int64_t Nx, int64_t Ny, in
 
 // load the sim data from Python-written HDF5 files
 // NOLINTBEGIN(clang-analyzer-cplusplus.NewDeleteLeaks)
-template<typename Real>
-[[nodiscard]] auto loadSimulation3D_impl(std::filesystem::path const& simDir) -> Simulation3D<Real> {
-
+[[nodiscard]] auto loadSimulation3D_impl(std::filesystem::path const& simDir, Precision precision) -> Simulation3D {
   int expected_ndims = 0;
   hsize_t dims[2]    = {};
 
@@ -120,22 +175,23 @@ template<typename Real>
   }
 
   // calculate some update coefficients
-  double const lfac = isFCC(grid) ? 0.25 : 1.0;    // laplacian factor
-  double const dsl2 = (1.0 + EPS<Real>)*lfac * l2; // scale for stability (EPS in fdtd_common.hpp)
-  double const da1  = (2.0 - dsl2 * NN);           // scaling for stability in single
+  double const eps  = getEpsilon(precision);
+  double const lfac = isFCC(grid) ? 0.25 : 1.0; // laplacian factor
+  double const dsl2 = (1.0 + eps) * lfac * l2;  // scale for stability (EPS in fdtd_common.hpp)
+  double const da1  = (2.0 - dsl2 * NN);        // scaling for stability in single
   double const da2  = lfac * l2;
 
-  auto const a1  = static_cast<Real>(da1);
-  auto const a2  = static_cast<Real>(da2);
-  auto const sl2 = static_cast<Real>(dsl2);
-  auto const lo2 = static_cast<Real>(0.5 * l);
+  auto const a1  = static_cast<double>(da1);
+  auto const a2  = static_cast<double>(da2);
+  auto const sl2 = static_cast<double>(dsl2);
+  auto const lo2 = static_cast<double>(0.5 * l);
 
   fmt::println("a2 (double): {:.16g}", da2);
-  fmt::println("a2 (Real): {:.16g}", a2);
+  fmt::println("a2 (double): {:.16g}", a2);
   fmt::println("a1 (double): {:.16g}", da1);
-  fmt::println("a1 (Real): {:.16g}", a1);
+  fmt::println("a1 (double): {:.16g}", a1);
   fmt::println("sl2 (double): {:.16g}", dsl2);
-  fmt::println("sl2 (Real): {:.16g}", sl2);
+  fmt::println("sl2 (double): {:.16g}", sl2);
 
   fmt::println("l2={:.16g}", l2);
   fmt::println("NN={}", NN);
@@ -185,12 +241,12 @@ template<typename Real>
   auto saf_bn = vox_out.read<std::vector<double>>("saf_bn");
   PFFDTD_ASSERT(std::cmp_equal(saf_bn.size(), Nb));
 
-  auto ssaf_bn = std::vector<Real>(size_t(Nb));
+  auto ssaf_bn = std::vector<double>(size_t(Nb));
   for (int64_t i = 0; i < Nb; i++) {
     if (isFCC(grid)) {
-      ssaf_bn[i] = (Real)(0.5 / std::numbers::sqrt2) * saf_bn[i]; // rescale for S*h/V and cast
+      ssaf_bn[i] = (double)(0.5 / std::numbers::sqrt2) * saf_bn[i]; // rescale for S*h/V and cast
     } else {
-      ssaf_bn[i] = (Real)saf_bn[i]; // just cast
+      ssaf_bn[i] = (double)saf_bn[i]; // just cast
     }
   }
 
@@ -233,7 +289,7 @@ template<typename Real>
   PFFDTD_ASSERT(std::cmp_equal(in_sigs.size(), Ns * Nt));
 
   // not recommended to run single without differentiating input
-  if (sizeof(Real) == 4) {
+  if (sizeof(double) == 4) {
     PFFDTD_ASSERT(diff);
   }
 
@@ -259,8 +315,8 @@ template<typename Real>
   //////////////////
   // DEF (RLC) datasets
   //////////////////
-  auto mat_beta  = std::vector<Real>(size_t(Nm));
-  auto mat_quads = std::vector<MatQuad<Real>>(static_cast<unsigned long>(Nm) * size_t(MMb));
+  auto mat_beta  = std::vector<double>(size_t(Nm));
+  auto mat_quads = std::vector<MatQuad<double>>(static_cast<unsigned long>(Nm) * size_t(MMb));
   for (int8_t i = 0; i < Nm; i++) {
     auto DEF = materials.read<std::vector<double>>(fmt::format("mat_{:02d}_DEF", i).c_str());
     PFFDTD_ASSERT(Mb[i] <= MMb);
@@ -287,11 +343,11 @@ template<typename Real>
       PFFDTD_ASSERT(not std::isnan(bd));
 
       int32_t const mij  = (int32_t)MMb * i + j;
-      mat_quads[mij].b   = (Real)b;
-      mat_quads[mij].bd  = (Real)bd;
-      mat_quads[mij].bDh = (Real)bDh;
-      mat_quads[mij].bFh = (Real)bFh;
-      mat_beta[i] += (Real)b;
+      mat_quads[mij].b   = (double)b;
+      mat_quads[mij].bd  = (double)bd;
+      mat_quads[mij].bDh = (double)bDh;
+      mat_quads[mij].bFh = (double)bFh;
+      mat_beta[i] += (double)b;
     }
   }
 
@@ -393,7 +449,7 @@ template<typename Real>
   fmt::println("Nbl = {}", Nbl);
   auto mat_bnl  = std::vector<int8_t>(static_cast<size_t>(Nbl));
   auto bnl_ixyz = std::vector<int64_t>(static_cast<size_t>(Nbl));
-  auto ssaf_bnl = std::vector<Real>(static_cast<size_t>(Nbl));
+  auto ssaf_bnl = std::vector<double>(static_cast<size_t>(Nbl));
   {
     int64_t j = 0;
     for (int64_t i = 0; i < Nb; i++) {
@@ -463,7 +519,7 @@ template<typename Real>
     }
   }
 
-  return Simulation3D<Real>{
+  return Simulation3D{
       .bn_ixyz     = std::move(bn_ixyz),
       .bnl_ixyz    = std::move(bnl_ixyz),
       .bna_ixyz    = std::move(bna_ixyz),
@@ -501,13 +557,61 @@ template<typename Real>
       .lo2         = lo2,
       .a2          = a2,
       .a1          = a1,
+      .precision   = precision,
   };
 }
 
 // NOLINTEND(clang-analyzer-cplusplus.NewDeleteLeaks)
 
-template<typename Real>
-void writeOutputs_impl(Simulation3D<Real> const& sim, std::filesystem::path const& simDir) {
+} // namespace
+
+[[nodiscard]] auto loadSimulation3D(std::filesystem::path const& simDir, Precision precision) -> Simulation3D {
+  return loadSimulation3D_impl(simDir, precision);
+}
+
+// scale input to be in middle of floating-point range
+void scaleInput(Simulation3D& sim) {
+  auto* in_sigs    = sim.in_sigs.data();
+  int64_t const Nt = sim.Nt;
+  int64_t const Ns = sim.Ns;
+
+  // normalise input signals (and save gain)
+  double max_in = 0.0;
+  for (int64_t n = 0; n < Nt; n++) {
+    for (int64_t ns = 0; ns < Ns; ns++) {
+      max_in = std::max(max_in, fabs(in_sigs[ns * Nt + n]));
+    }
+  }
+
+  auto const [min_exp, max_exp] = std::pair<double, double>{getMinMaxExponent(sim.precision)};
+
+  auto const aexp      = 0.5; // normalise to middle power of two
+  auto const pow2      = static_cast<int32_t>(std::round(aexp * max_exp + (1 - aexp) * min_exp));
+  auto const norm1     = std::pow(2.0, pow2);
+  auto const inv_infac = norm1 / max_in;
+  auto const infac     = 1.0 / inv_infac;
+
+  std::printf(
+      "max_in = %.16e, pow2 = %d, norm1 = %.16e, inv_infac = %.16e, infac = "
+      "%.16e\n",
+      max_in,
+      pow2,
+      norm1,
+      inv_infac,
+      infac
+  );
+
+  // normalise input data
+  for (int64_t ns = 0; ns < Ns; ns++) {
+    for (int64_t n = 0; n < Nt; n++) {
+      in_sigs[ns * Nt + n] *= inv_infac;
+    }
+  }
+
+  sim.infac = infac;
+}
+
+void writeOutputs(Simulation3D const& sim, std::filesystem::path const& simDir) {
   auto const* out_reorder = sim.out_reorder.data();
   auto Nt                 = static_cast<size_t>(sim.Nt);
   auto Nr                 = static_cast<size_t>(sim.Nr);
@@ -525,9 +629,7 @@ void writeOutputs_impl(Simulation3D<Real> const& sim, std::filesystem::path cons
   std::puts("wrote output dataset");
 }
 
-// print last samples of simulation (for correctness checking..)
-template<typename Real>
-void printLastSample_impl(Simulation3D<Real> const& sim) {
+void printLastSample(Simulation3D const& sim) {
   int64_t const Nt           = sim.Nt;
   int64_t const Nr           = sim.Nr;
   double* u_out              = sim.u_out.get();
@@ -542,26 +644,4 @@ void printLastSample_impl(Simulation3D<Real> const& sim) {
   }
 }
 
-} // namespace
-
-[[nodiscard]] auto loadSimulation3D_float(std::filesystem::path const& simDir) -> Simulation3D<float> {
-  return loadSimulation3D_impl<float>(simDir);
-}
-
-[[nodiscard]] auto loadSimulation3D_double(std::filesystem::path const& simDir) -> Simulation3D<double> {
-  return loadSimulation3D_impl<double>(simDir);
-}
-
-void printLastSample(Simulation3D<float> const& sim) { printLastSample_impl(sim); }
-
-void printLastSample(Simulation3D<double> const& sim) { printLastSample_impl(sim); }
-
-void writeOutputs(Simulation3D<float> const& sim, std::filesystem::path const& simDir) {
-  writeOutputs_impl(sim, simDir);
-}
-
-void writeOutputs(Simulation3D<double> const& sim, std::filesystem::path const& simDir) {
-  writeOutputs_impl(sim, simDir);
-}
-
 } // namespace pffdtd
diff --git a/src/cpp/pffdtd/simulation_3d.hpp b/src/cpp/pffdtd/simulation_3d.hpp
index 99b9d27..069aa9b 100644
--- a/src/cpp/pffdtd/simulation_3d.hpp
+++ b/src/cpp/pffdtd/simulation_3d.hpp
@@ -6,6 +6,7 @@
 
 #include "pffdtd/assert.hpp"
 #include "pffdtd/mat_quad.hpp"
+#include "pffdtd/precision.hpp"
 #include "pffdtd/utility.hpp"
 
 #include <algorithm>
@@ -29,113 +30,53 @@ enum struct Grid : int8_t {
 [[nodiscard]] constexpr auto isFCC(Grid grid) noexcept -> bool { return grid != Grid::CART; }
 
 // main sim data, on host
-template<typename Real>
 struct Simulation3D {
-  std::vector<int64_t> bn_ixyz;         // boundary node indices
-  std::vector<int64_t> bnl_ixyz;        // lossy boundary node indices
-  std::vector<int64_t> bna_ixyz;        // absorbing boundary node indices
-  std::vector<int8_t> Q_bna;            // integer for ABCs (wall 1,edge 2,corner 3)
-  std::vector<int64_t> in_ixyz;         // input points
-  std::vector<int64_t> out_ixyz;        // output points
-  std::vector<int64_t> out_reorder;     // ordering for outputs point for final print/save
-  std::vector<uint16_t> adj_bn;         // nearest-neighbour adjancencies for all boundary nodes
-  std::vector<Real> ssaf_bnl;           // surface area corrections (with extra volume scaling)
-  std::vector<uint8_t> bn_mask;         // bit mask for bounday nodes
-  std::vector<int8_t> mat_bnl;          // material indices for lossy boundary nodes
-  std::vector<int8_t> K_bn;             // number of adjacent neighbours, boundary nodesa
-  std::vector<double> in_sigs;          // input signals
-  std::unique_ptr<double[]> u_out;      // for output signals
-  int64_t Ns;                           // number of input grid points
-  int64_t Nr;                           // number of output grid points
-  int64_t Nt;                           // number of samples simulation
-  int64_t Npts;                         // number of Cartesian grid points
-  int64_t Nx;                           // x-dim (non-continguous)
-  int64_t Ny;                           // y-dim
-  int64_t Nz;                           // z-dim (continguous)
-  int64_t Nb;                           // number of boundary nodes
-  int64_t Nbl;                          // number of lossy boundary nodes
-  int64_t Nba;                          // number of ABC nodes
-  double l;                             // Courant number (CFL)
-  double l2;                            // CFL number squared
-  Grid grid;                            // Grid type
-  int8_t NN;                            // integer, neareast neighbours
-  int8_t Nm;                            // number of materials used
-  std::vector<int8_t> Mb;               // number of branches per material
-  std::vector<MatQuad<Real>> mat_quads; // RLC coefficients (essentially)
-  std::vector<Real> mat_beta;           // part of FD-boundaries one per material
-  double infac;                         // rescaling of input (for numerical reason)
-  Real sl2;                             // scaled l2 (for single precision)
-  Real lo2;                             // 0.5*l
-  Real a2;                              // update stencil coefficient
-  Real a1;                              // update stencil coefficient
+  std::vector<int64_t> bn_ixyz;           // boundary node indices
+  std::vector<int64_t> bnl_ixyz;          // lossy boundary node indices
+  std::vector<int64_t> bna_ixyz;          // absorbing boundary node indices
+  std::vector<int8_t> Q_bna;              // integer for ABCs (wall 1,edge 2,corner 3)
+  std::vector<int64_t> in_ixyz;           // input points
+  std::vector<int64_t> out_ixyz;          // output points
+  std::vector<int64_t> out_reorder;       // ordering for outputs point for final print/save
+  std::vector<uint16_t> adj_bn;           // nearest-neighbour adjancencies for all boundary nodes
+  std::vector<double> ssaf_bnl;           // surface area corrections (with extra volume scaling)
+  std::vector<uint8_t> bn_mask;           // bit mask for bounday nodes
+  std::vector<int8_t> mat_bnl;            // material indices for lossy boundary nodes
+  std::vector<int8_t> K_bn;               // number of adjacent neighbours, boundary nodesa
+  std::vector<double> in_sigs;            // input signals
+  std::unique_ptr<double[]> u_out;        // for output signals
+  int64_t Ns;                             // number of input grid points
+  int64_t Nr;                             // number of output grid points
+  int64_t Nt;                             // number of samples simulation
+  int64_t Npts;                           // number of Cartesian grid points
+  int64_t Nx;                             // x-dim (non-continguous)
+  int64_t Ny;                             // y-dim
+  int64_t Nz;                             // z-dim (continguous)
+  int64_t Nb;                             // number of boundary nodes
+  int64_t Nbl;                            // number of lossy boundary nodes
+  int64_t Nba;                            // number of ABC nodes
+  double l;                               // Courant number (CFL)
+  double l2;                              // CFL number squared
+  Grid grid;                              // Grid type
+  int8_t NN;                              // integer, neareast neighbours
+  int8_t Nm;                              // number of materials used
+  std::vector<int8_t> Mb;                 // number of branches per material
+  std::vector<MatQuad<double>> mat_quads; // RLC coefficients (essentially)
+  std::vector<double> mat_beta;           // part of FD-boundaries one per material
+  double infac;                           // rescaling of input (for numerical reason)
+  double sl2;                             // scaled l2 (for single precision)
+  double lo2;                             // 0.5*l
+  double a2;                              // update stencil coefficient
+  double a1;                              // update stencil coefficient
+  Precision precision;                    // Runtime floating-point precision
 };
 
-[[nodiscard]] auto loadSimulation3D_float(std::filesystem::path const& simDir) -> Simulation3D<float>;
-[[nodiscard]] auto loadSimulation3D_double(std::filesystem::path const& simDir) -> Simulation3D<double>;
+[[nodiscard]] auto loadSimulation3D(std::filesystem::path const& simDir, Precision precision) -> Simulation3D;
+auto scaleInput(Simulation3D& sim) -> void;
+auto printLastSample(Simulation3D const& sim) -> void;
+auto writeOutputs(Simulation3D const& sim, std::filesystem::path const& simDir) -> void;
 
-auto printLastSample(Simulation3D<float> const& sim) -> void;
-auto printLastSample(Simulation3D<double> const& sim) -> void;
-
-auto writeOutputs(Simulation3D<float> const& sim, std::filesystem::path const& simDir) -> void;
-auto writeOutputs(Simulation3D<double> const& sim, std::filesystem::path const& simDir) -> void;
-
-template<typename Real>
-[[nodiscard]] auto loadSimulation3D(std::filesystem::path const& simDir) -> Simulation3D<Real> {
-  if constexpr (std::is_same_v<Real, float>) {
-    return loadSimulation3D_float(simDir);
-  } else if constexpr (std::is_same_v<Real, double>) {
-    return loadSimulation3D_double(simDir);
-  } else {
-    static_assert(always_false<Real>);
-  }
-}
-
-// scale input to be in middle of floating-point range
-template<typename Real>
-void scaleInput(Simulation3D<Real>& sim) {
-  auto* in_sigs    = sim.in_sigs.data();
-  int64_t const Nt = sim.Nt;
-  int64_t const Ns = sim.Ns;
-
-  // normalise input signals (and save gain)
-  double max_in = 0.0;
-  for (int64_t n = 0; n < Nt; n++) {
-    for (int64_t ns = 0; ns < Ns; ns++) {
-      max_in = std::max(max_in, fabs(in_sigs[ns * Nt + n]));
-    }
-  }
-
-  constexpr auto min_exp = static_cast<double>(std::numeric_limits<Real>::min_exponent);
-  constexpr auto max_exp = static_cast<double>(std::numeric_limits<Real>::max_exponent);
-
-  auto const aexp      = 0.5; // normalise to middle power of two
-  auto const pow2      = static_cast<int32_t>(std::round(aexp * max_exp + (1 - aexp) * min_exp));
-  auto const norm1     = std::pow(2.0, pow2);
-  auto const inv_infac = norm1 / max_in;
-  auto const infac     = 1.0 / inv_infac;
-
-  std::printf(
-      "max_in = %.16e, pow2 = %d, norm1 = %.16e, inv_infac = %.16e, infac = "
-      "%.16e\n",
-      max_in,
-      pow2,
-      norm1,
-      inv_infac,
-      infac
-  );
-
-  // normalise input data
-  for (int64_t ns = 0; ns < Ns; ns++) {
-    for (int64_t n = 0; n < Nt; n++) {
-      in_sigs[ns * Nt + n] *= inv_infac;
-    }
-  }
-
-  sim.infac = infac;
-}
-
-template<typename Real>
-void rescaleOutput(Simulation3D<Real>& sim) {
+inline void rescaleOutput(Simulation3D& sim) {
   int64_t const Nt = sim.Nt;
   int64_t const Nr = sim.Nr;
   double infac     = sim.infac;
diff --git a/src/cpp/pffdtd/utility.hpp b/src/cpp/pffdtd/utility.hpp
index 4de282a..64f8936 100644
--- a/src/cpp/pffdtd/utility.hpp
+++ b/src/cpp/pffdtd/utility.hpp
@@ -3,12 +3,16 @@
 // Misc function definitions not specific to FDTD simulation
 #pragma once
 
+#include "pffdtd/mat_quad.hpp"
+
+#include <algorithm>
 #include <concepts>
 #include <cstddef>
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>
 #include <memory>
+#include <vector>
 
 #ifndef DIV_CEIL
   #define DIV_CEIL(x, y) (((x) + (y) - 1) / (y)) // this works for x≥0 and y>0
@@ -31,4 +35,25 @@ template<typename T>
   return static_cast<T>(GET_BIT(word, pos));
 }
 
+template<typename T>
+auto convertTo(std::vector<double> const& in) {
+  auto out = std::vector<T>(in.size());
+  std::ranges::transform(in, out.begin(), [](auto v) { return static_cast<T>(v); });
+  return out;
+}
+
+template<typename T>
+auto convertTo(std::vector<MatQuad<double>> const& in) {
+  auto out = std::vector<MatQuad<T>>(in.size());
+  std::ranges::transform(in, out.begin(), [](MatQuad<double> mq) {
+    return MatQuad<T>{
+        .b   = static_cast<T>(mq.b),
+        .bd  = static_cast<T>(mq.bd),
+        .bDh = static_cast<T>(mq.bDh),
+        .bFh = static_cast<T>(mq.bFh),
+    };
+  });
+  return out;
+}
+
 } // namespace pffdtd