diff --git a/src/cpp/pffdtd/engine_cpu_3d.cpp b/src/cpp/pffdtd/engine_cpu_3d.cpp
index f65923e..47a95e7 100644
--- a/src/cpp/pffdtd/engine_cpu_3d.cpp
+++ b/src/cpp/pffdtd/engine_cpu_3d.cpp
@@ -91,24 +91,24 @@ auto run(Simulation3D<Real> const& sd) -> void {
   int64_t const Nb   = sd.Nb;
   int64_t const Nbl  = sd.Nbl;
   int64_t const Nba  = sd.Nba;
-  int8_t* Mb         = sd.Mb;
 
   // keep local copies of pointers (style choice)
-  int64_t* bn_ixyz         = sd.bn_ixyz;
-  int64_t* bnl_ixyz        = sd.bnl_ixyz;
-  int64_t* bna_ixyz        = sd.bna_ixyz;
-  int64_t* in_ixyz         = sd.in_ixyz;
-  int64_t* out_ixyz        = sd.out_ixyz;
-  uint16_t* adj_bn         = sd.adj_bn;
-  uint8_t* bn_mask         = sd.bn_mask;
-  int8_t* mat_bnl          = sd.mat_bnl;
-  int8_t* Q_bna            = sd.Q_bna;
-  double* in_sigs          = sd.in_sigs;
-  double* u_out            = sd.u_out;
-  int8_t const fcc_flag    = sd.fcc_flag;
-  Real* ssaf_bnl           = sd.ssaf_bnl;
-  Real* mat_beta           = sd.mat_beta;
-  MatQuad<Real>* mat_quads = sd.mat_quads;
+  int8_t const* Mb               = sd.Mb.data();
+  int64_t const* bn_ixyz         = sd.bn_ixyz.data();
+  int64_t const* bnl_ixyz        = sd.bnl_ixyz.data();
+  int64_t const* bna_ixyz        = sd.bna_ixyz.data();
+  int64_t const* in_ixyz         = sd.in_ixyz.data();
+  int64_t const* out_ixyz        = sd.out_ixyz.data();
+  uint16_t const* adj_bn         = sd.adj_bn.data();
+  uint8_t const* bn_mask         = sd.bn_mask.data();
+  int8_t const* mat_bnl          = sd.mat_bnl.data();
+  int8_t* Q_bna                  = sd.Q_bna;
+  double const* in_sigs          = sd.in_sigs.data();
+  int8_t const fcc_flag          = sd.fcc_flag;
+  Real const* ssaf_bnl           = sd.ssaf_bnl.data();
+  Real const* mat_beta           = sd.mat_beta.data();
+  MatQuad<Real> const* mat_quads = sd.mat_quads.data();
+  double* u_out                  = sd.u_out.get();
 
   // allocate memory
   auto u0_buf   = std::vector<Real>(static_cast<size_t>(Npts));
diff --git a/src/cpp/pffdtd/engine_cuda_3d.cu b/src/cpp/pffdtd/engine_cuda_3d.cu
index 0e8d5e6..1eefbbb 100644
--- a/src/cpp/pffdtd/engine_cuda_3d.cu
+++ b/src/cpp/pffdtd/engine_cuda_3d.cu
@@ -106,21 +106,21 @@ __constant__ int8_t cuMb[MNm]; // to store Mb per mat
 // this is data on host, sometimes copied and recomputed for copy to GPU devices
 // (indices), sometimes just aliased pointers (scalar arrays)
 template<typename Real>
-struct HostData {      // arrays on host (for copy), mirrors gpu local data
-  double* in_sigs{};   // aliased
-  Real* u_out_buf{};   // aliased
-  double* u_out{};     // aliased
-  Real* ssaf_bnl{};    // aliased
-  int64_t* in_ixyz{};  // recomputed
-  int64_t* out_ixyz{}; // recomputed
-  int64_t* bn_ixyz{};  // recomputed
-  int64_t* bnl_ixyz{}; // recomputed
-  int64_t* bna_ixyz{}; // recomputed
-  int8_t* Q_bna{};     // aliased
-  uint16_t* adj_bn{};  // aliased
-  int8_t* mat_bnl{};   // aliased
-  uint8_t* bn_mask{};  // recomputed
-  int8_t* K_bn{};      // aliased
+struct HostData {           // arrays on host (for copy), mirrors gpu local data
+  double const* in_sigs{};  // aliased
+  Real* u_out_buf{};        // aliased
+  double* u_out{};          // aliased
+  Real const* ssaf_bnl{};   // aliased
+  int64_t* in_ixyz{};       // recomputed
+  int64_t* out_ixyz{};      // recomputed
+  int64_t* bn_ixyz{};       // recomputed
+  int64_t* bnl_ixyz{};      // recomputed
+  int64_t* bna_ixyz{};      // recomputed
+  int8_t* Q_bna{};          // aliased
+  uint16_t const* adj_bn{}; // aliased
+  int8_t const* mat_bnl{};  // aliased
+  uint8_t* bn_mask{};       // recomputed
+  int8_t const* K_bn{};     // aliased
   int64_t Ns{};
   int64_t Nr{};
   int64_t Npts{};
@@ -546,16 +546,16 @@ auto print_gpu_details(int i) -> uint64_t {
 // input indices need to be sorted for multi-device allocation
 template<typename Real>
 void checkSorted(Simulation3D<Real> const& sim) {
-  int64_t* bn_ixyz  = sim.bn_ixyz;
-  int64_t* bnl_ixyz = sim.bnl_ixyz;
-  int64_t* bna_ixyz = sim.bna_ixyz;
-  int64_t* in_ixyz  = sim.in_ixyz;
-  int64_t* out_ixyz = sim.out_ixyz;
-  int64_t const Nb  = sim.Nb;
-  int64_t const Nbl = sim.Nbl;
-  int64_t const Nba = sim.Nba;
-  int64_t const Ns  = sim.Ns;
-  int64_t const Nr  = sim.Nr;
+  int64_t const* bn_ixyz  = sim.bn_ixyz.data();
+  int64_t const* bnl_ixyz = sim.bnl_ixyz.data();
+  int64_t const* bna_ixyz = sim.bna_ixyz.data();
+  int64_t const* in_ixyz  = sim.in_ixyz.data();
+  int64_t const* out_ixyz = sim.out_ixyz.data();
+  int64_t const Nb        = sim.Nb;
+  int64_t const Nbl       = sim.Nbl;
+  int64_t const Nba       = sim.Nba;
+  int64_t const Ns        = sim.Ns;
+  int64_t const Nr        = sim.Nr;
   for (int64_t i = 1; i < Nb; i++) {
     PFFDTD_ASSERT(bn_ixyz[i] > bn_ixyz[i - 1]); // check save_gpu_folder
   }
@@ -624,8 +624,8 @@ void splitData(Simulation3D<Real> const& sim, std::span<HostData<Real>> ghds) {
   }
 
   // bn_ixyz - Nb
-  int64_t* bn_ixyz = sim.bn_ixyz;
-  int64_t const Nb = sim.Nb;
+  int64_t const* bn_ixyz = sim.bn_ixyz.data();
+  int64_t const Nb       = sim.Nb;
   {
     int gid = 0;
     for (int64_t i = 0; i < Nb; i++) {
@@ -644,8 +644,8 @@ void splitData(Simulation3D<Real> const& sim, std::span<HostData<Real>> ghds) {
   PFFDTD_ASSERT(Nb_check == Nb);
 
   // bnl_ixyz - Nbl
-  int64_t* bnl_ixyz = sim.bnl_ixyz;
-  int64_t const Nbl = sim.Nbl;
+  int64_t const* bnl_ixyz = sim.bnl_ixyz.data();
+  int64_t const Nbl       = sim.Nbl;
   {
     int gid = 0;
     for (int64_t i = 0; i < Nbl; i++) {
@@ -664,8 +664,8 @@ void splitData(Simulation3D<Real> const& sim, std::span<HostData<Real>> ghds) {
   PFFDTD_ASSERT(Nbl_check == Nbl);
 
   // bna_ixyz - Nba
-  int64_t* bna_ixyz = sim.bna_ixyz;
-  int64_t const Nba = sim.Nba;
+  int64_t const* bna_ixyz = sim.bna_ixyz.data();
+  int64_t const Nba       = sim.Nba;
   {
     int gid = 0;
     for (int64_t i = 0; i < Nba; i++) {
@@ -684,8 +684,8 @@ void splitData(Simulation3D<Real> const& sim, std::span<HostData<Real>> ghds) {
   PFFDTD_ASSERT(Nba_check == Nba);
 
   // in_ixyz - Ns
-  int64_t* in_ixyz = sim.in_ixyz;
-  int64_t const Ns = sim.Ns;
+  int64_t const* in_ixyz = sim.in_ixyz.data();
+  int64_t const Ns       = sim.Ns;
   {
     int gid = 0;
     for (int64_t i = 0; i < Ns; i++) {
@@ -704,8 +704,8 @@ void splitData(Simulation3D<Real> const& sim, std::span<HostData<Real>> ghds) {
   PFFDTD_ASSERT(Ns_check == Ns);
 
   // out_ixyz - Nr
-  int64_t* out_ixyz = sim.out_ixyz;
-  int64_t const Nr  = sim.Nr;
+  int64_t const* out_ixyz = sim.out_ixyz.data();
+  int64_t const Nr        = sim.Nr;
   {
     int gid = 0;
     for (int64_t i = 0; i < Nr; i++) {
@@ -829,13 +829,13 @@ auto run(Simulation3D<Real> const& sim) -> void {
     std::printf("Nx=%ld Ns=%ld Nr=%ld Nb=%ld, Npts=%ld\n", host.Nx, host.Ns, host.Nr, host.Nb, host.Npts);
 
     // aliased pointers (to memory already allocated)
-    host.in_sigs   = sim.in_sigs + Ns_read * sim.Nt;
-    host.ssaf_bnl  = sim.ssaf_bnl + Nbl_read;
-    host.adj_bn    = sim.adj_bn + Nb_read;
-    host.mat_bnl   = sim.mat_bnl + Nbl_read;
-    host.K_bn      = sim.K_bn + Nb_read;
+    host.in_sigs   = sim.in_sigs.data() + Ns_read * sim.Nt;
+    host.ssaf_bnl  = sim.ssaf_bnl.data() + Nbl_read;
+    host.adj_bn    = sim.adj_bn.data() + Nb_read;
+    host.mat_bnl   = sim.mat_bnl.data() + Nbl_read;
+    host.K_bn      = sim.K_bn.data() + Nb_read;
     host.Q_bna     = sim.Q_bna + Nba_read;
-    host.u_out     = sim.u_out + Nr_read * sim.Nt;
+    host.u_out     = sim.u_out.get() + Nr_read * sim.Nt;
     host.u_out_buf = u_out_buf + Nr_read;
 
     // recalculate indices, these are associated host versions to copy over to devices
@@ -941,12 +941,15 @@ auto run(Simulation3D<Real> const& sim) -> void {
     gpuErrchk(cudaMemcpy(gpu.mat_bnl, host.mat_bnl, (size_t)host.Nbl * sizeof(int8_t), cudaMemcpyHostToDevice));
 
     gpuErrchk(cudaMalloc(&(gpu.mat_beta), (size_t)sim.Nm * sizeof(Real)));
-    gpuErrchk(cudaMemcpy(gpu.mat_beta, sim.mat_beta, (size_t)sim.Nm * sizeof(Real), cudaMemcpyHostToDevice));
+    gpuErrchk(cudaMemcpy(gpu.mat_beta, sim.mat_beta.data(), (size_t)sim.Nm * sizeof(Real), cudaMemcpyHostToDevice));
 
     gpuErrchk(cudaMalloc(&(gpu.mat_quads), (size_t)sim.Nm * MMb * sizeof(MatQuad<Real>)));
-    gpuErrchk(
-        cudaMemcpy(gpu.mat_quads, sim.mat_quads, (size_t)sim.Nm * MMb * sizeof(MatQuad<Real>), cudaMemcpyHostToDevice)
-    );
+    gpuErrchk(cudaMemcpy(
+        gpu.mat_quads,
+        sim.mat_quads.data(),
+        (size_t)sim.Nm * MMb * sizeof(MatQuad<Real>),
+        cudaMemcpyHostToDevice
+    ));
 
     gpuErrchk(cudaMalloc(&(gpu.bn_mask), (size_t)(host.Nbm * sizeof(uint8_t))));
     gpuErrchk(cudaMemcpy(gpu.bn_mask, host.bn_mask, (size_t)host.Nbm * sizeof(uint8_t), cudaMemcpyHostToDevice));
@@ -969,15 +972,15 @@ auto run(Simulation3D<Real> const& sim) -> void {
     std::printf("\n");
 
     // swapping x and z here (CUDA has first dim contiguous)
+    // same for all devices
     gpuErrchk(cudaMemcpyToSymbol(cuNx, &(sim.Nz), sizeof(int64_t)));
     gpuErrchk(cudaMemcpyToSymbol(cuNy, &(sim.Ny), sizeof(int64_t)));
     gpuErrchk(cudaMemcpyToSymbol(cuNz, &(host.Nxh), sizeof(int64_t)));
     gpuErrchk(cudaMemcpyToSymbol(cuNb, &(host.Nb), sizeof(int64_t)));
     gpuErrchk(cudaMemcpyToSymbol(cuNbl, &(host.Nbl), sizeof(int64_t)));
     gpuErrchk(cudaMemcpyToSymbol(cuNba, &(host.Nba), sizeof(int64_t)));
-    gpuErrchk(cudaMemcpyToSymbol(cuMb, sim.Mb, sim.Nm * sizeof(int8_t)));
-    gpuErrchk(cudaMemcpyToSymbol(cuNxNy, &Nzy,
-                                 sizeof(int64_t))); // same for all devices
+    gpuErrchk(cudaMemcpyToSymbol(cuMb, sim.Mb.data(), sim.Nm * sizeof(int8_t)));
+    gpuErrchk(cudaMemcpyToSymbol(cuNxNy, &Nzy, sizeof(int64_t)));
 
     if constexpr (std::is_same_v<Real, float>) {
       gpuErrchk(cudaMemcpyToSymbol(c1_f32, &a1, sizeof(float)));
diff --git a/src/cpp/pffdtd/engine_sycl_3d.cpp b/src/cpp/pffdtd/engine_sycl_3d.cpp
index 4509fcf..8d0563d 100644
--- a/src/cpp/pffdtd/engine_sycl_3d.cpp
+++ b/src/cpp/pffdtd/engine_sycl_3d.cpp
@@ -54,7 +54,6 @@ auto run(Simulation3D<Real> const& sim) -> void {
   auto const Nb   = sim.Nb;
   auto const Nbl  = sim.Nbl;
   auto const Nba  = sim.Nba;
-  auto const Nm   = sim.Nm;
   auto const Nr   = sim.Nr;
   auto const Ns   = sim.Ns;
   auto const Nt   = sim.Nt;
@@ -66,29 +65,29 @@ auto run(Simulation3D<Real> const& sim) -> void {
   auto const a2  = static_cast<Real>(sim.a2);
 
   auto Q_bna_buf     = sycl::buffer{sim.Q_bna, sycl::range(size_t(Nba))};
-  auto bn_mask_buf   = sycl::buffer{sim.bn_mask, sycl::range(size_t(Npts))};
-  auto adj_bn_buf    = sycl::buffer{sim.adj_bn, sycl::range(size_t(Nb))};
-  auto bn_ixyz_buf   = sycl::buffer{sim.bn_ixyz, sycl::range(size_t(Nb))};
-  auto bnl_ixyz_buf  = sycl::buffer{sim.bnl_ixyz, sycl::range(size_t(Nb))};
-  auto bna_ixyz_buf  = sycl::buffer{sim.bna_ixyz, sycl::range(size_t(Nba))};
-  auto in_ixyz_buf   = sycl::buffer{sim.in_ixyz, sycl::range(size_t(Ns))};
-  auto out_ixyz_buf  = sycl::buffer{sim.out_ixyz, sycl::range(size_t(Nr))};
-  auto in_sigs_buf   = sycl::buffer{sim.in_sigs, sycl::range(size_t(Ns * Nt))};
-  auto mat_beta_buf  = sycl::buffer{sim.mat_beta, sycl::range(size_t(Nm))};
-  auto mat_bnl_buf   = sycl::buffer{sim.mat_bnl, sycl::range(size_t(Nbl))};
-  auto mat_quads_buf = sycl::buffer{sim.mat_quads, sycl::range(size_t(Nm * MMb))};
-  auto Mb_buf        = sycl::buffer{sim.Mb, sycl::range(size_t(Nm))};
-  auto ssaf_bnl_buf  = sycl::buffer{sim.ssaf_bnl, sycl::range(size_t(Nbl))};
-
-  auto u0_buf    = sycl::buffer<Real>(size_t(Npts));
-  auto u1_buf    = sycl::buffer<Real>(size_t(Npts));
-  auto u0b_buf   = sycl::buffer<Real>(size_t(Nbl));
-  auto u1b_buf   = sycl::buffer<Real>(size_t(Nbl));
-  auto u2b_buf   = sycl::buffer<Real>(size_t(Nbl));
-  auto u2ba_buf  = sycl::buffer<Real>(size_t(Nba));
-  auto vh1_buf   = sycl::buffer<Real>(size_t(Nbl * MMb));
-  auto gh1_buf   = sycl::buffer<Real>(size_t(Nbl * MMb));
-  auto u_out_buf = sycl::buffer<Real>(size_t(Nr * Nt));
+  auto bn_mask_buf   = sycl::buffer{sim.bn_mask};
+  auto adj_bn_buf    = sycl::buffer{sim.adj_bn};
+  auto bn_ixyz_buf   = sycl::buffer{sim.bn_ixyz};
+  auto bnl_ixyz_buf  = sycl::buffer{sim.bnl_ixyz};
+  auto bna_ixyz_buf  = sycl::buffer{sim.bna_ixyz};
+  auto in_ixyz_buf   = sycl::buffer{sim.in_ixyz};
+  auto out_ixyz_buf  = sycl::buffer{sim.out_ixyz};
+  auto in_sigs_buf   = sycl::buffer{sim.in_sigs};
+  auto mat_beta_buf  = sycl::buffer{sim.mat_beta};
+  auto mat_bnl_buf   = sycl::buffer{sim.mat_bnl};
+  auto mat_quads_buf = sycl::buffer{sim.mat_quads};
+  auto Mb_buf        = sycl::buffer{sim.Mb};
+  auto ssaf_bnl_buf  = sycl::buffer{sim.ssaf_bnl};
+
+  auto u0_buf    = sycl::buffer<Real>(static_cast<size_t>(Npts));
+  auto u1_buf    = sycl::buffer<Real>(static_cast<size_t>(Npts));
+  auto u0b_buf   = sycl::buffer<Real>(static_cast<size_t>(Nbl));
+  auto u1b_buf   = sycl::buffer<Real>(static_cast<size_t>(Nbl));
+  auto u2b_buf   = sycl::buffer<Real>(static_cast<size_t>(Nbl));
+  auto u2ba_buf  = sycl::buffer<Real>(static_cast<size_t>(Nba));
+  auto vh1_buf   = sycl::buffer<Real>(static_cast<size_t>(Nbl * MMb));
+  auto gh1_buf   = sycl::buffer<Real>(static_cast<size_t>(Nbl * MMb));
+  auto u_out_buf = sycl::buffer<Real>(static_cast<size_t>(Nr * Nt));
 
   auto elapsedAir      = std::chrono::nanoseconds{0};
   auto elapsedBoundary = std::chrono::nanoseconds{0};
diff --git a/src/cpp/pffdtd/hdf.hpp b/src/cpp/pffdtd/hdf.hpp
index c34ea6b..3d10052 100644
--- a/src/cpp/pffdtd/hdf.hpp
+++ b/src/cpp/pffdtd/hdf.hpp
@@ -79,35 +79,41 @@ struct H5FReader {
     auto set   = H5Dopen(_handle, dataset, H5P_DEFAULT);
     auto space = H5Dget_space(set);
 
-    auto ndims = 1;
+    // auto ndims = 1;
+    // PFFDTD_ASSERT(H5Sget_simple_extent_ndims(space) == ndims);
+
+    auto ndims = H5Sget_simple_extent_ndims(space);
     auto dims  = std::array<hsize_t, 3>{};
-    PFFDTD_ASSERT(H5Sget_simple_extent_ndims(space) == ndims);
     H5Sget_simple_extent_dims(space, dims.data(), nullptr);
 
     auto size = ndims == 1 ? dims[0] : dims[0] * dims[1];
 
-    if constexpr (std::is_same_v<T, uint8_t>) {
+    if constexpr (std::is_same_v<T, int64_t>) {
+      auto type = H5T_NATIVE_INT64;
+      auto buf  = std::vector<T>(size);
+      auto err  = H5Dread(set, type, H5S_ALL, H5S_ALL, H5P_DEFAULT, buf.data());
+      checkErrorAndCloseDataset(dataset, set, err);
+      return buf;
+    } else if constexpr (std::is_same_v<T, uint8_t>) {
       auto type = H5T_NATIVE_UINT8;
       auto buf  = std::vector<T>(size);
       auto err  = H5Dread(set, type, H5S_ALL, H5S_ALL, H5P_DEFAULT, buf.data());
       checkErrorAndCloseDataset(dataset, set, err);
       return buf;
-    }
-
-    if constexpr (std::is_same_v<T, int64_t>) {
-      auto type = H5T_NATIVE_INT64;
+    } else if constexpr (std::is_same_v<T, int8_t>) {
+      auto type = H5T_NATIVE_INT8;
       auto buf  = std::vector<T>(size);
       auto err  = H5Dread(set, type, H5S_ALL, H5S_ALL, H5P_DEFAULT, buf.data());
       checkErrorAndCloseDataset(dataset, set, err);
       return buf;
-    }
-
-    if constexpr (std::is_same_v<T, double>) {
+    } else if constexpr (std::is_same_v<T, double>) {
       auto type = H5T_NATIVE_DOUBLE;
       auto buf  = std::vector<T>(size);
       auto err  = H5Dread(set, type, H5S_ALL, H5S_ALL, H5P_DEFAULT, buf.data());
       checkErrorAndCloseDataset(dataset, set, err);
       return buf;
+    } else {
+      static_assert(always_false<T>);
     }
   }
 
diff --git a/src/cpp/pffdtd/simulation_3d.cpp b/src/cpp/pffdtd/simulation_3d.cpp
index c52ed0b..12d8828 100644
--- a/src/cpp/pffdtd/simulation_3d.cpp
+++ b/src/cpp/pffdtd/simulation_3d.cpp
@@ -70,7 +70,7 @@ void ind2sub3d(int64_t idx, int64_t Nx, int64_t Ny, int64_t Nz, int64_t* ix, int
 }
 
 // double check some index inside grid
-void check_inside_grid(int64_t* idx, int64_t N, int64_t Nx, int64_t Ny, int64_t Nz) {
+void check_inside_grid(int64_t const* idx, int64_t N, int64_t Nx, int64_t Ny, int64_t Nz) {
   for (int64_t i = 0; i < N; i++) {
     int64_t iz = 0;
     int64_t iy = 0;
@@ -169,9 +169,8 @@ template<typename Real>
   //////////////////
   // bn_ixyz dataset
   //////////////////
-  expected_ndims   = 1;
-  int64_t* bn_ixyz = read<int64_t>(vox_out, "bn_ixyz", expected_ndims, dims);
-  PFFDTD_ASSERT(static_cast<int64_t>(dims[0]) == Nb);
+  auto bn_ixyz = vox_out.read<std::vector<int64_t>>("bn_ixyz");
+  PFFDTD_ASSERT(std::cmp_equal(bn_ixyz.size(), Nb));
 
   //////////////////
   // adj_bn dataset
@@ -184,9 +183,8 @@ template<typename Real>
   //////////////////
   // mat_bn dataset
   //////////////////
-  expected_ndims = 1;
-  int8_t* mat_bn = read<int8_t>(vox_out, "mat_bn", expected_ndims, dims);
-  PFFDTD_ASSERT(static_cast<int64_t>(dims[0]) == Nb);
+  auto mat_bn = vox_out.read<std::vector<int8_t>>("mat_bn");
+  PFFDTD_ASSERT(std::cmp_equal(mat_bn.size(), Nb));
 
   //////////////////
   // saf_bn dataset
@@ -230,28 +228,23 @@ template<typename Real>
   //////////////////
   // in_ixyz dataset
   //////////////////
-  expected_ndims   = 1;
-  int64_t* in_ixyz = read<int64_t>(signals, "in_ixyz", expected_ndims, dims);
-  PFFDTD_ASSERT(static_cast<int64_t>(dims[0]) == Ns);
+  auto in_ixyz = signals.read<std::vector<int64_t>>("in_ixyz");
+  PFFDTD_ASSERT(std::cmp_equal(in_ixyz.size(), Ns));
 
   //////////////////
   // out_ixyz dataset
   //////////////////
-  expected_ndims    = 1;
-  int64_t* out_ixyz = read<int64_t>(signals, "out_ixyz", expected_ndims, dims);
-  PFFDTD_ASSERT(static_cast<int64_t>(dims[0]) == Nr);
+  auto out_ixyz = signals.read<std::vector<int64_t>>("out_ixyz");
+  PFFDTD_ASSERT(std::cmp_equal(out_ixyz.size(), Nr));
 
-  expected_ndims       = 1;
-  int64_t* out_reorder = read<int64_t>(signals, "out_reorder", expected_ndims, dims);
-  PFFDTD_ASSERT(static_cast<int64_t>(dims[0]) == Nr);
+  auto out_reorder = signals.read<std::vector<int64_t>>("out_reorder");
+  PFFDTD_ASSERT(std::cmp_equal(out_reorder.size(), Nr));
 
   //////////////////
   // in_sigs dataset
   //////////////////
-  expected_ndims  = 2;
-  double* in_sigs = read<double>(signals, "in_sigs", expected_ndims, dims);
-  PFFDTD_ASSERT(static_cast<int64_t>(dims[0]) == Ns);
-  PFFDTD_ASSERT(static_cast<int64_t>(dims[1]) == Nt);
+  auto in_sigs = signals.read<std::vector<double>>("in_sigs");
+  PFFDTD_ASSERT(std::cmp_equal(in_sigs.size(), Ns * Nt));
 
   // not recommended to run single without differentiating input
   if (sizeof(Real) == 4) {
@@ -275,8 +268,8 @@ template<typename Real>
   fmt::println("Nm={}", Nm);
   PFFDTD_ASSERT(Nm <= MNm);
 
-  expected_ndims = 1;
-  int8_t* Mb     = read<int8_t>(materials, "Mb", expected_ndims, dims);
+  auto Mb = materials.read<std::vector<int8_t>>("Mb");
+  PFFDTD_ASSERT(std::cmp_equal(Mb.size(), Nm));
 
   for (int8_t i = 0; i < Nm; i++) {
     fmt::println("Mb[{}]={}", i, Mb[i]);
@@ -285,8 +278,8 @@ template<typename Real>
   //////////////////
   // DEF (RLC) datasets
   //////////////////
-  auto* mat_beta  = allocate_zeros<Real>(Nm);
-  auto* mat_quads = allocate_zeros<MatQuad<Real>>(static_cast<unsigned long>(Nm) * MMb);
+  auto mat_beta  = std::vector<Real>(size_t(Nm));
+  auto mat_quads = std::vector<MatQuad<Real>>(static_cast<unsigned long>(Nm) * size_t(MMb));
   for (int8_t i = 0; i < Nm; i++) {
     expected_ndims = 2;
     auto* DEF      = read<double>(materials, fmt::format("mat_{:02d}_DEF", i).c_str(), expected_ndims, dims);
@@ -331,7 +324,7 @@ template<typename Real>
   //////////////////
   // check bn_ixyz
   //////////////////
-  check_inside_grid(bn_ixyz, Nb, Nx, Ny, Nz);
+  check_inside_grid(bn_ixyz.data(), Nb, Nx, Ny, Nz);
   fmt::println("bn_ixyz checked");
 
   //////////////////
@@ -355,7 +348,7 @@ template<typename Real>
   //////////////////
   // bit-pack and check adj_bn
   //////////////////
-  auto* adj_bn = allocate_zeros<uint16_t>(Nb);
+  auto adj_bn = std::vector<uint16_t>(size_t(Nb));
 
   for (int64_t i = 0; i < Nb; i++) {
     for (int8_t j = 0; j < NN; j++) {
@@ -375,8 +368,7 @@ template<typename Real>
   //////////////////
   // calculate K_bn from adj_bn
   //////////////////
-  auto* K_bn = allocate_zeros<int8_t>(Nb);
-
+  auto K_bn = std::vector<int8_t>(size_t(Nb));
   for (int64_t nb = 0; nb < Nb; nb++) {
     K_bn[nb] = 0;
     for (uint8_t nn = 0; nn < NN; nn++) {
@@ -390,14 +382,14 @@ template<typename Real>
   //////////////////
   // make compressed bit-mask
   int64_t const Nbm = (Npts - 1) / 8 + 1;
-  auto* bn_mask     = allocate_zeros<uint8_t>(Nbm); // one bit per
+  auto bn_mask      = std::vector<uint8_t>(size_t(Nbm)); // one bit per
   for (int64_t i = 0; i < Nb; i++) {
     int64_t const ii = bn_ixyz[i];
     SET_BIT(bn_mask[ii >> 3], ii % 8);
   }
 
   // create bn_mask_raw to double check
-  bool* bn_mask_raw = allocate_zeros<bool>(Npts);
+  auto bn_mask_raw = std::vector<bool>(size_t(Npts));
 
   for (int64_t i = 0; i < Nb; i++) {
     int64_t const ii = bn_ixyz[i];
@@ -415,7 +407,6 @@ template<typename Real>
     }
   }
   fmt::println("bn_mask double checked");
-  delete[] bn_mask_raw;
 
   // count Nbl
   int64_t Nbl = 0;
@@ -423,9 +414,9 @@ template<typename Real>
     Nbl += static_cast<int64_t>(mat_bn[i] >= 0);
   }
   fmt::println("Nbl = {}", Nbl);
-  auto* mat_bnl  = allocate_zeros<int8_t>(Nbl);
-  auto* bnl_ixyz = allocate_zeros<int64_t>(Nbl);
-  auto* ssaf_bnl = allocate_zeros<Real>(Nbl);
+  auto mat_bnl  = std::vector<int8_t>(static_cast<size_t>(Nbl));
+  auto bnl_ixyz = std::vector<int64_t>(static_cast<size_t>(Nbl));
+  auto ssaf_bnl = std::vector<Real>(static_cast<size_t>(Nbl));
   {
     int64_t j = 0;
     for (int64_t i = 0; i < Nb; i++) {
@@ -438,7 +429,6 @@ template<typename Real>
     }
     PFFDTD_ASSERT(j == Nbl);
   }
-  delete[] mat_bn;
   delete[] ssaf_bn;
 
   fmt::println("separated non-rigid bn");
@@ -450,8 +440,8 @@ template<typename Real>
     Nba /= 2;
   }
 
-  auto* bna_ixyz = allocate_zeros<int64_t>(Nba);
-  auto* Q_bna    = allocate_zeros<int8_t>(Nba);
+  auto bna_ixyz = std::vector<int64_t>(size_t(Nba));
+  auto* Q_bna   = allocate_zeros<int8_t>(Nba);
   {
     int64_t ii = 0;
     for (int64_t ix = 1; ix < Nx - 1; ix++) {
@@ -482,7 +472,7 @@ template<typename Real>
     fmt::println("ABC nodes");
     if (fcc_flag == 2) { // need to sort bna_ixyz
       auto* bna_sort_keys = allocate_zeros<int64_t>(Nba);
-      sort_keys(bna_ixyz, bna_sort_keys, Nba);
+      sort_keys(bna_ixyz.data(), bna_sort_keys, Nba);
 
       // now sort corresponding Q_bna array
       int8_t* Q_bna_unsorted = nullptr;
@@ -501,20 +491,20 @@ template<typename Real>
   }
 
   return Simulation3D<Real>{
-      .bn_ixyz     = bn_ixyz,
-      .bnl_ixyz    = bnl_ixyz,
-      .bna_ixyz    = bna_ixyz,
+      .bn_ixyz     = std::move(bn_ixyz),
+      .bnl_ixyz    = std::move(bnl_ixyz),
+      .bna_ixyz    = std::move(bna_ixyz),
       .Q_bna       = Q_bna,
-      .in_ixyz     = in_ixyz,
-      .out_ixyz    = out_ixyz,
-      .out_reorder = out_reorder,
-      .adj_bn      = adj_bn,
-      .ssaf_bnl    = ssaf_bnl,
-      .bn_mask     = bn_mask,
-      .mat_bnl     = mat_bnl,
-      .K_bn        = K_bn,
-      .in_sigs     = in_sigs,
-      .u_out       = allocate_zeros<double>(Nr * Nt),
+      .in_ixyz     = std::move(in_ixyz),
+      .out_ixyz    = std::move(out_ixyz),
+      .out_reorder = std::move(out_reorder),
+      .adj_bn      = std::move(adj_bn),
+      .ssaf_bnl    = std::move(ssaf_bnl),
+      .bn_mask     = std::move(bn_mask),
+      .mat_bnl     = std::move(mat_bnl),
+      .K_bn        = std::move(K_bn),
+      .in_sigs     = std::move(in_sigs),
+      .u_out       = std::unique_ptr<double[]>{allocate_zeros<double>(Nr * Nt)},
       .Ns          = Ns,
       .Nr          = Nr,
       .Nt          = Nt,
@@ -530,9 +520,9 @@ template<typename Real>
       .fcc_flag    = fcc_flag,
       .NN          = NN,
       .Nm          = Nm,
-      .Mb          = Mb,
-      .mat_quads   = mat_quads,
-      .mat_beta    = mat_beta,
+      .Mb          = std::move(Mb),
+      .mat_quads   = std::move(mat_quads),
+      .mat_beta    = std::move(mat_beta),
       .infac       = 1.0,
       .sl2         = sl2,
       .lo2         = lo2,
@@ -543,10 +533,10 @@ template<typename Real>
 
 template<typename Real>
 void writeOutputs_impl(Simulation3D<Real> const& sim, std::filesystem::path const& simDir) {
-  auto Nt           = static_cast<size_t>(sim.Nt);
-  auto Nr           = static_cast<size_t>(sim.Nr);
-  auto* out_reorder = sim.out_reorder;
-  auto u_out        = stdex::mdarray<double, stdex::dextents<size_t, 2>>(Nr, Nt);
+  auto const* out_reorder = sim.out_reorder.data();
+  auto Nt                 = static_cast<size_t>(sim.Nt);
+  auto Nr                 = static_cast<size_t>(sim.Nr);
+  auto u_out              = stdex::mdarray<double, stdex::dextents<size_t, 2>>(Nr, Nt);
 
   // write outputs in correct order
   for (auto nr = size_t{0}; nr < Nr; ++nr) {
@@ -563,10 +553,10 @@ void writeOutputs_impl(Simulation3D<Real> const& sim, std::filesystem::path cons
 // print last samples of simulation (for correctness checking..)
 template<typename Real>
 void printLastSample_impl(Simulation3D<Real> const& sim) {
-  int64_t const Nt     = sim.Nt;
-  int64_t const Nr     = sim.Nr;
-  double* u_out        = sim.u_out;
-  int64_t* out_reorder = sim.out_reorder;
+  int64_t const Nt           = sim.Nt;
+  int64_t const Nr           = sim.Nr;
+  double* u_out              = sim.u_out.get();
+  int64_t const* out_reorder = sim.out_reorder.data();
   // print last samples
   fmt::println("RAW OUTPUTS");
   for (int64_t nr = 0; nr < Nr; nr++) {
diff --git a/src/cpp/pffdtd/simulation_3d.hpp b/src/cpp/pffdtd/simulation_3d.hpp
index ee4bcc8..b58577e 100644
--- a/src/cpp/pffdtd/simulation_3d.hpp
+++ b/src/cpp/pffdtd/simulation_3d.hpp
@@ -15,6 +15,7 @@
 #include <cstring>
 #include <filesystem>
 #include <limits>
+#include <vector>
 
 namespace pffdtd {
 
@@ -33,43 +34,43 @@ struct MatQuad {
 // main sim data, on host
 template<typename Real>
 struct Simulation3D {
-  int64_t* bn_ixyz;         // boundary node indices
-  int64_t* bnl_ixyz;        // lossy boundary node indices
-  int64_t* bna_ixyz;        // absorbing boundary node indices
-  int8_t* Q_bna;            // integer for ABCs (wall 1,edge 2,corner 3)
-  int64_t* in_ixyz;         // input points
-  int64_t* out_ixyz;        // output points
-  int64_t* out_reorder;     // ordering for outputs point for final print/save
-  uint16_t* adj_bn;         // nearest-neighbour adjancencies for all boundary nodes
-  Real* ssaf_bnl;           // surface area corrections (with extra volume scaling)
-  uint8_t* bn_mask;         // bit mask for bounday nodes
-  int8_t* mat_bnl;          // material indices for lossy boundary nodes
-  int8_t* K_bn;             // number of adjacent neighbours, boundary nodesa
-  double* in_sigs;          // input signals
-  double* u_out;            // for output signals
-  int64_t Ns;               // number of input grid points
-  int64_t Nr;               // number of output grid points
-  int64_t Nt;               // number of samples simulation
-  int64_t Npts;             // number of Cartesian grid points
-  int64_t Nx;               // x-dim (non-continguous)
-  int64_t Ny;               // y-dim
-  int64_t Nz;               // z-dim (continguous)
-  int64_t Nb;               // number of boundary nodes
-  int64_t Nbl;              // number of lossy boundary nodes
-  int64_t Nba;              // number of ABC nodes
-  double l;                 // Courant number (CFL)
-  double l2;                // CFL number squared
-  int8_t fcc_flag;          // boolean for FCC
-  int8_t NN;                // integer, neareast neighbours
-  int8_t Nm;                // number of materials used
-  int8_t* Mb;               // number of branches per material
-  MatQuad<Real>* mat_quads; // RLC coefficients (essentially)
-  Real* mat_beta;           // part of FD-boundaries one per material
-  double infac;             // rescaling of input (for numerical reason)
-  Real sl2;                 // scaled l2 (for single precision)
-  Real lo2;                 // 0.5*l
-  Real a2;                  // update stencil coefficient
-  Real a1;                  // update stencil coefficient
+  std::vector<int64_t> bn_ixyz;         // boundary node indices
+  std::vector<int64_t> bnl_ixyz;        // lossy boundary node indices
+  std::vector<int64_t> bna_ixyz;        // absorbing boundary node indices
+  int8_t* Q_bna;                        // integer for ABCs (wall 1,edge 2,corner 3)
+  std::vector<int64_t> in_ixyz;         // input points
+  std::vector<int64_t> out_ixyz;        // output points
+  std::vector<int64_t> out_reorder;     // ordering for outputs point for final print/save
+  std::vector<uint16_t> adj_bn;         // nearest-neighbour adjancencies for all boundary nodes
+  std::vector<Real> ssaf_bnl;           // surface area corrections (with extra volume scaling)
+  std::vector<uint8_t> bn_mask;         // bit mask for bounday nodes
+  std::vector<int8_t> mat_bnl;          // material indices for lossy boundary nodes
+  std::vector<int8_t> K_bn;             // number of adjacent neighbours, boundary nodesa
+  std::vector<double> in_sigs;          // input signals
+  std::unique_ptr<double[]> u_out;      // for output signals
+  int64_t Ns;                           // number of input grid points
+  int64_t Nr;                           // number of output grid points
+  int64_t Nt;                           // number of samples simulation
+  int64_t Npts;                         // number of Cartesian grid points
+  int64_t Nx;                           // x-dim (non-continguous)
+  int64_t Ny;                           // y-dim
+  int64_t Nz;                           // z-dim (continguous)
+  int64_t Nb;                           // number of boundary nodes
+  int64_t Nbl;                          // number of lossy boundary nodes
+  int64_t Nba;                          // number of ABC nodes
+  double l;                             // Courant number (CFL)
+  double l2;                            // CFL number squared
+  int8_t fcc_flag;                      // boolean for FCC
+  int8_t NN;                            // integer, neareast neighbours
+  int8_t Nm;                            // number of materials used
+  std::vector<int8_t> Mb;               // number of branches per material
+  std::vector<MatQuad<Real>> mat_quads; // RLC coefficients (essentially)
+  std::vector<Real> mat_beta;           // part of FD-boundaries one per material
+  double infac;                         // rescaling of input (for numerical reason)
+  Real sl2;                             // scaled l2 (for single precision)
+  Real lo2;                             // 0.5*l
+  Real a2;                              // update stencil coefficient
+  Real a1;                              // update stencil coefficient
 };
 
 [[nodiscard]] auto loadSimulation3D_float(std::filesystem::path const& simDir) -> Simulation3D<float>;
@@ -95,7 +96,7 @@ template<typename Real>
 // scale input to be in middle of floating-point range
 template<typename Real>
 void scaleInput(Simulation3D<Real>& sim) {
-  double* in_sigs  = sim.in_sigs;
+  auto* in_sigs    = sim.in_sigs.data();
   int64_t const Nt = sim.Nt;
   int64_t const Ns = sim.Ns;
 
@@ -133,8 +134,7 @@ void scaleInput(Simulation3D<Real>& sim) {
     }
   }
 
-  sim.infac   = infac;
-  sim.in_sigs = in_sigs;
+  sim.infac = infac;
 }
 
 template<typename Real>
@@ -142,30 +142,14 @@ void rescaleOutput(Simulation3D<Real>& sim) {
   int64_t const Nt = sim.Nt;
   int64_t const Nr = sim.Nr;
   double infac     = sim.infac;
-  double* u_out    = sim.u_out;
+  double* u_out    = sim.u_out.get();
 
   std::transform(u_out, u_out + Nr * Nt, u_out, [infac](auto sample) { return sample * infac; });
 }
 
 template<typename Real>
 void freeSimulation3D(Simulation3D<Real> const& sim) {
-  delete[] sim.bn_ixyz;
-  delete[] sim.bnl_ixyz;
-  delete[] sim.bna_ixyz;
   delete[] sim.Q_bna;
-  delete[] sim.adj_bn;
-  delete[] sim.mat_bnl;
-  delete[] sim.bn_mask;
-  delete[] sim.ssaf_bnl;
-  delete[] sim.K_bn;
-  delete[] sim.in_ixyz;
-  delete[] sim.out_ixyz;
-  delete[] sim.out_reorder;
-  delete[] sim.in_sigs;
-  delete[] sim.u_out;
-  delete[] sim.Mb;
-  delete[] sim.mat_beta;
-  delete[] sim.mat_quads;
 }
 
 } // namespace pffdtd