diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt index b9fd1a9..fb307b4 100644 --- a/src/cpp/CMakeLists.txt +++ b/src/cpp/CMakeLists.txt @@ -58,6 +58,7 @@ endif() if(PFFDTD_HAS_METAL) target_compile_definitions(pffdtd PUBLIC PFFDTD_HAS_METAL=1) + target_sources(pffdtd PRIVATE pffdtd/engine_metal_2d.hpp pffdtd/engine_metal_2d.mm) target_sources(pffdtd PRIVATE pffdtd/engine_metal_3d.hpp pffdtd/engine_metal_3d.mm) set(SHADER_FILES "${CMAKE_CURRENT_SOURCE_DIR}/pffdtd/engine_metal.metal") diff --git a/src/cpp/main.cpp b/src/cpp/main.cpp index c5e5614..d30ec8d 100644 --- a/src/cpp/main.cpp +++ b/src/cpp/main.cpp @@ -16,6 +16,7 @@ #endif #if defined(PFFDTD_HAS_METAL) + #include "pffdtd/engine_metal_2d.hpp" #include "pffdtd/engine_metal_3d.hpp" #endif @@ -37,6 +38,9 @@ namespace { using Callback = std::function>(Simulation2D const&)>; auto engines = std::map{}; engines["native"] = pffdtd::EngineCPU2D{}; +#if defined(PFFDTD_HAS_METAL) + engines["metal"] = pffdtd::EngineMETAL2D{}; +#endif #if defined(PFFDTD_HAS_SYCL) engines["sycl"] = pffdtd::EngineSYCL2D{}; #endif diff --git a/src/cpp/pffdtd/engine_metal.hpp b/src/cpp/pffdtd/engine_metal.hpp index 3db67f3..1e50028 100644 --- a/src/cpp/pffdtd/engine_metal.hpp +++ b/src/cpp/pffdtd/engine_metal.hpp @@ -11,13 +11,20 @@ namespace pffdtd { template struct Constants2D { + long Nx; long Ny; + long Nt; + long in_ixy; Real lossFactor; }; template struct Constants3D { - long n; + Real l; + Real lo2; + Real sl2; + Real a1; + Real a2; long Nx; long Ny; long Nz; @@ -28,11 +35,6 @@ struct Constants3D { long Ns; long Nr; long Nt; - Real l; - Real lo2; - Real sl2; - Real a1; - Real a2; }; template diff --git a/src/cpp/pffdtd/engine_metal.metal b/src/cpp/pffdtd/engine_metal.metal index 21d4d28..fd70bdd 100644 --- a/src/cpp/pffdtd/engine_metal.metal +++ b/src/cpp/pffdtd/engine_metal.metal @@ -13,7 +13,7 @@ namespace sim2d { device float const* u1 [[buffer(1)]], device float const* u2 [[buffer(2)]], device uint8_t const* in_mask [[buffer(3)]], - constant Constants2D const& constants [[buffer(4)]], + constant Constants2D& constants [[buffer(4)]], uint2 id [[thread_position_in_grid]] ) { int64_t const x = id.x + 1; @@ -39,7 +39,7 @@ namespace sim2d { device float const* u2 [[buffer(2)]], device int64_t const* bn_ixy [[buffer(3)]], device int64_t const* adj_bn [[buffer(4)]], - constant Constants2D const& constants [[buffer(5)]], + constant Constants2D& constants [[buffer(5)]], uint id [[thread_position_in_grid]] ) { int64_t const ib = bn_ixy[id]; @@ -62,7 +62,7 @@ namespace sim2d { device float const* u2 [[buffer(1)]], device int64_t const* bn_ixy [[buffer(2)]], device int64_t const* adj_bn [[buffer(3)]], - constant Constants2D const& constants [[buffer(4)]], + constant Constants2D& constants [[buffer(4)]], uint id [[thread_position_in_grid]] ) { int64_t const ib = bn_ixy[id]; @@ -76,6 +76,27 @@ namespace sim2d { u0[ib] = (current + lossFactor * K4 * prev) / (1.0 + lossFactor * K4); } +[[kernel]] void addSource( + device float* u0 [[buffer(0)]], + device float const* src_sig [[buffer(1)]], + constant Constants2D& constants [[buffer(2)]], + constant int64_t& timestep [[buffer(3)]], + uint id [[thread_position_in_grid]] +) { + u0[constants.in_ixy] += src_sig[timestep]; +} + +[[kernel]] void readOutput( + device float* out [[buffer(0)]], + device float const* u0 [[buffer(1)]], + device int64_t const* out_ixy [[buffer(2)]], + constant Constants2D& constants [[buffer(3)]], + constant int64_t& timestep [[buffer(4)]], + uint id [[thread_position_in_grid]] +) { + out[id * constants.Nt + timestep] = u0[out_ixy[id]]; +} + } // namespace sim2d namespace sim3d { @@ -84,33 +105,32 @@ namespace sim3d { device float* u0 [[buffer(0)]], device float const* u1 [[buffer(1)]], device uint8_t const* bn_mask [[buffer(2)]], - constant Constants3D const& constants [[buffer(3)]], + constant Constants3D& constants [[buffer(3)]], uint3 id [[thread_position_in_grid]] ) { + float const a1 = constants.a1; + float const a2 = constants.a2; + int64_t const Nz = constants.Nz; int64_t const NzNy = constants.NzNy; - float const a1 = constants.a1; - float const a2 = constants.a2; - int64_t const ix = id.x + 1; - int64_t const iy = id.y + 1; - int64_t const iz = id.z + 1; - int64_t const ii = ix * NzNy + iy * Nz + iz; + int64_t const x = id.x + 1; + int64_t const y = id.y + 1; + int64_t const z = id.z + 1; + int64_t const i = x * NzNy + y * Nz + z; - if (get_bit(bn_mask[ii >> 3], ii % 8) != 0) { + if (get_bit(bn_mask[i / 8], i % 8) != 0) { return; } - float partial = a1 * u1[ii] - u0[ii]; - partial += a2 * u1[ii + NzNy]; - partial += a2 * u1[ii - NzNy]; - partial += a2 * u1[ii + Nz]; - partial += a2 * u1[ii - Nz]; - partial += a2 * u1[ii + 1]; - partial += a2 * u1[ii - 1]; - u0[ii] = partial; - - // u0[ii] = a1 + a2; + float partial = a1 * u1[i] - u0[i]; + partial += a2 * u1[i + NzNy]; + partial += a2 * u1[i - NzNy]; + partial += a2 * u1[i + Nz]; + partial += a2 * u1[i - Nz]; + partial += a2 * u1[i + 1]; + partial += a2 * u1[i - 1]; + u0[i] = partial; } [[kernel]] void rigidUpdateCart( @@ -118,12 +138,12 @@ namespace sim3d { device float const* u1 [[buffer(1)]], device int64_t const* bn_ixyz [[buffer(2)]], device uint16_t const* adj_bn [[buffer(3)]], - constant Constants3D const& constants [[buffer(4)]], + constant Constants3D& constants [[buffer(4)]], uint id [[thread_position_in_grid]] ) { + int64_t const ii = bn_ixyz[id]; auto const adj = adj_bn[id]; auto const Kint = metal::popcount(adj); - int64_t const ii = bn_ixyz[id]; float const _2 = 2.0; float const K = Kint; @@ -150,7 +170,7 @@ namespace sim3d { device float const* mat_beta [[buffer(6)]], device MatQuad const* mat_quads [[buffer(7)]], device uint8_t const* Mb [[buffer(8)]], - constant Constants3D const& constants [[buffer(9)]], + constant Constants3D& constants [[buffer(9)]], uint id [[thread_position_in_grid]] ) { auto nb = static_cast(id); @@ -194,7 +214,7 @@ namespace sim3d { device float const* u2ba [[buffer(1)]], device float const* Q_bna [[buffer(2)]], device int64_t const* bna_ixyz [[buffer(3)]], - constant Constants3D const& constants [[buffer(4)]], + constant Constants3D& constants [[buffer(4)]], uint id [[thread_position_in_grid]] ) { auto const lQ = constants.l * Q_bna[id]; @@ -204,7 +224,7 @@ namespace sim3d { [[kernel]] void flipHaloXY( device float* u1 [[buffer(0)]], - constant Constants3D const& constants [[buffer(1)]], + constant Constants3D& constants [[buffer(1)]], uint2 id [[thread_position_in_grid]] ) { auto const Nz = constants.Nz; @@ -220,7 +240,7 @@ namespace sim3d { [[kernel]] void flipHaloXZ( device float* u1 [[buffer(0)]], - constant Constants3D const& constants [[buffer(1)]], + constant Constants3D& constants [[buffer(1)]], uint2 id [[thread_position_in_grid]] ) { auto const Ny = constants.Ny; @@ -236,7 +256,7 @@ namespace sim3d { [[kernel]] void flipHaloYZ( device float* u1 [[buffer(0)]], - constant Constants3D const& constants [[buffer(1)]], + constant Constants3D& constants [[buffer(1)]], uint2 id [[thread_position_in_grid]] ) { auto const Nx = constants.Nx; @@ -272,33 +292,24 @@ namespace sim3d { device float* u0 [[buffer(0)]], device float const* in_sigs [[buffer(1)]], device int64_t const* in_ixyz [[buffer(2)]], - constant Constants3D const& constants [[buffer(3)]], + constant Constants3D& constants [[buffer(3)]], + constant int64_t& timestep [[buffer(4)]], uint id [[thread_position_in_grid]] ) { - auto const ii = in_ixyz[id]; - u0[ii] += in_sigs[id * constants.Nt + constants.n]; + u0[in_ixyz[id]] += in_sigs[id * constants.Nt + timestep]; } [[kernel]] void readOutput( device float* u_out [[buffer(0)]], device float const* u1 [[buffer(1)]], device int64_t const* out_ixyz [[buffer(2)]], - constant Constants3D const& constants [[buffer(3)]], + constant Constants3D& constants [[buffer(3)]], + constant int64_t& timestep [[buffer(4)]], uint id [[thread_position_in_grid]] ) { - auto const ii = out_ixyz[id]; - u_out[id * constants.Nt + constants.n] = u1[ii]; + u_out[id * constants.Nt + timestep] = u1[out_ixyz[id]]; } } // namespace sim3d -[[kernel]] void -foo(device float const* in [[buffer(0)]], device float* out [[buffer(1)]], uint id [[thread_position_in_grid]]) { - out[id] = in[id] + 42.0; -} - -[[kernel]] void -bar(device float const* in [[buffer(0)]], device float* out [[buffer(1)]], uint id [[thread_position_in_grid]]) { - out[id] = in[id] - 1.0; -} } // namespace pffdtd diff --git a/src/cpp/pffdtd/engine_metal_2d.hpp b/src/cpp/pffdtd/engine_metal_2d.hpp new file mode 100644 index 0000000..c8dc800 --- /dev/null +++ b/src/cpp/pffdtd/engine_metal_2d.hpp @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: MIT +// SPDX-FileCopyrightText: 2024 Tobias Hienzsch + +#pragma once + +#if not defined(PFFDTD_HAS_METAL) + #error "METAL must be enabled in CMake via PFFDTD_ENABLE_METAL" +#endif + +#include "pffdtd/mdspan.hpp" +#include "pffdtd/simulation_2d.hpp" + +namespace pffdtd { + +struct EngineMETAL2D { + [[nodiscard]] auto operator()(Simulation2D const& sim) const -> stdex::mdarray>; +}; + +} // namespace pffdtd diff --git a/src/cpp/pffdtd/engine_metal_2d.mm b/src/cpp/pffdtd/engine_metal_2d.mm new file mode 100644 index 0000000..037e396 --- /dev/null +++ b/src/cpp/pffdtd/engine_metal_2d.mm @@ -0,0 +1,199 @@ +// SPDX-License-Identifier: MIT +// SPDX-FileCopyrightText: 2024 Tobias Hienzsch + +#include "engine_metal_2d.hpp" + +#include "pffdtd/engine_metal.hpp" +#include "pffdtd/metal.hpp" +#include "pffdtd/progress.hpp" +#include "pffdtd/time.hpp" + +#include + +#include +#include + +namespace pffdtd { + +namespace { + +auto toFloat(std::vector const& buf) { + auto buf32 = std::vector(buf.size()); + std::ranges::transform(buf32, buf32.begin(), [](auto v) { return static_cast(v); }); + return buf32; +} + +auto run(Simulation2D const& sim) { + @autoreleasepool { + summary(sim); + + auto const Nx = sim.Nx; + auto const Ny = sim.Ny; + auto const Npts = Nx * Ny; + auto const Nt = sim.Nt; + auto const Nb = static_cast(sim.adj_bn.size()); + auto const inx = sim.inx; + auto const iny = sim.iny; + auto const Nr = static_cast(sim.out_ixy.size()); + auto const loss_factor = sim.loss_factor; + auto const src_sig_f32 = toFloat(sim.src_sig); + + auto const c = Constants2D{ + .Nx = Nx, + .Ny = Ny, + .Nt = Nt, + .in_ixy = inx * Ny + iny, + .lossFactor = static_cast(loss_factor), + }; + + // Device + NSArray* devices = MTLCopyAllDevices(); + id device = devices[0]; + PFFDTD_ASSERT(device != nil); + summary(device); + + // Library + id library = [device newDefaultLibrary]; + PFFDTD_ASSERT(library != nil); + + id airUpdateKernel = makeComputePipeline(library, "pffdtd::sim2d::airUpdate"); + id rigidUpdateKernel = makeComputePipeline(library, "pffdtd::sim2d::boundaryRigid"); + id rigidLossKernel = makeComputePipeline(library, "pffdtd::sim2d::boundaryLoss"); + id addSourceKernel = makeComputePipeline(library, "pffdtd::sim2d::addSource"); + id readOutputKernel = makeComputePipeline(library, "pffdtd::sim2d::readOutput"); + + // Buffer + id in_mask = makeBuffer(device, sim.in_mask, MTLResourceStorageModeShared); + id bn_ixy = makeBuffer(device, sim.bn_ixy, MTLResourceStorageModeShared); + id adj_bn = makeBuffer(device, sim.adj_bn, MTLResourceStorageModeShared); + id out_ixy = makeBuffer(device, sim.out_ixy, MTLResourceStorageModeShared); + id src_sig = makeBuffer(device, src_sig_f32, MTLResourceStorageModeShared); + id constants = [device newBufferWithBytes:&c + length:sizeof(Constants2D) + options:MTLResourceStorageModeShared]; + PFFDTD_ASSERT(constants != nil); + + id u0 = makeEmptyBuffer(device, Npts, MTLResourceStorageModeShared); + id u1 = makeEmptyBuffer(device, Npts, MTLResourceStorageModeShared); + id u2 = makeEmptyBuffer(device, Npts, MTLResourceStorageModeShared); + id out = makeEmptyBuffer(device, Npts, MTLResourceStorageModeShared); + + // Queue + id commandQueue = [device newCommandQueue]; + PFFDTD_ASSERT(commandQueue != nil); + + auto const start = getTime(); + + for (int64_t n{0}; n < Nt; ++n) { + auto const sampleStart = getTime(); + + id commandBuffer = [commandQueue commandBuffer]; + + id timestep = [device newBufferWithBytes:&n + length:sizeof(int64_t) + options:MTLResourceStorageModeShared]; + PFFDTD_ASSERT(timestep != nil); + + // Air Update + id airUpdate = [commandBuffer computeCommandEncoder]; + [airUpdate setComputePipelineState:airUpdateKernel]; + [airUpdate setBuffer:u0 offset:0 atIndex:0]; + [airUpdate setBuffer:u1 offset:0 atIndex:1]; + [airUpdate setBuffer:u2 offset:0 atIndex:2]; + [airUpdate setBuffer:in_mask offset:0 atIndex:3]; + [airUpdate setBuffer:constants offset:0 atIndex:4]; + [airUpdate dispatchThreads:MTLSizeMake(Nx - 2, Ny - 2, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [airUpdate endEncoding]; + + // Rigid Update + id rigidUpdate = [commandBuffer computeCommandEncoder]; + [rigidUpdate setComputePipelineState:rigidUpdateKernel]; + [rigidUpdate setBuffer:u0 offset:0 atIndex:0]; + [rigidUpdate setBuffer:u1 offset:0 atIndex:1]; + [rigidUpdate setBuffer:u2 offset:0 atIndex:2]; + [rigidUpdate setBuffer:bn_ixy offset:0 atIndex:3]; + [rigidUpdate setBuffer:adj_bn offset:0 atIndex:4]; + [rigidUpdate setBuffer:constants offset:0 atIndex:5]; + [rigidUpdate dispatchThreads:MTLSizeMake(Nb, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [rigidUpdate endEncoding]; + + // Rigid Loss + id rigidLoss = [commandBuffer computeCommandEncoder]; + [rigidLoss setComputePipelineState:rigidLossKernel]; + [rigidLoss setBuffer:u0 offset:0 atIndex:0]; + [rigidLoss setBuffer:u2 offset:0 atIndex:1]; + [rigidLoss setBuffer:bn_ixy offset:0 atIndex:2]; + [rigidLoss setBuffer:adj_bn offset:0 atIndex:3]; + [rigidLoss setBuffer:constants offset:0 atIndex:4]; + [rigidLoss dispatchThreads:MTLSizeMake(Nb, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [rigidLoss endEncoding]; + + // Add Source + id addSource = [commandBuffer computeCommandEncoder]; + [addSource setComputePipelineState:addSourceKernel]; + [addSource setBuffer:u0 offset:0 atIndex:0]; + [addSource setBuffer:src_sig offset:0 atIndex:1]; + [addSource setBuffer:constants offset:0 atIndex:2]; + [addSource setBuffer:timestep offset:0 atIndex:3]; + [addSource dispatchThreads:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [addSource endEncoding]; + + // Read Outputs + id readOutput = [commandBuffer computeCommandEncoder]; + [readOutput setComputePipelineState:readOutputKernel]; + [readOutput setBuffer:out offset:0 atIndex:0]; + [readOutput setBuffer:u0 offset:0 atIndex:1]; + [readOutput setBuffer:out_ixy offset:0 atIndex:2]; + [readOutput setBuffer:constants offset:0 atIndex:3]; + [readOutput setBuffer:timestep offset:0 atIndex:4]; + [readOutput dispatchThreads:MTLSizeMake(Nr, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [readOutput endEncoding]; + + // Wait + [commandBuffer commit]; + [commandBuffer waitUntilCompleted]; + PFFDTD_ASSERT(commandBuffer.status == MTLCommandBufferStatusCompleted); + + // Rotate buffers + auto tmp = u2; + u2 = u1; + u1 = u0; + u0 = tmp; + + auto const now = getTime(); + auto const elapsed = now - start; + auto const elapsedSample = now - sampleStart; + + print(ProgressReport{ + .n = n, + .Nt = Nt, + .Npts = Npts, + .Nb = Nb, + .elapsed = elapsed, + .elapsedSample = elapsedSample, + .elapsedAir = {}, + .elapsedSampleAir = {}, + .elapsedBoundary = {}, + .elapsedSampleBoundary = {}, + .numWorkers = 1, + }); + } + + auto outputs_f64 = stdex::mdarray>(Nr, Nt); + + float* outputs_f32 = (float*)[out contents]; + for (auto i{0UL}; i < static_cast(Nr * Nt); ++i) { + outputs_f64.data()[i] = outputs_f32[i]; + } + + return outputs_f64; + } +} + +} // namespace + +auto EngineMETAL2D::operator()(Simulation2D const& sim) const -> stdex::mdarray> { + return run(sim); +} + +} // namespace pffdtd diff --git a/src/cpp/pffdtd/engine_metal_3d.mm b/src/cpp/pffdtd/engine_metal_3d.mm index 2dfb17c..47f2578 100644 --- a/src/cpp/pffdtd/engine_metal_3d.mm +++ b/src/cpp/pffdtd/engine_metal_3d.mm @@ -4,57 +4,22 @@ #include "pffdtd/assert.hpp" #include "pffdtd/engine_metal.hpp" +#include "pffdtd/metal.hpp" #include "pffdtd/progress.hpp" #include "pffdtd/time.hpp" #include -#import -#import - -#include +#include namespace pffdtd { namespace { -void summary(id device) { - fmt::println("- Device {}", [device.name UTF8String]); - fmt::println(" - Unified memory: {}", device.hasUnifiedMemory != 0 ? "true" : "false"); - fmt::println(" - Max buffer length {} MB", device.maxBufferLength / 1'000'000); - fmt::println(" - Recommended max working set size: {} MB", device.recommendedMaxWorkingSetSize / 1'000'000); - fmt::println(" - Max transfer rate: {}", device.maxTransferRate); - fmt::println(" - Max threads per threadgroup: {}", device.maxThreadsPerThreadgroup.width); - fmt::println(" - Max threadgroup memory size: {} bytes\n", device.maxThreadgroupMemoryLength); -} - -id makeComputePipeline(id library, char const* function) { - id kernel = [library newFunctionWithName:[NSString stringWithUTF8String:function]]; - PFFDTD_ASSERT(kernel != nil); - - NSError* error = nullptr; - id pipeline = [library.device newComputePipelineStateWithFunction:kernel error:&error]; - PFFDTD_ASSERT(error == nullptr); - - fmt::println("- {}:", function); - fmt::println(" - threadExecutionWidth = {}", pipeline.threadExecutionWidth); - fmt::println(" - maxTotalThreadsPerThreadgroup = {}", pipeline.maxTotalThreadsPerThreadgroup); - fmt::println(" - staticThreadgroupMemoryLength = {}\n", pipeline.staticThreadgroupMemoryLength); - - return pipeline; -} - -template -id makeBuffer(id device, std::vector const& data, ModeT mode) { - auto const size = data.size() * sizeof(T); - id buf = [device newBufferWithBytes:data.data() length:size options:mode]; - PFFDTD_ASSERT(buf != nil); - return buf; -} - template auto run(Simulation3D const& sim) { @autoreleasepool { + // Device NSArray* devices = MTLCopyAllDevices(); id device = devices[0]; @@ -95,7 +60,7 @@ auto run(Simulation3D const& sim) { auto const a1 = static_cast(sim.a1); auto const a2 = static_cast(sim.a2); - fmt::println("HOST-BUFFERS"); + fmt::println("COPY-BUFFERS"); id out_ixyz = makeBuffer(device, sim.out_ixyz, MTLResourceStorageModeShared); id in_ixyz = makeBuffer(device, sim.in_ixyz, MTLResourceStorageModeShared); id in_sigs = makeBuffer(device, sim.in_sigs, MTLResourceStorageModeShared); @@ -111,170 +76,174 @@ auto run(Simulation3D const& sim) { id mat_quads = makeBuffer(device, sim.mat_quads, MTLResourceStorageModeShared); id Q_bna = makeBuffer(device, sim.Q_bna, MTLResourceStorageModeShared); - fmt::println("DEVICE-BUFFERS"); - id u0 = [device newBufferWithLength:sizeof(Real) * Npts options:MTLResourceStorageModeShared]; - id u1 = [device newBufferWithLength:sizeof(Real) * Npts options:MTLResourceStorageModeShared]; - id u0b = [device newBufferWithLength:sizeof(Real) * Nbl options:MTLResourceStorageModeShared]; - id u1b = [device newBufferWithLength:sizeof(Real) * Nbl options:MTLResourceStorageModeShared]; - id u2b = [device newBufferWithLength:sizeof(Real) * Nbl options:MTLResourceStorageModeShared]; - id u2ba = [device newBufferWithLength:sizeof(Real) * Nba options:MTLResourceStorageModeShared]; - id gh1 = [device newBufferWithLength:sizeof(Real) * Nbl * MMb options:MTLResourceStorageModeShared]; - id vh1 = [device newBufferWithLength:sizeof(Real) * Nbl * MMb options:MTLResourceStorageModeShared]; - id u_out = [device newBufferWithLength:sizeof(Real) * Nr * Nt options:MTLResourceStorageModeShared]; + fmt::println("EMPTY-BUFFERS"); + id u0 = makeEmptyBuffer(device, Npts, MTLResourceStorageModeShared); + id u1 = makeEmptyBuffer(device, Npts, MTLResourceStorageModeShared); + id u0b = makeEmptyBuffer(device, Nbl, MTLResourceStorageModeShared); + id u1b = makeEmptyBuffer(device, Nbl, MTLResourceStorageModeShared); + id u2b = makeEmptyBuffer(device, Nbl, MTLResourceStorageModeShared); + id u2ba = makeEmptyBuffer(device, Nba, MTLResourceStorageModeShared); + id gh1 = makeEmptyBuffer(device, Nbl * MMb, MTLResourceStorageModeShared); + id vh1 = makeEmptyBuffer(device, Nbl * MMb, MTLResourceStorageModeShared); + id u_out = makeEmptyBuffer(device, Nr * Nt, MTLResourceStorageModeShared); + + auto const c = Constants3D{ + .l = l, + .lo2 = lo2, + .sl2 = sl2, + .a1 = a1, + .a2 = a2, + .Nx = Nx, + .Ny = Ny, + .Nz = Nz, + .NzNy = NzNy, + .Nb = Nb, + .Nbl = Nbl, + .Nba = Nba, + .Ns = Ns, + .Nr = Nr, + .Nt = Nt, + }; + id constants = [device newBufferWithBytes:&c length:sizeof(c) options:MTLResourceStorageModeShared]; // Queue id commandQueue = [device newCommandQueue]; - assert(commandQueue != nil); + PFFDTD_ASSERT(commandQueue != nil); fmt::println("START"); auto const start = getTime(); for (int64_t n = 0; n < Nt; n++) { auto const sampleStart = getTime(); - auto const c = Constants3D{ - .n = n, - .Nx = Nx, - .Ny = Ny, - .Nz = Nz, - .NzNy = NzNy, - .Nb = Nb, - .Nbl = Nbl, - .Nba = Nba, - .Ns = Ns, - .Nr = Nr, - .Nt = Nt, - .l = l, - .lo2 = lo2, - .sl2 = sl2, - .a1 = a1, - .a2 = a2, - }; - id constants = [device newBufferWithBytes:&c length:sizeof(c) options:MTLResourceStorageModeShared]; - id commandBuffer = [commandQueue commandBuffer]; + id timestep = [device newBufferWithBytes:&n length:sizeof(n) options:MTLResourceStorageModeShared]; + PFFDTD_ASSERT(timestep != nil); // Rigid - id rigidEncoder = [commandBuffer computeCommandEncoder]; - [rigidEncoder setComputePipelineState:kernelRigidCart]; - [rigidEncoder setBuffer:u0 offset:0 atIndex:0]; - [rigidEncoder setBuffer:u1 offset:0 atIndex:1]; - [rigidEncoder setBuffer:bn_ixyz offset:0 atIndex:2]; - [rigidEncoder setBuffer:adj_bn offset:0 atIndex:3]; - [rigidEncoder setBuffer:constants offset:0 atIndex:4]; - [rigidEncoder dispatchThreads:MTLSizeMake(Nb, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - [rigidEncoder endEncoding]; + id rigidUpdate = [commandBuffer computeCommandEncoder]; + [rigidUpdate setComputePipelineState:kernelRigidCart]; + [rigidUpdate setBuffer:u0 offset:0 atIndex:0]; + [rigidUpdate setBuffer:u1 offset:0 atIndex:1]; + [rigidUpdate setBuffer:bn_ixyz offset:0 atIndex:2]; + [rigidUpdate setBuffer:adj_bn offset:0 atIndex:3]; + [rigidUpdate setBuffer:constants offset:0 atIndex:4]; + [rigidUpdate dispatchThreads:MTLSizeMake(Nb, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [rigidUpdate endEncoding]; // Copy to boundary buffer - id copyToBoundaryEncoder = [commandBuffer computeCommandEncoder]; - [copyToBoundaryEncoder setComputePipelineState:kernelCopyFromGrid]; - [copyToBoundaryEncoder setBuffer:u0b offset:0 atIndex:0]; - [copyToBoundaryEncoder setBuffer:u0 offset:0 atIndex:1]; - [copyToBoundaryEncoder setBuffer:bnl_ixyz offset:0 atIndex:2]; - [copyToBoundaryEncoder dispatchThreads:MTLSizeMake(Nbl, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - [copyToBoundaryEncoder endEncoding]; + id copyToBoundary = [commandBuffer computeCommandEncoder]; + [copyToBoundary setComputePipelineState:kernelCopyFromGrid]; + [copyToBoundary setBuffer:u0b offset:0 atIndex:0]; + [copyToBoundary setBuffer:u0 offset:0 atIndex:1]; + [copyToBoundary setBuffer:bnl_ixyz offset:0 atIndex:2]; + [copyToBoundary dispatchThreads:MTLSizeMake(Nbl, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [copyToBoundary endEncoding]; // Apply rigid loss - id boundaryLossEncoder = [commandBuffer computeCommandEncoder]; - [boundaryLossEncoder setComputePipelineState:kernelBoundaryLossFD]; - [boundaryLossEncoder setBuffer:u0b offset:0 atIndex:0]; - [boundaryLossEncoder setBuffer:u2b offset:0 atIndex:1]; - [boundaryLossEncoder setBuffer:vh1 offset:0 atIndex:2]; - [boundaryLossEncoder setBuffer:gh1 offset:0 atIndex:3]; - [boundaryLossEncoder setBuffer:ssaf_bnl offset:0 atIndex:4]; - [boundaryLossEncoder setBuffer:mat_bnl offset:0 atIndex:5]; - [boundaryLossEncoder setBuffer:mat_beta offset:0 atIndex:6]; - [boundaryLossEncoder setBuffer:mat_quads offset:0 atIndex:7]; - [boundaryLossEncoder setBuffer:Mb offset:0 atIndex:8]; - [boundaryLossEncoder setBuffer:constants offset:0 atIndex:9]; - [boundaryLossEncoder dispatchThreads:MTLSizeMake(Nbl, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - [boundaryLossEncoder endEncoding]; + id boundaryLoss = [commandBuffer computeCommandEncoder]; + [boundaryLoss setComputePipelineState:kernelBoundaryLossFD]; + [boundaryLoss setBuffer:u0b offset:0 atIndex:0]; + [boundaryLoss setBuffer:u2b offset:0 atIndex:1]; + [boundaryLoss setBuffer:vh1 offset:0 atIndex:2]; + [boundaryLoss setBuffer:gh1 offset:0 atIndex:3]; + [boundaryLoss setBuffer:ssaf_bnl offset:0 atIndex:4]; + [boundaryLoss setBuffer:mat_bnl offset:0 atIndex:5]; + [boundaryLoss setBuffer:mat_beta offset:0 atIndex:6]; + [boundaryLoss setBuffer:mat_quads offset:0 atIndex:7]; + [boundaryLoss setBuffer:Mb offset:0 atIndex:8]; + [boundaryLoss setBuffer:constants offset:0 atIndex:9]; + [boundaryLoss dispatchThreads:MTLSizeMake(Nbl, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [boundaryLoss endEncoding]; // Copy from boundary buffer - id copyFromBoundaryEncoder = [commandBuffer computeCommandEncoder]; - [copyFromBoundaryEncoder setComputePipelineState:kernelCopyToGrid]; - [copyFromBoundaryEncoder setBuffer:u0 offset:0 atIndex:0]; - [copyFromBoundaryEncoder setBuffer:u0b offset:0 atIndex:1]; - [copyFromBoundaryEncoder setBuffer:bnl_ixyz offset:0 atIndex:2]; - [copyFromBoundaryEncoder dispatchThreads:MTLSizeMake(Nbl, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - [copyFromBoundaryEncoder endEncoding]; + id copyFromBoundary = [commandBuffer computeCommandEncoder]; + [copyFromBoundary setComputePipelineState:kernelCopyToGrid]; + [copyFromBoundary setBuffer:u0 offset:0 atIndex:0]; + [copyFromBoundary setBuffer:u0b offset:0 atIndex:1]; + [copyFromBoundary setBuffer:bnl_ixyz offset:0 atIndex:2]; + [copyFromBoundary dispatchThreads:MTLSizeMake(Nbl, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [copyFromBoundary endEncoding]; // Copy to ABC buffer - id copyToABCEncoder = [commandBuffer computeCommandEncoder]; - [copyToABCEncoder setComputePipelineState:kernelCopyFromGrid]; - [copyToABCEncoder setBuffer:u2ba offset:0 atIndex:0]; - [copyToABCEncoder setBuffer:u0 offset:0 atIndex:1]; - [copyToABCEncoder setBuffer:bna_ixyz offset:0 atIndex:2]; - [copyToABCEncoder dispatchThreads:MTLSizeMake(Nba, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - [copyToABCEncoder endEncoding]; + id copyToABC = [commandBuffer computeCommandEncoder]; + [copyToABC setComputePipelineState:kernelCopyFromGrid]; + [copyToABC setBuffer:u2ba offset:0 atIndex:0]; + [copyToABC setBuffer:u0 offset:0 atIndex:1]; + [copyToABC setBuffer:bna_ixyz offset:0 atIndex:2]; + [copyToABC dispatchThreads:MTLSizeMake(Nba, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [copyToABC endEncoding]; // Flip halo XY - id flipHaloXYEncoder = [commandBuffer computeCommandEncoder]; - [flipHaloXYEncoder setComputePipelineState:kernelFlipHaloXY]; - [flipHaloXYEncoder setBuffer:u1 offset:0 atIndex:0]; - [flipHaloXYEncoder setBuffer:constants offset:0 atIndex:1]; - [flipHaloXYEncoder dispatchThreads:MTLSizeMake(Nx, Ny, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - [flipHaloXYEncoder endEncoding]; + id flipHaloXY = [commandBuffer computeCommandEncoder]; + [flipHaloXY setComputePipelineState:kernelFlipHaloXY]; + [flipHaloXY setBuffer:u1 offset:0 atIndex:0]; + [flipHaloXY setBuffer:constants offset:0 atIndex:1]; + [flipHaloXY dispatchThreads:MTLSizeMake(Nx, Ny, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [flipHaloXY endEncoding]; // Flip halo XZ - id flipHaloXZEncoder = [commandBuffer computeCommandEncoder]; - [flipHaloXZEncoder setComputePipelineState:kernelFlipHaloXZ]; - [flipHaloXZEncoder setBuffer:u1 offset:0 atIndex:0]; - [flipHaloXZEncoder setBuffer:constants offset:0 atIndex:1]; - [flipHaloXZEncoder dispatchThreads:MTLSizeMake(Nx, Nz, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - [flipHaloXZEncoder endEncoding]; + id flipHaloXZ = [commandBuffer computeCommandEncoder]; + [flipHaloXZ setComputePipelineState:kernelFlipHaloXZ]; + [flipHaloXZ setBuffer:u1 offset:0 atIndex:0]; + [flipHaloXZ setBuffer:constants offset:0 atIndex:1]; + [flipHaloXZ dispatchThreads:MTLSizeMake(Nx, Nz, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [flipHaloXZ endEncoding]; // Flip halo YZ - id flipHaloYZEncoder = [commandBuffer computeCommandEncoder]; - [flipHaloYZEncoder setComputePipelineState:kernelFlipHaloYZ]; - [flipHaloYZEncoder setBuffer:u1 offset:0 atIndex:0]; - [flipHaloYZEncoder setBuffer:constants offset:0 atIndex:1]; - [flipHaloYZEncoder dispatchThreads:MTLSizeMake(Ny, Nz, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - [flipHaloYZEncoder endEncoding]; + id flipHaloYZ = [commandBuffer computeCommandEncoder]; + [flipHaloYZ setComputePipelineState:kernelFlipHaloYZ]; + [flipHaloYZ setBuffer:u1 offset:0 atIndex:0]; + [flipHaloYZ setBuffer:constants offset:0 atIndex:1]; + [flipHaloYZ dispatchThreads:MTLSizeMake(Ny, Nz, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [flipHaloYZ endEncoding]; // Add source - id addSourceEncoder = [commandBuffer computeCommandEncoder]; - [addSourceEncoder setComputePipelineState:kernelAddSource]; - [addSourceEncoder setBuffer:u0 offset:0 atIndex:0]; - [addSourceEncoder setBuffer:in_sigs offset:0 atIndex:1]; - [addSourceEncoder setBuffer:in_ixyz offset:0 atIndex:2]; - [addSourceEncoder setBuffer:constants offset:0 atIndex:3]; - [addSourceEncoder dispatchThreads:MTLSizeMake(Ns, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - [addSourceEncoder endEncoding]; + id addSource = [commandBuffer computeCommandEncoder]; + [addSource setComputePipelineState:kernelAddSource]; + [addSource setBuffer:u0 offset:0 atIndex:0]; + [addSource setBuffer:in_sigs offset:0 atIndex:1]; + [addSource setBuffer:in_ixyz offset:0 atIndex:2]; + [addSource setBuffer:constants offset:0 atIndex:3]; + [addSource setBuffer:timestep offset:0 atIndex:4]; + [addSource dispatchThreads:MTLSizeMake(Ns, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [addSource endEncoding]; // Air update - id airUpdateEncoder = [commandBuffer computeCommandEncoder]; - [airUpdateEncoder setComputePipelineState:kernelAirCart]; - [airUpdateEncoder setBuffer:u0 offset:0 atIndex:0]; - [airUpdateEncoder setBuffer:u1 offset:0 atIndex:1]; - [airUpdateEncoder setBuffer:bn_mask offset:0 atIndex:2]; - [airUpdateEncoder setBuffer:constants offset:0 atIndex:3]; - [airUpdateEncoder dispatchThreads:MTLSizeMake(Nx - 2, Ny - 2, Nz - 2) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - [airUpdateEncoder endEncoding]; + id airUpdate = [commandBuffer computeCommandEncoder]; + [airUpdate setComputePipelineState:kernelAirCart]; + [airUpdate setBuffer:u0 offset:0 atIndex:0]; + [airUpdate setBuffer:u1 offset:0 atIndex:1]; + [airUpdate setBuffer:bn_mask offset:0 atIndex:2]; + [airUpdate setBuffer:constants offset:0 atIndex:3]; + [airUpdate dispatchThreads:MTLSizeMake(Nx - 2, Ny - 2, Nz - 2) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [airUpdate endEncoding]; // ABC loss - id abcEncoder = [commandBuffer computeCommandEncoder]; - [abcEncoder setComputePipelineState:kernelBoundaryLossABC]; - [abcEncoder setBuffer:u0 offset:0 atIndex:0]; - [abcEncoder setBuffer:u2ba offset:0 atIndex:1]; - [abcEncoder setBuffer:Q_bna offset:0 atIndex:2]; - [abcEncoder setBuffer:bna_ixyz offset:0 atIndex:3]; - [abcEncoder setBuffer:constants offset:0 atIndex:4]; - [abcEncoder dispatchThreads:MTLSizeMake(Nba, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - [abcEncoder endEncoding]; + id abcLoss = [commandBuffer computeCommandEncoder]; + [abcLoss setComputePipelineState:kernelBoundaryLossABC]; + [abcLoss setBuffer:u0 offset:0 atIndex:0]; + [abcLoss setBuffer:u2ba offset:0 atIndex:1]; + [abcLoss setBuffer:Q_bna offset:0 atIndex:2]; + [abcLoss setBuffer:bna_ixyz offset:0 atIndex:3]; + [abcLoss setBuffer:constants offset:0 atIndex:4]; + [abcLoss dispatchThreads:MTLSizeMake(Nba, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [abcLoss endEncoding]; // Read receiver - id readOutputEncoder = [commandBuffer computeCommandEncoder]; - [readOutputEncoder setComputePipelineState:kernelReadOutput]; - [readOutputEncoder setBuffer:u_out offset:0 atIndex:0]; - [readOutputEncoder setBuffer:u1 offset:0 atIndex:1]; - [readOutputEncoder setBuffer:out_ixyz offset:0 atIndex:2]; - [readOutputEncoder setBuffer:constants offset:0 atIndex:3]; - [readOutputEncoder dispatchThreads:MTLSizeMake(Nr, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; - [readOutputEncoder endEncoding]; + id readOutput = [commandBuffer computeCommandEncoder]; + [readOutput setComputePipelineState:kernelReadOutput]; + [readOutput setBuffer:u_out offset:0 atIndex:0]; + [readOutput setBuffer:u1 offset:0 atIndex:1]; + [readOutput setBuffer:out_ixyz offset:0 atIndex:2]; + [readOutput setBuffer:constants offset:0 atIndex:3]; + [readOutput setBuffer:timestep offset:0 atIndex:4]; + [readOutput dispatchThreads:MTLSizeMake(Nr, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [readOutput endEncoding]; // Wait [commandBuffer commit]; [commandBuffer waitUntilCompleted]; + PFFDTD_ASSERT(commandBuffer.status == MTLCommandBufferStatusCompleted); // Swap buffers { diff --git a/src/cpp/pffdtd/metal.hpp b/src/cpp/pffdtd/metal.hpp new file mode 100644 index 0000000..4d07375 --- /dev/null +++ b/src/cpp/pffdtd/metal.hpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// SPDX-FileCopyrightText: 2024 Tobias Hienzsch + +#pragma once + +#include "pffdtd/assert.hpp" +#include "pffdtd/time.hpp" + +#if not defined(PFFDTD_HAS_METAL) + #error "SYCL must be enabled in CMake via PFFDTD_ENABLE_METAL" +#endif + +#include + +#include +#include + +namespace pffdtd { + +inline void summary(id device) { + fmt::println("- Device {}", [device.name UTF8String]); + fmt::println(" - Unified memory: {}", device.hasUnifiedMemory != 0 ? "true" : "false"); + fmt::println(" - Max buffer length {} MB", device.maxBufferLength / 1'000'000); + fmt::println(" - Recommended max working set size: {} MB", device.recommendedMaxWorkingSetSize / 1'000'000); + fmt::println(" - Max transfer rate: {}", device.maxTransferRate); + fmt::println(" - Max threads per threadgroup: {}", device.maxThreadsPerThreadgroup.width); + fmt::println(" - Max threadgroup memory size: {} bytes\n", device.maxThreadgroupMemoryLength); +} + +inline id makeComputePipeline(id library, char const* function) { + id kernel = [library newFunctionWithName:[NSString stringWithUTF8String:function]]; + PFFDTD_ASSERT(kernel != nil); + + NSError* error = nullptr; + id pipeline = [library.device newComputePipelineStateWithFunction:kernel error:&error]; + PFFDTD_ASSERT(error == nullptr); + + fmt::println("- {}:", function); + fmt::println(" - threadExecutionWidth = {}", pipeline.threadExecutionWidth); + fmt::println(" - maxTotalThreadsPerThreadgroup = {}", pipeline.maxTotalThreadsPerThreadgroup); + fmt::println(" - staticThreadgroupMemoryLength = {}\n", pipeline.staticThreadgroupMemoryLength); + + return pipeline; +} + +template +id makeBuffer(id device, std::vector const& data, ModeT mode) { + auto const size = data.size() * sizeof(T); + id buf = [device newBufferWithBytes:data.data() length:size options:mode]; + PFFDTD_ASSERT(buf != nil); + return buf; +} + +template +id makeEmptyBuffer(id device, size_t count, ModeT mode) { + auto const size = count * sizeof(T); + id buf = [device newBufferWithLength:size options:mode]; + PFFDTD_ASSERT(buf != nil); + return buf; +} + +} // namespace pffdtd