From 6a59fc8c1c696481d533f16c49e9665f0e1d7da2 Mon Sep 17 00:00:00 2001 From: Simon Byrne Date: Thu, 19 Oct 2023 13:43:08 -0700 Subject: [PATCH 1/2] use blocking synchronize to reduce poll waiting --- src/Spaces/dss_cuda.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Spaces/dss_cuda.jl b/src/Spaces/dss_cuda.jl index b649a56bc9..7afaa5e948 100644 --- a/src/Spaces/dss_cuda.jl +++ b/src/Spaces/dss_cuda.jl @@ -427,13 +427,13 @@ function fill_send_buffer!(::ClimaComms.CUDADevice, dss_buffer::DSSBuffer) if nsend > 0 nitems = nsend * nlevels * nfid nthreads, nblocks = _configure_threadblock(nitems) - CUDA.synchronize() # CUDA MPI uses a separate stream. This will synchronize across streams + CUDA.synchronize(;blocking=true) # CUDA MPI uses a separate stream. This will synchronize across streams @cuda threads = (nthreads) blocks = (nblocks) fill_send_buffer_kernel!( send_data, send_buf_idx, pperimeter_data, ) - CUDA.synchronize() # CUDA MPI uses a separate stream. This will synchronize across streams + CUDA.synchronize(;blocking=true) # CUDA MPI uses a separate stream. This will synchronize across streams end return nothing end @@ -468,13 +468,13 @@ function load_from_recv_buffer!(::ClimaComms.CUDADevice, dss_buffer::DSSBuffer) if nrecv > 0 nitems = nrecv * nlevels * nfid nthreads, nblocks = _configure_threadblock(nitems) - CUDA.synchronize() + CUDA.synchronize(;blocking=true) @cuda threads = (nthreads) blocks = (nblocks) load_from_recv_buffer_kernel!( pperimeter_data, recv_data, recv_buf_idx, ) - CUDA.synchronize() + CUDA.synchronize(;blocking=true) end return nothing end From 5c7da33c92e9f25ad44b7bad09700fc8ac5835b2 Mon Sep 17 00:00:00 2001 From: Simon Byrne Date: Thu, 19 Oct 2023 14:41:30 -0700 Subject: [PATCH 2/2] remove unnecessary synchronizations --- src/Spaces/dss_cuda.jl | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/Spaces/dss_cuda.jl b/src/Spaces/dss_cuda.jl index 7afaa5e948..058186c344 100644 --- a/src/Spaces/dss_cuda.jl +++ b/src/Spaces/dss_cuda.jl @@ -427,13 +427,12 @@ function fill_send_buffer!(::ClimaComms.CUDADevice, dss_buffer::DSSBuffer) if nsend > 0 nitems = nsend * nlevels * nfid nthreads, nblocks = _configure_threadblock(nitems) - CUDA.synchronize(;blocking=true) # CUDA MPI uses a separate stream. This will synchronize across streams @cuda threads = (nthreads) blocks = (nblocks) fill_send_buffer_kernel!( send_data, send_buf_idx, pperimeter_data, ) - CUDA.synchronize(;blocking=true) # CUDA MPI uses a separate stream. This will synchronize across streams + CUDA.synchronize(; blocking = true) # CUDA MPI uses a separate stream. This will synchronize across streams end return nothing end @@ -468,13 +467,11 @@ function load_from_recv_buffer!(::ClimaComms.CUDADevice, dss_buffer::DSSBuffer) if nrecv > 0 nitems = nrecv * nlevels * nfid nthreads, nblocks = _configure_threadblock(nitems) - CUDA.synchronize(;blocking=true) @cuda threads = (nthreads) blocks = (nblocks) load_from_recv_buffer_kernel!( pperimeter_data, recv_data, recv_buf_idx, ) - CUDA.synchronize(;blocking=true) end return nothing end