Add GPU impl of QuasiMonotoneLimiter

wip
CliMA · Nov 14, 2023 · 3b43d09 · 3b43d09
1 parent ac8d329
commit 3b43d09
Show file tree

Hide file tree

Showing 3 changed files with 274 additions and 26 deletions.
diff --git a/src/Limiters/quasimonotone.jl b/src/Limiters/quasimonotone.jl
@@ -106,7 +106,71 @@ function compute_element_bounds!(
     ρq,
     ρ,
     ::ClimaComms.CUDADevice,
-) end
+)
+    S = size(Fields.field_values(ρ))
+    (Ni, Nj, _, Nv, Nh) = S
+    nthreads, nblocks = config_threadblock(Nv, Nh)
+
+    CUDA.@cuda threads = nthreads blocks = nblocks compute_element_bounds_kernel!(
+        limiter,
+        Fields.field_values(Operators.strip_space(ρq, axes(ρq))),
+        Fields.field_values(Operators.strip_space(ρ, axes(ρ))),
+        Nv,
+        Nh,
+        Val(Ni),
+        Val(Nj),
+    )
+    return nothing
+end
+
+function config_threadblock(Nv, Nh)
+    nitems = Nv * Nh
+    nthreads = min(256, nitems)
+    nblocks = cld(nitems, nthreads)
+    return (nthreads, nblocks)
+end
+
+function get_hv(Nv, Nh, blockIdx, threadIdx, blockDim, gridDim)
+    tidx = (blockIdx.x - 1) * blockDim.x + threadIdx.x
+    (h, v) = CartesianIndices((1:Nh, 1:Nv))[tidx].I
+    # @cuprintln("Nv,Nh,v,h=($Nv, $Nh,$v,$h)")
+    return (h, v)
+end
+
+function compute_element_bounds_kernel!(
+    limiter,
+    ρq,
+    ρ,
+    Nv,
+    Nh,
+    ::Val{Ni},
+    ::Val{Nj},
+) where {Ni, Nj}
+    (h, v) = get_hv(Nv, Nh, blockIdx(), threadIdx(), blockDim(), gridDim())
+    if h ≤ Nh && v ≤ Nv
+        (; q_bounds) = limiter
+        local q_min, q_max
+        slab_ρq = slab(ρq, v, h)
+        slab_ρ = slab(ρ, v, h)
+        for j in 1:Nj
+            for i in 1:Ni
+                q = rdiv(slab_ρq[i, j], slab_ρ[i, j])
+                if i == 1 && j == 1
+                    q_min = q
+                    q_max = q
+                else
+                    q_min = rmin(q_min, q)
+                    q_max = rmax(q_max, q)
+                end
+            end
+        end
+        slab_q_bounds = slab(q_bounds, v, h)
+        slab_q_bounds[1] = q_min
+        slab_q_bounds[2] = q_max
+    end
+    return nothing
+end
+
 
 function compute_element_bounds!(
     limiter::QuasiMonotoneLimiter,
@@ -158,7 +222,50 @@ function compute_neighbor_bounds_local!(
     limiter::QuasiMonotoneLimiter,
     ρ,
     ::ClimaComms.CUDADevice,
-) end
+)
+    topology = Spaces.topology(axes(ρ))
+    Ni, Nj, _, Nv, Nh = size(Fields.field_values(ρ))
+    nthreads, nblocks = config_threadblock(Nv, Nh)
+    CUDA.@cuda threads = nthreads blocks = nblocks compute_neighbor_bounds_local_kernel!(
+        limiter,
+        topology.local_neighbor_elem,
+        topology.local_neighbor_elem_offset,
+        Nv,
+        Nh,
+        Val(Ni),
+        Val(Nj),
+    )
+end
+
+function compute_neighbor_bounds_local_kernel!(
+    limiter,
+    local_neighbor_elem,
+    local_neighbor_elem_offset,
+    Nv,
+    Nh,
+    ::Val{Ni},
+    ::Val{Nj},
+) where {Ni, Nj}
+
+    (h, v) = get_hv(Nv, Nh, blockIdx(), threadIdx(), blockDim(), gridDim())
+    if h ≤ Nh && v ≤ Nv
+        (; q_bounds, q_bounds_nbr, ghost_buffer, rtol) = limiter
+        slab_q_bounds = slab(q_bounds, v, h)
+        q_min = slab_q_bounds[1]
+        q_max = slab_q_bounds[2]
+        for lne in
+            local_neighbor_elem_offset[h]:(local_neighbor_elem_offset[h + 1] - 1)
+            h_nbr = local_neighbor_elem[lne]
+            slab_q_bounds = slab(q_bounds, v, h_nbr)
+            q_min = rmin(q_min, slab_q_bounds[1])
+            q_max = rmax(q_max, slab_q_bounds[2])
+        end
+        slab_q_bounds_nbr = slab(q_bounds_nbr, v, h)
+        slab_q_bounds_nbr[1] = q_min
+        slab_q_bounds_nbr[2] = q_max
+    end
+    return nothing
+end
 
 function compute_neighbor_bounds_local!(
     limiter::QuasiMonotoneLimiter,
@@ -279,7 +386,153 @@ function apply_limiter!(
     ρ::Fields.Field,
     limiter::QuasiMonotoneLimiter,
     ::ClimaComms.CUDADevice,
-) end
+)
+    ρq_data = Fields.field_values(ρq)
+    (Ni, Nj, _, Nv, Nh) = size(ρq_data)
+    Nf = DataLayouts.ncomponents(ρq_data)
+    maxiter = Ni * Nj
+    WJ = Spaces.local_geometry_data(axes(ρq)).WJ
+    nthreads, nblocks = config_threadblock(Nv, Nh)
+    CUDA.@cuda threads = nthreads blocks = nblocks apply_limiter_kernel!(
+        limiter,
+        Fields.field_values(Operators.strip_space(ρq, axes(ρq))),
+        Fields.field_values(Operators.strip_space(ρ, axes(ρ))),
+        WJ,
+        Nv,
+        Nh,
+        Val(Nf),
+        Val(Ni),
+        Val(Nj),
+        Val(maxiter),
+    )
+    return nothing
+end
+
+function apply_limiter_kernel!(
+    limiter::QuasiMonotoneLimiter,
+    ρq_data,
+    ρ_data,
+    WJ_data,
+    Nv,
+    Nh,
+    ::Val{Nf},
+    ::Val{Ni},
+    ::Val{Nj},
+    ::Val{maxiter},
+) where {Nf, Ni, Nj, maxiter}
+    (; q_bounds_nbr, rtol) = limiter
+    converged = true
+    (h, v) = get_hv(Nv, Nh, blockIdx(), threadIdx(), blockDim(), gridDim())
+    if h ≤ Nh && v ≤ Nv
+
+        slab_ρ = slab(ρ_data, v, h)
+        slab_ρq = slab(ρq_data, v, h)
+        slab_WJ = slab(WJ_data, v, h)
+        slab_q_bounds = slab(q_bounds_nbr, v, h)
+
+        array_ρq = parent(slab_ρq)
+        array_ρ = parent(slab_ρ)
+        array_w = parent(slab_WJ)
+        array_q_bounds = parent(slab_q_bounds)
+
+        # 1) compute ∫ρ
+        total_mass = zero(eltype(array_ρ))
+        for j in 1:Nj, i in 1:Ni
+            total_mass += array_ρ[i, j, 1] * array_w[i, j, 1]
+        end
+
+        @assert total_mass > 0
+
+        converged = true
+        for f in 1:Nf
+            q_min = array_q_bounds[1, f]
+            q_max = array_q_bounds[2, f]
+
+            # 2) compute ∫ρq
+            tracer_mass = zero(eltype(array_ρq))
+            for j in 1:Nj, i in 1:Ni
+                tracer_mass += array_ρq[i, j, f] * array_w[i, j, 1]
+            end
+
+            # TODO: Should this condition be enforced? (It isn't in HOMME.)
+            # @assert tracer_mass >= 0
+
+            # 3) set bounds
+            q_avg = tracer_mass / total_mass
+            q_min = min(q_min, q_avg)
+            q_max = max(q_max, q_avg)
+
+            # 3) modify ρq
+            for iter in 1:maxiter
+                Δtracer_mass = zero(eltype(array_ρq))
+                for j in 1:Nj, i in 1:Ni
+                    ρ = array_ρ[i, j, 1]
+                    ρq = array_ρq[i, j, f]
+                    ρq_max = ρ * q_max
+                    ρq_min = ρ * q_min
+                    w = array_w[i, j]
+                    if ρq > ρq_max
+                        Δtracer_mass += (ρq - ρq_max) * w
+                        array_ρq[i, j, f] = ρq_max
+                    elseif ρq < ρq_min
+                        Δtracer_mass += (ρq - ρq_min) * w
+                        array_ρq[i, j, f] = ρq_min
+                    end
+                end
+
+                if abs(Δtracer_mass) <= rtol * abs(tracer_mass)
+                    break
+                end
+
+                if Δtracer_mass > 0 # add mass
+                    total_mass_at_Δ_points = zero(eltype(array_ρ))
+                    for j in 1:Nj, i in 1:Ni
+                        ρ = array_ρ[i, j, 1]
+                        ρq = array_ρq[i, j, f]
+                        w = array_w[i, j]
+                        if ρq < ρ * q_max
+                            total_mass_at_Δ_points += ρ * w
+                        end
+                    end
+                    Δq_at_Δ_points = Δtracer_mass / total_mass_at_Δ_points
+                    for j in 1:Nj, i in 1:Ni
+                        ρ = array_ρ[i, j, 1]
+                        ρq = array_ρq[i, j, f]
+                        if ρq < ρ * q_max
+                            array_ρq[i, j, f] += ρ * Δq_at_Δ_points
+                        end
+                    end
+                else # remove mass
+                    total_mass_at_Δ_points = zero(eltype(array_ρ))
+                    for j in 1:Nj, i in 1:Ni
+                        ρ = array_ρ[i, j, 1]
+                        ρq = array_ρq[i, j, f]
+                        w = array_w[i, j]
+                        if ρq > ρ * q_min
+                            total_mass_at_Δ_points += ρ * w
+                        end
+                    end
+                    Δq_at_Δ_points = Δtracer_mass / total_mass_at_Δ_points
+                    for j in 1:Nj, i in 1:Ni
+                        ρ = array_ρ[i, j, 1]
+                        ρq = array_ρq[i, j, f]
+                        if ρq > ρ * q_min
+                            array_ρq[i, j, f] += ρ * Δq_at_Δ_points
+                        end
+                    end
+                end
+
+                if iter == maxiter
+                    converged = false
+                end
+            end
+        end
+
+    end
+    # converged || @warn "Limiter failed to converge with rtol = $rtol"
+
+    return nothing
+end
 
 function apply_limiter!(
     ρq::Fields.Field,

diff --git a/src/Topologies/topology2d.jl b/src/Topologies/topology2d.jl
@@ -34,6 +34,7 @@ struct Topology2D{
     LVO,
     GV,
     GVO,
+    VI,
     BF,
     RGV,
 } <: AbstractDistributedTopology
@@ -81,9 +82,9 @@ struct Topology2D{
     ghost_vertex_offset::GVO
 
     "a vector of the lidx of neighboring local elements of each element"
-    local_neighbor_elem::Vector{Int}
+    local_neighbor_elem::VI
     "the index into `local_neighbor_elem` for the start of each element"
-    local_neighbor_elem_offset::Vector{Int}
+    local_neighbor_elem_offset::VI
     "a vector of the ridx of neighboring ghost elements of each element"
     ghost_neighbor_elem::Vector{Int}
     "the index into `ghost_neighbor_elem` for the start of each element"
@@ -503,8 +504,8 @@ function Topology2D(
         DA(local_vertex_offset),
         DA(ghost_vertices),
         DA(ghost_vertex_offset),
-        local_neighbor_elem,
-        local_neighbor_elem_offset,
+        DA(local_neighbor_elem),
+        DA(local_neighbor_elem_offset),
         ghost_neighbor_elem,
         ghost_neighbor_elem_offset,
         boundaries,

diff --git a/test/Limiters/limiter.jl b/test/Limiters/limiter.jl
@@ -128,30 +128,25 @@ end
         limiter = Limiters.QuasiMonotoneLimiter(ρq)
         Limiters.compute_element_bounds!(limiter, ρq, ρ)
 
-        is_gpu = device isa ClimaComms.CUDADevice
         S = map(Iterators.product(1:n1, 1:n2)) do (h1, h2)
             (h1, h2, slab(limiter.q_bounds, h1 + n1 * (h2 - 1)))
         end
         CUDA.@allowscalar begin
-            @test all(map(T -> T[3][1].x ≈ 2 * (T[1] - 1), S)) broken = is_gpu # q_min
-            @test all(map(T -> T[3][1].y ≈ 3 * (T[2] - 1), S)) broken = is_gpu # q_min
-            @test all(map(T -> T[3][2].x ≈ 2 * T[1], S)) broken = is_gpu # q_max
-            @test all(map(T -> T[3][2].y ≈ 3 * T[2], S)) broken = is_gpu # q_max
+            @test all(map(T -> T[3][1].x ≈ 2 * (T[1] - 1), S)) # q_min
+            @test all(map(T -> T[3][1].y ≈ 3 * (T[2] - 1), S)) # q_min
+            @test all(map(T -> T[3][2].x ≈ 2 * T[1], S)) # q_max
+            @test all(map(T -> T[3][2].y ≈ 3 * T[2], S)) # q_max
         end
 
         Limiters.compute_neighbor_bounds_local!(limiter, ρ)
         SN = map(Iterators.product(1:n1, 1:n2)) do (h1, h2)
             (h1, h2, slab(limiter.q_bounds_nbr, h1 + n1 * (h2 - 1)))
         end
         CUDA.@allowscalar begin
-            @test all(map(T -> T[3][1].x ≈ 2 * max(T[1] - 2, 0), SN)) broken =
-                is_gpu # q_min
-            @test all(map(T -> T[3][1].y ≈ 3 * max(T[2] - 2, 0), SN)) broken =
-                is_gpu # q_min
-            @test all(map(T -> T[3][2].x ≈ 2 * min(T[1] + 1, n1), SN)) broken =
-                is_gpu # q_max
-            @test all(map(T -> T[3][2].y ≈ 3 * min(T[2] + 1, n2), SN)) broken =
-                is_gpu # q_max
+            @test all(map(T -> T[3][1].x ≈ 2 * max(T[1] - 2, 0), SN))  # q_min
+            @test all(map(T -> T[3][1].y ≈ 3 * max(T[2] - 2, 0), SN))  # q_min
+            @test all(map(T -> T[3][2].x ≈ 2 * min(T[1] + 1, n1), SN))  # q_max
+            @test all(map(T -> T[3][2].y ≈ 3 * min(T[2] + 1, n2), SN))  # q_max
         end
     end
 end
@@ -216,8 +211,7 @@ end
         Limiters.apply_limiter!(ρq, ρ, limiter)
 
         @test Array(parent(ρq))[:, :, 1, 1] ≈
-              [FT(0.0) FT(0.0); FT(0.950005) FT(0.950005)] rtol = 10eps(FT) broken =
-            gpu_broken
+              [FT(0.0) FT(0.0); FT(0.950005) FT(0.950005)] rtol = 10eps(FT)
         # Check mass conservation after application of limiter
         @test sum(ρq) ≈ initial_Q_mass rtol = 10eps(FT)
     end
@@ -228,7 +222,6 @@ end
     Nij = 5
     device = ClimaComms.device()
     comms_ctx = ClimaComms.SingletonCommsContext(device)
-    is_gpu = device isa ClimaComms.CUDADevice
 
     for FT in (Float64, Float32)
         space =
@@ -260,7 +253,8 @@ end
 
         @test sum(ρq.x) ≈ total_ρq.x
         @test sum(ρq.y) ≈ total_ρq.y
-        @test all(FT(0) .<= Array(parent(ρq)) .<= FT(1)) broken = is_gpu
+        @test maximum(Array(parent(ρq))) ≈ 1 rtol = eps(FT)
+        @test minimum(Array(parent(ρq))) ≈ 0 rtol = eps(FT)
     end
 end
 
@@ -271,7 +265,6 @@ end
     n3 = 3
     device = ClimaComms.device()
     comms_ctx = ClimaComms.SingletonCommsContext(device)
-    is_gpu = device isa ClimaComms.CUDADevice
 
     for FT in (Float64, Float32)
         horzspace, hv_center_space, hv_face_space = hvspace_3D(
@@ -315,6 +308,7 @@ end
 
         @test sum(ρq.x) ≈ total_ρq.x
         @test sum(ρq.y) ≈ total_ρq.y
-        @test all(FT(0) .<= Array(parent(ρq)) .<= FT(1)) broken = is_gpu
+        @test maximum(Array(parent(ρq))) ≤ 1
+        @test minimum(Array(parent(ρq))) ≥ 0
     end
 end