Skip to content

Commit

Permalink
Add GPU impl of QuasiMonotoneLimiter
Browse files Browse the repository at this point in the history
wip
  • Loading branch information
charleskawczynski committed Nov 14, 2023
1 parent ac8d329 commit 3b43d09
Show file tree
Hide file tree
Showing 3 changed files with 274 additions and 26 deletions.
259 changes: 256 additions & 3 deletions src/Limiters/quasimonotone.jl
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,71 @@ function compute_element_bounds!(
ρq,
ρ,
::ClimaComms.CUDADevice,
) end
)
S = size(Fields.field_values(ρ))
(Ni, Nj, _, Nv, Nh) = S
nthreads, nblocks = config_threadblock(Nv, Nh)

CUDA.@cuda threads = nthreads blocks = nblocks compute_element_bounds_kernel!(
limiter,
Fields.field_values(Operators.strip_space(ρq, axes(ρq))),
Fields.field_values(Operators.strip_space(ρ, axes(ρ))),
Nv,
Nh,
Val(Ni),
Val(Nj),
)
return nothing
end

function config_threadblock(Nv, Nh)
nitems = Nv * Nh
nthreads = min(256, nitems)
nblocks = cld(nitems, nthreads)
return (nthreads, nblocks)
end

function get_hv(Nv, Nh, blockIdx, threadIdx, blockDim, gridDim)
tidx = (blockIdx.x - 1) * blockDim.x + threadIdx.x
(h, v) = CartesianIndices((1:Nh, 1:Nv))[tidx].I
# @cuprintln("Nv,Nh,v,h=($Nv, $Nh,$v,$h)")
return (h, v)
end

function compute_element_bounds_kernel!(
limiter,
ρq,
ρ,
Nv,
Nh,
::Val{Ni},
::Val{Nj},
) where {Ni, Nj}
(h, v) = get_hv(Nv, Nh, blockIdx(), threadIdx(), blockDim(), gridDim())
if h Nh && v Nv
(; q_bounds) = limiter
local q_min, q_max
slab_ρq = slab(ρq, v, h)
slab_ρ = slab(ρ, v, h)
for j in 1:Nj
for i in 1:Ni
q = rdiv(slab_ρq[i, j], slab_ρ[i, j])
if i == 1 && j == 1
q_min = q
q_max = q
else
q_min = rmin(q_min, q)
q_max = rmax(q_max, q)
end
end
end
slab_q_bounds = slab(q_bounds, v, h)
slab_q_bounds[1] = q_min
slab_q_bounds[2] = q_max
end
return nothing
end


function compute_element_bounds!(
limiter::QuasiMonotoneLimiter,
Expand Down Expand Up @@ -158,7 +222,50 @@ function compute_neighbor_bounds_local!(
limiter::QuasiMonotoneLimiter,
ρ,
::ClimaComms.CUDADevice,
) end
)
topology = Spaces.topology(axes(ρ))
Ni, Nj, _, Nv, Nh = size(Fields.field_values(ρ))
nthreads, nblocks = config_threadblock(Nv, Nh)
CUDA.@cuda threads = nthreads blocks = nblocks compute_neighbor_bounds_local_kernel!(
limiter,
topology.local_neighbor_elem,
topology.local_neighbor_elem_offset,
Nv,
Nh,
Val(Ni),
Val(Nj),
)
end

function compute_neighbor_bounds_local_kernel!(
limiter,
local_neighbor_elem,
local_neighbor_elem_offset,
Nv,
Nh,
::Val{Ni},
::Val{Nj},
) where {Ni, Nj}

(h, v) = get_hv(Nv, Nh, blockIdx(), threadIdx(), blockDim(), gridDim())
if h Nh && v Nv
(; q_bounds, q_bounds_nbr, ghost_buffer, rtol) = limiter
slab_q_bounds = slab(q_bounds, v, h)
q_min = slab_q_bounds[1]
q_max = slab_q_bounds[2]
for lne in
local_neighbor_elem_offset[h]:(local_neighbor_elem_offset[h + 1] - 1)
h_nbr = local_neighbor_elem[lne]
slab_q_bounds = slab(q_bounds, v, h_nbr)
q_min = rmin(q_min, slab_q_bounds[1])
q_max = rmax(q_max, slab_q_bounds[2])
end
slab_q_bounds_nbr = slab(q_bounds_nbr, v, h)
slab_q_bounds_nbr[1] = q_min
slab_q_bounds_nbr[2] = q_max
end
return nothing
end

function compute_neighbor_bounds_local!(
limiter::QuasiMonotoneLimiter,
Expand Down Expand Up @@ -279,7 +386,153 @@ function apply_limiter!(
ρ::Fields.Field,
limiter::QuasiMonotoneLimiter,
::ClimaComms.CUDADevice,
) end
)
ρq_data = Fields.field_values(ρq)
(Ni, Nj, _, Nv, Nh) = size(ρq_data)
Nf = DataLayouts.ncomponents(ρq_data)
maxiter = Ni * Nj
WJ = Spaces.local_geometry_data(axes(ρq)).WJ
nthreads, nblocks = config_threadblock(Nv, Nh)
CUDA.@cuda threads = nthreads blocks = nblocks apply_limiter_kernel!(
limiter,
Fields.field_values(Operators.strip_space(ρq, axes(ρq))),
Fields.field_values(Operators.strip_space(ρ, axes(ρ))),
WJ,
Nv,
Nh,
Val(Nf),
Val(Ni),
Val(Nj),
Val(maxiter),
)
return nothing
end

function apply_limiter_kernel!(
limiter::QuasiMonotoneLimiter,
ρq_data,
ρ_data,
WJ_data,
Nv,
Nh,
::Val{Nf},
::Val{Ni},
::Val{Nj},
::Val{maxiter},
) where {Nf, Ni, Nj, maxiter}
(; q_bounds_nbr, rtol) = limiter
converged = true
(h, v) = get_hv(Nv, Nh, blockIdx(), threadIdx(), blockDim(), gridDim())
if h Nh && v Nv

slab_ρ = slab(ρ_data, v, h)
slab_ρq = slab(ρq_data, v, h)
slab_WJ = slab(WJ_data, v, h)
slab_q_bounds = slab(q_bounds_nbr, v, h)

array_ρq = parent(slab_ρq)
array_ρ = parent(slab_ρ)
array_w = parent(slab_WJ)
array_q_bounds = parent(slab_q_bounds)

# 1) compute ∫ρ
total_mass = zero(eltype(array_ρ))
for j in 1:Nj, i in 1:Ni
total_mass += array_ρ[i, j, 1] * array_w[i, j, 1]
end

@assert total_mass > 0

converged = true
for f in 1:Nf
q_min = array_q_bounds[1, f]
q_max = array_q_bounds[2, f]

# 2) compute ∫ρq
tracer_mass = zero(eltype(array_ρq))
for j in 1:Nj, i in 1:Ni
tracer_mass += array_ρq[i, j, f] * array_w[i, j, 1]
end

# TODO: Should this condition be enforced? (It isn't in HOMME.)
# @assert tracer_mass >= 0

# 3) set bounds
q_avg = tracer_mass / total_mass
q_min = min(q_min, q_avg)
q_max = max(q_max, q_avg)

# 3) modify ρq
for iter in 1:maxiter
Δtracer_mass = zero(eltype(array_ρq))
for j in 1:Nj, i in 1:Ni
ρ = array_ρ[i, j, 1]
ρq = array_ρq[i, j, f]
ρq_max = ρ * q_max
ρq_min = ρ * q_min
w = array_w[i, j]
if ρq > ρq_max
Δtracer_mass += (ρq - ρq_max) * w
array_ρq[i, j, f] = ρq_max
elseif ρq < ρq_min
Δtracer_mass += (ρq - ρq_min) * w
array_ρq[i, j, f] = ρq_min
end
end

if abs(Δtracer_mass) <= rtol * abs(tracer_mass)
break
end

if Δtracer_mass > 0 # add mass
total_mass_at_Δ_points = zero(eltype(array_ρ))
for j in 1:Nj, i in 1:Ni
ρ = array_ρ[i, j, 1]
ρq = array_ρq[i, j, f]
w = array_w[i, j]
if ρq < ρ * q_max
total_mass_at_Δ_points += ρ * w
end
end
Δq_at_Δ_points = Δtracer_mass / total_mass_at_Δ_points
for j in 1:Nj, i in 1:Ni
ρ = array_ρ[i, j, 1]
ρq = array_ρq[i, j, f]
if ρq < ρ * q_max
array_ρq[i, j, f] += ρ * Δq_at_Δ_points
end
end
else # remove mass
total_mass_at_Δ_points = zero(eltype(array_ρ))
for j in 1:Nj, i in 1:Ni
ρ = array_ρ[i, j, 1]
ρq = array_ρq[i, j, f]
w = array_w[i, j]
if ρq > ρ * q_min
total_mass_at_Δ_points += ρ * w
end
end
Δq_at_Δ_points = Δtracer_mass / total_mass_at_Δ_points
for j in 1:Nj, i in 1:Ni
ρ = array_ρ[i, j, 1]
ρq = array_ρq[i, j, f]
if ρq > ρ * q_min
array_ρq[i, j, f] += ρ * Δq_at_Δ_points
end
end
end

if iter == maxiter
converged = false
end
end
end

end
# converged || @warn "Limiter failed to converge with rtol = $rtol"

return nothing
end

function apply_limiter!(
ρq::Fields.Field,
Expand Down
9 changes: 5 additions & 4 deletions src/Topologies/topology2d.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ struct Topology2D{
LVO,
GV,
GVO,
VI,
BF,
RGV,
} <: AbstractDistributedTopology
Expand Down Expand Up @@ -81,9 +82,9 @@ struct Topology2D{
ghost_vertex_offset::GVO

"a vector of the lidx of neighboring local elements of each element"
local_neighbor_elem::Vector{Int}
local_neighbor_elem::VI
"the index into `local_neighbor_elem` for the start of each element"
local_neighbor_elem_offset::Vector{Int}
local_neighbor_elem_offset::VI
"a vector of the ridx of neighboring ghost elements of each element"
ghost_neighbor_elem::Vector{Int}
"the index into `ghost_neighbor_elem` for the start of each element"
Expand Down Expand Up @@ -503,8 +504,8 @@ function Topology2D(
DA(local_vertex_offset),
DA(ghost_vertices),
DA(ghost_vertex_offset),
local_neighbor_elem,
local_neighbor_elem_offset,
DA(local_neighbor_elem),
DA(local_neighbor_elem_offset),
ghost_neighbor_elem,
ghost_neighbor_elem_offset,
boundaries,
Expand Down
32 changes: 13 additions & 19 deletions test/Limiters/limiter.jl
Original file line number Diff line number Diff line change
Expand Up @@ -128,30 +128,25 @@ end
limiter = Limiters.QuasiMonotoneLimiter(ρq)
Limiters.compute_element_bounds!(limiter, ρq, ρ)

is_gpu = device isa ClimaComms.CUDADevice
S = map(Iterators.product(1:n1, 1:n2)) do (h1, h2)
(h1, h2, slab(limiter.q_bounds, h1 + n1 * (h2 - 1)))
end
CUDA.@allowscalar begin
@test all(map(T -> T[3][1].x 2 * (T[1] - 1), S)) broken = is_gpu # q_min
@test all(map(T -> T[3][1].y 3 * (T[2] - 1), S)) broken = is_gpu # q_min
@test all(map(T -> T[3][2].x 2 * T[1], S)) broken = is_gpu # q_max
@test all(map(T -> T[3][2].y 3 * T[2], S)) broken = is_gpu # q_max
@test all(map(T -> T[3][1].x 2 * (T[1] - 1), S)) # q_min
@test all(map(T -> T[3][1].y 3 * (T[2] - 1), S)) # q_min
@test all(map(T -> T[3][2].x 2 * T[1], S)) # q_max
@test all(map(T -> T[3][2].y 3 * T[2], S)) # q_max
end

Limiters.compute_neighbor_bounds_local!(limiter, ρ)
SN = map(Iterators.product(1:n1, 1:n2)) do (h1, h2)
(h1, h2, slab(limiter.q_bounds_nbr, h1 + n1 * (h2 - 1)))
end
CUDA.@allowscalar begin
@test all(map(T -> T[3][1].x 2 * max(T[1] - 2, 0), SN)) broken =
is_gpu # q_min
@test all(map(T -> T[3][1].y 3 * max(T[2] - 2, 0), SN)) broken =
is_gpu # q_min
@test all(map(T -> T[3][2].x 2 * min(T[1] + 1, n1), SN)) broken =
is_gpu # q_max
@test all(map(T -> T[3][2].y 3 * min(T[2] + 1, n2), SN)) broken =
is_gpu # q_max
@test all(map(T -> T[3][1].x 2 * max(T[1] - 2, 0), SN)) # q_min
@test all(map(T -> T[3][1].y 3 * max(T[2] - 2, 0), SN)) # q_min
@test all(map(T -> T[3][2].x 2 * min(T[1] + 1, n1), SN)) # q_max
@test all(map(T -> T[3][2].y 3 * min(T[2] + 1, n2), SN)) # q_max
end
end
end
Expand Down Expand Up @@ -216,8 +211,7 @@ end
Limiters.apply_limiter!(ρq, ρ, limiter)

@test Array(parent(ρq))[:, :, 1, 1]
[FT(0.0) FT(0.0); FT(0.950005) FT(0.950005)] rtol = 10eps(FT) broken =
gpu_broken
[FT(0.0) FT(0.0); FT(0.950005) FT(0.950005)] rtol = 10eps(FT)
# Check mass conservation after application of limiter
@test sum(ρq) initial_Q_mass rtol = 10eps(FT)
end
Expand All @@ -228,7 +222,6 @@ end
Nij = 5
device = ClimaComms.device()
comms_ctx = ClimaComms.SingletonCommsContext(device)
is_gpu = device isa ClimaComms.CUDADevice

for FT in (Float64, Float32)
space =
Expand Down Expand Up @@ -260,7 +253,8 @@ end

@test sum(ρq.x) total_ρq.x
@test sum(ρq.y) total_ρq.y
@test all(FT(0) .<= Array(parent(ρq)) .<= FT(1)) broken = is_gpu
@test maximum(Array(parent(ρq))) 1 rtol = eps(FT)
@test minimum(Array(parent(ρq))) 0 rtol = eps(FT)
end
end

Expand All @@ -271,7 +265,6 @@ end
n3 = 3
device = ClimaComms.device()
comms_ctx = ClimaComms.SingletonCommsContext(device)
is_gpu = device isa ClimaComms.CUDADevice

for FT in (Float64, Float32)
horzspace, hv_center_space, hv_face_space = hvspace_3D(
Expand Down Expand Up @@ -315,6 +308,7 @@ end

@test sum(ρq.x) total_ρq.x
@test sum(ρq.y) total_ρq.y
@test all(FT(0) .<= Array(parent(ρq)) .<= FT(1)) broken = is_gpu
@test maximum(Array(parent(ρq))) 1
@test minimum(Array(parent(ρq))) 0
end
end

0 comments on commit 3b43d09

Please sign in to comment.