diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml deleted file mode 100644 index 7935393..0000000 --- a/.github/workflows/CompatHelper.yml +++ /dev/null @@ -1,16 +0,0 @@ -name: CompatHelper -on: - schedule: - - cron: 0 0 0 * * - workflow_dispatch: -jobs: - CompatHelper: - runs-on: ubuntu-latest - steps: - - name: Pkg.add("CompatHelper") - run: julia -e 'using Pkg; Pkg.add("CompatHelper")' - - name: CompatHelper.main() - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }} - run: julia -e 'using CompatHelper; CompatHelper.main()' diff --git a/docs/src/details.md b/docs/src/details.md index 51abb9d..8306ef5 100644 --- a/docs/src/details.md +++ b/docs/src/details.md @@ -46,6 +46,13 @@ J. Monaghan, Smoothed Particle Hydrodynamics, “Annual Review of Astronomy and ### Density diffusion term +```math + +\frac{\partial \rho_i}{\partial t} = \sum m_j \textbf{v}_{ij} \cdot \nabla_i W_{ij} + \delta_{\Phi} h c_0 \sum \Psi_{ij} \cdot \nabla_i W_{ij} \frac{m_j}{\rho_j} + + +``` + ### XSPH correction diff --git a/examples/profile.jl b/examples/profile.jl index 398e499..46a3f1d 100644 --- a/examples/profile.jl +++ b/examples/profile.jl @@ -42,9 +42,11 @@ sphprob.dpc_l₀ = 0.01 s = 0.01 stepsolve!(sphprob, 1) -@profile stepsolve!(sphprob, 10000) +@benchmark stepsolve!(sphprob, 1000) -@profile timesolve!(sphprob; batch = 300, timeframe = 0.2) +@profile stepsolve!(sphprob, 100) + +@profile timesolve!(sphprob; batch = 300, timeframe = 0.001) pprof() diff --git a/src/celllist.jl b/src/celllist.jl index 1e08ca2..a34eb86 100644 --- a/src/celllist.jl +++ b/src/celllist.jl @@ -100,7 +100,7 @@ end Full update cell grid. """ -@noinline function update!(c::GPUCellList) +@noinline function update!(c::GPUCellList, fillzero::Bool = true) #cellmap_2d!(c.pcell, c.points, (c.cs[2], c.cs[2]), c.offset) fill!(c.cellpnum, zero(Int32)) @@ -126,7 +126,7 @@ Full update cell grid. if c.pairsn > length(c.pairs) || c.pairsn < length(c.pairs) * 0.6 # if current number of pairs more than pair list or too small - then resize CUDA.unsafe_free!(c.pairs) c.pairs = CUDA.fill((zero(Int32), zero(Int32)), Int(ceil(c.pairsn/0.8))) - else + elseif fillzero fill!(c.pairs, (zero(Int32), zero(Int32))) end @@ -153,13 +153,13 @@ Full update cell grid. end """ - partialupdate!(c::GPUCellList) + partialupdate!(c::GPUCellList, fillzero::Bool = true) Update only distance """ @noinline function partialupdate!(c::GPUCellList) fill!(c.cnt, zero(Int32)) - fill!(c.pairs, (zero(Int32), zero(Int32))) + if fillzero fill!(c.pairs, (zero(Int32), zero(Int32))) end neib_internal_2d!(c.pairs, c.cnt, c.cellpnum, c.points, c.celllist, c.dist) neib_external_2d!(c.pairs, c.cnt, c.cellpnum, c.points, c.celllist, (1, -1), c.dist) neib_external_2d!(c.pairs, c.cnt, c.cellpnum, c.points, c.celllist, (0, 1), c.dist) @@ -177,6 +177,16 @@ function neighborlist(c::GPUCellList) c.pairs end +""" + neighborlistview(c::GPUCellList) + +List of pairs with distance. +""" +function neighborlistview(c::GPUCellList) + view(c.pairs, 1:c.pairsn) +end + + function Base.show(io::IO, c::GPUCellList) println(io, " Cell List") diff --git a/src/gpukernels2d.jl b/src/gpukernels2d.jl index 9923b3e..0a13216 100644 --- a/src/gpukernels2d.jl +++ b/src/gpukernels2d.jl @@ -143,43 +143,46 @@ function neib_internal_2d!(pairs, cnt, cellpnum, points, celllist, dist) cs = fld(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN), Tx * Ty * sizeof(Tuple{Int32, Int32})) CUDA.@sync gpukernel(pairs, cnt, cellpnum, points, celllist, dist², cs; threads = threads, blocks = blocks, shmem= Tx * Ty * cs * sizeof(Tuple{Int32, Int32})) end -function kernel_neib_internal_2d!(pairs, cnt, cellpnum, points, celllist, dist², cs) - indexᵢ = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x - indexⱼ = (blockIdx().y - Int32(1)) * blockDim().y + threadIdx().y - #indexₖ = (blockIdx().z - Int32(1)) * blockDim().z + threadIdx().z +function kernel_neib_internal_2d!(pairs, cnt, cellpnum, points, celllist, dist², cs) # cs - cache length for each thread + threadx = threadIdx().x + thready = threadIdx().y + indexᵢ = (blockIdx().x - Int32(1)) * blockDim().x + threadx + indexⱼ = (blockIdx().y - Int32(1)) * blockDim().y + thready + #indexₖ = (blockIdx().z - Int32(1)) * blockDim().z + threadIdx().z Nx, Ny = size(cellpnum) - cache = CuDynamicSharedArray(Tuple{Int32, Int32}, blockDim().x * blockDim().y * cs) + cache = CuDynamicSharedArray(Tuple{Int32, Int32}, (cs, blockDim().x , blockDim().y)) if indexᵢ <= Nx && indexⱼ <= Ny && cellpnum[indexᵢ, indexⱼ] > 1 - lind = indexᵢ + Ny * (indexⱼ - Int32(1)) ccnt = zero(Int32) len = cellpnum[indexᵢ, indexⱼ] for i = 1:len - 1 indi = celllist[i, indexᵢ, indexⱼ] - pᵢ = points[indi] + pᵢ = points[indi] for j = i + 1:len indj = celllist[j, indexᵢ, indexⱼ] - pⱼ = points[indj] - distance = (pᵢ[1] - pⱼ[1])^2 + (pᵢ[2] - pⱼ[2])^2 + pⱼ = points[indj] + distance = (pᵢ[1] - pⱼ[1])^2 + (pᵢ[2] - pⱼ[2])^2 # calculate r² to awoid sqrt, compare with squared maximum distance dist² if distance < dist² - cache[lind + ccnt] = minmax(indi, indj) + #n = CUDA.@atomic cnt[1] += 1 + #pairs[n+1] = minmax(indi, indj) ccnt += 1 + cache[ccnt, threadx, thready] = minmax(indi, indj) if ccnt == cs - ccnt = 0 - s = CUDA.@atomic cnt[1] += cs - if s + cs <=length(pairs) - for cind in 1:cs - pairs[s+cind] = cache[lind + cind - 1] + s = CUDA.@atomic cnt[1] += ccnt + if s + ccnt <=length(pairs) + for cind in 1:ccnt + pairs[s + cind] = cache[cind, threadx, thready] end end + ccnt = 0 end end end end - if ccnt != 0 - s = CUDA.@atomic cnt[1] += ccnt + if ccnt > 0 + s = CUDA.@atomic cnt[1] += ccnt if s + ccnt <=length(pairs) - for cind in 1:ccnt - pairs[s + cind] = cache[lind + cind - 1] + for cind in 1:ccnt + pairs[s + cind] = cache[cind, threadx, thready] end end end @@ -205,23 +208,19 @@ function neib_external_2d!(pairs, cnt, cellpnum, points, celllist, offset, dist) Bx, By = cld(Nx, Tx), cld(Ny, Ty) # Blocks in grid. threads = (Tx, Ty) blocks = Bx, By - cs = fld(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN) - 8, Tx * Ty * sizeof(Tuple{Int32, Int32})) - CUDA.@sync gpukernel(pairs, cnt, cellpnum, points, celllist, offset, dist², cs; threads = threads, blocks = blocks, shmem= Tx * Ty * cs * sizeof(Tuple{Int32, Int32}) + 8) + cs = fld(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN), Tx * Ty * sizeof(Tuple{Int32, Int32})) + CUDA.@sync gpukernel(pairs, cnt, cellpnum, points, celllist, offset, dist², cs; threads = threads, blocks = blocks, shmem= Tx * Ty * cs * sizeof(Tuple{Int32, Int32})) end function kernel_neib_external_2d!(pairs, cnt, cellpnum, points, celllist, offset, dist², cs) - indexᵢ = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x - indexⱼ = (blockIdx().y - Int32(1)) * blockDim().y + threadIdx().y - #scnt = CuDynamicSharedArray(Int32, 1) - cache = CuDynamicSharedArray(Tuple{Int32, Int32}, blockDim().x * blockDim().y * cs, 8) - #if threadIdx().x == 1 && threadIdx().y == 1 - # scnt[1] = cnt[1] - #end - #sync_threads() + threadx = threadIdx().x + thready = threadIdx().y + indexᵢ = (blockIdx().x - Int32(1)) * blockDim().x + threadx + indexⱼ = (blockIdx().y - Int32(1)) * blockDim().y + thready + cache = CuDynamicSharedArray(Tuple{Int32, Int32}, (cs, blockDim().x , blockDim().y)) Nx, Ny = size(cellpnum) neibcellᵢ = indexᵢ + offset[1] neibcellⱼ = indexⱼ + offset[2] if 0 < neibcellᵢ <= Nx && 0 < neibcellⱼ <= Ny && indexᵢ <= Nx && indexⱼ <= Ny && cellpnum[indexᵢ, indexⱼ] > 0 #&& cellpnum[neibcellᵢ, neibcellⱼ] > 0 - lind = indexᵢ + Ny * (indexⱼ - Int32(1)) ccnt = zero(Int32) iinds = view(celllist, 1:cellpnum[indexᵢ, indexⱼ], indexᵢ, indexⱼ) jinds = view(celllist, 1:cellpnum[neibcellᵢ, neibcellⱼ], neibcellᵢ, neibcellⱼ) @@ -231,82 +230,87 @@ function kernel_neib_external_2d!(pairs, cnt, cellpnum, points, celllist, offse pⱼ = points[j] distance = (pᵢ[1] - pⱼ[1])^2 + (pᵢ[2] - pⱼ[2])^2 if distance < dist² - cache[lind + ccnt] = minmax(i, j) + #n = CUDA.@atomic cnt[1] += 1 + #pairs[n + 1] = minmax(i, j) + ccnt += 1 + cache[ccnt, threadx, thready] = minmax(i, j) if ccnt == cs s = CUDA.@atomic cnt[1] += ccnt if s + ccnt <=length(pairs) - for cind in 1:cs - pairs[s+cind] = cache[lind + cind - 1] + for cind in 1:ccnt + pairs[s + cind] = cache[cind, threadx, thready] end end ccnt = 0 - end + end + end end - end - if ccnt != 0 - s = CUDA.@atomic cnt[1] += ccnt + end + if ccnt > 0 + s = CUDA.@atomic cnt[1] += ccnt if s + ccnt <=length(pairs) - for cind in 1:ccnt - pairs[s + cind] = cache[lind + cind - 1] + for cind in 1:ccnt + pairs[s + cind] = cache[cind, threadx, thready] end end end + end - #sync_threads() - #if threadIdx().x == 1 && threadIdx().y == 1 - # cnt[1] = scnt[1] - #end return nothing end ##################################################################### -# Make neighbor matrix (list) +# ##################################################################### -#= -function kernel_neiblist_2d!(nlist, ncnt, points, celllist, cellpnum, pcell, dist, offset) - index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x - if index <= length(points) - # get point cell - cell = pcell[index] - celli = cell[1] + offset[1] - cellj = cell[2] + offset[2] - if 0 < celli <= size(celllist, 2) && 0 < cellj <= size(celllist, 3) - snl = size(nlist, 1) - clist = view(celllist, :, celli, cellj) - celln = cellpnum[celli, cellj] - distsq = dist * dist - cnt = ncnt[index] - pointi = points[index] - pointj = points[indexj] - for i = 1:celln - indexj = clist[i] - if index != indexj && (pointi[1] - pointj[1])^2 + (pointi[2] - pointj[2])^2 < distsq - cnt += 1 - if cnt <= snl - nlist[cnt, index] = indexj - end - end +function pranges!(ranges, pairs) + gpukernel = @cuda launch=false kernel_pranges!(ranges, pairs) + config = launch_configuration(gpukernel.fun) + Nx = length(ranges) + maxThreads = config.threads + Tx = min(maxThreads, Nx) + CUDA.@sync gpukernel(ranges, pairs; threads = Tx, blocks = 1, shmem = Tx * sizeof(Int)) +end +function kernel_pranges!(ranges, pairs) + index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x + thread = threadIdx().x + stride = gridDim().x * blockDim().x + tn = blockDim().x + cache = CuDynamicSharedArray(Int, tn) + cache[thread] = 1 + rs = 0 + sync_threads() + while index <= length(ranges) + s = false + for i = cache[thread]:length(pairs) + fpi = first(pairs[i]) + if fpi == index && !s + s = true + rs = i + elseif fpi != index && s + ranges[index] = (rs, i - 1) + cache[thread] = i + break + elseif (!s && index < fpi) || fpi == 0 + ranges[index] = (0, 0) + break end - ncnt[index] = cnt end + sync_threads() + if thread == 1 + maxindex = 1 + for i = 1:tn + if cache[i] > maxindex maxindex = cache[i] end + end + for i = 1:tn + cache[i] = maxindex + end + end + index += stride + sync_threads() end return nothing end -""" - neiblist_2d!(nlist, ncnt, points, celllist, cellpnum, pcell, dist, offset) - -""" -function neiblist_2d!(nlist, ncnt, points, celllist, cellpnum, pcell, dist, offset) - gpukernel = @cuda launch=false kernel_neiblist_2d!(nlist, ncnt, points, celllist, cellpnum, pcell, dist, offset) - config = launch_configuration(gpukernel.fun) - Nx = length(points) - maxThreads = config.threads - Tx = min(maxThreads, Nx) - Bx = cld(Nx, Tx) - CUDA.@sync gpukernel(nlist, ncnt, points, celllist, cellpnum, pcell, dist, offset; threads = Tx, blocks = Bx) -end -=# ##################################################################### ##################################################################### # SPH @@ -501,13 +505,35 @@ function kernel_∑∇W_2d!(∑∇W, pairs, points, kernel, H⁻¹) end ##################################################################### +# https://discourse.julialang.org/t/can-this-be-written-even-faster-cpu/109924/28 +@inline function powfancy7th(x, γ⁻¹, γ) + if γ == 7 + # todo tune the magic constant + # initial guess based on fast inverse sqrt trick but adjusted to compute x^(1/7) + t = copysign(reinterpret(Float64, 0x36cd000000000000 + reinterpret(UInt64,abs(x))÷7), x) + @fastmath for _ in 1:2 + # newton's method for t^3 - x/t^4 = 0 + t2 = t*t + t3 = t2*t + t4 = t2*t2 + xot4 = x/t4 + t = t - t*(t3 - xot4)/(4*t3 + 3*xot4) + end + return t + end + return x^γ⁻¹ +end """ ∂ρ∂tDDT!(∑∂ρ∂t, ∇W, pairs, points, h, m₀, δᵩ, c₀, γ, g, ρ₀, ρ, v, ptype) Compute ∂ρ∂t - density derivative includind density diffusion. +```math +\\frac{\\partial \\rho_i}{\\partial t} = \\sum m_j \\textbf{v}_{ij} \\cdot \\nabla_i W_{ij} + \\delta_{\Phi} h c_0 \\sum \\Psi_{ij} \\cdot \\nabla_i W_{ij} \\frac{m_j}{\\rho_j} + +``` """ function ∂ρ∂tDDT!(∑∂ρ∂t, ∇W, pairs, points, h, m₀, δᵩ, c₀, γ, g, ρ₀, ρ, v, ptype) @@ -570,7 +596,8 @@ function kernel_∂ρ∂tDDT!(∑∂ρ∂t, ∇W, pairs, points, h, m₀, δᵩ ∑∂ρ∂ti = ∑∂ρ∂tj = m₀dot if ptype[pᵢ] >= 1 - drhopvp = ρ₀ * (1 + DDTgz * Δx[2])^γ⁻¹ - ρ₀ ## << CHECK + drhopvp = ρ₀ * powfancy7th(1 + DDTgz * Δx[2], γ⁻¹, γ) - ρ₀ ## << CHECK + #drhopvp = ρ₀ * (1 + DDTgz * Δx[2])^γ⁻¹ - ρ₀ ## << CHECK visc_densi = DDTkh * c₀ * (ρⱼ - ρᵢ - drhopvp) / (r² + η²) delta_i = visc_densi * dot3 * m₀ / ρⱼ ∑∂ρ∂ti += delta_i @@ -578,7 +605,8 @@ function kernel_∂ρ∂tDDT!(∑∂ρ∂t, ∇W, pairs, points, h, m₀, δᵩ CUDA.@atomic ∑∂ρ∂t[pᵢ] += ∑∂ρ∂ti if ptype[pⱼ] >= 1 - drhopvn = ρ₀ * (1 - DDTgz * Δx[2])^γ⁻¹ - ρ₀ + drhopvn = ρ₀ * powfancy7th(1 - DDTgz * Δx[2], γ⁻¹, γ) - ρ₀ + #drhopvn = ρ₀ * (1 - DDTgz * Δx[2])^γ⁻¹ - ρ₀ visc_densi = DDTkh * c₀ * (ρᵢ - ρⱼ - drhopvn) / (r² + η²) delta_j = visc_densi * dot3 * m₀ / ρᵢ ∑∂ρ∂tj += delta_j @@ -608,6 +636,108 @@ function kernel_∂ρ∂tDDT!(∑∂ρ∂t, ∇W, pairs, points, h, m₀, δᵩ end return nothing end + +function ∂ρ∂tDDT2!(∑∂ρ∂t, ∇W, pairs, points, ranges, h, m₀, δᵩ, c₀, γ, g, ρ₀, ρ, v, ptype) + if length(pairs) != length(∇W) error("Length shoul be equal") end + + gpukernel = @cuda launch=false kernel_∂ρ∂tDDT2!(∑∂ρ∂t, ∇W, pairs, points, ranges, h, m₀, δᵩ, c₀, γ, g, ρ₀, ρ, v, ptype) + config = launch_configuration(gpukernel.fun) + Nx = length(ranges) + maxThreads = config.threads + Tx = min(maxThreads, Nx) + Bx = cld(Nx, Tx) + CUDA.@sync gpukernel(∑∂ρ∂t, ∇W, pairs, points, ranges, h, m₀, δᵩ, c₀, γ, g, ρ₀, ρ, v, ptype; threads = Tx, blocks = Bx) +end +function kernel_∂ρ∂tDDT2!(∑∂ρ∂t, ∇W, pairs, points, ranges, h, m₀, δᵩ, c₀, γ, g, ρ₀, ρ, v, ptype) + tindex = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x + stride = gridDim().x * blockDim().x + index = tindex + # move it outside kernel + γ⁻¹ = 1/γ + η² = (0.1*h)*(0.1*h) + Cb = (c₀ * c₀ * ρ₀) * γ⁻¹ + DDTgz = ρ₀ * g / Cb + DDTkh = 2 * h * δᵩ + + while index <= length(ranges) + s, e = ranges[index] + for pind in s:e + pair = pairs[index] + pᵢ = pair[1]; pⱼ = pair[2] + if pᵢ > 0 # && !(isboundary[pᵢ] && isboundary[pᵢ]) + xᵢ = points[pᵢ] + xⱼ = points[pⱼ] + Δx = (xᵢ[1] - xⱼ[1], xᵢ[2] - xⱼ[2]) + r² = Δx[1]^2 + Δx[2]^2 + # for timestep Δt½ d != actual range + # one way - not calculate values out of 2h + # if r² > (2h)^2 return nothing end + #= + Cb = (c₀ * c₀ * ρ₀) * γ⁻¹ + Pᴴ = ρ₀ * g * z + ᵸᵀᴴ + =# + ρᵢ = ρ[pᵢ] + ρⱼ = ρ[pⱼ] + + Δv = (v[pᵢ][1] - v[pⱼ][1], v[pᵢ][2] - v[pⱼ][2]) + + ∇Wᵢⱼ = ∇W[index] + #= + z = Δx[2] + Cb = (c₀ * c₀ * ρ₀) * γ⁻¹ + Pᴴ = ρ₀ * g * z + ρᴴ = ρ₀ * (((Pᴴ + 1)/Cb)^γ⁻¹ - 1) + ψ = 2 * (ρᵢ - ρⱼ) * Δx / r² + =# + dot3 = -(Δx[1] * ∇Wᵢⱼ[1] + Δx[2] * ∇Wᵢⱼ[2]) # - Δx ⋅ ∇Wᵢⱼ + + # as actual range at timestep Δt½ may be greateg - some problems can be here + if 1 + DDTgz * Δx[2] < 0 || 1 - DDTgz * Δx[2] < 0 return nothing end + + m₀dot = m₀ * (Δv[1] * ∇Wᵢⱼ[1] + Δv[2] * ∇Wᵢⱼ[2]) # Δv ⋅ ∇Wᵢⱼ + ∑∂ρ∂ti = ∑∂ρ∂tj = m₀dot + + if ptype[pᵢ] >= 1 + drhopvp = ρ₀ * powfancy7th(1 + DDTgz * Δx[2], γ⁻¹, γ) - ρ₀ ## << CHECK + visc_densi = DDTkh * c₀ * (ρⱼ - ρᵢ - drhopvp) / (r² + η²) + delta_i = visc_densi * dot3 * m₀ / ρⱼ + ∑∂ρ∂ti += delta_i + end + CUDA.@atomic ∑∂ρ∂t[pᵢ] += ∑∂ρ∂ti + + if ptype[pⱼ] >= 1 + drhopvn = ρ₀ * powfancy7th(1 - DDTgz * Δx[2], γ⁻¹, γ) - ρ₀ + visc_densi = DDTkh * c₀ * (ρᵢ - ρⱼ - drhopvn) / (r² + η²) + delta_j = visc_densi * dot3 * m₀ / ρᵢ + ∑∂ρ∂tj += delta_j + end + CUDA.@atomic ∑∂ρ∂t[pⱼ] += ∑∂ρ∂tj + + #= + if isnan(delta_j) || isnan(m₀dot) || isnan(ρᵢ) || isnan(ρⱼ) + @cuprintln "kernel_DDT 1 isnan dx1 = $(Δx[1]) , dx2 = $(Δx[2]) rhoi = $ρᵢ , dot3 = $dot3 , visc_densi = $visc_densi drhopvn = $drhopvn $(∇W[1]) $(Δv[1])" + error() + end + if isinf(delta_j) || isinf(m₀dot) || isinf(delta_i) + @cuprintln "kernel_DDT 2 inf: dx1 = $(Δx[1]) , dx2 = $(Δx[2]) rhoi = $ρᵢ , rhoj = $ρⱼ , dot3 = $dot3 , delta_i = $delta_i , delta_j = $delta_j , drhopvn = $drhopvn , visc_densi = $visc_densi , $(∇W[1]) , $(Δv[1])" + error() + end + =# + #mlfac = MotionLimiter[pᵢ] * MotionLimiter[pⱼ] + #= + if isnan(∑∂ρ∂tval1) || isnan(∑∂ρ∂tval2) || abs(∑∂ρ∂tval1) > 10000000 || abs(∑∂ρ∂tval2) > 10000000 + @cuprintln "kernel DDT: drhodti = $∑∂ρ∂ti drhodtj = $∑∂ρ∂tj, dx1 = $(Δx[1]), dx2 = $(Δx[2]) rhoi = $ρᵢ, rhoj = $ρⱼ, dot3 = $dot3, visc_densi = $visc_densi, drhopvn = $drhopvn, dw = $(∇W[1]), dv = $(Δv[1])" + error() + end + =# + + end + index += stride + end + end + return nothing +end ##################################################################### """ diff --git a/src/sphproblem.jl b/src/sphproblem.jl index 76f2e53..0e2470a 100644 --- a/src/sphproblem.jl +++ b/src/sphproblem.jl @@ -165,8 +165,8 @@ function _stepsolve!(prob::SPHProblem{T}, n::Int, ::StepByStep; timestepping = f else update!(prob.system) x = prob.system.points - pairs = neighborlist(prob.system) - sort!(pairs, by = first) + pairs = neighborlistview(prob.system) + #sort!(pairs, by = first) for a in prob.cΔx fill!(a, zero(T)) end skipupdate = true updaten += 1 diff --git a/test/devtest.jl b/test/devtest.jl index d445ee8..fcb975e 100644 --- a/test/devtest.jl +++ b/test/devtest.jl @@ -866,4 +866,163 @@ verts = empty(MeshCell{WriteVTK.PolyData.Verts,UnitRange{Int64}}[]) end test2!(CUDA.zeros(10)) - @benchmark minmax(34, 23) \ No newline at end of file + @benchmark minmax(34, 23) + + +using CUDA + + function test_2d!(cnt, mat) + gpukernel = @cuda launch=false kernel_test_2d!(cnt, mat, 6) + config = launch_configuration(gpukernel.fun) + maxThreads = config.threads + Nx = length(mat) + Tx = min(maxThreads, Nx) + Bx = 1 + cs = fld(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN) - 8, Tx * sizeof(Tuple{Int32, Int32})) + CUDA.@sync gpukernel(cnt, mat, cs; threads = Tx, blocks = Bx, shmem= Tx * cs * sizeof(Tuple{Int32, Int32})) + end + function kernel_test_2d!(cnt, mat, cs) + index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x + stride = gridDim().x * blockDim().x + scnt = CuStaticSharedArray(Int32, 1) + + if threadIdx().x == 1 + scnt[1] = cnt[1] + end + sync_threads() + Nx, Ny = size(mat) + while index <= length(mat) + y = cld(index, Nx) + x = index - Nx * (y - 1) + #@cuprintln "index $index ind $x, $y" + + n = CUDA.@atomic scnt[1] += 1 + mat[x,y] = n + 1 + index += stride + end + sync_threads() + if threadIdx().x == 1 + cnt[1] = scnt[1] + CUDA.@cuprintln "Down $(cnt[1]) block $(blockIdx().x)" + end + return nothing + end + cnt = cu([0]) + mat = CUDA.zeros(100, 100) + test_2d!(cnt, mat) + + + + function pranges_test!(ranges, pairs) + gpukernel = @cuda launch=false kernel_pranges_test!(ranges, pairs) + config = launch_configuration(gpukernel.fun) + Nx = length(ranges) + maxThreads = config.threads + Tx = min(maxThreads, Nx) + CUDA.@sync gpukernel(ranges, pairs; threads = 1, blocks = 1, shmem = Tx * sizeof(Int)) + end + function kernel_pranges_test!(ranges, pairs, np) + index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x + thread = threadIdx().x + stride = gridDim().x * blockDim().x + tn = blockDim().x + cache = CuDynamicSharedArray(Int, tn) + cache[thread] = 1 + + si = (thread - 1) * np + 1 + ei = min(length(pairs), thread * np) + sync_threads() + if thread != 1 + s = first(pairs[si]) + for i = si+1:length(pairs) + if first(pairs[i]) != s + si = i + break + end + end + end + + sync_threads() + for i = si:ei + + end + end + + + @benchmark pranges_test!($pr, $system.pairs) + + + #= +function neib_external_2d!(pairs, cnt, cellpnum, points, celllist, offset, dist) + dist² = dist^2 + CLn, CLx, CLy = size(celllist) + Nx, Ny = size(cellpnum) + if (Nx, Ny) != (CLx, CLy) error("cell list dimension $((CLx, CLy)) not equal cellpnum $(size(cellpnum))...") end + gpukernel = @cuda launch=false kernel_neib_external_2d!(pairs, cnt, cellpnum, points, celllist, offset, dist², 6) + config = launch_configuration(gpukernel.fun) + maxThreads = config.threads + Tx = min(maxThreads, Nx) + Bx = 1 # Blocks in grid. + cs = fld(attribute(device(), CUDA.DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN), Tx * sizeof(Tuple{Int32, Int32})) + CUDA.@sync gpukernel(pairs, cnt, cellpnum, points, celllist, offset, dist², cs; threads = Tx, blocks = Bx, shmem = Tx * cs * sizeof(Tuple{Int32, Int32})) +end +function kernel_neib_external_2d!(pairs, cnt, cellpnum, points, celllist, offset, dist², cs) + index = (blockIdx().x - Int32(1)) * blockDim().x + threadIdx().x + stride = gridDim().x * blockDim().x + Nx, Ny = size(cellpnum) + scnt = CuStaticSharedArray(Int32, 1) + cache = CuDynamicSharedArray(Tuple{Int32, Int32}, (cs, blockDim().x)) + + if threadIdx().x == 1 + scnt[1] = cnt[1] + end + sync_threads() + Nx, Ny = size(cellpnum) + while index <= length(cellpnum) + indexⱼ = cld(index, Nx) # y + indexᵢ = index - Nx * (indexⱼ - 1) # x + neibcellᵢ = indexᵢ + offset[1] + neibcellⱼ = indexⱼ + offset[2] + + if 0 < neibcellᵢ <= Nx && 0 < neibcellⱼ <= Ny && indexᵢ <= Nx && indexⱼ <= Ny && cellpnum[indexᵢ, indexⱼ] > 0 #&& cellpnum[neibcellᵢ, neibcellⱼ] > 0 + ccnt = zero(Int32) + iinds = view(celllist, 1:cellpnum[indexᵢ, indexⱼ], indexᵢ, indexⱼ) + jinds = view(celllist, 1:cellpnum[neibcellᵢ, neibcellⱼ], neibcellᵢ, neibcellⱼ) + for i in iinds + pᵢ = points[i] + for j in jinds + pⱼ = points[j] + distance = (pᵢ[1] - pⱼ[1])^2 + (pᵢ[2] - pⱼ[2])^2 + if distance < dist² + ccnt += 1 + cache[ccnt, threadIdx().x] = minmax(i, j) + if ccnt == cs + s = CUDA.@atomic scnt[1] += ccnt + if s + ccnt <=length(pairs) + for cind in 1:ccnt + pairs[s + cind] = cache[cind, threadIdx().x] + end + end + ccnt = 0 + end + end + end + end + if ccnt > 0 + s = CUDA.@atomic scnt[1] += ccnt + if s + ccnt <=length(pairs) + for cind in 1:ccnt + pairs[s + cind] = cache[cind, threadIdx().x] + end + end + end + end + index += stride + end + sync_threads() + if threadIdx().x == 1 + cnt[1] = scnt[1] + end + return nothing +end +=# \ No newline at end of file