diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 0803a91719..1d545cfb93 100755 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -1198,6 +1198,29 @@ steps: key: "cpu_field_perf" command: "julia --color=yes --project=.buildkite test/Fields/field_opt.jl" + - group: "Perf: Benchmark scripts" + steps: + + - label: "Perf: benchmark scripts index_swapping" + key: perf_index_swapping + command: + - "julia --project=.buildkite -e 'using CUDA; CUDA.versioninfo()'" + - "julia --color=yes --project=.buildkite benchmarks/scripts/index_swapping.jl" + env: + CLIMACOMMS_DEVICE: "CUDA" + agents: + slurm_gpus: 1 + + - label: "Perf: benchmark scripts indexing_and_static_ndranges" + key: indexing_and_static_ndranges + command: + - "julia --project=.buildkite -e 'using CUDA; CUDA.versioninfo()'" + - "julia --color=yes --project=.buildkite benchmarks/scripts/indexing_and_static_ndranges.jl" + env: + CLIMACOMMS_DEVICE: "CUDA" + agents: + slurm_gpus: 1 + - group: "Perf: Operators" steps: diff --git a/benchmarks/scripts/benchmark_utils.jl b/benchmarks/scripts/benchmark_utils.jl new file mode 100644 index 0000000000..4b75b30183 --- /dev/null +++ b/benchmarks/scripts/benchmark_utils.jl @@ -0,0 +1,89 @@ +import CUDA +using BenchmarkTools, Dates +using LazyBroadcast: @lazy + +""" + caller_name(@__FILE__) + +Returns a string of the (single) line pointing to the function that +called the function we're in. +""" +macro caller_name(f) + quote + string(readlines($f)[StackTraces.stacktrace()[4].line]) + end +end + +Base.@kwdef mutable struct Benchmark + problem_size::Tuple + float_type::Type + device_bandwidth_GBs::Int = 2_039 + data::Vector = [] +end + +function perf_stats(; bm::Benchmark, kernel_time_s, n_reads_writes) + N = prod(bm.problem_size) + GB = N * n_reads_writes * sizeof(bm.float_type) / 1024^3 + achieved_bandwidth_GBs = GB / kernel_time_s + bandwidth_efficiency = + achieved_bandwidth_GBs / bm.device_bandwidth_GBs * 100 + return (; N, GB, achieved_bandwidth_GBs, bandwidth_efficiency) +end; + +time_and_units_str(x::Real) = + trunc_time(string(compound_period(x, Dates.Second))) +function compound_period(x::Real, ::Type{T}) where {T <: Dates.Period} + nf = Dates.value(convert(Dates.Nanosecond, T(1))) + ns = Dates.Nanosecond(ceil(x * nf)) + return Dates.canonicalize(Dates.CompoundPeriod(ns)) +end +trunc_time(s::String) = count(',', s) > 1 ? join(split(s, ",")[1:2], ",") : s + +abstract type AbstractUniversalSizes{Nv, Nij} end +struct UniversalSizesCC{Nv, Nij} <: AbstractUniversalSizes{Nv, Nij} + Nh::Int +end +struct UniversalSizesStatic{Nv, Nij, Nh} <: AbstractUniversalSizes{Nv, Nij} end + +get_Nv(::AbstractUniversalSizes{Nv}) where {Nv} = Nv +get_Nij(::AbstractUniversalSizes{Nv, Nij}) where {Nv, Nij} = Nij +get_Nh(us::UniversalSizesCC) = us.Nh +get_Nh(::UniversalSizesStatic{Nv, Nij, Nh}) where {Nv, Nij, Nh} = Nh +get_N(us::AbstractUniversalSizes{Nv, Nij}) where {Nv, Nij} = + prod((Nv, Nij, Nij, 1, get_Nh(us))) +UniversalSizesCC(Nv, Nij, Nh) = UniversalSizesCC{Nv, Nij}(Nh) +UniversalSizesStatic(Nv, Nij, Nh) = UniversalSizesStatic{Nv, Nij, Nh}() + +import PrettyTables +function tabulate_benchmark(bm) + funcs = map(x -> x.caller, bm.data) + timings = map(x -> time_and_units_str(x.kernel_time_s), bm.data) + n_reads_writes = map(x -> x.n_reads_writes, bm.data) + nreps = map(x -> x.nreps, bm.data) + achieved_bandwidth_GBs = map(x -> x.achieved_bandwidth_GBs, bm.data) + bandwidth_efficiency = map(x -> x.bandwidth_efficiency, bm.data) + header = [ + "funcs", + "time per call", + "bw %", + "achieved bw", + "n-reads/writes", + "n-reps", + ] + data = hcat( + funcs, + timings, + bandwidth_efficiency, + achieved_bandwidth_GBs, + n_reads_writes, + nreps, + ) + title = "Problem size: $(bm.problem_size), float_type = $(bm.float_type), device_bandwidth_GBs=$(bm.device_bandwidth_GBs)" + PrettyTables.pretty_table( + data; + title, + header, + alignment = :l, + crop = :none, + ) +end diff --git a/benchmarks/scripts/index_swapping.jl b/benchmarks/scripts/index_swapping.jl index ac43c3fd7a..1e02a2874f 100644 --- a/benchmarks/scripts/index_swapping.jl +++ b/benchmarks/scripts/index_swapping.jl @@ -23,89 +23,55 @@ In particular, Clima A100 ``` -at_dot_call!($X_vector, $Y_vector): - 6 milliseconds, 19 microseconds -custom_kernel_bc!($X_array, $Y_array, $uss, swap = 0): - 6 milliseconds, 329 microseconds -custom_kernel_bc!($X_array, $Y_array, $uss, swap = 1): - 14 milliseconds, 232 microseconds -custom_kernel_bc!($X_array, $Y_array, $uss, swap = 2): - 15 milliseconds, 960 microseconds +[ Info: ArrayType = CuArray +Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs=2039 +┌──────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐ +│ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │ +├──────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤ +│ BIS.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) │ 36 microseconds, 195 nanoseconds │ 54.952 │ 1120.47 │ 2 │ 1000 │ +│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=0, nreps=1000, bm) │ 74 microseconds, 228 nanoseconds │ 26.7955 │ 546.359 │ 2 │ 1000 │ +│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=1, nreps=1000, bm) │ 82 microseconds, 501 nanoseconds │ 24.1085 │ 491.572 │ 2 │ 1000 │ +│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=2, nreps=1000, bm) │ 72 microseconds, 567 nanoseconds │ 27.4088 │ 558.865 │ 2 │ 1000 │ +└──────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘ ``` =# #! format: off -import CUDA -using BenchmarkTools, Dates -using LazyBroadcast: @lazy -ArrayType = CUDA.CuArray; -# ArrayType = identity; +module IndexSwapBench + +include("benchmark_utils.jl") -if ArrayType === identity - macro pretty_belapsed(expr) - return quote - println($(string(expr)), ":") - print(" ") - print_time_and_units(BenchmarkTools.@belapsed(esc($expr))) - end - end -else - macro pretty_belapsed(expr) - return quote - println($(string(expr)), ":") - print(" ") - print_time_and_units( - BenchmarkTools.@belapsed(CUDA.@sync((esc($expr)))) - ) - end - end - macro pretty_elapsed(expr) - return quote - println($(string(expr)), ":") - print(" ") - print_time_and_units( - BenchmarkTools.@elapsed(CUDA.@sync((esc($expr)))) - ) - end - end -end -print_time_and_units(x) = println(time_and_units_str(x)) -time_and_units_str(x::Real) = - trunc_time(string(compound_period(x, Dates.Second))) -function compound_period(x::Real, ::Type{T}) where {T <: Dates.Period} - nf = Dates.value(convert(Dates.Nanosecond, T(1))) - ns = Dates.Nanosecond(ceil(x * nf)) - return Dates.canonicalize(Dates.CompoundPeriod(ns)) -end -trunc_time(s::String) = count(',', s) > 1 ? join(split(s, ",")[1:2], ",") : s foo(x1, x2, x3) = x1 -function at_dot_call!(X, Y) +function at_dot_call!(X, Y; nreps = 1, print_info = true, bm=nothing) (; x1, x2, x3) = X (; y1) = Y - for i in 1:100 # reduce variance / impact of launch latency - @. y1 = foo(x1, x2, x3) # 3 reads, 1 write + e = CUDA.@elapsed begin for i in 1:nreps # reduce variance / impact of launch latency + @. y1 = foo(x1, x2, x3) # 3 reads, 1 write + end + end + if !isnothing(bm) + kernel_time_s=e/nreps + nt = (; + caller=@caller_name(@__FILE__), + kernel_time_s, + n_reads_writes=2, + nreps, + perf_stats(;bm,kernel_time_s,n_reads_writes=2)... + ) + push!(bm.data, nt) end return nothing end; -struct UniversalSizesStatic{Nv, Nij, Nh} end - -get_Nv(::UniversalSizesStatic{Nv}) where {Nv} = Nv -get_Nij(::UniversalSizesStatic{Nv, Nij}) where {Nv, Nij} = Nij -get_Nh(::UniversalSizesStatic{Nv, Nij, Nh}) where {Nv, Nij, Nh} = Nh -get_N(us::UniversalSizesStatic{Nv, Nij}) where {Nv, Nij} = prod((Nv,Nij,Nij,1,get_Nh(us))) -UniversalSizesStatic(Nv, Nij, Nh) = UniversalSizesStatic{Nv, Nij, Nh}() -using Test - -function custom_kernel_bc!(X, Y, us::UniversalSizesStatic; swap=0, printtb=false) +function custom_kernel_bc!(X, Y, us::UniversalSizesStatic; swap=0, printtb=false, nreps = 1, print_info = true, bm=nothing) (; x1, x2, x3) = X (; y1) = Y bc = @lazy @. y1 = foo(x1, x2, x3) @assert !(y1 isa Array) f = if swap==0 - custom_kernel_knl_bc_no_swap! + custom_kernel_knl_bc_0swap! elseif swap == 1 - custom_kernel_knl_bc_swap! + custom_kernel_knl_bc_1swap! elseif swap == 2 custom_kernel_knl_bc_2swap! else @@ -122,14 +88,27 @@ function custom_kernel_bc!(X, Y, us::UniversalSizesStatic; swap=0, printtb=false threads = min(N, config.threads) blocks = cld(N, threads) printtb && @show blocks, threads - for i in 1:100 # reduce variance / impact of launch latency - kernel(y1, bc,us; threads, blocks) + e = CUDA.@elapsed begin + for i in 1:nreps # reduce variance / impact of launch latency + kernel(y1, bc,us; threads, blocks) + end + end + if !isnothing(bm) + kernel_time_s=e/nreps + nt = (; + caller=@caller_name(@__FILE__), + kernel_time_s, + n_reads_writes=2, + nreps, + perf_stats(;bm,kernel_time_s,n_reads_writes=2)... + ) + push!(bm.data, nt) end return nothing end; # Mimics how indexing works in generalized pointwise kernels -function custom_kernel_knl_bc_swap!(y1, bc, us) +function custom_kernel_knl_bc_1swap!(y1, bc, us) @inbounds begin tidx = (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x if tidx ≤ get_N(us) @@ -145,7 +124,7 @@ function custom_kernel_knl_bc_swap!(y1, bc, us) end # Mimics how indexing works in specialized kernels -function custom_kernel_knl_bc_no_swap!(y1, bc, us) +function custom_kernel_knl_bc_0swap!(y1, bc, us) @inbounds begin tidx = (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x if tidx ≤ get_N(us) @@ -179,35 +158,47 @@ function custom_kernel_knl_bc_2swap!(y1, bc, us) end import Random +using Test function test_custom_kernel_bc!(X_array, Y_array, uss; swap) Random.seed!(1234) X_array.x1 .= typeof(X_array.x1)(rand(eltype(X_array.x1), size(X_array.x1))) Y_array_cp = deepcopy(Y_array) - custom_kernel_bc!(X_array, Y_array_cp, uss; swap=0) - custom_kernel_bc!(X_array, Y_array, uss; swap) + custom_kernel_bc!(X_array, Y_array_cp, uss; swap=0, print_info = false) + custom_kernel_bc!(X_array, Y_array, uss; swap, print_info = false) @test all(Y_array_cp.y1 .== Y_array.y1) end -FT = Float32; -arr(T) = T(zeros(63,4,4,1,5400)) -X_array = (;x1 = arr(ArrayType),x2 = arr(ArrayType),x3 = arr(ArrayType)); -Y_array = (;y1 = arr(ArrayType),); +end # module + +import .IndexSwapBench as BIS + +using CUDA +bm = BIS.Benchmark(;problem_size=(63,4,4,1,5400), float_type=Float32) +# bm = BIS.Benchmark(;problem_size=(63,4,4,1,5400), float_type=Float64) +ArrayType = CUDA.CuArray; +# ArrayType = identity; +arr(bm, T) = T(zeros(bm.float_type, bm.problem_size...)) +X_array = (;x1 = arr(bm, ArrayType),x2 = arr(bm, ArrayType),x3 = arr(bm, ArrayType)); +Y_array = (;y1 = arr(bm, ArrayType),); to_vec(ξ) = (;zip(propertynames(ξ), map(θ -> vec(θ), values(ξ)))...); X_vector = to_vec(X_array); Y_vector = to_vec(Y_array); N = length(X_vector.x1) (Nv, Nij, _, _, Nh) = size(Y_array.y1); -uss = UniversalSizesStatic(Nv, Nij, Nh); -at_dot_call!(X_vector, Y_vector) -custom_kernel_bc!(X_array, Y_array, uss; swap=0) -custom_kernel_bc!(X_array, Y_array, uss; swap=1) -custom_kernel_bc!(X_array, Y_array, uss; swap=2) -test_custom_kernel_bc!(X_array, Y_array, uss; swap=1) -test_custom_kernel_bc!(X_array, Y_array, uss; swap=2) - -@pretty_belapsed at_dot_call!($X_vector, $Y_vector) -@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $uss, swap=0) -@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $uss, swap=1) -@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $uss, swap=2) +uss = BIS.UniversalSizesStatic(Nv, Nij, Nh); +BIS.at_dot_call!(X_vector, Y_vector; nreps=1) +BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=0, nreps=1) +BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=1, nreps=1) +BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=2, nreps=1) +BIS.test_custom_kernel_bc!(X_array, Y_array, uss; swap=1) +BIS.test_custom_kernel_bc!(X_array, Y_array, uss; swap=2) + +BIS.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) +BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=0, nreps=1000, bm) +BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=1, nreps=1000, bm) +BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=2, nreps=1000, bm) + +@info "ArrayType = $ArrayType" +BIS.tabulate_benchmark(bm) #! format: on diff --git a/benchmarks/scripts/indexing_and_static_ndranges.jl b/benchmarks/scripts/indexing_and_static_ndranges.jl index 4ac1ad0fdd..3e42f0ec16 100644 --- a/benchmarks/scripts/indexing_and_static_ndranges.jl +++ b/benchmarks/scripts/indexing_and_static_ndranges.jl @@ -5,7 +5,7 @@ using Revise; include(joinpath("benchmarks", "scripts", "indexing_and_static_ndr # Info: This script compares two things: - linear vs cartesian indexing - - impact of static vs dynamic NDRanges (https://juliagpu.github.io/KernelAbstractions.jl/dev/examples/memcopy_static/) + - impact of static vs dynamic NDRanges (https://juliagpu.githubSR.io/KernelAbstractions.jl/dev/examples/memcopy_static/) Linear indexing, when possible, has performance advantages over using Cartesian indexing. Julia Base's Broadcast only @@ -30,61 +30,84 @@ setting where linear indexing is allowed. nearly the same benefit as linear indexing. # References: - - https://github.com/CliMA/ClimaCore.jl/issues/1889 - - https://github.com/JuliaLang/julia/issues/28126 - - https://github.com/JuliaLang/julia/issues/32051 + - https://githubSR.com/CliMA/ClimaCore.jl/issues/1889 + - https://githubSR.com/JuliaLang/julia/issues/28126 + - https://githubSR.com/JuliaLang/julia/issues/32051 # Benchmark results: -Local Apple M1 Mac (CPU): +Clima A100: ``` -at_dot_call!($X_array, $Y_array): - 143 milliseconds, 774 microseconds -at_dot_call!($X_vector, $Y_vector): - 65 milliseconds, 567 microseconds -custom_kernel_bc!($X_vector, $Y_vector, $us): - 66 milliseconds, 870 microseconds -custom_kernel_bc!($X_array, $Y_array, $us; use_pw = false): - 143 milliseconds, 643 microseconds -custom_kernel_bc!($X_array, $Y_array, $us; use_pw = true): - 65 milliseconds, 778 microseconds -custom_kernel_bc!($X_vector, $Y_vector, $uss): - 65 milliseconds, 765 microseconds -custom_kernel_bc!($X_array, $Y_array, $uss; use_pw = false): - 144 milliseconds, 271 microseconds -custom_kernel_bc!($X_array, $Y_array, $uss; use_pw = true): - 66 milliseconds, 376 microseconds +[ Info: ArrayType = identity +Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs=2039 +┌────────────────────────────────────────────────────────────────────────────┬───────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐ +│ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │ +├────────────────────────────────────────────────────────────────────────────┼───────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤ +│ BSR.at_dot_call!(X_array, Y_array; nreps=1000, bm) # slow │ 422 microseconds, 223 nanoseconds │ 2.35535 │ 48.0256 │ 1 │ 1000 │ +│ BSR.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) # fast │ 242 microseconds, 740 nanoseconds │ 4.09692 │ 83.5362 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_vector, Y_vector, us; nreps=1000, bm) │ 242 microseconds, 30 nanoseconds │ 4.10894 │ 83.7812 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_vector, Y_vector, uss; nreps=1000, bm) │ 244 microseconds, 279 nanoseconds │ 4.0711 │ 83.0097 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=false, nreps=1000, bm) │ 499 microseconds, 283 nanoseconds │ 1.99182 │ 40.6133 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=false, nreps=1000, bm) │ 541 microseconds, 506 nanoseconds │ 1.83651 │ 37.4465 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=true, nreps=1000, bm) │ 247 microseconds, 108 nanoseconds │ 4.02449 │ 82.0593 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=true, nreps=1000, bm) │ 242 microseconds, 209 nanoseconds │ 4.10589 │ 83.7192 │ 1 │ 1000 │ +└────────────────────────────────────────────────────────────────────────────┴───────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘ +[ Info: ArrayType = identity +Problem size: (63, 4, 4, 1, 5400), float_type = Float64, device_bandwidth_GBs=2039 +┌────────────────────────────────────────────────────────────────────────────┬───────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐ +│ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │ +├────────────────────────────────────────────────────────────────────────────┼───────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤ +│ BSR.at_dot_call!(X_array, Y_array; nreps=1000, bm) # slow │ 1 millisecond, 446 microseconds │ 1.37517 │ 28.0397 │ 1 │ 1000 │ +│ BSR.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) # fast │ 984 microseconds, 854 nanoseconds │ 2.01955 │ 41.1787 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_vector, Y_vector, us; nreps=1000, bm) │ 987 microseconds, 438 nanoseconds │ 2.01427 │ 41.0709 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_vector, Y_vector, uss; nreps=1000, bm) │ 985 microseconds, 779 nanoseconds │ 2.01766 │ 41.1401 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=false, nreps=1000, bm) │ 1 millisecond, 475 microseconds │ 1.34834 │ 27.4927 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=false, nreps=1000, bm) │ 1 millisecond, 473 microseconds │ 1.34985 │ 27.5234 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=true, nreps=1000, bm) │ 983 microseconds, 811 nanoseconds │ 2.0217 │ 41.2224 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=true, nreps=1000, bm) │ 984 microseconds, 683 nanoseconds │ 2.0199 │ 41.1858 │ 1 │ 1000 │ +└────────────────────────────────────────────────────────────────────────────┴───────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘ ``` Clima A100 ``` -at_dot_call!($X_array, $Y_array): - 6 milliseconds, 775 microseconds -at_dot_call!($X_vector, $Y_vector): - 2 milliseconds, 834 microseconds -custom_sol_kernel!($X_vector, $Y_vector, $(Val(N))): - 2 milliseconds, 547 microseconds -custom_kernel_bc!($X_vector, $Y_vector, $us): - 2 milliseconds, 561 microseconds -custom_kernel_bc!($X_array, $Y_array, $us; use_pw = false): - 4 milliseconds, 160 microseconds -custom_kernel_bc!($X_array, $Y_array, $us; use_pw = true): - 2 milliseconds, 584 microseconds -custom_kernel_bc!($X_vector, $Y_vector, $uss): - 2 milliseconds, 540 microseconds -custom_kernel_bc!($X_array, $Y_array, $uss; use_pw = false): - 2 milliseconds, 715 microseconds -custom_kernel_bc!($X_array, $Y_array, $uss; use_pw = true): - 2 milliseconds, 547 microseconds +[ Info: ArrayType = CuArray +Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs=2039 +┌─────────────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐ +│ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │ +├─────────────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤ +│ BSR.at_dot_call!(X_array, Y_array; nreps=1000, bm) # slow │ 84 microseconds, 791 nanoseconds │ 11.7287 │ 239.149 │ 1 │ 1000 │ +│ BSR.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) # fast │ 14 microseconds, 497 nanoseconds │ 68.6003 │ 1398.76 │ 1 │ 1000 │ +│ iscpu || BSR.custom_sol_kernel!(X_vector, Y_vector, Val(N); nreps=1000, bm) │ 13 microseconds, 125 nanoseconds │ 75.7724 │ 1545.0 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_vector, Y_vector, us; nreps=1000, bm) │ 14 microseconds, 212 nanoseconds │ 69.9794 │ 1426.88 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_vector, Y_vector, uss; nreps=1000, bm) │ 13 microseconds, 55 nanoseconds │ 76.1765 │ 1553.24 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=false, nreps=1000, bm) │ 47 microseconds, 258 nanoseconds │ 21.0439 │ 429.084 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=false, nreps=1000, bm) │ 30 microseconds, 637 nanoseconds │ 32.4612 │ 661.884 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=true, nreps=1000, bm) │ 14 microseconds, 386 nanoseconds │ 69.1326 │ 1409.61 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=true, nreps=1000, bm) │ 13 microseconds, 58 nanoseconds │ 76.1646 │ 1553.0 │ 1 │ 1000 │ +└─────────────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘ +[ Info: ArrayType = CuArray +Problem size: (63, 4, 4, 1, 5400), float_type = Float64, device_bandwidth_GBs=2039 +┌─────────────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐ +│ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │ +├─────────────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤ +│ BSR.at_dot_call!(X_array, Y_array; nreps=1000, bm) # slow │ 85 microseconds, 69 nanoseconds │ 23.3807 │ 476.732 │ 1 │ 1000 │ +│ BSR.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) # fast │ 28 microseconds, 809 nanoseconds │ 69.0417 │ 1407.76 │ 1 │ 1000 │ +│ iscpu || BSR.custom_sol_kernel!(X_vector, Y_vector, Val(N); nreps=1000, bm) │ 26 microseconds, 183 nanoseconds │ 75.965 │ 1548.93 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_vector, Y_vector, us; nreps=1000, bm) │ 26 microseconds, 426 nanoseconds │ 75.2673 │ 1534.7 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_vector, Y_vector, uss; nreps=1000, bm) │ 26 microseconds, 256 nanoseconds │ 75.7546 │ 1544.64 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=false, nreps=1000, bm) │ 47 microseconds, 819 nanoseconds │ 41.5938 │ 848.098 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=false, nreps=1000, bm) │ 31 microseconds, 442 nanoseconds │ 63.2584 │ 1289.84 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=true, nreps=1000, bm) │ 26 microseconds, 729 nanoseconds │ 74.4138 │ 1517.3 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=true, nreps=1000, bm) │ 26 microseconds, 642 nanoseconds │ 74.6569 │ 1522.25 │ 1 │ 1000 │ +└─────────────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘ ``` =# #! format: off -import CUDA -using BenchmarkTools, Dates -using LazyBroadcast: @lazy -ArrayType = CUDA.CuArray; -# ArrayType = identity; + +module IndexStaticRangeBench + +include("benchmark_utils.jl") # ============================================================ Non-extruded broadcast (start) import Base.Broadcast: BroadcastStyle @@ -216,62 +239,32 @@ _axes(bc::PointWiseBC{<:Base.Broadcast.AbstractArrayStyle{0}}, ::Nothing) = () Base.IndexStyle(::Type{<:PointWiseBC{<:Any, <:Tuple{Any}}}) = IndexLinear() # ============================================================ Non-extruded broadcast (end) -if ArrayType === identity - macro pretty_belapsed(expr) - return quote - println($(string(expr)), ":") - print(" ") - print_time_and_units(BenchmarkTools.@belapsed(esc($expr))) - end - end - macro pretty_elapsed(expr) - return quote - println($(string(expr)), ":") - print(" ") - print_time_and_units(BenchmarkTools.@elapsed(esc($expr))) - end - end -else - macro pretty_belapsed(expr) - return quote - println($(string(expr)), ":") - print(" ") - print_time_and_units( - BenchmarkTools.@belapsed(CUDA.@sync((esc($expr)))) - ) - end - end - macro pretty_elapsed(expr) - return quote - println($(string(expr)), ":") - print(" ") - print_time_and_units( - BenchmarkTools.@elapsed(CUDA.@sync((esc($expr)))) - ) - end - end -end -print_time_and_units(x) = println(time_and_units_str(x)) -time_and_units_str(x::Real) = - trunc_time(string(compound_period(x, Dates.Second))) -function compound_period(x::Real, ::Type{T}) where {T <: Dates.Period} - nf = Dates.value(convert(Dates.Nanosecond, T(1))) - ns = Dates.Nanosecond(ceil(x * nf)) - return Dates.canonicalize(Dates.CompoundPeriod(ns)) -end -trunc_time(s::String) = count(',', s) > 1 ? join(split(s, ",")[1:2], ",") : s myadd(x1, x2, x3) = zero(x1) -function at_dot_call!(X, Y) +function at_dot_call!(X, Y; nreps = 1, bm=nothing) (; x1, x2, x3) = X (; y1) = Y - for i in 1:100 # reduce variance / impact of launch latency - @. y1 = myadd(x1, x2, x3) # 3 reads, 1 write - # @. y1 = 0 # 3 reads, 1 write + @. y1 = myadd(x1, x2, x3) # compile + e = CUDA.@elapsed begin + for i in 1:nreps # reduce variance / impact of launch latency + @. y1 = myadd(x1, x2, x3) # 3 reads, 1 write + end + end + if !isnothing(bm) + kernel_time_s=e/nreps + n_reads_writes=1 + nt = (; + caller=@caller_name(@__FILE__), + kernel_time_s, + n_reads_writes, + nreps, + perf_stats(;bm,kernel_time_s,n_reads_writes)... + ) + push!(bm.data, nt) end return nothing end; -function custom_sol_kernel!(X, Y, ::Val{N}) where {N} +function custom_sol_kernel!(X, Y, ::Val{N}; nreps = 1, bm=nothing) where {N} (; x1, x2, x3) = X (; y1) = Y kernel = CUDA.@cuda always_inline = true launch = false custom_kernel_knl!( @@ -284,9 +277,25 @@ function custom_sol_kernel!(X, Y, ::Val{N}) where {N} config = CUDA.launch_configuration(kernel.fun) threads = min(N, config.threads) blocks = cld(N, threads) - for i in 1:100 # reduce variance / impact of launch latency - kernel(y1, x1, x2, x3, Val(N); threads, blocks) + kernel(y1, x1, x2, x3, Val(N); threads, blocks) # compile + e = CUDA.@elapsed begin + for i in 1:nreps # reduce variance / impact of launch latency + kernel(y1, x1, x2, x3, Val(N); threads, blocks) + end + end + if !isnothing(bm) + kernel_time_s=e/nreps + n_reads_writes=1 + nt = (; + caller=@caller_name(@__FILE__), + kernel_time_s, + n_reads_writes, + nreps, + perf_stats(;bm,kernel_time_s,n_reads_writes)... + ) + push!(bm.data, nt) end + return nothing end; function custom_kernel_knl!(y1, x1, x2, x3, ::Val{N}) where {N} @@ -299,42 +308,26 @@ function custom_kernel_knl!(y1, x1, x2, x3, ::Val{N}) where {N} return nothing end; -abstract type AbstractUniversalSizes{Nv, Nij} end -struct UniversalSizesCC{Nv, Nij} <: AbstractUniversalSizes{Nv, Nij} - Nh::Int -end -struct UniversalSizesStatic{Nv, Nij, Nh} <: AbstractUniversalSizes{Nv, Nij} end - -get_Nv(::AbstractUniversalSizes{Nv}) where {Nv} = Nv -get_Nij(::AbstractUniversalSizes{Nv, Nij}) where {Nv, Nij} = Nij -get_Nh(us::UniversalSizesCC) = us.Nh -get_Nh(::UniversalSizesStatic{Nv, Nij, Nh}) where {Nv, Nij, Nh} = Nh -get_N(us::AbstractUniversalSizes{Nv, Nij}) where {Nv, Nij} = prod((Nv,Nij,Nij,1,get_Nh(us))) -UniversalSizesCC(Nv, Nij, Nh) = UniversalSizesCC{Nv, Nij}(Nh) -UniversalSizesStatic(Nv, Nij, Nh) = UniversalSizesStatic{Nv, Nij, Nh}() -using Test -us_tup = (1, 2, 3) -@test get_Nv(UniversalSizesCC(us_tup...)) == get_Nv(UniversalSizesStatic(us_tup...)) -@test get_Nij(UniversalSizesCC(us_tup...)) == get_Nij(UniversalSizesStatic(us_tup...)) -@test get_Nh(UniversalSizesCC(us_tup...)) == get_Nh(UniversalSizesStatic(us_tup...)) -@test get_N(UniversalSizesCC(us_tup...)) == get_N(UniversalSizesStatic(us_tup...)) - -function custom_kernel_bc!(X, Y, us::AbstractUniversalSizes; printtb=false, use_pw=true) +function custom_kernel_bc!(X, Y, us::AbstractUniversalSizes; printtb=false, use_pw=true, nreps = 1, bm=nothing) (; x1, x2, x3) = X (; y1) = Y bc_base = @lazy @. y1 = myadd(x1, x2, x3) bc = use_pw ? to_pointwise_bc(bc_base) : bc_base if y1 isa Array if bc isa Base.Broadcast.Broadcasted - for i in 1:100 # reduce variance / impact of launch latency - @inbounds @simd for j in eachindex(bc) - y1[j] = bc[j] + e = Base.@elapsed begin + for i in 1:nreps # reduce variance / impact of launch latency + @inbounds @simd for j in eachindex(bc) + y1[j] = bc[j] + end end end else - for i in 1:100 # reduce variance / impact of launch latency - @inbounds @simd for j in 1:get_N(us) - y1[j] = bc[j] + e = Base.@elapsed begin + for i in 1:nreps # reduce variance / impact of launch latency + @inbounds @simd for j in 1:get_N(us) + y1[j] = bc[j] + end end end end @@ -350,10 +343,25 @@ function custom_kernel_bc!(X, Y, us::AbstractUniversalSizes; printtb=false, use_ threads = min(N, config.threads) blocks = cld(N, threads) printtb && @show blocks, threads - for i in 1:100 # reduce variance / impact of launch latency - kernel(y1, bc,us; threads, blocks) + kernel(y1, bc,us; threads, blocks) # compile + e = CUDA.@elapsed begin + for i in 1:nreps # reduce variance / impact of launch latency + kernel(y1, bc,us; threads, blocks) + end end end + if !isnothing(bm) + kernel_time_s=e/nreps + n_reads_writes=1 + nt = (; + caller=@caller_name(@__FILE__), + kernel_time_s, + n_reads_writes, + nreps, + perf_stats(;bm,kernel_time_s,n_reads_writes)... + ) + push!(bm.data, nt) + end return nothing end; @inline get_cart_lin_index(bc, n, I) = I @@ -371,40 +379,61 @@ function custom_kernel_knl_bc!(y1, bc, us) return nothing end; -FT = Float32; -arr(T) = T(zeros(63,4,4,1,5400)) -X_array = (;x1 = arr(ArrayType),x2 = arr(ArrayType),x3 = arr(ArrayType)); -Y_array = (;y1 = arr(ArrayType),); +end # module +import .IndexStaticRangeBench as BSR + +using CUDA +using Test +bm = BSR.Benchmark(;problem_size=(63,4,4,1,5400), float_type=Float32) +# bm = BSR.Benchmark(;problem_size=(63,4,4,1,5400), float_type=Float64) +ArrayType = CUDA.CuArray; +# ArrayType = Base.identity; + +us_tup = (1, 2, 3) +@test BSR.get_Nv(BSR.UniversalSizesCC(us_tup...)) == BSR.get_Nv(BSR.UniversalSizesStatic(us_tup...)) +@test BSR.get_Nij(BSR.UniversalSizesCC(us_tup...)) == BSR.get_Nij(BSR.UniversalSizesStatic(us_tup...)) +@test BSR.get_Nh(BSR.UniversalSizesCC(us_tup...)) == BSR.get_Nh(BSR.UniversalSizesStatic(us_tup...)) +@test BSR.get_N(BSR.UniversalSizesCC(us_tup...)) == BSR.get_N(BSR.UniversalSizesStatic(us_tup...)) + +arr(bm, T) = T(zeros(bm.float_type, bm.problem_size...)) +X_array = (;x1 = arr(bm, ArrayType),x2 = arr(bm, ArrayType),x3 = arr(bm, ArrayType)); +Y_array = (;y1 = arr(bm, ArrayType),); to_vec(ξ) = (;zip(propertynames(ξ), map(θ -> vec(θ), values(ξ)))...); X_vector = to_vec(X_array); Y_vector = to_vec(Y_array); -at_dot_call!(X_array, Y_array) -at_dot_call!(X_vector, Y_vector) +BSR.at_dot_call!(X_array, Y_array) +BSR.at_dot_call!(X_vector, Y_vector) N = length(X_vector.x1) (Nv, Nij, _, Nf, Nh) = size(Y_array.y1); -us = UniversalSizesCC(Nv, Nij, Nh); -uss = UniversalSizesStatic(Nv, Nij, Nh); -@test get_N(us) == N -@test get_N(uss) == N +us = BSR.UniversalSizesCC(Nv, Nij, Nh); +uss = BSR.UniversalSizesStatic(Nv, Nij, Nh); +@test BSR.get_N(us) == N +@test BSR.get_N(uss) == N iscpu = ArrayType === identity -iscpu || custom_sol_kernel!(X_vector, Y_vector, Val(N)) -custom_kernel_bc!(X_vector, Y_vector, us) -custom_kernel_bc!(X_array, Y_array, us; use_pw=false) -custom_kernel_bc!(X_array, Y_array, us; use_pw=true) - -custom_kernel_bc!(X_vector, Y_vector, uss) -custom_kernel_bc!(X_array, Y_array, uss; use_pw=false) -custom_kernel_bc!(X_array, Y_array, uss; use_pw=true) - -@pretty_belapsed at_dot_call!($X_array, $Y_array) # slow -@pretty_belapsed at_dot_call!($X_vector, $Y_vector) # fast -iscpu || @pretty_belapsed custom_sol_kernel!($X_vector, $Y_vector, $(Val(N))) -@pretty_belapsed custom_kernel_bc!($X_vector, $Y_vector, $us) -@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $us; use_pw=false) -@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $us; use_pw=true) - -@pretty_belapsed custom_kernel_bc!($X_vector, $Y_vector, $uss) -@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $uss; use_pw=false) -@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $uss; use_pw=true) +BSR.custom_kernel_bc!(X_vector, Y_vector, us) +BSR.custom_kernel_bc!(X_vector, Y_vector, uss) +iscpu || BSR.custom_sol_kernel!(X_vector, Y_vector, Val(N)) + +BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=false) +BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=false) + +BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=true) +BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=true) + +BSR.at_dot_call!(X_array, Y_array; nreps=1000, bm) # slow +BSR.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) # fast +iscpu || BSR.custom_sol_kernel!(X_vector, Y_vector, Val(N); nreps=1000, bm) + +BSR.custom_kernel_bc!(X_vector, Y_vector, us; nreps=1000, bm) +BSR.custom_kernel_bc!(X_vector, Y_vector, uss; nreps=1000, bm) + +BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=false, nreps=1000, bm) +BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=false, nreps=1000, bm) + +BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=true, nreps=1000, bm) +BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=true, nreps=1000, bm) + +@info "ArrayType = $ArrayType" +BSR.tabulate_benchmark(bm) #! format: on