diff --git a/benchmarks/scripts/benchmark_utils.jl b/benchmarks/scripts/benchmark_utils.jl index 4b75b30183..239005a104 100644 --- a/benchmarks/scripts/benchmark_utils.jl +++ b/benchmarks/scripts/benchmark_utils.jl @@ -79,11 +79,18 @@ function tabulate_benchmark(bm) nreps, ) title = "Problem size: $(bm.problem_size), float_type = $(bm.float_type), device_bandwidth_GBs=$(bm.device_bandwidth_GBs)" - PrettyTables.pretty_table( - data; - title, - header, - alignment = :l, - crop = :none, + PrettyTables.pretty_table(data; title, header, alignment = :l, crop = :none) +end + +push_info(bm::Nothing; e, nreps, caller, n_reads_writes) = nothing +function push_info(bm; e, nreps, caller, n_reads_writes) + kernel_time_s = e / nreps + nt = (; + caller, + kernel_time_s, + n_reads_writes, + nreps, + perf_stats(; bm, kernel_time_s, n_reads_writes)..., ) + push!(bm.data, nt) end diff --git a/benchmarks/scripts/index_swapping.jl b/benchmarks/scripts/index_swapping.jl index 1e02a2874f..20410f7d68 100644 --- a/benchmarks/scripts/index_swapping.jl +++ b/benchmarks/scripts/index_swapping.jl @@ -28,10 +28,20 @@ Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs=20 ┌──────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐ │ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │ ├──────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤ -│ BIS.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) │ 36 microseconds, 195 nanoseconds │ 54.952 │ 1120.47 │ 2 │ 1000 │ -│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=0, nreps=1000, bm) │ 74 microseconds, 228 nanoseconds │ 26.7955 │ 546.359 │ 2 │ 1000 │ -│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=1, nreps=1000, bm) │ 82 microseconds, 501 nanoseconds │ 24.1085 │ 491.572 │ 2 │ 1000 │ -│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=2, nreps=1000, bm) │ 72 microseconds, 567 nanoseconds │ 27.4088 │ 558.865 │ 2 │ 1000 │ +│ BIS.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) │ 34 microseconds, 617 nanoseconds │ 57.4574 │ 1171.56 │ 2 │ 1000 │ +│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=0, nreps=1000, bm) │ 60 microseconds, 384 nanoseconds │ 32.939 │ 671.627 │ 2 │ 1000 │ +│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=1, nreps=1000, bm) │ 68 microseconds, 108 nanoseconds │ 29.2034 │ 595.458 │ 2 │ 1000 │ +│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=2, nreps=1000, bm) │ 60 microseconds, 395 nanoseconds │ 32.9329 │ 671.502 │ 2 │ 1000 │ +└──────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘ +[ Info: ArrayType = CuArray +Problem size: (63, 4, 4, 1, 5400), float_type = Float64, device_bandwidth_GBs=2039 +┌──────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐ +│ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │ +├──────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤ +│ BIS.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) │ 59 microseconds, 558 nanoseconds │ 66.791 │ 1361.87 │ 2 │ 1000 │ +│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=0, nreps=1000, bm) │ 63 microseconds, 238 nanoseconds │ 62.905 │ 1282.63 │ 2 │ 1000 │ +│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=1, nreps=1000, bm) │ 80 microseconds, 502 nanoseconds │ 49.4142 │ 1007.56 │ 2 │ 1000 │ +│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=2, nreps=1000, bm) │ 63 microseconds, 228 nanoseconds │ 62.9142 │ 1282.82 │ 2 │ 1000 │ └──────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘ ``` =# @@ -42,28 +52,24 @@ module IndexSwapBench include("benchmark_utils.jl") foo(x1, x2, x3) = x1 -function at_dot_call!(X, Y; nreps = 1, print_info = true, bm=nothing) +function at_dot_call!(X, Y; nreps = 1, print_info = true, bm=nothing, n_trials = 30) (; x1, x2, x3) = X (; y1) = Y - e = CUDA.@elapsed begin for i in 1:nreps # reduce variance / impact of launch latency - @. y1 = foo(x1, x2, x3) # 3 reads, 1 write + e = Inf + @. y1 = foo(x1, x2, x3) # compile + for t in 1:n_trials + et = CUDA.@elapsed begin + for i in 1:nreps # reduce variance / impact of launch latency + @. y1 = foo(x1, x2, x3) # 1 write, 1 read + end end + e = min(e, et) end - if !isnothing(bm) - kernel_time_s=e/nreps - nt = (; - caller=@caller_name(@__FILE__), - kernel_time_s, - n_reads_writes=2, - nreps, - perf_stats(;bm,kernel_time_s,n_reads_writes=2)... - ) - push!(bm.data, nt) - end + push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=2) return nothing end; -function custom_kernel_bc!(X, Y, us::UniversalSizesStatic; swap=0, printtb=false, nreps = 1, print_info = true, bm=nothing) +function custom_kernel_bc!(X, Y, us::UniversalSizesStatic; swap=0, printtb=false, nreps = 1, print_info = true, bm=nothing, n_trials=30) (; x1, x2, x3) = X (; y1) = Y bc = @lazy @. y1 = foo(x1, x2, x3) @@ -88,22 +94,17 @@ function custom_kernel_bc!(X, Y, us::UniversalSizesStatic; swap=0, printtb=false threads = min(N, config.threads) blocks = cld(N, threads) printtb && @show blocks, threads - e = CUDA.@elapsed begin - for i in 1:nreps # reduce variance / impact of launch latency - kernel(y1, bc,us; threads, blocks) + kernel(y1, bc,us; threads, blocks) # compile + e = Inf + for t in 1:n_trials + et = CUDA.@elapsed begin + for i in 1:nreps # reduce variance / impact of launch latency + kernel(y1, bc,us; threads, blocks) + end end + e = min(e, et) end - if !isnothing(bm) - kernel_time_s=e/nreps - nt = (; - caller=@caller_name(@__FILE__), - kernel_time_s, - n_reads_writes=2, - nreps, - perf_stats(;bm,kernel_time_s,n_reads_writes=2)... - ) - push!(bm.data, nt) - end + push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=2) return nothing end; diff --git a/benchmarks/scripts/indexing_and_static_ndranges.jl b/benchmarks/scripts/indexing_and_static_ndranges.jl index 3e42f0ec16..defcb3d372 100644 --- a/benchmarks/scripts/indexing_and_static_ndranges.jl +++ b/benchmarks/scripts/indexing_and_static_ndranges.jl @@ -5,7 +5,7 @@ using Revise; include(joinpath("benchmarks", "scripts", "indexing_and_static_ndr # Info: This script compares two things: - linear vs cartesian indexing - - impact of static vs dynamic NDRanges (https://juliagpu.githubSR.io/KernelAbstractions.jl/dev/examples/memcopy_static/) + - impact of static vs dynamic NDRanges (https://juliagpu.github.io/KernelAbstractions.jl/dev/examples/memcopy_static/) Linear indexing, when possible, has performance advantages over using Cartesian indexing. Julia Base's Broadcast only @@ -43,8 +43,8 @@ Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs=20 ┌────────────────────────────────────────────────────────────────────────────┬───────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐ │ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │ ├────────────────────────────────────────────────────────────────────────────┼───────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤ -│ BSR.at_dot_call!(X_array, Y_array; nreps=1000, bm) # slow │ 422 microseconds, 223 nanoseconds │ 2.35535 │ 48.0256 │ 1 │ 1000 │ -│ BSR.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) # fast │ 242 microseconds, 740 nanoseconds │ 4.09692 │ 83.5362 │ 1 │ 1000 │ +│ BSR.at_dot_call!(X_array, Y_array; nreps=1000, bm) │ 422 microseconds, 223 nanoseconds │ 2.35535 │ 48.0256 │ 1 │ 1000 │ +│ BSR.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) │ 242 microseconds, 740 nanoseconds │ 4.09692 │ 83.5362 │ 1 │ 1000 │ │ BSR.custom_kernel_bc!(X_vector, Y_vector, us; nreps=1000, bm) │ 242 microseconds, 30 nanoseconds │ 4.10894 │ 83.7812 │ 1 │ 1000 │ │ BSR.custom_kernel_bc!(X_vector, Y_vector, uss; nreps=1000, bm) │ 244 microseconds, 279 nanoseconds │ 4.0711 │ 83.0097 │ 1 │ 1000 │ │ BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=false, nreps=1000, bm) │ 499 microseconds, 283 nanoseconds │ 1.99182 │ 40.6133 │ 1 │ 1000 │ @@ -57,8 +57,8 @@ Problem size: (63, 4, 4, 1, 5400), float_type = Float64, device_bandwidth_GBs=20 ┌────────────────────────────────────────────────────────────────────────────┬───────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐ │ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │ ├────────────────────────────────────────────────────────────────────────────┼───────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤ -│ BSR.at_dot_call!(X_array, Y_array; nreps=1000, bm) # slow │ 1 millisecond, 446 microseconds │ 1.37517 │ 28.0397 │ 1 │ 1000 │ -│ BSR.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) # fast │ 984 microseconds, 854 nanoseconds │ 2.01955 │ 41.1787 │ 1 │ 1000 │ +│ BSR.at_dot_call!(X_array, Y_array; nreps=1000, bm) │ 1 millisecond, 446 microseconds │ 1.37517 │ 28.0397 │ 1 │ 1000 │ +│ BSR.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) │ 984 microseconds, 854 nanoseconds │ 2.01955 │ 41.1787 │ 1 │ 1000 │ │ BSR.custom_kernel_bc!(X_vector, Y_vector, us; nreps=1000, bm) │ 987 microseconds, 438 nanoseconds │ 2.01427 │ 41.0709 │ 1 │ 1000 │ │ BSR.custom_kernel_bc!(X_vector, Y_vector, uss; nreps=1000, bm) │ 985 microseconds, 779 nanoseconds │ 2.01766 │ 41.1401 │ 1 │ 1000 │ │ BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=false, nreps=1000, bm) │ 1 millisecond, 475 microseconds │ 1.34834 │ 27.4927 │ 1 │ 1000 │ @@ -75,30 +75,30 @@ Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs=20 ┌─────────────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐ │ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │ ├─────────────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤ -│ BSR.at_dot_call!(X_array, Y_array; nreps=1000, bm) # slow │ 84 microseconds, 791 nanoseconds │ 11.7287 │ 239.149 │ 1 │ 1000 │ -│ BSR.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) # fast │ 14 microseconds, 497 nanoseconds │ 68.6003 │ 1398.76 │ 1 │ 1000 │ -│ iscpu || BSR.custom_sol_kernel!(X_vector, Y_vector, Val(N); nreps=1000, bm) │ 13 microseconds, 125 nanoseconds │ 75.7724 │ 1545.0 │ 1 │ 1000 │ -│ BSR.custom_kernel_bc!(X_vector, Y_vector, us; nreps=1000, bm) │ 14 microseconds, 212 nanoseconds │ 69.9794 │ 1426.88 │ 1 │ 1000 │ -│ BSR.custom_kernel_bc!(X_vector, Y_vector, uss; nreps=1000, bm) │ 13 microseconds, 55 nanoseconds │ 76.1765 │ 1553.24 │ 1 │ 1000 │ -│ BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=false, nreps=1000, bm) │ 47 microseconds, 258 nanoseconds │ 21.0439 │ 429.084 │ 1 │ 1000 │ -│ BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=false, nreps=1000, bm) │ 30 microseconds, 637 nanoseconds │ 32.4612 │ 661.884 │ 1 │ 1000 │ -│ BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=true, nreps=1000, bm) │ 14 microseconds, 386 nanoseconds │ 69.1326 │ 1409.61 │ 1 │ 1000 │ -│ BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=true, nreps=1000, bm) │ 13 microseconds, 58 nanoseconds │ 76.1646 │ 1553.0 │ 1 │ 1000 │ +│ BSR.at_dot_call!(X_array, Y_array; nreps=1000, bm) │ 68 microseconds, 641 nanoseconds │ 14.4882 │ 295.415 │ 1 │ 1000 │ +│ BSR.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) │ 13 microseconds, 787 nanoseconds │ 72.1366 │ 1470.86 │ 1 │ 1000 │ +│ iscpu || BSR.custom_sol_kernel!(X_vector, Y_vector, Val(N); nreps=1000, bm) │ 12 microseconds, 925 nanoseconds │ 76.943 │ 1568.87 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_vector, Y_vector, us; nreps=1000, bm) │ 13 microseconds, 364 nanoseconds │ 74.4195 │ 1517.41 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_vector, Y_vector, uss; nreps=1000, bm) │ 12 microseconds, 929 nanoseconds │ 76.9247 │ 1568.49 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=false, nreps=1000, bm) │ 41 microseconds, 5 nanoseconds │ 24.2533 │ 494.525 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=false, nreps=1000, bm) │ 26 microseconds, 652 nanoseconds │ 37.3141 │ 760.835 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=true, nreps=1000, bm) │ 13 microseconds, 582 nanoseconds │ 73.2243 │ 1493.04 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=true, nreps=1000, bm) │ 12 microseconds, 922 nanoseconds │ 76.9613 │ 1569.24 │ 1 │ 1000 │ └─────────────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘ [ Info: ArrayType = CuArray Problem size: (63, 4, 4, 1, 5400), float_type = Float64, device_bandwidth_GBs=2039 ┌─────────────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐ │ funcs │ time per call │ bw % │ achieved bw │ n-reads/writes │ n-reps │ ├─────────────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤ -│ BSR.at_dot_call!(X_array, Y_array; nreps=1000, bm) # slow │ 85 microseconds, 69 nanoseconds │ 23.3807 │ 476.732 │ 1 │ 1000 │ -│ BSR.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) # fast │ 28 microseconds, 809 nanoseconds │ 69.0417 │ 1407.76 │ 1 │ 1000 │ -│ iscpu || BSR.custom_sol_kernel!(X_vector, Y_vector, Val(N); nreps=1000, bm) │ 26 microseconds, 183 nanoseconds │ 75.965 │ 1548.93 │ 1 │ 1000 │ -│ BSR.custom_kernel_bc!(X_vector, Y_vector, us; nreps=1000, bm) │ 26 microseconds, 426 nanoseconds │ 75.2673 │ 1534.7 │ 1 │ 1000 │ -│ BSR.custom_kernel_bc!(X_vector, Y_vector, uss; nreps=1000, bm) │ 26 microseconds, 256 nanoseconds │ 75.7546 │ 1544.64 │ 1 │ 1000 │ -│ BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=false, nreps=1000, bm) │ 47 microseconds, 819 nanoseconds │ 41.5938 │ 848.098 │ 1 │ 1000 │ -│ BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=false, nreps=1000, bm) │ 31 microseconds, 442 nanoseconds │ 63.2584 │ 1289.84 │ 1 │ 1000 │ -│ BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=true, nreps=1000, bm) │ 26 microseconds, 729 nanoseconds │ 74.4138 │ 1517.3 │ 1 │ 1000 │ -│ BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=true, nreps=1000, bm) │ 26 microseconds, 642 nanoseconds │ 74.6569 │ 1522.25 │ 1 │ 1000 │ +│ BSR.at_dot_call!(X_array, Y_array; nreps=1000, bm) │ 69 microseconds, 10 nanoseconds │ 28.8217 │ 587.673 │ 1 │ 1000 │ +│ BSR.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) │ 28 microseconds, 219 nanoseconds │ 70.4848 │ 1437.18 │ 1 │ 1000 │ +│ iscpu || BSR.custom_sol_kernel!(X_vector, Y_vector, Val(N); nreps=1000, bm) │ 25 microseconds, 460 nanoseconds │ 78.1221 │ 1592.91 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_vector, Y_vector, us; nreps=1000, bm) │ 25 microseconds, 625 nanoseconds │ 77.6194 │ 1582.66 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_vector, Y_vector, uss; nreps=1000, bm) │ 25 microseconds, 436 nanoseconds │ 78.1975 │ 1594.45 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=false, nreps=1000, bm) │ 41 microseconds, 621 nanoseconds │ 47.7881 │ 974.4 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=false, nreps=1000, bm) │ 27 microseconds, 111 nanoseconds │ 73.3654 │ 1495.92 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=true, nreps=1000, bm) │ 25 microseconds, 931 nanoseconds │ 76.703 │ 1563.97 │ 1 │ 1000 │ +│ BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=true, nreps=1000, bm) │ 25 microseconds, 464 nanoseconds │ 78.1095 │ 1592.65 │ 1 │ 1000 │ └─────────────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘ ``` =# @@ -240,31 +240,24 @@ Base.IndexStyle(::Type{<:PointWiseBC{<:Any, <:Tuple{Any}}}) = IndexLinear() # ============================================================ Non-extruded broadcast (end) myadd(x1, x2, x3) = zero(x1) -function at_dot_call!(X, Y; nreps = 1, bm=nothing) +function at_dot_call!(X, Y; nreps = 1, bm=nothing, n_trials = 30) (; x1, x2, x3) = X (; y1) = Y @. y1 = myadd(x1, x2, x3) # compile - e = CUDA.@elapsed begin - for i in 1:nreps # reduce variance / impact of launch latency - @. y1 = myadd(x1, x2, x3) # 3 reads, 1 write + e = Inf + for t in 1:n_trials + et = CUDA.@elapsed begin + for i in 1:nreps # reduce variance / impact of launch latency + @. y1 = myadd(x1, x2, x3) # 3 reads, 1 write + end end + e = min(e, et) end - if !isnothing(bm) - kernel_time_s=e/nreps - n_reads_writes=1 - nt = (; - caller=@caller_name(@__FILE__), - kernel_time_s, - n_reads_writes, - nreps, - perf_stats(;bm,kernel_time_s,n_reads_writes)... - ) - push!(bm.data, nt) - end + push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=1) return nothing end; -function custom_sol_kernel!(X, Y, ::Val{N}; nreps = 1, bm=nothing) where {N} +function custom_sol_kernel!(X, Y, ::Val{N}; nreps = 1, bm=nothing, n_trials = 30) where {N} (; x1, x2, x3) = X (; y1) = Y kernel = CUDA.@cuda always_inline = true launch = false custom_kernel_knl!( @@ -278,23 +271,16 @@ function custom_sol_kernel!(X, Y, ::Val{N}; nreps = 1, bm=nothing) where {N} threads = min(N, config.threads) blocks = cld(N, threads) kernel(y1, x1, x2, x3, Val(N); threads, blocks) # compile - e = CUDA.@elapsed begin - for i in 1:nreps # reduce variance / impact of launch latency - kernel(y1, x1, x2, x3, Val(N); threads, blocks) + e = Inf + for t in 1:n_trials + et = CUDA.@elapsed begin + for i in 1:nreps # reduce variance / impact of launch latency + kernel(y1, x1, x2, x3, Val(N); threads, blocks) + end end + e = min(e, et) end - if !isnothing(bm) - kernel_time_s=e/nreps - n_reads_writes=1 - nt = (; - caller=@caller_name(@__FILE__), - kernel_time_s, - n_reads_writes, - nreps, - perf_stats(;bm,kernel_time_s,n_reads_writes)... - ) - push!(bm.data, nt) - end + push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=1) return nothing end; @@ -308,27 +294,34 @@ function custom_kernel_knl!(y1, x1, x2, x3, ::Val{N}) where {N} return nothing end; -function custom_kernel_bc!(X, Y, us::AbstractUniversalSizes; printtb=false, use_pw=true, nreps = 1, bm=nothing) +function custom_kernel_bc!(X, Y, us::AbstractUniversalSizes; printtb=false, use_pw=true, nreps = 1, bm=nothing, n_trials = 30) (; x1, x2, x3) = X (; y1) = Y bc_base = @lazy @. y1 = myadd(x1, x2, x3) bc = use_pw ? to_pointwise_bc(bc_base) : bc_base + e = Inf if y1 isa Array if bc isa Base.Broadcast.Broadcasted - e = Base.@elapsed begin - for i in 1:nreps # reduce variance / impact of launch latency - @inbounds @simd for j in eachindex(bc) - y1[j] = bc[j] + for t in 1:n_trials + et = Base.@elapsed begin + for i in 1:nreps # reduce variance / impact of launch latency + @inbounds @simd for j in eachindex(bc) + y1[j] = bc[j] + end end end + e = min(e, et) end else - e = Base.@elapsed begin - for i in 1:nreps # reduce variance / impact of launch latency - @inbounds @simd for j in 1:get_N(us) - y1[j] = bc[j] + for t in 1:n_trials + et = Base.@elapsed begin + for i in 1:nreps # reduce variance / impact of launch latency + @inbounds @simd for j in 1:get_N(us) + y1[j] = bc[j] + end end end + e = min(e, et) end end else @@ -344,24 +337,16 @@ function custom_kernel_bc!(X, Y, us::AbstractUniversalSizes; printtb=false, use_ blocks = cld(N, threads) printtb && @show blocks, threads kernel(y1, bc,us; threads, blocks) # compile - e = CUDA.@elapsed begin - for i in 1:nreps # reduce variance / impact of launch latency - kernel(y1, bc,us; threads, blocks) + for t in 1:n_trials + et = CUDA.@elapsed begin + for i in 1:nreps # reduce variance / impact of launch latency + kernel(y1, bc,us; threads, blocks) + end end + e = min(e, et) end end - if !isnothing(bm) - kernel_time_s=e/nreps - n_reads_writes=1 - nt = (; - caller=@caller_name(@__FILE__), - kernel_time_s, - n_reads_writes, - nreps, - perf_stats(;bm,kernel_time_s,n_reads_writes)... - ) - push!(bm.data, nt) - end + push_info(bm; e, nreps, caller = @caller_name(@__FILE__),n_reads_writes=1) return nothing end; @inline get_cart_lin_index(bc, n, I) = I @@ -420,8 +405,8 @@ BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=false) BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=true) BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=true) -BSR.at_dot_call!(X_array, Y_array; nreps=1000, bm) # slow -BSR.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) # fast +BSR.at_dot_call!(X_array, Y_array; nreps=1000, bm) +BSR.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) iscpu || BSR.custom_sol_kernel!(X_vector, Y_vector, Val(N); nreps=1000, bm) BSR.custom_kernel_bc!(X_vector, Y_vector, us; nreps=1000, bm)