diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 0803a91719..1d545cfb93 100755
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -1198,6 +1198,29 @@ steps:
         key: "cpu_field_perf"
         command: "julia --color=yes --project=.buildkite test/Fields/field_opt.jl"
 
+  - group: "Perf: Benchmark scripts"
+    steps:
+
+      - label: "Perf: benchmark scripts index_swapping"
+        key: perf_index_swapping
+        command:
+          - "julia --project=.buildkite -e 'using CUDA; CUDA.versioninfo()'"
+          - "julia --color=yes --project=.buildkite benchmarks/scripts/index_swapping.jl"
+        env:
+          CLIMACOMMS_DEVICE: "CUDA"
+        agents:
+          slurm_gpus: 1
+
+      - label: "Perf: benchmark scripts indexing_and_static_ndranges"
+        key: indexing_and_static_ndranges
+        command:
+          - "julia --project=.buildkite -e 'using CUDA; CUDA.versioninfo()'"
+          - "julia --color=yes --project=.buildkite benchmarks/scripts/indexing_and_static_ndranges.jl"
+        env:
+          CLIMACOMMS_DEVICE: "CUDA"
+        agents:
+          slurm_gpus: 1
+
   - group: "Perf: Operators"
     steps:
 
diff --git a/benchmarks/scripts/benchmark_utils.jl b/benchmarks/scripts/benchmark_utils.jl
new file mode 100644
index 0000000000..4b75b30183
--- /dev/null
+++ b/benchmarks/scripts/benchmark_utils.jl
@@ -0,0 +1,89 @@
+import CUDA
+using BenchmarkTools, Dates
+using LazyBroadcast: @lazy
+
+"""
+    caller_name(@__FILE__)
+
+Returns a string of the (single) line pointing to the function that
+called the function we're in.
+"""
+macro caller_name(f)
+    quote
+        string(readlines($f)[StackTraces.stacktrace()[4].line])
+    end
+end
+
+Base.@kwdef mutable struct Benchmark
+    problem_size::Tuple
+    float_type::Type
+    device_bandwidth_GBs::Int = 2_039
+    data::Vector = []
+end
+
+function perf_stats(; bm::Benchmark, kernel_time_s, n_reads_writes)
+    N = prod(bm.problem_size)
+    GB = N * n_reads_writes * sizeof(bm.float_type) / 1024^3
+    achieved_bandwidth_GBs = GB / kernel_time_s
+    bandwidth_efficiency =
+        achieved_bandwidth_GBs / bm.device_bandwidth_GBs * 100
+    return (; N, GB, achieved_bandwidth_GBs, bandwidth_efficiency)
+end;
+
+time_and_units_str(x::Real) =
+    trunc_time(string(compound_period(x, Dates.Second)))
+function compound_period(x::Real, ::Type{T}) where {T <: Dates.Period}
+    nf = Dates.value(convert(Dates.Nanosecond, T(1)))
+    ns = Dates.Nanosecond(ceil(x * nf))
+    return Dates.canonicalize(Dates.CompoundPeriod(ns))
+end
+trunc_time(s::String) = count(',', s) > 1 ? join(split(s, ",")[1:2], ",") : s
+
+abstract type AbstractUniversalSizes{Nv, Nij} end
+struct UniversalSizesCC{Nv, Nij} <: AbstractUniversalSizes{Nv, Nij}
+    Nh::Int
+end
+struct UniversalSizesStatic{Nv, Nij, Nh} <: AbstractUniversalSizes{Nv, Nij} end
+
+get_Nv(::AbstractUniversalSizes{Nv}) where {Nv} = Nv
+get_Nij(::AbstractUniversalSizes{Nv, Nij}) where {Nv, Nij} = Nij
+get_Nh(us::UniversalSizesCC) = us.Nh
+get_Nh(::UniversalSizesStatic{Nv, Nij, Nh}) where {Nv, Nij, Nh} = Nh
+get_N(us::AbstractUniversalSizes{Nv, Nij}) where {Nv, Nij} =
+    prod((Nv, Nij, Nij, 1, get_Nh(us)))
+UniversalSizesCC(Nv, Nij, Nh) = UniversalSizesCC{Nv, Nij}(Nh)
+UniversalSizesStatic(Nv, Nij, Nh) = UniversalSizesStatic{Nv, Nij, Nh}()
+
+import PrettyTables
+function tabulate_benchmark(bm)
+    funcs = map(x -> x.caller, bm.data)
+    timings = map(x -> time_and_units_str(x.kernel_time_s), bm.data)
+    n_reads_writes = map(x -> x.n_reads_writes, bm.data)
+    nreps = map(x -> x.nreps, bm.data)
+    achieved_bandwidth_GBs = map(x -> x.achieved_bandwidth_GBs, bm.data)
+    bandwidth_efficiency = map(x -> x.bandwidth_efficiency, bm.data)
+    header = [
+        "funcs",
+        "time per call",
+        "bw %",
+        "achieved bw",
+        "n-reads/writes",
+        "n-reps",
+    ]
+    data = hcat(
+        funcs,
+        timings,
+        bandwidth_efficiency,
+        achieved_bandwidth_GBs,
+        n_reads_writes,
+        nreps,
+    )
+    title = "Problem size: $(bm.problem_size), float_type = $(bm.float_type), device_bandwidth_GBs=$(bm.device_bandwidth_GBs)"
+    PrettyTables.pretty_table(
+        data;
+        title,
+        header,
+        alignment = :l,
+        crop = :none,
+    )
+end
diff --git a/benchmarks/scripts/index_swapping.jl b/benchmarks/scripts/index_swapping.jl
index ac43c3fd7a..1e02a2874f 100644
--- a/benchmarks/scripts/index_swapping.jl
+++ b/benchmarks/scripts/index_swapping.jl
@@ -23,89 +23,55 @@ In particular,
 
 Clima A100
 ```
-at_dot_call!($X_vector, $Y_vector):
-     6 milliseconds, 19 microseconds
-custom_kernel_bc!($X_array, $Y_array, $uss, swap = 0):
-     6 milliseconds, 329 microseconds
-custom_kernel_bc!($X_array, $Y_array, $uss, swap = 1):
-     14 milliseconds, 232 microseconds
-custom_kernel_bc!($X_array, $Y_array, $uss, swap = 2):
-     15 milliseconds, 960 microseconds
+[ Info: ArrayType = CuArray
+Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs=2039
+┌──────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐
+│ funcs                                                                │ time per call                    │ bw %    │ achieved bw │ n-reads/writes │ n-reps │
+├──────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤
+│ BIS.at_dot_call!(X_vector, Y_vector; nreps=1000, bm)                 │ 36 microseconds, 195 nanoseconds │ 54.952  │ 1120.47     │ 2              │ 1000   │
+│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=0, nreps=1000, bm) │ 74 microseconds, 228 nanoseconds │ 26.7955 │ 546.359     │ 2              │ 1000   │
+│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=1, nreps=1000, bm) │ 82 microseconds, 501 nanoseconds │ 24.1085 │ 491.572     │ 2              │ 1000   │
+│ BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=2, nreps=1000, bm) │ 72 microseconds, 567 nanoseconds │ 27.4088 │ 558.865     │ 2              │ 1000   │
+└──────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘
 ```
 =#
 
 #! format: off
-import CUDA
-using BenchmarkTools, Dates
-using LazyBroadcast: @lazy
-ArrayType = CUDA.CuArray;
-# ArrayType = identity;
+module IndexSwapBench
+
+include("benchmark_utils.jl")
 
-if ArrayType === identity
-    macro pretty_belapsed(expr)
-        return quote
-            println($(string(expr)), ":")
-            print("     ")
-            print_time_and_units(BenchmarkTools.@belapsed(esc($expr)))
-        end
-    end
-else
-    macro pretty_belapsed(expr)
-        return quote
-            println($(string(expr)), ":")
-            print("     ")
-            print_time_and_units(
-                BenchmarkTools.@belapsed(CUDA.@sync((esc($expr))))
-            )
-        end
-    end
-    macro pretty_elapsed(expr)
-        return quote
-            println($(string(expr)), ":")
-            print("     ")
-            print_time_and_units(
-                BenchmarkTools.@elapsed(CUDA.@sync((esc($expr))))
-            )
-        end
-    end
-end
-print_time_and_units(x) = println(time_and_units_str(x))
-time_and_units_str(x::Real) =
-    trunc_time(string(compound_period(x, Dates.Second)))
-function compound_period(x::Real, ::Type{T}) where {T <: Dates.Period}
-    nf = Dates.value(convert(Dates.Nanosecond, T(1)))
-    ns = Dates.Nanosecond(ceil(x * nf))
-    return Dates.canonicalize(Dates.CompoundPeriod(ns))
-end
-trunc_time(s::String) = count(',', s) > 1 ? join(split(s, ",")[1:2], ",") : s
 foo(x1, x2, x3) = x1
-function at_dot_call!(X, Y)
+function at_dot_call!(X, Y; nreps = 1, print_info = true, bm=nothing)
     (; x1, x2, x3) = X
     (; y1) = Y
-    for i in 1:100 # reduce variance / impact of launch latency
-        @. y1 = foo(x1, x2, x3) # 3 reads, 1 write
+    e = CUDA.@elapsed begin for i in 1:nreps # reduce variance / impact of launch latency
+            @. y1 = foo(x1, x2, x3) # 3 reads, 1 write
+        end
+    end
+    if !isnothing(bm)
+        kernel_time_s=e/nreps
+        nt = (;
+            caller=@caller_name(@__FILE__),
+            kernel_time_s,
+            n_reads_writes=2,
+            nreps,
+            perf_stats(;bm,kernel_time_s,n_reads_writes=2)...
+        )
+        push!(bm.data, nt)
     end
     return nothing
 end;
 
-struct UniversalSizesStatic{Nv, Nij, Nh} end
-
-get_Nv(::UniversalSizesStatic{Nv}) where {Nv} = Nv
-get_Nij(::UniversalSizesStatic{Nv, Nij}) where {Nv, Nij} = Nij
-get_Nh(::UniversalSizesStatic{Nv, Nij, Nh}) where {Nv, Nij, Nh} = Nh
-get_N(us::UniversalSizesStatic{Nv, Nij}) where {Nv, Nij} = prod((Nv,Nij,Nij,1,get_Nh(us)))
-UniversalSizesStatic(Nv, Nij, Nh) = UniversalSizesStatic{Nv, Nij, Nh}()
-using Test
-
-function custom_kernel_bc!(X, Y, us::UniversalSizesStatic; swap=0, printtb=false)
+function custom_kernel_bc!(X, Y, us::UniversalSizesStatic; swap=0, printtb=false, nreps = 1, print_info = true, bm=nothing)
     (; x1, x2, x3) = X
     (; y1) = Y
     bc = @lazy @. y1 = foo(x1, x2, x3)
     @assert !(y1 isa Array)
     f = if swap==0
-        custom_kernel_knl_bc_no_swap!
+        custom_kernel_knl_bc_0swap!
     elseif swap == 1
-        custom_kernel_knl_bc_swap!
+        custom_kernel_knl_bc_1swap!
     elseif swap == 2
         custom_kernel_knl_bc_2swap!
     else
@@ -122,14 +88,27 @@ function custom_kernel_bc!(X, Y, us::UniversalSizesStatic; swap=0, printtb=false
     threads = min(N, config.threads)
     blocks = cld(N, threads)
     printtb && @show blocks, threads
-    for i in 1:100 # reduce variance / impact of launch latency
-        kernel(y1, bc,us; threads, blocks)
+    e = CUDA.@elapsed begin
+        for i in 1:nreps # reduce variance / impact of launch latency
+            kernel(y1, bc,us; threads, blocks)
+        end
+    end
+    if !isnothing(bm)
+        kernel_time_s=e/nreps
+        nt = (;
+            caller=@caller_name(@__FILE__),
+            kernel_time_s,
+            n_reads_writes=2,
+            nreps,
+            perf_stats(;bm,kernel_time_s,n_reads_writes=2)...
+        )
+        push!(bm.data, nt)
     end
     return nothing
 end;
 
 # Mimics how indexing works in generalized pointwise kernels
-function custom_kernel_knl_bc_swap!(y1, bc, us)
+function custom_kernel_knl_bc_1swap!(y1, bc, us)
     @inbounds begin
         tidx = (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x
         if tidx ≤ get_N(us)
@@ -145,7 +124,7 @@ function custom_kernel_knl_bc_swap!(y1, bc, us)
 end
 
 # Mimics how indexing works in specialized kernels
-function custom_kernel_knl_bc_no_swap!(y1, bc, us)
+function custom_kernel_knl_bc_0swap!(y1, bc, us)
     @inbounds begin
         tidx = (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x
         if tidx ≤ get_N(us)
@@ -179,35 +158,47 @@ function custom_kernel_knl_bc_2swap!(y1, bc, us)
 end
 
 import Random
+using Test
 function test_custom_kernel_bc!(X_array, Y_array, uss; swap)
     Random.seed!(1234)
     X_array.x1 .= typeof(X_array.x1)(rand(eltype(X_array.x1), size(X_array.x1)))
     Y_array_cp = deepcopy(Y_array)
-    custom_kernel_bc!(X_array, Y_array_cp, uss; swap=0)
-    custom_kernel_bc!(X_array, Y_array, uss; swap)
+    custom_kernel_bc!(X_array, Y_array_cp, uss; swap=0, print_info = false)
+    custom_kernel_bc!(X_array, Y_array, uss; swap, print_info = false)
     @test all(Y_array_cp.y1 .== Y_array.y1)
 end
 
-FT = Float32;
-arr(T) = T(zeros(63,4,4,1,5400))
-X_array = (;x1 = arr(ArrayType),x2 = arr(ArrayType),x3 = arr(ArrayType));
-Y_array = (;y1 = arr(ArrayType),);
+end # module
+
+import .IndexSwapBench as BIS
+
+using CUDA
+bm = BIS.Benchmark(;problem_size=(63,4,4,1,5400), float_type=Float32)
+# bm = BIS.Benchmark(;problem_size=(63,4,4,1,5400), float_type=Float64)
+ArrayType = CUDA.CuArray;
+# ArrayType = identity;
+arr(bm, T) = T(zeros(bm.float_type, bm.problem_size...))
+X_array = (;x1 = arr(bm, ArrayType),x2 = arr(bm, ArrayType),x3 = arr(bm, ArrayType));
+Y_array = (;y1 = arr(bm, ArrayType),);
 to_vec(ξ) = (;zip(propertynames(ξ), map(θ -> vec(θ), values(ξ)))...);
 X_vector = to_vec(X_array);
 Y_vector = to_vec(Y_array);
 N = length(X_vector.x1)
 (Nv, Nij, _, _, Nh) = size(Y_array.y1);
-uss = UniversalSizesStatic(Nv, Nij, Nh);
-at_dot_call!(X_vector, Y_vector)
-custom_kernel_bc!(X_array, Y_array, uss; swap=0)
-custom_kernel_bc!(X_array, Y_array, uss; swap=1)
-custom_kernel_bc!(X_array, Y_array, uss; swap=2)
-test_custom_kernel_bc!(X_array, Y_array, uss; swap=1)
-test_custom_kernel_bc!(X_array, Y_array, uss; swap=2)
-
-@pretty_belapsed at_dot_call!($X_vector, $Y_vector)
-@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $uss, swap=0)
-@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $uss, swap=1)
-@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $uss, swap=2)
+uss = BIS.UniversalSizesStatic(Nv, Nij, Nh);
+BIS.at_dot_call!(X_vector, Y_vector; nreps=1)
+BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=0, nreps=1)
+BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=1, nreps=1)
+BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=2, nreps=1)
+BIS.test_custom_kernel_bc!(X_array, Y_array, uss; swap=1)
+BIS.test_custom_kernel_bc!(X_array, Y_array, uss; swap=2)
+
+BIS.at_dot_call!(X_vector, Y_vector; nreps=1000, bm)
+BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=0, nreps=1000, bm)
+BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=1, nreps=1000, bm)
+BIS.custom_kernel_bc!(X_array, Y_array, uss; swap=2, nreps=1000, bm)
+
+@info "ArrayType = $ArrayType"
+BIS.tabulate_benchmark(bm)
 
 #! format: on
diff --git a/benchmarks/scripts/indexing_and_static_ndranges.jl b/benchmarks/scripts/indexing_and_static_ndranges.jl
index 4ac1ad0fdd..3e42f0ec16 100644
--- a/benchmarks/scripts/indexing_and_static_ndranges.jl
+++ b/benchmarks/scripts/indexing_and_static_ndranges.jl
@@ -5,7 +5,7 @@ using Revise; include(joinpath("benchmarks", "scripts", "indexing_and_static_ndr
 # Info:
 This script compares two things:
  - linear vs cartesian indexing
- - impact of static vs dynamic NDRanges (https://juliagpu.github.io/KernelAbstractions.jl/dev/examples/memcopy_static/)
+ - impact of static vs dynamic NDRanges (https://juliagpu.githubSR.io/KernelAbstractions.jl/dev/examples/memcopy_static/)
 
 Linear indexing, when possible, has performance advantages
 over using Cartesian indexing. Julia Base's Broadcast only
@@ -30,61 +30,84 @@ setting where linear indexing is allowed.
     nearly the same benefit as linear indexing.
 
 # References:
- - https://github.com/CliMA/ClimaCore.jl/issues/1889
- - https://github.com/JuliaLang/julia/issues/28126
- - https://github.com/JuliaLang/julia/issues/32051
+ - https://githubSR.com/CliMA/ClimaCore.jl/issues/1889
+ - https://githubSR.com/JuliaLang/julia/issues/28126
+ - https://githubSR.com/JuliaLang/julia/issues/32051
 
 # Benchmark results:
 
-Local Apple M1 Mac (CPU):
+Clima A100:
 ```
-at_dot_call!($X_array, $Y_array):
-     143 milliseconds, 774 microseconds
-at_dot_call!($X_vector, $Y_vector):
-     65 milliseconds, 567 microseconds
-custom_kernel_bc!($X_vector, $Y_vector, $us):
-     66 milliseconds, 870 microseconds
-custom_kernel_bc!($X_array, $Y_array, $us; use_pw = false):
-     143 milliseconds, 643 microseconds
-custom_kernel_bc!($X_array, $Y_array, $us; use_pw = true):
-     65 milliseconds, 778 microseconds
-custom_kernel_bc!($X_vector, $Y_vector, $uss):
-     65 milliseconds, 765 microseconds
-custom_kernel_bc!($X_array, $Y_array, $uss; use_pw = false):
-     144 milliseconds, 271 microseconds
-custom_kernel_bc!($X_array, $Y_array, $uss; use_pw = true):
-     66 milliseconds, 376 microseconds
+[ Info: ArrayType = identity
+Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs=2039
+┌────────────────────────────────────────────────────────────────────────────┬───────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐
+│ funcs                                                                      │ time per call                     │ bw %    │ achieved bw │ n-reads/writes │ n-reps │
+├────────────────────────────────────────────────────────────────────────────┼───────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤
+│ BSR.at_dot_call!(X_array, Y_array; nreps=1000, bm) # slow                  │ 422 microseconds, 223 nanoseconds │ 2.35535 │ 48.0256     │ 1              │ 1000   │
+│ BSR.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) # fast                │ 242 microseconds, 740 nanoseconds │ 4.09692 │ 83.5362     │ 1              │ 1000   │
+│ BSR.custom_kernel_bc!(X_vector, Y_vector, us; nreps=1000, bm)              │ 242 microseconds, 30 nanoseconds  │ 4.10894 │ 83.7812     │ 1              │ 1000   │
+│ BSR.custom_kernel_bc!(X_vector, Y_vector, uss; nreps=1000, bm)             │ 244 microseconds, 279 nanoseconds │ 4.0711  │ 83.0097     │ 1              │ 1000   │
+│ BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=false, nreps=1000, bm)  │ 499 microseconds, 283 nanoseconds │ 1.99182 │ 40.6133     │ 1              │ 1000   │
+│ BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=false, nreps=1000, bm) │ 541 microseconds, 506 nanoseconds │ 1.83651 │ 37.4465     │ 1              │ 1000   │
+│ BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=true, nreps=1000, bm)   │ 247 microseconds, 108 nanoseconds │ 4.02449 │ 82.0593     │ 1              │ 1000   │
+│ BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=true, nreps=1000, bm)  │ 242 microseconds, 209 nanoseconds │ 4.10589 │ 83.7192     │ 1              │ 1000   │
+└────────────────────────────────────────────────────────────────────────────┴───────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘
+[ Info: ArrayType = identity
+Problem size: (63, 4, 4, 1, 5400), float_type = Float64, device_bandwidth_GBs=2039
+┌────────────────────────────────────────────────────────────────────────────┬───────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐
+│ funcs                                                                      │ time per call                     │ bw %    │ achieved bw │ n-reads/writes │ n-reps │
+├────────────────────────────────────────────────────────────────────────────┼───────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤
+│ BSR.at_dot_call!(X_array, Y_array; nreps=1000, bm) # slow                  │ 1 millisecond, 446 microseconds   │ 1.37517 │ 28.0397     │ 1              │ 1000   │
+│ BSR.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) # fast                │ 984 microseconds, 854 nanoseconds │ 2.01955 │ 41.1787     │ 1              │ 1000   │
+│ BSR.custom_kernel_bc!(X_vector, Y_vector, us; nreps=1000, bm)              │ 987 microseconds, 438 nanoseconds │ 2.01427 │ 41.0709     │ 1              │ 1000   │
+│ BSR.custom_kernel_bc!(X_vector, Y_vector, uss; nreps=1000, bm)             │ 985 microseconds, 779 nanoseconds │ 2.01766 │ 41.1401     │ 1              │ 1000   │
+│ BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=false, nreps=1000, bm)  │ 1 millisecond, 475 microseconds   │ 1.34834 │ 27.4927     │ 1              │ 1000   │
+│ BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=false, nreps=1000, bm) │ 1 millisecond, 473 microseconds   │ 1.34985 │ 27.5234     │ 1              │ 1000   │
+│ BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=true, nreps=1000, bm)   │ 983 microseconds, 811 nanoseconds │ 2.0217  │ 41.2224     │ 1              │ 1000   │
+│ BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=true, nreps=1000, bm)  │ 984 microseconds, 683 nanoseconds │ 2.0199  │ 41.1858     │ 1              │ 1000   │
+└────────────────────────────────────────────────────────────────────────────┴───────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘
 ```
 
 Clima A100
 ```
-at_dot_call!($X_array, $Y_array):
-     6 milliseconds, 775 microseconds
-at_dot_call!($X_vector, $Y_vector):
-     2 milliseconds, 834 microseconds
-custom_sol_kernel!($X_vector, $Y_vector, $(Val(N))):
-     2 milliseconds, 547 microseconds
-custom_kernel_bc!($X_vector, $Y_vector, $us):
-     2 milliseconds, 561 microseconds
-custom_kernel_bc!($X_array, $Y_array, $us; use_pw = false):
-     4 milliseconds, 160 microseconds
-custom_kernel_bc!($X_array, $Y_array, $us; use_pw = true):
-     2 milliseconds, 584 microseconds
-custom_kernel_bc!($X_vector, $Y_vector, $uss):
-     2 milliseconds, 540 microseconds
-custom_kernel_bc!($X_array, $Y_array, $uss; use_pw = false):
-     2 milliseconds, 715 microseconds
-custom_kernel_bc!($X_array, $Y_array, $uss; use_pw = true):
-     2 milliseconds, 547 microseconds
+[ Info: ArrayType = CuArray
+Problem size: (63, 4, 4, 1, 5400), float_type = Float32, device_bandwidth_GBs=2039
+┌─────────────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐
+│ funcs                                                                       │ time per call                    │ bw %    │ achieved bw │ n-reads/writes │ n-reps │
+├─────────────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤
+│ BSR.at_dot_call!(X_array, Y_array; nreps=1000, bm) # slow                   │ 84 microseconds, 791 nanoseconds │ 11.7287 │ 239.149     │ 1              │ 1000   │
+│ BSR.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) # fast                 │ 14 microseconds, 497 nanoseconds │ 68.6003 │ 1398.76     │ 1              │ 1000   │
+│ iscpu || BSR.custom_sol_kernel!(X_vector, Y_vector, Val(N); nreps=1000, bm) │ 13 microseconds, 125 nanoseconds │ 75.7724 │ 1545.0      │ 1              │ 1000   │
+│ BSR.custom_kernel_bc!(X_vector, Y_vector, us; nreps=1000, bm)               │ 14 microseconds, 212 nanoseconds │ 69.9794 │ 1426.88     │ 1              │ 1000   │
+│ BSR.custom_kernel_bc!(X_vector, Y_vector, uss; nreps=1000, bm)              │ 13 microseconds, 55 nanoseconds  │ 76.1765 │ 1553.24     │ 1              │ 1000   │
+│ BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=false, nreps=1000, bm)   │ 47 microseconds, 258 nanoseconds │ 21.0439 │ 429.084     │ 1              │ 1000   │
+│ BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=false, nreps=1000, bm)  │ 30 microseconds, 637 nanoseconds │ 32.4612 │ 661.884     │ 1              │ 1000   │
+│ BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=true, nreps=1000, bm)    │ 14 microseconds, 386 nanoseconds │ 69.1326 │ 1409.61     │ 1              │ 1000   │
+│ BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=true, nreps=1000, bm)   │ 13 microseconds, 58 nanoseconds  │ 76.1646 │ 1553.0      │ 1              │ 1000   │
+└─────────────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘
+[ Info: ArrayType = CuArray
+Problem size: (63, 4, 4, 1, 5400), float_type = Float64, device_bandwidth_GBs=2039
+┌─────────────────────────────────────────────────────────────────────────────┬──────────────────────────────────┬─────────┬─────────────┬────────────────┬────────┐
+│ funcs                                                                       │ time per call                    │ bw %    │ achieved bw │ n-reads/writes │ n-reps │
+├─────────────────────────────────────────────────────────────────────────────┼──────────────────────────────────┼─────────┼─────────────┼────────────────┼────────┤
+│ BSR.at_dot_call!(X_array, Y_array; nreps=1000, bm) # slow                   │ 85 microseconds, 69 nanoseconds  │ 23.3807 │ 476.732     │ 1              │ 1000   │
+│ BSR.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) # fast                 │ 28 microseconds, 809 nanoseconds │ 69.0417 │ 1407.76     │ 1              │ 1000   │
+│ iscpu || BSR.custom_sol_kernel!(X_vector, Y_vector, Val(N); nreps=1000, bm) │ 26 microseconds, 183 nanoseconds │ 75.965  │ 1548.93     │ 1              │ 1000   │
+│ BSR.custom_kernel_bc!(X_vector, Y_vector, us; nreps=1000, bm)               │ 26 microseconds, 426 nanoseconds │ 75.2673 │ 1534.7      │ 1              │ 1000   │
+│ BSR.custom_kernel_bc!(X_vector, Y_vector, uss; nreps=1000, bm)              │ 26 microseconds, 256 nanoseconds │ 75.7546 │ 1544.64     │ 1              │ 1000   │
+│ BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=false, nreps=1000, bm)   │ 47 microseconds, 819 nanoseconds │ 41.5938 │ 848.098     │ 1              │ 1000   │
+│ BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=false, nreps=1000, bm)  │ 31 microseconds, 442 nanoseconds │ 63.2584 │ 1289.84     │ 1              │ 1000   │
+│ BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=true, nreps=1000, bm)    │ 26 microseconds, 729 nanoseconds │ 74.4138 │ 1517.3      │ 1              │ 1000   │
+│ BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=true, nreps=1000, bm)   │ 26 microseconds, 642 nanoseconds │ 74.6569 │ 1522.25     │ 1              │ 1000   │
+└─────────────────────────────────────────────────────────────────────────────┴──────────────────────────────────┴─────────┴─────────────┴────────────────┴────────┘
 ```
 =#
 
 #! format: off
-import CUDA
-using BenchmarkTools, Dates
-using LazyBroadcast: @lazy
-ArrayType = CUDA.CuArray;
-# ArrayType = identity;
+
+module IndexStaticRangeBench
+
+include("benchmark_utils.jl")
 
 # ============================================================ Non-extruded broadcast (start)
 import Base.Broadcast: BroadcastStyle
@@ -216,62 +239,32 @@ _axes(bc::PointWiseBC{<:Base.Broadcast.AbstractArrayStyle{0}}, ::Nothing) = ()
 Base.IndexStyle(::Type{<:PointWiseBC{<:Any, <:Tuple{Any}}}) = IndexLinear()
 # ============================================================ Non-extruded broadcast (end)
 
-if ArrayType === identity
-    macro pretty_belapsed(expr)
-        return quote
-            println($(string(expr)), ":")
-            print("     ")
-            print_time_and_units(BenchmarkTools.@belapsed(esc($expr)))
-        end
-    end
-    macro pretty_elapsed(expr)
-        return quote
-            println($(string(expr)), ":")
-            print("     ")
-            print_time_and_units(BenchmarkTools.@elapsed(esc($expr)))
-        end
-    end
-else
-    macro pretty_belapsed(expr)
-        return quote
-            println($(string(expr)), ":")
-            print("     ")
-            print_time_and_units(
-                BenchmarkTools.@belapsed(CUDA.@sync((esc($expr))))
-            )
-        end
-    end
-    macro pretty_elapsed(expr)
-        return quote
-            println($(string(expr)), ":")
-            print("     ")
-            print_time_and_units(
-                BenchmarkTools.@elapsed(CUDA.@sync((esc($expr))))
-            )
-        end
-    end
-end
-print_time_and_units(x) = println(time_and_units_str(x))
-time_and_units_str(x::Real) =
-    trunc_time(string(compound_period(x, Dates.Second)))
-function compound_period(x::Real, ::Type{T}) where {T <: Dates.Period}
-    nf = Dates.value(convert(Dates.Nanosecond, T(1)))
-    ns = Dates.Nanosecond(ceil(x * nf))
-    return Dates.canonicalize(Dates.CompoundPeriod(ns))
-end
-trunc_time(s::String) = count(',', s) > 1 ? join(split(s, ",")[1:2], ",") : s
 myadd(x1, x2, x3) = zero(x1)
-function at_dot_call!(X, Y)
+function at_dot_call!(X, Y; nreps = 1, bm=nothing)
     (; x1, x2, x3) = X
     (; y1) = Y
-    for i in 1:100 # reduce variance / impact of launch latency
-        @. y1 = myadd(x1, x2, x3) # 3 reads, 1 write
-        # @. y1 = 0 # 3 reads, 1 write
+    @. y1 = myadd(x1, x2, x3) # compile
+    e = CUDA.@elapsed begin
+        for i in 1:nreps # reduce variance / impact of launch latency
+            @. y1 = myadd(x1, x2, x3) # 3 reads, 1 write
+        end
+    end
+    if !isnothing(bm)
+        kernel_time_s=e/nreps
+        n_reads_writes=1
+        nt = (;
+            caller=@caller_name(@__FILE__),
+            kernel_time_s,
+            n_reads_writes,
+            nreps,
+            perf_stats(;bm,kernel_time_s,n_reads_writes)...
+        )
+        push!(bm.data, nt)
     end
     return nothing
 end;
 
-function custom_sol_kernel!(X, Y, ::Val{N}) where {N}
+function custom_sol_kernel!(X, Y, ::Val{N}; nreps = 1, bm=nothing) where {N}
     (; x1, x2, x3) = X
     (; y1) = Y
     kernel = CUDA.@cuda always_inline = true launch = false custom_kernel_knl!(
@@ -284,9 +277,25 @@ function custom_sol_kernel!(X, Y, ::Val{N}) where {N}
     config = CUDA.launch_configuration(kernel.fun)
     threads = min(N, config.threads)
     blocks = cld(N, threads)
-    for i in 1:100 # reduce variance / impact of launch latency
-        kernel(y1, x1, x2, x3, Val(N); threads, blocks)
+    kernel(y1, x1, x2, x3, Val(N); threads, blocks) # compile
+    e = CUDA.@elapsed begin
+        for i in 1:nreps # reduce variance / impact of launch latency
+            kernel(y1, x1, x2, x3, Val(N); threads, blocks)
+        end
+    end
+    if !isnothing(bm)
+        kernel_time_s=e/nreps
+        n_reads_writes=1
+        nt = (;
+            caller=@caller_name(@__FILE__),
+            kernel_time_s,
+            n_reads_writes,
+            nreps,
+            perf_stats(;bm,kernel_time_s,n_reads_writes)...
+        )
+        push!(bm.data, nt)
     end
+
     return nothing
 end;
 function custom_kernel_knl!(y1, x1, x2, x3, ::Val{N}) where {N}
@@ -299,42 +308,26 @@ function custom_kernel_knl!(y1, x1, x2, x3, ::Val{N}) where {N}
     return nothing
 end;
 
-abstract type AbstractUniversalSizes{Nv, Nij} end
-struct UniversalSizesCC{Nv, Nij} <: AbstractUniversalSizes{Nv, Nij}
-    Nh::Int
-end
-struct UniversalSizesStatic{Nv, Nij, Nh} <: AbstractUniversalSizes{Nv, Nij} end
-
-get_Nv(::AbstractUniversalSizes{Nv}) where {Nv} = Nv
-get_Nij(::AbstractUniversalSizes{Nv, Nij}) where {Nv, Nij} = Nij
-get_Nh(us::UniversalSizesCC) = us.Nh
-get_Nh(::UniversalSizesStatic{Nv, Nij, Nh}) where {Nv, Nij, Nh} = Nh
-get_N(us::AbstractUniversalSizes{Nv, Nij}) where {Nv, Nij} = prod((Nv,Nij,Nij,1,get_Nh(us)))
-UniversalSizesCC(Nv, Nij, Nh) = UniversalSizesCC{Nv, Nij}(Nh)
-UniversalSizesStatic(Nv, Nij, Nh) = UniversalSizesStatic{Nv, Nij, Nh}()
-using Test
-us_tup = (1, 2, 3)
-@test get_Nv(UniversalSizesCC(us_tup...))  == get_Nv(UniversalSizesStatic(us_tup...))
-@test get_Nij(UniversalSizesCC(us_tup...)) == get_Nij(UniversalSizesStatic(us_tup...))
-@test get_Nh(UniversalSizesCC(us_tup...))  == get_Nh(UniversalSizesStatic(us_tup...))
-@test get_N(UniversalSizesCC(us_tup...))   == get_N(UniversalSizesStatic(us_tup...))
-
-function custom_kernel_bc!(X, Y, us::AbstractUniversalSizes; printtb=false, use_pw=true)
+function custom_kernel_bc!(X, Y, us::AbstractUniversalSizes; printtb=false, use_pw=true, nreps = 1, bm=nothing)
     (; x1, x2, x3) = X
     (; y1) = Y
     bc_base = @lazy @. y1 = myadd(x1, x2, x3)
     bc = use_pw ? to_pointwise_bc(bc_base) : bc_base
     if y1 isa Array
         if bc isa Base.Broadcast.Broadcasted
-            for i in 1:100 # reduce variance / impact of launch latency
-                @inbounds @simd for j in eachindex(bc)
-                    y1[j] = bc[j]
+            e = Base.@elapsed begin
+                for i in 1:nreps # reduce variance / impact of launch latency
+                    @inbounds @simd for j in eachindex(bc)
+                        y1[j] = bc[j]
+                    end
                 end
             end
         else
-            for i in 1:100 # reduce variance / impact of launch latency
-                @inbounds @simd for j in 1:get_N(us)
-                    y1[j] = bc[j]
+            e = Base.@elapsed begin
+                for i in 1:nreps # reduce variance / impact of launch latency
+                    @inbounds @simd for j in 1:get_N(us)
+                        y1[j] = bc[j]
+                    end
                 end
             end
         end
@@ -350,10 +343,25 @@ function custom_kernel_bc!(X, Y, us::AbstractUniversalSizes; printtb=false, use_
         threads = min(N, config.threads)
         blocks = cld(N, threads)
         printtb && @show blocks, threads
-        for i in 1:100 # reduce variance / impact of launch latency
-            kernel(y1, bc,us; threads, blocks)
+        kernel(y1, bc,us; threads, blocks) # compile
+        e = CUDA.@elapsed begin
+            for i in 1:nreps # reduce variance / impact of launch latency
+                kernel(y1, bc,us; threads, blocks)
+            end
         end
     end
+    if !isnothing(bm)
+        kernel_time_s=e/nreps
+        n_reads_writes=1
+        nt = (;
+            caller=@caller_name(@__FILE__),
+            kernel_time_s,
+            n_reads_writes,
+            nreps,
+            perf_stats(;bm,kernel_time_s,n_reads_writes)...
+        )
+        push!(bm.data, nt)
+    end
     return nothing
 end;
 @inline get_cart_lin_index(bc, n, I) = I
@@ -371,40 +379,61 @@ function custom_kernel_knl_bc!(y1, bc, us)
     return nothing
 end;
 
-FT = Float32;
-arr(T) = T(zeros(63,4,4,1,5400))
-X_array = (;x1 = arr(ArrayType),x2 = arr(ArrayType),x3 = arr(ArrayType));
-Y_array = (;y1 = arr(ArrayType),);
+end # module
+import .IndexStaticRangeBench as BSR
+
+using CUDA
+using Test
+bm = BSR.Benchmark(;problem_size=(63,4,4,1,5400), float_type=Float32)
+# bm = BSR.Benchmark(;problem_size=(63,4,4,1,5400), float_type=Float64)
+ArrayType = CUDA.CuArray;
+# ArrayType = Base.identity;
+
+us_tup = (1, 2, 3)
+@test BSR.get_Nv(BSR.UniversalSizesCC(us_tup...))  == BSR.get_Nv(BSR.UniversalSizesStatic(us_tup...))
+@test BSR.get_Nij(BSR.UniversalSizesCC(us_tup...)) == BSR.get_Nij(BSR.UniversalSizesStatic(us_tup...))
+@test BSR.get_Nh(BSR.UniversalSizesCC(us_tup...))  == BSR.get_Nh(BSR.UniversalSizesStatic(us_tup...))
+@test BSR.get_N(BSR.UniversalSizesCC(us_tup...))   == BSR.get_N(BSR.UniversalSizesStatic(us_tup...))
+
+arr(bm, T) = T(zeros(bm.float_type, bm.problem_size...))
+X_array = (;x1 = arr(bm, ArrayType),x2 = arr(bm, ArrayType),x3 = arr(bm, ArrayType));
+Y_array = (;y1 = arr(bm, ArrayType),);
 to_vec(ξ) = (;zip(propertynames(ξ), map(θ -> vec(θ), values(ξ)))...);
 X_vector = to_vec(X_array);
 Y_vector = to_vec(Y_array);
-at_dot_call!(X_array, Y_array)
-at_dot_call!(X_vector, Y_vector)
+BSR.at_dot_call!(X_array, Y_array)
+BSR.at_dot_call!(X_vector, Y_vector)
 N = length(X_vector.x1)
 (Nv, Nij, _, Nf, Nh) = size(Y_array.y1);
-us = UniversalSizesCC(Nv, Nij, Nh);
-uss = UniversalSizesStatic(Nv, Nij, Nh);
-@test get_N(us) == N
-@test get_N(uss) == N
+us = BSR.UniversalSizesCC(Nv, Nij, Nh);
+uss = BSR.UniversalSizesStatic(Nv, Nij, Nh);
+@test BSR.get_N(us) == N
+@test BSR.get_N(uss) == N
 iscpu = ArrayType === identity
-iscpu || custom_sol_kernel!(X_vector, Y_vector, Val(N))
-custom_kernel_bc!(X_vector, Y_vector, us)
-custom_kernel_bc!(X_array, Y_array, us; use_pw=false)
-custom_kernel_bc!(X_array, Y_array, us; use_pw=true)
-
-custom_kernel_bc!(X_vector, Y_vector, uss)
-custom_kernel_bc!(X_array, Y_array, uss; use_pw=false)
-custom_kernel_bc!(X_array, Y_array, uss; use_pw=true)
-
-@pretty_belapsed at_dot_call!($X_array, $Y_array) # slow
-@pretty_belapsed at_dot_call!($X_vector, $Y_vector) # fast
-iscpu || @pretty_belapsed custom_sol_kernel!($X_vector, $Y_vector, $(Val(N)))
-@pretty_belapsed custom_kernel_bc!($X_vector, $Y_vector, $us)
-@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $us; use_pw=false)
-@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $us; use_pw=true)
-
-@pretty_belapsed custom_kernel_bc!($X_vector, $Y_vector, $uss)
-@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $uss; use_pw=false)
-@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $uss; use_pw=true)
+BSR.custom_kernel_bc!(X_vector, Y_vector, us)
+BSR.custom_kernel_bc!(X_vector, Y_vector, uss)
+iscpu || BSR.custom_sol_kernel!(X_vector, Y_vector, Val(N))
+
+BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=false)
+BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=false)
+
+BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=true)
+BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=true)
+
+BSR.at_dot_call!(X_array, Y_array; nreps=1000, bm) # slow
+BSR.at_dot_call!(X_vector, Y_vector; nreps=1000, bm) # fast
+iscpu || BSR.custom_sol_kernel!(X_vector, Y_vector, Val(N); nreps=1000, bm)
+
+BSR.custom_kernel_bc!(X_vector, Y_vector, us; nreps=1000, bm)
+BSR.custom_kernel_bc!(X_vector, Y_vector, uss; nreps=1000, bm)
+
+BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=false, nreps=1000, bm)
+BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=false, nreps=1000, bm)
+
+BSR.custom_kernel_bc!(X_array, Y_array, us; use_pw=true, nreps=1000, bm)
+BSR.custom_kernel_bc!(X_array, Y_array, uss; use_pw=true, nreps=1000, bm)
+
+@info "ArrayType = $ArrayType"
+BSR.tabulate_benchmark(bm)
 
 #! format: on