Skip to content

Commit

Permalink
Adapt for Metal
Browse files Browse the repository at this point in the history
[only benchmarks]
  • Loading branch information
christiangnrd committed Sep 19, 2024
1 parent b9ab40c commit 721ef62
Show file tree
Hide file tree
Showing 8 changed files with 139 additions and 194 deletions.
48 changes: 24 additions & 24 deletions perf/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@ const n = 1000

# generate some arrays
cpu_mat = rand(rng, Float32, m, n)
gpu_mat = CuArray{Float32}(undef, size(cpu_mat))
gpu_mat = MtlArray{Float32}(undef, size(cpu_mat))
gpu_vec = reshape(gpu_mat, length(gpu_mat))
gpu_arr_3d = reshape(gpu_mat, (m, 40, 25))
gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10))
gpu_mat_ints = CuArray(rand(rng, Int, m, n))
gpu_mat_ints = MtlArray(rand(rng, Int, m, n))
gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
gpu_mat_bools = CuArray(rand(rng, Bool, m, n))
gpu_mat_bools = MtlArray(rand(rng, Bool, m, n))
gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools))

group["construct"] = @benchmarkable CuArray{Int}(undef, 1)
group["construct"] = @benchmarkable MtlArray{Int}(undef, 1)

group["copy"] = @async_benchmarkable copy($gpu_mat)

Expand All @@ -26,7 +26,7 @@ let group = addgroup!(group, "copyto!")
end

let group = addgroup!(group, "iteration")
group["scalar"] = @benchmarkable CUDA.@allowscalar [$gpu_vec[i] for i in 1:10]
group["scalar"] = @benchmarkable Metal.@allowscalar [$gpu_vec[i] for i in 1:10]

group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools]

Expand All @@ -46,12 +46,12 @@ let group = addgroup!(group, "iteration")
end
end

let group = addgroup!(group, "reverse")
group["1d"] = @async_benchmarkable reverse($gpu_vec)
group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1)
group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec)
group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1)
end
# let group = addgroup!(group, "reverse")
# group["1d"] = @async_benchmarkable reverse($gpu_vec)
# group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1)
# group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec)
# group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1)
# end

group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0

Expand All @@ -77,31 +77,31 @@ end

let group = addgroup!(group, "random")
let group = addgroup!(group, "rand")
group["Float32"] = @async_benchmarkable CUDA.rand(Float32, m*n)
group["Int64"] = @async_benchmarkable CUDA.rand(Int64, m*n)
group["Float32"] = @async_benchmarkable Metal.rand(Float32, m*n)
group["Int64"] = @async_benchmarkable Metal.rand(Int64, m*n)
end

let group = addgroup!(group, "rand!")
group["Float32"] = @async_benchmarkable CUDA.rand!($gpu_vec)
group["Int64"] = @async_benchmarkable CUDA.rand!($gpu_vec_ints)
group["Float32"] = @async_benchmarkable Metal.rand!($gpu_vec)
group["Int64"] = @async_benchmarkable Metal.rand!($gpu_vec_ints)
end

let group = addgroup!(group, "randn")
group["Float32"] = @async_benchmarkable CUDA.randn(Float32, m*n)
#group["Int64"] = @async_benchmarkable CUDA.randn(Int64, m*n)
group["Float32"] = @async_benchmarkable Metal.randn(Float32, m*n)
# group["Int64"] = @async_benchmarkable Metal.randn(Int64, m*n)
end

let group = addgroup!(group, "randn!")
group["Float32"] = @async_benchmarkable CUDA.randn!($gpu_vec)
#group["Int64"] = @async_benchmarkable CUDA.randn!($gpu_vec_ints)
group["Float32"] = @async_benchmarkable Metal.randn!($gpu_vec)
# group["Int64"] = @async_benchmarkable Metal.randn!($gpu_vec_ints)
end
end

let group = addgroup!(group, "sorting")
group["1d"] = @async_benchmarkable sort($gpu_vec)
group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
end
# let group = addgroup!(group, "sorting")
# group["1d"] = @async_benchmarkable sort($gpu_vec)
# group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
# group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
# end

let group = addgroup!(group, "permutedims")
group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1))
Expand Down
33 changes: 16 additions & 17 deletions perf/byval.jl
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
module ByVal

using CUDA, BenchmarkTools, Random
using CUDA: i32
using Metal, BenchmarkTools, Random

const threads = 256

# simple add matrixes kernel
function kernel_add_mat(n, x1, x2, y)
i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
i = thread_position_in_grid_1d()
if i <= n
@inbounds y[i] = x1[i] + x2[i]
end
Expand All @@ -20,8 +19,8 @@ end

# add arrays of matrixes kernel
function kernel_add_mat_z_slices(n, vararg...)
x1, x2, y = get_inputs3(blockIdx().y, vararg...)
i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
x1, x2, y = get_inputs3(threadgroup_position_in_grid_2d().y, vararg...)
i = thread_position_in_grid_1d()
if i <= n
@inbounds y[i] = x1[i] + x2[i]
end
Expand All @@ -30,15 +29,15 @@ end

function add_z_slices!(y, x1, x2)
m1, n1 = size(x1[1]) #get size of first slice
blocks = (m1 * n1 + threads - 1) ÷ threads
# get length(x1) more blocks than needed to process 1 slice
@cuda blocks = blocks, length(x1) threads = threads kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...)
groups = (m1 * n1 + threads - 1) ÷ threads
# get length(x1) more groups than needed to process 1 slice
@metal groups = groups, length(x1) threads = threads kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...)
end

function add!(y, x1, x2)
m1, n1 = size(x1)
blocks = (m1 * n1 + threads - 1) ÷ threads
@cuda blocks = blocks, 1 threads = threads kernel_add_mat(m1 * n1, x1, x2, y)
groups = (m1 * n1 + threads - 1) ÷ threads
@metal groups = (groups, 1) threads = threads kernel_add_mat(m1 * n1, x1, x2, y)
end

function main()
Expand All @@ -54,22 +53,22 @@ function main()
m, n = 3072, 1536 # 256 multiplier
#m, n = 6007, 3001 # prime numbers to test memory access correctness

x1 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
x2 = [cu(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
x1 = [mtl(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
x2 = [mtl(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
y1 = [similar(x1[1]) for i = 1:num_z_slices]

# reference down to bones add on GPU
results["reference"] = @benchmark CUDA.@sync blocking=true add!($y1[1], $x1[1], $x2[1])
results["reference"] = @benchmark Metal.@sync add!($y1[1], $x1[1], $x2[1])

# adding arrays in an array
for slices = 1:num_z_slices
results["slices=$slices"] = @benchmark CUDA.@sync blocking=true add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
results["slices=$slices"] = @benchmark Metal.@sync add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
end

# BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
CUDA.unsafe_free!.(x1)
CUDA.unsafe_free!.(x2)
CUDA.unsafe_free!.(y1)
Metal.unsafe_free!.(x1)
Metal.unsafe_free!.(x2)
Metal.unsafe_free!.(y1)

return results
end
Expand Down
36 changes: 19 additions & 17 deletions perf/kernel.jl
Original file line number Diff line number Diff line change
@@ -1,33 +1,35 @@
using CUDA: i32
# using GPUArrays

group = addgroup!(SUITE, "kernel")

group["launch"] = @benchmarkable @cuda identity(nothing)
group["launch"] = @benchmarkable @metal identity(nothing)

group["occupancy"] = @benchmarkable begin
kernel = @cuda launch=false identity(nothing)
launch_configuration(kernel.fun)
end
# group["occupancy"] = @benchmarkable begin
# kernel = @metal launch=false identity(nothing)
# GPUArrays.launch_heuristic(Metal.mtlArrayBackend(), kernel.f; elements=1, elements_per_thread=1)
# return
# end

src = CUDA.rand(Float32, 512, 1000)
src = Metal.rand(Float32, 512, 1000)
dest = similar(src)
function indexing_kernel(dest, src)
i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
i = thread_position_in_grid_1d()
@inbounds dest[i] = src[i]
return
end
group["indexing"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $indexing_kernel($dest, $src)
group["indexing"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $indexing_kernel($dest, $src)

function checked_indexing_kernel(dest, src)
i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
i = thread_position_in_grid_1d()
dest[i] = src[i]
return
end
group["indexing_checked"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $checked_indexing_kernel($dest, $src)
group["indexing_checked"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $checked_indexing_kernel($dest, $src)

function rand_kernel(dest::AbstractArray{T}) where {T}
i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x
dest[i] = rand(T)
return
end
group["rand"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $rand_kernel($dest)
## DELETE
# function rand_kernel(dest::AbstractArray{T}) where {T}
# i = thread_position_in_grid_1d()
# dest[i] = Metal.rand(T)
# return
# end
# group["rand"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $rand_kernel($dest)
12 changes: 6 additions & 6 deletions perf/latency.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module Latency

using CUDA
using Metal
using BenchmarkTools

function main()
Expand All @@ -11,24 +11,24 @@ function main()
base_cmd = `$base_cmd --project=$(unsafe_string(Base.JLOptions().project))`
end
# NOTE: we don't ust Base.active_project() here because of how CI launches this script,
# starting with --project in the main CUDA.jl project.
# starting with --project in the main Metal.jl project.

# time to precompile the package and its dependencies
precompile_cmd =
`$base_cmd -e "pkg = Base.identify_package(\"CUDA\")
`$base_cmd -e "pkg = Base.identify_package(\"Metal\")
Base.compilecache(pkg)"`
results["precompile"] = @benchmark run($precompile_cmd) evals=1 seconds=60

# time to actually import the package
import_cmd =
`$base_cmd -e "using CUDA"`
`$base_cmd -e "using Metal"`
results["import"] = @benchmark run($import_cmd) evals=1 seconds=30

# time to actually compile a kernel
ttfp_cmd =
`$base_cmd -e "using CUDA
`$base_cmd -e "using Metal
kernel() = return
CUDA.code_ptx(devnull, kernel, Tuple{}; kernel=true)"`
Metal.code_agx(devnull, kernel, Tuple{}; kernel=true)"`
results["ttfp"] = @benchmark run($ttfp_cmd) evals=1 seconds=60

results
Expand Down
14 changes: 3 additions & 11 deletions perf/metal.jl
Original file line number Diff line number Diff line change
@@ -1,14 +1,6 @@
group = addgroup!(SUITE, "cuda")
group = addgroup!(SUITE, "metal")

let group = addgroup!(group, "synchronization")
let group = addgroup!(group, "stream")
group["blocking"] = @benchmarkable synchronize(blocking=true)
group["auto"] = @benchmarkable synchronize()
group["nonblocking"] = @benchmarkable synchronize(spin=false)
end
let group = addgroup!(group, "context")
group["blocking"] = @benchmarkable device_synchronize(blocking=true)
group["auto"] = @benchmarkable device_synchronize()
group["nonblocking"] = @benchmarkable device_synchronize(spin=false)
end
group["stream"] = @benchmarkable synchronize()
group["context"] = @benchmarkable device_synchronize()
end
26 changes: 13 additions & 13 deletions perf/metaldevrt.jl
Original file line number Diff line number Diff line change
@@ -1,42 +1,42 @@
module cudadevrt
module metaldevrt

using CUDA, BenchmarkTools, Random
using Metal, BenchmarkTools, Random

const threads = 256
#simple add matrix and vector kernel
function kernel_add_mat_vec(m, x1, x2, y)
# one block per column
offset = (blockIdx().x-1) * m
@inbounds xtmp = x2[blockIdx().x]
for i = threadIdx().x : blockDim().x : m
offset = (threadgroup_position_in_grid_2d().x-1) * m
@inbounds xtmp = x2[threadgroup_position_in_grid_2d().x]
for i = thread_position_in_threadgroup_2d().x : threadgroups_per_grid_2d().x : m
@inbounds y[offset + i] = x1[offset + i] + xtmp
end
return
end

function add!(y, x1, x2)
m, n = size(x1)
@cuda blocks = n, 1 threads = threads kernel_add_mat_vec(m, x1, x2, y)
@metal groups = n, 1 threads = threads kernel_add_mat_vec(m, x1, x2, y)
end

function main()
Random.seed!(1)
m, n = 3072, 1536 # 256 multiplier
x1 = cu(randn(Float32, (m, n)) .+ Float32(0.5))
x2 = cu(randn(Float32, (1, n)) .+ Float32(0.5))
x1 = mtl(randn(Float32, (m, n)) .+ Float32(0.5))
x2 = mtl(randn(Float32, (1, n)) .+ Float32(0.5))
y1 = similar(x1)

results = @benchmark CUDA.@sync blocking=true add!($y1, $x1, $x2)
results = @benchmark Metal.@sync add!($y1, $x1, $x2)

# BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
CUDA.unsafe_free!(x1)
CUDA.unsafe_free!(x2)
CUDA.unsafe_free!(y1)
Metal.unsafe_free!(x1)
Metal.unsafe_free!(x2)
Metal.unsafe_free!(y1)

return results
end

end

cudadevrt.main()
metaldevrt.main()

Loading

0 comments on commit 721ef62

Please sign in to comment.