-
Notifications
You must be signed in to change notification settings - Fork 41
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Showing
8 changed files
with
139 additions
and
194 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,33 +1,35 @@ | ||
using CUDA: i32 | ||
# using GPUArrays | ||
|
||
group = addgroup!(SUITE, "kernel") | ||
|
||
group["launch"] = @benchmarkable @cuda identity(nothing) | ||
group["launch"] = @benchmarkable @metal identity(nothing) | ||
|
||
group["occupancy"] = @benchmarkable begin | ||
kernel = @cuda launch=false identity(nothing) | ||
launch_configuration(kernel.fun) | ||
end | ||
# group["occupancy"] = @benchmarkable begin | ||
# kernel = @metal launch=false identity(nothing) | ||
# GPUArrays.launch_heuristic(Metal.mtlArrayBackend(), kernel.f; elements=1, elements_per_thread=1) | ||
# return | ||
# end | ||
|
||
src = CUDA.rand(Float32, 512, 1000) | ||
src = Metal.rand(Float32, 512, 1000) | ||
dest = similar(src) | ||
function indexing_kernel(dest, src) | ||
i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x | ||
i = thread_position_in_grid_1d() | ||
@inbounds dest[i] = src[i] | ||
return | ||
end | ||
group["indexing"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $indexing_kernel($dest, $src) | ||
group["indexing"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $indexing_kernel($dest, $src) | ||
|
||
function checked_indexing_kernel(dest, src) | ||
i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x | ||
i = thread_position_in_grid_1d() | ||
dest[i] = src[i] | ||
return | ||
end | ||
group["indexing_checked"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $checked_indexing_kernel($dest, $src) | ||
group["indexing_checked"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $checked_indexing_kernel($dest, $src) | ||
|
||
function rand_kernel(dest::AbstractArray{T}) where {T} | ||
i = (blockIdx().x-1i32) * blockDim().x + threadIdx().x | ||
dest[i] = rand(T) | ||
return | ||
end | ||
group["rand"] = @async_benchmarkable @cuda threads=size(src,1) blocks=size(src,2) $rand_kernel($dest) | ||
## DELETE | ||
# function rand_kernel(dest::AbstractArray{T}) where {T} | ||
# i = thread_position_in_grid_1d() | ||
# dest[i] = Metal.rand(T) | ||
# return | ||
# end | ||
# group["rand"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $rand_kernel($dest) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,6 @@ | ||
group = addgroup!(SUITE, "cuda") | ||
group = addgroup!(SUITE, "metal") | ||
|
||
let group = addgroup!(group, "synchronization") | ||
let group = addgroup!(group, "stream") | ||
group["blocking"] = @benchmarkable synchronize(blocking=true) | ||
group["auto"] = @benchmarkable synchronize() | ||
group["nonblocking"] = @benchmarkable synchronize(spin=false) | ||
end | ||
let group = addgroup!(group, "context") | ||
group["blocking"] = @benchmarkable device_synchronize(blocking=true) | ||
group["auto"] = @benchmarkable device_synchronize() | ||
group["nonblocking"] = @benchmarkable device_synchronize(spin=false) | ||
end | ||
group["stream"] = @benchmarkable synchronize() | ||
group["context"] = @benchmarkable device_synchronize() | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,42 +1,42 @@ | ||
module cudadevrt | ||
module metaldevrt | ||
|
||
using CUDA, BenchmarkTools, Random | ||
using Metal, BenchmarkTools, Random | ||
|
||
const threads = 256 | ||
#simple add matrix and vector kernel | ||
function kernel_add_mat_vec(m, x1, x2, y) | ||
# one block per column | ||
offset = (blockIdx().x-1) * m | ||
@inbounds xtmp = x2[blockIdx().x] | ||
for i = threadIdx().x : blockDim().x : m | ||
offset = (threadgroup_position_in_grid_2d().x-1) * m | ||
@inbounds xtmp = x2[threadgroup_position_in_grid_2d().x] | ||
for i = thread_position_in_threadgroup_2d().x : threadgroups_per_grid_2d().x : m | ||
@inbounds y[offset + i] = x1[offset + i] + xtmp | ||
end | ||
return | ||
end | ||
|
||
function add!(y, x1, x2) | ||
m, n = size(x1) | ||
@cuda blocks = n, 1 threads = threads kernel_add_mat_vec(m, x1, x2, y) | ||
@metal groups = n, 1 threads = threads kernel_add_mat_vec(m, x1, x2, y) | ||
end | ||
|
||
function main() | ||
Random.seed!(1) | ||
m, n = 3072, 1536 # 256 multiplier | ||
x1 = cu(randn(Float32, (m, n)) .+ Float32(0.5)) | ||
x2 = cu(randn(Float32, (1, n)) .+ Float32(0.5)) | ||
x1 = mtl(randn(Float32, (m, n)) .+ Float32(0.5)) | ||
x2 = mtl(randn(Float32, (1, n)) .+ Float32(0.5)) | ||
y1 = similar(x1) | ||
|
||
results = @benchmark CUDA.@sync blocking=true add!($y1, $x1, $x2) | ||
results = @benchmark Metal.@sync add!($y1, $x1, $x2) | ||
|
||
# BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them | ||
CUDA.unsafe_free!(x1) | ||
CUDA.unsafe_free!(x2) | ||
CUDA.unsafe_free!(y1) | ||
Metal.unsafe_free!(x1) | ||
Metal.unsafe_free!(x2) | ||
Metal.unsafe_free!(y1) | ||
|
||
return results | ||
end | ||
|
||
end | ||
|
||
cudadevrt.main() | ||
metaldevrt.main() | ||
|
Oops, something went wrong.