From 3bbda32f9ca219a85f72272ea31eeb1a564e0b44 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Mon, 30 Sep 2024 09:51:57 -0300 Subject: [PATCH 1/4] Make `lu` results have same storage mode as input (#435) --- lib/mps/linalg.jl | 6 +++--- test/mps/linalg.jl | 8 ++++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/lib/mps/linalg.jl b/lib/mps/linalg.jl index 1c29e75c0..8558f756a 100644 --- a/lib/mps/linalg.jl +++ b/lib/mps/linalg.jl @@ -128,7 +128,7 @@ LinearAlgebra.ipiv2perm(v::MtlVector{T}, maxi::Integer) where T = encode!(cbuf, kernel, descriptor) end - P = MtlMatrix{UInt32}(undef, 1, min(N, M)) + P = similar(A, UInt32, 1, min(N, M)) status = MtlArray{MPSMatrixDecompositionStatus}(undef) commitAndContinue!(cmdbuf) do cbuf @@ -137,7 +137,7 @@ LinearAlgebra.ipiv2perm(v::MtlVector{T}, maxi::Integer) where T = encode!(cbuf, kernel, mps_at, mps_at, mps_p, status) end - B = MtlMatrix{T}(undef, M, N) + B = similar(A, M, N) commit!(cmdbuf) do cbuf mps_b = MPSMatrix(B) @@ -186,7 +186,7 @@ end encode!(cbuf, kernel, descriptor) end - P = MtlMatrix{UInt32}(undef, 1, min(N, M)) + P = similar(A, UInt32, 1, min(N, M)) status = MtlArray{MPSMatrixDecompositionStatus}(undef) commitAndContinue!(cmdbuf) do cbuf diff --git a/test/mps/linalg.jl b/test/mps/linalg.jl index d0f982489..106d7669c 100644 --- a/test/mps/linalg.jl +++ b/test/mps/linalg.jl @@ -190,6 +190,7 @@ end end end +using Metal: storagemode @testset "decompositions" begin A = MtlMatrix(rand(Float32, 1024, 1024)) lua = lu(A) @@ -211,6 +212,13 @@ end A = MtlMatrix{Float32}([1 2; 0 0]) @test_throws SingularException lu(A) + + altStorage = Metal.DefaultStorageMode != Metal.PrivateStorage ? Metal.PrivateStorage : Metal.SharedStorage + A = MtlMatrix{Float32,altStorage}(rand(Float32, 1024, 1024)) + lua = lu(A) + @test storagemode(lua.factors) == storagemode(lua.ipiv) == storagemode(A) + lua = lu!(A) + @test storagemode(lua.factors) == storagemode(lua.ipiv) == storagemode(A) end using .MPS: MPSMatrixSoftMax, MPSMatrixLogSoftMax From b999285e11a5dcd7c202245dbdc5ccebf19b7db6 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Mon, 30 Sep 2024 09:52:46 -0300 Subject: [PATCH 2/4] Fix benchmarking CI and benchmark Shared and Private storage modes (#437) --- .buildkite/pipeline.yml | 2 - .github/workflows/Benchmark.yml | 4 +- perf/array.jl | 176 ++++++++++++++++---------------- 3 files changed, 92 insertions(+), 90 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index b35a38233..788ca020b 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -118,9 +118,7 @@ steps: build.message !~ /\[skip special\]/ timeout_in_minutes: 60 - # we want to benchmark every commit on the master branch, even if it failed CI - wait: ~ - # continue_on_failure: true - group: ":racehorse: Benchmarks" steps: diff --git a/.github/workflows/Benchmark.yml b/.github/workflows/Benchmark.yml index 1904c1640..5bf747779 100644 --- a/.github/workflows/Benchmark.yml +++ b/.github/workflows/Benchmark.yml @@ -11,6 +11,7 @@ on: - main paths: - "src/**/*" + - "lib/**/*" - "ext/**/*" - "perf/**/*" - ".buildkite/**/*" @@ -21,8 +22,9 @@ on: - main paths: - "src/**/*" + - "lib/**/*" - "ext/**/*" - - "benchmarks/**/*" + - "perf/**/*" - ".buildkite/**/*" - "Project.toml" - ".github/workflows/Benchmark.yml" diff --git a/perf/array.jl b/perf/array.jl index 0c57a7dfa..857a05970 100644 --- a/perf/array.jl +++ b/perf/array.jl @@ -1,110 +1,112 @@ -group = addgroup!(SUITE, "array") - const m = 512 const n = 1000 -# generate some arrays -cpu_mat = rand(rng, Float32, m, n) -gpu_mat = MtlArray{Float32}(undef, size(cpu_mat)) -gpu_vec = reshape(gpu_mat, length(gpu_mat)) -gpu_arr_3d = reshape(gpu_mat, (m, 40, 25)) -gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10)) -gpu_mat_ints = MtlArray(rand(rng, Int, m, n)) -gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints)) -gpu_mat_bools = MtlArray(rand(rng, Bool, m, n)) -gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools)) - -group["construct"] = @benchmarkable MtlArray{Int}(undef, 1) - -group["copy"] = @async_benchmarkable copy($gpu_mat) - -gpu_mat2 = copy(gpu_mat) -let group = addgroup!(group, "copyto!") - group["cpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat, $cpu_mat) - group["gpu_to_cpu"] = @async_benchmarkable copyto!($cpu_mat, $gpu_mat) - group["gpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat2, $gpu_mat) -end +for (S, smname) in [(Metal.PrivateStorage,"private"), (Metal.SharedStorage,"shared")] + group = addgroup!(SUITE, "$smname array") + + # generate some arrays + cpu_mat = rand(rng, Float32, m, n) + gpu_mat = MtlMatrix{Float32,S}(undef, size(cpu_mat)) + gpu_vec = reshape(gpu_mat, length(gpu_mat)) + gpu_arr_3d = reshape(gpu_mat, (m, 40, 25)) + gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10)) + gpu_mat_ints = MtlMatrix{Int,S}(rand(rng, Int, m, n)) + gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints)) + gpu_mat_bools = MtlMatrix{Bool,S}(rand(rng, Bool, m, n)) + gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools)) + + group["construct"] = @benchmarkable MtlArray{Int,1,$S}(undef, 1) + + group["copy"] = @benchmarkable Metal.@sync copy($gpu_mat) + + gpu_mat2 = copy(gpu_mat) + let group = addgroup!(group, "copyto!") + group["cpu_to_gpu"] = @benchmarkable Metal.@sync copyto!($gpu_mat, $cpu_mat) + group["gpu_to_cpu"] = @benchmarkable Metal.@sync copyto!($cpu_mat, $gpu_mat) + group["gpu_to_gpu"] = @benchmarkable Metal.@sync copyto!($gpu_mat2, $gpu_mat) + end -let group = addgroup!(group, "iteration") - group["scalar"] = @benchmarkable Metal.@allowscalar [$gpu_vec[i] for i in 1:10] + let group = addgroup!(group, "iteration") + group["scalar"] = @benchmarkable Metal.@allowscalar [$gpu_vec[i] for i in 1:10] - group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools] + group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools] - let group = addgroup!(group, "findall") - group["bool"] = @benchmarkable findall($gpu_vec_bools) - group["int"] = @benchmarkable findall(isodd, $gpu_vec_ints) - end + let group = addgroup!(group, "findall") + group["bool"] = @benchmarkable findall($gpu_vec_bools) + group["int"] = @benchmarkable findall(isodd, $gpu_vec_ints) + end - let group = addgroup!(group, "findfirst") - group["bool"] = @benchmarkable findfirst($gpu_vec_bools) - group["int"] = @benchmarkable findfirst(isodd, $gpu_vec_ints) - end + let group = addgroup!(group, "findfirst") + group["bool"] = @benchmarkable findfirst($gpu_vec_bools) + group["int"] = @benchmarkable findfirst(isodd, $gpu_vec_ints) + end - let group = addgroup!(group, "findmin") # findmax - group["1d"] = @async_benchmarkable findmin($gpu_vec) - group["2d"] = @async_benchmarkable findmin($gpu_mat; dims=1) + let group = addgroup!(group, "findmin") # findmax + group["1d"] = @benchmarkable Metal.@sync findmin($gpu_vec) + group["2d"] = @benchmarkable Metal.@sync findmin($gpu_mat; dims=1) + end end -end - -# let group = addgroup!(group, "reverse") -# group["1d"] = @async_benchmarkable reverse($gpu_vec) -# group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1) -# group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec) -# group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1) -# end -group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0 + # let group = addgroup!(group, "reverse") + # group["1d"] = @benchmarkable Metal.@sync reverse($gpu_vec) + # group["2d"] = @benchmarkable Metal.@sync reverse($gpu_mat; dims=1) + # group["1d_inplace"] = @benchmarkable Metal.@sync reverse!($gpu_vec) + # group["2d_inplace"] = @benchmarkable Metal.@sync reverse!($gpu_mat; dims=1) + # end -# no need to test inplace version, which performs the same operation (but with an alloc) -let group = addgroup!(group, "accumulate") - group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec) - group["2d"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=1) -end + group["broadcast"] = @benchmarkable Metal.@sync $gpu_mat .= 0f0 -let group = addgroup!(group, "reductions") - let group = addgroup!(group, "reduce") - group["1d"] = @async_benchmarkable reduce(+, $gpu_vec) - group["2d"] = @async_benchmarkable reduce(+, $gpu_mat; dims=1) + # no need to test inplace version, which performs the same operation (but with an alloc) + let group = addgroup!(group, "accumulate") + group["1d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_vec) + group["2d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims=1) end - let group = addgroup!(group, "mapreduce") - group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec) - group["2d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=1) - end + let group = addgroup!(group, "reductions") + let group = addgroup!(group, "reduce") + group["1d"] = @benchmarkable Metal.@sync reduce(+, $gpu_vec) + group["2d"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims=1) + end - # used by sum, prod, minimum, maximum, all, any, count -end + let group = addgroup!(group, "mapreduce") + group["1d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_vec) + group["2d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat; dims=1) + end -let group = addgroup!(group, "random") - let group = addgroup!(group, "rand") - group["Float32"] = @async_benchmarkable Metal.rand(Float32, m*n) - group["Int64"] = @async_benchmarkable Metal.rand(Int64, m*n) + # used by sum, prod, minimum, maximum, all, any, count end - let group = addgroup!(group, "rand!") - group["Float32"] = @async_benchmarkable Metal.rand!($gpu_vec) - group["Int64"] = @async_benchmarkable Metal.rand!($gpu_vec_ints) + let group = addgroup!(group, "random") + let group = addgroup!(group, "rand") + group["Float32"] = @benchmarkable Metal.@sync Metal.rand(Float32, m*n) + group["Int64"] = @benchmarkable Metal.@sync Metal.rand(Int64, m*n) + end + + let group = addgroup!(group, "rand!") + group["Float32"] = @benchmarkable Metal.@sync Metal.rand!($gpu_vec) + group["Int64"] = @benchmarkable Metal.@sync Metal.rand!($gpu_vec_ints) + end + + let group = addgroup!(group, "randn") + group["Float32"] = @benchmarkable Metal.@sync Metal.randn(Float32, m*n) + # group["Int64"] = @benchmarkable Metal.@sync Metal.randn(Int64, m*n) + end + + let group = addgroup!(group, "randn!") + group["Float32"] = @benchmarkable Metal.@sync Metal.randn!($gpu_vec) + # group["Int64"] = @benchmarkable Metal.@sync Metal.randn!($gpu_vec_ints) + end end - let group = addgroup!(group, "randn") - group["Float32"] = @async_benchmarkable Metal.randn(Float32, m*n) - # group["Int64"] = @async_benchmarkable Metal.randn(Int64, m*n) - end + # let group = addgroup!(group, "sorting") + # group["1d"] = @benchmarkable Metal.@sync sort($gpu_vec) + # group["2d"] = @benchmarkable Metal.@sync sort($gpu_mat; dims=1) + # group["by"] = @benchmarkable Metal.@sync sort($gpu_vec; by=sin) + # end - let group = addgroup!(group, "randn!") - group["Float32"] = @async_benchmarkable Metal.randn!($gpu_vec) - # group["Int64"] = @async_benchmarkable Metal.randn!($gpu_vec_ints) + let group = addgroup!(group, "permutedims") + group["2d"] = @benchmarkable Metal.@sync permutedims($gpu_mat, (2,1)) + group["3d"] = @benchmarkable Metal.@sync permutedims($gpu_arr_3d, (3,1,2)) + group["4d"] = @benchmarkable Metal.@sync permutedims($gpu_arr_4d, (2,1,4,3)) end end - -# let group = addgroup!(group, "sorting") -# group["1d"] = @async_benchmarkable sort($gpu_vec) -# group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1) -# group["by"] = @async_benchmarkable sort($gpu_vec; by=sin) -# end - -let group = addgroup!(group, "permutedims") - group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1)) - group["3d"] = @async_benchmarkable permutedims($gpu_arr_3d, (3,1,2)) - group["4d"] = @async_benchmarkable permutedims($gpu_arr_4d, (2,1,4,3)) -end From f605bcb4961c4c353a917ecdd82f0b2b966a83ff Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Mon, 30 Sep 2024 15:46:39 -0300 Subject: [PATCH 3/4] Get more descriptive errors from flaky test (#440) [only tests] --- test/array.jl | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/test/array.jl b/test/array.jl index 567323842..e2aaaacc2 100644 --- a/test/array.jl +++ b/test/array.jl @@ -210,28 +210,34 @@ end # Dims in tuple let A = Metal.fill(b, (10, 10, 10, 1000)) - @test all(Array(A) .== b) + B = fill(b, (10, 10, 10, 1000)) + @test Array(A) == B end - let M = Metal.fill(b, (10, 10)) - @test all(Array(M) .== b) + let M = Metal.fill(b, (10, 10, 10, 1000)) + B = fill(b, (10, 10, 10, 1000)) + @test Array(M) == B end let V = Metal.fill(b, (10,)) - @test all(Array(V) .== b) + B = fill(b, (10,)) + @test Array(V) == B end #Dims already unpacked let A = Metal.fill(b, 10, 10, 10, 1000) - @test all(Array(A) .== b) + B = fill(b, 10, 10, 10, 1000) + @test Array(A) == B end let M = Metal.fill(b, 10, 10) - @test all(Array(M) .== b) + B = fill(b, 10, 10) + @test Array(M) == B end let V = Metal.fill(b, 10) - @test all(Array(V) .== b) + B = fill(b, 10) + @test Array(V) == B end end @@ -420,7 +426,7 @@ end @testset "broadcast" begin testf(f, x) = Array(f(MtlArray(x))) ≈ f(x) - + @test testf(x->max.(x, zero(Float32)), randn(Float32, 1000)) @test testf(x->min.(x, one(Float32)), randn(Float32, 1000)) @test testf(x->min.(max.(x, zero(Float32)), one(Float32)), randn(Float32, 1000)) From 438ab8f8017b6123824f96ef91483f48c70b29b6 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 1 Oct 2024 21:26:44 +0200 Subject: [PATCH 4/4] Bump version. --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index b8e38d94b..2ad06e39e 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "Metal" uuid = "dde4c033-4e86-420c-a63e-0dd931031962" -version = "1.3.0" +version = "1.4.0" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"