From 3bbda32f9ca219a85f72272ea31eeb1a564e0b44 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Mon, 30 Sep 2024 09:51:57 -0300
Subject: [PATCH 1/4] Make `lu` results have same storage mode as input (#435)

---
 lib/mps/linalg.jl  | 6 +++---
 test/mps/linalg.jl | 8 ++++++++
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/lib/mps/linalg.jl b/lib/mps/linalg.jl
index 1c29e75c0..8558f756a 100644
--- a/lib/mps/linalg.jl
+++ b/lib/mps/linalg.jl
@@ -128,7 +128,7 @@ LinearAlgebra.ipiv2perm(v::MtlVector{T}, maxi::Integer) where T =
         encode!(cbuf, kernel, descriptor)
     end
 
-    P = MtlMatrix{UInt32}(undef, 1, min(N, M))
+    P = similar(A, UInt32, 1, min(N, M))
     status = MtlArray{MPSMatrixDecompositionStatus}(undef)
 
     commitAndContinue!(cmdbuf) do cbuf
@@ -137,7 +137,7 @@ LinearAlgebra.ipiv2perm(v::MtlVector{T}, maxi::Integer) where T =
         encode!(cbuf, kernel, mps_at, mps_at, mps_p, status)
     end
 
-    B = MtlMatrix{T}(undef, M, N)
+    B = similar(A, M, N)
 
     commit!(cmdbuf) do cbuf
         mps_b = MPSMatrix(B)
@@ -186,7 +186,7 @@ end
         encode!(cbuf, kernel, descriptor)
     end
 
-    P = MtlMatrix{UInt32}(undef, 1, min(N, M))
+    P = similar(A, UInt32, 1, min(N, M))
     status = MtlArray{MPSMatrixDecompositionStatus}(undef)
 
     commitAndContinue!(cmdbuf) do cbuf
diff --git a/test/mps/linalg.jl b/test/mps/linalg.jl
index d0f982489..106d7669c 100644
--- a/test/mps/linalg.jl
+++ b/test/mps/linalg.jl
@@ -190,6 +190,7 @@ end
     end
 end
 
+using Metal: storagemode
 @testset "decompositions" begin
     A = MtlMatrix(rand(Float32, 1024, 1024))
     lua = lu(A)
@@ -211,6 +212,13 @@ end
 
     A = MtlMatrix{Float32}([1 2; 0 0])
     @test_throws SingularException lu(A)
+
+    altStorage = Metal.DefaultStorageMode != Metal.PrivateStorage ? Metal.PrivateStorage : Metal.SharedStorage
+    A = MtlMatrix{Float32,altStorage}(rand(Float32, 1024, 1024))
+    lua = lu(A)
+    @test storagemode(lua.factors) == storagemode(lua.ipiv) == storagemode(A)
+    lua = lu!(A)
+    @test storagemode(lua.factors) == storagemode(lua.ipiv) == storagemode(A)
 end
 
 using .MPS: MPSMatrixSoftMax, MPSMatrixLogSoftMax

From b999285e11a5dcd7c202245dbdc5ccebf19b7db6 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Mon, 30 Sep 2024 09:52:46 -0300
Subject: [PATCH 2/4] Fix benchmarking CI and benchmark Shared and Private
 storage modes (#437)

---
 .buildkite/pipeline.yml         |   2 -
 .github/workflows/Benchmark.yml |   4 +-
 perf/array.jl                   | 176 ++++++++++++++++----------------
 3 files changed, 92 insertions(+), 90 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index b35a38233..788ca020b 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -118,9 +118,7 @@ steps:
             build.message !~ /\[skip special\]/
         timeout_in_minutes: 60
 
-  # we want to benchmark every commit on the master branch, even if it failed CI
   - wait: ~
-    # continue_on_failure: true
 
   - group: ":racehorse: Benchmarks"
     steps:
diff --git a/.github/workflows/Benchmark.yml b/.github/workflows/Benchmark.yml
index 1904c1640..5bf747779 100644
--- a/.github/workflows/Benchmark.yml
+++ b/.github/workflows/Benchmark.yml
@@ -11,6 +11,7 @@ on:
       - main
     paths:
       - "src/**/*"
+      - "lib/**/*"
       - "ext/**/*"
       - "perf/**/*"
       - ".buildkite/**/*"
@@ -21,8 +22,9 @@ on:
       - main
     paths:
       - "src/**/*"
+      - "lib/**/*"
       - "ext/**/*"
-      - "benchmarks/**/*"
+      - "perf/**/*"
       - ".buildkite/**/*"
       - "Project.toml"
       - ".github/workflows/Benchmark.yml"
diff --git a/perf/array.jl b/perf/array.jl
index 0c57a7dfa..857a05970 100644
--- a/perf/array.jl
+++ b/perf/array.jl
@@ -1,110 +1,112 @@
-group = addgroup!(SUITE, "array")
-
 const m = 512
 const n = 1000
 
-# generate some arrays
-cpu_mat = rand(rng, Float32, m, n)
-gpu_mat = MtlArray{Float32}(undef, size(cpu_mat))
-gpu_vec = reshape(gpu_mat, length(gpu_mat))
-gpu_arr_3d = reshape(gpu_mat, (m, 40, 25))
-gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10))
-gpu_mat_ints = MtlArray(rand(rng, Int, m, n))
-gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
-gpu_mat_bools = MtlArray(rand(rng, Bool, m, n))
-gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools))
-
-group["construct"] = @benchmarkable MtlArray{Int}(undef, 1)
-
-group["copy"] = @async_benchmarkable copy($gpu_mat)
-
-gpu_mat2 = copy(gpu_mat)
-let group = addgroup!(group, "copyto!")
-    group["cpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat, $cpu_mat)
-    group["gpu_to_cpu"] = @async_benchmarkable copyto!($cpu_mat, $gpu_mat)
-    group["gpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat2, $gpu_mat)
-end
+for (S, smname) in [(Metal.PrivateStorage,"private"), (Metal.SharedStorage,"shared")]
+    group = addgroup!(SUITE, "$smname array")
+
+    # generate some arrays
+    cpu_mat = rand(rng, Float32, m, n)
+    gpu_mat = MtlMatrix{Float32,S}(undef, size(cpu_mat))
+    gpu_vec = reshape(gpu_mat, length(gpu_mat))
+    gpu_arr_3d = reshape(gpu_mat, (m, 40, 25))
+    gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10))
+    gpu_mat_ints = MtlMatrix{Int,S}(rand(rng, Int, m, n))
+    gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
+    gpu_mat_bools = MtlMatrix{Bool,S}(rand(rng, Bool, m, n))
+    gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools))
+
+    group["construct"] = @benchmarkable MtlArray{Int,1,$S}(undef, 1)
+
+    group["copy"] = @benchmarkable Metal.@sync copy($gpu_mat)
+
+    gpu_mat2 = copy(gpu_mat)
+    let group = addgroup!(group, "copyto!")
+        group["cpu_to_gpu"] = @benchmarkable Metal.@sync copyto!($gpu_mat, $cpu_mat)
+        group["gpu_to_cpu"] = @benchmarkable Metal.@sync copyto!($cpu_mat, $gpu_mat)
+        group["gpu_to_gpu"] = @benchmarkable Metal.@sync copyto!($gpu_mat2, $gpu_mat)
+    end
 
-let group = addgroup!(group, "iteration")
-    group["scalar"] = @benchmarkable Metal.@allowscalar [$gpu_vec[i] for i in 1:10]
+    let group = addgroup!(group, "iteration")
+        group["scalar"] = @benchmarkable Metal.@allowscalar [$gpu_vec[i] for i in 1:10]
 
-    group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools]
+        group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools]
 
-    let group = addgroup!(group, "findall")
-        group["bool"] = @benchmarkable findall($gpu_vec_bools)
-        group["int"] = @benchmarkable findall(isodd, $gpu_vec_ints)
-    end
+        let group = addgroup!(group, "findall")
+            group["bool"] = @benchmarkable findall($gpu_vec_bools)
+            group["int"] = @benchmarkable findall(isodd, $gpu_vec_ints)
+        end
 
-    let group = addgroup!(group, "findfirst")
-        group["bool"] = @benchmarkable findfirst($gpu_vec_bools)
-        group["int"] = @benchmarkable findfirst(isodd, $gpu_vec_ints)
-    end
+        let group = addgroup!(group, "findfirst")
+            group["bool"] = @benchmarkable findfirst($gpu_vec_bools)
+            group["int"] = @benchmarkable findfirst(isodd, $gpu_vec_ints)
+        end
 
-    let group = addgroup!(group, "findmin") # findmax
-        group["1d"] = @async_benchmarkable findmin($gpu_vec)
-        group["2d"] = @async_benchmarkable findmin($gpu_mat; dims=1)
+        let group = addgroup!(group, "findmin") # findmax
+            group["1d"] = @benchmarkable Metal.@sync findmin($gpu_vec)
+            group["2d"] = @benchmarkable Metal.@sync findmin($gpu_mat; dims=1)
+        end
     end
-end
-
-# let group = addgroup!(group, "reverse")
-#     group["1d"] = @async_benchmarkable reverse($gpu_vec)
-#     group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1)
-#     group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec)
-#     group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1)
-# end
 
-group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0
+    # let group = addgroup!(group, "reverse")
+    #     group["1d"] = @benchmarkable Metal.@sync reverse($gpu_vec)
+    #     group["2d"] = @benchmarkable Metal.@sync reverse($gpu_mat; dims=1)
+    #     group["1d_inplace"] = @benchmarkable Metal.@sync reverse!($gpu_vec)
+    #     group["2d_inplace"] = @benchmarkable Metal.@sync reverse!($gpu_mat; dims=1)
+    # end
 
-# no need to test inplace version, which performs the same operation (but with an alloc)
-let group = addgroup!(group, "accumulate")
-    group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec)
-    group["2d"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=1)
-end
+    group["broadcast"] = @benchmarkable Metal.@sync $gpu_mat .= 0f0
 
-let group = addgroup!(group, "reductions")
-    let group = addgroup!(group, "reduce")
-        group["1d"] = @async_benchmarkable reduce(+, $gpu_vec)
-        group["2d"] = @async_benchmarkable reduce(+, $gpu_mat; dims=1)
+    # no need to test inplace version, which performs the same operation (but with an alloc)
+    let group = addgroup!(group, "accumulate")
+        group["1d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_vec)
+        group["2d"] = @benchmarkable Metal.@sync accumulate(+, $gpu_mat; dims=1)
     end
 
-    let group = addgroup!(group, "mapreduce")
-        group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec)
-        group["2d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=1)
-    end
+    let group = addgroup!(group, "reductions")
+        let group = addgroup!(group, "reduce")
+            group["1d"] = @benchmarkable Metal.@sync reduce(+, $gpu_vec)
+            group["2d"] = @benchmarkable Metal.@sync reduce(+, $gpu_mat; dims=1)
+        end
 
-    # used by sum, prod, minimum, maximum, all, any, count
-end
+        let group = addgroup!(group, "mapreduce")
+            group["1d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_vec)
+            group["2d"] = @benchmarkable Metal.@sync mapreduce(x->x+1, +, $gpu_mat; dims=1)
+        end
 
-let group = addgroup!(group, "random")
-    let group = addgroup!(group, "rand")
-        group["Float32"] = @async_benchmarkable Metal.rand(Float32, m*n)
-        group["Int64"] = @async_benchmarkable Metal.rand(Int64, m*n)
+        # used by sum, prod, minimum, maximum, all, any, count
     end
 
-    let group = addgroup!(group, "rand!")
-        group["Float32"] = @async_benchmarkable Metal.rand!($gpu_vec)
-        group["Int64"] = @async_benchmarkable Metal.rand!($gpu_vec_ints)
+    let group = addgroup!(group, "random")
+        let group = addgroup!(group, "rand")
+            group["Float32"] = @benchmarkable Metal.@sync Metal.rand(Float32, m*n)
+            group["Int64"] = @benchmarkable Metal.@sync Metal.rand(Int64, m*n)
+        end
+
+        let group = addgroup!(group, "rand!")
+            group["Float32"] = @benchmarkable Metal.@sync Metal.rand!($gpu_vec)
+            group["Int64"] = @benchmarkable Metal.@sync Metal.rand!($gpu_vec_ints)
+        end
+
+        let group = addgroup!(group, "randn")
+            group["Float32"] = @benchmarkable Metal.@sync Metal.randn(Float32, m*n)
+            # group["Int64"] = @benchmarkable Metal.@sync Metal.randn(Int64, m*n)
+        end
+
+        let group = addgroup!(group, "randn!")
+            group["Float32"] = @benchmarkable Metal.@sync Metal.randn!($gpu_vec)
+            # group["Int64"] = @benchmarkable Metal.@sync Metal.randn!($gpu_vec_ints)
+        end
     end
 
-    let group = addgroup!(group, "randn")
-        group["Float32"] = @async_benchmarkable Metal.randn(Float32, m*n)
-        # group["Int64"] = @async_benchmarkable Metal.randn(Int64, m*n)
-    end
+    # let group = addgroup!(group, "sorting")
+    #     group["1d"] = @benchmarkable Metal.@sync sort($gpu_vec)
+    #     group["2d"] = @benchmarkable Metal.@sync sort($gpu_mat; dims=1)
+    #     group["by"] = @benchmarkable Metal.@sync sort($gpu_vec; by=sin)
+    # end
 
-    let group = addgroup!(group, "randn!")
-        group["Float32"] = @async_benchmarkable Metal.randn!($gpu_vec)
-        # group["Int64"] = @async_benchmarkable Metal.randn!($gpu_vec_ints)
+    let group = addgroup!(group, "permutedims")
+        group["2d"] = @benchmarkable Metal.@sync permutedims($gpu_mat, (2,1))
+        group["3d"] = @benchmarkable Metal.@sync permutedims($gpu_arr_3d, (3,1,2))
+        group["4d"] = @benchmarkable Metal.@sync permutedims($gpu_arr_4d, (2,1,4,3))
     end
 end
-
-# let group = addgroup!(group, "sorting")
-#     group["1d"] = @async_benchmarkable sort($gpu_vec)
-#     group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
-#     group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
-# end
-
-let group = addgroup!(group, "permutedims")
-    group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1))
-    group["3d"] = @async_benchmarkable permutedims($gpu_arr_3d, (3,1,2))
-    group["4d"] = @async_benchmarkable permutedims($gpu_arr_4d, (2,1,4,3))
-end

From f605bcb4961c4c353a917ecdd82f0b2b966a83ff Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Mon, 30 Sep 2024 15:46:39 -0300
Subject: [PATCH 3/4] Get more descriptive errors from flaky test (#440)

[only tests]
---
 test/array.jl | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/test/array.jl b/test/array.jl
index 567323842..e2aaaacc2 100644
--- a/test/array.jl
+++ b/test/array.jl
@@ -210,28 +210,34 @@ end
 
     # Dims in tuple
     let A = Metal.fill(b, (10, 10, 10, 1000))
-        @test all(Array(A) .== b)
+        B = fill(b, (10, 10, 10, 1000))
+        @test Array(A) == B
     end
 
-    let M = Metal.fill(b, (10, 10))
-        @test all(Array(M) .== b)
+    let M = Metal.fill(b, (10, 10, 10, 1000))
+        B = fill(b, (10, 10, 10, 1000))
+        @test Array(M) == B
     end
 
     let V = Metal.fill(b, (10,))
-        @test all(Array(V) .== b)
+        B = fill(b, (10,))
+        @test Array(V) == B
     end
 
     #Dims already unpacked
     let A = Metal.fill(b, 10, 10, 10, 1000)
-        @test all(Array(A) .== b)
+        B = fill(b, 10, 10, 10, 1000)
+        @test Array(A) == B
     end
 
     let M = Metal.fill(b, 10, 10)
-        @test all(Array(M) .== b)
+        B = fill(b, 10, 10)
+        @test Array(M) == B
     end
 
     let V = Metal.fill(b, 10)
-        @test all(Array(V) .== b)
+        B = fill(b, 10)
+        @test Array(V) == B
     end
 end
 
@@ -420,7 +426,7 @@ end
 
 @testset "broadcast" begin
     testf(f, x) = Array(f(MtlArray(x))) ≈ f(x)
-    
+
     @test testf(x->max.(x, zero(Float32)), randn(Float32, 1000))
     @test testf(x->min.(x, one(Float32)), randn(Float32, 1000))
     @test testf(x->min.(max.(x, zero(Float32)), one(Float32)), randn(Float32, 1000))

From 438ab8f8017b6123824f96ef91483f48c70b29b6 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 1 Oct 2024 21:26:44 +0200
Subject: [PATCH 4/4] Bump version.

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index b8e38d94b..2ad06e39e 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "Metal"
 uuid = "dde4c033-4e86-420c-a63e-0dd931031962"
-version = "1.3.0"
+version = "1.4.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"