JuliaGPU · christiangnrd · Sep 26, 2024 · Sep 26, 2024 · Oct 12, 2024
diff --git a/Project.toml b/Project.toml
@@ -5,6 +5,7 @@ version = "1.4.0"
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Artifacts = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
+BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
 CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
 CodecBzip2 = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd"
 ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
@@ -26,11 +27,9 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
 [weakdeps]
-BFloat16s = "ab4f0b2a-ad5b-11e8-123f-65d77653426b"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 
 [extensions]
-BFloat16sExt = "BFloat16s"
 SpecialFunctionsExt = "SpecialFunctions"
 
 [compat]

diff --git a/ext/BFloat16sExt.jl b/ext/BFloat16sExt.jl
diff --git a/lib/mps/MPS.jl b/lib/mps/MPS.jl
@@ -16,7 +16,9 @@ using ObjectiveC, .Foundation
 
 import GPUArrays
 
-const MtlFloat = Union{Float32, Float16}
+using BFloat16s: BFloat16
+
+const MtlFloat = Union{Float32, Float16, BFloat16}
 
 const MPSShape = NSArray#{NSNumber}
 Base.convert(::Type{MPSShape}, tuple::Union{Vector{N},NTuple{N, <:Integer}}) where N = NSArray(NSNumber.(collect(tuple)))

diff --git a/lib/mps/matrix.jl b/lib/mps/matrix.jl
@@ -5,8 +5,8 @@ Base.convert(::Type{MPSDataType}, x::Integer) = MPSDataType(x)
 
 # Conversions for MPSDataTypes with Julia equivalents
 const jl_mps_to_typ = Dict{MPSDataType, DataType}()
-for type in [UInt8,UInt16,UInt32,UInt64,Int8,Int16,Int32,Int64,Float16,Float32,(ComplexF16,:MPSDataTypeComplexFloat16),(ComplexF32,:MPSDataTypeComplexFloat32),Bool]
-    jltype, mpstype = if type isa Type
+for type in [:UInt8,:UInt16,:UInt32,:UInt64,:Int8,:Int16,:Int32,:Int64,:Float16,:BFloat16,:Float32,(:ComplexF16,:MPSDataTypeComplexFloat16),(:ComplexF32,:MPSDataTypeComplexFloat32),:Bool]
+    jltype, mpstype = if type isa Symbol
         type, Symbol(:MPSDataType, type)
     else
         type

diff --git a/src/Metal.jl b/src/Metal.jl
@@ -13,6 +13,7 @@ using ExprTools: splitdef, combinedef
 using Artifacts
 using ObjectiveC, .CoreFoundation, .Foundation, .Dispatch, .OS
 import KernelAbstractions
+using BFloat16s
 
 include("version.jl")
 

diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl
@@ -18,7 +18,8 @@ function GPUCompiler.finish_ir!(@nospecialize(job::MetalCompilerJob),
     # pointer type information for typed intrinsics
     # (this is consumed by the LLVM IR downgrader)
     for (jltyp, llvmtyp) in (Int32 => :i32, Int64 => :i64,
-                             Float16 => :f16, Float32 => :f32),
+                             Float16 => :f16, Float32 => :f32,
+                             BFloat16 => :bf16),
         (as, asname) in (AS.Device => "global", AS.ThreadGroup => "local")
 
         # map of intrinsics to pointer operand indices and eltypes

diff --git a/src/device/intrinsics/simd.jl b/src/device/intrinsics/simd.jl
@@ -7,7 +7,7 @@ function convert_origin(origin::NTuple{2, Int64})
     return (VecElement{Int64}(origin[1]-1), VecElement{Int64}(origin[2]-1))
 end
 
-for (jltype, suffix) in ((:Float16, "f16"), (:Float32, "f32"))
+for (jltype, suffix) in ((:Float16, "f16"), (:Float32, "f32"), (:BFloat16, "bf18"))
     for as in (AS.Device, AS.ThreadGroup)
         @eval begin
             @device_function simdgroup_load(
@@ -55,7 +55,7 @@ end
     simdgroup_load(data::MtlDeviceArray{T}, matrix_origin=(1, 1))
 
 Loads data from device or threadgroup memory into an 8x8 SIMD-group matrix
-and returns it. `T` must be either `Float16` or `Float32`.
+and returns it. `T` must be either `Float16`, `Float32`, or `BFloat16`.
 
 # Arguments
 - `matrix_origin::NTuple{2, Int64}=(1, 1)`: origin in the source memory to load from.
@@ -65,7 +65,7 @@ and returns it. `T` must be either `Float16` or `Float32`.
     simdgroup_store(src, dest::MtlDeviceArray{T}, matrix_origin=(1, 1))
 
 Stores data from an 8x8 SIMD-group matrix into device or threadgroup memory.
-`T` must be either `Float16` or `Float32`.
+`T` must be either `Float16`, `Float32`, `BFloat16`.
 
 # Arguments
 - `matrix_origin::NTuple{2, Int64}=(1, 1)`: origin in the destination memory to store to.
@@ -88,6 +88,7 @@ Returns `a * b + c`.
 
 simd_shuffle_map = ((Float32, "f32"),
                     (Float16, "f16"),
+                    (BFloat16, "bf16"),
                     (Int32,   "s.i32"),
                     (UInt32,  "u.i32"),
                     (Int16,   "s.i16"),
@@ -118,7 +119,7 @@ The value for delta must be the same for all threads in the SIMD-group. This fun
 doesn’t modify the upper delta lanes of data because it doesn’t wrap values around
 the SIMD-group.
 
-T must be one of the following: Float32, Float16, Int32, UInt32, Int16, UInt16, Int8, or UInt8
+T must be one of the following: Float32, Float16, BFloat16, Int32, UInt32, Int16, UInt16, Int8, or UInt8
 """
 simd_shuffle_down
 
@@ -131,6 +132,6 @@ lane ID minus delta.
 The value of delta must be the same for all threads in a SIMD-group. This function doesn’t
 modify the lower delta lanes of data because it doesn’t wrap values around the SIMD-group.
 
-T must be one of the following: Float32, Float16, Int32, UInt32, Int16, UInt16, Int8, or UInt8
+T must be one of the following: Float32, Float16, BFloat16, Int32, UInt32, Int16, UInt16, Int8, or UInt8
 """
 simd_shuffle_up
diff --git a/test/device/intrinsics.jl b/test/device/intrinsics.jl
@@ -1,4 +1,5 @@
 using SpecialFunctions
+using BFloat16s
 using Metal: metal_support
 
 @testset "arguments" begin
@@ -308,8 +309,9 @@ end
 ############################################################################################
 
 @testset "simd intrinsics" begin
-
-@testset "shuffle($typ)" for typ in [Float32, Float16, Int32, UInt32, Int16, UInt16, Int8, UInt8]
+types = [Float32, Float16, Int32, UInt32, Int16, UInt16, Int8, UInt8]
+metal_support() >= v"3.1" && push!(types, BFloat16)
+@testset "shuffle($typ)" for typ in types
     function kernel(a::MtlDeviceVector{T}, b::MtlDeviceVector{T}) where T
         idx = thread_position_in_grid_1d()
         idx_in_simd = thread_index_in_simdgroup()
@@ -344,7 +346,9 @@ end
 end
 
 @testset "matrix functions" begin
-    @testset "load_store($typ)" for typ in [Float16, Float32]
+    simdgroup_types = [Float16, Float32]
+    metal_support() >= v"3.1" && push!(simdgroup_types, BFloat16)
+    @testset "load_store($typ)" for typ in simdgroup_types
         function kernel(a::MtlDeviceArray{T}, b::MtlDeviceArray{T},
                             origin_a=(1, 1), origin_b=(1, 1)) where {T}
             sg_a = simdgroup_load(a, origin_a)
@@ -367,7 +371,7 @@ end
         end
     end
 
-    @testset "load_store_tg($typ)" for typ in [Float16, Float32]
+    @testset "load_store_tg($typ)" for typ in simdgroup_types
         function kernel(a::MtlDeviceArray{T}, b::MtlDeviceArray{T}) where {T}
             pos = thread_position_in_threadgroup_2d()
 
@@ -391,7 +395,7 @@ end
         @test Array(a) == Array(b)
     end
 
-    @testset "mul($typ)" for typ in [Float16, Float32]
+    @testset "mul($typ)" for typ in simdgroup_types
         function kernel(a::MtlDeviceArray{T}, b::MtlDeviceArray{T}, c::MtlDeviceArray{T}) where {T}
             sg_a = simdgroup_load(a)
             sg_b = simdgroup_load(b)
@@ -407,7 +411,7 @@ end
         @test Array(a) * Array(b) ≈ Array(c)
     end
 
-    @testset "mad($typ)" for typ in [Float16, Float32]
+    @testset "mad($typ)" for typ in simdgroup_types
         function kernel(a::MtlDeviceArray{T}, b::MtlDeviceArray{T}, c::MtlDeviceArray{T},
                     d::MtlDeviceArray{T}) where {T}
             sg_a = simdgroup_load(a)