JuliaGPU · christiangnrd · Dec 26, 2024 · Dec 26, 2024
diff --git a/Artifacts.toml b/Artifacts.toml
diff --git a/Project.toml b/Project.toml
@@ -4,7 +4,6 @@ version = "1.4.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-Artifacts = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
 CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
 CodecBzip2 = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd"
 ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
@@ -14,12 +13,10 @@ KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 LLVMDowngrader_jll = "f52de702-fb25-5922-94ba-81dd59b07444"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-ObjectFile = "d8793406-e978-5875-9003-1fc021f44a92"
 ObjectiveC = "e86c9b32-1129-44ac-8ea0-90d5bb39ded9"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
-Python_jll = "93d3a430-8e7c-50da-8e8d-3dfcfb3baf05"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
@@ -35,7 +32,6 @@ SpecialFunctionsExt = "SpecialFunctions"
 
 [compat]
 Adapt = "4"
-Artifacts = "1"
 BFloat16s = "0.5"
 CEnum = "0.4, 0.5"
 CodecBzip2 = "0.8.5"
@@ -45,11 +41,14 @@ GPUCompiler = "0.26, 0.27, 1"
 KernelAbstractions = "0.9.1"
 LLVM = "7.2, 8, 9"
 LLVMDowngrader_jll = "0.6"
-ObjectFile = "0.4"
+LinearAlgebra = "1"
 ObjectiveC = "2.1, 3"
 PrecompileTools = "1"
 Preferences = "1"
+Printf = "1"
+Random = "1"
 SHA = "0.7"
 SpecialFunctions = "2"
 StaticArrays = "1"
+UUIDs = "1"
 julia = "1.10"
diff --git a/docs/src/api/compiler.md b/docs/src/api/compiler.md
@@ -26,10 +26,9 @@ the InteractiveUtils standard library:
 @device_code_typed
 @device_code_warntype
 @device_code_llvm
+@device_code_air
 @device_code_native
-@device_code_agx
 @device_code
 ```
 
-For more information, please consult the GPUCompiler.jl documentation. `code_agx` is
-actually `code_native`:
+For more information, please consult the GPUCompiler.jl documentation.
diff --git a/perf/latency.jl b/perf/latency.jl
@@ -28,7 +28,7 @@ function main()
     ttfp_cmd =
         `$base_cmd -e "using Metal
                        kernel() = return
-                       Metal.code_agx(devnull, kernel, Tuple{}; kernel=true)"`
+                       Metal.code_native(devnull, kernel, Tuple{}; kernel=true)"`
     results["ttfp"] = @benchmark run($ttfp_cmd) evals=1 seconds=60
 
     results

diff --git a/src/Metal.jl b/src/Metal.jl
@@ -7,10 +7,7 @@ using LLVM
 using LLVM.Interop
 import LLVMDowngrader_jll
 using Preferences: @load_preference, load_preference
-using Python_jll
-using ObjectFile
 using ExprTools: splitdef, combinedef
-using Artifacts
 using ObjectiveC, .CoreFoundation, .Foundation, .Dispatch, .OS
 import KernelAbstractions
 

diff --git a/src/compiler/compilation.jl b/src/compiler/compilation.jl
@@ -176,8 +176,7 @@ function compile(@nospecialize(job::CompilerJob))
 end
 
 # link into an executable kernel
-@autoreleasepool function link(@nospecialize(job::CompilerJob), compiled;
-                               return_function=false)
+@autoreleasepool function link(@nospecialize(job::CompilerJob), compiled)
     @signpost_event log=log_compiler() "Link" "Job=$job"
 
     @signpost_interval log=log_compiler() "Instantiate compute pipeline" begin
@@ -211,7 +210,5 @@ end
         end
     end
 
-    # most of the time, we don't need the function object,
-    # so don't keep it alive unconditionally in GPUCompiler's caches
-    pipeline_state, return_function ? fun : nothing
+    pipeline_state
 end
diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
@@ -186,7 +186,7 @@ function mtlfunction(f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
         cache = compiler_cache(dev)
         source = methodinstance(F, tt)
         config = compiler_config(dev; name, kwargs...)::MetalCompilerConfig
-        pipeline, _ = GPUCompiler.cached_compilation(cache, source, config, compile, link)
+        pipeline = GPUCompiler.cached_compilation(cache, source, config, compile, link)
 
         # create a callable object that captures the function instance. we don't need to think
         # about world age here, as GPUCompiler already does and will return a different object

diff --git a/src/compiler/reflection.jl b/src/compiler/reflection.jl
@@ -19,161 +19,8 @@ function split_kwargs_runtime(kwargs, wanted::Vector{Symbol})
     return extracted, remaining
 end
 
-"""
-    code_agx([io], f, types, cap::VersionNumber)
-
-Prints the AGX code generated for the method matching the given generic function and type
-signature to `io` which defaults to `stdout`.
-
-See also: [`@device_code_agx`](@ref)
-"""
-function code_agx(io::IO, @nospecialize(func::Base.Callable), @nospecialize(types),
-                  kernel::Bool=true; kwargs...)
-    compiler_kwargs, kwargs = split_kwargs_runtime(kwargs, COMPILER_KWARGS)
-    source = methodinstance(typeof(func), Base.to_tuple_type(types))
-    config = compiler_config(device(); kernel, compiler_kwargs...)
-    job = CompilerJob(source, config)
-    code_agx(io, job)
-end
-
-@autoreleasepool function code_agx(io::IO, job::MetalCompilerJob)
-    if !job.config.kernel
-        error("Can only generate AGX code for kernel functions")
-    end
-
-    # compile the kernel
-    compiled = compile(job)
-    pipeline, fun = link(job, compiled; return_function=true)
-    # XXX: can we re-use this pipeline?
-
-    # register it with a pipeline descriptor
-    pipeline_desc = MTLComputePipelineDescriptor()
-    pipeline_desc.computeFunction = fun
-
-    # create a binary archive
-    bin_desc = MTLBinaryArchiveDescriptor()
-    bin = MTLBinaryArchive(device(), bin_desc)
-    add_functions!(bin, pipeline_desc)
-
-    mktempdir() do dir
-        # serialize the archive to a file
-        binary = joinpath(dir, "kernel.macho")
-        write(binary, bin)
-
-        # disassemble the main function
-        first = true
-        i = 0
-        extract_gpu_code(binary) do name, code
-            # skip all-zero functions
-            all(code .== 0) && return
-
-            i += 1
-            file = joinpath(dir, "function$(i).bin")
-            write(file, code)
-
-            # disassemble the function
-            first || println(io)
-            println(io, "$name:")
-            print(io, disassemble(file))
-
-            first = false
-        end
-    end
-end
-
-@enum GPUMachineType::UInt32 begin
-    AppleGPU = 0x1000013
-    AMDGPU   = 0x1000014
-    IntelGPU = 0x1000015
-    AIR64    = 0x1000017
-end
-
-function extract_gpu_code(f, binary)
-    fat_handle = readmeta(open(binary))
-    fat_handle isa FatMachOHandle || error("Expected a universal binary, got a $(typeof(fat_handle))")
-
-    # the universal binary contains several architectures; extract the GPU one
-    arch = findfirst(fat_handle) do arch
-        arch.header isa MachO.MachOHeader64 && GPUMachineType(arch.header.cputype) == AppleGPU
-    end
-    arch === nothing && error("Could not find GPU architecture in universal binary")
-
-    # the GPU binary contains several sections...
-    ## ... extract the compute section, which is another Mach-O binary
-    compute_section = findfirst(Sections(fat_handle[arch]), "__TEXT,__compute")
-    compute_section === nothing && error("Could not find __compute section in GPU binary")
-    compute_binary = read(compute_section)
-    native_handle = only(readmeta(IOBuffer(compute_binary)))
-    ## ... extract the metallib section, which is a Metal library
-    metallib_section = findfirst(Sections(fat_handle[arch]), "__TEXT,__metallib")
-    metallib_section === nothing && error("Could not find __metallib section in GPU binary")
-    metallib_binary = read(metallib_section)
-    metallib = read(IOBuffer(metallib_binary), MetalLib)
-    # TODO: use this to implement a do-block device_code_air like CUDA.jl?
-
-    # identify the kernel name
-    kernel_name = "unknown_kernel"
-    # XXX: does it happen that these metallibs contain multiple functions?
-    if length(metallib.functions) == 1
-        kernel_name = metallib.functions[1].name
-    end
-    # XXX: we used to be able to identify the kernel by looking at symbols in
-    #      the fat binary, one of which aliased with the start of the compute
-    #      section. these symbols have disappeared on macOS 15.
-    #compute_symbol = nothing
-    #for symbol in Symbols(fat_handle[arch])
-    #    symbol_value(symbol) == section_offset(compute_section) || continue
-    #    endswith(symbol_name(symbol), "_begin") || continue
-    #    compute_symbol = symbol
-    #end
-    #compute_symbol === nothing && error("Could not find symbol for __compute section")
-    #kernel_name = symbol_name(compute_symbol)[1:end-6]
-
-    # within the native GPU binary, isolate the section containing code
-    section = findfirst(Sections(native_handle), "__TEXT,__text")
-    isnothing(section) && error("Could not find __TEXT,__text section")
-
-    # get all symbols, and sort them by address
-    symbols = sort(collect(Symbols(native_handle)), by=symbol_value)
-
-    # extract relevant functions
-    code = read(section)
-    function extract_function(fn)
-        # find the symbol
-        symbol = findfirst(isequal(fn) , symbols)
-        symbol ===  nothing && return nothing
-        offset = symbol_value(symbols[symbol])
-
-        # extract the function
-        size = if symbol < length(symbols)
-            # up until the next symbol
-            symbol_value(symbols[symbol + 1])
-        else
-            # up until the end of the section
-            section_size(section)
-        end - offset
-        return code[offset + 1 : offset + size]
-    end
-    for sym in symbols
-        f("$kernel_name.$(symbol_name(sym))", extract_function(sym))
-    end
-    return
-end
-
-function disassemble(path)
-    io = IOBuffer()
-    disassembler = joinpath(only(readdir(artifact"applegpu"; join=true)), "disassemble.py")
-    run(pipeline(`$(python()) $disassembler $path`, stdout=io))
-    return String(take!(io))
-end
-
-code_agx(@nospecialize(func::Base.Callable), @nospecialize(types); kwargs...) =
-    code_agx(stdout, func, types; kwargs...)
-
-const code_native = code_agx
-
 # forward the rest to GPUCompiler with an appropriate CompilerJob
-for method in (:code_typed, :code_warntype, :code_llvm)
+for method in (:code_typed, :code_warntype, :code_llvm, :code_native)
     # only code_typed doesn't take a io argument
     args = method === :code_typed ? (:job,) : (:io, :job)
 
@@ -191,37 +38,19 @@ for method in (:code_typed, :code_warntype, :code_llvm)
     end
 end
 
-
 #
 # @device_code_* functions
 #
 
 export @device_code_lowered, @device_code_typed, @device_code_warntype,
-       @device_code_llvm, @device_code_native, @device_code_agx, @device_code
-
-"""
-    @device_code_agx [io::IO=stdout, ...] ex
-
-Evaluates the expression `ex` and prints the result of [`Metal.code_agx`](@ref) to
-`io` for every compiled Metal kernel. For other supported keywords, see
-[`Metal.code_agx`](@ref).
-"""
-macro device_code_agx(ex...)
-    function hook(job::MetalCompilerJob; io::IO=stdout, kwargs...)
-        println(io, "; $job")
-        println(io)
-        code_agx(io, job; kwargs...)
-    end
-    GPUCompiler.emit_hooked_compilation(hook, ex...)
-end
-
-const var"@device_code_native" = var"@device_code_agx"
+       @device_code_llvm, @device_code_metal, @device_code
 
 # forward to GPUCompiler
 @eval $(Symbol("@device_code_lowered")) = $(getfield(GPUCompiler, Symbol("@device_code_lowered")))
 @eval $(Symbol("@device_code_typed")) = $(getfield(GPUCompiler, Symbol("@device_code_typed")))
 @eval $(Symbol("@device_code_warntype")) = $(getfield(GPUCompiler, Symbol("@device_code_warntype")))
 @eval $(Symbol("@device_code_llvm")) = $(getfield(GPUCompiler, Symbol("@device_code_llvm")))
+@eval $(Symbol("@device_code_metal")) = $(getfield(GPUCompiler, Symbol("@device_code_native")))
 @eval $(Symbol("@device_code")) = $(getfield(GPUCompiler, Symbol("@device_code")))
 
 

diff --git a/src/initialization.jl b/src/initialization.jl
@@ -40,11 +40,6 @@ function __init__()
         @warn "Metal.jl has not been tested on macOS 16 or later, you may run into issues."
     end
 
-    # we use Python_jll, but don't actually want its environment to be active
-    # (this breaks the call to pygmentize in GPUCompiler).
-    # XXX: the JLL should only set PYTHONHOME when the executable is called
-    delete!(ENV, "PYTHONHOME")
-
     if Base.JLOptions().debug_level >= 2
         # enable Metal API validation
         ENV["MTL_DEBUG_LAYER"] = "1"

diff --git a/test/execution.jl b/test/execution.jl
@@ -59,13 +59,11 @@ end
     Metal.code_typed(dummy, Tuple{})
     Metal.code_warntype(devnull, dummy, Tuple{})
     Metal.code_llvm(devnull, dummy, Tuple{})
-    shader_validation || Metal.code_agx(devnull, dummy, Tuple{})
 
     @device_code_lowered @metal dummy()
     @device_code_typed @metal dummy()
     @device_code_warntype io=devnull @metal dummy()
     @device_code_llvm io=devnull @metal dummy()
-    shader_validation || @device_code_agx io=devnull @metal dummy()
 
     mktempdir() do dir
         @device_code dir=dir @metal dummy()
@@ -76,7 +74,6 @@ end
     # make sure kernel name aliases are preserved in the generated code
     @test occursin("dummy", sprint(io->(@device_code_llvm io=io optimize=false @metal dummy())))
     @test occursin("dummy", sprint(io->(@device_code_llvm io=io @metal dummy())))
-    shader_validation || @test occursin("dummy", sprint(io->(@device_code_agx io=io @metal dummy())))
 
     # make sure invalid kernels can be partially reflected upon
     let