Skip to content

Commit

Permalink
Remove device_code_agx (#512)
Browse files Browse the repository at this point in the history
It doesn't work on M3 anyway, and the Python
dependency is quite heavy.
  • Loading branch information
christiangnrd authored Jan 6, 2025
1 parent cf21f9d commit de8eed5
Show file tree
Hide file tree
Showing 10 changed files with 13 additions and 206 deletions.
6 changes: 0 additions & 6 deletions Artifacts.toml

This file was deleted.

9 changes: 4 additions & 5 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ version = "1.4.0"

[deps]
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
Artifacts = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
CodecBzip2 = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd"
ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
Expand All @@ -14,12 +13,10 @@ KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
LLVMDowngrader_jll = "f52de702-fb25-5922-94ba-81dd59b07444"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
ObjectFile = "d8793406-e978-5875-9003-1fc021f44a92"
ObjectiveC = "e86c9b32-1129-44ac-8ea0-90d5bb39ded9"
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
Preferences = "21216c6a-2e73-6563-6e65-726566657250"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
Python_jll = "93d3a430-8e7c-50da-8e8d-3dfcfb3baf05"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
Expand All @@ -35,7 +32,6 @@ SpecialFunctionsExt = "SpecialFunctions"

[compat]
Adapt = "4"
Artifacts = "1"
BFloat16s = "0.5"
CEnum = "0.4, 0.5"
CodecBzip2 = "0.8.5"
Expand All @@ -45,11 +41,14 @@ GPUCompiler = "0.26, 0.27, 1"
KernelAbstractions = "0.9.1"
LLVM = "7.2, 8, 9"
LLVMDowngrader_jll = "0.6"
ObjectFile = "0.4"
LinearAlgebra = "1"
ObjectiveC = "2.1, 3"
PrecompileTools = "1"
Preferences = "1"
Printf = "1"
Random = "1"
SHA = "0.7"
SpecialFunctions = "2"
StaticArrays = "1"
UUIDs = "1"
julia = "1.10"
5 changes: 2 additions & 3 deletions docs/src/api/compiler.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,9 @@ the InteractiveUtils standard library:
@device_code_typed
@device_code_warntype
@device_code_llvm
@device_code_air
@device_code_native
@device_code_agx
@device_code
```

For more information, please consult the GPUCompiler.jl documentation. `code_agx` is
actually `code_native`:
For more information, please consult the GPUCompiler.jl documentation.
2 changes: 1 addition & 1 deletion perf/latency.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ function main()
ttfp_cmd =
`$base_cmd -e "using Metal
kernel() = return
Metal.code_agx(devnull, kernel, Tuple{}; kernel=true)"`
Metal.code_native(devnull, kernel, Tuple{}; kernel=true)"`
results["ttfp"] = @benchmark run($ttfp_cmd) evals=1 seconds=60

results
Expand Down
3 changes: 0 additions & 3 deletions src/Metal.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,7 @@ using LLVM
using LLVM.Interop
import LLVMDowngrader_jll
using Preferences: @load_preference, load_preference
using Python_jll
using ObjectFile
using ExprTools: splitdef, combinedef
using Artifacts
using ObjectiveC, .CoreFoundation, .Foundation, .Dispatch, .OS
import KernelAbstractions

Expand Down
7 changes: 2 additions & 5 deletions src/compiler/compilation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,7 @@ function compile(@nospecialize(job::CompilerJob))
end

# link into an executable kernel
@autoreleasepool function link(@nospecialize(job::CompilerJob), compiled;
return_function=false)
@autoreleasepool function link(@nospecialize(job::CompilerJob), compiled)
@signpost_event log=log_compiler() "Link" "Job=$job"

@signpost_interval log=log_compiler() "Instantiate compute pipeline" begin
Expand Down Expand Up @@ -211,7 +210,5 @@ end
end
end

# most of the time, we don't need the function object,
# so don't keep it alive unconditionally in GPUCompiler's caches
pipeline_state, return_function ? fun : nothing
pipeline_state
end
2 changes: 1 addition & 1 deletion src/compiler/execution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ function mtlfunction(f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
cache = compiler_cache(dev)
source = methodinstance(F, tt)
config = compiler_config(dev; name, kwargs...)::MetalCompilerConfig
pipeline, _ = GPUCompiler.cached_compilation(cache, source, config, compile, link)
pipeline = GPUCompiler.cached_compilation(cache, source, config, compile, link)

# create a callable object that captures the function instance. we don't need to think
# about world age here, as GPUCompiler already does and will return a different object
Expand Down
177 changes: 3 additions & 174 deletions src/compiler/reflection.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,161 +19,8 @@ function split_kwargs_runtime(kwargs, wanted::Vector{Symbol})
return extracted, remaining
end

"""
code_agx([io], f, types, cap::VersionNumber)
Prints the AGX code generated for the method matching the given generic function and type
signature to `io` which defaults to `stdout`.
See also: [`@device_code_agx`](@ref)
"""
function code_agx(io::IO, @nospecialize(func::Base.Callable), @nospecialize(types),
kernel::Bool=true; kwargs...)
compiler_kwargs, kwargs = split_kwargs_runtime(kwargs, COMPILER_KWARGS)
source = methodinstance(typeof(func), Base.to_tuple_type(types))
config = compiler_config(device(); kernel, compiler_kwargs...)
job = CompilerJob(source, config)
code_agx(io, job)
end

@autoreleasepool function code_agx(io::IO, job::MetalCompilerJob)
if !job.config.kernel
error("Can only generate AGX code for kernel functions")
end

# compile the kernel
compiled = compile(job)
pipeline, fun = link(job, compiled; return_function=true)
# XXX: can we re-use this pipeline?

# register it with a pipeline descriptor
pipeline_desc = MTLComputePipelineDescriptor()
pipeline_desc.computeFunction = fun

# create a binary archive
bin_desc = MTLBinaryArchiveDescriptor()
bin = MTLBinaryArchive(device(), bin_desc)
add_functions!(bin, pipeline_desc)

mktempdir() do dir
# serialize the archive to a file
binary = joinpath(dir, "kernel.macho")
write(binary, bin)

# disassemble the main function
first = true
i = 0
extract_gpu_code(binary) do name, code
# skip all-zero functions
all(code .== 0) && return

i += 1
file = joinpath(dir, "function$(i).bin")
write(file, code)

# disassemble the function
first || println(io)
println(io, "$name:")
print(io, disassemble(file))

first = false
end
end
end

@enum GPUMachineType::UInt32 begin
AppleGPU = 0x1000013
AMDGPU = 0x1000014
IntelGPU = 0x1000015
AIR64 = 0x1000017
end

function extract_gpu_code(f, binary)
fat_handle = readmeta(open(binary))
fat_handle isa FatMachOHandle || error("Expected a universal binary, got a $(typeof(fat_handle))")

# the universal binary contains several architectures; extract the GPU one
arch = findfirst(fat_handle) do arch
arch.header isa MachO.MachOHeader64 && GPUMachineType(arch.header.cputype) == AppleGPU
end
arch === nothing && error("Could not find GPU architecture in universal binary")

# the GPU binary contains several sections...
## ... extract the compute section, which is another Mach-O binary
compute_section = findfirst(Sections(fat_handle[arch]), "__TEXT,__compute")
compute_section === nothing && error("Could not find __compute section in GPU binary")
compute_binary = read(compute_section)
native_handle = only(readmeta(IOBuffer(compute_binary)))
## ... extract the metallib section, which is a Metal library
metallib_section = findfirst(Sections(fat_handle[arch]), "__TEXT,__metallib")
metallib_section === nothing && error("Could not find __metallib section in GPU binary")
metallib_binary = read(metallib_section)
metallib = read(IOBuffer(metallib_binary), MetalLib)
# TODO: use this to implement a do-block device_code_air like CUDA.jl?

# identify the kernel name
kernel_name = "unknown_kernel"
# XXX: does it happen that these metallibs contain multiple functions?
if length(metallib.functions) == 1
kernel_name = metallib.functions[1].name
end
# XXX: we used to be able to identify the kernel by looking at symbols in
# the fat binary, one of which aliased with the start of the compute
# section. these symbols have disappeared on macOS 15.
#compute_symbol = nothing
#for symbol in Symbols(fat_handle[arch])
# symbol_value(symbol) == section_offset(compute_section) || continue
# endswith(symbol_name(symbol), "_begin") || continue
# compute_symbol = symbol
#end
#compute_symbol === nothing && error("Could not find symbol for __compute section")
#kernel_name = symbol_name(compute_symbol)[1:end-6]

# within the native GPU binary, isolate the section containing code
section = findfirst(Sections(native_handle), "__TEXT,__text")
isnothing(section) && error("Could not find __TEXT,__text section")

# get all symbols, and sort them by address
symbols = sort(collect(Symbols(native_handle)), by=symbol_value)

# extract relevant functions
code = read(section)
function extract_function(fn)
# find the symbol
symbol = findfirst(isequal(fn) , symbols)
symbol === nothing && return nothing
offset = symbol_value(symbols[symbol])

# extract the function
size = if symbol < length(symbols)
# up until the next symbol
symbol_value(symbols[symbol + 1])
else
# up until the end of the section
section_size(section)
end - offset
return code[offset + 1 : offset + size]
end
for sym in symbols
f("$kernel_name.$(symbol_name(sym))", extract_function(sym))
end
return
end

function disassemble(path)
io = IOBuffer()
disassembler = joinpath(only(readdir(artifact"applegpu"; join=true)), "disassemble.py")
run(pipeline(`$(python()) $disassembler $path`, stdout=io))
return String(take!(io))
end

code_agx(@nospecialize(func::Base.Callable), @nospecialize(types); kwargs...) =
code_agx(stdout, func, types; kwargs...)

const code_native = code_agx

# forward the rest to GPUCompiler with an appropriate CompilerJob
for method in (:code_typed, :code_warntype, :code_llvm)
for method in (:code_typed, :code_warntype, :code_llvm, :code_native)
# only code_typed doesn't take a io argument
args = method === :code_typed ? (:job,) : (:io, :job)

Expand All @@ -191,37 +38,19 @@ for method in (:code_typed, :code_warntype, :code_llvm)
end
end


#
# @device_code_* functions
#

export @device_code_lowered, @device_code_typed, @device_code_warntype,
@device_code_llvm, @device_code_native, @device_code_agx, @device_code

"""
@device_code_agx [io::IO=stdout, ...] ex
Evaluates the expression `ex` and prints the result of [`Metal.code_agx`](@ref) to
`io` for every compiled Metal kernel. For other supported keywords, see
[`Metal.code_agx`](@ref).
"""
macro device_code_agx(ex...)
function hook(job::MetalCompilerJob; io::IO=stdout, kwargs...)
println(io, "; $job")
println(io)
code_agx(io, job; kwargs...)
end
GPUCompiler.emit_hooked_compilation(hook, ex...)
end

const var"@device_code_native" = var"@device_code_agx"
@device_code_llvm, @device_code_metal, @device_code

# forward to GPUCompiler
@eval $(Symbol("@device_code_lowered")) = $(getfield(GPUCompiler, Symbol("@device_code_lowered")))
@eval $(Symbol("@device_code_typed")) = $(getfield(GPUCompiler, Symbol("@device_code_typed")))
@eval $(Symbol("@device_code_warntype")) = $(getfield(GPUCompiler, Symbol("@device_code_warntype")))
@eval $(Symbol("@device_code_llvm")) = $(getfield(GPUCompiler, Symbol("@device_code_llvm")))
@eval $(Symbol("@device_code_metal")) = $(getfield(GPUCompiler, Symbol("@device_code_native")))
@eval $(Symbol("@device_code")) = $(getfield(GPUCompiler, Symbol("@device_code")))


Expand Down
5 changes: 0 additions & 5 deletions src/initialization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,6 @@ function __init__()
@warn "Metal.jl has not been tested on macOS 16 or later, you may run into issues."
end

# we use Python_jll, but don't actually want its environment to be active
# (this breaks the call to pygmentize in GPUCompiler).
# XXX: the JLL should only set PYTHONHOME when the executable is called
delete!(ENV, "PYTHONHOME")

if Base.JLOptions().debug_level >= 2
# enable Metal API validation
ENV["MTL_DEBUG_LAYER"] = "1"
Expand Down
3 changes: 0 additions & 3 deletions test/execution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,11 @@ end
Metal.code_typed(dummy, Tuple{})
Metal.code_warntype(devnull, dummy, Tuple{})
Metal.code_llvm(devnull, dummy, Tuple{})
shader_validation || Metal.code_agx(devnull, dummy, Tuple{})

@device_code_lowered @metal dummy()
@device_code_typed @metal dummy()
@device_code_warntype io=devnull @metal dummy()
@device_code_llvm io=devnull @metal dummy()
shader_validation || @device_code_agx io=devnull @metal dummy()

mktempdir() do dir
@device_code dir=dir @metal dummy()
Expand All @@ -76,7 +74,6 @@ end
# make sure kernel name aliases are preserved in the generated code
@test occursin("dummy", sprint(io->(@device_code_llvm io=io optimize=false @metal dummy())))
@test occursin("dummy", sprint(io->(@device_code_llvm io=io @metal dummy())))
shader_validation || @test occursin("dummy", sprint(io->(@device_code_agx io=io @metal dummy())))

# make sure invalid kernels can be partially reflected upon
let
Expand Down

0 comments on commit de8eed5

Please sign in to comment.