Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove device_code_agx #512

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions Artifacts.toml

This file was deleted.

9 changes: 4 additions & 5 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ version = "1.4.0"

[deps]
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
Artifacts = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
CodecBzip2 = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd"
ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
Expand All @@ -14,12 +13,10 @@ KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
LLVMDowngrader_jll = "f52de702-fb25-5922-94ba-81dd59b07444"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
ObjectFile = "d8793406-e978-5875-9003-1fc021f44a92"
ObjectiveC = "e86c9b32-1129-44ac-8ea0-90d5bb39ded9"
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
Preferences = "21216c6a-2e73-6563-6e65-726566657250"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
Python_jll = "93d3a430-8e7c-50da-8e8d-3dfcfb3baf05"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
Expand All @@ -35,7 +32,6 @@ SpecialFunctionsExt = "SpecialFunctions"

[compat]
Adapt = "4"
Artifacts = "1"
BFloat16s = "0.5"
CEnum = "0.4, 0.5"
CodecBzip2 = "0.8.5"
Expand All @@ -45,11 +41,14 @@ GPUCompiler = "0.26, 0.27, 1"
KernelAbstractions = "0.9.1"
LLVM = "7.2, 8, 9"
LLVMDowngrader_jll = "0.6"
ObjectFile = "0.4"
LinearAlgebra = "1"
ObjectiveC = "2.1, 3"
PrecompileTools = "1"
Preferences = "1"
Printf = "1"
Random = "1"
SHA = "0.7"
SpecialFunctions = "2"
StaticArrays = "1"
UUIDs = "1"
julia = "1.10"
5 changes: 2 additions & 3 deletions docs/src/api/compiler.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,9 @@ the InteractiveUtils standard library:
@device_code_typed
@device_code_warntype
@device_code_llvm
@device_code_air
@device_code_native
@device_code_agx
@device_code
```

For more information, please consult the GPUCompiler.jl documentation. `code_agx` is
actually `code_native`:
For more information, please consult the GPUCompiler.jl documentation.
2 changes: 1 addition & 1 deletion perf/latency.jl
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ function main()
ttfp_cmd =
`$base_cmd -e "using Metal
kernel() = return
Metal.code_agx(devnull, kernel, Tuple{}; kernel=true)"`
Metal.code_native(devnull, kernel, Tuple{}; kernel=true)"`
results["ttfp"] = @benchmark run($ttfp_cmd) evals=1 seconds=60

results
Expand Down
3 changes: 0 additions & 3 deletions src/Metal.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,7 @@ using LLVM
using LLVM.Interop
import LLVMDowngrader_jll
using Preferences: @load_preference, load_preference
using Python_jll
using ObjectFile
using ExprTools: splitdef, combinedef
using Artifacts
using ObjectiveC, .CoreFoundation, .Foundation, .Dispatch, .OS
import KernelAbstractions

Expand Down
7 changes: 2 additions & 5 deletions src/compiler/compilation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,7 @@ function compile(@nospecialize(job::CompilerJob))
end

# link into an executable kernel
@autoreleasepool function link(@nospecialize(job::CompilerJob), compiled;
return_function=false)
@autoreleasepool function link(@nospecialize(job::CompilerJob), compiled)
@signpost_event log=log_compiler() "Link" "Job=$job"

@signpost_interval log=log_compiler() "Instantiate compute pipeline" begin
Expand Down Expand Up @@ -211,7 +210,5 @@ end
end
end

# most of the time, we don't need the function object,
# so don't keep it alive unconditionally in GPUCompiler's caches
pipeline_state, return_function ? fun : nothing
pipeline_state
end
2 changes: 1 addition & 1 deletion src/compiler/execution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ function mtlfunction(f::F, tt::TT=Tuple{}; name=nothing, kwargs...) where {F,TT}
cache = compiler_cache(dev)
source = methodinstance(F, tt)
config = compiler_config(dev; name, kwargs...)::MetalCompilerConfig
pipeline, _ = GPUCompiler.cached_compilation(cache, source, config, compile, link)
pipeline = GPUCompiler.cached_compilation(cache, source, config, compile, link)

# create a callable object that captures the function instance. we don't need to think
# about world age here, as GPUCompiler already does and will return a different object
Expand Down
177 changes: 3 additions & 174 deletions src/compiler/reflection.jl
Original file line number Diff line number Diff line change
Expand Up @@ -19,161 +19,8 @@ function split_kwargs_runtime(kwargs, wanted::Vector{Symbol})
return extracted, remaining
end

"""
code_agx([io], f, types, cap::VersionNumber)

Prints the AGX code generated for the method matching the given generic function and type
signature to `io` which defaults to `stdout`.

See also: [`@device_code_agx`](@ref)
"""
function code_agx(io::IO, @nospecialize(func::Base.Callable), @nospecialize(types),
kernel::Bool=true; kwargs...)
compiler_kwargs, kwargs = split_kwargs_runtime(kwargs, COMPILER_KWARGS)
source = methodinstance(typeof(func), Base.to_tuple_type(types))
config = compiler_config(device(); kernel, compiler_kwargs...)
job = CompilerJob(source, config)
code_agx(io, job)
end

@autoreleasepool function code_agx(io::IO, job::MetalCompilerJob)
if !job.config.kernel
error("Can only generate AGX code for kernel functions")
end

# compile the kernel
compiled = compile(job)
pipeline, fun = link(job, compiled; return_function=true)
# XXX: can we re-use this pipeline?

# register it with a pipeline descriptor
pipeline_desc = MTLComputePipelineDescriptor()
pipeline_desc.computeFunction = fun

# create a binary archive
bin_desc = MTLBinaryArchiveDescriptor()
bin = MTLBinaryArchive(device(), bin_desc)
add_functions!(bin, pipeline_desc)

mktempdir() do dir
# serialize the archive to a file
binary = joinpath(dir, "kernel.macho")
write(binary, bin)

# disassemble the main function
first = true
i = 0
extract_gpu_code(binary) do name, code
# skip all-zero functions
all(code .== 0) && return

i += 1
file = joinpath(dir, "function$(i).bin")
write(file, code)

# disassemble the function
first || println(io)
println(io, "$name:")
print(io, disassemble(file))

first = false
end
end
end

@enum GPUMachineType::UInt32 begin
AppleGPU = 0x1000013
AMDGPU = 0x1000014
IntelGPU = 0x1000015
AIR64 = 0x1000017
end

function extract_gpu_code(f, binary)
fat_handle = readmeta(open(binary))
fat_handle isa FatMachOHandle || error("Expected a universal binary, got a $(typeof(fat_handle))")

# the universal binary contains several architectures; extract the GPU one
arch = findfirst(fat_handle) do arch
arch.header isa MachO.MachOHeader64 && GPUMachineType(arch.header.cputype) == AppleGPU
end
arch === nothing && error("Could not find GPU architecture in universal binary")

# the GPU binary contains several sections...
## ... extract the compute section, which is another Mach-O binary
compute_section = findfirst(Sections(fat_handle[arch]), "__TEXT,__compute")
compute_section === nothing && error("Could not find __compute section in GPU binary")
compute_binary = read(compute_section)
native_handle = only(readmeta(IOBuffer(compute_binary)))
## ... extract the metallib section, which is a Metal library
metallib_section = findfirst(Sections(fat_handle[arch]), "__TEXT,__metallib")
metallib_section === nothing && error("Could not find __metallib section in GPU binary")
metallib_binary = read(metallib_section)
metallib = read(IOBuffer(metallib_binary), MetalLib)
# TODO: use this to implement a do-block device_code_air like CUDA.jl?

# identify the kernel name
kernel_name = "unknown_kernel"
# XXX: does it happen that these metallibs contain multiple functions?
if length(metallib.functions) == 1
kernel_name = metallib.functions[1].name
end
# XXX: we used to be able to identify the kernel by looking at symbols in
# the fat binary, one of which aliased with the start of the compute
# section. these symbols have disappeared on macOS 15.
#compute_symbol = nothing
#for symbol in Symbols(fat_handle[arch])
# symbol_value(symbol) == section_offset(compute_section) || continue
# endswith(symbol_name(symbol), "_begin") || continue
# compute_symbol = symbol
#end
#compute_symbol === nothing && error("Could not find symbol for __compute section")
#kernel_name = symbol_name(compute_symbol)[1:end-6]

# within the native GPU binary, isolate the section containing code
section = findfirst(Sections(native_handle), "__TEXT,__text")
isnothing(section) && error("Could not find __TEXT,__text section")

# get all symbols, and sort them by address
symbols = sort(collect(Symbols(native_handle)), by=symbol_value)

# extract relevant functions
code = read(section)
function extract_function(fn)
# find the symbol
symbol = findfirst(isequal(fn) , symbols)
symbol === nothing && return nothing
offset = symbol_value(symbols[symbol])

# extract the function
size = if symbol < length(symbols)
# up until the next symbol
symbol_value(symbols[symbol + 1])
else
# up until the end of the section
section_size(section)
end - offset
return code[offset + 1 : offset + size]
end
for sym in symbols
f("$kernel_name.$(symbol_name(sym))", extract_function(sym))
end
return
end

function disassemble(path)
io = IOBuffer()
disassembler = joinpath(only(readdir(artifact"applegpu"; join=true)), "disassemble.py")
run(pipeline(`$(python()) $disassembler $path`, stdout=io))
return String(take!(io))
end

code_agx(@nospecialize(func::Base.Callable), @nospecialize(types); kwargs...) =
code_agx(stdout, func, types; kwargs...)

const code_native = code_agx

# forward the rest to GPUCompiler with an appropriate CompilerJob
for method in (:code_typed, :code_warntype, :code_llvm)
for method in (:code_typed, :code_warntype, :code_llvm, :code_native)
# only code_typed doesn't take a io argument
args = method === :code_typed ? (:job,) : (:io, :job)

Expand All @@ -191,37 +38,19 @@ for method in (:code_typed, :code_warntype, :code_llvm)
end
end


#
# @device_code_* functions
#

export @device_code_lowered, @device_code_typed, @device_code_warntype,
@device_code_llvm, @device_code_native, @device_code_agx, @device_code

"""
@device_code_agx [io::IO=stdout, ...] ex

Evaluates the expression `ex` and prints the result of [`Metal.code_agx`](@ref) to
`io` for every compiled Metal kernel. For other supported keywords, see
[`Metal.code_agx`](@ref).
"""
macro device_code_agx(ex...)
function hook(job::MetalCompilerJob; io::IO=stdout, kwargs...)
println(io, "; $job")
println(io)
code_agx(io, job; kwargs...)
end
GPUCompiler.emit_hooked_compilation(hook, ex...)
end

const var"@device_code_native" = var"@device_code_agx"
@device_code_llvm, @device_code_metal, @device_code

# forward to GPUCompiler
@eval $(Symbol("@device_code_lowered")) = $(getfield(GPUCompiler, Symbol("@device_code_lowered")))
@eval $(Symbol("@device_code_typed")) = $(getfield(GPUCompiler, Symbol("@device_code_typed")))
@eval $(Symbol("@device_code_warntype")) = $(getfield(GPUCompiler, Symbol("@device_code_warntype")))
@eval $(Symbol("@device_code_llvm")) = $(getfield(GPUCompiler, Symbol("@device_code_llvm")))
@eval $(Symbol("@device_code_metal")) = $(getfield(GPUCompiler, Symbol("@device_code_native")))
@eval $(Symbol("@device_code")) = $(getfield(GPUCompiler, Symbol("@device_code")))


Expand Down
5 changes: 0 additions & 5 deletions src/initialization.jl
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,6 @@ function __init__()
@warn "Metal.jl has not been tested on macOS 16 or later, you may run into issues."
end

# we use Python_jll, but don't actually want its environment to be active
# (this breaks the call to pygmentize in GPUCompiler).
# XXX: the JLL should only set PYTHONHOME when the executable is called
delete!(ENV, "PYTHONHOME")

if Base.JLOptions().debug_level >= 2
# enable Metal API validation
ENV["MTL_DEBUG_LAYER"] = "1"
Expand Down
3 changes: 0 additions & 3 deletions test/execution.jl
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,11 @@ end
Metal.code_typed(dummy, Tuple{})
Metal.code_warntype(devnull, dummy, Tuple{})
Metal.code_llvm(devnull, dummy, Tuple{})
shader_validation || Metal.code_agx(devnull, dummy, Tuple{})

@device_code_lowered @metal dummy()
@device_code_typed @metal dummy()
@device_code_warntype io=devnull @metal dummy()
@device_code_llvm io=devnull @metal dummy()
shader_validation || @device_code_agx io=devnull @metal dummy()

mktempdir() do dir
@device_code dir=dir @metal dummy()
Expand All @@ -76,7 +74,6 @@ end
# make sure kernel name aliases are preserved in the generated code
@test occursin("dummy", sprint(io->(@device_code_llvm io=io optimize=false @metal dummy())))
@test occursin("dummy", sprint(io->(@device_code_llvm io=io @metal dummy())))
shader_validation || @test occursin("dummy", sprint(io->(@device_code_agx io=io @metal dummy())))

# make sure invalid kernels can be partially reflected upon
let
Expand Down
Loading