Skip to content

Commit

Permalink
perf: add direct Lux + CUDA numbers
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal committed Nov 2, 2024
1 parent 5a390e1 commit 96e9ce8
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 7 deletions.
1 change: 1 addition & 0 deletions perf/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
Boltz = "4544d5e4-abc5-4dea-817f-29e4c205d9c8"
Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
Metalhead = "dbeba491-748d-5e0e-a39e-b530a07fa0cc"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Reactant = "3c362404-f566-11ee-1572-e11a4b42c853"
12 changes: 6 additions & 6 deletions perf/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

Benchmark was run on a single NVIDIA RTX 4050 GPU with 6GB of memory.

| Batch Size | Best Timing (Flax) | Best Timing (Lux + Reactant) |
| ---------- | ------------------ | ---------------------------- |
| 1 | 0.00403 s | 0.00057587 s |
| 4 | 0.00788 s | 0.000712372 s |
| 32 | 0.05146 s | 0.000810471 s |
| 128 | 0.20071 s | 0.009914158 s |
| Batch Size | Best Timing (Flax) | Best Timing (Lux + Reactant) | Best Timing (Lux) |
| ---------- | ------------------ | ---------------------------- | ----------------- |
| 1 | 0.00403 s | 0.000575870 s | 0.004382536 s |
| 4 | 0.00788 s | 0.000712372 s | 0.011562075 s |
| 32 | 0.05146 s | 0.000810471 s | 0.103826668 s |
| 128 | 0.20071 s | 0.009914158 s | 0.430018518 s |
55 changes: 55 additions & 0 deletions perf/resnet/lux.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
using ArgParse, BenchmarkTools
import Metalhead
using Lux, LuxCUDA, Random, Boltz

function parse_commandline()
s = ArgParseSettings()

#! format: off
@add_arg_table! s begin
"--batch-size"
help = "Batch size"
arg_type = Vector{Int}
default = [1, 4, 32, 128]

"--model-size"
help = "Model size"
arg_type = Int
default = 50
end
#! format: on

return parse_args(s)
end

function main()
parsed_args = parse_commandline()
dev = gpu_device(; force=true)

model = Vision.ResNet(parsed_args["model-size"])
ps, st = Lux.setup(Random.default_rng(), model)
ps_ra = ps |> dev
st_ra = Lux.testmode(st) |> dev

println("Param count: $(Lux.parameterlength(ps_ra))")
println("State count: $(Lux.statelength(st_ra))")

timings = Dict{Int, Float64}()

for b in parsed_args["batch-size"]
println("batch_size=$b")

x = rand(Float32, 224, 224, 3, b) |> dev

timings[b] = @belapsed begin
y, _ = $(model)($(x), $(ps_ra), $(st_ra))
CUDA.synchronize()
end

println("Best timing: $(timings[b]) s")
end

println(timings)
end

main()
2 changes: 1 addition & 1 deletion perf/resnet/main.jl → perf/resnet/lux_reactant.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ end

function main()
parsed_args = parse_commandline()
dev = xla_device()
dev = xla_device(; force=true)

model = Vision.ResNet(parsed_args["model-size"])
ps, st = Lux.setup(Random.default_rng(), model)
Expand Down

0 comments on commit 96e9ce8

Please sign in to comment.