From 96e9ce8f9ec20e81714cfec7811c2c96933288ab Mon Sep 17 00:00:00 2001 From: Avik Pal Date: Fri, 1 Nov 2024 21:22:00 -0400 Subject: [PATCH] perf: add direct Lux + CUDA numbers --- perf/Project.toml | 1 + perf/README.md | 12 +++--- perf/resnet/lux.jl | 55 ++++++++++++++++++++++++ perf/resnet/{main.jl => lux_reactant.jl} | 2 +- 4 files changed, 63 insertions(+), 7 deletions(-) create mode 100644 perf/resnet/lux.jl rename perf/resnet/{main.jl => lux_reactant.jl} (97%) diff --git a/perf/Project.toml b/perf/Project.toml index 70fbf5734..e7f44f72d 100644 --- a/perf/Project.toml +++ b/perf/Project.toml @@ -4,6 +4,7 @@ BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" Boltz = "4544d5e4-abc5-4dea-817f-29e4c205d9c8" Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" Lux = "b2108857-7c20-44ae-9111-449ecde12c47" +LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda" Metalhead = "dbeba491-748d-5e0e-a39e-b530a07fa0cc" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reactant = "3c362404-f566-11ee-1572-e11a4b42c853" diff --git a/perf/README.md b/perf/README.md index ef3c59945..ad248f1ff 100644 --- a/perf/README.md +++ b/perf/README.md @@ -6,9 +6,9 @@ Benchmark was run on a single NVIDIA RTX 4050 GPU with 6GB of memory. -| Batch Size | Best Timing (Flax) | Best Timing (Lux + Reactant) | -| ---------- | ------------------ | ---------------------------- | -| 1 | 0.00403 s | 0.00057587 s | -| 4 | 0.00788 s | 0.000712372 s | -| 32 | 0.05146 s | 0.000810471 s | -| 128 | 0.20071 s | 0.009914158 s | +| Batch Size | Best Timing (Flax) | Best Timing (Lux + Reactant) | Best Timing (Lux) | +| ---------- | ------------------ | ---------------------------- | ----------------- | +| 1 | 0.00403 s | 0.000575870 s | 0.004382536 s | +| 4 | 0.00788 s | 0.000712372 s | 0.011562075 s | +| 32 | 0.05146 s | 0.000810471 s | 0.103826668 s | +| 128 | 0.20071 s | 0.009914158 s | 0.430018518 s | diff --git a/perf/resnet/lux.jl b/perf/resnet/lux.jl new file mode 100644 index 000000000..872d40358 --- /dev/null +++ b/perf/resnet/lux.jl @@ -0,0 +1,55 @@ +using ArgParse, BenchmarkTools +import Metalhead +using Lux, LuxCUDA, Random, Boltz + +function parse_commandline() + s = ArgParseSettings() + + #! format: off + @add_arg_table! s begin + "--batch-size" + help = "Batch size" + arg_type = Vector{Int} + default = [1, 4, 32, 128] + + "--model-size" + help = "Model size" + arg_type = Int + default = 50 + end + #! format: on + + return parse_args(s) +end + +function main() + parsed_args = parse_commandline() + dev = gpu_device(; force=true) + + model = Vision.ResNet(parsed_args["model-size"]) + ps, st = Lux.setup(Random.default_rng(), model) + ps_ra = ps |> dev + st_ra = Lux.testmode(st) |> dev + + println("Param count: $(Lux.parameterlength(ps_ra))") + println("State count: $(Lux.statelength(st_ra))") + + timings = Dict{Int, Float64}() + + for b in parsed_args["batch-size"] + println("batch_size=$b") + + x = rand(Float32, 224, 224, 3, b) |> dev + + timings[b] = @belapsed begin + y, _ = $(model)($(x), $(ps_ra), $(st_ra)) + CUDA.synchronize() + end + + println("Best timing: $(timings[b]) s") + end + + println(timings) +end + +main() diff --git a/perf/resnet/main.jl b/perf/resnet/lux_reactant.jl similarity index 97% rename from perf/resnet/main.jl rename to perf/resnet/lux_reactant.jl index 47cec54a2..805bc5d5b 100644 --- a/perf/resnet/main.jl +++ b/perf/resnet/lux_reactant.jl @@ -26,7 +26,7 @@ end function main() parsed_args = parse_commandline() - dev = xla_device() + dev = xla_device(; force=true) model = Vision.ResNet(parsed_args["model-size"]) ps, st = Lux.setup(Random.default_rng(), model)