From c81629b7a62522809d91ab7aec0a9d6249dbd5aa Mon Sep 17 00:00:00 2001
From: Avik Pal <avikpal@mit.edu>
Date: Wed, 8 Jan 2025 15:37:45 -0500
Subject: [PATCH] feat: allow no grad option for reactant (#1190)

---
 examples/ConditionalVAE/main.jl      |  3 +-
 ext/LuxReactantExt/LuxReactantExt.jl |  2 +-
 ext/LuxReactantExt/training.jl       | 27 +++++++++++------
 src/helpers/training.jl              | 45 ++++++++++++++++++++--------
 4 files changed, 53 insertions(+), 24 deletions(-)

diff --git a/examples/ConditionalVAE/main.jl b/examples/ConditionalVAE/main.jl
index 380ae1d39..d9813e833 100644
--- a/examples/ConditionalVAE/main.jl
+++ b/examples/ConditionalVAE/main.jl
@@ -244,7 +244,8 @@ function main(; batchsize=128, image_size=(64, 64), num_latent_dims=8, max_num_f
         start_time = time()
         for (i, X) in enumerate(train_dataloader)
             (_, loss, _, train_state) = Training.single_train_step!(
-                AutoEnzyme(), loss_function, X, train_state)
+                AutoEnzyme(), loss_function, X, train_state; return_gradients=Val(false)
+            )
 
             loss_total += loss
             total_samples += size(X, ndims(X))
diff --git a/ext/LuxReactantExt/LuxReactantExt.jl b/ext/LuxReactantExt/LuxReactantExt.jl
index 93a1b1279..292132759 100644
--- a/ext/LuxReactantExt/LuxReactantExt.jl
+++ b/ext/LuxReactantExt/LuxReactantExt.jl
@@ -4,7 +4,7 @@ using Enzyme: Enzyme, Const, Duplicated, Active
 using Optimisers: Optimisers
 using Reactant: Reactant, @compile, @code_hlo, AnyTracedRArray, TracedRArray, TracedRNumber
 using Setfield: @set!
-using Static: False
+using Static: True, False
 
 using Lux: Lux, LuxOps, Training, Utils
 using Lux.Training: TrainingBackendCache, ReactantBackend
diff --git a/ext/LuxReactantExt/training.jl b/ext/LuxReactantExt/training.jl
index 2462bd252..d6c0c1c8d 100644
--- a/ext/LuxReactantExt/training.jl
+++ b/ext/LuxReactantExt/training.jl
@@ -55,7 +55,7 @@ function Lux.Training.compute_gradients_impl(
 end
 
 function Lux.Training.compute_gradients_impl(::ReactantBackend, obj_fn::F, data,
-        ts::Training.TrainState{<:TrainingBackendCache{ReactantBackend}, F}) where {F}
+        ts::Training.TrainState{<:TrainingBackendCache{<:ReactantBackend}, F}) where {F}
     grads, loss, stats, st = ts.cache.extras.compiled_gradient_function(
         obj_fn, ts.model, data, ts.parameters, ts.states)
     @set! ts.states = st
@@ -70,7 +70,7 @@ for inplace in ("!", "")
 
     # Ideally users never hit this dispatch but it is still good to have as a fallback
     @eval function Lux.Training.$(apply_gradients_fn)(
-            ts::Training.TrainState{<:TrainingBackendCache{ReactantBackend}}, grads
+            ts::Training.TrainState{<:TrainingBackendCache{<:ReactantBackend}}, grads
     )
         if hasfield(typeof(ts.cache.extras), :update_function)
             update_function = ts.cache.extras.update_function
@@ -94,15 +94,15 @@ for inplace in ("!", "")
     @eval function Lux.Training.$(fname)(backend::ReactantBackend, objective_function::F,
             data, ts::Training.TrainState) where {F}
         maybe_dump_to_mlir_file!($(internal_fn), objective_function, ts.model, data,
-            ts.parameters, ts.states, ts.optimizer_state)
+            ts.parameters, ts.states, ts.optimizer_state, backend.return_gradients)
 
         compiled_grad_and_step_function = @compile $(internal_fn)(
             objective_function, ts.model, data, ts.parameters, ts.states,
-            ts.optimizer_state)
+            ts.optimizer_state, backend.return_gradients)
 
         grads, ps, loss, stats, st, opt_state = compiled_grad_and_step_function(
             objective_function, ts.model, data, ts.parameters, ts.states,
-            ts.optimizer_state)
+            ts.optimizer_state, backend.return_gradients)
 
         cache = TrainingBackendCache(
             backend, False(), nothing, (; compiled_grad_and_step_function))
@@ -116,10 +116,11 @@ for inplace in ("!", "")
         return grads, loss, stats, ts
     end
 
-    @eval function Lux.Training.$(fname)(::ReactantBackend, obj_fn::F, data,
-            ts::Training.TrainState{<:TrainingBackendCache{ReactantBackend}, F}) where {F}
+    @eval function Lux.Training.$(fname)(backend::ReactantBackend, obj_fn::F, data,
+            ts::Training.TrainState{<:TrainingBackendCache{<:ReactantBackend}, F}) where {F}
         grads, ps, loss, stats, st, opt_state = ts.cache.extras.compiled_grad_and_step_function(
-            obj_fn, ts.model, data, ts.parameters, ts.states, ts.optimizer_state)
+            obj_fn, ts.model, data, ts.parameters, ts.states,
+            ts.optimizer_state, backend.return_gradients)
 
         @set! ts.states = st
         @set! ts.parameters = ps
@@ -131,7 +132,15 @@ for inplace in ("!", "")
 
     # XXX: Inplace version not actually inplace
     @eval function $(internal_fn)(
-            objective_function::F, model, data, ps, st, opt_state) where {F}
+            objective_function::F, model, data, ps, st, opt_state, ::False) where {F}
+        dps, loss, stats, stₙ = compute_gradients_internal(
+            objective_function, model, data, ps, st)
+        opt_state, ps = Optimisers.$(update_fn)(opt_state, ps, dps)
+        return nothing, ps, loss, stats, stₙ, opt_state
+    end
+
+    @eval function $(internal_fn)(
+            objective_function::F, model, data, ps, st, opt_state, ::True) where {F}
         dps, loss, stats, stₙ = compute_gradients_internal(
             objective_function, model, data, ps, st)
         opt_state, ps = Optimisers.$(update_fn)(opt_state, ps, dps)
diff --git a/src/helpers/training.jl b/src/helpers/training.jl
index c11f74b93..45b93b49d 100644
--- a/src/helpers/training.jl
+++ b/src/helpers/training.jl
@@ -7,7 +7,7 @@ using FastClosures: @closure
 using Functors: Functors, fmap
 using Optimisers: Optimisers
 using Setfield: @set!
-using Static: StaticBool, Static, False, True
+using Static: StaticBool, Static, False, True, static
 
 using ..Lux: Lux, Utils, ReactantCompatibleOptimisers
 using LuxCore: LuxCore, AbstractLuxLayer
@@ -104,7 +104,9 @@ function Base.show(io::IO, ::MIME"text/plain", ts::TrainState)
         print(io, "\n    objective_function: ", nameof(typeof(ts.objective_function)))
 end
 
-struct ReactantBackend end
+@concrete struct ReactantBackend
+    return_gradients <: StaticBool
+end
 
 const APPLY_GRAD_DOCSTRING = """
 ## Arguments
@@ -198,10 +200,13 @@ function compute_gradients(ad, obj_fn::F, data, ts::TrainState) where {F}
     return compute_gradients_impl(maybe_wrap_adtype(ad, dev_type), obj_fn, data, ts)
 end
 
-maybe_wrap_adtype(backend::ReactantBackend, _) = backend
-maybe_wrap_adtype(ad::AbstractADType, _) = ad
-function maybe_wrap_adtype(ad::AbstractADType, ::Type{ReactantDevice})
-    ad isa AutoEnzyme && return ReactantBackend()
+maybe_wrap_adtype(backend::ReactantBackend, ::Any; kwargs...) = backend
+maybe_wrap_adtype(ad::AbstractADType, ::Any; kwargs...) = ad
+function maybe_wrap_adtype(
+        ad::AbstractADType, ::Type{ReactantDevice};
+        return_gradients::Utils.BoolType=True()
+)
+    ad isa AutoEnzyme && return ReactantBackend(static(return_gradients))
     throw(ArgumentError("Computing gradients for models on XLA is supported only with \
                          Enzyme.jl (`AutoEnzyme`)."))
 end
@@ -258,12 +263,17 @@ function wrap_objective_function(
 end
 
 """
-    single_train_step!(backend, obj_fn::F, data, ts::TrainState)
+    single_train_step!(backend, obj_fn::F, data, ts::TrainState; return_gradients=True())
 
 Perform a single training step. Computes the gradients using [`compute_gradients`](@ref) and
 updates the parameters using [`apply_gradients!`](@ref). All backends supported via
 [`compute_gradients`](@ref) are supported here.
 
+## Keyword Arguments
+
+  - `return_gradients`: If `True()`, the gradients are returned. If `False()`, the returned
+    gradients are `nothing`. Defaults to `True()`. This is only used for Reactant Backend.
+
 ## Return
 
 Returned values are the same as [`compute_gradients`](@ref). Note that despite the `!`,
@@ -271,13 +281,15 @@ only the parameters in `ts` are updated inplace. Users should be using the retur
 object for further training steps, else there is no caching and performance will be
 suboptimal (and absolutely terrible for backends like `AutoReactant`).
 """
-function single_train_step!(backend, obj_fn::F, data, ts::TrainState) where {F}
-    backend = maybe_wrap_adtype(backend, get_device_type((ts.parameters, ts.states)))
+function single_train_step!(backend, obj_fn::F, data, ts::TrainState;
+        return_gradients::Utils.BoolType=True()) where {F}
+    backend = maybe_wrap_adtype(
+        backend, get_device_type((ts.parameters, ts.states)); return_gradients)
     return single_train_step_impl!(backend, obj_fn, data, ts)
 end
 
 """
-    single_train_step(backend, obj_fn::F, data, ts::TrainState)
+    single_train_step(backend, obj_fn::F, data, ts::TrainState; return_gradients=True())
 
 Perform a single training step. Computes the gradients using [`compute_gradients`](@ref) and
 updates the parameters using [`apply_gradients`](@ref). All backends supported via
@@ -285,12 +297,19 @@ updates the parameters using [`apply_gradients`](@ref). All backends supported v
 
 In most cases you should use [`single_train_step!`](@ref) instead of this function.
 
+## Keyword Arguments
+
+  - `return_gradients`: If `True()`, the gradients are returned. If `False()`, the returned
+    gradients are `nothing`. Defaults to `True()`. This is only used for Reactant Backend.
+
 ## Return
 
-Returned values are the same as [`compute_gradients`](@ref).
+Returned values are the same as [`single_train_step!`](@ref).
 """
-function single_train_step(backend, obj_fn::F, data, ts::TrainState) where {F}
-    backend = maybe_wrap_adtype(backend, get_device_type((ts.parameters, ts.states)))
+function single_train_step(backend, obj_fn::F, data, ts::TrainState;
+        return_gradients::Utils.BoolType=True()) where {F}
+    backend = maybe_wrap_adtype(
+        backend, get_device_type((ts.parameters, ts.states)); return_gradients)
     return single_train_step_impl(backend, obj_fn, data, ts)
 end