pytorch · ebsmothers · Nov 1, 2024 · Nov 1, 2024 · felipemello1 · Nov 1, 2024
diff --git a/torchtune/config/_utils.py b/torchtune/config/_utils.py
@@ -173,6 +173,11 @@ def _merge_yaml_and_cli_args(yaml_args: Namespace, cli_args: List[str]) -> DictC
         # key string to reflect this
         if k in yaml_kwargs and _has_component(yaml_kwargs[k]):
             k += "._component_"
+
+        # None passed via CLI will be parsed as string, but we really want OmegaConf null
+        if v == "None":
         ac_mode (str): Activation checkpointing mode. ['none', 'full', 'selective'] 
         ac_mode (str): Activation checkpointing mode. ['none', 'full', 'selective'] 
+            v = "!!null"
+
         # TODO: this is a hack but otherwise we can't pass strings with leading zeroes
         # to define the checkpoint file format. We manually override OmegaConf behavior
         # by prepending the value with !!str to force a string type

diff --git a/torchtune/training/_grad_scaler.py b/torchtune/training/_grad_scaler.py
@@ -21,6 +21,11 @@ def scale_grads(model: nn.Module, scaler: torch.Tensor) -> None:
     Outputs:
         None (grad fields are modified in place)
     """
+    device = None
     for p in model.parameters():
+        # First ensure scaler is on the same device as the model
+        if not device:
+            device = p.device
+            scaler = scaler.to(device)
         if p.grad is not None:
             p.grad *= scaler