pytorch · facebook-github-bot · Nov 8, 2024 · Nov 7, 2024
diff --git a/torchtune/training/_distributed.py b/torchtune/training/_distributed.py
@@ -300,7 +300,9 @@ def load_from_full_model_state_dict(
     for param_name, full_tensor in full_sd.items():
         sharded_meta_param = meta_sharded_sd.get(param_name)
         full_tensor = full_tensor.to(sharded_meta_param.dtype).to(device)
-        if isinstance(sharded_meta_param._local_tensor, NF4Tensor):
+        if hasattr(sharded_meta_param, "_local_tensor") and isinstance(
+            sharded_meta_param._local_tensor, NF4Tensor
+        ):
             full_tensor = to_nf4(full_tensor)
             # replicating logic from `_fsdp_param.py`` `_init_sharded_param`
             # otherwise `distribute_tensor(DTensor(local=NF4))`