2024-12-12 nightly release (9cfa288)

pytorch · Dec 12, 2024 · ba4f453 · ba4f453
1 parent 6cd1681
commit ba4f453
Show file tree

Hide file tree

Showing 13 changed files with 259 additions and 53 deletions.
diff --git a/recipes/knowledge_distillation_distributed.py b/recipes/knowledge_distillation_distributed.py
@@ -28,7 +28,6 @@
     get_adapter_state_dict,
     get_lora_module_names,
     get_merged_lora_ckpt,
-    load_dora_magnitudes,
     LoRALinear,
     set_trainable_params,
     validate_missing_and_unexpected_for_lora,
@@ -477,8 +476,7 @@ def _setup_model(
                 ) and not lora_weights_state_dict:
                     # lora may not be covered in state dict
                     # if finetune for the 1st time
-                    m.lora_a.to_empty(device=lora_device)
-                    m.lora_b.to_empty(device=lora_device)
+                    m.to_empty(device=lora_device)
                     m.initialize_parameters()
                 # RoPE is not covered in state dict
                 if hasattr(m, "rope_init"):
@@ -491,13 +489,10 @@ def _setup_model(
             self._is_rank_zero,
             cpu_offload=fsdp_cpu_offload,
         )
-        is_dora = False
         for m in model.modules():
             if hasattr(m, "initialize_dora_magnitude"):
-                is_dora = True
                 m.initialize_dora_magnitude()
-        if is_dora:
-            load_dora_magnitudes(model)
+
         validate_missing_and_unexpected_for_lora(
             lora_attn_modules=self._lora_attn_modules,
             apply_lora_to_mlp=self._apply_lora_to_mlp,

diff --git a/recipes/knowledge_distillation_single_device.py b/recipes/knowledge_distillation_single_device.py
@@ -26,7 +26,6 @@
     get_adapter_state_dict,
     get_lora_module_names,
     get_merged_lora_ckpt,
-    load_dora_magnitudes,
     set_trainable_params,
     validate_missing_and_unexpected_for_lora,
 )
@@ -421,7 +420,9 @@ def _setup_model(
         # This is for any adapters that need to be initialized after base weights
         # have been loaded (e.g. DoRA).
         if self._is_dora:
-            load_dora_magnitudes(model)
+            for m in model.modules():
+                if hasattr(m, "initialize_dora_magnitude"):
+                    m.initialize_dora_magnitude()
         if lora_weights_state_dict:
             lora_missing, lora_unexpected = model.load_state_dict(
                 lora_weights_state_dict, strict=False

diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py
@@ -27,7 +27,6 @@
     get_adapter_params,
     get_adapter_state_dict,
     get_merged_lora_ckpt,
-    load_dora_magnitudes,
     LoRALinear,
     set_trainable_params,
     validate_missing_and_unexpected_for_lora,
@@ -400,8 +399,7 @@ def _setup_model(
                 ) and not lora_weights_state_dict:
                     # lora may not be covered in state dict
                     # if finetune for the 1st time
-                    m.lora_a.to_empty(device=lora_device)
-                    m.lora_b.to_empty(device=lora_device)
+                    m.to_empty(device=lora_device)
                     m.initialize_parameters()
                 # RoPE is not covered in state dict
                 if hasattr(m, "rope_init"):
@@ -420,7 +418,9 @@ def _setup_model(
                 is_dora = True
                 m.initialize_dora_magnitude()
         if is_dora:
-            load_dora_magnitudes(model)
+            for m in model.modules():
+                if hasattr(m, "initialize_dora_magnitude"):
+                    m.initialize_dora_magnitude()
         validate_missing_and_unexpected_for_lora(
             lora_attn_modules=self._lora_attn_modules,
             apply_lora_to_mlp=self._apply_lora_to_mlp,

diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
@@ -29,7 +29,6 @@
     get_adapter_state_dict,
     get_lora_module_names,
     get_merged_lora_ckpt,
-    load_dora_magnitudes,
     LoRALinear,
     set_trainable_params,
     validate_missing_and_unexpected_for_lora,
@@ -496,10 +495,9 @@ def _setup_model(
                 ) and not lora_weights_state_dict:
                     # lora may not be covered in state dict
                     # if finetune for the 1st time
-                    m.lora_a.to_empty(device=lora_device)
-                    m.lora_b.to_empty(device=lora_device)
+                    m.to_empty(device=lora_device)
                     m.initialize_parameters()
-                # RoPE is not covered in state dict
+
                 if hasattr(m, "rope_init"):
                     m.rope_init()
 
@@ -510,13 +508,10 @@ def _setup_model(
             self._is_rank_zero,
             cpu_offload=fsdp_cpu_offload,
         )
-        is_dora = False
         for m in model.modules():
             if hasattr(m, "initialize_dora_magnitude"):
-                is_dora = True
                 m.initialize_dora_magnitude()
-        if is_dora:
-            load_dora_magnitudes(model)
+
         validate_missing_and_unexpected_for_lora(
             lora_attn_modules=self._lora_attn_modules,
             apply_lora_to_mlp=self._apply_lora_to_mlp,

diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py
@@ -27,7 +27,6 @@
     get_adapter_state_dict,
     get_lora_module_names,
     get_merged_lora_ckpt,
-    load_dora_magnitudes,
     set_trainable_params,
     validate_missing_and_unexpected_for_lora,
 )
@@ -450,7 +449,6 @@ def _setup_model(
             for m in model.modules():
                 if hasattr(m, "initialize_dora_magnitude"):
                     m.initialize_dora_magnitude()
-            load_dora_magnitudes(model)
         if lora_weights_state_dict:
             lora_missing, lora_unexpected = model.load_state_dict(
                 lora_weights_state_dict, strict=False

diff --git a/recipes/qat_lora_finetune_distributed.py b/recipes/qat_lora_finetune_distributed.py
@@ -534,8 +534,7 @@ def _setup_model(
                 ) and not lora_weights_state_dict:
                     # lora may not be covered in state dict
                     # if finetune for the 1st time
-                    m.lora_a.to_empty(device=lora_device)
-                    m.lora_b.to_empty(device=lora_device)
+                    m.to_empty(device=lora_device)
                     m.initialize_parameters()
                 # RoPE is not covered in state dict
                 if hasattr(m, "rope_init"):

diff --git a/tests/recipes/test_lora_finetune_distributed.py b/tests/recipes/test_lora_finetune_distributed.py
@@ -215,15 +215,15 @@ def test_training_state_on_resume(
 
     @pytest.mark.integration_test
     @pytest.mark.parametrize(
-        "recipe_config, model_type, ckpt_type",
+        "recipe_config, model_type, ckpt_type, use_dora",
         [
-            ("llama2/7B_lora", "llama2", "tune"),
-            ("llama3/8B_lora", "llama3", "tune"),
+            ("llama2/7B_lora", "llama2", "tune", True),
+            ("llama3/8B_lora", "llama3", "tune", False),
         ],
     )
     @gpu_test(gpu_count=2)
     def test_save_and_load_merged_weights(
-        self, recipe_config, model_type, ckpt_type, tmpdir, monkeypatch
+        self, recipe_config, model_type, ckpt_type, use_dora, tmpdir, monkeypatch
     ):
         ckpt_component = CKPT_COMPONENT_MAP[ckpt_type]
         ckpt = model_type + "_" + ckpt_type
@@ -249,9 +249,9 @@ def test_save_and_load_merged_weights(
             enable_activation_checkpointing=True \
             enable_activation_offloading=True \
         """.split()
-
-        model_config = MODEL_TEST_CONFIGS[model_type + "_lora"]
-
+        model_config = MODEL_TEST_CONFIGS[
+            model_type + ("_dora" if use_dora else "_lora")
+        ]
         cmd = cmd + self._get_test_config_overrides() + model_config
         monkeypatch.setattr(sys, "argv", cmd)
         runpy.run_path(TUNE_PATH, run_name="__main__")

diff --git a/tests/torchtune/models/llama2/scripts/compare_dora.py b/tests/torchtune/models/llama2/scripts/compare_dora.py
@@ -13,12 +13,7 @@
 from torch import nn
 from torchao.dtypes.nf4tensor import linear_nf4, to_nf4
 from torchtune import training
-from torchtune.modules.peft import (
-    DoRALinear,
-    get_merged_lora_ckpt,
-    load_dora_magnitudes,
-    LoRALinear,
-)
+from torchtune.modules.peft import DoRALinear, get_merged_lora_ckpt, LoRALinear
 from torchtune.training.seed import set_seed
 
 
@@ -91,7 +86,7 @@ def _dora_is_the_same_as_lora():
     # Verify that this is true.
     assert not _dora_is_the_same_as_lora()
     module.initialize_dora_magnitude()
-    load_dora_magnitudes(module)
+
     assert _dora_is_the_same_as_lora()
 
     def _compare_params():