Merge branch 'main' into code_llama2_evaluation

pytorch · Dec 26, 2024 · 26126c6 · 26126c6
2 parents 8495571 + aa8f365
commit 26126c6
Show file tree

Hide file tree

Showing 19 changed files with 584 additions and 433 deletions.
diff --git a/docs/source/deep_dives/checkpointer.rst b/docs/source/deep_dives/checkpointer.rst
diff --git a/docs/source/tutorials/e2e_flow.rst b/docs/source/tutorials/e2e_flow.rst
diff --git a/recipes/configs/llama2/7B_lora_dpo.yaml b/recipes/configs/llama2/7B_lora_dpo.yaml
@@ -32,7 +32,7 @@ model:
 tokenizer:
   _component_: torchtune.models.llama2.llama2_tokenizer
   path: /tmp/Llama-2-7b-hf/tokenizer.model
-  max_seq_len: 1024
+  max_seq_len: 1024 # higher increases memory
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer

diff --git a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml
@@ -31,7 +31,7 @@ model:
 tokenizer:
   _component_: torchtune.models.llama2.llama2_tokenizer
   path: /tmp/Llama-2-7b-hf/tokenizer.model
-  max_seq_len: 1024
+  max_seq_len: 1024 # higher increases memory
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer

diff --git a/recipes/configs/llama3/8B_qat_lora.yaml b/recipes/configs/llama3/8B_qat_lora.yaml
@@ -83,6 +83,10 @@ dtype: bf16
 enable_activation_checkpointing: False  # True reduces memory
 enable_activation_offloading: False  # True reduces memory
 
+# QAT arguments
+quantizer:
+  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
+  groupsize: 256
 
 # Profiler (disabled)
 profiler:
@@ -108,8 +112,3 @@ profiler:
   warmup_steps: 3
   active_steps: 2
   num_cycles: 1
-
-# QAT arguments
-quantizer:
-  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
-  groupsize: 256
diff --git a/recipes/configs/llama3_1/8B_lora_dpo.yaml b/recipes/configs/llama3_1/8B_lora_dpo.yaml
@@ -32,7 +32,7 @@ model:
 tokenizer:
   _component_: torchtune.models.llama3.llama3_tokenizer
   path: /tmp/Meta-Llama-3.1-8B-Instruct/original/tokenizer.model
-  max_seq_len: null
+  max_seq_len: 1024 # higher increases memory
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer

diff --git a/recipes/configs/llama3_1/8B_lora_dpo_single_device.yaml b/recipes/configs/llama3_1/8B_lora_dpo_single_device.yaml
@@ -31,7 +31,7 @@ model:
 tokenizer:
   _component_: torchtune.models.llama3.llama3_tokenizer
   path: /tmp/Meta-Llama-3.1-8B-Instruct/original/tokenizer.model
-  max_seq_len: null
+  max_seq_len: 1024 # higher increases memory
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer

diff --git a/recipes/configs/llama3_1/8B_qat_lora.yaml b/recipes/configs/llama3_1/8B_qat_lora.yaml
@@ -86,6 +86,10 @@ dtype: bf16
 enable_activation_checkpointing: False  # True reduces memory
 enable_activation_offloading: False  # True reduces memory
 
+# QAT arguments
+quantizer:
+  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
+  groupsize: 256
 
 # Profiler (disabled)
 profiler:
@@ -111,8 +115,3 @@ profiler:
   warmup_steps: 3
   active_steps: 2
   num_cycles: 1
-
-# QAT arguments
-quantizer:
-  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
-  groupsize: 256
diff --git a/recipes/configs/llama3_2/1B_qat_lora.yaml b/recipes/configs/llama3_2/1B_qat_lora.yaml
@@ -82,6 +82,10 @@ dtype: bf16
 enable_activation_checkpointing: False  # True reduces memory
 enable_activation_offloading: False  # True reduces memory
 
+# QAT arguments
+quantizer:
+  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
+  groupsize: 256
 
 # Profiler (disabled)
 profiler:
@@ -107,8 +111,3 @@ profiler:
   warmup_steps: 3
   active_steps: 2
   num_cycles: 1
-
-# QAT arguments
-quantizer:
-  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
-  groupsize: 256
diff --git a/recipes/configs/llama3_2/3B_qat_lora.yaml b/recipes/configs/llama3_2/3B_qat_lora.yaml
@@ -83,6 +83,10 @@ dtype: bf16
 enable_activation_checkpointing: False  # True reduces memory
 enable_activation_offloading: False  # True reduces memory
 
+# QAT arguments
+quantizer:
+  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
+  groupsize: 256
 
 # Profiler (disabled)
 profiler:
@@ -108,8 +112,3 @@ profiler:
   warmup_steps: 3
   active_steps: 2
   num_cycles: 1
-
-# QAT arguments
-quantizer:
-  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
-  groupsize: 256
diff --git a/recipes/configs/llama3_2/evaluation.yaml b/recipes/configs/llama3_2/evaluation.yaml
@@ -0,0 +1,42 @@
+# Config for EleutherEvalRecipe in eleuther_eval.py
+#
+# To launch, run the following command:
+#    tune run eleuther_eval --config llama3_2/evaluation
+
+# Model Arguments
+model:
+  _component_: torchtune.models.llama3_2.llama3_2_3b
+
+# Checkpointer
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Llama-3.2-3B-Instruct
+  checkpoint_files: [
+    model-00001-of-00002.safetensors,
+    model-00002-of-00002.safetensors,
+  ]
+  recipe_checkpoint: null
+  output_dir: ${output_dir}
+  model_type: LLAMA3_2
+resume_from_checkpoint: False
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.llama3.llama3_tokenizer
+  path: /tmp/Llama-3.2-3B-Instruct/original/tokenizer.model
+  max_seq_len: null
+
+# Environment
+device: cpu
+dtype: bf16
+seed: 1234 # It is not recommended to change this seed, b/c it matches EleutherAI's default seed
+
+# EleutherAI specific eval args
+tasks: ["truthfulqa_mc2"]
+limit: null
+max_seq_length: 4096
+batch_size: 8
+enable_kv_cache: True
+
+# Quantization specific args
+quantizer: null
diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py
@@ -118,6 +118,7 @@ class QATRecipeDistributed(FTRecipeInterface):
 
     Raises:
         ValueError: If ``dtype`` is set to fp16.
+        ValueError: If ``compile`` is set to True.
         RuntimeError: If ``dtype`` is set to bf16 and the hardware does not support bf16.
         RuntimeError: If ``left_pad_sequence`` is set as the data collator.
         RuntimeError: If ``enable_activation_offloading`` is True and device is not CUDA.
@@ -133,6 +134,11 @@ def __init__(self, cfg: DictConfig) -> None:
                 "full fp16 training is not supported with this recipe. Please use bf16 or fp32 instead."
             )
 
+        if cfg.get("compile", False):
+            raise ValueError(
+                "Compile is not yet supported for QAT. Please set compile=False."
+            )
+
         # logging attributes
         self._output_dir = cfg.output_dir
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)

diff --git a/recipes/qat_lora_finetune_distributed.py b/recipes/qat_lora_finetune_distributed.py
@@ -126,7 +126,8 @@ class QATLoRAFinetuneRecipeDistributed(FTRecipeInterface):
 
     Raises:
         ValueError: If ``dtype`` is set to fp16.
-        ValueError: If world_size is 1
+        ValueError: If world_size is 1.
+        ValueError: If ``compile`` is set to True.
         RuntimeError: If ``dtype`` is set to bf16 and the hardware does not support bf16.
         RuntimeError: If ``left_pad_sequence`` is set as the data collator.
         RuntimeError: If ``enable_activation_offloading`` is True and device is not CUDA.
@@ -149,6 +150,11 @@ def __init__(self, cfg: DictConfig) -> None:
                 "full fp16 training is not supported with this recipe. Please use bf16 or fp32 instead."
             )
 
+        if cfg.get("compile", False):
+            raise ValueError(
+                "Compile is not yet supported for QAT. Please set compile=False."
+            )
+
         _, rank = utils.get_world_size_and_rank()
 
         # _is_rank_zero is used primarily for logging. In the future, the logger

diff --git a/tests/torchtune/modules/_export/test_attention.py b/tests/torchtune/modules/_export/test_attention.py
@@ -159,6 +159,7 @@ def test_attention_export(self):
                 (self.x, self.x),
                 kwargs={"input_pos": self.input_pos},
                 dynamic_shapes=self.dynamic_shapes,
+                strict=True,
             )
         et_res = et_mha_ep.module()(self.x, self.x, input_pos=self.input_pos)
         tt_res = self.tt_mha(self.x, self.x, input_pos=self.input_pos)

diff --git a/tests/torchtune/modules/_export/test_export_position_embeddings.py b/tests/torchtune/modules/_export/test_export_position_embeddings.py
@@ -51,14 +51,14 @@ def test_tile_positional_embedding_smoke(self):
         torch_version_ge("2.6.0.dev20241117"), reason="Need recent fixes for export"
     )
     def test_tile_positional_embedding_export(self):
-
         tpe_ep = torch.export.export(
             self.tpe,
             (self.x, self.aspect_ratio),
             dynamic_shapes=(
                 self.dynamic_shape,
                 None,
             ),  # assuming aspect ratio is static
+            strict=True,
         )
 
         y = tpe_ep.module()(self.x, self.aspect_ratio)
@@ -129,14 +129,14 @@ def test_tiled_token_positional_embedding_smoke(self):
         torch_version_ge("2.6.0.dev20241117"), reason="Need recent fixes for export"
     )
     def test_tiled_token_positional_embedding_export(self):
-
         tpe_ep = torch.export.export(
             self.tpe,
             (self.x, self.aspect_ratio),
             dynamic_shapes=(
                 self.dynamic_shape,
                 None,
             ),  # assuming aspect ratio is static
+            strict=True,
         )
 
         y = tpe_ep.module()(self.x, self.aspect_ratio)
@@ -155,6 +155,7 @@ def test_tiled_token_positional_embedding_aoti(self):
                 self.dynamic_shape,
                 None,
             ),  # assuming aspect ratio is static
+            strict=True,
         )
 
         with tempfile.TemporaryDirectory() as tmpdir:

diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py
@@ -469,6 +469,10 @@ class Recipe:
                 name="mistral/evaluation",
                 file_path="mistral/evaluation.yaml",
             ),
+            Config(
+                name="llama3_2/evaluation",
+                file_path="llama3_2/evaluation.yaml",
+            ),
             Config(
                 name="code_llama2/evaluation",
                 file_path="code_llama2/evaluation.yaml",

diff --git a/torchtune/generation/_generation.py b/torchtune/generation/_generation.py
@@ -139,7 +139,7 @@ def get_causal_mask_from_padding_mask(
             - [bsz, seq_length, target_seq_len] if ``target_seq_len`` was specified.
 
     Raises:
-        AssertionError: if ``target_seq_len > seq_len``, the sequence length of the padding mask.
+        AssertionError: if ``target_seq_len < seq_len``, the sequence length of the padding mask.
 
     Example:
         >>> padding_mask = torch.tensor([[False, True, True, True]])

diff --git a/torchtune/training/metric_logging.py b/torchtune/training/metric_logging.py
@@ -222,7 +222,7 @@ def log_config(self, config: DictConfig) -> None:
             try:
                 output_config_fname = Path(
                     os.path.join(
-                        config.checkpointer.checkpoint_dir,
+                        config.output_dir,
                         "torchtune_config.yaml",
                     )
                 )

diff --git a/version.txt b/version.txt
@@ -1 +1 @@
-0.5.0
+0.6.0