From de8b57c3c195efea2375d5417952d0902376a91a Mon Sep 17 00:00:00 2001
From: Felipe Mello <fmellomascarenhas@gmail.com>
Date: Thu, 19 Dec 2024 20:18:43 -0500
Subject: [PATCH 1/9] raise compile error (#2188)

Co-authored-by: Felipe Mello <felipemello@fb.com>
---
 recipes/configs/llama3/8B_qat_lora.yaml   | 9 ++++-----
 recipes/configs/llama3_1/8B_qat_lora.yaml | 9 ++++-----
 recipes/configs/llama3_2/1B_qat_lora.yaml | 9 ++++-----
 recipes/configs/llama3_2/3B_qat_lora.yaml | 9 ++++-----
 recipes/qat_distributed.py                | 6 ++++++
 recipes/qat_lora_finetune_distributed.py  | 8 +++++++-
 6 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/recipes/configs/llama3/8B_qat_lora.yaml b/recipes/configs/llama3/8B_qat_lora.yaml
index 5a889a3d63..5f88f175ec 100644
--- a/recipes/configs/llama3/8B_qat_lora.yaml
+++ b/recipes/configs/llama3/8B_qat_lora.yaml
@@ -83,6 +83,10 @@ dtype: bf16
 enable_activation_checkpointing: False  # True reduces memory
 enable_activation_offloading: False  # True reduces memory
 
+# QAT arguments
+quantizer:
+  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
+  groupsize: 256
 
 # Profiler (disabled)
 profiler:
@@ -108,8 +112,3 @@ profiler:
   warmup_steps: 3
   active_steps: 2
   num_cycles: 1
-
-# QAT arguments
-quantizer:
-  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
-  groupsize: 256
diff --git a/recipes/configs/llama3_1/8B_qat_lora.yaml b/recipes/configs/llama3_1/8B_qat_lora.yaml
index d25351a0e4..3d7c94744e 100644
--- a/recipes/configs/llama3_1/8B_qat_lora.yaml
+++ b/recipes/configs/llama3_1/8B_qat_lora.yaml
@@ -86,6 +86,10 @@ dtype: bf16
 enable_activation_checkpointing: False  # True reduces memory
 enable_activation_offloading: False  # True reduces memory
 
+# QAT arguments
+quantizer:
+  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
+  groupsize: 256
 
 # Profiler (disabled)
 profiler:
@@ -111,8 +115,3 @@ profiler:
   warmup_steps: 3
   active_steps: 2
   num_cycles: 1
-
-# QAT arguments
-quantizer:
-  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
-  groupsize: 256
diff --git a/recipes/configs/llama3_2/1B_qat_lora.yaml b/recipes/configs/llama3_2/1B_qat_lora.yaml
index 79f628367f..bffc52a4ac 100644
--- a/recipes/configs/llama3_2/1B_qat_lora.yaml
+++ b/recipes/configs/llama3_2/1B_qat_lora.yaml
@@ -82,6 +82,10 @@ dtype: bf16
 enable_activation_checkpointing: False  # True reduces memory
 enable_activation_offloading: False  # True reduces memory
 
+# QAT arguments
+quantizer:
+  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
+  groupsize: 256
 
 # Profiler (disabled)
 profiler:
@@ -107,8 +111,3 @@ profiler:
   warmup_steps: 3
   active_steps: 2
   num_cycles: 1
-
-# QAT arguments
-quantizer:
-  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
-  groupsize: 256
diff --git a/recipes/configs/llama3_2/3B_qat_lora.yaml b/recipes/configs/llama3_2/3B_qat_lora.yaml
index 6b69aebac2..64985de1ea 100644
--- a/recipes/configs/llama3_2/3B_qat_lora.yaml
+++ b/recipes/configs/llama3_2/3B_qat_lora.yaml
@@ -83,6 +83,10 @@ dtype: bf16
 enable_activation_checkpointing: False  # True reduces memory
 enable_activation_offloading: False  # True reduces memory
 
+# QAT arguments
+quantizer:
+  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
+  groupsize: 256
 
 # Profiler (disabled)
 profiler:
@@ -108,8 +112,3 @@ profiler:
   warmup_steps: 3
   active_steps: 2
   num_cycles: 1
-
-# QAT arguments
-quantizer:
-  _component_: torchtune.training.quantization.Int8DynActInt4WeightQATQuantizer
-  groupsize: 256
diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py
index 6c79a6cefa..efb9c4c2b5 100644
--- a/recipes/qat_distributed.py
+++ b/recipes/qat_distributed.py
@@ -118,6 +118,7 @@ class QATRecipeDistributed(FTRecipeInterface):
 
     Raises:
         ValueError: If ``dtype`` is set to fp16.
+        ValueError: If ``compile`` is set to True.
         RuntimeError: If ``dtype`` is set to bf16 and the hardware does not support bf16.
         RuntimeError: If ``left_pad_sequence`` is set as the data collator.
         RuntimeError: If ``enable_activation_offloading`` is True and device is not CUDA.
@@ -133,6 +134,11 @@ def __init__(self, cfg: DictConfig) -> None:
                 "full fp16 training is not supported with this recipe. Please use bf16 or fp32 instead."
             )
 
+        if cfg.get("compile", False):
+            raise ValueError(
+                "Compile is not yet supported for QAT. Please set compile=False."
+            )
+
         # logging attributes
         self._output_dir = cfg.output_dir
         self._log_every_n_steps = cfg.get("log_every_n_steps", 1)
diff --git a/recipes/qat_lora_finetune_distributed.py b/recipes/qat_lora_finetune_distributed.py
index d047d77d41..57e4a09108 100644
--- a/recipes/qat_lora_finetune_distributed.py
+++ b/recipes/qat_lora_finetune_distributed.py
@@ -126,7 +126,8 @@ class QATLoRAFinetuneRecipeDistributed(FTRecipeInterface):
 
     Raises:
         ValueError: If ``dtype`` is set to fp16.
-        ValueError: If world_size is 1
+        ValueError: If world_size is 1.
+        ValueError: If ``compile`` is set to True.
         RuntimeError: If ``dtype`` is set to bf16 and the hardware does not support bf16.
         RuntimeError: If ``left_pad_sequence`` is set as the data collator.
         RuntimeError: If ``enable_activation_offloading`` is True and device is not CUDA.
@@ -149,6 +150,11 @@ def __init__(self, cfg: DictConfig) -> None:
                 "full fp16 training is not supported with this recipe. Please use bf16 or fp32 instead."
             )
 
+        if cfg.get("compile", False):
+            raise ValueError(
+                "Compile is not yet supported for QAT. Please set compile=False."
+            )
+
         _, rank = utils.get_world_size_and_rank()
 
         # _is_rank_zero is used primarily for logging. In the future, the logger

From 74e6e7b6dbe76ac6c8a3515349c1e1b2952a4841 Mon Sep 17 00:00:00 2001
From: Philip Bontrager <pbontrager@gmail.com>
Date: Thu, 19 Dec 2024 20:19:11 -0500
Subject: [PATCH 2/9] Update DPO Max Seq Len (#2176)

---
 recipes/configs/llama2/7B_lora_dpo.yaml                 | 2 +-
 recipes/configs/llama2/7B_lora_dpo_single_device.yaml   | 2 +-
 recipes/configs/llama3_1/8B_lora_dpo.yaml               | 2 +-
 recipes/configs/llama3_1/8B_lora_dpo_single_device.yaml | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/recipes/configs/llama2/7B_lora_dpo.yaml b/recipes/configs/llama2/7B_lora_dpo.yaml
index 250d62db44..887be92925 100644
--- a/recipes/configs/llama2/7B_lora_dpo.yaml
+++ b/recipes/configs/llama2/7B_lora_dpo.yaml
@@ -32,7 +32,7 @@ model:
 tokenizer:
   _component_: torchtune.models.llama2.llama2_tokenizer
   path: /tmp/Llama-2-7b-hf/tokenizer.model
-  max_seq_len: 1024
+  max_seq_len: 1024 # higher increases memory
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
diff --git a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml
index 4d154c38ce..6e0049cfd5 100644
--- a/recipes/configs/llama2/7B_lora_dpo_single_device.yaml
+++ b/recipes/configs/llama2/7B_lora_dpo_single_device.yaml
@@ -31,7 +31,7 @@ model:
 tokenizer:
   _component_: torchtune.models.llama2.llama2_tokenizer
   path: /tmp/Llama-2-7b-hf/tokenizer.model
-  max_seq_len: 1024
+  max_seq_len: 1024 # higher increases memory
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
diff --git a/recipes/configs/llama3_1/8B_lora_dpo.yaml b/recipes/configs/llama3_1/8B_lora_dpo.yaml
index 7160362b2a..4425e7414b 100644
--- a/recipes/configs/llama3_1/8B_lora_dpo.yaml
+++ b/recipes/configs/llama3_1/8B_lora_dpo.yaml
@@ -32,7 +32,7 @@ model:
 tokenizer:
   _component_: torchtune.models.llama3.llama3_tokenizer
   path: /tmp/Meta-Llama-3.1-8B-Instruct/original/tokenizer.model
-  max_seq_len: null
+  max_seq_len: 1024 # higher increases memory
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
diff --git a/recipes/configs/llama3_1/8B_lora_dpo_single_device.yaml b/recipes/configs/llama3_1/8B_lora_dpo_single_device.yaml
index 81d6158b28..236b623f7d 100644
--- a/recipes/configs/llama3_1/8B_lora_dpo_single_device.yaml
+++ b/recipes/configs/llama3_1/8B_lora_dpo_single_device.yaml
@@ -31,7 +31,7 @@ model:
 tokenizer:
   _component_: torchtune.models.llama3.llama3_tokenizer
   path: /tmp/Meta-Llama-3.1-8B-Instruct/original/tokenizer.model
-  max_seq_len: null
+  max_seq_len: 1024 # higher increases memory
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer

From 076bf81fb4594500ae7dd20ba5d989b14d328781 Mon Sep 17 00:00:00 2001
From: Reema Alzaid <80041251+ReemaAlzaid@users.noreply.github.com>
Date: Fri, 20 Dec 2024 21:10:27 +0300
Subject: [PATCH 3/9] Llama3.2 3B eval (#2186)

Co-authored-by: Reema Alzaid <Reema.Alzaid@ibm.com>
---
 recipes/configs/llama3_2/evaluation.yaml | 42 ++++++++++++++++++++++++
 torchtune/_recipe_registry.py            |  4 +++
 2 files changed, 46 insertions(+)
 create mode 100644 recipes/configs/llama3_2/evaluation.yaml

diff --git a/recipes/configs/llama3_2/evaluation.yaml b/recipes/configs/llama3_2/evaluation.yaml
new file mode 100644
index 0000000000..44b4ebcead
--- /dev/null
+++ b/recipes/configs/llama3_2/evaluation.yaml
@@ -0,0 +1,42 @@
+# Config for EleutherEvalRecipe in eleuther_eval.py
+#
+# To launch, run the following command:
+#    tune run eleuther_eval --config llama3_2/evaluation
+
+# Model Arguments
+model:
+  _component_: torchtune.models.llama3_2.llama3_2_3b
+
+# Checkpointer
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Llama-3.2-3B-Instruct
+  checkpoint_files: [
+    model-00001-of-00002.safetensors,
+    model-00002-of-00002.safetensors,
+  ]
+  recipe_checkpoint: null
+  output_dir: ${output_dir}
+  model_type: LLAMA3_2
+resume_from_checkpoint: False
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.llama3.llama3_tokenizer
+  path: /tmp/Llama-3.2-3B-Instruct/original/tokenizer.model
+  max_seq_len: null
+
+# Environment
+device: cpu
+dtype: bf16
+seed: 1234 # It is not recommended to change this seed, b/c it matches EleutherAI's default seed
+
+# EleutherAI specific eval args
+tasks: ["truthfulqa_mc2"]
+limit: null
+max_seq_length: 4096
+batch_size: 8
+enable_kv_cache: True
+
+# Quantization specific args
+quantizer: null
diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py
index faf1ec7124..6ce06a3585 100644
--- a/torchtune/_recipe_registry.py
+++ b/torchtune/_recipe_registry.py
@@ -469,6 +469,10 @@ class Recipe:
                 name="mistral/evaluation",
                 file_path="mistral/evaluation.yaml",
             ),
+            Config(
+                name="llama3_2/evaluation",
+                file_path="llama3_2/evaluation.yaml",
+            ),
         ],
         supports_distributed=False,
     ),

From e97870057dddaa00db64361796da526502e08d7a Mon Sep 17 00:00:00 2001
From: Paul Soulos <psoulos@gmail.com>
Date: Fri, 20 Dec 2024 13:13:17 -0500
Subject: [PATCH 4/9] =?UTF-8?q?Update=20typo=20in=20docstring=20for=20=5Fg?=
 =?UTF-8?q?eneration.get=5Fcausal=5Fmask=5Ffrom=5Fpadding=E2=80=A6=20(#218?=
 =?UTF-8?q?7)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 torchtune/generation/_generation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchtune/generation/_generation.py b/torchtune/generation/_generation.py
index bb4b1ff0b0..b73dd186a8 100644
--- a/torchtune/generation/_generation.py
+++ b/torchtune/generation/_generation.py
@@ -139,7 +139,7 @@ def get_causal_mask_from_padding_mask(
             - [bsz, seq_length, target_seq_len] if ``target_seq_len`` was specified.
 
     Raises:
-        AssertionError: if ``target_seq_len > seq_len``, the sequence length of the padding mask.
+        AssertionError: if ``target_seq_len < seq_len``, the sequence length of the padding mask.
 
     Example:
         >>> padding_mask = torch.tensor([[False, True, True, True]])

From 1db6f663026b69b3af35ffdea183cd95d4bb22bb Mon Sep 17 00:00:00 2001
From: Felipe Mello <fmellomascarenhas@gmail.com>
Date: Fri, 20 Dec 2024 13:27:39 -0500
Subject: [PATCH 5/9] new docs for checkpointing (#2189)

Co-authored-by: Felipe Mello <felipemello@fb.com>
---
 docs/source/deep_dives/checkpointer.rst | 300 +++++++++-------
 docs/source/tutorials/e2e_flow.rst      | 457 +++++++++++++++---------
 2 files changed, 447 insertions(+), 310 deletions(-)

diff --git a/docs/source/deep_dives/checkpointer.rst b/docs/source/deep_dives/checkpointer.rst
index 13aac698c6..92dffc878d 100644
--- a/docs/source/deep_dives/checkpointer.rst
+++ b/docs/source/deep_dives/checkpointer.rst
@@ -42,11 +42,11 @@ inference will not work as expected). In addition to the keys lining up, you als
 of the weights (values in the state_dict) to match up exactly with those expected by the model
 definition.
 
-Let's look at the two popular formats for Llama2.
+Let's look at the two popular formats for Llama 3.2.
 
 **Meta Format**
 
-This is the format supported by the official Llama2 implementation. When you download the Llama2 7B model
+This is the format supported by the official Llama 3.2 implementation. When you download the Llama 3.2 3B model
 from the `meta-llama website <https://llama.meta.com/llama-downloads>`_, you'll get access to a single
 ``.pth`` checkpoint file. You can inspect the contents of this checkpoint easily with ``torch.load``
 
@@ -58,39 +58,43 @@ from the `meta-llama website <https://llama.meta.com/llama-downloads>`_, you'll
     >>> for key, value in state_dict.items():
     >>>    print(f'{key}: {value.shape}')
 
-    tok_embeddings.weight: torch.Size([32000, 4096])
+    tok_embeddings.weight: torch.Size([128256, 3072])
     ...
     ...
     >>> print(len(state_dict.keys()))
-    292
+    255
 
-The state_dict contains 292 keys, including an input embedding table called ``tok_embeddings``. The
-model definition for this state_dict expects an embedding layer with ``32000`` tokens each having a
-embedding with dim of ``4096``.
+The state_dict contains 255 keys, including an input embedding table called ``tok_embeddings``. The
+model definition for this state_dict expects an embedding layer with ``128256`` tokens each having a
+embedding with dim of ``3072``.
 
 
 **HF Format**
 
 This is the most popular format within the Hugging Face Model Hub and is
 the default format in every torchtune config. This is also the format you get when you download the
-llama2 model from the `Llama-2-7b-hf <https://huggingface.co/meta-llama/Llama-2-7b-hf>`_ repo.
+llama3.2 model from the `Llama-3.2-3B-Instruct <https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct>`_ repo.
 
-The first big difference is that the state_dict is split across two ``.bin`` files. To correctly
+The first big difference is that the state_dict is split across two ``.safetensors`` files. To correctly
 load the checkpoint, you'll need to piece these files together. Let's inspect one of the files.
 
 .. code-block:: python
 
-    >>> import torch
-    >>> state_dict = torch.load('pytorch_model-00001-of-00002.bin', mmap=True, weights_only=True, map_location='cpu')
+    >>> from safetensors import safe_open
+    >>> state_dict = {}
+    >>> with safe_open("model-00001-of-00002.safetensors", framework="pt", device="cpu") as f:
+    >>>     for k in f.keys():
+    >>>         state_dict[k] = f.get_tensor(k)
+
     >>> # inspect the keys and the shapes of the associated tensors
     >>> for key, value in state_dict.items():
     >>>     print(f'{key}: {value.shape}')
 
-    model.embed_tokens.weight: torch.Size([32000, 4096])
+    model.embed_tokens.weight: torch.Size([128256, 3072])
     ...
     ...
     >>> print(len(state_dict.keys()))
-    241
+    187
 
 Not only does the state_dict contain fewer keys (expected since this is one of two files), but
 the embedding table is called ``model.embed_tokens`` instead of ``tok_embeddings``. This mismatch
@@ -145,16 +149,16 @@ Model Hub and is the default format in every torchtune config.
 For this checkpointer to work correctly, we assume that ``checkpoint_dir`` contains the necessary checkpoint
 and json files. The easiest way to make sure everything works correctly is to use the following flow:
 
-- Download the model from the HF repo using tune download. By default, this will ignore the "safetensors"
-  files.
+- Download the model from the HF repo using tune download. This will ignore the "pth"
+  files, since we will be loading the "safetensors".
 
     |
 
     .. code-block:: bash
 
-        tune download meta-llama/Llama-2-7b-hf \
-        --output-dir <checkpoint_dir> \
-        --hf-token <hf-token>
+       tune download meta-llama/Llama-3.2-3B-Instruct \
+       --output-dir /tmp/Llama-3.2-3B-Instruct \
+       --ignore-patterns "original/consolidated.00.pth"
 
 - Use ``output_dir`` specified here as the ``checkpoint_dir`` argument for the checkpointer.
 
@@ -170,31 +174,25 @@ The following snippet explains how the HFCheckpointer is setup in torchtune conf
         _component_: torchtune.training.FullModelHFCheckpointer
 
         # directory with the checkpoint files
-        # this should match the output_dir above
-        checkpoint_dir: <checkpoint_dir>
+        # this should match the folder you used when downloading the model
+        checkpoint_dir: /tmp/Llama-3.2-3B-Instruct
 
-        # checkpoint files. For the llama2-7b-hf model we have
-        # 2 .bin files. The checkpointer takes care of sorting
+        # checkpoint files. For the Llama-3.2-3B-Instruct model we have
+        # 2 .safetensor files. The checkpointer takes care of sorting
         # by id and so the order here does not matter
         checkpoint_files: [
-            pytorch_model-00001-of-00002.bin,
-            pytorch_model-00002-of-00002.bin,
+            model-00001-of-00002.safetensors,
+            model-00002-of-00002.safetensors,
         ]
 
-        # if we're restarting a previous run, we need to specify
-        # the file with the checkpoint state. More on this in the
-        # next section
-        recipe_checkpoint: null
-
-        # dir for saving the output checkpoints. Usually set
-        # to be the same as checkpoint_dir
-        output_dir: <checkpoint_dir>
+        # dir for saving the output checkpoints
+        output_dir: <output_dir>
 
         # model_type which specifies how to convert the state_dict
         # into a format which torchtune understands
-        model_type: LLAMA2
+        model_type: LLAMA3_2
 
-    # set to True if restarting training
+    # set to True if restarting training. More on that later.
     resume_from_checkpoint: False
 
 .. note::
@@ -222,9 +220,9 @@ and json files. The easiest way to make sure everything works correctly is to us
 
     .. code-block:: bash
 
-        tune download meta-llama/Llama-2-7b \
-        --output-dir <checkpoint_dir> \
-        --hf-token <hf-token>
+        tune download meta-llama/Llama-3.2-3B-Instruct \
+        --output-dir /tmp/Llama-3.2-3B-Instruct \
+        --ignore-patterns "*.safetensors"
 
 - Use ``output_dir`` above as the ``checkpoint_dir`` for the checkpointer.
 
@@ -240,27 +238,21 @@ The following snippet explains how the MetaCheckpointer is setup in torchtune co
         _component_: torchtune.training.FullModelMetaCheckpointer
 
         # directory with the checkpoint files
-        # this should match the output_dir above
+        # this should match the folder you used when downloading the model
         checkpoint_dir: <checkpoint_dir>
 
-        # checkpoint files. For the llama2-7b model we have
+        # checkpoint files. For the llama3.2 3B model we have
         # a single .pth file
         checkpoint_files: [consolidated.00.pth]
 
-        # if we're restarting a previous run, we need to specify
-        # the file with the checkpoint state. More on this in the
-        # next section
-        recipe_checkpoint: null
-
-        # dir for saving the output checkpoints. Usually set
-        # to be the same as checkpoint_dir
+        # dir for saving the output checkpoints.
         output_dir: <checkpoint_dir>
 
         # model_type which specifies how to convert the state_dict
         # into a format which torchtune understands
-        model_type: LLAMA2
+        model_type: LLAMA3_2
 
-    # set to True if restarting training
+    # set to True if restarting training. More on that later.
     resume_from_checkpoint: False
 
 |
@@ -274,6 +266,73 @@ for testing or for loading quantized models for generation.
 
 |
 
+Checkpoint Output
+---------------------------------
+
+Congrats for getting this far! Let's say you have followed our :ref:`End-to-End Workflow with torchtune <e2e_flow>` and trained a llama 3.2 3B using one of our LoRA recipes.
+
+Now let's visualize the outputs. A simple way of doing this is by running :code:`tree -a path/to/outputdir`, which should show something like the tree below.
+There are 3 types of folders:
+
+1) **recipe_state**: Holds recipe_state.pt with the information necessary to restart your training run from the last intermediate epoch. More on that later;
+2) **logs**: Outputs of your metric_logger, if any;
+3) **epoch_{}**: Contains your trained model weights plus model metadata. If running inference or pushing to a model hub, you should use this folder directly;
+
+.. note::
+     For each epoch, we copy the contents of the original checkpoint folder, excluding the original checkpoints and large files.
+     These files are lightweight, mostly configuration files, and make it easier for the user to use the epoch folders directly in downstream applications.
+
+For more details about each file, please check the End-to-End tutorial mentioned above.
+
+    .. code-block:: bash
+
+        >>> tree -a /tmp/torchtune/llama3_2_3B/lora_single_device
+        /tmp/torchtune/llama3_2_3B/lora_single_device
+        ├── epoch_0
+        │   ├── adapter_config.json
+        │   ├── adapter_model.pt
+        │   ├── adapter_model.safetensors
+        │   ├── config.json
+        │   ├── ft-model-00001-of-00002.safetensors
+        │   ├── ft-model-00002-of-00002.safetensors
+        │   ├── generation_config.json
+        │   ├── LICENSE.txt
+        │   ├── model.safetensors.index.json
+        │   ├── original
+        │   │   ├── orig_params.json
+        │   │   ├── params.json
+        │   │   └── tokenizer.model
+        │   ├── original_repo_id.json
+        │   ├── README.md
+        │   ├── special_tokens_map.json
+        │   ├── tokenizer_config.json
+        │   ├── tokenizer.json
+        │   └── USE_POLICY.md
+        ├── epoch_1
+        │   ├── adapter_config.json
+        │   ├── adapter_model.pt
+        │   ├── adapter_model.safetensors
+        │   ├── config.json
+        │   ├── ft-model-00001-of-00002.safetensors
+        │   ├── ft-model-00002-of-00002.safetensors
+        │   ├── generation_config.json
+        │   ├── LICENSE.txt
+        │   ├── model.safetensors.index.json
+        │   ├── original
+        │   │   ├── orig_params.json
+        │   │   ├── params.json
+        │   │   └── tokenizer.model
+        │   ├── original_repo_id.json
+        │   ├── README.md
+        │   ├── special_tokens_map.json
+        │   ├── tokenizer_config.json
+        │   ├── tokenizer.json
+        │   └── USE_POLICY.md
+        ├── logs
+        │   └── log_1734652101.txt
+        └── recipe_state
+            └── recipe_state.pt
+
 
 Intermediate vs Final Checkpoints
 ---------------------------------
@@ -327,93 +386,67 @@ The output state dicts have the following formats:
                 ...
             }
 
-To restart from a previous checkpoint file, you'll need to make the following changes
-to the config file
+Resuming from checkpoint - Full Finetuning
+------------------------------------------
 
-.. code-block:: yaml
+Sometimes our training is interrupted for some reason. To restart training from a previous checkpoint file,
+you'll need to **update** the following fields in your configs:
 
-    checkpointer:
+**resume_from_checkpoint**: Set it to True;
 
-        # checkpointer to use
-        _component_: torchtune.training.FullModelHFCheckpointer
+**checkpoint_files**: change the path to ``epoch_{YOUR_EPOCH}/ft-model={}-of-{}.safetensors``;
 
-        checkpoint_dir: <checkpoint_dir>
+Notice that we do **not** change our checkpoint_dir or output_dir. Since we are resuming from checkpoint, we know
+to look for it in the output_dir.
+
+.. code-block:: yaml
 
+    checkpointer:
         # checkpoint files. Note that you will need to update this
         # section of the config with the intermediate checkpoint files
         checkpoint_files: [
-            hf_model_0001_0.pt,
-            hf_model_0002_0.pt,
+            epoch_{YOUR_EPOCH}/ft-model-00001-of-00002.safetensors,
+            epoch_{YOUR_EPOCH}/ft-model-00001-of-00002.safetensors,
         ]
 
-        # if we're restarting a previous run, we need to specify
-        # the file with the checkpoint state
-        recipe_checkpoint: recipe_state.pt
-
-        # dir for saving the output checkpoints. Usually set
-        # to be the same as checkpoint_dir
-        output_dir: <checkpoint_dir>
-
-        # model_type which specifies how to convert the state_dict
-        # into a format which torchtune understands
-        model_type: LLAMA2
-
     # set to True if restarting training
     resume_from_checkpoint: True
 
 
-Checkpointing for LoRA
-----------------------
-
-In torchtune, we output both the adapter weights and the full model "merged" weights
-for LoRA. The "merged" checkpoint can be used just like you would use the source
-checkpoint with any post-training tools. For more details, take a look at our
-:ref:`LoRA Finetuning Tutorial <lora_finetune_label>`.Additionally, by setting the option "save_adapter_weights_only" to True when saving a checkpoint, you can choose to save only the adapter weights.
-
-The primary difference between the two use cases is when you want to resume training
-from a checkpoint. In this case, the checkpointer needs access to both the initial frozen
-base model weights as well as the learnt adapter weights. The config for this scenario
-looks something like this:
+Resuming from checkpoint - LoRA Finetuning
+------------------------------------------
 
+Similarly to full finetuning, we will also only need to modify two fields: ``resume_from_checkpoint``
+and ``adapter_checkpoint``, which will be loaded from output_dir. We do not have to modify ``checkpoint_files``,
+because the base model being loaded is still the same.
 
 .. code-block:: yaml
 
     checkpointer:
 
-        # checkpointer to use
-        _component_: torchtune.training.FullModelHFCheckpointer
-
-        # directory with the checkpoint files
-        # this should match the output_dir above
-        checkpoint_dir: <checkpoint_dir>
-
-        # checkpoint files. This is the ORIGINAL frozen checkpoint
-        # and NOT the merged checkpoint output during training
-        checkpoint_files: [
-            pytorch_model-00001-of-00002.bin,
-            pytorch_model-00002-of-00002.bin,
-        ]
-
-        # this refers to the adapter weights learnt during training
-        adapter_checkpoint: adapter_0.pt
-
-        # the file with the checkpoint state
-        recipe_checkpoint: recipe_state.pt
-
-        # dir for saving the output checkpoints. Usually set
-        # to be the same as checkpoint_dir
-        output_dir: <checkpoint_dir>
-
-        # model_type which specifies how to convert the state_dict
-        # into a format which torchtune understands
-        model_type: LLAMA2
+        # adapter_checkpoint. Note that you will need to update this
+        # section of the config with the intermediate checkpoint files
+        adapter_checkpoint: epoch_{YOUR_EPOCH}/adapter_model.safetensors
 
     # set to True if restarting training
     resume_from_checkpoint: True
 
-    # Set to True to save only the adapter weights
+    # set to True to save only the adapter weights
+    # it does not influence resuming_from_checkpointing
     save_adapter_weights_only: False
 
+.. note::
+    In torchtune, we output both the adapter weights and the full model merged weights
+    for LoRA. The merged checkpoint is a convenience, since it can be used without having special
+    tooling to handle the adapters. However, they should **not** be used when resuming
+    training, as loading the merged weights + adapter would be an error. Therefore, when resuming for LoRA,
+    we will take the original untrained weigths from checkpoint dir, and the trained
+    adapters from output_dir. For more details, take a look at our :ref:`LoRA Finetuning Tutorial <lora_finetune_label>`.
+
+.. note::
+    Additionally, by setting the option :code:`save_adapter_weights_only`, you can choose to **only** save the adapter weights.
+    This reduces the amount of storage and time needed to save the checkpoint, but has no influence over resuming from checkpoint.
+
 |
 
 Putting this all together
@@ -422,51 +455,50 @@ Putting this all together
 Let's now put all of this knowledge together! We'll load some checkpoints,
 create some models and run a simple forward.
 
-For this section we'll use the Llama2 13B model in HF format.
+For this section we'll use the Llama-3.2-3B-Instruct model in HF format.
 
 .. code-block:: python
 
     import torch
-    from torchtune.training import FullModelHFCheckpointer, ModelType
-    from torchtune.models.llama2 import llama2_13b
+    from torchtune.models.llama3_2 import llama3_2_3b
+    from torchtune.training import FullModelHFCheckpointer
 
     # Set the right directory and files
-    checkpoint_dir = 'Llama-2-13b-hf/'
+    checkpoint_dir = "/tmp/Llama-3.2-3B-Instruct/"
+    output_dir = "/tmp/torchtune/llama3_2_3B/full_single_device"
+
     pytorch_files = [
-        'pytorch_model-00001-of-00003.bin',
-        'pytorch_model-00002-of-00003.bin',
-        'pytorch_model-00003-of-00003.bin'
+        "model-00001-of-00002.safetensors",
+        "model-00002-of-00002.safetensors",
     ]
 
     # Set up the checkpointer and load state dict
     checkpointer = FullModelHFCheckpointer(
         checkpoint_dir=checkpoint_dir,
         checkpoint_files=pytorch_files,
-        output_dir=checkpoint_dir,
-        model_type="LLAMA2"
+        output_dir=output_dir,
+        model_type="LLAMA3_2",
     )
     torchtune_sd = checkpointer.load_checkpoint()
 
     # Setup the model and the input
-    model = llama2_13b()
+    model = llama3_2_3b()
 
     # Model weights are stored with the key="model"
     model.load_state_dict(torchtune_sd["model"])
-    <All keys matched successfully>
-
-    # We have 32000 vocab tokens; lets generate an input with 70 tokens
-    x = torch.randint(0, 32000, (1, 70))
-
-    with torch.no_grad():
-        model(x)
-
-    tensor([[[ -6.3989,  -9.0531,   3.2375,  ...,  -5.2822,  -4.4872,  -5.7469],
-        [ -8.6737, -11.0023,   6.8235,  ...,  -2.6819,  -4.2424,  -4.0109],
-        [ -4.6915,  -7.3618,   4.1628,  ...,  -2.8594,  -2.5857,  -3.1151],
-        ...,
-        [ -7.7808,  -8.2322,   2.8850,  ...,  -1.9604,  -4.7624,  -1.6040],
-        [ -7.3159,  -8.5849,   1.8039,  ...,  -0.9322,  -5.2010,  -1.6824],
-        [ -7.8929,  -8.8465,   3.3794,  ...,  -1.3500,  -4.6145,  -2.5931]]])
+    model.to("cuda")
+
+    # We have 128256 vocab tokens; lets generate an input with 24 tokens
+    x = torch.randint(0, 128256, (1, 24), dtype=torch.long, device="cuda")
+
+    tensor([[[ 1.4299,  1.1658,  4.2459,  ..., -2.3259, -2.3262, -2.3259],
+            [ 6.5942,  7.2284,  2.4090,  ..., -6.0129, -6.0121, -6.0127],
+            [ 5.6462,  4.8787,  4.0950,  ..., -4.6460, -4.6455, -4.6457],
+            ...,
+            [-0.4156, -0.0626, -0.0362,  ..., -3.6432, -3.6437, -3.6427],
+            [-0.5679, -0.6902,  0.5267,  ..., -2.6137, -2.6138, -2.6127],
+            [ 0.3688, -0.1350,  1.1764,  ..., -3.4563, -3.4565, -3.4564]]],
+        device='cuda:0')
 
 
 You can do this with any model supported by torchtune. You can find a full list
diff --git a/docs/source/tutorials/e2e_flow.rst b/docs/source/tutorials/e2e_flow.rst
index c615aacef5..a6c1d561bf 100644
--- a/docs/source/tutorials/e2e_flow.rst
+++ b/docs/source/tutorials/e2e_flow.rst
@@ -43,26 +43,24 @@ might have can look something like this:
 In this tutorial, we'll cover how you can use torchtune for all of the above, leveraging
 integrations with popular tools and libraries from the ecosystem.
 
-We'll use the Llama2 7B model for this tutorial. You can find a complete set of models supported
+We'll use the Llama-3.2-3B-Instruct model for this tutorial. You can find a complete set of models supported
 by torchtune `here <https://github.com/pytorch/torchtune/blob/main/README.md#introduction>`_.
 
 |
 
-Download Llama2 7B
-------------------
+Download Llama-3.2-3B-Instruct
+------------------------------
 
-In this tutorial, we'll use the Hugging Face model weights for the Llama2 7B mode.
 For more information on checkpoint formats and how these are handled in torchtune, take a look at
 this tutorial on :ref:`checkpoints <understand_checkpointer>`.
 
-To download the HF format Llama2 7B model, we'll use the tune CLI.
+To download the HF format Llama-3.2-3B-Instruct, we'll use the tune CLI.
 
 .. code-block:: bash
 
-  tune download \
-  meta-llama/Llama-2-7b-hf \
-  --output-dir <checkpoint_dir> \
-  --hf-token <ACCESS TOKEN>
+  tune download meta-llama/Llama-3.2-3B-Instruct \
+    --output-dir /tmp/Llama-3.2-3B-Instruct \
+    --ignore-patterns "original/consolidated.00.pth"
 
 Make a note of ``<checkpoint_dir>``, we'll use this many times in this tutorial.
 
@@ -75,14 +73,14 @@ For this tutorial, we'll fine-tune the model using LoRA. LoRA is a parameter eff
 technique which is especially helpful when you don't have a lot of GPU memory to play with. LoRA
 freezes the base LLM and adds a very small percentage of learnable parameters. This helps keep
 memory associated with gradients and optimizer state low. Using torchtune, you should be able to
-fine-tune a Llama2 7B model with LoRA in less than 16GB of GPU memory using bfloat16 on a
+fine-tune a Llama-3.2-3B-Instruct model with LoRA in less than 16GB of GPU memory using bfloat16 on a
 RTX 3090/4090. For more information on how to use LoRA, take a look at our
 :ref:`LoRA Tutorial <lora_finetune_label>`.
 
 We'll fine-tune using our
 `single device LoRA recipe <https://github.com/pytorch/torchtune/blob/main/recipes/lora_finetune_single_device.py>`_
 and use the standard settings from the
-`default config <https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama2/7B_lora_single_device.yaml>`_.
+`default config <https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama3_2/3B_lora_single_device.yaml>`_.
 
 This will fine-tune our model using a ``batch_size=2`` and ``dtype=bfloat16``. With these settings the model
 should have a peak memory usage of ~16GB and total training time of around two hours for each epoch.
@@ -95,19 +93,39 @@ Let's look for the right config for this use case by using the tune CLI.
 
     tune ls
 
-    RECIPE                                   CONFIG
-    full_finetune_single_device              llama2/7B_full_low_memory
-                                             mistral/7B_full_low_memory
-    full_finetune_distributed                llama2/7B_full
-                                             llama2/13B_full
-                                             mistral/7B_full
-    lora_finetune_single_device              llama2/7B_lora_single_device
-                                             llama2/7B_qlora_single_device
-                                             mistral/7B_lora_single_device
+    RECIPE                                  CONFIG
+    full_finetune_single_device             llama2/7B_full_low_memory
+                                            code_llama2/7B_full_low_memory
+                                            llama3/8B_full_single_device
+                                            llama3_1/8B_full_single_device
+                                            llama3_2/1B_full_single_device
+                                            llama3_2/3B_full_single_device
+                                            mistral/7B_full_low_memory
+                                            phi3/mini_full_low_memory
+                                            qwen2/7B_full_single_device
+                                            ...
+
+
+    full_finetune_distributed               llama2/7B_full
+                                            llama2/13B_full
+                                            llama3/8B_full
+                                            llama3_1/8B_full
+                                            llama3_2/1B_full
+                                            llama3_2/3B_full
+                                            mistral/7B_full
+                                            gemma2/9B_full
+                                            gemma2/27B_full
+                                            phi3/mini_full
+                                            qwen2/7B_full
+                                            ...
+
+    lora_finetune_single_device             llama2/7B_lora_single_device
+                                            llama2/7B_qlora_single_device
+                                            llama3/8B_lora_single_device
     ...
 
 
-For this tutorial we'll use the ``llama2/7B_lora_single_device`` config.
+For this tutorial we'll use the ``llama3_2/3B_lora_single_device`` config.
 
 The config already points to the HF Checkpointer and the right checkpoint files.
 All we need to do is update the checkpoint directory for both the model and the
@@ -116,31 +134,76 @@ tokenizer. Let's do this using the overrides in the tune CLI while starting trai
 
 .. code-block:: bash
 
-    tune run lora_finetune_single_device \
-    --config llama2/7B_lora_single_device \
-    checkpointer.checkpoint_dir=<checkpoint_dir> \
-    tokenizer.path=<checkpoint_dir>/tokenizer.model \
-    checkpointer.output_dir=<checkpoint_dir>
+    tune run lora_finetune_single_device --config llama3_2/3B_lora_single_device
 
 
-Once training is complete, you'll see the following in the logs.
+Preparing your artifacts for inference
+--------------------------------------
 
-.. code-block:: bash
-
-    [_checkpointer.py:473] Model checkpoint of size 9.98 GB saved to <checkpoint_dir>/hf_model_0001_0.pt
+Congrats for getting this far! You have loaded your weights, trained your model, now it's time to visualize
+the outputs. A simple way of doing this is by running `tree -a path/to/outputdir`, which should show something like the tree below.
+There are 4 types of folders:
 
-    [_checkpointer.py:473] Model checkpoint of size 3.50 GB saved to <checkpoint_dir>/hf_model_0002_0.pt
-
-    [_checkpointer.py:484] Adapter checkpoint of size 0.01 GB saved to <checkpoint_dir>/adapter_0.pt
+1) **recipe_state**: Holds recipe_state.pt with the information necessary to restart the last intermediate epoch. For more information, please check our deep-dive :ref:`Checkpointing in torchtune <understand_checkpointer>`.;
+2) **logs**: Defined in your config in metric_logger;
+3) **epoch_{}**: Contains your new trained model weights plus all original files of the model, except the checkpoints, making it easy for you to choose an specific epoch to run inference on or push to a model hub;
 
+.. code-block:: bash
 
-The final trained weights are merged with the original model and split across two checkpoint files
-similar to the source checkpoints from the HF Hub
-(see the :ref:`LoRA Tutorial <lora_finetune_label>` for more details).
-In fact the keys will be identical between these checkpoints.
-We also have a third checkpoint file which is much smaller in size
-and contains the learnt LoRA adapter weights. For this tutorial, we'll only use the model
-checkpoints and not the adapter weights.
+    >>> tree -a /tmp/torchtune/llama3_2_3B/lora_single_device
+        /tmp/torchtune/llama3_2_3B/lora_single_device
+        ├── epoch_0
+        │   ├── adapter_config.json
+        │   ├── adapter_model.pt
+        │   ├── adapter_model.safetensors
+        │   ├── config.json
+        │   ├── ft-model-00001-of-00002.safetensors
+        │   ├── ft-model-00002-of-00002.safetensors
+        │   ├── generation_config.json
+        │   ├── LICENSE.txt
+        │   ├── model.safetensors.index.json
+        │   ├── original
+        │   │   ├── orig_params.json
+        │   │   ├── params.json
+        │   │   └── tokenizer.model
+        │   ├── original_repo_id.json
+        │   ├── README.md
+        │   ├── special_tokens_map.json
+        │   ├── tokenizer_config.json
+        │   ├── tokenizer.json
+        │   └── USE_POLICY.md
+        ├── epoch_1
+        │   ├── adapter_config.json
+        │   ├── adapter_model.pt
+        │   ├── adapter_model.safetensors
+        │   ├── config.json
+        │   ├── ft-model-00001-of-00002.safetensors
+        │   ├── ft-model-00002-of-00002.safetensors
+        │   ├── generation_config.json
+        │   ├── LICENSE.txt
+        │   ├── model.safetensors.index.json
+        │   ├── original
+        │   │   ├── orig_params.json
+        │   │   ├── params.json
+        │   │   └── tokenizer.model
+        │   ├── original_repo_id.json
+        │   ├── README.md
+        │   ├── special_tokens_map.json
+        │   ├── tokenizer_config.json
+        │   ├── tokenizer.json
+        │   └── USE_POLICY.md
+        ├── logs
+        │   └── log_1734652101.txt
+        └── recipe_state
+            └── recipe_state.pt
+
+Let's understand the files:
+
+- `adapter_model.safetensors` and `adapter_model.pt` are your LoRA trained adapter weights. We save a duplicated .pt version of it to facilitate resuming from checkpoint.
+- `ft-model-{}-of-{}.safetensors` are your trained full model weights (not adapters). When LoRA finetuning, these are only present if we set ``save_adapter_weights_only=False``. In that case, we merge the merged base model with trained adapters, making inference easier.
+- `adapter_config.json` is used by Huggingface PEFT when loading an adapter (more on that later);
+- `model.safetensors.index.json` is used by Huggingface .from_pretrained when loading the model weights (more on that later)
+- All other files were originally in the checkpoint_dir. They are automatically copied during training. Files over 100MiB and ending on .safetensors, .pth, .pt, .bin are ignored, making it lightweight.
 
 |
 
@@ -164,78 +227,68 @@ modifying its associated config ``eleuther_evaluation.yaml``.
     to install the EleutherAI evaluation harness.
 
 Since we plan to update all of the checkpoint files to point to our fine-tuned checkpoints,
-let's first copy over the config to our local working directory so we can make changes. This
-will be easier than overriding all of these elements through the CLI.
+let's first copy over the config to our local working directory so we can make changes.
 
 .. code-block:: bash
 
     tune cp eleuther_evaluation ./custom_eval_config.yaml \
 
-For this tutorial we'll use the `truthfulqa_mc2 <https://github.com/sylinrl/TruthfulQA>`_ task from the harness.
-This task measures a model's propensity to be truthful when answering questions and
-measures the model's zero-shot accuracy on a question followed by one or more true
-responses and one or more false responses. Let's first run a baseline without fine-tuning.
-
-
-.. code-block:: bash
-
-    tune run eleuther_eval --config ./custom_eval_config.yaml
-    checkpointer.checkpoint_dir=<checkpoint_dir> \
-    tokenizer.path=<checkpoint_dir>/tokenizer.model
-
-    [evaluator.py:324] Running loglikelihood requests
-    [eleuther_eval.py:195] Eval completed in 121.27 seconds.
-    [eleuther_eval.py:197] truthfulqa_mc2: {'acc,none': 0.388...
+Then, in your config, you only need to replace two fields: ``output_dir`` and ``checkpoint_files``. Notice
+that we are using the merged weights, and not the LoRA adapters.
 
-The model has an accuracy around 38.8%. Let's compare this with the fine-tuned model.
+.. code-block:: yaml
 
+    # TODO: update to your desired epoch
+    output_dir: /tmp/torchtune/llama3_2_3B/lora_single_device/epoch_0
 
-First, we modify ``custom_eval_config.yaml`` to include the fine-tuned checkpoints.
+    # Tokenizer
+    tokenizer:
+        _component_: torchtune.models.llama3.llama3_tokenizer
+        path: ${output_dir}/original/tokenizer.model
 
-.. code-block:: yaml
+    model:
+        # Notice that we don't pass the lora model. We are using the merged weights,
+        _component_: torchtune.models.llama3_2.llama3_2_3b
 
     checkpointer:
         _component_: torchtune.training.FullModelHFCheckpointer
-
-        # directory with the checkpoint files
-        # this should match the output_dir specified during
-        # finetuning
-        checkpoint_dir: <checkpoint_dir>
-
-        # checkpoint files for the fine-tuned model. This should
-        # match what's shown in the logs above
+        checkpoint_dir: ${output_dir}
         checkpoint_files: [
-            hf_model_0001_0.pt,
-            hf_model_0002_0.pt,
+            ft-model-00001-of-00002.safetensors,
+            ft-model-00002-of-00002.safetensors,
         ]
+        output_dir: ${output_dir}
+        model_type: LLAMA3_2
 
-        output_dir: <checkpoint_dir>
-        model_type: LLAMA2
+    ### OTHER PARAMETERS -- NOT RELATED TO THIS CHECKPOINT
 
-    # Make sure to update the tokenizer path to the right
-    # checkpoint directory as well
-    tokenizer:
-        _component_: torchtune.models.llama2.llama2_tokenizer
-        path: <checkpoint_dir>/tokenizer.model
+    # Environment
+    device: cuda
+    dtype: bf16
+    seed: 1234 # It is not recommended to change this seed, b/c it matches EleutherAI's default seed
 
+    # EleutherAI specific eval args
+    tasks: ["truthfulqa_mc2"]
+    limit: null
+    max_seq_length: 4096
+    batch_size: 8
+    enable_kv_cache: True
 
-Now, let's run the recipe.
+    # Quantization specific args
+    quantizer: null
 
-.. code-block:: bash
+For this tutorial we'll use the `truthfulqa_mc2 <https://github.com/sylinrl/TruthfulQA>`_ task from the harness.
 
-    tune run eleuther_eval --config ./custom_eval_config.yaml
+This task measures a model's propensity to be truthful when answering questions and
+measures the model's zero-shot accuracy on a question followed by one or more true
+responses and one or more false responses
 
 
-The results should look something like this.
+.. code-block:: yaml
 
-.. code-block:: bash
+    tune run eleuther_eval --config ./custom_eval_config.yaml
 
     [evaluator.py:324] Running loglikelihood requests
-    [eleuther_eval.py:195] Eval completed in 121.27 seconds.
-    [eleuther_eval.py:197] truthfulqa_mc2: {'acc,none': 0.489 ...
-
-Our fine-tuned model gets ~48% on this task, which is ~10 points
-better than the baseline. Great! Seems like our fine-tuning helped.
 
 |
 
@@ -257,64 +310,74 @@ Let's first copy over the config to our local working directory so we can make c
 
     tune cp generation ./custom_generation_config.yaml
 
-Let's modify ``custom_generation_config.yaml`` to include the following changes.
+Let's modify ``custom_generation_config.yaml`` to include the following changes. Again, you only need
+ to replace two fields: ``output_dir`` and ``checkpoint_files``
 
 .. code-block:: yaml
 
-    checkpointer:
-        _component_: torchtune.training.FullModelHFCheckpointer
+    output_dir: /tmp/torchtune/llama3_2_3B/lora_single_device/epoch_0
 
-        # directory with the checkpoint files
-        # this should match the output_dir specified during
-        # finetuning
-        checkpoint_dir: <checkpoint_dir>
+    # Tokenizer
+    tokenizer:
+        _component_: torchtune.models.llama3.llama3_tokenizer
+        path: ${output_dir}/original/tokenizer.model
+        prompt_template: null
 
-        # checkpoint files for the fine-tuned model. This should
-        # match what's shown in the logs above
+    model:
+        # Notice that we don't pass the lora model. We are using the merged weights,
+        _component_: torchtune.models.llama3_2.llama3_2_3b
+
+    checkpointer:
+        _component_: torchtune.training.FullModelHFCheckpointer
+        checkpoint_dir: ${output_dir}
         checkpoint_files: [
-            hf_model_0001_0.pt,
-            hf_model_0002_0.pt,
+            ft-model-00001-of-00002.safetensors,
+            ft-model-00002-of-00002.safetensors,
         ]
+        output_dir: ${output_dir}
+        model_type: LLAMA3_2
 
-        output_dir: <checkpoint_dir>
-        model_type: LLAMA2
+    ### OTHER PARAMETERS -- NOT RELATED TO THIS CHECKPOINT
 
-    # Make sure to update the tokenizer path to the right
-    # checkpoint directory as well
-    tokenizer:
-        _component_: torchtune.models.llama2.llama2_tokenizer
-        path: <checkpoint_dir>/tokenizer.model
+    device: cuda
+    dtype: bf16
+
+    seed: 1234
+
+    # Generation arguments; defaults taken from gpt-fast
+    prompt:
+    system: null
+    user: "Tell me a joke. "
+    max_new_tokens: 300
+    temperature: 0.6 # 0.8 and 0.6 are popular values to try
+    top_k: 300
+
+    enable_kv_cache: True
 
+    quantizer: null
 
 Once the config is updated, let's kick off generation! We'll use the
 default settings for sampling with ``top_k=300`` and a
 ``temperature=0.8``. These parameters control how the probabilities for
-sampling are computed. These are standard settings for Llama2 7B and
-we recommend inspecting the model with these before playing around with
+sampling are computed. We recommend inspecting the model with these before playing around with
 these parameters.
 
-We'll use a different prompt from the one in the config
-
 .. code-block:: bash
 
     tune run generate --config ./custom_generation_config.yaml \
-    prompt="What are some interesting sites to visit in the Bay Area?"
+    prompt="tell me a joke. "
 
 
 Once generation is complete, you'll see the following in the logs.
 
 
-.. code-block:: bash
-
-    [generate.py:92] Exploratorium in San Francisco has made the cover of Time Magazine,
-                     and its awesome. And the bridge is pretty cool...
+.. code-block::
 
-    [generate.py:96] Time for inference: 11.61 sec total, 25.83 tokens/sec
-    [generate.py:99] Memory used: 15.72 GB
+    Tell me a joke. Here's a joke for you:
 
+    What do you call a fake noodle?
 
-Indeed, the bridge is pretty cool! Seems like our LLM knows a little something about the
-Bay Area!
+    An impasta!
 
 |
 
@@ -348,101 +411,143 @@ conversion is that you can directly work with standard formats. This helps
 with interoperability with other libraries since torchtune doesn't add yet
 another format to the mix.
 
-Let's take a look at an example of how this would work with a popular codebase
-used for running performant inference with LLMs -
-`gpt-fast <https://github.com/pytorch-labs/gpt-fast/tree/main>`_. This section
-assumes that you've cloned that repository on your machine.
+Let's start with huggingface
 
-``gpt-fast`` makes some assumptions about the checkpoint and the availability of
-the key-to-file mapping i.e. a file mapping parameter names to the files containing them.
-Let's satisfy these assumptions, by creating this mapping
-file. Let's assume we'll be using ``<new_dir>/Llama-2-7B-hf`` as the directory
-for this. ``gpt-fast`` assumes that the directory with checkpoints has the
-same format at the HF repo-id.
+**Case 1: HF using BASE MODEL + trained adapter**
+
+Here we load the base model from HF model hub. Then we load the adapters on top of it using PeftModel.
+It will look for the files adapter_model.safetensors for the weights and adapter_config.json for where to insert them.
 
 .. code-block:: python
 
-    import json
-    import torch
+    from peft import PeftModel
+    from transformers import AutoModelForCausalLM, AutoTokenizer
 
-    # create the output dictionary
-    output_dict = {"weight_map": {}}
+    #TODO: update it to your chosen epoch
+    trained_model_path = "/tmp/torchtune/llama3_2_3B/lora_single_device/epoch_0"
 
-    # Load the checkpoints
-    sd_1 = torch.load('<checkpoint_dir>/hf_model_0001_0.pt', mmap=True, map_location='cpu')
-    sd_2 = torch.load('<checkpoint_dir>/hf_model_0002_0.pt', mmap=True, map_location='cpu')
+    # Define the model and adapter paths
+    original_model_name = "meta-llama/Llama-3.2-1B-Instruct"
 
-    # create the weight map
-    for key in sd_1.keys():
-        output_dict['weight_map'][key] =  "hf_model_0001_0.pt"
-    for key in sd_2.keys():
-        output_dict['weight_map'][key] =  "hf_model_0002_0.pt"
+    model = AutoModelForCausalLM.from_pretrained(original_model_name)
 
-    with open('<new_dir>/Llama-2-7B-hf/pytorch_model.bin.index.json', 'w') as f:
-        json.dump(output_dict, f)
+    # huggingface will look for adapter_model.safetensors and adapter_config.json
+    peft_model = PeftModel.from_pretrained(model, trained_model_path)
 
+    # Load the tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(original_model_name)
 
-Now that we've created the weight_map, let's copy over our checkpoints.
+    # Function to generate text
+    def generate_text(model, tokenizer, prompt, max_length=50):
+        inputs = tokenizer(prompt, return_tensors="pt")
+        outputs = model.generate(**inputs, max_length=max_length)
+        return tokenizer.decode(outputs[0], skip_special_tokens=True)
 
-.. code-block:: bash
+    prompt = "tell me a joke: '"
+    print("Base model output:", generate_text(peft_model, tokenizer, prompt))
 
-    cp  <checkpoint_dir>/hf_model_0001_0.pt  <new_dir>/Llama-2-7B-hf/
-    cp  <checkpoint_dir>/hf_model_0002_0.pt  <new_dir>/Llama-2-7B-hf/
-    cp  <checkpoint_dir>/tokenizer.model     <new_dir>/Llama-2-7B-hf/
+**Case 2: HF using merged full+adapter weights**
 
-Once the directory structure is setup, let's convert the checkpoints and run inference!
+In this case, HF will check in model.safetensors.index.json which files it should load.
 
-.. code-block:: bash
+.. code-block:: python
+
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+
+    #TODO: update it to your chosen epoch
+    trained_model_path = "/tmp/torchtune/llama3_2_3B/lora_single_device/epoch_0"
+
+    model = AutoModelForCausalLM.from_pretrained(
+        pretrained_model_name_or_path=trained_model_path,
+    )
 
-    cd gpt-fast/
+    # Load the tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(trained_model_path, safetensors=True)
 
-    # convert the checkpoints into a format readable by gpt-fast
-    python scripts/convert_hf_checkpoint.py \
-    --checkpoint_dir <new_dir>/Llama-2-7B-hf/ \
-    --model 7B
 
-    # run inference using the converted model
-    python generate.py \
-    --compile \
-    --checkpoint_path <new_dir>/Llama-2-7B-hf/model.pth \
-    --device cuda
+    # Function to generate text
+    def generate_text(model, tokenizer, prompt, max_length=50):
+        inputs = tokenizer(prompt, return_tensors="pt")
+        outputs = model.generate(**inputs, max_length=max_length)
+        return tokenizer.decode(outputs[0], skip_special_tokens=True)
 
-The output should look something like this:
+
+    prompt = "Complete the sentence: 'Once upon a time...'"
+    print("Base model output:", generate_text(model, tokenizer, prompt))
+
+**Case 3: vLLM using merged full+adapter weights**
+
+It will load any .safetensors file. Since here we mixed both the full model weights and adapter weights, we have to delete the
+adapter weights to succesfully load it.
 
 .. code-block:: bash
 
-    Hello, my name is Justin. I am a middle school math teacher
-    at WS Middle School ...
+    rm /tmp/torchtune/llama3_2_3B/lora_single_device/base_model/adapter_model.safetensors
 
-    Time for inference 5: 1.94 sec total, 103.28 tokens/sec
-    Bandwidth achieved: 1391.84 GB/sec
+Now we can run the script
 
+.. code-block:: python
 
-And thats it! Try your own prompt!
+    from vllm import LLM, SamplingParams
+
+    def print_outputs(outputs):
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+        print("-" * 80)
+
+    #TODO: update it to your chosen epoch
+    llm = LLM(
+        model="/tmp/torchtune/llama3_2_3B/lora_single_device/epoch_0",
+        load_format="safetensors",
+        kv_cache_dtype="auto",
+    )
+    sampling_params = SamplingParams(max_tokens=16, temperature=0.5)
+
+    conversation = [
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": "Hello"},
+        {"role": "assistant", "content": "Hello! How can I assist you today?"},
+        {
+            "role": "user",
+            "content": "Write an essay about the importance of higher education.",
+        },
+    ]
+    outputs = llm.chat(conversation, sampling_params=sampling_params, use_tqdm=False)
+    print_outputs(outputs)
 
 Uploading your model to the Hugging Face Hub
 --------------------------------------------
 
 Your new model is working great and you want to share it with the world. The easiest way to do this
-is utilizing the `huggingface-cli <https://huggingface.co/docs/huggingface_hub/en/guides/cli>`_ command, which works seamlessly with torchtune. Simply point the CLI
-to your finetuned model directory like so:
+is utilizing the `huggingface_hub <https://huggingface.co/docs/huggingface_hub/guides/upload>`_.
 
-.. code-block:: bash
+.. code-block:: python
 
-    huggingface-cli upload <hf-repo-id> <checkpoint-dir>
+    import huggingface_hub
+    api = huggingface_hub.HfApi()
 
-The command should output a link to your repository on the Hub. If the repository doesn't exist yet, it will
-be created automatically:
+    #TODO: update it to your chosen epoch
+    trained_model_path = "/tmp/torchtune/llama3_2_3B/lora_single_device/epoch_0"
 
-.. code-block:: text
+    username = huggingface_hub.whoami()["name"]
+    repo_name = "my-model-trained-with-torchtune"
 
-    https://huggingface.co/<hf-repo-id>/tree/main/.
+    # if the repo doesn't exist
+    repo_id = huggingface_hub.create_repo(repo_name).repo_id
 
-.. note::
+    # if it already exists
+    repo_id = f"{username}/{repo_name}"
 
-    Before uploading, make sure you are `authenticated with Hugging Face <https://huggingface.co/docs/huggingface_hub/quick-start#authentication>`_ by running ``huggingface-cli login``.
+    api.upload_folder(
+        folder_path=trained_model_path,
+        repo_id=repo_id,
+        repo_type="model",
+        create_pr=False
+    )
 
-For more details on the ``huggingface-cli upload`` feature check out the `Hugging Face docs <https://huggingface.co/docs/huggingface_hub/en/guides/cli#huggingface-cli-upload>`_.
+If you prefer, you can also try the cli version `huggingface-cli upload <https://huggingface.co/docs/huggingface_hub/en/guides/cli#huggingface-cli-upload>`_.
 
 |
 

From 0cd8bc4ca57db6f04c37be41511c3a33b94d7fcf Mon Sep 17 00:00:00 2001
From: Joe Cummings <jrcummings27@gmail.com>
Date: Fri, 20 Dec 2024 18:52:50 +0000
Subject: [PATCH 6/9] Update E2E Tutorial w/ vLLM and HF Hub (#2192)

Co-authored-by: Felipe Mello <felipemello@fb.com>
Co-authored-by: salman <salman.mohammadi@outlook.com>
---
 docs/source/tutorials/e2e_flow.rst | 320 +++++++++++++----------------
 1 file changed, 139 insertions(+), 181 deletions(-)

diff --git a/docs/source/tutorials/e2e_flow.rst b/docs/source/tutorials/e2e_flow.rst
index a6c1d561bf..21571d2e30 100644
--- a/docs/source/tutorials/e2e_flow.rst
+++ b/docs/source/tutorials/e2e_flow.rst
@@ -7,7 +7,7 @@ End-to-End Workflow with torchtune
 In this tutorial, we'll walk through an end-to-end example of how you can fine-tune,
 evaluate, optionally quantize and then run generation with your favorite LLM using
 torchtune. We'll also go over how you can use some popular tools and libraries
-from the community seemlessly with torchtune.
+from the community seamlessly with torchtune.
 
 .. grid:: 2
 
@@ -25,49 +25,32 @@ from the community seemlessly with torchtune.
         :ref:`checkpoints <understand_checkpointer>`
 
 
-Overview
---------
+Finetune your model
+-------------------
 
-Fine-tuning an LLM is usually only one step in a larger workflow. An example workflow that you
-might have can look something like this:
+First, let's download a model using the tune CLI. The following command will download the `Llama3.2 3B Instruct <https://ai.meta.com/blog/llama-3-2-connect-2024-vision-edge-mobile-devices/>`_
+model from the Hugging Face Hub and save it the local filesystem. Hugging Face uploaded the original
+weights (``consolidated.00.pth``) and the weights compatible with the `from_pretrained() <https://huggingface.co/docs/huggingface_hub/main/en/guides/integrations#frompretrained>`_ API (``*.safetensors``).
+We don't need both so we'll ignore the original weights when downloading.
 
-- Download a popular model from `HF Hub <https://huggingface.co/docs/hub/en/index>`_
-- Fine-tune the model using a relevant fine-tuning technique. The exact technique used
-  will depend on factors such as the model, amount and nature of training data, your hardware
-  setup and the end task for which the model will be used
-- Evaluate the model on some benchmarks to validate model quality
-- Run some generations to make sure the model output looks reasonable
-- Quantize the model for efficient inference
-- [Optional] Export the model for specific environments such as inference on a mobile phone
+.. code-block:: text
 
-In this tutorial, we'll cover how you can use torchtune for all of the above, leveraging
-integrations with popular tools and libraries from the ecosystem.
-
-We'll use the Llama-3.2-3B-Instruct model for this tutorial. You can find a complete set of models supported
-by torchtune `here <https://github.com/pytorch/torchtune/blob/main/README.md#introduction>`_.
-
-|
-
-Download Llama-3.2-3B-Instruct
-------------------------------
-
-For more information on checkpoint formats and how these are handled in torchtune, take a look at
-this tutorial on :ref:`checkpoints <understand_checkpointer>`.
-
-To download the HF format Llama-3.2-3B-Instruct, we'll use the tune CLI.
-
-.. code-block:: bash
-
-  tune download meta-llama/Llama-3.2-3B-Instruct \
-    --output-dir /tmp/Llama-3.2-3B-Instruct \
-    --ignore-patterns "original/consolidated.00.pth"
-
-Make a note of ``<checkpoint_dir>``, we'll use this many times in this tutorial.
+    $ tune download meta-llama/Llama-3.2-3B-Instruct --ignore-patterns "original/consolidated.00.pth"
+    Successfully downloaded model repo and wrote to the following locations:
+    /tmp/Llama-3.2-3B-Instruct/.cache
+    /tmp/Llama-3.2-3B-Instruct/.gitattributes
+    /tmp/Llama-3.2-3B-Instruct/LICENSE.txt
+    /tmp/Llama-3.2-3B-Instruct/README.md
+    /tmp/Llama-3.2-3B-Instruct/USE_POLICY.md
+    /tmp/Llama-3.2-3B-Instruct/config.json
+    /tmp/Llama-3.2-3B-Instruct/generation_config.json
+    /tmp/Llama-3.2-3B-Instruct/model-00001-of-00002.safetensors
+    ...
 
-|
+.. note::
 
-Finetune the model using LoRA
------------------------------
+    For a list of all other models you can finetune out-of-the-box with torchtune, check out
+    our :ref:`models page<models>`.
 
 For this tutorial, we'll fine-tune the model using LoRA. LoRA is a parameter efficient fine-tuning
 technique which is especially helpful when you don't have a lot of GPU memory to play with. LoRA
@@ -77,22 +60,11 @@ fine-tune a Llama-3.2-3B-Instruct model with LoRA in less than 16GB of GPU memor
 RTX 3090/4090. For more information on how to use LoRA, take a look at our
 :ref:`LoRA Tutorial <lora_finetune_label>`.
 
-We'll fine-tune using our
-`single device LoRA recipe <https://github.com/pytorch/torchtune/blob/main/recipes/lora_finetune_single_device.py>`_
-and use the standard settings from the
-`default config <https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama3_2/3B_lora_single_device.yaml>`_.
-
-This will fine-tune our model using a ``batch_size=2`` and ``dtype=bfloat16``. With these settings the model
-should have a peak memory usage of ~16GB and total training time of around two hours for each epoch.
-We'll need to make some changes to the config to make sure our recipe can access the
-right checkpoints.
-
 Let's look for the right config for this use case by using the tune CLI.
 
-.. code-block:: bash
-
-    tune ls
+.. code-block:: text
 
+    $ tune ls
     RECIPE                                  CONFIG
     full_finetune_single_device             llama2/7B_full_low_memory
                                             code_llama2/7B_full_low_memory
@@ -125,105 +97,102 @@ Let's look for the right config for this use case by using the tune CLI.
     ...
 
 
-For this tutorial we'll use the ``llama3_2/3B_lora_single_device`` config.
-
-The config already points to the HF Checkpointer and the right checkpoint files.
-All we need to do is update the checkpoint directory for both the model and the
-tokenizer. Let's do this using the overrides in the tune CLI while starting training!
-
-
-.. code-block:: bash
-
-    tune run lora_finetune_single_device --config llama3_2/3B_lora_single_device
-
-
-Preparing your artifacts for inference
---------------------------------------
+We'll fine-tune using our
+:ref:`single device LoRA recipe <lora_finetune_recipe_label>`
+and use the standard settings from the
+`default config <https://github.com/pytorch/torchtune/blob/main/recipes/configs/llama3_2/3B_lora_single_device.yaml>`_.
 
-Congrats for getting this far! You have loaded your weights, trained your model, now it's time to visualize
-the outputs. A simple way of doing this is by running `tree -a path/to/outputdir`, which should show something like the tree below.
-There are 4 types of folders:
+This will fine-tune our model using a ``batch_size=4`` and ``dtype=bfloat16``. With these settings the model
+should have a peak memory usage of ~16GB and total training time of around 2-3 hours for each epoch.
+
+.. code-block:: text
+
+    $ tune run lora_finetune_single_device --config llama3_2/3B_lora_single_device
+    Setting manual seed to local seed 3977464327. Local seed is seed + rank = 3977464327 + 0
+    Hint: enable_activation_checkpointing is True, but enable_activation_offloading isn't. Enabling activation offloading should reduce memory further.
+    Writing logs to /tmp/torchtune/llama3_2_3B/lora_single_device/logs/log_1734708879.txt
+    Model is initialized with precision torch.bfloat16.
+    Memory stats after model init:
+            GPU peak memory allocation: 6.21 GiB
+            GPU peak memory reserved: 6.27 GiB
+            GPU peak memory active: 6.21 GiB
+    Tokenizer is initialized from file.
+    Optimizer and loss are initialized.
+    Loss is initialized.
+    Dataset and Sampler are initialized.
+    Learning rate scheduler is initialized.
+    Profiling disabled.
+    Profiler config after instantiation: {'enabled': False}
+    1|3|Loss: 1.943998098373413:   0%|                    | 3/1617 [00:21<3:04:47,  6.87s/it]
+
+Congrats on training your model! Let's take a look at the artifacts produced by torchtune. A simple way of doing this is by running :code:`tree -a path/to/outputdir`, which should show something like the tree below.
+There are 3 types of folders:
 
 1) **recipe_state**: Holds recipe_state.pt with the information necessary to restart the last intermediate epoch. For more information, please check our deep-dive :ref:`Checkpointing in torchtune <understand_checkpointer>`.;
-2) **logs**: Defined in your config in metric_logger;
-3) **epoch_{}**: Contains your new trained model weights plus all original files of the model, except the checkpoints, making it easy for you to choose an specific epoch to run inference on or push to a model hub;
-
-.. code-block:: bash
-
-    >>> tree -a /tmp/torchtune/llama3_2_3B/lora_single_device
-        /tmp/torchtune/llama3_2_3B/lora_single_device
-        ├── epoch_0
-        │   ├── adapter_config.json
-        │   ├── adapter_model.pt
-        │   ├── adapter_model.safetensors
-        │   ├── config.json
-        │   ├── ft-model-00001-of-00002.safetensors
-        │   ├── ft-model-00002-of-00002.safetensors
-        │   ├── generation_config.json
-        │   ├── LICENSE.txt
-        │   ├── model.safetensors.index.json
-        │   ├── original
-        │   │   ├── orig_params.json
-        │   │   ├── params.json
-        │   │   └── tokenizer.model
-        │   ├── original_repo_id.json
-        │   ├── README.md
-        │   ├── special_tokens_map.json
-        │   ├── tokenizer_config.json
-        │   ├── tokenizer.json
-        │   └── USE_POLICY.md
-        ├── epoch_1
-        │   ├── adapter_config.json
-        │   ├── adapter_model.pt
-        │   ├── adapter_model.safetensors
-        │   ├── config.json
-        │   ├── ft-model-00001-of-00002.safetensors
-        │   ├── ft-model-00002-of-00002.safetensors
-        │   ├── generation_config.json
-        │   ├── LICENSE.txt
-        │   ├── model.safetensors.index.json
-        │   ├── original
-        │   │   ├── orig_params.json
-        │   │   ├── params.json
-        │   │   └── tokenizer.model
-        │   ├── original_repo_id.json
-        │   ├── README.md
-        │   ├── special_tokens_map.json
-        │   ├── tokenizer_config.json
-        │   ├── tokenizer.json
-        │   └── USE_POLICY.md
-        ├── logs
-        │   └── log_1734652101.txt
-        └── recipe_state
-            └── recipe_state.pt
+2) **logs**: Contains all the logging output from your training run: loss, memory, exceptions, etc.
+3) **epoch_{}**: Contains your trained model weights plus model metadata. If running inference or pushing to a model hub, you should use this folder directly.
+
+
+.. code-block:: text
+
+    $ tree -a /tmp/torchtune/llama3_2_3B/lora_single_device
+    /tmp/torchtune/llama3_2_3B/lora_single_device
+    ├── epoch_0
+    │   ├── adapter_config.json
+    │   ├── adapter_model.pt
+    │   ├── adapter_model.safetensors
+    │   ├── config.json
+    │   ├── ft-model-00001-of-00002.safetensors
+    │   ├── ft-model-00002-of-00002.safetensors
+    │   ├── generation_config.json
+    │   ├── LICENSE.txt
+    │   ├── model.safetensors.index.json
+    │   ├── original
+    │   │   ├── orig_params.json
+    │   │   ├── params.json
+    │   │   └── tokenizer.model
+    │   ├── original_repo_id.json
+    │   ├── README.md
+    │   ├── special_tokens_map.json
+    │   ├── tokenizer_config.json
+    │   ├── tokenizer.json
+    │   └── USE_POLICY.md
+    ├── epoch_1
+    │   ├── adapter_config.json
+    │   ...
+    ├── logs
+    │   └── log_1734652101.txt
+    └── recipe_state
+        └── recipe_state.pt
 
 Let's understand the files:
 
-- `adapter_model.safetensors` and `adapter_model.pt` are your LoRA trained adapter weights. We save a duplicated .pt version of it to facilitate resuming from checkpoint.
-- `ft-model-{}-of-{}.safetensors` are your trained full model weights (not adapters). When LoRA finetuning, these are only present if we set ``save_adapter_weights_only=False``. In that case, we merge the merged base model with trained adapters, making inference easier.
-- `adapter_config.json` is used by Huggingface PEFT when loading an adapter (more on that later);
-- `model.safetensors.index.json` is used by Huggingface .from_pretrained when loading the model weights (more on that later)
+- ``adapter_model.safetensors`` and ``adapter_model.pt`` are your LoRA trained adapter weights. We save a duplicated .pt version of it to facilitate resuming from checkpoint.
+- ``ft-model-{}-of-{}.safetensors`` are your trained full model weights (not adapters). When LoRA finetuning, these are only present if we set ``save_adapter_weights_only=False``. In that case, we merge the merged base model with trained adapters, making inference easier.
+- ``adapter_config.json`` is used by Huggingface PEFT when loading an adapter (more on that later);
+- ``model.safetensors.index.json`` is used by Hugging Face ``from_pretrained()`` when loading the model weights (more on that later)
 - All other files were originally in the checkpoint_dir. They are automatically copied during training. Files over 100MiB and ending on .safetensors, .pth, .pt, .bin are ignored, making it lightweight.
 
-|
+Evaluate your model
+-------------------
 
-.. _eval_harness_label:
+We've fine-tuned a model. But how well does this model really do? Let's determine this through structured evaluation and playing around with it.
 
-Run Evaluation using EleutherAI's Eval Harness
-----------------------------------------------
+.. _eval_harness_label:
 
-We've fine-tuned a model. But how well does this model really do? Let's run some Evaluations!
+Run evals using EleutherAI's Eval Harness
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 .. TODO (SalmanMohammadi) ref eval recipe docs
 
 torchtune integrates with
 `EleutherAI's evaluation harness <https://github.com/EleutherAI/lm-evaluation-harness>`_.
 An example of this is available through the
-``eleuther_eval`` recipe. In this tutorial, we're going to directly use this recipe by
-modifying its associated config ``eleuther_evaluation.yaml``.
+`eleuther_eval <https://github.com/pytorch/torchtune/blob/main/recipes/eleuther_eval.py>`_ recipe. In this tutorial, we're going to directly use this recipe by
+modifying its associated config `eleuther_evaluation.yaml <https://github.com/pytorch/torchtune/blob/main/recipes/configs/eleuther_evaluation.yaml>`_.
 
 .. note::
-    For this section of the tutorial, you should first run :code:`pip install lm_eval==0.4.*`
+    For this section of the tutorial, you should first run :code:`pip install lm_eval>=0.4.5`
     to install the EleutherAI evaluation harness.
 
 Since we plan to update all of the checkpoint files to point to our fine-tuned checkpoints,
@@ -231,10 +200,10 @@ let's first copy over the config to our local working directory so we can make c
 
 .. code-block:: bash
 
-    tune cp eleuther_evaluation ./custom_eval_config.yaml \
+    $ tune cp eleuther_evaluation ./custom_eval_config.yaml
+    Copied file to custom_eval_config.yaml
 
-Then, in your config, you only need to replace two fields: ``output_dir`` and ``checkpoint_files``. Notice
-that we are using the merged weights, and not the LoRA adapters.
+Notice that we are using the merged weights, and not the LoRA adapters.
 
 .. code-block:: yaml
 
@@ -281,19 +250,16 @@ For this tutorial we'll use the `truthfulqa_mc2 <https://github.com/sylinrl/Trut
 
 This task measures a model's propensity to be truthful when answering questions and
 measures the model's zero-shot accuracy on a question followed by one or more true
-responses and one or more false responses
+responses and one or more false responses.
 
+.. code-block:: text
 
-.. code-block:: yaml
-
-    tune run eleuther_eval --config ./custom_eval_config.yaml
-
+    $ tune run eleuther_eval --config ./custom_eval_config.yaml
     [evaluator.py:324] Running loglikelihood requests
+    ...
 
-|
-
-Generation
------------
+Generate some output
+~~~~~~~~~~~~~~~~~~~~
 
 We've run some evaluations and the model seems to be doing well. But does it really
 generate meaningful text for the prompts you care about? Let's find out!
@@ -303,12 +269,12 @@ For this, we'll use the
 and the associated
 `config <https://github.com/pytorch/torchtune/blob/main/recipes/configs/generation.yaml>`_.
 
-
 Let's first copy over the config to our local working directory so we can make changes.
 
-.. code-block:: bash
+.. code-block:: text
 
-    tune cp generation ./custom_generation_config.yaml
+    $ tune cp generation ./custom_generation_config.yaml
+    Copied file to custom_generation_config.yaml
 
 Let's modify ``custom_generation_config.yaml`` to include the following changes. Again, you only need
  to replace two fields: ``output_dir`` and ``checkpoint_files``
@@ -362,27 +328,17 @@ default settings for sampling with ``top_k=300`` and a
 sampling are computed. We recommend inspecting the model with these before playing around with
 these parameters.
 
-.. code-block:: bash
-
-    tune run generate --config ./custom_generation_config.yaml \
-    prompt="tell me a joke. "
-
-
-Once generation is complete, you'll see the following in the logs.
-
-
-.. code-block::
+.. code-block:: text
 
+    $ tune run generate --config ./custom_generation_config.yaml prompt="tell me a joke. "
     Tell me a joke. Here's a joke for you:
 
     What do you call a fake noodle?
 
     An impasta!
 
-|
-
-Speeding up Generation using Quantization
------------------------------------------
+Introduce some quantization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 We rely on `torchao <https://github.com/pytorch-labs/ao>`_ for `post-training quantization <https://github.com/pytorch/ao/tree/main/torchao/quantization#quantization>`_.
 To quantize the fine-tuned model after installing torchao we can run the following command::
@@ -401,22 +357,20 @@ For Llama models, you can run generation directly in torchao on the quantized mo
 discussed in `this readme <https://github.com/pytorch/ao/tree/main/torchao/_models/llama>`_. This way you can compare your own results
 to those in the previously-linked table.
 
-|
-
-Using torchtune checkpoints with other libraries
-------------------------------------------------
+Use your model in the wild
+--------------------------
 
-As we mentioned above, one of the benefits of handling of the checkpoint
-conversion is that you can directly work with standard formats. This helps
-with interoperability with other libraries since torchtune doesn't add yet
-another format to the mix.
+Let's say we're happy with how our model is performing at this point - we want to do something with it! Productionize for serving, publish on the Hugging Face Hub, etc.
+As we mentioned above, one of the benefits of handling of the checkpoint conversion is that you can directly work with standard formats. This helps
+with interoperability with other libraries since torchtune doesn't add yet another format to the mix.
 
-Let's start with huggingface
+Use with Hugging Face ``from_pretrained()``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-**Case 1: HF using BASE MODEL + trained adapter**
+**Case 1: Hugging Face using base model + trained adapters**
 
-Here we load the base model from HF model hub. Then we load the adapters on top of it using PeftModel.
-It will look for the files adapter_model.safetensors for the weights and adapter_config.json for where to insert them.
+Here we load the base model from Hugging Face model hub. Then we load the adapters on top of it using `PeftModel <https://huggingface.co/docs/peft/v0.6.1/en/package_reference/peft_model>`_.
+It will look for the files ``adapter_model.safetensors`` for the weights and ``adapter_config.json`` for where to insert them.
 
 .. code-block:: python
 
@@ -446,9 +400,9 @@ It will look for the files adapter_model.safetensors for the weights and adapter
     prompt = "tell me a joke: '"
     print("Base model output:", generate_text(peft_model, tokenizer, prompt))
 
-**Case 2: HF using merged full+adapter weights**
+**Case 2: Hugging Face using merged weights**
 
-In this case, HF will check in model.safetensors.index.json which files it should load.
+In this case, Hugging Face will check in ``model.safetensors.index.json`` for which files it should load.
 
 .. code-block:: python
 
@@ -475,16 +429,20 @@ In this case, HF will check in model.safetensors.index.json which files it shoul
     prompt = "Complete the sentence: 'Once upon a time...'"
     print("Base model output:", generate_text(model, tokenizer, prompt))
 
-**Case 3: vLLM using merged full+adapter weights**
+Use with vLLM
+~~~~~~~~~~~~~
 
-It will load any .safetensors file. Since here we mixed both the full model weights and adapter weights, we have to delete the
+`vLLM <https://docs.vllm.ai/en/latest/>`_ is a fast and easy-to-use library for LLM inference and serving. They include a lot of awesome features like
+state-of-the-art serving throughput, continuous batching of incoming requests, quantization, and speculative decoding.
+
+The library will load any .safetensors file. Since here we mixed both the full model weights and adapter weights, we have to delete the
 adapter weights to succesfully load it.
 
-.. code-block:: bash
+.. code-block:: python
 
     rm /tmp/torchtune/llama3_2_3B/lora_single_device/base_model/adapter_model.safetensors
 
-Now we can run the script
+Now we can run the following script:
 
 .. code-block:: python
 
@@ -517,8 +475,8 @@ Now we can run the script
     outputs = llm.chat(conversation, sampling_params=sampling_params, use_tqdm=False)
     print_outputs(outputs)
 
-Uploading your model to the Hugging Face Hub
---------------------------------------------
+Upload your model to the Hugging Face Hub
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 Your new model is working great and you want to share it with the world. The easiest way to do this
 is utilizing the `huggingface_hub <https://huggingface.co/docs/huggingface_hub/guides/upload>`_.

From 6a53242d095887f638ae30c63efde8ed837c0bd5 Mon Sep 17 00:00:00 2001
From: gmagogsfm <gmagogsfm@users.noreply.github.com>
Date: Fri, 20 Dec 2024 12:58:23 -0800
Subject: [PATCH 7/9] pytorch/torchtune/tests/torchtune/modules/_export

Differential Revision: D67388194

Pull Request resolved: https://github.com/pytorch/torchtune/pull/2179
---
 tests/torchtune/modules/_export/test_attention.py            | 1 +
 .../modules/_export/test_export_position_embeddings.py       | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/torchtune/modules/_export/test_attention.py b/tests/torchtune/modules/_export/test_attention.py
index ed2c022c3e..b65778c55b 100644
--- a/tests/torchtune/modules/_export/test_attention.py
+++ b/tests/torchtune/modules/_export/test_attention.py
@@ -159,6 +159,7 @@ def test_attention_export(self):
                 (self.x, self.x),
                 kwargs={"input_pos": self.input_pos},
                 dynamic_shapes=self.dynamic_shapes,
+                strict=True,
             )
         et_res = et_mha_ep.module()(self.x, self.x, input_pos=self.input_pos)
         tt_res = self.tt_mha(self.x, self.x, input_pos=self.input_pos)
diff --git a/tests/torchtune/modules/_export/test_export_position_embeddings.py b/tests/torchtune/modules/_export/test_export_position_embeddings.py
index 20bfb84deb..6907ca3edd 100644
--- a/tests/torchtune/modules/_export/test_export_position_embeddings.py
+++ b/tests/torchtune/modules/_export/test_export_position_embeddings.py
@@ -51,7 +51,6 @@ def test_tile_positional_embedding_smoke(self):
         torch_version_ge("2.6.0.dev20241117"), reason="Need recent fixes for export"
     )
     def test_tile_positional_embedding_export(self):
-
         tpe_ep = torch.export.export(
             self.tpe,
             (self.x, self.aspect_ratio),
@@ -59,6 +58,7 @@ def test_tile_positional_embedding_export(self):
                 self.dynamic_shape,
                 None,
             ),  # assuming aspect ratio is static
+            strict=True,
         )
 
         y = tpe_ep.module()(self.x, self.aspect_ratio)
@@ -129,7 +129,6 @@ def test_tiled_token_positional_embedding_smoke(self):
         torch_version_ge("2.6.0.dev20241117"), reason="Need recent fixes for export"
     )
     def test_tiled_token_positional_embedding_export(self):
-
         tpe_ep = torch.export.export(
             self.tpe,
             (self.x, self.aspect_ratio),
@@ -137,6 +136,7 @@ def test_tiled_token_positional_embedding_export(self):
                 self.dynamic_shape,
                 None,
             ),  # assuming aspect ratio is static
+            strict=True,
         )
 
         y = tpe_ep.module()(self.x, self.aspect_ratio)
@@ -155,6 +155,7 @@ def test_tiled_token_positional_embedding_aoti(self):
                 self.dynamic_shape,
                 None,
             ),  # assuming aspect ratio is static
+            strict=True,
         )
 
         with tempfile.TemporaryDirectory() as tmpdir:

From 002b17cf8204e1e98fcfa0aca5dfb3b485d913fe Mon Sep 17 00:00:00 2001
From: Felipe Mello <fmellomascarenhas@gmail.com>
Date: Fri, 20 Dec 2024 16:34:26 -0500
Subject: [PATCH 8/9] update torchtune version (#2195)

Co-authored-by: Felipe Mello <felipemello@fb.com>
---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index 8f0916f768..a918a2aa18 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.5.0
+0.6.0

From aa8f365f91a69aa36aaea14cf6f03ccd45310bb6 Mon Sep 17 00:00:00 2001
From: akashc1 <43617927+akashc1@users.noreply.github.com>
Date: Sat, 21 Dec 2024 06:39:30 -0800
Subject: [PATCH 9/9] [metric_logging][wandb] Fix wandb metric logger config
 save path (#2196)

---
 torchtune/training/metric_logging.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchtune/training/metric_logging.py b/torchtune/training/metric_logging.py
index a6189f10e1..42aa1f9d72 100644
--- a/torchtune/training/metric_logging.py
+++ b/torchtune/training/metric_logging.py
@@ -222,7 +222,7 @@ def log_config(self, config: DictConfig) -> None:
             try:
                 output_config_fname = Path(
                     os.path.join(
-                        config.checkpointer.checkpoint_dir,
+                        config.output_dir,
                         "torchtune_config.yaml",
                     )
                 )