From 515228b128e2a80a83cacf19e0bbe65006148d68 Mon Sep 17 00:00:00 2001
From: pytorchbot <pytorchbot@pytorch.com>
Date: Wed, 10 Apr 2024 11:34:51 +0000
Subject: [PATCH] 2024-04-10 nightly release
 (d252375ac98d5690fde4cbcef8ffc21e84394031)

---
 .github/workflows/recipe_test_multi_gpu.yaml  | 10 ++++-
 README.md                                     |  7 ++++
 docs/source/examples/finetune_llm.rst         |  2 +-
 docs/source/examples/lora_finetune.rst        |  4 +-
 recipes/configs/gemma/2B_full.yaml            |  6 +--
 recipes/configs/llama2/13B_full.yaml          |  2 +-
 recipes/configs/llama2/13B_lora.yaml          |  2 +-
 recipes/configs/llama2/7B_full.yaml           |  2 +-
 .../configs/llama2/7B_full_single_device.yaml |  3 +-
 .../7B_full_single_device_low_memory.yaml     |  5 ++-
 recipes/configs/llama2/7B_lora.yaml           |  2 +-
 .../configs/llama2/7B_lora_single_device.yaml |  3 +-
 .../llama2/7B_qlora_single_device.yaml        |  5 ++-
 recipes/full_finetune_distributed.py          |  4 +-
 recipes/full_finetune_single_device.py        | 18 ++++++--
 recipes/gemma_full_finetune_distributed.py    |  4 +-
 recipes/lora_finetune_single_device.py        | 19 +++++++--
 tests/cache_artifacts.sh                      |  1 -
 .../test_full_finetune_single_device.py       |  7 +++-
 .../test_lora_finetune_single_device.py       | 21 +++++++++-
 tests/regression_tests/test_llama2_7b.py      |  4 +-
 tests/test_utils.py                           |  8 ++++
 tests/torchtune/utils/test_wrap_compile.py    | 26 ++++++++++++
 torchtune/utils/__init__.py                   |  3 ++
 .../utils/_checkpointing/_checkpointer.py     |  4 +-
 torchtune/utils/_compile_utils.py             | 42 +++++++++++++++++++
 26 files changed, 179 insertions(+), 35 deletions(-)
 create mode 100644 tests/torchtune/utils/test_wrap_compile.py
 create mode 100644 torchtune/utils/_compile_utils.py

diff --git a/.github/workflows/recipe_test_multi_gpu.yaml b/.github/workflows/recipe_test_multi_gpu.yaml
index 7b5c182c82..2d66feaa64 100644
--- a/.github/workflows/recipe_test_multi_gpu.yaml
+++ b/.github/workflows/recipe_test_multi_gpu.yaml
@@ -24,6 +24,7 @@ jobs:
     strategy:
       matrix:
         python-version: ['3.8', '3.9', '3.10', '3.11']
+        torch-version: ["nightly", "stable"]
     steps:
       - name: Check out repo
         uses: actions/checkout@v3
@@ -45,9 +46,14 @@ jobs:
       - name: Install S3 CLI
         run: |
           python -m pip install awscli==1.32.6
-      - name: Install dependencies
+      - name: Install torch nightly
+        if: ${{ matrix.torch-version == 'nightly' }}
+        run: python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu118
+      - name: Install torch stable
+        if: ${{ matrix.torch-version == 'stable' }}
+        run: python -m pip install torch
+      - name: Install remaining dependencies
         run: |
-          python -m pip install torch
           python -m pip install -e ".[dev]"
           python -m pip install lm-eval==0.4.*
       - name: Run recipe tests with coverage
diff --git a/README.md b/README.md
index df1de940f7..bf732beb30 100644
--- a/README.md
+++ b/README.md
@@ -176,6 +176,13 @@ Check out `tune --help` for all possible CLI commands and options.
 
 ---
 
+#### Support for torch.compile
+
+Note that support for ``torch.compile`` is currently in progress and only enabled for a couple of recipes. In particular, ``torch.compile`` support is
+enabled for [full_finetune_single_device](https://github.com/pytorch/torchtune/blob/main/recipes/full_finetune_single_device.py) and [lora_finetune_single_device](https://github.com/pytorch/torchtune/blob/main/recipes/lora_finetune_single_device.py) and validated on Llama2-7b. Other recipes and models are currently not tested, though we welcome contributions from the community.
+
+---
+
 ## Design Principles
 
 TorchTune embodies PyTorch’s design philosophy [[details](https://pytorch.org/docs/stable/community/design.html)], especially "usability over everything else".
diff --git a/docs/source/examples/finetune_llm.rst b/docs/source/examples/finetune_llm.rst
index bc454e1415..33223f0341 100644
--- a/docs/source/examples/finetune_llm.rst
+++ b/docs/source/examples/finetune_llm.rst
@@ -70,7 +70,7 @@ To run the recipe without any changes on 4 GPUs, launch a training run using Tun
 
 .. code-block:: bash
 
-    tune --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config full_finetune_distributed
+    tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config full_finetune_distributed
 
 Dataset
 -------
diff --git a/docs/source/examples/lora_finetune.rst b/docs/source/examples/lora_finetune.rst
index 5f14b236aa..6a3fe5244b 100644
--- a/docs/source/examples/lora_finetune.rst
+++ b/docs/source/examples/lora_finetune.rst
@@ -253,7 +253,7 @@ You can then run the following command to perform a LoRA finetune of Llama2-7B u
 
 .. code-block:: bash
 
-    tune --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config lora_finetune_distributed
+    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config lora_finetune_distributed
 
 .. note::
     Make sure to point to the location of your Llama2 weights and tokenizer. This can be done
@@ -288,7 +288,7 @@ Let's run this experiment. We can also increase alpha (in general it is good pra
 
 .. code-block:: bash
 
-    tune --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config lora_finetune_distributed \
+    tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config lora_finetune_distributed \
     lora_attn_modules='[q_proj, k_proj, v_proj, output_proj]' \
     lora_rank=32 lora_alpha=64 output_dir=./lora_experiment_1
 
diff --git a/recipes/configs/gemma/2B_full.yaml b/recipes/configs/gemma/2B_full.yaml
index ebabb23c24..53dc03f6e7 100644
--- a/recipes/configs/gemma/2B_full.yaml
+++ b/recipes/configs/gemma/2B_full.yaml
@@ -3,18 +3,18 @@
 #
 # This config assumes that you've run the following command before launching
 # this run:
-#   tune download --repo-id google/gemma-2b \
+#   tune download google/gemma-2b \
 #   --hf-token <HF_TOKEN> \
 #   --output-dir /tmp/gemma2
 #
 # To launch on 4 devices, run the following command from root:
-#   tune --nnodes 1 --nproc_per_node 4 full_finetune_distributed \
+#   tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed \
 #   --config gemma/2B_full \
 #
 # You can add specific overrides through the command line. For example
 # to override the checkpointer directory while launching training
 # you can run:
-#   tune --nnodes 1 --nproc_per_node 4 full_finetune_distributed \
+#   tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed \
 #   --config gemma/2B_full \
 #   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 #
diff --git a/recipes/configs/llama2/13B_full.yaml b/recipes/configs/llama2/13B_full.yaml
index abbd9c45c5..73c9c0c922 100644
--- a/recipes/configs/llama2/13B_full.yaml
+++ b/recipes/configs/llama2/13B_full.yaml
@@ -14,7 +14,7 @@
 # You can add specific overrides through the command line. For example
 # to override the checkpointer directory while launching training
 # you can run:
-#   tune --nnodes 1 --nproc_per_node 4 full_finetune_distributed \
+#   tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed \
 #   --config llama2/13B_full \
 #   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 #
diff --git a/recipes/configs/llama2/13B_lora.yaml b/recipes/configs/llama2/13B_lora.yaml
index 947faf7c6a..a82d2dde6b 100644
--- a/recipes/configs/llama2/13B_lora.yaml
+++ b/recipes/configs/llama2/13B_lora.yaml
@@ -14,7 +14,7 @@
 # You can add specific overrides through the command line. For example
 # to override the checkpointer directory while launching training
 # you can run:
-#   tune --nnodes 1 --nproc_per_node 4 lora_finetune_distributed \
+#   tune run --nnodes 1 --nproc_per_node 4 lora_finetune_distributed \
 #   --config llama2/13B_lora \
 #   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 #
diff --git a/recipes/configs/llama2/7B_full.yaml b/recipes/configs/llama2/7B_full.yaml
index 16f3dcb3ec..c510efa8b4 100644
--- a/recipes/configs/llama2/7B_full.yaml
+++ b/recipes/configs/llama2/7B_full.yaml
@@ -14,7 +14,7 @@
 # You can add specific overrides through the command line. For example
 # to override the checkpointer directory while launching training
 # you can run:
-#   tune --nnodes 1 --nproc_per_node 4 full_finetune_distributed \
+#   tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed \
 #   --config llama2/7B_full \
 #   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 #
diff --git a/recipes/configs/llama2/7B_full_single_device.yaml b/recipes/configs/llama2/7B_full_single_device.yaml
index 1d297a28ec..b1d4e59a90 100644
--- a/recipes/configs/llama2/7B_full_single_device.yaml
+++ b/recipes/configs/llama2/7B_full_single_device.yaml
@@ -14,7 +14,7 @@
 # You can add specific overrides through the command line. For example
 # to override the checkpointer directory while launching training
 # you can run:
-#   tune --nnodes 1 --nproc_per_node 1 full_finetune_single_device \
+#   tune run --nnodes 1 --nproc_per_node 1 full_finetune_single_device \
 #   --config llama2/7B_full_single_device \
 #   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 #
@@ -56,6 +56,7 @@ loss:
   _component_: torch.nn.CrossEntropyLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
+compile: False
 optimizer_in_bwd: False
 
 
diff --git a/recipes/configs/llama2/7B_full_single_device_low_memory.yaml b/recipes/configs/llama2/7B_full_single_device_low_memory.yaml
index c1bfd5cb6f..4c36bf774d 100644
--- a/recipes/configs/llama2/7B_full_single_device_low_memory.yaml
+++ b/recipes/configs/llama2/7B_full_single_device_low_memory.yaml
@@ -14,7 +14,7 @@
 # You can add specific overrides through the command line. For example
 # to override the checkpointer directory while launching training
 # you can run:
-#   tune --nnodes 1 --nproc_per_node 1 full_finetune_single_device \
+#   tune run --nnodes 1 --nproc_per_node 1 full_finetune_single_device \
 #   --config llama2/7B_full_single_device_low_memory \
 #   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 #
@@ -68,6 +68,9 @@ enable_activation_checkpointing: True
 # Reduced precision
 dtype: bf16
 
+# Model compilation
+compile: False
+
 # Logging
 metric_logger:
   _component_: torchtune.utils.metric_logging.DiskLogger
diff --git a/recipes/configs/llama2/7B_lora.yaml b/recipes/configs/llama2/7B_lora.yaml
index 16053b7168..ddfe03f9cb 100644
--- a/recipes/configs/llama2/7B_lora.yaml
+++ b/recipes/configs/llama2/7B_lora.yaml
@@ -14,7 +14,7 @@
 # You can add specific overrides through the command line. For example
 # to override the checkpointer directory while launching training
 # you can run:
-#   tune --nnodes 1 --nproc_per_node 4 lora_finetune_distributed \
+#   tune run --nnodes 1 --nproc_per_node 4 lora_finetune_distributed \
 #   --config llama2/7B_lora \
 #   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 #
diff --git a/recipes/configs/llama2/7B_lora_single_device.yaml b/recipes/configs/llama2/7B_lora_single_device.yaml
index 249baf7187..eb5d0448e2 100644
--- a/recipes/configs/llama2/7B_lora_single_device.yaml
+++ b/recipes/configs/llama2/7B_lora_single_device.yaml
@@ -14,7 +14,7 @@
 # You can add specific overrides through the command line. For example
 # to override the checkpointer directory while launching training
 # you can run:
-#   tune --nnodes 1 --nproc_per_node 1 lora_finetune_single_device \
+#   tune run --nnodes 1 --nproc_per_node 1 lora_finetune_single_device \
 #   --config 7B_lora_single_device \
 #   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 #
@@ -69,6 +69,7 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
+compile: False
 
 # Logging
 output_dir: /tmp/lora_finetune_output
diff --git a/recipes/configs/llama2/7B_qlora_single_device.yaml b/recipes/configs/llama2/7B_qlora_single_device.yaml
index 644541b1bf..7f916b2b55 100644
--- a/recipes/configs/llama2/7B_qlora_single_device.yaml
+++ b/recipes/configs/llama2/7B_qlora_single_device.yaml
@@ -14,7 +14,7 @@
 # You can add specific overrides through the command line. For example
 # to override the checkpointer directory while launching training
 # you can run:
-#   tune --nnodes 1 --nproc_per_node 1 lora_finetune_single_device \
+#   tune run --nnodes 1 --nproc_per_node 1 lora_finetune_single_device \
 #   --config 7B_qlora_single_device \
 #   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 #
@@ -69,6 +69,9 @@ loss:
 epochs: 1
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
+# Note: compile for QLoRA is only supported on nightly
+# PyTorch (>= 2.4.0.dev20240408)
+compile: False
 
 # Logging
 output_dir: /tmp/qlora_finetune_output/
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
index db1e3ce6f3..75e513defd 100644
--- a/recipes/full_finetune_distributed.py
+++ b/recipes/full_finetune_distributed.py
@@ -101,13 +101,13 @@ def __init__(self, cfg: DictConfig) -> None:
         self.max_steps_per_epoch = cfg.max_steps_per_epoch
         self.total_training_steps = 0
 
-    def load_checkpoint(self, cfg: DictConfig) -> Dict[str, Any]:
+    def load_checkpoint(self, cfg_checkpointer: DictConfig) -> Dict[str, Any]:
         """
         Extract the checkpoint state from file and validate. If resume_from_checkpoint
         is True, this also includes the recipe state.
         """
         self._checkpointer = config.instantiate(
-            cfg,
+            cfg_checkpointer,
             resume_from_checkpoint=self._resume_from_checkpoint,
         )
         checkpoint_dict = self._checkpointer.load_checkpoint()
diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py
index 96167713ba..3658fa8205 100644
--- a/recipes/full_finetune_single_device.py
+++ b/recipes/full_finetune_single_device.py
@@ -5,7 +5,6 @@
 # LICENSE file in the root directory of this source tree.
 
 import sys
-
 from functools import partial
 from typing import Any, Dict, Optional, Tuple
 from warnings import warn
@@ -103,13 +102,13 @@ def __init__(self, cfg: DictConfig) -> None:
         self.max_steps_per_epoch = cfg.max_steps_per_epoch
         self.total_training_steps = 0
 
-    def load_checkpoint(self, cfg: DictConfig) -> Dict[str, Any]:
+    def load_checkpoint(self, cfg_checkpointer: DictConfig) -> Dict[str, Any]:
         """
         Extract the checkpoint state from file and validate. If resume_from_checkpoint
         is True, this also includes the recipe state.
         """
         self._checkpointer = config.instantiate(
-            cfg,
+            cfg_checkpointer,
             resume_from_checkpoint=self._resume_from_checkpoint,
         )
         checkpoint_dict = self._checkpointer.load_checkpoint()
@@ -156,9 +155,11 @@ def setup(self, cfg: DictConfig) -> None:
         # ``_setup_model`` handles initialization and loading the state dict. This method
         # should be called before ``_setup_optimizer`` since transforming the optimizer
         # state dict requires the model
+        self._model_compile = cfg.compile
         self._model = self._setup_model(
             cfg_model=cfg.model,
             enable_activation_checkpointing=cfg.enable_activation_checkpointing,
+            compile_model=self._model_compile,
             model_state_dict=ckpt_dict[utils.MODEL_KEY],
         )
         self._tokenizer = config.instantiate(cfg.tokenizer)
@@ -206,6 +207,7 @@ def _setup_model(
         self,
         cfg_model: DictConfig,
         enable_activation_checkpointing: bool,
+        compile_model: bool,
         model_state_dict: Dict[str, Any],
     ) -> nn.Module:
         """
@@ -224,6 +226,11 @@ def _setup_model(
         # Validate model was loaded in with the expected dtype.
         utils.validate_expected_param_dtype(model.named_parameters(), dtype=self._dtype)
         log.info(f"Model is initialized with precision {self._dtype}.")
+
+        # Compile model, if enabled.
+        if compile_model:
+            log.info("Compiling model with torch.compile...")
+            model = utils.wrap_compile(model)
         log.info(
             utils.memory_stats_log(
                 "Memory Stats after model init:", device=self._device
@@ -341,6 +348,11 @@ def train(self) -> None:
         The core training loop. Supports training on subsets of the dataset using the
         ``max_steps_per_epoch``.
         """
+
+        if self._model_compile:
+            log.info(
+                "NOTE: torch.compile is enabled and model is compiled in first forward. Expect a relatively slow first iteration."
+            )
         # zero out the gradients before starting training
         if not self._optimizer_in_bwd:
             self._optimizer.zero_grad()
diff --git a/recipes/gemma_full_finetune_distributed.py b/recipes/gemma_full_finetune_distributed.py
index dad0ded7bb..cd918a9ebc 100644
--- a/recipes/gemma_full_finetune_distributed.py
+++ b/recipes/gemma_full_finetune_distributed.py
@@ -100,13 +100,13 @@ def __init__(self, cfg: DictConfig) -> None:
         self.max_steps_per_epoch = cfg.max_steps_per_epoch
         self.total_training_steps = 0
 
-    def load_checkpoint(self, cfg: DictConfig) -> Dict[str, Any]:
+    def load_checkpoint(self, cfg_checkpointer: DictConfig) -> Dict[str, Any]:
         """
         Extract the checkpoint state from file and validate. If resume_from_checkpoint
         is True, this also includes the recipe state.
         """
         self._checkpointer = config.instantiate(
-            cfg,
+            cfg_checkpointer,
             resume_from_checkpoint=self._resume_from_checkpoint,
         )
         checkpoint_dict = self._checkpointer.load_checkpoint()
diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py
index 3f4711eb72..335fd11169 100644
--- a/recipes/lora_finetune_single_device.py
+++ b/recipes/lora_finetune_single_device.py
@@ -98,14 +98,14 @@ def __init__(self, cfg: DictConfig) -> None:
         self._resume_from_checkpoint = cfg.resume_from_checkpoint
         self._gradient_accumulation_steps = cfg.gradient_accumulation_steps
 
-    def load_checkpoint(self, cfg: DictConfig) -> Dict[str, Any]:
+    def load_checkpoint(self, cfg_checkpointer: DictConfig) -> Dict[str, Any]:
         """
         Extract the checkpoint state from file and validate. This includes the
         base model weights. If resume_from_checkpoint is True, this also includes
         the adapter weights and recipe state
         """
         self._checkpointer = config.instantiate(
-            cfg,
+            cfg_checkpointer,
             resume_from_checkpoint=self._resume_from_checkpoint,
         )
         checkpoint_dict = self._checkpointer.load_checkpoint()
@@ -146,12 +146,13 @@ def setup(self, cfg: DictConfig) -> None:
         model, tokenizer, loss, optimizer, learning rate scheduler, sampler, and dataloader.
         """
         self._metric_logger = config.instantiate(cfg.metric_logger)
-
-        checkpoint_dict = self.load_checkpoint(cfg=cfg.checkpointer)
+        self._model_compile = cfg.compile
+        checkpoint_dict = self.load_checkpoint(cfg_checkpointer=cfg.checkpointer)
 
         self._model = self._setup_model(
             cfg_model=cfg.model,
             enable_activation_checkpointing=cfg.enable_activation_checkpointing,
+            compile_model=cfg.compile,
             base_model_state_dict=checkpoint_dict[utils.MODEL_KEY],
             lora_weights_state_dict=(
                 checkpoint_dict[utils.ADAPTER_KEY]
@@ -214,6 +215,7 @@ def _setup_model(
         self,
         cfg_model: DictConfig,
         enable_activation_checkpointing: bool,
+        compile_model: bool,
         base_model_state_dict: Dict[str, Any],
         lora_weights_state_dict: Optional[Dict[str, Any]] = None,
     ) -> nn.Module:
@@ -257,6 +259,10 @@ def _setup_model(
         )
 
         log.info(f"Model is initialized with precision {self._dtype}.")
+        # Compile model, if enabled.
+        if compile_model:
+            log.info("Compiling model with torch.compile...")
+            model = utils.wrap_compile(model)
         log.info(
             utils.memory_stats_log(
                 "Memory Stats after model init:", device=self._device
@@ -380,6 +386,11 @@ def train(self) -> None:
         The core training loop.
         """
 
+        if self._model_compile:
+            log.info(
+                "NOTE: torch.compile is enabled and model is compiled in first forward. Expect a relatively slow first iteration."
+            )
+
         # self.epochs_run should be non-zero when we're resuming from a checkpoint
         for curr_epoch in range(self.epochs_run, self.total_epochs):
             # Update the sampler to ensure data is correctly shuffled across epochs
diff --git a/tests/cache_artifacts.sh b/tests/cache_artifacts.sh
index 0a54c593c9..df4079670d 100755
--- a/tests/cache_artifacts.sh
+++ b/tests/cache_artifacts.sh
@@ -13,7 +13,6 @@
 # In all cases, if the files already exist locally they will not be downloaded from S3.
 
 SMALL_MODEL_URLS=(
-    "s3://pytorch-multimodal/small-ckpt-01242024"
     "s3://pytorch-multimodal/small-ckpt-tune-03082024.pt"
     "s3://pytorch-multimodal/small-ckpt-meta-03082024.pt"
     "s3://pytorch-multimodal/small-ckpt-hf-03082024.pt"
diff --git a/tests/recipes/test_full_finetune_single_device.py b/tests/recipes/test_full_finetune_single_device.py
index 55a95d2b55..b992df6bc9 100644
--- a/tests/recipes/test_full_finetune_single_device.py
+++ b/tests/recipes/test_full_finetune_single_device.py
@@ -50,12 +50,16 @@ def _fetch_expected_loss_values(self):
     @pytest.mark.parametrize(
         "config", ["full_single_device_low_memory", "full_single_device"]
     )
-    def test_loss(self, config, tmpdir, monkeypatch):
+    @pytest.mark.parametrize("compile", [True, False])
+    def test_loss(self, compile, config, tmpdir, monkeypatch):
         ckpt = "small_test_ckpt_meta"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
         ckpt_dir = ckpt_path.parent
         log_file = gen_log_file_name(tmpdir)
 
+        # To workaround https://github.com/pytorch/torchtune/issues/676
+        if compile:
+            os.environ["TORCH_COMPILE_BACKEND"] = "aot_eager"
         cmd = f"""
         tune run full_finetune_single_device \
             --config llama2/7B_{config} \
@@ -66,6 +70,7 @@ def test_loss(self, config, tmpdir, monkeypatch):
             checkpointer.output_dir={tmpdir} \
             checkpointer.model_type=LLAMA2 \
             metric_logger.filename={log_file} \
+            compile={compile} \
         """.split()
 
         model_config = llama2_test_config()
diff --git a/tests/recipes/test_lora_finetune_single_device.py b/tests/recipes/test_lora_finetune_single_device.py
index b74d7d8faf..3eece0ef24 100644
--- a/tests/recipes/test_lora_finetune_single_device.py
+++ b/tests/recipes/test_lora_finetune_single_device.py
@@ -22,6 +22,7 @@
     CKPT_MODEL_PATHS,
     gen_log_file_name,
     get_loss_values_from_metric_logger,
+    torch_version_ge,
 )
 from torchtune import config
 
@@ -52,12 +53,17 @@ def _fetch_qlora_expected_loss_values(self, dtype):
         return [10.5059, 10.5571, 10.5181, 10.4897]
 
     @pytest.mark.integration_test
-    def test_loss(self, tmpdir, monkeypatch):
+    @pytest.mark.parametrize("compile", [True, False])
+    def test_loss(self, compile, tmpdir, monkeypatch):
         ckpt = "small_test_ckpt_meta"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
         ckpt_dir = ckpt_path.parent
         log_file = gen_log_file_name(tmpdir)
 
+        # To workaround https://github.com/pytorch/torchtune/issues/676
+        if compile:
+            os.environ["TORCH_COMPILE_BACKEND"] = "aot_eager"
+
         cmd = f"""
         tune run lora_finetune_single_device \
             --config llama2/7B_lora_single_device \
@@ -68,6 +74,7 @@ def test_loss(self, tmpdir, monkeypatch):
             checkpointer.output_dir={tmpdir} \
             checkpointer.model_type=LLAMA2 \
             metric_logger.filename={log_file} \
+            compile={compile} \
         """.split()
 
         model_config = lora_llama2_test_config(
@@ -91,12 +98,21 @@ def test_loss(self, tmpdir, monkeypatch):
 
     @pytest.mark.integration_test
     @pytest.mark.parametrize("dtype", ["fp32", "bf16"])
-    def test_loss_qlora(self, dtype, tmpdir, monkeypatch):
+    @pytest.mark.parametrize("compile", [True, False])
+    @pytest.mark.skipif(
+        not torch_version_ge("2.4.0"),
+        reason="Please install a nightly build of torch to run this test.",
+    )
+    def test_loss_qlora(self, compile, dtype, tmpdir, monkeypatch):
         ckpt = "small_test_ckpt_meta"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])
         ckpt_dir = ckpt_path.parent
         log_file = gen_log_file_name(tmpdir)
 
+        # To workaround https://github.com/pytorch/torchtune/issues/676
+        if compile:
+            os.environ["TORCH_COMPILE_BACKEND"] = "aot_eager"
+
         cmd = f"""
         tune run lora_finetune_single_device
             --config llama2/7B_qlora_single_device \
@@ -107,6 +123,7 @@ def test_loss_qlora(self, dtype, tmpdir, monkeypatch):
             checkpointer.output_dir={tmpdir} \
             checkpointer.model_type=LLAMA2 \
             metric_logger.filename={log_file} \
+            compile={compile} \
         """.split()
 
         model_config = lora_llama2_test_config(
diff --git a/tests/regression_tests/test_llama2_7b.py b/tests/regression_tests/test_llama2_7b.py
index a4743711ce..32de865b02 100644
--- a/tests/regression_tests/test_llama2_7b.py
+++ b/tests/regression_tests/test_llama2_7b.py
@@ -55,7 +55,7 @@ def test_loss(self, tmpdir, monkeypatch):
         log_file = gen_log_file_name(tmpdir)
 
         cmd = f"""
-        tune --nnodes 1 --nproc_per_node 2 full_finetune_distributed
+        tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed
             --config llama2/7B_full \
             output_dir={tmpdir} \
             checkpointer=torchtune.utils.FullModelTorchTuneCheckpointer
@@ -99,7 +99,7 @@ def test_finetune_and_eval(self, tmpdir, capsys, monkeypatch):
 
         # Run on prod LoRA FT config but with only 10 steps for now
         ft_cmd = f"""
-        tune --nnodes 1 --nproc_per_node 2 lora_finetune_distributed
+        tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed
             --config llama2/7B_lora \
             output_dir={tmpdir} \
             checkpointer=torchtune.utils.FullModelTorchTuneCheckpointer
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 2ca3fcfee6..c575134e8f 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -31,6 +31,14 @@
     "llama2_7b": "/tmp/test-artifacts/llama2-7b-torchtune.pt",
 }
 
+
+def torch_version_ge(version: str) -> bool:
+    """
+    Check if torch version is greater than or equal to the given version
+    """
+    return version in torch.__version__ or torch.__version__ >= version
+
+
 # Inherit from tokenizer class to reuse its tokenize_messages method
 class DummyTokenizer(Tokenizer):
     def __init__(self):
diff --git a/tests/torchtune/utils/test_wrap_compile.py b/tests/torchtune/utils/test_wrap_compile.py
new file mode 100644
index 0000000000..83e4707eb6
--- /dev/null
+++ b/tests/torchtune/utils/test_wrap_compile.py
@@ -0,0 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+
+import torch
+from torchtune import utils
+
+
+class TestWrapCompile:
+    def test_wrap_compile(self) -> None:
+        """
+        Ensures that compile prefix is removed in compiled model
+        state_dict and can be loaded into non-compiled model.
+        """
+        m = torch.nn.Linear(5, 5)
+        m = utils.wrap_compile(m)
+        assert isinstance(m, torch._dynamo.eval_frame.OptimizedModule)
+        load_m = torch.nn.Linear(5, 5)
+        missing, unexpected = load_m.load_state_dict(m.state_dict(), strict=False)
+        assert not missing
+        assert not unexpected
diff --git a/torchtune/utils/__init__.py b/torchtune/utils/__init__.py
index 27365deb56..7d23fc723a 100644
--- a/torchtune/utils/__init__.py
+++ b/torchtune/utils/__init__.py
@@ -11,6 +11,8 @@
     ModelType,
     transform_opt_state_dict,
 )
+
+from ._compile_utils import wrap_compile
 from ._device import get_device
 from ._distributed import (  # noqa
     contains_fsdp,
@@ -77,6 +79,7 @@
     "set_default_dtype",
     "set_seed",
     "validate_expected_param_dtype",
+    "wrap_compile",
     "TuneRecipeArgumentParser",
     "CheckpointableDataLoader",
     "OptimizerInBackwardWrapper",
diff --git a/torchtune/utils/_checkpointing/_checkpointer.py b/torchtune/utils/_checkpointing/_checkpointer.py
index 7c4e589a64..35352d4430 100644
--- a/torchtune/utils/_checkpointing/_checkpointer.py
+++ b/torchtune/utils/_checkpointing/_checkpointer.py
@@ -265,7 +265,7 @@ class FullModelHFCheckpointer(_CheckpointerInterface):
     the following flow:
 
         1. Download the model from the HF repo using tune download
-        tune download --repo-id meta-llama/Llama-2-7b-hf \
+        tune download meta-llama/Llama-2-7b-hf \
         --output-dir <checkpoint_dir> \
         --hf-token <hf-token>
 
@@ -511,7 +511,7 @@ class FullModelMetaCheckpointer(_CheckpointerInterface):
     the following flow:
 
         1. Download the model from the HF repo using tune download
-        tune download --repo-id meta-llama/Llama-2-7b \
+        tune download meta-llama/Llama-2-7b \
         --output-dir <checkpoint_dir> \
         --hf-token <hf-token>
 
diff --git a/torchtune/utils/_compile_utils.py b/torchtune/utils/_compile_utils.py
new file mode 100644
index 0000000000..273292c026
--- /dev/null
+++ b/torchtune/utils/_compile_utils.py
@@ -0,0 +1,42 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+
+import torch
+from torch import nn
+
+_TORCH_COMPILE_WRAPPER_PREFIX = "_orig_mod."
+
+
+def wrap_compile(model: nn.Module) -> None:
+    """
+    Wraps the model with torch.compile. This function will also
+        register a state_dict post hook that allows state_dicts produced
+        with torch.compile training to behave as regular eager mode models.
+        In particular, it strips away a torch.compile specific prefix
+        added to the state_dict by torch.compile.
+
+        Args:
+                model (nn.Module): model to wrap with compile.
+
+        Returns:
+            None
+    """
+    # TORCH_COMPILE_BACKEND can be set as an env var to override default torch.compile backend.
+    # Currently only used in unittesting to work around https://github.com/pytorch/torchtune/issues/676
+    backend = os.environ.get("TORCH_COMPILE_BACKEND", "inductor")
+    model = torch.compile(model, backend=backend)
+    model._register_state_dict_hook(_remove_torch_compile_prefix)
+    return model
+
+
+def _remove_torch_compile_prefix(model, state_dict, *args, **kwargs):
+    keys = list(state_dict.keys())
+    for key in keys:
+        if key.startswith(_TORCH_COMPILE_WRAPPER_PREFIX):
+            newkey = key[len(_TORCH_COMPILE_WRAPPER_PREFIX) :]
+            state_dict[newkey] = state_dict.pop(key)