From 985727467ca6b34c7925babc035a6b2d170714a9 Mon Sep 17 00:00:00 2001
From: joecummings <jrcummings27@gmail.com>
Date: Tue, 2 Apr 2024 11:59:52 -0700
Subject: [PATCH 01/12] Rename files

---
 recipes/configs/llama2/13B_full.yaml          | 30 +++----
 recipes/configs/llama2/13B_lora.yaml          | 35 ++++----
 recipes/configs/llama2/7B_full.yaml           | 35 ++++----
 .../configs/llama2/7B_full_single_device.yaml | 76 -----------------
 .../7B_full_single_device_low_memory.yaml     | 76 -----------------
 recipes/configs/llama2/7B_lora.yaml           | 35 ++++----
 .../configs/llama2/7B_lora_single_device.yaml | 83 -------------------
 .../llama2/7B_qlora_single_device.yaml        | 83 -------------------
 recipes/configs/mistral/7B_full.yaml          |  3 +-
 recipes/configs/mistral/7B_lora.yaml          |  5 +-
 torchtune/_recipe_registry.py                 | 24 ++----
 11 files changed, 77 insertions(+), 408 deletions(-)
 delete mode 100644 recipes/configs/llama2/7B_full_single_device.yaml
 delete mode 100644 recipes/configs/llama2/7B_full_single_device_low_memory.yaml
 delete mode 100644 recipes/configs/llama2/7B_lora_single_device.yaml
 delete mode 100644 recipes/configs/llama2/7B_qlora_single_device.yaml

diff --git a/recipes/configs/llama2/13B_full.yaml b/recipes/configs/llama2/13B_full.yaml
index abbd9c45c5..b54821c93a 100644
--- a/recipes/configs/llama2/13B_full.yaml
+++ b/recipes/configs/llama2/13B_full.yaml
@@ -1,27 +1,19 @@
-# Config for multi-device full finetuning in full_finetune_distributed.py
-# using a Llama2 13B model
+# Config for multi-device with full_finetune_distributed.py using a Llama2 13B model
 #
-# This config assumes that you've run the following command before launching
-# this run:
-#   tune download --repo-id meta-llama/Llama-2-13b-hf \
-#   --hf-token <HF_TOKEN> \
-#   --output-dir /tmp/llama2-13b-hf
+# This config assumes that you've run the following command before launching:
+# $ tune download meta-llama/Llama-2-13b \
+#       --hf-token <HF_TOKEN> \
+#       --output-dir /tmp/llama2
 #
 # To launch on 4 devices, run the following command from root:
-#   tune run --nproc_per_node 4 full_finetune_distributed \
-#   --config llama2/13B_full \
+# $ tune run --nproc_per_node 4 full_finetune_distributed \
+#       --config llama2/13B_full
 #
 # You can add specific overrides through the command line. For example
-# to override the checkpointer directory while launching training
-# you can run:
-#   tune --nnodes 1 --nproc_per_node 4 full_finetune_distributed \
-#   --config llama2/13B_full \
-#   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
-#
-# This config should be used with 2+ GPUs. Single device full fine-tuning
-# requires several memory optimizations which are exposed through
-# 7B_full_single_device.yaml. Please update the model and checkpoints to 13B
-# in that config.
+# to override the checkpointer directory while launching training:
+# $ tune run --nproc_per_node 4 full_finetune_distributed \
+#       --config llama2/13B_full \
+#       checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 
 
 # Tokenizer
diff --git a/recipes/configs/llama2/13B_lora.yaml b/recipes/configs/llama2/13B_lora.yaml
index 947faf7c6a..042a986277 100644
--- a/recipes/configs/llama2/13B_lora.yaml
+++ b/recipes/configs/llama2/13B_lora.yaml
@@ -1,27 +1,24 @@
-# Config for multi-device LoRA in lora_finetune_distributed.py
-# using a Llama2 13B model
+# Config for multi-device with lora_finetune_distributed.py or single-device LoRA
+# finetuning with lora_finetune_single_device.py using a Llama2 13B model
 #
-# This config assumes that you've run the following command before launching
-# this run:
-#   tune download --repo-id meta-llama/Llama-2-13b-hf \
-#   --hf-token <HF_TOKEN> \
-#   --output-dir /tmp/llama2-13b-hf
+# This config assumes that you've run the following command before launching:
+# $ tune download meta-llama/Llama-2-13b \
+#       --hf-token <HF_TOKEN> \
+#       --output-dir /tmp/llama2
 #
 # To launch on 4 devices, run the following command from root:
-#   tune run --nproc_per_node 4 lora_finetune_distributed \
-#   --config llama2/13B_lora \
+# $ tune run --nproc_per_node 4 lora_finetune_distributed \
+#       --config llama2/13B_lora
 #
-# You can add specific overrides through the command line. For example
-# to override the checkpointer directory while launching training
-# you can run:
-#   tune --nnodes 1 --nproc_per_node 4 lora_finetune_distributed \
-#   --config llama2/13B_lora \
-#   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+# To launch on a single device, run the following command:
+# $ tune run lora_finetune_single_device \
+#       --config llama2/13B_lora
 #
-# This config works best when the model is being fine-tuned on 2+ GPUs.
-# For single device lora finetuning please use 7B_lora_single_device.yaml
-# or 7B_qlora_single_device.yaml and update the model and checkpoints to
-# the 13B model.
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training:
+# $ tune run --nproc_per_node 4 lora_finetune_distributed \
+#       --config llama2/13B_lora \
+#       checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 
 
 # Model Arguments
diff --git a/recipes/configs/llama2/7B_full.yaml b/recipes/configs/llama2/7B_full.yaml
index 16f3dcb3ec..7a94ebd221 100644
--- a/recipes/configs/llama2/7B_full.yaml
+++ b/recipes/configs/llama2/7B_full.yaml
@@ -1,26 +1,27 @@
-# Config for multi-device full finetuning in full_finetune_distributed.py
-# using a Llama2 7B model
+# Config for multi-device with full_finetune_distributed.py or single-device full finetuning
+# with full_finetune_single_device.py using a Llama2 7B model
 #
-# This config assumes that you've run the following command before launching
-# this run:
-#   tune download --repo-id meta-llama/Llama-2-7b \
-#   --hf-token <HF_TOKEN> \
-#   --output-dir /tmp/llama2
+# This config assumes that you've run the following command before launching:
+# $ tune download meta-llama/Llama-2-7b \
+#       --hf-token <HF_TOKEN> \
+#       --output-dir /tmp/llama2
 #
 # To launch on 4 devices, run the following command from root:
-#   tune run --nproc_per_node 4 full_finetune_distributed \
-#   --config llama2/7B_full \
+# $ tune run --nproc_per_node 4 full_finetune_distributed \
+#       --config llama2/7B_full
+#
+# To launch on a single device, run the following command:
+# $ tune run full_finetune_single_device \
+#       --config llama2/7B_full
 #
 # You can add specific overrides through the command line. For example
-# to override the checkpointer directory while launching training
-# you can run:
-#   tune --nnodes 1 --nproc_per_node 4 full_finetune_distributed \
-#   --config llama2/7B_full \
-#   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+# to override the checkpointer directory while launching training:
+# $ tune run --nproc_per_node 4 full_finetune_distributed \
+#       --config llama2/7B_full \
+#       checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 #
-# This config works best when the model is being fine-tuned on 2+ GPUs.
-# Single device full finetuning requires more memory optimizations. It's
-# best to use 7B_full_single_device.yaml for those cases
+# For more memory optimizations, such as those needed when running on a single GPU,
+# use llama2/7B_full_low_memory_example
 
 
 # Tokenizer
diff --git a/recipes/configs/llama2/7B_full_single_device.yaml b/recipes/configs/llama2/7B_full_single_device.yaml
deleted file mode 100644
index 1d297a28ec..0000000000
--- a/recipes/configs/llama2/7B_full_single_device.yaml
+++ /dev/null
@@ -1,76 +0,0 @@
-# Config for single device full finetuning in full_finetune_single_device.py
-# using a Llama2 7B model
-#
-# This config assumes that you've run the following command before launching
-# this run:
-#   tune download --repo-id meta-llama/Llama-2-7b \
-#   --hf-token <HF_TOKEN> \
-#   --output-dir /tmp/llama2
-#
-# To launch on a single device, run the following command from root:
-#   tune run full_finetune_single_device \
-#   --config llama2/7B_full_single_device \
-#
-# You can add specific overrides through the command line. For example
-# to override the checkpointer directory while launching training
-# you can run:
-#   tune --nnodes 1 --nproc_per_node 1 full_finetune_single_device \
-#   --config llama2/7B_full_single_device \
-#   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
-#
-# This config works only for training on single device.
-
-
-# Tokenizer
-tokenizer:
-  _component_: torchtune.models.llama2.llama2_tokenizer
-  path: /tmp/llama2/tokenizer.model
-
-# Dataset
-dataset:
-  _component_: torchtune.datasets.alpaca_dataset
-  train_on_input: True
-seed: null
-shuffle: True
-
-# Model Arguments
-model:
-  _component_: torchtune.models.llama2.llama2_7b
-
-checkpointer:
-  _component_: torchtune.utils.FullModelMetaCheckpointer
-  checkpoint_dir: /tmp/llama2
-  checkpoint_files: [consolidated.00.pth]
-  recipe_checkpoint: null
-  output_dir: /tmp/llama2
-  model_type: LLAMA2
-resume_from_checkpoint: False
-
-# Fine-tuning arguments
-batch_size: 2
-epochs: 3
-optimizer:
-  _component_: torch.optim.SGD
-  lr: 2e-5
-loss:
-  _component_: torch.nn.CrossEntropyLoss
-max_steps_per_epoch: null
-gradient_accumulation_steps: 1
-optimizer_in_bwd: False
-
-
-# Training environment
-device: cuda
-
-# Memory management
-enable_activation_checkpointing: True
-
-# Reduced precision
-dtype: bf16
-
-# Logging
-metric_logger:
-  _component_: torchtune.utils.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/alpaca-llama2-finetune
-log_every_n_steps: null
diff --git a/recipes/configs/llama2/7B_full_single_device_low_memory.yaml b/recipes/configs/llama2/7B_full_single_device_low_memory.yaml
deleted file mode 100644
index c1bfd5cb6f..0000000000
--- a/recipes/configs/llama2/7B_full_single_device_low_memory.yaml
+++ /dev/null
@@ -1,76 +0,0 @@
-# Config for single device full finetuning in full_finetune_single_device.py
-# using a Llama2 7B model
-#
-# This config assumes that you've run the following command before launching
-# this run:
-#   tune download --repo-id meta-llama/Llama-2-7b \
-#   --hf-token <HF_TOKEN> \
-#   --output-dir /tmp/llama2
-#
-# To launch on a single device, run the following command from root:
-#   tune run full_finetune_single_device \
-#   --config llama2/7B_full_single_device_low_memory \
-#
-# You can add specific overrides through the command line. For example
-# to override the checkpointer directory while launching training
-# you can run:
-#   tune --nnodes 1 --nproc_per_node 1 full_finetune_single_device \
-#   --config llama2/7B_full_single_device_low_memory \
-#   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
-#
-# This config works only for training on single device.
-
-
-# Tokenizer
-tokenizer:
-  _component_: torchtune.models.llama2.llama2_tokenizer
-  path: /tmp/llama2/tokenizer.model
-
-# Dataset
-dataset:
-  _component_: torchtune.datasets.alpaca_dataset
-  train_on_input: True
-seed: null
-shuffle: True
-
-# Model Arguments
-model:
-  _component_: torchtune.models.llama2.llama2_7b
-
-checkpointer:
-  _component_: torchtune.utils.FullModelMetaCheckpointer
-  checkpoint_dir: /tmp/llama2
-  checkpoint_files: [consolidated.00.pth]
-  recipe_checkpoint: null
-  output_dir: /tmp/llama2
-  model_type: LLAMA2
-resume_from_checkpoint: False
-
-# Fine-tuning arguments
-batch_size: 2
-epochs: 1
-optimizer:
-  _component_: bitsandbytes.optim.PagedAdamW
-  lr: 2e-5
-optimizer_in_bwd: True
-loss:
-  _component_: torch.nn.CrossEntropyLoss
-max_steps_per_epoch: null
-gradient_accumulation_steps: 1
-
-
-# Training environment
-device: cuda
-
-# Memory management
-enable_activation_checkpointing: True
-
-# Reduced precision
-dtype: bf16
-
-# Logging
-metric_logger:
-  _component_: torchtune.utils.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-output_dir: /tmp/alpaca-llama2-finetune
-log_every_n_steps: null
diff --git a/recipes/configs/llama2/7B_lora.yaml b/recipes/configs/llama2/7B_lora.yaml
index d9035e64c9..fdf5971303 100644
--- a/recipes/configs/llama2/7B_lora.yaml
+++ b/recipes/configs/llama2/7B_lora.yaml
@@ -1,26 +1,27 @@
-# Config for multi-device LoRA finetuning in lora_finetune_distributed.py
-# using a Llama2 7B model
+# Config for multi-device with lora_finetune_distributed.py or single-device LoRA
+# finetuning with lora_finetune_single_device.py using a Llama2 7B model
 #
-# This config assumes that you've run the following command before launching
-# this run:
-#   tune download --repo-id meta-llama/Llama-2-7b \
-#   --hf-token <HF_TOKEN> \
-#   --output-dir /tmp/llama2
+# This config assumes that you've run the following command before launching:
+# $ tune download meta-llama/Llama-2-7b \
+#       --hf-token <HF_TOKEN> \
+#       --output-dir /tmp/llama2
 #
 # To launch on 4 devices, run the following command from root:
-#   tune run --nproc_per_node 4 lora_finetune_distributed \
-#   --config llama2/7B_lora \
+# $ tune run --nproc_per_node 4 lora_finetune_distributed \
+#       --config llama2/7B_lora
+#
+# To launch on a single device, run the following command:
+# $ tune run lora_finetune_single_device \
+#       --config llama2/7B_lora
 #
 # You can add specific overrides through the command line. For example
-# to override the checkpointer directory while launching training
-# you can run:
-#   tune --nnodes 1 --nproc_per_node 4 lora_finetune_distributed \
-#   --config llama2/7B_lora \
-#   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+# to override the checkpointer directory while launching training:
+# $ tune run --nproc_per_node 4 lora_finetune_distributed \
+#       --config llama2/7B_lora \
+#       checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 #
-# This config works best when the model is being fine-tuned on 2+ GPUs.
-# For single device lora finetuning please use 7B_lora_single_device.yaml
-# or 7B_qlora_single_device.yaml
+# For more memory optimizations, such as those needed when running on a single GPU,
+# use llama2/7B_qlora
 
 
 # Model Arguments
diff --git a/recipes/configs/llama2/7B_lora_single_device.yaml b/recipes/configs/llama2/7B_lora_single_device.yaml
deleted file mode 100644
index de19afe428..0000000000
--- a/recipes/configs/llama2/7B_lora_single_device.yaml
+++ /dev/null
@@ -1,83 +0,0 @@
-# Config for single device LoRA finetuning in lora_finetune_single_device.py
-# using a Llama2 7B model
-#
-# This config assumes that you've run the following command before launching
-# this run:
-#   tune download --repo-id meta-llama/Llama-2-7b \
-#   --hf-token <HF_TOKEN> \
-#   --output-dir /tmp/llama2
-#
-# To launch on a single device, run the following command from root:
-#   tune run lora_finetune_single_device \
-#   --config llama2/7B_lora_single_device \
-#
-# You can add specific overrides through the command line. For example
-# to override the checkpointer directory while launching training
-# you can run:
-#   tune --nnodes 1 --nproc_per_node 1 lora_finetune_single_device \
-#   --config 7B_lora_single_device \
-#   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
-#
-# This config works only for training on single device.
-
-
-# Model Arguments
-model:
-  _component_: torchtune.models.llama2.lora_llama2_7b
-  lora_attn_modules: ['q_proj', 'v_proj']
-  apply_lora_to_mlp: False
-  apply_lora_to_output: False
-  lora_rank: 8
-  lora_alpha: 16
-
-checkpointer:
-  _component_: torchtune.utils.FullModelMetaCheckpointer
-  checkpoint_dir: /tmp/llama2/
-  checkpoint_files: [consolidated.00.pth]
-  adapter_checkpoint: null
-  recipe_checkpoint: null
-  output_dir: /tmp/llama2/
-  model_type: LLAMA2
-resume_from_checkpoint: False
-
-# Tokenizer
-tokenizer:
-  _component_: torchtune.models.llama2.llama2_tokenizer
-  path: /tmp/llama2/tokenizer.model
-
-# Dataset and Sampler
-dataset:
-  _component_: torchtune.datasets.alpaca_cleaned_dataset
-  train_on_input: True
-seed: null
-shuffle: True
-batch_size: 2
-
-# Optimizer and Scheduler
-optimizer:
-  _component_: torch.optim.AdamW
-  weight_decay: 0.01
-  lr: 3e-4
-lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
-  num_warmup_steps: 100
-
-loss:
-  _component_: torch.nn.CrossEntropyLoss
-
-# Training
-epochs: 1
-max_steps_per_epoch: null
-gradient_accumulation_steps: 1
-
-# Logging
-output_dir: /tmp/lora_finetune_output
-metric_logger:
-  _component_: torchtune.utils.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-log_every_n_steps: null
-
-# Environment
-device: cuda
-dtype: bf16
-enable_activation_checkpointing: True
diff --git a/recipes/configs/llama2/7B_qlora_single_device.yaml b/recipes/configs/llama2/7B_qlora_single_device.yaml
deleted file mode 100644
index 26510f1642..0000000000
--- a/recipes/configs/llama2/7B_qlora_single_device.yaml
+++ /dev/null
@@ -1,83 +0,0 @@
-# Config for single device QLoRA with lora_finetune_single_device.py
-# using a Llama2 7B model
-#
-# This config assumes that you've run the following command before launching
-# this run:
-#   tune download --repo-id meta-llama/Llama-2-7b \
-#   --hf-token <HF_TOKEN> \
-#   --output-dir /tmp/llama2
-#
-# To launch on a single device, run the following command from root:
-#   tune run lora_finetune_single_device \
-#   --config llama2\7B_qlora_single_device \
-#
-# You can add specific overrides through the command line. For example
-# to override the checkpointer directory while launching training
-# you can run:
-#   tune --nnodes 1 --nproc_per_node 1 lora_finetune_single_device \
-#   --config 7B_qlora_single_device \
-#   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
-#
-# This config works only for training on single device.
-
-# Model Arguments
-model:
-  _component_: torchtune.models.llama2.qlora_llama2_7b
-  lora_attn_modules: ['q_proj', 'v_proj', 'k_proj', 'output_proj']
-  apply_lora_to_mlp: True
-  apply_lora_to_output: False
-  lora_rank: 8
-  lora_alpha: 16
-  quantize_base: True
-
-checkpointer:
-  _component_: torchtune.utils.FullModelMetaCheckpointer
-  checkpoint_dir: /tmp/llama2
-  checkpoint_files: [consolidated.00.pth]
-  adapter_checkpoint: null
-  recipe_checkpoint: null
-  output_dir: /tmp/llama2/
-  model_type: LLAMA2
-resume_from_checkpoint: False
-
-# Tokenizer
-tokenizer:
-  _component_: torchtune.models.llama2.llama2_tokenizer
-  path: /tmp/llama2/tokenizer.model
-
-# Dataset and Sampler
-dataset:
-  _component_: torchtune.datasets.alpaca_cleaned_dataset
-  train_on_input: True
-seed: null
-shuffle: True
-batch_size: 2
-
-# Optimizer and Scheduler
-optimizer:
-  _component_: torch.optim.AdamW
-  weight_decay: 0.01
-  lr: 3e-4
-lr_scheduler:
-  _component_: torchtune.modules.get_cosine_schedule_with_warmup
-  num_warmup_steps: 100
-
-loss:
-  _component_: torch.nn.CrossEntropyLoss
-
-# Training
-epochs: 1
-max_steps_per_epoch: null
-gradient_accumulation_steps: 1
-
-# Logging
-output_dir: /tmp/qlora_finetune_output/
-metric_logger:
-  _component_: torchtune.utils.metric_logging.DiskLogger
-  log_dir: ${output_dir}
-log_every_n_steps: 1
-
-# Environment
-device: cuda
-dtype: bf16
-enable_activation_checkpointing: True
diff --git a/recipes/configs/mistral/7B_full.yaml b/recipes/configs/mistral/7B_full.yaml
index 211c5526ac..fcaa7f593b 100644
--- a/recipes/configs/mistral/7B_full.yaml
+++ b/recipes/configs/mistral/7B_full.yaml
@@ -3,7 +3,8 @@
 # from the paper
 #
 # Run this config on 4 GPUs using the following:
-# tune run --nproc_per_node 4 full_finetune_distributed --config mistral/7B_full
+# $ tune run --nproc_per_node 4 full_finetune_distributed --config mistral/7B_full
+
 
 # Tokenizer
 tokenizer:
diff --git a/recipes/configs/mistral/7B_lora.yaml b/recipes/configs/mistral/7B_lora.yaml
index 53f926de74..a62d8f4a78 100644
--- a/recipes/configs/mistral/7B_lora.yaml
+++ b/recipes/configs/mistral/7B_lora.yaml
@@ -3,7 +3,10 @@
 # from the paper
 #
 # Run this config on 4 GPUs using the following:
-# tune run --nproc_per_node 4 lora_finetune_distributed --config mistral/7B_lora
+# $ tune run --nproc_per_node 4 lora_finetune_distributed --config mistral/7B_lora
+#
+# Run this config on a single GPU with the following:
+# $ tune run lora_finetune_single_device --config mistral/7B_lora
 
 
 # Tokenizer
diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py
index 20e578e306..61da86ea4f 100644
--- a/torchtune/_recipe_registry.py
+++ b/torchtune/_recipe_registry.py
@@ -28,16 +28,12 @@ class Recipe:
         file_path="full_finetune_single_device.py",
         configs=[
             Config(
-                name="llama2/7B_full_single_device",
-                file_path="llama2/7B_full_single_device.yaml",
+                name="llama2/7B_full",
+                file_path="llama2/7B_full.yaml",
             ),
             Config(
-                name="llama2/7B_full_single_device_low_memory",
-                file_path="llama2/7B_full_single_device_low_memory.yaml",
-            ),
-            Config(
-                name="mistral/7B_full",
-                file_path="mistral/7B_full.yaml",
+                name="llama2/7B_full_low_memory",
+                file_path="llama2/7B_full_low_memory.yaml",
             ),
         ],
         supports_distributed=False,
@@ -57,16 +53,12 @@ class Recipe:
         file_path="lora_finetune_single_device.py",
         configs=[
             Config(
-                name="llama2/7B_lora_single_device",
-                file_path="llama2/7B_lora_single_device.yaml",
-            ),
-            Config(
-                name="llama2/7B_qlora_single_device",
-                file_path="llama2/7B_qlora_single_device.yaml",
+                name="llama2/7B_lora",
+                file_path="llama2/7B_lora.yaml",
             ),
             Config(
-                name="mistral/7B_lora",
-                file_path="mistral/7B_lora.yaml",
+                name="llama2/7B_qlora",
+                file_path="llama2/7B_qlora.yaml",
             ),
         ],
         supports_distributed=False,

From 9c36605ba28450248376d5fea91bd8aff939c588 Mon Sep 17 00:00:00 2001
From: joecummings <jrcummings27@gmail.com>
Date: Tue, 2 Apr 2024 13:05:00 -0700
Subject: [PATCH 02/12] Add grad acc steps

---
 recipes/configs/llama2/13B_lora.yaml          |  1 +
 .../configs/llama2/7B_full_low_memory.yaml    | 73 ++++++++++++++++
 recipes/configs/llama2/7B_lora.yaml           |  1 +
 recipes/configs/llama2/7B_qlora.yaml          | 84 +++++++++++++++++++
 4 files changed, 159 insertions(+)
 create mode 100644 recipes/configs/llama2/7B_full_low_memory.yaml
 create mode 100644 recipes/configs/llama2/7B_qlora.yaml

diff --git a/recipes/configs/llama2/13B_lora.yaml b/recipes/configs/llama2/13B_lora.yaml
index 042a986277..022e0ccfb6 100644
--- a/recipes/configs/llama2/13B_lora.yaml
+++ b/recipes/configs/llama2/13B_lora.yaml
@@ -72,6 +72,7 @@ loss:
 # Training
 epochs: 1
 max_steps_per_epoch: null
+gradient_accumulation_steps: 1
 
 # Logging
 output_dir: /tmp/lora_finetune_output
diff --git a/recipes/configs/llama2/7B_full_low_memory.yaml b/recipes/configs/llama2/7B_full_low_memory.yaml
new file mode 100644
index 0000000000..124e95353b
--- /dev/null
+++ b/recipes/configs/llama2/7B_full_low_memory.yaml
@@ -0,0 +1,73 @@
+# Config for single-device full finetuning with full_finetune_single_device.py
+# using a Llama2 7B model
+#
+# This config assumes that you've run the following command before launching:
+# $ tune download meta-llama/Llama-2-7b \
+#       --hf-token <HF_TOKEN> \
+#       --output-dir /tmp/llama2
+#
+# To launch on a single device, run the following command:
+# $ tune run full_finetune_single_device \
+#       --config llama2/7B_full_low_memory
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training:
+# $ tune run full_finetune_single_device \
+#       --config llama2/7B_full_low_memory \
+#       checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config specifies components from `bitsandbytes`, make sure you have it installed
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.llama2.llama2_tokenizer
+  path: /tmp/llama2/tokenizer.model
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_dataset
+  train_on_input: True
+seed: null
+shuffle: True
+
+# Model Arguments
+model:
+  _component_: torchtune.models.llama2.llama2_7b
+
+checkpointer:
+  _component_: torchtune.utils.FullModelMetaCheckpointer
+  checkpoint_dir: /tmp/llama2
+  checkpoint_files: [consolidated.00.pth]
+  recipe_checkpoint: null
+  output_dir: /tmp/llama2
+  model_type: LLAMA2
+resume_from_checkpoint: False
+
+# Fine-tuning arguments
+batch_size: 2
+epochs: 1
+optimizer:
+  _component_: bitsandbytes.optim.PagedAdamW
+  lr: 2e-5
+optimizer_in_bwd: True
+loss:
+  _component_: torch.nn.CrossEntropyLoss
+max_steps_per_epoch: null
+gradient_accumulation_steps: 1
+
+
+# Training environment
+device: cuda
+
+# Memory management
+enable_activation_checkpointing: True
+
+# Reduced precision
+dtype: bf16
+
+# Logging
+metric_logger:
+  _component_: torchtune.utils.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/alpaca-llama2-finetune
+log_every_n_steps: null
diff --git a/recipes/configs/llama2/7B_lora.yaml b/recipes/configs/llama2/7B_lora.yaml
index fdf5971303..b1e9f01e4b 100644
--- a/recipes/configs/llama2/7B_lora.yaml
+++ b/recipes/configs/llama2/7B_lora.yaml
@@ -71,6 +71,7 @@ loss:
 # Training
 epochs: 1
 max_steps_per_epoch: null
+gradient_accumulation_steps: 1
 
 # Logging
 output_dir: /tmp/lora_finetune_output
diff --git a/recipes/configs/llama2/7B_qlora.yaml b/recipes/configs/llama2/7B_qlora.yaml
new file mode 100644
index 0000000000..d3c847579f
--- /dev/null
+++ b/recipes/configs/llama2/7B_qlora.yaml
@@ -0,0 +1,84 @@
+# Config for single device QLoRA with lora_finetune_single_device.py
+# using a Llama2 7B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+# $ tune download --repo-id meta-llama/Llama-2-7b \
+#       --hf-token <HF_TOKEN> \
+#       --output-dir /tmp/llama2
+#
+# To launch on a single device, run the following command from root:
+# $ tune run lora_finetune_single_device \
+#       --config llama2/7B_qlora
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training:
+# $ tune run lora_finetune_single_device \
+#       --config llama2/7B_qlora \
+#       checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+
+# Model Arguments
+model:
+  _component_: torchtune.models.llama2.qlora_llama2_7b
+  lora_attn_modules: ['q_proj', 'v_proj', 'k_proj', 'output_proj']
+  apply_lora_to_mlp: True
+  apply_lora_to_output: False
+  lora_rank: 8
+  lora_alpha: 16
+  quantize_base: True
+
+checkpointer:
+  _component_: torchtune.utils.FullModelMetaCheckpointer
+  checkpoint_dir: /tmp/llama2
+  checkpoint_files: [consolidated.00.pth]
+  adapter_checkpoint: null
+  recipe_checkpoint: null
+  output_dir: /tmp/llama2/
+  model_type: LLAMA2
+resume_from_checkpoint: False
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.llama2.llama2_tokenizer
+  path: /tmp/llama2/tokenizer.model
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.alpaca_dataset
+  train_on_input: True
+  use_clean: True
+seed: null
+shuffle: True
+batch_size: 2
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  weight_decay: 0.01
+  lr: 3e-4
+lr_scheduler:
+  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torch.nn.CrossEntropyLoss
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 1
+
+# Logging
+output_dir: /tmp/qlora_finetune_output/
+metric_logger:
+  _component_: torchtune.utils.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+log_every_n_steps: 1
+
+# Environment
+device: cuda
+dtype: bf16
+enable_activation_checkpointing: True

From da54679f4b0e372c3c481240be781273e8cb20f5 Mon Sep 17 00:00:00 2001
From: joecummings <jrcummings27@gmail.com>
Date: Tue, 2 Apr 2024 15:15:24 -0700
Subject: [PATCH 03/12] Optimizer in bwd

---
 recipes/configs/llama2/7B_full.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/recipes/configs/llama2/7B_full.yaml b/recipes/configs/llama2/7B_full.yaml
index 7a94ebd221..9a46de2d17 100644
--- a/recipes/configs/llama2/7B_full.yaml
+++ b/recipes/configs/llama2/7B_full.yaml
@@ -55,6 +55,7 @@ epochs: 3
 optimizer:
   _component_: torch.optim.AdamW
   lr: 2e-5
+optimizer_in_bwd: False
 loss:
   _component_: torch.nn.CrossEntropyLoss
 max_steps_per_epoch: null

From ca12ce00a4cd3667e97f43d21236819467b13add Mon Sep 17 00:00:00 2001
From: joecummings <jrcummings27@gmail.com>
Date: Tue, 2 Apr 2024 15:18:04 -0700
Subject: [PATCH 04/12] Update tests

---
 tests/recipes/test_full_finetune_single_device.py |  2 +-
 tests/recipes/test_lora_finetune_single_device.py | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/recipes/test_full_finetune_single_device.py b/tests/recipes/test_full_finetune_single_device.py
index 55a95d2b55..7578b68c4a 100644
--- a/tests/recipes/test_full_finetune_single_device.py
+++ b/tests/recipes/test_full_finetune_single_device.py
@@ -48,7 +48,7 @@ def _fetch_expected_loss_values(self):
 
     @pytest.mark.integration_test
     @pytest.mark.parametrize(
-        "config", ["full_single_device_low_memory", "full_single_device"]
+        "config", ["full_low_memory", "full"]
     )
     def test_loss(self, config, tmpdir, monkeypatch):
         ckpt = "small_test_ckpt_meta"
diff --git a/tests/recipes/test_lora_finetune_single_device.py b/tests/recipes/test_lora_finetune_single_device.py
index b74d7d8faf..1142c20b06 100644
--- a/tests/recipes/test_lora_finetune_single_device.py
+++ b/tests/recipes/test_lora_finetune_single_device.py
@@ -60,7 +60,7 @@ def test_loss(self, tmpdir, monkeypatch):
 
         cmd = f"""
         tune run lora_finetune_single_device \
-            --config llama2/7B_lora_single_device \
+            --config llama2/7B_lora \
             output_dir={tmpdir} \
             checkpointer=torchtune.utils.FullModelMetaCheckpointer \
             checkpointer.checkpoint_dir='{ckpt_dir}' \
@@ -99,7 +99,7 @@ def test_loss_qlora(self, dtype, tmpdir, monkeypatch):
 
         cmd = f"""
         tune run lora_finetune_single_device
-            --config llama2/7B_qlora_single_device \
+            --config llama2/7B_qlora \
             output_dir={tmpdir} \
             checkpointer=torchtune.utils.FullModelMetaCheckpointer
             checkpointer.checkpoint_dir='{ckpt_dir}' \
@@ -151,7 +151,7 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch):
         # Train for two epochs
         cmd_1 = f"""
         tune run lora_finetune_single_device \
-            --config llama2/7B_lora_single_device \
+            --config llama2/7B_lora \
             output_dir={tmpdir} \
             checkpointer=torchtune.utils.FullModelHFCheckpointer \
             checkpointer.checkpoint_dir='{ckpt_dir}' \
@@ -176,7 +176,7 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch):
         # Resume training
         cmd_2 = f"""
         tune run lora_finetune_single_device \
-            --config llama2/7B_lora_single_device \
+            --config llama2/7B_lora \
             output_dir={tmpdir} \
             checkpointer=torchtune.utils.FullModelHFCheckpointer \
             checkpointer.checkpoint_dir={tmpdir} \
@@ -209,7 +209,7 @@ def test_save_and_load_merged_weights(self, tmpdir, monkeypatch):
 
         cmd = f"""
         tune run lora_finetune_single_device \
-            --config llama2/7B_lora_single_device \
+            --config llama2/7B_lora \
             output_dir={tmpdir} \
             checkpointer=torchtune.utils.FullModelTorchTuneCheckpointer \
             checkpointer.checkpoint_dir='{ckpt_dir}' \

From 47e01c5dc9207b655fca39736ac4c04d44178421 Mon Sep 17 00:00:00 2001
From: joecummings <jrcummings27@gmail.com>
Date: Tue, 2 Apr 2024 15:21:36 -0700
Subject: [PATCH 05/12] Lint

---
 tests/recipes/test_full_finetune_single_device.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/recipes/test_full_finetune_single_device.py b/tests/recipes/test_full_finetune_single_device.py
index 7578b68c4a..2494f82f30 100644
--- a/tests/recipes/test_full_finetune_single_device.py
+++ b/tests/recipes/test_full_finetune_single_device.py
@@ -47,9 +47,7 @@ def _fetch_expected_loss_values(self):
         return [10.5074, 10.5563, 10.5152, 10.4851]
 
     @pytest.mark.integration_test
-    @pytest.mark.parametrize(
-        "config", ["full_low_memory", "full"]
-    )
+    @pytest.mark.parametrize("config", ["full_low_memory", "full"])
     def test_loss(self, config, tmpdir, monkeypatch):
         ckpt = "small_test_ckpt_meta"
         ckpt_path = Path(CKPT_MODEL_PATHS[ckpt])

From d032735304beb5118844c3faca2c6dfccfb81420 Mon Sep 17 00:00:00 2001
From: joecummings <jrcummings27@gmail.com>
Date: Tue, 2 Apr 2024 15:48:13 -0700
Subject: [PATCH 06/12] Update CP tests

---
 tests/torchtune/_cli/test_cp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/torchtune/_cli/test_cp.py b/tests/torchtune/_cli/test_cp.py
index 0b6ae3b944..0d90cd6bbb 100644
--- a/tests/torchtune/_cli/test_cp.py
+++ b/tests/torchtune/_cli/test_cp.py
@@ -62,7 +62,7 @@ def test_copy_skips_when_dest_already_exists_and_no_clobber_is_true(
         existing_file = tmpdir_path / "existing_file.yaml"
         existing_file.touch()
 
-        args = f"tune cp llama2/7B_full_single_device {existing_file} -n".split()
+        args = f"tune cp llama2/7B_full {existing_file} -n".split()
 
         monkeypatch.setattr(sys, "argv", args)
         runpy.run_path(TUNE_PATH, run_name="__main__")
@@ -80,7 +80,7 @@ def test_adds_correct_suffix_to_dest_when_no_suffix_is_provided(
         tmpdir_path = Path(tmpdir)
         dest = tmpdir_path / "my_custom_finetune"
 
-        args = f"tune cp llama2/7B_full_single_device {dest}".split()
+        args = f"tune cp llama2/7B_full {dest}".split()
 
         monkeypatch.setattr(sys, "argv", args)
         runpy.run_path(TUNE_PATH, run_name="__main__")

From 400eeaaba396b11f407b660ea4c3d88cc6b53f59 Mon Sep 17 00:00:00 2001
From: joecummings <jrcummings27@gmail.com>
Date: Tue, 2 Apr 2024 16:46:06 -0700
Subject: [PATCH 07/12] Remove 'use_clean'

---
 recipes/configs/llama2/7B_qlora.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/recipes/configs/llama2/7B_qlora.yaml b/recipes/configs/llama2/7B_qlora.yaml
index d3c847579f..65bcd7972d 100644
--- a/recipes/configs/llama2/7B_qlora.yaml
+++ b/recipes/configs/llama2/7B_qlora.yaml
@@ -49,7 +49,6 @@ tokenizer:
 dataset:
   _component_: torchtune.datasets.alpaca_dataset
   train_on_input: True
-  use_clean: True
 seed: null
 shuffle: True
 batch_size: 2

From c4cf3eb6991b600f35fd82d13d3b95ea88e90dbb Mon Sep 17 00:00:00 2001
From: joecummings <jrcummings27@gmail.com>
Date: Tue, 2 Apr 2024 16:47:29 -0700
Subject: [PATCH 08/12] Properly remove file names

---
 tests/recipes/test_full_finetune_single_device.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/recipes/test_full_finetune_single_device.py b/tests/recipes/test_full_finetune_single_device.py
index 2494f82f30..d759c7c53d 100644
--- a/tests/recipes/test_full_finetune_single_device.py
+++ b/tests/recipes/test_full_finetune_single_device.py
@@ -102,7 +102,7 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch):
         # Train for two epochs
         cmd_1 = f"""
         tune run full_finetune_single_device \
-            --config llama2/7B_full_single_device \
+            --config llama2/7B_full \
             output_dir={tmpdir} \
             checkpointer._component_=torchtune.utils.FullModelHFCheckpointer \
             checkpointer.checkpoint_dir='{ckpt_dir}' \
@@ -121,7 +121,7 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch):
         # Resume training
         cmd_2 = f"""
         tune run full_finetune_single_device \
-            --config llama2/7B_full_single_device \
+            --config llama2/7B_full \
             output_dir={tmpdir} \
             checkpointer._component_=torchtune.utils.FullModelHFCheckpointer \
             checkpointer.checkpoint_dir={tmpdir} \
@@ -184,7 +184,7 @@ def test_gradient_accumulation(self, tmpdir, monkeypatch):
 
         cmd_1 = f"""
         tune run full_finetune_single_device \
-            --config llama2/7B_full_single_device \
+            --config llama2/7B_full \
             checkpointer._component_=torchtune.utils.FullModelTorchTuneCheckpointer \
             checkpointer.checkpoint_dir={ckpt_dir} \
             checkpointer.checkpoint_files=[{ckpt_path}]\
@@ -210,7 +210,7 @@ def test_gradient_accumulation(self, tmpdir, monkeypatch):
         # Update the cmd with new values for gradient accumulation
         cmd_2 = f"""
         tune run full_finetune_single_device \
-            --config llama2/7B_full_single_device \
+            --config llama2/7B_full \
             checkpointer._component_=torchtune.utils.FullModelTorchTuneCheckpointer \
             checkpointer.checkpoint_dir={ckpt_dir} \
             checkpointer.checkpoint_files=[{ckpt_path}]\

From 9f5cf33ef2f4ca9c3f367d4b5a4f60e12090c710 Mon Sep 17 00:00:00 2001
From: joecummings <jrcummings27@gmail.com>
Date: Wed, 3 Apr 2024 08:17:01 -0700
Subject: [PATCH 09/12] Updates

---
 recipes/configs/llama2/13B_lora.yaml   | 1 -
 recipes/configs/llama2/7B_full.yaml    | 5 ++---
 recipes/configs/llama2/7B_lora.yaml    | 1 -
 recipes/configs/llama2/7B_qlora.yaml   | 2 +-
 recipes/configs/mistral/7B_lora.yaml   | 1 -
 recipes/full_finetune_single_device.py | 4 ++--
 recipes/lora_finetune_single_device.py | 6 +++---
 torchtune/_recipe_registry.py          | 2 ++
 8 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/recipes/configs/llama2/13B_lora.yaml b/recipes/configs/llama2/13B_lora.yaml
index 022e0ccfb6..042a986277 100644
--- a/recipes/configs/llama2/13B_lora.yaml
+++ b/recipes/configs/llama2/13B_lora.yaml
@@ -72,7 +72,6 @@ loss:
 # Training
 epochs: 1
 max_steps_per_epoch: null
-gradient_accumulation_steps: 1
 
 # Logging
 output_dir: /tmp/lora_finetune_output
diff --git a/recipes/configs/llama2/7B_full.yaml b/recipes/configs/llama2/7B_full.yaml
index 9a46de2d17..fb13247867 100644
--- a/recipes/configs/llama2/7B_full.yaml
+++ b/recipes/configs/llama2/7B_full.yaml
@@ -1,5 +1,5 @@
-# Config for multi-device with full_finetune_distributed.py or single-device full finetuning
-# with full_finetune_single_device.py using a Llama2 7B model
+# Config settings for multi-device using full_finetune_distributed.py or single-device full finetuning
+# with full_finetune_single_device.py, specifically for a Llama2 7B model
 #
 # This config assumes that you've run the following command before launching:
 # $ tune download meta-llama/Llama-2-7b \
@@ -55,7 +55,6 @@ epochs: 3
 optimizer:
   _component_: torch.optim.AdamW
   lr: 2e-5
-optimizer_in_bwd: False
 loss:
   _component_: torch.nn.CrossEntropyLoss
 max_steps_per_epoch: null
diff --git a/recipes/configs/llama2/7B_lora.yaml b/recipes/configs/llama2/7B_lora.yaml
index b1e9f01e4b..fdf5971303 100644
--- a/recipes/configs/llama2/7B_lora.yaml
+++ b/recipes/configs/llama2/7B_lora.yaml
@@ -71,7 +71,6 @@ loss:
 # Training
 epochs: 1
 max_steps_per_epoch: null
-gradient_accumulation_steps: 1
 
 # Logging
 output_dir: /tmp/lora_finetune_output
diff --git a/recipes/configs/llama2/7B_qlora.yaml b/recipes/configs/llama2/7B_qlora.yaml
index 65bcd7972d..4eeb6f3ff9 100644
--- a/recipes/configs/llama2/7B_qlora.yaml
+++ b/recipes/configs/llama2/7B_qlora.yaml
@@ -47,7 +47,7 @@ tokenizer:
 
 # Dataset and Sampler
 dataset:
-  _component_: torchtune.datasets.alpaca_dataset
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
   train_on_input: True
 seed: null
 shuffle: True
diff --git a/recipes/configs/mistral/7B_lora.yaml b/recipes/configs/mistral/7B_lora.yaml
index a62d8f4a78..9e575fd810 100644
--- a/recipes/configs/mistral/7B_lora.yaml
+++ b/recipes/configs/mistral/7B_lora.yaml
@@ -57,7 +57,6 @@ loss:
 batch_size: 4
 epochs: 3
 max_steps_per_epoch: null
-gradient_accumulation_steps: 1
 
 # Training env
 device: cuda
diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py
index 3e6dd2fb1f..a043a16929 100644
--- a/recipes/full_finetune_single_device.py
+++ b/recipes/full_finetune_single_device.py
@@ -51,7 +51,7 @@ class FullFinetuneRecipeSingleDevice(FTRecipeInterface):
     The following configs can be used to run this recipe:
         >>> tune ls
         RECIPE                             CONFIG
-        full_finetune_single_device        llama2/7B_full_single_device
+        full_finetune_single_device        llama2/7B_full
 
     Args:
         cfg (DictConfig): OmegaConf object parsed from yaml file
@@ -87,7 +87,7 @@ def __init__(self, cfg: DictConfig) -> None:
         # Training cfg
         self._resume_from_checkpoint = cfg.resume_from_checkpoint
         self._gradient_accumulation_steps = cfg.gradient_accumulation_steps
-        self._optimizer_in_bwd = cfg.optimizer_in_bwd
+        self._optimizer_in_bwd = cfg.get("optimizer_in_bwd", False)
         # TODO: find a better place / way to perform validation of args that don't yet
         # compose with each other.
         if self._gradient_accumulation_steps > 1 and self._optimizer_in_bwd:
diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py
index 37e36cd9ef..15545b1106 100644
--- a/recipes/lora_finetune_single_device.py
+++ b/recipes/lora_finetune_single_device.py
@@ -52,8 +52,8 @@ class LoRAFinetuneRecipeSingleDevice(FTRecipeInterface):
     The following configs can be used to run this recipe:
         >>> tune ls
         RECIPE                          CONFIG
-        lora_finetune_single_device     llama2/7B_lora_single_device
-                                        llama2/7B_qlora_single_device
+        lora_finetune_single_device     llama2/7B_lora
+                                        llama2/7B_qlora
 
     Args:
         cfg (DictConfig): OmegaConf object parsed from yaml file
@@ -96,7 +96,7 @@ def __init__(self, cfg: DictConfig) -> None:
         self.total_training_steps = 0
 
         self._resume_from_checkpoint = cfg.resume_from_checkpoint
-        self._gradient_accumulation_steps = cfg.gradient_accumulation_steps
+        self._gradient_accumulation_steps = cfg.get("gradient_accumulation_steps", 1)
 
     def load_checkpoint(self, cfg: DictConfig) -> Dict[str, Any]:
         """
diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py
index 61da86ea4f..eddaec4906 100644
--- a/torchtune/_recipe_registry.py
+++ b/torchtune/_recipe_registry.py
@@ -35,6 +35,7 @@ class Recipe:
                 name="llama2/7B_full_low_memory",
                 file_path="llama2/7B_full_low_memory.yaml",
             ),
+            Config(name="mistral/7B_full", file_path="mistral/7B_full.yaml"),
         ],
         supports_distributed=False,
     ),
@@ -60,6 +61,7 @@ class Recipe:
                 name="llama2/7B_qlora",
                 file_path="llama2/7B_qlora.yaml",
             ),
+            Config(name="mistral/7B_lora", file_path="mistral/7B_lora.yaml"),
         ],
         supports_distributed=False,
     ),

From e0aae755a23b3c2cb5db9824b389e03defbdfb4e Mon Sep 17 00:00:00 2001
From: joecummings <jrcummings27@gmail.com>
Date: Wed, 3 Apr 2024 08:25:45 -0700
Subject: [PATCH 10/12] Fix bug

---
 recipes/full_finetune_single_device.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py
index a043a16929..acbaf17421 100644
--- a/recipes/full_finetune_single_device.py
+++ b/recipes/full_finetune_single_device.py
@@ -168,7 +168,7 @@ def setup(self, cfg: DictConfig) -> None:
         # checkpoint. Transforming the opt state dict is handled by this method
         self._optimizer = self._setup_optimizer(
             cfg_optimizer=cfg.optimizer,
-            optimizer_in_bwd=cfg.optimizer_in_bwd,
+            optimizer_in_bwd=self._optimizer_in_bwd,
             opt_state_dict=(
                 ckpt_dict[utils.OPT_KEY] if self._resume_from_checkpoint else None
             ),

From d8e4796d31cd40daf891d19bd51d1b3b913c2ac5 Mon Sep 17 00:00:00 2001
From: joecummings <jrcummings27@gmail.com>
Date: Wed, 3 Apr 2024 08:31:39 -0700
Subject: [PATCH 11/12] Updates to config comments

---
 recipes/configs/llama2/7B_full.yaml            | 2 +-
 recipes/configs/llama2/7B_full_low_memory.yaml | 2 +-
 recipes/configs/llama2/7B_qlora.yaml           | 7 +++----
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/recipes/configs/llama2/7B_full.yaml b/recipes/configs/llama2/7B_full.yaml
index fb13247867..b5ee6a8431 100644
--- a/recipes/configs/llama2/7B_full.yaml
+++ b/recipes/configs/llama2/7B_full.yaml
@@ -1,4 +1,4 @@
-# Config settings for multi-device using full_finetune_distributed.py or single-device full finetuning
+# Config settings for multi-device using full_finetune_distributed.py or single-device
 # with full_finetune_single_device.py, specifically for a Llama2 7B model
 #
 # This config assumes that you've run the following command before launching:
diff --git a/recipes/configs/llama2/7B_full_low_memory.yaml b/recipes/configs/llama2/7B_full_low_memory.yaml
index 124e95353b..a07714ae9d 100644
--- a/recipes/configs/llama2/7B_full_low_memory.yaml
+++ b/recipes/configs/llama2/7B_full_low_memory.yaml
@@ -6,7 +6,7 @@
 #       --hf-token <HF_TOKEN> \
 #       --output-dir /tmp/llama2
 #
-# To launch on a single device, run the following command:
+# To launch, run the following command:
 # $ tune run full_finetune_single_device \
 #       --config llama2/7B_full_low_memory
 #
diff --git a/recipes/configs/llama2/7B_qlora.yaml b/recipes/configs/llama2/7B_qlora.yaml
index 4eeb6f3ff9..3f88d4d745 100644
--- a/recipes/configs/llama2/7B_qlora.yaml
+++ b/recipes/configs/llama2/7B_qlora.yaml
@@ -1,13 +1,12 @@
-# Config for single device QLoRA with lora_finetune_single_device.py
+# Config for single-device QLoRA with lora_finetune_single_device.py
 # using a Llama2 7B model
 #
-# This config assumes that you've run the following command before launching
-# this run:
+# This config assumes that you've run the following command before launching:
 # $ tune download --repo-id meta-llama/Llama-2-7b \
 #       --hf-token <HF_TOKEN> \
 #       --output-dir /tmp/llama2
 #
-# To launch on a single device, run the following command from root:
+# To launch, run the following command from root:
 # $ tune run lora_finetune_single_device \
 #       --config llama2/7B_qlora
 #

From 01cfdce1ae61563d497206c7fd1ffe6caf974aaf Mon Sep 17 00:00:00 2001
From: joecummings <jrcummings27@gmail.com>
Date: Wed, 3 Apr 2024 08:41:16 -0700
Subject: [PATCH 12/12] Remove unsupported singel device commands in configs

---
 recipes/configs/llama2/13B_lora.yaml | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/recipes/configs/llama2/13B_lora.yaml b/recipes/configs/llama2/13B_lora.yaml
index 042a986277..d2fc1fed72 100644
--- a/recipes/configs/llama2/13B_lora.yaml
+++ b/recipes/configs/llama2/13B_lora.yaml
@@ -1,5 +1,4 @@
-# Config for multi-device with lora_finetune_distributed.py or single-device LoRA
-# finetuning with lora_finetune_single_device.py using a Llama2 13B model
+# Config for multi-device with lora_finetune_distributed.py using a Llama2 13B model
 #
 # This config assumes that you've run the following command before launching:
 # $ tune download meta-llama/Llama-2-13b \
@@ -10,16 +9,13 @@
 # $ tune run --nproc_per_node 4 lora_finetune_distributed \
 #       --config llama2/13B_lora
 #
-# To launch on a single device, run the following command:
-# $ tune run lora_finetune_single_device \
-#       --config llama2/13B_lora
-#
 # You can add specific overrides through the command line. For example
 # to override the checkpointer directory while launching training:
 # $ tune run --nproc_per_node 4 lora_finetune_distributed \
 #       --config llama2/13B_lora \
 #       checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
-
+#
+# This config is only tested on a multi-device setup
 
 # Model Arguments
 model: