pytorch · joecummings · Apr 2, 2024 · Apr 2, 2024 · Apr 2, 2024 · Apr 2, 2024
diff --git a/recipes/configs/llama2/13B_full.yaml b/recipes/configs/llama2/13B_full.yaml
@@ -1,27 +1,19 @@
-# Config for multi-device full finetuning in full_finetune_distributed.py
-# using a Llama2 13B model
+# Config for multi-device with full_finetune_distributed.py using a Llama2 13B model
 #
-# This config assumes that you've run the following command before launching
-# this run:
-#   tune download --repo-id meta-llama/Llama-2-13b-hf \
-#   --hf-token <HF_TOKEN> \
-#   --output-dir /tmp/llama2-13b-hf
+# This config assumes that you've run the following command before launching:
+# $ tune download meta-llama/Llama-2-13b \
+#       --hf-token <HF_TOKEN> \
+#       --output-dir /tmp/llama2
 #
 # To launch on 4 devices, run the following command from root:
-#   tune run --nproc_per_node 4 full_finetune_distributed \
-#   --config llama2/13B_full \
+# $ tune run --nproc_per_node 4 full_finetune_distributed \
+#       --config llama2/13B_full
 #
 # You can add specific overrides through the command line. For example
-# to override the checkpointer directory while launching training
-# you can run:
-#   tune --nnodes 1 --nproc_per_node 4 full_finetune_distributed \
-#   --config llama2/13B_full \
-#   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
-#
-# This config should be used with 2+ GPUs. Single device full fine-tuning
-# requires several memory optimizations which are exposed through
-# 7B_full_single_device.yaml. Please update the model and checkpoints to 13B
-# in that config.
+# to override the checkpointer directory while launching training:
+# $ tune run --nproc_per_node 4 full_finetune_distributed \
+#       --config llama2/13B_full \
+#       checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 
 
 # Tokenizer

diff --git a/recipes/configs/llama2/13B_lora.yaml b/recipes/configs/llama2/13B_lora.yaml
@@ -1,28 +1,21 @@
-# Config for multi-device LoRA in lora_finetune_distributed.py
-# using a Llama2 13B model
+# Config for multi-device with lora_finetune_distributed.py using a Llama2 13B model
 #
-# This config assumes that you've run the following command before launching
-# this run:
-#   tune download --repo-id meta-llama/Llama-2-13b-hf \
-#   --hf-token <HF_TOKEN> \
-#   --output-dir /tmp/llama2-13b-hf
+# This config assumes that you've run the following command before launching:
+# $ tune download meta-llama/Llama-2-13b \
+#       --hf-token <HF_TOKEN> \
+#       --output-dir /tmp/llama2
 #
 # To launch on 4 devices, run the following command from root:
-#   tune run --nproc_per_node 4 lora_finetune_distributed \
-#   --config llama2/13B_lora \
+# $ tune run --nproc_per_node 4 lora_finetune_distributed \
+#       --config llama2/13B_lora
 #
 # You can add specific overrides through the command line. For example
-# to override the checkpointer directory while launching training
-# you can run:
-#   tune --nnodes 1 --nproc_per_node 4 lora_finetune_distributed \
-#   --config llama2/13B_lora \
-#   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+# to override the checkpointer directory while launching training:
+# $ tune run --nproc_per_node 4 lora_finetune_distributed \
+#       --config llama2/13B_lora \
+#       checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 #
-# This config works best when the model is being fine-tuned on 2+ GPUs.
-# For single device lora finetuning please use 7B_lora_single_device.yaml
-# or 7B_qlora_single_device.yaml and update the model and checkpoints to
-# the 13B model.
-
+# This config is only tested on a multi-device setup
 
 # Model Arguments
 model:

diff --git a/recipes/configs/llama2/7B_full.yaml b/recipes/configs/llama2/7B_full.yaml
@@ -1,26 +1,27 @@
-# Config for multi-device full finetuning in full_finetune_distributed.py
-# using a Llama2 7B model
+# Config settings for multi-device using full_finetune_distributed.py or single-device
+# with full_finetune_single_device.py, specifically for a Llama2 7B model
 #
-# This config assumes that you've run the following command before launching
-# this run:
-#   tune download --repo-id meta-llama/Llama-2-7b \
-#   --hf-token <HF_TOKEN> \
-#   --output-dir /tmp/llama2
+# This config assumes that you've run the following command before launching:
+# $ tune download meta-llama/Llama-2-7b \
+#       --hf-token <HF_TOKEN> \
+#       --output-dir /tmp/llama2
 #
 # To launch on 4 devices, run the following command from root:
-#   tune run --nproc_per_node 4 full_finetune_distributed \
-#   --config llama2/7B_full \
+# $ tune run --nproc_per_node 4 full_finetune_distributed \
+#       --config llama2/7B_full
+#
+# To launch on a single device, run the following command:
+# $ tune run full_finetune_single_device \
+#       --config llama2/7B_full
 #
 # You can add specific overrides through the command line. For example
-# to override the checkpointer directory while launching training
-# you can run:
-#   tune --nnodes 1 --nproc_per_node 4 full_finetune_distributed \
-#   --config llama2/7B_full \
-#   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+# to override the checkpointer directory while launching training:
+# $ tune run --nproc_per_node 4 full_finetune_distributed \
+#       --config llama2/7B_full \
+#       checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 #
-# This config works best when the model is being fine-tuned on 2+ GPUs.
-# Single device full finetuning requires more memory optimizations. It's
-# best to use 7B_full_single_device.yaml for those cases
+# For more memory optimizations, such as those needed when running on a single GPU,
+# use llama2/7B_full_low_memory_example
 
 
 # Tokenizer

diff --git a/...ma2/7B_full_single_device_low_memory.yaml → ...es/configs/llama2/7B_full_low_memory.yaml b/...ma2/7B_full_single_device_low_memory.yaml → ...es/configs/llama2/7B_full_low_memory.yaml
@@ -1,25 +1,22 @@
-# Config for single device full finetuning in full_finetune_single_device.py
+# Config for single-device full finetuning with full_finetune_single_device.py
 # using a Llama2 7B model
 #
-# This config assumes that you've run the following command before launching
-# this run:
-#   tune download --repo-id meta-llama/Llama-2-7b \
-#   --hf-token <HF_TOKEN> \
-#   --output-dir /tmp/llama2
+# This config assumes that you've run the following command before launching:
+# $ tune download meta-llama/Llama-2-7b \
+#       --hf-token <HF_TOKEN> \
+#       --output-dir /tmp/llama2
 #
-# To launch on a single device, run the following command from root:
-#   tune run full_finetune_single_device \
-#   --config llama2/7B_full_single_device_low_memory \
+# To launch, run the following command:
+# $ tune run full_finetune_single_device \
+#       --config llama2/7B_full_low_memory
 #
 # You can add specific overrides through the command line. For example
-# to override the checkpointer directory while launching training
-# you can run:
-#   tune --nnodes 1 --nproc_per_node 1 full_finetune_single_device \
-#   --config llama2/7B_full_single_device_low_memory \
-#   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+# to override the checkpointer directory while launching training:
+# $ tune run full_finetune_single_device \
+#       --config llama2/7B_full_low_memory \
+#       checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 #
-# This config works only for training on single device.
-
+# This config specifies components from `bitsandbytes`, make sure you have it installed
 
 # Tokenizer
 tokenizer:

diff --git a/recipes/configs/llama2/7B_full_single_device.yaml b/recipes/configs/llama2/7B_full_single_device.yaml
diff --git a/recipes/configs/llama2/7B_lora.yaml b/recipes/configs/llama2/7B_lora.yaml
@@ -1,26 +1,27 @@
-# Config for multi-device LoRA finetuning in lora_finetune_distributed.py
-# using a Llama2 7B model
+# Config for multi-device with lora_finetune_distributed.py or single-device LoRA
+# finetuning with lora_finetune_single_device.py using a Llama2 7B model
 #
-# This config assumes that you've run the following command before launching
-# this run:
-#   tune download --repo-id meta-llama/Llama-2-7b \
-#   --hf-token <HF_TOKEN> \
-#   --output-dir /tmp/llama2
+# This config assumes that you've run the following command before launching:
+# $ tune download meta-llama/Llama-2-7b \
+#       --hf-token <HF_TOKEN> \
+#       --output-dir /tmp/llama2
 #
 # To launch on 4 devices, run the following command from root:
-#   tune run --nproc_per_node 4 lora_finetune_distributed \
-#   --config llama2/7B_lora \
+# $ tune run --nproc_per_node 4 lora_finetune_distributed \
+#       --config llama2/7B_lora
+#
+# To launch on a single device, run the following command:
+# $ tune run lora_finetune_single_device \
+#       --config llama2/7B_lora
 #
 # You can add specific overrides through the command line. For example
-# to override the checkpointer directory while launching training
-# you can run:
-#   tune --nnodes 1 --nproc_per_node 4 lora_finetune_distributed \
-#   --config llama2/7B_lora \
-#   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+# to override the checkpointer directory while launching training:
+# $ tune run --nproc_per_node 4 lora_finetune_distributed \
+#       --config llama2/7B_lora \
+#       checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 #
-# This config works best when the model is being fine-tuned on 2+ GPUs.
-# For single device lora finetuning please use 7B_lora_single_device.yaml
-# or 7B_qlora_single_device.yaml
+# For more memory optimizations, such as those needed when running on a single GPU,
+# use llama2/7B_qlora
 
 
 # Model Arguments

diff --git a/recipes/configs/llama2/7B_lora_single_device.yaml b/recipes/configs/llama2/7B_lora_single_device.yaml