Mistral QLoRA and config spring cleaning (#670)

pytorch · Apr 11, 2024 · 6e9ea22 · 6e9ea22
1 parent 8bb3aae
commit 6e9ea22
Show file tree

Hide file tree

Showing 23 changed files with 362 additions and 87 deletions.
diff --git a/recipes/configs/gemma/2B_full.yaml b/recipes/configs/gemma/2B_full.yaml
@@ -69,9 +69,6 @@ gradient_accumulation_steps: 1
 # Training env
 device: cuda
 
-# Distributed
-cpu_offload: False
-
 # Memory management
 enable_activation_checkpointing: True
 

diff --git a/recipes/configs/llama2/13B_full.yaml b/recipes/configs/llama2/13B_full.yaml
@@ -3,7 +3,7 @@
 #
 # This config assumes that you've run the following command before launching
 # this run:
-#   tune download --repo-id meta-llama/Llama-2-13b-hf \
+#   tune download meta-llama/Llama-2-13b-hf \
 #   --hf-token <HF_TOKEN> \
 #   --output-dir /tmp/llama2-13b-hf
 #
@@ -68,9 +68,6 @@ gradient_accumulation_steps: 1
 # Training env
 device: cuda
 
-# Distributed
-cpu_offload: False
-
 # Memory management
 enable_activation_checkpointing: True
 

diff --git a/recipes/configs/llama2/13B_lora.yaml b/recipes/configs/llama2/13B_lora.yaml
@@ -3,7 +3,7 @@
 #
 # This config assumes that you've run the following command before launching
 # this run:
-#   tune download --repo-id meta-llama/Llama-2-13b-hf \
+#   tune download meta-llama/Llama-2-13b-hf \
 #   --hf-token <HF_TOKEN> \
 #   --output-dir /tmp/llama2-13b-hf
 #
@@ -19,7 +19,7 @@
 #   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 #
 # This config works best when the model is being fine-tuned on 2+ GPUs.
-# For single device lora finetuning please use 7B_lora_single_device.yaml
+# For single device LoRA finetuning please use 7B_lora_single_device.yaml
 # or 7B_qlora_single_device.yaml and update the model and checkpoints to
 # the 13B model.
 

diff --git a/recipes/configs/llama2/7B_full.yaml b/recipes/configs/llama2/7B_full.yaml
@@ -3,7 +3,7 @@
 #
 # This config assumes that you've run the following command before launching
 # this run:
-#   tune download --repo-id meta-llama/Llama-2-7b \
+#   tune download meta-llama/Llama-2-7b \
 #   --hf-token <HF_TOKEN> \
 #   --output-dir /tmp/llama2
 #
@@ -63,9 +63,6 @@ gradient_accumulation_steps: 1
 # Training env
 device: cuda
 
-# Distributed
-cpu_offload: False
-
 # Memory management
 enable_activation_checkpointing: True
 

diff --git a/...ma2/7B_full_single_device_low_memory.yaml → ...es/configs/llama2/7B_full_low_memory.yaml b/...ma2/7B_full_single_device_low_memory.yaml → ...es/configs/llama2/7B_full_low_memory.yaml
@@ -3,19 +3,19 @@
 #
 # This config assumes that you've run the following command before launching
 # this run:
-#   tune download --repo-id meta-llama/Llama-2-7b \
+#   tune download meta-llama/Llama-2-7b \
 #   --hf-token <HF_TOKEN> \
 #   --output-dir /tmp/llama2
 #
 # To launch on a single device, run the following command from root:
 #   tune run full_finetune_single_device \
-#   --config llama2/7B_full_single_device_low_memory \
+#   --config llama2/7B_full_low_memory \
 #
 # You can add specific overrides through the command line. For example
 # to override the checkpointer directory while launching training
 # you can run:
-#   tune run --nnodes 1 --nproc_per_node 1 full_finetune_single_device \
-#   --config llama2/7B_full_single_device_low_memory \
+#   tune run full_finetune_single_device \
+#   --config llama2/7B_full_low_memory \
 #   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 #
 # This config works only for training on single device.
@@ -48,7 +48,7 @@ resume_from_checkpoint: False
 
 # Fine-tuning arguments
 batch_size: 2
-epochs: 1
+epochs: 3
 optimizer:
   _component_: bitsandbytes.optim.PagedAdamW
   lr: 2e-5
@@ -57,7 +57,7 @@ loss:
   _component_: torch.nn.CrossEntropyLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
-
+compile: False
 
 # Training environment
 device: cuda
@@ -68,9 +68,6 @@ enable_activation_checkpointing: True
 # Reduced precision
 dtype: bf16
 
-# Model compilation
-compile: False
-
 # Logging
 metric_logger:
   _component_: torchtune.utils.metric_logging.DiskLogger

diff --git a/recipes/configs/llama2/7B_lora.yaml b/recipes/configs/llama2/7B_lora.yaml
@@ -3,7 +3,7 @@
 #
 # This config assumes that you've run the following command before launching
 # this run:
-#   tune download --repo-id meta-llama/Llama-2-7b \
+#   tune download meta-llama/Llama-2-7b \
 #   --hf-token <HF_TOKEN> \
 #   --output-dir /tmp/llama2
 #
@@ -19,7 +19,7 @@
 #   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 #
 # This config works best when the model is being fine-tuned on 2+ GPUs.
-# For single device lora finetuning please use 7B_lora_single_device.yaml
+# For single device LoRA finetuning please use 7B_lora_single_device.yaml
 # or 7B_qlora_single_device.yaml
 
 

diff --git a/recipes/configs/llama2/7B_lora_single_device.yaml b/recipes/configs/llama2/7B_lora_single_device.yaml
@@ -3,7 +3,7 @@
 #
 # This config assumes that you've run the following command before launching
 # this run:
-#   tune download --repo-id meta-llama/Llama-2-7b \
+#   tune download meta-llama/Llama-2-7b \
 #   --hf-token <HF_TOKEN> \
 #   --output-dir /tmp/llama2
 #

diff --git a/recipes/configs/llama2/7B_qlora_single_device.yaml b/recipes/configs/llama2/7B_qlora_single_device.yaml
@@ -3,7 +3,7 @@
 #
 # This config assumes that you've run the following command before launching
 # this run:
-#   tune download --repo-id meta-llama/Llama-2-7b \
+#   tune download meta-llama/Llama-2-7b \
 #   --hf-token <HF_TOKEN> \
 #   --output-dir /tmp/llama2
 #
@@ -28,7 +28,6 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
-  quantize_base: True
 
 checkpointer:
   _component_: torchtune.utils.FullModelMetaCheckpointer

diff --git a/recipes/configs/mistral/7B_full.yaml b/recipes/configs/mistral/7B_full.yaml
@@ -1,9 +1,29 @@
+# Config for multi-device full finetuning in full_finetune_distributed.py
+# using a Mistral 7B model
+#
 # This config uses hyperparameters based on small set of experiments and information
 # available on various forums. These are not meant to replicate the numbers
 # from the paper
 #
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download mistralai/Mistral-7B-v0.1 \
+#   --hf-token <HF_TOKEN> \
+#   --output-dir /tmp/Mistral-7B-v0.1
+#
 # Run this config on 4 GPUs using the following:
 # tune run --nproc_per_node 4 full_finetune_distributed --config mistral/7B_full
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed \
+#   --config mistral/7B_full \
+#   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# Single device full finetuning requires more memory optimizations. It's
+# best to use 7B_full_single_device.yaml for those cases
 
 # Tokenizer
 tokenizer:
@@ -48,9 +68,6 @@ gradient_accumulation_steps: 1
 # Training env
 device: cuda
 
-# Distributed
-cpu_offload: False
-
 # Memory management
 enable_activation_checkpointing: True
 

diff --git a/...configs/llama2/7B_full_single_device.yaml → ...s/configs/mistral/7B_full_low_memory.yaml b/...configs/llama2/7B_full_single_device.yaml → ...s/configs/mistral/7B_full_low_memory.yaml
@@ -1,30 +1,33 @@
 # Config for single device full finetuning in full_finetune_single_device.py
-# using a Llama2 7B model
+# using a Mistral 7B model
+#
+# This config uses hyperparameters based on small set of experiments and information
+# available on various forums. These are not meant to replicate the numbers
+# from the paper
 #
 # This config assumes that you've run the following command before launching
 # this run:
-#   tune download --repo-id meta-llama/Llama-2-7b \
+#   tune download mistralai/Mistral-7B-v0.1 \
 #   --hf-token <HF_TOKEN> \
-#   --output-dir /tmp/llama2
+#   --output-dir /tmp/Mistral-7B-v0.1
 #
 # To launch on a single device, run the following command from root:
 #   tune run full_finetune_single_device \
-#   --config llama2/7B_full_single_device \
+#   --config mistral/7B_full_low_memory \
 #
 # You can add specific overrides through the command line. For example
 # to override the checkpointer directory while launching training
 # you can run:
-#   tune run --nnodes 1 --nproc_per_node 1 full_finetune_single_device \
-#   --config llama2/7B_full_single_device \
+#   tune run full_finetune_single_device \
+#   --config mistral/7B_full_low_memory \
 #   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 #
 # This config works only for training on single device.
 
-
 # Tokenizer
 tokenizer:
-  _component_: torchtune.models.llama2.llama2_tokenizer
-  path: /tmp/llama2/tokenizer.model
+  _component_: torchtune.models.mistral.mistral_tokenizer
+  path: /tmp/Mistral-7B-v0.1/tokenizer.model
 
 # Dataset
 dataset:
@@ -35,32 +38,33 @@ shuffle: True
 
 # Model Arguments
 model:
-  _component_: torchtune.models.llama2.llama2_7b
+  _component_: torchtune.models.mistral.mistral_7b
 
 checkpointer:
-  _component_: torchtune.utils.FullModelMetaCheckpointer
-  checkpoint_dir: /tmp/llama2
-  checkpoint_files: [consolidated.00.pth]
+  _component_: torchtune.utils.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Mistral-7B-v0.1
+  checkpoint_files: [
+    pytorch_model-00001-of-00002.bin,
+    pytorch_model-00002-of-00002.bin
+  ]
   recipe_checkpoint: null
-  output_dir: /tmp/llama2
-  model_type: LLAMA2
+  output_dir: /tmp/Mistral-7B-v0.1/
+  model_type: MISTRAL
 resume_from_checkpoint: False
 
 # Fine-tuning arguments
 batch_size: 2
 epochs: 3
 optimizer:
-  _component_: torch.optim.SGD
-  lr: 2e-5
+  _component_: bitsandbytes.optim.PagedAdamW
+  lr: 5e-6
 loss:
   _component_: torch.nn.CrossEntropyLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
-compile: False
-optimizer_in_bwd: False
+optimizer_in_bwd: True
 
-
-# Training environment
+# Training env
 device: cuda
 
 # Memory management
@@ -69,9 +73,12 @@ enable_activation_checkpointing: True
 # Reduced precision
 dtype: bf16
 
+# Model compilation
+compile: False
+
 # Logging
 metric_logger:
   _component_: torchtune.utils.metric_logging.DiskLogger
   log_dir: ${output_dir}
-output_dir: /tmp/alpaca-llama2-finetune
+output_dir: /tmp/Mistral-7B-v0.1/
 log_every_n_steps: null
diff --git a/recipes/configs/mistral/7B_lora.yaml b/recipes/configs/mistral/7B_lora.yaml
@@ -1,9 +1,29 @@
+# Config for multi-device LoRA finetuning in lora_finetune_distributed.py
+# using a Mistral 7B model
+#
 # This config uses hyperparameters based on small set of experiments and information
 # available on various forums. These are not meant to replicate the numbers
 # from the paper
 #
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download mistralai/Mistral-7B-v0.1 \
+#   --hf-token <HF_TOKEN> \
+#   --output-dir /tmp/Mistral-7B-v0.1
+#
 # Run this config on 4 GPUs using the following:
 # tune run --nproc_per_node 4 lora_finetune_distributed --config mistral/7B_lora
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run --nnodes 1 --nproc_per_node 4 lora_finetune_distributed \
+#   --config mistral/7B_lora \
+#   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# For single device LoRA finetuning please use 7B_lora_single_device.yaml
+# or 7B_qlora_single_device.yaml for those cases
 
 
 # Tokenizer