pytorch · ebsmothers · Apr 11, 2024 · Apr 9, 2024 · Apr 9, 2024 · Apr 9, 2024
diff --git a/recipes/configs/gemma/2B_full.yaml b/recipes/configs/gemma/2B_full.yaml
@@ -69,9 +69,6 @@ gradient_accumulation_steps: 1
 # Training env
 device: cuda
 
-# Distributed
-cpu_offload: False
-
 # Memory management
 enable_activation_checkpointing: True
 

diff --git a/recipes/configs/llama2/13B_full.yaml b/recipes/configs/llama2/13B_full.yaml
@@ -68,9 +68,6 @@ gradient_accumulation_steps: 1
 # Training env
 device: cuda
 
-# Distributed
-cpu_offload: False
-
 # Memory management
 enable_activation_checkpointing: True
 

diff --git a/recipes/configs/llama2/13B_lora.yaml b/recipes/configs/llama2/13B_lora.yaml
@@ -19,7 +19,7 @@
 #   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 #
 # This config works best when the model is being fine-tuned on 2+ GPUs.
-# For single device lora finetuning please use 7B_lora_single_device.yaml
+# For single device LoRA finetuning please use 7B_lora_single_device.yaml
 # or 7B_qlora_single_device.yaml and update the model and checkpoints to
 # the 13B model.
 

diff --git a/recipes/configs/llama2/7B_full.yaml b/recipes/configs/llama2/7B_full.yaml
@@ -63,9 +63,6 @@ gradient_accumulation_steps: 1
 # Training env
 device: cuda
 
-# Distributed
-cpu_offload: False
-
 # Memory management
 enable_activation_checkpointing: True
 

diff --git a/recipes/configs/llama2/7B_full_single_device.yaml b/recipes/configs/llama2/7B_full_single_device.yaml
@@ -14,7 +14,7 @@
 # You can add specific overrides through the command line. For example
 # to override the checkpointer directory while launching training
 # you can run:
-#   tune run --nnodes 1 --nproc_per_node 1 full_finetune_single_device \
+#   tune run full_finetune_single_device \
 #   --config llama2/7B_full_single_device \
 #   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 #
@@ -48,15 +48,15 @@ resume_from_checkpoint: False
 
 # Fine-tuning arguments
 batch_size: 2
-epochs: 3
+epochs: 1
 optimizer:
-  _component_: torch.optim.SGD
+  _component_: bitsandbytes.optim.PagedAdamW
   lr: 2e-5
+optimizer_in_bwd: True
 loss:
   _component_: torch.nn.CrossEntropyLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
-optimizer_in_bwd: False
 
 
 # Training environment

diff --git a/recipes/configs/llama2/7B_lora.yaml b/recipes/configs/llama2/7B_lora.yaml
@@ -19,7 +19,7 @@
 #   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 #
 # This config works best when the model is being fine-tuned on 2+ GPUs.
-# For single device lora finetuning please use 7B_lora_single_device.yaml
+# For single device LoRA finetuning please use 7B_lora_single_device.yaml
 # or 7B_qlora_single_device.yaml
 
 

diff --git a/recipes/configs/llama2/7B_qlora_single_device.yaml b/recipes/configs/llama2/7B_qlora_single_device.yaml
@@ -28,7 +28,6 @@ model:
   apply_lora_to_output: False
   lora_rank: 8
   lora_alpha: 16
-  quantize_base: True
 
 checkpointer:
   _component_: torchtune.utils.FullModelMetaCheckpointer

diff --git a/recipes/configs/mistral/7B_full.yaml b/recipes/configs/mistral/7B_full.yaml
@@ -1,9 +1,29 @@
+# Config for multi-device full finetuning in full_finetune_distributed.py
+# using a Mistral 7B model
+#
 # This config uses hyperparameters based on small set of experiments and information
 # available on various forums. These are not meant to replicate the numbers
 # from the paper
 #
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download --repo-id mistralai/Mistral-7B-v0.1 \
+#   --hf-token <HF_TOKEN> \
+#   --output-dir /tmp/Mistral-7B-v0.1
+#
 # Run this config on 4 GPUs using the following:
 # tune run --nproc_per_node 4 full_finetune_distributed --config mistral/7B_full
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune --nnodes 1 --nproc_per_node 4 full_finetune_distributed \
-#   tune --nnodes 1 --nproc_per_node 4 full_finetune_distributed \
+#   tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed \
-#   tune --nnodes 1 --nproc_per_node 4 full_finetune_distributed \
+#   tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed \
+#   --config mistral/7B_full \
+#   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# Single device full finetuning requires more memory optimizations. It's
+# best to use 7B_full_single_device.yaml for those cases
 
 # Tokenizer
 tokenizer:
@@ -48,9 +68,6 @@ gradient_accumulation_steps: 1
 # Training env
 device: cuda
 
-# Distributed
-cpu_offload: False
-
 # Memory management
 enable_activation_checkpointing: True
 

diff --git a/...ma2/7B_full_single_device_low_memory.yaml → ...onfigs/mistral/7B_full_single_device.yaml b/...ma2/7B_full_single_device_low_memory.yaml → ...onfigs/mistral/7B_full_single_device.yaml
@@ -1,30 +1,33 @@
 # Config for single device full finetuning in full_finetune_single_device.py
-# using a Llama2 7B model
+# using a Mistral 7B model
+#
+# This config uses hyperparameters based on small set of experiments and information
+# available on various forums. These are not meant to replicate the numbers
+# from the paper
 #
 # This config assumes that you've run the following command before launching
 # this run:
-#   tune download --repo-id meta-llama/Llama-2-7b \
+#   tune download --repo-id mistralai/Mistral-7B-v0.1 \
 #   --hf-token <HF_TOKEN> \
-#   --output-dir /tmp/llama2
+#   --output-dir /tmp/Mistral-7B-v0.1
 #
 # To launch on a single device, run the following command from root:
 #   tune run full_finetune_single_device \
-#   --config llama2/7B_full_single_device_low_memory \
+#   --config mistral/7B_full_single_device \
 #
 # You can add specific overrides through the command line. For example
 # to override the checkpointer directory while launching training
 # you can run:
-#   tune run --nnodes 1 --nproc_per_node 1 full_finetune_single_device \
-#   --config llama2/7B_full_single_device_low_memory \
+#   tune run full_finetune_single_device \
+#   --config llama2/7B_full_single_device \
-#   --config llama2/7B_full_single_device \
+#   --config mistral/7B_full_single_device \
-#   --config llama2/7B_full_single_device \
+#   --config mistral/7B_full_single_device \
 #   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
 #
 # This config works only for training on single device.
 
-
 # Tokenizer
 tokenizer:
-  _component_: torchtune.models.llama2.llama2_tokenizer
-  path: /tmp/llama2/tokenizer.model
+  _component_: torchtune.models.mistral.mistral_tokenizer
+  path: /tmp/Mistral-7B-v0.1/tokenizer.model
 
 # Dataset
 dataset:
@@ -35,31 +38,33 @@ shuffle: True
 
 # Model Arguments
 model:
-  _component_: torchtune.models.llama2.llama2_7b
+  _component_: torchtune.models.mistral.mistral_7b
 
 checkpointer:
-  _component_: torchtune.utils.FullModelMetaCheckpointer
-  checkpoint_dir: /tmp/llama2
-  checkpoint_files: [consolidated.00.pth]
+  _component_: torchtune.utils.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Mistral-7B-v0.1
+  checkpoint_files: [
+    pytorch_model-00001-of-00002.bin,
+    pytorch_model-00002-of-00002.bin
+  ]
   recipe_checkpoint: null
-  output_dir: /tmp/llama2
-  model_type: LLAMA2
+  output_dir: /tmp/Mistral-7B-v0.1/
+  model_type: MISTRAL
 resume_from_checkpoint: False
 
 # Fine-tuning arguments
 batch_size: 2
-epochs: 1
+epochs: 3
 optimizer:
   _component_: bitsandbytes.optim.PagedAdamW
-  lr: 2e-5
-optimizer_in_bwd: True
+  lr: 5e-6
 loss:
   _component_: torch.nn.CrossEntropyLoss
 max_steps_per_epoch: null
 gradient_accumulation_steps: 1
+optimizer_in_bwd: True
 
-
-# Training environment
+# Training env
 device: cuda
 
 # Memory management
@@ -72,5 +77,5 @@ dtype: bf16
 metric_logger:
   _component_: torchtune.utils.metric_logging.DiskLogger
   log_dir: ${output_dir}
-output_dir: /tmp/alpaca-llama2-finetune
+output_dir: /tmp/Mistral-7B-v0.1/
 log_every_n_steps: null
diff --git a/recipes/configs/mistral/7B_lora.yaml b/recipes/configs/mistral/7B_lora.yaml
@@ -1,9 +1,29 @@
+# Config for multi-device LoRA finetuning in lora_finetune_distributed.py
+# using a Mistral 7B model
+#
 # This config uses hyperparameters based on small set of experiments and information
 # available on various forums. These are not meant to replicate the numbers
 # from the paper
 #
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download --repo-id mistralai/Mistral-7B-v0.1 \
+#   --hf-token <HF_TOKEN> \
+#   --output-dir /tmp/Mistral-7B-v0.1
+#
 # Run this config on 4 GPUs using the following:
 # tune run --nproc_per_node 4 lora_finetune_distributed --config mistral/7B_lora
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune --nnodes 1 --nproc_per_node 4 lora_finetune_distributed \
-#   tune --nnodes 1 --nproc_per_node 4 lora_finetune_distributed \
+#   tune run --nnodes 1 --nproc_per_node 4 lora_finetune_distributed \
-#   tune --nnodes 1 --nproc_per_node 4 lora_finetune_distributed \
+#   tune run --nnodes 1 --nproc_per_node 4 lora_finetune_distributed \
+#   --config mistral/7B_lora \
+#   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# For single device LoRA finetuning please use 7B_lora_single_device.yaml
+# or 7B_qlora_single_device.yaml for those cases
 
 
 # Tokenizer

diff --git a/recipes/configs/mistral/7B_lora_single_device.yaml b/recipes/configs/mistral/7B_lora_single_device.yaml
@@ -0,0 +1,98 @@
+# Config for single device LoRA finetuning in lora_finetune_single_device.py
+# using a Mistral 7B model
+#
+# This config uses hyperparameters based on small set of experiments and information
+# available on various forums. These are not meant to replicate the numbers
+# from the paper
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download --repo-id mistralai/Mistral-7B-v0.1 \
+#   --hf-token <HF_TOKEN> \
+#   --output-dir /tmp/Mistral-7B-v0.1
+#
+# To launch on a single device, run the following command from root:
+#   tune run lora_finetune_single_device \
+#   --config mistral/7B_lora_single_device \
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run lora_finetune_single_device \
+#   --config mistral/7B_lora_single_device \
+#   checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.mistral.mistral_tokenizer
+  path: /tmp/Mistral-7B-v0.1/tokenizer.model
+
+# Dataset
+dataset:
+  _component_: torchtune.datasets.alpaca_dataset
+  train_on_input: True
+seed: null
+shuffle: True
+
+# Model Arguments
+model:
+  _component_: torchtune.models.mistral.lora_mistral_7b
+  lora_attn_modules: ['q_proj', 'k_proj', 'v_proj']
+  apply_lora_to_mlp: True
+  apply_lora_to_output: True
+  lora_rank: 64
+  lora_alpha: 16
+
+checkpointer:
+  _component_: torchtune.utils.FullModelHFCheckpointer
+  checkpoint_dir: /tmp/Mistral-7B-v0.1
+  checkpoint_files: [
+    pytorch_model-00001-of-00002.bin,
+    pytorch_model-00002-of-00002.bin
+  ]
+  recipe_checkpoint: null
+  output_dir: /tmp/Mistral-7B-v0.1
+  model_type: MISTRAL
+resume_from_checkpoint: False
+
+optimizer:
+  _component_: torch.optim.AdamW
+  lr: 2e-5
+
+lr_scheduler:
+  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torch.nn.CrossEntropyLoss
+
+# Fine-tuning arguments
+batch_size: 4
+epochs: 3
+max_steps_per_epoch: null
+gradient_accumulation_steps: 1
+
+# Training env
+device: cuda
+
+# Memory management
+enable_activation_checkpointing: True
+
+# Reduced precision
+dtype: bf16
+
+# Logging
+metric_logger:
+  _component_: torchtune.utils.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+output_dir: /tmp/Mistral-7B-v0.1
+log_every_n_steps: null
+
+# Show case the usage of pytorch profiler
+# Set enabled to False as it's only needed for debugging training
+profiler:
+  _component_: torchtune.utils.profiler
+  enabled: False
+  output_dir: /tmp/alpaca-llama2-finetune/torchtune_perf_tracing.json