From 985727467ca6b34c7925babc035a6b2d170714a9 Mon Sep 17 00:00:00 2001 From: joecummings Date: Tue, 2 Apr 2024 11:59:52 -0700 Subject: [PATCH 01/12] Rename files --- recipes/configs/llama2/13B_full.yaml | 30 +++---- recipes/configs/llama2/13B_lora.yaml | 35 ++++---- recipes/configs/llama2/7B_full.yaml | 35 ++++---- .../configs/llama2/7B_full_single_device.yaml | 76 ----------------- .../7B_full_single_device_low_memory.yaml | 76 ----------------- recipes/configs/llama2/7B_lora.yaml | 35 ++++---- .../configs/llama2/7B_lora_single_device.yaml | 83 ------------------- .../llama2/7B_qlora_single_device.yaml | 83 ------------------- recipes/configs/mistral/7B_full.yaml | 3 +- recipes/configs/mistral/7B_lora.yaml | 5 +- torchtune/_recipe_registry.py | 24 ++---- 11 files changed, 77 insertions(+), 408 deletions(-) delete mode 100644 recipes/configs/llama2/7B_full_single_device.yaml delete mode 100644 recipes/configs/llama2/7B_full_single_device_low_memory.yaml delete mode 100644 recipes/configs/llama2/7B_lora_single_device.yaml delete mode 100644 recipes/configs/llama2/7B_qlora_single_device.yaml diff --git a/recipes/configs/llama2/13B_full.yaml b/recipes/configs/llama2/13B_full.yaml index abbd9c45c5..b54821c93a 100644 --- a/recipes/configs/llama2/13B_full.yaml +++ b/recipes/configs/llama2/13B_full.yaml @@ -1,27 +1,19 @@ -# Config for multi-device full finetuning in full_finetune_distributed.py -# using a Llama2 13B model +# Config for multi-device with full_finetune_distributed.py using a Llama2 13B model # -# This config assumes that you've run the following command before launching -# this run: -# tune download --repo-id meta-llama/Llama-2-13b-hf \ -# --hf-token \ -# --output-dir /tmp/llama2-13b-hf +# This config assumes that you've run the following command before launching: +# $ tune download meta-llama/Llama-2-13b \ +# --hf-token \ +# --output-dir /tmp/llama2 # # To launch on 4 devices, run the following command from root: -# tune run --nproc_per_node 4 full_finetune_distributed \ -# --config llama2/13B_full \ +# $ tune run --nproc_per_node 4 full_finetune_distributed \ +# --config llama2/13B_full # # You can add specific overrides through the command line. For example -# to override the checkpointer directory while launching training -# you can run: -# tune --nnodes 1 --nproc_per_node 4 full_finetune_distributed \ -# --config llama2/13B_full \ -# checkpointer.checkpoint_dir= -# -# This config should be used with 2+ GPUs. Single device full fine-tuning -# requires several memory optimizations which are exposed through -# 7B_full_single_device.yaml. Please update the model and checkpoints to 13B -# in that config. +# to override the checkpointer directory while launching training: +# $ tune run --nproc_per_node 4 full_finetune_distributed \ +# --config llama2/13B_full \ +# checkpointer.checkpoint_dir= # Tokenizer diff --git a/recipes/configs/llama2/13B_lora.yaml b/recipes/configs/llama2/13B_lora.yaml index 947faf7c6a..042a986277 100644 --- a/recipes/configs/llama2/13B_lora.yaml +++ b/recipes/configs/llama2/13B_lora.yaml @@ -1,27 +1,24 @@ -# Config for multi-device LoRA in lora_finetune_distributed.py -# using a Llama2 13B model +# Config for multi-device with lora_finetune_distributed.py or single-device LoRA +# finetuning with lora_finetune_single_device.py using a Llama2 13B model # -# This config assumes that you've run the following command before launching -# this run: -# tune download --repo-id meta-llama/Llama-2-13b-hf \ -# --hf-token \ -# --output-dir /tmp/llama2-13b-hf +# This config assumes that you've run the following command before launching: +# $ tune download meta-llama/Llama-2-13b \ +# --hf-token \ +# --output-dir /tmp/llama2 # # To launch on 4 devices, run the following command from root: -# tune run --nproc_per_node 4 lora_finetune_distributed \ -# --config llama2/13B_lora \ +# $ tune run --nproc_per_node 4 lora_finetune_distributed \ +# --config llama2/13B_lora # -# You can add specific overrides through the command line. For example -# to override the checkpointer directory while launching training -# you can run: -# tune --nnodes 1 --nproc_per_node 4 lora_finetune_distributed \ -# --config llama2/13B_lora \ -# checkpointer.checkpoint_dir= +# To launch on a single device, run the following command: +# $ tune run lora_finetune_single_device \ +# --config llama2/13B_lora # -# This config works best when the model is being fine-tuned on 2+ GPUs. -# For single device lora finetuning please use 7B_lora_single_device.yaml -# or 7B_qlora_single_device.yaml and update the model and checkpoints to -# the 13B model. +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training: +# $ tune run --nproc_per_node 4 lora_finetune_distributed \ +# --config llama2/13B_lora \ +# checkpointer.checkpoint_dir= # Model Arguments diff --git a/recipes/configs/llama2/7B_full.yaml b/recipes/configs/llama2/7B_full.yaml index 16f3dcb3ec..7a94ebd221 100644 --- a/recipes/configs/llama2/7B_full.yaml +++ b/recipes/configs/llama2/7B_full.yaml @@ -1,26 +1,27 @@ -# Config for multi-device full finetuning in full_finetune_distributed.py -# using a Llama2 7B model +# Config for multi-device with full_finetune_distributed.py or single-device full finetuning +# with full_finetune_single_device.py using a Llama2 7B model # -# This config assumes that you've run the following command before launching -# this run: -# tune download --repo-id meta-llama/Llama-2-7b \ -# --hf-token \ -# --output-dir /tmp/llama2 +# This config assumes that you've run the following command before launching: +# $ tune download meta-llama/Llama-2-7b \ +# --hf-token \ +# --output-dir /tmp/llama2 # # To launch on 4 devices, run the following command from root: -# tune run --nproc_per_node 4 full_finetune_distributed \ -# --config llama2/7B_full \ +# $ tune run --nproc_per_node 4 full_finetune_distributed \ +# --config llama2/7B_full +# +# To launch on a single device, run the following command: +# $ tune run full_finetune_single_device \ +# --config llama2/7B_full # # You can add specific overrides through the command line. For example -# to override the checkpointer directory while launching training -# you can run: -# tune --nnodes 1 --nproc_per_node 4 full_finetune_distributed \ -# --config llama2/7B_full \ -# checkpointer.checkpoint_dir= +# to override the checkpointer directory while launching training: +# $ tune run --nproc_per_node 4 full_finetune_distributed \ +# --config llama2/7B_full \ +# checkpointer.checkpoint_dir= # -# This config works best when the model is being fine-tuned on 2+ GPUs. -# Single device full finetuning requires more memory optimizations. It's -# best to use 7B_full_single_device.yaml for those cases +# For more memory optimizations, such as those needed when running on a single GPU, +# use llama2/7B_full_low_memory_example # Tokenizer diff --git a/recipes/configs/llama2/7B_full_single_device.yaml b/recipes/configs/llama2/7B_full_single_device.yaml deleted file mode 100644 index 1d297a28ec..0000000000 --- a/recipes/configs/llama2/7B_full_single_device.yaml +++ /dev/null @@ -1,76 +0,0 @@ -# Config for single device full finetuning in full_finetune_single_device.py -# using a Llama2 7B model -# -# This config assumes that you've run the following command before launching -# this run: -# tune download --repo-id meta-llama/Llama-2-7b \ -# --hf-token \ -# --output-dir /tmp/llama2 -# -# To launch on a single device, run the following command from root: -# tune run full_finetune_single_device \ -# --config llama2/7B_full_single_device \ -# -# You can add specific overrides through the command line. For example -# to override the checkpointer directory while launching training -# you can run: -# tune --nnodes 1 --nproc_per_node 1 full_finetune_single_device \ -# --config llama2/7B_full_single_device \ -# checkpointer.checkpoint_dir= -# -# This config works only for training on single device. - - -# Tokenizer -tokenizer: - _component_: torchtune.models.llama2.llama2_tokenizer - path: /tmp/llama2/tokenizer.model - -# Dataset -dataset: - _component_: torchtune.datasets.alpaca_dataset - train_on_input: True -seed: null -shuffle: True - -# Model Arguments -model: - _component_: torchtune.models.llama2.llama2_7b - -checkpointer: - _component_: torchtune.utils.FullModelMetaCheckpointer - checkpoint_dir: /tmp/llama2 - checkpoint_files: [consolidated.00.pth] - recipe_checkpoint: null - output_dir: /tmp/llama2 - model_type: LLAMA2 -resume_from_checkpoint: False - -# Fine-tuning arguments -batch_size: 2 -epochs: 3 -optimizer: - _component_: torch.optim.SGD - lr: 2e-5 -loss: - _component_: torch.nn.CrossEntropyLoss -max_steps_per_epoch: null -gradient_accumulation_steps: 1 -optimizer_in_bwd: False - - -# Training environment -device: cuda - -# Memory management -enable_activation_checkpointing: True - -# Reduced precision -dtype: bf16 - -# Logging -metric_logger: - _component_: torchtune.utils.metric_logging.DiskLogger - log_dir: ${output_dir} -output_dir: /tmp/alpaca-llama2-finetune -log_every_n_steps: null diff --git a/recipes/configs/llama2/7B_full_single_device_low_memory.yaml b/recipes/configs/llama2/7B_full_single_device_low_memory.yaml deleted file mode 100644 index c1bfd5cb6f..0000000000 --- a/recipes/configs/llama2/7B_full_single_device_low_memory.yaml +++ /dev/null @@ -1,76 +0,0 @@ -# Config for single device full finetuning in full_finetune_single_device.py -# using a Llama2 7B model -# -# This config assumes that you've run the following command before launching -# this run: -# tune download --repo-id meta-llama/Llama-2-7b \ -# --hf-token \ -# --output-dir /tmp/llama2 -# -# To launch on a single device, run the following command from root: -# tune run full_finetune_single_device \ -# --config llama2/7B_full_single_device_low_memory \ -# -# You can add specific overrides through the command line. For example -# to override the checkpointer directory while launching training -# you can run: -# tune --nnodes 1 --nproc_per_node 1 full_finetune_single_device \ -# --config llama2/7B_full_single_device_low_memory \ -# checkpointer.checkpoint_dir= -# -# This config works only for training on single device. - - -# Tokenizer -tokenizer: - _component_: torchtune.models.llama2.llama2_tokenizer - path: /tmp/llama2/tokenizer.model - -# Dataset -dataset: - _component_: torchtune.datasets.alpaca_dataset - train_on_input: True -seed: null -shuffle: True - -# Model Arguments -model: - _component_: torchtune.models.llama2.llama2_7b - -checkpointer: - _component_: torchtune.utils.FullModelMetaCheckpointer - checkpoint_dir: /tmp/llama2 - checkpoint_files: [consolidated.00.pth] - recipe_checkpoint: null - output_dir: /tmp/llama2 - model_type: LLAMA2 -resume_from_checkpoint: False - -# Fine-tuning arguments -batch_size: 2 -epochs: 1 -optimizer: - _component_: bitsandbytes.optim.PagedAdamW - lr: 2e-5 -optimizer_in_bwd: True -loss: - _component_: torch.nn.CrossEntropyLoss -max_steps_per_epoch: null -gradient_accumulation_steps: 1 - - -# Training environment -device: cuda - -# Memory management -enable_activation_checkpointing: True - -# Reduced precision -dtype: bf16 - -# Logging -metric_logger: - _component_: torchtune.utils.metric_logging.DiskLogger - log_dir: ${output_dir} -output_dir: /tmp/alpaca-llama2-finetune -log_every_n_steps: null diff --git a/recipes/configs/llama2/7B_lora.yaml b/recipes/configs/llama2/7B_lora.yaml index d9035e64c9..fdf5971303 100644 --- a/recipes/configs/llama2/7B_lora.yaml +++ b/recipes/configs/llama2/7B_lora.yaml @@ -1,26 +1,27 @@ -# Config for multi-device LoRA finetuning in lora_finetune_distributed.py -# using a Llama2 7B model +# Config for multi-device with lora_finetune_distributed.py or single-device LoRA +# finetuning with lora_finetune_single_device.py using a Llama2 7B model # -# This config assumes that you've run the following command before launching -# this run: -# tune download --repo-id meta-llama/Llama-2-7b \ -# --hf-token \ -# --output-dir /tmp/llama2 +# This config assumes that you've run the following command before launching: +# $ tune download meta-llama/Llama-2-7b \ +# --hf-token \ +# --output-dir /tmp/llama2 # # To launch on 4 devices, run the following command from root: -# tune run --nproc_per_node 4 lora_finetune_distributed \ -# --config llama2/7B_lora \ +# $ tune run --nproc_per_node 4 lora_finetune_distributed \ +# --config llama2/7B_lora +# +# To launch on a single device, run the following command: +# $ tune run lora_finetune_single_device \ +# --config llama2/7B_lora # # You can add specific overrides through the command line. For example -# to override the checkpointer directory while launching training -# you can run: -# tune --nnodes 1 --nproc_per_node 4 lora_finetune_distributed \ -# --config llama2/7B_lora \ -# checkpointer.checkpoint_dir= +# to override the checkpointer directory while launching training: +# $ tune run --nproc_per_node 4 lora_finetune_distributed \ +# --config llama2/7B_lora \ +# checkpointer.checkpoint_dir= # -# This config works best when the model is being fine-tuned on 2+ GPUs. -# For single device lora finetuning please use 7B_lora_single_device.yaml -# or 7B_qlora_single_device.yaml +# For more memory optimizations, such as those needed when running on a single GPU, +# use llama2/7B_qlora # Model Arguments diff --git a/recipes/configs/llama2/7B_lora_single_device.yaml b/recipes/configs/llama2/7B_lora_single_device.yaml deleted file mode 100644 index de19afe428..0000000000 --- a/recipes/configs/llama2/7B_lora_single_device.yaml +++ /dev/null @@ -1,83 +0,0 @@ -# Config for single device LoRA finetuning in lora_finetune_single_device.py -# using a Llama2 7B model -# -# This config assumes that you've run the following command before launching -# this run: -# tune download --repo-id meta-llama/Llama-2-7b \ -# --hf-token \ -# --output-dir /tmp/llama2 -# -# To launch on a single device, run the following command from root: -# tune run lora_finetune_single_device \ -# --config llama2/7B_lora_single_device \ -# -# You can add specific overrides through the command line. For example -# to override the checkpointer directory while launching training -# you can run: -# tune --nnodes 1 --nproc_per_node 1 lora_finetune_single_device \ -# --config 7B_lora_single_device \ -# checkpointer.checkpoint_dir= -# -# This config works only for training on single device. - - -# Model Arguments -model: - _component_: torchtune.models.llama2.lora_llama2_7b - lora_attn_modules: ['q_proj', 'v_proj'] - apply_lora_to_mlp: False - apply_lora_to_output: False - lora_rank: 8 - lora_alpha: 16 - -checkpointer: - _component_: torchtune.utils.FullModelMetaCheckpointer - checkpoint_dir: /tmp/llama2/ - checkpoint_files: [consolidated.00.pth] - adapter_checkpoint: null - recipe_checkpoint: null - output_dir: /tmp/llama2/ - model_type: LLAMA2 -resume_from_checkpoint: False - -# Tokenizer -tokenizer: - _component_: torchtune.models.llama2.llama2_tokenizer - path: /tmp/llama2/tokenizer.model - -# Dataset and Sampler -dataset: - _component_: torchtune.datasets.alpaca_cleaned_dataset - train_on_input: True -seed: null -shuffle: True -batch_size: 2 - -# Optimizer and Scheduler -optimizer: - _component_: torch.optim.AdamW - weight_decay: 0.01 - lr: 3e-4 -lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup - num_warmup_steps: 100 - -loss: - _component_: torch.nn.CrossEntropyLoss - -# Training -epochs: 1 -max_steps_per_epoch: null -gradient_accumulation_steps: 1 - -# Logging -output_dir: /tmp/lora_finetune_output -metric_logger: - _component_: torchtune.utils.metric_logging.DiskLogger - log_dir: ${output_dir} -log_every_n_steps: null - -# Environment -device: cuda -dtype: bf16 -enable_activation_checkpointing: True diff --git a/recipes/configs/llama2/7B_qlora_single_device.yaml b/recipes/configs/llama2/7B_qlora_single_device.yaml deleted file mode 100644 index 26510f1642..0000000000 --- a/recipes/configs/llama2/7B_qlora_single_device.yaml +++ /dev/null @@ -1,83 +0,0 @@ -# Config for single device QLoRA with lora_finetune_single_device.py -# using a Llama2 7B model -# -# This config assumes that you've run the following command before launching -# this run: -# tune download --repo-id meta-llama/Llama-2-7b \ -# --hf-token \ -# --output-dir /tmp/llama2 -# -# To launch on a single device, run the following command from root: -# tune run lora_finetune_single_device \ -# --config llama2\7B_qlora_single_device \ -# -# You can add specific overrides through the command line. For example -# to override the checkpointer directory while launching training -# you can run: -# tune --nnodes 1 --nproc_per_node 1 lora_finetune_single_device \ -# --config 7B_qlora_single_device \ -# checkpointer.checkpoint_dir= -# -# This config works only for training on single device. - -# Model Arguments -model: - _component_: torchtune.models.llama2.qlora_llama2_7b - lora_attn_modules: ['q_proj', 'v_proj', 'k_proj', 'output_proj'] - apply_lora_to_mlp: True - apply_lora_to_output: False - lora_rank: 8 - lora_alpha: 16 - quantize_base: True - -checkpointer: - _component_: torchtune.utils.FullModelMetaCheckpointer - checkpoint_dir: /tmp/llama2 - checkpoint_files: [consolidated.00.pth] - adapter_checkpoint: null - recipe_checkpoint: null - output_dir: /tmp/llama2/ - model_type: LLAMA2 -resume_from_checkpoint: False - -# Tokenizer -tokenizer: - _component_: torchtune.models.llama2.llama2_tokenizer - path: /tmp/llama2/tokenizer.model - -# Dataset and Sampler -dataset: - _component_: torchtune.datasets.alpaca_cleaned_dataset - train_on_input: True -seed: null -shuffle: True -batch_size: 2 - -# Optimizer and Scheduler -optimizer: - _component_: torch.optim.AdamW - weight_decay: 0.01 - lr: 3e-4 -lr_scheduler: - _component_: torchtune.modules.get_cosine_schedule_with_warmup - num_warmup_steps: 100 - -loss: - _component_: torch.nn.CrossEntropyLoss - -# Training -epochs: 1 -max_steps_per_epoch: null -gradient_accumulation_steps: 1 - -# Logging -output_dir: /tmp/qlora_finetune_output/ -metric_logger: - _component_: torchtune.utils.metric_logging.DiskLogger - log_dir: ${output_dir} -log_every_n_steps: 1 - -# Environment -device: cuda -dtype: bf16 -enable_activation_checkpointing: True diff --git a/recipes/configs/mistral/7B_full.yaml b/recipes/configs/mistral/7B_full.yaml index 211c5526ac..fcaa7f593b 100644 --- a/recipes/configs/mistral/7B_full.yaml +++ b/recipes/configs/mistral/7B_full.yaml @@ -3,7 +3,8 @@ # from the paper # # Run this config on 4 GPUs using the following: -# tune run --nproc_per_node 4 full_finetune_distributed --config mistral/7B_full +# $ tune run --nproc_per_node 4 full_finetune_distributed --config mistral/7B_full + # Tokenizer tokenizer: diff --git a/recipes/configs/mistral/7B_lora.yaml b/recipes/configs/mistral/7B_lora.yaml index 53f926de74..a62d8f4a78 100644 --- a/recipes/configs/mistral/7B_lora.yaml +++ b/recipes/configs/mistral/7B_lora.yaml @@ -3,7 +3,10 @@ # from the paper # # Run this config on 4 GPUs using the following: -# tune run --nproc_per_node 4 lora_finetune_distributed --config mistral/7B_lora +# $ tune run --nproc_per_node 4 lora_finetune_distributed --config mistral/7B_lora +# +# Run this config on a single GPU with the following: +# $ tune run lora_finetune_single_device --config mistral/7B_lora # Tokenizer diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py index 20e578e306..61da86ea4f 100644 --- a/torchtune/_recipe_registry.py +++ b/torchtune/_recipe_registry.py @@ -28,16 +28,12 @@ class Recipe: file_path="full_finetune_single_device.py", configs=[ Config( - name="llama2/7B_full_single_device", - file_path="llama2/7B_full_single_device.yaml", + name="llama2/7B_full", + file_path="llama2/7B_full.yaml", ), Config( - name="llama2/7B_full_single_device_low_memory", - file_path="llama2/7B_full_single_device_low_memory.yaml", - ), - Config( - name="mistral/7B_full", - file_path="mistral/7B_full.yaml", + name="llama2/7B_full_low_memory", + file_path="llama2/7B_full_low_memory.yaml", ), ], supports_distributed=False, @@ -57,16 +53,12 @@ class Recipe: file_path="lora_finetune_single_device.py", configs=[ Config( - name="llama2/7B_lora_single_device", - file_path="llama2/7B_lora_single_device.yaml", - ), - Config( - name="llama2/7B_qlora_single_device", - file_path="llama2/7B_qlora_single_device.yaml", + name="llama2/7B_lora", + file_path="llama2/7B_lora.yaml", ), Config( - name="mistral/7B_lora", - file_path="mistral/7B_lora.yaml", + name="llama2/7B_qlora", + file_path="llama2/7B_qlora.yaml", ), ], supports_distributed=False, From 9c36605ba28450248376d5fea91bd8aff939c588 Mon Sep 17 00:00:00 2001 From: joecummings Date: Tue, 2 Apr 2024 13:05:00 -0700 Subject: [PATCH 02/12] Add grad acc steps --- recipes/configs/llama2/13B_lora.yaml | 1 + .../configs/llama2/7B_full_low_memory.yaml | 73 ++++++++++++++++ recipes/configs/llama2/7B_lora.yaml | 1 + recipes/configs/llama2/7B_qlora.yaml | 84 +++++++++++++++++++ 4 files changed, 159 insertions(+) create mode 100644 recipes/configs/llama2/7B_full_low_memory.yaml create mode 100644 recipes/configs/llama2/7B_qlora.yaml diff --git a/recipes/configs/llama2/13B_lora.yaml b/recipes/configs/llama2/13B_lora.yaml index 042a986277..022e0ccfb6 100644 --- a/recipes/configs/llama2/13B_lora.yaml +++ b/recipes/configs/llama2/13B_lora.yaml @@ -72,6 +72,7 @@ loss: # Training epochs: 1 max_steps_per_epoch: null +gradient_accumulation_steps: 1 # Logging output_dir: /tmp/lora_finetune_output diff --git a/recipes/configs/llama2/7B_full_low_memory.yaml b/recipes/configs/llama2/7B_full_low_memory.yaml new file mode 100644 index 0000000000..124e95353b --- /dev/null +++ b/recipes/configs/llama2/7B_full_low_memory.yaml @@ -0,0 +1,73 @@ +# Config for single-device full finetuning with full_finetune_single_device.py +# using a Llama2 7B model +# +# This config assumes that you've run the following command before launching: +# $ tune download meta-llama/Llama-2-7b \ +# --hf-token \ +# --output-dir /tmp/llama2 +# +# To launch on a single device, run the following command: +# $ tune run full_finetune_single_device \ +# --config llama2/7B_full_low_memory +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training: +# $ tune run full_finetune_single_device \ +# --config llama2/7B_full_low_memory \ +# checkpointer.checkpoint_dir= +# +# This config specifies components from `bitsandbytes`, make sure you have it installed + +# Tokenizer +tokenizer: + _component_: torchtune.models.llama2.llama2_tokenizer + path: /tmp/llama2/tokenizer.model + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_dataset + train_on_input: True +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.llama2.llama2_7b + +checkpointer: + _component_: torchtune.utils.FullModelMetaCheckpointer + checkpoint_dir: /tmp/llama2 + checkpoint_files: [consolidated.00.pth] + recipe_checkpoint: null + output_dir: /tmp/llama2 + model_type: LLAMA2 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 2 +epochs: 1 +optimizer: + _component_: bitsandbytes.optim.PagedAdamW + lr: 2e-5 +optimizer_in_bwd: True +loss: + _component_: torch.nn.CrossEntropyLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 1 + + +# Training environment +device: cuda + +# Memory management +enable_activation_checkpointing: True + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.utils.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/alpaca-llama2-finetune +log_every_n_steps: null diff --git a/recipes/configs/llama2/7B_lora.yaml b/recipes/configs/llama2/7B_lora.yaml index fdf5971303..b1e9f01e4b 100644 --- a/recipes/configs/llama2/7B_lora.yaml +++ b/recipes/configs/llama2/7B_lora.yaml @@ -71,6 +71,7 @@ loss: # Training epochs: 1 max_steps_per_epoch: null +gradient_accumulation_steps: 1 # Logging output_dir: /tmp/lora_finetune_output diff --git a/recipes/configs/llama2/7B_qlora.yaml b/recipes/configs/llama2/7B_qlora.yaml new file mode 100644 index 0000000000..d3c847579f --- /dev/null +++ b/recipes/configs/llama2/7B_qlora.yaml @@ -0,0 +1,84 @@ +# Config for single device QLoRA with lora_finetune_single_device.py +# using a Llama2 7B model +# +# This config assumes that you've run the following command before launching +# this run: +# $ tune download --repo-id meta-llama/Llama-2-7b \ +# --hf-token \ +# --output-dir /tmp/llama2 +# +# To launch on a single device, run the following command from root: +# $ tune run lora_finetune_single_device \ +# --config llama2/7B_qlora +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training: +# $ tune run lora_finetune_single_device \ +# --config llama2/7B_qlora \ +# checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + + +# Model Arguments +model: + _component_: torchtune.models.llama2.qlora_llama2_7b + lora_attn_modules: ['q_proj', 'v_proj', 'k_proj', 'output_proj'] + apply_lora_to_mlp: True + apply_lora_to_output: False + lora_rank: 8 + lora_alpha: 16 + quantize_base: True + +checkpointer: + _component_: torchtune.utils.FullModelMetaCheckpointer + checkpoint_dir: /tmp/llama2 + checkpoint_files: [consolidated.00.pth] + adapter_checkpoint: null + recipe_checkpoint: null + output_dir: /tmp/llama2/ + model_type: LLAMA2 +resume_from_checkpoint: False + +# Tokenizer +tokenizer: + _component_: torchtune.models.llama2.llama2_tokenizer + path: /tmp/llama2/tokenizer.model + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_dataset + train_on_input: True + use_clean: True +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.modules.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torch.nn.CrossEntropyLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 1 + +# Logging +output_dir: /tmp/qlora_finetune_output/ +metric_logger: + _component_: torchtune.utils.metric_logging.DiskLogger + log_dir: ${output_dir} +log_every_n_steps: 1 + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: True From da54679f4b0e372c3c481240be781273e8cb20f5 Mon Sep 17 00:00:00 2001 From: joecummings Date: Tue, 2 Apr 2024 15:15:24 -0700 Subject: [PATCH 03/12] Optimizer in bwd --- recipes/configs/llama2/7B_full.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/recipes/configs/llama2/7B_full.yaml b/recipes/configs/llama2/7B_full.yaml index 7a94ebd221..9a46de2d17 100644 --- a/recipes/configs/llama2/7B_full.yaml +++ b/recipes/configs/llama2/7B_full.yaml @@ -55,6 +55,7 @@ epochs: 3 optimizer: _component_: torch.optim.AdamW lr: 2e-5 +optimizer_in_bwd: False loss: _component_: torch.nn.CrossEntropyLoss max_steps_per_epoch: null From ca12ce00a4cd3667e97f43d21236819467b13add Mon Sep 17 00:00:00 2001 From: joecummings Date: Tue, 2 Apr 2024 15:18:04 -0700 Subject: [PATCH 04/12] Update tests --- tests/recipes/test_full_finetune_single_device.py | 2 +- tests/recipes/test_lora_finetune_single_device.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/recipes/test_full_finetune_single_device.py b/tests/recipes/test_full_finetune_single_device.py index 55a95d2b55..7578b68c4a 100644 --- a/tests/recipes/test_full_finetune_single_device.py +++ b/tests/recipes/test_full_finetune_single_device.py @@ -48,7 +48,7 @@ def _fetch_expected_loss_values(self): @pytest.mark.integration_test @pytest.mark.parametrize( - "config", ["full_single_device_low_memory", "full_single_device"] + "config", ["full_low_memory", "full"] ) def test_loss(self, config, tmpdir, monkeypatch): ckpt = "small_test_ckpt_meta" diff --git a/tests/recipes/test_lora_finetune_single_device.py b/tests/recipes/test_lora_finetune_single_device.py index b74d7d8faf..1142c20b06 100644 --- a/tests/recipes/test_lora_finetune_single_device.py +++ b/tests/recipes/test_lora_finetune_single_device.py @@ -60,7 +60,7 @@ def test_loss(self, tmpdir, monkeypatch): cmd = f""" tune run lora_finetune_single_device \ - --config llama2/7B_lora_single_device \ + --config llama2/7B_lora \ output_dir={tmpdir} \ checkpointer=torchtune.utils.FullModelMetaCheckpointer \ checkpointer.checkpoint_dir='{ckpt_dir}' \ @@ -99,7 +99,7 @@ def test_loss_qlora(self, dtype, tmpdir, monkeypatch): cmd = f""" tune run lora_finetune_single_device - --config llama2/7B_qlora_single_device \ + --config llama2/7B_qlora \ output_dir={tmpdir} \ checkpointer=torchtune.utils.FullModelMetaCheckpointer checkpointer.checkpoint_dir='{ckpt_dir}' \ @@ -151,7 +151,7 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch): # Train for two epochs cmd_1 = f""" tune run lora_finetune_single_device \ - --config llama2/7B_lora_single_device \ + --config llama2/7B_lora \ output_dir={tmpdir} \ checkpointer=torchtune.utils.FullModelHFCheckpointer \ checkpointer.checkpoint_dir='{ckpt_dir}' \ @@ -176,7 +176,7 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch): # Resume training cmd_2 = f""" tune run lora_finetune_single_device \ - --config llama2/7B_lora_single_device \ + --config llama2/7B_lora \ output_dir={tmpdir} \ checkpointer=torchtune.utils.FullModelHFCheckpointer \ checkpointer.checkpoint_dir={tmpdir} \ @@ -209,7 +209,7 @@ def test_save_and_load_merged_weights(self, tmpdir, monkeypatch): cmd = f""" tune run lora_finetune_single_device \ - --config llama2/7B_lora_single_device \ + --config llama2/7B_lora \ output_dir={tmpdir} \ checkpointer=torchtune.utils.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir='{ckpt_dir}' \ From 47e01c5dc9207b655fca39736ac4c04d44178421 Mon Sep 17 00:00:00 2001 From: joecummings Date: Tue, 2 Apr 2024 15:21:36 -0700 Subject: [PATCH 05/12] Lint --- tests/recipes/test_full_finetune_single_device.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/recipes/test_full_finetune_single_device.py b/tests/recipes/test_full_finetune_single_device.py index 7578b68c4a..2494f82f30 100644 --- a/tests/recipes/test_full_finetune_single_device.py +++ b/tests/recipes/test_full_finetune_single_device.py @@ -47,9 +47,7 @@ def _fetch_expected_loss_values(self): return [10.5074, 10.5563, 10.5152, 10.4851] @pytest.mark.integration_test - @pytest.mark.parametrize( - "config", ["full_low_memory", "full"] - ) + @pytest.mark.parametrize("config", ["full_low_memory", "full"]) def test_loss(self, config, tmpdir, monkeypatch): ckpt = "small_test_ckpt_meta" ckpt_path = Path(CKPT_MODEL_PATHS[ckpt]) From d032735304beb5118844c3faca2c6dfccfb81420 Mon Sep 17 00:00:00 2001 From: joecummings Date: Tue, 2 Apr 2024 15:48:13 -0700 Subject: [PATCH 06/12] Update CP tests --- tests/torchtune/_cli/test_cp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/torchtune/_cli/test_cp.py b/tests/torchtune/_cli/test_cp.py index 0b6ae3b944..0d90cd6bbb 100644 --- a/tests/torchtune/_cli/test_cp.py +++ b/tests/torchtune/_cli/test_cp.py @@ -62,7 +62,7 @@ def test_copy_skips_when_dest_already_exists_and_no_clobber_is_true( existing_file = tmpdir_path / "existing_file.yaml" existing_file.touch() - args = f"tune cp llama2/7B_full_single_device {existing_file} -n".split() + args = f"tune cp llama2/7B_full {existing_file} -n".split() monkeypatch.setattr(sys, "argv", args) runpy.run_path(TUNE_PATH, run_name="__main__") @@ -80,7 +80,7 @@ def test_adds_correct_suffix_to_dest_when_no_suffix_is_provided( tmpdir_path = Path(tmpdir) dest = tmpdir_path / "my_custom_finetune" - args = f"tune cp llama2/7B_full_single_device {dest}".split() + args = f"tune cp llama2/7B_full {dest}".split() monkeypatch.setattr(sys, "argv", args) runpy.run_path(TUNE_PATH, run_name="__main__") From 400eeaaba396b11f407b660ea4c3d88cc6b53f59 Mon Sep 17 00:00:00 2001 From: joecummings Date: Tue, 2 Apr 2024 16:46:06 -0700 Subject: [PATCH 07/12] Remove 'use_clean' --- recipes/configs/llama2/7B_qlora.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/recipes/configs/llama2/7B_qlora.yaml b/recipes/configs/llama2/7B_qlora.yaml index d3c847579f..65bcd7972d 100644 --- a/recipes/configs/llama2/7B_qlora.yaml +++ b/recipes/configs/llama2/7B_qlora.yaml @@ -49,7 +49,6 @@ tokenizer: dataset: _component_: torchtune.datasets.alpaca_dataset train_on_input: True - use_clean: True seed: null shuffle: True batch_size: 2 From c4cf3eb6991b600f35fd82d13d3b95ea88e90dbb Mon Sep 17 00:00:00 2001 From: joecummings Date: Tue, 2 Apr 2024 16:47:29 -0700 Subject: [PATCH 08/12] Properly remove file names --- tests/recipes/test_full_finetune_single_device.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/recipes/test_full_finetune_single_device.py b/tests/recipes/test_full_finetune_single_device.py index 2494f82f30..d759c7c53d 100644 --- a/tests/recipes/test_full_finetune_single_device.py +++ b/tests/recipes/test_full_finetune_single_device.py @@ -102,7 +102,7 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch): # Train for two epochs cmd_1 = f""" tune run full_finetune_single_device \ - --config llama2/7B_full_single_device \ + --config llama2/7B_full \ output_dir={tmpdir} \ checkpointer._component_=torchtune.utils.FullModelHFCheckpointer \ checkpointer.checkpoint_dir='{ckpt_dir}' \ @@ -121,7 +121,7 @@ def test_training_state_on_resume(self, tmpdir, monkeypatch): # Resume training cmd_2 = f""" tune run full_finetune_single_device \ - --config llama2/7B_full_single_device \ + --config llama2/7B_full \ output_dir={tmpdir} \ checkpointer._component_=torchtune.utils.FullModelHFCheckpointer \ checkpointer.checkpoint_dir={tmpdir} \ @@ -184,7 +184,7 @@ def test_gradient_accumulation(self, tmpdir, monkeypatch): cmd_1 = f""" tune run full_finetune_single_device \ - --config llama2/7B_full_single_device \ + --config llama2/7B_full \ checkpointer._component_=torchtune.utils.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir={ckpt_dir} \ checkpointer.checkpoint_files=[{ckpt_path}]\ @@ -210,7 +210,7 @@ def test_gradient_accumulation(self, tmpdir, monkeypatch): # Update the cmd with new values for gradient accumulation cmd_2 = f""" tune run full_finetune_single_device \ - --config llama2/7B_full_single_device \ + --config llama2/7B_full \ checkpointer._component_=torchtune.utils.FullModelTorchTuneCheckpointer \ checkpointer.checkpoint_dir={ckpt_dir} \ checkpointer.checkpoint_files=[{ckpt_path}]\ From 9f5cf33ef2f4ca9c3f367d4b5a4f60e12090c710 Mon Sep 17 00:00:00 2001 From: joecummings Date: Wed, 3 Apr 2024 08:17:01 -0700 Subject: [PATCH 09/12] Updates --- recipes/configs/llama2/13B_lora.yaml | 1 - recipes/configs/llama2/7B_full.yaml | 5 ++--- recipes/configs/llama2/7B_lora.yaml | 1 - recipes/configs/llama2/7B_qlora.yaml | 2 +- recipes/configs/mistral/7B_lora.yaml | 1 - recipes/full_finetune_single_device.py | 4 ++-- recipes/lora_finetune_single_device.py | 6 +++--- torchtune/_recipe_registry.py | 2 ++ 8 files changed, 10 insertions(+), 12 deletions(-) diff --git a/recipes/configs/llama2/13B_lora.yaml b/recipes/configs/llama2/13B_lora.yaml index 022e0ccfb6..042a986277 100644 --- a/recipes/configs/llama2/13B_lora.yaml +++ b/recipes/configs/llama2/13B_lora.yaml @@ -72,7 +72,6 @@ loss: # Training epochs: 1 max_steps_per_epoch: null -gradient_accumulation_steps: 1 # Logging output_dir: /tmp/lora_finetune_output diff --git a/recipes/configs/llama2/7B_full.yaml b/recipes/configs/llama2/7B_full.yaml index 9a46de2d17..fb13247867 100644 --- a/recipes/configs/llama2/7B_full.yaml +++ b/recipes/configs/llama2/7B_full.yaml @@ -1,5 +1,5 @@ -# Config for multi-device with full_finetune_distributed.py or single-device full finetuning -# with full_finetune_single_device.py using a Llama2 7B model +# Config settings for multi-device using full_finetune_distributed.py or single-device full finetuning +# with full_finetune_single_device.py, specifically for a Llama2 7B model # # This config assumes that you've run the following command before launching: # $ tune download meta-llama/Llama-2-7b \ @@ -55,7 +55,6 @@ epochs: 3 optimizer: _component_: torch.optim.AdamW lr: 2e-5 -optimizer_in_bwd: False loss: _component_: torch.nn.CrossEntropyLoss max_steps_per_epoch: null diff --git a/recipes/configs/llama2/7B_lora.yaml b/recipes/configs/llama2/7B_lora.yaml index b1e9f01e4b..fdf5971303 100644 --- a/recipes/configs/llama2/7B_lora.yaml +++ b/recipes/configs/llama2/7B_lora.yaml @@ -71,7 +71,6 @@ loss: # Training epochs: 1 max_steps_per_epoch: null -gradient_accumulation_steps: 1 # Logging output_dir: /tmp/lora_finetune_output diff --git a/recipes/configs/llama2/7B_qlora.yaml b/recipes/configs/llama2/7B_qlora.yaml index 65bcd7972d..4eeb6f3ff9 100644 --- a/recipes/configs/llama2/7B_qlora.yaml +++ b/recipes/configs/llama2/7B_qlora.yaml @@ -47,7 +47,7 @@ tokenizer: # Dataset and Sampler dataset: - _component_: torchtune.datasets.alpaca_dataset + _component_: torchtune.datasets.alpaca_cleaned_dataset train_on_input: True seed: null shuffle: True diff --git a/recipes/configs/mistral/7B_lora.yaml b/recipes/configs/mistral/7B_lora.yaml index a62d8f4a78..9e575fd810 100644 --- a/recipes/configs/mistral/7B_lora.yaml +++ b/recipes/configs/mistral/7B_lora.yaml @@ -57,7 +57,6 @@ loss: batch_size: 4 epochs: 3 max_steps_per_epoch: null -gradient_accumulation_steps: 1 # Training env device: cuda diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py index 3e6dd2fb1f..a043a16929 100644 --- a/recipes/full_finetune_single_device.py +++ b/recipes/full_finetune_single_device.py @@ -51,7 +51,7 @@ class FullFinetuneRecipeSingleDevice(FTRecipeInterface): The following configs can be used to run this recipe: >>> tune ls RECIPE CONFIG - full_finetune_single_device llama2/7B_full_single_device + full_finetune_single_device llama2/7B_full Args: cfg (DictConfig): OmegaConf object parsed from yaml file @@ -87,7 +87,7 @@ def __init__(self, cfg: DictConfig) -> None: # Training cfg self._resume_from_checkpoint = cfg.resume_from_checkpoint self._gradient_accumulation_steps = cfg.gradient_accumulation_steps - self._optimizer_in_bwd = cfg.optimizer_in_bwd + self._optimizer_in_bwd = cfg.get("optimizer_in_bwd", False) # TODO: find a better place / way to perform validation of args that don't yet # compose with each other. if self._gradient_accumulation_steps > 1 and self._optimizer_in_bwd: diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py index 37e36cd9ef..15545b1106 100644 --- a/recipes/lora_finetune_single_device.py +++ b/recipes/lora_finetune_single_device.py @@ -52,8 +52,8 @@ class LoRAFinetuneRecipeSingleDevice(FTRecipeInterface): The following configs can be used to run this recipe: >>> tune ls RECIPE CONFIG - lora_finetune_single_device llama2/7B_lora_single_device - llama2/7B_qlora_single_device + lora_finetune_single_device llama2/7B_lora + llama2/7B_qlora Args: cfg (DictConfig): OmegaConf object parsed from yaml file @@ -96,7 +96,7 @@ def __init__(self, cfg: DictConfig) -> None: self.total_training_steps = 0 self._resume_from_checkpoint = cfg.resume_from_checkpoint - self._gradient_accumulation_steps = cfg.gradient_accumulation_steps + self._gradient_accumulation_steps = cfg.get("gradient_accumulation_steps", 1) def load_checkpoint(self, cfg: DictConfig) -> Dict[str, Any]: """ diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py index 61da86ea4f..eddaec4906 100644 --- a/torchtune/_recipe_registry.py +++ b/torchtune/_recipe_registry.py @@ -35,6 +35,7 @@ class Recipe: name="llama2/7B_full_low_memory", file_path="llama2/7B_full_low_memory.yaml", ), + Config(name="mistral/7B_full", file_path="mistral/7B_full.yaml"), ], supports_distributed=False, ), @@ -60,6 +61,7 @@ class Recipe: name="llama2/7B_qlora", file_path="llama2/7B_qlora.yaml", ), + Config(name="mistral/7B_lora", file_path="mistral/7B_lora.yaml"), ], supports_distributed=False, ), From e0aae755a23b3c2cb5db9824b389e03defbdfb4e Mon Sep 17 00:00:00 2001 From: joecummings Date: Wed, 3 Apr 2024 08:25:45 -0700 Subject: [PATCH 10/12] Fix bug --- recipes/full_finetune_single_device.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes/full_finetune_single_device.py b/recipes/full_finetune_single_device.py index a043a16929..acbaf17421 100644 --- a/recipes/full_finetune_single_device.py +++ b/recipes/full_finetune_single_device.py @@ -168,7 +168,7 @@ def setup(self, cfg: DictConfig) -> None: # checkpoint. Transforming the opt state dict is handled by this method self._optimizer = self._setup_optimizer( cfg_optimizer=cfg.optimizer, - optimizer_in_bwd=cfg.optimizer_in_bwd, + optimizer_in_bwd=self._optimizer_in_bwd, opt_state_dict=( ckpt_dict[utils.OPT_KEY] if self._resume_from_checkpoint else None ), From d8e4796d31cd40daf891d19bd51d1b3b913c2ac5 Mon Sep 17 00:00:00 2001 From: joecummings Date: Wed, 3 Apr 2024 08:31:39 -0700 Subject: [PATCH 11/12] Updates to config comments --- recipes/configs/llama2/7B_full.yaml | 2 +- recipes/configs/llama2/7B_full_low_memory.yaml | 2 +- recipes/configs/llama2/7B_qlora.yaml | 7 +++---- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/recipes/configs/llama2/7B_full.yaml b/recipes/configs/llama2/7B_full.yaml index fb13247867..b5ee6a8431 100644 --- a/recipes/configs/llama2/7B_full.yaml +++ b/recipes/configs/llama2/7B_full.yaml @@ -1,4 +1,4 @@ -# Config settings for multi-device using full_finetune_distributed.py or single-device full finetuning +# Config settings for multi-device using full_finetune_distributed.py or single-device # with full_finetune_single_device.py, specifically for a Llama2 7B model # # This config assumes that you've run the following command before launching: diff --git a/recipes/configs/llama2/7B_full_low_memory.yaml b/recipes/configs/llama2/7B_full_low_memory.yaml index 124e95353b..a07714ae9d 100644 --- a/recipes/configs/llama2/7B_full_low_memory.yaml +++ b/recipes/configs/llama2/7B_full_low_memory.yaml @@ -6,7 +6,7 @@ # --hf-token \ # --output-dir /tmp/llama2 # -# To launch on a single device, run the following command: +# To launch, run the following command: # $ tune run full_finetune_single_device \ # --config llama2/7B_full_low_memory # diff --git a/recipes/configs/llama2/7B_qlora.yaml b/recipes/configs/llama2/7B_qlora.yaml index 4eeb6f3ff9..3f88d4d745 100644 --- a/recipes/configs/llama2/7B_qlora.yaml +++ b/recipes/configs/llama2/7B_qlora.yaml @@ -1,13 +1,12 @@ -# Config for single device QLoRA with lora_finetune_single_device.py +# Config for single-device QLoRA with lora_finetune_single_device.py # using a Llama2 7B model # -# This config assumes that you've run the following command before launching -# this run: +# This config assumes that you've run the following command before launching: # $ tune download --repo-id meta-llama/Llama-2-7b \ # --hf-token \ # --output-dir /tmp/llama2 # -# To launch on a single device, run the following command from root: +# To launch, run the following command from root: # $ tune run lora_finetune_single_device \ # --config llama2/7B_qlora # From 01cfdce1ae61563d497206c7fd1ffe6caf974aaf Mon Sep 17 00:00:00 2001 From: joecummings Date: Wed, 3 Apr 2024 08:41:16 -0700 Subject: [PATCH 12/12] Remove unsupported singel device commands in configs --- recipes/configs/llama2/13B_lora.yaml | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/recipes/configs/llama2/13B_lora.yaml b/recipes/configs/llama2/13B_lora.yaml index 042a986277..d2fc1fed72 100644 --- a/recipes/configs/llama2/13B_lora.yaml +++ b/recipes/configs/llama2/13B_lora.yaml @@ -1,5 +1,4 @@ -# Config for multi-device with lora_finetune_distributed.py or single-device LoRA -# finetuning with lora_finetune_single_device.py using a Llama2 13B model +# Config for multi-device with lora_finetune_distributed.py using a Llama2 13B model # # This config assumes that you've run the following command before launching: # $ tune download meta-llama/Llama-2-13b \ @@ -10,16 +9,13 @@ # $ tune run --nproc_per_node 4 lora_finetune_distributed \ # --config llama2/13B_lora # -# To launch on a single device, run the following command: -# $ tune run lora_finetune_single_device \ -# --config llama2/13B_lora -# # You can add specific overrides through the command line. For example # to override the checkpointer directory while launching training: # $ tune run --nproc_per_node 4 lora_finetune_distributed \ # --config llama2/13B_lora \ # checkpointer.checkpoint_dir= - +# +# This config is only tested on a multi-device setup # Model Arguments model: