diff --git a/docs/source/api_ref_models.rst b/docs/source/api_ref_models.rst index e658de7294..645e7f2c19 100644 --- a/docs/source/api_ref_models.rst +++ b/docs/source/api_ref_models.rst @@ -6,6 +6,31 @@ torchtune.models .. currentmodule:: torchtune.models +llama3.3 +-------- + +Text-only models from the 3.3 version of `Llama3 family `_. + +Important: You need to request access on `Hugging Face `__ before downloading it. + +To download the Llama-3.3-70B-Instruct model: + +.. code-block:: bash + + tune download meta-llama/Llama-3.3-70B-Instruct --ignore-patterns "original/consolidated.00.pth" --hf-token + +.. autosummary:: + :toctree: generated/ + :nosignatures: + + llama3_3.llama3_3_70b + llama3_3.lora_llama3_3_70b + llama3_3.qlora_llama3_3_70b + +.. note:: + + The Llama3.3 tokenizer reuses the :class:`~torchtune.models.llama3.llama3_tokenizer` class. + llama3.2 -------- diff --git a/recipes/configs/llama3_3/70B_full.yaml b/recipes/configs/llama3_3/70B_full.yaml new file mode 100644 index 0000000000..8f96a5fbd7 --- /dev/null +++ b/recipes/configs/llama3_3/70B_full.yaml @@ -0,0 +1,138 @@ +# Config for multi-device full finetuning in full_finetune_distributed.py +# using a Llama3.3 70B Instruct model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download meta-llama/Llama-3.3-70B-Instruct --ignore-patterns "original/consolidated*" +# +# To launch on 8 devices, run the following command from root: +# tune run --nproc_per_node 8 full_finetune_distributed --config llama3_3/70B_full +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nproc_per_node 8 full_finetune_distributed --config llama3_3/70B_full checkpointer.checkpoint_dir= +# +# This config is only tested on an 8xA100 machine. +# + +# Tokenizer +tokenizer: + _component_: torchtune.models.llama3.llama3_tokenizer + path: /tmp/Llama-3.3-70B-Instruct/original/tokenizer.model + max_seq_len: null + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_dataset + packed: False # True increases speed +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.llama3_3.llama3_3_70b + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Llama-3.3-70B-Instruct/ + checkpoint_files: [ + model-00001-of-00030.safetensors, + model-00002-of-00030.safetensors, + model-00003-of-00030.safetensors, + model-00004-of-00030.safetensors, + model-00005-of-00030.safetensors, + model-00006-of-00030.safetensors, + model-00007-of-00030.safetensors, + model-00008-of-00030.safetensors, + model-00009-of-00030.safetensors, + model-00010-of-00030.safetensors, + model-00011-of-00030.safetensors, + model-00012-of-00030.safetensors, + model-00013-of-00030.safetensors, + model-00014-of-00030.safetensors, + model-00015-of-00030.safetensors, + model-00016-of-00030.safetensors, + model-00017-of-00030.safetensors, + model-00018-of-00030.safetensors, + model-00019-of-00030.safetensors, + model-00020-of-00030.safetensors, + model-00021-of-00030.safetensors, + model-00022-of-00030.safetensors, + model-00023-of-00030.safetensors, + model-00024-of-00030.safetensors, + model-00025-of-00030.safetensors, + model-00026-of-00030.safetensors, + model-00027-of-00030.safetensors, + model-00028-of-00030.safetensors, + model-00029-of-00030.safetensors, + model-00030-of-00030.safetensors, + ] + recipe_checkpoint: null + output_dir: /tmp/Llama-3.3-70B-Instruct/ + model_type: LLAMA3 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 2 +epochs: 1 + +optimizer: + _component_: torch.optim.AdamW + lr: 2e-5 + # Note: highly recommended to use fused=True optimizer flag + # with CPU offload for faster optimizer step. + fused: True + +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 1 # Use to increase virtual batch size + + +# Training env +device: cuda + +# Memory management +enable_activation_checkpointing: True # True reduces memory +enable_activation_offloading: False # True reduces memory +custom_sharded_layers: ['tok_embeddings', 'output'] # Layers to shard separately (useful for large vocab size models). Lower Memory, but lower speed. +fsdp_cpu_offload: True +compile: False # pytorch compile, set to true for better perf/memory +optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1 + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/full-llama3_3-finetune +log_every_n_steps: 1 +log_peak_memory_stats: True + +# Profiler (disabled) +profiler: + _component_: torchtune.training.setup_torch_profiler + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 3 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/llama3_3/70B_lora.yaml b/recipes/configs/llama3_3/70B_lora.yaml new file mode 100644 index 0000000000..901c700c22 --- /dev/null +++ b/recipes/configs/llama3_3/70B_lora.yaml @@ -0,0 +1,132 @@ +# Config for multi-device LoRA in lora_finetune_distributed.py +# using a Llama3.3 70B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download meta-llama/Llama-3.3-70B-Instruct --ignore-patterns "original/consolidated*" +# +# This config needs 8 GPUs to run +# tune run --nproc_per_node 8 lora_finetune_distributed --config llama3_3/70B_lora + +# Model Arguments +model: + _component_: torchtune.models.llama3_3.lora_llama3_3_70b + lora_attn_modules: ['q_proj', 'v_proj', 'output_proj'] + apply_lora_to_mlp: True + apply_lora_to_output: False + lora_rank: 16 # higher increases accuracy and memory + lora_alpha: 32 # usually alpha=2*rank + lora_dropout: 0.0 + +tokenizer: + _component_: torchtune.models.llama3.llama3_tokenizer + path: /tmp/Llama-3.3-70B-Instruct/original/tokenizer.model + max_seq_len: null + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Llama-3.3-70B-Instruct/ + checkpoint_files: [ + model-00001-of-00030.safetensors, + model-00002-of-00030.safetensors, + model-00003-of-00030.safetensors, + model-00004-of-00030.safetensors, + model-00005-of-00030.safetensors, + model-00006-of-00030.safetensors, + model-00007-of-00030.safetensors, + model-00008-of-00030.safetensors, + model-00009-of-00030.safetensors, + model-00010-of-00030.safetensors, + model-00011-of-00030.safetensors, + model-00012-of-00030.safetensors, + model-00013-of-00030.safetensors, + model-00014-of-00030.safetensors, + model-00015-of-00030.safetensors, + model-00016-of-00030.safetensors, + model-00017-of-00030.safetensors, + model-00018-of-00030.safetensors, + model-00019-of-00030.safetensors, + model-00020-of-00030.safetensors, + model-00021-of-00030.safetensors, + model-00022-of-00030.safetensors, + model-00023-of-00030.safetensors, + model-00024-of-00030.safetensors, + model-00025-of-00030.safetensors, + model-00026-of-00030.safetensors, + model-00027-of-00030.safetensors, + model-00028-of-00030.safetensors, + model-00029-of-00030.safetensors, + model-00030-of-00030.safetensors, + ] + recipe_checkpoint: null + output_dir: /tmp/Llama-3.3-70B-Instruct/ + model_type: LLAMA3 +resume_from_checkpoint: False +save_adapter_weights_only: True # Set to false to save the whole model + adapter merged + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_dataset + packed: False # True increases speed +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + fused: True + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 1 # Use to increase virtual batch size +compile: False # pytorch compile, set to true for better perf/memory + +# Logging +output_dir: /tmp/lora-llama3_3-finetune-output +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +log_every_n_steps: 1 +log_peak_memory_stats: True + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: True # True reduces memory +enable_activation_offloading: False # True reduces memory +# custom_sharded_layers: ['tok_embeddings', 'output'] # Layers to shard separately (useful for large vocab size models). Lower Memory, but lower speed. + +# Profiler (disabled) +profiler: + _component_: torchtune.training.setup_torch_profiler + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 3 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/llama3_3/70B_qlora.yaml b/recipes/configs/llama3_3/70B_qlora.yaml new file mode 100644 index 0000000000..e25b196927 --- /dev/null +++ b/recipes/configs/llama3_3/70B_qlora.yaml @@ -0,0 +1,132 @@ +# Config for multi-device LoRA in lora_finetune_distributed.py +# using a Llama3.3 70B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download meta-llama/Llama-3.3-70B-Instruct --ignore-patterns "original/consolidated*" +# +# This config needs 8 GPUs to run +# tune run --nproc_per_node 8 lora_finetune_distributed --config llama3_3/70B_lora + +# Model Arguments +model: + _component_: torchtune.models.llama3_3.qlora_llama3_3_70b + lora_attn_modules: ['q_proj', 'v_proj', 'output_proj'] + apply_lora_to_mlp: True + apply_lora_to_output: False + lora_rank: 16 # higher increases accuracy and memory + lora_alpha: 32 # usually alpha=2*rank + lora_dropout: 0.0 + +tokenizer: + _component_: torchtune.models.llama3.llama3_tokenizer + path: /tmp/Llama-3.3-70B-Instruct/original/tokenizer.model + max_seq_len: null + +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Llama-3.3-70B-Instruct/ + checkpoint_files: [ + model-00001-of-00030.safetensors, + model-00002-of-00030.safetensors, + model-00003-of-00030.safetensors, + model-00004-of-00030.safetensors, + model-00005-of-00030.safetensors, + model-00006-of-00030.safetensors, + model-00007-of-00030.safetensors, + model-00008-of-00030.safetensors, + model-00009-of-00030.safetensors, + model-00010-of-00030.safetensors, + model-00011-of-00030.safetensors, + model-00012-of-00030.safetensors, + model-00013-of-00030.safetensors, + model-00014-of-00030.safetensors, + model-00015-of-00030.safetensors, + model-00016-of-00030.safetensors, + model-00017-of-00030.safetensors, + model-00018-of-00030.safetensors, + model-00019-of-00030.safetensors, + model-00020-of-00030.safetensors, + model-00021-of-00030.safetensors, + model-00022-of-00030.safetensors, + model-00023-of-00030.safetensors, + model-00024-of-00030.safetensors, + model-00025-of-00030.safetensors, + model-00026-of-00030.safetensors, + model-00027-of-00030.safetensors, + model-00028-of-00030.safetensors, + model-00029-of-00030.safetensors, + model-00030-of-00030.safetensors, + ] + recipe_checkpoint: null + output_dir: /tmp/Llama-3.3-70B-Instruct/ + model_type: LLAMA3 +resume_from_checkpoint: False +save_adapter_weights_only: True # Set to false to save the whole model + adapter merged + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_dataset + packed: False # True increases speed +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + fused: True + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torchtune.modules.loss.CEWithChunkedOutputLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 1 # Use to increase virtual batch size +compile: False # pytorch compile, set to true for better perf/memory + +# Logging +output_dir: /tmp/lora-llama3_3-finetune-output +metric_logger: + _component_: torchtune.training.metric_logging.DiskLogger + log_dir: ${output_dir} +log_every_n_steps: 1 +log_peak_memory_stats: True + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: True # True reduces memory +enable_activation_offloading: False # True reduces memory +# custom_sharded_layers: ['tok_embeddings', 'output'] # Layers to shard separately (useful for large vocab size models). Lower Memory, but lower speed. + +# Profiler (disabled) +profiler: + _component_: torchtune.training.setup_torch_profiler + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 3 + active_steps: 2 + num_cycles: 1 diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py index 2a4bf25a8b..efa0175d23 100644 --- a/torchtune/_recipe_registry.py +++ b/torchtune/_recipe_registry.py @@ -106,6 +106,7 @@ class Recipe: Config(name="llama3_2/3B_full", file_path="llama3_2/3B_full.yaml"), Config(name="llama3/70B_full", file_path="llama3/70B_full.yaml"), Config(name="llama3_1/70B_full", file_path="llama3_1/70B_full.yaml"), + Config(name="llama3_3/70B_full", file_path="llama3_3/70B_full.yaml"), Config(name="mistral/7B_full", file_path="mistral/7B_full.yaml"), Config(name="gemma/2B_full", file_path="gemma/2B_full.yaml"), Config(name="gemma/7B_full", file_path="gemma/7B_full.yaml"), @@ -353,6 +354,8 @@ class Recipe: Config(name="llama3/8B_dora", file_path="llama3/8B_dora.yaml"), Config(name="llama3/70B_lora", file_path="llama3/70B_lora.yaml"), Config(name="llama3_1/70B_lora", file_path="llama3_1/70B_lora.yaml"), + Config(name="llama3_3/70B_lora", file_path="llama3_3/70B_lora.yaml"), + Config(name="llama3_3/70B_qlora", file_path="llama3_3/70B_qlora.yaml"), Config(name="llama3/8B_lora", file_path="llama3/8B_lora.yaml"), Config(name="llama3_1/8B_lora", file_path="llama3_1/8B_lora.yaml"), Config(name="llama3_2/1B_lora", file_path="llama3_2/1B_lora.yaml"), diff --git a/torchtune/models/llama3_3/__init__.py b/torchtune/models/llama3_3/__init__.py new file mode 100644 index 0000000000..cd5ac4d306 --- /dev/null +++ b/torchtune/models/llama3_3/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from ._model_builders import llama3_3_70b, lora_llama3_3_70b, qlora_llama3_3_70b # noqa + +__all__ = [ + "llama3_3_70b", + "lora_llama3_3_70b", + "qlora_llama3_3_70b", +] diff --git a/torchtune/models/llama3_3/_model_builders.py b/torchtune/models/llama3_3/_model_builders.py new file mode 100644 index 0000000000..a55973e136 --- /dev/null +++ b/torchtune/models/llama3_3/_model_builders.py @@ -0,0 +1,37 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +from torchtune.models.llama3_1._model_builders import ( + llama3_1_70b, + lora_llama3_1_70b, + qlora_llama3_1_70b, +) + +""" +Model builders build specific instantiations using component builders. The Llama3.3 model +builders all call the Llama3.1 models as they're identical models apart from the checkpoints. +""" + +llama3_3_70b = llama3_1_70b + +llama3_3_70b.__doc__ = """ +Builder for creating a Llama3.3 model initialized w/ the default 70B parameter values. +Please see `llama3_1_70b` for full API arguments. +""" + +lora_llama3_3_70b = lora_llama3_1_70b + +lora_llama3_3_70b.__doc__ = """ +Builder for creating a Llama3.3 70B model with LoRA enabled. +Please see `lora_llama3_1_70b` for full API arguments. +""" + +qlora_llama3_3_70b = qlora_llama3_1_70b + +qlora_llama3_1_70b.__doc__ = """ +Builder for creating a Llama3.3 70B model with QLoRA enabled. Base model weights in linear layers +that LoRA is applied to are quantized per the QLoRA paper: https://arxiv.org/abs/2305.14314. +Please see `lora_llama3_1_70b` for full API arguments. +"""