diff --git a/recipes/configs/qwen2_5/0_5B_full.yaml b/recipes/configs/qwen2_5/0.5B_full.yaml similarity index 64% rename from recipes/configs/qwen2_5/0_5B_full.yaml rename to recipes/configs/qwen2_5/0.5B_full.yaml index 93c94c666a..775e5db2d1 100644 --- a/recipes/configs/qwen2_5/0_5B_full.yaml +++ b/recipes/configs/qwen2_5/0.5B_full.yaml @@ -1,29 +1,39 @@ # Config for multi-device full finetuning in full_finetune_distributed.py # using a Qwen2.5 0.5B model # -# This config assumes that you've run the following command before launching -# this run: -# tune download Qwen/Qwen2.5-0.5B-Instruct --output-dir /tmp/Qwen2_5-0_5B-Instruct --ignore-patterns None +# This config assumes that you've run the following command before launching: +# tune download Qwen/Qwen2.5-0.5B-Instruct --ignore-patterns None # # To launch on 2 devices, run the following command from root: -# tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/0_5B_full +# tune run --nproc_per_node 2 full_finetune_distributed --config qwen2_5/0.5B_full # # You can add specific overrides through the command line. For example -# to override the checkpointer directory while launching training -# you can run: -# tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/0_5B_full checkpointer.checkpoint_dir= +# to override the checkpointer directory while launching training: +# tune run --nproc_per_node 2 full_finetune_distributed --config qwen2_5/0.5B_full checkpointer.checkpoint_dir= # -# This config works best when the model is being fine-tuned on 2+ GPUs. -# Single device full finetuning requires more memory optimizations. It's -# best to use 0_5B_full_single_device.yaml for those cases +# This config is for fine-tuning on 2+ GPUs. + +# Model arguments +model: + _component_: torchtune.models.qwen2_5.qwen2_5_0_5b # Tokenizer tokenizer: _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer - path: /tmp/Qwen2_5-0_5B-Instruct/vocab.json - merges_file: /tmp/Qwen2_5-0_5B-Instruct/merges.txt + path: /tmp/Qwen2.5-0.5B-Instruct/vocab.json + merges_file: /tmp/Qwen2.5-0.5B-Instruct/merges.txt max_seq_len: null +# Checkpointer +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2.5-0.5B-Instruct + checkpoint_files: [model.safetensors] + recipe_checkpoint: null + output_dir: /tmp/Qwen2.5-0.5B-Instruct-finetune + model_type: QWEN2 +resume_from_checkpoint: False + # Dataset dataset: _component_: torchtune.datasets.alpaca_cleaned_dataset @@ -31,52 +41,35 @@ dataset: seed: null shuffle: True -# Model Arguments -model: - _component_: torchtune.models.qwen2_5.qwen2_5_0_5b - -checkpointer: - _component_: torchtune.training.FullModelHFCheckpointer - checkpoint_dir: /tmp/Qwen2_5-0_5B-Instruct - checkpoint_files: [ - model.safetensors - ] - recipe_checkpoint: null - output_dir: /tmp/Qwen2_5-0_5B-Instruct-finetune - model_type: QWEN2 -resume_from_checkpoint: False - # Fine-tuning arguments -batch_size: 2 epochs: 1 +max_steps_per_epoch: null +batch_size: 2 +gradient_accumulation_steps: 8 # Use to increase virtual batch size optimizer: _component_: torch.optim.AdamW fused: True lr: 2e-5 +optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1 loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss -max_steps_per_epoch: null -gradient_accumulation_steps: 8 # Use to increase virtual batch size -compile: False # pytorch compile, set to true for better perf/memory -optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1 # Training env device: cuda -# Memory management -enable_activation_checkpointing: True # True reduces memory +# Memory management / performance +enable_activation_checkpointing: False # True reduces memory enable_activation_offloading: False # True reduces memory - -# Reduced precision dtype: bf16 +compile: False # torch.compile the model + loss, True increases speed + decreases memory # Logging +output_dir: /tmp/Qwen2.5-0.5B-Instruct-finetune metric_logger: _component_: torchtune.training.metric_logging.DiskLogger - log_dir: ${output_dir} -output_dir: /tmp/Qwen2_5-0_5B-Instruct-finetune + log_dir: ${output_dir}/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Profiler (disabled) profiler: diff --git a/recipes/configs/qwen2_5/0_5B_full_single_device.yaml b/recipes/configs/qwen2_5/0.5B_full_single_device.yaml similarity index 68% rename from recipes/configs/qwen2_5/0_5B_full_single_device.yaml rename to recipes/configs/qwen2_5/0.5B_full_single_device.yaml index 707cbaa0f2..f668f8416b 100644 --- a/recipes/configs/qwen2_5/0_5B_full_single_device.yaml +++ b/recipes/configs/qwen2_5/0.5B_full_single_device.yaml @@ -1,31 +1,39 @@ # Config for single device full finetuning in full_finetune_single_device.py # using a Qwen2.5 0.5B # -# This config assumes that you've run the following command before launching -# this run: -# tune download Qwen/Qwen2.5-0.5B-Instruct --output-dir /tmp/Qwen2_5-0_5B-Instruct --ignore-patterns None -# -# The default config uses an optimizer from bitsandbytes. If you do not have it installed, -# you can install it with -# pip install bitsandbytes +# This config assumes that you've run the following command before launching: +# tune download Qwen/Qwen2.5-0.5B-Instruct --ignore-patterns None # # To launch on a single device, run the following command from root: -# tune run full_finetune_single_device --config qwen2_5/0_5B_full_single_device +# tune run full_finetune_single_device --config qwen2_5/0.5B_full_single_device # # You can add specific overrides through the command line. For example -# to override the checkpointer directory while launching training -# you can run: -# tune run full_finetune_single_device --config qwen2_5/0_5B_full_single_device checkpointer.checkpoint_dir= +# to override the checkpointer directory while launching training: +# tune run full_finetune_single_device --config qwen2_5/0.5B_full_single_device checkpointer.checkpoint_dir= # # This config works only for training on single device. +# Model arguments +model: + _component_: torchtune.models.qwen2_5.qwen2_5_0_5b + # Tokenizer tokenizer: _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer - path: /tmp/Qwen2_5-0_5B-Instruct/vocab.json - merges_file: /tmp/Qwen2_5-0_5B-Instruct/merges.txt + path: /tmp/Qwen2.5-0.5B-Instruct/vocab.json + merges_file: /tmp/Qwen2.5-0.5B-Instruct/merges.txt max_seq_len: null +# Checkpointer +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2.5-0.5B-Instruct + checkpoint_files: [model.safetensors] + recipe_checkpoint: null + output_dir: /tmp/Qwen2.5-0.5B-Instruct-finetune + model_type: QWEN2 +resume_from_checkpoint: False + # Dataset dataset: _component_: torchtune.datasets.alpaca_cleaned_dataset @@ -33,54 +41,35 @@ dataset: seed: null shuffle: True -# Model Arguments -model: - _component_: torchtune.models.qwen2_5.qwen2_5_0_5b - -checkpointer: - _component_: torchtune.training.FullModelHFCheckpointer - checkpoint_dir: /tmp/Qwen2_5-0_5B-Instruct - checkpoint_files: [ - model.safetensors - ] - recipe_checkpoint: null - output_dir: /tmp/Qwen2_5-0_5B-Instruct-finetune - model_type: QWEN2 -resume_from_checkpoint: False - # Fine-tuning arguments -batch_size: 2 epochs: 1 +max_steps_per_epoch: null +batch_size: 2 +gradient_accumulation_steps: 8 # Use to increase virtual batch size optimizer: _component_: torch.optim.AdamW fused: True lr: 2e-5 - +optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1 loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss -optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1 -max_steps_per_epoch: null -gradient_accumulation_steps: 8 # Use to increase virtual batch size -compile: False # pytorch compile, set to true for better perf/memory - -# Training environment +# Training env device: cuda -# Memory management -enable_activation_checkpointing: True # True reduces memory +# Memory management / performance +enable_activation_checkpointing: False # True reduces memory enable_activation_offloading: False # True reduces memory - -# Reduced precision dtype: bf16 +compile: False # torch.compile the model + loss, True increases speed + decreases memory # Logging +output_dir: /tmp/Qwen2.5-0.5B-Instruct-finetune metric_logger: _component_: torchtune.training.metric_logging.DiskLogger - log_dir: ${output_dir} -output_dir: /tmp/Qwen2_5-0_5B-Instruct-finetune + log_dir: ${output_dir}/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Profiler (disabled) profiler: diff --git a/recipes/configs/qwen2_5/0_5B_lora.yaml b/recipes/configs/qwen2_5/0.5B_lora.yaml similarity index 63% rename from recipes/configs/qwen2_5/0_5B_lora.yaml rename to recipes/configs/qwen2_5/0.5B_lora.yaml index 63ec87897c..3507542663 100644 --- a/recipes/configs/qwen2_5/0_5B_lora.yaml +++ b/recipes/configs/qwen2_5/0.5B_lora.yaml @@ -1,23 +1,19 @@ # Config for multi-device LoRA finetuning in lora_finetune_distributed.py # using a Qwen2.5 0.5B model # -# This config assumes that you've run the following command before launching -# this run: -# tune download Qwen/Qwen2.5-0.5B-Instruct --output-dir /tmp/Qwen2_5-0_5B-Instruct --ignore-patterns None +# This config assumes that you've run the following command before launching: +# tune download Qwen/Qwen2.5-0.5B-Instruct --ignore-patterns None # # To launch on 2 devices, run the following command from root: -# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/0_5B_lora +# tune run --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/0.5B_lora # # You can add specific overrides through the command line. For example -# to override the checkpointer directory while launching training -# you can run: -# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/0_5B_lora checkpointer.checkpoint_dir= +# to override the checkpointer directory while launching training: +# tune run --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/0.5B_lora checkpointer.checkpoint_dir= # -# This config works best when the model is being fine-tuned on 2+ GPUs. -# For single device LoRA finetuning please use 0_5B_lora_single_device.yaml +# This config is for fine-tuning on 2+ GPUs. - -# Model Arguments +# Model arguments model: _component_: torchtune.models.qwen2_5.lora_qwen2_5_0_5b lora_attn_modules: ['q_proj', 'v_proj', 'output_proj'] @@ -26,71 +22,66 @@ model: lora_alpha: 64 # usually alpha=2*rank lora_dropout: 0.0 +# Tokenizer tokenizer: _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer - path: /tmp/Qwen2_5-0_5B-Instruct/vocab.json - merges_file: /tmp/Qwen2_5-0_5B-Instruct/merges.txt + path: /tmp/Qwen2.5-0.5B-Instruct/vocab.json + merges_file: /tmp/Qwen2.5-0.5B-Instruct/merges.txt max_seq_len: null +# Checkpointer checkpointer: _component_: torchtune.training.FullModelHFCheckpointer - checkpoint_dir: /tmp/Qwen2_5-0_5B-Instruct - checkpoint_files: [ - model.safetensors - ] + checkpoint_dir: /tmp/Qwen2.5-0.5B-Instruct + checkpoint_files: [model.safetensors] recipe_checkpoint: null - output_dir: /tmp/Qwen2_5-0_5B-Instruct-lora-finetune + output_dir: /tmp/Qwen2.5-0.5B-Instruct-lora-finetune model_type: QWEN2 resume_from_checkpoint: False -# Dataset and Sampler +# Dataset dataset: _component_: torchtune.datasets.alpaca_cleaned_dataset packed: False # True increases speed - seed: null shuffle: True -batch_size: 4 -# Optimizer and Scheduler +# Fine-tuning arguments +epochs: 1 +max_steps_per_epoch: null +batch_size: 2 +gradient_accumulation_steps: 8 # Use to increase virtual batch size optimizer: _component_: torch.optim.AdamW fused: True weight_decay: 0.01 lr: 2e-3 - lr_scheduler: _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 - loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss -# Training -epochs: 1 -max_steps_per_epoch: null -gradient_accumulation_steps: 8 # Use to increase virtual batch size -compile: False # pytorch compile, set to true for better perf/memory +# Training env +device: cuda + +# Memory management / performance +enable_activation_checkpointing: False # True reduces memory +enable_activation_offloading: False # True reduces memory +dtype: bf16 +compile: False # torch.compile the model + loss, True increases speed + decreases memory # Logging -output_dir: /tmp/Qwen2_5-0_5B-Instruct-lora-finetune +output_dir: /tmp/Qwen2.5-0.5B-Instruct-lora-finetune metric_logger: _component_: torchtune.training.metric_logging.DiskLogger - log_dir: ${output_dir} + log_dir: ${output_dir}/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True -# Environment -device: cuda -dtype: bf16 -enable_activation_checkpointing: True # True reduces memory -enable_activation_offloading: False # True reduces memory - -# Show case the usage of pytorch profiler -# Set enabled to False as it's only needed for debugging training +# Profiler (disabled) profiler: _component_: torchtune.training.setup_torch_profiler - enabled: False #Output directory of trace artifacts @@ -109,6 +100,6 @@ profiler: # `torch.profiler.schedule` options: # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat wait_steps: 5 - warmup_steps: 5 + warmup_steps: 3 active_steps: 2 num_cycles: 1 diff --git a/recipes/configs/qwen2_5/0_5B_lora_single_device.yaml b/recipes/configs/qwen2_5/0.5B_lora_single_device.yaml similarity index 69% rename from recipes/configs/qwen2_5/0_5B_lora_single_device.yaml rename to recipes/configs/qwen2_5/0.5B_lora_single_device.yaml index e11e34bcb7..8f4e309279 100644 --- a/recipes/configs/qwen2_5/0_5B_lora_single_device.yaml +++ b/recipes/configs/qwen2_5/0.5B_lora_single_device.yaml @@ -2,21 +2,18 @@ # using a Qwen2.5 0.5B model # # This config assumes that you've run the following command before launching -# this run: -# tune download Qwen/Qwen2.5-0.5B-Instruct --output-dir /tmp/Qwen2_5-0_5B-Instruct --ignore-patterns None +# tune download Qwen/Qwen2.5-0.5B-Instruct --ignore-patterns None # # To launch on a single device, run the following command from root: -# tune run lora_finetune_single_device --config qwen2_5/0_5B_lora_single_device +# tune run lora_finetune_single_device --config qwen2_5/0.5B_lora_single_device # # You can add specific overrides through the command line. For example -# to override the checkpointer directory while launching training -# you can run: -# tune run lora_finetune_single_device --config qwen2_5/0_5B_lora_single_device checkpointer.checkpoint_dir= +# to override the checkpointer directory while launching training: +# tune run lora_finetune_single_device --config qwen2_5/0.5B_lora_single_device checkpointer.checkpoint_dir= # # This config works only for training on single device. - -# Model Arguments +# Model arguments model: _component_: torchtune.models.qwen2_5.lora_qwen2_5_0_5b lora_attn_modules: ['q_proj', 'v_proj', 'output_proj'] @@ -25,69 +22,64 @@ model: lora_alpha: 64 # usually alpha=2*rank lora_dropout: 0.0 +# Tokenizer tokenizer: _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer - path: /tmp/Qwen2_5-0_5B-Instruct/vocab.json - merges_file: /tmp/Qwen2_5-0_5B-Instruct/merges.txt + path: /tmp/Qwen2.5-0.5B-Instruct/vocab.json + merges_file: /tmp/Qwen2.5-0.5B-Instruct/merges.txt max_seq_len: null +# Checkpointer checkpointer: _component_: torchtune.training.FullModelHFCheckpointer - checkpoint_dir: /tmp/Qwen2_5-0_5B-Instruct - checkpoint_files: [ - model.safetensors - ] + checkpoint_dir: /tmp/Qwen2.5-0.5B-Instruct + checkpoint_files: [model.safetensors] recipe_checkpoint: null - output_dir: /tmp/Qwen2_5-0_5B-Instruct-lora-finetune + output_dir: /tmp/Qwen2.5-0.5B-Instruct-lora-finetune model_type: QWEN2 resume_from_checkpoint: False -# Dataset and Sampler +# Dataset dataset: _component_: torchtune.datasets.alpaca_cleaned_dataset packed: False # True increases speed seed: null shuffle: True -batch_size: 4 -# Optimizer and Scheduler +# Fine-tuning arguments +epochs: 1 +max_steps_per_epoch: null +batch_size: 2 +gradient_accumulation_steps: 8 # Use to increase virtual batch size optimizer: _component_: torch.optim.AdamW fused: True weight_decay: 0.01 lr: 2e-3 - lr_scheduler: _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 - loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss -# Training -epochs: 1 -max_steps_per_epoch: null -gradient_accumulation_steps: 8 # Use to increase virtual batch size -compile: False # pytorch compile, set to true for better perf/memory +# Training env +device: cuda + +# Memory management / performance +enable_activation_checkpointing: False # True reduces memory +enable_activation_offloading: False # True reduces memory +dtype: bf16 +compile: False # torch.compile the model + loss, True increases speed + decreases memory # Logging -output_dir: /tmp/Qwen2_5-0_5B-Instruct-lora-finetune +output_dir: /tmp/Qwen2.5-0.5B-Instruct-lora-finetune metric_logger: _component_: torchtune.training.metric_logging.DiskLogger - log_dir: ${output_dir} + log_dir: ${output_dir}/logs log_every_n_steps: 1 -log_peak_memory_stats: False - -# Environment -device: cuda -dtype: bf16 - -# Activations Offloading -enable_activation_checkpointing: True # True reduces memory -enable_activation_offloading: False # True reduces memory +log_peak_memory_stats: True -# Show case the usage of pytorch profiler -# Set enabled to False as it's only needed for debugging training +# Profiler (disabled) profiler: _component_: torchtune.training.setup_torch_profiler enabled: False @@ -108,6 +100,6 @@ profiler: # `torch.profiler.schedule` options: # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat wait_steps: 5 - warmup_steps: 5 + warmup_steps: 3 active_steps: 2 num_cycles: 1 diff --git a/recipes/configs/qwen2_5/1_5B_full.yaml b/recipes/configs/qwen2_5/1.5B_full.yaml similarity index 64% rename from recipes/configs/qwen2_5/1_5B_full.yaml rename to recipes/configs/qwen2_5/1.5B_full.yaml index 13999e478d..8eb535df2c 100644 --- a/recipes/configs/qwen2_5/1_5B_full.yaml +++ b/recipes/configs/qwen2_5/1.5B_full.yaml @@ -1,29 +1,39 @@ # Config for multi-device full finetuning in full_finetune_distributed.py # using a Qwen2.5 1.5B model # -# This config assumes that you've run the following command before launching -# this run: -# tune download Qwen/Qwen2.5-1.5B-Instruct --output-dir /tmp/Qwen2_5-1_5B-Instruct --ignore-patterns None +# This config assumes that you've run the following command before launching: +# tune download Qwen/Qwen2.5-1.5B-Instruct --ignore-patterns None # # To launch on 2 devices, run the following command from root: -# tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/1_5B_full +# tune run --nproc_per_node 2 full_finetune_distributed --config qwen2_5/1.5B_full # # You can add specific overrides through the command line. For example -# to override the checkpointer directory while launching training -# you can run: -# tune run --nnodes 1 --nproc_per_node 2 full_finetune_distributed --config qwen2_5/1_5B_full checkpointer.checkpoint_dir= +# to override the checkpointer directory while launching training: +# tune run --nproc_per_node 2 full_finetune_distributed --config qwen2_5/1.5B_full checkpointer.checkpoint_dir= # -# This config works best when the model is being fine-tuned on 2+ GPUs. -# Single device full finetuning requires more memory optimizations. It's -# best to use 1_5B_full_single_device.yaml for those cases +# This config is for fine-tuning on 2+ GPUs. + +# Model arguments +model: + _component_: torchtune.models.qwen2_5.qwen2_5_1_5b_instruct # Tokenizer tokenizer: _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer - path: /tmp/Qwen2_5-1_5B-Instruct/vocab.json - merges_file: /tmp/Qwen2_5-1_5B-Instruct/merges.txt + path: /tmp/Qwen2.5-1.5B-Instruct/vocab.json + merges_file: /tmp/Qwen2.5-1.5B-Instruct/merges.txt max_seq_len: null +# Checkpointer +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2.5-1.5B-Instruct + checkpoint_files: [model.safetensors] + recipe_checkpoint: null + output_dir: /tmp/Qwen2.5-1.5B-Instruct-finetune + model_type: QWEN2 +resume_from_checkpoint: False + # Dataset dataset: _component_: torchtune.datasets.alpaca_cleaned_dataset @@ -31,52 +41,35 @@ dataset: seed: null shuffle: True -# Model Arguments -model: - _component_: torchtune.models.qwen2_5.qwen2_5_1_5b_instruct - -checkpointer: - _component_: torchtune.training.FullModelHFCheckpointer - checkpoint_dir: /tmp/Qwen2_5-1_5B-Instruct - checkpoint_files: [ - model.safetensors - ] - recipe_checkpoint: null - output_dir: /tmp/Qwen2_5-1_5B-Instruct-finetune - model_type: QWEN2 -resume_from_checkpoint: False - # Fine-tuning arguments -batch_size: 2 epochs: 1 +max_steps_per_epoch: null +batch_size: 2 +gradient_accumulation_steps: 1 # Use to increase virtual batch size optimizer: _component_: torch.optim.AdamW fused: True lr: 2e-5 +optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1 loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss -max_steps_per_epoch: null -gradient_accumulation_steps: 1 # Use to increase virtual batch size -compile: False # pytorch compile, set to true for better perf/memory -optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1 # Training env device: cuda -# Memory management -enable_activation_checkpointing: False # True reduces memory +# Memory management / performance +enable_activation_checkpointing: True # True reduces memory enable_activation_offloading: False # True reduces memory - -# Reduced precision dtype: bf16 +compile: False # torch.compile the model + loss, True increases speed + decreases memory # Logging +output_dir: /tmp/Qwen2.5-1.5B-Instruct-finetune metric_logger: _component_: torchtune.training.metric_logging.DiskLogger - log_dir: ${output_dir} -output_dir: /tmp/Qwen2_5-1_5B-Instruct-finetune + log_dir: ${output_dir}/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Profiler (disabled) profiler: diff --git a/recipes/configs/qwen2_5/1_5B_full_single_device.yaml b/recipes/configs/qwen2_5/1.5B_full_single_device.yaml similarity index 73% rename from recipes/configs/qwen2_5/1_5B_full_single_device.yaml rename to recipes/configs/qwen2_5/1.5B_full_single_device.yaml index 9d23055ab5..b0e860548d 100644 --- a/recipes/configs/qwen2_5/1_5B_full_single_device.yaml +++ b/recipes/configs/qwen2_5/1.5B_full_single_device.yaml @@ -1,31 +1,43 @@ # Config for single device full finetuning in full_finetune_single_device.py # using a Qwen2.5 1.5B # -# This config assumes that you've run the following command before launching -# this run: -# tune download Qwen/Qwen2.5-1.5B-Instruct --output-dir /tmp/Qwen2_5-1_5B-Instruct --ignore-patterns None +# This config assumes that you've run the following command before launching: +# tune download Qwen/Qwen2.5-1.5B-Instruct --ignore-patterns None # # The default config uses an optimizer from bitsandbytes. If you do not have it installed, -# you can install it with +# you can install it with: # pip install bitsandbytes # # To launch on a single device, run the following command from root: -# tune run full_finetune_single_device --config qwen2_5/1_5B_full_single_device +# tune run full_finetune_single_device --config qwen2_5/1.5B_full_single_device # # You can add specific overrides through the command line. For example -# to override the checkpointer directory while launching training -# you can run: -# tune run full_finetune_single_device --config qwen2_5/1_5B_full_single_device checkpointer.checkpoint_dir= +# to override the checkpointer directory while launching training: +# tune run full_finetune_single_device --config qwen2_5/1.5B_full_single_device checkpointer.checkpoint_dir= # # This config works only for training on single device. +# Model arguments +model: + _component_: torchtune.models.qwen2_5.qwen2_5_1_5b_instruct + # Tokenizer tokenizer: _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer - path: /tmp/Qwen2_5-1_5B-Instruct/vocab.json - merges_file: /tmp/Qwen2_5-1_5B-Instruct/merges.txt + path: /tmp/Qwen2.5-1.5B-Instruct/vocab.json + merges_file: /tmp/Qwen2.5-1.5B-Instruct/merges.txt max_seq_len: null +# Checkpointer +checkpointer: + _component_: torchtune.training.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2.5-1.5B-Instruct + checkpoint_files: [model.safetensors] + recipe_checkpoint: null + output_dir: /tmp/Qwen2.5-1.5B-Instruct-finetune + model_type: QWEN2 +resume_from_checkpoint: False + # Dataset dataset: _component_: torchtune.datasets.alpaca_cleaned_dataset @@ -33,54 +45,34 @@ dataset: seed: null shuffle: True -# Model Arguments -model: - _component_: torchtune.models.qwen2_5.qwen2_5_1_5b_instruct - -checkpointer: - _component_: torchtune.training.FullModelHFCheckpointer - checkpoint_dir: /tmp/Qwen2_5-1_5B-Instruct - checkpoint_files: [ - model.safetensors - ] - recipe_checkpoint: null - output_dir: /tmp/Qwen2_5-1_5B-Instruct-finetune - model_type: QWEN2 -resume_from_checkpoint: False - # Fine-tuning arguments -batch_size: 2 epochs: 1 +max_steps_per_epoch: null +batch_size: 4 +gradient_accumulation_steps: 1 # Use to increase virtual batch size optimizer: _component_: bitsandbytes.optim.PagedAdamW lr: 2e-5 - optimizer_in_bwd: True # True saves memory. Requires gradient_accumulation_steps=1 - loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss -max_steps_per_epoch: null -gradient_accumulation_steps: 1 # Use to increase virtual batch size -compile: False # pytorch compile, set to true for better perf/memory - -# Training environment +# Training env device: cuda -# Memory management +# Memory management / performance enable_activation_checkpointing: True # True reduces memory enable_activation_offloading: False # True reduces memory - -# Reduced precision dtype: bf16 +compile: False # torch.compile the model + loss, True increases speed + decreases memory # Logging +output_dir: /tmp/Qwen2.5-1.5B-Instruct-finetune metric_logger: _component_: torchtune.training.metric_logging.DiskLogger - log_dir: ${output_dir} -output_dir: /tmp/Qwen2_5-1_5B-Instruct-finetune + log_dir: ${output_dir}/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True # Profiler (disabled) profiler: diff --git a/recipes/configs/qwen2_5/1_5B_lora.yaml b/recipes/configs/qwen2_5/1.5B_lora.yaml similarity index 63% rename from recipes/configs/qwen2_5/1_5B_lora.yaml rename to recipes/configs/qwen2_5/1.5B_lora.yaml index d47835d0b5..c44bea6124 100644 --- a/recipes/configs/qwen2_5/1_5B_lora.yaml +++ b/recipes/configs/qwen2_5/1.5B_lora.yaml @@ -1,23 +1,19 @@ # Config for multi-device LoRA finetuning in lora_finetune_distributed.py # using a Qwen2.5 1.5B model # -# This config assumes that you've run the following command before launching -# this run: -# tune download Qwen/Qwen2.5-1.5B-Instruct --output-dir /tmp/Qwen2_5-1_5B-Instruct --ignore-patterns None +# This config assumes that you've run the following command before launching: +# tune download Qwen/Qwen2.5-1.5B-Instruct --ignore-patterns None # # To launch on 2 devices, run the following command from root: -# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/1_5B_lora +# tune run --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/1.5B_lora # # You can add specific overrides through the command line. For example -# to override the checkpointer directory while launching training -# you can run: -# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/1_5B_lora checkpointer.checkpoint_dir= +# to override the checkpointer directory while launching training: +# tune run --nproc_per_node 2 lora_finetune_distributed --config qwen2_5/1.5B_lora checkpointer.checkpoint_dir= # -# This config works best when the model is being fine-tuned on 2+ GPUs. -# For single device LoRA finetuning please use 1_5B_lora_single_device.yaml +# This config is for fine-tuning on 2+ GPUs. - -# Model Arguments +# Model arguments model: _component_: torchtune.models.qwen2_5.lora_qwen2_5_1_5b_instruct lora_attn_modules: ['q_proj', 'v_proj', 'output_proj'] @@ -26,69 +22,65 @@ model: lora_alpha: 64 # usually alpha=2*rank lora_dropout: 0.0 +# Tokenizer tokenizer: _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer - path: /tmp/Qwen2_5-1_5B-Instruct/vocab.json - merges_file: /tmp/Qwen2_5-1_5B-Instruct/merges.txt + path: /tmp/Qwen2.5-1.5B-Instruct/vocab.json + merges_file: /tmp/Qwen2.5-1.5B-Instruct/merges.txt max_seq_len: null +# Checkpointer checkpointer: _component_: torchtune.training.FullModelHFCheckpointer - checkpoint_dir: /tmp/Qwen2_5-1_5B-Instruct - checkpoint_files: [ - model.safetensors - ] + checkpoint_dir: /tmp/Qwen2.5-1.5B-Instruct + checkpoint_files: [model.safetensors] recipe_checkpoint: null - output_dir: /tmp/Qwen2_5-1_5B-Instruct-lora-finetune + output_dir: /tmp/Qwen2.5-1.5B-Instruct-lora-finetune model_type: QWEN2 resume_from_checkpoint: False -# Dataset and Sampler +# Dataset dataset: _component_: torchtune.datasets.alpaca_cleaned_dataset packed: False # True increases speed seed: null shuffle: True -batch_size: 2 -# Optimizer and Scheduler +# Fine-tuning arguments +epochs: 1 +max_steps_per_epoch: null +batch_size: 2 +gradient_accumulation_steps: 8 # Use to increase virtual batch size optimizer: _component_: torch.optim.AdamW fused: True lr: 2e-5 - lr_scheduler: _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 - loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss -# Training -epochs: 1 -max_steps_per_epoch: null -gradient_accumulation_steps: 8 # Use to increase virtual batch size -compile: False # pytorch compile, set to true for better perf/memory +# Training env +device: cuda + +# Memory management / performance +enable_activation_checkpointing: False # True reduces memory +enable_activation_offloading: False # True reduces memory +dtype: bf16 +compile: False # torch.compile the model + loss, True increases speed + decreases memory # Logging -output_dir: /tmp/Qwen2_5-1_5B-Instruct-lora-finetune +output_dir: /tmp/Qwen2.5-1.5B-Instruct-lora-finetune metric_logger: _component_: torchtune.training.metric_logging.DiskLogger - log_dir: ${output_dir} + log_dir: ${output_dir}/logs log_every_n_steps: 1 -log_peak_memory_stats: False +log_peak_memory_stats: True -# Environment -device: cuda -dtype: bf16 -enable_activation_checkpointing: True # True reduces memory -enable_activation_offloading: False # True reduces memory - -# Show case the usage of pytorch profiler -# Set enabled to False as it's only needed for debugging training +# Profiler (disabled) profiler: _component_: torchtune.training.setup_torch_profiler - enabled: False #Output directory of trace artifacts @@ -107,6 +99,6 @@ profiler: # `torch.profiler.schedule` options: # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat wait_steps: 5 - warmup_steps: 5 + warmup_steps: 3 active_steps: 2 num_cycles: 1 diff --git a/recipes/configs/qwen2_5/1_5B_lora_single_device.yaml b/recipes/configs/qwen2_5/1.5B_lora_single_device.yaml similarity index 69% rename from recipes/configs/qwen2_5/1_5B_lora_single_device.yaml rename to recipes/configs/qwen2_5/1.5B_lora_single_device.yaml index e9583ea62a..168dea810c 100644 --- a/recipes/configs/qwen2_5/1_5B_lora_single_device.yaml +++ b/recipes/configs/qwen2_5/1.5B_lora_single_device.yaml @@ -1,22 +1,19 @@ # Config for single device LoRA finetuning in lora_finetune_single_device.py # using a Qwen2.5 1.5B model # -# This config assumes that you've run the following command before launching -# this run: -# tune download Qwen/Qwen2.5-1.5B-Instruct --output-dir /tmp/Qwen2_5-1_5B-Instruct --ignore-patterns None +# This config assumes that you've run the following command before launching: +# tune download Qwen/Qwen2.5-1.5B-Instruct --ignore-patterns None # # To launch on a single device, run the following command from root: -# tune run lora_finetune_single_device --config qwen2_5/1_5B_lora_single_device +# tune run lora_finetune_single_device --config qwen2_5/1.5B_lora_single_device # # You can add specific overrides through the command line. For example -# to override the checkpointer directory while launching training -# you can run: -# tune run lora_finetune_single_device --config qwen2_5/1_5B_lora_single_device checkpointer.checkpoint_dir= +# to override the checkpointer directory while launching training: +# tune run lora_finetune_single_device --config qwen2_5/1.5B_lora_single_device checkpointer.checkpoint_dir= # # This config works only for training on single device. - -# Model Arguments +# Model arguments model: _component_: torchtune.models.qwen2_5.lora_qwen2_5_1_5b_instruct lora_attn_modules: ['q_proj', 'v_proj', 'output_proj'] @@ -25,68 +22,63 @@ model: lora_alpha: 64 # usually alpha=2*rank lora_dropout: 0.0 +# Tokenizer tokenizer: _component_: torchtune.models.qwen2_5.qwen2_5_tokenizer - path: /tmp/Qwen2_5-1_5B-Instruct/vocab.json - merges_file: /tmp/Qwen2_5-1_5B-Instruct/merges.txt + path: /tmp/Qwen2.5-1.5B-Instruct/vocab.json + merges_file: /tmp/Qwen2.5-1.5B-Instruct/merges.txt max_seq_len: null +# Checkpointer checkpointer: _component_: torchtune.training.FullModelHFCheckpointer - checkpoint_dir: /tmp/Qwen2_5-1_5B-Instruct - checkpoint_files: [ - model.safetensors - ] + checkpoint_dir: /tmp/Qwen2.5-1.5B-Instruct + checkpoint_files: [model.safetensors] recipe_checkpoint: null - output_dir: /tmp/Qwen2_5-1_5B-Instruct-lora-finetune + output_dir: /tmp/Qwen2.5-1.5B-Instruct-lora-finetune model_type: QWEN2 resume_from_checkpoint: False -# Dataset and Sampler +# Dataset dataset: _component_: torchtune.datasets.alpaca_cleaned_dataset packed: False # True increases speed seed: null shuffle: True -batch_size: 2 -# Optimizer and Scheduler +# Fine-tuning arguments +epochs: 1 +max_steps_per_epoch: null +batch_size: 2 +gradient_accumulation_steps: 8 # Use to increase virtual batch size optimizer: _component_: torch.optim.AdamW fused: True lr: 2e-3 - lr_scheduler: _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup num_warmup_steps: 100 - loss: _component_: torchtune.modules.loss.CEWithChunkedOutputLoss -# Training -epochs: 1 -max_steps_per_epoch: null -gradient_accumulation_steps: 8 # Use to increase virtual batch size -compile: False # pytorch compile, set to true for better perf/memory +# Training env +device: cuda + +# Memory management / performance +enable_activation_checkpointing: False # True reduces memory +enable_activation_offloading: False # True reduces memory +dtype: bf16 +compile: False # torch.compile the model + loss, True increases speed + decreases memory # Logging -output_dir: /tmp/Qwen2_5-1_5B-Instruct-lora-finetune +output_dir: /tmp/Qwen2.5-1.5B-Instruct-lora-finetune metric_logger: _component_: torchtune.training.metric_logging.DiskLogger - log_dir: ${output_dir} + log_dir: ${output_dir}/logs log_every_n_steps: 1 -log_peak_memory_stats: False - -# Environment -device: cuda -dtype: bf16 - -# Activations Offloading -enable_activation_checkpointing: True # True reduces memory -enable_activation_offloading: False # True reduces memory +log_peak_memory_stats: True -# Show case the usage of pytorch profiler -# Set enabled to False as it's only needed for debugging training +# Profiler (disabled) profiler: _component_: torchtune.training.setup_torch_profiler enabled: False @@ -107,6 +99,6 @@ profiler: # `torch.profiler.schedule` options: # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat wait_steps: 5 - warmup_steps: 5 + warmup_steps: 3 active_steps: 2 num_cycles: 1 diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py index c40e89184b..5bbf860482 100644 --- a/torchtune/_recipe_registry.py +++ b/torchtune/_recipe_registry.py @@ -72,12 +72,12 @@ class Recipe: file_path="qwen2/1.5B_full_single_device.yaml", ), Config( - name="qwen2_5/0_5B_full_single_device", - file_path="qwen2_5/0_5B_full_single_device.yaml", + name="qwen2_5/0.5B_full_single_device", + file_path="qwen2_5/0.5B_full_single_device.yaml", ), Config( - name="qwen2_5/1_5B_full_single_device", - file_path="qwen2_5/1_5B_full_single_device.yaml", + name="qwen2_5/1.5B_full_single_device", + file_path="qwen2_5/1.5B_full_single_device.yaml", ), Config( name="qwen2_5/3B_full_single_device", @@ -116,8 +116,8 @@ class Recipe: Config(name="qwen2/7B_full", file_path="qwen2/7B_full.yaml"), Config(name="qwen2/0.5B_full", file_path="qwen2/0.5B_full.yaml"), Config(name="qwen2/1.5B_full", file_path="qwen2/1.5B_full.yaml"), - Config(name="qwen2_5/0_5B_full", file_path="qwen2_5/0_5B_full.yaml"), - Config(name="qwen2_5/1_5B_full", file_path="qwen2_5/1_5B_full.yaml"), + Config(name="qwen2_5/0.5B_full", file_path="qwen2_5/0.5B_full.yaml"), + Config(name="qwen2_5/1.5B_full", file_path="qwen2_5/1.5B_full.yaml"), Config(name="qwen2_5/3B_full", file_path="qwen2_5/3B_full.yaml"), Config(name="qwen2_5/7B_full", file_path="qwen2_5/7B_full.yaml"), Config( @@ -264,12 +264,12 @@ class Recipe: file_path="qwen2/1.5B_lora_single_device.yaml", ), Config( - name="qwen2_5/0_5B_lora_single_device", - file_path="qwen2_5/0_5B_lora_single_device.yaml", + name="qwen2_5/0.5B_lora_single_device", + file_path="qwen2_5/0.5B_lora_single_device.yaml", ), Config( - name="qwen2_5/1_5B_lora_single_device", - file_path="qwen2_5/1_5B_lora_single_device.yaml", + name="qwen2_5/1.5B_lora_single_device", + file_path="qwen2_5/1.5B_lora_single_device.yaml", ), Config( name="qwen2_5/3B_lora_single_device", @@ -363,8 +363,8 @@ class Recipe: Config(name="qwen2/7B_lora", file_path="qwen2/7B_lora.yaml"), Config(name="qwen2/0.5B_lora", file_path="qwen2/0.5B_lora.yaml"), Config(name="qwen2/1.5B_lora", file_path="qwen2/1.5B_lora.yaml"), - Config(name="qwen2_5/0_5B_lora", file_path="qwen2_5/0_5B_lora.yaml"), - Config(name="qwen2_5/1_5B_lora", file_path="qwen2_5/1_5B_lora.yaml"), + Config(name="qwen2_5/0.5B_lora", file_path="qwen2_5/0.5B_lora.yaml"), + Config(name="qwen2_5/1.5B_lora", file_path="qwen2_5/1.5B_lora.yaml"), Config(name="qwen2_5/3B_lora", file_path="qwen2_5/3B_lora.yaml"), Config(name="qwen2_5/7B_lora", file_path="qwen2_5/7B_lora.yaml"), Config(name="qwen2_5/32B_lora", file_path="qwen2_5/32B_lora.yaml"),