Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update configs #1954

Merged
merged 14 commits into from
Nov 7, 2024
35 changes: 30 additions & 5 deletions recipes/configs/code_llama2/7B_full_low_memory.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ resume_from_checkpoint: False

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_dataset
packed: False # True increases speed

seed: null
shuffle: True
Expand All @@ -55,20 +55,20 @@ shuffle: True
epochs: 1
max_steps_per_epoch: null
batch_size: 2
gradient_accumulation_steps: 1
gradient_accumulation_steps: 1 # Use to increase virtual batch size
optimizer:
_component_: bitsandbytes.optim.PagedAdamW
lr: 2e-5
optimizer_in_bwd: True
optimizer_in_bwd: True # True saves memory. Requires gradient_accumulation_steps=1
loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
compile: False
compile: False # pytorch compile, set to true for better perf/memory

# Training env
device: cuda

# Memory management
enable_activation_checkpointing: True
enable_activation_checkpointing: True # True reduces memory
enable_activation_offloading: True # True reduces memory
dtype: bf16

Expand All @@ -79,3 +79,28 @@ metric_logger:
log_dir: /tmp/CodeLlama-7b-hf/logs
log_every_n_steps: 1
log_peak_memory_stats: True

# Profiler (disabled)
profiler:
_component_: torchtune.training.setup_torch_profiler
enabled: False

#Output directory of trace artifacts
output_dir: ${output_dir}/profiling_outputs

#`torch.profiler.ProfilerActivity` types to trace
cpu: True
cuda: True

#trace options passed to `torch.profiler.profile`
profile_memory: False
with_stack: False
record_shapes: True
with_flops: False

# `torch.profiler.schedule` options:
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
wait_steps: 5
warmup_steps: 3
active_steps: 2
num_cycles: 1
16 changes: 8 additions & 8 deletions recipes/configs/code_llama2/7B_lora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@
# Model Arguments
model:
_component_: torchtune.models.code_llama2.lora_code_llama2_7b
lora_attn_modules: ['q_proj', 'v_proj']
apply_lora_to_mlp: False
lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
apply_lora_to_mlp: True
apply_lora_to_output: False
lora_rank: 8
lora_alpha: 16
lora_rank: 8 # higher increases accuracy and memory
lora_alpha: 16 # usually alpha=2*rank
lora_dropout: 0.0

# Tokenizer
Expand All @@ -49,8 +49,8 @@ save_adapter_weights_only: False

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_cleaned_dataset
packed: False # True increases speed

seed: null
shuffle: True
Expand All @@ -59,7 +59,7 @@ shuffle: True
epochs: 1
max_steps_per_epoch: null
batch_size: 2
gradient_accumulation_steps: 16
gradient_accumulation_steps: 8 # Use to increase virtual batch size
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry to be a pain but I feel like "effective batch size" is the more common term?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

:D

optimizer:
_component_: torch.optim.AdamW
fused: True
Expand All @@ -70,13 +70,13 @@ lr_scheduler:
num_warmup_steps: 100
loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
compile: False
compile: False # pytorch compile, set to true for better perf/memory

# Training env
device: cuda

# Memory management
enable_activation_checkpointing: True
enable_activation_checkpointing: True # True reduces memory
enable_activation_offloading: False # True reduces memory
dtype: bf16

Expand Down
14 changes: 7 additions & 7 deletions recipes/configs/code_llama2/7B_qlora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@
# Model Arguments
model:
_component_: torchtune.models.code_llama2.qlora_code_llama2_7b
lora_attn_modules: ['q_proj', 'v_proj', 'k_proj', 'output_proj']
lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
apply_lora_to_mlp: True
apply_lora_to_output: False
lora_rank: 8
lora_alpha: 16
lora_rank: 8 # higher increases accuracy and memory
lora_alpha: 16 # usually alpha=2*rank
lora_dropout: 0.0

# Tokenizer
Expand All @@ -49,16 +49,16 @@ save_adapter_weights_only: False

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_cleaned_dataset
packed: False # True increases speed
seed: null
shuffle: True

# Fine-tuning arguments
epochs: 1
max_steps_per_epoch: null
batch_size: 2
gradient_accumulation_steps: 16
gradient_accumulation_steps: 8 # Use to increase virtual batch size
optimizer:
_component_: torch.optim.AdamW
fused: True
Expand All @@ -69,13 +69,13 @@ lr_scheduler:
num_warmup_steps: 100
loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
compile: False
compile: False # pytorch compile, set to true for better perf/memory

# Training env
device: cuda

# Memory management
enable_activation_checkpointing: True
enable_activation_checkpointing: True # True reduces memory
enable_activation_offloading: False # True reduces memory
dtype: bf16

Expand Down
33 changes: 29 additions & 4 deletions recipes/configs/dev/8B_full_experimental.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ tokenizer:

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_dataset
packed: False # True increases speed
seed: null
shuffle: True

Expand Down Expand Up @@ -57,14 +57,14 @@ optimizer:
loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 1
compile: False
gradient_accumulation_steps: 1 # Use to increase virtual batch size
compile: False # pytorch compile, set to true for better perf/memory

# Training env
device: cuda

# Memory management
enable_activation_checkpointing: False
enable_activation_checkpointing: False # True reduces memory
enable_activation_offloading: False # True reduces memory
ac_mode: 'selective' # ['selective', 'full']
ac_option: 2 # [int] = ac every positive int layer
Expand All @@ -81,3 +81,28 @@ metric_logger:
output_dir: /tmp/alpaca-llama3-finetune
log_every_n_steps: null
log_peak_memory_stats: True

# Profiler (disabled)
profiler:
_component_: torchtune.training.setup_torch_profiler
enabled: False

#Output directory of trace artifacts
output_dir: ${output_dir}/profiling_outputs

#`torch.profiler.ProfilerActivity` types to trace
cpu: True
cuda: True

#trace options passed to `torch.profiler.profile`
profile_memory: False
with_stack: False
record_shapes: True
with_flops: False

# `torch.profiler.schedule` options:
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
wait_steps: 5
warmup_steps: 3
active_steps: 2
num_cycles: 1
34 changes: 30 additions & 4 deletions recipes/configs/gemma/2B_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ tokenizer:

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_dataset
packed: False # True increases speed
seed: null
shuffle: True

Expand Down Expand Up @@ -54,14 +54,15 @@ optimizer:
loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 1
compile: False
gradient_accumulation_steps: 1 # Use to increase virtual batch size
compile: False # pytorch compile, set to true for better perf/memory
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1

# Training env
device: cuda

# Memory management
enable_activation_checkpointing: True
enable_activation_checkpointing: True # True reduces memory
enable_activation_offloading: False # True reduces memory

# Reduced precision
Expand All @@ -74,3 +75,28 @@ metric_logger:
output_dir: /tmp/alpaca-gemma-finetune
log_every_n_steps: 1
log_peak_memory_stats: True

# Profiler (disabled)
profiler:
_component_: torchtune.training.setup_torch_profiler
enabled: False

#Output directory of trace artifacts
output_dir: ${output_dir}/profiling_outputs

#`torch.profiler.ProfilerActivity` types to trace
cpu: True
cuda: True

#trace options passed to `torch.profiler.profile`
profile_memory: False
with_stack: False
record_shapes: True
with_flops: False

# `torch.profiler.schedule` options:
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
wait_steps: 5
warmup_steps: 3
active_steps: 2
num_cycles: 1
39 changes: 32 additions & 7 deletions recipes/configs/gemma/2B_lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,18 @@ tokenizer:

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_dataset
packed: False # True increases speed
seed: null
shuffle: True

# Model Arguments
model:
_component_: torchtune.models.gemma.lora_gemma_2b
lora_attn_modules: ['q_proj', 'k_proj', 'v_proj']
lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
apply_lora_to_mlp: True
lora_rank: 64
lora_alpha: 128
lora_rank: 64 # higher increases accuracy and memory
lora_alpha: 128 # usually alpha=2*rank
lora_dropout: 0.0

checkpointer:
Expand Down Expand Up @@ -66,14 +66,14 @@ loss:
batch_size: 4
epochs: 3
max_steps_per_epoch: null
gradient_accumulation_steps: 1
compile: False
gradient_accumulation_steps: 1 # Use to increase virtual batch size
compile: False # pytorch compile, set to true for better perf/memory

# Training env
device: cuda

# Memory management
enable_activation_checkpointing: True
enable_activation_checkpointing: True # True reduces memory
enable_activation_offloading: False # True reduces memory

# Reduced precision
Expand All @@ -86,3 +86,28 @@ metric_logger:
output_dir: /tmp/alpaca-gemma-lora
log_every_n_steps: 1
log_peak_memory_stats: True

# Profiler (disabled)
profiler:
_component_: torchtune.training.setup_torch_profiler
enabled: False

#Output directory of trace artifacts
output_dir: ${output_dir}/profiling_outputs

#`torch.profiler.ProfilerActivity` types to trace
cpu: True
cuda: True

#trace options passed to `torch.profiler.profile`
profile_memory: False
with_stack: False
record_shapes: True
with_flops: False

# `torch.profiler.schedule` options:
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
wait_steps: 5
warmup_steps: 3
active_steps: 2
num_cycles: 1
14 changes: 7 additions & 7 deletions recipes/configs/gemma/2B_lora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,18 @@ tokenizer:

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_dataset
packed: False # True increases speed
seed: null
shuffle: True

# Model Arguments
model:
_component_: torchtune.models.gemma.lora_gemma_2b
lora_attn_modules: ['q_proj', 'k_proj', 'v_proj']
lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
apply_lora_to_mlp: True
lora_rank: 64
lora_alpha: 128
lora_rank: 64 # higher increases accuracy and memory
lora_alpha: 128 # usually alpha=2*rank
lora_dropout: 0.0

checkpointer:
Expand Down Expand Up @@ -65,14 +65,14 @@ loss:
batch_size: 4
epochs: 3
max_steps_per_epoch: null
gradient_accumulation_steps: 4
compile: False
gradient_accumulation_steps: 8 # Use to increase virtual batch size
compile: False # pytorch compile, set to true for better perf/memory

# Training env
device: cuda

# Memory management
enable_activation_checkpointing: True
enable_activation_checkpointing: True # True reduces memory
enable_activation_offloading: False # True reduces memory

# Reduced precision
Expand Down
Loading
Loading