Skip to content

Commit

Permalink
Merge branch 'main' into mm_tests
Browse files Browse the repository at this point in the history
  • Loading branch information
SalmanMohammadi committed Nov 8, 2024
2 parents f0a94d7 + 7bfb333 commit df3402c
Show file tree
Hide file tree
Showing 117 changed files with 2,212 additions and 732 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/regression_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ jobs:
python-version: ['3.11']
torch-version: ["stable", "nightly"]
fail-fast: false
env:
PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
steps:
- name: Check out repo
uses: actions/checkout@v3
Expand Down
35 changes: 30 additions & 5 deletions recipes/configs/code_llama2/7B_full_low_memory.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ resume_from_checkpoint: False

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_dataset
packed: False # True increases speed

seed: null
shuffle: True
Expand All @@ -55,20 +55,20 @@ shuffle: True
epochs: 1
max_steps_per_epoch: null
batch_size: 2
gradient_accumulation_steps: 1
gradient_accumulation_steps: 1 # Use to increase virtual batch size
optimizer:
_component_: bitsandbytes.optim.PagedAdamW
lr: 2e-5
optimizer_in_bwd: True
optimizer_in_bwd: True # True saves memory. Requires gradient_accumulation_steps=1
loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
compile: False
compile: False # pytorch compile, set to true for better perf/memory

# Training env
device: cuda

# Memory management
enable_activation_checkpointing: True
enable_activation_checkpointing: True # True reduces memory
enable_activation_offloading: True # True reduces memory
dtype: bf16

Expand All @@ -79,3 +79,28 @@ metric_logger:
log_dir: /tmp/CodeLlama-7b-hf/logs
log_every_n_steps: 1
log_peak_memory_stats: True

# Profiler (disabled)
profiler:
_component_: torchtune.training.setup_torch_profiler
enabled: False

#Output directory of trace artifacts
output_dir: ${output_dir}/profiling_outputs

#`torch.profiler.ProfilerActivity` types to trace
cpu: True
cuda: True

#trace options passed to `torch.profiler.profile`
profile_memory: False
with_stack: False
record_shapes: True
with_flops: False

# `torch.profiler.schedule` options:
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
wait_steps: 5
warmup_steps: 3
active_steps: 2
num_cycles: 1
16 changes: 8 additions & 8 deletions recipes/configs/code_llama2/7B_lora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@
# Model Arguments
model:
_component_: torchtune.models.code_llama2.lora_code_llama2_7b
lora_attn_modules: ['q_proj', 'v_proj']
apply_lora_to_mlp: False
lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
apply_lora_to_mlp: True
apply_lora_to_output: False
lora_rank: 8
lora_alpha: 16
lora_rank: 8 # higher increases accuracy and memory
lora_alpha: 16 # usually alpha=2*rank
lora_dropout: 0.0

# Tokenizer
Expand All @@ -49,8 +49,8 @@ save_adapter_weights_only: False

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_cleaned_dataset
packed: False # True increases speed

seed: null
shuffle: True
Expand All @@ -59,7 +59,7 @@ shuffle: True
epochs: 1
max_steps_per_epoch: null
batch_size: 2
gradient_accumulation_steps: 16
gradient_accumulation_steps: 8 # Use to increase virtual batch size
optimizer:
_component_: torch.optim.AdamW
fused: True
Expand All @@ -70,13 +70,13 @@ lr_scheduler:
num_warmup_steps: 100
loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
compile: False
compile: False # pytorch compile, set to true for better perf/memory

# Training env
device: cuda

# Memory management
enable_activation_checkpointing: True
enable_activation_checkpointing: True # True reduces memory
enable_activation_offloading: False # True reduces memory
dtype: bf16

Expand Down
14 changes: 7 additions & 7 deletions recipes/configs/code_llama2/7B_qlora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@
# Model Arguments
model:
_component_: torchtune.models.code_llama2.qlora_code_llama2_7b
lora_attn_modules: ['q_proj', 'v_proj', 'k_proj', 'output_proj']
lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
apply_lora_to_mlp: True
apply_lora_to_output: False
lora_rank: 8
lora_alpha: 16
lora_rank: 8 # higher increases accuracy and memory
lora_alpha: 16 # usually alpha=2*rank
lora_dropout: 0.0

# Tokenizer
Expand All @@ -49,16 +49,16 @@ save_adapter_weights_only: False

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_cleaned_dataset
packed: False # True increases speed
seed: null
shuffle: True

# Fine-tuning arguments
epochs: 1
max_steps_per_epoch: null
batch_size: 2
gradient_accumulation_steps: 16
gradient_accumulation_steps: 8 # Use to increase virtual batch size
optimizer:
_component_: torch.optim.AdamW
fused: True
Expand All @@ -69,13 +69,13 @@ lr_scheduler:
num_warmup_steps: 100
loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
compile: False
compile: False # pytorch compile, set to true for better perf/memory

# Training env
device: cuda

# Memory management
enable_activation_checkpointing: True
enable_activation_checkpointing: True # True reduces memory
enable_activation_offloading: False # True reduces memory
dtype: bf16

Expand Down
33 changes: 29 additions & 4 deletions recipes/configs/dev/8B_full_experimental.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ tokenizer:

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_dataset
packed: False # True increases speed
seed: null
shuffle: True

Expand Down Expand Up @@ -57,14 +57,14 @@ optimizer:
loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 1
compile: False
gradient_accumulation_steps: 1 # Use to increase virtual batch size
compile: False # pytorch compile, set to true for better perf/memory

# Training env
device: cuda

# Memory management
enable_activation_checkpointing: False
enable_activation_checkpointing: False # True reduces memory
enable_activation_offloading: False # True reduces memory
ac_mode: 'selective' # ['selective', 'full']
ac_option: 2 # [int] = ac every positive int layer
Expand All @@ -81,3 +81,28 @@ metric_logger:
output_dir: /tmp/alpaca-llama3-finetune
log_every_n_steps: null
log_peak_memory_stats: True

# Profiler (disabled)
profiler:
_component_: torchtune.training.setup_torch_profiler
enabled: False

#Output directory of trace artifacts
output_dir: ${output_dir}/profiling_outputs

#`torch.profiler.ProfilerActivity` types to trace
cpu: True
cuda: True

#trace options passed to `torch.profiler.profile`
profile_memory: False
with_stack: False
record_shapes: True
with_flops: False

# `torch.profiler.schedule` options:
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
wait_steps: 5
warmup_steps: 3
active_steps: 2
num_cycles: 1
34 changes: 30 additions & 4 deletions recipes/configs/gemma/2B_full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ tokenizer:

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_dataset
packed: False # True increases speed
seed: null
shuffle: True

Expand Down Expand Up @@ -54,14 +54,15 @@ optimizer:
loss:
_component_: torchtune.modules.loss.CEWithChunkedOutputLoss
max_steps_per_epoch: null
gradient_accumulation_steps: 1
compile: False
gradient_accumulation_steps: 1 # Use to increase virtual batch size
compile: False # pytorch compile, set to true for better perf/memory
optimizer_in_bwd: False # True saves memory. Requires gradient_accumulation_steps=1

# Training env
device: cuda

# Memory management
enable_activation_checkpointing: True
enable_activation_checkpointing: True # True reduces memory
enable_activation_offloading: False # True reduces memory

# Reduced precision
Expand All @@ -74,3 +75,28 @@ metric_logger:
output_dir: /tmp/alpaca-gemma-finetune
log_every_n_steps: 1
log_peak_memory_stats: True

# Profiler (disabled)
profiler:
_component_: torchtune.training.setup_torch_profiler
enabled: False

#Output directory of trace artifacts
output_dir: ${output_dir}/profiling_outputs

#`torch.profiler.ProfilerActivity` types to trace
cpu: True
cuda: True

#trace options passed to `torch.profiler.profile`
profile_memory: False
with_stack: False
record_shapes: True
with_flops: False

# `torch.profiler.schedule` options:
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
wait_steps: 5
warmup_steps: 3
active_steps: 2
num_cycles: 1
39 changes: 32 additions & 7 deletions recipes/configs/gemma/2B_lora.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,18 @@ tokenizer:

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_dataset
packed: False # True increases speed
seed: null
shuffle: True

# Model Arguments
model:
_component_: torchtune.models.gemma.lora_gemma_2b
lora_attn_modules: ['q_proj', 'k_proj', 'v_proj']
lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
apply_lora_to_mlp: True
lora_rank: 64
lora_alpha: 128
lora_rank: 64 # higher increases accuracy and memory
lora_alpha: 128 # usually alpha=2*rank
lora_dropout: 0.0

checkpointer:
Expand Down Expand Up @@ -66,14 +66,14 @@ loss:
batch_size: 4
epochs: 3
max_steps_per_epoch: null
gradient_accumulation_steps: 1
compile: False
gradient_accumulation_steps: 1 # Use to increase virtual batch size
compile: False # pytorch compile, set to true for better perf/memory

# Training env
device: cuda

# Memory management
enable_activation_checkpointing: True
enable_activation_checkpointing: True # True reduces memory
enable_activation_offloading: False # True reduces memory

# Reduced precision
Expand All @@ -86,3 +86,28 @@ metric_logger:
output_dir: /tmp/alpaca-gemma-lora
log_every_n_steps: 1
log_peak_memory_stats: True

# Profiler (disabled)
profiler:
_component_: torchtune.training.setup_torch_profiler
enabled: False

#Output directory of trace artifacts
output_dir: ${output_dir}/profiling_outputs

#`torch.profiler.ProfilerActivity` types to trace
cpu: True
cuda: True

#trace options passed to `torch.profiler.profile`
profile_memory: False
with_stack: False
record_shapes: True
with_flops: False

# `torch.profiler.schedule` options:
# wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat
wait_steps: 5
warmup_steps: 3
active_steps: 2
num_cycles: 1
14 changes: 7 additions & 7 deletions recipes/configs/gemma/2B_lora_single_device.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,18 +22,18 @@ tokenizer:

# Dataset
dataset:
packed: False # Set to true for great speed ups
_component_: torchtune.datasets.alpaca_dataset
packed: False # True increases speed
seed: null
shuffle: True

# Model Arguments
model:
_component_: torchtune.models.gemma.lora_gemma_2b
lora_attn_modules: ['q_proj', 'k_proj', 'v_proj']
lora_attn_modules: ['q_proj', 'v_proj', 'output_proj']
apply_lora_to_mlp: True
lora_rank: 64
lora_alpha: 128
lora_rank: 64 # higher increases accuracy and memory
lora_alpha: 128 # usually alpha=2*rank
lora_dropout: 0.0

checkpointer:
Expand Down Expand Up @@ -65,14 +65,14 @@ loss:
batch_size: 4
epochs: 3
max_steps_per_epoch: null
gradient_accumulation_steps: 4
compile: False
gradient_accumulation_steps: 8 # Use to increase virtual batch size
compile: False # pytorch compile, set to true for better perf/memory

# Training env
device: cuda

# Memory management
enable_activation_checkpointing: True
enable_activation_checkpointing: True # True reduces memory
enable_activation_offloading: False # True reduces memory

# Reduced precision
Expand Down
Loading

0 comments on commit df3402c

Please sign in to comment.