Added multinode capabilities for distributed training (#19)

* multinode fsdp finetuning * full dpo
sarvamai · Dec 23, 2024 · b1e0666 · b1e0666
1 parent 0fb4536
commit b1e0666
Show file tree

Hide file tree

Showing 5 changed files with 918 additions and 12 deletions.
diff --git a/recipes/configs/sarvam1/full_dpo.yaml b/recipes/configs/sarvam1/full_dpo.yaml
@@ -0,0 +1,86 @@
+# Config for multi-device full DPO alignment in full_dpo_distributed.py
+# using a Llama2 7B model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download meta-llama/Llama-2-7b-hf --output-dir /tmp/Llama-2-7b-hf --ignore-patterns "*.safetensors" --hf-token <HF_TOKEN>
+#
+# To launch on 2 devices, run the following command from root:
+#   tune run --nnodes 1 --nproc_per_node 2 full_dpo_distributed --config llama2/7B_lora_dpo
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run --nnodes 1 --nproc_per_node 2 full_dpo_distributed --config llama2/7B_lora_dpo checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works best when the model is being fine-tuned on 2+ GPUs.
+# For single device LoRA DPO alignment please use 7B_lora_dpo_single_device.yaml
+
+# Model Arguments
+model:
+  _component_: torchtune.models.sarvam1.sarvam1
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.llama2.llama2_tokenizer
+  path: /projects/data/rahul_sarvam_ai/nemo_models/sarvam-1-pt/tokenizer.model
+  max_seq_len: 8192
+output_dir: /projects/data/rahul_sarvam_ai/torchtune_models/dpo_test
+
+checkpointer:
+  _component_: torchtune.training.FullModelHFCheckpointer
+  checkpoint_dir: /projects/data/rahul_sarvam_ai/models/sarvam-1-torchtune-sft
+  checkpoint_files:
+    [model-00001-of-00002.safetensors, model-00002-of-00002.safetensors]
+  recipe_checkpoint: null
+  output_dir: ${output_dir}
+  model_type: LLAMA3
+  safe_serialization: True
+resume_from_checkpoint: False
+save_adapter_weights_only: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.preference_dataset
+  source: allenai/llama-3.1-tulu-3-70b-preference-mixture
+  split: train
+seed: null
+shuffle: True
+batch_size: 1
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  fused: True
+  weight_decay: 0.01
+  lr: 1e-5
+lr_scheduler:
+  _component_: torchtune.training.lr_schedulers.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torchtune.rlhf.loss.DPOLoss
+  beta: 0.1
+  label_smoothing: 0
+
+# Training
+epochs: 1
+max_steps_per_epoch: 1_000_000
+gradient_accumulation_steps: 8  # Use to increase virtual batch size
+compile: False  # pytorch compile, set to true for better perf/memory
+
+# Logging
+metric_logger:
+  _component_: torchtune.training.metric_logging.WandBLogger
+  # the W&B project to log to
+  project: torchtune
+log_every_n_steps: 10
+log_peak_memory_stats: True
+
+# Environment
+device: cuda
+dtype: bf16
+
+# Memory management
+enable_activation_checkpointing: True  # True reduces memory
+enable_activation_offloading: False  # True reduces memory
diff --git a/recipes/configs/sarvam1/full_finetune.yaml b/recipes/configs/sarvam1/full_finetune.yaml
@@ -23,7 +23,7 @@ model:
 
 # Tokenizer
 tokenizer:
-  _component_: torchtune.models.sarvam1.sarvam1_tokenizer
+  _component_: torchtune.models.llama2.llama2_tokenizer
   path: /projects/data/rahul_sarvam_ai/nemo_models/sarvam-1-pt/tokenizer.model
   max_seq_len: 8192
 
@@ -35,10 +35,10 @@ dataset:
   conversation_style: openai
   conversation_column: messages
   source: json
-  packs_cache_path: /projects/data/mohit_sarvam_ai/torchtune/data/microsoft_agent
+  packs_cache_path: /projects/data/rahul_sarvam_ai/torchtune_models/cache/ft_train_cache_phase_1
   data_files: [
-    # /projects/data/mohit_sarvam_ai/torchtune/data/ft_train/sample.json,
-    /projects/data/mohit_sarvam_ai/torchtune/data/microsoft-agent.jsonl
+    /projects/data/mohit_sarvam_ai/torchtune/data/ft_train/sample.json,
+    # /projects/data/mohit_sarvam_ai/torchtune/data/microsoft-agent.jsonl
     # /projects/data/mohit_sarvam_ai/torchtune/data/ft_train/data_0.json,
     # /projects/data/mohit_sarvam_ai/torchtune/data/ft_train/data_1.json,
     # /projects/data/mohit_sarvam_ai/torchtune/data/ft_train/data_2.json,
@@ -63,13 +63,14 @@ dataset:
   split: train
 seed: null
 shuffle: True
-output_dir: /projects/data/mohit_sarvam_ai/torchtune/output-sft/sarvam1-sft-microsft-agent
+output_dir: /projects/data/rahul_sarvam_ai/torchtune_models/sft-phase-1
 
 checkpointer:
   _component_: torchtune.training.FullModelHFCheckpointer
   checkpoint_dir: /projects/data/rahul_sarvam_ai/nemo_models/sarvam-1-pt
   checkpoint_files: [
-    pytorch_model.bin
+    # pytorch_model.bin
+    ft-model-00001-of-00001.safetensors
   ]
   recipe_checkpoint: null
   output_dir: ${output_dir}
@@ -78,20 +79,20 @@ save_interval: 2000
 resume_from_checkpoint: False
 
 # Fine-tuning arguments
-batch_size: 1
+batch_size: 8
 epochs: 1
 optimizer:
   _component_: torch.optim.AdamW
   fused: True
-  lr: 7e-6
+  lr: 3e-5
   weight_decay: 0.01
   betas: [0.9, 0.98]
 lr_scheduler:
   _component_: torchtune.training.lr_schedulers.get_linear_schedule_with_warmup
-  min_lr: 1e6
+  min_lr: 1e-8
   max_lr: ${optimizer.lr}
-  num_warmup_steps: 500
-  constant_steps: 500
+  num_warmup_steps: 1000
+  constant_steps: 0
 
 loss:
   _component_: torchtune.modules.loss.CEWithChunkedOutputLoss

diff --git a/recipes/configs/sarvam1/lora_dpo.yaml b/recipes/configs/sarvam1/lora_dpo.yaml
@@ -71,7 +71,7 @@ loss:
 
 # Training
 epochs: 1
-max_steps_per_epoch: 1000_000
+max_steps_per_epoch: 1_000_000
 gradient_accumulation_steps: 4  # Use to increase virtual batch size
 compile: False  # pytorch compile, set to true for better perf/memory