From 338130aef1b357ccd19f67d6d7faaf7dc67f4ad3 Mon Sep 17 00:00:00 2001
From: Mehant Kammakomati <mehant.kammakomati2@ibm.com>
Date: Wed, 13 Mar 2024 14:14:52 +0530
Subject: [PATCH] feat: move to accelerate for distributed training launch

Signed-off-by: Mehant Kammakomati <mehant.kammakomati2@ibm.com>
---
 README.md                                     | 21 ++++++++++++-------
 .../accelerate_fsdp_llama_2_procs.yaml        |  3 +++
 .../README.md                                 | 10 ++++-----
 tuning/config/fsdp_config.json                | 12 -----------
 4 files changed, 21 insertions(+), 25 deletions(-)
 rename tuning/config/accelerate_fsdp_config.yaml => config/accelerate_fsdp_llama_2_procs.yaml (76%)
 delete mode 100644 tuning/config/fsdp_config.json

diff --git a/README.md b/README.md
index d839a132e..0f9fa0162 100644
--- a/README.md
+++ b/README.md
@@ -60,6 +60,10 @@ Current supported and tested models are `Llama2` (7 and 13B configurations have
 # if you want to use one GPU on multi-gpu machine
 export CUDA_VISIBLE_DEVICES=0
 
+MODEL_PATH=llama-7b-hf # Huggingface model id or path to a checkpoint
+DATA_PATH=twitter_complaints.json # Path to the dataset
+OUTPUT_PATH=out # Path to the output folder where the checkpoints are saved
+
 python tuning/sft_trainer.py  \
 --model_name_or_path $MODEL_PATH  \
 --data_path $DATA_PATH  \
@@ -83,11 +87,16 @@ python tuning/sft_trainer.py  \
 ```
 
 ### Multiple GPUs with FSDP
+
 ```bash
-torchrun \
---nnodes=1 \
---nproc_per_node=8 \ 
---master_port=1234 \
+MODEL_PATH=llama-7b-hf # Huggingface model id or path to a checkpoint
+DATA_PATH=twitter_complaints.json # Path to the dataset
+OUTPUT_PATH=out # Path to the output folder where the checkpoints are saved
+MASTER_PORT=1234 # The port at which the process with rank 0 listens to
+MASTER_ADDR=x.x.x.x # The IP addresss of the node with rank 0
+
+accelerate launch --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
+--config_file config/accelerate_fsdp_llama_2_procs.yaml \
 tuning/sft_trainer.py \
 --model_name_or_path $MODEL_PATH \
 --data_path $DATA_PATH \
@@ -104,8 +113,6 @@ tuning/sft_trainer.py \
 --warmup_ratio 0.03 \
 --lr_scheduler_type "cosine" \
 --logging_steps 1 \
---fsdp "full_shard auto_wrap" \ 
---fsdp_config tuning/config/fsdp_config.json \
 --include_tokens_per_second \
 --packing False \
 --response_template "\n### Response:" \
@@ -113,7 +120,7 @@ tuning/sft_trainer.py \
 ```
 
 
-For `GPTBigCode` models, Hugging Face has enabled Flash v2 and one can simply replace the `'LlamaDecoderLayer'` with `'GPTBigCodeBlock'` in `tuning/config/fsdp_config.json` for proper sharding of the model.
+Typically the transformer module is passed to form FSDP unit. For `GPTBigCode` models, Hugging Face has enabled Flash v2 and one can simply replace the `'LlamaDecoderLayer'` with `'GPTBigCodeBlock'` in `config/accelerate_fsdp_llama_2_procs.yaml` for proper sharding of the model.
 
 ### LoRA Tuning Example
 
diff --git a/tuning/config/accelerate_fsdp_config.yaml b/config/accelerate_fsdp_llama_2_procs.yaml
similarity index 76%
rename from tuning/config/accelerate_fsdp_config.yaml
rename to config/accelerate_fsdp_llama_2_procs.yaml
index 359ec9efc..170c9d28f 100644
--- a/tuning/config/accelerate_fsdp_config.yaml
+++ b/config/accelerate_fsdp_llama_2_procs.yaml
@@ -1,3 +1,6 @@
+# options that can be used with accelerate config are neatly documented here - 
+# https://github.com/huggingface/accelerate/blob/ee163b66fb7848892519e804688cb4ae981aacbe/docs/source/package_reference/cli.md
+
 compute_environment: LOCAL_MACHINE
 debug: false
 distributed_type: FSDP
diff --git a/examples/prompt_tuning_twitter_complaints/README.md b/examples/prompt_tuning_twitter_complaints/README.md
index f3c663b35..cfccfc509 100644
--- a/examples/prompt_tuning_twitter_complaints/README.md
+++ b/examples/prompt_tuning_twitter_complaints/README.md
@@ -34,11 +34,11 @@ We will switch our PEFT method from LORA to Prompt Tuning (pt)
 MODEL_PATH=llama-7b-hf
 DATA_PATH=twitter_complaints.json
 OUTPUT_PATH=out
+MASTER_PORT=1234 # The port at which the process with rank 0 listens to
+MASTER_ADDR=x.x.x.x # The IP addresss of the node with rank 0
 
-torchrun \
---nnodes=1 \
---nproc_per_node=8  \
---master_port=1234  \
+accelerate launch --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \
+--config_file config/accelerate_fsdp_llama_2_procs.yaml \
 tuning/sft_trainer.py  \
 --model_name_or_path $MODEL_PATH  \
 --data_path $DATA_PATH  \
@@ -56,8 +56,6 @@ tuning/sft_trainer.py  \
 --warmup_ratio 0.03  \
 --lr_scheduler_type "cosine"  \
 --logging_steps 1  \
---fsdp "full_shard auto_wrap"  \
---fsdp_config tuning/config/fsdp_config.json \
 --include_tokens_per_second  \
 --packing False  \
 --response_template "\n### Label:"  \
diff --git a/tuning/config/fsdp_config.json b/tuning/config/fsdp_config.json
deleted file mode 100644
index 3cae95ff4..000000000
--- a/tuning/config/fsdp_config.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-  "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
-  "fsdp_backward_prefetch_policy": "BACKWARD_PRE",
-  "fsdp_cpu_ram_efficient_loading": "False",
-  "fsdp_forward_prefetch": "True",
-  "fsdp_offload_params": "False",
-  "fsdp_state_dict_type": "SHARDED_STATE_DICT",
-  "fsdp_sync_module_states": "False",
-  "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
-  "fsdp_use_orig_params": "True",
-  "activation_checkpointing": false
-}
\ No newline at end of file