From 338130aef1b357ccd19f67d6d7faaf7dc67f4ad3 Mon Sep 17 00:00:00 2001 From: Mehant Kammakomati Date: Wed, 13 Mar 2024 14:14:52 +0530 Subject: [PATCH] feat: move to accelerate for distributed training launch Signed-off-by: Mehant Kammakomati --- README.md | 21 ++++++++++++------- .../accelerate_fsdp_llama_2_procs.yaml | 3 +++ .../README.md | 10 ++++----- tuning/config/fsdp_config.json | 12 ----------- 4 files changed, 21 insertions(+), 25 deletions(-) rename tuning/config/accelerate_fsdp_config.yaml => config/accelerate_fsdp_llama_2_procs.yaml (76%) delete mode 100644 tuning/config/fsdp_config.json diff --git a/README.md b/README.md index d839a132e..0f9fa0162 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,10 @@ Current supported and tested models are `Llama2` (7 and 13B configurations have # if you want to use one GPU on multi-gpu machine export CUDA_VISIBLE_DEVICES=0 +MODEL_PATH=llama-7b-hf # Huggingface model id or path to a checkpoint +DATA_PATH=twitter_complaints.json # Path to the dataset +OUTPUT_PATH=out # Path to the output folder where the checkpoints are saved + python tuning/sft_trainer.py \ --model_name_or_path $MODEL_PATH \ --data_path $DATA_PATH \ @@ -83,11 +87,16 @@ python tuning/sft_trainer.py \ ``` ### Multiple GPUs with FSDP + ```bash -torchrun \ ---nnodes=1 \ ---nproc_per_node=8 \ ---master_port=1234 \ +MODEL_PATH=llama-7b-hf # Huggingface model id or path to a checkpoint +DATA_PATH=twitter_complaints.json # Path to the dataset +OUTPUT_PATH=out # Path to the output folder where the checkpoints are saved +MASTER_PORT=1234 # The port at which the process with rank 0 listens to +MASTER_ADDR=x.x.x.x # The IP addresss of the node with rank 0 + +accelerate launch --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \ +--config_file config/accelerate_fsdp_llama_2_procs.yaml \ tuning/sft_trainer.py \ --model_name_or_path $MODEL_PATH \ --data_path $DATA_PATH \ @@ -104,8 +113,6 @@ tuning/sft_trainer.py \ --warmup_ratio 0.03 \ --lr_scheduler_type "cosine" \ --logging_steps 1 \ ---fsdp "full_shard auto_wrap" \ ---fsdp_config tuning/config/fsdp_config.json \ --include_tokens_per_second \ --packing False \ --response_template "\n### Response:" \ @@ -113,7 +120,7 @@ tuning/sft_trainer.py \ ``` -For `GPTBigCode` models, Hugging Face has enabled Flash v2 and one can simply replace the `'LlamaDecoderLayer'` with `'GPTBigCodeBlock'` in `tuning/config/fsdp_config.json` for proper sharding of the model. +Typically the transformer module is passed to form FSDP unit. For `GPTBigCode` models, Hugging Face has enabled Flash v2 and one can simply replace the `'LlamaDecoderLayer'` with `'GPTBigCodeBlock'` in `config/accelerate_fsdp_llama_2_procs.yaml` for proper sharding of the model. ### LoRA Tuning Example diff --git a/tuning/config/accelerate_fsdp_config.yaml b/config/accelerate_fsdp_llama_2_procs.yaml similarity index 76% rename from tuning/config/accelerate_fsdp_config.yaml rename to config/accelerate_fsdp_llama_2_procs.yaml index 359ec9efc..170c9d28f 100644 --- a/tuning/config/accelerate_fsdp_config.yaml +++ b/config/accelerate_fsdp_llama_2_procs.yaml @@ -1,3 +1,6 @@ +# options that can be used with accelerate config are neatly documented here - +# https://github.com/huggingface/accelerate/blob/ee163b66fb7848892519e804688cb4ae981aacbe/docs/source/package_reference/cli.md + compute_environment: LOCAL_MACHINE debug: false distributed_type: FSDP diff --git a/examples/prompt_tuning_twitter_complaints/README.md b/examples/prompt_tuning_twitter_complaints/README.md index f3c663b35..cfccfc509 100644 --- a/examples/prompt_tuning_twitter_complaints/README.md +++ b/examples/prompt_tuning_twitter_complaints/README.md @@ -34,11 +34,11 @@ We will switch our PEFT method from LORA to Prompt Tuning (pt) MODEL_PATH=llama-7b-hf DATA_PATH=twitter_complaints.json OUTPUT_PATH=out +MASTER_PORT=1234 # The port at which the process with rank 0 listens to +MASTER_ADDR=x.x.x.x # The IP addresss of the node with rank 0 -torchrun \ ---nnodes=1 \ ---nproc_per_node=8 \ ---master_port=1234 \ +accelerate launch --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \ +--config_file config/accelerate_fsdp_llama_2_procs.yaml \ tuning/sft_trainer.py \ --model_name_or_path $MODEL_PATH \ --data_path $DATA_PATH \ @@ -56,8 +56,6 @@ tuning/sft_trainer.py \ --warmup_ratio 0.03 \ --lr_scheduler_type "cosine" \ --logging_steps 1 \ ---fsdp "full_shard auto_wrap" \ ---fsdp_config tuning/config/fsdp_config.json \ --include_tokens_per_second \ --packing False \ --response_template "\n### Label:" \ diff --git a/tuning/config/fsdp_config.json b/tuning/config/fsdp_config.json deleted file mode 100644 index 3cae95ff4..000000000 --- a/tuning/config/fsdp_config.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP", - "fsdp_backward_prefetch_policy": "BACKWARD_PRE", - "fsdp_cpu_ram_efficient_loading": "False", - "fsdp_forward_prefetch": "True", - "fsdp_offload_params": "False", - "fsdp_state_dict_type": "SHARDED_STATE_DICT", - "fsdp_sync_module_states": "False", - "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer", - "fsdp_use_orig_params": "True", - "activation_checkpointing": false -} \ No newline at end of file