From 5056095df6f30979a71cfd4aa9a379b78fbd1faf Mon Sep 17 00:00:00 2001 From: Yu Chin Fabian Lim Date: Wed, 13 Mar 2024 16:58:07 +0800 Subject: [PATCH] update README, replaced .json with accelerate.yaml Signed-off-by: Yu Chin Fabian Lim --- README.md | 26 ++++++++----- fixtures/accelerate_fsdp_defaults.yaml | 52 ++++++++++++++++++++++++++ tuning/config/fsdp_config.json | 12 ------ 3 files changed, 68 insertions(+), 22 deletions(-) create mode 100644 fixtures/accelerate_fsdp_defaults.yaml delete mode 100644 tuning/config/fsdp_config.json diff --git a/README.md b/README.md index d839a132e..363633b77 100644 --- a/README.md +++ b/README.md @@ -83,15 +83,26 @@ python tuning/sft_trainer.py \ ``` ### Multiple GPUs with FSDP + +The recommendation is to use [huggingface accelerate](https://huggingface.co/docs/accelerate/en/index) to launch multi-gpu jobs, in particular when using FSDP: +- `accelerate` is written on top of [`torch.distributed.run`](https://github.com/pytorch/pytorch/blob/main/torch/distributed/run.py). +- `accelerate launch` CLI highly similar to `torchrun`, spawns multiple jobs (one for each gpu). +- tightly integrated with [huggingface Trainer](https://github.com/huggingface/transformers/blob/main/src/transformers/trainer.py). + +`accelerate launch` CLI to be run with specific command line arguments, see example below. Default arguments handled by passing in a +`--config_file` argument; see [reference docs](https://huggingface.co/docs/accelerate/en/package_reference/cli#accelerate-launch) and [fixtures/accelerate_fsdp_defaults.yaml](./fixtures/accelerate_fsdp_defaults.yaml) for sample defaults. + + ```bash -torchrun \ ---nnodes=1 \ ---nproc_per_node=8 \ ---master_port=1234 \ +accelerate launch \ +--config_file fixtures/accelerate_fsdp_defaults.yaml \ +--num_machines=1 \ +--num_processes=8 \ +--main_process_port=1234 \ tuning/sft_trainer.py \ --model_name_or_path $MODEL_PATH \ --data_path $DATA_PATH \ ---bf16 True \ +--torch_dtype bfloat16 \ --output_dir $OUTPUT_PATH \ --num_train_epochs 5 \ --per_device_train_batch_size 4 \ @@ -104,17 +115,12 @@ tuning/sft_trainer.py \ --warmup_ratio 0.03 \ --lr_scheduler_type "cosine" \ --logging_steps 1 \ ---fsdp "full_shard auto_wrap" \ ---fsdp_config tuning/config/fsdp_config.json \ --include_tokens_per_second \ --packing False \ --response_template "\n### Response:" \ --dataset_text_field "output" ``` - -For `GPTBigCode` models, Hugging Face has enabled Flash v2 and one can simply replace the `'LlamaDecoderLayer'` with `'GPTBigCodeBlock'` in `tuning/config/fsdp_config.json` for proper sharding of the model. - ### LoRA Tuning Example ```bash diff --git a/fixtures/accelerate_fsdp_defaults.yaml b/fixtures/accelerate_fsdp_defaults.yaml new file mode 100644 index 000000000..5174923b2 --- /dev/null +++ b/fixtures/accelerate_fsdp_defaults.yaml @@ -0,0 +1,52 @@ +# type of compute environment, no need to change +compute_environment: LOCAL_MACHINE # AMAZON_SAGEMAKER + +# use FSDP distributed compute +distributed_type: FSDP + +# FSDP specific configurations +fsdp_config: + + # use this for training transformers + fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP + + # this controls the FSDP pipelining + fsdp_backward_prefetch_policy: BACKWARD_PRE # set to BACKWARD_PRE for the most time-efficient pipeline + # but requires the most memory. BACKWARD_POST is the less + # memory intensive option + + # setting this to true will increase forward memory by prefetching the next FSDP all-gather, while performing + # the current forward pass. + fsdp_forward_prefetch: false + + # setting this will offload model and optimizer parameters to the CPU, to save GPU memory at a significant + # increase of CPU time. + fsdp_offload_params: false + + fsdp_sharding_strategy: 1 # set to FULL_SHARD (1), SHARD_GRAD_OP (2), + # 3 is NO_SHARD, effectively disabling FSDP + # 4, 5 are HYBRID_ modes for multi-node training only. + + fsdp_state_dict_type: FULL_STATE_DICT # set to FULL_STATE_DICT (1), SHARDED_STATE_DICT (3) + # 2 is LOCAL_STATE_DICT where parameters are still flattened + # 3 is efficient, but requires know-how to use the shared checkpoint. + + fsdp_cpu_ram_efficient_loading: true # for large models set to true, model loaded on single process + fsdp_sync_module_states: true # for large models set to true, model loaded on single process + + # not needed for HF models that have . _no_split_modules + # the example below is for GPTBigCode + # fsdp_transformer_layer_cls_to_wrap: "GPTBigCodeBlock” + +# for "autocast" mixed precision training, where the weights of the model are kept at higher precision, but the +# learning products (e.g., gradients, model parameters) are kept at a lower precision. Default is 'no'. Other options +# would be fp16, bf16, etc. +mixed_precision: 'no' + +machine_rank: 0 # rank of the machine where accelerate is launched +num_machines: 1 +num_processes: 1 # default, override with --num_processes + +# the rendezvous method to use in distributed training. Other option is c10d +rdzv_backend: static +same_network: true \ No newline at end of file diff --git a/tuning/config/fsdp_config.json b/tuning/config/fsdp_config.json deleted file mode 100644 index cb96df45d..000000000 --- a/tuning/config/fsdp_config.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP", - "fsdp_backward_prefetch_policy": "BACKWARD_PRE", - "fsdp_cpu_ram_efficient_loading": "False", - "fsdp_forward_prefetch": "True", - "fsdp_offload_params": "False", - "fsdp_state_dict_type": "SHARDED_STATE_DICT", - "fsdp_sync_module_states": "False", - "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer", - "fsdp_use_orig_params": "True", - "activation_checkpointing": "True" -} \ No newline at end of file