diff --git a/README.md b/README.md index 301f1842e..3e0be3095 100644 --- a/README.md +++ b/README.md @@ -88,19 +88,37 @@ python tuning/sft_trainer.py \ ### Multiple GPUs with FSDP +The recommendation is to use [huggingface accelerate](https://huggingface.co/docs/accelerate/en/index) to launch multi-gpu jobs, in particular when using FSDP: +- `accelerate` is written on top of [`torch.distributed.run`](https://github.com/pytorch/pytorch/blob/main/torch/distributed/run.py). +- `accelerate launch` CLI highly similar to `torchrun`, spawns multiple jobs (one for each gpu). +- tightly integrated with [huggingface Trainer](https://github.com/huggingface/transformers/blob/main/src/transformers/trainer.py). + +`accelerate launch` CLI to be run with specific command line arguments, see example below. Default arguments handled by passing in a +`--config_file` argument; see [reference docs](https://huggingface.co/docs/accelerate/en/package_reference/cli#accelerate-launch) and [fixtures/accelerate_fsdp_defaults.yaml](./fixtures/accelerate_fsdp_defaults.yaml) for sample defaults. + ```bash MODEL_PATH=llama-7b-hf # Huggingface model id or path to a checkpoint DATA_PATH=twitter_complaints.json # Path to the dataset OUTPUT_PATH=out # Path to the output folder where the checkpoints are saved + +# MASTER_PORT and MASTER_ADDR are essential for multi node training and +# not needed for multi gpu in single node MASTER_PORT=1234 # The port at which the process with rank 0 listens to MASTER_ADDR=x.x.x.x # The IP addresss of the node with rank 0 -accelerate launch --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \ ---config_file config/accelerate_fsdp_llama_2_procs.yaml \ + +```bash +accelerate launch \ +--main_process_ip $MASTER_ADDR \ +--main_process_port $MASTER_PORT \ +--config_file fixtures/accelerate_fsdp_defaults.yaml \ +--num_machines=1 \ +--num_processes=8 \ +--main_process_port=1234 \ tuning/sft_trainer.py \ --model_name_or_path $MODEL_PATH \ ---training_data_path $DATA_PATH \ ---bf16 True \ +--data_path $DATA_PATH \ +--torch_dtype bfloat16 \ --output_dir $OUTPUT_PATH \ --num_train_epochs 5 \ --per_device_train_batch_size 4 \ diff --git a/config/accelerate_fsdp_llama_2_procs.yaml b/config/accelerate_fsdp_llama_2_procs.yaml deleted file mode 100644 index 170c9d28f..000000000 --- a/config/accelerate_fsdp_llama_2_procs.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# options that can be used with accelerate config are neatly documented here - -# https://github.com/huggingface/accelerate/blob/ee163b66fb7848892519e804688cb4ae981aacbe/docs/source/package_reference/cli.md - -compute_environment: LOCAL_MACHINE -debug: false -distributed_type: FSDP -fsdp_config: - fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP - fsdp_backward_prefetch_policy: BACKWARD_PRE - fsdp_forward_prefetch: true - fsdp_cpu_ram_efficient_loading: true - fsdp_offload_params: false - fsdp_sharding_strategy: FULL_SHARD - fsdp_state_dict_type: SHARDED_STATE_DICT - fsdp_sync_module_states: true - fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer - fsdp_use_orig_params: true -machine_rank: 0 -main_training_function: main -num_machines: 1 -num_processes: 2 -rdzv_backend: static -same_network: true -tpu_env: [] -tpu_use_cluster: false -tpu_use_sudo: false -use_cpu: false \ No newline at end of file diff --git a/examples/prompt_tuning_twitter_complaints/README.md b/examples/prompt_tuning_twitter_complaints/README.md index 221850086..4e49eb8a2 100644 --- a/examples/prompt_tuning_twitter_complaints/README.md +++ b/examples/prompt_tuning_twitter_complaints/README.md @@ -34,16 +34,22 @@ We will switch our PEFT method from LORA to Prompt Tuning (pt) MODEL_PATH=llama-7b-hf DATA_PATH=twitter_complaints.json OUTPUT_PATH=out + +# MASTER_PORT and MASTER_ADDR are essential for multi node training and +# not needed for multi gpu in single node MASTER_PORT=1234 # The port at which the process with rank 0 listens to MASTER_ADDR=x.x.x.x # The IP addresss of the node with rank 0 -accelerate launch --main_process_ip $MASTER_ADDR --main_process_port $MASTER_PORT \ ---config_file config/accelerate_fsdp_llama_2_procs.yaml \ +accelerate launch \ +--main_process_ip $MASTER_ADDR \ +--main_process_port $MASTER_PORT \ +--config_file fixtures/accelerate_fsdp_defaults.yaml \ tuning/sft_trainer.py \ --model_name_or_path $MODEL_PATH \ --training_data_path $DATA_PATH \ --output_dir $OUTPUT_PATH \ --peft_method pt \ +--torch_dtype bfloat16 \ --tokenizer_name_or_path $MODEL_PATH \ --num_train_epochs 5 \ --per_device_train_batch_size 1 \ diff --git a/fixtures/accelerate_fsdp_defaults.yaml b/fixtures/accelerate_fsdp_defaults.yaml new file mode 100644 index 000000000..f70d74faa --- /dev/null +++ b/fixtures/accelerate_fsdp_defaults.yaml @@ -0,0 +1,60 @@ +# options that can be used with accelerate config are neatly documented here - +# https://github.com/huggingface/accelerate/blob/ee163b66fb7848892519e804688cb4ae981aacbe/docs/source/package_reference/cli.md + +# type of compute environment, no need to change +compute_environment: LOCAL_MACHINE # AMAZON_SAGEMAKER + +# use FSDP distributed compute +distributed_type: FSDP + +# FSDP specific configurations +fsdp_config: + + # use this for training transformers + fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP + + # this controls the FSDP pipelining + fsdp_backward_prefetch_policy: BACKWARD_PRE # set to BACKWARD_PRE for the most time-efficient pipeline + # but requires the most memory. BACKWARD_POST is the less + # memory intensive option + + # setting this to true will increase forward memory by prefetching the next FSDP all-gather, while performing + # the current forward pass. + fsdp_forward_prefetch: false + + # setting this will offload model and optimizer parameters to the CPU, to save GPU memory at a significant + # increase of CPU time. + fsdp_offload_params: false + + fsdp_sharding_strategy: 1 # set to FULL_SHARD (1), SHARD_GRAD_OP (2), + # 3 is NO_SHARD, effectively disabling FSDP + # 4, 5 are HYBRID_ modes for multi-node training only. + + fsdp_state_dict_type: FULL_STATE_DICT # set to FULL_STATE_DICT (1), SHARDED_STATE_DICT (3) + # 2 is LOCAL_STATE_DICT where parameters are still flattened + # 3 is efficient, but requires know-how to use the shared checkpoint. + + fsdp_cpu_ram_efficient_loading: true # for large models set to true, model loaded on single process + fsdp_sync_module_states: true # for large models set to true, model loaded on single process + + # not needed for HF models that have . _no_split_modules + # the example below is for GPTBigCode + # fsdp_transformer_layer_cls_to_wrap: "GPTBigCodeBlock” + +# for "autocast" mixed precision training, where the weights of the model are kept at higher precision, but the +# learning products (e.g., gradients, model parameters) are kept at a lower precision. Default is 'no'. Other options +# would be fp16, bf16, etc. +mixed_precision: 'no' + +machine_rank: 0 # rank of the machine where accelerate is launched +num_machines: 1 +num_processes: 1 # default, override with --num_processes + +# the rendezvous method to use in distributed training. Other option is c10d +rdzv_backend: static +same_network: true + +# below arguments are required when training in multi-node setup +# for multi-gpu single node, the below values default to +# main_process_ip: 127.0.0.1 # override with --main_process_ip +# main_process_port: 29500 # override with --main_process_port