From 813b323dcf3525d3cc5cbf241f03491f2a9790df Mon Sep 17 00:00:00 2001 From: aresnow Date: Fri, 21 Jul 2023 19:02:55 +0800 Subject: [PATCH 1/2] Set ddp_find_unused_parameters to False when using ddp --- qlora.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/qlora.py b/qlora.py index 45cbe889..2d4885a7 100644 --- a/qlora.py +++ b/qlora.py @@ -675,7 +675,12 @@ def train(): set_seed(args.seed) data_module = make_data_module(tokenizer=tokenizer, args=args) - + + # When using distributed training, the value of the flag find_unused_parameters passed to + # DistributedDataParallel. Will default to False if gradient checkpointing is used, True otherwise. + if os.environ.get('LOCAL_RANK') is not None: + training_args.ddp_find_unused_parameters = False + trainer = Seq2SeqTrainer( model=model, tokenizer=tokenizer, From dcb4c14ba71bdc2aea76c28d8f39c0392595f5b0 Mon Sep 17 00:00:00 2001 From: aresnow Date: Fri, 21 Jul 2023 22:06:14 +0800 Subject: [PATCH 2/2] Fix --- qlora.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/qlora.py b/qlora.py index 2d4885a7..027a3378 100644 --- a/qlora.py +++ b/qlora.py @@ -679,7 +679,10 @@ def train(): # When using distributed training, the value of the flag find_unused_parameters passed to # DistributedDataParallel. Will default to False if gradient checkpointing is used, True otherwise. if os.environ.get('LOCAL_RANK') is not None: - training_args.ddp_find_unused_parameters = False + if training_args.gradient_checkpointing: + training_args.ddp_find_unused_parameters = False + else: + training_args.ddp_find_unused_parameters = True trainer = Seq2SeqTrainer( model=model,