From 813b323dcf3525d3cc5cbf241f03491f2a9790df Mon Sep 17 00:00:00 2001
From: aresnow <aresnow1@gmail.com>
Date: Fri, 21 Jul 2023 19:02:55 +0800
Subject: [PATCH 1/2] Set ddp_find_unused_parameters to False when using ddp

---
 qlora.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/qlora.py b/qlora.py
index 45cbe889..2d4885a7 100644
--- a/qlora.py
+++ b/qlora.py
@@ -675,7 +675,12 @@ def train():
     set_seed(args.seed)
 
     data_module = make_data_module(tokenizer=tokenizer, args=args)
-    
+
+    # When using distributed training, the value of the flag find_unused_parameters passed to
+    # DistributedDataParallel. Will default to False if gradient checkpointing is used, True otherwise.
+    if os.environ.get('LOCAL_RANK') is not None:
+        training_args.ddp_find_unused_parameters = False
+
     trainer = Seq2SeqTrainer(
         model=model,
         tokenizer=tokenizer,

From dcb4c14ba71bdc2aea76c28d8f39c0392595f5b0 Mon Sep 17 00:00:00 2001
From: aresnow <aresnow1@gmail.com>
Date: Fri, 21 Jul 2023 22:06:14 +0800
Subject: [PATCH 2/2] Fix

---
 qlora.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/qlora.py b/qlora.py
index 2d4885a7..027a3378 100644
--- a/qlora.py
+++ b/qlora.py
@@ -679,7 +679,10 @@ def train():
     # When using distributed training, the value of the flag find_unused_parameters passed to
     # DistributedDataParallel. Will default to False if gradient checkpointing is used, True otherwise.
     if os.environ.get('LOCAL_RANK') is not None:
-        training_args.ddp_find_unused_parameters = False
+        if training_args.gradient_checkpointing:
+            training_args.ddp_find_unused_parameters = False
+        else:
+            training_args.ddp_find_unused_parameters = True
 
     trainer = Seq2SeqTrainer(
         model=model,