From 313e1bd0de2b44aaa71797464f1e8b6a041a6f18 Mon Sep 17 00:00:00 2001
From: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Date: Wed, 25 Oct 2023 15:27:15 +0200
Subject: [PATCH] disable_exllamav2_for_quantization (#1482)

disable_exllamav2
---
 optimum/gptq/quantizer.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py
index c4b3cb46a73..1d10697a97a 100644
--- a/optimum/gptq/quantizer.py
+++ b/optimum/gptq/quantizer.py
@@ -457,11 +457,6 @@ def tmp(_, input, output):
                         "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU. Setting `disable_exllama=True`"
                     )
                     self.disable_exllama = True
-                if not self.disable_exllamav2:
-                    logger.warning(
-                        "Found modules on cpu/disk. Using Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllamav2=True`"
-                    )
-                    self.disable_exllamav2 = True
             # act order and exllama
             elif self.desc_act and not self.disable_exllama:
                 logger.warning(
@@ -469,6 +464,12 @@ def tmp(_, input, output):
                     "Setting `disable_exllama=True`. You should only use Exllama backend with act_order for inference. "
                 )
                 self.disable_exllama = True
+            elif not self.disable_exllamav2:
+                logger.warning(
+                    "Using Exllamav2 backend will reorder the weights offline, thus you will not be able to save the model with the right weights."
+                    "Setting `disable_exllamav2=True`. You should only use Exllamav2 backend for inference. "
+                )
+                self.disable_exllamav2 = True
         # Step 4: Pack the model at the end (Replacing the layers)
         self.pack_model(model=model, quantizers=quantizers)