From 313e1bd0de2b44aaa71797464f1e8b6a041a6f18 Mon Sep 17 00:00:00 2001 From: Marc Sun <57196510+SunMarc@users.noreply.github.com> Date: Wed, 25 Oct 2023 15:27:15 +0200 Subject: [PATCH] disable_exllamav2_for_quantization (#1482) disable_exllamav2 --- optimum/gptq/quantizer.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index c4b3cb46a73..1d10697a97a 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -457,11 +457,6 @@ def tmp(_, input, output): "Found modules on cpu/disk. Using Exllama backend requires all the modules to be on GPU. Setting `disable_exllama=True`" ) self.disable_exllama = True - if not self.disable_exllamav2: - logger.warning( - "Found modules on cpu/disk. Using Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllamav2=True`" - ) - self.disable_exllamav2 = True # act order and exllama elif self.desc_act and not self.disable_exllama: logger.warning( @@ -469,6 +464,12 @@ def tmp(_, input, output): "Setting `disable_exllama=True`. You should only use Exllama backend with act_order for inference. " ) self.disable_exllama = True + elif not self.disable_exllamav2: + logger.warning( + "Using Exllamav2 backend will reorder the weights offline, thus you will not be able to save the model with the right weights." + "Setting `disable_exllamav2=True`. You should only use Exllamav2 backend for inference. " + ) + self.disable_exllamav2 = True # Step 4: Pack the model at the end (Replacing the layers) self.pack_model(model=model, quantizers=quantizers)