diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index 844da3e315..cbfb5dca07 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -220,7 +220,7 @@ def __init__( ) self.exllama_version = self.exllama_config["version"] - def select_quant_linear(self, device_map: Union[str, dict]): + def select_quant_linear(self, device_map: Union[str, dict], pack: bool = False): if is_gptqmodel_available(): self.quant_linear = hf_select_quant_linear( bits=self.bits, @@ -231,6 +231,7 @@ def select_quant_linear(self, device_map: Union[str, dict]): meta=self.meta, device_map=device_map, backend=self.backend, + pack=pack, ) else: self.quant_linear = hf_select_quant_linear( @@ -301,7 +302,7 @@ def convert_model(self, model: nn.Module, **kwargs): ) del layers_to_be_replaced[name] - self.select_quant_linear(device_map=kwargs.get("device_map", None)) + self.select_quant_linear(device_map=kwargs.get("device_map", None), pack=False) self._replace_by_quant_layers(model, layers_to_be_replaced) @@ -761,7 +762,7 @@ def pack_model( layers = get_layers(model) layers = {n: layers[n] for n in quantizers} - self.select_quant_linear(device_map=model.hf_device_map) + self.select_quant_linear(device_map=model.hf_device_map, pack=True) self._replace_by_quant_layers(model, quantizers) qlayers = get_layers(model, [self.quant_linear]) diff --git a/optimum/utils/import_utils.py b/optimum/utils/import_utils.py index d0f4c85db2..7174084a45 100644 --- a/optimum/utils/import_utils.py +++ b/optimum/utils/import_utils.py @@ -52,7 +52,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[ TRANSFORMERS_MINIMUM_VERSION = version.parse("4.25.0") DIFFUSERS_MINIMUM_VERSION = version.parse("0.22.0") AUTOGPTQ_MINIMUM_VERSION = version.parse("0.4.99") # Allows 0.5.0.dev0 -GPTQMODEL_MINIMUM_VERSION = version.parse("1.4.2") +GPTQMODEL_MINIMUM_VERSION = version.parse("1.6.0") # This is the minimal required version to support some ONNX Runtime features