diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index a8cee79ab..08539135c 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -625,7 +625,7 @@ def store_lm_head_input_hook(_, args, kwargs): sym=sym, mse=mse, ) - + print("gptq[name]",gptq[name]) for name in skipped_modules: subset.pop(name) @@ -670,11 +670,11 @@ def tmp(_, inp: Tuple[torch.Tensor, ...], out: torch.Tensor): if layer.reuse_kv: additional_layer_inputs["kv_last_layer"] = shared_kv_cache_dict.get(i - 1) - layer_output = layer(*layer_input) if self.quantize_config.lm_head else layer(*layer_input, **additional_layer_inputs) + layer_output = layer(*layer_input) if is_lm_head else layer(*layer_input, **additional_layer_inputs) if shared_kv_cache_dict.get(i) is None: shared_kv_cache_dict[i] = layer_output[-1] else: - layer(*layer_input) if self.quantize_config.lm_head else layer(*layer_input, **additional_layer_inputs) + layer(*layer_input) if is_lm_head else layer(*layer_input, **additional_layer_inputs) del layer_input del additional_layer_inputs @@ -731,7 +731,7 @@ def tmp(_, inp: Tuple[torch.Tensor, ...], out: torch.Tensor): self.quant_log.append(stat) logger.info(stat) - quantizers[f"{self.layers_node}.{i}.{name}"] = ( + quantizers[self.lm_head if is_lm_head else f"{self.layers_node}.{i}.{name}"] = ( gptq[name].quantizer.to(CPU), move_to(scale, CPU), move_to(zero, CPU), @@ -760,7 +760,7 @@ def tmp(_, inp: Tuple[torch.Tensor, ...], out: torch.Tensor): with torch.no_grad(): layer_output = move_to( - layer(*layer_input)[0] if self.quantize_config.lm_head else layer(*layer_input, **additional_layer_inputs)[0], + layer(*layer_input)[0] if is_lm_head else layer(*layer_input, **additional_layer_inputs)[0], cur_layer_device if calibration_enable_gpu_cache else CPU, ) layer_outputs.append([layer_output])