diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 39ac14ac5..3473ade4b 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -725,6 +725,9 @@ def tmp(_, inp: Tuple[torch.Tensor, ...], out: torch.Tensor): if not is_lm_head: layers[i] = move_to(layer, CPU) + else: + move_to(layer, CPU) + del layer del gptq del layer_inputs @@ -732,6 +735,13 @@ def tmp(_, inp: Tuple[torch.Tensor, ...], out: torch.Tensor): layer_outputs, [], ) # TODO: is it really OK to cache only the first positional argument? + + # if i == layer_count - 1: + # print("saved", layer_inputs) + # torch.save(layer_inputs, "lm_head_layer_inputs.pt") + # layer_inputs = torch.load("lm_head_layer_inputs.pt") + # print("loaded", layer_inputs) + torch_empty_cache() logger.info(f"Quantization summary:\n{self.quant_log}")