diff --git a/tests/basic_correctness/test_cpu_offload.py b/tests/basic_correctness/test_cpu_offload.py index 180b926637ecb..f0f09ee63c0e6 100644 --- a/tests/basic_correctness/test_cpu_offload.py +++ b/tests/basic_correctness/test_cpu_offload.py @@ -22,11 +22,28 @@ def test_cpu_offload_fp8(): ["--cpu-offload-gb", "2"]) -@pytest.mark.skipif(not is_quant_method_supported("awq"), - reason="awq is not supported on this GPU type.") +@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"), + reason="gptq_marlin is not supported on this GPU type.") +def test_cpu_offload_gptq(): + # Test GPTQ Marlin + compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", [], + ["--cpu-offload-gb", "1"]) + # Test GPTQ + compare_two_settings("Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4", + ["--quantization", "gptq"], + ["--quantization", "gptq", "--cpu-offload-gb", "1"]) + + +@pytest.mark.skipif(not is_quant_method_supported("awq_marlin"), + reason="awq_marlin is not supported on this GPU type.") def test_cpu_offload_awq(): - compare_two_settings("casperhansen/llama-3-8b-instruct-awq", [], - ["--cpu-offload-gb", "2"]) + # Test AWQ Marlin + compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", [], + ["--cpu-offload-gb", "1"]) + # Test AWQ + compare_two_settings("Qwen/Qwen2-1.5B-Instruct-AWQ", + ["--quantization", "awq"], + ["--quantization", "awq", "--cpu-offload-gb", "1"]) @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"), diff --git a/tests/utils.py b/tests/utils.py index 666694299d397..bd431b85d2663 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -266,8 +266,9 @@ def compare_two_settings(model: str, arg1_results = results[:n] arg2_results = results[n:] for arg1_result, arg2_result in zip(arg1_results, arg2_results): - assert arg1_result == arg2_result, \ - f"Results for {model=} are not the same with {arg1=} and {arg2=}" + assert arg1_result == arg2_result, ( + f"Results for {model=} are not the same with {arg1=} and {arg2=}. " + f"{arg1_result=} != {arg2_result=}") def init_test_distributed_environment( diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py index 510c9dd49ef03..aa04fcf8310bf 100644 --- a/vllm/model_executor/layers/quantization/gptq.py +++ b/vllm/model_executor/layers/quantization/gptq.py @@ -204,13 +204,7 @@ def create_weights( layer.exllama_state = exllama_state - def apply(self, - layer: torch.nn.Module, - x: torch.Tensor, - bias: Optional[torch.Tensor] = None) -> torch.Tensor: - qweight = layer.qweight - out_shape = x.shape[:-1] + (qweight.shape[-1], ) - reshaped_x = x.reshape(-1, x.shape[-1]) + def process_weights_after_loading(self, layer: torch.nn.Module) -> None: # exllama needs to shuffle the weight after the weight is loaded # here we do the shuffle on first forward pass if layer.exllama_state == ExllamaState.UNINITIALIZED: @@ -222,6 +216,14 @@ def apply(self, layer.exllama_state = ExllamaState.READY ops.gptq_shuffle(layer.qweight, layer.g_idx, self.quant_config.weight_bits) + + def apply(self, + layer: torch.nn.Module, + x: torch.Tensor, + bias: Optional[torch.Tensor] = None) -> torch.Tensor: + out_shape = x.shape[:-1] + (layer.qweight.shape[-1], ) + reshaped_x = x.reshape(-1, x.shape[-1]) + output = ops.gptq_gemm(reshaped_x, layer.qweight, layer.qzeros, layer.scales, layer.g_idx, layer.exllama_state == ExllamaState.READY, diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 4a11b14971076..066102f3a01c0 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -251,7 +251,6 @@ def create_weights( scales_and_zp_size, output_size_per_partition // self.quant_config.pack_factor, dtype=torch.int32, - device="meta", ), requires_grad=False, )