From 5dbf854553cb6ac97f0c633ed36ba64e0fc9bb29 Mon Sep 17 00:00:00 2001 From: "Li, Jiang" Date: Mon, 30 Dec 2024 18:17:04 +0800 Subject: [PATCH] [CI/Build][CPU] Fix CPU CI by lazy importing triton FP8 kernels (#11618) Signed-off-by: jiang1.li --- vllm/model_executor/layers/quantization/fp8.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 7f779ac8d3b3e..2fe22903a385b 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -15,8 +15,6 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig, QuantizeMethodBase) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod -from vllm.model_executor.layers.quantization.utils.fp8_utils import ( - apply_w8a8_block_fp8_linear) from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import ( apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin) from vllm.model_executor.layers.quantization.utils.quant_utils import ( @@ -337,6 +335,9 @@ def apply(self, size_k=layer.input_size_per_partition, bias=bias) + # Note: lazy import to avoid triton import error. + from vllm.model_executor.layers.quantization.utils.fp8_utils import ( + apply_w8a8_block_fp8_linear) if self.block_quant: assert self.quant_config.weight_block_size is not None return apply_w8a8_block_fp8_linear(