Skip to content

Commit

Permalink
[CI/Build][CPU] Fix CPU CI by lazy importing triton FP8 kernels (vllm…
Browse files Browse the repository at this point in the history
…-project#11618)

Signed-off-by: jiang1.li <[email protected]>
  • Loading branch information
bigPYJ1151 authored Dec 30, 2024
1 parent 970d6d0 commit 5dbf854
Showing 1 changed file with 3 additions and 2 deletions.
5 changes: 3 additions & 2 deletions vllm/model_executor/layers/quantization/fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig, QuantizeMethodBase)
from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
apply_w8a8_block_fp8_linear)
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
apply_fp8_marlin_linear, prepare_fp8_layer_for_marlin)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
Expand Down Expand Up @@ -337,6 +335,9 @@ def apply(self,
size_k=layer.input_size_per_partition,
bias=bias)

# Note: lazy import to avoid triton import error.
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
apply_w8a8_block_fp8_linear)
if self.block_quant:
assert self.quant_config.weight_block_size is not None
return apply_w8a8_block_fp8_linear(
Expand Down

0 comments on commit 5dbf854

Please sign in to comment.