diff --git a/docs/source/index.rst b/docs/source/index.rst index ebf1361976c5e..7c545a85ac2be 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -121,6 +121,7 @@ Documentation quantization/auto_awq quantization/bnb quantization/gguf + quantization/inc quantization/int8 quantization/fp8 quantization/fp8_e5m2_kvcache diff --git a/docs/source/quantization/inc.rst b/docs/source/quantization/inc.rst index 76d5c662409df..ad7e21af54c40 100644 --- a/docs/source/quantization/inc.rst +++ b/docs/source/quantization/inc.rst @@ -4,7 +4,7 @@ FP8 INC ======= vLLM supports FP8 (8-bit floating point) weight and activation quantization using Intel® Neural Compressor (INC) on Intel® Gaudi® 2 and Intel® Gaudi® 3 AI accelerators. -Currently, quantization is supported only for Llama models. +Currently, quantization is validated only in Llama models. Intel Gaudi supports quantization of various modules and functions, including, but not limited to ``Linear``, ``KVCache``, ``Matmul`` and ``Softmax``. For more information, please refer to: `Supported Modules\\Supported Functions\\Custom Patched Modules `_. diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index 314cd98212e9c..d75f146ecf4b8 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -13,6 +13,7 @@ class HpuPlatform(Platform): device_name: str = "hpu" device_type: str = "hpu" dispatch_key: str = "HPU" + supported_quantization: list[str] = ["inc"] @classmethod def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: