fix cr comments

HabanaAI · Jan 9, 2025 · 2c0c3cb · 2c0c3cb
1 parent f4d3c92
commit 2c0c3cb
Show file tree

Hide file tree

Showing 3 changed files with 3 additions and 1 deletion.
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -121,6 +121,7 @@ Documentation
    quantization/auto_awq
    quantization/bnb
    quantization/gguf
+   quantization/inc
    quantization/int8
    quantization/fp8
    quantization/fp8_e5m2_kvcache

diff --git a/docs/source/quantization/inc.rst b/docs/source/quantization/inc.rst
@@ -4,7 +4,7 @@ FP8 INC
 =======
 
 vLLM supports FP8 (8-bit floating point) weight and activation quantization using Intel® Neural Compressor (INC) on Intel® Gaudi® 2 and Intel® Gaudi® 3 AI accelerators.
-Currently, quantization is supported only for Llama models.
+Currently, quantization is validated only in Llama models.
 
 Intel Gaudi supports quantization of various modules and functions, including, but not limited to ``Linear``, ``KVCache``, ``Matmul`` and ``Softmax``. For more information, please refer to:
 `Supported Modules\\Supported Functions\\Custom Patched Modules <https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html#supported-modules>`_.

diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
@@ -13,6 +13,7 @@ class HpuPlatform(Platform):
     device_name: str = "hpu"
     device_type: str = "hpu"
     dispatch_key: str = "HPU"
+    supported_quantization: list[str] = ["inc"]
 
     @classmethod
     def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: