HabanaAI · maktukmak · Oct 23, 2024 · Oct 23, 2024 · Oct 23, 2024 · Oct 23, 2024
@@ -131,10 +131,38 @@ def is_ieee_754(self) -> bool:
                 not self._finite_values_only
 
         def __str__(self) -> str:
-            raise NotImplementedError
+            """
+            naming generally follows: https://github.com/jax-ml/ml_dtypes
+            for floating point types (leading f) the scheme is:
+            `float<size_bits>_e<exponent_bits>m<mantissa_bits>[flags]`
+            flags:
+            - no-flags: means it follows IEEE 754 conventions
+            - f: means finite values only (no infinities)
+            - n: means nans are supported (non-standard encoding)
+            for integer types the scheme is:
+            `[u]int<size_bits>[b<bias>]`
+            - if bias is not present it means its zero
+            """
+            if self.is_floating_point():
+                ret = "float" + str(self.size_bits) + "_e" + str(
+                    self.exponent) + "m" + str(self.mantissa)
+
+                if not self.is_ieee_754():
+                    if self._finite_values_only:
+                        ret = ret + "f"
+                    if self.nan_repr != NanRepr.NONE:
+                        ret = ret + "n"
+
+                return ret
+            else:
+                ret = ("int" if self.is_signed() else "uint") + str(
+                    self.size_bits)
+                if self.has_bias():
+                    ret = ret + "b" + str(self.bias)
+                return ret
 
         def __repr__(self) -> str:
-            raise NotImplementedError
+            return "ScalarType." + self.__str__()
 
         # __len__ needs to be defined (and has to throw TypeError) for pytorch's
         # opcheck to work.

@@ -12,6 +12,10 @@
 
 logger = init_logger(__name__)
 
+if current_platform.is_hpu():
+    import habana_frameworks.torch.core as htcore
+    convert_from_uint4 = torch.ops.hpu.convert_from_uint4
+
 if not current_platform.is_tpu() and not current_platform.is_hpu():
     try:
         import vllm._C
@@ -266,7 +270,16 @@ def awq_gemm(input: torch.Tensor, qweight: torch.Tensor, qzeros: torch.Tensor,
     return torch.ops._C.awq_gemm(input, qweight, qzeros, scales, split_k_iters)
 
 
-# gptq
+def gptq_hpu_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
+                  b_gptq_qzeros: torch.Tensor, b_gptq_scales: torch.Tensor,
+                  b_g_idx: torch.Tensor, use_exllama: bool,
+                  bit: int) -> torch.Tensor:
+
+    weight = convert_from_uint4(b_q_weight, b_gptq_scales, b_gptq_qzeros,
+                                a.dtype)
+    return torch.matmul(a, weight)
+
+
 def gptq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
               b_gptq_qzeros: torch.Tensor, b_gptq_scales: torch.Tensor,
               b_g_idx: torch.Tensor, use_exllama: bool,

@@ -26,8 +26,8 @@
     "CompressedTensorsLinearMethod", "AWQMarlinLinearMethod",
     "AWQLinearMethod", "GPTQMarlinLinearMethod", "Fp8LinearMethod",
     "MarlinLinearMethod", "QQQLinearMethod", "GPTQMarlin24LinearMethod",
-    "TPUInt8LinearMethod", "GPTQLinearMethod", "FBGEMMFp8LinearMethod",
-    "ModelOptFp8LinearMethod", "IPEXAWQLinearMethod"
+    "TPUInt8LinearMethod", "GPTQLinearMethod", "GPTQHPULinearMethod",
+    "FBGEMMFp8LinearMethod", "ModelOptFp8LinearMethod", "IPEXAWQLinearMethod"
 ]
 
 

@@ -17,6 +17,7 @@
 from vllm.model_executor.layers.quantization.fp8 import Fp8Config
 from vllm.model_executor.layers.quantization.gguf import GGUFConfig
 from vllm.model_executor.layers.quantization.gptq import GPTQConfig
+from vllm.model_executor.layers.quantization.gptq_hpu import GPTQHPUConfig
 from vllm.model_executor.layers.quantization.gptq_marlin import (
     GPTQMarlinConfig)
 from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
@@ -46,6 +47,7 @@
     "gptq_marlin": GPTQMarlinConfig,
     "awq_marlin": AWQMarlinConfig,
     "gptq": GPTQConfig,
+    "gptq_hpu": GPTQHPUConfig,
     "compressed-tensors": CompressedTensorsConfig,
     "bitsandbytes": BitsAndBytesConfig,
     "inc": INCConfig,

@@ -0,0 +1,291 @@
+from fractions import Fraction
+from typing import Any, Dict, List, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.parameter import (ChannelQuantScaleParameter,
+                                           GroupQuantScaleParameter,
+                                           PackedColumnParameter,
+                                           PackedvLLMParameter,
+                                           RowvLLMParameter)
+
+
+class GPTQHPUConfig(QuantizationConfig):
+    """Config class for GPTQ.
+
+    Reference: https://arxiv.org/abs/2210.17323
+    """
+
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        desc_act: bool,
+        lm_head_quantized: bool,
+    ) -> None:
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.desc_act = desc_act
+        self.lm_head_quantized = lm_head_quantized
+        self.pack_factor = Fraction(32, self.weight_bits)
+        if self.weight_bits not in [2, 3, 4, 8]:
+            raise ValueError(
+                "Currently, only 2/3/4/8-bit weight quantization is "
+                f"supported for GPTQ, but got {self.weight_bits} bits.")
+
+    def __repr__(self) -> str:
+        return (f"GPTQHPUConfig(weight_bits={self.weight_bits}, "
+                f"group_size={self.group_size}, "
+                f"desc_act={self.desc_act}),"
+                f"lm_head_quantized={self.lm_head_quantized}")
+
+    @classmethod
+    def get_name(cls) -> str:
+        return "gptq_hpu"
+
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.bfloat16]
+
+    @classmethod
+    # Need to figure it out
+    def get_min_capability(cls) -> int:
+        return 0
+
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return ["quantize_config.json"]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "GPTQHPUConfig":
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        desc_act = cls.get_from_keys(config, ["desc_act"])
+        lm_head_quantized = cls.get_from_keys_or(config, ["lm_head"],
+                                                 default=False)
+        return cls(weight_bits, group_size, desc_act, lm_head_quantized)
+
+    @classmethod
+    def override_quantization_method(cls, hf_quant_cfg,
+                                     user_quant) -> Optional[str]:
+
+        is_valid_user_quant = user_quant == "gptq_hpu"
+
+        if is_valid_user_quant:
+            return cls.get_name()
+
+        return None
+
+    def get_quant_method(self, layer: torch.nn.Module,
+                         prefix: str) -> Optional["GPTQHPULinearMethod"]:
+        if (isinstance(layer, LinearBase) or
+            (isinstance(layer, ParallelLMHead) and self.lm_head_quantized)):
+            return GPTQHPULinearMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+
+
+class GPTQHPULinearMethod(LinearMethodBase):
+    """Linear method for GPTQ.
+
+    Args:
+        quant_config: The GPTQ quantization config.
+    """
+
+    def __init__(self, quant_config: GPTQHPUConfig):
+        self.quant_config = quant_config
+
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del output_size  # Unused.
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        if input_size_per_partition % self.quant_config.group_size != 0:
+            raise ValueError(
+                "The input size is not aligned with the quantized "
+                "weight shape. This can be caused by too large "
+                "tensor parallel size.")
+        output_size_per_partition = sum(output_partition_sizes)
+        if (output_size_per_partition % self.quant_config.pack_factor.numerator
+                != 0):
+            raise ValueError(
+                "The output size is not aligned with the quantized "
+                "weight shape. This can be caused by too large "
+                "tensor parallel size.")
+
+        if self.quant_config.group_size != -1:
+            group_size = self.quant_config.group_size
+        else:
+            group_size = input_size
+        scale_and_zero_size = input_size // group_size
+        scale_and_zero_input_dim = None
+
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition // self.quant_config.pack_factor,
+                output_size_per_partition,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=0,
+            packed_factor=self.quant_config.pack_factor,
+            weight_loader=weight_loader)
+
+        g_idx = RowvLLMParameter(data=torch.tensor(
+            [
+                i // self.quant_config.group_size
+                for i in range(input_size_per_partition)
+            ],
+            dtype=torch.int32,
+        ),
+                                 input_dim=0,
+                                 weight_loader=weight_loader)
+        qzeros_args = {
+            "data":
+            torch.empty(
+                scale_and_zero_size,
+                output_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int32,
+            ),
+            "weight_loader":
+            weight_loader
+        }
+        weight_scale_args = {
+            "data":
+            torch.empty(
+                scale_and_zero_size,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            "weight_loader":
+            weight_loader
+        }
+        if scale_and_zero_input_dim is None:
+            scales = ChannelQuantScaleParameter(output_dim=1,
+                                                **weight_scale_args)
+            qzeros = PackedColumnParameter(
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args)
+
+        else:
+            scales = GroupQuantScaleParameter(output_dim=1,
+                                              input_dim=0,
+                                              **weight_scale_args)
+            qzeros = PackedvLLMParameter(
+                input_dim=0,
+                output_dim=1,
+                packed_dim=1,
+                packed_factor=self.quant_config.pack_factor,
+                **qzeros_args)
+
+        layer.register_parameter("qweight", qweight)
+        layer.register_parameter("g_idx", g_idx)
+        layer.register_parameter("qzeros", qzeros)
+        layer.register_parameter("scales", scales)
+
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+
+        self.wf = torch.tensor(list(range(0, 32,
+                                          self.quant_config.weight_bits)),
+                               dtype=torch.int32).unsqueeze(0)
+        weight = self.unpack_weight_from_cuda_old_format(layer)
+        layer.qweight.data = self.pack_tensor(weight).to('hpu')
+
+        zeros = self.unpack_zeros_from_cuda_old_format(layer).cpu()
+        layer.qzeros.data = self.pack_tensor(zeros).to('hpu')
+
+        # TODO: Support group indexing and remove the check
+        columns = layer.qweight.shape[0]
+        if self.quant_config.group_size > 0:
+            g_idx_trivial = [
+                i // self.quant_config.group_size for i in range(columns)
+            ]
+        else:
+            g_idx_trivial = [0] * columns
+        g_idx_trivial = torch.tensor(g_idx_trivial, dtype=torch.int32)
+        assert torch.equal(
+            layer.g_idx,
+            g_idx_trivial), "Non-trivial tensor g_idx is not supported"
+
+        # for torch.compile
+        layer.qweight = Parameter(layer.qweight.data, requires_grad=False)
+        layer.qzeros = Parameter(layer.qzeros.data, requires_grad=False)
+        layer.g_idx = Parameter(layer.g_idx.data, requires_grad=False)
+        layer.scales = Parameter(layer.scales.data, requires_grad=False)
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        out_shape = x.shape[:-1]
+        if hasattr(layer, 'output_size_per_partition'):
+            out_shape += (layer.output_size_per_partition, )
+        else:
+            out_shape += (layer.output_size, )
+
+        reshaped_x = x.reshape(-1, x.shape[-1])
+
+        output = ops.gptq_hpu_gemm(reshaped_x, layer.qweight, layer.qzeros,
+                                   layer.scales, layer.g_idx, None,
+                                   self.quant_config.weight_bits)
+        if bias is not None:
+            output.add_(bias)
+        return output.reshape(out_shape)
+
+    def pack_tensor(self, input, bits=4):
+        normal = input.to(torch.int32)
+        q = torch.sum(torch.bitwise_left_shift(
+            normal.reshape(normal.shape[0], -1, (32 // bits)),
+            self.wf.unsqueeze(0)),
+                      dim=-1).to(torch.int32)
+
+        return q
+
+    def unpack_zeros_from_cuda_old_format(self, layer):
+
+        bits = self.quant_config.weight_bits
+        zeros = torch.bitwise_right_shift(
+            torch.unsqueeze(layer.qzeros.to('cpu'),
+                            2).expand(-1, -1, 32 // bits),
+            self.wf.unsqueeze(0),
+        ).to(torch.int16 if bits == 8 else torch.int8)
+
+        zeros = zeros + 1
+        zeros = torch.bitwise_and(zeros, (2**bits) - 1).to(
+            layer.scales.dtype)  # NOTE: It appears that casting here
+        #after the `zeros = zeros + 1` is important.
+        zeros = zeros.reshape(-1, zeros.shape[1] * zeros.shape[2])
+        return zeros
+
+    def unpack_weight_from_cuda_old_format(self, layer):
+
+        qweight = layer.qweight.cpu()
+        bits = self.quant_config.weight_bits
+
+        weight = torch.bitwise_right_shift(
+            torch.unsqueeze(qweight, 1).expand(-1, 32 // bits, -1),
+            self.wf.unsqueeze(-1),
+        ).to(torch.int16 if bits == 8 else torch.int8)
+        weight = torch.bitwise_and(weight, (2**bits) - 1)
+        weight = weight.reshape(
+            (weight.shape[0] * weight.shape[1], weight.shape[2]))
+        return weight