Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NPU] trial of HQQ q4_0 #12534

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions python/llm/src/ipex_llm/transformers/npu_models/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,11 +109,13 @@ def replace_with_QuantizedLinear(layer, qtype, device, modules_to_not_convert,
enable_scale_search=enable_scale_search,
imatrix=imatrix)
if qtype == "sym_int4_rtn" and os.environ.get("IPEX_LLM_NPU_QUANTIZATION_HQQ", "0") != "0":
from .quantize import update_scale_inverse_median
from .quantize import update_scale_hqq_v2
# scale search by hqq
qweights, scale = update_scale_inverse_median(layer.weight.data.to(torch.float32),
(1.0 / scale.to(torch.float32)),
[-8, 7])
print("====original scale is :", scale)
qweights, scale = update_scale_hqq_v2(layer.weight.data.to(torch.float32),
scale.to(torch.float32),
[-8, 7])
print("====updated scale is :", scale)
zero = None
# split scale to scale & zero
if qtype == "asym_int4_rtn":
Expand Down
47 changes: 46 additions & 1 deletion python/llm/src/ipex_llm/transformers/npu_models/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,12 @@ def update_scale_hqq(x: Tensor, iscale: Tensor, min_max: list):
W_ = (x - W_e).clone()
W_mask = torch.abs(W_) < z_val
W_[W_mask] = z_val
iscale, _ = torch.median(W_q / W_q, axis=1, keepdim=True)
iscale, _ = torch.median(W_q / W_, axis=1, keepdim=True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use mean instead of median?

beta *= kappa

current_error = float(torch.abs(x - W_r).mean())
print(i, current_error)
print(iscale, torch.isinf(iscale).any(), torch.isnan(iscale).any())
if current_error < best_error:
best_error = current_error
else:
Expand All @@ -141,6 +143,49 @@ def update_scale_hqq(x: Tensor, iscale: Tensor, min_max: list):
return qweights.view(torch.uint8), scale_b.to(torch.float16)


def update_scale_hqq_v2(x: Tensor, scale: Tensor, min_max: list):
scale = scale.unsqueeze(1)
opt_params: dict = {"lp_norm": 0.7, "beta": 1e1, "kappa": 1.01, "iters": 20}
lp_norm, beta, kappa, iters = (
opt_params["lp_norm"],
opt_params["beta"],
opt_params["kappa"],
opt_params["iters"],
)

best_error = 1e4
for i in range(iters):
W_q = c_round(x / scale).clamp(min_max[0], min_max[1])
W_q_mask = W_q != 0 # m, n
sum_row = torch.sum(W_q_mask.int(), axis=1, keepdim=True) # m, 1
W_r = W_q * scale
W_e = shrink_lp_op(x - W_r, beta, lp_norm)
W_ = (x - W_e).clone()
tmp = W_ / W_q
tmp[W_q == 0] = 0
tmp = torch.sum(tmp, axis=1, keepdim=True) # m, 1
scale = tmp / sum_row # m, 1
beta *= kappa

current_error = float(torch.abs(x - W_r).mean())
print(i, current_error)
if current_error < best_error:
best_error = current_error
else:
break

scale_b = scale
qweights = (c_round(x / scale)).clamp(min_max[0], min_max[1]).to(torch.int8) # m * n
qweights = qweights.reshape(x.shape[0], -1 , 2) # m * n/2 * 2
low_bit, high_bit = qweights.split(1, dim=-1)
high_bit = high_bit.squeeze().view(torch.int8)
low_bit = low_bit.squeeze().view(torch.int8)
high_bit = high_bit << 4
low_bit = low_bit & 0x0f
qweights = high_bit | low_bit

return qweights.view(torch.uint8), scale_b.to(torch.float16)


# re-estimate the scale based on the inverse median: Only tested with axis==0
def update_scale_inverse_median(
Expand Down
Loading