Skip to content

Commit

Permalink
load quantized model & save npu compiled model
Browse files Browse the repository at this point in the history
  • Loading branch information
cyita committed Jan 3, 2025
1 parent 31b4279 commit 1fb7793
Showing 1 changed file with 25 additions and 10 deletions.
35 changes: 25 additions & 10 deletions python/llm/src/ipex_llm/transformers/npu_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,7 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs)
intra_pp = kwargs.pop("intra_pp", None)
transpose_value_cache = kwargs.pop("transpose_value_cache", True)
modules_to_not_convert = kwargs.pop("modules_to_not_convert", [])
save_directory = kwargs.pop('save_directory', None)

from transformers.models.auto.configuration_auto import AutoConfig
from transformers.modeling_utils import no_init_weights, get_state_dict_dtype
Expand Down Expand Up @@ -658,16 +659,30 @@ def load_low_bit(cls, pretrained_model_name_or_path: str, *model_args, **kwargs)
param.requires_grad_(False)

if optimize_model and not pipeline:
from ipex_llm.transformers.npu_models.convert_mp import optimize_llm
optimize_llm(
llm,
max_context_len=max_context_len,
max_prompt_len=max_prompt_len,
inter_pp=inter_pp,
intra_pp=intra_pp,
transpose_value_cache=transpose_value_cache,
group_size=quantization_group_size
)
if model.config.model_type in ["qwen2", "llama", "minicpm"]:
from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process
optimize_llm_single_process(
llm,
kv_len=max_context_len,
max_prompt_len=max_prompt_len,
transpose_value_cache=transpose_value_cache,
group_size=quantization_group_size,
qtype=qtype,
save_directory=save_directory,
fuse_layers=None,
has_llm=hasattr(model, "llm")
)
else:
from ipex_llm.transformers.npu_models.convert_mp import optimize_llm
optimize_llm(
llm,
max_context_len=max_context_len,
max_prompt_len=max_prompt_len,
inter_pp=inter_pp,
intra_pp=intra_pp,
transpose_value_cache=transpose_value_cache,
group_size=quantization_group_size
)
elif optimize_model and pipeline:
from ipex_llm.transformers.npu_pipeline_model.convert_pipeline \
import convert_llm
Expand Down

0 comments on commit 1fb7793

Please sign in to comment.