Skip to content

Commit

Permalink
Merge pull request #246 from Jaylyn-Barbee/main
Browse files Browse the repository at this point in the history
Update AIPC_Inference.md to reflect changes in intel-npu-acceleration-library
  • Loading branch information
kinfey authored Jan 16, 2025
2 parents 0aa5a39 + 277453d commit b899f6f
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 29 deletions.
13 changes: 6 additions & 7 deletions code/03.Inference/AIPC/AIPC_NPU_DEMO.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
"outputs": [],
"source": [
"from transformers import AutoTokenizer, pipeline,TextStreamer\n",
"import intel_npu_acceleration_library as npu_lib\n",
"from intel_npu_acceleration_library import NPUModelForCausalLM, int4\n",
"from intel_npu_acceleration_library.compiler import CompilerConfig\n",
"\n",
"\n",
"import warnings"
Expand Down Expand Up @@ -84,12 +85,10 @@
}
],
"source": [
"model = npu_lib.NPUModelForCausalLM.from_pretrained(\n",
" model_id,\n",
" torch_dtype=\"auto\",\n",
" dtype=npu_lib.int4,\n",
" trust_remote_code=True\n",
" )\n",
"compiler_conf = CompilerConfig(dtype=int4)\n",
"model = NPUModelForCausalLM.from_pretrained(\n",
" model_id, use_cache=True, config=compiler_conf, attn_implementation='sdpa'\n",
").eval()\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
"\n",
Expand Down
38 changes: 16 additions & 22 deletions md/03.Inference/AIPC_Inference.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,51 +46,45 @@ Install the Python Library with pip
Using Intel NPU acceleration, this library does not affect the traditional encoding process. You only need to use this library to quantize the original Phi-3 model, such as FP16,INT8,INT4,such as

```python

from transformers import AutoTokenizer, pipeline,TextStreamer
import intel_npu_acceleration_library as npu_lib
from intel_npu_acceleration_library import NPUModelForCausalLM, int4
from intel_npu_acceleration_library.compiler import CompilerConfig
import warnings

model_id = "microsoft/Phi-3-mini-4k-instruct"

model = npu_lib.NPUModelForCausalLM.from_pretrained(
model_id,
torch_dtype="auto",
dtype=npu_lib.int4,
trust_remote_code=True
)
compiler_conf = CompilerConfig(dtype=int4)
model = NPUModelForCausalLM.from_pretrained(
model_id, use_cache=True, config=compiler_conf, attn_implementation="sdpa"
).eval()

tokenizer = AutoTokenizer.from_pretrained(model_id)

text_streamer = TextStreamer(tokenizer, skip_prompt=True)

```
After the quantification is successful, continue execution to call the NPU to run the Phi-3 model.

After the quantification is successful, continue execution to call the NPU to run the Phi-3 model.

```python

generation_args = {
"max_new_tokens": 1024,
"return_full_text": False,
"temperature": 0.3,
"do_sample": False,
"streamer": text_streamer,
}
"max_new_tokens": 1024,
"return_full_text": False,
"temperature": 0.3,
"do_sample": False,
"streamer": text_streamer,
}

pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
"text-generation",
model=model,
tokenizer=tokenizer,
)

query = "<|system|>You are a helpful AI assistant.<|end|><|user|>Can you introduce yourself?<|end|><|assistant|>"

with warnings.catch_warnings():
warnings.simplefilter("ignore")
pipe(query, **generation_args)


```

When executing code, we can view the running status of the NPU through Task Manager
Expand Down

0 comments on commit b899f6f

Please sign in to comment.