EmbeddedLLM · szeyu · Sep 3, 2024 · Sep 4, 2024 · Sep 4, 2024 · Sep 4, 2024
diff --git a/README.md b/README.md
@@ -35,6 +35,7 @@ Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)). E
   * Onnxruntime CPU Models [Link](./docs/model/onnxruntime_cpu_models.md)
   * Ipex-LLM Models [Link](./docs/model/ipex_models.md)
   * OpenVINO-LLM Models [Link](./docs/model/openvino_models.md)
+  * NPU-LLM Models [Link](./docs/model/npu_models.md)
 
 ## Getting Started
 
@@ -56,12 +57,14 @@ Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)). E
      - **CUDA:** `$env:ELLM_TARGET_DEVICE='cuda'; pip install -e .[cuda]`
      - **IPEX:** `$env:ELLM_TARGET_DEVICE='ipex'; python setup.py develop`
      - **OpenVINO:** `$env:ELLM_TARGET_DEVICE='openvino'; pip install -e .[openvino]`
+     - **NPU:** `$env:ELLM_TARGET_DEVICE='npu'; pip install -e .[npu]`
      - **With Web UI**:
        - **DirectML:** `$env:ELLM_TARGET_DEVICE='directml'; pip install -e .[directml,webui]`
        - **CPU:** `$env:ELLM_TARGET_DEVICE='cpu'; pip install -e .[cpu,webui]`
        - **CUDA:** `$env:ELLM_TARGET_DEVICE='cuda'; pip install -e .[cuda,webui]`
        - **IPEX:** `$env:ELLM_TARGET_DEVICE='ipex'; python setup.py develop; pip install -r requirements-webui.txt`
        - **OpenVINO:** `$env:ELLM_TARGET_DEVICE='openvino'; pip install -e .[openvino,webui]`
+       - **NPU:** `$env:ELLM_TARGET_DEVICE='npu'; pip install -e .[npu,webui]`
 
 - **Linux**
 
@@ -77,12 +80,14 @@ Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)). E
      - **CUDA:** `ELLM_TARGET_DEVICE='cuda' pip install -e .[cuda]`
      - **IPEX:** `ELLM_TARGET_DEVICE='ipex' python setup.py develop`
      - **OpenVINO:** `ELLM_TARGET_DEVICE='openvino' pip install -e .[openvino]`
+     - **NPU:** `ELLM_TARGET_DEVICE='npu' pip install -e .[npu]`
      - **With Web UI**:
        - **DirectML:** `ELLM_TARGET_DEVICE='directml' pip install -e .[directml,webui]`
        - **CPU:** `ELLM_TARGET_DEVICE='cpu' pip install -e .[cpu,webui]`
        - **CUDA:** `ELLM_TARGET_DEVICE='cuda' pip install -e .[cuda,webui]`
        - **IPEX:** `ELLM_TARGET_DEVICE='ipex' python setup.py develop; pip install -r requirements-webui.txt`
        - **OpenVINO:** `ELLM_TARGET_DEVICE='openvino' pip install -e .[openvino,webui]`
+       - **NPU:** `ELLM_TARGET_DEVICE='npu' pip install -e .[npu,webui]`
 
 ### Launch OpenAI API Compatible Server
 
@@ -142,6 +147,9 @@ It is an interface that allows you to download and deploy OpenAI API compatible
 
    # OpenVINO
    ellm_server --model_path '.\meta-llama_Meta-Llama-3.1-8B-Instruct\'  --backend 'openvino' --device 'gpu' --port 5555 --served_model_name 'meta-llama_Meta/Llama-3.1-8B-Instruct'
+
+   # NPU
+   ellm_server --model_path 'microsoft/Phi-3-mini-4k-instruct'  --backend 'npu' --device 'npu' --port 5555 --served_model_name 'microsoft/Phi-3-mini-4k-instruct'
    ```
 
 ## Prebuilt OpenAI API Compatible Windows Executable (Alpha)
@@ -161,6 +169,9 @@ _Powershell/Terminal Usage (Use it like `ellm_server`)_:
 
 # OpenVINO
 .\ellm_api_server.exe --model_path '.\meta-llama_Meta-Llama-3.1-8B-Instruct\'  --backend 'openvino' --device 'gpu' --port 5555 --served_model_name 'meta-llama_Meta/Llama-3.1-8B-Instruct'
+
+# NPU
+.\ellm_api_server.exe --model_path 'microsoft/Phi-3-mini-4k-instruct'  --backend 'npu' --device 'npu' --port 5555 --served_model_name 'microsoft/Phi-3-mini-4k-instruct'
 ```
 
 ## Acknowledgements

diff --git a/docs/model/npu_models.md b/docs/model/npu_models.md
@@ -0,0 +1,15 @@
+# Model Powered by NPU-LLM
+
+## Verified Models
+Verified models can be found from EmbeddedLLM NPU-LLM model collections
+* EmbeddedLLM NPU-LLM Model collections: [link](https://huggingface.co/collections/EmbeddedLLM/npu-llm-66d692817e6c9509bb8ead58)
+
+| Model | Model Link |
+| --- | --- |
+| Phi-3-mini-4k-instruct | [link](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) |
+| Phi-3-mini-128k-instruct | [link](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct) |
+| Phi-3-medium-4k-instruct | [link](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct) |
+| Phi-3-medium-128k-instruct | [link](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) |
+
+## Contribution
+We welcome contributions to the verified model list.
diff --git a/requirements-npu.txt b/requirements-npu.txt
@@ -0,0 +1,3 @@
+intel-npu-acceleration-library
+torch>=2.4
+transformers>=4.42
diff --git a/setup.py b/setup.py
@@ -54,6 +54,10 @@ def _is_openvino() -> bool:
     return ELLM_TARGET_DEVICE == "openvino"
 
 
+def _is_npu() -> bool:
+    return ELLM_TARGET_DEVICE == "npu"
+
+
 class ELLMInstallCommand(install):
     def run(self):
         install.run(self)
@@ -186,6 +190,8 @@ def get_requirements() -> List[str]:
         requirements = _read_requirements("requirements-ipex.txt")
     elif _is_openvino():
         requirements = _read_requirements("requirements-openvino.txt")
+    elif _is_npu():
+        requirements = _read_requirements("requirements-npu.txt")
     else:
         raise ValueError("Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.")
     return requirements
@@ -204,6 +210,8 @@ def get_ellm_version() -> str:
         version += "+ipex"
     elif _is_openvino():
         version += "+openvino"
+    elif _is_npu():
+        version += "+npu"
     else:
         raise RuntimeError("Unknown runtime environment")
 
@@ -256,6 +264,7 @@ def get_ellm_version() -> str:
         "cuda": ["onnxruntime-genai-cuda==0.3.0rc2"],
         "ipex": [],
         "openvino": [],
+        "npu": [],
     },
     dependency_links=dependency_links,
     entry_points={

diff --git a/src/embeddedllm/backend/npu_engine.py b/src/embeddedllm/backend/npu_engine.py
@@ -0,0 +1,268 @@
+import contextlib
+import time
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from typing import AsyncIterator, List, Optional
+
+from loguru import logger
+from PIL import Image
+from transformers import (
+    AutoConfig,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerFast,
+    TextIteratorStreamer,
+)
+
+from threading import Thread
+
+import intel_npu_acceleration_library as npu_lib
+
+from embeddedllm.inputs import PromptInputs
+from embeddedllm.protocol import CompletionOutput, RequestOutput
+from embeddedllm.sampling_params import SamplingParams
+from embeddedllm.backend.base_engine import BaseLLMEngine, _get_and_verify_max_len
+
+RECORD_TIMING = True
+
+
+class NPUEngine(BaseLLMEngine):
+    def __init__(self, model_path: str, vision: bool, device: str = "npu"):
+        self.model_path = model_path
+        self.model_config: AutoConfig = AutoConfig.from_pretrained(
+            self.model_path, trust_remote_code=True
+        )
+        self.device = device
+
+        # model_config is to find out the max length of the model
+        self.max_model_len = _get_and_verify_max_len(
+            hf_config=self.model_config,
+            max_model_len=None,
+            disable_sliding_window=False,
+            sliding_window_len=self.get_hf_config_sliding_window(),
+        )
+
+        logger.info("Model Context Length: " + str(self.max_model_len))
+
+        try:
+            logger.info("Attempt to load fast tokenizer")
+            self.tokenizer = PreTrainedTokenizerFast.from_pretrained(self.model_path)
+        except Exception:
+            logger.info("Attempt to load slower tokenizer")
+            self.tokenizer = PreTrainedTokenizer.from_pretrained(self.model_path)
+
+        self.model = npu_lib.NPUModelForCausalLM.from_pretrained(
+                        self.model_path,
+                        torch_dtype="auto",
+                        dtype=npu_lib.int4,
+                        trust_remote_code=True,
+                        export=False
+                    )
+
+        logger.info("Model loaded")
+        self.tokenizer_stream = TextIteratorStreamer(
+            self.tokenizer, skip_prompt=True, skip_special_tokens=True
+        )
+        logger.info("Tokenizer created")
+
+        self.vision = vision
+
+        # if self.vision:
+        #     self.onnx_processor = self.model.create_multimodal_processor()
+        #     self.processor = AutoImageProcessor.from_pretrained(
+        #         self.model_path, trust_remote_code=True
+        #     )
+        #     print(dir(self.processor))
+
+    async def generate_vision(
+        self,
+        inputs: PromptInputs,
+        sampling_params: SamplingParams,
+        request_id: str,
+        stream: bool = True,
+    ) -> AsyncIterator[RequestOutput]:
+        raise NotImplementedError(f"generate_vision yet to be implemented.")
+
+    async def generate(
+        self,
+        inputs: PromptInputs,
+        sampling_params: SamplingParams,
+        request_id: str,
+        stream: bool = True,
+    ) -> AsyncIterator[RequestOutput]:
+        """Generate outputs for a request.
+
+        Generate outputs for a request. This method is a coroutine. It adds the
+        request into the waiting queue of the LLMEngine and streams the outputs
+        from the LLMEngine to the caller.
+
+        """
+
+        prompt_text = inputs["prompt"]
+        input_token_length = None
+        input_tokens = None  # for text only use case
+        # logger.debug("inputs: " + prompt_text)
+
+        input_tokens = self.tokenizer.encode(prompt_text, return_tensors="pt")
+        # logger.debug(f"input_tokens: {input_tokens}")
+        input_token_length = len(input_tokens[0])
+
+        max_tokens = sampling_params.max_tokens
+
+        assert input_token_length is not None
+
+        if input_token_length + max_tokens > self.max_model_len:
+            raise ValueError("Exceed Context Length")
+
+        generation_options = {
+            name: getattr(sampling_params, name)
+            for name in [
+                "do_sample",
+                # "max_length",
+                "max_new_tokens",
+                "min_length",
+                "top_p",
+                "top_k",
+                "temperature",
+                "repetition_penalty",
+            ]
+            if hasattr(sampling_params, name)
+        }
+        generation_options["max_length"] = self.max_model_len
+        generation_options["input_ids"] = input_tokens.clone()
+        # generation_options["input_ids"] = input_tokens.clone().to(self.device)
+        generation_options["max_new_tokens"] = max_tokens
+        print(generation_options)
+
+        token_list: List[int] = []
+        output_text: str = ""
+        if stream:
+            generation_options["streamer"] = self.tokenizer_stream
+            if RECORD_TIMING:
+                started_timestamp = time.time()
+                first_token_timestamp = 0
+                first = True
+                new_tokens = []
+            try:
+                thread = Thread(target=self.model.generate, kwargs=generation_options)
+                started_timestamp = time.time()
+                first_token_timestamp = None
+                thread.start()
+                output_text = ""
+                first = True
+                for new_text in self.tokenizer_stream:
+                    if new_text == "":
+                        continue
+                    if RECORD_TIMING:
+                        if first:
+                            first_token_timestamp = time.time()
+                            first = False
+                    # logger.debug(f"new text: {new_text}")
+                    output_text += new_text
+                    token_list = self.tokenizer.encode(output_text, return_tensors="pt")
+
+                    output = RequestOutput(
+                        request_id=request_id,
+                        prompt=prompt_text,
+                        prompt_token_ids=input_tokens[0],
+                        finished=False,
+                        outputs=[
+                            CompletionOutput(
+                                index=0,
+                                text=output_text,
+                                token_ids=token_list[0],
+                                cumulative_logprob=-1.0,
+                            )
+                        ],
+                    )
+                    yield output
+                    # logits = generator.get_output("logits")
+                    # print(logits)
+                    if RECORD_TIMING:
+                        new_tokens = token_list[0]
+
+                yield RequestOutput(
+                    request_id=request_id,
+                    prompt=prompt_text,
+                    prompt_token_ids=input_tokens[0],
+                    finished=True,
+                    outputs=[
+                        CompletionOutput(
+                            index=0,
+                            text=output_text,
+                            token_ids=token_list[0],
+                            cumulative_logprob=-1.0,
+                            finish_reason="stop",
+                        )
+                    ],
+                )
+                if RECORD_TIMING:
+                    prompt_time = first_token_timestamp - started_timestamp
+                    run_time = time.time() - first_token_timestamp
+                    logger.info(
+                        f"Prompt length: {len(input_tokens[0])}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens[0])/prompt_time:.2f} tps, New tokens per second: {len(new_tokens)/run_time:.2f} tps"
+                    )
+
+            except Exception as e:
+                logger.error(str(e))
+
+                error_output = RequestOutput(
+                    prompt=inputs,
+                    prompt_token_ids=input_tokens,
+                    finished=True,
+                    request_id=request_id,
+                    outputs=[
+                        CompletionOutput(
+                            index=0,
+                            text=output_text,
+                            token_ids=token_list,
+                            cumulative_logprob=-1.0,
+                            finish_reason="error",
+                            stop_reason=str(e),
+                        )
+                    ],
+                )
+                yield error_output
+        else:
+            try:
+                token_list = self.model.generate(**generation_options)[0]
+
+                output_text = self.tokenizer.decode(
+                    token_list[input_token_length:], skip_special_tokens=True
+                )
+
+                yield RequestOutput(
+                    request_id=request_id,
+                    prompt=prompt_text,
+                    prompt_token_ids=input_tokens[0],
+                    finished=True,
+                    outputs=[
+                        CompletionOutput(
+                            index=0,
+                            text=output_text,
+                            token_ids=token_list,
+                            cumulative_logprob=-1.0,
+                            finish_reason="stop",
+                        )
+                    ],
+                )
+
+            except Exception as e:
+                logger.error(str(e))
+
+                error_output = RequestOutput(
+                    prompt=prompt_text,
+                    prompt_token_ids=input_tokens[0],
+                    finished=True,
+                    request_id=request_id,
+                    outputs=[
+                        CompletionOutput(
+                            index=0,
+                            text=output_text,
+                            token_ids=token_list,
+                            cumulative_logprob=-1.0,
+                            finish_reason="error",
+                            stop_reason=str(e),
+                        )
+                    ],
+                )
+                yield error_output