diff --git a/.gitignore b/.gitignore
index a25e14a..9a79f19 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,4 +11,8 @@ scripts/*.ps1
 scripts/*.sh
 **/dist
 **/build
-*.log
\ No newline at end of file
+*.log
+benchmark/
+modelTest/
+nc_workspace/
+debug_openai_history.txt
\ No newline at end of file
diff --git a/README.md b/README.md
index 4e4c883..b96bedd 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@ Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)). E
 | Model architectures   | Gemma <br/> Llama \* <br/> Mistral + <br/>Phi <br/> |                   |                |
 | Platform              | Linux <br/> Windows                                 |                   |                |
 | Architecture          | x86 <br/> x64 <br/>                                 | Arm64             |                |
-| Hardware Acceleration | CUDA<br/>DirectML<br/>IpexLLM                       | QNN <br/> ROCm    | OpenVINO       |
+| Hardware Acceleration | CUDA<br/>DirectML<br/>IpexLLM<br/>OpenVINO          | QNN <br/> ROCm    |                |
 
 \* The Llama model architecture supports similar model families such as CodeLlama, Vicuna, Yi, and more.
 
@@ -33,22 +33,12 @@ Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)). E
 - [Acknowledgements](#acknowledgements)
 
 ## Supported Models (Quick Start)
+  * Onnxruntime DirectML Models [Link](./docs/model/onnxruntime_directml_models.md)
+  * Onnxruntime CPU Models [Link](./docs/model/onnxruntime_cpu_models.md)
+  * Ipex-LLM Models [Link](./docs/model/ipex_models.md)
+  * OpenVINO-LLM Models [Link](./docs/model/openvino_models.md)
+  * NPU-LLM Models [Link](./docs/model/npu_models.md)
 
-| Models | Parameters | Context Length | Link |
-| --- | --- | --- | --- |
-| Gemma-2b-Instruct v1 | 2B | 8192 | [EmbeddedLLM/gemma-2b-it-onnx](https://huggingface.co/EmbeddedLLM/gemma-2b-it-onnx) |
-| Llama-2-7b-chat | 7B | 4096 | [EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-7b-chat-int4-onnx-directml) |
-| Llama-2-13b-chat | 13B | 4096 | [EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml](https://huggingface.co/EmbeddedLLM/llama-2-13b-chat-int4-onnx-directml) |
-| Llama-3-8b-chat | 8B | 8192 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) |
-| Mistral-7b-v0.3-instruct | 7B | 32768 | [EmbeddedLLM/mistral-7b-instruct-v0.3-onnx](https://huggingface.co/EmbeddedLLM/mistral-7b-instruct-v0.3-onnx) |
-| Phi-3-mini-4k-instruct-062024 | 3.8B | 4096 | [EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx](https://huggingface.co/EmbeddedLLM/Phi-3-mini-4k-instruct-062024-onnx/tree/main/onnx/directml/Phi-3-mini-4k-instruct-062024-int4) |
-| Phi3-mini-4k-instruct | 3.8B | 4096 | [microsoft/Phi-3-mini-4k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-onnx) |
-| Phi3-mini-128k-instruct | 3.8B | 128k | [microsoft/Phi-3-mini-128k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct-onnx) |
-| Phi3-medium-4k-instruct | 17B | 4096 | [microsoft/Phi-3-medium-4k-instruct-onnx-directml](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct-onnx-directml) |
-| Phi3-medium-128k-instruct | 17B | 128k | [microsoft/Phi-3-medium-128k-instruct-onnx-directml](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct-onnx-directml) |
-| Openchat-3.6-8b | 8B | 8192 | [EmbeddedLLM/openchat-3.6-8b-20240522-onnx](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx) |
-| Yi-1.5-6b-chat | 6B | 32k | [EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx](https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx) |
-| Phi-3-vision-128k-instruct |  | 128k | [EmbeddedLLM/Phi-3-vision-128k-instruct-onnx](https://huggingface.co/EmbeddedLLM/Phi-3-vision-128k-instruct-onnx/tree/main/onnx/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4) |
 
 ## Getting Started
 
@@ -70,12 +60,14 @@ Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)). E
      - **CUDA:** `$env:ELLM_TARGET_DEVICE='cuda'; pip install -e .[cuda]`
      - **IPEX:** `$env:ELLM_TARGET_DEVICE='ipex'; python setup.py develop`
      - **OpenVINO:** `$env:ELLM_TARGET_DEVICE='openvino'; pip install -e .[openvino]`
+     - **NPU:** `$env:ELLM_TARGET_DEVICE='npu'; pip install -e .[npu]`
      - **With Web UI**:
        - **DirectML:** `$env:ELLM_TARGET_DEVICE='directml'; pip install -e .[directml,webui]`
        - **CPU:** `$env:ELLM_TARGET_DEVICE='cpu'; pip install -e .[cpu,webui]`
        - **CUDA:** `$env:ELLM_TARGET_DEVICE='cuda'; pip install -e .[cuda,webui]`
        - **IPEX:** `$env:ELLM_TARGET_DEVICE='ipex'; python setup.py develop; pip install -r requirements-webui.txt`
        - **OpenVINO:** `$env:ELLM_TARGET_DEVICE='openvino'; pip install -e .[openvino,webui]`
+       - **NPU:** `$env:ELLM_TARGET_DEVICE='npu'; pip install -e .[npu,webui]`
 
 - **Linux**
 
@@ -91,12 +83,14 @@ Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)). E
      - **CUDA:** `ELLM_TARGET_DEVICE='cuda' pip install -e .[cuda]`
      - **IPEX:** `ELLM_TARGET_DEVICE='ipex' python setup.py develop`
      - **OpenVINO:** `ELLM_TARGET_DEVICE='openvino' pip install -e .[openvino]`
+     - **NPU:** `ELLM_TARGET_DEVICE='npu' pip install -e .[npu]`
      - **With Web UI**:
        - **DirectML:** `ELLM_TARGET_DEVICE='directml' pip install -e .[directml,webui]`
        - **CPU:** `ELLM_TARGET_DEVICE='cpu' pip install -e .[cpu,webui]`
        - **CUDA:** `ELLM_TARGET_DEVICE='cuda' pip install -e .[cuda,webui]`
        - **IPEX:** `ELLM_TARGET_DEVICE='ipex' python setup.py develop; pip install -r requirements-webui.txt`
        - **OpenVINO:** `ELLM_TARGET_DEVICE='openvino' pip install -e .[openvino,webui]`
+       - **NPU:** `ELLM_TARGET_DEVICE='npu' pip install -e .[npu,webui]`
 
 ### Launch OpenAI API Compatible Server
 
@@ -121,7 +115,7 @@ Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)). E
 
 ### Launch Chatbot Web UI
 
-1.  `ellm_chatbot --port 7788 --host localhost --server_port <ellm_server_port> --server_host localhost`. **Note:** To find out more of the supported arguments. `ellm_chatbot --help`.
+1.  `ellm_chatbot --port 7788 --host localhost --server_port <ellm_server_port> --server_host localhost --model_name <model_name>`. **Note:** To find out more of the supported arguments. `ellm_chatbot --help`.
 
 ![asset/ellm_chatbot_vid.webp](asset/ellm_chatbot_vid.webp)
 
@@ -156,6 +150,9 @@ It is an interface that allows you to download and deploy OpenAI API compatible
 
    # OpenVINO
    ellm_server --model_path '.\meta-llama_Meta-Llama-3.1-8B-Instruct\'  --backend 'openvino' --device 'gpu' --port 5555 --served_model_name 'meta-llama_Meta/Llama-3.1-8B-Instruct'
+
+   # NPU
+   ellm_server --model_path 'microsoft/Phi-3-mini-4k-instruct'  --backend 'npu' --device 'npu' --port 5555 --served_model_name 'microsoft/Phi-3-mini-4k-instruct'
    ```
 
 ## Prebuilt OpenAI API Compatible Windows Executable (Alpha)
@@ -168,13 +165,16 @@ _Powershell/Terminal Usage (Use it like `ellm_server`)_:
 .\ellm_api_server.exe --model_path <path/to/model/weight>
 
 # DirectML
-.\ellm_api_server.exe --model_path 'EmbeddedLLM_Phi-3-mini-4k-instruct-062024-onnx\onnx\directml\Phi-3-mini-4k-instruct-062024-int4' --port 5555
+.\ellm_api_server.exe --model_path 'EmbeddedLLM/Phi-3-mini-4k-instruct-onnx-directml' --port 5555
 
 # IPEX-LLM
 .\ellm_api_server.exe --model_path '.\meta-llama_Meta-Llama-3.1-8B-Instruct\'  --backend 'ipex' --device 'xpu' --port 5555 --served_model_name 'meta-llama_Meta/Llama-3.1-8B-Instruct'
 
 # OpenVINO
 .\ellm_api_server.exe --model_path '.\meta-llama_Meta-Llama-3.1-8B-Instruct\'  --backend 'openvino' --device 'gpu' --port 5555 --served_model_name 'meta-llama_Meta/Llama-3.1-8B-Instruct'
+
+# NPU
+.\ellm_api_server.exe --model_path 'microsoft/Phi-3-mini-4k-instruct'  --backend 'npu' --device 'npu' --port 5555 --served_model_name 'microsoft/Phi-3-mini-4k-instruct'
 ```
 
 ## Acknowledgements
diff --git a/docs/model/npu_models.md b/docs/model/npu_models.md
new file mode 100644
index 0000000..c1d2b06
--- /dev/null
+++ b/docs/model/npu_models.md
@@ -0,0 +1,15 @@
+# Model Powered by NPU-LLM
+
+## Verified Models
+Verified models can be found from EmbeddedLLM NPU-LLM model collections
+* EmbeddedLLM NPU-LLM Model collections: [link](https://huggingface.co/collections/EmbeddedLLM/npu-llm-66d692817e6c9509bb8ead58)
+
+| Model | Model Link |
+| --- | --- |
+| Phi-3-mini-4k-instruct | [link](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) |
+| Phi-3-mini-128k-instruct | [link](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct) |
+| Phi-3-medium-4k-instruct | [link](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct) |
+| Phi-3-medium-128k-instruct | [link](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) |
+
+## Contribution
+We welcome contributions to the verified model list.
\ No newline at end of file
diff --git a/requirements-npu.txt b/requirements-npu.txt
new file mode 100644
index 0000000..dbcb8cf
--- /dev/null
+++ b/requirements-npu.txt
@@ -0,0 +1,3 @@
+intel-npu-acceleration-library
+torch>=2.4
+transformers>=4.42
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 4520ee6..50ce2f9 100644
--- a/setup.py
+++ b/setup.py
@@ -54,6 +54,10 @@ def _is_openvino() -> bool:
     return ELLM_TARGET_DEVICE == "openvino"
 
 
+def _is_npu() -> bool:
+    return ELLM_TARGET_DEVICE == "npu"
+
+
 class ELLMInstallCommand(install):
     def run(self):
         install.run(self)
@@ -198,6 +202,8 @@ def get_requirements() -> List[str]:
         requirements = _read_requirements("requirements-ipex.txt")
     elif _is_openvino():
         requirements = _read_requirements("requirements-openvino.txt")
+    elif _is_npu():
+        requirements = _read_requirements("requirements-npu.txt")
     else:
         raise ValueError("Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.")
     return requirements
@@ -216,6 +222,8 @@ def get_ellm_version() -> str:
         version += "+ipex"
     elif _is_openvino():
         version += "+openvino"
+    elif _is_npu():
+        version += "+npu"
     else:
         raise RuntimeError("Unknown runtime environment")
 
@@ -268,6 +276,7 @@ def get_ellm_version() -> str:
         "cuda": ["onnxruntime-genai-cuda==0.3.0rc2"],
         "ipex": [],
         "openvino": [],
+        "npu": [],
     },
     dependency_links=dependency_links,
     entry_points={
diff --git a/src/embeddedllm/backend/intel_npu_engine.py b/src/embeddedllm/backend/intel_npu_engine.py
new file mode 100644
index 0000000..c245e43
--- /dev/null
+++ b/src/embeddedllm/backend/intel_npu_engine.py
@@ -0,0 +1,268 @@
+import contextlib
+import time
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from typing import AsyncIterator, List, Optional
+
+from loguru import logger
+from PIL import Image
+from transformers import (
+    AutoConfig,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerFast,
+    TextIteratorStreamer,
+)
+
+from threading import Thread
+
+import intel_npu_acceleration_library as npu_lib
+
+from embeddedllm.inputs import PromptInputs
+from embeddedllm.protocol import CompletionOutput, RequestOutput
+from embeddedllm.sampling_params import SamplingParams
+from embeddedllm.backend.base_engine import BaseLLMEngine, _get_and_verify_max_len
+
+RECORD_TIMING = True
+
+
+class NPUEngine(BaseLLMEngine):
+    def __init__(self, model_path: str, vision: bool, device: str = "npu"):
+        self.model_path = model_path
+        self.model_config: AutoConfig = AutoConfig.from_pretrained(
+            self.model_path, trust_remote_code=True
+        )
+        self.device = device
+
+        # model_config is to find out the max length of the model
+        self.max_model_len = _get_and_verify_max_len(
+            hf_config=self.model_config,
+            max_model_len=None,
+            disable_sliding_window=False,
+            sliding_window_len=self.get_hf_config_sliding_window(),
+        )
+
+        logger.info("Model Context Length: " + str(self.max_model_len))
+
+        try:
+            logger.info("Attempt to load fast tokenizer")
+            self.tokenizer = PreTrainedTokenizerFast.from_pretrained(self.model_path)
+        except Exception:
+            logger.info("Attempt to load slower tokenizer")
+            self.tokenizer = PreTrainedTokenizer.from_pretrained(self.model_path)
+
+        self.model = npu_lib.NPUModelForCausalLM.from_pretrained(
+                        self.model_path,
+                        torch_dtype="auto",
+                        dtype=npu_lib.int4,
+                        trust_remote_code=True,
+                        export=False
+                    )
+
+        logger.info("Model loaded")
+        self.tokenizer_stream = TextIteratorStreamer(
+            self.tokenizer, skip_prompt=True, skip_special_tokens=True
+        )
+        logger.info("Tokenizer created")
+
+        self.vision = vision
+
+        # if self.vision:
+        #     self.onnx_processor = self.model.create_multimodal_processor()
+        #     self.processor = AutoImageProcessor.from_pretrained(
+        #         self.model_path, trust_remote_code=True
+        #     )
+        #     print(dir(self.processor))
+
+    async def generate_vision(
+        self,
+        inputs: PromptInputs,
+        sampling_params: SamplingParams,
+        request_id: str,
+        stream: bool = True,
+    ) -> AsyncIterator[RequestOutput]:
+        raise NotImplementedError(f"generate_vision yet to be implemented.")
+
+    async def generate(
+        self,
+        inputs: PromptInputs,
+        sampling_params: SamplingParams,
+        request_id: str,
+        stream: bool = True,
+    ) -> AsyncIterator[RequestOutput]:
+        """Generate outputs for a request.
+
+        Generate outputs for a request. This method is a coroutine. It adds the
+        request into the waiting queue of the LLMEngine and streams the outputs
+        from the LLMEngine to the caller.
+
+        """
+
+        prompt_text = inputs["prompt"]
+        input_token_length = None
+        input_tokens = None  # for text only use case
+        # logger.debug("inputs: " + prompt_text)
+
+        input_tokens = self.tokenizer.encode(prompt_text, return_tensors="pt")
+        # logger.debug(f"input_tokens: {input_tokens}")
+        input_token_length = len(input_tokens[0])
+
+        max_tokens = sampling_params.max_tokens
+
+        assert input_token_length is not None
+
+        if input_token_length + max_tokens > self.max_model_len:
+            raise ValueError("Exceed Context Length")
+
+        generation_options = {
+            name: getattr(sampling_params, name)
+            for name in [
+                "do_sample",
+                # "max_length",
+                "max_new_tokens",
+                "min_length",
+                "top_p",
+                "top_k",
+                "temperature",
+                "repetition_penalty",
+            ]
+            if hasattr(sampling_params, name)
+        }
+        generation_options["max_length"] = self.max_model_len
+        generation_options["input_ids"] = input_tokens.clone()
+        # generation_options["input_ids"] = input_tokens.clone().to(self.device)
+        generation_options["max_new_tokens"] = max_tokens
+        print(generation_options)
+
+        token_list: List[int] = []
+        output_text: str = ""
+        if stream:
+            generation_options["streamer"] = self.tokenizer_stream
+            if RECORD_TIMING:
+                started_timestamp = time.time()
+                first_token_timestamp = 0
+                first = True
+                new_tokens = []
+            try:
+                thread = Thread(target=self.model.generate, kwargs=generation_options)
+                started_timestamp = time.time()
+                first_token_timestamp = None
+                thread.start()
+                output_text = ""
+                first = True
+                for new_text in self.tokenizer_stream:
+                    if new_text == "":
+                        continue
+                    if RECORD_TIMING:
+                        if first:
+                            first_token_timestamp = time.time()
+                            first = False
+                    # logger.debug(f"new text: {new_text}")
+                    output_text += new_text
+                    token_list = self.tokenizer.encode(output_text, return_tensors="pt")
+
+                    output = RequestOutput(
+                        request_id=request_id,
+                        prompt=prompt_text,
+                        prompt_token_ids=input_tokens[0],
+                        finished=False,
+                        outputs=[
+                            CompletionOutput(
+                                index=0,
+                                text=output_text,
+                                token_ids=token_list[0],
+                                cumulative_logprob=-1.0,
+                            )
+                        ],
+                    )
+                    yield output
+                    # logits = generator.get_output("logits")
+                    # print(logits)
+                    if RECORD_TIMING:
+                        new_tokens = token_list[0]
+
+                yield RequestOutput(
+                    request_id=request_id,
+                    prompt=prompt_text,
+                    prompt_token_ids=input_tokens[0],
+                    finished=True,
+                    outputs=[
+                        CompletionOutput(
+                            index=0,
+                            text=output_text,
+                            token_ids=token_list[0],
+                            cumulative_logprob=-1.0,
+                            finish_reason="stop",
+                        )
+                    ],
+                )
+                if RECORD_TIMING:
+                    prompt_time = first_token_timestamp - started_timestamp
+                    run_time = time.time() - first_token_timestamp
+                    logger.info(
+                        f"Prompt length: {len(input_tokens[0])}, New tokens: {len(new_tokens)}, Time to first: {(prompt_time):.2f}s, Prompt tokens per second: {len(input_tokens[0])/prompt_time:.2f} tps, New tokens per second: {len(new_tokens)/run_time:.2f} tps"
+                    )
+
+            except Exception as e:
+                logger.error(str(e))
+
+                error_output = RequestOutput(
+                    prompt=inputs,
+                    prompt_token_ids=input_tokens,
+                    finished=True,
+                    request_id=request_id,
+                    outputs=[
+                        CompletionOutput(
+                            index=0,
+                            text=output_text,
+                            token_ids=token_list,
+                            cumulative_logprob=-1.0,
+                            finish_reason="error",
+                            stop_reason=str(e),
+                        )
+                    ],
+                )
+                yield error_output
+        else:
+            try:
+                token_list = self.model.generate(**generation_options)[0]
+
+                output_text = self.tokenizer.decode(
+                    token_list[input_token_length:], skip_special_tokens=True
+                )
+
+                yield RequestOutput(
+                    request_id=request_id,
+                    prompt=prompt_text,
+                    prompt_token_ids=input_tokens[0],
+                    finished=True,
+                    outputs=[
+                        CompletionOutput(
+                            index=0,
+                            text=output_text,
+                            token_ids=token_list,
+                            cumulative_logprob=-1.0,
+                            finish_reason="stop",
+                        )
+                    ],
+                )
+
+            except Exception as e:
+                logger.error(str(e))
+
+                error_output = RequestOutput(
+                    prompt=prompt_text,
+                    prompt_token_ids=input_tokens[0],
+                    finished=True,
+                    request_id=request_id,
+                    outputs=[
+                        CompletionOutput(
+                            index=0,
+                            text=output_text,
+                            token_ids=token_list,
+                            cumulative_logprob=-1.0,
+                            finish_reason="error",
+                            stop_reason=str(e),
+                        )
+                    ],
+                )
+                yield error_output
\ No newline at end of file
diff --git a/src/embeddedllm/engine.py b/src/embeddedllm/engine.py
index e2c5a9d..b341472 100644
--- a/src/embeddedllm/engine.py
+++ b/src/embeddedllm/engine.py
@@ -56,6 +56,22 @@ def __init__(self, model_path: str, vision: bool, device: str = "xpu", backend:
 
             self.engine = OnnxruntimeEngine(self.model_path, self.vision, self.device)
             logger.info(f"Initializing onnxruntime backend ({backend.upper()}): OnnxruntimeEngine")
+            
+        elif self.backend == "npu":
+            assert self.device == "npu", f"To run npu backend, device must be npu."
+            processor = get_processor_type()
+            if(processor == "Intel"):
+                from embeddedllm.backend.intel_npu_engine import NPUEngine
+                
+                self.engine = NPUEngine(self.model_path, self.vision, self.device)
+                logger.info(f"Initializing Intel npu backend (NPU): NPUEngine")
+                
+            elif(processor == "AMD"):
+                raise SystemError(f"NPU support on AMD platform is not supported yet.")
+            
+            else:
+                raise SystemError(f"Unknown processor is not supported.")
+        
         elif self.backend == "cpu":
             assert self.device == "cpu", f"To run `cpu` backend, `device` must be `cpu`."
             processor = get_processor_type()
@@ -80,7 +96,7 @@ def __init__(self, model_path: str, vision: bool, device: str = "xpu", backend:
 
         else:
             raise ValueError(
-                f"EmbeddedLLMEngine only supports `cpu`, `ipex`, `cuda`, `openvino` and `directml`."
+                f"EmbeddedLLMEngine only supports `cpu`, `npu`, `ipex`, `cuda`, `openvino` and `directml`."
             )
         self.tokenizer = self.engine.tokenizer
 
diff --git a/src/embeddedllm/entrypoints/modelui.py b/src/embeddedllm/entrypoints/modelui.py
index 9c82355..81cb681 100644
--- a/src/embeddedllm/entrypoints/modelui.py
+++ b/src/embeddedllm/entrypoints/modelui.py
@@ -20,7 +20,7 @@ def get_embeddedllm_backend():
         version = importlib.metadata.version("embeddedllm")
 
         # Use regex to extract the backend
-        match = re.search(r"\+(directml|cpu|cuda|ipex|openvino)$", version)
+        match = re.search(r"\+(directml|npu|cpu|cuda|ipex|openvino)$", version)
 
         if match:
             backend = match.group(1)
@@ -260,6 +260,41 @@ class ModelCard(BaseModel):
     ),
 }
 
+npu_model_dict_list = {
+    "microsoft/Phi-3-mini-4k-instruct": ModelCard(
+        hf_url="https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/tree/main/",
+        repo_id="microsoft/Phi-3-mini-4k-instruct",
+        model_name="Phi-3-mini-4k-instruct",
+        subfolder=".",
+        repo_type="model",
+        context_length=4096,
+    ),
+    "microsoft/Phi-3-mini-128k-instruct": ModelCard(
+        hf_url="https://huggingface.co/microsoft/Phi-3-mini-128k-instruct/tree/main",
+        repo_id="microsoft/Phi-3-mini-128k-instruct",
+        model_name="Phi-3-mini-128k-instruct",
+        subfolder=".",
+        repo_type="model",
+        context_length=131072,
+    ),
+    "microsoft/Phi-3-medium-4k-instruct": ModelCard(
+        hf_url="https://huggingface.co/microsoft/Phi-3-medium-4k-instruct/tree/main",
+        repo_id="microsoft/Phi-3-medium-4k-instruct",
+        model_name="Phi-3-medium-4k-instruct",
+        subfolder=".",
+        repo_type="model",
+        context_length=4096,
+    ),
+    "microsoft/Phi-3-medium-128k-instruct": ModelCard(
+        hf_url="https://huggingface.co/microsoft/Phi-3-medium-128k-instruct/tree/main",
+        repo_id="microsoft/Phi-3-medium-128k-instruct",
+        model_name="Phi-3-medium-128k-instruct",
+        subfolder=".",
+        repo_type="model",
+        context_length=131072,
+    ),
+}
+
 ipex_model_dict_list = {
     "microsoft/Phi-3-mini-4k-instruct": ModelCard(
         hf_url="https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/tree/main/",
@@ -507,6 +542,11 @@ def compute_memory_size(repo_id, path_in_repo, repo_type: str = "model"):
         repo_id=v.repo_id, path_in_repo=v.subfolder, repo_type=v.repo_type
     )
 
+for k, v in npu_model_dict_list.items():
+    v.size = compute_memory_size(
+        repo_id=v.repo_id, path_in_repo=v.subfolder, repo_type=v.repo_type
+    )
+
 for k, v in ipex_model_dict_list.items():
     v.size = compute_memory_size(
         repo_id=v.repo_id, path_in_repo=v.subfolder, repo_type=v.repo_type
@@ -603,6 +643,9 @@ def update_model_list(engine_type):
     if engine_type == "DirectML":
         models = sorted(list(dml_model_dict_list.keys()))
         models_pandas = convert_to_dataframe(dml_model_dict_list)
+    elif backend == "npu":
+        models = sorted(list(npu_model_dict_list.keys()))
+        models_pandas = convert_to_dataframe(npu_model_dict_list)
     elif backend == "ipex":
         models = sorted(list(ipex_model_dict_list.keys()))
         models_pandas = convert_to_dataframe(ipex_model_dict_list)
@@ -631,6 +674,8 @@ def deploy_model(engine_type, model_name, port_number):
 
     if engine_type == "DirectML":
         llm_model_card = dml_model_dict_list[model_name]
+    elif backend == "npu":
+        llm_model_card = npu_model_dict_list[model_name]
     elif backend == "ipex":
         llm_model_card = ipex_model_dict_list[model_name]
     elif backend == "openvino":
@@ -654,7 +699,9 @@ def deploy_model(engine_type, model_name, port_number):
     model_path = llm_model_card.repo_id
     print("Model path:", model_path)
 
-    if engine_type == "Ipex":
+    if engine_type == "NPU":
+        device = "npu"
+    elif engine_type == "Ipex":
         device = "xpu"
     elif engine_type == "OpenVino":
         device = "gpu"
@@ -718,6 +765,8 @@ def download_model(engine_type, model_name):
 
     if engine_type == "DirectML":
         llm_model_card = dml_model_dict_list[model_name]
+    elif backend == "npu":
+        llm_model_card = npu_model_dict_list[model_name]
     elif backend == "ipex":
         llm_model_card = ipex_model_dict_list[model_name]
     elif backend == "openvino":
@@ -771,6 +820,8 @@ def main():
 
         if backend == "directml":
             default_value = "DirectML"
+        elif backend == "npu":
+            default_value = "NPU"
         elif backend == "ipex":
             default_value = "Ipex"
         elif backend == "openvino":