diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index c2674b914f485..9858d00cfb5c1 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -50,6 +50,5 @@ jobs:
         mypy vllm/transformers_utils --config-file pyproject.toml
         mypy vllm/usage --config-file pyproject.toml
         mypy vllm/worker --config-file pyproject.toml
-        mypy vllm/hpu --config-file pyproject.toml
 
 
diff --git a/format.sh b/format.sh
index fbfc27a68bb3d..5ad6d6f2938bb 100755
--- a/format.sh
+++ b/format.sh
@@ -113,7 +113,6 @@ mypy vllm/spec_decode --config-file pyproject.toml
 mypy vllm/transformers_utils --config-file pyproject.toml
 mypy vllm/usage --config-file pyproject.toml
 mypy vllm/worker --config-file pyproject.toml
-mypy vllm/hpu --config-file pyproject.toml
 
 
 # If git diff returns a file that is in the skip list, the file may be checked anyway:
diff --git a/requirements-hpu.txt b/requirements-hpu.txt
index e0f03c8464c7b..d451200aa1144 100644
--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
@@ -6,3 +6,4 @@ ray == 2.32.0
 triton
 pandas
 tabulate
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@30ee2d1
diff --git a/tests/lora/test_lora_hpu.py b/tests/lora/test_lora_hpu.py
index 01b6472745e1c..c8ecaef1a6316 100644
--- a/tests/lora/test_lora_hpu.py
+++ b/tests/lora/test_lora_hpu.py
@@ -1,7 +1,7 @@
 import pytest
 import torch
+from vllm_hpu_extension.ops import LoraMask
 
-from vllm.hpu.ops import LoraMask
 from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice
 
 from .utils import DummyLoRAManager
diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py
index 56b71a431aca7..b7b8072de3fe5 100644
--- a/vllm/attention/backends/habana_attn.py
+++ b/vllm/attention/backends/habana_attn.py
@@ -7,14 +7,14 @@
 from typing import Any, Dict, List, Optional, Tuple, Type
 
 import torch
+import vllm_hpu_extension.ops as ops
+from vllm_hpu_extension import cache_ops
+from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache
 
-import vllm.hpu.ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata, AttentionType)
 from vllm.attention.ops.habana_paged_attn import (HabanaPagedAttention,
                                                   HabanaPagedAttentionMetadata)
-from vllm.hpu import cache_ops
-from vllm.hpu.utils import Matmul, Softmax, VLLMKVCache
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
diff --git a/vllm/attention/ops/habana_paged_attn.py b/vllm/attention/ops/habana_paged_attn.py
index cab8d7abe95fd..49a3e3f774d58 100644
--- a/vllm/attention/ops/habana_paged_attn.py
+++ b/vllm/attention/ops/habana_paged_attn.py
@@ -6,8 +6,7 @@
 from typing import Dict, List, Optional, Tuple
 
 import torch
-
-from vllm.hpu import cache_ops, ops
+from vllm_hpu_extension import cache_ops, ops
 
 # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
 _PARTITION_SIZE = 512
diff --git a/vllm/hpu/__init__.py b/vllm/hpu/__init__.py
deleted file mode 100644
index b8e4d3aac98a7..0000000000000
--- a/vllm/hpu/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-###############################################################################
-# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-###############################################################################
diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py
deleted file mode 100644
index 9042924f68b3d..0000000000000
--- a/vllm/hpu/cache_ops.py
+++ /dev/null
@@ -1,107 +0,0 @@
-###############################################################################
-# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-###############################################################################
-
-import math
-
-import habana_frameworks.torch as htorch
-import torch
-
-
-def reshape_and_cache(key,
-                      value,
-                      key_cache,
-                      value_cache,
-                      slot_mapping,
-                      dtype,
-                      is_prompt=False):
-    num_blocks = key_cache.size(0)
-    block_size = key_cache.size(1)
-    slot_mapping = slot_mapping.flatten()
-    indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
-    offsets = torch.fmod(slot_mapping, block_size)
-    num_slots_requested = slot_mapping.size(0)
-    num_slots_available = num_blocks * block_size
-    # NOTE(kzawora): HPU PT bridge crashes with
-    # RuntimeError: Invalid inputs for scatter_nd_onnx
-    # on index_put when num_slots_requested > num_slots_available.
-    # This case might occur when we have little kv cache blocks and
-    # lots of padding, or are doing warmup.
-    # This loop is a workaround for this issue. Please remove it
-    # once key_cache.index_put_(indices, offsets), key) works.
-    num_kv_cache_passes = math.ceil(num_slots_requested / num_slots_available)
-    for i in range(num_kv_cache_passes):
-        start_idx = i * num_slots_available
-        end_idx = (i + 1) * num_slots_available
-        key_cache.index_put_(
-            (indices[start_idx:end_idx], offsets[start_idx:end_idx]),
-            key[start_idx:end_idx])
-        value_cache.index_put_(
-            (indices[start_idx:end_idx], offsets[start_idx:end_idx]),
-            value[start_idx:end_idx])
-
-
-def prepare_to_cache(cache, slot_mapping):
-    num_blocks = cache.size(0)
-    block_size = cache.size(1)
-    slot_mapping = slot_mapping.flatten()
-    indices = torch.div(slot_mapping, block_size, rounding_mode="floor")
-    offsets = torch.fmod(slot_mapping, block_size)
-    num_slots_requested = slot_mapping.size(0)
-    num_slots_available = num_blocks * block_size
-    # NOTE(kzawora): HPU PT bridge crashes with
-    # RuntimeError: Invalid inputs for scatter_nd_onnx
-    # on index_put when num_slots_requested > num_slots_available.
-    # This case might occur when we have little kv cache blocks and
-    # lots of padding, or are doing warmup.
-    # This loop is a workaround for this issue. Please remove it
-    # once key_cache.index_put_(indices, offsets), key) works.
-    num_kv_cache_passes = math.ceil(num_slots_requested / num_slots_available)
-
-    return num_kv_cache_passes, num_slots_available, indices, offsets
-
-
-def insert_or_update_cache(input, cache, num_kv_cache_passes,
-                           num_slots_available, block_indices, block_offsets):
-    for i in range(num_kv_cache_passes):
-        start_idx = i * num_slots_available
-        end_idx = (i + 1) * num_slots_available
-        cache.index_put_((block_indices[start_idx:end_idx],
-                          block_offsets[start_idx:end_idx]),
-                         input[start_idx:end_idx])
-
-
-def swap_blocks(src, dst, block_mapping):
-    index_src = torch.zeros((1, ), dtype=torch.int32, device=src.device)
-    index_dst = torch.zeros((1, ), dtype=torch.int32, device=dst.device)
-    for src_idx, dst_idx in block_mapping.items():
-        index_src[0] = src_idx
-        index_dst[0] = dst_idx
-        dst.index_put_([index_dst], src.index_select(0, index_src))
-    if dst.device.type == 'hpu':
-        htorch.core.mark_step()
-        torch.hpu.synchronize()
-
-
-def copy_blocks(key_caches, value_caches, block_mapping):
-    index_src = torch.zeros((1, ),
-                            dtype=torch.int32,
-                            device=key_caches[0].device)
-    index_dst = torch.zeros((1, ),
-                            dtype=torch.int32,
-                            device=key_caches[0].device)
-    for src, dsts in block_mapping.items():
-        index_src[0] = src
-        for dst in dsts:
-            index_dst[0] = dst
-            for key_cache in key_caches:
-                key_cache.index_copy_(0, index_dst,
-                                      key_cache.index_select(0, index_src))
-            for value_cache in value_caches:
-                value_cache.index_copy_(0, index_dst,
-                                        value_cache.index_select(0, index_src))
-        if key_caches[0].device.type == 'hpu':
-            htorch.core.mark_step()
diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py
deleted file mode 100644
index 939d195a12b08..0000000000000
--- a/vllm/hpu/ops.py
+++ /dev/null
@@ -1,293 +0,0 @@
-###############################################################################
-# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-###############################################################################
-from typing import Optional
-
-import habana_frameworks.torch as htorch
-import torch
-import torch.nn.functional as F
-
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-HPUFusedRMSNorm = None
-try:
-    from habana_frameworks.torch.hpex.normalization import FusedRMSNorm
-    HPUFusedRMSNorm = FusedRMSNorm
-except ImportError:
-    logger.warning("Could not import HPU FusedRMSNorm kernel. "
-                   "vLLM will use forward_native implementation of RMSNorm.")
-HPUFusedSDPA = None
-try:
-    from habana_frameworks.torch.hpex.kernels import FusedSDPA
-    HPUFusedSDPA = FusedSDPA
-except ImportError:
-    logger.warning("Could not import HPU FusedSDPA kernel. "
-                   "vLLM will use native implementation.")
-
-
-def batch2block(tensor, block_mapping):
-    shape = tuple(tensor.shape)
-    return (block_mapping @ tensor.view(shape[0], -1)).view(-1, *shape[1:])
-
-
-def block2batch(tensor, block_mapping):
-    shape = tuple(tensor.shape)
-    return (block_mapping.t() @ tensor.view(shape[0], -1)).view(-1, *shape[1:])
-
-
-def block_softmax(batch_size, attn, block_mapping):
-    # We're using global maximum to decrease the exponent as
-    # it's fast to compute and performs reasonably well.
-    # This is by no means a final solution and needs to
-    # be properly addressed in the future.
-    #
-    # Additionally there's a bug where 'max' is not parallelized
-    # across TPC cores, so we need to split the tensor manually
-    # instead of simply doing attn_max = attn.max()
-
-    tail_dims = tuple(range(1, attn.dim()))
-    attn_max = attn.amax(tail_dims).amax()
-    attn.sub_(attn_max)
-    attn = attn.exp_()
-    sums = attn.sum(dim=-1).unsqueeze(-1)
-    sums = block2batch(sums, block_mapping)
-    sums = batch2block(sums, block_mapping)
-    sums.add_(1.0e-12)
-    attn.div_(sums)
-    return attn
-
-
-def flat_pa(query, key_cache, value_cache, block_list, block_mapping,
-            block_bias, scale, matmul_qk_op, matmul_av_op, keys_fetch_func,
-            values_fetch_func):
-    batch_size = query.size(0)
-    q_heads = query.size(1)
-    kv_heads = key_cache.size(2)
-
-    query = batch2block(scale * query, block_mapping).unsqueeze(-2)
-    key = keys_fetch_func(key_cache, block_list).transpose(1, 2)
-    value = values_fetch_func(value_cache, block_list).transpose(1, 2)
-    block_bias = block_bias.view(key.size(0), 1, 1, -1)
-
-    if kv_heads != q_heads:
-        block_bias = block_bias.unsqueeze(1)
-        query = query.unflatten(1, (kv_heads, -1))
-        key = key.unflatten(1, (kv_heads, 1))
-        value = value.unflatten(1, (kv_heads, 1))
-        key = key.transpose(3, 4)
-    else:
-        key = key.transpose(2, 3)
-
-    attn = matmul_qk_op(query, key) + block_bias
-    attn = block_softmax(batch_size, attn, block_mapping)
-    attn = matmul_av_op(attn, value)
-    attn = block2batch(attn, block_mapping)
-    attn = attn.squeeze(-2)
-    if kv_heads != q_heads:
-        attn = attn.flatten(1, 2)
-    return attn
-
-
-def silu_and_mul(x: torch.Tensor) -> torch.Tensor:
-    d = x.shape[-1] // 2
-    return F.silu(x[..., :d]) * x[..., d:]
-
-
-#TODO: remove after fusedsdpa fix for query_head != kv_head
-def repeat_kv(kv: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep).
-    The kv go from (batch, num_key_value_heads, seqlen, head_dim) to
-    (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = kv.shape
-    if n_rep == 1:
-        return kv
-    kv = kv[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen,
-                                     head_dim)
-    return kv.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-def prompt_attention(
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    attn_bias: Optional[torch.Tensor] = None,
-    p: float = 0.0,
-    scale: Optional[float] = None,
-    matmul_qk_op=torch.matmul,
-    softmax_op=torch.softmax,
-    matmul_av_op=torch.matmul,
-    valid_seq_lengths: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
-    query = query.transpose(1, 2)
-    key = key.transpose(1, 2)
-    value = value.transpose(1, 2)
-    query_heads = query.size(1)
-    kv_heads = key.size(1)
-    if attn_bias is not None or HPUFusedSDPA is None:
-        if query_heads != kv_heads:
-            query = query.unflatten(1, (kv_heads, -1))
-            key = key.unflatten(1, (kv_heads, 1))
-            value = value.unflatten(1, (kv_heads, 1))
-            if attn_bias is not None:
-                attn_bias = attn_bias.unsqueeze(2)
-        attn_weights = matmul_qk_op(query * scale, key.transpose(-1, -2))
-        if attn_bias is not None:
-            attn_weights.add_(attn_bias)
-        attn_weights = softmax_op(attn_weights, dim=-1)
-        attn_weights = matmul_av_op(attn_weights, value)
-        if query_heads != kv_heads:
-            attn_weights = attn_weights.flatten(1, 2)
-    else:
-        #TODO: remove after fusedsdpa fix for query_heads != kv_heads
-        if query_heads != kv_heads:
-            key = repeat_kv(key, int(query_heads // kv_heads))
-            value = repeat_kv(value, int(query_heads // kv_heads))
-        softmax_mode = 'fast'
-        recompute_mode = True
-        attn_weights = FusedSDPA.apply(query, key, value, None, 0.0, True,
-                                       scale, softmax_mode, recompute_mode,
-                                       valid_seq_lengths, 'right')
-    attn_weights = attn_weights.transpose(1, 2)
-    return attn_weights
-
-
-class LoraMask:
-    lora_mask = None
-
-    @staticmethod
-    def setLoraMask(mask):
-        LoraMask.lora_mask = mask
-
-    @staticmethod
-    def getLoraMask():
-        return LoraMask.lora_mask
-
-
-def dispatch_bgmv_linear(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    wa_t_all: torch.Tensor,
-    wb_t_all: torch.Tensor,
-    indices: torch.LongTensor,
-    layer_idx: int,
-    scale: float,
-):
-    """
-    `wa_t_all` and `wb_t_all` contains all LoRA A and LoRA B weight matrices
-    stacked at dimension 0 into single tensors, assuming same rank. `wa` is the
-    reshaped and transposed version of `wa_t_all` of shape
-    (h_in, max_loras * lora_rank) and `wb` is the transposed and reshaped
-    version of `wb_t_all` of shape (max_loras * lora_rank, h_out).
-
-    Matmul input `x` with `wa`. Multiply `x` with a mask to zero-out inputs of
-    inactive LoRA indices. Matmul masked output with `wb` and scale it to get
-    the final output.
-    """
-
-    assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}'
-    mask = LoraMask.getLoraMask()
-
-    wa = wa_t_all[:, 0, :, :]
-    wb = wb_t_all[:, 0, :, :].transpose(1, 2)
-    wa = wa.reshape(wa.shape[0] * wa.shape[1], wa.shape[2]).transpose(0, 1)
-    wb = wb.reshape(wb.shape[0] * wb.shape[1], wb.shape[2])
-
-    out = x @ wa
-    assert (out.shape == mask.shape)
-    out = out * mask
-    out = out @ wb
-    y += out * scale
-
-
-def dispatch_bgmv_embedding(
-    y: torch.Tensor,
-    x: torch.Tensor,
-    wb_t_all: torch.Tensor,
-    indices: torch.LongTensor,
-    layer_idx: int,
-    scale: float,
-):
-    """
-    `wb_t_all` contains all LoRA-B weight matrices stacked at dimension 0 into
-    a single tensor, assuming same rank. `wb` is the transposed and reshaped
-    version of `wb_t_all` of shape (num_loras * lora_rank, embedding_dim).
-
-    Output of LoRA-A embedding (tensor x) is repeated max_loras times to match
-    the shape of `wb`. Multiply `x` with a mask to zero-out inputs of inactive
-    LoRA indices. Matmul masked output with `wb` and scale it to get the final
-    output.
-    """
-
-    assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}'
-    max_loras = wb_t_all.size(0)
-
-    x = x.repeat(1, max_loras)
-    x = x * LoraMask.getLoraMask()
-    wb = wb_t_all[:, 0, :, :].transpose(1, 2)
-    wb = wb.reshape(wb.shape[0] * wb.shape[1], wb.shape[2])
-    out = x @ wb
-    y += out * scale
-
-
-class MoeMatmul(torch.nn.Module):
-
-    def __init__(self):
-        super().__init__()
-
-    def set_weight(self, w):
-        self.weight = w
-
-    def calc(self, state, expert_id, w):
-        self.weight = w[expert_id].transpose(0, 1)
-        return self.forward(state)
-
-    def forward(self, state):
-        return torch.matmul(state, self.weight)
-
-
-class StaticFusedMOE(torch.nn.Module):
-
-    def __init__(self, num_total_experts):
-        super().__init__()
-        self.w13_list = torch.nn.ModuleList(
-            [MoeMatmul() for _ in range(num_total_experts)])
-        self.w2_list = torch.nn.ModuleList(
-            [MoeMatmul() for _ in range(num_total_experts)])
-        self.num_total_experts = num_total_experts
-
-    def forward(self, hidden_states, w1, w2, score, topk):
-        B, D = hidden_states.shape
-        routing_weights = F.softmax(score, dim=1, dtype=torch.float32)
-        routing_weights, selected_experts = torch.topk(routing_weights,
-                                                       topk,
-                                                       dim=-1)
-        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
-        routing_weights = routing_weights.to(hidden_states.dtype)
-        final_hidden_states = torch.zeros((1, B, D),
-                                          dtype=hidden_states.dtype,
-                                          device=hidden_states.device)
-        padded_weights = torch.zeros((B, self.num_total_experts),
-                                     dtype=hidden_states.dtype,
-                                     device=hidden_states.device)
-        padded_weights.scatter_(-1, selected_experts, routing_weights)
-        padded_weights = padded_weights.reshape(-1, B, self.num_total_experts)
-        padded_weights = padded_weights.permute(2, 0, 1).unsqueeze(-1)
-        htorch.core.mark_step()
-
-        for expert_idx in range(self.num_total_experts):
-            padded_weight = padded_weights[expert_idx]
-            current_state_static = hidden_states.reshape(-1, D)
-            w_output = self.w13_list[expert_idx].calc(current_state_static,
-                                                      expert_idx, w1)
-            w_output = silu_and_mul(w_output)
-            w_output = self.w2_list[expert_idx].calc(w_output, expert_idx, w2)
-            current_hidden_states_static = w_output * padded_weight
-            final_hidden_states += current_hidden_states_static
-
-        return final_hidden_states.view(-1, D)
diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py
deleted file mode 100644
index 1857253f47f1b..0000000000000
--- a/vllm/hpu/rotary_embed.py
+++ /dev/null
@@ -1,123 +0,0 @@
-###############################################################################
-# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-###############################################################################
-
-import torch
-import torch.nn as nn
-
-from vllm.logger import init_logger
-from vllm.utils import is_hpu
-
-logger = init_logger(__name__)
-
-if is_hpu():
-    try:
-        from habana_frameworks.torch.hpex.kernels import (
-            RotaryPosEmbeddingHelperV1 as FusedRoPE)
-    except ImportError:
-        logger.warning("Could not import HPU FusedRoPE kernel. "
-                       "vLLM will use forward_native implementation of RoPE.")
-        FusedRoPE = None
-else:
-    FusedRoPE = None
-
-
-class HpuRotaryEmbedding(nn.Module):
-
-    def __init__(self,
-                 head_size,
-                 rotary_dim,
-                 max_position_embeddings=2048,
-                 base=10000,
-                 is_neox_style=None,
-                 device='hpu',
-                 RoPEFallback=None):
-        super().__init__()
-
-        self.head_size = head_size
-        self.dim = rotary_dim
-        self.max_position_embeddings = max_position_embeddings
-        self.base = base
-        inv_freq = 1.0 / (self.base**(
-            torch.arange(0, self.dim, 2).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(seq_len=max_position_embeddings,
-                                device=self.inv_freq.device,
-                                dtype=torch.get_default_dtype())
-        if FusedRoPE is None:
-            assert RoPEFallback is not None, (
-                "HPU FusedRoPE kernel could not be imported, and "
-                "fallback RoPE implementation was not provided!")
-            self.fallback_impl = RoPEFallback(head_size,
-                                              rotary_dim,
-                                              max_position_embeddings,
-                                              base,
-                                              is_neox_style,
-                                              dtype=torch.get_default_dtype())
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached,
-                         device=device,
-                         dtype=self.inv_freq.dtype)
-
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order
-        # to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached",
-                             emb.cos().to(dtype),
-                             persistent=False)
-        self.register_buffer("sin_cached",
-                             emb.sin().to(dtype),
-                             persistent=False)
-
-    def forward(self, positions: torch.Tensor, query: torch.Tensor,
-                key: torch.Tensor):
-        if FusedRoPE is None:
-            return self.fallback_impl(positions, query, key)
-        if query.dim() == 2:
-            query = query.unsqueeze(0)
-        if key.dim() == 2:
-            key = key.unsqueeze(0)
-        if positions.dim() == 1:
-            positions = positions.unsqueeze(0)
-        seq_len = key.shape[-2]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len,
-                                    device=query.device,
-                                    dtype=query.dtype)
-
-        cos, sin = self.cos_cached[:seq_len].to(
-            dtype=query.dtype), self.sin_cached[:seq_len].to(dtype=query.dtype)
-        query = query.reshape(
-            (query.shape[0], query.shape[1], query.shape[2] // self.head_size,
-             self.head_size))
-        key = key.reshape((key.shape[0], key.shape[1],
-                           key.shape[2] // self.head_size, self.head_size))
-        query_rot = query[..., :self.dim]
-        key_rot = key[..., :self.dim]
-        if self.dim < self.head_size:
-            query_pass = query[..., self.dim:]
-            key_pass = key[..., self.dim:]
-
-        if len(positions[0]) == 1:
-            cos = self.cos_cached[positions].unsqueeze(2).to(dtype=query.dtype)
-            sin = self.sin_cached[positions].unsqueeze(2).to(dtype=query.dtype)
-        else:
-            cos = cos[positions].unsqueeze(2)
-            sin = sin[positions].unsqueeze(2)
-        query, key = FusedRoPE.apply(query_rot, cos, sin,
-                                     0), FusedRoPE.apply(key_rot, cos, sin, 0)
-        if self.dim < self.head_size:
-            query = torch.cat((query, query_pass), dim=-1)
-            key = torch.cat((key, key_pass), dim=-1)
-        return query.reshape(
-            (query.shape[0], query.shape[1],
-             query.shape[2] * query.shape[3])), key.reshape(
-                 (key.shape[0], key.shape[1], key.shape[2] * key.shape[3]))
diff --git a/vllm/hpu/utils.py b/vllm/hpu/utils.py
deleted file mode 100644
index 13204b83d5742..0000000000000
--- a/vllm/hpu/utils.py
+++ /dev/null
@@ -1,61 +0,0 @@
-###############################################################################
-# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
-#
-# This source code is licensed under the BSD license found in the
-# LICENSE file in the root directory of this source tree.
-###############################################################################
-
-from functools import wraps
-
-import habana_frameworks.torch as htorch
-import torch
-
-from vllm.hpu.cache_ops import insert_or_update_cache
-
-
-def with_mark_steps(fn):
-
-    @wraps(fn)
-    def wrapped(*args, **kwargs):
-        htorch.core.mark_step()
-        result = fn(*args, **kwargs)
-        del args
-        del kwargs
-        htorch.core.mark_step()
-        return result
-
-    return wrapped
-
-
-class Matmul(torch.nn.Module):
-
-    def __init__(self):
-        super(Matmul, self).__init__()
-
-    def forward(self, x, y):
-        return torch.matmul(x, y)
-
-
-class Softmax(torch.nn.Module):
-
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x, dim=None, inv_head=None):
-        return torch.softmax(x, dim)
-
-
-class VLLMKVCache(torch.nn.Module):
-
-    def __init__(self):
-        super(VLLMKVCache, self).__init__()
-
-    def forward(self, input, cache, num_kv_cache_passes, num_slots_available,
-                block_indices, block_offset):
-        insert_or_update_cache(input, cache, num_kv_cache_passes,
-                               num_slots_available, block_indices,
-                               block_offset)
-        return cache
-
-    def fetch_from_cache(self, cache, blocks):
-        return cache.index_select(0, blocks)
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index aa01e9fb77af2..59b7432b6e6eb 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -30,7 +30,8 @@
 from vllm.utils import is_hpu
 
 if is_hpu():
-    from vllm.hpu.ops import dispatch_bgmv_embedding, dispatch_bgmv_linear
+    from vllm_hpu_extension.ops import (dispatch_bgmv_embedding,
+                                        dispatch_bgmv_linear)
 
 if TYPE_CHECKING:
     pass
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index cf0d5f98f1b01..bda8a0622ef31 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -202,7 +202,7 @@ def __init__(
         self.num_expert_group = num_expert_group
         self.topk_group = topk_group
         if is_hpu():
-            from vllm.hpu.ops import StaticFusedMOE
+            from vllm_hpu_extension.ops import StaticFusedMOE
             self.hpu_static_fused_moe = StaticFusedMOE(self.num_experts)
 
         if quant_config is None:
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index c12668c14887d..9ef532e61a7c0 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -75,7 +75,7 @@ def forward_hpu(
         x: torch.Tensor,
         residual: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        from vllm.hpu.ops import HPUFusedRMSNorm
+        from vllm_hpu_extension.ops import HPUFusedRMSNorm
         if HPUFusedRMSNorm is None:
             return self.forward_native(x, residual)
         if residual is not None:
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 4e3c840bede60..2581e3a74dc72 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -31,7 +31,7 @@
 from vllm.utils import is_hpu, is_tpu
 
 if is_hpu():
-    from vllm.hpu.rotary_embed import HpuRotaryEmbedding
+    from vllm_hpu_extension.rotary_embed import HpuRotaryEmbedding
 
 
 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index 4d65f8f68e725..7002e2878bd7d 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -19,13 +19,13 @@
 import habana_frameworks.torch as htorch
 import habana_frameworks.torch.internal.bridge_config as bc
 import torch
+from vllm_hpu_extension.ops import LoraMask as LoraMask
 
 from vllm.attention import AttentionMetadata, get_attn_backend
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                          ModelConfig, MultiModalConfig, ParallelConfig,
                          SchedulerConfig)
 from vllm.distributed.parallel_state import get_world_group
-from vllm.hpu.ops import LoraMask as LoraMask
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
 from vllm.lora.request import LoRARequest