diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml index c2674b914f485..9858d00cfb5c1 100644 --- a/.github/workflows/mypy.yaml +++ b/.github/workflows/mypy.yaml @@ -50,6 +50,5 @@ jobs: mypy vllm/transformers_utils --config-file pyproject.toml mypy vllm/usage --config-file pyproject.toml mypy vllm/worker --config-file pyproject.toml - mypy vllm/hpu --config-file pyproject.toml diff --git a/format.sh b/format.sh index fbfc27a68bb3d..5ad6d6f2938bb 100755 --- a/format.sh +++ b/format.sh @@ -113,7 +113,6 @@ mypy vllm/spec_decode --config-file pyproject.toml mypy vllm/transformers_utils --config-file pyproject.toml mypy vllm/usage --config-file pyproject.toml mypy vllm/worker --config-file pyproject.toml -mypy vllm/hpu --config-file pyproject.toml # If git diff returns a file that is in the skip list, the file may be checked anyway: diff --git a/requirements-hpu.txt b/requirements-hpu.txt index e0f03c8464c7b..d451200aa1144 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -6,3 +6,4 @@ ray == 2.32.0 triton pandas tabulate +vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@30ee2d1 diff --git a/tests/lora/test_lora_hpu.py b/tests/lora/test_lora_hpu.py index 01b6472745e1c..c8ecaef1a6316 100644 --- a/tests/lora/test_lora_hpu.py +++ b/tests/lora/test_lora_hpu.py @@ -1,7 +1,7 @@ import pytest import torch +from vllm_hpu_extension.ops import LoraMask -from vllm.hpu.ops import LoraMask from vllm.lora.layers import _apply_lora, _apply_lora_packed_nslice from .utils import DummyLoRAManager diff --git a/vllm/attention/backends/habana_attn.py b/vllm/attention/backends/habana_attn.py index 56b71a431aca7..b7b8072de3fe5 100644 --- a/vllm/attention/backends/habana_attn.py +++ b/vllm/attention/backends/habana_attn.py @@ -7,14 +7,14 @@ from typing import Any, Dict, List, Optional, Tuple, Type import torch +import vllm_hpu_extension.ops as ops +from vllm_hpu_extension import cache_ops +from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache -import vllm.hpu.ops as ops from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionMetadata, AttentionType) from vllm.attention.ops.habana_paged_attn import (HabanaPagedAttention, HabanaPagedAttentionMetadata) -from vllm.hpu import cache_ops -from vllm.hpu.utils import Matmul, Softmax, VLLMKVCache from vllm.logger import init_logger logger = init_logger(__name__) diff --git a/vllm/attention/ops/habana_paged_attn.py b/vllm/attention/ops/habana_paged_attn.py index cab8d7abe95fd..49a3e3f774d58 100644 --- a/vllm/attention/ops/habana_paged_attn.py +++ b/vllm/attention/ops/habana_paged_attn.py @@ -6,8 +6,7 @@ from typing import Dict, List, Optional, Tuple import torch - -from vllm.hpu import cache_ops, ops +from vllm_hpu_extension import cache_ops, ops # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. _PARTITION_SIZE = 512 diff --git a/vllm/hpu/__init__.py b/vllm/hpu/__init__.py deleted file mode 100644 index b8e4d3aac98a7..0000000000000 --- a/vllm/hpu/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -############################################################################### -# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company -# -# This source code is licensed under the BSD license found in the -# LICENSE file in the root directory of this source tree. -############################################################################### diff --git a/vllm/hpu/cache_ops.py b/vllm/hpu/cache_ops.py deleted file mode 100644 index 9042924f68b3d..0000000000000 --- a/vllm/hpu/cache_ops.py +++ /dev/null @@ -1,107 +0,0 @@ -############################################################################### -# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company -# -# This source code is licensed under the BSD license found in the -# LICENSE file in the root directory of this source tree. -############################################################################### - -import math - -import habana_frameworks.torch as htorch -import torch - - -def reshape_and_cache(key, - value, - key_cache, - value_cache, - slot_mapping, - dtype, - is_prompt=False): - num_blocks = key_cache.size(0) - block_size = key_cache.size(1) - slot_mapping = slot_mapping.flatten() - indices = torch.div(slot_mapping, block_size, rounding_mode="floor") - offsets = torch.fmod(slot_mapping, block_size) - num_slots_requested = slot_mapping.size(0) - num_slots_available = num_blocks * block_size - # NOTE(kzawora): HPU PT bridge crashes with - # RuntimeError: Invalid inputs for scatter_nd_onnx - # on index_put when num_slots_requested > num_slots_available. - # This case might occur when we have little kv cache blocks and - # lots of padding, or are doing warmup. - # This loop is a workaround for this issue. Please remove it - # once key_cache.index_put_(indices, offsets), key) works. - num_kv_cache_passes = math.ceil(num_slots_requested / num_slots_available) - for i in range(num_kv_cache_passes): - start_idx = i * num_slots_available - end_idx = (i + 1) * num_slots_available - key_cache.index_put_( - (indices[start_idx:end_idx], offsets[start_idx:end_idx]), - key[start_idx:end_idx]) - value_cache.index_put_( - (indices[start_idx:end_idx], offsets[start_idx:end_idx]), - value[start_idx:end_idx]) - - -def prepare_to_cache(cache, slot_mapping): - num_blocks = cache.size(0) - block_size = cache.size(1) - slot_mapping = slot_mapping.flatten() - indices = torch.div(slot_mapping, block_size, rounding_mode="floor") - offsets = torch.fmod(slot_mapping, block_size) - num_slots_requested = slot_mapping.size(0) - num_slots_available = num_blocks * block_size - # NOTE(kzawora): HPU PT bridge crashes with - # RuntimeError: Invalid inputs for scatter_nd_onnx - # on index_put when num_slots_requested > num_slots_available. - # This case might occur when we have little kv cache blocks and - # lots of padding, or are doing warmup. - # This loop is a workaround for this issue. Please remove it - # once key_cache.index_put_(indices, offsets), key) works. - num_kv_cache_passes = math.ceil(num_slots_requested / num_slots_available) - - return num_kv_cache_passes, num_slots_available, indices, offsets - - -def insert_or_update_cache(input, cache, num_kv_cache_passes, - num_slots_available, block_indices, block_offsets): - for i in range(num_kv_cache_passes): - start_idx = i * num_slots_available - end_idx = (i + 1) * num_slots_available - cache.index_put_((block_indices[start_idx:end_idx], - block_offsets[start_idx:end_idx]), - input[start_idx:end_idx]) - - -def swap_blocks(src, dst, block_mapping): - index_src = torch.zeros((1, ), dtype=torch.int32, device=src.device) - index_dst = torch.zeros((1, ), dtype=torch.int32, device=dst.device) - for src_idx, dst_idx in block_mapping.items(): - index_src[0] = src_idx - index_dst[0] = dst_idx - dst.index_put_([index_dst], src.index_select(0, index_src)) - if dst.device.type == 'hpu': - htorch.core.mark_step() - torch.hpu.synchronize() - - -def copy_blocks(key_caches, value_caches, block_mapping): - index_src = torch.zeros((1, ), - dtype=torch.int32, - device=key_caches[0].device) - index_dst = torch.zeros((1, ), - dtype=torch.int32, - device=key_caches[0].device) - for src, dsts in block_mapping.items(): - index_src[0] = src - for dst in dsts: - index_dst[0] = dst - for key_cache in key_caches: - key_cache.index_copy_(0, index_dst, - key_cache.index_select(0, index_src)) - for value_cache in value_caches: - value_cache.index_copy_(0, index_dst, - value_cache.index_select(0, index_src)) - if key_caches[0].device.type == 'hpu': - htorch.core.mark_step() diff --git a/vllm/hpu/ops.py b/vllm/hpu/ops.py deleted file mode 100644 index 939d195a12b08..0000000000000 --- a/vllm/hpu/ops.py +++ /dev/null @@ -1,293 +0,0 @@ -############################################################################### -# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company -# -# This source code is licensed under the BSD license found in the -# LICENSE file in the root directory of this source tree. -############################################################################### -from typing import Optional - -import habana_frameworks.torch as htorch -import torch -import torch.nn.functional as F - -from vllm.logger import init_logger - -logger = init_logger(__name__) -HPUFusedRMSNorm = None -try: - from habana_frameworks.torch.hpex.normalization import FusedRMSNorm - HPUFusedRMSNorm = FusedRMSNorm -except ImportError: - logger.warning("Could not import HPU FusedRMSNorm kernel. " - "vLLM will use forward_native implementation of RMSNorm.") -HPUFusedSDPA = None -try: - from habana_frameworks.torch.hpex.kernels import FusedSDPA - HPUFusedSDPA = FusedSDPA -except ImportError: - logger.warning("Could not import HPU FusedSDPA kernel. " - "vLLM will use native implementation.") - - -def batch2block(tensor, block_mapping): - shape = tuple(tensor.shape) - return (block_mapping @ tensor.view(shape[0], -1)).view(-1, *shape[1:]) - - -def block2batch(tensor, block_mapping): - shape = tuple(tensor.shape) - return (block_mapping.t() @ tensor.view(shape[0], -1)).view(-1, *shape[1:]) - - -def block_softmax(batch_size, attn, block_mapping): - # We're using global maximum to decrease the exponent as - # it's fast to compute and performs reasonably well. - # This is by no means a final solution and needs to - # be properly addressed in the future. - # - # Additionally there's a bug where 'max' is not parallelized - # across TPC cores, so we need to split the tensor manually - # instead of simply doing attn_max = attn.max() - - tail_dims = tuple(range(1, attn.dim())) - attn_max = attn.amax(tail_dims).amax() - attn.sub_(attn_max) - attn = attn.exp_() - sums = attn.sum(dim=-1).unsqueeze(-1) - sums = block2batch(sums, block_mapping) - sums = batch2block(sums, block_mapping) - sums.add_(1.0e-12) - attn.div_(sums) - return attn - - -def flat_pa(query, key_cache, value_cache, block_list, block_mapping, - block_bias, scale, matmul_qk_op, matmul_av_op, keys_fetch_func, - values_fetch_func): - batch_size = query.size(0) - q_heads = query.size(1) - kv_heads = key_cache.size(2) - - query = batch2block(scale * query, block_mapping).unsqueeze(-2) - key = keys_fetch_func(key_cache, block_list).transpose(1, 2) - value = values_fetch_func(value_cache, block_list).transpose(1, 2) - block_bias = block_bias.view(key.size(0), 1, 1, -1) - - if kv_heads != q_heads: - block_bias = block_bias.unsqueeze(1) - query = query.unflatten(1, (kv_heads, -1)) - key = key.unflatten(1, (kv_heads, 1)) - value = value.unflatten(1, (kv_heads, 1)) - key = key.transpose(3, 4) - else: - key = key.transpose(2, 3) - - attn = matmul_qk_op(query, key) + block_bias - attn = block_softmax(batch_size, attn, block_mapping) - attn = matmul_av_op(attn, value) - attn = block2batch(attn, block_mapping) - attn = attn.squeeze(-2) - if kv_heads != q_heads: - attn = attn.flatten(1, 2) - return attn - - -def silu_and_mul(x: torch.Tensor) -> torch.Tensor: - d = x.shape[-1] // 2 - return F.silu(x[..., :d]) * x[..., d:] - - -#TODO: remove after fusedsdpa fix for query_head != kv_head -def repeat_kv(kv: torch.Tensor, n_rep: int) -> torch.Tensor: - """ - This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). - The kv go from (batch, num_key_value_heads, seqlen, head_dim) to - (batch, num_attention_heads, seqlen, head_dim) - """ - batch, num_key_value_heads, slen, head_dim = kv.shape - if n_rep == 1: - return kv - kv = kv[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, - head_dim) - return kv.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) - - -def prompt_attention( - query: torch.Tensor, - key: torch.Tensor, - value: torch.Tensor, - attn_bias: Optional[torch.Tensor] = None, - p: float = 0.0, - scale: Optional[float] = None, - matmul_qk_op=torch.matmul, - softmax_op=torch.softmax, - matmul_av_op=torch.matmul, - valid_seq_lengths: Optional[torch.Tensor] = None, -) -> torch.Tensor: - query = query.transpose(1, 2) - key = key.transpose(1, 2) - value = value.transpose(1, 2) - query_heads = query.size(1) - kv_heads = key.size(1) - if attn_bias is not None or HPUFusedSDPA is None: - if query_heads != kv_heads: - query = query.unflatten(1, (kv_heads, -1)) - key = key.unflatten(1, (kv_heads, 1)) - value = value.unflatten(1, (kv_heads, 1)) - if attn_bias is not None: - attn_bias = attn_bias.unsqueeze(2) - attn_weights = matmul_qk_op(query * scale, key.transpose(-1, -2)) - if attn_bias is not None: - attn_weights.add_(attn_bias) - attn_weights = softmax_op(attn_weights, dim=-1) - attn_weights = matmul_av_op(attn_weights, value) - if query_heads != kv_heads: - attn_weights = attn_weights.flatten(1, 2) - else: - #TODO: remove after fusedsdpa fix for query_heads != kv_heads - if query_heads != kv_heads: - key = repeat_kv(key, int(query_heads // kv_heads)) - value = repeat_kv(value, int(query_heads // kv_heads)) - softmax_mode = 'fast' - recompute_mode = True - attn_weights = FusedSDPA.apply(query, key, value, None, 0.0, True, - scale, softmax_mode, recompute_mode, - valid_seq_lengths, 'right') - attn_weights = attn_weights.transpose(1, 2) - return attn_weights - - -class LoraMask: - lora_mask = None - - @staticmethod - def setLoraMask(mask): - LoraMask.lora_mask = mask - - @staticmethod - def getLoraMask(): - return LoraMask.lora_mask - - -def dispatch_bgmv_linear( - y: torch.Tensor, - x: torch.Tensor, - wa_t_all: torch.Tensor, - wb_t_all: torch.Tensor, - indices: torch.LongTensor, - layer_idx: int, - scale: float, -): - """ - `wa_t_all` and `wb_t_all` contains all LoRA A and LoRA B weight matrices - stacked at dimension 0 into single tensors, assuming same rank. `wa` is the - reshaped and transposed version of `wa_t_all` of shape - (h_in, max_loras * lora_rank) and `wb` is the transposed and reshaped - version of `wb_t_all` of shape (max_loras * lora_rank, h_out). - - Matmul input `x` with `wa`. Multiply `x` with a mask to zero-out inputs of - inactive LoRA indices. Matmul masked output with `wb` and scale it to get - the final output. - """ - - assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}' - mask = LoraMask.getLoraMask() - - wa = wa_t_all[:, 0, :, :] - wb = wb_t_all[:, 0, :, :].transpose(1, 2) - wa = wa.reshape(wa.shape[0] * wa.shape[1], wa.shape[2]).transpose(0, 1) - wb = wb.reshape(wb.shape[0] * wb.shape[1], wb.shape[2]) - - out = x @ wa - assert (out.shape == mask.shape) - out = out * mask - out = out @ wb - y += out * scale - - -def dispatch_bgmv_embedding( - y: torch.Tensor, - x: torch.Tensor, - wb_t_all: torch.Tensor, - indices: torch.LongTensor, - layer_idx: int, - scale: float, -): - """ - `wb_t_all` contains all LoRA-B weight matrices stacked at dimension 0 into - a single tensor, assuming same rank. `wb` is the transposed and reshaped - version of `wb_t_all` of shape (num_loras * lora_rank, embedding_dim). - - Output of LoRA-A embedding (tensor x) is repeated max_loras times to match - the shape of `wb`. Multiply `x` with a mask to zero-out inputs of inactive - LoRA indices. Matmul masked output with `wb` and scale it to get the final - output. - """ - - assert layer_idx == 0, f'layer_idx should be 0, but got {layer_idx}' - max_loras = wb_t_all.size(0) - - x = x.repeat(1, max_loras) - x = x * LoraMask.getLoraMask() - wb = wb_t_all[:, 0, :, :].transpose(1, 2) - wb = wb.reshape(wb.shape[0] * wb.shape[1], wb.shape[2]) - out = x @ wb - y += out * scale - - -class MoeMatmul(torch.nn.Module): - - def __init__(self): - super().__init__() - - def set_weight(self, w): - self.weight = w - - def calc(self, state, expert_id, w): - self.weight = w[expert_id].transpose(0, 1) - return self.forward(state) - - def forward(self, state): - return torch.matmul(state, self.weight) - - -class StaticFusedMOE(torch.nn.Module): - - def __init__(self, num_total_experts): - super().__init__() - self.w13_list = torch.nn.ModuleList( - [MoeMatmul() for _ in range(num_total_experts)]) - self.w2_list = torch.nn.ModuleList( - [MoeMatmul() for _ in range(num_total_experts)]) - self.num_total_experts = num_total_experts - - def forward(self, hidden_states, w1, w2, score, topk): - B, D = hidden_states.shape - routing_weights = F.softmax(score, dim=1, dtype=torch.float32) - routing_weights, selected_experts = torch.topk(routing_weights, - topk, - dim=-1) - routing_weights /= routing_weights.sum(dim=-1, keepdim=True) - routing_weights = routing_weights.to(hidden_states.dtype) - final_hidden_states = torch.zeros((1, B, D), - dtype=hidden_states.dtype, - device=hidden_states.device) - padded_weights = torch.zeros((B, self.num_total_experts), - dtype=hidden_states.dtype, - device=hidden_states.device) - padded_weights.scatter_(-1, selected_experts, routing_weights) - padded_weights = padded_weights.reshape(-1, B, self.num_total_experts) - padded_weights = padded_weights.permute(2, 0, 1).unsqueeze(-1) - htorch.core.mark_step() - - for expert_idx in range(self.num_total_experts): - padded_weight = padded_weights[expert_idx] - current_state_static = hidden_states.reshape(-1, D) - w_output = self.w13_list[expert_idx].calc(current_state_static, - expert_idx, w1) - w_output = silu_and_mul(w_output) - w_output = self.w2_list[expert_idx].calc(w_output, expert_idx, w2) - current_hidden_states_static = w_output * padded_weight - final_hidden_states += current_hidden_states_static - - return final_hidden_states.view(-1, D) diff --git a/vllm/hpu/rotary_embed.py b/vllm/hpu/rotary_embed.py deleted file mode 100644 index 1857253f47f1b..0000000000000 --- a/vllm/hpu/rotary_embed.py +++ /dev/null @@ -1,123 +0,0 @@ -############################################################################### -# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company -# -# This source code is licensed under the BSD license found in the -# LICENSE file in the root directory of this source tree. -############################################################################### - -import torch -import torch.nn as nn - -from vllm.logger import init_logger -from vllm.utils import is_hpu - -logger = init_logger(__name__) - -if is_hpu(): - try: - from habana_frameworks.torch.hpex.kernels import ( - RotaryPosEmbeddingHelperV1 as FusedRoPE) - except ImportError: - logger.warning("Could not import HPU FusedRoPE kernel. " - "vLLM will use forward_native implementation of RoPE.") - FusedRoPE = None -else: - FusedRoPE = None - - -class HpuRotaryEmbedding(nn.Module): - - def __init__(self, - head_size, - rotary_dim, - max_position_embeddings=2048, - base=10000, - is_neox_style=None, - device='hpu', - RoPEFallback=None): - super().__init__() - - self.head_size = head_size - self.dim = rotary_dim - self.max_position_embeddings = max_position_embeddings - self.base = base - inv_freq = 1.0 / (self.base**( - torch.arange(0, self.dim, 2).float().to(device) / self.dim)) - self.register_buffer("inv_freq", inv_freq, persistent=False) - - # Build here to make `torch.jit.trace` work. - self._set_cos_sin_cache(seq_len=max_position_embeddings, - device=self.inv_freq.device, - dtype=torch.get_default_dtype()) - if FusedRoPE is None: - assert RoPEFallback is not None, ( - "HPU FusedRoPE kernel could not be imported, and " - "fallback RoPE implementation was not provided!") - self.fallback_impl = RoPEFallback(head_size, - rotary_dim, - max_position_embeddings, - base, - is_neox_style, - dtype=torch.get_default_dtype()) - - def _set_cos_sin_cache(self, seq_len, device, dtype): - self.max_seq_len_cached = seq_len - t = torch.arange(self.max_seq_len_cached, - device=device, - dtype=self.inv_freq.dtype) - - freqs = torch.einsum("i,j->ij", t, self.inv_freq) - # Different from paper, but it uses a different permutation in order - # to obtain the same calculation - emb = torch.cat((freqs, freqs), dim=-1) - self.register_buffer("cos_cached", - emb.cos().to(dtype), - persistent=False) - self.register_buffer("sin_cached", - emb.sin().to(dtype), - persistent=False) - - def forward(self, positions: torch.Tensor, query: torch.Tensor, - key: torch.Tensor): - if FusedRoPE is None: - return self.fallback_impl(positions, query, key) - if query.dim() == 2: - query = query.unsqueeze(0) - if key.dim() == 2: - key = key.unsqueeze(0) - if positions.dim() == 1: - positions = positions.unsqueeze(0) - seq_len = key.shape[-2] - if seq_len > self.max_seq_len_cached: - self._set_cos_sin_cache(seq_len=seq_len, - device=query.device, - dtype=query.dtype) - - cos, sin = self.cos_cached[:seq_len].to( - dtype=query.dtype), self.sin_cached[:seq_len].to(dtype=query.dtype) - query = query.reshape( - (query.shape[0], query.shape[1], query.shape[2] // self.head_size, - self.head_size)) - key = key.reshape((key.shape[0], key.shape[1], - key.shape[2] // self.head_size, self.head_size)) - query_rot = query[..., :self.dim] - key_rot = key[..., :self.dim] - if self.dim < self.head_size: - query_pass = query[..., self.dim:] - key_pass = key[..., self.dim:] - - if len(positions[0]) == 1: - cos = self.cos_cached[positions].unsqueeze(2).to(dtype=query.dtype) - sin = self.sin_cached[positions].unsqueeze(2).to(dtype=query.dtype) - else: - cos = cos[positions].unsqueeze(2) - sin = sin[positions].unsqueeze(2) - query, key = FusedRoPE.apply(query_rot, cos, sin, - 0), FusedRoPE.apply(key_rot, cos, sin, 0) - if self.dim < self.head_size: - query = torch.cat((query, query_pass), dim=-1) - key = torch.cat((key, key_pass), dim=-1) - return query.reshape( - (query.shape[0], query.shape[1], - query.shape[2] * query.shape[3])), key.reshape( - (key.shape[0], key.shape[1], key.shape[2] * key.shape[3])) diff --git a/vllm/hpu/utils.py b/vllm/hpu/utils.py deleted file mode 100644 index 13204b83d5742..0000000000000 --- a/vllm/hpu/utils.py +++ /dev/null @@ -1,61 +0,0 @@ -############################################################################### -# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company -# -# This source code is licensed under the BSD license found in the -# LICENSE file in the root directory of this source tree. -############################################################################### - -from functools import wraps - -import habana_frameworks.torch as htorch -import torch - -from vllm.hpu.cache_ops import insert_or_update_cache - - -def with_mark_steps(fn): - - @wraps(fn) - def wrapped(*args, **kwargs): - htorch.core.mark_step() - result = fn(*args, **kwargs) - del args - del kwargs - htorch.core.mark_step() - return result - - return wrapped - - -class Matmul(torch.nn.Module): - - def __init__(self): - super(Matmul, self).__init__() - - def forward(self, x, y): - return torch.matmul(x, y) - - -class Softmax(torch.nn.Module): - - def __init__(self): - super().__init__() - - def forward(self, x, dim=None, inv_head=None): - return torch.softmax(x, dim) - - -class VLLMKVCache(torch.nn.Module): - - def __init__(self): - super(VLLMKVCache, self).__init__() - - def forward(self, input, cache, num_kv_cache_passes, num_slots_available, - block_indices, block_offset): - insert_or_update_cache(input, cache, num_kv_cache_passes, - num_slots_available, block_indices, - block_offset) - return cache - - def fetch_from_cache(self, cache, blocks): - return cache.index_select(0, blocks) diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py index aa01e9fb77af2..59b7432b6e6eb 100644 --- a/vllm/lora/layers.py +++ b/vllm/lora/layers.py @@ -30,7 +30,8 @@ from vllm.utils import is_hpu if is_hpu(): - from vllm.hpu.ops import dispatch_bgmv_embedding, dispatch_bgmv_linear + from vllm_hpu_extension.ops import (dispatch_bgmv_embedding, + dispatch_bgmv_linear) if TYPE_CHECKING: pass diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index cf0d5f98f1b01..bda8a0622ef31 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -202,7 +202,7 @@ def __init__( self.num_expert_group = num_expert_group self.topk_group = topk_group if is_hpu(): - from vllm.hpu.ops import StaticFusedMOE + from vllm_hpu_extension.ops import StaticFusedMOE self.hpu_static_fused_moe = StaticFusedMOE(self.num_experts) if quant_config is None: diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index c12668c14887d..9ef532e61a7c0 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -75,7 +75,7 @@ def forward_hpu( x: torch.Tensor, residual: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: - from vllm.hpu.ops import HPUFusedRMSNorm + from vllm_hpu_extension.ops import HPUFusedRMSNorm if HPUFusedRMSNorm is None: return self.forward_native(x, residual) if residual is not None: diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index 4e3c840bede60..2581e3a74dc72 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -31,7 +31,7 @@ from vllm.utils import is_hpu, is_tpu if is_hpu(): - from vllm.hpu.rotary_embed import HpuRotaryEmbedding + from vllm_hpu_extension.rotary_embed import HpuRotaryEmbedding def _rotate_neox(x: torch.Tensor) -> torch.Tensor: diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index 4d65f8f68e725..7002e2878bd7d 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -19,13 +19,13 @@ import habana_frameworks.torch as htorch import habana_frameworks.torch.internal.bridge_config as bc import torch +from vllm_hpu_extension.ops import LoraMask as LoraMask from vllm.attention import AttentionMetadata, get_attn_backend from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, ModelConfig, MultiModalConfig, ParallelConfig, SchedulerConfig) from vllm.distributed.parallel_state import get_world_group -from vllm.hpu.ops import LoraMask as LoraMask from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping from vllm.lora.request import LoRARequest