diff --git a/vllm/attention/backends/hpu_attn.py b/vllm/attention/backends/hpu_attn.py index dfdddbb67d122..3301cf244afe1 100644 --- a/vllm/attention/backends/hpu_attn.py +++ b/vllm/attention/backends/hpu_attn.py @@ -130,9 +130,6 @@ def __init__( self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA', '1').lower() in ['1', 'true'] \ and not is_fake_hpu() - if self.prefill_use_fusedsdpa: - assert alibi_slopes is None, \ - 'Prefill with FusedSDPA not supported with alibi slopes!' suppored_head_sizes = HPUPagedAttention.get_supported_head_sizes() if head_size not in suppored_head_sizes: @@ -196,7 +193,8 @@ def forward( kv_shape = (batch_size, seq_len_kv, self.num_kv_heads, self.head_size) if attn_metadata is None or attn_metadata.block_list is None: - if not self.prefill_use_fusedsdpa: + if (not self.prefill_use_fusedsdpa + or self.alibi_slopes is not None): # TODO: move this outside of model assert attn_metadata.attn_bias is not None, \ 'attn_bias must be set before calling model.forward' diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index b7561a1cf7ee4..856f237d19e0f 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -172,9 +172,6 @@ class HpuModelAdapter: def __init__(self, model, block_size, dtype, enforce_eager): self.model = model - self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA', - '1').lower() in ['1', 'true'] \ - and not is_fake_hpu() self.block_size = block_size self.dtype = dtype if not is_fake_hpu() and not htorch.utils.internal.is_lazy( @@ -212,8 +209,7 @@ def _compile_region(self, model, name, module): def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device, dtype): - if (attn_metadata is None or self.prefill_use_fusedsdpa - or not attn_metadata.is_prompt): + if (attn_metadata is None or not attn_metadata.is_prompt): return attn_metadata prefill_metadata = attn_metadata