HabanaAI · itaraban · Nov 29, 2024 · michalkuligowski · Dec 9, 2024
@@ -130,9 +130,6 @@ def __init__(
         self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA',
                                                '1').lower() in ['1', 'true'] \
                                                and not is_fake_hpu()
-        if self.prefill_use_fusedsdpa:
-            assert alibi_slopes is None, \
-                'Prefill with FusedSDPA not supported with alibi slopes!'
 
         suppored_head_sizes = HPUPagedAttention.get_supported_head_sizes()
         if head_size not in suppored_head_sizes:
@@ -196,7 +193,8 @@ def forward(
             kv_shape = (batch_size, seq_len_kv, self.num_kv_heads,
                         self.head_size)
             if attn_metadata is None or attn_metadata.block_list is None:
-                if not self.prefill_use_fusedsdpa:
+                if (not self.prefill_use_fusedsdpa
+                        or self.alibi_slopes is not None):
                     # TODO: move this outside of model
                     assert attn_metadata.attn_bias is not None, \
                             'attn_bias must be set before calling model.forward'

@@ -172,9 +172,6 @@ class HpuModelAdapter:
 
     def __init__(self, model, block_size, dtype, enforce_eager):
         self.model = model
-        self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA',
-                                               '1').lower() in ['1', 'true'] \
-                                                and not is_fake_hpu()
         self.block_size = block_size
         self.dtype = dtype
         if not is_fake_hpu() and not htorch.utils.internal.is_lazy(
@@ -212,8 +209,7 @@ def _compile_region(self, model, name, module):
 
     def _set_attn_bias(self, attn_metadata, batch_size, seq_len, device,
                        dtype):
-        if (attn_metadata is None or self.prefill_use_fusedsdpa
-                or not attn_metadata.is_prompt):
+        if (attn_metadata is None or not attn_metadata.is_prompt):
             return attn_metadata
 
         prefill_metadata = attn_metadata