Enable FusedSDPA prefill by default (#447)

This removers the need to pass VLLM_PROMPT_USE_FUSEDSDPA environment variable in order to enable FusedSDPA attention. Fallback attention can still be used if VLLM_PROMPT_USE_FUSEDSDPA=0 is provided.
HabanaAI · Oct 30, 2024 · ec7eec7 · ec7eec7
1 parent 049d9dc
commit ec7eec7
Showing 1 changed file with 2 additions and 1 deletion.
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
@@ -282,7 +282,8 @@ class HpuModelAdapter():
     def __init__(self, model, block_size, dtype, enforce_eager):
         self.model = model
         self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA',
-                                               '0').lower() in ['1', 'true']
+                                               '1').lower() in ['1', 'true'] \
+                                                and not is_fake_hpu()
         self.block_size = block_size
         self.dtype = dtype
         if not is_fake_hpu() and not htorch.utils.internal.is_lazy(