From ec7eec7e3789dd8a5529c0fe54eac55f78a6c925 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Wed, 30 Oct 2024 12:57:29 +0100 Subject: [PATCH] Enable FusedSDPA prefill by default (#447) This removers the need to pass VLLM_PROMPT_USE_FUSEDSDPA environment variable in order to enable FusedSDPA attention. Fallback attention can still be used if VLLM_PROMPT_USE_FUSEDSDPA=0 is provided. --- vllm/worker/hpu_model_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 559ed33548dea..a92d952c308fd 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -282,7 +282,8 @@ class HpuModelAdapter(): def __init__(self, model, block_size, dtype, enforce_eager): self.model = model self.prefill_use_fusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA', - '0').lower() in ['1', 'true'] + '1').lower() in ['1', 'true'] \ + and not is_fake_hpu() self.block_size = block_size self.dtype = dtype if not is_fake_hpu() and not htorch.utils.internal.is_lazy(