From d81f829e170d40f303e1a287d7278da8166a50d9 Mon Sep 17 00:00:00 2001 From: RafLit Date: Wed, 18 Dec 2024 13:27:43 +0100 Subject: [PATCH] TC llama recompile fix - no_grad to inference_mode (#640) during warmup the inference mode is used, but at runtime it's overwritten by inference mode - this causes recompilations due to dispatch key mismatch in torch.compile. This switches the no_grad mode to inference_mode from base class. --------- Co-authored-by: Rafal Litka --- vllm/platforms/hpu.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py index ee83187fff797..314cd98212e9c 100644 --- a/vllm/platforms/hpu.py +++ b/vllm/platforms/hpu.py @@ -1,7 +1,5 @@ from typing import TYPE_CHECKING, Optional -import torch - from .interface import Platform, PlatformEnum, _Backend if TYPE_CHECKING: @@ -24,10 +22,6 @@ def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend: def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: return True - @staticmethod - def inference_mode(): - return torch.no_grad() - @classmethod def check_and_update_config(cls, vllm_config: VllmConfig) -> None: