From d81f829e170d40f303e1a287d7278da8166a50d9 Mon Sep 17 00:00:00 2001
From: RafLit <rafal.litka@intel.com>
Date: Wed, 18 Dec 2024 13:27:43 +0100
Subject: [PATCH] TC llama recompile fix - no_grad to inference_mode (#640)

during warmup the inference mode is used, but at runtime it's
overwritten by inference mode - this causes recompilations due to
dispatch key mismatch in torch.compile.
This switches the no_grad mode to inference_mode from base class.

---------

Co-authored-by: Rafal Litka <rlitka@habana.ai>
---
 vllm/platforms/hpu.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
index ee83187fff797..314cd98212e9c 100644
--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
@@ -1,7 +1,5 @@
 from typing import TYPE_CHECKING, Optional
 
-import torch
-
 from .interface import Platform, PlatformEnum, _Backend
 
 if TYPE_CHECKING:
@@ -24,10 +22,6 @@ def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
     def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
         return True
 
-    @staticmethod
-    def inference_mode():
-        return torch.no_grad()
-
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None: