TC llama recompile fix - no_grad to inference_mode (#640)

during warmup the inference mode is used, but at runtime it's overwritten by inference mode - this causes recompilations due to dispatch key mismatch in torch.compile. This switches the no_grad mode to inference_mode from base class. --------- Co-authored-by: Rafal Litka <[email protected]>
HabanaAI · Dec 18, 2024 · d81f829 · d81f829
1 parent da61ecf
commit d81f829
Showing 1 changed file with 0 additions and 6 deletions.
diff --git a/vllm/platforms/hpu.py b/vllm/platforms/hpu.py
@@ -1,7 +1,5 @@
 from typing import TYPE_CHECKING, Optional
 
-import torch
-
 from .interface import Platform, PlatformEnum, _Backend
 
 if TYPE_CHECKING:
@@ -24,10 +22,6 @@ def get_default_attn_backend(cls, selected_backend: _Backend) -> _Backend:
     def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
         return True
 
-    @staticmethod
-    def inference_mode():
-        return torch.no_grad()
-
     @classmethod
     def check_and_update_config(cls, vllm_config: VllmConfig) -> None: