Refine INC shutdown code (#335)

This PR removes debug printouts in INC shutdown method and covers the case where application exits before model is initialized properly.
HabanaAI · Sep 25, 2024 · 8c6dcae · 8c6dcae
1 parent 9111a80
commit 8c6dcae
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 11 deletions.
diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py
@@ -195,9 +195,6 @@ def check_health(self) -> None:
     def shutdown(self) -> None:
         self.driver_worker.shutdown_inc()
 
-    def __del__(self):
-        self.shutdown()
-
 
 class HabanaExecutorAsync(HabanaExecutor, ExecutorAsyncBase):
 

diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
@@ -550,6 +550,7 @@ def __init__(
         # Lazy initialization
         self.lora_manager: LRUCacheWorkerLoRAManager = None
         self.model: torch.nn.Module = None
+        self.inc_initialized_successfully = False
 
         # Profiler stats
         self.profiler_counter_helper = HabanaProfilerCounterHelper()
@@ -632,6 +633,7 @@ def load_model(self) -> None:
                         self.model = convert(self.model, config)
                     htcore.hpu_initialize(self.model,
                                           mark_only_scales_as_const=True)
+                self.inc_initialized_successfully = True
                 logger.info("Preparing model with INC took %s",
                             m_inc.get_summary_string())
             elif not is_fake_hpu():
@@ -1938,14 +1940,18 @@ def execute_model(
         return [output]
 
     def shutdown_inc(self):
-        print('inc shutdown')
-        if (model_config := getattr(self, "model_config", None)) and \
-                         getattr(model_config, "quantization", None) == 'inc':
-            print('inc shutdown start')
+        can_finalize_inc = False
+        from contextlib import suppress
+        with suppress(AttributeError):
+            can_finalize_inc = (self.model_config.quantization == 'inc') and \
+                (self.model.model is not None) and \
+                self.inc_initialized_successfully and \
+                not getattr(self, "_is_inc_finalized", False)
+        if can_finalize_inc:
             from neural_compressor.torch.quantization import (
                 finalize_calibration)
             finalize_calibration(self.model.model)
-            print('inc shutdown')
+            self._is_inc_finalized = True
 
     def __del__(self):
         self.shutdown_inc()
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
@@ -320,9 +320,6 @@ def list_prompt_adapters(self) -> Set[int]:
     def shutdown_inc(self):
         self.model_runner.shutdown_inc()
 
-    def __del__(self):
-        self.shutdown_inc()
-
     @property
     def max_model_len(self) -> int:
         return self.model_config.max_model_len