From 8c6dcae75c06144081c98cb07668be87d763eb47 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Wed, 25 Sep 2024 14:13:15 +0200
Subject: [PATCH] Refine INC shutdown code (#335)

This PR removes debug printouts in INC shutdown method and covers the
case where application exits before model is initialized properly.
---
 vllm/executor/habana_executor.py   |  3 ---
 vllm/worker/habana_model_runner.py | 16 +++++++++++-----
 vllm/worker/habana_worker.py       |  3 ---
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py
index 44226fc898218..e4bd54f8849b3 100644
--- a/vllm/executor/habana_executor.py
+++ b/vllm/executor/habana_executor.py
@@ -195,9 +195,6 @@ def check_health(self) -> None:
     def shutdown(self) -> None:
         self.driver_worker.shutdown_inc()
 
-    def __del__(self):
-        self.shutdown()
-
 
 class HabanaExecutorAsync(HabanaExecutor, ExecutorAsyncBase):
 
diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py
index c99500ef1296b..6940e7637dbb7 100644
--- a/vllm/worker/habana_model_runner.py
+++ b/vllm/worker/habana_model_runner.py
@@ -550,6 +550,7 @@ def __init__(
         # Lazy initialization
         self.lora_manager: LRUCacheWorkerLoRAManager = None
         self.model: torch.nn.Module = None
+        self.inc_initialized_successfully = False
 
         # Profiler stats
         self.profiler_counter_helper = HabanaProfilerCounterHelper()
@@ -632,6 +633,7 @@ def load_model(self) -> None:
                         self.model = convert(self.model, config)
                     htcore.hpu_initialize(self.model,
                                           mark_only_scales_as_const=True)
+                self.inc_initialized_successfully = True
                 logger.info("Preparing model with INC took %s",
                             m_inc.get_summary_string())
             elif not is_fake_hpu():
@@ -1938,14 +1940,18 @@ def execute_model(
         return [output]
 
     def shutdown_inc(self):
-        print('inc shutdown')
-        if (model_config := getattr(self, "model_config", None)) and \
-                         getattr(model_config, "quantization", None) == 'inc':
-            print('inc shutdown start')
+        can_finalize_inc = False
+        from contextlib import suppress
+        with suppress(AttributeError):
+            can_finalize_inc = (self.model_config.quantization == 'inc') and \
+                (self.model.model is not None) and \
+                self.inc_initialized_successfully and \
+                not getattr(self, "_is_inc_finalized", False)
+        if can_finalize_inc:
             from neural_compressor.torch.quantization import (
                 finalize_calibration)
             finalize_calibration(self.model.model)
-            print('inc shutdown')
+            self._is_inc_finalized = True
 
     def __del__(self):
         self.shutdown_inc()
diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py
index 8cdbba02fbb33..2e4dfeac42c3e 100644
--- a/vllm/worker/habana_worker.py
+++ b/vllm/worker/habana_worker.py
@@ -320,9 +320,6 @@ def list_prompt_adapters(self) -> Set[int]:
     def shutdown_inc(self):
         self.model_runner.shutdown_inc()
 
-    def __del__(self):
-        self.shutdown_inc()
-
     @property
     def max_model_len(self) -> int:
         return self.model_config.max_model_len