diff --git a/vllm/executor/habana_executor.py b/vllm/executor/habana_executor.py index 44226fc898218..e4bd54f8849b3 100644 --- a/vllm/executor/habana_executor.py +++ b/vllm/executor/habana_executor.py @@ -195,9 +195,6 @@ def check_health(self) -> None: def shutdown(self) -> None: self.driver_worker.shutdown_inc() - def __del__(self): - self.shutdown() - class HabanaExecutorAsync(HabanaExecutor, ExecutorAsyncBase): diff --git a/vllm/worker/habana_model_runner.py b/vllm/worker/habana_model_runner.py index c99500ef1296b..6940e7637dbb7 100644 --- a/vllm/worker/habana_model_runner.py +++ b/vllm/worker/habana_model_runner.py @@ -550,6 +550,7 @@ def __init__( # Lazy initialization self.lora_manager: LRUCacheWorkerLoRAManager = None self.model: torch.nn.Module = None + self.inc_initialized_successfully = False # Profiler stats self.profiler_counter_helper = HabanaProfilerCounterHelper() @@ -632,6 +633,7 @@ def load_model(self) -> None: self.model = convert(self.model, config) htcore.hpu_initialize(self.model, mark_only_scales_as_const=True) + self.inc_initialized_successfully = True logger.info("Preparing model with INC took %s", m_inc.get_summary_string()) elif not is_fake_hpu(): @@ -1938,14 +1940,18 @@ def execute_model( return [output] def shutdown_inc(self): - print('inc shutdown') - if (model_config := getattr(self, "model_config", None)) and \ - getattr(model_config, "quantization", None) == 'inc': - print('inc shutdown start') + can_finalize_inc = False + from contextlib import suppress + with suppress(AttributeError): + can_finalize_inc = (self.model_config.quantization == 'inc') and \ + (self.model.model is not None) and \ + self.inc_initialized_successfully and \ + not getattr(self, "_is_inc_finalized", False) + if can_finalize_inc: from neural_compressor.torch.quantization import ( finalize_calibration) finalize_calibration(self.model.model) - print('inc shutdown') + self._is_inc_finalized = True def __del__(self): self.shutdown_inc() diff --git a/vllm/worker/habana_worker.py b/vllm/worker/habana_worker.py index 8cdbba02fbb33..2e4dfeac42c3e 100644 --- a/vllm/worker/habana_worker.py +++ b/vllm/worker/habana_worker.py @@ -320,9 +320,6 @@ def list_prompt_adapters(self) -> Set[int]: def shutdown_inc(self): self.model_runner.shutdown_inc() - def __del__(self): - self.shutdown_inc() - @property def max_model_len(self) -> int: return self.model_config.max_model_len