diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index af0f010781040..89a36adcf5c1a 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -1288,9 +1288,6 @@ def _advance_to_next_step( seq = seq_group.seqs[0] seq.append_token_id(sample.output_token, sample.logprobs) - def finish_measurements(self): - self.model_executor.finish_measurements() - def step(self) -> List[Union[RequestOutput, EmbeddingRequestOutput]]: """Performs one decoding iteration and returns newly generated results. diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 9513252bdabe4..98d6df944da67 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -230,9 +230,6 @@ def set_tokenizer(self, tokenizer: AnyTokenizer) -> None: else: tokenizer_group.tokenizer = get_cached_tokenizer(tokenizer) - def finish_measurements(self): - self.llm_engine.finish_measurements() - @overload # LEGACY: single (prompt + optional token ids) def generate( self, diff --git a/vllm/executor/hpu_executor.py b/vllm/executor/hpu_executor.py index 34879bc4e7ef5..dda3b5e9b3a01 100644 --- a/vllm/executor/hpu_executor.py +++ b/vllm/executor/hpu_executor.py @@ -26,6 +26,7 @@ class HPUExecutor(ExecutorBase): def _init_executor(self) -> None: """Initialize the worker and load the model.""" self._init_worker() + self.shutdown_inc = True def _get_worker_kwargs( self, @@ -90,9 +91,6 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None: msg = f"init_cache_engine took {cache_init_m.get_summary_string()}" logger.info(msg) - def finish_measurements(self): - self.driver_worker.finish_measurements() - def execute_model( self, execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]: @@ -197,7 +195,9 @@ def stop_profile(self) -> None: self.driver_worker.stop_profile() def shutdown(self) -> None: - self.driver_worker.shutdown_inc() + if self.shutdown_inc: + self.driver_worker.shutdown_inc() + self.shutdown_inc = False class HPUExecutorAsync(HPUExecutor, ExecutorAsyncBase): diff --git a/vllm/executor/ray_hpu_executor.py b/vllm/executor/ray_hpu_executor.py index 343fa43b0eda1..2f0037eb6dffe 100644 --- a/vllm/executor/ray_hpu_executor.py +++ b/vllm/executor/ray_hpu_executor.py @@ -70,7 +70,12 @@ def _init_executor(self) -> None: self.output_decoder = msgspec.msgpack.Decoder( Optional[List[SamplerOutput]]) + self.shutdown_inc = True + def shutdown(self) -> None: + if self.shutdown_inc: + self._run_workers("shutdown_inc") + self.shutdown_inc = False if hasattr(self, "forward_dag") and self.forward_dag is not None: self.forward_dag.teardown() import ray diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index d3fa9c287234c..1a535b9e8e9cd 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1779,10 +1779,6 @@ def prepare_model_input( is_prompt=is_prompt, virtual_engine=virtual_engine) - def finish_measurements(self): - from neural_compressor.torch.quantization import finalize_calibration - finalize_calibration(self.model.model) - def _check_config(self, batch_size, seq_len, is_prompt, warmup_mode): cfg = (batch_size, seq_len, is_prompt) seen = cfg in self.seen_configs @@ -1993,6 +1989,3 @@ def shutdown_inc(self): finalize_calibration) finalize_calibration(self.model.model) self._is_inc_finalized = True - - def __del__(self): - self.shutdown_inc() diff --git a/vllm/worker/hpu_worker.py b/vllm/worker/hpu_worker.py index 59a5adf65ebc1..00154f52483cd 100644 --- a/vllm/worker/hpu_worker.py +++ b/vllm/worker/hpu_worker.py @@ -260,9 +260,6 @@ def _warm_up_model(self) -> None: # the model initialization and profiling. set_random_seed(self.model_config.seed) - def finish_measurements(self): - self.model_runner.finish_measurements() - @property def do_metadata_broadcast(self) -> bool: return self.parallel_config.tensor_parallel_size > 1