From efe0268b51de321aae4fcb509146383c801ca7b3 Mon Sep 17 00:00:00 2001 From: Jan Kaniecki Date: Wed, 20 Nov 2024 13:29:20 +0100 Subject: [PATCH] Skip empty steps in multi step sheduling (#526) This change allows to skip empty steps in multistep scenario. We are currently wasting host time on launching n-2 empty steps. This PR removes it. The gain will be visible after device time optimizations, as we are currently limited by HPU calculations inside multistep. --- vllm/worker/hpu_model_runner.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index be4918c840d9b..03c5e62c8f11e 100755 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -2241,9 +2241,12 @@ def try_revert_dummy_output_tokens(): else: raise RuntimeError( "seq_group_metadata_list is uninitialized") - # Cache the original output token ids for i, seq_group_metadata in enumerate( seq_group_metadata_list): + # Skip empty steps + seq_group_metadata.state.current_step += ( + num_steps - 2) + # Cache the original output token ids cache_orig_output_tokens_len.append({}) for j, data in seq_group_metadata.seq_data.items(): cache_orig_output_tokens_len[i][j] = \