diff --git a/vllm/worker/hpu_enc_dec_model_runner.py b/vllm/worker/hpu_enc_dec_model_runner.py index 2b0c684d16c9f..2b8acb502822d 100644 --- a/vllm/worker/hpu_enc_dec_model_runner.py +++ b/vllm/worker/hpu_enc_dec_model_runner.py @@ -332,6 +332,15 @@ def _prepare_encoder_model_input_tensors( attn_metadata.cross_block_groups = block_groups attn_metadata.cross_block_usage = block_usage + # add padding to align with language model shapes + real_batch_size = len(seq_group_metadata_list) + batch_size_padded = self.bucketing_ctx.get_padded_batch_size( + real_batch_size, is_prompt) + batch_size_padding = batch_size_padded - real_batch_size + if batch_size_padding > 0: + encoder_seq_lens.extend(encoder_seq_lens[0] + for _ in range(batch_size_padding)) + encoder_seq_lens_tensor = self._list_to_int32_tensor(encoder_seq_lens) attn_metadata.encoder_seq_lens = encoder_seq_lens attn_metadata.encoder_seq_lens_tensor = encoder_seq_lens_tensor