Skip to content

Commit

Permalink
Limit number of dummy cross attention blocks
Browse files Browse the repository at this point in the history
  • Loading branch information
kdamaszk committed Jan 7, 2025
1 parent 9d6917f commit 137eeec
Showing 1 changed file with 3 additions and 1 deletion.
4 changes: 3 additions & 1 deletion vllm/worker/hpu_enc_dec_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,6 @@ def create_dummy_seq_group_metadata(self,
num_images = mm_counts["image"]
max_mm_tokens = self.mm_registry.get_max_multimodal_tokens(
self.model_config) * num_images
num_cross_blocks = math.ceil(max_mm_tokens / self.block_size)
seq_len = max(seq_len, 1)
if is_prompt:
input_len = seq_len
Expand All @@ -437,6 +436,9 @@ def create_dummy_seq_group_metadata(self,
input_len = seq_len - 1
output_len = 1
block_tables = {group_id: [_PAD_BLOCK_ID] * num_blocks}
# limit cross blocks to the number of available blocks
num_cross_blocks = min(self.bucketing_ctx.num_hpu_blocks,
max_mm_tokens) // self.block_size
cross_block_table = [_PAD_BLOCK_ID] * num_cross_blocks
prompt_token_ids = [0] * input_len
output_token_ids = [1] * output_len
Expand Down

0 comments on commit 137eeec

Please sign in to comment.