Skip to content

Commit

Permalink
fix an accuracy issue caused by selected_token_index
Browse files Browse the repository at this point in the history
Signed-off-by: Chendi.Xue <[email protected]>
  • Loading branch information
xuechendi committed Jan 7, 2025
1 parent fade386 commit 911f14b
Showing 1 changed file with 8 additions and 3 deletions.
11 changes: 8 additions & 3 deletions vllm/worker/hpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -479,8 +479,10 @@ def forward(self, *args, **kwargs):
LoraMask.setLoraMask(kwargs.pop('lora_mask'))
if self.layer_names is not None:
self._prepare_cos_sin(kwargs['positions'])
print("Warming up HPU Graph - input_ids: ", input_ids.shape,
"seq_lens_tensor: ", kwargs['attn_metadata'].seq_lens_tensor)
if kwargs['attn_metadata'].is_prompt:
print("Warming up HPU Graph - input_ids: ", input_ids,
"seq_lens_tensor: ", kwargs['attn_metadata'].seq_lens_tensor,
"selected_token_indices: ", selected_token_indices)
hidden_states = self.model(*args, **kwargs)
hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
hidden_states = hidden_states.index_select(0, selected_token_indices)
Expand Down Expand Up @@ -1613,7 +1615,10 @@ def prepare_input_tensors(

# FIXME: We need to adjust selected_token_indices to accommodate
# for padding
max_len = input_tokens.size(1)
if self.enable_merged_prefill:
max_len = slot_mapping.size(1)
else:
max_len = input_tokens.size(1)
paddings = [max_len - q for q in query_lens]
paddings = [0] + paddings[:-1]
paddings = list(itertools.accumulate(paddings))
Expand Down

0 comments on commit 911f14b

Please sign in to comment.