From 3af4b6ce54ccdfc91516b335c5331045d78c99a2 Mon Sep 17 00:00:00 2001
From: Sanju C Sudhakaran <scsudhakaran@habana.ai>
Date: Tue, 22 Oct 2024 19:03:58 +0530
Subject: [PATCH] Remove redundant set_active_loras call during warmup (#413)

CUDA uses `capture` for warmup runs and `execute_model` for actual runs.
During each phase they call `set_active_loras` only once. HPU uses
`execute_model` for both warmup and actual runs. Since `execute_model`
already takes care of `set_active_loras` internally, the redundant call
can be removed.

This special handling is redundant and incorrect, as it causes
out-of-bound slicing in decode phase reported in
https://github.com/HabanaAI/vllm-fork/issues/405.

This PR removes special handling of `set_active_loras` function call
from warmup runs and resolves the issue in
https://github.com/HabanaAI/vllm-fork/issues/405.
---
 vllm/worker/hpu_model_runner.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 888a9a9da942c..f2875194e93a0 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -1354,12 +1354,6 @@ def warmup_scenario(self,
                 ]
         self.profiler.start('internal', scenario_name)
         times = 3 if use_graphs or is_pt_profiler_run else 1
-        if self.lora_config and not is_lora_profile_run:
-            lora_mapping = LoRAMapping(
-                **dict(index_mapping=[0] * batch_size * seq_len,
-                       prompt_mapping=[0] * batch_size * seq_len,
-                       is_prefill=is_prompt))
-            self.set_active_loras(set(), lora_mapping)
         if is_prompt:
             seqs = [
                 self.create_dummy_seq_group_metadata(