From 3af4b6ce54ccdfc91516b335c5331045d78c99a2 Mon Sep 17 00:00:00 2001 From: Sanju C Sudhakaran Date: Tue, 22 Oct 2024 19:03:58 +0530 Subject: [PATCH] Remove redundant set_active_loras call during warmup (#413) CUDA uses `capture` for warmup runs and `execute_model` for actual runs. During each phase they call `set_active_loras` only once. HPU uses `execute_model` for both warmup and actual runs. Since `execute_model` already takes care of `set_active_loras` internally, the redundant call can be removed. This special handling is redundant and incorrect, as it causes out-of-bound slicing in decode phase reported in https://github.com/HabanaAI/vllm-fork/issues/405. This PR removes special handling of `set_active_loras` function call from warmup runs and resolves the issue in https://github.com/HabanaAI/vllm-fork/issues/405. --- vllm/worker/hpu_model_runner.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py index 888a9a9da942c..f2875194e93a0 100644 --- a/vllm/worker/hpu_model_runner.py +++ b/vllm/worker/hpu_model_runner.py @@ -1354,12 +1354,6 @@ def warmup_scenario(self, ] self.profiler.start('internal', scenario_name) times = 3 if use_graphs or is_pt_profiler_run else 1 - if self.lora_config and not is_lora_profile_run: - lora_mapping = LoRAMapping( - **dict(index_mapping=[0] * batch_size * seq_len, - prompt_mapping=[0] * batch_size * seq_len, - is_prefill=is_prompt)) - self.set_active_loras(set(), lora_mapping) if is_prompt: seqs = [ self.create_dummy_seq_group_metadata(