From 2003cc35135319b240230e686f26f13524403ee0 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 6 Nov 2024 17:49:19 +0800
Subject: [PATCH 001/183] [Model][LoRA]LoRA support added for
 LlamaEmbeddingModel (#10071)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 docs/source/models/supported_models.rst |  2 +-
 vllm/model_executor/models/llama.py     | 20 +++++++++++++++++++-
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 55835d945b00c..87f45cf695c8d 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -333,7 +333,7 @@ Text Embedding
   * - :code:`MistralModel`
     - Mistral-based
     - :code:`intfloat/e5-mistral-7b-instruct`, etc.
-    - 
+    - ✅︎
     - ✅︎
 
 .. important::
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 6c0a8b5ef8451..d768a57b7ef8a 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -627,7 +627,7 @@ def permute(w: torch.Tensor, n_heads: int):
         return name, loaded_weight
 
 
-class LlamaEmbeddingModel(nn.Module, SupportsPP):
+class LlamaEmbeddingModel(nn.Module, SupportsLoRA, SupportsPP):
     """
     A model that uses Llama with additional embedding functionalities.
 
@@ -638,6 +638,19 @@ class LlamaEmbeddingModel(nn.Module, SupportsPP):
         model: An instance of LlamaModel used for forward operations.
         _pooler: An instance of Pooler used for pooling operations.
     """
+    packed_modules_mapping = {
+        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+        "gate_up_proj": ["gate_proj", "up_proj"]
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj", "o_proj", "gate_up_proj", "down_proj", "embed_tokens"
+    ]
+    embedding_modules = {
+        "embed_tokens": "input_embeddings",
+    }
+    embedding_padding_modules = []
 
     def __init__(
         self,
@@ -679,3 +692,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
 
     def load_kv_cache_scales(self, quantization_param_path: str) -> None:
         self.model.load_kv_cache_scales(quantization_param_path)
+
+    # LRUCacheWorkerLoRAManager instantiation requires model config.
+    @property
+    def config(self):
+        return self.model.config

From a5bba7d234b4e0d82e6a64de82a8497760ed44cf Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Wed, 6 Nov 2024 19:41:17 +0800
Subject: [PATCH 002/183] [Model] Add Idefics3 support (#9767)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: B-201 <Joy25810@foxmail.com>
Co-authored-by: B-201 <Joy25810@foxmail.com>
---
 docs/source/models/supported_models.rst       |   6 +
 examples/offline_inference_vision_language.py |  17 +
 ...e_inference_vision_language_multi_image.py |  25 +
 .../vision_language/test_models.py            |  16 +
 vllm/entrypoints/chat_utils.py                |   2 +
 .../models/idefics2_vision_model.py           |  25 +-
 vllm/model_executor/models/idefics3.py        | 632 ++++++++++++++++++
 vllm/model_executor/models/registry.py        |   1 +
 8 files changed, 723 insertions(+), 1 deletion(-)
 create mode 100644 vllm/model_executor/models/idefics3.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 87f45cf695c8d..cdcea70c6cb7d 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -446,6 +446,12 @@ Text Generation
     - :code:`h2oai/h2ovl-mississippi-800m`, :code:`h2oai/h2ovl-mississippi-2b`, etc.
     - 
     - ✅︎
+  * - :code:`Idefics3ForConditionalGeneration`
+    - Idefics3
+    - T + I
+    - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc.
+    - 
+    - 
   * - :code:`InternVLChatModel`
     - InternVL2
     - T + I\ :sup:`E+`
diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 4fd002caf1763..8d17ce3754515 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -377,6 +377,22 @@ def run_glm4v(question: str, modality: str):
     return llm, prompt, stop_token_ids
 
 
+# Idefics3-8B-Llama3
+def run_idefics3(question: str, modality: str):
+    assert modality == "image"
+    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
+
+    llm = LLM(model=model_name,
+              max_model_len=8192,
+              max_num_seqs=2,
+              enforce_eager=True)
+    prompt = (
+        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
+    )
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
 model_example_map = {
     "llava": run_llava,
     "llava-next": run_llava_next,
@@ -397,6 +413,7 @@ def run_glm4v(question: str, modality: str):
     "mllama": run_mllama,
     "molmo": run_molmo,
     "glm4v": run_glm4v,
+    "idefics3": run_idefics3,
 }
 
 
diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py
index d99684078ff3d..7e883568995a4 100644
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
@@ -290,6 +290,30 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
     )
 
 
+def load_idefics3(question, image_urls: List[str]) -> ModelRequestData:
+    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
+
+    # The configuration below has been confirmed to launch on a single L40 GPU.
+    llm = LLM(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=16,
+        enforce_eager=True,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = "\n".join(f"Image-{i}: <image>\n"
+                             for i, _ in enumerate(image_urls, start=1))
+    prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:"  # noqa: E501
+    return ModelRequestData(
+        llm=llm,
+        prompt=prompt,
+        stop_token_ids=None,
+        image_data=[fetch_image(url) for url in image_urls],
+        chat_template=None,
+    )
+
+
 model_example_map = {
     "phi3_v": load_phi3v,
     "h2ovl_chat": load_h2onvl,
@@ -298,6 +322,7 @@ def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
     "qwen2_vl": load_qwen2_vl,
     "qwen_vl_chat": load_qwenvl_chat,
     "mllama": load_mllama,
+    "idefics3": load_idefics3,
 }
 
 
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index cfd2d61f2b633..3dbfaafb781af 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -327,6 +327,22 @@
         vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
         prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
     ),
+    "idefics3": VLMTestInfo(
+        models=["HuggingFaceM4/Idefics3-8B-Llama3"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<image>",
+        max_model_len=8192,
+        max_num_seqs=2,
+        auto_cls=AutoModelForVision2Seq,
+        marks=[
+            pytest.mark.skipif(
+                transformers.__version__ < "4.46.0",
+                reason="Model introduced in HF >= 4.46.0"
+            ),
+            large_gpu_mark(min_gb=48),
+        ],
+    ),
     ### Tensor parallel / multi-gpu broadcast tests
     "broadcast-chameleon": VLMTestInfo(
         models=["facebook/chameleon-7b"],
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 0ada0aaacda24..ed4e4399d5514 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -187,6 +187,8 @@ def _placeholder_str(self, modality: ModalityStr,
                 return "<|vision_start|><|image_pad|><|vision_end|>"
             if model_type == "molmo":
                 return ""
+            if model_type == "idefics3":
+                return "<image>"
 
             raise TypeError(f"Unknown {modality} model type: {model_type}")
         elif modality == "audio":
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index 53869b8fa6bd8..b21bc2a3f9ce1 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 """PyTorch Idefics2 model."""
 
-from typing import Optional
+from typing import Iterable, Optional, Tuple
 
 import torch
 from torch import nn
@@ -29,6 +29,7 @@
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 
 
 class Idefics2VisionEmbeddings(nn.Module):
@@ -329,3 +330,25 @@ def forward(
         encoder_outputs = self.encoder(hidden_states)
         last_hidden_state = self.post_layernorm(encoder_outputs)
         return last_hidden_state
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                param = params_dict[name.replace(weight_name, param_name)]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
new file mode 100644
index 0000000000000..e4c98f22fb16f
--- /dev/null
+++ b/vllm/model_executor/models/idefics3.py
@@ -0,0 +1,632 @@
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Idefics3 model compatible with HuggingFace weights."""
+
+import math
+from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
+                    TypedDict, Union)
+
+import torch
+import torch.utils.checkpoint
+from PIL import Image
+from torch import nn
+# Temporary solution for transformers below 4.46.0.
+from transformers import PretrainedConfig as Idefics3Config
+
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, MultiModalConfig
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
+                         InputContext, token_inputs)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
+from vllm.multimodal.image import cached_get_image_processor
+from vllm.sequence import IntermediateTensors, SequenceData
+from vllm.transformers_utils.processor import cached_get_processor
+from vllm.utils import is_list_of
+
+# yapf: disable
+from .idefics2_vision_model import (
+    Idefics2VisionTransformer as Idefics3VisionTransformer)
+# yapf: enable
+from .interfaces import SupportsMultiModal
+from .llama import LlamaModel
+from .utils import AutoWeightsLoader, flatten_bn, merge_multimodal_embeddings
+
+logger = init_logger(__name__)
+
+
+class Idefics3ImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, num_channels, height, width)`
+    """
+    rows: List[int]
+    cols: List[int]
+    pixel_attention_mask: Optional[torch.BoolTensor]
+
+
+class Idefics3ImageEmbeddingInputs(TypedDict):
+    type: Literal["image_embeds"]
+    data: torch.Tensor
+    """
+    Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
+    `hidden_size` must match the hidden size of language model backbone.
+    """
+
+
+ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs]
+
+
+def input_mapper_for_idefics3(
+    ctx: InputContext,
+    data: object,
+):
+    model_config = ctx.model_config
+    image_processor = cached_get_image_processor(
+        model_config.model, trust_remote_code=model_config.trust_remote_code)
+    if image_processor is None:
+        raise RuntimeError("No HuggingFace processor is available "
+                           "to process the image object")
+
+    if isinstance(data, Image.Image):
+        images = [[data]]
+    elif is_list_of(data, Image.Image):
+        images = [data]
+    else:
+        raise TypeError(f"Invalid image type: {type(data)}")
+
+    try:
+        batch_data = image_processor(images,
+                                     return_tensors="pt",
+                                     return_row_col_info=True).data
+    except Exception:
+        logger.error("Failed to process image (%s)", data)
+        raise
+
+    return MultiModalInputs(batch_data)
+
+
+def _resize_output_size(height: int,
+                        width: int,
+                        max_len: Optional[int] = None,
+                        min_len: Optional[int] = 1,
+                        max_size: Optional[int] = None) -> Tuple[int, int]:
+    # Set default value for max_len if not provided
+    max_len = max(height, width) if max_len is None else max_len
+    aspect_ratio = width / height
+
+    # Handle the maximum size constraint
+    if max_size is not None:
+        max_len = min(max_len, max_size)
+
+    # Adjust dimensions according to the aspect ratio
+    if width >= height:
+        width = max_len
+        height = int(width / aspect_ratio)
+    else:
+        height = max_len
+        width = int(height * aspect_ratio)
+
+    # Ensure both width and height are even (if needed)
+    height += 1 if height % 2 != 0 else 0
+    width += 1 if width % 2 != 0 else 0
+
+    # Ensure dimensions are not smaller than the minimum length
+    height = max(height, min_len)
+    width = max(width, min_len)
+
+    return height, width
+
+
+def _get_resize_output_image_size(
+    image_size: Tuple[int, int],
+    resolution_max_side: int,
+    max_image_size: int = 1820,
+) -> Tuple[int, int]:
+    if resolution_max_side > max_image_size:
+        raise ValueError(
+            "`resolution_max_side` cannot be larger than `max_image_size`")
+
+    height, width = image_size
+
+    # Find the output size, when rescaling the longest edge to max_len and
+    # preserving the aspect ratio
+    height, width = _resize_output_size(height,
+                                        width,
+                                        max_len=resolution_max_side)
+
+    return height, width
+
+
+def _prompt_split_image(image_seq_len: int, image_rows: int, image_cols: int,
+                        fake_token_around_image: str, image_token: str,
+                        global_img_token: str) -> str:
+    """
+    Prompt with expanded image tokens for when the image is split 
+    into patches.
+    """
+    text_split_images = ""
+    for n_h in range(image_rows):
+        for n_w in range(image_cols):
+            text_split_images += (fake_token_around_image +
+                                  f"<row_{n_h + 1}_col_{n_w + 1}>" +
+                                  image_token * image_seq_len)
+        text_split_images += "\n"
+
+    text_split_images += "\n" + _prompt_single_image(
+        image_seq_len=image_seq_len,
+        fake_token_around_image=fake_token_around_image,
+        image_token=image_token,
+        global_img_token=global_img_token)
+    return text_split_images
+
+
+def _prompt_single_image(image_seq_len: int, fake_token_around_image: str,
+                         image_token: str, global_img_token: str):
+    """Prompt with expanded image tokens for a single image."""
+    return (fake_token_around_image + global_img_token +
+            image_token * image_seq_len + fake_token_around_image)
+
+
+def _get_image_prompt_string(image_rows: int, image_cols: int,
+                             image_seq_len: int, fake_token_around_image: str,
+                             image_token: str, global_img_token: str):
+    if image_rows == 0 and image_cols == 0:
+        return _prompt_single_image(
+            image_seq_len=image_seq_len,
+            fake_token_around_image=fake_token_around_image,
+            image_token=image_token,
+            global_img_token=global_img_token,
+        )
+    return _prompt_split_image(image_seq_len, image_rows, image_cols,
+                               fake_token_around_image, image_token,
+                               global_img_token)
+
+
+def input_processor_for_idefics3(ctx: InputContext, inputs: DecoderOnlyInputs):
+    multi_modal_data = inputs.get("multi_modal_data")
+    if multi_modal_data is None or "image" not in multi_modal_data:
+        return inputs
+
+    model_config = ctx.model_config
+    processor = cached_get_processor(model_config.model)
+    image_processor = processor.image_processor
+    tokenizer = processor.tokenizer
+    size = image_processor.size['longest_edge']
+    max_image_size = image_processor.max_image_size['longest_edge']
+
+    image_data = multi_modal_data["image"]
+    if isinstance(image_data, Image.Image):
+        image_list = [image_data]
+    elif is_list_of(image_data, Image.Image):
+        image_list = image_data
+    else:
+        raise TypeError(f"Invalid image type: {type(image_data)}")
+
+    image_rows = []
+    image_cols = []
+    for image in image_list:
+        height, width = _get_resize_output_image_size(image.size, size)
+
+        rows = math.ceil(height / max_image_size)
+        cols = math.ceil(width / max_image_size)
+        image_rows.append(rows)
+        image_cols.append(cols)
+    image_rows = [image_rows]
+    image_cols = [image_cols]
+
+    n_images_in_text = []
+
+    text = inputs.get("prompt")
+    if text is not None:
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string, "
+                             "or a list of strings")
+
+        fake_image_token = processor.fake_image_token.content
+        image_token = processor.image_token.content
+        global_img_token = processor.global_image_tag
+
+        prompt_strings = []
+        for sample, sample_rows, sample_cols in zip(text, image_rows,
+                                                    image_cols):
+            n_images_in_text.append(sample.count(image_token))
+
+            # Replace the image token with fake tokens around the expanded
+            # image token sequence of length `image_seq_len`
+            image_prompt_strings = []
+            for n_rows, n_cols in zip(sample_rows, sample_cols):
+                image_prompt_string = _get_image_prompt_string(
+                    n_rows,
+                    n_cols,
+                    processor.image_seq_len,
+                    image_token=image_token,
+                    fake_token_around_image=fake_image_token,
+                    global_img_token=global_img_token,
+                )
+                image_prompt_strings.append(image_prompt_string)
+
+            split_sample = sample.split(image_token)
+            if len(split_sample) == 0:
+                raise ValueError(
+                    "The image token should be present in the text.")
+
+            # Place in the image prompt strings where the image tokens are
+            sample = split_sample[0]
+            for i, image_prompt_string in enumerate(image_prompt_strings):
+                sample += image_prompt_string + split_sample[i + 1]
+            prompt_strings.append(sample)
+
+        prompt_token_ids = tokenizer(text=prompt_strings[0]).input_ids
+
+        return token_inputs(
+            prompt_token_ids=prompt_token_ids,
+            prompt=prompt_strings[0],
+            multi_modal_data=multi_modal_data,
+        )
+
+
+def get_max_idefics3_image_tokens(ctx: InputContext,
+                                  *,
+                                  num_crops: Optional[int] = None):
+    model_config = ctx.model_config
+    processor = cached_get_processor(model_config.model)
+    image_seq_len = processor.image_seq_len
+    image_processor = processor.image_processor
+
+    size = image_processor.size['longest_edge']
+    max_image_size = image_processor.max_image_size['longest_edge']
+    resized_height, resized_width = size, size
+
+    grid_h = resized_height // max_image_size
+    grid_w = resized_width // max_image_size
+
+    return (grid_h * grid_w + 1) * image_seq_len
+
+
+def dummy_data_for_idefics3(ctx: InputContext, seq_len: int,
+                            mm_counts: Mapping[str, int]) -> DummyData:
+    hf_config = ctx.get_hf_config()
+    num_images = mm_counts["image"]
+
+    processor = cached_get_processor(ctx.model_config.model)
+    image_seq_len = processor.image_seq_len
+    max_llm_image_tokens = 17 * image_seq_len * num_images
+
+    seq_data = SequenceData.from_prompt_token_counts(
+        (hf_config.image_token_id, max_llm_image_tokens), (0, seq_len))
+
+    width = height = hf_config.vision_config.image_size
+    image = Image.new("RGB", (width, height), color=0)
+    mm_data = {"image": [image] if num_images == 1 else [image] * num_images}
+
+    return DummyData(seq_data, mm_data)
+
+
+class Idefics3SimpleMLP(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        input_size = config.vision_config.hidden_size * (config.scale_factor**
+                                                         2)
+        output_size = config.text_config.hidden_size
+        self.proj = ReplicatedLinear(input_size, output_size, bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        out, _ = self.proj(x)
+        return out
+
+
+class Idefics3Connector(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.scale_factor = config.scale_factor
+        self.modality_projection = Idefics3SimpleMLP(config)
+
+    def pixel_shuffle(self,
+                      x: torch.Tensor,
+                      scale_factor: int = 2) -> torch.Tensor:
+        bsz, seq, embed_dim = x.size()
+        height = width = int(seq**0.5)
+        x = x.view(bsz, height, width, embed_dim)
+        x = x.view(bsz, height, int(width / scale_factor),
+                   embed_dim * scale_factor)
+        x = x.permute(0, 2, 1, 3)
+        x = x.reshape(
+            bsz,
+            int(width / scale_factor),
+            int(height / scale_factor),
+            embed_dim * (scale_factor**2),
+        )
+        x = x.permute(0, 2, 1, 3)
+        x = x.reshape(bsz, int(seq / (scale_factor**2)),
+                      embed_dim * (scale_factor**2))
+        return x
+
+    def forward(self, image_hidden_states: torch.Tensor) -> torch.Tensor:
+        image_hidden_states = self.pixel_shuffle(image_hidden_states,
+                                                 self.scale_factor)
+        image_hidden_states = self.modality_projection(image_hidden_states)
+        return image_hidden_states
+
+
+class Idefics3Model(nn.Module):
+
+    def __init__(
+        self,
+        config: Idefics3Config,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.padding_idx = self.config.text_config.pad_token_id
+        self.vocab_size = self.config.text_config.vocab_size
+
+        self.vision_model = Idefics3VisionTransformer(config.vision_config,
+                                                      quant_config)
+        self.connector = Idefics3Connector(config)
+        self.text_model = LlamaModel(config.text_config, cache_config,
+                                     quant_config)
+
+        self.image_seq_len = int(
+            ((config.vision_config.image_size //
+              config.vision_config.patch_size)**2) / (config.scale_factor**2))
+        self.image_token_id = self.config.image_token_id
+
+    def _validate_pixel_values(
+        self, data: Union[torch.Tensor, List[torch.Tensor]]
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+
+        h = w = self.config.vision_config.image_size
+        expected_dims = (3, h, w)
+
+        def _validate_shape(d: torch.Tensor):
+            actual_dims = tuple(d.shape[1:])
+
+            if actual_dims != expected_dims:
+                expected_expr = ("num_patches", *map(str, expected_dims))
+                raise ValueError(
+                    "The expected shape of pixel values per image per batch "
+                    f"is {expected_expr}. You supplied {tuple(d.shape)}.")
+
+        for d in data:
+            _validate_shape(d)
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[ImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+        rows = kwargs.pop("rows", None)
+        cols = kwargs.pop("cols", None)
+        pixel_attention_mask = kwargs.pop("pixel_attention_mask", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if image_embeds is not None:
+            if not isinstance(image_embeds, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of image embeddings. "
+                                 f"Got type: {type(image_embeds)}")
+
+            return Idefics3ImageEmbeddingInputs(
+                type="image_embeds",
+                data=flatten_bn(image_embeds, concat=True),
+            )
+
+        if pixel_values is not None:
+            if not isinstance(pixel_values, (torch.Tensor, list)):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            return Idefics3ImagePixelInputs(type="pixel_values",
+                                            data=self._validate_pixel_values(
+                                                flatten_bn(pixel_values,
+                                                           concat=True)),
+                                            rows=rows,
+                                            cols=cols,
+                                            pixel_attention_mask=flatten_bn(
+                                                pixel_attention_mask,
+                                                concat=True))
+
+        raise AssertionError("This line should be unreachable.")
+
+    def _image_pixels_to_features(
+        self,
+        pixel_values: torch.Tensor,
+        pixel_attention_mask: Optional[torch.BoolTensor] = None,
+    ) -> torch.Tensor:
+        # NOTE: we skip the step to select the vision feature layer since
+        # this is already done inside the vision tower
+        batch_size, num_images, num_channels, height, width = pixel_values.shape
+        pixel_values = pixel_values.to(
+            dtype=self.vision_model.embeddings.patch_embedding.weight.dtype
+        )  # fp16 compatibility
+        pixel_values = pixel_values.view(batch_size * num_images,
+                                         *pixel_values.shape[2:])
+
+        # Remove padding images - padding images are full 0.
+        nb_values_per_image = pixel_values.shape[1:].numel()
+        real_images_inds = (pixel_values == 0.0).sum(
+            dim=(-1, -2, -3)) != nb_values_per_image
+        pixel_values = pixel_values[real_images_inds].contiguous()
+
+        # Handle the vision attention mask
+        if pixel_attention_mask is None:
+            pixel_attention_mask = torch.ones(
+                size=(pixel_values.size(0), pixel_values.size(2),
+                      pixel_values.size(3)),
+                dtype=torch.bool,
+                device=pixel_values.device,
+            )
+        else:
+            # Remove padding images from the mask
+            pixel_attention_mask = pixel_attention_mask.view(
+                batch_size * num_images, *pixel_attention_mask.shape[2:])
+            pixel_attention_mask = pixel_attention_mask[
+                real_images_inds].contiguous()
+
+        patch_size = self.config.vision_config.patch_size
+        patches_subgrid = pixel_attention_mask.unfold(dimension=1,
+                                                      size=patch_size,
+                                                      step=patch_size)
+        patches_subgrid = patches_subgrid.unfold(dimension=2,
+                                                 size=patch_size,
+                                                 step=patch_size)
+        patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
+
+        # Get sequence from the vision encoder
+        image_hidden_states = self.vision_model(
+            pixel_values=pixel_values,
+            patch_attention_mask=patch_attention_mask,
+        )
+
+        return image_hidden_states
+
+    def _process_image_pixels(
+            self, inputs: Idefics3ImagePixelInputs) -> torch.Tensor:
+        assert self.vision_model is not None
+
+        pixel_values = inputs["data"]
+        pixel_attention_mask = inputs["pixel_attention_mask"]
+
+        return self._image_pixels_to_features(pixel_values,
+                                              pixel_attention_mask)
+
+    def _process_image_input(self, image_input: ImageInputs) -> torch.Tensor:
+        if image_input["type"] == "image_embeds":
+            return image_input["data"]
+
+        assert self.vision_model is not None
+        image_features = self._process_image_pixels(image_input)
+        return self.connector(image_features)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        if intermediate_tensors is not None:
+            input_ids = None
+            inputs_embeds = None
+        else:
+            # always pass the input via `inputs_embeds`
+            # to make sure the computation graph is consistent
+            image_input = self._parse_and_validate_image_input(**kwargs)
+
+            if image_input is not None:
+                vision_embeddings = self._process_image_input(image_input)
+                inputs_embeds = self.text_model.get_input_embeddings(input_ids)
+
+                inputs_embeds = merge_multimodal_embeddings(
+                    input_ids, inputs_embeds, vision_embeddings,
+                    self.image_token_id)
+            else:
+                inputs_embeds = self.text_model.get_input_embeddings(input_ids)
+            input_ids = None
+
+        hidden_states = self.text_model(
+            input_ids,
+            positions,
+            kv_caches,
+            attn_metadata,
+            intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+        return hidden_states
+
+
+@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_idefics3)
+@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_idefics3_image_tokens)
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_idefics3)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_idefics3)
+class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal):
+
+    def __init__(
+        self,
+        config: Idefics3Config,
+        multimodal_config: MultiModalConfig,
+        cache_config: Optional[CacheConfig] = None,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+
+        self.model = Idefics3Model(config, cache_config, quant_config)
+        self.image_token_id = self.config.image_token_id
+
+        self.lm_head = ParallelLMHead(
+            config.text_config.vocab_size,
+            config.text_config.hidden_size,
+            quant_config=quant_config,
+        )
+        if self.config.text_config.tie_word_embeddings:
+            self.lm_head.weight = self.model.text_model.wte.weight
+        self.logits_processor = LogitsProcessor(config.text_config.vocab_size)
+        self.sampler = Sampler()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        **kwargs: object,
+    ) -> Union[torch.Tensor, IntermediateTensors]:
+        hidden_states = self.model(
+            input_ids,
+            positions,
+            kv_caches,
+            attn_metadata,
+            intermediate_tensors,
+            **kwargs,
+        )
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(self)
+        loader.load_weights(weights)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 792c6cec34ae0..32750602b988c 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -120,6 +120,7 @@
     "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
     "H2OVLChatModel": ("h2ovl", "H2OVLChatModel"),
     "InternVLChatModel": ("internvl", "InternVLChatModel"),
+    "Idefics3ForConditionalGeneration":("idefics3","Idefics3ForConditionalGeneration"),
     "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"),
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
     "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"),  # noqa: E501

From 406d4cc480bbc01d41f34b83102548bae229671a Mon Sep 17 00:00:00 2001
From: Eric <ericperfectttt@gmail.com>
Date: Wed, 6 Nov 2024 22:13:15 +0800
Subject: [PATCH 003/183] [Model][LoRA]LoRA support added for
 Qwen2VLForConditionalGeneration (#10022)

Signed-off-by: ericperfect <ericperfectttt@gmail.com>
---
 docs/source/models/supported_models.rst |  2 +-
 vllm/model_executor/models/qwen2_vl.py  | 32 +++++++++++++++++++++----
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index cdcea70c6cb7d..5a474043078db 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -540,7 +540,7 @@ Text Generation
     - Qwen2-VL
     - T + I\ :sup:`E+` + V\ :sup:`+`
     - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
-    -
+    - ✅︎
     - ✅︎
   * - :code:`UltravoxModel`
     - Ultravox
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index e30b84e8dd44c..fad9137d0dcc5 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -40,7 +40,7 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.attention.selector import _Backend
-from vllm.config import CacheConfig, MultiModalConfig
+from vllm.config import CacheConfig, LoRAConfig, MultiModalConfig
 from vllm.distributed import get_pp_group, parallel_state
 from vllm.distributed import utils as dist_utils
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
@@ -65,7 +65,7 @@
 from vllm.transformers_utils.config import uses_mrope
 from vllm.transformers_utils.processor import cached_get_processor
 
-from .interfaces import SupportsMultiModal, SupportsPP
+from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
 from .utils import (PPMissingLayer, get_vit_attn_backend,
                     is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory)
@@ -927,13 +927,37 @@ def input_processor_for_qwen2_vl(
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_vl)
 @INPUT_REGISTRY.register_input_processor(input_processor_for_qwen2_vl)
 class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
-                                      SupportsPP):
+                                      SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    # TODO Support LoRA for the visual encoder in the future.
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+    ]
+    embedding_modules = {}
+    embedding_padding_modules = []
 
     def __init__(self,
                  config: Qwen2VLConfig,
                  multimodal_config: MultiModalConfig,
                  cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None) -> None:
+                 quant_config: Optional[QuantizationConfig] = None,
+                 lora_config: Optional[LoRAConfig] = None) -> None:
+
         super().__init__()
 
         assert not cache_config.enable_prefix_caching, \

From 399c7986088ed66184e69ac6ae2b28003b642711 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 6 Nov 2024 09:27:06 -0500
Subject: [PATCH 004/183] Remove ScaledActivation for AWQ (#10057)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 vllm/model_executor/layers/activation.py      | 37 ++-----------------
 .../layers/quantization/aqlm.py               |  3 --
 .../model_executor/layers/quantization/awq.py |  3 --
 .../layers/quantization/awq_marlin.py         |  3 --
 .../layers/quantization/base_config.py        |  8 ----
 .../layers/quantization/bitsandbytes.py       |  3 --
 .../compressed_tensors/compressed_tensors.py  |  3 --
 .../layers/quantization/deepspeedfp.py        |  3 --
 .../layers/quantization/experts_int8.py       |  3 --
 .../layers/quantization/fbgemm_fp8.py         |  3 --
 .../model_executor/layers/quantization/fp8.py |  3 --
 .../layers/quantization/gguf.py               |  3 --
 .../layers/quantization/gptq.py               |  3 --
 .../layers/quantization/gptq_marlin.py        |  3 --
 .../layers/quantization/gptq_marlin_24.py     |  3 --
 .../layers/quantization/ipex_quant.py         |  6 ---
 .../layers/quantization/marlin.py             |  3 --
 .../layers/quantization/modelopt.py           |  3 --
 .../layers/quantization/neuron_quant.py       |  3 --
 .../model_executor/layers/quantization/qqq.py |  3 --
 .../layers/quantization/tpu_int8.py           |  3 --
 vllm/model_executor/models/bart.py            |  8 ++--
 vllm/model_executor/models/bloom.py           |  2 +-
 vllm/model_executor/models/falcon.py          |  2 +-
 vllm/model_executor/models/gpt2.py            |  3 +-
 vllm/model_executor/models/gpt_bigcode.py     |  3 +-
 vllm/model_executor/models/gpt_j.py           |  3 +-
 vllm/model_executor/models/gpt_neox.py        |  3 +-
 vllm/model_executor/models/mpt.py             |  2 +-
 vllm/model_executor/models/opt.py             |  3 +-
 vllm/model_executor/models/persimmon.py       |  2 +-
 vllm/model_executor/models/phi.py             |  2 +-
 vllm/model_executor/models/qwen.py            |  2 +-
 vllm/model_executor/models/starcoder2.py      |  3 +-
 34 files changed, 19 insertions(+), 124 deletions(-)

diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index e347ca80ff765..34d65ed51ef3f 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -9,7 +9,6 @@
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.custom_op import CustomOp
-from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.utils import LazyDict
 
@@ -277,28 +276,14 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
 })
 
 
-def get_act_fn(
-    act_fn_name: str,
-    quant_config: Optional[QuantizationConfig] = None,
-    intermediate_size: Optional[int] = None,
-    input_is_parallel: bool = True,
-    params_dtype: Optional[torch.dtype] = None,
-) -> nn.Module:
+def get_act_fn(act_fn_name: str) -> nn.Module:
     """Get an activation function by name."""
     act_fn_name = act_fn_name.lower()
     if act_fn_name not in _ACTIVATION_REGISTRY:
         raise ValueError(
             f"Activation function {act_fn_name!r} is not supported.")
 
-    act_fn = _ACTIVATION_REGISTRY[act_fn_name]
-    if (quant_config is not None
-            and act_fn_name in quant_config.get_scaled_act_names()):
-        if intermediate_size is None:
-            raise ValueError("intermediate_size must be specified for scaled "
-                             "activation functions.")
-        return ScaledActivation(act_fn, intermediate_size, input_is_parallel,
-                                params_dtype)
-    return act_fn
+    return _ACTIVATION_REGISTRY[act_fn_name]
 
 
 _ACTIVATION_AND_MUL_REGISTRY = LazyDict({
@@ -307,25 +292,11 @@ def get_act_fn(
 })
 
 
-def get_act_and_mul_fn(
-    act_fn_name: str,
-    quant_config: Optional[QuantizationConfig] = None,
-    intermediate_size: Optional[int] = None,
-    input_is_parallel: bool = True,
-    params_dtype: Optional[torch.dtype] = None,
-) -> nn.Module:
+def get_act_and_mul_fn(act_fn_name: str) -> nn.Module:
     """Get an activation-and-mul (i.e. SiluAndMul) function by name."""
     act_fn_name = act_fn_name.lower()
     if act_fn_name not in _ACTIVATION_AND_MUL_REGISTRY:
         raise ValueError(
             f"Activation function {act_fn_name!r} is not supported.")
 
-    act_fn = _ACTIVATION_AND_MUL_REGISTRY[act_fn_name]
-    if (quant_config is not None
-            and act_fn_name in quant_config.get_scaled_act_names()):
-        if intermediate_size is None:
-            raise ValueError("intermediate_size must be specified for scaled "
-                             "activation functions.")
-        return ScaledActivation(act_fn, intermediate_size, input_is_parallel,
-                                params_dtype)
-    return act_fn
+    return _ACTIVATION_AND_MUL_REGISTRY[act_fn_name]
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
index c88ca340ebcc5..72c89fe2b0e48 100644
--- a/vllm/model_executor/layers/quantization/aqlm.py
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -213,9 +213,6 @@ def get_quant_method(self, layer: torch.nn.Module,
             return AQLMLinearMethod(self)
         return None
 
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
 
 class AQLMLinearMethod(LinearMethodBase):
     """Linear method for AQLM.
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index 38dd1f2e10fcd..d83528e9ec79c 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -77,9 +77,6 @@ def get_quant_method(self, layer: torch.nn.Module,
             return AWQLinearMethod(self)
         return None
 
-    def get_scaled_act_names(self) -> List[str]:
-        return ["gelu", "gelu_fast", "gelu_new", "gelu_pytorch_tanh"]
-
 
 def is_layer_skipped_awq(prefix: str, modules_to_not_convert: List[str]):
     return any(module_name in prefix for module_name in modules_to_not_convert)
diff --git a/vllm/model_executor/layers/quantization/awq_marlin.py b/vllm/model_executor/layers/quantization/awq_marlin.py
index ea69bee45f8d9..4d1a837d11585 100644
--- a/vllm/model_executor/layers/quantization/awq_marlin.py
+++ b/vllm/model_executor/layers/quantization/awq_marlin.py
@@ -127,9 +127,6 @@ def get_quant_method(self, layer: torch.nn.Module,
             return AWQMoEMethod(self)
         return None
 
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
     @classmethod
     def is_awq_marlin_compatible(cls, quant_config: Dict[str, Any]):
         # Extract data from quant config.
diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py
index 75fa8249cd3c2..6dfac8aad5358 100644
--- a/vllm/model_executor/layers/quantization/base_config.py
+++ b/vllm/model_executor/layers/quantization/base_config.py
@@ -133,11 +133,3 @@ def get_quant_method(self, layer: torch.nn.Module,
             method.
         """
         raise NotImplementedError
-
-    @abstractmethod
-    def get_scaled_act_names(self) -> List[str]:
-        """Returns the activation function names that should be post-scaled.
-
-        For now, this is only used by AWQ.
-        """
-        raise NotImplementedError
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
index 78965d7b9495c..39965ac9115c2 100644
--- a/vllm/model_executor/layers/quantization/bitsandbytes.py
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -114,9 +114,6 @@ def get_quant_method(self, layer: torch.nn.Module,
             return BitsAndBytesLinearMethod(self)
         return None
 
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
 
 def is_layer_skipped_bnb(prefix: str, llm_int8_skip_modules: List[str]):
     # Split the prefix into its dot-separated components
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index ecc345f116c37..4f5758a42dbbc 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -45,9 +45,6 @@ def __init__(self,
     def get_linear_method(self) -> "CompressedTensorsLinearMethod":
         return CompressedTensorsLinearMethod(self)
 
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
     def get_supported_act_dtypes(cls) -> List[torch.dtype]:
         return [torch.float16, torch.bfloat16]
 
diff --git a/vllm/model_executor/layers/quantization/deepspeedfp.py b/vllm/model_executor/layers/quantization/deepspeedfp.py
index 29484801dc380..36598b3e2990f 100644
--- a/vllm/model_executor/layers/quantization/deepspeedfp.py
+++ b/vllm/model_executor/layers/quantization/deepspeedfp.py
@@ -50,9 +50,6 @@ def from_config(cls, config: Dict[str, Any]) -> "DeepSpeedFPConfig":
     def get_linear_method(self) -> "DeepSpeedFPLinearMethod":
         return DeepSpeedFPLinearMethod(self)
 
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
     @classmethod
     def get_supported_act_dtypes(cls) -> List[torch.dtype]:
         return [torch.half, torch.bfloat16]
diff --git a/vllm/model_executor/layers/quantization/experts_int8.py b/vllm/model_executor/layers/quantization/experts_int8.py
index 116a4ea0aed89..97297970d9317 100644
--- a/vllm/model_executor/layers/quantization/experts_int8.py
+++ b/vllm/model_executor/layers/quantization/experts_int8.py
@@ -45,9 +45,6 @@ def get_quant_method(self, layer: torch.nn.Module,
             return ExpertsInt8MoEMethod(self)
         return None
 
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
 
 class ExpertsInt8MoEMethod(FusedMoEMethodBase):
 
diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
index 825d01d1b3551..7b71e13b50ccc 100644
--- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py
+++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py
@@ -64,9 +64,6 @@ def get_quant_method(self, layer: torch.nn.Module,
             return FBGEMMFp8LinearMethod(self)
         return None
 
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
 
 class FBGEMMFp8LinearMethod(LinearMethodBase):
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index d34579b7099bb..978e727bc7cb3 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -92,9 +92,6 @@ def get_quant_method(self, layer: torch.nn.Module,
             return Fp8KVCacheMethod(self)
         return None
 
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
 
 class Fp8LinearMethod(LinearMethodBase):
     """Linear method for FP8.
diff --git a/vllm/model_executor/layers/quantization/gguf.py b/vllm/model_executor/layers/quantization/gguf.py
index d73b9f6d92832..24138662eb25c 100644
--- a/vllm/model_executor/layers/quantization/gguf.py
+++ b/vllm/model_executor/layers/quantization/gguf.py
@@ -48,9 +48,6 @@ def get_quant_method(self, layer: torch.nn.Module,
             return GGUFEmbeddingMethod(self)
         return None
 
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
 
 def _fuse_mul_mat(x: torch.Tensor, qweight: torch.Tensor,
                   qweight_type: int) -> torch.Tensor:
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index 1cfadb4f42ca8..0aa605e62454e 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -80,9 +80,6 @@ def get_quant_method(self, layer: torch.nn.Module,
             return GPTQLinearMethod(self)
         return None
 
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
 
 class ExllamaState(Enum):
 
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
index b97dd108d6785..1f72e3afbbce5 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -125,9 +125,6 @@ def get_quant_method(
             return GPTQMarlinMoEMethod(self)
         return None
 
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
     @classmethod
     def is_gptq_marlin_compatible(cls, quant_config: Dict[str, Any]):
         # Extract data from quant config.
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin_24.py b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
index 0971aedba4c3c..07552c0f13348 100644
--- a/vllm/model_executor/layers/quantization/gptq_marlin_24.py
+++ b/vllm/model_executor/layers/quantization/gptq_marlin_24.py
@@ -127,9 +127,6 @@ def get_quant_method(self, layer: torch.nn.Module,
             return GPTQMarlin24LinearMethod(self)
         return None
 
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
 
 class GPTQMarlin24LinearMethod(LinearMethodBase):
     """Linear method for Marlin24.
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index e54052632e468..43f4502f7455c 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -93,12 +93,6 @@ def get_quant_method(self, layer: torch.nn.Module,
             return self.quant_method(self)
         return None
 
-    def get_scaled_act_names(self) -> List[str]:
-        if self.method == "awq":
-            return ["gelu", "gelu_fast", "gelu_new", "gelu_pytorch_tanh"]
-        else:
-            return []
-
 
 class IPEXAWQLinearMethod(AWQLinearMethod):
     """AWQ linear method using IPEX for the CPU backend.
diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py
index 8f1b5370b4538..20212e672eab0 100644
--- a/vllm/model_executor/layers/quantization/marlin.py
+++ b/vllm/model_executor/layers/quantization/marlin.py
@@ -110,9 +110,6 @@ def get_quant_method(self, layer: torch.nn.Module,
             return MarlinLinearMethod(self)
         return None
 
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
 
 class MarlinLinearMethod(LinearMethodBase):
     """Linear method for Marlin.
diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py
index 9694f2b8208e2..a1b3eeb43cbee 100644
--- a/vllm/model_executor/layers/quantization/modelopt.py
+++ b/vllm/model_executor/layers/quantization/modelopt.py
@@ -68,9 +68,6 @@ def get_quant_method(self, layer: torch.nn.Module,
             return ModelOptFp8KVCacheMethod(self)
         return None
 
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
 
 class ModelOptFp8KVCacheMethod(BaseKVCacheMethod):
     """
diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py
index 2624981f6a614..2d5cdfa165775 100644
--- a/vllm/model_executor/layers/quantization/neuron_quant.py
+++ b/vllm/model_executor/layers/quantization/neuron_quant.py
@@ -57,9 +57,6 @@ def get_quant_method(self, layer: Module, prefix: str) -> Optional[Any]:
                 "Neuron Quantization is only supported through"
                 " transformers_neuronx.")
 
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
     def get_quantization_config(self):
         from transformers_neuronx.config import QuantizationConfig
         return QuantizationConfig(quant_dtype=self.quant_dtype,
diff --git a/vllm/model_executor/layers/quantization/qqq.py b/vllm/model_executor/layers/quantization/qqq.py
index 5bc3737520865..2ccd082029610 100644
--- a/vllm/model_executor/layers/quantization/qqq.py
+++ b/vllm/model_executor/layers/quantization/qqq.py
@@ -112,9 +112,6 @@ def get_quant_method(self, layer: torch.nn.Module,
             return QQQLinearMethod(self)
         return None
 
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
 
 class QQQLinearMethod(LinearMethodBase):
     """Linear method for QQQ.
diff --git a/vllm/model_executor/layers/quantization/tpu_int8.py b/vllm/model_executor/layers/quantization/tpu_int8.py
index be8235b468f68..605c3a38644ac 100644
--- a/vllm/model_executor/layers/quantization/tpu_int8.py
+++ b/vllm/model_executor/layers/quantization/tpu_int8.py
@@ -50,9 +50,6 @@ def get_quant_method(self, layer: Module,
             return TPUInt8LinearMethod(self)
         return None
 
-    def get_scaled_act_names(self) -> List[str]:
-        return []
-
 
 class TPUInt8LinearMethod(LinearMethodBase):
     """Int8 Linear method for TPU Quant. """
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index 0543ca978b7dd..85de1a8115b8b 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -393,8 +393,7 @@ def __init__(
             cache_config=cache_config,
             quant_config=quant_config)
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.activation_fn = get_act_fn(config.activation_function,
-                                        quant_config)
+        self.activation_fn = get_act_fn(config.activation_function)
 
         ffn_hidden_size = self.embed_dim
         ffn_intermediate_size = config.encoder_ffn_dim
@@ -405,7 +404,7 @@ def __init__(
             bias=ffn_has_bias,
             quant_config=quant_config,
         )
-        self.act = get_act_fn("gelu", quant_config, ffn_intermediate_size)
+        self.act = get_act_fn("gelu")
         self.fc2 = RowParallelLinear(
             ffn_intermediate_size,
             ffn_hidden_size,
@@ -473,8 +472,7 @@ def __init__(
             config=config,
             cache_config=cache_config,
             quant_config=quant_config)
-        self.activation_fn = get_act_fn(config.activation_function,
-                                        quant_config)
+        self.activation_fn = get_act_fn(config.activation_function)
 
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
         '''
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 83ff39a30fbe3..b2c109a21d4cf 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -146,7 +146,7 @@ def __init__(
             4 * hidden_size,
             quant_config=quant_config,
         )
-        self.gelu_impl = get_act_fn("gelu", quant_config, 4 * hidden_size)
+        self.gelu_impl = get_act_fn("gelu")
         self.dense_4h_to_h = RowParallelLinear(
             4 * hidden_size,
             hidden_size,
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index ad07fc3b3776e..6f8a7a7015c79 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -212,7 +212,7 @@ def __init__(
                                                   bias=config.bias,
                                                   skip_bias_add=True,
                                                   quant_config=quant_config)
-        self.act = get_act_fn("gelu", quant_config, 4 * hidden_size)
+        self.act = get_act_fn("gelu")
         self.reduce_row_parallel_results = not (config.new_decoder_architecture
                                                 or config.parallel_attn)
         self.dense_4h_to_h = RowParallelLinear(
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index a06200c4b7e08..8147037ed2a32 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -123,8 +123,7 @@ def __init__(
             quant_config=quant_config,
             prefix=f"{prefix}.c_proj",
         )
-        self.act = get_act_fn(config.activation_function, quant_config,
-                              intermediate_size)
+        self.act = get_act_fn(config.activation_function)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states, _ = self.c_fc(hidden_states)
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index 7612ea641d95c..9f44fa76abcba 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -135,8 +135,7 @@ def __init__(
             bias=True,
             quant_config=quant_config,
         )
-        self.act = get_act_fn(config.activation_function, quant_config,
-                              intermediate_size)
+        self.act = get_act_fn(config.activation_function)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states, _ = self.c_fc(hidden_states)
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index b28a6081b868f..6fcccdfb112d8 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -130,8 +130,7 @@ def __init__(
             hidden_size,
             quant_config=quant_config,
         )
-        self.act = get_act_fn(config.activation_function, quant_config,
-                              intermediate_size)
+        self.act = get_act_fn(config.activation_function)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states, _ = self.fc_in(hidden_states)
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 931052c7cccf0..d3f86558ecc7e 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -128,8 +128,7 @@ def __init__(
             config.hidden_size,
             quant_config=quant_config,
         )
-        self.act = get_act_fn(config.hidden_act, quant_config,
-                              config.intermediate_size)
+        self.act = get_act_fn(config.hidden_act)
 
     def forward(self, hidden_states):
         hidden_states, _ = self.dense_h_to_4h(hidden_states)
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index fdd8af79b5470..7f0658f4cb2b0 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -153,7 +153,7 @@ def __init__(
             bias=not config.no_bias,
             quant_config=quant_config,
         )
-        self.act = get_act_fn("gelu", quant_config, intermediate_size)
+        self.act = get_act_fn("gelu")
         self.down_proj = RowParallelLinear(
             intermediate_size,
             hidden_size,
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 7a76e4a0906db..d140f4237b1ca 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -147,8 +147,7 @@ def __init__(
             quant_config=quant_config,
             prefix=f"{prefix}.fc1",
         )
-        self.activation_fn = get_act_fn(config.activation_function,
-                                        quant_config, config.ffn_dim)
+        self.activation_fn = get_act_fn(config.activation_function)
         self.fc2 = RowParallelLinear(
             config.ffn_dim,
             self.embed_dim,
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index bd4a9f698bacd..112bf6f3ed1af 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -60,7 +60,7 @@ def __init__(self,
         self.dense_4h_to_h = RowParallelLinear(config.intermediate_size,
                                                config.hidden_size,
                                                quant_config=quant_config)
-        self.act = get_act_fn(config.hidden_act, quant_config)
+        self.act = get_act_fn(config.hidden_act)
 
     def forward(self, hidden_states) -> torch.Tensor:
         hidden_states, _ = self.dense_h_to_4h(hidden_states)
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 492122450b237..d308f4913314c 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -152,7 +152,7 @@ def __init__(self,
             config.hidden_size,
             quant_config=quant_config,
         )
-        self.act = get_act_fn(config.hidden_act, quant_config, n_inner)
+        self.act = get_act_fn(config.hidden_act)
 
     def forward(self, hidden_states):
         hidden_states, _ = self.fc1(hidden_states)
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 3a0e33e8a3eff..4044ddbbcca3d 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -203,7 +203,7 @@ def __init__(
                                          intermediate_size,
                                          bias=True,
                                          quant_config=quant_config)
-        self.act_fn = get_act_fn("gelu", quant_config, intermediate_size)
+        self.act_fn = get_act_fn("gelu")
         self.c_proj = RowParallelLinear(
             intermediate_size,
             hidden_size,
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index b24c5dadb2b2b..a5e4155fb4d2c 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -139,8 +139,7 @@ def __init__(self,
             bias=config.use_bias,
             quant_config=quant_config,
         )
-        self.act = get_act_fn(config.hidden_act, quant_config,
-                              config.intermediate_size)
+        self.act = get_act_fn(config.hidden_act)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states, _ = self.c_fc(hidden_states)

From 098f94de42859f8251fe920f87adb88336129c53 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Wed, 6 Nov 2024 09:31:01 -0500
Subject: [PATCH 005/183] [CI/Build] Drop Python 3.8 support (#10038)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .readthedocs.yaml                                   |  2 +-
 CMakeLists.txt                                      |  2 +-
 docs/source/getting_started/amd-installation.rst    |  2 --
 docs/source/getting_started/installation.rst        |  2 +-
 docs/source/getting_started/neuron-installation.rst |  2 +-
 docs/source/getting_started/quickstart.rst          |  2 +-
 setup.py                                            | 12 ++++--------
 vllm/distributed/parallel_state.py                  |  5 ++---
 8 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 34735700a224e..284196bc2d279 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -6,7 +6,7 @@ version: 2
 build:
   os: ubuntu-22.04
   tools:
-    python: '3.9'
+    python: "3.12"
 
 sphinx:
   configuration: docs/source/conf.py
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c372ba98befbf..25c0865a90a67 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,7 +31,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 # Supported python versions.  These versions will be searched in order, the
 # first match will be selected.  These should be kept in sync with setup.py.
 #
-set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11" "3.12")
+set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 
 # Supported NVIDIA architectures.
 set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst
index 301337aebcf4c..ece5d785e0c65 100644
--- a/docs/source/getting_started/amd-installation.rst
+++ b/docs/source/getting_started/amd-installation.rst
@@ -13,8 +13,6 @@ Requirements
 * GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
 * ROCm 6.2
 
-Note: PyTorch 2.5+/ROCm6.2 dropped the support for python 3.8.
-
 Installation options:
 
 #. :ref:`Build from source with docker <build_from_source_docker_rocm>`
diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index 61871cdf41125..efc050dd1bfb2 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -66,7 +66,7 @@ If you want to access the wheels for previous commits, you can specify the commi
     $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
     $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 
-Note that the wheels are built with Python 3.8 ABI (see `PEP 425 <https://peps.python.org/pep-0425/>`_ for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata.
+Note that the wheels are built with Python 3.9 ABI (see `PEP 425 <https://peps.python.org/pep-0425/>`_ for more details about ABI), so **they are compatible with Python 3.9 and later**. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata.
 
 Another way to access the latest code is to use the docker images:
 
diff --git a/docs/source/getting_started/neuron-installation.rst b/docs/source/getting_started/neuron-installation.rst
index ec99fc013057b..025ba6ef7ebd8 100644
--- a/docs/source/getting_started/neuron-installation.rst
+++ b/docs/source/getting_started/neuron-installation.rst
@@ -11,7 +11,7 @@ Requirements
 ------------
 
 * OS: Linux
-* Python: 3.8 -- 3.11
+* Python: 3.9 -- 3.11
 * Accelerator: NeuronCore_v2 (in trn1/inf2 instances)
 * Pytorch 2.0.1/2.1.1
 * AWS Neuron SDK 2.16/2.17 (Verified on python 3.8)
diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst
index 00b762ccc2ccb..0c0491c860563 100644
--- a/docs/source/getting_started/quickstart.rst
+++ b/docs/source/getting_started/quickstart.rst
@@ -12,7 +12,7 @@ This guide will help you quickly get started with vLLM to:
 Prerequisites
 --------------
 - OS: Linux
-- Python: 3.8 - 3.12
+- Python: 3.9 -- 3.12
 - GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
 
 Installation
diff --git a/setup.py b/setup.py
index 4a20e49235ac8..d2438ae74c455 100644
--- a/setup.py
+++ b/setup.py
@@ -55,12 +55,6 @@ def is_ninja_available() -> bool:
     return which("ninja") is not None
 
 
-def remove_prefix(text, prefix):
-    if text.startswith(prefix):
-        return text[len(prefix):]
-    return text
-
-
 class CMakeExtension(Extension):
 
     def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None:
@@ -197,8 +191,10 @@ def build_extensions(self) -> None:
             os.makedirs(self.build_temp)
 
         targets = []
-        target_name = lambda s: remove_prefix(remove_prefix(s, "vllm."),
-                                              "vllm_flash_attn.")
+
+        def target_name(s: str) -> str:
+            return s.removeprefix("vllm.").removeprefix("vllm_flash_attn.")
+
         # Build all the extensions
         for ext in self.extensions:
             self.configure(ext)
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index efa3525910a5e..0d15403264eee 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -89,12 +89,11 @@ def _get_unique_name(name: str) -> str:
     return newname
 
 
-_groups: Dict[str, Callable[[], "GroupCoordinator"]] = {}
+_groups: Dict[str, Callable[[], Optional["GroupCoordinator"]]] = {}
 
 
 def _register_group(group: "GroupCoordinator") -> None:
-    # looks like Python 3.8 does not understand `ReferenceType`
-    _groups[group.unique_name] = weakref.ref(group)  # type: ignore
+    _groups[group.unique_name] = weakref.ref(group)
 
 
 if supports_custom_op():

From 87bd7e0515eebd9344272a3136d7bd662c607438 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Wed, 6 Nov 2024 13:15:42 -0500
Subject: [PATCH 006/183] [CI/Build] change conflict PR comment from mergify
 (#10080)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .github/mergify.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/mergify.yml b/.github/mergify.yml
index 1ce5039a061b2..ca4bd7ee2b87f 100644
--- a/.github/mergify.yml
+++ b/.github/mergify.yml
@@ -46,7 +46,9 @@ pull_request_rules:
     comment:
       message: |
        This pull request has merge conflicts that must be resolved before it can be
-       merged. @{{author}} please rebase it. https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
+       merged. Please rebase the PR, @{{author}}.
+
+       https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
 
 - name: remove 'needs-rebase' label when conflict is resolved
   conditions:

From d58268c56a8ee0eb01c30e7ab7c07c934e1791c2 Mon Sep 17 00:00:00 2001
From: Joe Runde <Joseph.Runde@ibm.com>
Date: Wed, 6 Nov 2024 12:57:35 -0700
Subject: [PATCH 007/183] [V1] Make v1 more testable (#9888)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
---
 Dockerfile                                    |  3 ++
 pyproject.toml                                |  1 +
 tests/conftest.py                             | 18 ++++++++
 .../entrypoints/llm/test_prompt_validation.py |  9 ++++
 tests/kernels/test_attention_selector.py      |  2 +
 tests/kernels/test_encoder_decoder_attn.py    |  4 +-
 vllm/attention/selector.py                    | 43 ++++++++++++++-----
 vllm/engine/multiprocessing/engine.py         | 18 ++++----
 vllm/entrypoints/llm.py                       | 26 +++++++----
 vllm/model_executor/layers/sampler.py         |  9 ++++
 vllm/model_executor/models/arctic.py          |  4 +-
 vllm/model_executor/models/baichuan.py        |  4 +-
 vllm/model_executor/models/bart.py            |  4 +-
 vllm/model_executor/models/blip2.py           |  4 +-
 vllm/model_executor/models/bloom.py           |  4 +-
 vllm/model_executor/models/chameleon.py       |  4 +-
 vllm/model_executor/models/chatglm.py         |  4 +-
 vllm/model_executor/models/commandr.py        |  4 +-
 vllm/model_executor/models/dbrx.py            |  4 +-
 vllm/model_executor/models/deepseek.py        |  4 +-
 vllm/model_executor/models/deepseek_v2.py     |  4 +-
 vllm/model_executor/models/exaone.py          |  4 +-
 vllm/model_executor/models/falcon.py          |  4 +-
 vllm/model_executor/models/florence2.py       |  4 +-
 vllm/model_executor/models/gemma.py           |  4 +-
 vllm/model_executor/models/gemma2.py          |  4 +-
 vllm/model_executor/models/gpt2.py            |  4 +-
 vllm/model_executor/models/gpt_bigcode.py     |  4 +-
 vllm/model_executor/models/gpt_j.py           |  4 +-
 vllm/model_executor/models/gpt_neox.py        |  4 +-
 vllm/model_executor/models/granite.py         |  4 +-
 vllm/model_executor/models/granitemoe.py      |  4 +-
 vllm/model_executor/models/internlm2.py       |  4 +-
 vllm/model_executor/models/internvl.py        |  4 +-
 vllm/model_executor/models/jais.py            |  4 +-
 vllm/model_executor/models/jamba.py           |  4 +-
 vllm/model_executor/models/llama.py           |  4 +-
 vllm/model_executor/models/llava.py           |  4 +-
 vllm/model_executor/models/llava_next.py      |  4 +-
 .../model_executor/models/llava_next_video.py |  4 +-
 vllm/model_executor/models/llava_onevision.py |  4 +-
 vllm/model_executor/models/mamba.py           |  4 +-
 vllm/model_executor/models/minicpm.py         |  4 +-
 vllm/model_executor/models/minicpmv.py        |  4 +-
 vllm/model_executor/models/mixtral.py         |  4 +-
 vllm/model_executor/models/mixtral_quant.py   |  4 +-
 vllm/model_executor/models/mllama.py          |  4 +-
 vllm/model_executor/models/mlp_speculator.py  |  4 +-
 vllm/model_executor/models/molmo.py           |  4 +-
 vllm/model_executor/models/mpt.py             |  4 +-
 vllm/model_executor/models/nemotron.py        |  4 +-
 vllm/model_executor/models/olmo.py            |  4 +-
 vllm/model_executor/models/olmoe.py           |  4 +-
 vllm/model_executor/models/opt.py             |  4 +-
 vllm/model_executor/models/orion.py           |  4 +-
 vllm/model_executor/models/persimmon.py       |  4 +-
 vllm/model_executor/models/phi.py             |  4 +-
 vllm/model_executor/models/phi3_small.py      |  4 +-
 vllm/model_executor/models/phi3v.py           |  4 +-
 vllm/model_executor/models/phimoe.py          |  4 +-
 vllm/model_executor/models/pixtral.py         |  4 +-
 vllm/model_executor/models/qwen.py            |  4 +-
 vllm/model_executor/models/qwen2.py           |  4 +-
 vllm/model_executor/models/qwen2_audio.py     |  4 +-
 vllm/model_executor/models/qwen2_moe.py       |  4 +-
 vllm/model_executor/models/qwen2_vl.py        |  4 +-
 vllm/model_executor/models/solar.py           |  4 +-
 vllm/model_executor/models/stablelm.py        |  4 +-
 vllm/model_executor/models/starcoder2.py      |  4 +-
 vllm/model_executor/models/ultravox.py        |  4 +-
 vllm/model_executor/models/xverse.py          |  4 +-
 vllm/v1/attention/backends/flash_attn.py      | 12 +++---
 vllm/v1/engine/llm_engine.py                  |  6 +++
 vllm/v1/tokenizer/detokenizer.py              |  8 ++--
 vllm/v1/worker/gpu_model_runner.py            |  5 +--
 75 files changed, 243 insertions(+), 165 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 343364da2ebf5..4c0f5aebe859d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -191,6 +191,9 @@ ADD . /vllm-workspace/
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-dev.txt
 
+# Copy in the v1 package for testing (it isn't distributed yet)
+COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
+
 # doc requires source code
 # we hide them inside `test_docs/` , so that this source code
 # will not be imported by other tests
diff --git a/pyproject.toml b/pyproject.toml
index 3562569647391..1aebc543a733a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -97,4 +97,5 @@ markers = [
     "skip_global_cleanup",
     "core_model: run this model test in each PR instead of just daily",
     "distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
+    "skip_v1: do not run this test with v1",
 ]
diff --git a/tests/conftest.py b/tests/conftest.py
index f9dfabc82639b..6cf791dc62ce5 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -5,6 +5,7 @@
 from enum import Enum
 from typing import (Any, Callable, Dict, List, Optional, Tuple, Type,
                     TypedDict, TypeVar, Union)
+from unittest.mock import patch
 
 import numpy as np
 import pytest
@@ -108,6 +109,23 @@ def prompts(self, prompts: _VideoAssetPrompts) -> List[str]:
 """Singleton instance of :class:`_VideoAssets`."""
 
 
+@pytest.fixture(params=[True, False])
+def run_with_both_engines(request):
+    # Automatically runs tests twice, once with V1 and once without
+    use_v1 = request.param
+    # Tests decorated with `@skip_v1` are only run without v1
+    skip_v1 = request.node.get_closest_marker("skip_v1")
+
+    if use_v1:
+        if skip_v1:
+            pytest.skip("Skipping test on vllm V1")
+        with patch('vllm.envs.VLLM_USE_V1', True):
+            yield
+    else:
+        with patch('vllm.envs.VLLM_USE_V1', False):
+            yield
+
+
 @pytest.fixture(autouse=True)
 def init_test_http_connection():
     # pytest_asyncio may use a different event loop per test
diff --git a/tests/entrypoints/llm/test_prompt_validation.py b/tests/entrypoints/llm/test_prompt_validation.py
index 675a980ab3f3f..ee7010a238114 100644
--- a/tests/entrypoints/llm/test_prompt_validation.py
+++ b/tests/entrypoints/llm/test_prompt_validation.py
@@ -3,12 +3,21 @@
 from vllm import LLM
 
 
+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def test_empty_prompt():
     llm = LLM(model="gpt2", enforce_eager=True)
     with pytest.raises(ValueError, match='Prompt cannot be empty'):
         llm.generate([""])
 
 
+@pytest.mark.skip_v1
 def test_out_of_vocab_token():
     llm = LLM(model="gpt2", enforce_eager=True)
     with pytest.raises(ValueError, match='out of vocabulary'):
diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index 3fe9ca0b0450f..169ce040d370c 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -44,6 +44,8 @@ def test_env(name: str, device: str, monkeypatch):
 
 def test_flash_attn(monkeypatch):
     """Test FlashAttn validation."""
+    # TODO: When testing for v1, pipe in `use_v1` as an argument to
+    # which_attn_to_use
 
     override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL)
 
diff --git a/tests/kernels/test_encoder_decoder_attn.py b/tests/kernels/test_encoder_decoder_attn.py
index a1dd5eeeaa398..3d3724c50421d 100644
--- a/tests/kernels/test_encoder_decoder_attn.py
+++ b/tests/kernels/test_encoder_decoder_attn.py
@@ -16,7 +16,7 @@
 from vllm.attention import (Attention, AttentionBackend, AttentionMetadata,
                             AttentionType)
 from vllm.attention.backends.utils import STR_NOT_IMPL_ENC_DEC_ROCM_HIP
-from vllm.attention.selector import (_Backend, get_attn_backend,
+from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
                                      global_force_attn_backend_context_manager)
 from vllm.forward_context import set_forward_context
 from vllm.platforms import current_platform
@@ -774,7 +774,7 @@ def set_reset_environment(attn_backend):
     default_dtype = torch.get_default_dtype()
     if attn_backend.name == 'FLASH_ATTN':
         torch.set_default_dtype(torch.bfloat16)
-    get_attn_backend.cache_clear()
+    _cached_get_attn_backend.cache_clear()
     yield
     # Reset the torch datatype to what it was before the test
     # so as not to impact the remaining tests.
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 991602da2853a..664707e9dc65d 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -89,7 +89,6 @@ def get_global_forced_attn_backend() -> Optional[_Backend]:
     return forced_attn_backend
 
 
-@lru_cache(maxsize=None)
 def get_attn_backend(
     head_size: int,
     dtype: torch.dtype,
@@ -99,6 +98,31 @@ def get_attn_backend(
     is_blocksparse: bool = False,
 ) -> Type[AttentionBackend]:
     """Selects which attention backend to use and lazily imports it."""
+    # Accessing envs.* behind an @lru_cache decorator can cause the wrong
+    # value to be returned from the cache if the value changes between calls.
+    # To avoid this, we read envs.VLLM_USE_V1 here and pass it explicitly to the
+    # private function.
+    return _cached_get_attn_backend(
+        head_size=head_size,
+        dtype=dtype,
+        kv_cache_dtype=kv_cache_dtype,
+        block_size=block_size,
+        is_attention_free=is_attention_free,
+        is_blocksparse=is_blocksparse,
+        use_v1=envs.VLLM_USE_V1,
+    )
+
+
+@lru_cache(maxsize=None)
+def _cached_get_attn_backend(
+    head_size: int,
+    dtype: torch.dtype,
+    kv_cache_dtype: Optional[str],
+    block_size: int,
+    is_attention_free: bool,
+    is_blocksparse: bool = False,
+    use_v1: bool = False,
+) -> Type[AttentionBackend]:
     if is_blocksparse:
         logger.info("Using BlocksparseFlashAttention backend.")
         from vllm.attention.backends.blocksparse_attn import (
@@ -106,7 +130,7 @@ def get_attn_backend(
         return BlocksparseFlashAttentionBackend
 
     backend = which_attn_to_use(head_size, dtype, kv_cache_dtype, block_size,
-                                is_attention_free)
+                                is_attention_free, use_v1)
     if backend == _Backend.FLASH_ATTN:
         logger.info("Using Flash Attention backend.")
         from vllm.attention.backends.flash_attn import (  # noqa: F401
@@ -162,13 +186,12 @@ def get_attn_backend(
         raise ValueError("Invalid attention backend.")
 
 
-def which_attn_to_use(
-    head_size: int,
-    dtype: torch.dtype,
-    kv_cache_dtype: Optional[str],
-    block_size: int,
-    is_attention_free: bool,
-) -> _Backend:
+def which_attn_to_use(head_size: int,
+                      dtype: torch.dtype,
+                      kv_cache_dtype: Optional[str],
+                      block_size: int,
+                      is_attention_free: bool,
+                      use_v1: bool = False) -> _Backend:
     """Returns which flash attention backend to use."""
     # Default case.
     selected_backend = _Backend.FLASH_ATTN
@@ -228,7 +251,7 @@ def which_attn_to_use(
     if current_platform.is_hpu():
         return _Backend.HPU_ATTN
 
-    if envs.VLLM_USE_V1:
+    if use_v1:
         return _Backend.FLASH_ATTN_VLLM_V1
 
     # FlashAttn in NVIDIA GPUs.
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index e1dcb82829d76..889845ee67312 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -6,7 +6,9 @@
 import cloudpickle
 import zmq
 
+import vllm.envs
 from vllm import AsyncEngineArgs, SamplingParams
+from vllm.engine.llm_engine import LLMEngine
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.engine.multiprocessing import (ENGINE_DEAD_ERROR, IPC_DATA_EXT,
@@ -17,17 +19,11 @@
                                          RPCStartupRequest, RPCStartupResponse,
                                          RPCUProfileRequest)
 # yapf: enable
-from vllm.envs import VLLM_USE_V1
 from vllm.executor.gpu_executor import GPUExecutor
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.usage.usage_lib import UsageContext
 
-if VLLM_USE_V1:
-    from vllm.v1.engine.llm_engine import LLMEngine
-else:
-    from vllm.engine.llm_engine import LLMEngine
-
 logger = init_logger(__name__)
 
 POLLING_TIMEOUT_MS = 10000
@@ -117,11 +113,17 @@ def from_engine_args(cls, engine_args: AsyncEngineArgs,
         load_general_plugins()
 
         engine_config = engine_args.create_engine_config()
+        if vllm.envs.VLLM_USE_V1:
+            # Lazy import: the v1 package isn't distributed
+            from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
+            engine_class = V1LLMEngine
+        else:
+            engine_class = LLMEngine
 
-        executor_class = LLMEngine._get_executor_cls(engine_config)
+        executor_class = engine_class._get_executor_cls(engine_config)
 
         use_async_sockets = (engine_config.model_config.use_async_output_proc
-                             and not VLLM_USE_V1)
+                             and not vllm.envs.VLLM_USE_V1)
 
         return cls(ipc_path=ipc_path,
                    use_async_sockets=use_async_sockets,
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index b18974c5a0c57..d8b60a5e01471 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1,7 +1,7 @@
 import itertools
 import warnings
 from contextlib import contextmanager
-from typing import (Any, ClassVar, Dict, List, Optional, Sequence, Tuple,
+from typing import (Any, ClassVar, Dict, List, Optional, Sequence, Tuple, Type,
                     Union, cast, overload)
 
 from tqdm import tqdm
@@ -10,6 +10,7 @@
 from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
                               BeamSearchSequence, get_beam_search_score)
 from vllm.engine.arg_utils import EngineArgs, TaskOption
+from vllm.engine.llm_engine import LLMEngine
 from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
                                          apply_hf_chat_template,
                                          apply_mistral_chat_template,
@@ -31,11 +32,6 @@
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Counter, deprecate_args, deprecate_kwargs, is_list_of
 
-if envs.VLLM_USE_V1:
-    from vllm.v1.engine.llm_engine import LLMEngine  # type: ignore
-else:
-    from vllm.engine.llm_engine import LLMEngine  # type: ignore
-
 logger = init_logger(__name__)
 
 
@@ -206,10 +202,21 @@ def __init__(
             pooling_returned_token_ids=pooling_returned_token_ids,
             **kwargs,
         )
-        self.llm_engine = LLMEngine.from_engine_args(
+        # Logic to switch between engines is done at runtime instead of import
+        # to avoid import order issues
+        self.engine_class = self.get_engine_class()
+        self.llm_engine = self.engine_class.from_engine_args(
             engine_args, usage_context=UsageContext.LLM_CLASS)
         self.request_counter = Counter()
 
+    @staticmethod
+    def get_engine_class() -> Type[LLMEngine]:
+        if envs.VLLM_USE_V1:
+            # Lazy import: the v1 package isn't distributed
+            from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
+            return V1LLMEngine  # type: ignore
+        return LLMEngine
+
     def get_tokenizer(self) -> AnyTokenizer:
         return self.llm_engine.get_tokenizer_group(TokenizerGroup).tokenizer
 
@@ -394,7 +401,7 @@ def generate(
             priority=priority)
 
         outputs = self._run_engine(use_tqdm=use_tqdm)
-        return LLMEngine.validate_outputs(outputs, RequestOutput)
+        return self.engine_class.validate_outputs(outputs, RequestOutput)
 
     def beam_search(
         self,
@@ -769,7 +776,8 @@ def encode(
         )
 
         outputs = self._run_engine(use_tqdm=use_tqdm)
-        return LLMEngine.validate_outputs(outputs, EmbeddingRequestOutput)
+        return self.engine_class.validate_outputs(outputs,
+                                                  EmbeddingRequestOutput)
 
     def start_profile(self) -> None:
         self.llm_engine.start_profile()
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index f86c6ec362ebe..c10efefea5471 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -30,6 +30,15 @@
 else:
     flashinfer_top_k_top_p_sampling = None
 
+
+def get_sampler() -> torch.nn.Module:
+    if envs.VLLM_USE_V1:
+        # Lazy import: the v1 package isn't distributed
+        from vllm.v1.sample.sampler import Sampler as V1Sampler
+        return V1Sampler()
+    return Sampler()
+
+
 # (num_token_ids, num_parent_ids) per sequence group.
 SampleResultType = List[Tuple[List[int], List[int]]]
 
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index 5b712ba83c25a..4fec314a70aa4 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -23,7 +23,7 @@
 from vllm.model_executor.layers.quantization.deepspeedfp import (
     DeepSpeedFPConfig, DeepSpeedFPParameter)
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -436,7 +436,7 @@ def __init__(self,
         self.unpadded_vocab_size = config.vocab_size
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 1fbf4135add7a..cce182da4820f 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -37,7 +37,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -352,7 +352,7 @@ def __init__(
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index 85de1a8115b8b..fd600adceb21c 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -34,7 +34,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -838,7 +838,7 @@ def __init__(self,
 
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index db1f92649bd49..efd24e7cf40f6 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -13,7 +13,7 @@
                          InputContext, token_inputs)
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.utils import consecutive_placeholder_ranges
@@ -525,7 +525,7 @@ def sampler(self):
         if hasattr(self.language_model, "sampler"):
             return self.language_model.sampler
 
-        return Sampler()
+        return get_sampler()
 
     def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
         h = w = self.config.vision_config.image_size
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index b2c109a21d4cf..c2440ee75d588 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -33,7 +33,7 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -298,7 +298,7 @@ def __init__(
                                           self.config.hidden_size)
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 9f6c6786c0fa4..58841f177ec22 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -21,7 +21,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -946,7 +946,7 @@ def __init__(
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size, logit_scale)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 881b86564e811..032fa82ab93cd 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -24,7 +24,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -616,7 +616,7 @@ def __init__(
                 self.transformer.embedding.weight)
         self.lm_head = self.transformer.output_layer
         self.logits_processor = LogitsProcessor(config.padded_vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
 
     def forward(self,
                 input_ids: torch.Tensor,
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 835682ca3b379..718f26bed443f 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -37,7 +37,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -355,7 +355,7 @@ def __init__(
                                  cache_config,
                                  quant_config,
                                  lora_config=lora_config)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index 3e60eee2d8fe2..ae43383155ffc 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -14,7 +14,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -373,7 +373,7 @@ def __init__(
         )
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index d278ea5b6a991..53a1c7cfbfef4 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -41,7 +41,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -399,7 +399,7 @@ def __init__(
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 834be78bce87b..95bbf4fb59c6a 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -42,7 +42,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -496,7 +496,7 @@ def __init__(
                                       config.hidden_size,
                                       quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 23efe0359cb4a..a8d591b921cd6 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -42,7 +42,7 @@
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     get_compressed_tensors_cache_scale)
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -478,7 +478,7 @@ def __init__(
             self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                     config.vocab_size,
                                                     logit_scale)
-            self.sampler = Sampler()
+            self.sampler = get_sampler()
         else:
             self.lm_head = PPMissingLayer()
 
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 6f8a7a7015c79..daf49521637b0 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -38,7 +38,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -426,7 +426,7 @@ def __init__(
                 quant_config=quant_config,
             )
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 6840ac8b9e303..184bee5f65671 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -10,7 +10,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.bart import (BartDecoder, BartEncoder,
                                              BartParallelLMHead,
@@ -112,7 +112,7 @@ def __init__(self,
 
         self.logits_processor = LogitsProcessor(self.vocab_size,
                                                 config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index fc3f5cb20afb0..1cc3ea679c553 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -33,7 +33,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -393,7 +393,7 @@ def __init__(
                                 quant_config,
                                 prefix=maybe_prefix(prefix, "model"))
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index c365880109ef8..16e0d6b30713a 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -33,7 +33,7 @@
 from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -414,7 +414,7 @@ def __init__(
         self.model = Gemma2Model(config, cache_config, quant_config)
         self.logits_processor = LogitsProcessor(
             config.vocab_size, soft_cap=config.final_logit_softcapping)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 8147037ed2a32..7f81bbff94932 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -33,7 +33,7 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -259,7 +259,7 @@ def __init__(
             self.lm_head = ParallelLMHead(self.config.vocab_size,
                                           self.config.hidden_size)
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index 9f44fa76abcba..4be8e4199f04d 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -33,7 +33,7 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -285,7 +285,7 @@ def __init__(
             self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 6fcccdfb112d8..834b4aff2e4ba 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -32,7 +32,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -247,7 +247,7 @@ def __init__(
             quant_config=quant_config,
         )
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index d3f86558ecc7e..1903156d7efe1 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -32,7 +32,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -260,7 +260,7 @@ def __init__(
         if self.config.tie_word_embeddings:
             self.embed_out.weight = self.gpt_neox.embed_in.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.gpt_neox.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index bee48f377e0f5..8a75b9cb1d55d 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -42,7 +42,7 @@
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     get_compressed_tensors_cache_scale)
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -411,7 +411,7 @@ def __init__(
             self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                     config.vocab_size,
                                                     scale=logit_scale)
-            self.sampler = Sampler()
+            self.sampler = get_sampler()
         else:
             self.lm_head = PPMissingLayer()
 
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 691a6e77c46c4..b4da986efabe3 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -39,7 +39,7 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -371,7 +371,7 @@ def __init__(
                                                 scale=1 /
                                                 self.config.logits_scaling)
 
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index afefb6cd9fa96..7ddb1e2a1ab10 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -20,7 +20,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -338,7 +338,7 @@ def __init__(
         if self.config.tie_word_embeddings:
             self.output.weight = self.model.tok_embeddings.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index d2ec0ff6e74c6..bb9d38889a175 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -21,7 +21,7 @@
                          InputContext, token_inputs)
 from vllm.model_executor.layers.quantization import (AWQConfig,
                                                      QuantizationConfig)
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.models.intern_vit import (InternVisionModel,
                                                    InternVisionPatchModel)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -467,7 +467,7 @@ def sampler(self):
         if hasattr(self.language_model, "sampler"):
             return self.language_model.sampler
 
-        return Sampler()
+        return get_sampler()
 
     def _init_vision_model(
         self,
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index 301893f74cb87..23fdca09493b7 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -34,7 +34,7 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -308,7 +308,7 @@ def __init__(
                                         config.mup_width_scale)
         self.logits_processor = LogitsProcessor(vocab_size=config.vocab_size,
                                                 scale=self.output_logits_scale)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 81d88a47c1941..9b18a1b68f9d3 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -17,7 +17,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -383,7 +383,7 @@ def __init__(
 
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
 
     def forward(self,
                 input_ids: torch.Tensor,
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index d768a57b7ef8a..9e8a403b2f1fc 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -42,7 +42,7 @@
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     get_compressed_tensors_cache_scale)
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -536,7 +536,7 @@ def __init__(
             self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                     config.vocab_size,
                                                     logit_scale)
-            self.sampler = Sampler()
+            self.sampler = get_sampler()
         else:
             self.lm_head = PPMissingLayer()
         self.make_empty_intermediate_tensors = (
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 7fbd59ebd98fd..bdd67b12a06d8 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -14,7 +14,7 @@
                          InputContext)
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import IntermediateTensors
@@ -302,7 +302,7 @@ def sampler(self):
         if hasattr(self.language_model, "sampler"):
             return self.language_model.sampler
 
-        return Sampler()
+        return get_sampler()
 
     def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
         h = w = self.config.vision_config.image_size
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 7a2c95594ddcd..37b8baa8c6be0 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -16,7 +16,7 @@
                          InputContext)
 from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -327,7 +327,7 @@ def sampler(self):
         if hasattr(self.language_model, "sampler"):
             return self.language_model.sampler
 
-        return Sampler()
+        return get_sampler()
 
     def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
         expected_dims = (2, )
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index b755e2347f6ed..69bfc80a4372c 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -15,7 +15,7 @@
                          InputContext, token_inputs)
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -289,7 +289,7 @@ def sampler(self):
         if hasattr(self.language_model, "sampler"):
             return self.language_model.sampler
 
-        return Sampler()
+        return get_sampler()
 
     def _validate_video_pixel_values(
         self, data: Union[torch.Tensor, List[torch.Tensor]]
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index f410d64577a77..26ece8190e7de 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -19,7 +19,7 @@
                          InputContext, token_inputs)
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.utils import (cached_get_tokenizer,
@@ -437,7 +437,7 @@ def sampler(self):
         if hasattr(self.language_model, "sampler"):
             return self.language_model.sampler
 
-        return Sampler()
+        return get_sampler()
 
     def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
         expected_dims = (2, )
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index aac4b7aa2661d..91161957642f9 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -13,7 +13,7 @@
 from vllm.model_executor.layers.mamba.mamba_mixer import MambaMixer
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -169,7 +169,7 @@ def __init__(
 
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
 
     def forward(self,
                 input_ids: torch.Tensor,
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index acf03cd8cb8ad..7704431a4d90a 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -43,7 +43,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -496,7 +496,7 @@ def __init__(
 
         self.logits_processor = LogitsProcessor(unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 5acd3f65896c7..4ffe33bb6ce41 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -41,7 +41,7 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.resampler import (BaseResampler, Resampler2,
                                                   get_2d_sincos_pos_embed)
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.utils import set_default_torch_dtype
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -420,7 +420,7 @@ def __init__(
                                       quant_config=quant_config,
                                       prefix="llm.lm_head")
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
 
         self.make_empty_intermediate_tensors = (
             self.llm.make_empty_intermediate_tensors)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index e9b9c4d838faa..f5c28e7d74811 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -38,7 +38,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -366,7 +366,7 @@ def __init__(
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 9647d69be8a0a..007c4e2eabc90 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -40,7 +40,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -366,7 +366,7 @@ def __init__(
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 5fa8d19b97fe8..d442ffe3c1fb1 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -44,7 +44,7 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -1141,7 +1141,7 @@ def __init__(self,
         )
         self.logits_processor = LogitsProcessor(config.output_hidden_states,
                                                 config.text_config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
 
     def compute_logits(
         self,
diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index ae218d749fc0b..fde44265414c5 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -6,7 +6,7 @@
 
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -137,7 +137,7 @@ def __init__(self, config: MLPSpeculatorConfig, **kwargs) -> None:
         self.config = config
         self.logits_processor = LogitsProcessor(config.vocab_size,
                                                 config.vocab_size, 1.0)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
 
     def generate_proposals(
         self,
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 785b53670542f..3a50923de3741 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -33,7 +33,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -1053,7 +1053,7 @@ def __init__(
 
         self.logits_processor = LogitsProcessor(config.embedding_size
                                                 or config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 7f0658f4cb2b0..b3977812cb273 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -16,7 +16,7 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -281,7 +281,7 @@ def __init__(
         self.transformer = MPTModel(config, cache_config, quant_config)
         self.lm_head = self.transformer.wte
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index b649064536dc2..8d128a42b14b8 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -36,7 +36,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -441,7 +441,7 @@ def __init__(
             self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                     config.vocab_size,
                                                     logit_scale)
-            self.sampler = Sampler()
+            self.sampler = get_sampler()
         else:
             self.lm_head = PPMissingLayer()
         self.make_empty_intermediate_tensors = (
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index dd3f58289a227..545d86eebb5ec 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -37,7 +37,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -309,7 +309,7 @@ def __init__(self,
                 quant_config=quant_config,
             )
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 374cbb8df1fcd..de30b5270e7e8 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -28,7 +28,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -323,7 +323,7 @@ def __init__(
                                       config.hidden_size,
                                       quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index d140f4237b1ca..a453376d02552 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -33,7 +33,7 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -362,7 +362,7 @@ def __init__(
             self.lm_head = ParallelLMHead(config.vocab_size,
                                           config.word_embed_proj_dim)
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index a338a93c2dd9a..d6ec1fb602f05 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -20,7 +20,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -284,7 +284,7 @@ def __init__(
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index 112bf6f3ed1af..11e7c8abd4888 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -36,7 +36,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -279,7 +279,7 @@ def __init__(self,
                                       config.hidden_size,
                                       bias=False)
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index d308f4913314c..4dae6e323654b 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -51,7 +51,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -300,7 +300,7 @@ def __init__(
                                       bias=True,
                                       quant_config=quant_config)
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py
index 3a7afc606bb9a..92bf0e61448e5 100644
--- a/vllm/model_executor/models/phi3_small.py
+++ b/vllm/model_executor/models/phi3_small.py
@@ -15,7 +15,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -386,7 +386,7 @@ def __init__(
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 1c41891ced416..a84d6b317b479 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -32,7 +32,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.model_executor.models.clip import CLIPVisionModel
@@ -570,7 +570,7 @@ def sampler(self):
         if hasattr(self.language_model, "sampler"):
             return self.language_model.sampler
 
-        return Sampler()
+        return get_sampler()
 
     def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
         expected_dims = (2, )
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index 59843ae3dfd59..19e2621ead996 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -38,7 +38,7 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -562,7 +562,7 @@ def __init__(
         )
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
 
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 6e9092432467a..facf1969b9479 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -25,7 +25,7 @@
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.utils import merge_multimodal_embeddings
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -190,7 +190,7 @@ def sampler(self):
         if hasattr(self.language_model, "sampler"):
             return self.language_model.sampler
 
-        return Sampler()
+        return get_sampler()
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 4044ddbbcca3d..c91c2caa3d519 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -36,7 +36,7 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.resampler import Resampler2, get_abs_pos
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -884,7 +884,7 @@ def __init__(
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.transformer.wte.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 49b3de1304cca..1e99c1b13b31f 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -39,7 +39,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -444,7 +444,7 @@ def __init__(
                                               prefix, "lm_head"))
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 556c09400ee83..54a7085f69ba9 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -36,7 +36,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
@@ -295,7 +295,7 @@ def __init__(self,
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.text_config.vocab_size,
                                                 logit_scale)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
 
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 98bb48a274e49..c8c48c0894c36 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -44,7 +44,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -393,7 +393,7 @@ def __init__(
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index fad9137d0dcc5..af263262bd239 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -52,7 +52,7 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.qwen2 import Qwen2Model
@@ -990,7 +990,7 @@ def __init__(self,
             self.lm_head = PPMissingLayer()
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index 1b233ac7427dd..931e48a44f631 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -42,7 +42,7 @@
 from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
     get_compressed_tensors_cache_scale)
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
@@ -449,7 +449,7 @@ def __init__(
             self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                     config.vocab_size,
                                                     logit_scale)
-            self.sampler = Sampler()
+            self.sampler = get_sampler()
         else:
             self.lm_head = PPMissingLayer()
 
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 34389b645a7c1..4cb55506bb237 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -34,7 +34,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -261,7 +261,7 @@ def __init__(
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index a5e4155fb4d2c..0b0e3f21065b4 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -34,7 +34,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -269,7 +269,7 @@ def __init__(self,
             )
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 749750fc9c16e..3a343986a9345 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -21,7 +21,7 @@
 from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.loader import DefaultModelLoader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalInputs,
@@ -379,7 +379,7 @@ def sampler(self):
         if hasattr(self.language_model, "sampler"):
             return self.language_model.sampler
 
-        return Sampler()
+        return get_sampler()
 
     def _audio_features_to_embeddings(
             self, input_features: torch.Tensor) -> torch.Tensor:
diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py
index e559988ada753..1d08b382b0b00 100644
--- a/vllm/model_executor/models/xverse.py
+++ b/vllm/model_executor/models/xverse.py
@@ -37,7 +37,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
-from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
+from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -334,7 +334,7 @@ def __init__(
         if self.config.tie_word_embeddings:
             self.lm_head.weight = self.model.embed_tokens.weight
         self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.sampler = Sampler()
+        self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index 906f06777a136..e73a1e60b2730 100644
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -136,7 +136,7 @@ def forward(
             "key/v_scale is not supported in FlashAttention.")
 
         output = torch.empty_like(query)
-        torch.ops.vllm.unified_flash_attention(
+        torch.ops.vllm.unified_v1_flash_attention(
             output,
             query,
             key,
@@ -156,7 +156,7 @@ def forward(
         return output
 
 
-def unified_flash_attention(
+def unified_v1_flash_attention(
     output: torch.Tensor,
     query: torch.Tensor,
     key: torch.Tensor,
@@ -222,7 +222,7 @@ def unified_flash_attention(
     output[:num_actual_tokens].copy_(attn_output)
 
 
-def unified_flash_attention_fake(
+def unified_v1_flash_attention_fake(
     output: torch.Tensor,
     query: torch.Tensor,
     key: torch.Tensor,
@@ -243,8 +243,8 @@ def unified_flash_attention_fake(
 
 
 direct_register_custom_op(
-    op_name="unified_flash_attention",
-    op_func=unified_flash_attention,
+    op_name="unified_v1_flash_attention",
+    op_func=unified_v1_flash_attention,
     mutates_args=["kv_cache", "output"],
-    fake_impl=unified_flash_attention_fake,
+    fake_impl=unified_v1_flash_attention_fake,
 )
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 64cc18149d6c5..5f5720480abdc 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -155,6 +155,12 @@ def __init__(
         # GPU and CPU blocks, which are profiled in the distributed executor.
         self.scheduler = Scheduler(scheduler_config, cache_config, lora_config)
 
+    def __del__(self):
+        # Small hack- implicit clean up of resources on garbage collect
+        # TODO: this should probably be explicitly invoked when we're done with
+        # the engine
+        self.terminate_detokenizer()
+
     def _initialize_kv_caches(self) -> None:
         num_gpu_blocks, _ = self.model_executor.determine_num_available_blocks(
         )
diff --git a/vllm/v1/tokenizer/detokenizer.py b/vllm/v1/tokenizer/detokenizer.py
index 4bbcf4717981e..e485fcc3522d9 100644
--- a/vllm/v1/tokenizer/detokenizer.py
+++ b/vllm/v1/tokenizer/detokenizer.py
@@ -73,7 +73,7 @@ def recv(self) -> Optional[DetokenizerOutputs]:
         return None
 
     def terminate(self) -> None:
-        self.push_socket.send(b"", flags=zmq.NOBLOCK)
+        self.detokenizer.kill()
         self.detokenizer.join()
 
 
@@ -108,10 +108,10 @@ def run(self):
         self.push_socket.bind(f"tcp://*:{self.push_port}")
 
         while True:
+            if self.pull_socket.poll(timeout=1000) == 0:
+                # Nothing to read
+                continue
             message = self.pull_socket.recv()
-            if message == b"":
-                # Terminate signal.
-                break
             inputs = self.msgpack_decoder.decode(message)
 
             for req_id in inputs.free_req_ids:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 63bf7c2e605a2..e6383b59cf7a3 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2,7 +2,6 @@
 import time
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Dict, List, Optional, Set
-from unittest.mock import patch
 
 import numpy as np
 import torch
@@ -26,7 +25,6 @@
                                                    FlashAttentionMetadata)
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
-from vllm.v1.sample.sampler import Sampler
 
 if TYPE_CHECKING:
     from vllm.v1.core.scheduler import SchedulerOutput
@@ -418,8 +416,7 @@ def load_model(self) -> None:
 
         logger.info("Starting to load model %s...", self.model_config.model)
         with DeviceMemoryProfiler() as m:  # noqa: SIM117
-            with patch("vllm.model_executor.layers.sampler.Sampler", Sampler):
-                self.model = get_model(vllm_config=self.vllm_config)
+            self.model = get_model(vllm_config=self.vllm_config)
 
         self.model_memory_usage = m.consumed_memory
         logger.info("Loading model weights took %.4f GB",

From 74f2f8a0f1d4a2afb27d7be87ed2ff12c8319eee Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Wed, 6 Nov 2024 17:25:23 -0500
Subject: [PATCH 008/183] [CI/Build] Always run the ruff workflow (#10092)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .github/workflows/ruff.yml | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index edf98ce2fcab0..1a6beca0b87c0 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -15,12 +15,17 @@ on:
   pull_request:
     branches:
       - main
-    paths:
-      - "**/*.py"
-      - pyproject.toml
-      - requirements-lint.txt
-      - .github/workflows/matchers/ruff.json
-      - .github/workflows/ruff.yml
+    # This workflow is only relevant when one of the following files changes.
+    # However, we have github configured to expect and require this workflow
+    # to run and pass before github with auto-merge a pull request. Until github
+    # allows more flexible auto-merge policy, we can just run this on every PR.
+    # It doesn't take that long to run, anyway.
+    #paths:
+    #  - "**/*.py"
+    #  - pyproject.toml
+    #  - requirements-lint.txt
+    #  - .github/workflows/matchers/ruff.json
+    #  - .github/workflows/ruff.yml
 
 jobs:
   ruff:

From 719c1ca468537d2be2616ddc3163236af7f5bd62 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 6 Nov 2024 16:42:09 -0800
Subject: [PATCH 009/183] [core][distributed] add stateless_init_process_group
 (#10072)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .buildkite/test-pipeline.yaml   |  2 +-
 tests/distributed/test_utils.py | 75 ++++++++++++++++++++++++++++++++-
 vllm/distributed/utils.py       | 73 ++++++++++++++++++++++++++++++++
 3 files changed, 147 insertions(+), 3 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 3e940549862ea..705e81d15ad65 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -120,6 +120,7 @@ steps:
   - tests/spec_decode/e2e/test_integration_dist_tp4
   - tests/compile
   commands:
+  - pytest -v -s distributed/test_utils.py
   - pytest -v -s compile/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
@@ -431,7 +432,6 @@ steps:
   - pip install -e ./plugins/vllm_add_dummy_model
   - pytest -v -s distributed/test_distributed_oot.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
 
 - label: Multi-step Tests (4 GPUs) # 36min
   working_dir: "/vllm-workspace/tests"
diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
index a51a9909f6f41..3c7facc12c59a 100644
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -1,9 +1,15 @@
+import pytest
 import ray
+import torch
+import torch.distributed as dist
 
 import vllm.envs as envs
+from vllm.distributed.utils import stateless_init_process_group
 from vllm.utils import (cuda_device_count_stateless,
                         update_environment_variables)
 
+from ..utils import multi_gpu_test
+
 
 @ray.remote
 class _CUDADeviceCountStatelessTestActor:
@@ -24,10 +30,75 @@ def test_cuda_device_count_stateless():
     CUDA_VISIBLE_DEVICES is changed."""
     actor = _CUDADeviceCountStatelessTestActor.options(  # type: ignore
         num_gpus=2).remote()
-    assert sorted(ray.get(
-        actor.get_cuda_visible_devices.remote()).split(",")) == ["0", "1"]
+    assert len(
+        sorted(ray.get(
+            actor.get_cuda_visible_devices.remote()).split(","))) == 2
     assert ray.get(actor.get_count.remote()) == 2
     ray.get(actor.set_cuda_visible_devices.remote("0"))
     assert ray.get(actor.get_count.remote()) == 1
     ray.get(actor.set_cuda_visible_devices.remote(""))
     assert ray.get(actor.get_count.remote()) == 0
+
+
+def cpu_worker(rank, WORLD_SIZE):
+    pg1 = stateless_init_process_group(init_method="tcp://127.0.0.1:29500",
+                                       rank=rank,
+                                       world_size=WORLD_SIZE,
+                                       backend="gloo")
+    if rank <= 2:
+        pg2 = stateless_init_process_group(init_method="tcp://127.0.0.1:29501",
+                                           rank=rank,
+                                           world_size=3,
+                                           backend="gloo")
+    data = torch.tensor([rank])
+    dist.all_reduce(data, op=dist.ReduceOp.SUM, group=pg1)
+    if rank <= 2:
+        dist.all_reduce(data, op=dist.ReduceOp.SUM, group=pg2)
+    item = data[0].item()
+    print(f"rank: {rank}, item: {item}")
+    if rank == 3:
+        assert item == 6
+    else:
+        assert item == 18
+
+
+def gpu_worker(rank, WORLD_SIZE):
+    pg1 = stateless_init_process_group(init_method="tcp://127.0.0.1:29502",
+                                       rank=rank,
+                                       world_size=WORLD_SIZE,
+                                       backend="nccl")
+    if rank <= 2:
+        pg2 = stateless_init_process_group(init_method="tcp://127.0.0.1:29503",
+                                           rank=rank,
+                                           world_size=3,
+                                           backend="nccl")
+    torch.cuda.set_device(rank)
+    data = torch.tensor([rank]).cuda()
+    dist.all_reduce(data, op=dist.ReduceOp.SUM, group=pg1)
+    if rank <= 2:
+        dist.all_reduce(data, op=dist.ReduceOp.SUM, group=pg2)
+    item = data[0].item()
+    print(f"rank: {rank}, item: {item}")
+    if rank == 3:
+        assert item == 6
+    else:
+        assert item == 18
+
+
+@multi_gpu_test(num_gpus=4)
+@pytest.mark.parametrize("worker", [cpu_worker, gpu_worker])
+def test_stateless_init_process_group(worker):
+    WORLD_SIZE = 4
+    from multiprocessing import get_context
+    ctx = get_context("fork")
+    processes = []
+    for i in range(WORLD_SIZE):
+        rank = i
+        processes.append(ctx.Process(target=worker, args=(rank, WORLD_SIZE)))
+    for p in processes:
+        p.start()
+    for p in processes:
+        p.join()
+    for p in processes:
+        assert not p.exitcode
+    print("All processes finished.")
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 8c94ef8cb10ce..d24ce898707fc 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -5,6 +5,11 @@
 from typing import Sequence, Tuple
 
 import torch
+from torch.distributed import ProcessGroup
+from torch.distributed.distributed_c10d import (Backend, PrefixStore,
+                                                _get_default_timeout,
+                                                is_nccl_available)
+from torch.distributed.rendezvous import rendezvous
 
 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -84,3 +89,71 @@ def get_pp_indices(num_hidden_layers: int, pp_rank: int,
             end_layer = num_hidden_layers
 
     return (start_layer, end_layer)
+
+
+def stateless_init_process_group(init_method: str, rank: int, world_size: int,
+                                 backend: str) -> ProcessGroup:
+    """A replacement for `torch.distributed.init_process_group` that does not
+    pollute the global state.
+
+    If we have process A and process B called `torch.distributed.init_process_group`
+    to form a group, and then we want to form another group with process A, B, C,
+    D, it is not possible in PyTorch, because process A and process B have already
+    formed a group, and process C and process D cannot join that group. This
+    function is a workaround for this issue.
+
+    `torch.distributed.init_process_group` is a global call, while this function
+    is a stateless call. It will return a `ProcessGroup` object that can be used
+    for collective communication. With this function, process A and process B
+    can call `stateless_init_process_group` to form a group, and then process A, B,
+    C, and D can call `stateless_init_process_group` to form another group.
+    """ # noqa
+
+    backend = Backend(backend)  # it is basically string
+    timeout = _get_default_timeout(backend)
+
+    store, rank, world_size = next(
+        rendezvous(init_method, rank, world_size, timeout=timeout))
+    store.set_timeout(timeout)
+
+    group_rank = rank
+    group_size = world_size
+
+    # Use a PrefixStore to avoid accidental overrides of keys used by
+    # different systems (e.g. RPC) in case the store is multi-tenant.
+    prefix_store = PrefixStore(init_method, store)
+
+    pg_options = ProcessGroup.Options(backend=backend, timeout=timeout)
+
+    pg: ProcessGroup = ProcessGroup(
+        prefix_store,
+        group_rank,
+        group_size,
+        pg_options,
+    )
+
+    if backend == "gloo":
+        from torch.distributed.distributed_c10d import ProcessGroupGloo
+        backend_class = ProcessGroupGloo(prefix_store,
+                                         group_rank,
+                                         group_size,
+                                         timeout=timeout)
+        backend_type = ProcessGroup.BackendType.GLOO
+        device = torch.device("cpu")
+    elif backend == "nccl":
+        assert is_nccl_available()
+        from torch.distributed.distributed_c10d import ProcessGroupNCCL
+
+        backend_options = ProcessGroupNCCL.Options()
+        backend_options._timeout = timeout
+
+        backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size,
+                                         backend_options)
+        backend_type = ProcessGroup.BackendType.NCCL
+        device = torch.device("cuda")
+
+    backend_class._set_sequence_number_for_group()
+
+    pg._register_backend(device, backend_type, backend_class)
+
+    return pg

From 4ab32566449558f2b5dbfbe44aeb6417e02e2e88 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 6 Nov 2024 19:54:13 -0500
Subject: [PATCH 010/183] [Bugfix] Fix FP8 torch._scaled_mm fallback for
 torch>2.5 with CUDA<12.4 (#10095)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 vllm/model_executor/layers/quantization/utils/w8a8_utils.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index 445117ac99a34..ec73533126ab6 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -7,8 +7,7 @@
 
 # Input scaling factors are no longer optional in _scaled_mm starting
 # from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
-TORCH_DEVICE_IDENTITY = torch.ones(1).cuda() \
-            if current_platform.is_rocm() else None
+TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32)
 
 
 def cutlass_fp8_supported() -> bool:
@@ -166,8 +165,7 @@ def apply_fp8_linear(
 
             # Making sure the dummy tensor is on the same device as the weight
             global TORCH_DEVICE_IDENTITY
-            if (TORCH_DEVICE_IDENTITY is not None
-                    and TORCH_DEVICE_IDENTITY.device != weight.device):
+            if TORCH_DEVICE_IDENTITY.device != weight.device:
                 TORCH_DEVICE_IDENTITY = TORCH_DEVICE_IDENTITY.to(weight.device)
 
             # GEMM

From d3859f18915a1e3c50ee88bcbb0af4f4fe754b4e Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Thu, 7 Nov 2024 09:29:03 +0800
Subject: [PATCH 011/183] [Misc][XPU] Upgrade to Pytorch 2.5 for xpu backend
 (#9823)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
Signed-off-by: yan ma <yan.ma@intel.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
---
 Dockerfile.xpu                       | 12 +++++++++-
 requirements-xpu.txt                 |  8 +++----
 vllm/_ipex_ops.py                    | 33 +++++++------------------
 vllm/attention/backends/ipex_attn.py | 36 +++++++++++++++-------------
 4 files changed, 43 insertions(+), 46 deletions(-)

diff --git a/Dockerfile.xpu b/Dockerfile.xpu
index 0ecb46df6256c..63bc682770422 100644
--- a/Dockerfile.xpu
+++ b/Dockerfile.xpu
@@ -30,9 +30,19 @@ COPY requirements-common.txt /workspace/vllm/requirements-common.txt
 
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install --no-cache-dir \
-    --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
     -r requirements-xpu.txt
 
+RUN git clone https://github.com/intel/pti-gpu && \
+    cd pti-gpu/sdk && \
+    git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \
+    mkdir build && \
+    cd build && \
+    cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \
+    make -j && \
+    cmake --install . --config Release --prefix "/usr/local"
+
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
+
 COPY . .
 ARG GIT_REPO_CHECK
 RUN --mount=type=bind,source=.git,target=.git \
diff --git a/requirements-xpu.txt b/requirements-xpu.txt
index eb76a33dab5c2..e41295792283f 100644
--- a/requirements-xpu.txt
+++ b/requirements-xpu.txt
@@ -8,9 +8,9 @@ packaging
 setuptools-scm>=8
 wheel
 jinja2
-# Following pkgs retrieved from https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-torch == 2.3.1+cxx11.abi
-intel-extension-for-pytorch == 2.3.110+xpu
-oneccl_bind_pt == 2.3.100+xpu
+
+torch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp310-cp310-linux_x86_64.whl
+intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp310-cp310-linux_x86_64.whl
+oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp310-cp310-linux_x86_64.whl
 
 triton-xpu == 3.0.0b1
diff --git a/vllm/_ipex_ops.py b/vllm/_ipex_ops.py
index 31fcc4c3256a8..28b804f765a3a 100644
--- a/vllm/_ipex_ops.py
+++ b/vllm/_ipex_ops.py
@@ -74,20 +74,12 @@ def paged_attention_v1(
         assert kv_cache_dtype == "auto"
         num_heads = out.size(1)
         num_queries_per_tokens = num_heads // num_kv_heads
-        head_mapping = torch.arange(
-            0,
-            num_kv_heads,
-            device=query.device,
-            dtype=torch.int32,
-        ).view(num_kv_heads,
-               1).repeat_interleave(num_queries_per_tokens).flatten()
-        # todo: ipex will refactor namespace
-        torch.xpu.paged_attention_v1(  # type: ignore
+        ipex.llm.modules.PagedAttention.single_query_kv_attention(
             out,
             query.contiguous(),
             key_cache.view_as(value_cache),
             value_cache,
-            head_mapping,
+            num_queries_per_tokens,
             scale,
             block_tables,
             context_lens,
@@ -124,26 +116,15 @@ def paged_attention_v2(
         assert kv_cache_dtype == "auto"
         num_heads = out.size(1)
         num_queries_per_tokens = num_heads // num_kv_heads
-        head_mapping = torch.arange(
-            0,
-            num_kv_heads,
-            dtype=torch.int32,
-            device=query.device,
-        ).view(num_kv_heads,
-               1).repeat_interleave(num_queries_per_tokens).flatten()
-        # todo: ipex will refactor namespace
-        torch.xpu.paged_attention_v2(  # type: ignore
+        ipex.llm.modules.PagedAttention.single_query_kv_attention(
             out,
-            exp_sum,
-            max_logits,
-            tmp_out,
             query.contiguous(),
             key_cache.view_as(value_cache),
             value_cache,
-            head_mapping,
+            num_queries_per_tokens,
+            scale,
             block_tables,
             context_lens,
-            scale,
             block_size,
             max_context_len,
             alibi_slopes,
@@ -202,6 +183,7 @@ def varlen_attention(
         is_causal: bool,
         return_softmax: bool,
         gen_: torch.Generator,
+        logits_soft_cap: float,
     ) -> None:
         ipex.llm.functional.varlen_attention(query.contiguous(),
                                              key.contiguous(),
@@ -210,7 +192,8 @@ def varlen_attention(
                                              max_seqlen_q, max_seqlen_k,
                                              pdropout, softmax_scale,
                                              zero_tensors, is_causal,
-                                             return_softmax, gen_)
+                                             return_softmax, gen_,
+                                             logits_soft_cap)
 
     @staticmethod
     def reshape_and_cache(
diff --git a/vllm/attention/backends/ipex_attn.py b/vllm/attention/backends/ipex_attn.py
index 1eb5fe10d76db..87bdb1e0e6565 100644
--- a/vllm/attention/backends/ipex_attn.py
+++ b/vllm/attention/backends/ipex_attn.py
@@ -119,8 +119,6 @@ def __init__(
         if blocksparse_params is not None:
             raise ValueError(
                 "IPEX backend does not support block-sparse attention.")
-        if logits_soft_cap is not None:
-            raise ValueError("IPEX backend does not support logits_soft_cap.")
         self.num_heads = num_heads
         self.head_size = head_size
         self.scale = float(scale)
@@ -135,6 +133,9 @@ def __init__(
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
         self.need_mask = (self.alibi_slopes is not None
                           or self.sliding_window is not None)
+        if logits_soft_cap is None:
+            logits_soft_cap = 0
+        self.logits_soft_cap = logits_soft_cap
 
         supported_head_sizes = PagedAttention.get_supported_head_sizes()
         if head_size not in supported_head_sizes:
@@ -239,20 +240,23 @@ def forward(
                     (num_tokens, self.num_heads, self.head_size),
                     dtype=query.dtype,
                     device=query.device)
-                ipex_ops.varlen_attention(query,
-                                          key,
-                                          value,
-                                          output,
-                                          attn_metadata.seqlen_q,
-                                          attn_metadata.seqlen_q,
-                                          attn_metadata.max_seqlen,
-                                          attn_metadata.max_seqlen,
-                                          pdropout=0.0,
-                                          softmax_scale=self.scale,
-                                          zero_tensors=False,
-                                          is_causal=True,
-                                          return_softmax=False,
-                                          gen_=None)
+                ipex_ops.varlen_attention(
+                    query,
+                    key,
+                    value,
+                    output,
+                    attn_metadata.seqlen_q,
+                    attn_metadata.seqlen_q,
+                    attn_metadata.max_seqlen,
+                    attn_metadata.max_seqlen,
+                    pdropout=0.0,
+                    softmax_scale=self.scale,
+                    zero_tensors=False,
+                    is_causal=True,
+                    return_softmax=False,
+                    gen_=None,
+                    logits_soft_cap=self.logits_soft_cap,
+                )
             else:
                 # prefix-enabled attention
                 raise RuntimeError(

From 29862b884bb5c59a35a9bcf62913c233d8b82471 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Thu, 7 Nov 2024 04:07:51 +0000
Subject: [PATCH 012/183] [Frontend] Adjust try/except blocks in API impl
 (#10056)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/entrypoints/openai/serving_completion.py | 8 ++------
 vllm/entrypoints/openai/serving_embedding.py  | 8 +++-----
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 570232be38379..db31b1153d97e 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -189,13 +189,7 @@ async def create_completion(
         try:
             async for i, res in result_generator:
                 final_res_batch[i] = res
-        except asyncio.CancelledError:
-            return self.create_error_response("Client disconnected")
-        except ValueError as e:
-            # TODO: Use a vllm-specific Validation Error
-            return self.create_error_response(str(e))
 
-        try:
             for i, final_res in enumerate(final_res_batch):
                 assert final_res is not None
 
@@ -217,6 +211,8 @@ async def create_completion(
                 tokenizer,
                 request_metadata,
             )
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index 917856cd2b2dd..bbe7db8f13231 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -205,12 +205,8 @@ async def create_embedding(
         try:
             async for i, res in result_generator:
                 final_res_batch[i] = res
-        except asyncio.CancelledError:
-            return self.create_error_response("Client disconnected")
 
-        try:
-            for final_res in final_res_batch:
-                assert final_res is not None
+            assert all(final_res is not None for final_res in final_res_batch)
 
             final_res_batch_checked = cast(List[EmbeddingRequestOutput],
                                            final_res_batch)
@@ -218,6 +214,8 @@ async def create_embedding(
             response = request_output_to_embedding_response(
                 final_res_batch_checked, request_id, created_time, model_name,
                 encoding_format)
+        except asyncio.CancelledError:
+            return self.create_error_response("Client disconnected")
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             return self.create_error_response(str(e))

From a4b3e0c1e999d214c6355b16a1c68250e6c030e2 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Thu, 7 Nov 2024 12:43:08 +0800
Subject: [PATCH 013/183] [Hardware][CPU] Update torch 2.5 (#9911)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 .buildkite/run-cpu-test.sh                    |  2 +-
 Dockerfile.cpu                                |  2 +-
 cmake/cpu_extension.cmake                     |  1 +
 csrc/cpu/attention.cpp                        | 10 +++
 csrc/cpu/cpu_types_x86.hpp                    | 78 +++++++++++--------
 csrc/cpu/dnnl_helper.hpp                      |  6 ++
 csrc/cpu/quant.cpp                            |  7 ++
 .../getting_started/cpu-installation.rst      |  6 +-
 requirements-cpu.txt                          |  2 +-
 .../decoder_only/language/test_models.py      |  3 +-
 vllm/executor/cpu_executor.py                 |  5 --
 .../layers/quantization/ipex_quant.py         |  2 +-
 12 files changed, 76 insertions(+), 48 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index c331a9c49c0d0..2dbeee8562971 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -46,7 +46,7 @@ docker exec cpu-test bash -c "
 docker exec cpu-test bash -c "
   export VLLM_CPU_KVCACHE_SPACE=10 
   export VLLM_CPU_OMP_THREADS_BIND=48-92 
-  python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & 
+  python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
   timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
   python3 benchmarks/benchmark_serving.py \
     --backend vllm \
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index f1a21d6bd13fc..287b4958da4e5 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -22,7 +22,7 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li
 
 RUN echo 'ulimit -c 0' >> ~/.bashrc
 
-RUN pip install intel_extension_for_pytorch==2.4.0
+RUN pip install intel_extension_for_pytorch==2.5.0
 
 WORKDIR /workspace
 
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 7237d246ddf55..776a0bb11ae64 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -18,6 +18,7 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc")
 #
 list(APPEND CXX_COMPILE_FLAGS
     "-fopenmp"
+    "-mf16c"
     "-DVLLM_CPU_EXTENSION")
 
 execute_process(COMMAND cat /proc/cpuinfo
diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
index abb4e3bea14bb..e3953c7c45719 100644
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -22,6 +22,16 @@ struct KernelVecType<float> {
   using v_load_vec_type = vec_op::FP32Vec16;
 };
 
+template <>
+struct KernelVecType<c10::Half> {
+  using q_load_vec_type = vec_op::FP16Vec8;
+  using q_vec_type = vec_op::FP32Vec16;
+  using k_load_vec_type = vec_op::FP16Vec16;
+  using k_vec_type = vec_op::FP32Vec16;
+  using qk_acc_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::FP16Vec16;
+};
+
 #ifdef __AVX512BF16__
 template <>
 struct KernelVecType<c10::BFloat16> {
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index a325153b470cc..12d5757b495be 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -11,10 +11,10 @@ static_assert(false, "AVX2 must be supported for the current implementation.");
 
 namespace vec_op {
 
-// FIXME: FP16 is not fully supported in Torch-CPU
 #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)                                 \
   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)                         \
-  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
+  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)                      \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
 
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
   AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
@@ -50,37 +50,37 @@ template <typename T> struct Vec {
 struct FP32Vec8;
 struct FP32Vec16;
 
-#ifdef __AVX512FP16__
 struct FP16Vec8 : public Vec<FP16Vec8> {
   constexpr static int VEC_ELEM_NUM = 8;
 
-  __m128h reg;
+  __m128i reg;
 
-  explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {}
+  explicit FP16Vec8(const void *ptr)
+      : reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
 
-  explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {}
+  explicit FP16Vec8(const FP32Vec8 &);
 
-  explicit FP16Vec8(__m128h data) : reg(data) {}
+  void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
+};
 
-  FP16Vec8 operator*(const FP16Vec8 &b) const {
-    return FP16Vec8(_mm_mul_ph(reg, b.reg));
-  }
+struct FP16Vec16 : public Vec<FP16Vec16> {
+  constexpr static int VEC_ELEM_NUM = 16;
 
-  FP16Vec8 operator+(const FP16Vec8 &b) const {
-    return FP16Vec8(_mm_add_ph(reg, b.reg));
-  }
+  __m256i reg;
 
-  FP16Vec8 operator-(const FP16Vec8 &b) const {
-    return FP16Vec8(_mm_sub_ph(reg, b.reg));
-  }
+  explicit FP16Vec16(const void *ptr)
+      : reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
 
-  FP16Vec8 operator/(const FP16Vec8 &b) const {
-    return FP16Vec8(_mm_div_ph(reg, b.reg));
-  }
+  explicit FP16Vec16(const FP32Vec16 &);
 
-  void save(void *ptr) const { _mm_storeu_ph(ptr, reg); }
+  void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
+
+  void save(void* ptr, const int elem_num) const {
+    constexpr uint32_t M = 0xFFFFFFFF;
+    __mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
+    _mm256_mask_storeu_epi16(ptr, mask, reg);
+  }
 };
-#endif
 
 struct BF16Vec8 : public Vec<BF16Vec8> {
   constexpr static int VEC_ELEM_NUM = 8;
@@ -202,9 +202,7 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
 
   explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
 
-#ifdef __AVX512FP16__
-  explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {}
-#endif
+  explicit FP32Vec8(const FP16Vec8 &v) : reg(_mm256_cvtph_ps(v.reg)) {}
 
   explicit FP32Vec8(const BF16Vec8 &v)
       : reg(_mm256_castsi256_ps(
@@ -323,6 +321,10 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
       : reg(_mm512_castsi512_ps(
             _mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
 
+  explicit FP32Vec16(const FP16Vec16 &v) : reg(_mm512_cvtph_ps(v.reg)) {}
+
+  explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+
   explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
 
   explicit FP32Vec16(const INT32Vec16 &v)
@@ -534,24 +536,34 @@ template <typename T> using vec_t = typename VecType<T>::vec_type;
 
 template <> struct VecType<float> { using vec_type = FP32Vec8; };
 
-#ifdef __AVX512FP16__
-template <> struct VecType<c10::Half> { using vec_type = FP16Vec16; };
-#endif
+template <> struct VecType<c10::Half> { using vec_type = FP16Vec8; };
 
 template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
 
 template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
 
-#ifdef __AVX512FP16__
-template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
-  *reinterpret_cast<_Float16 *>(ptr) = v;
-}
-#endif
-
 inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
   acc = acc + a * b;
 }
 
+template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
+  *reinterpret_cast<unsigned short *>(ptr) =
+      _cvtss_sh(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+}
+
+inline FP16Vec8::FP16Vec8(const FP32Vec8 &v)
+    : reg(_mm256_cvtps_ph(v.reg,
+                          _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {}
+
+#ifdef __AVX512F__
+inline FP16Vec16::FP16Vec16(const FP32Vec16 &v)
+    : reg(_mm512_cvtps_ph(v.reg,
+                          _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) {}
+#else
+inline FP16Vec16::FP16Vec16(const FP32Vec16 &v)
+    : reg(_mm256_insertf128_si256(_mm256_castsi128_si256(FP16Vec8(FP32Vec8(v.reg_low)).reg), FP16Vec8(FP32Vec8(v.reg_low)).reg, 1)) {}
+#endif
+
 #ifdef __AVX512BF16__
 template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
   *reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v);
diff --git a/csrc/cpu/dnnl_helper.hpp b/csrc/cpu/dnnl_helper.hpp
index 024ad4ae43da8..8b5011dc065f0 100644
--- a/csrc/cpu/dnnl_helper.hpp
+++ b/csrc/cpu/dnnl_helper.hpp
@@ -2,6 +2,7 @@
 #define DNNL_HELPER_HPP
 
 #include <c10/util/BFloat16.h>
+#include <c10/util/Half.h>
 
 #include "oneapi/dnnl/dnnl.hpp"
 
@@ -32,6 +33,11 @@ struct DNNLType<c10::BFloat16> {
   static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16;
 };
 
+template <>
+struct DNNLType<c10::Half> {
+  static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f16;
+};
+
 template <typename T>
 constexpr inline dnnl::memory::data_type get_dnnl_type() {
   return DNNLType<std::decay_t<T>>::type;
diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp
index b493fd793818a..f42fa2361a2db 100644
--- a/csrc/cpu/quant.cpp
+++ b/csrc/cpu/quant.cpp
@@ -23,6 +23,13 @@ struct KernelVecType<c10::BFloat16> {
   using cvt_vec_type = vec_op::FP32Vec16;
 };
 
+template <>
+struct KernelVecType<c10::Half> {
+  using load_vec_type = vec_op::FP16Vec16;
+  using azp_adj_load_vec_type = vec_op::INT32Vec16;
+  using cvt_vec_type = vec_op::FP32Vec16;
+};
+
 #ifdef __AVX512F__
 template <bool AZP, typename scalar_t>
 void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst
index d12aeebbbc184..69530fd778c55 100644
--- a/docs/source/getting_started/cpu-installation.rst
+++ b/docs/source/getting_started/cpu-installation.rst
@@ -3,13 +3,13 @@
 Installation with CPU
 ========================
 
-vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32 and BF16. vLLM CPU backend supports the following vLLM features:
+vLLM initially supports basic model inferencing and serving on x86 CPU platform, with data types FP32, FP16 and BF16. vLLM CPU backend supports the following vLLM features:
 
 - Tensor Parallel (``-tp = N``)
 - Quantization (``INT8 W8A8, AWQ``)
 
 .. note::
-    FP16 data type and more advanced features on `chunked-prefill`, `prefix-caching` and `FP8 KV cache` are under development and will be available soon.
+    More advanced features on `chunked-prefill`, `prefix-caching` and `FP8 KV cache` are under development and will be available soon.
 
 Table of contents:
 
@@ -72,8 +72,6 @@ Build from source
     $ VLLM_TARGET_DEVICE=cpu python setup.py install
 
 .. note::
-    - BF16 is the default data type in the current CPU backend (that means the backend will cast FP16 to BF16), and is compatible will all CPUs with AVX512 ISA support. 
-
     - AVX512_BF16 is an extension ISA provides native BF16 data type conversion and vector product instructions, will brings some performance improvement compared with pure AVX512. The CPU backend build script will check the host CPU flags to determine whether to enable AVX512_BF16. 
     
     - If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable VLLM_CPU_AVX512BF16=1 before the building.    
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 27ca8ca5dbc58..749b03a0603d8 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -2,5 +2,5 @@
 -r requirements-common.txt
 
 # Dependencies for x86_64 CPUs
-torch == 2.4.0+cpu; platform_machine != "ppc64le"
+torch == 2.5.1+cpu; platform_machine != "ppc64le"
 torchvision; platform_machine != "ppc64le"   # required for the image processor of phi3v, this must be updated alongside torch
diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py
index 05117666f8c3f..d705909c24bf8 100644
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -32,8 +32,7 @@
         "openbmb/MiniCPM3-4B",
     ]
 
-# TODO: remove this after CPU float16 support ready
-target_dtype = "float" if current_platform.is_cpu() else "half"
+target_dtype = "half"
 
 
 @pytest.mark.parametrize("model", MODELS)
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index ab3ebb4e43d18..4ceb5a837dd7f 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -2,8 +2,6 @@
 from functools import partial
 from typing import Any, Awaitable, List, Optional, Set, Tuple, Union
 
-import torch
-
 import vllm.envs as envs
 from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
                          SchedulerConfig)
@@ -316,9 +314,6 @@ async def check_health_async(self) -> None:
 
 
 def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
-    if config.dtype == torch.float16:
-        logger.warning("float16 is not supported on CPU, casting to bfloat16.")
-        config.dtype = torch.bfloat16
     # Reminder: Please update docs/source/serving/compatibility_matrix.rst
     # If the feature combo become valid
     if not config.enforce_eager:
diff --git a/vllm/model_executor/layers/quantization/ipex_quant.py b/vllm/model_executor/layers/quantization/ipex_quant.py
index 43f4502f7455c..330c2ad195d78 100644
--- a/vllm/model_executor/layers/quantization/ipex_quant.py
+++ b/vllm/model_executor/layers/quantization/ipex_quant.py
@@ -54,7 +54,7 @@ def get_name(cls) -> str:
 
     @classmethod
     def get_supported_act_dtypes(cls) -> List[torch.dtype]:
-        return [torch.bfloat16]
+        return [torch.bfloat16, torch.float16]
 
     @classmethod
     def get_min_capability(cls) -> int:

From e7b84c394d221d0c528584511f56ef3359630706 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 6 Nov 2024 21:06:41 -0800
Subject: [PATCH 014/183] [doc] add back Python 3.8 ABI (#10100)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/getting_started/installation.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index efc050dd1bfb2..f02626bda4c64 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -66,7 +66,7 @@ If you want to access the wheels for previous commits, you can specify the commi
     $ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
     $ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
 
-Note that the wheels are built with Python 3.9 ABI (see `PEP 425 <https://peps.python.org/pep-0425/>`_ for more details about ABI), so **they are compatible with Python 3.9 and later**. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata.
+Note that the wheels are built with Python 3.8 ABI (see `PEP 425 <https://peps.python.org/pep-0425/>`_ for more details about ABI), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata. Although we don't support Python 3.8 any more (because PyTorch 2.5 dropped support for Python 3.8), the wheels are still built with Python 3.8 ABI to keep the same wheel name as before.
 
 Another way to access the latest code is to use the docker images:
 

From 1fa020c539485e398d10ca9be376c1d0d87ae19b Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Thu, 7 Nov 2024 05:06:57 +0000
Subject: [PATCH 015/183] [V1][BugFix] Fix Generator construction in greedy +
 seed case (#10097)

Signed-off-by: Nick Hill <nhill@redhat.com>
---
 vllm/v1/worker/gpu_model_runner.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index e6383b59cf7a3..9bb49a21453d0 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -146,7 +146,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         for req_data in scheduler_output.scheduled_new_reqs:
             req_id = req_data.req_id
             sampling_params = req_data.sampling_params
-            if sampling_params.seed is not None:
+            if sampling_params.sampling_type == SamplingType.RANDOM_SEED:
                 generator = torch.Generator(device=self.device)
                 generator.manual_seed(sampling_params.seed)
             else:
@@ -382,7 +382,8 @@ def execute_model(
                 # Rewind the generator state as if the token was not sampled.
                 generator = self.input_batch.generators.get(i)
                 if generator is not None:
-                    generator.set_offset(generator.get_offset() - 1)
+                    # This relies on cuda-specific torch-internal impl details
+                    generator.set_offset(generator.get_offset() - 4)
 
         if sampler_output.logprob_token_ids is None:
             logprob_token_ids = None

From db7db4aab9fd23e818d89ca9037099d30c071a5a Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 7 Nov 2024 14:00:21 +0800
Subject: [PATCH 016/183] [Misc] Consolidate ModelConfig code related to HF
 config (#10104)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/serving/compatibility_matrix.rst |  2 +-
 tests/test_config.py                         | 38 ++++++++++++++++++++
 vllm/config.py                               | 14 ++++----
 vllm/inputs/preprocess.py                    |  2 +-
 vllm/transformers_utils/config.py            |  9 +++++
 vllm/utils.py                                |  4 ---
 vllm/worker/cpu_model_runner.py              |  9 +----
 vllm/worker/cpu_worker.py                    |  5 +--
 vllm/worker/model_runner.py                  | 23 +++++-------
 vllm/worker/worker.py                        |  5 +--
 10 files changed, 68 insertions(+), 43 deletions(-)

diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/serving/compatibility_matrix.rst
index cab19e4ec5b6c..f629b3ca78318 100644
--- a/docs/source/serving/compatibility_matrix.rst
+++ b/docs/source/serving/compatibility_matrix.rst
@@ -359,7 +359,7 @@ Feature x Hardware
      - ✅
      - ✅
      - ✅
-     - `✗ <https://github.com/vllm-project/vllm/blob/a84e598e2125960d3b4f716b78863f24ac562947/vllm/worker/cpu_model_runner.py#L125>`__ 
+     - ✅
      - ✗
    * - :abbr:`logP (Logprobs)`
      - ✅
diff --git a/tests/test_config.py b/tests/test_config.py
index 69918b67607d9..5211049bf0011 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -165,3 +165,41 @@ def test_rope_customization():
     assert getattr(longchat_model_config.hf_config, "rope_scaling",
                    None) == TEST_ROPE_SCALING
     assert longchat_model_config.max_model_len == 4096
+
+
+@pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [
+    ("facebook/opt-125m", False),
+    ("facebook/bart-base", True),
+    ("meta-llama/Llama-3.2-1B", False),
+    ("meta-llama/Llama-3.2-11B-Vision", True),
+])
+def test_is_encoder_decoder(model_id, is_encoder_decoder):
+    config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        dtype="float16",
+        seed=0,
+    )
+
+    assert config.is_encoder_decoder == is_encoder_decoder
+
+
+@pytest.mark.parametrize(("model_id", "uses_mrope"), [
+    ("facebook/opt-125m", False),
+    ("Qwen/Qwen2-VL-2B-Instruct", True),
+])
+def test_uses_mrope(model_id, uses_mrope):
+    config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        dtype="float16",
+        seed=0,
+    )
+
+    assert config.uses_mrope == uses_mrope
diff --git a/vllm/config.py b/vllm/config.py
index 91bbbfec4b7b3..c7fad3a261858 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -15,7 +15,8 @@
 from vllm.tracing import is_otel_available, otel_import_error_traceback
 from vllm.transformers_utils.config import (ConfigFormat, get_config,
                                             get_hf_image_processor_config,
-                                            get_hf_text_config)
+                                            get_hf_text_config,
+                                            is_encoder_decoder, uses_mrope)
 from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory,
                         print_warning_once)
 
@@ -667,12 +668,13 @@ def get_multimodal_config(self) -> "MultiModalConfig":
         return self.multimodal_config
 
     @property
-    def is_encoder_decoder_model(self) -> bool:
+    def is_encoder_decoder(self) -> bool:
         """Extract the HF encoder/decoder model flag."""
-        return getattr(
-            self.hf_config, "is_encoder_decoder",
-            False) or (hasattr(self.hf_config, "text_config") and getattr(
-                self.hf_config.text_config, "is_encoder_decoder", False))
+        return is_encoder_decoder(self.hf_config)
+
+    @property
+    def uses_mrope(self) -> bool:
+        return uses_mrope(self.hf_config)
 
     @property
     def is_multimodal_model(self) -> bool:
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index a5c787a56b5a9..509b0448b9e51 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -580,4 +580,4 @@ async def preprocess_async(
         )
 
     def is_encoder_decoder_model(self):
-        return self.model_config.is_encoder_decoder_model
+        return self.model_config.is_encoder_decoder
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 1a5870aa4f84c..415d8bf7cc2bb 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -129,6 +129,15 @@ def uses_mrope(config: PretrainedConfig) -> bool:
     return "mrope_section" in rope_scaling
 
 
+def is_encoder_decoder(config: PretrainedConfig) -> bool:
+    """Detect if the model with this config is used as an encoder/decoder."""
+    text_config = getattr(config, "text_config", None)
+    if text_config is not None:
+        return is_encoder_decoder(text_config)
+
+    return getattr(config, "is_encoder_decoder", False)
+
+
 def get_config(
     model: Union[str, Path],
     trust_remote_code: bool,
diff --git a/vllm/utils.py b/vllm/utils.py
index d78130873d3dc..13d7f6d475346 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -88,9 +88,6 @@
                                        "currently supported with encoder/"
                                        "decoder models.")
 
-STR_NOT_IMPL_ENC_DEC_CPU = ("CPU is not currently supported with "
-                            "encoder/decoder models.")
-
 # Efficiently import all enc/dec error strings
 # rather than having to import all of the above
 STR_NOT_IMPL_ENC_DEC_ERR_STRS = {
@@ -105,7 +102,6 @@
     "STR_NOT_IMPL_ENC_DEC_SPEC_DEC": STR_NOT_IMPL_ENC_DEC_SPEC_DEC,
     "STR_NOT_IMPL_ENC_DEC_BACKEND": STR_NOT_IMPL_ENC_DEC_BACKEND,
     "STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER": STR_NOT_IMPL_ENC_DEC_PROMPT_ADAPTER,
-    "STR_NOT_IMPL_ENC_DEC_CPU": STR_NOT_IMPL_ENC_DEC_CPU
 }
 
 # Constants related to forcing the attention backend selection
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index fdd72a452f2ad..26a15ed645c43 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -18,7 +18,6 @@
                              MultiModalInputs, MultiModalPlaceholderMap)
 from vllm.sequence import (IntermediateTensors, SequenceData,
                            SequenceGroupMetadata)
-from vllm.transformers_utils.config import uses_mrope
 from vllm.utils import make_tensor_with_pad
 from vllm.worker.model_runner_base import (
     ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
@@ -163,7 +162,7 @@ def _compute_multi_modal_input(self, seq_group: SequenceGroupMetadata,
 
         # special processing for mrope position deltas.
         mrope_positions = None
-        if self.runner.model_is_mrope:
+        if self.runner.model_config.uses_mrope:
             image_grid_thw = mm_kwargs.get("image_grid_thw", None)
             video_grid_thw = mm_kwargs.get("video_grid_thw", None)
             assert image_grid_thw is not None or video_grid_thw is not None, (
@@ -446,12 +445,6 @@ def __init__(
         # Lazy initialization.
         self.model: nn.Module  # Set after init_Model
 
-    @property
-    def model_is_mrope(self) -> bool:
-        """Detect if the model has "mrope" rope_scaling type.
-        mrope requires keep "rope_deltas" between prompt and decoding phases."""
-        return uses_mrope(self.model_config.hf_config)
-
     def load_model(self) -> None:
         self.model = get_model(vllm_config=self.vllm_config)
 
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 3778707ae07e8..2914f520d823c 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -151,7 +151,7 @@ def __init__(
             self.local_omp_cpuid = omp_cpuids.split("|")[rank]
 
         ModelRunnerClass: Type[CPUModelRunner] = CPUModelRunner
-        if self._is_encoder_decoder_model():
+        if self.model_config.is_encoder_decoder:
             ModelRunnerClass = CPUEncoderDecoderModelRunner
         self.model_runner: CPUModelRunner = ModelRunnerClass(
             vllm_config=vllm_config,
@@ -188,9 +188,6 @@ def stop_profile(self):
             raise RuntimeError("Profiler is not enabled.")
         self.profiler.stop()
 
-    def _is_encoder_decoder_model(self):
-        return self.model_config.is_encoder_decoder_model
-
     def init_device(self) -> None:
         if self.local_omp_cpuid != "all":
             ret = torch.ops._C_utils.init_cpu_threads_env(self.local_omp_cpuid)
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 1e8ea4e8e79cf..a1ec2e85be7b8 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -47,7 +47,6 @@
     LRUCacheWorkerPromptAdapterManager)
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
-from vllm.transformers_utils.config import uses_mrope
 from vllm.utils import (DeviceMemoryProfiler, GiB_bytes, PyObjectCache,
                         async_tensor_h2d, flatten_2d_lists,
                         is_pin_memory_available, supports_dynamo,
@@ -493,7 +492,7 @@ def _compute_lens(self, inter_data: InterDataForSeqGroup, seq_idx: int,
             context_len = seq_data.get_num_computed_tokens()
             seq_len = min(seq_len, context_len + token_chunk_size)
         elif self.runner.scheduler_config.is_multi_step or \
-            self.runner.model_config.is_encoder_decoder_model:
+            self.runner.model_config.is_encoder_decoder:
             context_len = seq_len - 1
         else:
             context_len = seq_data.get_num_computed_tokens()
@@ -666,7 +665,7 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
         inter_data.multi_modal_placeholder_maps = placeholder_maps
 
         # special processing for mrope position deltas.
-        if self.runner.model_is_mrope:
+        if self.runner.model_config.uses_mrope:
             image_grid_thw = mm_kwargs.get("image_grid_thw", None)
             video_grid_thw = mm_kwargs.get("video_grid_thw", None)
             assert image_grid_thw is not None or video_grid_thw is not None, (
@@ -711,7 +710,7 @@ def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata):
 
         encoder_seq_len = 0
 
-        if self.runner.model_config.is_encoder_decoder_model:
+        if self.runner.model_config.is_encoder_decoder:
             encoder_seq_len = seq_group_metadata.encoder_seq_data.get_len()
 
         inter_data = self.init_cached_inter_data(
@@ -837,7 +836,7 @@ def build(self) -> ModelInputForGPU:
             if not inter_data.is_prompt:
                 max_decode_seq_len = max(max_decode_seq_len,
                                          max(inter_data.seq_lens))
-                if self.runner.model_config.is_encoder_decoder_model:
+                if self.runner.model_config.is_encoder_decoder:
                     max_encoder_seq_len = max(max_encoder_seq_len,
                                               inter_data.encoder_seq_len)
 
@@ -1375,12 +1374,6 @@ def list_prompt_adapters(self) -> Set[int]:
             raise RuntimeError("PromptAdapter is not enabled.")
         return self.prompt_adapter_manager.list_adapters()
 
-    @property
-    def model_is_mrope(self) -> bool:
-        """Detect if the model has "mrope" rope_scaling type.
-        mrope requires keep "rope_deltas" between prompt and decoding phases."""
-        return uses_mrope(self.model_config.hf_config)
-
     @torch.inference_mode()
     def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
         """Cuda graph capture a model.
@@ -1411,7 +1404,7 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
         max_batch_size = self.max_batchsize_to_capture
         input_tokens = torch.zeros(max_batch_size, dtype=torch.long).cuda()
         input_positions = torch.zeros(max_batch_size, dtype=torch.long).cuda()
-        if self.model_is_mrope:
+        if self.model_config.uses_mrope:
             input_positions = torch.tile(input_positions, (3, 1))
         # Prepare dummy previous_hidden_states only if needed by the model.
         # This is used by draft models such as EAGLE.
@@ -1447,7 +1440,7 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                         self.attn_state.graph_capture_get_metadata_for_batch(
                             batch_size,
                             is_encoder_decoder_model=self.model_config.
-                            is_encoder_decoder_model))
+                            is_encoder_decoder))
 
                     if self.lora_config:
                         lora_mapping = LoRAMapping(
@@ -1466,7 +1459,7 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                     graph_runner = CUDAGraphRunner(
                         self.model, self.attn_backend.get_name(),
                         self.attn_state.graph_clone(batch_size),
-                        self.model_config.is_encoder_decoder_model)
+                        self.model_config.is_encoder_decoder)
 
                     capture_inputs = {
                         "input_ids":
@@ -1497,7 +1490,7 @@ def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
                             self.model.get_seqlen_agnostic_capture_inputs(
                                 batch_size)
                         })
-                    if self.model_config.is_encoder_decoder_model:
+                    if self.model_config.is_encoder_decoder:
                         # add the additional inputs to capture for
                         # encoder-decoder models.
                         self._update_inputs_to_capture_for_enc_dec_model(
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 8928936b4f9fc..d8c8011a585d8 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -77,7 +77,7 @@ def __init__(
             ModelRunnerClass = model_runner_cls
         elif model_config.task == "embedding":
             ModelRunnerClass = EmbeddingModelRunner
-        elif self._is_encoder_decoder_model():
+        elif self.model_config.is_encoder_decoder:
             ModelRunnerClass = EncoderDecoderModelRunner
         self.model_runner: GPUModelRunnerBase = ModelRunnerClass(
             vllm_config=self.vllm_config,
@@ -119,9 +119,6 @@ def stop_profile(self):
             raise RuntimeError("Profiler is not enabled.")
         self.profiler.stop()
 
-    def _is_encoder_decoder_model(self):
-        return self.model_config.is_encoder_decoder_model
-
     def init_device(self) -> None:
         if self.device_config.device.type == "cuda":
             # torch.distributed.all_reduce does not free the input tensor until

From 104d729656fe746d1b91a0528e51e5efc8d14b4a Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Thu, 7 Nov 2024 01:54:46 -0500
Subject: [PATCH 017/183] [CI/Build] re-add codespell to CI (#10083)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .github/workflows/codespell.yml | 45 +++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 .github/workflows/codespell.yml

diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml
new file mode 100644
index 0000000000000..dfb087ff66913
--- /dev/null
+++ b/.github/workflows/codespell.yml
@@ -0,0 +1,45 @@
+name: codespell
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - main
+    paths:
+      - "**/*.py"
+      - "**/*.md"
+      - "**/*.rst"
+      - pyproject.toml
+      - requirements-lint.txt
+      - .github/workflows/codespell.yml
+  pull_request:
+    branches:
+      - main
+    paths:
+      - "**/*.py"
+      - "**/*.md"
+      - "**/*.rst"
+      - pyproject.toml
+      - requirements-lint.txt
+      - .github/workflows/codespell.yml
+
+jobs:
+  codespell:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+    steps:
+    - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements-lint.txt
+    - name: Spelling check with codespell
+      run: |
+        codespell --toml pyproject.toml

From d7263a1bb837648bec67d99ed35db56c58832d3f Mon Sep 17 00:00:00 2001
From: Rafael Vasquez <rafvasq21@gmail.com>
Date: Thu, 7 Nov 2024 02:50:35 -0500
Subject: [PATCH 018/183] Doc: Improve benchmark documentation (#9927)

Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
---
 docs/source/dev/profiling/profiling_index.rst |  5 +--
 docs/source/index.rst                         |  4 +--
 docs/source/performance/benchmarks.rst        | 33 +++++++++++++++++++
 .../performance_benchmark/benchmarks.rst      | 23 -------------
 4 files changed, 38 insertions(+), 27 deletions(-)
 create mode 100644 docs/source/performance/benchmarks.rst
 delete mode 100644 docs/source/performance_benchmark/benchmarks.rst

diff --git a/docs/source/dev/profiling/profiling_index.rst b/docs/source/dev/profiling/profiling_index.rst
index 9e8b2f1817567..a422b1fcda521 100644
--- a/docs/source/dev/profiling/profiling_index.rst
+++ b/docs/source/dev/profiling/profiling_index.rst
@@ -1,5 +1,6 @@
-Profiling vLLM 
-=================================
+==============
+Profiling vLLM
+==============
 
 We support tracing vLLM workers using the ``torch.profiler`` module. You can enable tracing by setting the ``VLLM_TORCH_PROFILER_DIR`` environment variable to the directory where you want to save the traces: ``VLLM_TORCH_PROFILER_DIR=/mnt/traces/``
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 51add1fd4d0ab..38dad25e18c02 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -126,9 +126,9 @@ Documentation
 
 .. toctree::
    :maxdepth: 1
-   :caption: Performance benchmarks
+   :caption: Performance
 
-   performance_benchmark/benchmarks
+   performance/benchmarks
 
 .. toctree::
    :maxdepth: 2
diff --git a/docs/source/performance/benchmarks.rst b/docs/source/performance/benchmarks.rst
new file mode 100644
index 0000000000000..6d4d7b544cb5d
--- /dev/null
+++ b/docs/source/performance/benchmarks.rst
@@ -0,0 +1,33 @@
+.. _benchmarks:
+
+================
+Benchmark Suites
+================
+
+vLLM contains two sets of benchmarks:
+
++ :ref:`Performance benchmarks <performance_benchmarks>`
++ :ref:`Nightly benchmarks <nightly_benchmarks>`
+
+
+.. _performance_benchmarks:
+
+Performance Benchmarks
+----------------------
+
+The performance benchmarks are used for development to confirm whether new changes improve performance under various workloads. They are triggered on every commit with both the ``perf-benchmarks`` and ``ready`` labels, and when a PR is merged into vLLM.
+
+The latest performance results are hosted on the public `vLLM Performance Dashboard <https://perf.vllm.ai>`_.
+
+More information on the performance benchmarks and their parameters can be found `here <https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md>`__.
+
+.. _nightly_benchmarks:
+
+Nightly Benchmarks
+------------------
+
+These compare vLLM's performance against alternatives (``tgi``, ``trt-llm``, and ``lmdeploy``) when there are major updates of vLLM (e.g., bumping up to a new version). They are primarily intended for consumers to evaluate when to choose vLLM over other options and are triggered on every commit with both the ``perf-benchmarks`` and ``nightly-benchmarks`` labels. 
+
+The latest nightly benchmark results are shared in major release blog posts such as `vLLM v0.6.0 <https://blog.vllm.ai/2024/09/05/perf-update.html>`_.
+
+More information on the nightly benchmarks and their parameters can be found `here <https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/nightly-descriptions.md>`__.
\ No newline at end of file
diff --git a/docs/source/performance_benchmark/benchmarks.rst b/docs/source/performance_benchmark/benchmarks.rst
deleted file mode 100644
index e5c8d6a55de63..0000000000000
--- a/docs/source/performance_benchmark/benchmarks.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-.. _benchmarks:
-
-Benchmark suites of vLLM
-========================
-
-
-
-vLLM contains two sets of benchmarks:
-
-+ **Performance benchmarks**: benchmark vLLM's performance under various workloads at a high frequency (when a pull request (PR for short) of vLLM is being merged). See `vLLM performance dashboard <https://perf.vllm.ai>`_ for the latest performance results.
-
-+ **Nightly benchmarks**: compare vLLM's performance against alternatives (tgi, trt-llm, and lmdeploy) when there are major updates of vLLM (e.g., bumping up to a new version). The latest results are available in the `vLLM GitHub README <https://github.com/vllm-project/vllm/blob/main/README.md>`_.
-
-
-Trigger a benchmark
--------------------
-
-The performance benchmarks and nightly benchmarks can be triggered by submitting a PR to vLLM, and label the PR with `perf-benchmarks` and `nightly-benchmarks`.
-
-
-.. note::
-
-   Please refer to `vLLM performance benchmark descriptions <https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/performance-benchmarks-descriptions.md>`_ and `vLLM nightly benchmark descriptions <https://github.com/vllm-project/vllm/blob/main/.buildkite/nightly-benchmarks/nightly-descriptions.md>`_ for detailed descriptions on benchmark environment, workload and metrics.

From 6192e9b8fef8492c3e52bd65c7d954a1ef9b40c8 Mon Sep 17 00:00:00 2001
From: Hanzhi Zhou <hanzhi713@gmail.com>
Date: Wed, 6 Nov 2024 23:50:47 -0800
Subject: [PATCH 019/183] [Core][Distributed] Refactor ipc buffer init in
 CustomAllreduce (#10030)

Signed-off-by: Hanzhi Zhou <hanzhi713@gmail.com>
---
 csrc/custom_all_reduce.cu                     | 119 +++++++--------
 csrc/custom_all_reduce.cuh                    |  87 +++++------
 csrc/custom_all_reduce_test.cu                |  24 +--
 csrc/ops.h                                    |  22 ++-
 csrc/torch_bindings.cpp                       |  21 +--
 tests/distributed/test_custom_all_reduce.py   |   4 +-
 tools/profiler/visualize_layerwise_profile.py |  32 ++--
 vllm/_custom_ops.py                           |  29 ++--
 .../device_communicators/custom_all_reduce.py | 140 +++++++-----------
 9 files changed, 218 insertions(+), 260 deletions(-)

diff --git a/csrc/custom_all_reduce.cu b/csrc/custom_all_reduce.cu
index 9b82bec44c3c6..123278bfed71d 100644
--- a/csrc/custom_all_reduce.cu
+++ b/csrc/custom_all_reduce.cu
@@ -5,32 +5,29 @@
 
 #include "custom_all_reduce.cuh"
 
-// fake pointer type, must match fptr_t type in ops.h
+// Fake pointer type, must match fptr_t type in ops.h.
+// We use this type alias to indicate when pointers are passed in as int64_t.
 using fptr_t = int64_t;
 static_assert(sizeof(void*) == sizeof(fptr_t));
 
-fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data,
-                      const std::vector<std::string>& handles,
-                      const std::vector<int64_t>& offsets, int64_t rank,
+fptr_t init_custom_ar(const std::vector<fptr_t>& fake_ipc_ptrs,
+                      torch::Tensor& rank_data, int64_t rank,
                       bool full_nvlink) {
-  int world_size = offsets.size();
+  int world_size = fake_ipc_ptrs.size();
   if (world_size > 8)
     throw std::invalid_argument("world size > 8 is not supported");
   if (world_size % 2 != 0)
     throw std::invalid_argument("Odd num gpus is not supported for now");
-  if (world_size != handles.size())
-    throw std::invalid_argument(
-        "handles length should equal to offsets length");
   if (rank < 0 || rank >= world_size)
     throw std::invalid_argument("invalid rank passed in");
 
-  cudaIpcMemHandle_t ipc_handles[8];
+  vllm::Signal* ipc_ptrs[8];
   for (int i = 0; i < world_size; i++) {
-    std::memcpy(&ipc_handles[i], handles[i].data(), sizeof(cudaIpcMemHandle_t));
+    ipc_ptrs[i] = reinterpret_cast<vllm::Signal*>(fake_ipc_ptrs[i]);
   }
-  return (fptr_t) new vllm::CustomAllreduce(
-      reinterpret_cast<vllm::Signal*>(meta.data_ptr()), rank_data.data_ptr(),
-      rank_data.numel(), ipc_handles, offsets, rank, full_nvlink);
+  return (fptr_t) new vllm::CustomAllreduce(ipc_ptrs, rank_data.data_ptr(),
+                                            rank_data.numel(), rank, world_size,
+                                            full_nvlink);
 }
 
 /**
@@ -55,26 +52,48 @@ bool _is_weak_contiguous(torch::Tensor& t) {
           t.numel() * t.element_size());
 }
 
-void _all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
-                 cudaStream_t stream) {
+/**
+ * Performs an out-of-place allreduce and stores result in out.
+ *
+ * If _reg_buffer is null, assumes inp.data_ptr() is already IPC-registered.
+ * Otherwise, _reg_buffer is assumed to be IPC-registered and inp is first
+ * copied into _reg_buffer.
+ */
+void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
+                fptr_t _reg_buffer, int64_t reg_buffer_sz_bytes) {
   auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
+  auto stream = c10::cuda::getCurrentCUDAStream().stream();
+
+  TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
+  TORCH_CHECK_EQ(inp.numel(), out.numel());
   TORCH_CHECK(_is_weak_contiguous(out));
+  TORCH_CHECK(_is_weak_contiguous(inp));
+  auto input_size = inp.numel() * inp.element_size();
+  auto reg_buffer = reinterpret_cast<void*>(_reg_buffer);
+  if (reg_buffer) {
+    TORCH_CHECK_LE(input_size, reg_buffer_sz_bytes);
+    AT_CUDA_CHECK(cudaMemcpyAsync(reg_buffer, inp.data_ptr(), input_size,
+                                  cudaMemcpyDeviceToDevice, stream));
+  } else {
+    reg_buffer = inp.data_ptr();
+  }
   switch (out.scalar_type()) {
     case at::ScalarType::Float: {
-      fa->allreduce<float>(stream, reinterpret_cast<float*>(inp.data_ptr()),
+      fa->allreduce<float>(stream, reinterpret_cast<float*>(reg_buffer),
                            reinterpret_cast<float*>(out.data_ptr()),
                            out.numel());
       break;
     }
     case at::ScalarType::Half: {
-      fa->allreduce<half>(stream, reinterpret_cast<half*>(inp.data_ptr()),
+      fa->allreduce<half>(stream, reinterpret_cast<half*>(reg_buffer),
                           reinterpret_cast<half*>(out.data_ptr()), out.numel());
       break;
     }
 #if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
     case at::ScalarType::BFloat16: {
       fa->allreduce<nv_bfloat16>(
-          stream, reinterpret_cast<nv_bfloat16*>(inp.data_ptr()),
+          stream, reinterpret_cast<nv_bfloat16*>(reg_buffer),
           reinterpret_cast<nv_bfloat16*>(out.data_ptr()), out.numel());
       break;
     }
@@ -85,57 +104,41 @@ void _all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
   }
 }
 
-void all_reduce_reg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out) {
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
-  auto stream = c10::cuda::getCurrentCUDAStream().stream();
-  TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
-  TORCH_CHECK_EQ(inp.numel(), out.numel());
-  _all_reduce(_fa, inp, out, stream);
-}
-
-void all_reduce_unreg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& reg_buffer,
-                      torch::Tensor& out) {
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
-  auto stream = c10::cuda::getCurrentCUDAStream().stream();
-
-  auto input_size = inp.numel() * inp.element_size();
-  TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
-  TORCH_CHECK_EQ(inp.numel(), out.numel());
-  TORCH_CHECK(input_size <= reg_buffer.numel() * reg_buffer.element_size(),
-              "registered buffer is too small to contain the input");
-  AT_CUDA_CHECK(cudaMemcpyAsync(reg_buffer.data_ptr(), inp.data_ptr(),
-                                input_size, cudaMemcpyDeviceToDevice, stream));
-  _all_reduce(_fa, reg_buffer, out, stream);
-}
-
 void dispose(fptr_t _fa) {
-  auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
-  delete fa;
+  delete reinterpret_cast<vllm::CustomAllreduce*>(_fa);
 }
 
 int64_t meta_size() { return sizeof(vllm::Signal); }
 
-void register_buffer(fptr_t _fa, torch::Tensor& t,
-                     const std::vector<std::string>& handles,
-                     const std::vector<int64_t>& offsets) {
+void register_buffer(fptr_t _fa, const std::vector<fptr_t>& fake_ipc_ptrs) {
   auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
-  fa->register_buffer(handles, offsets, t.data_ptr());
+  TORCH_CHECK(fake_ipc_ptrs.size() == fa->world_size_);
+  void* ipc_ptrs[8];
+  for (int i = 0; i < fake_ipc_ptrs.size(); i++) {
+    ipc_ptrs[i] = reinterpret_cast<void*>(fake_ipc_ptrs[i]);
+  }
+  fa->register_buffer(ipc_ptrs);
 }
 
-std::tuple<torch::Tensor, std::vector<int64_t>> get_graph_buffer_ipc_meta(
-    fptr_t _fa) {
+// Use vector<int64_t> to represent byte data for python binding compatibility.
+std::tuple<std::vector<int64_t>, std::vector<int64_t>>
+get_graph_buffer_ipc_meta(fptr_t _fa) {
   auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
-  auto [handle_bytes, offsets] = fa->get_graph_buffer_ipc_meta();
-  auto options =
-      torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
-  auto handles =
-      torch::empty({static_cast<int64_t>(handle_bytes.size())}, options);
-  std::memcpy(handles.data_ptr(), handle_bytes.data(), handle_bytes.size());
-  return {handles, std::move(offsets)};
+  auto [handle, offsets] = fa->get_graph_buffer_ipc_meta();
+  std::vector<int64_t> bytes(handle.begin(), handle.end());
+  return std::make_tuple(bytes, offsets);
 }
 
-void register_graph_buffers(fptr_t _fa, const std::vector<std::string>& handles,
+// Use vector<int64_t> to represent byte data for python binding compatibility.
+void register_graph_buffers(fptr_t _fa,
+                            const std::vector<std::vector<int64_t>>& handles,
                             const std::vector<std::vector<int64_t>>& offsets) {
   auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
-  fa->register_graph_buffers(handles, offsets);
+  std::vector<std::string> bytes;
+  bytes.reserve(handles.size());
+  for (int i = 0; i < handles.size(); i++) {
+    bytes.emplace_back(handles[i].begin(), handles[i].end());
+  }
+  bytes.reserve(handles.size());
+  fa->register_graph_buffers(bytes, offsets);
 }
diff --git a/csrc/custom_all_reduce.cuh b/csrc/custom_all_reduce.cuh
index a2f7e43300002..6be4d4f2b2eb8 100644
--- a/csrc/custom_all_reduce.cuh
+++ b/csrc/custom_all_reduce.cuh
@@ -285,46 +285,52 @@ class CustomAllreduce {
   int world_size_;
   bool full_nvlink_;
 
-  // below are device pointers
   RankSignals sg_;
+  // Stores an map from a pointer to its peer pointters from all ranks.
   std::unordered_map<void*, RankData*> buffers_;
   Signal* self_sg_;
 
-  // stores the registered device pointers from all ranks
+  // Stores rank data from all ranks. This is mainly for cuda graph purposes.
+  // For cuda graph to work, all kernel arguments must be fixed during graph
+  // capture time. However, the peer pointers are not known during graph capture
+  // time. Therefore, during capture, we increment the rank data pointer and use
+  // that as the argument to the kernel. The kernel arguments are stored in
+  // graph_unreg_buffers_. The actual peer pointers will be filled in at the
+  // memory pointed to by the pointers in graph_unreg_buffers_ when
+  // the IPC handles are exchanged between ranks.
+  //
+  // The overall process looks like this:
+  // 1. Graph capture.
+  // 2. Each rank obtains the IPC handles for each addresses used during cuda
+  // graph capture using get_graph_buffer_ipc_meta.
+  // 3. (In Python) all gather the IPC handles.
+  // 4. Obtain the peer pointers by opening the IPC handles, and store them in
+  // the rank data array at corresponding positions.
   RankData *d_rank_data_base_, *d_rank_data_end_;
   std::vector<void*> graph_unreg_buffers_;
   // a map from IPC handles to opened IPC pointers
   std::map<IPC_KEY, char*> ipc_handles_;
 
   /**
-   * meta is a pointer to device metadata and temporary buffer for allreduce.
+   * Signals are an array of ipc-enabled buffers from all ranks.
+   * For each of the buffer, the layout is as follows:
+   * | -- sizeof(Signal) -- | ------ a few MB ----- |
+   * The first section is for allreduce synchronization, and the second section
+   * is for storing the intermediate results required by some allreduce algos.
    *
-   * There's a total of sizeof(Signal) of prefix before the actual data,
-   * so meta + 1 points to actual temporary buffer.
-   *
-   * note: this class does not own any device memory. Any required buffers
-   * are passed in from the constructor
+   * Note: this class does not own any device memory. Any required buffers
+   * are passed in from the constructor.
    */
-  CustomAllreduce(Signal* meta, void* rank_data, size_t rank_data_sz,
-                  const cudaIpcMemHandle_t* handles,
-                  const std::vector<int64_t>& offsets, int rank,
-                  bool full_nvlink = true)
+  CustomAllreduce(Signal** signals, void* rank_data, size_t rank_data_sz,
+                  int rank, int world_size, bool full_nvlink = true)
       : rank_(rank),
-        world_size_(offsets.size()),
+        world_size_(world_size),
         full_nvlink_(full_nvlink),
-        self_sg_(meta),
+        self_sg_(signals[rank]),
         d_rank_data_base_(reinterpret_cast<RankData*>(rank_data)),
         d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) {
     for (int i = 0; i < world_size_; i++) {
-      Signal* rank_sg;
-      if (i != rank_) {
-        char* handle = open_ipc_handle(&handles[i]);
-        handle += offsets[i];
-        rank_sg = (Signal*)handle;
-      } else {
-        rank_sg = self_sg_;
-      }
-      sg_.signals[i] = rank_sg;
+      sg_.signals[i] = signals[i];
     }
   }
 
@@ -341,11 +347,10 @@ class CustomAllreduce {
     return it->second;
   }
 
-  std::pair<std::vector<uint8_t>, std::vector<int64_t>>
-  get_graph_buffer_ipc_meta() {
+  std::pair<std::string, std::vector<int64_t>> get_graph_buffer_ipc_meta() {
     auto num_buffers = graph_unreg_buffers_.size();
     auto handle_sz = sizeof(cudaIpcMemHandle_t);
-    std::vector<uint8_t> handles(handle_sz * num_buffers, 0);
+    std::string handles(handle_sz * num_buffers, static_cast<char>(0));
     std::vector<int64_t> offsets(num_buffers);
     for (int i = 0; i < num_buffers; i++) {
       auto ptr = graph_unreg_buffers_[i];
@@ -370,26 +375,22 @@ class CustomAllreduce {
           std::to_string(d_rank_data_base_ + num - d_rank_data_end_));
   }
 
-  void register_buffer(const std::vector<std::string>& handles,
-                       const std::vector<int64_t>& offsets, void* self) {
+  /**
+   * Register already-shared IPC pointers.
+   */
+  void register_buffer(void** ptrs) {
     check_rank_data_capacity();
     RankData data;
     for (int i = 0; i < world_size_; i++) {
-      if (i != rank_) {
-        char* handle = open_ipc_handle(handles[i].data());
-        handle += offsets[i];
-        data.ptrs[i] = handle;
-      } else {
-        data.ptrs[i] = self;
-      }
+      data.ptrs[i] = ptrs[i];
     }
     auto d_data = d_rank_data_base_++;
     CUDACHECK(
         cudaMemcpy(d_data, &data, sizeof(RankData), cudaMemcpyHostToDevice));
-    buffers_[self] = d_data;
+    buffers_[ptrs[rank_]] = d_data;
   }
 
-  // note: when registering graph buffers, we intentionally choose to not
+  // Note: when registering graph buffers, we intentionally choose to not
   // deduplicate the addresses. That means if the allocator reuses some
   // addresses, they will be registered again. This is to account for the remote
   // possibility of different allocation patterns between ranks. For example,
@@ -424,11 +425,13 @@ class CustomAllreduce {
   }
 
   /**
-   * This is the result after careful grid search. Using 36 blocks give the best
-   * or close to the best runtime on the devices I tried: A100, A10, A30, T4,
-   * V100. You'll notice that NCCL kernels also only take a small amount of SMs.
-   * Not quite sure the underlying reason, but my guess is that too many SMs
-   * will cause contention on NVLink bus.
+   * Performs allreduce, assuming input has already been registered.
+   *
+   * Block and grid default configs are results after careful grid search. Using
+   * 36 blocks give the best or close to the best runtime on the devices I
+   * tried: A100, A10, A30, T4, V100. You'll notice that NCCL kernels also only
+   * take a small amount of SMs. Not quite sure the underlying reason, but my
+   * guess is that too many SMs will cause contention on NVLink bus.
    */
   template <typename T>
   void allreduce(cudaStream_t stream, T* input, T* output, int size,
diff --git a/csrc/custom_all_reduce_test.cu b/csrc/custom_all_reduce_test.cu
index 376687e91cfda..b59ea40d980f4 100644
--- a/csrc/custom_all_reduce_test.cu
+++ b/csrc/custom_all_reduce_test.cu
@@ -135,24 +135,26 @@ void run(int myRank, int nRanks, ncclComm_t& comm, int threads, int block_limit,
   void* rank_data;
   size_t rank_data_sz = 16 * 1024 * 1024;
   CUDACHECK(cudaMalloc(&rank_data, rank_data_sz));
-  std::vector<int64_t> offsets(nRanks, 0);
-  vllm::CustomAllreduce fa(buffer, rank_data, rank_data_sz, data_handles,
-                           offsets, myRank);
+  vllm::Signal* ipc_ptrs[8];
+  for (int i = 0; i < nRanks; i++) {
+    if (i == myRank)
+      ipc_ptrs[i] = buffer;
+    else
+      CUDACHECK(cudaIpcOpenMemHandle((void**)&ipc_ptrs[i], data_handles[i],
+                                     cudaIpcMemLazyEnablePeerAccess));
+  }
+  vllm::CustomAllreduce fa(ipc_ptrs, rank_data, rank_data_sz, myRank, nRanks);
   auto* self_data =
       reinterpret_cast<T*>(reinterpret_cast<char*>(buffer) +
                            sizeof(vllm::Signal) + data_size * sizeof(T));
   // hack buffer registration
   {
-    std::vector<std::string> handles;
-    handles.reserve(nRanks);
+    void* data[8];
     for (int i = 0; i < nRanks; i++) {
-      char* begin = (char*)&data_handles[i];
-      char* end = (char*)&data_handles[i + 1];
-      handles.emplace_back(begin, end);
+      data[i] =
+          ((char*)ipc_ptrs[i]) + sizeof(vllm::Signal) + data_size * sizeof(T);
     }
-    std::vector<int64_t> offsets(nRanks,
-                                 sizeof(vllm::Signal) + data_size * sizeof(T));
-    fa.register_buffer(handles, offsets, self_data);
+    fa.register_buffer(data);
   }
 
   double* ground_truth;
diff --git a/csrc/ops.h b/csrc/ops.h
index c50eb39a3dacc..e0775ee1891df 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -199,20 +199,16 @@ void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight,
 
 #ifndef USE_ROCM
 using fptr_t = int64_t;
-fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data,
-                      const std::vector<std::string>& handles,
-                      const std::vector<int64_t>& offsets, int64_t rank,
-                      bool full_nvlink);
-void all_reduce_reg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out);
-void all_reduce_unreg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& reg_buffer,
-                      torch::Tensor& out);
+fptr_t init_custom_ar(const std::vector<int64_t>& fake_ipc_ptrs,
+                      torch::Tensor& rank_data, int64_t rank, bool full_nvlink);
+void all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
+                fptr_t reg_buffer, int64_t reg_buffer_sz_bytes);
 void dispose(fptr_t _fa);
 int64_t meta_size();
-void register_buffer(fptr_t _fa, torch::Tensor& t,
-                     const std::vector<std::string>& handles,
-                     const std::vector<int64_t>& offsets);
-std::tuple<torch::Tensor, std::vector<int64_t>> get_graph_buffer_ipc_meta(
-    fptr_t _fa);
-void register_graph_buffers(fptr_t _fa, const std::vector<std::string>& handles,
+void register_buffer(fptr_t _fa, const std::vector<int64_t>& fake_ipc_ptrs);
+std::tuple<std::vector<int64_t>, std::vector<int64_t>>
+get_graph_buffer_ipc_meta(fptr_t _fa);
+void register_graph_buffers(fptr_t _fa,
+                            const std::vector<std::vector<int64_t>>& handles,
                             const std::vector<std::vector<int64_t>>& offsets);
 #endif
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index b8185c24d5628..971a45d50ffa4 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -411,27 +411,18 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
 TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
   // Custom all-reduce kernels
   custom_ar.def(
-      "init_custom_ar(Tensor meta, Tensor rank_data, "
-      "str[] handles, int[] offsets, int rank, "
-      "bool full_nvlink) -> int");
+      "init_custom_ar(int[] ipc_tensors, Tensor rank_data, "
+      "int rank, bool full_nvlink) -> int");
   custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
-
-  custom_ar.def("all_reduce_reg(int fa, Tensor inp, Tensor! out) -> ()");
-  custom_ar.impl("all_reduce_reg", torch::kCUDA, &all_reduce_reg);
-
   custom_ar.def(
-      "all_reduce_unreg(int fa, Tensor inp, Tensor reg_buffer, Tensor! out) -> "
-      "()");
-  custom_ar.impl("all_reduce_unreg", torch::kCUDA, &all_reduce_unreg);
+      "all_reduce(int fa, Tensor inp, Tensor! out, int reg_buffer, "
+      "int reg_buffer_sz_bytes) -> ()");
+  custom_ar.impl("all_reduce", torch::kCUDA, &all_reduce);
 
   custom_ar.def("dispose", &dispose);
   custom_ar.def("meta_size", &meta_size);
 
-  custom_ar.def(
-      "register_buffer(int fa, Tensor t, str[] handles, "
-      "int[] offsets) -> ()");
-  custom_ar.impl("register_buffer", torch::kCUDA, &register_buffer);
-
+  custom_ar.def("register_buffer", &register_buffer);
   custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
   custom_ar.def("register_graph_buffers", &register_graph_buffers);
 }
diff --git a/tests/distributed/test_custom_all_reduce.py b/tests/distributed/test_custom_all_reduce.py
index 95435e753058a..86ca1948ef94a 100644
--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -95,13 +95,13 @@ def eager_allreduce(tp_size, pp_size, rank, distributed_init_port):
     inp = torch.ones(sz, dtype=torch.float32, device=device)
     out = inp
     for _ in range(num_communication):
-        out = fa.all_reduce_unreg(out)
+        out = fa.all_reduce(out, registered=False)
     torch.testing.assert_close(out, inp * (tp_size**num_communication))
 
     inp = torch.ones(sz * 4, dtype=torch.bfloat16, device=device)
     out = inp
     for _ in range(num_communication):
-        out = fa.all_reduce_unreg(out)
+        out = fa.all_reduce(out, registered=False)
     torch.testing.assert_close(out, inp * (tp_size**num_communication))
 
 
diff --git a/tools/profiler/visualize_layerwise_profile.py b/tools/profiler/visualize_layerwise_profile.py
index efd6beee865c2..adc44474aa4c1 100644
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@@ -196,8 +196,8 @@ def is_cross_device_reduce_1stage(op_name: str):
     def is_cross_device_reduce_2stage(op_name: str):
         return "cross_device_reduce_2stage" in op_name
 
-    def is_custom_ar_all_reduce_unreg(op_name: str):
-        return "_C_custom_ar::all_reduce_unreg" in op_name
+    def is_custom_ar_all_reduce(op_name: str):
+        return "_C_custom_ar::all_reduce" in op_name
 
     def is_reduce_kernel(op_name: str):
         return "reduce_kernel" in op_name
@@ -246,9 +246,9 @@ def is_reduce_kernel(op_name: str):
         filter(lambda x: is_cross_device_reduce_2stage(x), ops))
     ops = list(filter(lambda x: x not in cross_device_reduce_2stage_ops, ops))
 
-    custom_ar_all_reduce_unreg_ops = list(
-        filter(lambda x: is_custom_ar_all_reduce_unreg(x), ops))
-    ops = list(filter(lambda x: x not in custom_ar_all_reduce_unreg_ops, ops))
+    custom_ar_all_reduce_ops = list(
+        filter(lambda x: is_custom_ar_all_reduce(x), ops))
+    ops = list(filter(lambda x: x not in custom_ar_all_reduce_ops, ops))
 
     reduce_kernel_ops = list(filter(lambda x: is_reduce_kernel(x), ops))
     ops = list(filter(lambda x: x not in reduce_kernel_ops, ops))
@@ -289,21 +289,21 @@ def is_reduce_kernel(op_name: str):
     if len(cross_device_reduce_2stage_ops):
         trace_df['cross_device_reduce_2stage_ops'] = trace_df[
             cross_device_reduce_2stage_ops].agg("sum", axis=1)
-    if len(custom_ar_all_reduce_unreg_ops):
-        trace_df['custom_ar_all_reduce_unreg_ops'] = trace_df[
-            custom_ar_all_reduce_unreg_ops].agg("sum", axis=1)
+    if len(custom_ar_all_reduce_ops):
+        trace_df['custom_ar_all_reduce_ops'] = trace_df[
+            custom_ar_all_reduce_ops].agg("sum", axis=1)
     if len(reduce_kernel_ops):
         trace_df['reduce_kernel_ops'] = trace_df[reduce_kernel_ops].agg("sum",
                                                                         axis=1)
 
-    trace_df.drop(
-        attention_ops + quant_ops + gemm_ops + rms_norm_ops + vocab_embed_ops +
-        mem_ops + elementwise_ops + nccl_all_reduce_ops + nccl_gather_ops +
-        nccl_broadcast_ops + nccl_other_ops + cross_device_reduce_1stage_ops +
-        cross_device_reduce_2stage_ops + custom_ar_all_reduce_unreg_ops +
-        reduce_kernel_ops,
-        axis=1,
-        inplace=True)
+    trace_df.drop(attention_ops + quant_ops + gemm_ops + rms_norm_ops +
+                  vocab_embed_ops + mem_ops + elementwise_ops +
+                  nccl_all_reduce_ops + nccl_gather_ops + nccl_broadcast_ops +
+                  nccl_other_ops + cross_device_reduce_1stage_ops +
+                  cross_device_reduce_2stage_ops + custom_ar_all_reduce_ops +
+                  reduce_kernel_ops,
+                  axis=1,
+                  inplace=True)
     return trace_df
 
 
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 682e08db99fa9..767d45ede7e87 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -912,20 +912,16 @@ def get_max_shared_memory_per_block_device_attribute(device: int) -> int:
 
 
 # custom ar
-def init_custom_ar(meta: torch.Tensor, rank_data: torch.Tensor,
-                   handles: List[str], offsets: List[int], rank: int,
-                   full_nvlink: bool) -> int:
-    return torch.ops._C_custom_ar.init_custom_ar(meta, rank_data, handles,
-                                                 offsets, rank, full_nvlink)
+def init_custom_ar(ipc_tensors: List[torch.Tensor], rank_data: torch.Tensor,
+                   rank: int, full_nvlink: bool) -> int:
+    return torch.ops._C_custom_ar.init_custom_ar(ipc_tensors, rank_data, rank,
+                                                 full_nvlink)
 
 
-def all_reduce_reg(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
-    torch.ops._C_custom_ar.all_reduce_reg(fa, inp, out)
-
-
-def all_reduce_unreg(fa: int, inp: torch.Tensor, reg_buffer: torch.Tensor,
-                     out: torch.Tensor) -> None:
-    torch.ops._C_custom_ar.all_reduce_unreg(fa, inp, reg_buffer, out)
+def all_reduce(fa: int, inp: torch.Tensor, out: torch.Tensor, reg_buffer: int,
+               reg_buffer_sz_bytes: int) -> None:
+    torch.ops._C_custom_ar.all_reduce(fa, inp, out, reg_buffer,
+                                      reg_buffer_sz_bytes)
 
 
 def dispose(fa: int) -> None:
@@ -936,16 +932,15 @@ def meta_size() -> int:
     return torch.ops._C_custom_ar.meta_size()
 
 
-def register_buffer(fa: int, t: torch.Tensor, handles: List[str],
-                    offsets: List[int]) -> None:
-    return torch.ops._C_custom_ar.register_buffer(fa, t, handles, offsets)
+def register_buffer(fa: int, ipc_tensors: List[int]) -> None:
+    return torch.ops._C_custom_ar.register_buffer(fa, ipc_tensors)
 
 
-def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[str], List[int]]:
+def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[int], List[int]]:
     return torch.ops._C_custom_ar.get_graph_buffer_ipc_meta(fa)
 
 
-def register_graph_buffers(fa: int, handles: List[str],
+def register_graph_buffers(fa: int, handles: List[List[int]],
                            offsets: List[List[int]]) -> None:
     torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
 
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index 3b5d92561cf25..62929dc0feaaf 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -1,6 +1,6 @@
 import ctypes
 from contextlib import contextmanager
-from typing import Any, List, Optional, Union
+from typing import List, Optional, Union
 
 import torch
 import torch.distributed as dist
@@ -147,18 +147,14 @@ def __init__(self,
             return
 
         self.disabled = False
-        # buffers memory are owned by this Python class and passed to C++
-        # meta data composes of two parts: meta data for synchronization
-        # (256 bytes) and a temporary buffer for storing intermediate
-        # allreduce results.
-        self.meta = torch.zeros(ops.meta_size() + max_size,
-                                dtype=torch.uint8,
-                                device=self.device)
+        # Buffers memory are owned by this Python class and passed to C++.
+        # Meta data composes of two parts: meta data for synchronization and a
+        # temporary buffer for storing intermediate allreduce results.
+        self.meta_ptrs = self.create_shared_buffer(ops.meta_size() + max_size,
+                                                   group=group)
         # This is a pre-registered IPC buffer. In eager mode, input tensors
         # are first copied into this buffer before allreduce is performed
-        self.buffer = torch.empty(max_size,
-                                  dtype=torch.uint8,
-                                  device=self.device)
+        self.buffer_ptrs = self.create_shared_buffer(max_size, group=group)
         # This is a buffer for storing the tuples of pointers pointing to
         # IPC buffers from all ranks. Each registered tuple has size of
         # 8*world_size bytes where world_size is at most 8. Allocating 8MB
@@ -170,16 +166,19 @@ def __init__(self,
         self.max_size = max_size
         self.rank = rank
         self.world_size = world_size
-        handles, offsets = self._get_ipc_meta(self.meta)
         self.full_nvlink = full_nvlink
-        self._ptr = ops.init_custom_ar(self.meta, self.rank_data, handles,
-                                       offsets, rank, self.full_nvlink)
-        self.register_buffer(self.buffer)
+        self._ptr = ops.init_custom_ar(self.meta_ptrs, self.rank_data, rank,
+                                       self.full_nvlink)
+        ops.register_buffer(self._ptr, self.buffer_ptrs)
 
     @staticmethod
     def create_shared_buffer(
             size_in_bytes: int,
             group: Optional[ProcessGroup] = None) -> List[int]:
+        """
+        Creates a shared buffer and returns a list of pointers
+        representing the buffer on all processes in the group.
+        """
         lib = CudaRTLibrary()
         pointer = lib.cudaMalloc(size_in_bytes)
         handle = lib.cudaIpcGetMemHandle(pointer)
@@ -220,60 +219,24 @@ def capture(self):
             if not self.disabled:
                 self.register_graph_buffers()
 
-    def _get_ipc_meta(self, inp: torch.Tensor):
-        data = inp.untyped_storage()._share_cuda_()
-        handle = data[1]
-        # https://github.com/pytorch/pytorch/pull/130890 changes
-        # the binary format of the ipc handle
-        # it starts from pytorch 2.5
-        if len(handle) > 64:
-            assert len(handle) == 66
-            # only support SHAREABLE_HANDLE_VERSION = 1
-            assert int(handle[0]) == 1
-            # only support SHAREABLE_CUDA_MALLOC = 'c'
-            assert handle[1] == ord("c")
-            handle = handle[2:]
-            # TODO: support expandable segment
-        shard_data = (
-            handle,  # ipc handle to base ptr
-            data[3],  # offset of base ptr
-        )
-        return self._gather_ipc_meta(shard_data)
-
-    def _gather_ipc_meta(self, shard_data):
-        # Note: don't use `[[None]] * self.world_size` here
-        # because it will create a list of the same reference
-        all_data: List[Optional[Any]] = [[None]
-                                         for i in range(self.world_size)]
-        all_data[self.rank][0] = shard_data
-
-        ranks = dist.get_process_group_ranks(group=self.group)
-        ranks.sort()
+    def register_graph_buffers(self):
+        handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
+        logger.info("Registering %d cuda graph addresses", len(offset))
+        # We cannot directly use `dist.all_gather_object` here
+        # because it is incompatible with `gloo` backend under inference mode.
+        # see https://github.com/pytorch/pytorch/issues/126032 for details.
+        all_data = [[None, None]
+                    for _ in range(dist.get_world_size(group=self.group))]
+        all_data[self.rank] = [handle, offset]
+        ranks = sorted(dist.get_process_group_ranks(group=self.group))
         for i, rank in enumerate(ranks):
             dist.broadcast_object_list(all_data[i],
                                        src=rank,
                                        group=self.group,
                                        device="cpu")
-
-        # we cannot directly use `dist.all_gather_object` here
-        # because it is incompatible with `gloo` backend under inference mode.
-        # see https://github.com/pytorch/pytorch/issues/126032 for details.
-
-        handles = []
-        offsets = []
-        for i in range(len(all_data)):
-            handles.append(all_data[i][0][0])  # type: ignore
-            offsets.append(all_data[i][0][1])  # type: ignore
-        return handles, offsets
-
-    def register_buffer(self, inp: torch.Tensor):
-        handles, offsets = self._get_ipc_meta(inp)
-        ops.register_buffer(self._ptr, inp, handles, offsets)
-
-    def register_graph_buffers(self):
-        handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
-        handles, offsets = self._gather_ipc_meta((bytes(handle), offset))
-        logger.info("Registering %d cuda graph addresses", len(offset))
+        # Unpack list of tuples to tuple of lists.
+        handles = [d[0] for d in all_data]  # type: ignore
+        offsets = [d[1] for d in all_data]  # type: ignore
         ops.register_graph_buffers(self._ptr, handles, offsets)
 
     def should_custom_ar(self, inp: torch.Tensor):
@@ -291,45 +254,50 @@ def should_custom_ar(self, inp: torch.Tensor):
             return inp_size < self.max_size
         return False
 
-    # all reduce, assuming inp tensor is IPC registered with register_buffer,
-    # or, in the context of cuda graphs, register_graph_buffers
-    def all_reduce_reg(self, inp: torch.Tensor, out: torch.Tensor = None):
-        if out is None:
-            out = torch.empty_like(inp)
-        ops.all_reduce_reg(self._ptr, inp, out)
-        return out
-
-    # all reduce, assuming inp tensor is NOT IPC registered
-    def all_reduce_unreg(self, inp: torch.Tensor, out: torch.Tensor = None):
+    def all_reduce(self,
+                   inp: torch.Tensor,
+                   *,
+                   out: torch.Tensor = None,
+                   registered: bool = False):
+        """Performs an out-of-place all reduce.
+        
+        If registered is True, this assumes inp's pointer is already
+        IPC-registered. Otherwise, inp is first copied into a pre-registered
+        buffer.
+        """
         if out is None:
             out = torch.empty_like(inp)
-        ops.all_reduce_unreg(self._ptr, inp, self.buffer, out)
+        if registered:
+            ops.all_reduce(self._ptr, inp, out, 0, 0)
+        else:
+            ops.all_reduce(self._ptr, inp, out, self.buffer_ptrs[self.rank],
+                           self.max_size)
         return out
 
     def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
-        # when custom allreduce is disabled, this will be None
+        """The main allreduce API that provides support for cuda graph."""
+        # When custom allreduce is disabled, this will be None.
         if self.disabled or not self.should_custom_ar(input):
             return None
         if self._IS_CAPTURING:
             if torch.cuda.is_current_stream_capturing():
-                return self.all_reduce_reg(input)
+                return self.all_reduce(input, registered=True)
             else:
-                # if warm up, mimic the allocation pattern
-                # since custom allreduce is out-of-place
+                # If warm up, mimic the allocation pattern since custom
+                # allreduce is out-of-place.
                 return torch.empty_like(input)
         else:
-            # note: outside of cuda graph context,
-            # custom allreduce incurs a cost of cudaMemcpy, which should
-            # be small(<=1% of overall latency) compared to the performance
-            # gains of using custom kernels
-            return self.all_reduce_unreg(input)
-
-        return None
+            # Note: outside of cuda graph context, custom allreduce incurs a
+            # cost of cudaMemcpy, which should be small (<=1% of overall
+            # latency) compared to the performance gain of using custom kernels
+            return self.all_reduce(input, registered=False)
 
     def close(self):
         if not self.disabled and self._ptr:
             ops.dispose(self._ptr)
             self._ptr = 0
+            self.free_shared_buffer(self.meta_ptrs)
+            self.free_shared_buffer(self.buffer_ptrs)
 
     def __del__(self):
         self.close()

From e036e527a08fbf00ba725b12c9ebff6cd9bfab52 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Thu, 7 Nov 2024 02:54:16 -0500
Subject: [PATCH 020/183] [CI/Build] Improve mypy + python version matrix
 (#10041)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .github/workflows/mypy.yaml | 2 +-
 pyproject.toml              | 4 +---
 tools/mypy.sh               | 5 +++--
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 28d2e5fb8dbd9..fbee6bb03fc8e 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -43,4 +43,4 @@ jobs:
     - name: Mypy
       run: |
         echo "::add-matcher::.github/workflows/matchers/mypy.json"
-        tools/mypy.sh 1
+        tools/mypy.sh 1 ${{ matrix.python-version }}
diff --git a/pyproject.toml b/pyproject.toml
index 1aebc543a733a..bae8645502dea 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -55,14 +55,12 @@ ignore = [
 ]
 
 [tool.mypy]
-python_version = "3.9"
-
 ignore_missing_imports = true
 check_untyped_defs = true
 follow_imports = "silent"
 
 # After fixing type errors resulting from follow_imports: "skip" -> "silent",
-# move the directory here and remove it from format.sh and mypy.yaml
+# move the directory here and remove it from tools/mypy.sh
 files = [
     "vllm/*.py",
     "vllm/adapter_commons",
diff --git a/tools/mypy.sh b/tools/mypy.sh
index 14b0976a27da5..7e8f7d402cdd5 100755
--- a/tools/mypy.sh
+++ b/tools/mypy.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 
 CI=${1:-0}
+PYTHON_VERSION=${2:-3.9}
 
 if [ $CI -eq 1 ]; then
     set -e
@@ -9,10 +10,10 @@ fi
 run_mypy() {
     echo "Running mypy on $1"
     if [ $CI -eq 1 ] && [ -z "$1" ]; then
-        mypy "$@"
+        mypy --python-version "${PYTHON_VERSION}" "$@"
         return
     fi
-    mypy --follow-imports skip "$@"
+    mypy --follow-imports skip --python-version "${PYTHON_VERSION}" "$@"
 }
 
 run_mypy # Note that this is less strict than CI

From aa9078fa035abfac54179cbdca8b741e49c8cd0b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fl=C3=A1via=20B=C3=A9o?=
 <119421251+flaviabeo@users.noreply.github.com>
Date: Thu, 7 Nov 2024 05:42:40 -0300
Subject: [PATCH 021/183] Adds method to read the pooling types from model's
 files (#9506)

Signed-off-by: Flavia Beo <flavia.beo@ibm.com>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Max de Bayser <mbayser@br.ibm.com>
---
 examples/fp8/quantizer/quantize.py            |   4 +-
 tests/engine/test_arg_utils.py                |   7 +
 .../test_model_load_with_params.py            |  50 ++++++
 tests/test_config.py                          |  72 ++++++++
 tests/utils.py                                |  14 +-
 vllm/config.py                                |  28 ++-
 vllm/engine/arg_utils.py                      |   3 +-
 vllm/model_executor/layers/pooler.py          |  14 +-
 vllm/transformers_utils/config.py             | 170 ++++++++++++++++--
 .../tokenizer_group/__init__.py               |   5 +
 10 files changed, 342 insertions(+), 25 deletions(-)
 create mode 100644 tests/model_executor/test_model_load_with_params.py

diff --git a/examples/fp8/quantizer/quantize.py b/examples/fp8/quantizer/quantize.py
index 15f1a06b1219b..d75cc8b3d1cf7 100644
--- a/examples/fp8/quantizer/quantize.py
+++ b/examples/fp8/quantizer/quantize.py
@@ -230,7 +230,7 @@ def calibrate_loop():
 
 def main(args):
     if not torch.cuda.is_available():
-        raise EnvironmentError("GPU is required for inference.")
+        raise OSError("GPU is required for inference.")
 
     random.seed(RAND_SEED)
     np.random.seed(RAND_SEED)
@@ -314,7 +314,7 @@ def main(args):
 
             # Workaround for wo quantization
             if args.qformat in ["int8_wo", "int4_wo", "full_prec"]:
-                with open(f"{export_path}/config.json", 'r') as f:
+                with open(f"{export_path}/config.json") as f:
                     tensorrt_llm_config = json.load(f)
                 if args.qformat == "int8_wo":
                     tensorrt_llm_config["quantization"]["quant_algo"] = 'W8A16'
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index f7dc167fea6e4..e92e2588d01cb 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -30,6 +30,13 @@ def test_limit_mm_per_prompt_parser(arg, expected):
     assert args.limit_mm_per_prompt == expected
 
 
+def test_valid_pooling_config():
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    args = parser.parse_args(["--pooling-type=MEAN"])
+    engine_args = EngineArgs.from_cli_args(args=args)
+    assert engine_args.pooling_type == 'MEAN'
+
+
 @pytest.mark.parametrize(
     ("arg"),
     [
diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py
new file mode 100644
index 0000000000000..7e5e2780d3916
--- /dev/null
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -0,0 +1,50 @@
+import os
+
+import pytest
+
+from vllm.model_executor.layers.pooler import PoolingType
+from vllm.model_executor.models.bert import BertEmbeddingModel
+from vllm.platforms import current_platform
+
+MAX_MODEL_LEN = 128
+MODEL_NAME = os.environ.get("MODEL_NAME", "BAAI/bge-base-en-v1.5")
+REVISION = os.environ.get("REVISION", "main")
+
+
+@pytest.mark.skipif(current_platform.is_rocm(),
+                    reason="Xformers backend is not supported on ROCm.")
+def test_model_loading_with_params(vllm_runner):
+    """
+    Test parameter weight loading with tp>1.
+    """
+    with vllm_runner(model_name=MODEL_NAME,
+                     revision=REVISION,
+                     dtype="float16",
+                     max_model_len=MAX_MODEL_LEN) as model:
+        output = model.encode("Write a short story about a robot that"
+                              " dreams for the first time.\n")
+
+        model_config = model.model.llm_engine.model_config
+
+        model_tokenizer = model.model.llm_engine.tokenizer
+
+        # asserts on the bert model config file
+        assert model_config.encoder_config["max_seq_length"] == 512
+        assert model_config.encoder_config["do_lower_case"]
+
+        # asserts on the pooling config files
+        assert model_config.pooler_config.pooling_type == PoolingType.CLS.name
+        assert model_config.pooler_config.pooling_norm
+
+        # asserts on the tokenizer loaded
+        assert model_tokenizer.tokenizer_id == "BAAI/bge-base-en-v1.5"
+        assert model_tokenizer.tokenizer_config["do_lower_case"]
+        assert model_tokenizer.tokenizer.model_max_length == 512
+
+        model = model.model.llm_engine.model_executor\
+                     .driver_worker.model_runner.model
+        assert isinstance(model, BertEmbeddingModel)
+        assert model._pooler.pooling_type == PoolingType.CLS
+        assert model._pooler.normalize
+        # assert output
+        assert output
diff --git a/tests/test_config.py b/tests/test_config.py
index 5211049bf0011..66bdb883657c5 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1,6 +1,8 @@
 import pytest
 
 from vllm.config import ModelConfig
+from vllm.model_executor.layers.pooler import PoolingType
+from vllm.platforms import current_platform
 
 
 @pytest.mark.parametrize(("model_id", "expected_task"), [
@@ -102,6 +104,76 @@ def test_get_sliding_window():
     assert mistral_model_config.get_sliding_window() == TEST_SLIDING_WINDOW
 
 
+@pytest.mark.skipif(current_platform.is_rocm(),
+                    reason="Xformers backend is not supported on ROCm.")
+def test_get_pooling_config():
+    model_id = "sentence-transformers/all-MiniLM-L12-v2"
+    minilm_model_config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        revision=None,
+    )
+
+    minilm_pooling_config = minilm_model_config._init_pooler_config(
+        pooling_type=None,
+        pooling_norm=None,
+        pooling_returned_token_ids=None,
+        pooling_softmax=None,
+        pooling_step_tag_id=None)
+
+    assert minilm_pooling_config.pooling_norm
+    assert minilm_pooling_config.pooling_type == PoolingType.MEAN.name
+
+
+@pytest.mark.skipif(current_platform.is_rocm(),
+                    reason="Xformers backend is not supported on ROCm.")
+def test_get_pooling_config_from_args():
+    model_id = "sentence-transformers/all-MiniLM-L12-v2"
+    minilm_model_config = ModelConfig(model_id,
+                                      task="auto",
+                                      tokenizer=model_id,
+                                      tokenizer_mode="auto",
+                                      trust_remote_code=False,
+                                      seed=0,
+                                      dtype="float16",
+                                      revision=None)
+
+    minilm_pooling_config = minilm_model_config._init_pooler_config(
+        pooling_type='CLS',
+        pooling_norm=True,
+        pooling_returned_token_ids=None,
+        pooling_softmax=None,
+        pooling_step_tag_id=None)
+
+    assert minilm_pooling_config.pooling_norm
+    assert minilm_pooling_config.pooling_type == PoolingType.CLS.name
+
+
+@pytest.mark.skipif(current_platform.is_rocm(),
+                    reason="Xformers backend is not supported on ROCm.")
+def test_get_bert_tokenization_sentence_transformer_config():
+    bge_model_config = ModelConfig(
+        model="BAAI/bge-base-en-v1.5",
+        task="auto",
+        tokenizer="BAAI/bge-base-en-v1.5",
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        revision=None,
+    )
+
+    bert_bge_model_config = bge_model_config._get_encoder_config()
+
+    assert bert_bge_model_config["max_seq_length"] == 512
+    assert bert_bge_model_config["do_lower_case"]
+
+
 def test_rope_customization():
     TEST_ROPE_SCALING = {"rope_type": "dynamic", "factor": 2.0}
     TEST_ROPE_THETA = 16_000_000.0
diff --git a/tests/utils.py b/tests/utils.py
index 00c7dabe16a7b..a893667e144a6 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -15,6 +15,7 @@
 import pytest
 import requests
 import torch
+import torch.nn.functional as F
 from openai.types.completion import Completion
 from typing_extensions import ParamSpec
 
@@ -515,13 +516,14 @@ def compare_all_settings(model: str,
                     ref_result = copy.deepcopy(ref_result)
                     compare_result = copy.deepcopy(compare_result)
                     if "embedding" in ref_result and method == "encode":
-                        ref_embedding = torch.tensor(ref_result["embedding"])
-                        compare_embedding = torch.tensor(
-                            compare_result["embedding"])
-                        mse = ((ref_embedding - compare_embedding)**2).mean()
-                        assert mse < 1e-6, (
+                        sim = F.cosine_similarity(
+                            torch.tensor(ref_result["embedding"]),
+                            torch.tensor(compare_result["embedding"]),
+                            dim=0,
+                        )
+                        assert sim >= 0.999, (
                             f"Embedding for {model=} are not the same.\n"
-                            f"mse={mse}\n")
+                            f"cosine_similarity={sim}\n")
                         del ref_result["embedding"]
                         del compare_result["embedding"]
                     assert ref_result == compare_result, (
diff --git a/vllm/config.py b/vllm/config.py
index c7fad3a261858..e844a46bf06e6 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -13,10 +13,10 @@
 from vllm.model_executor.models import ModelRegistry
 from vllm.platforms import current_platform
 from vllm.tracing import is_otel_available, otel_import_error_traceback
-from vllm.transformers_utils.config import (ConfigFormat, get_config,
-                                            get_hf_image_processor_config,
-                                            get_hf_text_config,
-                                            is_encoder_decoder, uses_mrope)
+from vllm.transformers_utils.config import (
+    ConfigFormat, get_config, get_hf_image_processor_config,
+    get_hf_text_config, get_pooling_config,
+    get_sentence_transformer_tokenizer_config, is_encoder_decoder, uses_mrope)
 from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory,
                         print_warning_once)
 
@@ -197,6 +197,7 @@ def __init__(
                                     code_revision, rope_scaling, rope_theta,
                                     config_format)
         self.hf_text_config = get_hf_text_config(self.hf_config)
+        self.encoder_config = self._get_encoder_config()
         self.hf_image_processor_config = get_hf_image_processor_config(
             self.model, revision)
         self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
@@ -229,7 +230,8 @@ def __init__(
             max_model_len=max_model_len,
             disable_sliding_window=self.disable_sliding_window,
             sliding_window_len=self.get_hf_config_sliding_window(),
-            spec_target_max_model_len=spec_target_max_model_len)
+            spec_target_max_model_len=spec_target_max_model_len,
+            encoder_config=self.encoder_config)
         self.served_model_name = get_served_model_name(model,
                                                        served_model_name)
         self.multimodal_config = self._init_multimodal_config(
@@ -273,6 +275,10 @@ def _init_multimodal_config(
 
         return None
 
+    def _get_encoder_config(self):
+        return get_sentence_transformer_tokenizer_config(
+            self.model, self.revision)
+
     def _init_pooler_config(
         self,
         pooling_type: Optional[str] = None,
@@ -282,6 +288,14 @@ def _init_pooler_config(
         pooling_returned_token_ids: Optional[List[int]] = None
     ) -> Optional["PoolerConfig"]:
         if self.task == "embedding":
+            pooling_config = get_pooling_config(self.model, self.revision)
+            if pooling_config is not None:
+                # override if user does not
+                # specifies pooling_type and/or pooling_norm
+                if pooling_type is None:
+                    pooling_type = pooling_config["pooling_type"]
+                if pooling_norm is None:
+                    pooling_norm = pooling_config["normalize"]
             return PoolerConfig(
                 pooling_type=pooling_type,
                 pooling_norm=pooling_norm,
@@ -1795,6 +1809,7 @@ def _get_and_verify_max_len(
     disable_sliding_window: bool,
     sliding_window_len: Optional[Union[int, List[Optional[int]]]],
     spec_target_max_model_len: Optional[int] = None,
+    encoder_config: Optional[Any] = None,
 ) -> int:
     """Get and verify the model's maximum length."""
     derived_max_model_len = float("inf")
@@ -1877,6 +1892,9 @@ def _get_and_verify_max_len(
                     "original_max_position_embeddings"]
             derived_max_model_len *= scaling_factor
 
+    if encoder_config and "max_seq_length" in encoder_config:
+        derived_max_model_len = encoder_config["max_seq_length"]
+
     # If the user specified a max length, make sure it is smaller than the
     # derived length from the HF model config.
     if max_model_len is None:
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index b556c0eed3776..8c5b442e9f624 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -16,6 +16,7 @@
                          VllmConfig)
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
+from vllm.model_executor.layers.pooler import PoolingType
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import (
@@ -863,7 +864,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
 
         parser.add_argument(
             '--pooling-type',
-            choices=['LAST', 'ALL', 'CLS', 'STEP'],
+            choices=[pt.name for pt in PoolingType],
             default=None,
             help='Used to configure the pooling method in the embedding model.'
         )
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 1c9772b41cbef..024badbc17b96 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -16,6 +16,7 @@ class PoolingType(IntEnum):
     ALL = 1
     CLS = 2
     STEP = 3
+    MEAN = 4
 
 
 class Pooler(nn.Module):
@@ -27,7 +28,7 @@ class Pooler(nn.Module):
     3. Returns structured results as `PoolerOutput`.
 
     Attributes:
-        pooling_type: The type of pooling to use (LAST, ALL, CLS).
+        pooling_type: The type of pooling to use.
         normalize: Whether to normalize the pooled data.
     """
 
@@ -97,6 +98,17 @@ def forward(
             for prompt_len in prompt_lens:
                 pooled_data.append(hidden_states[offset:offset + prompt_len])
                 offset += prompt_len
+        elif self.pooling_type == PoolingType.MEAN:
+            # Calculate mean pooling
+            cumsum = torch.cumsum(hidden_states, dim=0)
+            start_indices = torch.cat([
+                torch.tensor([0], device=hidden_states.device),
+                torch.cumsum(prompt_lens[:-1], dim=0)
+            ])
+            end_indices = torch.cumsum(prompt_lens, dim=0)
+            pooled_data = (
+                cumsum[end_indices - 1] - cumsum[start_indices] +
+                hidden_states[start_indices]) / prompt_lens.unsqueeze(1)
         elif self.pooling_type == PoolingType.STEP:
             if self.returned_token_ids is not None and len(
                     self.returned_token_ids) > 0:
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 415d8bf7cc2bb..6b38ee31c2657 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -6,6 +6,9 @@
 import huggingface_hub
 from huggingface_hub import (file_exists, hf_hub_download,
                              try_to_load_from_cache)
+from huggingface_hub.utils import (EntryNotFoundError, LocalEntryNotFoundError,
+                                   RepositoryNotFoundError,
+                                   RevisionNotFoundError)
 from transformers import GenerationConfig, PretrainedConfig
 from transformers.models.auto.image_processing_auto import (
     get_image_processor_config)
@@ -213,7 +216,7 @@ def get_config(
                     raise e
 
     elif config_format == ConfigFormat.MISTRAL:
-        config = load_params_config(model, revision)
+        config = load_params_config(model, revision, token=kwargs.get("token"))
     else:
         raise ValueError(f"Unsupported config format: {config_format}")
 
@@ -243,6 +246,158 @@ def get_config(
     return config
 
 
+def get_hf_file_to_dict(file_name: str,
+                        model: Union[str, Path],
+                        revision: Optional[str] = 'main',
+                        token: Optional[str] = None):
+    """
+    Downloads a file from the Hugging Face Hub and returns 
+    its contents as a dictionary.
+
+    Parameters:
+    - file_name (str): The name of the file to download.
+    - model (str): The name of the model on the Hugging Face Hub.
+    - revision (str): The specific version of the model. 
+    - token (str): The Hugging Face authentication token.
+
+    Returns:
+    - config_dict (dict): A dictionary containing 
+    the contents of the downloaded file.
+    """
+    file_path = Path(model) / file_name
+
+    if file_or_path_exists(model=model,
+                           config_name=file_name,
+                           revision=revision,
+                           token=token):
+
+        if not file_path.is_file():
+            try:
+                hf_hub_file = hf_hub_download(model,
+                                              file_name,
+                                              revision=revision)
+            except (RepositoryNotFoundError, RevisionNotFoundError,
+                    EntryNotFoundError, LocalEntryNotFoundError) as e:
+                logger.debug("File or repository not found in hf_hub_download",
+                             e)
+                return None
+            file_path = Path(hf_hub_file)
+
+        with open(file_path) as file:
+            return json.load(file)
+    return None
+
+
+def get_pooling_config(model: str,
+                       revision: Optional[str] = 'main',
+                       token: Optional[str] = None):
+    """
+    This function gets the pooling and normalize 
+    config from the model - only applies to 
+    sentence-transformers models. 
+
+    Args:
+        model (str): The name of the Hugging Face model.
+        revision (str, optional): The specific version 
+        of the model to use. Defaults to 'main'.
+
+    Returns:
+        dict: A dictionary containing the pooling 
+        type and whether normalization is used.
+    """
+
+    modules_file_name = "modules.json"
+    modules_dict = get_hf_file_to_dict(modules_file_name, model, revision,
+                                       token)
+
+    if modules_dict is None:
+        return None
+
+    pooling = next((item for item in modules_dict
+                    if item["type"] == "sentence_transformers.models.Pooling"),
+                   None)
+    normalize = bool(
+        next((item for item in modules_dict
+              if item["type"] == "sentence_transformers.models.Normalize"),
+             False))
+
+    if pooling:
+
+        pooling_file_name = "{}/config.json".format(pooling["path"])
+        pooling_dict = get_hf_file_to_dict(pooling_file_name, model, revision,
+                                           token)
+        pooling_type_name = next(
+            (item for item, val in pooling_dict.items() if val is True), None)
+
+        if pooling_type_name is not None:
+            pooling_type_name = get_pooling_config_name(pooling_type_name)
+
+        return {"pooling_type": pooling_type_name, "normalize": normalize}
+
+    return None
+
+
+def get_pooling_config_name(pooling_name: str) -> Union[str, None]:
+    if "pooling_mode_" in pooling_name:
+        pooling_name = pooling_name.replace("pooling_mode_", "")
+
+    if "_" in pooling_name:
+        pooling_name = pooling_name.split("_")[0]
+
+    if "lasttoken" in pooling_name:
+        pooling_name = "last"
+
+    supported_pooling_types = ['LAST', 'ALL', 'CLS', 'STEP', 'MEAN']
+    pooling_type_name = pooling_name.upper()
+
+    try:
+        if pooling_type_name in supported_pooling_types:
+            return pooling_type_name
+    except NotImplementedError as e:
+        logger.debug("Pooling type not supported", e)
+        return None
+    return None
+
+
+def get_sentence_transformer_tokenizer_config(model: str,
+                                              revision: Optional[str] = 'main',
+                                              token: Optional[str] = None):
+    """
+    Returns the tokenization configuration dictionary for a 
+    given Sentence Transformer BERT model.
+
+    Parameters:
+    - model (str): The name of the Sentence Transformer 
+    BERT model.
+    - revision (str, optional): The revision of the m
+    odel to use. Defaults to 'main'.
+    - token (str): A Hugging Face access token.
+
+    Returns:
+    - dict: A dictionary containing the configuration parameters 
+    for the Sentence Transformer BERT model.
+    """
+    for config_name in [
+            "sentence_bert_config.json",
+            "sentence_roberta_config.json",
+            "sentence_distilbert_config.json",
+            "sentence_camembert_config.json",
+            "sentence_albert_config.json",
+            "sentence_xlm-roberta_config.json",
+            "sentence_xlnet_config.json",
+    ]:
+        encoder_dict = get_hf_file_to_dict(config_name, model, revision, token)
+        if encoder_dict:
+            break
+
+    if not encoder_dict:
+        return None
+
+    if all(k in encoder_dict for k in ("max_seq_length", "do_lower_case")):
+        return encoder_dict
+    return None
+
+
 def maybe_register_config_serialize_by_value(trust_remote_code: bool) -> None:
     """Try to register HF model configuration class to serialize by value
 
@@ -305,20 +460,15 @@ def _reduce_modelconfig(mc: ModelConfig):
             exc_info=e)
 
 
-def load_params_config(model, revision) -> PretrainedConfig:
+def load_params_config(model: Union[str, Path],
+                       revision: Optional[str],
+                       token: Optional[str] = None) -> PretrainedConfig:
     # This function loads a params.json config which
     # should be used when loading models in mistral format
 
     config_file_name = "params.json"
 
-    config_path = Path(model) / config_file_name
-
-    if not config_path.is_file():
-        config_path = Path(
-            hf_hub_download(model, config_file_name, revision=revision))
-
-    with open(config_path) as file:
-        config_dict = json.load(file)
+    config_dict = get_hf_file_to_dict(config_file_name, model, revision, token)
 
     config_mapping = {
         "dim": "hidden_size",
diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py
index 9a4149251d747..6a114b513f382 100644
--- a/vllm/transformers_utils/tokenizer_group/__init__.py
+++ b/vllm/transformers_utils/tokenizer_group/__init__.py
@@ -25,6 +25,11 @@ def init_tokenizer_from_configs(model_config: ModelConfig,
                        trust_remote_code=model_config.trust_remote_code,
                        revision=model_config.tokenizer_revision)
 
+    if (model_config.encoder_config is not None
+            and "do_lower_case" in model_config.encoder_config):
+        init_kwargs["do_lower_case"] = model_config.encoder_config[
+            "do_lower_case"]
+
     return get_tokenizer_group(parallel_config.tokenizer_pool_config,
                                **init_kwargs)
 

From 0dfba97b42032987fd6bd3d304ac22dd314c89b1 Mon Sep 17 00:00:00 2001
From: Lei Yang <DIYer22@users.noreply.github.com>
Date: Thu, 7 Nov 2024 17:07:19 +0800
Subject: [PATCH 022/183] [Frontend] Fix multiple values for keyword argument
 error (#10075) (#10076)

Signed-off-by: Lei <ylxx@live.com>
---
 vllm/entrypoints/openai/serving_engine.py | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index e7aeac8f8c018..e31dc2ced61fb 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -443,29 +443,28 @@ async def _preprocess_chat(
             tokenizer,
         )
 
+        _chat_template_kwargs: Dict[str, Any] = dict(
+            chat_template=chat_template,
+            add_generation_prompt=add_generation_prompt,
+            continue_final_message=continue_final_message,
+            tools=tool_dicts,
+            documents=documents,
+        )
+        _chat_template_kwargs.update(chat_template_kwargs or {})
+
         request_prompt: Union[str, List[int]]
         is_mistral_tokenizer = isinstance(tokenizer, MistralTokenizer)
         if is_mistral_tokenizer:
             request_prompt = apply_mistral_chat_template(
                 tokenizer,
                 messages=messages,
-                chat_template=chat_template,
-                add_generation_prompt=add_generation_prompt,
-                continue_final_message=continue_final_message,
-                tools=tool_dicts,
-                documents=documents,
-                **(chat_template_kwargs or {}),
+                **_chat_template_kwargs,
             )
         else:
             request_prompt = apply_hf_chat_template(
                 tokenizer,
                 conversation=conversation,
-                chat_template=chat_template,
-                add_generation_prompt=add_generation_prompt,
-                continue_final_message=continue_final_message,
-                tools=tool_dicts,
-                documents=documents,
-                **(chat_template_kwargs or {}),
+                **_chat_template_kwargs,
             )
 
         mm_data = await mm_data_future

From a6f332d0d9ac3e795949da7703f203b6b1a42797 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Thu, 7 Nov 2024 18:42:50 +0800
Subject: [PATCH 023/183] [Hardware][CPU][bugfix] Fix half dtype support on
 AVX2-only target (#10108)

Signed-off-by: jiang1.li <jiang1.li@intel.com>
---
 cmake/cpu_extension.cmake  |  2 +-
 csrc/cpu/cpu_types_x86.hpp | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 776a0bb11ae64..5912c5c02ede7 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -93,7 +93,7 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
     FetchContent_Declare(
         oneDNN
         GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
-        GIT_TAG  v3.5.3
+        GIT_TAG  v3.6
         GIT_PROGRESS TRUE
         GIT_SHALLOW TRUE
     )
diff --git a/csrc/cpu/cpu_types_x86.hpp b/csrc/cpu/cpu_types_x86.hpp
index 12d5757b495be..4bb4eb0f491ac 100644
--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -432,6 +432,16 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
   explicit FP32Vec16(const FP32Vec8 &data)
       : reg_low(data.reg), reg_high(data.reg) {}
 
+  explicit FP32Vec16(const FP16Vec16 &v) {
+    __m128i low = _mm256_extractf128_si256(v.reg, 0);
+    __m128i high = _mm256_extractf128_si256(v.reg, 1);
+
+    reg_low = _mm256_cvtph_ps(low);
+    reg_high = _mm256_cvtph_ps(high);
+  }
+
+  explicit FP32Vec16(const FP16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
+
   explicit FP32Vec16(const BF16Vec16 &v) {
     __m128i low = _mm256_extractf128_si256(v.reg, 0);
     __m128i high = _mm256_extractf128_si256(v.reg, 1);

From 999df95b4eefb920cd3539a7fa3a21b2911f3650 Mon Sep 17 00:00:00 2001
From: Jiahao Li <liplus17@163.com>
Date: Thu, 7 Nov 2024 18:50:44 +0800
Subject: [PATCH 024/183] [Bugfix] Make image processor respect
 `mm_processor_kwargs` for Qwen2-VL (#10112)

Signed-off-by: Jiahao Li <liplus17@163.com>
---
 vllm/model_executor/models/qwen2_vl.py | 33 ++++++++++++++++++--------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index af263262bd239..0e820cf123139 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -22,8 +22,8 @@
 # limitations under the License.
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
 from functools import partial
-from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional,
-                    Tuple, Type, TypedDict, Union)
+from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
+                    Optional, Tuple, Type, TypedDict, Union)
 
 import torch
 import torch.nn as nn
@@ -558,6 +558,17 @@ def forward(
 # === Vision input helpers === #
 
 
+def get_mm_processor_kwargs(
+        min_pixels: Optional[int] = None,
+        max_pixels: Optional[int] = None) -> Dict[str, int]:
+    mm_processor_kwargs = {}
+    if min_pixels:
+        mm_processor_kwargs["min_pixels"] = min_pixels
+    if max_pixels:
+        mm_processor_kwargs["max_pixels"] = max_pixels
+    return mm_processor_kwargs
+
+
 def mm_input_mapper_for_qwen2_vl(
     ctx: InputContext,
     data: MultiModalData[object],
@@ -575,12 +586,8 @@ def mm_input_mapper_for_qwen2_vl(
     model_config = ctx.model_config
     # Handle mm processor kwargs; we pass these at creation time
     # because preprocess() in transformers doesn't expose them
-    mm_processor_kwargs = {}
-    if min_pixels:
-        mm_processor_kwargs["min_pixels"] = min_pixels
-    if max_pixels:
-        mm_processor_kwargs["max_pixels"] = max_pixels
-
+    mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels,
+                                                  max_pixels=max_pixels)
     image_processor = cached_get_image_processor(
         model_config.model,
         trust_remote_code=model_config.trust_remote_code,
@@ -683,7 +690,10 @@ def get_max_qwen2_vl_mm_tokens(ctx: InputContext,
                                *,
                                min_pixels=None,
                                max_pixels=None) -> int:
-    image_processor = cached_get_image_processor(ctx.model_config.model)
+    mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels,
+                                                  max_pixels=max_pixels)
+    image_processor = cached_get_image_processor(ctx.model_config.model,
+                                                 **mm_processor_kwargs)
     max_resized_height, max_resized_width, max_llm_image_tokens = \
         _get_max_image_info(image_processor, data_type_key=data_type_key,
                             mm_count=1, min_pixels=min_pixels,
@@ -705,7 +715,10 @@ def dummy_data_for_qwen2_vl(
     min_pixels: Optional[int] = None,
     max_pixels: Optional[int] = None
 ) -> Tuple[SequenceData, Optional[MultiModalDataDict]]:
-    image_processor = cached_get_image_processor(ctx.model_config.model)
+    mm_processor_kwargs = get_mm_processor_kwargs(min_pixels=min_pixels,
+                                                  max_pixels=max_pixels)
+    image_processor = cached_get_image_processor(ctx.model_config.model,
+                                                 **mm_processor_kwargs)
 
     num_images = mm_counts["image"]
     max_resized_height, max_resized_width, max_llm_image_tokens = \

From a62bc0109c3864b9dc770dc637e3acd332c730ea Mon Sep 17 00:00:00 2001
From: Atlas <163425173+spliii@users.noreply.github.com>
Date: Thu, 7 Nov 2024 19:20:30 +0800
Subject: [PATCH 025/183] [Misc] Add Gamma-Distribution Request Generation
 Support for Serving Benchmark. (#10105)

Signed-off-by: Mozhou <spli161006@gmail.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
---
 benchmarks/benchmark_serving.py | 57 ++++++++++++++++++++++++++++++---
 1 file changed, 52 insertions(+), 5 deletions(-)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index ff06622628219..bdb8ea8e2a5dc 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -297,8 +297,33 @@ def sample_random_requests(
 async def get_request(
     input_requests: List[Tuple[str, int, int]],
     request_rate: float,
+    burstiness: float = 1.0,
 ) -> AsyncGenerator[Tuple[str, int, int], None]:
+    """
+    Asynchronously generates requests at a specified rate 
+    with OPTIONAL burstiness.
+    
+    Args:
+        input_requests: 
+            A list of input requests, each represented as a tuple.
+        request_rate: 
+            The rate at which requests are generated (requests/s).
+        burstiness (optional): 
+            The burstiness factor of the request generation. 
+            Only takes effect when request_rate is not inf.
+            Default value is 1, which follows a Poisson process.
+            Otherwise, the request intervals follow a gamma distribution.
+            A lower burstiness value (0 < burstiness < 1) results 
+            in more bursty requests, while a higher burstiness value 
+            (burstiness > 1) results in a more uniform arrival of requests.
+    """
     input_requests = iter(input_requests)
+
+    # Calculate scale parameter theta to maintain the desired request_rate.
+    assert burstiness > 0, (
+        f"A positive burstiness factor is expected, but given {burstiness}.")
+    theta = 1.0 / (request_rate * burstiness)
+
     for request in input_requests:
         yield request
 
@@ -306,8 +331,9 @@ async def get_request(
             # If the request rate is infinity, then we don't need to wait.
             continue
 
-        # Sample the request interval from the exponential distribution.
-        interval = np.random.exponential(1.0 / request_rate)
+        # Sample the request interval from the gamma distribution.
+        # If burstiness is 1, it follows exponential distribution.
+        interval = np.random.gamma(shape=burstiness, scale=theta)
         # The next request will be sent after the interval.
         await asyncio.sleep(interval)
 
@@ -426,6 +452,7 @@ async def benchmark(
     logprobs: Optional[int],
     best_of: int,
     request_rate: float,
+    burstiness: float,
     disable_tqdm: bool,
     profile: bool,
     selected_percentile_metrics: List[str],
@@ -480,7 +507,13 @@ async def benchmark(
         if profile_output.success:
             print("Profiler started")
 
+    if burstiness == 1.0:
+        distribution = "Poisson process"
+    else:
+        distribution = "Gamma distribution"
+
     print(f"Traffic request rate: {request_rate}")
+    print(f"Burstiness factor: {burstiness} ({distribution})")
     print(f"Maximum request concurrency: {max_concurrency}")
 
     pbar = None if disable_tqdm else tqdm(total=len(input_requests))
@@ -502,7 +535,7 @@ async def limited_request_func(request_func_input, pbar):
 
     benchmark_start_time = time.perf_counter()
     tasks: List[asyncio.Task] = []
-    async for request in get_request(input_requests, request_rate):
+    async for request in get_request(input_requests, request_rate, burstiness):
         prompt, prompt_len, output_len, mm_content = request
         request_func_input = RequestFuncInput(model=model_id,
                                               prompt=prompt,
@@ -769,6 +802,7 @@ def main(args: argparse.Namespace):
             logprobs=args.logprobs,
             best_of=args.best_of,
             request_rate=args.request_rate,
+            burstiness=args.burstiness,
             disable_tqdm=args.disable_tqdm,
             profile=args.profile,
             selected_percentile_metrics=args.percentile_metrics.split(","),
@@ -807,6 +841,7 @@ def main(args: argparse.Namespace):
         # Traffic
         result_json["request_rate"] = (
             args.request_rate if args.request_rate < float("inf") else "inf")
+        result_json["burstiness"] = args.burstiness
         result_json["max_concurrency"] = args.max_concurrency
 
         # Merge with benchmark result
@@ -922,8 +957,20 @@ def main(args: argparse.Namespace):
         default=float("inf"),
         help="Number of requests per second. If this is inf, "
         "then all the requests are sent at time 0. "
-        "Otherwise, we use Poisson process to synthesize "
-        "the request arrival times.",
+        "Otherwise, we use Poisson process or gamma distribution "
+        "to synthesize the request arrival times.",
+    )
+    parser.add_argument(
+        "--burstiness",
+        type=float,
+        default=1.0,
+        help="Burstiness factor of the request generation. "
+        "Only take effect when request_rate is not inf. "
+        "Default value is 1, which follows Poisson process. "
+        "Otherwise, the request intervals follow a gamma distribution. "
+        "A lower burstiness value (0 < burstiness < 1) results in more "
+        "bursty requests. A higher burstiness value (burstiness > 1) "
+        "results in a more uniform arrival of requests.",
     )
     parser.add_argument("--seed", type=int, default=0)
     parser.add_argument(

From ae62fd17c0023f7ec363c1141787b8c017937c44 Mon Sep 17 00:00:00 2001
From: Maximilien de Bayser <mbayser@br.ibm.com>
Date: Thu, 7 Nov 2024 12:09:02 -0300
Subject: [PATCH 026/183] [Frontend] Tool calling parser for Granite 3.0 models
 (#9027)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
---
 .../serving/openai_compatible_server.md       |  44 ++--
 examples/tool_chat_template_granite.jinja     |  40 ++++
 tests/tool_use/conftest.py                    |   6 +
 tests/tool_use/utils.py                       |  37 +--
 .../openai/tool_parsers/__init__.py           |   5 +-
 .../tool_parsers/granite_tool_parser.py       | 215 ++++++++++++++++++
 6 files changed, 314 insertions(+), 33 deletions(-)
 create mode 100644 examples/tool_chat_template_granite.jinja
 create mode 100644 vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py

diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 0b5f75caf2475..a196f8b1e574e 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -160,14 +160,7 @@ this, unless explicitly specified.
 :func: create_parser_for_docs
 :prog: vllm serve
 ```
-## Tool Calling in the Chat Completion API
-### Named Function Calling
-vLLM supports only named function calling in the chat completion API by default. It does so using Outlines, so this is 
-enabled by default, and will work with any supported model. You are guaranteed a validly-parsable function call - not a 
-high-quality one. 
 
-To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and 
-specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request. 
 
 ### Config file
 
@@ -196,12 +189,22 @@ The order of priorities is `command line > config file values > defaults`.
 ---
 
 ## Tool calling in the chat completion API
-vLLM supports only named function calling in the chat completion API. The `tool_choice` options `auto` and `required` are **not yet supported** but on the roadmap.
+
+vLLM supports named function calling and `auto` tool choice  in the chat completion API. The `tool_choice` options `required` is **not yet supported** but on the roadmap.
 
 It is the callers responsibility to prompt the model with the tool information, vLLM will not automatically manipulate the prompt.
 
+
+### Named Function Calling
+vLLM supports named function calling in the chat completion API by default. It does so using Outlines, so this is 
+enabled by default, and will work with any supported model. You are guaranteed a validly-parsable function call - not a 
+high-quality one. 
+
 vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
 
+To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and 
+specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request. 
+
 
 ### Automatic Function Calling
 To enable this feature, you should set the following flags:
@@ -275,6 +278,21 @@ it works better with vLLM.
 
 Recommended flags: `--tool-call-parser llama3_json --chat-template examples/tool_chat_template_llama3_json.jinja`
 
+#### IBM Granite
+
+Supported models:
+* `ibm-granite/granite-3.0-8b-instruct`
+
+Recommended flags: `--tool-call-parser granite --chat-template examples/tool_chat_template_granite.jinja`
+
+`examples/tool_chat_template_granite.jinja`: this is a modified chat template from the original on Huggingface. Parallel function calls are supported.
+
+* `ibm-granite/granite-20b-functioncalling`
+
+Recommended flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja`
+
+`examples/tool_chat_template_granite_20b_fc.jinja`: this is a modified chat template from the original on Huggingface, which is not vLLM compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported.
+
 
 #### InternLM Models (`internlm`)
 
@@ -297,16 +315,6 @@ AI21's Jamba-1.5 models are supported.
 Flags: `--tool-call-parser jamba`
 
 
-#### IBM Granite (`granite-20b-fc`)
-
-Supported models:
-* `ibm-granite/granite-20b-functioncalling`
-
-Flags: `--tool-call-parser granite-20b-fc --chat-template examples/tool_chat_template_granite_20b_fc.jinja`
-
-The example chat template deviates slightly from the original on Huggingface, which is not vLLM compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported.
-
-
 ### How to write a tool parser plugin
 
 A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py.
diff --git a/examples/tool_chat_template_granite.jinja b/examples/tool_chat_template_granite.jinja
new file mode 100644
index 0000000000000..2cc19e77188dc
--- /dev/null
+++ b/examples/tool_chat_template_granite.jinja
@@ -0,0 +1,40 @@
+{%- if tools %}
+    {{- '<|start_of_role|>available_tools<|end_of_role|>
+' }}
+    {%- for tool in tools %}
+    {{- tool | tojson(indent=4) }}
+    {%- if not loop.last %}
+        {{- '
+
+' }}
+    {%- endif %}
+    {%- endfor %}
+    {{- '<|end_of_text|>
+' }}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if message['role'] == 'system' %}
+    {{- '<|start_of_role|>system<|end_of_role|>' + message['content'] + '<|end_of_text|>
+' }}
+    {%- elif message['role'] == 'user' %}
+    {{- '<|start_of_role|>user<|end_of_role|>' + message['content'] + '<|end_of_text|>
+' }}
+    {%- elif message['role'] == 'assistant_tool_call' or (message['role'] == 'assistant' and message.tool_calls is defined) %}
+    {{- '<|start_of_role|>assistant<|end_of_role|>' }}
+        {% for tc in message.tool_calls %}
+            {{- '<|tool_call|> ' + {'name': tc.function.name, 'arguments': tc.function.arguments}|tojson  }}
+        {% endfor %}
+    {{- '<|end_of_text|>
+' }}
+    {%- elif message['role'] == 'assistant' %}
+    {{- '<|start_of_role|>assistant<|end_of_role|>'  + message['content'] + '<|end_of_text|>
+' }}
+    {%- elif message['role'] == 'tool_response' or  message['role'] == 'tool' %}
+    {{- '<|start_of_role|>tool_response<|end_of_role|>' + message['content'] + '<|end_of_text|>
+' }}
+    {%- endif %}
+    {%- if loop.last and add_generation_prompt %}
+    {{- '<|start_of_role|>assistant<|end_of_role|>' }}
+    {%- endif %}
+{%- endfor %}
diff --git a/tests/tool_use/conftest.py b/tests/tool_use/conftest.py
index ab6a29eba1b3f..294acf202a232 100644
--- a/tests/tool_use/conftest.py
+++ b/tests/tool_use/conftest.py
@@ -3,6 +3,7 @@
 from huggingface_hub import snapshot_download
 
 from tests.utils import RemoteOpenAIServer
+from vllm.platforms import current_platform
 
 from .utils import ARGS, CONFIGS, ServerConfig
 
@@ -11,6 +12,11 @@
 @pytest.fixture(scope="session", params=CONFIGS.keys())
 def server_config(request):
     config = CONFIGS[request.param]
+
+    if current_platform.is_rocm() and not config.get("supports_rocm", True):
+        pytest.skip("The {} model can't be tested on the ROCm platform".format(
+            config["model"]))
+
     # download model and tokenizer using transformers
     snapshot_download(config["model"])
     yield CONFIGS[request.param]
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index d9ee0b1d54b0a..576555b368afe 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -13,6 +13,7 @@ class ServerConfig(TypedDict, total=False):
     arguments: List[str]
     system_prompt: Optional[str]
     supports_parallel: Optional[bool]
+    supports_rocm: Optional[bool]
 
 
 def patch_system_prompt(messages: List[Dict[str, Any]],
@@ -36,7 +37,7 @@ def ensure_system_prompt(messages: List[Dict[str, Any]],
 
 # universal args for all models go here. also good if you need to test locally
 # and change type or KV cache quantization or something.
-ARGS: List[str] = ["--enable-auto-tool-choice", "--max-model-len", "8096"]
+ARGS: List[str] = ["--enable-auto-tool-choice", "--max-model-len", "1024"]
 
 CONFIGS: Dict[str, ServerConfig] = {
     "hermes": {
@@ -88,18 +89,28 @@ def ensure_system_prompt(messages: List[Dict[str, Any]],
         "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT "
         "to the user's question - just respond to it normally."
     },
-    ## FIXME: temporary disabled due to lack of hardware specification
-    ## for individual runs
-    #"granite20b": {
-    #    "model":
-    #    "ibm-granite/granite-20b-functioncalling",
-    #    "arguments": [
-    #        "--tool-call-parser", "granite-20b-fc", "--chat-template",
-    #        str(VLLM_PATH / "examples/tool_chat_template_granite_20b_fc.jinja")
-    #    ],
-    #    "supports_parallel":
-    #    False,
-    #},
+    "granite20b": {
+        "model":
+        "mbayser/granite-20b-functioncalling-FP8-KV",
+        "arguments": [
+            "--tool-call-parser", "granite-20b-fc", "--chat-template",
+            str(VLLM_PATH /
+                "examples/tool_chat_template_granite_20b_fc.jinja"),
+            "--max_num_seqs", "1", "--enforce-eager", "--cpu-offload-gb", "20"
+        ],
+        "supports_parallel":
+        False,
+        "supports_rocm":
+        False,
+    },
+    "granite8b": {
+        "model":
+        "ibm-granite/granite-3.0-8b-instruct",
+        "arguments": [
+            "--tool-call-parser", "granite", "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_granite.jinja")
+        ],
+    },
     "internlm": {
         "model":
         "internlm/internlm2_5-7b-chat",
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index 1b299ce655570..2187862e8380b 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -1,5 +1,6 @@
 from .abstract_tool_parser import ToolParser, ToolParserManager
 from .granite_20b_fc_tool_parser import Granite20bFCToolParser
+from .granite_tool_parser import GraniteToolParser
 from .hermes_tool_parser import Hermes2ProToolParser
 from .internlm2_tool_parser import Internlm2ToolParser
 from .jamba_tool_parser import JambaToolParser
@@ -8,6 +9,6 @@
 
 __all__ = [
     "ToolParser", "ToolParserManager", "Granite20bFCToolParser",
-    "Hermes2ProToolParser", "MistralToolParser", "Internlm2ToolParser",
-    "Llama3JsonToolParser", "JambaToolParser"
+    "GraniteToolParser", "Hermes2ProToolParser", "MistralToolParser",
+    "Internlm2ToolParser", "Llama3JsonToolParser", "JambaToolParser"
 ]
diff --git a/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
new file mode 100644
index 0000000000000..b5854ca39ab47
--- /dev/null
+++ b/vllm/entrypoints/openai/tool_parsers/granite_tool_parser.py
@@ -0,0 +1,215 @@
+import json
+from typing import Dict, Sequence, Union
+
+import partial_json_parser
+from partial_json_parser.core.options import Allow
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.entrypoints.openai.tool_parsers.utils import (consume_space,
+                                                        find_common_prefix,
+                                                        is_complete_json,
+                                                        partial_json_loads)
+from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.utils import random_uuid
+
+logger = init_logger(__name__)
+
+
+@ToolParserManager.register_module("granite")
+class GraniteToolParser(ToolParser):
+    """
+    Tool call parser for the granite 3.0 models. Intended
+    for use with the examples/tool_chat_template_granite.jinja
+    template.
+
+    Used when --enable-auto-tool-choice --tool-call-parser granite
+    are all set
+    """
+
+    def __init__(self, tokenizer: AnyTokenizer):
+        super().__init__(tokenizer)
+
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        stripped = model_output.strip()
+        if not stripped or stripped[0] != '[':
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+        try:
+            raw_function_calls = json.loads(stripped)
+            if not isinstance(raw_function_calls, list):
+                raise Exception(
+                    f"Expected dict or list, got {type(raw_function_calls)}")
+
+            logger.debug("Extracted %d tool calls", len(raw_function_calls))
+            tool_calls = [
+                ToolCall(
+                    type="function",
+                    function=FunctionCall(
+                        name=function_call["name"],
+                        # function call args are JSON but as a string
+                        arguments=json.dumps(function_call["arguments"]),
+                    ),
+                ) for function_call in raw_function_calls
+            ]
+
+            return ExtractedToolCallInformation(
+                tools_called=True,
+                tool_calls=tool_calls,
+                content=None,
+            )
+
+        except Exception as e:
+            logger.error("Error in extracting tool call from response %s", e)
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        start_idx = consume_space(0, current_text)
+        if not current_text or current_text[start_idx] != '[':
+            return DeltaMessage(content=delta_text)
+
+        # bit mask flags for partial JSON parsing. If the name hasn't been
+        # sent yet, don't allow sending
+        # an incomplete string since OpenAI only ever (as far as I have
+        # seen) allows sending the entire tool/ function name at once.
+        flags = Allow.ALL if self.current_tool_name_sent \
+            else Allow.ALL & ~Allow.STR
+        try:
+            tool_call_arr = None
+            is_complete = None
+            try:
+                tool_calls, end_idx = partial_json_loads(
+                    current_text[start_idx:], flags)
+                if type(tool_calls) is list:
+                    tool_call_arr = tool_calls
+                else:
+                    return DeltaMessage(content=delta_text)
+
+                is_complete = [True] * len(tool_calls)
+                if not is_complete_json(
+                        current_text[start_idx:start_idx + end_idx]):
+                    is_complete[-1] = False
+            except partial_json_parser.core.exceptions.MalformedJSON:
+                logger.debug('not enough tokens to parse into JSON yet')
+                return None
+
+            # case -- if no tokens have been streamed for the tool, e.g.
+            #   only the array brackets, stream nothing
+            if not tool_call_arr:
+                return None
+
+            # select as the current tool call the one we're on the state at
+            current_tool_call: Dict = tool_call_arr[self.current_tool_id]
+
+            delta = None
+            # case: we are starting a new tool in the array
+            #   -> array has > 0 length AND length has moved past cursor
+            if len(tool_call_arr) > self.current_tool_id + 1:
+
+                # if we're moving on to a new call, first make sure we
+                # haven't missed anything in the previous one that was
+                # auto-generated due to JSON completions, but wasn't
+                # streamed to the client yet.
+                if self.current_tool_id >= 0:
+                    cur_arguments = current_tool_call.get("arguments")
+                    if cur_arguments:
+                        cur_args_json = json.dumps(cur_arguments)
+                        sent = len(
+                            self.streamed_args_for_tool[self.current_tool_id])
+                        argument_diff = cur_args_json[sent:]
+
+                        logger.debug("got arguments diff: %s", argument_diff)
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=argument_diff).
+                                          model_dump(exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += argument_diff
+
+                # re-set stuff pertaining to progress in the current tool
+                self.current_tool_id = len(tool_call_arr) - 1
+                self.current_tool_name_sent = False
+                self.streamed_args_for_tool.append("")
+                logger.debug("starting on new tool %d", self.current_tool_id)
+                return delta
+
+            # if the current tool name hasn't been sent, send if available
+            # - otherwise send nothing
+            elif not self.current_tool_name_sent:
+                function_name = current_tool_call.get("name")
+                if function_name:
+
+                    delta = DeltaMessage(tool_calls=[
+                        DeltaToolCall(index=self.current_tool_id,
+                                      type="function",
+                                      id=f"chatcmpl-tool-{random_uuid()}",
+                                      function=DeltaFunctionCall(
+                                          name=function_name).model_dump(
+                                              exclude_none=True))
+                    ])
+                    self.current_tool_name_sent = True
+
+            # now we know we're on the same tool call and we're streaming
+            # arguments
+            else:
+                cur_arguments = current_tool_call.get("arguments")
+
+                if cur_arguments:
+                    sent = len(
+                        self.streamed_args_for_tool[self.current_tool_id])
+                    cur_args_json = json.dumps(cur_arguments)
+                    prev_arguments = self.prev_tool_call_arr[
+                        self.current_tool_id].get("arguments")
+
+                    argument_diff = None
+                    if is_complete[self.current_tool_id]:
+                        argument_diff = cur_args_json[sent:]
+                    elif prev_arguments:
+                        prev_args_json = json.dumps(prev_arguments)
+                        if cur_args_json != prev_args_json:
+                            prefix = find_common_prefix(
+                                prev_args_json, cur_args_json)
+                            argument_diff = prefix[sent:]
+
+                    if argument_diff is not None:
+                        delta = DeltaMessage(tool_calls=[
+                            DeltaToolCall(index=self.current_tool_id,
+                                          function=DeltaFunctionCall(
+                                              arguments=argument_diff).
+                                          model_dump(exclude_none=True))
+                        ])
+                        self.streamed_args_for_tool[
+                            self.current_tool_id] += argument_diff
+
+            self.prev_tool_call_arr = tool_call_arr
+            return delta
+
+        except Exception as e:
+            logger.error("Error trying to handle streaming tool call: %s", e)
+            logger.debug(
+                "Skipping chunk as a result of tool streaming extraction "
+                "error")
+            return None

From 9d43afcc538645625ea5fc2bca01d3697dd0595c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicol=C3=B2=20Lucchesi?= <nlucches@redhat.com>
Date: Thu, 7 Nov 2024 17:15:14 +0100
Subject: [PATCH 027/183] [Feature] [Spec decode]: Combine chunked prefill with
 speculative decoding (#9291)

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 tests/spec_decode/e2e/test_compatibility.py   |  34 ------
 .../e2e/test_multistep_correctness.py         | 105 ++++++++++++++++-
 .../spec_decode/e2e/test_ngram_correctness.py |  36 +++++-
 tests/spec_decode/test_ngram_worker.py        |   9 +-
 tests/spec_decode/test_scorer.py              |  31 ++++-
 tests/spec_decode/test_spec_decode_worker.py  |  82 +++++++++++++
 tests/spec_decode/utils.py                    |  71 +++++++++--
 vllm/attention/backends/flash_attn.py         |  10 +-
 vllm/attention/backends/rocm_flash_attn.py    |   6 +
 vllm/attention/backends/xformers.py           |   7 ++
 vllm/config.py                                |  14 +--
 vllm/core/scheduler.py                        |   1 +
 vllm/engine/output_processor/multi_step.py    |   8 +-
 vllm/spec_decode/batch_expansion.py           |  61 +++++-----
 vllm/spec_decode/mqa_scorer.py                |  31 +++--
 vllm/spec_decode/spec_decode_worker.py        | 110 +++++++++++++-----
 vllm/spec_decode/top1_proposer.py             |   8 +-
 17 files changed, 477 insertions(+), 147 deletions(-)

diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py
index 629074188a6c1..af8397c235f48 100644
--- a/tests/spec_decode/e2e/test_compatibility.py
+++ b/tests/spec_decode/e2e/test_compatibility.py
@@ -5,40 +5,6 @@
 from .conftest import get_output_from_llm_generator
 
 
-@pytest.mark.parametrize("common_llm_kwargs", [{
-    "model": "JackFram/llama-68m",
-    "speculative_model": "JackFram/llama-68m",
-    "num_speculative_tokens": 5,
-}])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [
-    {
-        "enable_chunked_prefill": True,
-    },
-])
-@pytest.mark.parametrize("test_llm_kwargs", [{}])
-@pytest.mark.parametrize("seed", [1])
-def test_spec_decode_xfail_chunked_prefill(test_llm_generator):
-    """Verify that speculative decoding with chunked prefill fails.
-    """
-    output_len = 128
-    temperature = 0.0
-
-    prompts = [
-        "Hello, my name is",
-    ]
-
-    sampling_params = SamplingParams(
-        max_tokens=output_len,
-        ignore_eos=True,
-        temperature=temperature,
-    )
-
-    with pytest.raises(ValueError,
-                       match="Speculative decoding and chunked prefill"):
-        get_output_from_llm_generator(test_llm_generator, prompts,
-                                      sampling_params)
-
-
 @pytest.mark.parametrize("common_llm_kwargs", [{
     "model": "meta-llama/Llama-2-7b-chat-hf",
     "speculative_model": "JackFram/llama-68m",
diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py
index 5f240d42d9e09..a13cca41f99e5 100644
--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -62,6 +62,16 @@
         {
             "speculative_model": "JackFram/llama-68m",
             "num_speculative_tokens": 5,
+            "enable_chunked_prefill": False,
+        },
+        {
+            # Chunked prefill enabled with small value
+            # to make sure we get mixed batches.
+            "speculative_model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+            "enable_chunked_prefill": True,
+            "max_num_batched_tokens": 4,
+            "max_num_seqs": 4
         },
         {
             # Verify the detokenizer assertions in the test work when spec
@@ -141,6 +151,14 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
     {
         "speculative_model": "JackFram/llama-68m",
         "num_speculative_tokens": 5,
+        "enable_chunked_prefill": False,
+    },
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+        "enable_chunked_prefill": True,
+        "max_num_batched_tokens": 4,
+        "max_num_seqs": 4,
     },
 ])
 @pytest.mark.parametrize(
@@ -204,6 +222,14 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
     {
         "speculative_model": "JackFram/llama-68m",
         "num_speculative_tokens": 5,
+        "enable_chunked_prefill": False,
+    },
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+        "enable_chunked_prefill": True,
+        "max_num_batched_tokens": 4,
+        "max_num_seqs": 4
     },
 ])
 @pytest.mark.parametrize(
@@ -255,6 +281,14 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
     {
         "speculative_model": "JackFram/llama-68m",
         "num_speculative_tokens": 5,
+        "enable_chunked_prefill": False,
+    },
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+        "enable_chunked_prefill": True,
+        "max_num_batched_tokens": 4,
+        "max_num_seqs": 4
     },
 ])
 @pytest.mark.parametrize("max_output_len", [
@@ -300,6 +334,14 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
     {
         "speculative_model": "JackFram/llama-68m",
         "num_speculative_tokens": 5,
+        "enable_chunked_prefill": False,
+    },
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+        "enable_chunked_prefill": True,
+        "max_num_batched_tokens": 4,
+        "max_num_seqs": 4
     },
 ])
 @pytest.mark.parametrize("batch_size", [1])
@@ -347,6 +389,14 @@ def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
     {
         "speculative_model": "JackFram/llama-68m",
         "num_speculative_tokens": 5,
+        "enable_chunked_prefill": False,
+    },
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+        "enable_chunked_prefill": True,
+        "max_num_batched_tokens": 4,
+        "max_num_seqs": 4
     },
 ])
 @pytest.mark.parametrize("batch_size", [32])
@@ -397,6 +447,14 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
     {
         "speculative_model": "JackFram/llama-68m",
         "num_speculative_tokens": 5,
+        "enable_chunked_prefill": False,
+    },
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+        "enable_chunked_prefill": True,
+        "max_num_batched_tokens": 4,
+        "max_num_seqs": 4
     },
 ])
 @pytest.mark.parametrize(
@@ -454,6 +512,14 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
     {
         "speculative_model": "JackFram/llama-68m",
         "num_speculative_tokens": 5,
+        "enable_chunked_prefill": False,
+    },
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+        "enable_chunked_prefill": True,
+        "max_num_batched_tokens": 4,
+        "max_num_seqs": 4
     },
 ])
 @pytest.mark.parametrize("batch_size", [2])
@@ -503,6 +569,15 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
             # Artificially limit the draft model max model len; this forces vLLM
             # to skip speculation once the sequences grow beyond 32-k tokens.
             "speculative_max_model_len": 32,
+            "enable_chunked_prefill": False,
+        },
+        {
+            "speculative_model": "JackFram/llama-68m",
+            "num_speculative_tokens": 5,
+            "enable_chunked_prefill": True,
+            "max_num_batched_tokens": 4,
+            "max_num_seqs": 4,
+            "speculative_max_model_len": 32,
         },
     ])
 @pytest.mark.parametrize("batch_size", [8])
@@ -551,6 +626,15 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs,
         "speculative_model": "JackFram/llama-68m",
         "num_speculative_tokens": 5,
         "speculative_disable_by_batch_size": 2,
+        "enable_chunked_prefill": False,
+    },
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+        "speculative_disable_by_batch_size": 2,
+        "enable_chunked_prefill": True,
+        "max_num_batched_tokens": 4,
+        "max_num_seqs": 4,
     },
 ])
 @pytest.mark.parametrize("batch_size", [8])
@@ -590,10 +674,17 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
         {
             "speculative_model": "JackFram/llama-68m",
             "num_speculative_tokens": k,
+            "enable_chunked_prefill": False,
         }
         # Try a range of common k, as well as large speculation.
         for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63]
-    ])
+    ] + [{
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": k,
+        "enable_chunked_prefill": True,
+        "max_num_batched_tokens": 4,
+        "max_num_seqs": 4,
+    } for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63]])
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize(
     "output_len",
@@ -636,11 +727,19 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
         {
             "speculative_model": "JackFram/llama-68m",
             "num_speculative_tokens": k,
-            "spec_decoding_acceptance_method": "typical_acceptance_sampler"
+            "spec_decoding_acceptance_method": "typical_acceptance_sampler",
+            "enable_chunked_prefill": False
         }
         # Try a range of common k.
         for k in [1, 2, 3]
-    ])
+    ] + [{
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": k,
+        "spec_decoding_acceptance_method": "typical_acceptance_sampler",
+        "enable_chunked_prefill": True,
+        "max_num_batched_tokens": 4,
+        "max_num_seqs": 4
+    } for k in [1, 2, 3]])
 @pytest.mark.parametrize("batch_size", [1, 32])
 @pytest.mark.parametrize(
     "output_len",
diff --git a/tests/spec_decode/e2e/test_ngram_correctness.py b/tests/spec_decode/e2e/test_ngram_correctness.py
index 31bedad480283..e53d169a8fcc3 100644
--- a/tests/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/spec_decode/e2e/test_ngram_correctness.py
@@ -50,18 +50,33 @@
         "num_speculative_tokens": 5,
         "ngram_prompt_lookup_max": 3,
     },
+    {
+        "speculative_model": "[ngram]",
+        "num_speculative_tokens": 5,
+        "ngram_prompt_lookup_max": 3,
+    },
 ])
 @pytest.mark.parametrize("output_len", [
     256,
 ])
 @pytest.mark.parametrize("batch_size", [1, 32])
+@pytest.mark.parametrize("prefill_chunk_size", [-1, 4])
 @pytest.mark.parametrize("seed", [1])
 def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
                                       per_test_common_llm_kwargs,
                                       baseline_llm_kwargs, test_llm_kwargs,
                                       batch_size: int, output_len: int,
-                                      seed: int):
+                                      prefill_chunk_size: int, seed: int):
     """Verify greedy equality on a tiny model with different batch size."""
+    if prefill_chunk_size > 0:
+        common_llm_kwargs.update(
+            **{
+                "enable_chunked_prefill": True,
+                "max_num_batched_tokens": prefill_chunk_size,
+                "max_num_seqs": prefill_chunk_size
+            })
+    else:
+        common_llm_kwargs["enable_chunked_prefill"] = False
     run_equality_correctness_test(vllm_runner,
                                   common_llm_kwargs,
                                   per_test_common_llm_kwargs,
@@ -151,6 +166,16 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
         "speculative_model": "[ngram]",
         "num_speculative_tokens": 5,
         "ngram_prompt_lookup_max": 3,
+        "enable_chunked_prefill": False,
+    },
+    {
+        "speculative_model": "[ngram]",
+        "num_speculative_tokens": 5,
+        "ngram_prompt_lookup_max": 3,
+        "enable_chunked_prefill": True,
+        "speculative_disable_mqa_scorer": True,
+        "max_num_batched_tokens": 4,
+        "max_num_seqs": 4
     },
 ])
 @pytest.mark.parametrize(
@@ -251,6 +276,15 @@ def test_ngram_different_k(vllm_runner, common_llm_kwargs,
                              "num_speculative_tokens": 5,
                              "ngram_prompt_lookup_max": 3,
                              "speculative_disable_by_batch_size": 4
+                         }, {
+                             "speculative_model": "[ngram]",
+                             "num_speculative_tokens": 5,
+                             "ngram_prompt_lookup_max": 3,
+                             "speculative_disable_by_batch_size": 4,
+                             "enable_chunked_prefill": True,
+                             "speculative_disable_mqa_scorer": True,
+                             "max_num_batched_tokens": 4,
+                             "max_num_seqs": 4
                          }])
 @pytest.mark.parametrize("batch_size", [1, 5])
 @pytest.mark.parametrize(
diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py
index 3995f87898afb..f66e957186604 100644
--- a/tests/spec_decode/test_ngram_worker.py
+++ b/tests/spec_decode/test_ngram_worker.py
@@ -118,7 +118,8 @@ def test_ngram_algo_correctness_for_batches_not_match_all():
         num_gpu_blocks,
         block_size,
         final_prompt_lens=final_prompt_lens)
-
+    for sg in seq_group_metadata_list:
+        sg.is_prompt = False
     proposals = proposer.get_spec_proposals(
         execute_model_req=ExecuteModelRequest(
             seq_group_metadata_list=seq_group_metadata_list,
@@ -147,7 +148,7 @@ def test_ngram_algo_correctness_for_batches_not_match_all():
 def test_ngram_algo_correctness_for_batches_match_all():
     """Verify our ngram algo find the right candidate in the prompt
 
-    For the scenario find candidate in all batchs
+    For the scenario find candidate in all batches
     """
 
     block_size = 32
@@ -192,6 +193,10 @@ def test_ngram_algo_correctness_for_batches_match_all():
         block_size,
         final_prompt_lens=final_prompt_lens)
 
+    # Normally drafter is run on decode requests only; here we check the output
+    # of the ngram worker as it is the sole proposer that has no forward.
+    for sg in seq_group_metadata_list:
+        sg.is_prompt = False
     proposals = proposer.get_spec_proposals(
         execute_model_req=ExecuteModelRequest(
             seq_group_metadata_list=seq_group_metadata_list,
diff --git a/tests/spec_decode/test_scorer.py b/tests/spec_decode/test_scorer.py
index e579c8b38db91..0b1509d8b7785 100644
--- a/tests/spec_decode/test_scorer.py
+++ b/tests/spec_decode/test_scorer.py
@@ -46,12 +46,14 @@ def assert_score_equal(score1: SpeculativeScores,
 @pytest.mark.parametrize('max_propose_len', [1, 3, 5])
 @pytest.mark.parametrize('mixed_propose_len', [True])
 @pytest.mark.parametrize('device', ['cuda'])
+@pytest.mark.parametrize('prefill_chunking', [False, True])
 def test_scorer(model_name: str, batch_size: int, max_propose_len: int,
-                mixed_propose_len: bool, device: str) -> None:
+                mixed_propose_len: bool, device: str,
+                prefill_chunking: bool) -> None:
     """
     Compare the batch expansion scorer and mqa scorer return the same score.
     We test for both queries with the same propose length and different 
-    propose length.
+    propose length, as well as mixed prefill-decode batches.
     """
     seed = 0
     block_size = 32
@@ -67,16 +69,37 @@ def test_scorer(model_name: str, batch_size: int, max_propose_len: int,
     if not mixed_propose_len:
         propose_lens = [max_propose_len] * batch_size
     else:
-        non_zero_cnt = random.randint(0, batch_size)
+        # There must be at least 1 decode request, otherwise
+        # we have nothing to score (`_run_no_spec`).
+        non_zero_cnt = random.randint(1, batch_size)
         propose_lens = [max_propose_len
                         ] * non_zero_cnt + [0] * (batch_size - non_zero_cnt)
         random.shuffle(propose_lens)
 
-    proposals = create_proposal(propose_lens, vocab_size, device)
     seq_group_metadatalist, _, _ = create_batch(batch_size,
                                                 max_propose_len,
                                                 block_size=block_size,
                                                 num_gpu_blocks=num_gpu_blocks)
+
+    if mixed_propose_len and prefill_chunking and (n_prefills :=
+                                                   batch_size - non_zero_cnt):
+        prefill, _, _ = create_batch(n_prefills,
+                                     None,
+                                     prefill_chunk_size=4,
+                                     block_size=block_size,
+                                     num_gpu_blocks=num_gpu_blocks,
+                                     seq_ids=list(
+                                         range(batch_size,
+                                               batch_size + n_prefills)))
+        # re-order to guarantee prefill|decode order
+        target_group_metadatalist = [
+            seq_group_metadatalist[i] for i, p in enumerate(propose_lens)
+            if p > 0
+        ]
+        seq_group_metadatalist = prefill + target_group_metadatalist
+        propose_lens = [0] * n_prefills + [p for p in propose_lens if p > 0]
+
+    proposals = create_proposal(propose_lens, vocab_size, device)
     requests = ExecuteModelRequest(seq_group_metadatalist,
                                    num_lookahead_slots=max_propose_len)
 
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index e0b7b7d47f1f1..8df143104c279 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -10,6 +10,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import ExecuteModelRequest, SequenceOutput
+from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
 from vllm.spec_decode.interfaces import SpeculativeProposals
 from vllm.spec_decode.metrics import (AsyncMetricsCollector,
                                       SpecDecodeWorkerMetrics)
@@ -819,3 +820,84 @@ def test_handle_finished_requests():
     # and 'request-3' are removed from seq_with_bonus_token_in_last_step.
     assert worker._seq_with_bonus_token_in_last_step == \
         {4,5,10}
+
+
+@pytest.mark.parametrize('k', [3])
+@pytest.mark.parametrize('batch_size', [2, 32])
+@pytest.mark.parametrize("batch_composition",
+                         ["prefill_only", "decode_only", "mixed"])
+@torch.inference_mode()
+def test_chunked_prefill_flow(k: int, batch_size: int, batch_composition: str):
+    """
+        Verify SpecDecodeWorker calls match the expected flow.
+    """
+    vocab_size = 32_000
+    draft_worker = mock_worker(cls=MultiStepWorker)
+    target_worker = mock_worker()
+    metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+    worker = SpecDecodeWorker(draft_worker,
+                              target_worker,
+                              mock_spec_decode_sampler("rejection_sampler"),
+                              disable_logprobs=False,
+                              metrics_collector=metrics_collector)
+    exception_secret = 'artificial stop'
+    worker.scorer = mock_worker(BatchExpansionTop1Scorer)
+    worker.scorer.score_proposals.side_effect = ValueError(exception_secret)
+
+    # Create batch with combination of terminal/non-terminal prefill chunks
+    # and decodes (different seq_ids).
+    decodes, _, _ = create_batch(batch_size, k)
+    # Pre-chunking here, get 'batch_size' chunks.
+    prefill, _, _ = create_batch(batch_size,
+                                 k,
+                                 prefill_chunk_size=4,
+                                 seq_ids=list(range(batch_size,
+                                                    batch_size * 2)))
+
+    if batch_composition == "prefill_only":
+        n_prefills = batch_size
+    elif batch_composition == "decode_only":
+        n_prefills = 0
+    else:
+        n_prefills = random.randint(1, batch_size - 1)
+    n_decodes = batch_size - n_prefills
+
+    prefill = random.sample(prefill, n_prefills)
+    decodes = random.sample(decodes, n_decodes)
+    target_group_metadata_list = prefill + decodes
+    execute_model_req = ExecuteModelRequest(
+        seq_group_metadata_list=target_group_metadata_list,
+        num_lookahead_slots=k)
+
+    target_token_ids = torch.randint(low=0,
+                                     high=vocab_size,
+                                     size=(1, batch_size * (k + 1)),
+                                     dtype=torch.int64,
+                                     device='cuda')
+    target_token_probs = torch.rand(1,
+                                    batch_size * (k + 1),
+                                    vocab_size,
+                                    dtype=torch.float32,
+                                    device='cuda')
+    target_token_logprobs = torch.rand(1,
+                                       batch_size * (k + 1),
+                                       vocab_size,
+                                       dtype=torch.float32,
+                                       device='cuda')
+    target_output = create_sampler_output_list(target_token_ids,
+                                               target_token_probs,
+                                               target_token_logprobs)
+
+    target_worker.execute_model.return_value = [target_output[0]]
+
+    if not len(decodes):
+        worker.execute_model(execute_model_req=execute_model_req)
+        # no spec run (prefill only)
+        draft_worker.execute_model.assert_called_once_with(execute_model_req)
+        target_worker.execute_model.assert_called_once_with(execute_model_req)
+    else:
+        # Decode-only run OR mixed batch, scorer call fails (it's mocked)
+        with pytest.raises(ValueError, match=exception_secret):
+            worker.execute_model(execute_model_req=execute_model_req)
+        # but first draft still counted
+        assert draft_worker.get_spec_proposals.call_count == 1
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index e5cb0530f9961..a4bfa6b2f384b 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -146,6 +146,41 @@ def create_seq_group_metadata_from_prompts(
     return seq_grou_metadata_list
 
 
+def create_chunked_seq_group_metadata_from_prompt(
+        prompt: List[int],
+        num_gpu_blocks: int,
+        chunk_size: int,
+        block_size: int,
+        seq_id: Optional[int] = None) -> List[SequenceGroupMetadata]:
+
+    if seq_id is None:
+        seq_id = 0
+
+    free_gpu_blocks = list(range(num_gpu_blocks))
+
+    block_allocations = [
+        free_gpu_blocks.pop()
+        for _ in range(round_up_to_next_block(len(prompt), block_size))
+    ]
+
+    seq_group_metadata_list = []
+    for i, idx in enumerate(range(0, len(prompt), chunk_size)):
+        chunk_ids = prompt[idx:idx + chunk_size]
+        data = SequenceData.from_seqs(prompt)
+        data.update_num_computed_tokens(idx)
+        seq_data = {i: data}
+        seq_group_metadata_list.append(
+            SequenceGroupMetadata(
+                request_id=str(seq_id),
+                is_prompt=True,
+                do_sample=idx + chunk_size >= len(prompt),  # terminal chunk
+                seq_data=seq_data,
+                sampling_params=SamplingParams(temperature=0.0),
+                block_tables={i: block_allocations},
+                token_chunk_size=len(chunk_ids)))
+    return seq_group_metadata_list
+
+
 def assert_logprobs_dict_allclose(
         actual_logprobs: List[Dict[int, Logprob]],
         expected_logprobs: List[Dict[int, Logprob]]) -> None:
@@ -198,7 +233,8 @@ def create_batch(batch_size,
                  prev_output_token_len: int = 10,
                  seq_ids: Optional[List[int]] = None,
                  num_gpu_blocks: Optional[int] = None,
-                 block_size: Optional[int] = None):
+                 block_size: Optional[int] = None,
+                 prefill_chunk_size: Optional[int] = None):
     if block_size is None:
         block_size = 8
 
@@ -213,15 +249,28 @@ def create_batch(batch_size,
         prompt_lens = prompt_len
 
     prompts = [[next(iterator) for _ in range(p_len)] for p_len in prompt_lens]
-    prev_output_tokens = [[
-        next(iterator) for _ in range(prev_output_token_len)
-    ] for _ in range(batch_size)]
-    final_prompt_lens = [
-        len(prompt) + len(prev_output_token) + k + 1
-        for prompt, prev_output_token in zip(prompts, prev_output_tokens)
-    ]
 
-    seq_group_metadata_list = create_seq_group_metadata_from_prompts(
-        prompts, num_gpu_blocks, block_size, final_prompt_lens,
-        prev_output_tokens, seq_ids)
+    if prefill_chunk_size:
+        # Create a batch of chunked prompts.
+        if not seq_ids:
+            seq_ids = list(range(len(prompts)))
+        seq_group_metadata_list = []
+        for p, sid in zip(prompts, seq_ids):
+            seq_group_metadata_list += \
+                create_chunked_seq_group_metadata_from_prompt(
+                p, num_gpu_blocks, prefill_chunk_size, block_size, sid)
+        seq_group_metadata_list = seq_group_metadata_list[:batch_size]
+        prev_output_tokens = []
+    else:
+        prev_output_tokens = [[
+            next(iterator) for _ in range(prev_output_token_len)
+        ] for _ in range(batch_size)]
+        final_prompt_lens = [
+            len(prompt) + len(prev_output_token) + k + 1
+            for prompt, prev_output_token in zip(prompts, prev_output_tokens)
+        ]
+
+        seq_group_metadata_list = create_seq_group_metadata_from_prompts(
+            prompts, num_gpu_blocks, block_size, final_prompt_lens,
+            prev_output_tokens, seq_ids)
     return seq_group_metadata_list, prompts, prev_output_tokens
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 26da0d89def29..314822b695722 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -276,7 +276,11 @@ def decode_metadata(self) -> Optional["FlashAttentionMetadata"]:
             max_query_len=self.max_query_len,
             max_prefill_seq_len=0,
             max_decode_seq_len=self.max_decode_seq_len,
-            query_start_loc=self.query_start_loc[self.num_prefills:]
+            # Batch may be composed of prefill|decodes, adjust query start
+            # indices to refer to the start of decodes. E.g.
+            # in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
+            query_start_loc=(self.query_start_loc[self.num_prefills:] -
+                             self.query_start_loc[self.num_prefills])
             if self.query_start_loc is not None else None,
             seq_start_loc=self.seq_start_loc[self.num_prefills:]
             if self.seq_start_loc is not None else None,
@@ -903,7 +907,9 @@ def unified_flash_attention(
         # Decoding run.
         # Use flash_attn_varlen_func kernel for speculative decoding
         # because different queries might have different lengths.
+
         assert decode_meta.max_decode_query_len is not None
+        # use only for actual varlen decoding
         if decode_meta.max_decode_query_len > 1:
             assert attn_type == AttentionType.DECODER, (
                 "Only decoder-only models support max_decode_query_len > 1")
@@ -949,8 +955,6 @@ def unified_flash_attention(
         assert prefill_output is not None
         return prefill_output.view(num_prefill_query_tokens, hidden_size)
 
-    # Chunked prefill does not work with speculative decoding.
-    # Therefore, the query length for decode should be 1 in chunked prefill.
     assert decode_meta is not None
     decode_output = decode_output.squeeze(1)
     output = torch.cat([prefill_output, decode_output], dim=0)
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index b129d0d992f2f..2bae370eaa90f 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -192,6 +192,12 @@ def decode_metadata(self) -> Optional["ROCmFlashAttentionMetadata"]:
             block_tables=self.block_tables[self.num_prefills:],
             use_cuda_graph=self.use_cuda_graph,
         )
+        # Batch may be composed of prefill|decodes, adjust query start indices
+        # to refer to the start of decodes when the two are split apart.
+        # E.g. in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
+        if self._cached_decode_metadata.query_start_loc is not None:
+            qs = self._cached_decode_metadata.query_start_loc
+            self._cached_decode_metadata.query_start_loc = qs - qs[0]
         return self._cached_decode_metadata
 
     def advance_step(self,
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index 4725413baade7..83d03606524dc 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -272,6 +272,13 @@ def decode_metadata(self) -> Optional["XFormersMetadata"]:
             max_encoder_seq_len=self.max_encoder_seq_len,
             cross_slot_mapping=self.cross_slot_mapping,
             cross_block_tables=self.cross_block_tables)
+
+        # Batch may be composed of prefill|decodes, adjust query start indices
+        # to refer to the start of decodes when the two are split apart.
+        # E.g. in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
+        if self._cached_decode_metadata.query_start_loc is not None:
+            qs = self._cached_decode_metadata.query_start_loc
+            self._cached_decode_metadata.query_start_loc = qs - qs[0]
         return self._cached_decode_metadata
 
 
diff --git a/vllm/config.py b/vllm/config.py
index e844a46bf06e6..9721925987cab 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -192,7 +192,6 @@ def __init__(
         self.max_logprobs = max_logprobs
         self.disable_sliding_window = disable_sliding_window
         self.skip_tokenizer_init = skip_tokenizer_init
-
         self.hf_config = get_config(self.model, trust_remote_code, revision,
                                     code_revision, rope_scaling, rope_theta,
                                     config_format)
@@ -1317,13 +1316,6 @@ def maybe_create_spec_config(
                              "speculative decoding is > 1, but got "
                              f"{speculative_disable_by_batch_size=}")
 
-        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
-        # If the feature combo become valid
-        if enable_chunked_prefill:
-            raise ValueError(
-                "Speculative decoding and chunked prefill are "
-                f"currently mutually exclusive ({enable_chunked_prefill=}).")
-
         # TODO: The user should be able to specify revision/max model len
         # for the draft model. It is not currently supported.
         draft_revision = None
@@ -1390,6 +1382,12 @@ def maybe_create_spec_config(
                         f"num_speculative_tokens={n_predict}, but "
                         f"{num_speculative_tokens=} was provided.")
 
+            if enable_chunked_prefill and draft_hf_config.model_type in (
+                    "medusa", "mlp_speculator", "eagle"):
+                raise ValueError(
+                    "Chunked prefill and hidden-state based draft models are "
+                    "not compatible.")
+
             draft_model_config.max_model_len = (
                 SpeculativeConfig._maybe_override_draft_max_model_len(
                     speculative_max_model_len,
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index e56d5cddce424..af4671ec29be9 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -1147,6 +1147,7 @@ def _schedule_chunked_prefill(self) -> SchedulerOutputs:
 
         # Update swapped requests.
         self.swapped.extend(running_scheduled.swapped_out)
+        # Put prefills first due to Attention backend ordering assumption.
         return SchedulerOutputs(
             scheduled_seq_groups=(prefills.seq_groups +
                                   running_scheduled.prefill_seq_groups +
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
index 223790806ab18..7a6ebb430541f 100644
--- a/vllm/engine/output_processor/multi_step.py
+++ b/vllm/engine/output_processor/multi_step.py
@@ -134,10 +134,12 @@ def process_outputs(self,
                 sample for sample in samples
                 if sample.output_token != VLLM_INVALID_TOKEN_ID
             ]
-            assert valid_samples
 
-            self._process_seq_outputs(seq, valid_samples,
-                                      sequence_group.sampling_params)
+            # When both spec-decode and pre-fill chunking are enabled, we
+            # don't have guaranteed samples here (e.g. all -1s).
+            if valid_samples:
+                self._process_seq_outputs(seq, valid_samples,
+                                          sequence_group.sampling_params)
 
     def _process_decode_and_stop(self, seq: Sequence,
                                  sampling_params: SamplingParams) -> None:
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
index 59e71cc8deb48..6a7929d9d8f9c 100644
--- a/vllm/spec_decode/batch_expansion.py
+++ b/vllm/spec_decode/batch_expansion.py
@@ -90,7 +90,7 @@ def score_proposals(
         else:
             # Batch has a mix of spec decode enabled and disabled seq groups
             contracted = self._contract_batch(
-                contracted_bs=len(execute_model_req.seq_group_metadata_list),
+                execute_model_req.seq_group_metadata_list,
                 target_sampler_output=target_sampler_output,
                 proposals=proposals,
                 num_scoring_tokens=num_scoring_tokens,
@@ -126,7 +126,7 @@ def _expand_batch(
             split_batch_by_proposal_len(
                 seq_group_metadata_list, proposal_lens_list)
 
-        target_seq_group_metadata_list = self._create_scoring_model_input(
+        spec_expanded_seqs = self._create_scoring_model_input(
             seq_group_metadata_list=spec_seqs,
             proposal_token_ids=proposal_token_ids_list,
             # NOTE: We determine the seq ids in the expanded batch using the
@@ -135,16 +135,19 @@ def _expand_batch(
                 seq_ids=get_all_seq_ids(seq_group_metadata_list)),
         )
 
-        num_scoring_tokens = len(target_seq_group_metadata_list)
-        target_seq_group_metadata_list.extend(non_spec_seqs)
+        num_scoring_tokens = len(spec_expanded_seqs)
+        # Batch speculative and non-speculative (e.g. chunked prefill) requests
+        # but make sure order is prefill|decode due to backend requirement.
+        target_seq_group_metadata_list = non_spec_seqs + spec_expanded_seqs
 
         return (spec_indices, non_spec_indices, target_seq_group_metadata_list,
                 num_scoring_tokens)
 
     def _contract_batch(
-        self, contracted_bs: int, target_sampler_output: SamplerOutput,
-        proposals: SpeculativeProposals, num_scoring_tokens: int,
-        non_spec_indices: List[int], spec_indices: List[int], k: int
+        self, contracted_seq_group_metadata_list: List[SequenceGroupMetadata],
+        target_sampler_output: SamplerOutput, proposals: SpeculativeProposals,
+        num_scoring_tokens: int, non_spec_indices: List[int],
+        spec_indices: List[int], k: int
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor,
                Optional[torch.Tensor]]:
         """Contract the expanded batch back into its original size.
@@ -154,6 +157,7 @@ def _contract_batch(
         contracted_bs is the original batch size, and the batch size that the
         target_sampler_output will be contracted to.
         """
+        contracted_bs = len(contracted_seq_group_metadata_list)
         (target_token_ids, target_probs, target_logprobs, target_hidden_states,
          non_spec_target_token_ids, non_spec_target_probs,
          non_spec_target_logprobs,
@@ -166,8 +170,8 @@ def _contract_batch(
 
         # The number of tokens in the expanded batch used for speculation is
         # equal to the total expanded batch size minus the number of samples for
-        # non-speculative sequences.
-        non_spec_expanded_bs = len(non_spec_target_token_ids)
+        # non-speculative sequences, prefill chunks with no out tokens included
+        non_spec_expanded_bs = len(non_spec_indices)
         spec_expanded_bs = expanded_batch_size - non_spec_expanded_bs
 
         target_token_ids = target_token_ids.reshape(spec_expanded_bs, k + 1)
@@ -191,7 +195,12 @@ def _contract_batch(
         else:
             all_hidden_states = None
 
-        if non_spec_indices:
+        # Rule out prefills that produce no tokens.
+        non_spec_indices = [
+            idx for idx in non_spec_indices
+            if contracted_seq_group_metadata_list[idx].do_sample
+        ]
+        if len(non_spec_indices):
             all_tokens[non_spec_indices, :1] = \
                 non_spec_target_token_ids.unsqueeze(1)
             all_probs[non_spec_indices, :1, :] = \
@@ -290,9 +299,6 @@ def _create_target_seq_group_metadata(
         This function creates K+1 target SequenceGroupMetadata to take
         advantage of the bonus token.
         """
-        assert not input_seq_group_metadata.is_prompt, (
-            "Speculating on "
-            "prompts not yet supported")
         assert len(input_seq_group_metadata.seq_data) == 1, (
             "Beam search "
             "not supported in speculative decoding")
@@ -390,27 +396,22 @@ def _split_scoring_output(
         # and non spec sequences) and should be removed in the future. It can be
         # done by supporting per-sequence proposal lens.
         #
-        # First samples are from speculative scoring, latter samples are non-
-        # speculative samples.
-        split_sizes = (num_scoring_tokens,
-                       sampler_output.sampled_token_ids.numel() -
-                       num_scoring_tokens)
-        (spec_probs, non_spec_probs
-         ) = sampler_output.sampled_token_probs.split(split_sizes)
-        (spec_sampled_tokens, non_spec_sampled_tokens
+        # First samples are non-speculative, latter samples are from speculative
+        # scoring (prefill|decode order).
+        split_sizes = (sampler_output.sampled_token_ids.numel() -
+                       num_scoring_tokens, num_scoring_tokens)
+        (non_spec_probs,
+         spec_probs) = sampler_output.sampled_token_probs.split(split_sizes)
+        (non_spec_sampled_tokens, spec_sampled_tokens
          ) = sampler_output.sampled_token_ids.flatten().split(split_sizes)
-        (
-            spec_logprobs,
-            non_spec_logprobs,
-        ) = sampler_output.logprobs.split(split_sizes)
+        (non_spec_logprobs,
+         spec_logprobs) = sampler_output.logprobs.split(split_sizes)
 
         if sampler_output.hidden_states is not None:
-            (
-                spec_hidden_states,
-                non_spec_hidden_states,
-            ) = sampler_output.hidden_states.split(split_sizes)
+            (non_spec_hidden_states, spec_hidden_states
+             ) = sampler_output.hidden_states.split(split_sizes)
         else:
-            spec_hidden_states, non_spec_hidden_states = None, None
+            non_spec_hidden_states, spec_hidden_states = None, None
 
         return (spec_sampled_tokens, spec_probs, spec_logprobs,
                 spec_hidden_states, non_spec_sampled_tokens, non_spec_probs,
diff --git a/vllm/spec_decode/mqa_scorer.py b/vllm/spec_decode/mqa_scorer.py
index f35a8a0ab8be3..cbf793e2043e3 100644
--- a/vllm/spec_decode/mqa_scorer.py
+++ b/vllm/spec_decode/mqa_scorer.py
@@ -21,6 +21,11 @@ def score_proposals(
         all_proposal_lengths = proposals.proposal_lens.tolist()
         for i, seq_group_metadata in enumerate(
                 execute_model_req.seq_group_metadata_list):
+            if all_proposal_lengths[i] == 0:
+                # Keep prompt seqs untouched (keep computed_tokens for chunks).
+                target_seq_group_metadata_list.append(seq_group_metadata)
+                continue
+
             seq_data_dict = seq_group_metadata.seq_data
             assert len(seq_data_dict) == 1
             seq_id = next(iter(seq_data_dict.keys()))
@@ -40,8 +45,7 @@ def score_proposals(
             new_seq_data.update_num_computed_tokens(
                 len(prompt_token_ids) + len(output_token_ids) - 1)
 
-            # Ensure that the new sequence has at least one token
-            # because we only use mqa scorer in the decoding stage.
+            # Ensure that the new decode sequence has at least one token.
             assert len(output_token_ids) >= 1
             new_seq_data_dict = {target_seq_id: new_seq_data}
 
@@ -54,7 +58,6 @@ def score_proposals(
                     target_seq_id: seq_group_metadata.block_tables[seq_id],
                 },
                 lora_request=None,
-                token_chunk_size=1,
             )
             target_seq_group_metadata_list.append(new_seq_group_metadata)
 
@@ -77,6 +80,7 @@ def score_proposals(
             all_probs = target_probs.reshape(bs, k + 1, self._vocab_size)
             all_logprobs = target_logprobs.reshape(bs, k + 1, self._vocab_size)
         else:
+            # We either have decodes with different lens or prefill+decodes.
             all_tokens = target_token_ids.new_full(size=(bs, k + 1),
                                                    fill_value=-1)
             all_probs = target_probs.new_zeros(*all_tokens.shape,
@@ -85,15 +89,18 @@ def score_proposals(
                                                     fill_value=-float("inf"))
             target_token_ids = target_token_ids.flatten()
             start_loc = 0
-            for i, proposed_len in enumerate(all_proposal_lengths):
-                output_len = proposed_len + 1
-                end_loc = start_loc + output_len
-                all_tokens[
-                    i, :output_len] = target_token_ids[start_loc:end_loc]
-                all_probs[i, :output_len] = target_probs[start_loc:end_loc]
-                all_logprobs[
-                    i, :output_len] = target_logprobs[start_loc:end_loc]
-                start_loc = end_loc
+            for i, (proposed_len, seq_meta) in enumerate(
+                    zip(all_proposal_lengths, target_seq_group_metadata_list)):
+                # Skip chunks with no output tokens.
+                if seq_meta.do_sample:
+                    output_len = proposed_len + 1
+                    end_loc = start_loc + output_len
+                    all_tokens[
+                        i, :output_len] = target_token_ids[start_loc:end_loc]
+                    all_probs[i, :output_len] = target_probs[start_loc:end_loc]
+                    all_logprobs[
+                        i, :output_len] = target_logprobs[start_loc:end_loc]
+                    start_loc = end_loc
 
         hidden_states = None
         if target_sampler_output.hidden_states is not None:
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index eb3c2e88e668c..b57742c2ebfdd 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -418,7 +418,10 @@ def execute_model(
         #    none of the requests in the batch have spec decoding enabled.
         # In any of these cases, the proposer and scorer workers
         # are called normally.
-        no_spec = num_lookahead_slots == 0 or disable_all_speculation or all(
+        # We expect `num_speculative_tokens` to be None for prefills.
+        no_spec = all(
+            sgm.is_prompt for sgm in execute_model_req.seq_group_metadata_list
+        ) or num_lookahead_slots == 0 or disable_all_speculation or all(
             sgm.num_speculative_tokens == 0
             for sgm in execute_model_req.seq_group_metadata_list)
 
@@ -484,7 +487,7 @@ def _maybe_disable_speculative_tokens(
 
     def _serialize_sampler_output_no_logprobs(
             self, execute_model_req: ExecuteModelRequest,
-            sampler_output: SamplerOutput) -> SamplerOutput:
+            sampler_output: SamplerOutput) -> List[SamplerOutput]:
         """
         Creates and returns a `SamplerOutput` with only the token IDs being
         serialized to CPU and populated in `CompletionSequenceGroupOutput`.
@@ -514,41 +517,56 @@ def _serialize_sampler_output_no_logprobs(
             if any(seq_output_prompt_logprobs) else \
                 sampler_output.sampled_token_ids).tolist()
 
-        seq_data_entries = (
+        seq_data_entries = [
             (seq_id, seq_data) for sg in \
             execute_model_req.seq_group_metadata_list \
             for seq_id, seq_data in sg.seq_data.items()
-        )
+            if sg.do_sample # ignore empty token sequences
+        ]
         completion_seq_group_output_list: List[
             CompletionSequenceGroupOutput] = []
-        for index, ((seq_id, seq_data), needs_prompt_logprobs) in \
-            enumerate(zip(seq_data_entries, seq_output_prompt_logprobs)):
-            if needs_prompt_logprobs:
-                prompt_token_ids = seq_data.get_prompt_token_ids()
-                prompt_logprobs = [
-                    create_logprobs_output(
-                        token_id=p_token_id,
+        output_index = 0
+        # Make sure the non-terminal prefill chunks are still aligned with
+        # their own empty output.
+        for seq_group_meta in execute_model_req.seq_group_metadata_list:
+            # Since we can get chunks here, we dont always have a sampled token
+            # (only on last chunk) but we still have to provide an output.
+            if not seq_group_meta.do_sample:
+                completion_seq_group_output_list.append(
+                    CompletionSequenceGroupOutput(samples=[],
+                                                  prompt_logprobs=None))
+            else:
+                # Sequence with output.
+                seq_id, seq_data = seq_data_entries[output_index]
+                needs_prompt_logprobs = seq_output_prompt_logprobs[
+                    output_index]
+                if needs_prompt_logprobs:
+                    prompt_token_ids = seq_data.get_prompt_token_ids()
+                    prompt_logprobs = [
+                        create_logprobs_output(
+                            token_id=p_token_id,
+                            token_id_logprob_rank=-1,
+                            token_id_logprob=0.0,
+                            topk_token_ids=[],
+                            topk_logprobs=[],
+                        )
+                        # no prompt logprobs for the first token
+                        for p_token_id in prompt_token_ids[1:]
+                    ]
+                else:
+                    prompt_logprobs = None
+                completion_seq_group_output_list.append(
+                    create_sequence_group_output(
+                        token_id=sampled_token_ids_list[output_index][0],
                         token_id_logprob_rank=-1,
                         token_id_logprob=0.0,
+                        seq_id=seq_id,
                         topk_token_ids=[],
                         topk_logprobs=[],
-                    )
-                    # no prompt logprobs for the first token
-                    for p_token_id in prompt_token_ids[1:]
-                ]
-            else:
-                prompt_logprobs = None
-
-            completion_seq_group_output_list.append(
-                create_sequence_group_output(
-                    token_id=sampled_token_ids_list[index][0],
-                    token_id_logprob_rank=-1,
-                    token_id_logprob=0.0,
-                    seq_id=seq_id,
-                    topk_token_ids=[],
-                    topk_logprobs=[],
-                    prompt_logprobs=prompt_logprobs))
-        return SamplerOutput(outputs=completion_seq_group_output_list)
+                        prompt_logprobs=prompt_logprobs))
+                output_index += 1
+
+        return [SamplerOutput(outputs=completion_seq_group_output_list)]
 
     @nvtx_range("spec_decode_worker._run_no_spec")
     def _run_no_spec(self, execute_model_req: ExecuteModelRequest,
@@ -568,6 +586,9 @@ def _run_no_spec(self, execute_model_req: ExecuteModelRequest,
         hidden_states = sampler_output.hidden_states
         if hidden_states is not None:
             # remove hidden_states for prompt tokens
+            # TODO Enable `return_hidden_states`: prefill chunks hidden states
+            # are pruned by the logits processor. Also, they should be arranged
+            # back into full-prefill latent. Address it to enable MLPSpeculator.
             if any(seq.is_prompt
                    for seq in execute_model_req.seq_group_metadata_list):
                 hidden_states = hidden_states[
@@ -593,14 +614,14 @@ def _run_no_spec(self, execute_model_req: ExecuteModelRequest,
         sampler_output_to_return = (self._serialize_sampler_output_no_logprobs(
             execute_model_req=execute_model_req, sampler_output=sampler_output)
                                     if self._disable_logprobs else
-                                    sampler_output)
+                                    [sampler_output])
 
         # Clear device tensors from sampler output. This reduces communication
         # overhead when the engine runs in a different process than the workers.
         sampler_output.sampled_token_probs = None
         sampler_output.sampled_token_ids = None
         sampler_output.logprobs = None
-        return [sampler_output_to_return]
+        return sampler_output_to_return
 
     def _run_non_driver_rank(self) -> bool:
         """Run proposer and verifier model in non-driver workers. This is used
@@ -644,9 +665,15 @@ def _run_speculative_decoding_step(
         This invokes the proposer worker to get k speculative tokens for each
         sequence, then scores each speculative token using the scoring worker.
 
+        When `enable_chunked_prefill` is set, scorer will batch decodes and 
+        prefills, while proposer will sync its KV-cache by running an extra
+        forward on prefills.
+
         Returns a list of SamplerOutput, each containing a single token per
         sequence.
         """
+        # With prefill chunking, expect requests to have prompts first
+        # so that backend gets prefill|decode.
         assert num_lookahead_slots == execute_model_req.num_lookahead_slots
 
         # Pass last hidden states from target model to proposer
@@ -671,6 +698,25 @@ def _run_speculative_decoding_step(
                 proposals,
             )
 
+        _, (non_spec_seqs, non_spec_indices) = split_batch_by_proposal_len(
+            execute_model_req.seq_group_metadata_list, proposals.proposal_lens)
+        # With prefill chunking enabled, `non_spec_seqs` contains prefills too:
+        # discard decodes that have already been processed by proposer.
+        non_spec_indices = [
+            idx for idx in non_spec_indices
+            if execute_model_req.seq_group_metadata_list[idx].is_prompt
+        ]
+        if len(non_spec_indices):
+            all_hidden_states = proposal_scores.hidden_states
+            # TODO fix `return_hidden_states`, same as in `_run_no_spec`
+            if all_hidden_states is not None:
+                prefill_hidden_states = all_hidden_states[non_spec_indices]
+                execute_model_req.previous_hidden_states = \
+                    prepare_prefill_hidden_states(prefill_hidden_states)
+            # Sync proposer KV cache for prefills.
+            prefill_req = execute_model_req.clone(non_spec_seqs)
+            self.proposer_worker.execute_model(prefill_req)
+
         with Timer() as verification_timer:
             accepted_token_ids, target_logprobs = self._verify_tokens(
                 execute_model_req.seq_group_metadata_list, proposal_scores,
@@ -769,7 +815,6 @@ def _verify_tokens(
             self.previous_hidden_states = HiddenStates(
                 hidden_states, seq_group_metadata_list,
                 second_last_token_hidden_states)
-
         return accepted_token_ids, logprobs
 
     def _create_output_sampler_list(
@@ -819,6 +864,8 @@ def _create_output_sampler_list(
         accepted_token_ids_by_step = accepted_token_ids_by_step.tolist()
 
         # Construct the output on a per-step, per-sequence basis.
+        # Non-terminal prefill chunks will end up here as rows with just -1s
+        # i.e mixed-batch [[-1, 1576], [-1, 29884], [-1, -1], [-1, -1]]
         sampler_output_list: List[SamplerOutput] = []
         for step_index in range(num_steps):
             if all(token_id == -1
@@ -861,7 +908,6 @@ def _create_output_sampler_list(
             # This is periodic because the rejection sampler emits metrics
             # periodically.
             self._maybe_log_stage_times(*stage_times)
-
         return sampler_output_list
 
     def _maybe_log_stage_times(self, average_time_per_proposal_tok_ms: float,
diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py
index f6a52a516075d..5a7999a258b2d 100644
--- a/vllm/spec_decode/top1_proposer.py
+++ b/vllm/spec_decode/top1_proposer.py
@@ -109,7 +109,6 @@ def get_spec_proposals(
             proposal_probs=proposal_probs,
             proposal_lens=proposal_lens,
             no_proposals=maybe_sampler_output is None)
-
         return proposals
 
     def _split_by_proposal_len(
@@ -127,9 +126,10 @@ def _split_by_proposal_len(
         nonzero_proposal_len_seqs: List[SequenceGroupMetadata] = []
         nonzero_proposal_len_indices: List[int] = []
         for i, seq_group_metadata in enumerate(seq_group_metadata_list):
-            # The speculative decoding for this request has been disabled
-            # (e.g. due to high traffic).
-            if seq_group_metadata.num_speculative_tokens == 0:
+            # The speculative decoding for this request has either been disabled
+            # (e.g. due to high traffic) or this is a prompt request.
+            if (seq_group_metadata.is_prompt
+                    or seq_group_metadata.num_speculative_tokens == 0):
                 proposal_lens.append(0)
                 continue
 

From de0e61a3239abff67c789138187a98465b806f76 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Thu, 7 Nov 2024 11:43:16 -0500
Subject: [PATCH 028/183] [CI/Build] Always run mypy (#10122)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .github/workflows/mypy.yaml | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index fbee6bb03fc8e..354849b249b59 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -14,11 +14,16 @@ on:
   pull_request:
     branches:
       - main
-    paths:
-      - '**/*.py'
-      - '.github/workflows/mypy.yaml'
-      - 'tools/mypy.sh'
-      - 'pyproject.toml'
+    # This workflow is only relevant when one of the following files changes.
+    # However, we have github configured to expect and require this workflow
+    # to run and pass before github with auto-merge a pull request. Until github
+    # allows more flexible auto-merge policy, we can just run this on every PR.
+    # It doesn't take that long to run, anyway.
+    #paths:
+    #  - '**/*.py'
+    #  - '.github/workflows/mypy.yaml'
+    #  - 'tools/mypy.sh'
+    #  - 'pyproject.toml'
 
 jobs:
   mypy:

From 3be5b26a7651b57aeb2cbdfc6aee81152ba68da5 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Thu, 7 Nov 2024 13:17:29 -0500
Subject: [PATCH 029/183] [CI/Build] Add shell script linting using shellcheck
 (#7925)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .../run-lm-eval-gsm-hf-baseline.sh            |  6 +-
 .../run-lm-eval-gsm-vllm-baseline.sh          |  6 +-
 .buildkite/lm-eval-harness/run-tests.sh       |  2 +-
 .../scripts/launch-server.sh                  | 63 ++++++++-----------
 .../scripts/nightly-annotate.sh               | 12 ++--
 .../scripts/run-nightly-benchmarks.sh         | 30 +++++----
 .../scripts/run-performance-benchmarks.sh     | 19 +++---
 .../scripts/wait-for-image.sh                 |  4 +-
 .buildkite/run-amd-test.sh                    | 34 +++++-----
 .buildkite/run-benchmarks.sh                  |  2 +
 .buildkite/run-cpu-test-ppc64le.sh            |  4 +-
 .buildkite/run-cpu-test.sh                    |  2 +
 .buildkite/run-multi-node-test.sh             | 27 ++++----
 .buildkite/run-neuron-test.sh                 |  8 ++-
 .buildkite/run-openvino-test.sh               |  2 +
 .buildkite/run-tpu-test.sh                    |  4 +-
 .buildkite/run-xpu-test.sh                    |  2 +
 .github/workflows/scripts/cuda-install.sh     |  8 +--
 .github/workflows/scripts/pytorch-install.sh  |  2 +-
 .github/workflows/shellcheck.yml              | 37 +++++++++++
 .gitignore                                    |  1 +
 .shellcheckrc                                 |  9 +++
 benchmarks/launch_tgi_server.sh               |  8 +--
 examples/run_cluster.sh                       |  4 +-
 format.sh                                     | 10 ++-
 .../run_model_weight_loading_test.sh          |  2 +-
 tools/mypy.sh                                 |  4 +-
 tools/shellcheck.sh                           | 21 +++++++
 28 files changed, 204 insertions(+), 129 deletions(-)
 create mode 100644 .github/workflows/shellcheck.yml
 create mode 100644 .shellcheckrc
 create mode 100755 tools/shellcheck.sh

diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
index b2e910e1ba8a7..a67fc89d54e60 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
@@ -41,6 +41,6 @@ while getopts "m:b:l:f:" OPT; do
 done
 
 lm_eval --model hf \
-  --model_args pretrained=$MODEL,parallelize=True \
-  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
-  --batch_size $BATCH_SIZE
+  --model_args "pretrained=$MODEL,parallelize=True" \
+  --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
+  --batch_size "$BATCH_SIZE"
diff --git a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
index 4d32b49a4fac3..65be3c5d93b20 100644
--- a/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
+++ b/.buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh
@@ -46,6 +46,6 @@ while getopts "m:b:l:f:t:" OPT; do
 done
 
 lm_eval --model vllm \
-  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend="ray",trust_remote_code=true,max_model_len=4096 \
-  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
-  --batch_size $BATCH_SIZE
+  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,distributed_executor_backend=ray,trust_remote_code=true,max_model_len=4096" \
+  --tasks gsm8k --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
+  --batch_size "$BATCH_SIZE"
diff --git a/.buildkite/lm-eval-harness/run-tests.sh b/.buildkite/lm-eval-harness/run-tests.sh
index b4fdde6dab425..26f33b744289a 100644
--- a/.buildkite/lm-eval-harness/run-tests.sh
+++ b/.buildkite/lm-eval-harness/run-tests.sh
@@ -30,7 +30,7 @@ while getopts "c:t:" OPT; do
 done
 
 # Parse list of configs.
-IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
+IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
 
 for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
 do
diff --git a/.buildkite/nightly-benchmarks/scripts/launch-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
index e9d7d6a8d760a..fb5063db86942 100644
--- a/.buildkite/nightly-benchmarks/scripts/launch-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-server.sh
@@ -50,31 +50,30 @@ launch_trt_server() {
   git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
   git lfs install
   cd tensorrtllm_backend
-  git checkout $trt_llm_version
-  tensorrtllm_backend_dir=$(pwd)
+  git checkout "$trt_llm_version"
   git submodule update --init --recursive
 
   # build trtllm engine
   cd /tensorrtllm_backend
-  cd ./tensorrt_llm/examples/${model_type}
+  cd "./tensorrt_llm/examples/${model_type}"
   python3 convert_checkpoint.py \
-    --model_dir ${model_path} \
-    --dtype ${model_dtype} \
-    --tp_size ${model_tp_size} \
-    --output_dir ${trt_model_path}
+    --model_dir "${model_path}" \
+    --dtype "${model_dtype}" \
+    --tp_size "${model_tp_size}" \
+    --output_dir "${trt_model_path}"
   trtllm-build \
-    --checkpoint_dir ${trt_model_path} \
+    --checkpoint_dir "${trt_model_path}" \
     --use_fused_mlp \
     --reduce_fusion disable \
     --workers 8 \
-    --gpt_attention_plugin ${model_dtype} \
-    --gemm_plugin ${model_dtype} \
-    --tp_size ${model_tp_size} \
-    --max_batch_size ${max_batch_size} \
-    --max_input_len ${max_input_len} \
-    --max_seq_len ${max_seq_len} \
-    --max_num_tokens ${max_num_tokens} \
-    --output_dir ${trt_engine_path}
+    --gpt_attention_plugin "${model_dtype}" \
+    --gemm_plugin "${model_dtype}" \
+    --tp_size "${model_tp_size}" \
+    --max_batch_size "${max_batch_size}" \
+    --max_input_len "${max_input_len}" \
+    --max_seq_len "${max_seq_len}" \
+    --max_num_tokens "${max_num_tokens}" \
+    --output_dir "${trt_engine_path}"
 
   # handle triton protobuf files and launch triton server
   cd /tensorrtllm_backend
@@ -82,15 +81,15 @@ launch_trt_server() {
   cp -r all_models/inflight_batcher_llm/* triton_model_repo/
   cd triton_model_repo
   rm -rf ./tensorrt_llm/1/*
-  cp -r ${trt_engine_path}/* ./tensorrt_llm/1
+  cp -r "${trt_engine_path}"/* ./tensorrt_llm/1
   python3 ../tools/fill_template.py -i tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,engine_dir:/tensorrtllm_backend/triton_model_repo/tensorrt_llm/1,decoupled_mode:true,batching_strategy:inflight_fused_batching,batch_scheduler_policy:guaranteed_no_evict,exclude_input_in_output:true,triton_max_batch_size:2048,max_queue_delay_microseconds:0,max_beam_width:1,max_queue_size:2048,enable_kv_cache_reuse:false
-  python3 ../tools/fill_template.py -i preprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5
-  python3 ../tools/fill_template.py -i postprocessing/config.pbtxt triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false
-  python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:$max_batch_size
-  python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:"False",bls_instance_count:1
+  python3 ../tools/fill_template.py -i preprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,preprocessing_instance_count:5"
+  python3 ../tools/fill_template.py -i postprocessing/config.pbtxt "triton_max_batch_size:2048,tokenizer_dir:$model_path,postprocessing_instance_count:5,skip_special_tokens:false"
+  python3 ../tools/fill_template.py -i ensemble/config.pbtxt triton_max_batch_size:"$max_batch_size"
+  python3 ../tools/fill_template.py -i tensorrt_llm_bls/config.pbtxt "triton_max_batch_size:$max_batch_size,decoupled_mode:true,accumulate_tokens:False,bls_instance_count:1"
   cd /tensorrtllm_backend
   python3 scripts/launch_triton_server.py \
-    --world_size=${model_tp_size} \
+    --world_size="${model_tp_size}" \
     --model_repo=/tensorrtllm_backend/triton_model_repo &
 
 }
@@ -98,10 +97,7 @@ launch_trt_server() {
 launch_tgi_server() {
   model=$(echo "$common_params" | jq -r '.model')
   tp=$(echo "$common_params" | jq -r '.tp')
-  dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-  dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
   port=$(echo "$common_params" | jq -r '.port')
-  num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
   server_args=$(json2args "$server_params")
 
   if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
@@ -129,10 +125,7 @@ launch_tgi_server() {
 launch_lmdeploy_server() {
   model=$(echo "$common_params" | jq -r '.model')
   tp=$(echo "$common_params" | jq -r '.tp')
-  dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-  dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
   port=$(echo "$common_params" | jq -r '.port')
-  num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
   server_args=$(json2args "$server_params")
 
   server_command="lmdeploy serve api_server $model \
@@ -149,10 +142,7 @@ launch_sglang_server() {
 
   model=$(echo "$common_params" | jq -r '.model')
   tp=$(echo "$common_params" | jq -r '.tp')
-  dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-  dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
   port=$(echo "$common_params" | jq -r '.port')
-  num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
   server_args=$(json2args "$server_params")
 
   if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
@@ -185,10 +175,7 @@ launch_vllm_server() {
 
   model=$(echo "$common_params" | jq -r '.model')
   tp=$(echo "$common_params" | jq -r '.tp')
-  dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
-  dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
   port=$(echo "$common_params" | jq -r '.port')
-  num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
   server_args=$(json2args "$server_params")
 
   if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
@@ -217,19 +204,19 @@ launch_vllm_server() {
 
 main() {
 
-  if [[ $CURRENT_LLM_SERVING_ENGINE == "trt" ]]; then
+  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "trt" ]]; then
     launch_trt_server
   fi
 
-  if [[ $CURRENT_LLM_SERVING_ENGINE == "tgi" ]]; then
+  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "tgi" ]]; then
     launch_tgi_server
   fi
 
-  if [[ $CURRENT_LLM_SERVING_ENGINE == "lmdeploy" ]]; then
+  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
     launch_lmdeploy_server
   fi
 
-  if [[ $CURRENT_LLM_SERVING_ENGINE == "sglang" ]]; then
+  if [[ "$CURRENT_LLM_SERVING_ENGINE" == "sglang" ]]; then
     launch_sglang_server
   fi
 
diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
index c6a1bbdeb7d48..686f70dbece6c 100644
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@@ -16,10 +16,10 @@ main() {
     fi
 
     # initial annotation
-    description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
+    #description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
 
     # download results
-    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
     mkdir -p results/
     /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
     ls
@@ -30,15 +30,15 @@ main() {
     /workspace/buildkite-agent artifact upload "results.zip"
 
     # upload benchmarking scripts
-    cd $VLLM_SOURCE_CODE_LOC/
+    cd "$VLLM_SOURCE_CODE_LOC/"
     zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
     /workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
 
-    cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
+    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
     # upload benchmarking pipeline
     /workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
 
-    cd $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
+    cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
     /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
     
 
@@ -75,4 +75,4 @@ main() {
     # /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
 }
 
-main "$@"
\ No newline at end of file
+main "$@"
diff --git a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
index dd8c15e0700eb..3f38cf5137535 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
@@ -12,7 +12,7 @@ check_gpus() {
     echo "Need at least 1 GPU to run benchmarking."
     exit 1
   fi
-  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
   echo "GPU type is $gpu_type"
 }
 
@@ -102,7 +102,7 @@ kill_gpu_processes() {
   pkill -f text-generation
   pkill -f lmdeploy
 
-  while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
+  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
     sleep 1
   done
 }
@@ -119,8 +119,8 @@ wait_for_server() {
 ensure_installed() {
   # Ensure that the given command is installed by apt-get
   local cmd=$1
-  if ! which $cmd >/dev/null; then
-    apt-get update && apt-get install -y $cmd
+  if ! which "$cmd" >/dev/null; then
+    apt-get update && apt-get install -y "$cmd"
   fi
 }
 
@@ -173,13 +173,11 @@ run_serving_tests() {
       echo "Reuse previous server for test case $test_name"
     else
       kill_gpu_processes
-      bash $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh \
+      bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
         "$server_params" "$common_params"
     fi
 
-    wait_for_server
-
-    if [ $? -eq 0 ]; then
+    if wait_for_server; then
       echo ""
       echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
     else
@@ -190,13 +188,13 @@ run_serving_tests() {
 
     # prepare tokenizer
     # this is required for lmdeploy.
-    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
     rm -rf /tokenizer_cache
     mkdir /tokenizer_cache
     python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
       --model "$model" \
       --cachedir /tokenizer_cache
-    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+    cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
 
 
     # change model name for lmdeploy (it will not follow standard hf name)
@@ -307,11 +305,11 @@ run_serving_tests() {
 prepare_dataset() {
 
   # download sharegpt dataset
-  cd $VLLM_SOURCE_CODE_LOC/benchmarks
+  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
   wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 
   # duplicate sonnet by 4x, to allow benchmarking with input length 2048
-  cd $VLLM_SOURCE_CODE_LOC/benchmarks
+  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
   echo "" > sonnet_4x.txt
   for _ in {1..4}
   do
@@ -339,17 +337,17 @@ main() {
 
   prepare_dataset
 
-  cd $VLLM_SOURCE_CODE_LOC/benchmarks
+  cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
   declare -g RESULTS_FOLDER=results/
   mkdir -p $RESULTS_FOLDER
-  BENCHMARK_ROOT=$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/
+  BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
 
   # run the test
-  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+  run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
 
   # upload benchmark results to buildkite
   python3 -m pip install tabulate pandas
-  python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
   upload_to_buildkite
 
 }
diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index a0b9a409b758d..d397b05cdff23 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -17,7 +17,7 @@ check_gpus() {
     echo "Need at least 1 GPU to run benchmarking."
     exit 1
   fi
-  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
   echo "GPU type is $gpu_type"
 }
 
@@ -93,7 +93,7 @@ kill_gpu_processes() {
 
 
   # wait until GPU memory usage smaller than 1GB
-  while [ $(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1) -ge 1000 ]; do
+  while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
     sleep 1
   done
 
@@ -117,7 +117,7 @@ upload_to_buildkite() {
   fi
 
   # Use the determined command to annotate and upload artifacts
-  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" <$RESULTS_FOLDER/benchmark_results.md
+  $BUILDKITE_AGENT_COMMAND annotate --style "info" --context "$BUILDKITE_LABEL-benchmark-results" < "$RESULTS_FOLDER/benchmark_results.md"
   $BUILDKITE_AGENT_COMMAND artifact upload "$RESULTS_FOLDER/*"
 }
 
@@ -150,7 +150,7 @@ run_latency_tests() {
     # check if there is enough GPU to run the test
     tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
     if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
       continue
     fi
 
@@ -206,9 +206,9 @@ run_throughput_tests() {
     throughput_args=$(json2args "$throughput_params")
 
     # check if there is enough GPU to run the test
-    tp=$(echo $throughput_params | jq -r '.tensor_parallel_size')
+    tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
     if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
       continue
     fi
 
@@ -270,7 +270,7 @@ run_serving_tests() {
     # check if there is enough GPU to run the test
     tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
     if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
       continue
     fi
 
@@ -278,7 +278,7 @@ run_serving_tests() {
     server_model=$(echo "$server_params" | jq -r '.model')
     client_model=$(echo "$client_params" | jq -r '.model')
     if [[ $server_model != "$client_model" ]]; then
-      echo "Server model and client model must be the same. Skip testcase $testname."
+      echo "Server model and client model must be the same. Skip testcase $test_name."
       continue
     fi
 
@@ -293,8 +293,7 @@ run_serving_tests() {
     server_pid=$!
 
     # wait until the server is alive
-    wait_for_server
-    if [ $? -eq 0 ]; then
+    if wait_for_server; then
       echo ""
       echo "vllm server is up and running."
     else
diff --git a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
index f16862907def1..19f7160e68a4d 100644
--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
@@ -6,7 +6,7 @@ TIMEOUT_SECONDS=10
 
 retries=0
 while [ $retries -lt 1000 ]; do
-    if [ $(curl -s --max-time $TIMEOUT_SECONDS -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
+    if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
         exit 0
     fi
 
@@ -16,4 +16,4 @@ while [ $retries -lt 1000 ]; do
     sleep 5
 done
 
-exit 1
\ No newline at end of file
+exit 1
diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index 860272e71fd84..902e162720b89 100755
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 # This script runs test inside the corresponding ROCm docker container.
 set -o pipefail
 
@@ -57,17 +59,17 @@ done
 echo "--- Pulling container" 
 image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
-docker pull ${image_name}
+docker pull "${image_name}"
 
 remove_docker_container() {
-   docker rm -f ${container_name} || docker image rm -f ${image_name} || true
+   docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
 }
 trap remove_docker_container EXIT
 
 echo "--- Running container"
 
 HF_CACHE="$(realpath ~)/huggingface"
-mkdir -p ${HF_CACHE}
+mkdir -p "${HF_CACHE}"
 HF_MOUNT="/root/.cache/huggingface"
 
 commands=$@
@@ -118,25 +120,25 @@ if [[ $commands == *"--shard-id="* ]]; then
         --network host \
         --shm-size=16gb \
         --rm \
-        -e HIP_VISIBLE_DEVICES=${GPU} \
+        -e HIP_VISIBLE_DEVICES="${GPU}" \
         -e HF_TOKEN \
-        -v ${HF_CACHE}:${HF_MOUNT} \
-        -e HF_HOME=${HF_MOUNT} \
-        --name ${container_name}_${GPU}  \
-        ${image_name} \
+        -v "${HF_CACHE}:${HF_MOUNT}" \
+        -e "HF_HOME=${HF_MOUNT}" \
+        --name "${container_name}_${GPU}" \
+        "${image_name}" \
         /bin/bash -c "${commands_gpu}" \
         |& while read -r line; do echo ">>Shard $GPU: $line"; done &
     PIDS+=($!)
   done
   #wait for all processes to finish and collect exit codes
-  for pid in ${PIDS[@]}; do
-    wait ${pid}
+  for pid in "${PIDS[@]}"; do
+    wait "${pid}"
     STATUS+=($?)
   done
-  for st in ${STATUS[@]}; do
+  for st in "${STATUS[@]}"; do
     if [[ ${st} -ne 0 ]]; then
       echo "One of the processes failed with $st"
-      exit ${st}
+      exit "${st}"
     fi
   done
 else
@@ -147,9 +149,9 @@ else
           --rm \
           -e HIP_VISIBLE_DEVICES=0 \
           -e HF_TOKEN \
-          -v ${HF_CACHE}:${HF_MOUNT} \
-          -e HF_HOME=${HF_MOUNT} \
-          --name ${container_name} \
-          ${image_name} \
+          -v "${HF_CACHE}:${HF_MOUNT}" \
+          -e "HF_HOME=${HF_MOUNT}" \
+          --name "${container_name}" \
+          "${image_name}" \
           /bin/bash -c "${commands}"
 fi
diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh
index cbf6dda677c53..1641c1faa9d6a 100644
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/run-benchmarks.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 # This script is run by buildkite to run the benchmarks and upload the results to buildkite
 
 set -ex
diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh
index fd60f5b6afeca..a63c95e51002f 100755
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
@@ -13,7 +15,7 @@ remove_docker_container
 # Run the image, setting --shm-size=4g for tensor parallel.
 source /etc/environment
 #docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN=$HF_TOKEN --name cpu-test cpu-test
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN="$HF_TOKEN" --name cpu-test cpu-test
 
 # Run basic model test
 docker exec cpu-test bash -c "
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 2dbeee8562971..064d7c77ab570 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
diff --git a/.buildkite/run-multi-node-test.sh b/.buildkite/run-multi-node-test.sh
index 7ac4dcc4c786d..530bf90a855fe 100755
--- a/.buildkite/run-multi-node-test.sh
+++ b/.buildkite/run-multi-node-test.sh
@@ -14,7 +14,7 @@ DOCKER_IMAGE=$4
 
 shift 4
 COMMANDS=("$@")
-if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
+if [ ${#COMMANDS[@]} -ne "$NUM_NODES" ]; then
     echo "The number of commands must be equal to the number of nodes."
     echo "Number of nodes: $NUM_NODES"
     echo "Number of commands: ${#COMMANDS[@]}"
@@ -23,7 +23,7 @@ fi
 
 echo "List of commands"
 for command in "${COMMANDS[@]}"; do
-    echo $command
+    echo "$command"
 done
 
 start_network() {
@@ -36,7 +36,7 @@ start_nodes() {
         for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
             DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
             GPU_DEVICES+=$(($DEVICE_NUM))
-            if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
+            if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
                 GPU_DEVICES+=','
             fi
         done
@@ -49,17 +49,20 @@ start_nodes() {
         # 3. map the huggingface cache directory to the container
         # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
         #    starting from 192.168.10.11)
-        docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null"
+        docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \
+            -v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
+            --network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
+            /bin/bash -c "tail -f /dev/null"
 
         # organize containers into a ray cluster
-        if [ $node -eq 0 ]; then
+        if [ "$node" -eq 0 ]; then
             # start the ray head node
-            docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block"
+            docker exec -d "node$node" /bin/bash -c "ray start --head --port=6379 --block"
             # wait for the head node to be ready
             sleep 10
         else
             # start the ray worker nodes, and connect them to the head node
-            docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
+            docker exec -d "node$node" /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
         fi
     done
 
@@ -79,22 +82,22 @@ run_nodes() {
         for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
             DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
             GPU_DEVICES+=$(($DEVICE_NUM))
-            if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
+            if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
                 GPU_DEVICES+=','
             fi
         done
         GPU_DEVICES+='"'
         echo "Running node$node with GPU devices: $GPU_DEVICES"
-        if [ $node -ne 0 ]; then
-            docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
+        if [ "$node" -ne 0 ]; then
+            docker exec -d "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
         else
-            docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
+            docker exec "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
         fi
     done
 }
 cleanup() {
     for node in $(seq 0 $(($NUM_NODES-1))); do
-        docker stop node$node
+        docker stop "node$node"
     done
     docker network rm docker-net
 }
diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
index 252c0f7fecd12..9259391aaed49 100644
--- a/.buildkite/run-neuron-test.sh
+++ b/.buildkite/run-neuron-test.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 # This script build the Neuron docker image and run the API server inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -e
@@ -12,10 +14,10 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
     current_time=$(date +%s)
     if [ $((current_time - last_build)) -gt 86400 ]; then
         docker system prune -f
-        echo $current_time > /tmp/neuron-docker-build-timestamp
+        echo "$current_time" > /tmp/neuron-docker-build-timestamp
     fi
 else
-    echo $(date +%s) > /tmp/neuron-docker-build-timestamp
+    date "+%s" > /tmp/neuron-docker-build-timestamp
 fi
 
 docker build -t neuron -f Dockerfile.neuron .
@@ -34,7 +36,7 @@ wait_for_server_to_start() {
     timeout=300
     counter=0
 
-    while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
+    while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do
         sleep 1
         counter=$((counter + 1))
         if [ $counter -ge $timeout ]; then
diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh
index 35ad5c0ddde77..6b12f424fd828 100755
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 # This script build the OpenVINO docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
diff --git a/.buildkite/run-tpu-test.sh b/.buildkite/run-tpu-test.sh
index 988d5aef5fb8c..770dad6ffa3a1 100644
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 set -e
 
 # Build the docker image.
@@ -12,4 +14,4 @@ remove_docker_container
 # For HF_TOKEN.
 source /etc/environment
 # Run a simple end-to-end example.
-docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
+docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh
index 6ffa66d5ef3d6..faeac8e2ded36 100644
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 # This script build the CPU docker image and run the offline inference inside the container.
 # It serves a sanity check for compilation and basic model usage.
 set -ex
diff --git a/.github/workflows/scripts/cuda-install.sh b/.github/workflows/scripts/cuda-install.sh
index 312c6e82f33a3..3d0b7a1fe0402 100644
--- a/.github/workflows/scripts/cuda-install.sh
+++ b/.github/workflows/scripts/cuda-install.sh
@@ -1,16 +1,16 @@
 #!/bin/bash
 
 # Replace '.' with '-' ex: 11.8 -> 11-8
-cuda_version=$(echo $1 | tr "." "-")
+cuda_version=$(echo "$1" | tr "." "-")
 # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
-OS=$(echo $2 | tr -d ".\-")
+OS=$(echo "$2" | tr -d ".\-")
 
 # Installs CUDA
-wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb
+wget -nv "https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb"
 sudo dpkg -i cuda-keyring_1.1-1_all.deb
 rm cuda-keyring_1.1-1_all.deb
 sudo apt -qq update
-sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version}
+sudo apt -y install "cuda-${cuda_version}" "cuda-nvcc-${cuda_version}" "cuda-libraries-dev-${cuda_version}"
 sudo apt clean
 
 # Test nvcc
diff --git a/.github/workflows/scripts/pytorch-install.sh b/.github/workflows/scripts/pytorch-install.sh
index dfc1851d7692c..e3cda7dad2d17 100644
--- a/.github/workflows/scripts/pytorch-install.sh
+++ b/.github/workflows/scripts/pytorch-install.sh
@@ -6,7 +6,7 @@ cuda_version=$3
 
 # Install torch
 $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
-$python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./}
+$python_executable -m pip install torch=="${pytorch_version}+cu${cuda_version//./}" --extra-index-url "https://download.pytorch.org/whl/cu${cuda_version//./}"
 
 # Print version information
 $python_executable --version
diff --git a/.github/workflows/shellcheck.yml b/.github/workflows/shellcheck.yml
new file mode 100644
index 0000000000000..ac43b20c31390
--- /dev/null
+++ b/.github/workflows/shellcheck.yml
@@ -0,0 +1,37 @@
+name: Lint shell scripts
+on:
+  push:
+    branches:
+      - "main"
+    paths:
+      - '**/*.sh'
+      - '.github/workflows/shellcheck.yml'
+  pull_request:
+    branches:
+      - "main"
+    paths:
+      - '**/*.sh'
+      - '.github/workflows/shellcheck.yml'
+
+env:
+  LC_ALL: en_US.UTF-8
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  contents: read
+
+jobs:
+  shellcheck:
+    runs-on: ubuntu-latest
+    steps:
+      - name: "Checkout"
+        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+        with:
+          fetch-depth: 0
+
+      - name: "Check shell scripts"
+        run: |
+          tools/shellcheck.sh
diff --git a/.gitignore b/.gitignore
index 1ea6e3419db2a..ceef6a5fba456 100644
--- a/.gitignore
+++ b/.gitignore
@@ -202,3 +202,4 @@ benchmarks/*.json
 
 # Linting
 actionlint
+shellcheck*/
diff --git a/.shellcheckrc b/.shellcheckrc
new file mode 100644
index 0000000000000..f3b6eedf8d907
--- /dev/null
+++ b/.shellcheckrc
@@ -0,0 +1,9 @@
+# rules currently disabled:
+#
+#   SC1091 (info): Not following: <sourced file> was not specified as input (see shellcheck -x)
+#   SC2004 (style): $/${} is unnecessary on arithmetic variables.
+#   SC2129 (style): Consider using { cmd1; cmd2; } >> file instead of individual redirects.
+#   SC2155 (warning): Declare and assign separately to avoid masking return values.
+#   SC2164 (warning): Use 'cd ... || exit' or 'cd ... || return' in case cd fails.
+#
+disable=SC1091,SC2004,SC2129,SC2155,SC2164
diff --git a/benchmarks/launch_tgi_server.sh b/benchmarks/launch_tgi_server.sh
index 8c5cd454fbbee..ba7383d88dc49 100755
--- a/benchmarks/launch_tgi_server.sh
+++ b/benchmarks/launch_tgi_server.sh
@@ -4,13 +4,13 @@ PORT=8000
 MODEL=$1
 TOKENS=$2
 
-docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \
-           -v $PWD/data:/data \
+docker run -e "HF_TOKEN=$HF_TOKEN" --gpus all --shm-size 1g -p $PORT:80 \
+           -v "$PWD/data:/data" \
            ghcr.io/huggingface/text-generation-inference:2.2.0 \
-           --model-id $MODEL \
+           --model-id "$MODEL" \
            --sharded false  \
            --max-input-length 1024 \
            --max-total-tokens 2048 \
            --max-best-of 5 \
            --max-concurrent-requests 5000 \
-           --max-batch-total-tokens $TOKENS
+           --max-batch-total-tokens "$TOKENS"
diff --git a/examples/run_cluster.sh b/examples/run_cluster.sh
index 8e4aa59e1766d..7b4b40b4b7e23 100644
--- a/examples/run_cluster.sh
+++ b/examples/run_cluster.sh
@@ -14,7 +14,7 @@ PATH_TO_HF_HOME="$4"
 shift 4
 
 # Additional arguments are passed directly to the Docker command
-ADDITIONAL_ARGS="$@"
+ADDITIONAL_ARGS=("$@")
 
 # Validate node type
 if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then
@@ -45,5 +45,5 @@ docker run \
     --shm-size 10.24g \
     --gpus all \
     -v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
-    ${ADDITIONAL_ARGS} \
+    "${ADDITIONAL_ARGS[@]}" \
     "${DOCKER_IMAGE}" -c "${RAY_START_CMD}"
diff --git a/format.sh b/format.sh
index be6ee0ce46dcb..d06ee62351a21 100755
--- a/format.sh
+++ b/format.sh
@@ -44,14 +44,14 @@ CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}')
 
 # # params: tool name, tool version, required version
 tool_version_check() {
-    if [[ $2 != $3 ]]; then
+    if [[ "$2" != "$3" ]]; then
         echo "❓❓Wrong $1 version installed: $3 is required, not $2."
         exit 1
     fi
 }
 
-tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-lint.txt | cut -d'=' -f3)"
-tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-lint.txt | cut -d'=' -f3)"
+tool_version_check "yapf" "$YAPF_VERSION" "$(grep yapf requirements-lint.txt | cut -d'=' -f3)"
+tool_version_check "ruff" "$RUFF_VERSION" "$(grep "ruff==" requirements-lint.txt | cut -d'=' -f3)"
 tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-lint.txt | cut -d'=' -f3)"
 tool_version_check "isort" "$ISORT_VERSION" "$(grep isort requirements-lint.txt | cut -d'=' -f3)"
 tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-lint.txt | cut -d'=' -f3)"
@@ -294,6 +294,10 @@ echo 'vLLM actionlint:'
 tools/actionlint.sh -color
 echo 'vLLM actionlint: Done'
 
+echo 'vLLM shellcheck:'
+tools/shellcheck.sh
+echo 'vLLM shellcheck: Done'
+
 if ! git diff --quiet &>/dev/null; then
     echo 
     echo "🔍🔍There are files changed by the format checker or by you that are not added and committed:"
diff --git a/tests/weight_loading/run_model_weight_loading_test.sh b/tests/weight_loading/run_model_weight_loading_test.sh
index e80c1d6c5849c..a4d0c44c22b51 100755
--- a/tests/weight_loading/run_model_weight_loading_test.sh
+++ b/tests/weight_loading/run_model_weight_loading_test.sh
@@ -14,7 +14,7 @@ while getopts "c:" OPT; do
 done
 
 
-IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
+IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
 
 for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
 do
diff --git a/tools/mypy.sh b/tools/mypy.sh
index 7e8f7d402cdd5..e984e739d70cf 100755
--- a/tools/mypy.sh
+++ b/tools/mypy.sh
@@ -3,13 +3,13 @@
 CI=${1:-0}
 PYTHON_VERSION=${2:-3.9}
 
-if [ $CI -eq 1 ]; then
+if [ "$CI" -eq 1 ]; then
     set -e
 fi
 
 run_mypy() {
     echo "Running mypy on $1"
-    if [ $CI -eq 1 ] && [ -z "$1" ]; then
+    if [ "$CI" -eq 1 ] && [ -z "$1" ]; then
         mypy --python-version "${PYTHON_VERSION}" "$@"
         return
     fi
diff --git a/tools/shellcheck.sh b/tools/shellcheck.sh
new file mode 100755
index 0000000000000..e850742a07900
--- /dev/null
+++ b/tools/shellcheck.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+scversion="stable"
+
+if [ -d "shellcheck-${scversion}" ]; then
+    export PATH="$PATH:$(pwd)/shellcheck-${scversion}"
+fi
+
+if ! [ -x "$(command -v shellcheck)" ]; then
+    if [ "$(uname -s)" != "Linux" ] || [ "$(uname -m)" != "x86_64" ]; then
+        echo "Please install shellcheck: https://github.com/koalaman/shellcheck?tab=readme-ov-file#installing"
+        exit 1
+    fi
+
+    # automatic local install if linux x86_64
+    wget -qO- "https://github.com/koalaman/shellcheck/releases/download/${scversion?}/shellcheck-${scversion?}.linux.x86_64.tar.xz" | tar -xJv
+    export PATH="$PATH:$(pwd)/shellcheck-${scversion}"
+fi
+
+# TODO - fix warnings in .buildkite/run-amd-test.sh
+find . -name "*.sh" -not -path "./.deps/*" -not -path "./.buildkite/run-amd-test.sh" -exec shellcheck {} +

From a2f1f3b0896be5e0fcd01727257438ba629e48af Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Thu, 7 Nov 2024 13:26:28 -0500
Subject: [PATCH 030/183] [CI/Build] Automate PR body text cleanup (#10082)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .github/scripts/cleanup_pr_body.sh    | 33 +++++++++++++++++++++++++++
 .github/workflows/cleanup_pr_body.yml | 23 +++++++++++++++++++
 2 files changed, 56 insertions(+)
 create mode 100755 .github/scripts/cleanup_pr_body.sh
 create mode 100644 .github/workflows/cleanup_pr_body.yml

diff --git a/.github/scripts/cleanup_pr_body.sh b/.github/scripts/cleanup_pr_body.sh
new file mode 100755
index 0000000000000..3b2da7b9f8966
--- /dev/null
+++ b/.github/scripts/cleanup_pr_body.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+set -eu
+
+# ensure 1 argument is passed
+if [ "$#" -ne 1 ]; then
+    echo "Usage: $0 <pr_number>"
+    exit 1
+fi
+
+PR_NUMBER=$1
+OLD=/tmp/orig_pr_body.txt
+NEW=/tmp/new_pr_body.txt
+
+gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
+cp "${OLD}" "${NEW}"
+
+# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
+sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE\*\*/,$d' "${NEW}"
+
+# Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
+sed -i '/FIX #xxxx.*$/d' "${NEW}"
+
+# Remove "FILL IN THE PR DESCRIPTION HERE"
+sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
+
+# Run this only if ${NEW} is different than ${OLD}
+if ! cmp -s "${OLD}" "${NEW}"; then
+    echo "Updating PR body"
+    gh pr edit --body-file "${NEW}" "${PR_NUMBER}"
+else
+    echo "No changes needed"
+fi
diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
new file mode 100644
index 0000000000000..b516c45c41dfc
--- /dev/null
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -0,0 +1,23 @@
+name: Cleanup PR Body
+
+on:
+  pull_request:
+    types: [opened, edited, synchronize]
+
+jobs:
+  update-description:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+
+      - name: Set up Python
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+        with:
+          python-version: '3.12'
+
+      - name: Update PR description
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"

From 97b8475bebf4598fb4847997323267be46457465 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 7 Nov 2024 18:55:35 +0000
Subject: [PATCH 031/183] Bump actions/setup-python from 5.2.0 to 5.3.0 (#9745)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/clang-format.yml | 2 +-
 .github/workflows/codespell.yml    | 2 +-
 .github/workflows/mypy.yaml        | 2 +-
 .github/workflows/publish.yml      | 2 +-
 .github/workflows/ruff.yml         | 2 +-
 .github/workflows/yapf.yml         | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
index 167c115d8956f..ea0c567e1b942 100644
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@@ -31,7 +31,7 @@ jobs:
     steps:
     - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+      uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml
index dfb087ff66913..7d2fdc436790d 100644
--- a/.github/workflows/codespell.yml
+++ b/.github/workflows/codespell.yml
@@ -33,7 +33,7 @@ jobs:
     steps:
     - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+      uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 354849b249b59..6f28b476343e9 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -34,7 +34,7 @@ jobs:
     steps:
     - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+      uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
       with:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 578c3fbd4e816..6a9c566334d20 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -68,7 +68,7 @@ jobs:
           bash -x .github/workflows/scripts/env.sh
 
       - name: Set up Python
-        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
         with:
             python-version: ${{ matrix.python-version }}
 
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index 1a6beca0b87c0..ffc13a7c7fe59 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -36,7 +36,7 @@ jobs:
     steps:
       - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
         with:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
index 4221c139ccf79..ac12b03084f20 100644
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@@ -25,7 +25,7 @@ jobs:
     steps:
       - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
         with:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies

From 28b2877d303caa6b2febc9d0b425f17828634a4c Mon Sep 17 00:00:00 2001
From: litianjian <45817262+litianjian@users.noreply.github.com>
Date: Fri, 8 Nov 2024 04:25:59 +0800
Subject: [PATCH 032/183] Online video support for VLMs (#10020)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/conf.py                           |   1 +
 requirements-test.in                          |   6 +-
 requirements-test.txt                         |  57 ++-
 setup.py                                      |   3 +-
 tests/entrypoints/openai/test_video.py        | 345 ++++++++++++++++++
 vllm/assets/video.py                          |   4 +-
 vllm/entrypoints/chat_utils.py                |  69 +++-
 vllm/envs.py                                  |  12 +-
 vllm/model_executor/models/llava_onevision.py |   5 +-
 vllm/multimodal/base.py                       |   3 +
 vllm/multimodal/utils.py                      | 121 +++++-
 vllm/multimodal/video.py                      |   3 +-
 12 files changed, 598 insertions(+), 31 deletions(-)
 create mode 100644 tests/entrypoints/openai/test_video.py

diff --git a/docs/source/conf.py b/docs/source/conf.py
index c7b638473a931..96ad9a4c26b09 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -116,6 +116,7 @@ def setup(app):
     "soundfile",
     "gguf",
     "lark",
+    "decord",
 ]
 
 for mock_target in autodoc_mock_imports:
diff --git a/requirements-test.in b/requirements-test.in
index 560c005fd6157..1b4b9ba78ed9c 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -8,6 +8,7 @@ pytest-shard
 
 # testing utils
 awscli
+decord # required for video tests
 einops # required for MPT, qwen-vl and Mamba
 httpx
 librosa # required for audio tests
@@ -15,12 +16,13 @@ opencv-python # required for video tests
 peft
 requests
 ray[adag]==2.35
-sentence-transformers # required for embedding
-soundfile # required for audio test
+sentence-transformers # required for embedding tests
+soundfile # required for audio tests
 timm # required for internvl test
 torch==2.5.1
 transformers_stream_generator # required for qwen-vl test
 matplotlib # required for qwen-vl test
+mistral_common[opencv] >= 1.4.4 # required for pixtral test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]==0.4.4 # required for model evaluation test
 
diff --git a/requirements-test.txt b/requirements-test.txt
index 518e81021cbcb..fb322fcc72dc2 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1,8 +1,8 @@
 #
-# This file is autogenerated by pip-compile with Python 3.12
+# This file is autogenerated by pip-compile with Python 3.9
 # by the following command:
 #
-#    pip-compile --output-file=requirements-test.txt requirements-test.in
+#    pip-compile requirements-test.in
 #
 absl-py==2.1.0
     # via rouge-score
@@ -28,6 +28,10 @@ anyio==4.6.2.post1
     # via httpx
 argcomplete==3.5.1
     # via datamodel-code-generator
+async-timeout==4.0.3
+    # via
+    #   aiohttp
+    #   redis
 attrs==24.2.0
     # via
     #   aiohttp
@@ -90,6 +94,8 @@ datasets==3.0.2
     #   lm-eval
 decorator==5.1.1
     # via librosa
+decord==0.6.0
+    # via -r requirements-test.in
 dill==0.3.8
     # via
     #   datasets
@@ -106,6 +112,10 @@ email-validator==2.2.0
     # via pydantic
 evaluate==0.4.3
     # via lm-eval
+exceptiongroup==1.2.2
+    # via
+    #   anyio
+    #   pytest
 fastrlock==0.8.2
     # via cupy-cuda12x
 filelock==3.16.1
@@ -156,6 +166,8 @@ idna==3.10
     #   httpx
     #   requests
     #   yarl
+importlib-resources==6.4.5
+    # via matplotlib
 inflect==5.6.2
     # via datamodel-code-generator
 iniconfig==2.0.0
@@ -178,7 +190,9 @@ joblib==1.4.2
 jsonlines==4.0.0
     # via lm-eval
 jsonschema==4.23.0
-    # via ray
+    # via
+    #   mistral-common
+    #   ray
 jsonschema-specifications==2024.10.1
     # via jsonschema
 kiwisolver==1.4.7
@@ -204,6 +218,10 @@ mbstrdecoder==1.1.3
     #   dataproperty
     #   pytablewriter
     #   typepy
+mistral-common[opencv]==1.4.4
+    # via
+    #   -r requirements-test.in
+    #   mistral-common
 more-itertools==10.5.0
     # via lm-eval
 mpmath==1.3.0
@@ -238,12 +256,15 @@ numpy==1.26.4
     #   contourpy
     #   cupy-cuda12x
     #   datasets
+    #   decord
     #   evaluate
     #   librosa
     #   matplotlib
+    #   mistral-common
     #   numba
     #   numexpr
     #   opencv-python
+    #   opencv-python-headless
     #   pandas
     #   peft
     #   rouge-score
@@ -288,6 +309,8 @@ nvidia-nvtx-cu12==12.4.127
     # via torch
 opencv-python==4.10.0.84
     # via -r requirements-test.in
+opencv-python-headless==4.10.0.84
+    # via mistral-common
 packaging==24.1
     # via
     #   accelerate
@@ -317,9 +340,10 @@ peft==0.13.2
     # via
     #   -r requirements-test.in
     #   lm-eval
-pillow==11.0.0
+pillow==10.4.0
     # via
     #   matplotlib
+    #   mistral-common
     #   sentence-transformers
     #   torchvision
 platformdirs==4.3.6
@@ -354,7 +378,9 @@ pybind11==2.13.6
 pycparser==2.22
     # via cffi
 pydantic[email]==2.9.2
-    # via datamodel-code-generator
+    # via
+    #   datamodel-code-generator
+    #   mistral-common
 pydantic-core==2.23.4
     # via pydantic
 pyparsing==3.2.0
@@ -420,6 +446,7 @@ requests==2.32.3
     #   evaluate
     #   huggingface-hub
     #   lm-eval
+    #   mistral-common
     #   pooch
     #   ray
     #   tiktoken
@@ -456,6 +483,8 @@ scipy==1.13.1
     #   sentence-transformers
 sentence-transformers==3.2.1
     # via -r requirements-test.in
+sentencepiece==0.2.0
+    # via mistral-common
 six==1.16.0
     # via
     #   python-dateutil
@@ -486,12 +515,20 @@ tensorizer==2.9.0
     # via -r requirements-test.in
 threadpoolctl==3.5.0
     # via scikit-learn
-tiktoken==0.8.0
-    # via lm-eval
+tiktoken==0.7.0
+    # via
+    #   lm-eval
+    #   mistral-common
 timm==1.0.11
     # via -r requirements-test.in
 tokenizers==0.20.1
     # via transformers
+toml==0.10.2
+    # via datamodel-code-generator
+tomli==2.0.2
+    # via
+    #   black
+    #   pytest
 torch==2.5.1
     # via
     #   -r requirements-test.in
@@ -535,8 +572,12 @@ typepy[datetime]==1.3.2
     #   tabledata
 typing-extensions==4.12.2
     # via
+    #   anyio
+    #   black
     #   huggingface-hub
     #   librosa
+    #   mistral-common
+    #   multidict
     #   pydantic
     #   pydantic-core
     #   torch
@@ -554,6 +595,8 @@ xxhash==3.5.0
     #   evaluate
 yarl==1.17.1
     # via aiohttp
+zipp==3.20.2
+    # via importlib-resources
 zstandard==0.23.0
     # via lm-eval
 
diff --git a/setup.py b/setup.py
index d2438ae74c455..b936589869e76 100644
--- a/setup.py
+++ b/setup.py
@@ -554,7 +554,8 @@ def _read_requirements(filename: str) -> List[str]:
     ext_modules=ext_modules,
     extras_require={
         "tensorizer": ["tensorizer>=2.9.0"],
-        "audio": ["librosa", "soundfile"]  # Required for audio processing
+        "audio": ["librosa", "soundfile"],  # Required for audio processing
+        "video": ["decord"]  # Required for video processing
     },
     cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},
     package_data=package_data,
diff --git a/tests/entrypoints/openai/test_video.py b/tests/entrypoints/openai/test_video.py
new file mode 100644
index 0000000000000..294b250362699
--- /dev/null
+++ b/tests/entrypoints/openai/test_video.py
@@ -0,0 +1,345 @@
+from typing import Dict, List
+
+import openai
+import pytest
+import pytest_asyncio
+
+from vllm.multimodal.utils import encode_video_base64, fetch_video
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
+MAXIMUM_VIDEOS = 4
+
+TEST_VIDEO_URLS = [
+    "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4",
+    "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ElephantsDream.mp4",
+    "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerBlazes.mp4",
+    "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4",
+]
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--task",
+        "generate",
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "32768",
+        "--max-num-seqs",
+        "2",
+        "--enforce-eager",
+        "--trust-remote-code",
+        "--limit-mm-per-prompt",
+        f"video={MAXIMUM_VIDEOS}",
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+@pytest.fixture(scope="session")
+def base64_encoded_video() -> Dict[str, str]:
+    return {
+        video_url: encode_video_base64(fetch_video(video_url))
+        for video_url in TEST_VIDEO_URLS
+    }
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+async def test_single_chat_session_video(client: openai.AsyncOpenAI,
+                                         model_name: str, video_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "video_url",
+                "video_url": {
+                    "url": video_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this video?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        logprobs=True,
+        top_logprobs=5)
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=6299, total_tokens=6309)
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+async def test_single_chat_session_video_beamsearch(client: openai.AsyncOpenAI,
+                                                    model_name: str,
+                                                    video_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "video_url",
+                "video_url": {
+                    "url": video_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this video?"
+            },
+        ],
+    }]
+
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        n=2,
+        max_completion_tokens=10,
+        logprobs=True,
+        top_logprobs=5,
+        extra_body=dict(use_beam_search=True))
+    assert len(chat_completion.choices) == 2
+    assert chat_completion.choices[
+        0].message.content != chat_completion.choices[1].message.content
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+async def test_single_chat_session_video_base64encoded(
+        client: openai.AsyncOpenAI, model_name: str, video_url: str,
+        base64_encoded_video: Dict[str, str]):
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "video_url",
+                "video_url": {
+                    "url":
+                    f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this video?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        logprobs=True,
+        top_logprobs=5)
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=6299, total_tokens=6309)
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+async def test_single_chat_session_video_base64encoded_beamsearch(
+        client: openai.AsyncOpenAI, model_name: str, video_url: str,
+        base64_encoded_video: Dict[str, str]):
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "video_url",
+                "video_url": {
+                    "url":
+                    f"data:video/jpeg;base64,{base64_encoded_video[video_url]}"
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this video?"
+            },
+        ],
+    }]
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        n=2,
+        max_completion_tokens=10,
+        extra_body=dict(use_beam_search=True))
+    assert len(chat_completion.choices) == 2
+    assert chat_completion.choices[
+        0].message.content != chat_completion.choices[1].message.content
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+async def test_chat_streaming_video(client: openai.AsyncOpenAI,
+                                    model_name: str, video_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "video_url",
+                "video_url": {
+                    "url": video_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this video?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=10,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks: List[str] = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert delta.content
+    assert "".join(chunks) == output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize(
+    "video_urls",
+    [TEST_VIDEO_URLS[:i] for i in range(2, len(TEST_VIDEO_URLS))])
+async def test_multi_video_input(client: openai.AsyncOpenAI, model_name: str,
+                                 video_urls: List[str]):
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *({
+                "type": "video_url",
+                "video_url": {
+                    "url": video_url
+                }
+            } for video_url in video_urls),
+            {
+                "type": "text",
+                "text": "What's in this video?"
+            },
+        ],
+    }]
+
+    if len(video_urls) > MAXIMUM_VIDEOS:
+        with pytest.raises(openai.BadRequestError):  # test multi-video input
+            await client.chat.completions.create(
+                model=model_name,
+                messages=messages,
+                max_completion_tokens=10,
+                temperature=0.0,
+            )
+
+        # the server should still work afterwards
+        completion = await client.completions.create(
+            model=model_name,
+            prompt=[0, 0, 0, 0, 0],
+            max_tokens=5,
+            temperature=0.0,
+        )
+        completion = completion.choices[0].text
+        assert completion is not None and len(completion) >= 0
+    else:
+        chat_completion = await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=10,
+            temperature=0.0,
+        )
+        message = chat_completion.choices[0].message
+        assert message.content is not None and len(message.content) >= 0
diff --git a/vllm/assets/video.py b/vllm/assets/video.py
index 05e031affabae..e4dcab10466db 100644
--- a/vllm/assets/video.py
+++ b/vllm/assets/video.py
@@ -35,7 +35,7 @@ def download_video_asset(filename: str) -> str:
 
 
 def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
-    cv2 = try_import_video_packages()
+    cv2, _ = try_import_video_packages()
 
     cap = cv2.VideoCapture(path)
     if not cap.isOpened():
@@ -59,7 +59,7 @@ def video_to_ndarrays(path: str, num_frames: int = -1) -> npt.NDArray:
 
 def video_to_pil_images_list(path: str,
                              num_frames: int = -1) -> List[Image.Image]:
-    cv2 = try_import_video_packages()
+    cv2, _ = try_import_video_packages()
     frames = video_to_ndarrays(path, num_frames)
     return [
         Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index ed4e4399d5514..3ca460c47c3bd 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -30,7 +30,9 @@
 from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.utils import (async_get_and_parse_audio,
                                    async_get_and_parse_image,
-                                   get_and_parse_audio, get_and_parse_image)
+                                   async_get_and_parse_video,
+                                   get_and_parse_audio, get_and_parse_image,
+                                   get_and_parse_video)
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
 from vllm.utils import print_warning_once
 
@@ -51,6 +53,20 @@ class ChatCompletionContentPartAudioParam(TypedDict, total=False):
     """The type of the content part."""
 
 
+class VideoURL(TypedDict, total=False):
+    url: Required[str]
+    """
+    Either a URL of the video or a data URL with base64 encoded video data.
+    """
+
+
+class ChatCompletionContentPartVideoParam(TypedDict, total=False):
+    video_url: Required[VideoURL]
+
+    type: Required[Literal["video_url"]]
+    """The type of the content part."""
+
+
 class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
     """A simpler version of the param that only accepts a plain image_url.
     This is supported by OpenAI API, although it is not documented.
@@ -74,11 +90,23 @@ class CustomChatCompletionContentSimpleAudioParam(TypedDict, total=False):
     audio_url: Required[str]
 
 
+class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
+    """A simpler version of the param that only accepts a plain audio_url.
+
+    Example:
+    {
+        "video_url": "https://example.com/video.mp4"
+    }
+    """
+    video_url: Required[str]
+
+
 ChatCompletionContentPartParam: TypeAlias = Union[
     OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
-    ChatCompletionContentPartRefusalParam,
+    ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
     CustomChatCompletionContentSimpleImageParam,
-    CustomChatCompletionContentSimpleAudioParam, str]
+    CustomChatCompletionContentSimpleAudioParam,
+    CustomChatCompletionContentSimpleVideoParam, str]
 
 
 class CustomChatCompletionMessageParam(TypedDict, total=False):
@@ -201,6 +229,9 @@ def _placeholder_str(self, modality: ModalityStr,
         elif modality == "video":
             if model_type == "qwen2_vl":
                 return "<|vision_start|><|video_pad|><|vision_end|>"
+            if model_type.startswith("llava"):
+                return self._cached_token_str(self._tokenizer,
+                                              hf_config.video_token_index)
             raise TypeError(f"Unknown {modality} model type: {model_type}")
         else:
             raise TypeError(f"Unknown modality: {modality}")
@@ -291,6 +322,10 @@ def parse_image(self, image_url: str) -> None:
     def parse_audio(self, audio_url: str) -> None:
         raise NotImplementedError
 
+    @abstractmethod
+    def parse_video(self, video_url: str) -> None:
+        raise NotImplementedError
+
 
 class MultiModalContentParser(BaseMultiModalContentParser):
 
@@ -313,6 +348,12 @@ def parse_audio(self, audio_url: str) -> None:
         placeholder = self._tracker.add("audio", audio)
         self._add_placeholder(placeholder)
 
+    def parse_video(self, video_url: str) -> None:
+        video = get_and_parse_video(video_url)
+
+        placeholder = self._tracker.add("video", video)
+        self._add_placeholder(placeholder)
+
 
 class AsyncMultiModalContentParser(BaseMultiModalContentParser):
 
@@ -336,6 +377,12 @@ def parse_audio(self, audio_url: str) -> None:
         placeholder = self._tracker.add("audio", audio_coro)
         self._add_placeholder(placeholder)
 
+    def parse_video(self, video_url: str) -> None:
+        video = async_get_and_parse_video(video_url)
+
+        placeholder = self._tracker.add("video", video)
+        self._add_placeholder(placeholder)
+
 
 def validate_chat_template(chat_template: Optional[Union[Path, str]]):
     """Raises if the provided chat template appears invalid."""
@@ -416,6 +463,7 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
 _ImageParser = partial(cast, ChatCompletionContentPartImageParam)
 _AudioParser = partial(cast, ChatCompletionContentPartAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
+_VideoParser = partial(cast, ChatCompletionContentPartVideoParam)
 MODEL_KEEP_MULTI_MODAL_CONTENT = {'mllama'}
 
 # Define a mapping from part types to their corresponding parsing functions.
@@ -428,6 +476,8 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
     lambda part: _AudioParser(part).get("audio_url", {}).get("url", ""),
     "refusal":
     lambda part: _RefusalParser(part).get("refusal", ""),
+    "video_url":
+    lambda part: _VideoParser(part).get("video_url", {}).get("url", ""),
 }
 
 
@@ -472,7 +522,10 @@ def _parse_chat_message_content_mm_part(
             audio_params = cast(CustomChatCompletionContentSimpleAudioParam,
                                 part)
             return "audio_url", audio_params.get("audio_url", "")
-
+        if part.get("video_url") is not None:
+            video_params = cast(CustomChatCompletionContentSimpleVideoParam,
+                                part)
+            return "video_url", video_params.get("video_url", "")
         # Raise an error if no 'type' or direct URL is found.
         raise ValueError("Missing 'type' field in multimodal part.")
 
@@ -482,7 +535,7 @@ def _parse_chat_message_content_mm_part(
 
 
 VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
-                                       "audio_url")
+                                       "audio_url", "video_url")
 
 
 def _parse_chat_message_content_parts(
@@ -542,7 +595,7 @@ def _parse_chat_message_content_part(
     # Handle structured dictionary parts
     part_type, content = _parse_chat_message_content_mm_part(part)
 
-    # if part_type is text/refusal/image_url/audio_url but
+    # if part_type is text/refusal/image_url/audio_url/video_url but
     # content is empty, log a warning and skip
     if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and not content:
         logger.warning(
@@ -561,6 +614,10 @@ def _parse_chat_message_content_part(
         mm_parser.parse_audio(content)
         return {'type': 'audio'} if wrap_dicts else None
 
+    if part_type == "video_url":
+        mm_parser.parse_video(content)
+        return {'type': 'video'} if wrap_dicts else None
+
     raise NotImplementedError(f"Unknown part type: {part_type}")
 
 
diff --git a/vllm/envs.py b/vllm/envs.py
index b4a263d1e086e..9e596a699e466 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -49,7 +49,8 @@
     VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
     VLLM_ASSETS_CACHE: str = os.path.join(VLLM_CACHE_ROOT, "assets")
     VLLM_IMAGE_FETCH_TIMEOUT: int = 5
-    VLLM_AUDIO_FETCH_TIMEOUT: int = 5
+    VLLM_VIDEO_FETCH_TIMEOUT: int = 15
+    VLLM_AUDIO_FETCH_TIMEOUT: int = 10
     VLLM_TARGET_DEVICE: str = "cuda"
     MAX_JOBS: Optional[str] = None
     NVCC_THREADS: Optional[str] = None
@@ -376,10 +377,15 @@ def get_default_config_root():
     "VLLM_IMAGE_FETCH_TIMEOUT":
     lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),
 
+    # Timeout for fetching videos when serving multimodal models
+    # Default is 15 seconds
+    "VLLM_VIDEO_FETCH_TIMEOUT":
+    lambda: int(os.getenv("VLLM_VIDEO_FETCH_TIMEOUT", "15")),
+
     # Timeout for fetching audio when serving multimodal models
-    # Default is 5 seconds
+    # Default is 10 seconds
     "VLLM_AUDIO_FETCH_TIMEOUT":
-    lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "5")),
+    lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")),
 
     # Path to the XLA persistent cache directory.
     # Only used for XLA devices such as TPUs.
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 26ece8190e7de..ad5d551ee0834 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -341,7 +341,7 @@ def input_processor_when_multimodal_input_video(ctx: InputContext,
                 get_llava_onevision_video_tokens(ctx, num_frames))
 
         tokenizer = cached_get_tokenizer(model_config.tokenizer)
-        new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens(
+        new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
             tokenizer,
             inputs.get("prompt"),
             inputs["prompt_token_ids"],
@@ -350,7 +350,8 @@ def input_processor_when_multimodal_input_video(ctx: InputContext,
         )
         return token_inputs(prompt_token_ids=new_token_ids,
                             prompt=new_prompt,
-                            multi_modal_data=multi_modal_data)
+                            multi_modal_data=multi_modal_data,
+                            multi_modal_placeholders={"video": ranges})
     else:
         raise TypeError(f"Invalid video type: {type(video_data)}")
 
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 5ff6f93fb25b4..26c94cf2d0b20 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -136,6 +136,9 @@ class MultiModalDataBuiltins(TypedDict, total=False):
     audio: MultiModalData[Tuple[np.ndarray, Union[int, float]]]
     """The input audio item(s) and corresponding sampling rate(s)."""
 
+    video: MultiModalData[Tuple[np.ndarray]]
+    """The input video(s)."""
+
 
 MultiModalDataDict = Union[MultiModalDataBuiltins,
                            Mapping[str, MultiModalData[object]]]
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 283c23c94d330..0c666b8cc2e69 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -8,8 +8,8 @@
 import numpy.typing as npt
 from PIL import Image
 
+import vllm.envs as envs
 from vllm.connections import global_http_connection
-from vllm.envs import VLLM_AUDIO_FETCH_TIMEOUT, VLLM_IMAGE_FETCH_TIMEOUT
 from vllm.logger import init_logger
 from vllm.multimodal.base import MultiModalDataDict, PlaceholderRange
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
@@ -80,7 +80,9 @@ def fetch_image(image_url: str,
     """
     if image_url.startswith('http'):
         image_raw = global_http_connection.get_bytes(
-            image_url, timeout=VLLM_IMAGE_FETCH_TIMEOUT)
+            image_url,
+            timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+        )
         image = _load_image_from_bytes(image_raw)
 
     elif image_url.startswith('data:image'):
@@ -105,7 +107,9 @@ async def async_fetch_image(image_url: str,
     """
     if image_url.startswith('http'):
         image_raw = await global_http_connection.async_get_bytes(
-            image_url, timeout=VLLM_IMAGE_FETCH_TIMEOUT)
+            image_url,
+            timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+        )
         image = _load_image_from_bytes(image_raw)
 
     elif image_url.startswith('data:image'):
@@ -119,6 +123,85 @@ async def async_fetch_image(image_url: str,
     return image.convert(image_mode)
 
 
+def _load_video_frames_from_bytes(b: bytes):
+    frame = Image.open(BytesIO(b))
+    return np.array(frame)
+
+
+def load_video_frames_from_base64(frame: Union[bytes, str]):
+    """Load frame from base64 format."""
+    return _load_video_frames_from_bytes(base64.b64decode(frame))
+
+
+def _load_video_from_bytes(b: bytes, num_frames: int = 32):
+    _, decord = try_import_video_packages()
+
+    video_path = BytesIO(b)
+    vr = decord.VideoReader(video_path, num_threads=1)
+    total_frame_num = len(vr)
+
+    if total_frame_num > num_frames:
+        uniform_sampled_frames = np.linspace(0,
+                                             total_frame_num - 1,
+                                             num_frames,
+                                             dtype=int)
+        frame_idx = uniform_sampled_frames.tolist()
+    else:
+        frame_idx = [i for i in range(0, total_frame_num)]
+    frames = vr.get_batch(frame_idx).asnumpy()
+
+    return frames
+
+
+def _load_video_from_data_url(video_url: str):
+    # Only split once and assume the second part is the base64 encoded image
+    frames_base64 = video_url.split(",")[1:]
+    return np.stack([
+        load_video_frames_from_base64(frame_base64)
+        for frame_base64 in frames_base64
+    ])
+
+
+def fetch_video(video_url: str, *, num_frames: int = 32) -> npt.NDArray:
+    """
+    Load video from a HTTP or base64 data URL.
+    """
+    if video_url.startswith('http') or video_url.startswith('https'):
+        video_raw = global_http_connection.get_bytes(
+            video_url,
+            timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
+        )
+        video = _load_video_from_bytes(video_raw, num_frames)
+    elif video_url.startswith('data:video'):
+        video = _load_video_from_data_url(video_url)
+    else:
+        raise ValueError("Invalid 'video_url': A valid 'video_url' must start "
+                         "with either 'data:video' or 'http'.")
+    return video
+
+
+async def async_fetch_video(video_url: str,
+                            *,
+                            num_frames: int = 32) -> npt.NDArray:
+    """
+    Asynchronously load video from a HTTP or base64 data URL.
+
+    By default, the image is converted into RGB format.
+    """
+    if video_url.startswith('http') or video_url.startswith('https'):
+        video_raw = await global_http_connection.async_get_bytes(
+            video_url,
+            timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
+        )
+        video = _load_video_from_bytes(video_raw, num_frames)
+    elif video_url.startswith('data:video'):
+        video = _load_video_from_data_url(video_url)
+    else:
+        raise ValueError("Invalid 'video_url': A valid 'video_url' must start "
+                         "with either 'data:video' or 'http'.")
+    return video
+
+
 def try_import_audio_packages() -> Tuple[Any, Any]:
     try:
         import librosa
@@ -137,7 +220,9 @@ def fetch_audio(audio_url: str) -> Tuple[np.ndarray, Union[int, float]]:
 
     if audio_url.startswith("http"):
         audio_bytes = global_http_connection.get_bytes(
-            audio_url, timeout=VLLM_AUDIO_FETCH_TIMEOUT)
+            audio_url,
+            timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
+        )
     elif audio_url.startswith("data:audio"):
         _, audio_base64 = audio_url.split(",", 1)
         audio_bytes = base64.b64decode(audio_base64)
@@ -157,7 +242,9 @@ async def async_fetch_audio(
 
     if audio_url.startswith("http"):
         audio_bytes = await global_http_connection.async_get_bytes(
-            audio_url, timeout=VLLM_AUDIO_FETCH_TIMEOUT)
+            audio_url,
+            timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
+        )
     elif audio_url.startswith("data:audio"):
         _, audio_base64 = audio_url.split(",", 1)
         audio_bytes = base64.b64decode(audio_base64)
@@ -182,6 +269,11 @@ def get_and_parse_image(
     return {"image": image}
 
 
+def get_and_parse_video(video_url: str) -> MultiModalDataDict:
+    video = fetch_video(video_url)
+    return {"video": video}
+
+
 async def async_get_and_parse_audio(audio_url: str) -> MultiModalDataDict:
     audio, sr = await async_fetch_audio(audio_url)
     return {"audio": (audio, sr)}
@@ -196,6 +288,11 @@ async def async_get_and_parse_image(
     return {"image": image}
 
 
+async def async_get_and_parse_video(video_url: str) -> MultiModalDataDict:
+    video = await async_fetch_video(video_url)
+    return {"video": video}
+
+
 def encode_audio_base64(
     audio: np.ndarray,
     sampling_rate: int,
@@ -246,14 +343,15 @@ def rescale_image_size(image: Image.Image,
 def try_import_video_packages() -> Any:
     try:
         import cv2
+        import decord
     except ImportError:
         raise ImportError(
             "Please install vllm[video] for video support.") from None
-    return cv2
+    return cv2, decord
 
 
 def resize_video(frames: npt.NDArray, size: Tuple[int, int]) -> npt.NDArray:
-    cv2 = try_import_video_packages()
+    cv2, _ = try_import_video_packages()
 
     num_frames, _, _, channels = frames.shape
     new_height, new_width = size
@@ -284,6 +382,15 @@ def sample_frames_from_video(frames: npt.NDArray,
         return sampled_frames
 
 
+def encode_video_base64(frames: npt.NDArray):
+    base64_frames = []
+    frames_list = [frames[i] for i in range(frames.shape[0])]
+    for frame in frames_list:
+        img_base64 = encode_image_base64(Image.fromarray(frame))
+        base64_frames.append(img_base64)
+    return ",".join(base64_frames)
+
+
 # Utilities for input processors
 _T = TypeVar("_T", str, int)
 
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 6c2c6720f4276..40a92fed28c87 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -7,6 +7,7 @@
 from vllm.logger import init_logger
 from vllm.transformers_utils.processor import get_video_processor
 from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.utils import is_list_of
 
 from .base import MultiModalData, MultiModalInputs
 from .image import ImagePlugin
@@ -60,7 +61,7 @@ def _default_input_mapper(
         if isinstance(data, list) and len(data) == 1:
             data = data[0]
 
-        if isinstance(data, np.ndarray):
+        if isinstance(data, np.ndarray) or is_list_of(data, np.ndarray):
             video_processor = self._get_hf_video_processor(
                 model_config,
                 mm_processor_kwargs,

From 93bff421bc012cc96b6eb91db459faf1b731f123 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 7 Nov 2024 21:44:58 +0000
Subject: [PATCH 033/183] Bump actions/checkout from 4.2.1 to 4.2.2 (#9746)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/actionlint.yml      | 2 +-
 .github/workflows/clang-format.yml    | 2 +-
 .github/workflows/cleanup_pr_body.yml | 2 +-
 .github/workflows/codespell.yml       | 2 +-
 .github/workflows/mypy.yaml           | 2 +-
 .github/workflows/publish.yml         | 4 ++--
 .github/workflows/ruff.yml            | 2 +-
 .github/workflows/shellcheck.yml      | 2 +-
 .github/workflows/yapf.yml            | 2 +-
 9 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
index 5eddf6b7c649b..0226cf0ca00e9 100644
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@@ -30,7 +30,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: "Checkout"
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           fetch-depth: 0
 
diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
index ea0c567e1b942..68149d2dc019f 100644
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@@ -29,7 +29,7 @@ jobs:
       matrix:
         python-version: ["3.11"]
     steps:
-    - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
       with:
diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
index b516c45c41dfc..7cf7242e130c8 100644
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -10,7 +10,7 @@ jobs:
 
     steps:
       - name: Checkout repository
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 
       - name: Set up Python
         uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml
index 7d2fdc436790d..68887adaae54b 100644
--- a/.github/workflows/codespell.yml
+++ b/.github/workflows/codespell.yml
@@ -31,7 +31,7 @@ jobs:
       matrix:
         python-version: ["3.12"]
     steps:
-    - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
       with:
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index 6f28b476343e9..73eeacf1fa562 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -32,7 +32,7 @@ jobs:
       matrix:
         python-version: ["3.9", "3.10", "3.11", "3.12"]
     steps:
-    - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
       with:
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index 6a9c566334d20..c1051d10a4860 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -21,7 +21,7 @@ jobs:
       upload_url: ${{ steps.create_release.outputs.upload_url }}
     steps:
       - name: Checkout
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 
       - name: Extract branch info
         shell: bash
@@ -54,7 +54,7 @@ jobs:
 
     steps:
       - name: Checkout
-        uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 
       - name: Setup ccache
         uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index ffc13a7c7fe59..7266cc378cfb0 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -34,7 +34,7 @@ jobs:
       matrix:
         python-version: ["3.12"]
     steps:
-      - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
         with:
diff --git a/.github/workflows/shellcheck.yml b/.github/workflows/shellcheck.yml
index ac43b20c31390..4b1587e373e17 100644
--- a/.github/workflows/shellcheck.yml
+++ b/.github/workflows/shellcheck.yml
@@ -28,7 +28,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: "Checkout"
-        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           fetch-depth: 0
 
diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
index ac12b03084f20..ff441f94435ad 100644
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@@ -23,7 +23,7 @@ jobs:
       matrix:
         python-version: ["3.12"]
     steps:
-      - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
         with:

From 073a4727282b00f3626d5fdf720bd19589db7b48 Mon Sep 17 00:00:00 2001
From: Jiangtao Hu <ycool@users.noreply.github.com>
Date: Thu, 7 Nov 2024 16:14:01 -0800
Subject: [PATCH 034/183] [Misc] report relevant env vars in collect_env.py
 tool (#9293)

---
 collect_env.py | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/collect_env.py b/collect_env.py
index 80403d576d78f..254c19b19a5ac 100644
--- a/collect_env.py
+++ b/collect_env.py
@@ -1,17 +1,19 @@
 # ruff: noqa
 # code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py
 
-# Unlike the rest of the PyTorch this file must be python2 compliant.
-# This script outputs relevant system environment info
-# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
 import datetime
 import locale
 import os
 import re
 import subprocess
 import sys
+# Unlike the rest of the PyTorch this file must be python2 compliant.
+# This script outputs relevant system environment info
+# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
 from collections import namedtuple
 
+from vllm.envs import environment_variables
+
 try:
     import torch
     TORCH_AVAILABLE = True
@@ -52,6 +54,7 @@
         'vllm_version',  # vllm specific field
         'vllm_build_flags',  # vllm specific field
         'gpu_topo',  # vllm specific field
+        'env_vars',
     ])
 
 DEFAULT_CONDA_PATTERNS = {
@@ -512,6 +515,22 @@ def is_xnnpack_available():
     else:
         return "N/A"
 
+def get_env_vars():
+    env_vars = ''
+    secret_terms=('secret', 'token', 'api', 'access', 'password')
+    report_prefix = ("TORCH", "NCCL", "PYTORCH",
+                     "CUDA", "CUBLAS", "CUDNN",
+                     "OMP_", "MKL_",
+                     "NVIDIA")
+    for k, v in os.environ.items():
+        if any(term in k.lower() for term in secret_terms):
+            continue
+        if k in environment_variables:
+            env_vars = env_vars + "{}={}".format(k, v) + "\n"
+        if k.startswith(report_prefix):
+            env_vars = env_vars + "{}={}".format(k, v) + "\n"
+
+    return env_vars
 
 def get_env_info():
     run_lambda = run
@@ -583,6 +602,7 @@ def get_version_or_na(cfg, prefix):
         vllm_version=vllm_version,
         vllm_build_flags=vllm_build_flags,
         gpu_topo=gpu_topo,
+        env_vars=get_env_vars(),
     )
 
 
@@ -631,6 +651,8 @@ def get_version_or_na(cfg, prefix):
 {vllm_build_flags}
 GPU Topology:
 {gpu_topo}
+
+{env_vars}
 """.strip()
 
 

From 42b4f46b71572e21582fd12c498ec3b0b78ada7b Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 7 Nov 2024 17:08:24 -0800
Subject: [PATCH 035/183] [V1] Add all_token_ids attribute to Request (#10135)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/core/scheduler.py    |  2 +-
 vllm/v1/engine/llm_engine.py |  2 +-
 vllm/v1/request.py           | 29 ++++++++++++++--
 vllm/v1/utils.py             | 64 ++++++++++++++++++++++++++++++++++++
 4 files changed, 92 insertions(+), 5 deletions(-)
 create mode 100644 vllm/v1/utils.py

diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 41659ff62747d..6017905642172 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -246,7 +246,7 @@ def update_from_output(
                 # NOTE(woosuk): Currently, we assume that each request
                 # generates at most one token at each step.
                 token_id = sampled_token_ids[req_index]
-                request.output_token_ids.append(token_id)
+                request.append_output_token_ids(token_id)
                 sampled.append((request, 1))
                 # TODO: Update the KV cache manager for prefix caching.
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 5f5720480abdc..b538c2c7d63bc 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -324,7 +324,7 @@ def send_to_detokenizer(self, sampled: List[Tuple[Request, int]]) -> None:
         )
         for req, num_tokens in sampled:
             inputs.req_ids.append(req.request_id)
-            if len(req.output_token_ids) == num_tokens:
+            if req.num_output_tokens == num_tokens:
                 # The request is first detokenized.
                 inputs.prompt_token_ids.append(req.prompt_token_ids)
             else:
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index be7d4d165d280..087067cdac56f 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -4,6 +4,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import RequestMetrics
+from vllm.v1.utils import ConstantList
 
 if TYPE_CHECKING:
     from vllm.inputs import DecoderOnlyInputs
@@ -40,17 +41,39 @@ def __init__(
         self.prompt = inputs.get("prompt")
         self.prompt_token_ids = inputs["prompt_token_ids"]
         self.num_prompt_tokens = len(self.prompt_token_ids)
-        self.output_token_ids: List[int] = []
+        self._output_token_ids: List[int] = []
+        self._all_token_ids: List[int] = self.prompt_token_ids.copy()
         self.output_text = ""
         self.num_computed_tokens = 0
 
+    @property
+    def output_token_ids(self) -> ConstantList[int]:
+        # Prevent directly appending to the output_token_ids since
+        # all_token_ids should also be updated simultaneously.
+        return ConstantList(self._output_token_ids)
+
+    @property
+    def all_token_ids(self) -> ConstantList[int]:
+        # Prevent directly appending to the all_token_ids since
+        # output_token_ids should also be updated simultaneously
+        return ConstantList(self._all_token_ids)
+
+    def append_output_token_ids(
+        self,
+        token_ids: Union[int, List[int]],
+    ) -> None:
+        if isinstance(token_ids, int):
+            token_ids = [token_ids]
+        self._output_token_ids.extend(token_ids)
+        self._all_token_ids.extend(token_ids)
+
     @property
     def num_tokens(self) -> int:
-        return self.num_prompt_tokens + len(self.output_token_ids)
+        return len(self._all_token_ids)
 
     @property
     def num_output_tokens(self) -> int:
-        return len(self.output_token_ids)
+        return len(self._output_token_ids)
 
     def is_finished(self) -> bool:
         return RequestStatus.is_finished(self.status)
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
new file mode 100644
index 0000000000000..4b26749712e32
--- /dev/null
+++ b/vllm/v1/utils.py
@@ -0,0 +1,64 @@
+from typing import Generic, List, TypeVar, overload
+
+T = TypeVar("T")
+
+
+class ConstantList(Generic[T]):
+
+    def __init__(self, x: List[T]) -> None:
+        self._x = x
+
+    def append(self, item):
+        raise Exception("Cannot append to a constant list")
+
+    def extend(self, item):
+        raise Exception("Cannot extend a constant list")
+
+    def insert(self, item):
+        raise Exception("Cannot insert into a constant list")
+
+    def pop(self, item):
+        raise Exception("Cannot pop from a constant list")
+
+    def remove(self, item):
+        raise Exception("Cannot remove from a constant list")
+
+    def clear(self):
+        raise Exception("Cannot clear a constant list")
+
+    def index(self, item):
+        return self._x.index(item)
+
+    @overload
+    def __getitem__(self, item) -> T:
+        ...
+
+    @overload
+    def __getitem__(self, s: slice, /) -> List[T]:
+        ...
+
+    def __getitem__(self, item):
+        return self._x[item]
+
+    @overload
+    def __setitem__(self, item, value):
+        ...
+
+    @overload
+    def __setitem__(self, s: slice, value, /):
+        ...
+
+    def __setitem__(self, item, value):
+        raise Exception("Cannot set item in a constant list")
+
+    def __delitem__(self, item):
+        raise Exception("Cannot delete item from a constant list")
+
+    def __iter__(self):
+        return iter(self._x)
+
+    def __contains__(self, item):
+        return item in self._x
+
+    def __len__(self):
+        return len(self._x)

From 201fc07730ec96dd88b758064f148a424f4b251b Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Thu, 7 Nov 2024 17:34:44 -0800
Subject: [PATCH 036/183] [V1] Prefix caching (take 2) (#9972)

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
---
 benchmarks/benchmark_prefix_caching.py |   9 +-
 tests/v1/core/test_prefix_caching.py   | 219 ++++++++++++++
 vllm/v1/core/kv_cache_manager.py       | 382 ++++++++++++++++++++++---
 vllm/v1/core/kv_cache_utils.py         | 194 +++++++++++++
 vllm/v1/core/scheduler.py              |  32 ++-
 vllm/v1/engine/llm_engine.py           |   1 +
 6 files changed, 771 insertions(+), 66 deletions(-)
 create mode 100644 tests/v1/core/test_prefix_caching.py
 create mode 100644 vllm/v1/core/kv_cache_utils.py

diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index 1aac029992dbf..6d33096ca1d11 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -118,7 +118,7 @@ def main(args):
     random.seed(args.seed)
     if args.dataset_path is not None:
         print(f"Start to sample {args.num_prompts} prompts"
-              "from {args.dataset_path}")
+              f"from {args.dataset_path}")
         filtered_datasets = sample_requests(
             dataset_path=args.dataset_path,
             num_requests=args.num_prompts,
@@ -142,13 +142,6 @@ def main(args):
                                        repeat_count=args.repeat_count,
                                        sort=args.sort)
 
-    print("------warm up------")
-    test_prefix(
-        llm=llm,
-        prompts=prompts,
-        sampling_params=sampling_params,
-    )
-
     print("------start generating------")
     test_prefix(
         llm=llm,
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
new file mode 100644
index 0000000000000..e5a3b62258dd8
--- /dev/null
+++ b/tests/v1/core/test_prefix_caching.py
@@ -0,0 +1,219 @@
+"""Compare the with and without prefix caching."""
+from vllm.inputs import DecoderOnlyInputs
+from vllm.sampling_params import SamplingParams
+from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
+from vllm.v1.core.kv_cache_utils import hash_block_tokens
+
+
+def make_request(request_id, prompt_token_ids):
+    return Request(
+        request_id=request_id,
+        inputs=DecoderOnlyInputs(prompt_token_ids=prompt_token_ids),
+        sampling_params=SamplingParams(max_tokens=17),
+        eos_token_id=100,
+        arrival_time=0,
+        lora_request=None,
+    )
+
+
+def test_prefill():
+    manager = KVCacheManager(
+        block_size=16,
+        num_gpu_blocks=10,
+        sliding_window=False,
+        enable_caching=True,
+        num_preallocate_tokens=16,
+    )
+
+    # Complete 3 blocks (48 tokens)
+    common_token_ids = [i for i in range(3) for _ in range(16)]
+
+    # Fully cache miss
+    # Incomplete 1 block (7 tokens)
+    unique_token_ids = [3] * 7
+    req0 = make_request("0", common_token_ids + unique_token_ids)
+    computed_blocks = manager.get_computed_blocks(req0)
+    assert not computed_blocks
+    blocks = manager.allocate_slots(req0, 55, computed_blocks)
+    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
+
+    # Check full block metadata
+    parent_block_hash = None
+    for block_id in (0, 1, 2):
+        block_hash = hash_block_tokens(parent_block_hash,
+                                       manager.block_pool[block_id].token_ids)
+        assert manager.block_pool[block_id].block_hash == block_hash
+        assert manager.block_pool[block_id].ref_cnt == 1
+        assert manager.block_pool[block_id].num_hashed_tokens == 16 * (
+            block_id + 1)
+        assert manager.block_pool[block_id].token_ids == tuple([block_id] * 16)
+        parent_block_hash = block_hash
+
+    # Check partial/preallocated block metadata
+    for block_id in (3, 4):
+        assert manager.block_pool[block_id].block_hash is None
+        assert manager.block_pool[block_id].ref_cnt == 1
+        assert manager.block_pool[block_id].num_hashed_tokens == 0
+        if block_id == 3:
+            assert manager.block_pool[block_id].token_ids == [3] * 7
+        else:
+            assert not manager.block_pool[block_id].token_ids
+
+    # Cache hit in the common prefix when the original block is still in use.
+    # Incomplete 1 block (5 tokens)
+    unique_token_ids = [3] * 5
+    req1 = make_request("1", common_token_ids + unique_token_ids)
+    computed_blocks = manager.get_computed_blocks(req1)
+    assert [b.block_id for b in computed_blocks] == [0, 1, 2]
+    num_new_tokens = 53 - 3 * 16
+    blocks = manager.allocate_slots(req1, num_new_tokens, computed_blocks)
+    assert [b.block_id for b in blocks] == [5, 6]
+    for block in computed_blocks:
+        assert block.ref_cnt == 2
+
+    # At this point, we should have 3 free blocks left.
+    assert manager.free_block_queue.num_free_blocks == 3
+
+    manager.free(req0)
+    manager.free(req1)
+
+    # All blocks should be available.
+    assert manager.free_block_queue.num_free_blocks == 10
+    # The order should be
+    # [unallocated (7, 8)]
+    # [unique_req0 (4, 3)]
+    # [unique_req1 (6, 5)]
+    # [common (2, 1, 0)]
+    assert [
+        b.block_id for b in manager.free_block_queue.get_all_free_blocks()
+    ] == [7, 8, 9, 4, 3, 6, 5, 2, 1, 0]
+
+    # Cache hit in the common prefix when the original block is already free.
+    # Incomplete 1 block (6 tokens)
+    unique_token_ids = [3] * 6
+    req2 = make_request("2", common_token_ids + unique_token_ids)
+    computed_block = manager.get_computed_blocks(req2)
+    assert [b.block_id for b in computed_block] == [0, 1, 2]
+    num_new_tokens = 53 - 3 * 16
+    blocks = manager.allocate_slots(req2, num_new_tokens, computed_blocks)
+    assert [b.block_id for b in blocks] == [7, 8]
+
+    # Although we only have 5 free blocks, we have 8 blocks in
+    # the free block queue due to lazy removal.
+    assert manager.free_block_queue.num_free_blocks == 5
+    assert all([
+        b.ref_cnt == 0 for b in manager.free_block_queue.get_all_free_blocks()
+    ])
+    assert len([b
+                for b in manager.free_block_queue.get_all_free_blocks()]) == 5
+
+    manager.free(req2)
+
+    # Cache miss and eviction.
+    req3 = make_request("3", [99] * (16 * 9))
+    computed_blocks = manager.get_computed_blocks(req3)
+    assert not computed_blocks
+    blocks = manager.allocate_slots(req2, 16 * 9, computed_blocks)
+    # This block ID order also checks the eviction order.
+    assert [b.block_id for b in blocks] == [9, 4, 3, 6, 5, 8, 7, 2, 1, 0]
+    assert manager.free_block_queue.num_free_blocks == 0
+    assert manager.free_block_queue.free_list_head is None
+    assert manager.free_block_queue.free_list_tail is None
+
+
+def test_decode():
+    manager = KVCacheManager(
+        block_size=16,
+        num_gpu_blocks=10,
+        sliding_window=False,
+        enable_caching=True,
+        num_preallocate_tokens=16,
+    )
+
+    # Complete 3 blocks (48 tokens)
+    common_token_ids = [i for i in range(3) for _ in range(16)]
+
+    # Fully cache miss
+    # Incomplete 1 block (7 tokens)
+    unique_token_ids = [3] * 7
+    req0 = make_request("0", common_token_ids + unique_token_ids)
+    computed_blocks = manager.get_computed_blocks(req0)
+    assert not computed_blocks
+    blocks = manager.allocate_slots(req0, 55, computed_blocks)
+    assert [b.block_id for b in blocks] == [0, 1, 2, 3, 4]
+
+    # Append slots without allocating a new block.
+    req0.num_computed_tokens = 55
+    for _ in range(4):
+        req0.append_output_token_ids(8)
+    new_blocks = manager.append_slots(req0, 4)
+    assert new_blocks is not None and len(new_blocks) == 0
+    assert len(manager.block_pool[3].token_ids) == 11
+
+    # Append slots without allocating a new block, but start using the
+    # preallocated block.
+    req0.num_computed_tokens = 59
+    # 6 tokens to fill the previous block, and 10 tokens to fill
+    # the preallocated block.
+    for _ in range(5 + 10):
+        req0.append_output_token_ids(7)
+    new_blocks = manager.append_slots(req0, 15)
+    assert new_blocks is not None and len(new_blocks) == 0
+    assert len(manager.block_pool[3].token_ids) == 16
+    assert len(manager.block_pool[4].token_ids) == 10
+
+    # Append slots with allocating a new block.
+    req0.num_computed_tokens = 74
+    # 6 tokens to fill the previous block, and 10 tokens to fill
+    # the preallocated block.
+    for _ in range(6 + 11):
+        req0.append_output_token_ids(12)
+    new_blocks = manager.append_slots(req0, 17)
+    # Plus one preallocated block.
+    assert new_blocks is not None and len(new_blocks) == 2
+    assert len(manager.block_pool[4].token_ids) == 16
+    assert len(manager.block_pool[5].token_ids) == 11
+    assert len(manager.block_pool[6].token_ids) == 0
+
+
+def test_evict():
+    manager = KVCacheManager(
+        block_size=16,
+        num_gpu_blocks=10,
+        sliding_window=False,
+        enable_caching=True,
+        num_preallocate_tokens=16,
+    )
+
+    last_token_id = 5 * 16 + 7
+    req0 = make_request("0", list(range(last_token_id)))
+    computed_blocks = manager.get_computed_blocks(req0)
+    assert not computed_blocks
+    blocks = manager.allocate_slots(req0, 5 * 16 + 7, computed_blocks)
+    assert len(blocks) == 7  # 5 full + 1 partial + 1 preallocated
+
+    # 3 blocks.
+    req1 = make_request("1", list(range(last_token_id,
+                                        last_token_id + 3 * 16)))
+    computed_blocks = manager.get_computed_blocks(req1)
+    assert not computed_blocks
+    blocks = manager.allocate_slots(req1, 3 * 16, computed_blocks)
+    assert len(blocks) == 3  # 3 full blocks
+    last_token_id += 3 * 16
+
+    assert manager.free_block_queue.num_free_blocks == 0
+
+    manager.free(req0)
+    manager.free(req1)
+    assert manager.free_block_queue.num_free_blocks == 10
+    assert [
+        b.block_id for b in manager.free_block_queue.get_all_free_blocks()
+    ] == [6, 5, 4, 3, 2, 1, 0, 9, 8, 7]
+
+    # Touch the first 2 blocks.
+    req2 = make_request("2", list(range(2 * 16 + 3)))
+    computed_blocks = manager.get_computed_blocks(req2)
+    assert [b.block_id for b in computed_blocks] == [0, 1]
+    blocks = manager.allocate_slots(req2, 3, computed_blocks)
+    assert [b.block_id for b in blocks] == [6, 5]
+    assert manager.free_block_queue.num_free_blocks == 6
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 9b735a8be10d7..82094fb65dd1a 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -1,9 +1,11 @@
+from collections import defaultdict
 from typing import Dict, List, Optional
 
-import numpy as np
-
 from vllm.logger import init_logger
 from vllm.utils import cdiv
+from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
+                                         KVCacheBlock, hash_block_tokens,
+                                         hash_request_tokens)
 from vllm.v1.request import Request
 
 logger = init_logger(__name__)
@@ -36,73 +38,359 @@ def __init__(
         self.num_preallocate_tokens = num_preallocate_tokens
         self.num_preallocate_blocks = cdiv(num_preallocate_tokens, block_size)
 
-        self.free_block_ids = list(range(num_gpu_blocks))
-        self.req_to_block_ids: Dict[str, List[int]] = {}
-        self.ref_cnts = np.zeros(num_gpu_blocks, dtype=np.int32)
+        # A Block pool of all kv-cache blocks.
+        self.block_pool: List[KVCacheBlock] = [
+            KVCacheBlock(idx) for idx in range(num_gpu_blocks)
+        ]
+        # Free block queue that constructs and manipulates a doubly linked
+        # list of free blocks (including eviction candidates when caching is
+        # enabled).
+        self.free_block_queue = FreeKVCacheBlockQueue(self.block_pool)
+
+        # {block_hash: {block ID: block}}. A cached block is
+        # a full block with a block hash that can be used for prefix caching.
+        # The cached block may be used by running requests or in the
+        # free_block_queue that could potentially be evicted.
+        # NOTE: We currently don't de-duplicate the blocks in the cache,
+        # meaning that if a block becomes full and is cached, we don't check
+        # if there is already an identical block in the cache. This is because
+        # we want to make sure the allocated block IDs won't change so that
+        # block tables are append-only.
+        self.cached_block_hash_to_block: Dict[BlockHashType, Dict[
+            int, KVCacheBlock]] = defaultdict(dict)
+
+        # Mapping from request ID to blocks to track the blocks allocated
+        # for each request, so that we can free the blocks when the request
+        # is finished.
+        self.req_to_blocks: Dict[str, List[KVCacheBlock]] = {}
 
-    def get_computed_blocks(self, request: Request) -> List[int]:
+    def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]:
+        """Get the computed (cached) blocks for the request.
+        Note that the computed blocks must be full.
+
+        Args:
+            request: The request to get the computed blocks.
+        
+        Returns:
+            A list of blocks that are computed for the request.
+        """
         if not self.enable_caching:
-            # No prefix caching.
+            # Prefix caching is disabled.
             return []
-        # TODO(woosuk): Implement hash-based caching.
-        return []
+
+        computed_blocks = []
+        block_hashes = hash_request_tokens(self.block_size,
+                                           request.all_token_ids)
+
+        for block_hash in block_hashes:
+            # block_hashes is a chain of block hashes. If a block hash is not
+            # in the cached_block_hash_to_id, the following block hashes are
+            # not computed yet for sure.
+            if cached_block := self._get_cached_block(block_hash):
+                computed_blocks.append(cached_block)
+            else:
+                break
+
+        return computed_blocks
 
     def append_slots(
         self,
         request: Request,
         num_tokens: int,
-    ) -> Optional[List[int]]:
+    ) -> Optional[List[KVCacheBlock]]:
+        """Append slots to the block table of the request.
+        We first append slots to already allocated blocks. If the allocated
+        blocks are not enough, we allocate new blocks.
+
+        Args:
+            request: The request to append slots.
+            num_tokens: The number of tokens to append.
+        
+        Returns:
+            A list of new blocks if new blocks are allocated, or None
+            if new blocks are required but cannot be allocated.
+        """
         num_required_blocks = cdiv(request.num_computed_tokens + num_tokens,
                                    self.block_size)
-        req_block_ids = self.req_to_block_ids[request.request_id]
-        if num_required_blocks <= len(req_block_ids):
-            # No new block is needed.
-            return []
+        req_blocks = self.req_to_blocks[request.request_id]
 
-        num_new_blocks = num_required_blocks - len(req_block_ids)
-        num_free_blocks = len(self.free_block_ids)
-        if num_new_blocks > num_free_blocks:
-            # Cannot allocate new blocks.
+        num_new_blocks = num_required_blocks - len(req_blocks)
+        if num_new_blocks > self.free_block_queue.num_free_blocks:
+            # Need to allocate new blocks due to insufficient pre-allocated
+            # slots, but we cannot allocate new blocks due to the limit.
             return None
 
-        # Allocate new blocks.
+        # When caching is enabled, assign token IDs to already allocated blocks.
+        new_token_ids = None
+        parent_block = None
+        if self.enable_caching:
+            # Figure out the token IDs to add to the blocks.
+            new_token_ids = request.all_token_ids[
+                request.num_computed_tokens:request.num_computed_tokens +
+                num_tokens]
+
+            # Find the last full block index.
+            # TODO: This may be optimized by calculating the computed tokens.
+            last_full_block_idx = len(req_blocks) - 1
+            while (last_full_block_idx >= 0
+                   and req_blocks[last_full_block_idx].block_hash is None):
+                last_full_block_idx -= 1
+
+            parent_block = (req_blocks[last_full_block_idx]
+                            if last_full_block_idx >= 0 else None)
+            token_id_idx = self._add_token_ids_to_blocks(
+                blocks=req_blocks[last_full_block_idx + 1:],
+                token_ids=new_token_ids,
+                parent_block=parent_block)
+
+            new_token_ids = new_token_ids[token_id_idx:]
+            parent_block = req_blocks[-1]
+
+        # No new block is needed. When caching is enabled, we make sure
+        # token_id_idx is equal to len(new_token_ids), meaning that all tokens
+        # are added to allocated blocks.
+        if num_required_blocks <= len(req_blocks):
+            assert not self.enable_caching or token_id_idx == num_tokens, \
+                    f"{token_id_idx=} != {num_tokens=}"
+            return []
+
+        # Allocate new blocks considering preallocated blocks, and
+        # add token IDs to them if caching is enabled.
         num_new_blocks = min(num_new_blocks + self.num_preallocate_blocks,
-                             num_free_blocks)
-        new_block_ids = self._get_new_blocks(num_new_blocks)
-        req_block_ids.extend(new_block_ids)
-        self.ref_cnts[new_block_ids] += 1
-        return new_block_ids
+                             self.free_block_queue.num_free_blocks)
+        new_blocks = self._get_new_blocks(num_new_blocks, new_token_ids,
+                                          parent_block)
+        req_blocks.extend(new_blocks)
+        return new_blocks
 
     def allocate_slots(
         self,
         request: Request,
         num_tokens: int,
-        computed_block_ids: List[int],
-    ) -> Optional[List[int]]:
+        computed_blocks: List[KVCacheBlock],
+    ) -> Optional[List[KVCacheBlock]]:
+        """Allocate slots for a new request.
+
+        Args:
+            request: The request to allocate slots.
+            num_tokens: The number of tokens to allocate. Note that this does
+                not include the tokens that have already been computed.
+            computed_blocks: The blocks that have already been computed.
+        
+        Returns:
+            A list of new allocated blocks.
+        """
+        if num_tokens == 0:
+            raise ValueError(
+                f"num_tokens must be greater than 0, got {num_tokens}")
+
+        # If a computed block of a request is an eviction candidate (in the
+        # free queue and ref_cnt == 0), it cannot be counted as a free block
+        # when allocating this request.
+        num_evictable_computed_blocks = len(
+            [blk for blk in computed_blocks if blk.ref_cnt == 0])
+
         num_required_blocks = cdiv(num_tokens, self.block_size)
-        num_free_blocks = len(self.free_block_ids)
-        if num_required_blocks > num_free_blocks:
+        if (num_required_blocks > self.free_block_queue.num_free_blocks -
+                num_evictable_computed_blocks):
             # Cannot allocate new blocks.
             return None
 
-        num_new_blocks = min(num_required_blocks + self.num_preallocate_blocks,
-                             num_free_blocks)
-        new_block_ids = self._get_new_blocks(num_new_blocks)
-        block_ids = computed_block_ids + new_block_ids
-        self.req_to_block_ids[request.request_id] = block_ids
-        self.ref_cnts[block_ids] += 1
-        return new_block_ids
+        # Determine the number of new blocks to allocate considering
+        # preallocated blocks.
+        num_new_blocks = min(
+            num_required_blocks + self.num_preallocate_blocks,
+            self.free_block_queue.num_free_blocks -
+            num_evictable_computed_blocks)
+
+        num_computed_tokens = len(computed_blocks) * self.block_size
+
+        # When caching is enabled, get the new token IDs and the parent block
+        # ID to generate cache keys.
+        new_token_ids = None
+        parent_block = None
+        if self.enable_caching:
+            # Touch the computed blocks to make sure they won't be evicted.
+            self._touch(computed_blocks)
+
+            # Get the token IDs for the blocks being allocated for hashing.
+            new_token_ids = request.all_token_ids[
+                num_computed_tokens:num_computed_tokens + num_tokens]
+            if not new_token_ids:
+                raise RuntimeError(
+                    "Failed to infer the token IDs for allocation. "
+                    f"#all_tokens={len(request.all_token_ids)} < "
+                    f"#computed_tokens={num_computed_tokens}")
+
+            # Get the parent block ID to construct the block chain.
+            parent_block = computed_blocks[-1] if computed_blocks else None
+
+        new_blocks = self._get_new_blocks(num_new_blocks, new_token_ids,
+                                          parent_block)
+
+        # Concatenate the computed block IDs and the new block IDs.
+        self.req_to_blocks[request.request_id] = computed_blocks + new_blocks
+        return new_blocks
 
     def free(self, request: Request) -> None:
-        block_ids = self.req_to_block_ids.pop(request.request_id)
-        self.ref_cnts[block_ids] -= 1
-        for block_id in block_ids:
-            ref_cnt = self.ref_cnts[block_id]
-            if ref_cnt == 0:
-                self.free_block_ids.append(block_id)
-
-    def _get_new_blocks(self, num_blocks: int) -> List[int]:
-        assert num_blocks <= len(self.free_block_ids)
-        new_block_ids = self.free_block_ids[-num_blocks:]
-        self.free_block_ids = self.free_block_ids[:-num_blocks]
-        return new_block_ids
+        """Free the blocks allocated for the request.
+        When caching is enabled, we free the blocks in reverse order so that
+        the tail blocks are evicted first.
+
+        Args:
+            request: The request to free the blocks.
+        """
+        blocks = self.req_to_blocks.pop(request.request_id)
+        if self.enable_caching:
+            # Free blocks in reverse order so that the tail blocks are
+            # freed first.
+            blocks = reversed(blocks)
+
+        for block in blocks:
+            block.ref_cnt -= 1
+            if block.ref_cnt == 0:
+                self.free_block_queue.append(block)
+
+    def _get_new_blocks(
+            self,
+            num_blocks: int,
+            token_ids: Optional[List[int]] = None,
+            parent_block: Optional[int] = None) -> List[KVCacheBlock]:
+        """Get new blocks from the free block pool, and add token IDs to
+        allocated blocks if caching is enabled.
+        Note that we do not check block cache in this function.
+        
+        Args:
+            num_blocks: The number of blocks to allocate.
+            token_ids: The token IDs in the blocks. None if caching is disabled.
+            parent_block: The parent block. Used to include block chain
+                in the block hash.
+        
+        Returns:
+            A list of new block.
+        """
+        if num_blocks > self.free_block_queue.num_free_blocks:
+            raise ValueError(
+                f"Cannot get {num_blocks} free blocks from the pool")
+
+        # First allocate blocks.
+        ret: List[KVCacheBlock] = []
+        idx = 0
+        while idx < num_blocks:
+            curr_block = self.free_block_queue.popleft()
+            assert curr_block.ref_cnt == 0
+
+            # Evict blocks from the cache.
+            if self.enable_caching:
+                block_hash = curr_block.block_hash
+                if (block_hash is not None
+                        and block_hash in self.cached_block_hash_to_block):
+                    if len(self.cached_block_hash_to_block[block_hash]) == 1:
+                        del self.cached_block_hash_to_block[block_hash]
+                    else:
+                        del self.cached_block_hash_to_block[block_hash][
+                            curr_block.block_id]
+                curr_block.reset()
+
+            curr_block.ref_cnt = 1
+            ret.append(curr_block)
+            idx += 1
+
+        # Then assign token IDs to the allocated blocks.
+        if self.enable_caching:
+            assert token_ids is not None
+            token_id_idx = self._add_token_ids_to_blocks(
+                blocks=ret, token_ids=token_ids, parent_block=parent_block)
+            assert token_id_idx == len(token_ids)
+
+        return ret
+
+    def _cache_full_block(self,
+                          block: KVCacheBlock,
+                          parent_block: Optional[KVCacheBlock] = None) -> None:
+        """Cache a full block for prefix caching.
+
+        Args:
+            block: The block to cache.
+            parent_block: The parent block. None if this is the first block.
+        """
+        parent_block_hash = (parent_block.block_hash
+                             if parent_block is not None else None)
+        assert len(block.token_ids) == self.block_size
+        block.token_ids = tuple(block.token_ids)
+        block_hash = hash_block_tokens(parent_block_hash, block.token_ids)
+        block.block_hash = block_hash
+        block.num_hashed_tokens = self.block_size + (
+            parent_block.num_hashed_tokens if parent_block is not None else 0)
+        self.cached_block_hash_to_block[block_hash][block.block_id] = block
+
+    def _get_cached_block(self,
+                          block_hash: BlockHashType) -> Optional[KVCacheBlock]:
+        """Get a cached block by the block hash, or None if cache miss.
+        If there are duplicated blocks, we return the first block in the cache.
+
+        Args:
+            block_hash: The hash value of the block.
+
+        Returns:
+            The cached block if it exists, or None.
+        """
+        if block_hash in self.cached_block_hash_to_block:
+            first_block_id = list(
+                self.cached_block_hash_to_block[block_hash].keys())[0]
+            return self.cached_block_hash_to_block[block_hash][first_block_id]
+        return None
+
+    def _touch(self, blocks: List[KVCacheBlock]) -> None:
+        """Touch a block increases its reference count by 1, and may remove
+        the block from the free queue. This is used when a block is hit by
+        another request with the same prefix.
+
+        Args:
+            blocks: A list of blocks to touch.
+        """
+        for block in blocks:
+            # ref_cnt=0 means this block is in the free list (i.e. eviction
+            # candidate), so remove it.
+            if block.ref_cnt == 0:
+                self.free_block_queue.remove(block)
+            block.ref_cnt += 1
+
+    def _add_token_ids_to_blocks(
+            self,
+            blocks: List[KVCacheBlock],
+            token_ids: List[int],
+            parent_block: Optional[KVCacheBlock] = None) -> int:
+        """Add token IDs to a list of allocated blocks.
+        If a block becomes full after adding token IDs, cache it.
+        Return the token ID index that has not been added to the blocks
+        if the blocks are not enough to hold all the token IDs.
+
+        Args:
+            blocks: A list of blocks to add token IDs.
+            token_ids: A list of token IDs to add.
+            parent_block: The parent block. None if this is the
+                first block.
+
+        Returns:
+            The starting token ID index that has not been added to the blocks
+            due to insufficient given blocks.
+        """
+        token_id_start = 0
+        for curr_block in blocks:
+            # If all token IDs are added, then the rest of the blocks are
+            # preallocated blocks, so we only need to update the
+            # parent_block_id. FIXME
+            if token_id_start == len(token_ids):
+                continue
+
+            # Add token IDs to the empty slots in the block.
+            empty_slots = self.block_size - len(curr_block.token_ids)
+            token_id_end = min(token_id_start + empty_slots, len(token_ids))
+            curr_block.token_ids.extend(token_ids[token_id_start:token_id_end])
+            # Cache the block if it becomes full.
+            if len(curr_block.token_ids) == self.block_size:
+                self._cache_full_block(curr_block, parent_block)
+            parent_block = curr_block
+            token_id_start = token_id_end
+        return token_id_start
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
new file mode 100644
index 0000000000000..33dbfb7377bfd
--- /dev/null
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -0,0 +1,194 @@
+"""KV-Cache Utilities."""
+from dataclasses import dataclass, field
+from typing import List, Optional, Tuple, Union
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+BlockHashType = Tuple[int, Tuple[int]]
+
+
+@dataclass
+class KVCacheBlock:
+    """KV-cache block metadata."""
+    # Block ID, ranging from 0 to num_gpu_blocks - 1.
+    block_id: int
+    # Reference count.
+    ref_cnt: int = 0
+    # Token IDs in the block. When the block is full, the type of token_ids
+    # should be Tuple[int] for fast matching.
+    token_ids: Union[List[int], Tuple[int]] = field(default_factory=list)
+    # The hash of the block composed of (block hash, tuple of token IDs).
+    # It is only available when the block is full.
+    block_hash: Optional[BlockHashType] = None
+    # The number of hashed tokens. More hashed tokens means the block
+    # is closer to the end of a prompt and more likely to be evicted.
+    num_hashed_tokens: int = 0
+
+    # Used to construct a doubly linked list for free blocks.
+    # These two attributes should only be manipulated by FreeKVCacheBlockQueue.
+    prev_free_block: Optional["KVCacheBlock"] = None
+    next_free_block: Optional["KVCacheBlock"] = None
+
+    def reset(self):
+        """Reset the block metadata."""
+        self.ref_cnt = 0
+        self.token_ids = []
+        self.block_hash = None
+        self.num_hashed_tokens = 0
+
+
+class FreeKVCacheBlockQueue:
+    """This class organizes a list of KVCacheBlock objects to a doubly linked
+    list of free blocks. We implement this class instead of using Python
+    builtin deque to support removing a block in the middle of the queue
+    in O(1) time. To close the performance gap to the builtin deque which is
+    implemented in C++, this class does not allocate any Python objects when
+    manipulating the linked list. Instead, this class manipulates the 
+    prev_free_block and next_free_block attributes of the given blocks.
+
+    The queue is ordered by block ID in the beginning. When a block is allocated
+    and then freed, it will be appended back with the eviction order:
+    1. The least recent used block is at the front (LRU).
+    2. If two blocks have the same last accessed time (allocated by the
+       same sequence), the one with more hash tokens (the tail of a block
+       chain) is at the front.
+    Note that we maintain this order by reversing the block order when free
+    blocks of a request. This operation is outside of this class.
+
+    Args:
+        blocks: A list of KVCacheBlock objects.
+    """
+
+    def __init__(self, blocks: List[KVCacheBlock]) -> None:
+        self.num_free_blocks = len(blocks)
+
+        # Initialize the doubly linked list of free blocks.
+        self.free_list_head = blocks[0]
+        self.free_list_tail = blocks[-1]
+        for i in range(self.num_free_blocks):
+            if i > 0:
+                blocks[i].prev_free_block = blocks[i - 1]
+            if i < self.num_free_blocks - 1:
+                blocks[i].next_free_block = blocks[i + 1]
+
+    def popleft(self) -> KVCacheBlock:
+        """Pop the first free block and reduce num_free_blocks by 1.
+        
+        Returns:
+            The first free block.
+        """
+        if not self.free_list_head:
+            raise ValueError("No free blocks available")
+
+        block = self.free_list_head
+        self.remove(block)
+        return block
+
+    def remove(self, block: KVCacheBlock) -> None:
+        """Remove a block in the free list and reduce num_free_blocks by 1.
+        
+        Args:
+            block: The block to remove.
+        """
+        if block.prev_free_block is not None:
+            # Link the previous block to the next block.
+            block.prev_free_block.next_free_block = block.next_free_block
+        if block.next_free_block is not None:
+            # Link the next block to the previous block.
+            block.next_free_block.prev_free_block = block.prev_free_block
+
+        if block == self.free_list_head:
+            # Update the head if the block is the head.
+            self.free_list_head = block.next_free_block
+        if block == self.free_list_tail:
+            # Update the tail if the block is the tail.
+            self.free_list_tail = block.prev_free_block
+
+        # Remove the block from the linked list.
+        block.prev_free_block = block.next_free_block = None
+        self.num_free_blocks -= 1
+
+    def append(self, block: KVCacheBlock) -> None:
+        """Put a block back into the free list and increase
+        num_free_blocks by 1.
+
+        Args:
+            block: The block to append.
+        """
+        if self.free_list_tail is not None:
+            # Link the last block to the new block.
+            self.free_list_tail.next_free_block = block
+            block.prev_free_block = self.free_list_tail
+            self.free_list_tail = block
+        else:
+            # The free list is empty.
+            assert self.free_list_head is None
+            self.free_list_head = self.free_list_tail = block
+
+        block.next_free_block = None
+        self.num_free_blocks += 1
+
+    def get_all_free_blocks(self) -> List[KVCacheBlock]:
+        """Get all free blocks in the free list. Mainly used for testing.
+        
+        Returns:
+            A list of free blocks.
+        """
+        ret = []
+        curr_block = self.free_list_head
+        while curr_block is not None:
+            ret.append(curr_block)
+            curr_block = curr_block.next_free_block
+        return ret
+
+
+def hash_block_tokens(parent_block_hash: Optional[int],
+                      curr_block_token_ids: Tuple[int]) -> BlockHashType:
+    """Computes a hash value corresponding to the contents of a block and
+    the contents of the preceding block(s). The hash value is used for
+    prefix caching. We use LRU cache for this function to avoid recomputing
+    hash values for the same block contents.
+
+    TODO: Support arbitrary metadata so that we could support more
+    features such as LoRA adapter.
+
+    Args:
+        parent_block_hash: The hash of the parent block. None
+            if this is the first block.
+        curr_block_token_ids: A tuple of token ids in the current
+            block. The current block is assumed to be full.
+
+    Returns:
+        The hash value of the block and the token ids in the block.
+        The entire tuple is used as the hash key of the block.
+    """
+    return (hash(
+        (parent_block_hash, *curr_block_token_ids)), curr_block_token_ids)
+
+
+def hash_request_tokens(block_size: int,
+                        token_ids: List[int]) -> List[BlockHashType]:
+    """Computes hash values of a chain of blocks given a sequence of
+    token IDs. The hash value is used for prefix caching.
+
+    Args:
+        block_size: The size of each block.
+        token_ids: A sequence of token ids in the request.
+
+    Returns:
+        The list of computed hash values.
+    """
+    ret = []
+    parent_block_hash = None
+    for start in range(0, len(token_ids), block_size):
+        end = start + block_size
+        block_token_ids = tuple(token_ids[start:end])
+        # Do not hash the block if it is not full.
+        if len(block_token_ids) < block_size:
+            break
+        block_hash = hash_block_tokens(parent_block_hash, block_token_ids)
+        ret.append(block_hash)
+        parent_block_hash = block_hash
+    return ret
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index 6017905642172..a60f8b8138ecf 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -34,7 +34,7 @@ def __init__(
             block_size=self.cache_config.block_size,
             num_gpu_blocks=num_gpu_blocks,
             sliding_window=self.cache_config.sliding_window,
-            enable_caching=True)
+            enable_caching=self.cache_config.enable_prefix_caching)
         self.block_size = self.cache_config.block_size
 
         # Scheduling constraints.
@@ -91,9 +91,9 @@ def schedule(self) -> "SchedulerOutput":
             assert num_new_tokens > 0
 
             while True:
-                new_block_ids = self.kv_cache_manager.append_slots(
+                new_blocks = self.kv_cache_manager.append_slots(
                     request, num_new_tokens)
-                if new_block_ids is None:
+                if new_blocks is None:
                     # The request cannot be scheduled.
                     # Preempt the lowest-priority request.
                     preempted_req = self.running.pop()
@@ -110,7 +110,9 @@ def schedule(self) -> "SchedulerOutput":
                     # The request can be scheduled.
                     scheduled_running_reqs.append(request)
 
-                    req_to_new_block_ids[request.request_id] = new_block_ids
+                    req_to_new_block_ids[request.request_id] = [
+                        b.block_id for b in new_blocks
+                    ]
                     num_scheduled_tokens[request.request_id] = num_new_tokens
                     token_budget -= num_new_tokens
                     req_index += 1
@@ -126,22 +128,29 @@ def schedule(self) -> "SchedulerOutput":
 
                 request = self.waiting[0]
                 # Get already-cached tokens.
-                computed_block_ids = self.kv_cache_manager.get_computed_blocks(
+                computed_blocks = self.kv_cache_manager.get_computed_blocks(
                     request)
                 # NOTE(woosuk): Since incomplete blocks are not eligible for
                 # sharing, `num_computed_tokens` is always a multiple of
                 # `block_size`.
-                num_computed_tokens = len(computed_block_ids) * self.block_size
+                num_computed_tokens = len(computed_blocks) * self.block_size
                 # Number of tokens to be scheduled.
                 # We use `request.num_tokens` instead of
                 # `request.num_prompt_tokens` to consider the resumed requests,
                 # which have output tokens.
                 num_new_tokens = request.num_tokens - num_computed_tokens
+                if num_new_tokens == 0:
+                    # The happens when prompt length is divisible by the block
+                    # size and all blocks are cached. Now we force to recompute
+                    # the last token.
+                    num_computed_tokens -= 1
+                    num_new_tokens = 1
+                    computed_blocks.pop()
                 num_new_tokens = min(num_new_tokens, token_budget)
                 assert num_new_tokens > 0
-                new_block_ids = self.kv_cache_manager.allocate_slots(
-                    request, num_new_tokens, computed_block_ids)
-                if new_block_ids is None:
+                new_blocks = self.kv_cache_manager.allocate_slots(
+                    request, num_new_tokens, computed_blocks)
+                if new_blocks is None:
                     # The request cannot be scheduled.
                     break
                 request.num_computed_tokens = num_computed_tokens
@@ -156,8 +165,9 @@ def schedule(self) -> "SchedulerOutput":
                     raise RuntimeError(
                         f"Invalid request status: {request.status}")
 
-                req_to_new_block_ids[request.request_id] = (
-                    computed_block_ids + new_block_ids)
+                req_to_new_block_ids[request.request_id] = [
+                    b.block_id for b in computed_blocks + new_blocks
+                ]
                 num_scheduled_tokens[request.request_id] = num_new_tokens
                 token_budget -= num_new_tokens
                 request.status = RequestStatus.RUNNING
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index b538c2c7d63bc..cd3f5c75d0d14 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -65,6 +65,7 @@ def __init__(
         elif usage_context == UsageContext.OPENAI_API_SERVER:
             scheduler_config.max_num_seqs = 1024
             scheduler_config.max_num_batched_tokens = 2048
+        cache_config.enable_prefix_caching = True
 
         logger.info(
             "Initializing an LLM engine (v%s) with config: "

From 6bb52b0f97c11d30fa38290926372148e231f408 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Thu, 7 Nov 2024 23:10:20 -0500
Subject: [PATCH 037/183] [CI/Build] Give PR cleanup job PR write access
 (#10139)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .github/workflows/cleanup_pr_body.yml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
index 7cf7242e130c8..37d93a1277974 100644
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -1,8 +1,11 @@
 name: Cleanup PR Body
 
 on:
-  pull_request:
-    types: [opened, edited, synchronize]
+  pull_request_target:
+    types: [opened, reopened, edited]
+
+permissions:
+  pull-requests: write
 
 jobs:
   update-description:

From 40d0e7411dbeb276befd33c4485115ac3d4d7f2a Mon Sep 17 00:00:00 2001
From: whyiug <whyiug@hotmail.com>
Date: Fri, 8 Nov 2024 12:44:58 +0800
Subject: [PATCH 038/183] [Doc] Update FAQ links in spec_decode.rst (#9662)

Signed-off-by: whyiug <whyiug@hotmail.com>
---
 docs/source/models/spec_decode.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/models/spec_decode.rst b/docs/source/models/spec_decode.rst
index b02c80aebec69..d57ffec53215d 100644
--- a/docs/source/models/spec_decode.rst
+++ b/docs/source/models/spec_decode.rst
@@ -182,7 +182,7 @@ speculative decoding, breaking down the guarantees into three key areas:
 3. **vLLM Logprob Stability**
    - vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the 
    same request across runs. For more details, see the FAQ section 
-   titled *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq.rst>`_.
+   titled *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq>`_.
 
 
 **Conclusion**
@@ -197,7 +197,7 @@ can occur due to following factors:
 
 **Mitigation Strategies**
 
-For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq.rst>`_.
+For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the `FAQs <../serving/faq>`_.
 
 Resources for vLLM contributors
 -------------------------------

From ad39bd640cdaaf2963cd07a7cc912c1dde516ed0 Mon Sep 17 00:00:00 2001
From: DearPlanet <junsong.zhang2021.work@outlook.com>
Date: Fri, 8 Nov 2024 12:58:37 +0800
Subject: [PATCH 039/183] [Bugfix] Add error handling when server cannot
 respond any valid tokens (#5895)

---
 benchmarks/backend_request_func.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index a42e70170ba28..313ba819c87cb 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -256,6 +256,7 @@ async def async_request_openai_completions(
             async with session.post(url=api_url, json=payload,
                                     headers=headers) as response:
                 if response.status == 200:
+                    first_valid_chunk_received = False
                     async for chunk_bytes in response.content:
                         chunk_bytes = chunk_bytes.strip()
                         if not chunk_bytes:
@@ -274,7 +275,8 @@ async def async_request_openai_completions(
                             if data["choices"][0]["text"]:
                                 timestamp = time.perf_counter()
                                 # First token
-                                if ttft == 0.0:
+                                if not first_valid_chunk_received:
+                                    first_chunk_received = True
                                     ttft = time.perf_counter() - st
                                     output.ttft = ttft
 
@@ -285,9 +287,14 @@ async def async_request_openai_completions(
 
                                 most_recent_timestamp = timestamp
                                 generated_text += data["choices"][0]["text"]
-
+                    if first_chunk_received:
+                        output.success = True
+                    else:
+                        output.success = False
+                        output.error = (
+                            "Never received a valid chunk to calculate TTFT."
+                            "This response will be marked as failed!")
                     output.generated_text = generated_text
-                    output.success = True
                     output.latency = latency
                 else:
                     output.error = response.reason or ""

From 7371749d54db40999d896c4a7f8935bc6984c093 Mon Sep 17 00:00:00 2001
From: Mengqing Cao <cmq0113@163.com>
Date: Fri, 8 Nov 2024 13:08:51 +0800
Subject: [PATCH 040/183] [Misc] Fix ImportError causing by triton (#9493)

---
 vllm/executor/multiproc_gpu_executor.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index 2dbde778e49b1..3eb14fb931925 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -13,12 +13,15 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.sequence import ExecuteModelRequest
-from vllm.triton_utils import maybe_set_triton_cache_manager
+from vllm.triton_utils.importing import HAS_TRITON
 from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless,
                         cuda_is_initialized, get_distributed_init_method,
                         get_open_port, get_vllm_instance_id, make_async,
                         update_environment_variables)
 
+if HAS_TRITON:
+    from vllm.triton_utils import maybe_set_triton_cache_manager
+
 logger = init_logger(__name__)
 
 
@@ -59,7 +62,7 @@ def _init_executor(self) -> None:
             torch.set_num_threads(default_omp_num_threads)
 
         # workaround for https://github.com/vllm-project/vllm/issues/6103
-        if world_size > 1:
+        if HAS_TRITON and world_size > 1:
             maybe_set_triton_cache_manager()
 
         # Multiprocessing-based executor does not support multi-node setting.

From 3a7f15a398727887137a021b8b32dc372b532087 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 8 Nov 2024 00:15:12 -0500
Subject: [PATCH 041/183] [Doc] Move CONTRIBUTING to docs site (#9924)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 CONTRIBUTING.md                               | 59 +---------------
 .../dockerfile/dockerfile.rst                 |  0
 docs/source/contributing/overview.rst         | 70 +++++++++++++++++++
 .../profiling/profiling_index.rst             |  0
 .../input_processing_pipeline.rst             |  0
 .../input_processing/model_inputs_index.rst   |  0
 .../kernel/paged_attention.rst                |  0
 .../multimodal/adding_multimodal_plugin.rst   |  0
 .../multimodal/multimodal_index.rst           |  0
 docs/source/index.rst                         | 39 ++++++++---
 10 files changed, 100 insertions(+), 68 deletions(-)
 rename docs/source/{dev => contributing}/dockerfile/dockerfile.rst (100%)
 create mode 100644 docs/source/contributing/overview.rst
 rename docs/source/{dev => contributing}/profiling/profiling_index.rst (100%)
 rename docs/source/{dev => design}/input_processing/input_processing_pipeline.rst (100%)
 rename docs/source/{dev => design}/input_processing/model_inputs_index.rst (100%)
 rename docs/source/{dev => design}/kernel/paged_attention.rst (100%)
 rename docs/source/{dev => design}/multimodal/adding_multimodal_plugin.rst (100%)
 rename docs/source/{dev => design}/multimodal/multimodal_index.rst (100%)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index b39fd75b5fb70..8beae68289997 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,60 +1,3 @@
 # Contributing to vLLM
 
-Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project:
-
-- Identify and report any issues or bugs.
-- Request or add support for a new model.
-- Suggest or implement new features.
-- Improve documentation or contribute a how-to guide. 
-
-We also believe in the power of community support; thus, answering queries, offering PR reviews, and assisting others are also highly regarded and beneficial contributions.
-
-Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository!
-
-## License
-
-See [LICENSE](LICENSE).
-
-## Developing
-
-Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the [building from source](https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source) documentation for details.
-
-## Testing
-
-```bash
-pip install -r requirements-dev.txt
-
-# linting and formatting
-bash format.sh
-# Static type checking
-mypy
-# Unit tests
-pytest tests/
-```
-**Note:** Currently, the repository does not pass the ``mypy`` tests.
-
-## Contribution Guidelines
-
-### DCO and Signed-off-by
-
-When contributing changes to this project, you must agree to the [DCO](DCO).
-Commits must include a `Signed-off-by:` header which certifies agreement with
-the terms of the [DCO](DCO).
-
-Using `-s` with `git commit` will automatically add this header.
-
-### Issues
-
-If you encounter a bug or have a feature request, please [search existing issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue) first to see if it has already been reported. If not, please [file a new issue](https://github.com/vllm-project/vllm/issues/new/choose), providing as much relevant information as possible.
-
-> [!IMPORTANT]
-> If you discover a security vulnerability, please follow the instructions [here](/SECURITY.md#reporting-a-vulnerability).
-
-### Pull Requests & Code Reviews
-
-Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE.md) for detailed guide for contribution.
-
-### Thank You
-
-Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
-All of your contributions help make vLLM a great tool and community for everyone!
+You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview/).
diff --git a/docs/source/dev/dockerfile/dockerfile.rst b/docs/source/contributing/dockerfile/dockerfile.rst
similarity index 100%
rename from docs/source/dev/dockerfile/dockerfile.rst
rename to docs/source/contributing/dockerfile/dockerfile.rst
diff --git a/docs/source/contributing/overview.rst b/docs/source/contributing/overview.rst
new file mode 100644
index 0000000000000..ac2d2b2fe4103
--- /dev/null
+++ b/docs/source/contributing/overview.rst
@@ -0,0 +1,70 @@
+Contributing to vLLM
+=====================
+
+Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project:
+
+- Identify and report any issues or bugs.
+- Request or add support for a new model.
+- Suggest or implement new features.
+- Improve documentation or contribute a how-to guide.
+
+We also believe in the power of community support; thus, answering queries, offering PR reviews, and assisting others are also highly regarded and beneficial contributions.
+
+Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository!
+
+License
+-------
+
+See `LICENSE <https://github.com/vllm-project/vllm/tree/main/LICENSE>`_.
+
+Developing
+----------
+
+Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the `building from source <https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source>`_ documentation for details.
+
+Testing
+-------
+
+.. code-block:: bash
+
+    pip install -r requirements-dev.txt
+
+    # linting and formatting
+    bash format.sh
+    # Static type checking
+    mypy
+    # Unit tests
+    pytest tests/
+
+.. note:: Currently, the repository does not pass the ``mypy`` tests.
+
+Contribution Guidelines
+=======================
+
+DCO and Signed-off-by
+----------------------
+
+When contributing changes to this project, you must agree to the `DCO <https://github.com/vllm-project/vllm/tree/main/DCO>`_.
+Commits must include a ``Signed-off-by:`` header which certifies agreement with
+the terms of the `DCO <https://github.com/vllm-project/vllm/tree/main/DCO>`_.
+
+Using ``-s`` with ``git commit`` will automatically add this header.
+
+Issues
+------
+
+If you encounter a bug or have a feature request, please `search existing issues <https://github.com/vllm-project/vllm/issues?q=is%3Aissue>`_ first to see if it has already been reported. If not, please `file a new issue <https://github.com/vllm-project/vllm/issues/new/choose>`_, providing as much relevant information as possible.
+
+.. important::
+   If you discover a security vulnerability, please follow the instructions `here <https://github.com/vllm-project/vllm/tree/main/SECURITY.md#reporting-a-vulnerability>`_.
+
+Pull Requests & Code Reviews
+----------------------------
+
+Please check the PR checklist in the `PR template <https://github.com/vllm-project/vllm/tree/main/.github/PULL_REQUEST_TEMPLATE.md>`_ for a detailed guide for contribution.
+
+Thank You
+---------
+
+Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
+All of your contributions help make vLLM a great tool and community for everyone!
diff --git a/docs/source/dev/profiling/profiling_index.rst b/docs/source/contributing/profiling/profiling_index.rst
similarity index 100%
rename from docs/source/dev/profiling/profiling_index.rst
rename to docs/source/contributing/profiling/profiling_index.rst
diff --git a/docs/source/dev/input_processing/input_processing_pipeline.rst b/docs/source/design/input_processing/input_processing_pipeline.rst
similarity index 100%
rename from docs/source/dev/input_processing/input_processing_pipeline.rst
rename to docs/source/design/input_processing/input_processing_pipeline.rst
diff --git a/docs/source/dev/input_processing/model_inputs_index.rst b/docs/source/design/input_processing/model_inputs_index.rst
similarity index 100%
rename from docs/source/dev/input_processing/model_inputs_index.rst
rename to docs/source/design/input_processing/model_inputs_index.rst
diff --git a/docs/source/dev/kernel/paged_attention.rst b/docs/source/design/kernel/paged_attention.rst
similarity index 100%
rename from docs/source/dev/kernel/paged_attention.rst
rename to docs/source/design/kernel/paged_attention.rst
diff --git a/docs/source/dev/multimodal/adding_multimodal_plugin.rst b/docs/source/design/multimodal/adding_multimodal_plugin.rst
similarity index 100%
rename from docs/source/dev/multimodal/adding_multimodal_plugin.rst
rename to docs/source/design/multimodal/adding_multimodal_plugin.rst
diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/design/multimodal/multimodal_index.rst
similarity index 100%
rename from docs/source/dev/multimodal/multimodal_index.rst
rename to docs/source/design/multimodal/multimodal_index.rst
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 38dad25e18c02..b12e695de37b6 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -130,26 +130,45 @@ Documentation
 
    performance/benchmarks
 
+.. Community: User community resources
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Community
+
+   community/meetups
+   community/sponsors
+
+.. API Documentation: API reference aimed at vllm library usage
+
 .. toctree::
    :maxdepth: 2
-   :caption: Developer Documentation
+   :caption: API Documentation
 
    dev/sampling_params
    dev/pooling_params
    dev/offline_inference/offline_index
    dev/engine/engine_index
-   dev/kernel/paged_attention
-   dev/input_processing/model_inputs_index
-   dev/multimodal/multimodal_index
-   dev/dockerfile/dockerfile
-   dev/profiling/profiling_index
+
+.. Design: docs about vLLM internals
 
 .. toctree::
-   :maxdepth: 1
-   :caption: Community
+   :maxdepth: 2
+   :caption: Design
 
-   community/meetups
-   community/sponsors
+   design/input_processing/model_inputs_index
+   design/kernel/paged_attention
+   design/multimodal/multimodal_index
+
+.. Contributing: contributing to the vLLM project
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contributing
+
+   contributing/overview
+   contributing/profiling/profiling_index
+   contributing/dockerfile/dockerfile
 
 Indices and tables
 ==================

From da07a9ead7a9b3c0ca0ecc1cc787faf1e1a1ccf7 Mon Sep 17 00:00:00 2001
From: Tao He <linzhu.ht@alibaba-inc.com>
Date: Fri, 8 Nov 2024 13:31:28 +0800
Subject: [PATCH 042/183] Fixes a typo about 'max_decode_seq_len' which causes
 crashes with cuda graph. (#9285)

Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>

From aea6ad629ff92f072a11b21dcdb1105677744007 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 8 Nov 2024 03:35:25 -0500
Subject: [PATCH 043/183] Add hf_transfer to testing image (#10096)

---
 Dockerfile | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index 4c0f5aebe859d..220dbe26712ec 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -191,6 +191,11 @@ ADD . /vllm-workspace/
 RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install -r requirements-dev.txt
 
+# enable fast downloads from hf (for testing)
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install hf_transfer
+ENV HF_HUB_ENABLE_HF_TRANSFER 1
+
 # Copy in the v1 package for testing (it isn't distributed yet)
 COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
 

From f4c2187e2967ef4052d173b422b0249ab9532753 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 8 Nov 2024 17:07:01 +0800
Subject: [PATCH 044/183] [Misc] Fix typo in #5895 (#10145)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 benchmarks/backend_request_func.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 313ba819c87cb..25c8b1bbf3e22 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -256,7 +256,7 @@ async def async_request_openai_completions(
             async with session.post(url=api_url, json=payload,
                                     headers=headers) as response:
                 if response.status == 200:
-                    first_valid_chunk_received = False
+                    first_chunk_received = False
                     async for chunk_bytes in response.content:
                         chunk_bytes = chunk_bytes.strip()
                         if not chunk_bytes:
@@ -275,7 +275,7 @@ async def async_request_openai_completions(
                             if data["choices"][0]["text"]:
                                 timestamp = time.perf_counter()
                                 # First token
-                                if not first_valid_chunk_received:
+                                if not first_chunk_received:
                                     first_chunk_received = True
                                     ttft = time.perf_counter() - st
                                     output.ttft = ttft

From f10797c0ce4533412d41842180ca792ad07df11c Mon Sep 17 00:00:00 2001
From: Yan Ma <yan.ma@intel.com>
Date: Fri, 8 Nov 2024 17:41:03 +0800
Subject: [PATCH 045/183] [Bugfix][XPU] Fix xpu tp by introducing
 XpuCommunicator (#10144)

Signed-off-by: yan ma <yan.ma@intel.com>
---
 .../device_communicators/xpu_communicator.py  | 47 +++++++++++++++++++
 vllm/distributed/parallel_state.py            | 40 +++++++---------
 2 files changed, 65 insertions(+), 22 deletions(-)
 create mode 100644 vllm/distributed/device_communicators/xpu_communicator.py

diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py
new file mode 100644
index 0000000000000..eafd3c2f67749
--- /dev/null
+++ b/vllm/distributed/device_communicators/xpu_communicator.py
@@ -0,0 +1,47 @@
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from vllm.platforms import current_platform
+
+
+class XpuCommunicator:
+
+    def __init__(self, group: ProcessGroup):
+        if not current_platform.is_xpu():
+            self.disabled = True
+            return
+        self.disabled = False
+        self.group = group
+        self.world_size = dist.get_world_size(self.group)
+
+    def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
+        dist.all_reduce(x, group=self.group)
+        return x
+
+    def gather(self,
+               input_: torch.Tensor,
+               rank_in_group: int,
+               dst: int = 0,
+               dim: int = -1):
+        # For xpu path, gather doesn't work properly together with ray
+        # cluster so we use all_gather instead for now.
+        input_size = input_.size()
+        # Allocate output tensor.
+        output_tensor = torch.empty((self.world_size, ) + input_size,
+                                    dtype=input_.dtype,
+                                    device=input_.device)
+        # All-gather.
+        torch.distributed.all_gather_into_tensor(output_tensor,
+                                                 input_,
+                                                 group=self.group)
+        if rank_in_group == dst:
+            # Reshape
+            output_tensor = output_tensor.movedim(0, dim)
+            output_tensor = output_tensor.reshape(input_size[:dim] +
+                                                  (self.world_size *
+                                                   input_size[dim], ) +
+                                                  input_size[dim + 1:])
+        else:
+            output_tensor = None
+        return output_tensor
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 0d15403264eee..87ade377266a2 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -177,6 +177,7 @@ def __init__(
         use_custom_allreduce: bool,
         use_tpu_communicator: bool,
         use_hpu_communicator: bool,
+        use_xpu_communicator: bool,
         use_message_queue_broadcaster: bool = False,
         group_name: Optional[str] = None,
     ):
@@ -214,6 +215,7 @@ def __init__(
         self.use_custom_allreduce = use_custom_allreduce
         self.use_tpu_communicator = use_tpu_communicator
         self.use_hpu_communicator = use_hpu_communicator
+        self.use_xpu_communicator = use_xpu_communicator
 
         # lazy import to avoid documentation build error
         from vllm.distributed.device_communicators.custom_all_reduce import (
@@ -248,6 +250,12 @@ def __init__(
         if use_hpu_communicator and self.world_size > 1:
             self.hpu_communicator = HpuCommunicator(group=self.device_group)
 
+        from vllm.distributed.device_communicators.xpu_communicator import (
+            XpuCommunicator)
+        self.xpu_communicator: Optional[XpuCommunicator]
+        if use_xpu_communicator and self.world_size > 1:
+            self.xpu_communicator = XpuCommunicator(group=self.device_group)
+
         from vllm.distributed.device_communicators.shm_broadcast import (
             MessageQueue)
         self.mq_broadcaster: Optional[MessageQueue] = None
@@ -373,6 +381,10 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
             not self.hpu_communicator.disabled:
             return self.hpu_communicator.all_reduce(input_)
 
+        if self.xpu_communicator is not None and \
+                not self.xpu_communicator.disabled:
+            return self.xpu_communicator.all_reduce(input_)
+
         if self.ca_comm is not None and \
             not self.ca_comm.disabled and \
                 self.ca_comm.should_custom_ar(input_):
@@ -459,28 +471,10 @@ def gather(self,
         if dim < 0:
             # Convert negative dim to positive.
             dim += input_.dim()
-        # For xpu path, gather doesn't work properly together with ray
-        # cluster so we use all_gather instead for now.
-        if current_platform.is_xpu():
-            input_size = input_.size()
-            # Allocate output tensor.
-            output_tensor = torch.empty((world_size, ) + input_size,
-                                        dtype=input_.dtype,
-                                        device=input_.device)
-            # All-gather.
-            torch.distributed.all_gather_into_tensor(output_tensor,
-                                                     input_,
-                                                     group=self.device_group)
-            if self.rank_in_group == dst:
-                # Reshape
-                output_tensor = output_tensor.movedim(0, dim)
-                output_tensor = output_tensor.reshape(input_size[:dim] +
-                                                      (world_size *
-                                                       input_size[dim], ) +
-                                                      input_size[dim + 1:])
-            else:
-                output_tensor = None
-            return output_tensor
+        if self.xpu_communicator is not None and \
+                not self.xpu_communicator.disabled:
+            return self.xpu_communicator.gather(input_, self.rank_in_group,
+                                                dst, dim)
         # Allocate output tensor.
         if self.rank_in_group == dst:
             gather_list = [torch.empty_like(input_) for _ in range(world_size)]
@@ -896,6 +890,7 @@ def init_world_group(ranks: List[int], local_rank: int,
         use_custom_allreduce=False,
         use_tpu_communicator=False,
         use_hpu_communicator=False,
+        use_xpu_communicator=False,
         group_name="world",
     )
 
@@ -918,6 +913,7 @@ def init_model_parallel_group(
         use_custom_allreduce=use_custom_allreduce,
         use_tpu_communicator=True,
         use_hpu_communicator=True,
+        use_xpu_communicator=True,
         use_message_queue_broadcaster=use_message_queue_broadcaster,
         group_name=group_name,
     )

From 1ff4aed5bddd995c5a2847993e2fb5be88763872 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Fri, 8 Nov 2024 17:56:58 +0800
Subject: [PATCH 046/183] [Model] Expose size to Idefics3 as
 mm_processor_kwargs (#10146)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 examples/offline_inference_vision_language.py |  19 +-
 ...e_inference_vision_language_multi_image.py |   7 +
 .../mm_processor_kwargs/test_idefics3.py      | 187 ++++++++++++++++++
 vllm/model_executor/models/idefics3.py        |  80 ++++++--
 4 files changed, 270 insertions(+), 23 deletions(-)
 create mode 100644 tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
index 8d17ce3754515..11af6880e1b5a 100644
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
@@ -382,10 +382,19 @@ def run_idefics3(question: str, modality: str):
     assert modality == "image"
     model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
 
-    llm = LLM(model=model_name,
-              max_model_len=8192,
-              max_num_seqs=2,
-              enforce_eager=True)
+    llm = LLM(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        enforce_eager=True,
+        # if you are running out of memory, you can reduce the "longest_edge".
+        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
+        mm_processor_kwargs={
+            "size": {
+                "longest_edge": 3 * 364
+            },
+        },
+    )
     prompt = (
         f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
     )
@@ -518,4 +527,4 @@ def main(args):
                         default=16,
                         help='Number of frames to extract from the video.')
     args = parser.parse_args()
-    main(args)
\ No newline at end of file
+    main(args)
diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py
index 7e883568995a4..dc12df8d78211 100644
--- a/examples/offline_inference_vision_language_multi_image.py
+++ b/examples/offline_inference_vision_language_multi_image.py
@@ -300,6 +300,13 @@ def load_idefics3(question, image_urls: List[str]) -> ModelRequestData:
         max_num_seqs=16,
         enforce_eager=True,
         limit_mm_per_prompt={"image": len(image_urls)},
+        # if you are running out of memory, you can reduce the "longest_edge".
+        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
+        mm_processor_kwargs={
+            "size": {
+                "longest_edge": 2 * 364
+            },
+        },
     )
 
     placeholders = "\n".join(f"Image-{i}: <image>\n"
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py
new file mode 100644
index 0000000000000..31896bfd13e8c
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_idefics3.py
@@ -0,0 +1,187 @@
+"""Tests for Idefics3's multimodal preprocessing kwargs."""
+from typing import Optional
+
+import pytest
+import torch
+import transformers
+from transformers import AutoImageProcessor, AutoTokenizer
+
+from vllm.inputs import InputContext, token_inputs
+from vllm.multimodal import MultiModalRegistry
+
+from .....conftest import _ImageAssets
+from ....utils import build_model_context
+
+models = ["HuggingFaceM4/Idefics3-8B-Llama3"]
+
+
+# Wrap lazy imports to avoid initializing CUDA during test collection
+@pytest.fixture()
+def input_processor_for_idefics3():
+    from vllm.model_executor.models.idefics3 import (
+        input_processor_for_idefics3)
+    return input_processor_for_idefics3
+
+
+@pytest.fixture()
+def dummy_data_for_idefics3():
+    from vllm.model_executor.models.idefics3 import dummy_data_for_idefics3
+    return dummy_data_for_idefics3
+
+
+@pytest.fixture()
+def get_max_idefics3_image_tokens():
+    from vllm.model_executor.models.idefics3 import (
+        get_max_idefics3_image_tokens)
+    return get_max_idefics3_image_tokens
+
+
+@pytest.mark.skipif(transformers.__version__ < "4.46.0",
+                    reason="Model introduced in HF >= 4.46.0")
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("longest_edge", [None, 168, 336, 400, 2 * 336])
+def test_input_mapper_override(model: str, image_assets: _ImageAssets,
+                               longest_edge: Optional[int]):
+    """Ensure that the [default] input mapper handles size properly."""
+
+    mm_processor_kwargs = {
+        "size": {
+            "longest_edge": longest_edge
+        }
+    } if longest_edge is not None else {}
+    ctx = build_model_context(
+        model_name=model,
+        tokenizer_name=model,
+        trust_remote_code=True,
+        mm_processor_kwargs=mm_processor_kwargs,
+    )
+
+    hf_processor = AutoImageProcessor.from_pretrained(model,
+                                                      trust_remote_code=True,
+                                                      **mm_processor_kwargs)
+
+    mm_registry = MultiModalRegistry()
+    mm_registry.init_mm_limits_per_prompt(ctx.model_config)
+
+    image = image_assets[0].pil_image
+    hf_result = hf_processor.preprocess(
+        image,
+        return_tensors="pt",
+    )
+
+    vllm_result = mm_registry.map_input(
+        ctx.model_config,
+        {"image": image},
+    )
+
+    assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"])
+
+
+@pytest.mark.skipif(transformers.__version__ < "4.46.0",
+                    reason="Model introduced in HF >= 4.46.0")
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("longest_edge, expected_max_tokens", [
+    (None, 2873),
+    (168, 169),
+    (336, 169),
+    (400, 338),
+    (672, 338),
+])
+def test_max_tokens_override(get_max_idefics3_image_tokens, model: str,
+                             longest_edge: Optional[int],
+                             expected_max_tokens: int):
+    """Ensure get_max_idefics3_image_tokens handles mm_processor_kwargs."""
+    size = {"longest_edge": longest_edge} if longest_edge is not None else None
+    ctx = build_model_context(
+        model_name=model,
+        tokenizer_name=model,
+        trust_remote_code=True,
+        mm_processor_kwargs=None,
+    )
+
+    actual_max_tokens = get_max_idefics3_image_tokens(
+        ctx=InputContext(ctx.model_config),
+        size=size,
+    )
+
+    assert expected_max_tokens == actual_max_tokens
+
+
+@pytest.mark.skipif(transformers.__version__ < "4.46.0",
+                    reason="Model introduced in HF >= 4.46.0")
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("longest_edge, toks_per_img, num_imgs", [
+    (168, 169, 1),
+    (168, 169, 2),
+    (400, 338, 1),
+    (400, 338, 2),
+])
+def test_dummy_data_override(dummy_data_for_idefics3, model: str,
+                             longest_edge: int, toks_per_img: int,
+                             num_imgs: int):
+    """Ensure dummy_data_for_idefics3 handles num_crops properly."""
+    # Same as the previous test - don't initialize mm_processor_kwargs
+    # in this test and assume that the kwargs will be correctly expanded by
+    # the partial when calling the dummy data func.
+    size = {"longest_edge": longest_edge} if longest_edge is not None else None
+    ctx = build_model_context(
+        model_name=model,
+        tokenizer_name=model,
+        trust_remote_code=True,
+        mm_processor_kwargs=None,
+    )
+
+    dummy_data = dummy_data_for_idefics3(
+        ctx=ctx,
+        seq_len=8192,  # Should be bigger than num_imgs * toks_per_img
+        mm_counts={"image": num_imgs},
+        size=size)
+    sequence_data = dummy_data.seq_data
+    # Ensure we have the right number of placeholders per size
+    image_token_id = ctx.get_hf_config().image_token_id
+    img_tok_count = sequence_data.get_token_ids().count(image_token_id)
+    assert img_tok_count == toks_per_img * num_imgs
+
+
+@pytest.mark.skipif(transformers.__version__ < "4.46.0",
+                    reason="Model introduced in HF >= 4.46.0")
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("longest_edge,expected_toks_per_img,num_imgs", [
+    (336, 169 * (1**2 + 1), 1),
+    (336, 169 * (1**2 + 1), 2),
+    (400, 169 * (2**2 + 1), 1),
+    (400, 169 * (2**2 + 1), 2),
+])
+def test_input_processor_override(input_processor_for_idefics3,
+                                  image_assets: _ImageAssets, model: str,
+                                  longest_edge: int,
+                                  expected_toks_per_img: int, num_imgs: int):
+    """Ensure input_processor_for_idefics3 handles num_crops properly."""
+    # Same as the previous test - don't initialize mm_processor_kwargs
+    # in this test and assume that the kwargs will be correctly expanded by
+    # the partial when calling the custom input processor.
+    size = {"longest_edge": longest_edge} if longest_edge is not None else None
+    ctx = build_model_context(
+        model_name=model,
+        tokenizer_name=model,
+        trust_remote_code=True,
+        mm_processor_kwargs=None,
+    )
+
+    # Build the image str / prompt based on the number of images we pass
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    placeholders = "<image>" if num_imgs == 1 else "\n".join(
+        f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
+    prompt = f"<|begin_of_text|>User:{placeholders}\n<end_of_utterance>\nAssistant:"  # noqa: E501
+    images = [image_assets[0].pil_image.resize((336 * 4, 336 * 4))] * num_imgs
+
+    inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
+                          prompt=prompt,
+                          multi_modal_data={"image": images})
+
+    processed_inputs = input_processor_for_idefics3(ctx, inputs, size=size)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    image_token_id = ctx.get_hf_config().image_token_id
+    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
+    assert img_tok_count == expected_toks_per_img * num_imgs
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index e4c98f22fb16f..3f6d010f4e493 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -14,8 +14,8 @@
 """Inference-only Idefics3 model compatible with HuggingFace weights."""
 
 import math
-from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
-                    TypedDict, Union)
+from typing import (Dict, Iterable, List, Literal, Mapping, NamedTuple,
+                    Optional, Tuple, TypedDict, Union)
 
 import torch
 import torch.utils.checkpoint
@@ -23,6 +23,7 @@
 from torch import nn
 # Temporary solution for transformers below 4.46.0.
 from transformers import PretrainedConfig as Idefics3Config
+from transformers import ProcessorMixin as Idefics3ImageProcessor
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, MultiModalConfig
@@ -72,16 +73,41 @@ class Idefics3ImageEmbeddingInputs(TypedDict):
     """
 
 
+class Idefics3ProcessorSize(NamedTuple):
+    """Hashable wrapper for unhashable `size` dict of Idefics3Processor."""
+    # NOTE: cached_get_processor/cached_get_image_processor uses lru_cache,
+    # we need to use NamedTuple instead of TypedDict to avoid hashing issues.
+    longest_edge: int
+
+    def __contains__(self, key: str) -> bool:
+        return key in self._asdict() and getattr(self, key) is not None
+
+    def __getitem__(self, key: str) -> int:
+        return getattr(self, key)
+
+
 ImageInputs = Union[Idefics3ImagePixelInputs, Idefics3ImageEmbeddingInputs]
 
 
+def get_mm_processor_kwargs(size: Optional[Dict[str, int]] = None) -> Dict:
+    mm_processor_kwargs = {}
+    if size:
+        mm_processor_kwargs["size"] = Idefics3ProcessorSize(**size)
+    return mm_processor_kwargs
+
+
 def input_mapper_for_idefics3(
     ctx: InputContext,
     data: object,
+    *,
+    size: Optional[Dict[str, int]] = None,
 ):
     model_config = ctx.model_config
+    mm_processor_kwargs = get_mm_processor_kwargs(size)
     image_processor = cached_get_image_processor(
-        model_config.model, trust_remote_code=model_config.trust_remote_code)
+        model_config.model,
+        trust_remote_code=model_config.trust_remote_code,
+        **mm_processor_kwargs)
     if image_processor is None:
         raise RuntimeError("No HuggingFace processor is available "
                            "to process the image object")
@@ -201,13 +227,17 @@ def _get_image_prompt_string(image_rows: int, image_cols: int,
                                global_img_token)
 
 
-def input_processor_for_idefics3(ctx: InputContext, inputs: DecoderOnlyInputs):
+def input_processor_for_idefics3(ctx: InputContext,
+                                 inputs: DecoderOnlyInputs,
+                                 *,
+                                 size: Optional[Dict[str, int]] = None):
     multi_modal_data = inputs.get("multi_modal_data")
     if multi_modal_data is None or "image" not in multi_modal_data:
         return inputs
 
     model_config = ctx.model_config
-    processor = cached_get_processor(model_config.model)
+    mm_processor_kwargs = get_mm_processor_kwargs(size)
+    processor = cached_get_processor(model_config.model, **mm_processor_kwargs)
     image_processor = processor.image_processor
     tokenizer = processor.tokenizer
     size = image_processor.size['longest_edge']
@@ -286,32 +316,46 @@ def input_processor_for_idefics3(ctx: InputContext, inputs: DecoderOnlyInputs):
         )
 
 
-def get_max_idefics3_image_tokens(ctx: InputContext,
-                                  *,
-                                  num_crops: Optional[int] = None):
-    model_config = ctx.model_config
-    processor = cached_get_processor(model_config.model)
-    image_seq_len = processor.image_seq_len
-    image_processor = processor.image_processor
-
+def _get_max_num_image_patch(image_processor: Idefics3ImageProcessor) -> int:
     size = image_processor.size['longest_edge']
     max_image_size = image_processor.max_image_size['longest_edge']
     resized_height, resized_width = size, size
 
     grid_h = resized_height // max_image_size
     grid_w = resized_width // max_image_size
+    return (grid_h * grid_w + 1)
+
+
+def get_max_idefics3_image_tokens(ctx: InputContext,
+                                  *,
+                                  size: Optional[Dict[str,
+                                                      int]] = None) -> int:
+    model_config = ctx.model_config
+    mm_processor_kwargs = get_mm_processor_kwargs(size)
+    processor = cached_get_processor(model_config.model, **mm_processor_kwargs)
+    image_seq_len = processor.image_seq_len
+    image_processor = processor.image_processor
+
+    max_num_image_patches = _get_max_num_image_patch(image_processor)
 
-    return (grid_h * grid_w + 1) * image_seq_len
+    return max_num_image_patches * image_seq_len
 
 
-def dummy_data_for_idefics3(ctx: InputContext, seq_len: int,
-                            mm_counts: Mapping[str, int]) -> DummyData:
+def dummy_data_for_idefics3(
+        ctx: InputContext,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+        *,
+        size: Optional[Dict[str, int]] = None) -> DummyData:
     hf_config = ctx.get_hf_config()
     num_images = mm_counts["image"]
 
-    processor = cached_get_processor(ctx.model_config.model)
+    mm_processor_kwargs = get_mm_processor_kwargs(size)
+    processor = cached_get_processor(ctx.model_config.model,
+                                     **mm_processor_kwargs)
+    max_num_image_patches = _get_max_num_image_patch(processor.image_processor)
     image_seq_len = processor.image_seq_len
-    max_llm_image_tokens = 17 * image_seq_len * num_images
+    max_llm_image_tokens = max_num_image_patches * image_seq_len * num_images
 
     seq_data = SequenceData.from_prompt_token_counts(
         (hf_config.image_token_id, max_llm_image_tokens), (0, seq_len))

From 208ce622c712fef75623f785597dbbd698700fa6 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Fri, 8 Nov 2024 06:39:41 -0800
Subject: [PATCH 047/183] [V1]Enable APC by default only for text models
 (#10148)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/engine/llm_engine.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index cd3f5c75d0d14..81dc01ae2d8e7 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -65,7 +65,10 @@ def __init__(
         elif usage_context == UsageContext.OPENAI_API_SERVER:
             scheduler_config.max_num_seqs = 1024
             scheduler_config.max_num_batched_tokens = 2048
-        cache_config.enable_prefix_caching = True
+
+        # TODO (ywang96): Enable APC by default when VLM supports it.
+        if not model_config.is_multimodal_model:
+            cache_config.enable_prefix_caching = True
 
         logger.info(
             "Initializing an LLM engine (v%s) with config: "

From b489fc3c91778d8815243f89132d36b2c6eefd5a Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 8 Nov 2024 23:30:04 +0800
Subject: [PATCH 048/183] [CI/Build] Update CPU tests to include all "standard"
 tests (#5481)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/run-cpu-test-ppc64le.sh            | 21 ++++++++++------
 .buildkite/run-cpu-test.sh                    | 25 +++++++++++++------
 .buildkite/test-pipeline.yaml                 |  3 +--
 pyproject.toml                                |  3 ++-
 requirements-test.in                          |  5 ----
 .../audio_language/test_ultravox.py           | 17 ++++++++++---
 .../vision_language/test_h2ovl.py             |  1 -
 .../vision_language/test_models.py            | 11 +++-----
 .../vision_language/test_phi3v.py             |  2 --
 tests/models/utils.py                         |  3 +--
 vllm/assets/image.py                          |  2 +-
 vllm/model_executor/models/ultravox.py        |  4 +--
 vllm/multimodal/utils.py                      |  8 +++---
 vllm/worker/cpu_worker.py                     |  6 ++++-
 14 files changed, 63 insertions(+), 48 deletions(-)

diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh
index a63c95e51002f..5add7ff0c15c9 100755
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@@ -19,17 +19,22 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
 
 # Run basic model test
 docker exec cpu-test bash -c "
-  pip install pytest matplotlib einops transformers_stream_generator
-  pytest -v -s tests/models -m \"not vlm\" \
-    --ignore=tests/models/test_embedding.py \
-    --ignore=tests/models/test_oot_registration.py \
-    --ignore=tests/models/test_registry.py \
-    --ignore=tests/models/test_jamba.py \
-    --ignore=tests/models/test_mamba.py \
-    --ignore=tests/models/test_danube3_4b.py" # Mamba kernels and Danube3-4B on CPU is not supported
+  set -e
+  pip install pytest pytest-asyncio \
+    decord einops librosa peft Pillow sentence-transformers soundfile \
+    transformers_stream_generator matplotlib datamodel_code_generator
+  pip install torchvision --index-url https://download.pytorch.org/whl/cpu
+  # Embedding models are not supported for CPU yet
+  # pytest -v -s tests/models/embedding/language
+  pytest -v -s tests/models/encoder_decoder/language
+  pytest -v -s tests/models/decoder_only/language/test_models.py
+  # Chunked prefill not supported for CPU yet
+  # pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
+  pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
 
 # online inference
 docker exec cpu-test bash -c "
+  set -e
   python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & 
   timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
   python3 benchmarks/benchmark_serving.py \
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 064d7c77ab570..25a448e63be27 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -20,32 +20,41 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
  --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
 
 # offline inference
-docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"
+docker exec cpu-test-avx2 bash -c "
+  set -e
+  python3 examples/offline_inference.py"
 
 # Run basic model test
 docker exec cpu-test bash -c "
-  pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator
+  set -e
+  pip install pytest pytest-asyncio \
+    decord einops librosa peft Pillow sentence-transformers soundfile \
+    transformers_stream_generator matplotlib datamodel_code_generator
+  pip install torchvision --index-url https://download.pytorch.org/whl/cpu
+  # Embedding models are not supported for CPU yet
+  # pytest -v -s tests/models/embedding/language
   pytest -v -s tests/models/encoder_decoder/language
-  pytest -v -s tests/models/decoder_only/language \
-    --ignore=tests/models/test_fp8.py \
-    --ignore=tests/models/decoder_only/language/test_jamba.py \
-    --ignore=tests/models/decoder_only/language/test_mamba.py \
-    --ignore=tests/models/decoder_only/language/test_granitemoe.py \
-    --ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
+  pytest -v -s tests/models/decoder_only/language/test_models.py
+  # Chunked prefill not supported for CPU yet
+  # pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
+  pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
 
 # Run compressed-tensor test
 docker exec cpu-test bash -c "
+  set -e
   pytest -s -v \
   tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
   tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
 
 # Run AWQ test
 docker exec cpu-test bash -c "
+  set -e
   pytest -s -v \
   tests/quantization/test_ipex_quant.py"
 
 # online inference
 docker exec cpu-test bash -c "
+  set -e
   export VLLM_CPU_KVCACHE_SPACE=10 
   export VLLM_CPU_OMP_THREADS_BIND=48-92 
   python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 705e81d15ad65..2c5d74e7abcbf 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -269,7 +269,6 @@ steps:
   source_file_dependencies:
   - benchmarks/
   commands:
-  - pip install aiohttp
   - bash run-benchmarks.sh
 
 - label: Quantization Test # 33min
@@ -331,7 +330,7 @@ steps:
   commands:
     - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py
 
-- label: Decoder-only Multi-Modal Models Test (Standard)
+- label: Decoder-only Multi-Modal Models Test (Standard) # 26min
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
diff --git a/pyproject.toml b/pyproject.toml
index bae8645502dea..1385a15d07878 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -93,7 +93,8 @@ skip_gitignore = true
 [tool.pytest.ini_options]
 markers = [
     "skip_global_cleanup",
-    "core_model: run this model test in each PR instead of just daily",
+    "core_model: enable this model test in each PR instead of only nightly",
+    "cpu_model: enable this model test in CPU tests",
     "distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
     "skip_v1: do not run this test with v1",
 ]
diff --git a/requirements-test.in b/requirements-test.in
index 1b4b9ba78ed9c..76f6de2f77c34 100644
--- a/requirements-test.in
+++ b/requirements-test.in
@@ -12,9 +12,7 @@ decord # required for video tests
 einops # required for MPT, qwen-vl and Mamba
 httpx
 librosa # required for audio tests
-opencv-python # required for video tests
 peft
-requests
 ray[adag]==2.35
 sentence-transformers # required for embedding tests
 soundfile # required for audio tests
@@ -29,9 +27,6 @@ lm-eval[api]==0.4.4 # required for model evaluation test
 # TODO: Add this after fully implementing llava(mantis)
 # git+https://github.com/TIGER-AI-Lab/Mantis.git # required for llava(mantis) test
 
-# Benchmarking
-aiohttp
-
 # quantization
 bitsandbytes>=0.44.0
 buildkite-test-collector==0.1.9
diff --git a/tests/models/decoder_only/audio_language/test_ultravox.py b/tests/models/decoder_only/audio_language/test_ultravox.py
index d14e88b4e5b26..e100c6b9bb906 100644
--- a/tests/models/decoder_only/audio_language/test_ultravox.py
+++ b/tests/models/decoder_only/audio_language/test_ultravox.py
@@ -5,11 +5,11 @@
 import pytest_asyncio
 from transformers import AutoModel, AutoTokenizer, BatchEncoding
 
-from tests.utils import RemoteOpenAIServer
 from vllm.sequence import SampleLogprobs
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 
 from ....conftest import HfRunner, VllmRunner
+from ....utils import RemoteOpenAIServer
 from ...utils import check_logprobs_close
 
 MODEL_NAME = "fixie-ai/ultravox-v0_3"
@@ -39,7 +39,10 @@ def audio(request):
     return AudioAsset(request.param)
 
 
-@pytest.fixture(params=({}, CHUNKED_PREFILL_KWARGS))
+@pytest.fixture(params=[
+    pytest.param({}, marks=pytest.mark.cpu_model),
+    pytest.param(CHUNKED_PREFILL_KWARGS),
+])
 def server(request, audio_assets):
     args = [
         "--dtype=bfloat16", "--max-model-len=4096", "--enforce-eager",
@@ -185,7 +188,10 @@ def run_multi_audio_test(
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS])
+@pytest.mark.parametrize("vllm_kwargs", [
+    pytest.param({}, marks=pytest.mark.cpu_model),
+    pytest.param(CHUNKED_PREFILL_KWARGS),
+])
 def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
                 num_logprobs: int, vllm_kwargs: dict) -> None:
 
@@ -207,7 +213,10 @@ def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int,
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
-@pytest.mark.parametrize("vllm_kwargs", [{}, CHUNKED_PREFILL_KWARGS])
+@pytest.mark.parametrize("vllm_kwargs", [
+    pytest.param({}, marks=pytest.mark.cpu_model),
+    pytest.param(CHUNKED_PREFILL_KWARGS),
+])
 def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str,
                                      max_tokens: int, num_logprobs: int,
                                      vllm_kwargs: dict) -> None:
diff --git a/tests/models/decoder_only/vision_language/test_h2ovl.py b/tests/models/decoder_only/vision_language/test_h2ovl.py
index ad9aa3104750b..45a7365204403 100644
--- a/tests/models/decoder_only/vision_language/test_h2ovl.py
+++ b/tests/models/decoder_only/vision_language/test_h2ovl.py
@@ -14,7 +14,6 @@
     "h2oai/h2ovl-mississippi-800m",  # Replace with your actual model names
     "h2oai/h2ovl-mississippi-2b",
 ]
-target_dtype = "bfloat16"
 
 
 def run_preprocessing_test(
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 3dbfaafb781af..163752e9fe06e 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -94,7 +94,7 @@
             ),
             limit_mm_per_prompt={"image": 4},
         )],
-        marks=[pytest.mark.core_model],
+        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
     ),
     "paligemma": VLMTestInfo(
         models=["google/paligemma-3b-mix-224"],
@@ -111,7 +111,8 @@
             "pixel_values"
         ),
         vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output,
-        dtype="half" if current_platform.is_rocm() else ("half", "float"),
+        dtype=("half" if current_platform.is_cpu() or current_platform.is_rocm()
+               else ("half", "float")),
         marks=[pytest.mark.core_model],
     ),
     "qwen2_vl": VLMTestInfo(
@@ -128,7 +129,7 @@
         max_num_seqs=2,
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
-        marks=[pytest.mark.core_model],
+        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
     ),
     #### Extended model tests
@@ -172,7 +173,6 @@
         use_tokenizer_eos=True,
         vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
         num_logprobs=10,
-        dtype="bfloat16" if current_platform.is_cpu() else "half",
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
     ),
     "glm4": VLMTestInfo(
@@ -245,7 +245,6 @@
         models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
         test_type=VLMTestType.CUSTOM_INPUTS,
         prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
-        dtype="half",
         num_video_frames=16,
         max_model_len=16384,
         postprocess_inputs=model_utils.get_key_type_post_processor(
@@ -404,7 +403,6 @@
         prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
         test_type=VLMTestType.CUSTOM_INPUTS,
         max_model_len=4096,
-        dtype="bfloat16" if current_platform.is_cpu() else "half",
         use_tokenizer_eos=True,
         patch_hf_runner=model_utils.internvl_patch_hf_runner,
         custom_test_opts=[
@@ -419,7 +417,6 @@
         test_type=VLMTestType.CUSTOM_INPUTS,
         max_model_len=16384,
         max_num_seqs=2,
-        dtype="half",
         postprocess_inputs=model_utils.get_key_type_post_processor(
             "pixel_values"
         ),
diff --git a/tests/models/decoder_only/vision_language/test_phi3v.py b/tests/models/decoder_only/vision_language/test_phi3v.py
index b9c20ddb2d746..82eae0705c9ba 100644
--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/test_phi3v.py
@@ -44,8 +44,6 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
 
 
 target_dtype = "half"
-if current_platform.is_cpu():
-    target_dtype = "bfloat16"
 
 # ROCm Triton FA can run into shared memory issues with these models,
 # use other backends in the meantime
diff --git a/tests/models/utils.py b/tests/models/utils.py
index f7802d98ad678..0eb3f61f1f047 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -5,7 +5,6 @@
 
 from vllm.config import ModelConfig, TaskOption
 from vllm.inputs import InputContext
-from vllm.platforms import current_platform
 from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
 
 TokensText = Tuple[List[int], str]
@@ -270,7 +269,7 @@ def build_model_context(model_name: str,
     if tokenizer_name is None:
         tokenizer_name = model_name
     if dtype is None:
-        dtype = "bfloat16" if current_platform.is_cpu() else "half"
+        dtype = "half"
 
     model_config = ModelConfig(
         model_name,
diff --git a/vllm/assets/image.py b/vllm/assets/image.py
index 5eec78c328903..389ecd5c869bc 100644
--- a/vllm/assets/image.py
+++ b/vllm/assets/image.py
@@ -27,4 +27,4 @@ def image_embeds(self) -> torch.Tensor:
         """
         image_path = get_vllm_public_assets(filename=f"{self.name}.pt",
                                             s3_prefix=VLM_IMAGES_DIR)
-        return torch.load(image_path)
+        return torch.load(image_path, map_location="cpu")
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 3a343986a9345..411584b1a6c3c 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -134,9 +134,9 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
         if sr != feature_extractor.sampling_rate:
             try:
                 import librosa
-            except ImportError:
+            except ImportError as exc:
                 raise ImportError(
-                    "Please install vllm[audio] for audio support.") from None
+                    "Please install vllm[audio] for audio support.") from exc
             audio = librosa.resample(audio,
                                      orig_sr=sr,
                                      target_sr=feature_extractor.sampling_rate)
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 0c666b8cc2e69..bee3c25dbd8dd 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -206,9 +206,9 @@ def try_import_audio_packages() -> Tuple[Any, Any]:
     try:
         import librosa
         import soundfile
-    except ImportError:
+    except ImportError as exc:
         raise ImportError(
-            "Please install vllm[audio] for audio support.") from None
+            "Please install vllm[audio] for audio support.") from exc
     return librosa, soundfile
 
 
@@ -344,9 +344,9 @@ def try_import_video_packages() -> Any:
     try:
         import cv2
         import decord
-    except ImportError:
+    except ImportError as exc:
         raise ImportError(
-            "Please install vllm[video] for video support.") from None
+            "Please install vllm[video] for video support.") from exc
     return cv2, decord
 
 
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 2914f520d823c..162e1e4be873b 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -151,7 +151,11 @@ def __init__(
             self.local_omp_cpuid = omp_cpuids.split("|")[rank]
 
         ModelRunnerClass: Type[CPUModelRunner] = CPUModelRunner
-        if self.model_config.is_encoder_decoder:
+        if self.model_config.task == "embedding":
+            raise NotImplementedError(
+                "Embedding models are not supported for CPU backend")
+            # ModelRunnerClass = CPUEmbeddingModelRunner
+        elif self.model_config.is_encoder_decoder:
             ModelRunnerClass = CPUEncoderDecoderModelRunner
         self.model_runner: CPUModelRunner = ModelRunnerClass(
             vllm_config=vllm_config,

From 0535e5fe6c38a25bad71d92bb7a396f04fd1aee5 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 8 Nov 2024 16:42:27 +0100
Subject: [PATCH 049/183] Fix edge case Mistral tokenizer (#10152)

---
 vllm/transformers_utils/tokenizers/mistral.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index ccffdcc2a4df2..1b273c6b120ea 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -72,11 +72,12 @@ def __init__(self, tokenizer: PublicMistralTokenizer) -> None:
         self.instruct = tokenizer.instruct_tokenizer
 
         tokenizer_ = tokenizer.instruct_tokenizer.tokenizer
-        if isinstance(tokenizer_, Tekkenizer):
+        self.is_tekken = isinstance(tokenizer_, Tekkenizer)
+        self.is_spm = isinstance(tokenizer_, SentencePieceTokenizer)
+        if self.is_tekken:
             # Make sure special tokens will not raise
             tokenizer_.special_token_policy = SpecialTokenPolicy.IGNORE
-
-        elif isinstance(tokenizer_, SentencePieceTokenizer):
+        elif self.is_spm:
             pass
         else:
             raise TypeError(f"Unsupported tokenizer: {type(tokenizer_)}")
@@ -218,7 +219,7 @@ def apply_chat_template(self,
         return encoded.tokens
 
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
-        if isinstance(self.tokenizer, Tekkenizer):
+        if self.is_tekken:
             tokens = [
                 t for t in tokens
                 if t not in self.tokenizer._all_special_tokens
@@ -270,21 +271,20 @@ def convert_ids_to_tokens(
             skip_special_tokens
         ), "skip_special_tokens=False is not supported for Mistral tokenizers."
 
-        assert isinstance(self.tokenizer,
-                          (Tekkenizer, SentencePieceTokenizer)), type(
-                              self.tokenizer)
+        assert self.is_tekken or self.is_spm, type(self.tokenizer)
 
-        if isinstance(self.tokenizer, Tekkenizer):
+        if self.is_tekken:
             # skip special tokens
             ids = [i for i in ids if i > self.tokenizer.num_special_tokens]
 
         tokens = [self.tokenizer.id_to_piece(id) for id in ids]
 
-        if any("�" in t for t in tokens):
+        if any("�" in t for t in tokens) and self.is_tekken:
             # if a decoded token contains the replacement character, then the
             # token has an incomplete UTF-8 character so we must use bytes
             # See: https://github.com/vllm-project/vllm/pull/8640
             #      https://github.com/vllm-project/vllm/pull/9625
+            # if underlying tokenizeir is sentencepiece, we just add "�"
             tokens = [self.tokenizer.id_to_byte_piece(id) for id in ids]
 
         return tokens

From f6778620a95baf925eb54694ab4666524d0d8584 Mon Sep 17 00:00:00 2001
From: sroy745 <142070531+sroy745@users.noreply.github.com>
Date: Fri, 8 Nov 2024 07:56:18 -0800
Subject: [PATCH 050/183] Disable spec-decode + chunked-prefill for draft
 models with tensor parallelism > 1 (#10136)

Signed-off-by: Sourashis Roy <sroy@roblox.com>
---
 tests/spec_decode/e2e/test_compatibility.py | 46 +++++++++++++++++++++
 vllm/config.py                              | 45 ++++++++++++++++----
 2 files changed, 83 insertions(+), 8 deletions(-)

diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py
index af8397c235f48..a3f0464e79675 100644
--- a/tests/spec_decode/e2e/test_compatibility.py
+++ b/tests/spec_decode/e2e/test_compatibility.py
@@ -50,3 +50,49 @@ def test_spec_decode_xfail_spec_max_model_len(test_llm_generator):
     with pytest.raises(ValueError, match="cannot be larger than"):
         get_output_from_llm_generator(test_llm_generator, prompts,
                                       sampling_params)
+
+
+@pytest.mark.parametrize("common_llm_kwargs",
+                         [{
+                             "model": "meta-llama/Llama-2-7b-chat-hf",
+                             "speculative_model": "JackFram/llama-68m",
+                             "num_speculative_tokens": 5,
+                             "enable_chunked_prefill": "True",
+                         }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [
+    {
+        "tensor_parallel_size": 2,
+        "speculative_draft_tensor_parallel_size": 2,
+    },
+    {
+        "tensor_parallel_size": 4,
+        "speculative_draft_tensor_parallel_size": 4,
+    },
+    {
+        "tensor_parallel_size": 8,
+        "speculative_draft_tensor_parallel_size": 8,
+    },
+])
+@pytest.mark.parametrize("test_llm_kwargs", [{}])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_xfail_chunked_prefill_draft_model_tp_not_one(
+        test_llm_generator):
+    """Verify that speculative decoding fails if chunked prefill is enabled for 
+    draft model with tensor parallelism of more than 1.
+    """
+    output_len = 128
+    temperature = 0.0
+
+    prompts = [
+        "Hello, my name is",
+    ]
+
+    sampling_params = SamplingParams(
+        max_tokens=output_len,
+        ignore_eos=True,
+        temperature=temperature,
+    )
+
+    with pytest.raises(ValueError, match="with tensor parallel size 1"):
+        get_output_from_llm_generator(test_llm_generator, prompts,
+                                      sampling_params)
diff --git a/vllm/config.py b/vllm/config.py
index 9721925987cab..bed58fcecb5cb 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1388,6 +1388,23 @@ def maybe_create_spec_config(
                     "Chunked prefill and hidden-state based draft models are "
                     "not compatible.")
 
+            speculative_draft_tensor_parallel_size = \
+                SpeculativeConfig._verify_and_get_draft_model_tensor_parallel_size(
+                    target_parallel_config,
+                    speculative_draft_tensor_parallel_size,
+                    draft_hf_config
+            )
+
+            if (enable_chunked_prefill and \
+                 speculative_draft_tensor_parallel_size != 1):
+                # TODO - Investigate why the error reported in
+                # https://github.com/vllm-project/vllm/pull/9291#issuecomment-2463266258
+                # is happening and re-enable it.
+                raise ValueError(
+                    "Chunked prefill and speculative decoding can be enabled "
+                    "simultaneously only for draft models with tensor "
+                    "parallel size 1.")
+
             draft_model_config.max_model_len = (
                 SpeculativeConfig._maybe_override_draft_max_model_len(
                     speculative_max_model_len,
@@ -1466,15 +1483,16 @@ def _maybe_override_draft_max_model_len(
         )
 
     @staticmethod
-    def create_draft_parallel_config(
-        target_parallel_config: ParallelConfig,
-        speculative_draft_tensor_parallel_size: Optional[int],
-        draft_hf_config: PretrainedConfig,
-    ) -> ParallelConfig:
-        """Create a parallel config for use by the draft worker.
-
-        This is mostly a copy of the target parallel config, except the tp_size.
+    def _verify_and_get_draft_model_tensor_parallel_size(
+            target_parallel_config: ParallelConfig,
+            speculative_draft_tensor_parallel_size: Optional[int],
+            draft_hf_config: PretrainedConfig) -> int:
+        """
+        Verifies and adjusts the tensor parallel size for a draft model
+        specified using speculative_draft_tensor_parallel_size.
         """
+        # If speculative_draft_tensor_parallel_size is unset then set it
+        # appropriately else verify that it is set correctly.
         if speculative_draft_tensor_parallel_size is None:
             if draft_hf_config.model_type == "mlp_speculator":
                 speculative_draft_tensor_parallel_size = 1
@@ -1490,7 +1508,18 @@ def create_draft_parallel_config(
             raise ValueError(
                 f"{speculative_draft_tensor_parallel_size=} cannot be "
                 f"other value than 1 or target model tensor_parallel_size")
+        return speculative_draft_tensor_parallel_size
 
+    @staticmethod
+    def create_draft_parallel_config(
+        target_parallel_config: ParallelConfig,
+        speculative_draft_tensor_parallel_size: int,
+        draft_hf_config: PretrainedConfig,
+    ) -> ParallelConfig:
+        """Create a parallel config for use by the draft worker.
+
+        This is mostly a copy of the target parallel config, except the tp_size.
+        """
         draft_parallel_config = ParallelConfig(
             pipeline_parallel_size=target_parallel_config.
             pipeline_parallel_size,

From 6b30471586f6128797272db654c42c5131d3a1f1 Mon Sep 17 00:00:00 2001
From: Rafael Vasquez <rafvasq21@gmail.com>
Date: Fri, 8 Nov 2024 12:51:04 -0500
Subject: [PATCH 051/183] [Misc] Improve Web UI (#10090)

Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
---
 docs/source/_static/custom.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js
index f475be71fc448..ceeca47226cde 100644
--- a/docs/source/_static/custom.js
+++ b/docs/source/_static/custom.js
@@ -8,7 +8,7 @@ document.addEventListener("DOMContentLoaded", function () {
     script.setAttribute("version", "stable");
     script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget.
     script.setAttribute("runllm-name", "vLLM");
-    script.setAttribute("runllm-position", "BOTTOM_RIGHT");
+    script.setAttribute("runllm-position", "BOTTOM_LEFT");
     script.setAttribute("runllm-assistant-id", "207");
   
     script.async = true;

From b5815c8413b4e09ba6ccd9c41ea3f9fb2d057aa8 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 8 Nov 2024 10:23:04 -0800
Subject: [PATCH 052/183] [V1] Fix non-cudagraph op name (#10166)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 9bb49a21453d0..2469048536e49 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -411,7 +411,7 @@ def load_model(self) -> None:
             set_compilation_config(
                 CompilationConfig(
                     use_cudagraph=True,
-                    non_cudagraph_ops=["vllm.unified_flash_attention"],
+                    non_cudagraph_ops=["vllm.unified_v1_flash_attention"],
                     use_inductor=True,
                 ))
 

From 87713c605334da837cac8367fa3e59c95153df88 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luka=20Govedi=C4=8D?=
 <ProExpertProg@users.noreply.github.com>
Date: Fri, 8 Nov 2024 14:53:36 -0500
Subject: [PATCH 053/183] [CI/Build] Ignore .gitignored files for shellcheck
 (#10162)

Signed-off-by: luka <luka@neuralmagic.com>
---
 tools/shellcheck.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/shellcheck.sh b/tools/shellcheck.sh
index e850742a07900..0bb6fd2eafa14 100755
--- a/tools/shellcheck.sh
+++ b/tools/shellcheck.sh
@@ -18,4 +18,4 @@ if ! [ -x "$(command -v shellcheck)" ]; then
 fi
 
 # TODO - fix warnings in .buildkite/run-amd-test.sh
-find . -name "*.sh" -not -path "./.deps/*" -not -path "./.buildkite/run-amd-test.sh" -exec shellcheck {} +
+find . -name "*.sh" -not -path "./.buildkite/run-amd-test.sh" -exec sh -c 'git check-ignore -q $1 || shellcheck $1' _ {} \;

From e1b5a8217974af541abda462e75dc8ce1a7e4004 Mon Sep 17 00:00:00 2001
From: Florian Zimmermeister <flozi00.fz@gmail.com>
Date: Fri, 8 Nov 2024 21:53:24 +0100
Subject: [PATCH 054/183] Rename vllm.logging to vllm.logging_utils (#10134)

---
 pyproject.toml                               | 2 +-
 tests/test_logger.py                         | 2 +-
 vllm/logger.py                               | 2 +-
 vllm/logging/__init__.py                     | 5 -----
 vllm/logging_utils/__init__.py               | 5 +++++
 vllm/{logging => logging_utils}/formatter.py | 0
 6 files changed, 8 insertions(+), 8 deletions(-)
 delete mode 100644 vllm/logging/__init__.py
 create mode 100644 vllm/logging_utils/__init__.py
 rename vllm/{logging => logging_utils}/formatter.py (100%)

diff --git a/pyproject.toml b/pyproject.toml
index 1385a15d07878..797e7a88ab31b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -68,7 +68,7 @@ files = [
     "vllm/entrypoints",
     "vllm/core",
     "vllm/inputs",
-    "vllm/logging",
+    "vllm/logging_utils",
     "vllm/multimodal",
     "vllm/platforms",
     "vllm/transformers_utils",
diff --git a/tests/test_logger.py b/tests/test_logger.py
index a937b0812ed0c..e3749616d4203 100644
--- a/tests/test_logger.py
+++ b/tests/test_logger.py
@@ -13,7 +13,7 @@
 
 from vllm.logger import (_DATE_FORMAT, _FORMAT, _configure_vllm_root_logger,
                          enable_trace_function_call, init_logger)
-from vllm.logging import NewLineFormatter
+from vllm.logging_utils import NewLineFormatter
 
 
 def f1(x):
diff --git a/vllm/logger.py b/vllm/logger.py
index d6fcda02a0fb3..80b9fcc59272d 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -24,7 +24,7 @@
 DEFAULT_LOGGING_CONFIG = {
     "formatters": {
         "vllm": {
-            "class": "vllm.logging.NewLineFormatter",
+            "class": "vllm.logging_utils.NewLineFormatter",
             "datefmt": _DATE_FORMAT,
             "format": _FORMAT,
         },
diff --git a/vllm/logging/__init__.py b/vllm/logging/__init__.py
deleted file mode 100644
index b9aec380776f3..0000000000000
--- a/vllm/logging/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from vllm.logging.formatter import NewLineFormatter
-
-__all__ = [
-    "NewLineFormatter",
-]
diff --git a/vllm/logging_utils/__init__.py b/vllm/logging_utils/__init__.py
new file mode 100644
index 0000000000000..576ccf78a8117
--- /dev/null
+++ b/vllm/logging_utils/__init__.py
@@ -0,0 +1,5 @@
+from vllm.logging_utils.formatter import NewLineFormatter
+
+__all__ = [
+    "NewLineFormatter",
+]
diff --git a/vllm/logging/formatter.py b/vllm/logging_utils/formatter.py
similarity index 100%
rename from vllm/logging/formatter.py
rename to vllm/logging_utils/formatter.py

From 4f93dfe952522c3f784be6542d69be2a172b8496 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luka=20Govedi=C4=8D?=
 <ProExpertProg@users.noreply.github.com>
Date: Fri, 8 Nov 2024 16:20:08 -0500
Subject: [PATCH 055/183] [torch.compile] Fuse RMSNorm with quant (#9138)

Signed-off-by: luka <luka@neuralmagic.com>
Co-authored-by: youkaichao <youkaichao@126.com>
---
 CMakeLists.txt                    |   1 +
 csrc/layernorm_kernels.cu         | 165 +----------------
 csrc/layernorm_quant_kernels.cu   | 234 ++++++++++++++++++++++++
 csrc/ops.h                        |  10 +
 csrc/quantization/fp8/common.cu   | 175 +-----------------
 csrc/quantization/fp8/common.cuh  | 172 ++++++++++++++++++
 csrc/torch_bindings.cpp           |  31 +++-
 csrc/type_convert.cuh             | 165 +++++++++++++++++
 tests/compile/backend.py          |  33 ++++
 tests/compile/test_fusion.py      |  92 ++++++++++
 tests/kernels/test_layernorm.py   |  75 +++++++-
 vllm/compilation/backends.py      | 109 +++++++++--
 vllm/compilation/config.py        |  25 ++-
 vllm/compilation/fusion.py        | 291 ++++++++++++++++++++++++++++++
 vllm/compilation/inductor_pass.py |  38 ++++
 vllm/compilation/reshapes.py      |  85 +++++++++
 vllm/envs.py                      |   2 +
 17 files changed, 1335 insertions(+), 368 deletions(-)
 create mode 100644 csrc/layernorm_quant_kernels.cu
 create mode 100644 csrc/quantization/fp8/common.cuh
 create mode 100644 csrc/type_convert.cuh
 create mode 100644 tests/compile/backend.py
 create mode 100644 tests/compile/test_fusion.py
 create mode 100644 vllm/compilation/fusion.py
 create mode 100644 vllm/compilation/inductor_pass.py
 create mode 100644 vllm/compilation/reshapes.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 25c0865a90a67..376565583d928 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -191,6 +191,7 @@ set(VLLM_EXT_SRC
   "csrc/pos_encoding_kernels.cu"
   "csrc/activation_kernels.cu"
   "csrc/layernorm_kernels.cu"
+  "csrc/layernorm_quant_kernels.cu"
   "csrc/quantization/gptq/q_gemm.cu"
   "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
   "csrc/quantization/fp8/common.cu"
diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu
index 7a7a25d2173d2..fb6882f3e7c3e 100644
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@@ -1,21 +1,13 @@
-#include <torch/all.h>
-#include <ATen/cuda/CUDAContext.h>
+#include "type_convert.cuh"
+#include "dispatch_utils.h"
+
+#include <torch/cuda.h>
 #include <c10/cuda/CUDAGuard.h>
 
-#include "dispatch_utils.h"
 #ifndef USE_ROCM
-  #include <cuda_bf16.h>
-  #include <cuda_fp16.h>
-  #include <cub/util_type.cuh>
   #include <cub/cub.cuh>
 #else
-  #include <hip/hip_bf16.h>
-  #include <hip/hip_fp16.h>
-  #include <hipcub/util_type.hpp>
   #include <hipcub/hipcub.hpp>
-
-using __nv_bfloat16 = __hip_bfloat16;
-using __nv_bfloat162 = __hip_bfloat162;
 #endif
 
 namespace vllm {
@@ -51,155 +43,6 @@ __global__ void rms_norm_kernel(
   }
 }
 
-/* Converter structs for the conversion from torch types to HIP/CUDA types,
-   and the associated type conversions within HIP/CUDA. These helpers need
-   to be implemented for now because the relevant type conversion
-   operators/constructors are not consistently implemented by HIP/CUDA, so
-   a generic conversion via type casts cannot be implemented.
-
-   Each struct should have the member static constexpr bool `exists`:
-   If false, the optimized kernel is not used for the corresponding torch type.
-   If true, the struct should be fully defined as shown in the examples below.
- */
-template <typename torch_type>
-struct _typeConvert {
-  static constexpr bool exists = false;
-};
-
-#if defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000))
-// CUDA < 12.0 runs into issues with packed type conversion
-template <>
-struct _typeConvert<c10::Half> {
-  static constexpr bool exists = true;
-  using hip_type = __half;
-  using packed_hip_type = __half2;
-
-  __device__ static inline float convert(hip_type x) { return __half2float(x); }
-  __device__ static inline float2 convert(packed_hip_type x) {
-    return __half22float2(x);
-  }
-  __device__ static inline hip_type convert(float x) {
-    return __float2half_rn(x);
-  }
-  __device__ static inline packed_hip_type convert(float2 x) {
-    return __float22half2_rn(x);
-  }
-};
-
-  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
-// CUDA_ARCH < 800 does not have BF16 support
-// TODO: Add in ROCm support once public headers handle bf16 maturely
-template <>
-struct _typeConvert<c10::BFloat16> {
-  static constexpr bool exists = true;
-  using hip_type = __nv_bfloat16;
-  using packed_hip_type = __nv_bfloat162;
-
-  __device__ static inline float convert(hip_type x) {
-    return __bfloat162float(x);
-  }
-  __device__ static inline float2 convert(packed_hip_type x) {
-    return __bfloat1622float2(x);
-  }
-  __device__ static inline hip_type convert(float x) {
-    return __float2bfloat16(x);
-  }
-  __device__ static inline packed_hip_type convert(float2 x) {
-    return __float22bfloat162_rn(x);
-  }
-};
-  #endif  // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
-#endif    // defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >=
-          // 12000))
-
-/* Vector POD struct to generate vectorized and packed FP16/BF16 ops
-   for appropriate specializations of fused_add_rms_norm_kernel.
-   Only functions that are necessary in that kernel are implemented.
-   Alignment to 16 bytes is required to use 128-bit global memory ops.
- */
-template <typename scalar_t, int width>
-struct alignas(16) _f16Vec {
-  /* Not theoretically necessary that width is a power of 2 but should
-     almost always be the case for optimization purposes */
-  static_assert(width > 0 && (width & (width - 1)) == 0,
-                "Width is not a positive power of 2!");
-  using Converter = _typeConvert<scalar_t>;
-  using T1 = typename Converter::hip_type;
-  using T2 = typename Converter::packed_hip_type;
-  T1 data[width];
-
-  __device__ _f16Vec& operator+=(const _f16Vec<scalar_t, width>& other) {
-    if constexpr (width % 2 == 0) {
-#pragma unroll
-      for (int i = 0; i < width; i += 2) {
-        T2 temp{data[i], data[i + 1]};
-        temp += T2{other.data[i], other.data[i + 1]};
-        data[i] = temp.x;
-        data[i + 1] = temp.y;
-      }
-    } else {
-#pragma unroll
-      for (int i = 0; i < width; ++i) data[i] += other.data[i];
-    }
-    return *this;
-  }
-
-  __device__ _f16Vec& operator*=(const _f16Vec<scalar_t, width>& other) {
-    if constexpr (width % 2 == 0) {
-#pragma unroll
-      for (int i = 0; i < width; i += 2) {
-        T2 temp{data[i], data[i + 1]};
-        temp *= T2{other.data[i], other.data[i + 1]};
-        data[i] = temp.x;
-        data[i + 1] = temp.y;
-      }
-    } else {
-#pragma unroll
-      for (int i = 0; i < width; ++i) data[i] *= other.data[i];
-    }
-    return *this;
-  }
-
-  __device__ _f16Vec& operator*=(const float scale) {
-    if constexpr (width % 2 == 0) {
-#pragma unroll
-      for (int i = 0; i < width; i += 2) {
-        float2 temp_f = Converter::convert(T2{data[i], data[i + 1]});
-        temp_f.x *= scale;
-        temp_f.y *= scale;
-        T2 temp = Converter::convert(temp_f);
-        data[i] = temp.x;
-        data[i + 1] = temp.y;
-      }
-    } else {
-#pragma unroll
-      for (int i = 0; i < width; ++i) {
-        float temp = Converter::convert(data[i]) * scale;
-        data[i] = Converter::convert(temp);
-      }
-    }
-    return *this;
-  }
-
-  __device__ float sum_squares() const {
-    float result = 0.0f;
-    if constexpr (width % 2 == 0) {
-#pragma unroll
-      for (int i = 0; i < width; i += 2) {
-        float2 z = Converter::convert(T2{data[i], data[i + 1]});
-        result += z.x * z.x + z.y * z.y;
-      }
-    } else {
-#pragma unroll
-      for (int i = 0; i < width; ++i) {
-        float x = Converter::convert(data[i]);
-        result += x * x;
-      }
-    }
-    return result;
-  }
-};
-
 /* Function specialization in the case of FP16/BF16 tensors.
    Additional optimizations we can make in this case are
    packed and vectorized operations, which help with the
diff --git a/csrc/layernorm_quant_kernels.cu b/csrc/layernorm_quant_kernels.cu
new file mode 100644
index 0000000000000..c18e2a4e4abe0
--- /dev/null
+++ b/csrc/layernorm_quant_kernels.cu
@@ -0,0 +1,234 @@
+/*
+ * This file contains the CUDA kernels for the fused quantized layernorm.
+ * The kernels correspond to the kernels in layernorm_kernels.cu, except they
+ * also produce quantized output directly.
+ * Currently, only static fp8 quantization is supported.
+ */
+
+#include "type_convert.cuh"
+#include "quantization/fp8/common.cuh"
+#include "dispatch_utils.h"
+
+#include <torch/cuda.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#ifndef USE_ROCM
+  #include <cub/cub.cuh>
+#else
+  #include <hipcub/hipcub.hpp>
+#endif
+
+namespace vllm {
+
+// TODO(woosuk): Further optimize this kernel.
+template <typename scalar_t>
+__global__ void rms_norm_static_fp8_quant_kernel(
+    FP8_TYPE* __restrict__ out,           // [..., hidden_size]
+    const scalar_t* __restrict__ input,   // [..., hidden_size]
+    const scalar_t* __restrict__ weight,  // [hidden_size]
+    const float* __restrict__ scale,      // [1]
+    const float epsilon, const int num_tokens, const int hidden_size) {
+  __shared__ float s_variance;
+  float variance = 0.0f;
+
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+    const float x = (float)input[blockIdx.x * hidden_size + idx];
+    variance += x * x;
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(variance / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  // invert scale to avoid division
+  float const scale_inv = 1.0f / *scale;
+
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+    float x = (float)input[blockIdx.x * hidden_size + idx];
+    float const out_norm = ((scalar_t)(x * s_variance)) * weight[idx];
+    out[blockIdx.x * hidden_size + idx] =
+        scaled_fp8_conversion<true>(out_norm, scale_inv);
+  }
+}
+
+/* Function specialization in the case of FP16/BF16 tensors.
+   Additional optimizations we can make in this case are
+   packed and vectorized operations, which help with the
+   memory latency bottleneck. */
+template <typename scalar_t, int width>
+__global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
+fused_add_rms_norm_static_fp8_quant_kernel(
+    FP8_TYPE* __restrict__ out,           // [..., hidden_size]
+    scalar_t* __restrict__ input,         // [..., hidden_size]
+    scalar_t* __restrict__ residual,      // [..., hidden_size]
+    const scalar_t* __restrict__ weight,  // [hidden_size]
+    const float* __restrict__ scale,      // [1]
+    const float epsilon, const int num_tokens, const int hidden_size) {
+  // Sanity checks on our vector struct and type-punned pointer arithmetic
+  static_assert(std::is_pod_v<_f16Vec<scalar_t, width>>);
+  static_assert(sizeof(_f16Vec<scalar_t, width>) == sizeof(scalar_t) * width);
+
+  const int vec_hidden_size = hidden_size / width;
+  __shared__ float s_variance;
+  float variance = 0.0f;
+  /* These and the argument pointers are all declared `restrict` as they are
+     not aliased in practice. Argument pointers should not be dereferenced
+     in this kernel as that would be undefined behavior */
+  auto* __restrict__ input_v =
+      reinterpret_cast<_f16Vec<scalar_t, width>*>(input);
+  auto* __restrict__ residual_v =
+      reinterpret_cast<_f16Vec<scalar_t, width>*>(residual);
+  auto* __restrict__ weight_v =
+      reinterpret_cast<const _f16Vec<scalar_t, width>*>(weight);
+
+  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
+    int id = blockIdx.x * vec_hidden_size + idx;
+    _f16Vec<scalar_t, width> temp = input_v[id];
+    temp += residual_v[id];
+    variance += temp.sum_squares();
+    residual_v[id] = temp;
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(variance / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  // invert scale to avoid division
+  float const scale_inv = 1.0f / *scale;
+
+  for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
+    int id = blockIdx.x * vec_hidden_size + idx;
+    _f16Vec<scalar_t, width> temp = residual_v[id];
+    temp *= s_variance;
+    temp *= weight_v[idx];
+#pragma unroll
+    for (int i = 0; i < width; ++i) {
+      out[id * width + i] =
+          scaled_fp8_conversion<true>(float(temp.data[i]), scale_inv);
+    }
+  }
+}
+
+/* Generic fused_add_rms_norm_kernel
+   The width field is not used here but necessary for other specializations.
+ */
+template <typename scalar_t, int width>
+__global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
+fused_add_rms_norm_static_fp8_quant_kernel(
+    FP8_TYPE* __restrict__ out,           // [..., hidden_size]
+    scalar_t* __restrict__ input,         // [..., hidden_size]
+    scalar_t* __restrict__ residual,      // [..., hidden_size]
+    const scalar_t* __restrict__ weight,  // [hidden_size]
+    const float* __restrict__ scale,      // [1]
+    const float epsilon, const int num_tokens, const int hidden_size) {
+  __shared__ float s_variance;
+  float variance = 0.0f;
+
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+    scalar_t z = input[blockIdx.x * hidden_size + idx];
+    z += residual[blockIdx.x * hidden_size + idx];
+    float x = (float)z;
+    variance += x * x;
+    residual[blockIdx.x * hidden_size + idx] = z;
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+  variance = BlockReduce(reduceStore).Reduce(variance, cub::Sum{}, blockDim.x);
+
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(variance / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  // invert scale to avoid division
+  float const scale_inv = 1.0f / *scale;
+
+  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+    float x = (float)residual[blockIdx.x * hidden_size + idx];
+    float const out_norm = ((scalar_t)(x * s_variance)) * weight[idx];
+    out[blockIdx.x * hidden_size + idx] =
+        scaled_fp8_conversion<true>(out_norm, scale_inv);
+  }
+}
+
+}  // namespace vllm
+
+void rms_norm_static_fp8_quant(torch::Tensor& out,     // [..., hidden_size]
+                               torch::Tensor& input,   // [..., hidden_size]
+                               torch::Tensor& weight,  // [hidden_size]
+                               torch::Tensor& scale,   // [1]
+                               double epsilon) {
+  int hidden_size = input.size(-1);
+  int num_tokens = input.numel() / hidden_size;
+
+  dim3 grid(num_tokens);
+  dim3 block(std::min(hidden_size, 1024));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
+    vllm::rms_norm_static_fp8_quant_kernel<scalar_t>
+        <<<grid, block, 0, stream>>>(
+            out.data_ptr<FP8_TYPE>(), input.data_ptr<scalar_t>(),
+            weight.data_ptr<scalar_t>(), scale.data_ptr<float>(), epsilon,
+            num_tokens, hidden_size);
+  });
+}
+
+#define LAUNCH_FUSED_ADD_RMS_NORM(width)                                    \
+  VLLM_DISPATCH_FLOATING_TYPES(                                             \
+      input.scalar_type(), "fused_add_rms_norm_kernel", [&] {               \
+        vllm::fused_add_rms_norm_static_fp8_quant_kernel<scalar_t, width>   \
+            <<<grid, block, 0, stream>>>(                                   \
+                out.data_ptr<FP8_TYPE>(), input.data_ptr<scalar_t>(),       \
+                residual.data_ptr<scalar_t>(), weight.data_ptr<scalar_t>(), \
+                scale.data_ptr<float>(), epsilon, num_tokens, hidden_size); \
+      });
+
+void fused_add_rms_norm_static_fp8_quant(
+    torch::Tensor& out,       // [..., hidden_size],
+    torch::Tensor& input,     // [..., hidden_size]
+    torch::Tensor& residual,  // [..., hidden_size]
+    torch::Tensor& weight,    // [hidden_size]
+    torch::Tensor& scale,     // [1]
+    double epsilon) {
+  int hidden_size = input.size(-1);
+  int num_tokens = input.numel() / hidden_size;
+
+  dim3 grid(num_tokens);
+  /* This kernel is memory-latency bound in many scenarios.
+     When num_tokens is large, a smaller block size allows
+     for increased block occupancy on CUs and better latency
+     hiding on global mem ops. */
+  const int max_block_size = (num_tokens < 256) ? 1024 : 256;
+  dim3 block(std::min(hidden_size, max_block_size));
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  /*If the tensor types are FP16/BF16, try to use the optimized kernel
+    with packed + vectorized ops.
+    Max optimization is achieved with a width-8 vector of FP16/BF16s
+    since we can load at most 128 bits at once in a global memory op.
+    However, this requires each tensor's data to be aligned to 16
+    bytes.
+   */
+  auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
+  auto res_ptr = reinterpret_cast<std::uintptr_t>(residual.data_ptr());
+  auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
+  bool ptrs_are_aligned =
+      inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
+  if (ptrs_are_aligned && hidden_size % 8 == 0) {
+    LAUNCH_FUSED_ADD_RMS_NORM(8);
+  } else {
+    LAUNCH_FUSED_ADD_RMS_NORM(0);
+  }
+}
diff --git a/csrc/ops.h b/csrc/ops.h
index e0775ee1891df..672e608e9c47e 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -56,6 +56,16 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
                         torch::Tensor& weight, double epsilon);
 
+void rms_norm_static_fp8_quant(torch::Tensor& out, torch::Tensor& input,
+                               torch::Tensor& weight, torch::Tensor& scale,
+                               double epsilon);
+
+void fused_add_rms_norm_static_fp8_quant(torch::Tensor& out,
+                                         torch::Tensor& input,
+                                         torch::Tensor& residual,
+                                         torch::Tensor& weight,
+                                         torch::Tensor& scale, double epsilon);
+
 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
                       torch::Tensor& key, int64_t head_size,
                       torch::Tensor& cos_sin_cache, bool is_neox);
diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu
index f2c609c1b68c3..e4f6615ede1ee 100644
--- a/csrc/quantization/fp8/common.cu
+++ b/csrc/quantization/fp8/common.cu
@@ -1,185 +1,16 @@
-#include <ATen/cuda/CUDAContext.h>
-#include <torch/all.h>
-#include <c10/cuda/CUDAGuard.h>
-
-#include <cmath>
-
-#include "cuda_compat.h"
+#include "common.cuh"
 #include "dispatch_utils.h"
 
+#include <c10/cuda/CUDAGuard.h>
+
 #ifndef USE_ROCM
-  #include <cub/util_type.cuh>
   #include <cub/cub.cuh>
 #else
-  #include <hipcub/util_type.hpp>
   #include <hipcub/hipcub.hpp>
 #endif
 
-#ifndef USE_ROCM
-using FP8_TYPE = c10::Float8_e4m3fn;
-C10_HOST_DEVICE constexpr auto FP8_E4M3_MAX =
-    std::numeric_limits<FP8_TYPE>::max();
-#else
-  #include "amd/hip_float8.h"
-using FP8_TYPE = c10::Float8_e4m3fnuz;
-// Using the default max value from pytorch (240.0) will cause accuracy
-// issue when running dynamic quantization. Here use 224.0f for rocm.
-constexpr auto FP8_E4M3_MAX = 224.0f;
-#endif
-
 namespace vllm {
 
-__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
-  float old;
-  old = (value >= 0)
-            ? __int_as_float(atomicMax((int*)addr, __float_as_int(value)))
-            : __uint_as_float(
-                  atomicMin((unsigned int*)addr, __float_as_uint(value)));
-
-  return old;
-}
-
-template <bool is_scale_inverted>
-__device__ __forceinline__ FP8_TYPE scaled_fp8_conversion(float const val,
-                                                          float const scale) {
-  float x = 0.0f;
-  if constexpr (is_scale_inverted) {
-    x = val * scale;
-  } else {
-    x = val / scale;
-  }
-
-  float r = fmax(-FP8_E4M3_MAX, fmin(x, FP8_E4M3_MAX));
-#ifndef USE_ROCM
-  return static_cast<c10::Float8_e4m3fn>(r);
-#else
-  // Use hardware cvt instruction for fp8 on rocm
-  return c10::Float8_e4m3fnuz(hip_fp8(r).data,
-                              c10::Float8_e4m3fnuz::from_bits());
-#endif
-}
-
-// Compute the absolute maximum m of the input tensor and store
-// m / float8_e4m3::max() in *scale. Each thread block performs a
-// reduction tree and the memory in scale is atomically updated.
-// So to get the right answer, *scale needs to be initialized to
-// a value <= 0.0 and we need to wait for all thread blocks to
-// finish before consuming *scale.
-template <typename scalar_t>
-__global__ void segmented_max_reduction(float* __restrict__ scale,
-                                        const scalar_t* __restrict__ input,
-                                        int64_t num_elems) {
-  __shared__ float cache[1024];
-  int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
-
-  // First store maximum for all values processes by
-  // the current thread in cache[threadIdx.x]
-  scalar_t tmp = 0.0;
-  while (i < num_elems) {
-    float x = static_cast<float>(input[i]);
-    tmp = max(tmp, fabs(x));
-    i += blockDim.x * gridDim.x;
-  }
-  cache[threadIdx.x] = tmp;
-
-  __syncthreads();
-
-  // Now perform parallel reduction within the thread block
-  int ib = blockDim.x / 2;
-  while (ib != 0) {
-    if (threadIdx.x < ib && cache[threadIdx.x + ib] > cache[threadIdx.x]) {
-      cache[threadIdx.x] = cache[threadIdx.x + ib];
-    }
-    __syncthreads();
-    ib /= 2;
-  }
-  // Finally, since cache[0] contains the maximum for this thread block,
-  // atomically write the max to the target location
-  if (threadIdx.x == 0) {
-    atomicMaxFloat(scale, cache[0] / FP8_E4M3_MAX);
-  }
-}
-
-template <typename scalar_t>
-struct __align__(8) vec4_t {
-  scalar_t x;
-  scalar_t y;
-  scalar_t z;
-  scalar_t w;
-};
-
-typedef struct __align__(4) {
-  FP8_TYPE x;
-  FP8_TYPE y;
-  FP8_TYPE z;
-  FP8_TYPE w;
-}
-float8x4_t;
-
-template <typename scalar_t>
-__device__ float thread_max_vec(scalar_t const* __restrict__ input,
-                                int64_t const num_elems, int const tid,
-                                int const step) {
-  // Vectorized input/output to better utilize memory bandwidth.
-  vec4_t<scalar_t> const* vectorized_in =
-      reinterpret_cast<vec4_t<scalar_t> const*>(input);
-
-  int64_t const num_vec_elems = num_elems >> 2;
-  float absmax_val = 0.0f;
-
-#pragma unroll 4
-  for (int64_t i = tid; i < num_vec_elems; i += step) {
-    vec4_t<scalar_t> in_vec = vectorized_in[i];
-    absmax_val = max(absmax_val, fabs(in_vec.x));
-    absmax_val = max(absmax_val, fabs(in_vec.y));
-    absmax_val = max(absmax_val, fabs(in_vec.z));
-    absmax_val = max(absmax_val, fabs(in_vec.w));
-  }
-
-  // Handle the remaining elements if num_elems is not divisible by 4
-  for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
-    absmax_val = max(absmax_val, fabs(input[i]));
-  }
-
-  return absmax_val;
-}
-
-template <typename scalar_t, bool is_scale_inverted>
-__device__ void scaled_fp8_conversion_vec(FP8_TYPE* __restrict__ out,
-                                          scalar_t const* __restrict__ input,
-                                          float const scale,
-                                          int64_t const num_elems,
-                                          int const tid, int const step) {
-  // Vectorized input/output to better utilize memory bandwidth.
-  vec4_t<scalar_t> const* vectorized_in =
-      reinterpret_cast<vec4_t<scalar_t> const*>(input);
-  float8x4_t* vectorized_out = reinterpret_cast<float8x4_t*>(out);
-
-  int64_t const num_vec_elems = num_elems >> 2;
-
-#pragma unroll 4
-  for (int64_t i = tid; i < num_vec_elems; i += step) {
-    vec4_t<scalar_t> in_vec = vectorized_in[i];
-    float8x4_t out_vec;
-
-    out_vec.x = scaled_fp8_conversion<is_scale_inverted>(
-        static_cast<float>(in_vec.x), scale);
-    out_vec.y = scaled_fp8_conversion<is_scale_inverted>(
-        static_cast<float>(in_vec.y), scale);
-    out_vec.z = scaled_fp8_conversion<is_scale_inverted>(
-        static_cast<float>(in_vec.z), scale);
-    out_vec.w = scaled_fp8_conversion<is_scale_inverted>(
-        static_cast<float>(in_vec.w), scale);
-    vectorized_out[i] = out_vec;
-  }
-
-  // Handle the remaining elements if num_elems is not divisible by 4
-  for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
-    out[i] = scaled_fp8_conversion<is_scale_inverted>(
-        static_cast<float>(input[i]), scale);
-  }
-}
-
 template <typename scalar_t>
 __global__ void scaled_fp8_quant_kernel(FP8_TYPE* __restrict__ out,
                                         const scalar_t* __restrict__ input,
diff --git a/csrc/quantization/fp8/common.cuh b/csrc/quantization/fp8/common.cuh
new file mode 100644
index 0000000000000..d7c0297d5333f
--- /dev/null
+++ b/csrc/quantization/fp8/common.cuh
@@ -0,0 +1,172 @@
+#pragma once
+
+#include <cmath>
+
+#ifndef USE_ROCM
+  #include <c10/util/Float8_e4m3fn.h>
+using FP8_TYPE = c10::Float8_e4m3fn;
+C10_HOST_DEVICE constexpr auto FP8_E4M3_MAX =
+    std::numeric_limits<FP8_TYPE>::max();
+#else
+  #include <c10/util/Float8_e4m3fnuz.h>
+  #include "amd/hip_float8.h"
+using FP8_TYPE = c10::Float8_e4m3fnuz;
+// Using the default max value from pytorch (240.0) will cause accuracy
+// issue when running dynamic quantization. Here use 224.0f for rocm.
+constexpr auto FP8_E4M3_MAX = 224.0f;
+#endif
+
+namespace vllm {
+
+__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
+  float old;
+  old = (value >= 0)
+            ? __int_as_float(atomicMax((int*)addr, __float_as_int(value)))
+            : __uint_as_float(
+                  atomicMin((unsigned int*)addr, __float_as_uint(value)));
+
+  return old;
+}
+
+template <bool is_scale_inverted>
+__device__ __forceinline__ FP8_TYPE scaled_fp8_conversion(float const val,
+                                                          float const scale) {
+  float x = 0.0f;
+  if constexpr (is_scale_inverted) {
+    x = val * scale;
+  } else {
+    x = val / scale;
+  }
+
+  float r = fmax(-FP8_E4M3_MAX, fmin(x, FP8_E4M3_MAX));
+#ifndef USE_ROCM
+  return static_cast<c10::Float8_e4m3fn>(r);
+#else
+  // Use hardware cvt instruction for fp8 on rocm
+  return c10::Float8_e4m3fnuz(hip_fp8(r).data,
+                              c10::Float8_e4m3fnuz::from_bits());
+#endif
+}
+
+// Compute the absolute maximum m of the input tensor and store
+// m / float8_e4m3::max() in *scale. Each thread block performs a
+// reduction tree and the memory in scale is atomically updated.
+// So to get the right answer, *scale needs to be initialized to
+// a value <= 0.0 and we need to wait for all thread blocks to
+// finish before consuming *scale.
+template <typename scalar_t>
+__global__ void segmented_max_reduction(float* __restrict__ scale,
+                                        const scalar_t* __restrict__ input,
+                                        int64_t num_elems) {
+  __shared__ float cache[1024];
+  int64_t i = blockDim.x * blockIdx.x + threadIdx.x;
+
+  // First store maximum for all values processes by
+  // the current thread in cache[threadIdx.x]
+  scalar_t tmp = 0.0;
+  while (i < num_elems) {
+    float x = static_cast<float>(input[i]);
+    tmp = max(tmp, fabs(x));
+    i += blockDim.x * gridDim.x;
+  }
+  cache[threadIdx.x] = tmp;
+
+  __syncthreads();
+
+  // Now perform parallel reduction within the thread block
+  int ib = blockDim.x / 2;
+  while (ib != 0) {
+    if (threadIdx.x < ib && cache[threadIdx.x + ib] > cache[threadIdx.x]) {
+      cache[threadIdx.x] = cache[threadIdx.x + ib];
+    }
+    __syncthreads();
+    ib /= 2;
+  }
+  // Finally, since cache[0] contains the maximum for this thread block,
+  // atomically write the max to the target location
+  if (threadIdx.x == 0) {
+    atomicMaxFloat(scale, cache[0] / FP8_E4M3_MAX);
+  }
+}
+
+template <typename scalar_t>
+struct __align__(8) vec4_t {
+  scalar_t x;
+  scalar_t y;
+  scalar_t z;
+  scalar_t w;
+};
+
+typedef struct __align__(4) {
+  FP8_TYPE x;
+  FP8_TYPE y;
+  FP8_TYPE z;
+  FP8_TYPE w;
+}
+float8x4_t;
+
+template <typename scalar_t>
+__device__ float thread_max_vec(scalar_t const* __restrict__ input,
+                                int64_t const num_elems, int const tid,
+                                int const step) {
+  // Vectorized input/output to better utilize memory bandwidth.
+  vec4_t<scalar_t> const* vectorized_in =
+      reinterpret_cast<vec4_t<scalar_t> const*>(input);
+
+  int64_t const num_vec_elems = num_elems >> 2;
+  float absmax_val = 0.0f;
+
+#pragma unroll 4
+  for (int64_t i = tid; i < num_vec_elems; i += step) {
+    vec4_t<scalar_t> in_vec = vectorized_in[i];
+    absmax_val = max(absmax_val, fabs(in_vec.x));
+    absmax_val = max(absmax_val, fabs(in_vec.y));
+    absmax_val = max(absmax_val, fabs(in_vec.z));
+    absmax_val = max(absmax_val, fabs(in_vec.w));
+  }
+
+  // Handle the remaining elements if num_elems is not divisible by 4
+  for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
+    absmax_val = max(absmax_val, fabs(input[i]));
+  }
+
+  return absmax_val;
+}
+
+template <typename scalar_t, bool is_scale_inverted>
+__device__ void scaled_fp8_conversion_vec(FP8_TYPE* __restrict__ out,
+                                          scalar_t const* __restrict__ input,
+                                          float const scale,
+                                          int64_t const num_elems,
+                                          int const tid, int const step) {
+  // Vectorized input/output to better utilize memory bandwidth.
+  vec4_t<scalar_t> const* vectorized_in =
+      reinterpret_cast<vec4_t<scalar_t> const*>(input);
+  float8x4_t* vectorized_out = reinterpret_cast<float8x4_t*>(out);
+
+  int64_t const num_vec_elems = num_elems >> 2;
+
+#pragma unroll 4
+  for (int64_t i = tid; i < num_vec_elems; i += step) {
+    vec4_t<scalar_t> in_vec = vectorized_in[i];
+    float8x4_t out_vec;
+
+    out_vec.x = scaled_fp8_conversion<is_scale_inverted>(
+        static_cast<float>(in_vec.x), scale);
+    out_vec.y = scaled_fp8_conversion<is_scale_inverted>(
+        static_cast<float>(in_vec.y), scale);
+    out_vec.z = scaled_fp8_conversion<is_scale_inverted>(
+        static_cast<float>(in_vec.z), scale);
+    out_vec.w = scaled_fp8_conversion<is_scale_inverted>(
+        static_cast<float>(in_vec.w), scale);
+    vectorized_out[i] = out_vec;
+  }
+
+  // Handle the remaining elements if num_elems is not divisible by 4
+  for (int64_t i = num_vec_elems * 4 + tid; i < num_elems; i += step) {
+    out[i] = scaled_fp8_conversion<is_scale_inverted>(
+        static_cast<float>(input[i]), scale);
+  }
+}
+
+}  // namespace vllm
\ No newline at end of file
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 971a45d50ffa4..229fd554d3eee 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -101,7 +101,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   // Layernorm
   // Apply Root Mean Square (RMS) Normalization to the input tensor.
   ops.def(
-      "rms_norm(Tensor! out, Tensor input, Tensor weight, float epsilon) -> "
+      "rms_norm(Tensor! result, Tensor input, Tensor weight, float epsilon) -> "
       "()");
   ops.impl("rms_norm", torch::kCUDA, &rms_norm);
 
@@ -111,6 +111,23 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "float epsilon) -> ()");
   ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);
 
+  // Layernorm-quant
+  // Apply Root Mean Square (RMS) Normalization to the input tensor.
+  ops.def(
+      "rms_norm_static_fp8_quant(Tensor! result, Tensor input, Tensor weight, "
+      "Tensor scale, float epsilon) -> "
+      "()");
+  ops.impl("rms_norm_static_fp8_quant", torch::kCUDA,
+           &rms_norm_static_fp8_quant);
+
+  // In-place fused Add and RMS Normalization.
+  ops.def(
+      "fused_add_rms_norm_static_fp8_quant(Tensor! result, Tensor input, "
+      "Tensor! residual, Tensor weight, "
+      "Tensor scale, float epsilon) -> ()");
+  ops.impl("fused_add_rms_norm_static_fp8_quant", torch::kCUDA,
+           &fused_add_rms_norm_static_fp8_quant);
+
   // Rotary embedding
   // Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
   ops.def(
@@ -322,18 +339,20 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   // Compute FP8 quantized tensor for given scaling factor.
   ops.def(
-      "static_scaled_fp8_quant(Tensor! out, Tensor input, Tensor scale) -> ()");
+      "static_scaled_fp8_quant(Tensor! result, Tensor input, Tensor scale) -> "
+      "()");
   ops.impl("static_scaled_fp8_quant", torch::kCUDA, &static_scaled_fp8_quant);
 
   // Compute dynamic-per-tensor FP8 quantized tensor and scaling factor.
   ops.def(
-      "dynamic_scaled_fp8_quant(Tensor! out, Tensor input, Tensor! scale) -> "
+      "dynamic_scaled_fp8_quant(Tensor! result, Tensor input, Tensor! scale) "
+      "-> "
       "()");
   ops.impl("dynamic_scaled_fp8_quant", torch::kCUDA, &dynamic_scaled_fp8_quant);
 
   // Compute dynamic-per-token FP8 quantized tensor and scaling factor.
   ops.def(
-      "dynamic_per_token_scaled_fp8_quant(Tensor! out, Tensor input, "
+      "dynamic_per_token_scaled_fp8_quant(Tensor! result, Tensor input, "
       "Tensor! scale, Tensor? scale_ub) -> "
       "()");
   ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
@@ -341,13 +360,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
 
   // Compute int8 quantized tensor for given scaling factor.
   ops.def(
-      "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
+      "static_scaled_int8_quant(Tensor! result, Tensor input, Tensor scale,"
       "Tensor? azp) -> ()");
   ops.impl("static_scaled_int8_quant", torch::kCUDA, &static_scaled_int8_quant);
 
   // Compute int8 quantized tensor and scaling factor
   ops.def(
-      "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
+      "dynamic_scaled_int8_quant(Tensor! result, Tensor input, Tensor! scale, "
       "Tensor!? azp) -> ()");
   ops.impl("dynamic_scaled_int8_quant", torch::kCUDA,
            &dynamic_scaled_int8_quant);
diff --git a/csrc/type_convert.cuh b/csrc/type_convert.cuh
new file mode 100644
index 0000000000000..21b9d0ae515df
--- /dev/null
+++ b/csrc/type_convert.cuh
@@ -0,0 +1,165 @@
+#pragma once
+
+#include <torch/all.h>
+
+#ifndef USE_ROCM
+  #include <cuda_bf16.h>
+  #include <cuda_fp16.h>
+#else
+  #include <hip/hip_bf16.h>
+  #include <hip/hip_fp16.h>
+
+using __nv_bfloat16 = __hip_bfloat16;
+using __nv_bfloat162 = __hip_bfloat162;
+#endif
+
+namespace vllm {
+/* Converter structs for the conversion from torch types to HIP/CUDA types,
+   and the associated type conversions within HIP/CUDA. These helpers need
+   to be implemented for now because the relevant type conversion
+   operators/constructors are not consistently implemented by HIP/CUDA, so
+   a generic conversion via type casts cannot be implemented.
+
+   Each struct should have the member static constexpr bool `exists`:
+   If false, the optimized kernel is not used for the corresponding torch type.
+   If true, the struct should be fully defined as shown in the examples below.
+ */
+template <typename torch_type>
+struct _typeConvert {
+  static constexpr bool exists = false;
+};
+
+#if defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000))
+// CUDA < 12.0 runs into issues with packed type conversion
+template <>
+struct _typeConvert<c10::Half> {
+  static constexpr bool exists = true;
+  using hip_type = __half;
+  using packed_hip_type = __half2;
+
+  __device__ static inline float convert(hip_type x) { return __half2float(x); }
+  __device__ static inline float2 convert(packed_hip_type x) {
+    return __half22float2(x);
+  }
+  __device__ static inline hip_type convert(float x) {
+    return __float2half_rn(x);
+  }
+  __device__ static inline packed_hip_type convert(float2 x) {
+    return __float22half2_rn(x);
+  }
+};
+
+  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+// CUDA_ARCH < 800 does not have BF16 support
+// TODO: Add in ROCm support once public headers handle bf16 maturely
+template <>
+struct _typeConvert<c10::BFloat16> {
+  static constexpr bool exists = true;
+  using hip_type = __nv_bfloat16;
+  using packed_hip_type = __nv_bfloat162;
+
+  __device__ static inline float convert(hip_type x) {
+    return __bfloat162float(x);
+  }
+  __device__ static inline float2 convert(packed_hip_type x) {
+    return __bfloat1622float2(x);
+  }
+  __device__ static inline hip_type convert(float x) {
+    return __float2bfloat16(x);
+  }
+  __device__ static inline packed_hip_type convert(float2 x) {
+    return __float22bfloat162_rn(x);
+  }
+};
+  #endif  // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+#endif    // defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >=
+          // 12000))
+
+/* Vector POD struct to generate vectorized and packed FP16/BF16 ops
+   for appropriate specializations of fused_add_rms_norm_kernel.
+   Only functions that are necessary in that kernel are implemented.
+   Alignment to 16 bytes is required to use 128-bit global memory ops.
+ */
+template <typename scalar_t, int width>
+struct alignas(16) _f16Vec {
+  /* Not theoretically necessary that width is a power of 2 but should
+     almost always be the case for optimization purposes */
+  static_assert(width > 0 && (width & (width - 1)) == 0,
+                "Width is not a positive power of 2!");
+  using Converter = _typeConvert<scalar_t>;
+  using T1 = typename Converter::hip_type;
+  using T2 = typename Converter::packed_hip_type;
+  T1 data[width];
+
+  __device__ _f16Vec& operator+=(const _f16Vec<scalar_t, width>& other) {
+    if constexpr (width % 2 == 0) {
+#pragma unroll
+      for (int i = 0; i < width; i += 2) {
+        T2 temp{data[i], data[i + 1]};
+        temp += T2{other.data[i], other.data[i + 1]};
+        data[i] = temp.x;
+        data[i + 1] = temp.y;
+      }
+    } else {
+#pragma unroll
+      for (int i = 0; i < width; ++i) data[i] += other.data[i];
+    }
+    return *this;
+  }
+
+  __device__ _f16Vec& operator*=(const _f16Vec<scalar_t, width>& other) {
+    if constexpr (width % 2 == 0) {
+#pragma unroll
+      for (int i = 0; i < width; i += 2) {
+        T2 temp{data[i], data[i + 1]};
+        temp *= T2{other.data[i], other.data[i + 1]};
+        data[i] = temp.x;
+        data[i + 1] = temp.y;
+      }
+    } else {
+#pragma unroll
+      for (int i = 0; i < width; ++i) data[i] *= other.data[i];
+    }
+    return *this;
+  }
+
+  __device__ _f16Vec& operator*=(const float scale) {
+    if constexpr (width % 2 == 0) {
+#pragma unroll
+      for (int i = 0; i < width; i += 2) {
+        float2 temp_f = Converter::convert(T2{data[i], data[i + 1]});
+        temp_f.x *= scale;
+        temp_f.y *= scale;
+        T2 temp = Converter::convert(temp_f);
+        data[i] = temp.x;
+        data[i + 1] = temp.y;
+      }
+    } else {
+#pragma unroll
+      for (int i = 0; i < width; ++i) {
+        float temp = Converter::convert(data[i]) * scale;
+        data[i] = Converter::convert(temp);
+      }
+    }
+    return *this;
+  }
+
+  __device__ float sum_squares() const {
+    float result = 0.0f;
+    if constexpr (width % 2 == 0) {
+#pragma unroll
+      for (int i = 0; i < width; i += 2) {
+        float2 z = Converter::convert(T2{data[i], data[i + 1]});
+        result += z.x * z.x + z.y * z.y;
+      }
+    } else {
+#pragma unroll
+      for (int i = 0; i < width; ++i) {
+        float x = Converter::convert(data[i]);
+        result += x * x;
+      }
+    }
+    return result;
+  }
+};
+}  // namespace vllm
\ No newline at end of file
diff --git a/tests/compile/backend.py b/tests/compile/backend.py
new file mode 100644
index 0000000000000..9d5c68274374e
--- /dev/null
+++ b/tests/compile/backend.py
@@ -0,0 +1,33 @@
+from copy import deepcopy
+from typing import Callable
+
+import torch
+
+
+class TestBackend:
+    """
+    This class provides a simple Inductor backend that can be used for testing.
+    It takes a list of custom passes and runs them after Inductor's passes.
+    It also saves the graph before and after the custom passes for inspection.
+    """
+
+    def __init__(self, *args: Callable[[torch.fx.Graph], None]):
+        self.custom_passes = args
+        from torch._inductor import config
+        self.current_config = config.shallow_copy_dict()
+        self.current_config['post_grad_custom_post_pass'] = self.post_pass
+
+    def __call__(self, graph: torch.fx.GraphModule, example_inputs):
+        from torch._inductor.compile_fx import compile_fx
+        return compile_fx(graph,
+                          example_inputs,
+                          config_patches=self.current_config)
+
+    def post_pass(self, graph: torch.fx.Graph):
+        self.graph_pre_pass = deepcopy(graph)
+        for pass_ in self.custom_passes:
+            pass_(graph)
+
+        self.graph_post_pass = deepcopy(graph)
+        # assign by reference, will reflect the final state of the graph
+        self.final_graph = graph
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
new file mode 100644
index 0000000000000..e4d3defafb951
--- /dev/null
+++ b/tests/compile/test_fusion.py
@@ -0,0 +1,92 @@
+import pytest
+import torch
+from compressed_tensors.quantization import FP8_DTYPE
+
+import vllm.envs as envs
+from vllm.compilation.config import CompilationConfig
+from vllm.compilation.fusion import (FusionPass, find_auto_fn,
+                                     find_auto_fn_maybe)
+from vllm.compilation.reshapes import RedundantReshapesPass
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    apply_fp8_linear)
+
+from .backend import TestBackend
+
+
+class TestModel(torch.nn.Module):
+
+    def __init__(self, hidden_size: int, eps: float, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
+        self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(4)]
+        self.w = [
+            torch.rand(hidden_size, hidden_size).to(dtype=FP8_DTYPE).t()
+            for _ in range(2)
+        ]
+
+    def forward(self, x):
+        resid = torch.relu(x)
+        y = self.norm[0](x)
+
+        x2 = apply_fp8_linear(y, self.w[0], self.scale[0], self.scale[1])
+        # make sure resid is used for replacement to work
+        y2, resid = self.norm[1](x2, resid)
+
+        x3 = apply_fp8_linear(y2, self.w[1], self.scale[2], self.scale[3])
+        y3, resid = self.norm[2](x3, resid)  # use resid here
+        return y3
+
+
+# Init does pattern registration, which can only happen once
+config = CompilationConfig(enable_fusion=True)
+reshape_pass = RedundantReshapesPass(config)
+fusion_pass = FusionPass.instance(config)
+
+
+@pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("hidden_size", [64, 3392, 4096])
+@pytest.mark.parametrize("num_tokens", [7, 256, 533, 2048, 2049])
+@pytest.mark.parametrize("eps", [1e-5, 1e-6])
+@pytest.mark.skipif(envs.VLLM_TARGET_DEVICE != "cuda",
+                    reason="Only test on CUDA")
+def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps):
+    torch.set_default_device("cuda")
+    torch.set_default_dtype(torch.float16)
+
+    if eps != 1e-5:
+        pytest.skip("Only test eps=1e-5 for now")
+
+    # Reshape pass is needed for the fusion pass to work
+    backend = TestBackend(reshape_pass, fusion_pass)
+    model = TestModel(hidden_size, eps)
+
+    # First dimension dynamic
+    x = torch.rand(num_tokens, hidden_size)
+    torch._dynamo.mark_dynamic(x, 0)
+
+    result = model(x)
+
+    model2 = torch.compile(model, backend=backend)
+    result2 = model2(x)
+
+    # Check that it gives the same answer
+    torch.testing.assert_close(result, result2, atol=1e-3, rtol=1e-3)
+
+    # Check substitution worked
+    pre_nodes = backend.graph_pre_pass.nodes
+    post_nodes = backend.graph_post_pass.nodes
+
+    rms_quant = torch.ops._C.rms_norm_static_fp8_quant.default
+    add_rms_quant = torch.ops._C.fused_add_rms_norm_static_fp8_quant.default
+    fp8_quant = torch.ops._C.static_scaled_fp8_quant.default
+
+    # In pre-nodes, fp8 quant should be present and fused kernels should not
+    assert find_auto_fn_maybe(pre_nodes, rms_quant) is None
+    assert find_auto_fn_maybe(pre_nodes, add_rms_quant) is None
+    find_auto_fn(pre_nodes, fp8_quant)
+
+    # In post-nodes, fused kernels should be present and fp8 quant should not
+    find_auto_fn(post_nodes, rms_quant)
+    find_auto_fn(post_nodes, add_rms_quant)
+    assert find_auto_fn_maybe(post_nodes, fp8_quant) is None
diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py
index 9dfa2cbe45e94..727769e071842 100644
--- a/tests/kernels/test_layernorm.py
+++ b/tests/kernels/test_layernorm.py
@@ -1,13 +1,14 @@
 import pytest
 import torch
 
+from tests.kernels.quant_utils import FP8_DTYPE
 from tests.kernels.utils import opcheck
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.platforms import current_platform
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 4096]  # Arbitrary values for testing
-HIDDEN_SIZES = [768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192,
+HIDDEN_SIZES = [8, 768, 769, 770, 771, 5120, 5124, 5125, 5126, 8192,
                 8199]  # Arbitrary values for testing
 ADD_RESIDUAL = [False, True]
 SEEDS = [0]
@@ -59,3 +60,75 @@ def test_rms_norm(
     else:
         opcheck(torch.ops._C.rms_norm,
                 (out, x, layer.weight.data, layer.variance_epsilon))
+
+
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("quant_scale", [1.0, 0.01, 10.0])
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_fused_rms_norm_quant(
+    num_tokens: int,
+    hidden_size: int,
+    add_residual: bool,
+    dtype: torch.dtype,
+    quant_scale: float,
+    seed: int,
+    device: str,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+
+    weight = torch.empty(hidden_size, dtype=dtype).normal_(mean=1.0, std=0.1)
+    scale = 1 / (2 * hidden_size)
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    x *= scale
+    if add_residual:
+        residual = torch.randn_like(x) * scale
+        residual_fused = residual.clone()
+    else:
+        residual = residual_fused = None
+
+    out_norm = torch.empty_like(x)
+    out_quant = torch.empty_like(x, dtype=FP8_DTYPE)
+    out_quant_fused = torch.empty_like(out_quant)
+
+    quant_scale_t = torch.tensor(quant_scale, dtype=torch.float32)
+
+    if add_residual:
+        torch.ops._C.fused_add_rms_norm_static_fp8_quant(
+            out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6)
+
+        # Unfused kernel is in-place so it goes second
+        # Also use a separate clone of x to avoid modifying the input
+        x_unfused = x.clone()
+        torch.ops._C.fused_add_rms_norm(x_unfused, residual, weight, 1e-6)
+        torch.ops._C.static_scaled_fp8_quant(out_quant, x_unfused,
+                                             quant_scale_t)
+
+        torch.cuda.synchronize()
+        torch.testing.assert_close(residual_fused,
+                                   residual,
+                                   atol=1e-2,
+                                   rtol=1e-2)
+
+        opcheck(
+            torch.ops._C.fused_add_rms_norm_static_fp8_quant,
+            (out_quant_fused, x, residual_fused, weight, quant_scale_t, 1e-6))
+    else:
+        torch.ops._C.rms_norm_static_fp8_quant(out_quant_fused, x, weight,
+                                               quant_scale_t, 1e-6)
+
+        torch.ops._C.rms_norm(out_norm, x, weight, 1e-6)
+        torch.ops._C.static_scaled_fp8_quant(out_quant, out_norm,
+                                             quant_scale_t)
+
+        opcheck(torch.ops._C.rms_norm_static_fp8_quant,
+                (out_quant_fused, x, weight, quant_scale_t, 1e-6))
+
+    torch.testing.assert_close(out_quant_fused.to(dtype=torch.float32),
+                               out_quant.to(dtype=torch.float32),
+                               atol=1e-3,
+                               rtol=1e-3)
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index abd1d16accaf7..f5fff344a1f48 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -2,7 +2,8 @@
 import dataclasses
 import operator
 from contextlib import ExitStack
-from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
+from typing import (Any, Callable, Dict, List, Optional, Sequence, Set, Tuple,
+                    Union)
 from unittest.mock import patch
 
 import torch
@@ -10,11 +11,13 @@
 
 import vllm.envs as envs
 from vllm.logger import init_logger
-from vllm.utils import weak_ref_tensors
+from vllm.utils import combine_fx_passes, weak_ref_tensors
 
 from .config import CompilationConfig
 from .counter import compilation_counter
+from .fusion import FusionPass
 from .levels import CompilationLevel
+from .reshapes import RedundantReshapesPass
 
 logger = init_logger(__name__)
 
@@ -99,28 +102,74 @@ def fix_functionalization(graph: fx.Graph):
                         user.replace_all_uses_with(replace_node)
                         nodes_to_remove.append(user)
                 nodes_to_remove.append(node)
+            elif (node.args[0] ==
+                  torch.ops._C.fused_add_rms_norm_static_fp8_quant.default):
+                # manual replace for fused_add_rms_norm_static_fp8_quant
+                # this is the most effective optimization for llama
+                # failing to do this will result in many unnecessary copies
+
+                kwargs = node.kwargs
+
+                result = kwargs['result']
+                residual = kwargs['residual']
+
+                # Create a new call to
+                # torch.ops._C.fused_add_rms_norm_static_fp8_quant.default
+                with graph.inserting_before(node):
+                    # just insert the call to the custom op
+                    # NOTE: don't run dead code elimination,
+                    # otherwise this op will be removed
+                    graph.call_function(
+                        torch.ops._C.fused_add_rms_norm_static_fp8_quant.
+                        default,
+                        kwargs=kwargs)
+
+                for user in list(node.users):
+                    if user.op == 'call_function' and user.target == operator.getitem:  # noqa
+                        # Remove the getitem node
+                        if user.args[1] == 1:
+                            replace_node = result
+                        elif user.args[1] == 2:
+                            replace_node = residual
+                        user.replace_all_uses_with(replace_node)
+                        nodes_to_remove.append(user)
+                nodes_to_remove.append(node)
 
             elif node.args[0] == torch.ops._C.rms_norm.default:
                 # manual replace for rms_norm
 
                 kwargs = node.kwargs
 
-                input = kwargs['input']
-                out = kwargs['out']
-                weight = kwargs['weight']
-                epsilon = kwargs['epsilon']
-                # Create a new call to torch.ops._C.rotary_embedding.default
-                # cannot use kwargs, because we have an `out`, see https://github.com/pytorch/pytorch/blob/a00faf440888ffb724bad413f329a49e2b6388e7/torch/_inductor/lowering.py#L351 # noqa
+                replace_node = kwargs['result']
+                # Create a new call to torch.ops._C.rms_norm.default
                 with graph.inserting_before(node):
                     # just insert the call to the custom op
                     # NOTE: don't run dead code elimination,
                     # otherwise this op will be removed
-                    graph.call_function(
-                        torch.ops._C.rms_norm.default,
-                        args=(out, input, weight, epsilon),
-                    )
+                    graph.call_function(torch.ops._C.rms_norm.default,
+                                        kwargs=kwargs)
 
-                replace_node = out
+                for user in list(node.users):
+                    if user.op == 'call_function' and user.target == operator.getitem:  # noqa
+                        user.replace_all_uses_with(replace_node)
+                        nodes_to_remove.append(user)
+                nodes_to_remove.append(node)
+
+            elif node.args[
+                    0] == torch.ops._C.rms_norm_static_fp8_quant.default:  # noqa
+                # manual replace for rms_norm_static_fp8_quant
+
+                kwargs = node.kwargs
+
+                replace_node = kwargs['result']
+                # Create a new call to torch.ops._C.rms_norm_static_fp8_quant.default  # noqa
+                with graph.inserting_before(node):
+                    # just insert the call to the custom op
+                    # NOTE: don't run dead code elimination,
+                    # otherwise this op will be removed
+                    graph.call_function(
+                        torch.ops._C.rms_norm_static_fp8_quant.default,
+                        kwargs=kwargs)
 
                 for user in list(node.users):
                     if user.op == 'call_function' and user.target == operator.getitem:  # noqa
@@ -136,7 +185,7 @@ def fix_functionalization(graph: fx.Graph):
                 input = kwargs['input']
                 out = kwargs['out']
 
-                # Create a new call to torch.ops._C.rotary_embedding.default
+                # Create a new call to torch.ops._C.silu_and_mul.default
                 # cannot use kwargs, because we have an `out`, see https://github.com/pytorch/pytorch/blob/a00faf440888ffb724bad413f329a49e2b6388e7/torch/_inductor/lowering.py#L351 # noqa
                 with graph.inserting_before(node):
                     # just insert the call to the custom op
@@ -319,6 +368,13 @@ class VllmBackend:
 
     The major work of this backend is to split the graph into
     piecewise graphs, and pass them to the piecewise backend.
+
+    This backend also handles custom passes and adds them to Inductor config.
+    The order of the post-grad post-passes is:
+    1. post_grad_passes (constructor parameter)
+    2. config["post_grad_custom_post_pass"]
+    3. fix_functionalization
+    This way, all passes operate on a functionalized graph.
     """
 
     compilation_configs: CompilationConfig
@@ -330,8 +386,10 @@ class VllmBackend:
     split_gm: fx.GraphModule
     piecewise_graphs: List[SplitItem]
     returned_callable: Callable
+    # Inductor passes to run on the graph pre-defunctionalization
+    post_grad_passes: Sequence[Callable]
 
-    def __init__(self, ):
+    def __init__(self, post_grad_passes: Sequence[Callable] = ()):
         global global_graph_pool
         if global_graph_pool is None:
             global_graph_pool = torch.cuda.graph_pool_handle()
@@ -340,10 +398,30 @@ def __init__(self, ):
         # streams, it might not be safe to share a global pool.
         # only investigate this when we use multiple streams
         self.graph_pool = global_graph_pool
+        self.post_grad_passes = post_grad_passes
 
         # `torch.compile` is JIT compiled, so we don't need to
         # do anything here
 
+    def add_passes_to_config(self):
+        config = self.compilation_configs
+        passes = list(self.post_grad_passes)
+
+        passes = passes + [RedundantReshapesPass(config)]
+
+        if config.enable_fusion:
+            passes = passes + [FusionPass.instance(config)]
+
+        inductor_config = config.inductor_compile_config
+        if "post_grad_custom_post_pass" in inductor_config:
+            passes = passes + [inductor_config["post_grad_custom_post_pass"]]
+
+        # add the fix_functionalization pass last, so that all other
+        # passes operate on a functionalized graph
+        passes = passes + [fix_functionalization]
+        combined_pass = combine_fx_passes(passes)
+        inductor_config["post_grad_custom_post_pass"] = combined_pass
+
     def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
 
         compilation_counter.num_graphs_seen += 1
@@ -357,6 +435,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
         # we get the sizes to capture for cudagraph
         # from compilation context
         self.compilation_configs = CompilationConfig.select_and_init_config()
+        self.add_passes_to_config()
 
         self.split_gm, self.piecewise_graphs = split_graph(
             graph, self.compilation_configs.non_cudagraph_ops)
diff --git a/vllm/compilation/config.py b/vllm/compilation/config.py
index 514f2b93ef64f..72377533140b5 100644
--- a/vllm/compilation/config.py
+++ b/vllm/compilation/config.py
@@ -1,4 +1,5 @@
 import copy
+from pathlib import Path
 from typing import Any, Dict, List, Optional
 
 from pydantic import BaseModel, Field, PrivateAttr
@@ -50,6 +51,12 @@ class CompilationConfig(BaseModel):
             name because the config uses json format. If we pass the config
             from Python, functions can also be passed directly via Python object
             constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`
+    - Custom inductor passes:
+        - dump_graph_stages: list of stages for which we want to dump the graph.
+            Each pass defines its own stages (before, after, maybe in-between).
+        - dump_graph_dir: directory to dump the graph. Default is .
+        - enable_fusion: whether to enable the custom fusion pass.
+            TODO better pass enabling system.
     
     Why we have different sizes for cudagraph and inductor:
     - cudagraph: a cudagraph captured for a specific size can only be used
@@ -72,6 +79,10 @@ class CompilationConfig(BaseModel):
     cudagraph_num_of_warmups: int = 0
     cudagraph_capture_sizes: Optional[List[int]] = None
 
+    dump_graph_stages: List[str] = Field(default_factory=list)
+    dump_graph_dir: Path = Field(default=Path("."))
+    enable_fusion: bool = True
+
     # not configurable, computed after init
     compile_sizes: List[int] = PrivateAttr
     capture_sizes: List[int] = PrivateAttr
@@ -81,7 +92,7 @@ def model_post_init(self, __context: Any) -> None:
             if not isinstance(v, str):
                 assert callable(v), (
                     f"pass {k} should be a function or a qualified name")
-                self.inductor_passes[k] = v
+                self.inductor_compile_config[k] = v
                 continue
 
             # resolve function from qualified name
@@ -91,18 +102,6 @@ def model_post_init(self, __context: Any) -> None:
             func = __import__(module).__dict__[func_name]
             self.inductor_compile_config[k] = func
 
-        from vllm.compilation.backends import fix_functionalization
-        from vllm.utils import combine_fx_passes
-        if "post_grad_custom_post_pass" in self.inductor_compile_config:
-            self.inductor_compile_config[
-                "post_grad_custom_post_pass"] = combine_fx_passes(
-                    fix_functionalization,
-                    self.inductor_compile_config["post_grad_custom_post_pass"],
-                )
-        else:
-            self.inductor_compile_config[
-                "post_grad_custom_post_pass"] = fix_functionalization
-
     def init_during_runtime(self):
         """To complete the initialization of config,
         we need to know the compile context, which is only available
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
new file mode 100644
index 0000000000000..2a0cf0002c9dd
--- /dev/null
+++ b/vllm/compilation/fusion.py
@@ -0,0 +1,291 @@
+import operator
+from typing import Iterable, List, Optional
+
+import torch
+from torch._higher_order_ops.auto_functionalize import auto_functionalized
+from torch._inductor.pattern_matcher import (Match, PatternMatcherPass,
+                                             fwd_only, register_replacement)
+
+from vllm.compilation.config import CompilationConfig
+from vllm.compilation.inductor_pass import InductorPass
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def rms_pattern_static(result: torch.Tensor, result_rms: torch.Tensor,
+                       input: torch.Tensor, weight: torch.Tensor,
+                       scale: torch.Tensor):
+    at1 = auto_functionalized(torch.ops._C.rms_norm.default,
+                              result=result_rms,
+                              input=input,
+                              weight=weight,
+                              epsilon=1e-5)
+    at2 = auto_functionalized(torch.ops._C.static_scaled_fp8_quant.default,
+                              result=result,
+                              input=at1[1],
+                              scale=scale)
+
+    # result
+    return at2[1]
+
+
+def rms_replacement_static(result: torch.Tensor, result_rms: torch.Tensor,
+                           input: torch.Tensor, weight: torch.Tensor,
+                           scale: torch.Tensor):
+    at = auto_functionalized(torch.ops._C.rms_norm_static_fp8_quant.default,
+                             result=result,
+                             input=input,
+                             weight=weight,
+                             scale=scale,
+                             epsilon=1e-5)
+
+    # result
+    return at[1]
+
+
+def rms_pattern_residual_static(result: torch.Tensor, input: torch.Tensor,
+                                residual: torch.Tensor, weight: torch.Tensor,
+                                scale: torch.Tensor):
+    at = auto_functionalized(torch.ops._C.fused_add_rms_norm.default,
+                             input=input,
+                             residual=residual,
+                             weight=weight,
+                             epsilon=1e-5)
+    at1 = auto_functionalized(torch.ops._C.static_scaled_fp8_quant.default,
+                              result=result,
+                              input=at[1],
+                              scale=scale)
+
+    # result, residual
+    return at1[1], at[2]
+
+
+def rms_replacement_residual_static(result: torch.Tensor, input: torch.Tensor,
+                                    residual: torch.Tensor,
+                                    weight: torch.Tensor, scale: torch.Tensor):
+    at = auto_functionalized(
+        torch.ops._C.fused_add_rms_norm_static_fp8_quant.default,
+        result=result,
+        input=input,
+        residual=residual,
+        weight=weight,
+        scale=scale,
+        epsilon=1e-5)
+    # result, residual
+    return at[1], at[2]
+
+
+def empty_bf16(*args, **kwargs):
+    return torch.empty(*args, **kwargs, dtype=torch.bfloat16, device="cuda")
+
+
+def empty_fp8(*args, **kwargs):
+    fp8 = torch.float8_e4m3fn
+    return torch.empty(*args, **kwargs, dtype=fp8, device="cuda")
+
+
+def empty_fp32(*args, **kwargs):
+    return torch.empty(*args, **kwargs, dtype=torch.float32, device="cuda")
+
+
+# Utilities for post-processing multi-output matches
+def is_func(node: torch.fx.Node, target) -> bool:
+    return node.op == "call_function" and node.target == target
+
+
+# Returns the first auto_functionalized node with the given op (if it exists)
+def find_auto_fn_maybe(nodes: Iterable[torch.fx.Node],
+                       op) -> Optional[torch.fx.Node]:
+    for node in nodes:
+        if is_func(node, auto_functionalized) and node.args[0] == op:  # noqa
+            return node
+    return None
+
+
+# Returns the first auto_functionalized node with the given op
+def find_auto_fn(nodes: Iterable[torch.fx.Node], op) -> torch.fx.Node:
+    node = find_auto_fn_maybe(nodes, op)
+    assert node is not None, f"Could not find {op} in nodes {nodes}"
+    return node
+
+
+# Returns the getitem node that extracts the idx-th element from node
+# (if it exists)
+def find_getitem_maybe(node: torch.fx.Node,
+                       idx: int) -> Optional[torch.fx.Node]:
+    for user in node.users:
+        if is_func(user, operator.getitem) and user.args[1] == idx:
+            return user
+    return None
+
+
+# Returns the getitem node that extracts the idx-th element from node
+def find_getitem(node: torch.fx.Node, idx: int) -> torch.fx.Node:
+    ret = find_getitem_maybe(node, idx)
+    assert ret is not None, f"Could not find getitem {idx} in node {node}"
+    return ret
+
+
+class FusionPass(InductorPass):
+    """
+    This pass fuses a pre-defined set of custom ops into fused ops.
+    It uses the torch pattern matcher to find the patterns and replace them.
+    It also manually processes multi-output matches, as those are broken in
+    the torch pattern matcher.
+
+    Because patterns can only be registered once, the pass is a singleton.
+    This will be addressed in a future version of PyTorch:
+    https://github.com/pytorch/pytorch/pull/139321#issuecomment-2452354980
+    """
+
+    _instance: 'Optional[FusionPass]' = None
+
+    @classmethod
+    def instance(cls, config: CompilationConfig):
+        """
+        Get the singleton instance of the FusionPass.
+        If the instance exists, the config is updated but
+        initialization is not repeated.
+        """
+        if cls._instance is None:
+            cls._instance = FusionPass(config)
+        else:
+            cls._instance.config = config
+        return cls._instance
+
+    def __init__(self, config: CompilationConfig):
+        assert self.__class__._instance is None, \
+            "FusionPass singleton instance already exists"
+        super().__init__(config)
+
+        self.matches: List[Match] = []
+        self.patterns: PatternMatcherPass = PatternMatcherPass(
+            pass_name="fusion_pass")
+
+        # Fuse rms_norm + static_scaled_fp8_quant into
+        # rms_norm_static_fp8_quant
+        inputs = [
+            empty_fp8(5, 4),
+            empty_bf16(5, 4),
+            empty_bf16(5, 4),
+            empty_bf16(1, 5),
+            empty_fp32(1, 1)
+        ]
+        register_replacement(rms_pattern_static, rms_replacement_static,
+                             inputs, fwd_only, self.patterns)
+
+        # Fuse fused_add_rms_norm + static_scaled_fp8_quant into
+        # fused_add_rms_norm_static_fp8_quant
+        # Because pattern has 2 outputs, we need to manually process the match
+        # (see process_matches)
+        inputs = [
+            empty_fp8(5, 4),
+            empty_bf16(5, 4),
+            empty_bf16(5, 4),
+            empty_bf16(1, 5),
+            empty_fp32(1, 1)
+        ]
+        register_replacement(rms_pattern_residual_static,
+                             rms_replacement_residual_static,
+                             inputs,
+                             fwd_only,
+                             self.patterns,
+                             extra_check=lambda m: self.record_match(m))
+
+    def record_match(self, match: Match) -> bool:
+        # Hijack the extra_check to record the match and
+        # save it for post-processing.
+        self.matches.append(match)
+
+        # Return False to prevent automatic replacement.
+        return False
+
+    def process_matches(self, graph: torch.fx.Graph):
+        """
+        Manually process multi-output matches and replace them with fused nodes.
+        This is necessary because the automatic replacement for multi-output
+        matches is broken: https://github.com/pytorch/pytorch/issues/137280
+        """
+        for match in self.matches:
+            # To avoid use-before-definition errors, insert replacement nodes
+            # after the last node in the match.
+            # match.nodes is not guaranteed to be sorted.
+            # Find the last node in the match.
+            for last_node_in_match in reversed(graph.nodes):
+                if last_node_in_match in match.nodes:
+                    break
+            else:
+                raise ValueError("No nodes in graph")
+
+            # Insert a new auto_functionalized node for the fused operation,
+            # as well as getitem nodes to extract the result and residual.
+            # The auto_functionalized node returns a tuple of
+            # (None, result, residual) - None is the function return value.
+            # The resulting graph looks like this:
+            # at = auto_functionalized(torch.ops._C.fused_add_rms_norm_static_fp8_quant.default, ...)  # noqa
+            # result_node_new = at[1]
+            # residual_node_new = at[2]
+            with graph.inserting_after(last_node_in_match):
+                kwargs = match.kwargs
+                kwargs["epsilon"] = 1e-5  # Currently hard-coded in RMSNorm
+
+                fused_node = graph.call_function(
+                    auto_functionalized,
+                    (torch.ops._C.fused_add_rms_norm_static_fp8_quant.default,
+                     ),
+                    kwargs=kwargs)
+
+                graph.inserting_after(fused_node)
+                result_node_new = graph.call_function(operator.getitem,
+                                                      (fused_node, 1))
+                residual_node_new = graph.call_function(
+                    operator.getitem, (fused_node, 2))
+
+            # Last part of replacement is rebinding the users of nodes in the
+            # match to use the new nodes.
+
+            # Find the nodes in the match that we need to rebind
+            rms_node = find_auto_fn(match.nodes,
+                                    torch.ops._C.fused_add_rms_norm.default)
+            quant_node = find_auto_fn(
+                match.nodes, torch.ops._C.static_scaled_fp8_quant.default)
+
+            assert len(rms_node.users) == 2
+            assert len(quant_node.users) == 1
+
+            # meta["val"] is used by de-functionalization and has to contain the
+            # value of the node (tuple of tensors) that would be returned by the
+            # functionalized node during tracing.
+
+            rms_tup = rms_node.meta["val"]
+            quant_tup = quant_node.meta["val"]
+
+            # The result of fused_node must be a tuple with the first element
+            # None (the function return value) and the remaining elements
+            # representing the mutated inputs.
+            fused_tup = (None, quant_tup[1], rms_tup[1], rms_tup[2])
+            fused_node.meta["val"] = fused_tup
+
+            # Find the getitem nodes and replace their uses with the new nodes.
+            # The old nodes will be removed by DCE at the end of the pass.
+            find_getitem(rms_node, 2).replace_all_uses_with(residual_node_new)
+            find_getitem(quant_node, 1).replace_all_uses_with(result_node_new)
+
+        # Finally, remove matched nodes
+        graph.eliminate_dead_code()
+        assert all(node not in graph.nodes for match in self.matches
+                   for node in match.nodes)
+
+    def __call__(self, graph: torch.fx.Graph):
+        self.dump_graph(graph, "before_fusion")
+
+        count = self.patterns.apply(graph)
+        logger.info("Replaced %s patterns", count)
+        self.dump_graph(graph, "after_pattern_match")
+
+        # Manually process multi-output matches (and run DCE)
+        self.process_matches(graph)
+        logger.info("Post-processed %s matches", len(self.matches))
+        self.dump_graph(graph, "after_fusion")
+        self.matches.clear()
diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py
new file mode 100644
index 0000000000000..b23351fa19759
--- /dev/null
+++ b/vllm/compilation/inductor_pass.py
@@ -0,0 +1,38 @@
+from abc import ABC, abstractmethod
+
+import torch
+
+from vllm.compilation.config import CompilationConfig
+# yapf: disable
+from vllm.distributed import get_tensor_model_parallel_rank as get_tp_rank
+from vllm.distributed import (
+    get_tensor_model_parallel_world_size as get_tp_world_size)
+from vllm.distributed import model_parallel_is_initialized as p_is_init
+# yapf: enable
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class InductorPass(ABC):
+
+    @abstractmethod
+    def __call__(self, graph: torch.fx.Graph):
+        raise NotImplementedError
+
+    def __init__(self, config: CompilationConfig):
+        self.config = config
+
+    def dump_graph(self, graph: torch.fx.Graph, stage: str):
+        if stage in self.config.dump_graph_stages:
+            # Make sure filename includes rank in the distributed setting
+            parallel = p_is_init() and get_tp_world_size() > 1
+            rank = f"-{get_tp_rank()}" if parallel else ""
+            filepath = self.config.dump_graph_dir / f"{stage}{rank}.py"
+
+            logger.info("Printing graph to %s", filepath)
+            with open(filepath, "w") as f:
+                src = graph.python_code(root_module="self", verbose=True).src
+                # Add imports so it's not full of errors
+                print("import torch; from torch import device", file=f)
+                print(src, file=f)
diff --git a/vllm/compilation/reshapes.py b/vllm/compilation/reshapes.py
new file mode 100644
index 0000000000000..0d284246d2576
--- /dev/null
+++ b/vllm/compilation/reshapes.py
@@ -0,0 +1,85 @@
+from typing import Union
+
+import torch.fx
+from torch import SymInt
+
+from vllm.compilation.fusion import is_func
+from vllm.compilation.inductor_pass import InductorPass
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class RedundantReshapesPass(InductorPass):
+    """
+    This is an inductor pass that removes redundant reshape operations.
+    It is required for RMSNorm-quant fusion to work properly.
+    That's because apply_fp8_linear adds a reshape, which is redundant
+    in the 2D-case.
+
+    Example graph:
+
+    getitem_1: "f16[s0, 4096]" = ...
+    view_1: "f16[s0, 4096]" = torch.reshape(getitem_1, [-1, 4096])
+    at = auto_functionalized(static_scaled_fp8_quant, input = view_1, ...)
+    out: "f8e4m3fn[s0, 4096]" = at[1]
+
+    Can be replaced with:
+    getitem_1: "f16[s0, 4096]" = ...
+    at = auto_functionalized(static_scaled_fp8_quant, input = getitem_1, ...)
+    out: "f8e4m3fn[s0, 4096]" = at[1]
+    """
+
+    def __call__(self, graph: torch.fx.Graph):
+        self.dump_graph(graph, "before_reshapes")
+        count = 0
+        # Remove no-op reshapes/views:
+        for node in graph.nodes:
+            if is_func(node, torch.ops.aten.reshape.default):
+                input, shape = node.args[:2]
+                input_shape = input.meta["val"].shape
+                if len(shape) != len(input_shape):
+                    # Reshape changing rank, skip
+                    continue
+
+                if shape.count(-1) > 1:
+                    # Invalid reshape args, skip
+                    continue
+
+                if all(
+                        self.dims_equivalent(s, i_s)
+                        for s, i_s in zip(shape, input_shape)):
+                    node.replace_all_uses_with(input)
+                    graph.erase_node(node)
+                    count += 1
+
+        logger.info("Removed %s no-op reshapes", count)
+
+        self.dump_graph(graph, "after_reshapes")
+
+    def dims_equivalent(self, dim: Union[int, torch.fx.Node],
+                        i_dim: Union[int, SymInt]) -> bool:
+        """
+        This function checks if two dimensions are equivalent.
+        :param dim: The dimension arg to reshape
+        :param i_dim: The corresponding dimension in the input tensor
+        :return: Are the dimensions equivalent?
+
+        There are three cases in which the dimensions are equivalent:
+        1. The dimensions are equal (both integers)
+        2. The reshape dimension is -1 (i.e. inferred)
+        3. The dimensions both correspond to the same SymInt
+
+        While case 2 does not guarantee the dimensions are equal,
+        they are equal if all other dimensions are equal.
+
+        In case 3, the reshape dimension is a torch.fx.Node,
+        and its value is a SymInt. That value is equal to the
+        input dimension.
+
+        """
+        # Case 1 and 2
+        if dim == i_dim or dim == -1:
+            return True
+        # Case 3
+        return isinstance(dim, torch.fx.Node) and dim.meta["val"] == i_dim
diff --git a/vllm/envs.py b/vllm/envs.py
index 9e596a699e466..154246c69f165 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -68,6 +68,7 @@
     VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
     VLLM_SKIP_P2P_CHECK: bool = False
     VLLM_TORCH_COMPILE_LEVEL: int = 0
+    VLLM_TORCH_COMPILE_CONFIG: Optional[str] = None
     VLLM_CUSTOM_OPS: List[str] = []
     VLLM_DISABLED_KERNELS: List[str] = []
     VLLM_USE_V1: bool = False
@@ -226,6 +227,7 @@ def get_default_config_root():
     # and disabled when running with Inductor (compile_level >= Inductor).
     "VLLM_CUSTOM_OPS":
     lambda: os.environ.get("VLLM_CUSTOM_OPS", "").replace(" ", "").split(","),
+
     # local rank of the process in the distributed setting, used to determine
     # the GPU device id
     "LOCAL_RANK":

From 10b67d865d92e376956345becafc249d4c3c0ab7 Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Fri, 8 Nov 2024 17:44:18 -0500
Subject: [PATCH 056/183] [Bugfix] SymIntArrayRef expected to contain concrete
 integers (#10170)

Signed-off-by: Bill Nell <bill@neuralmagic.com>
---
 vllm/compilation/backends.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index f5fff344a1f48..c3c670422defa 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -329,7 +329,8 @@ def run(self, *args):
             self.fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
             for t in args
         ]
-        return super().run(*fake_args)
+        with self.fake_mode:
+            return super().run(*fake_args)
 
     def call_module(self, target: torch.fx.node.Target,
                     args: Tuple[torch.fx.node.Argument,

From 127c07480ecea15e4c2990820c457807ff78a057 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Fri, 8 Nov 2024 18:59:22 -0600
Subject: [PATCH 057/183] [Kernel][Triton] Add Triton implementation for
 scaled_mm_triton to support fp8 and int8 SmoothQuant, symmetric case (#9857)

Signed-off-by: Randall Smith <Randall.Smith@amd.com>
---
 tests/kernels/test_triton_scaled_mm.py        | 106 ++++++++++
 vllm/_custom_ops.py                           |   9 +
 .../compressed_tensors/triton_scaled_mm.py    | 184 ++++++++++++++++++
 3 files changed, 299 insertions(+)
 create mode 100644 tests/kernels/test_triton_scaled_mm.py
 create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py

diff --git a/tests/kernels/test_triton_scaled_mm.py b/tests/kernels/test_triton_scaled_mm.py
new file mode 100644
index 0000000000000..8e96a2f70d751
--- /dev/null
+++ b/tests/kernels/test_triton_scaled_mm.py
@@ -0,0 +1,106 @@
+"""Tests for the triton_scaled_mm kernel
+
+Run `pytest tests/kernels/test_triton_scaled_mm.py`.
+"""
+import importlib
+from typing import Optional, Type
+
+import pytest
+import torch
+
+from vllm.platforms import current_platform
+
+device = "cuda"
+
+
+def scaled_mm_torch(a: torch.Tensor,
+                    b: torch.Tensor,
+                    scale_a: torch.Tensor,
+                    scale_b: torch.Tensor,
+                    out_dtype: Type[torch.dtype],
+                    bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    out = torch.mm(a.to(torch.float32), b.to(torch.float32))
+    out = scale_a * out
+    out = scale_b.T * out
+    out = out.to(out_dtype)
+    if bias is not None:
+        out = out + bias
+
+    return out
+
+
+def get_8bit_types():
+    types = [torch.int8]
+    supports_fp8 = current_platform.has_device_capability(89)
+    if current_platform.is_rocm() and supports_fp8:
+        types.append(torch.float8_e4m3fnuz)
+    elif current_platform.is_cuda() and supports_fp8:
+        types.append(torch.float8_e4m3fn)
+    return types
+
+
+@pytest.mark.parametrize("M", [1, 33, 64, 512])
+@pytest.mark.parametrize("N", [256, 971, 20486])
+@pytest.mark.parametrize("K", [128, 496, 1024])
+@pytest.mark.parametrize("out_dtype", [torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("in_dtype", get_8bit_types())
+@pytest.mark.parametrize("use_scalar_scale_a", [True, False])
+@pytest.mark.parametrize("use_scalar_scale_b", [True, False])
+@pytest.mark.parametrize("use_bias", [True, False])
+def test_scaled_mm(M, N, K, in_dtype, out_dtype, use_scalar_scale_a,
+                   use_scalar_scale_b, use_bias):
+    is_floating_point_type = lambda t: torch.tensor([1, 1], dtype=t
+                                                    ).is_floating_point()
+
+    current_platform.seed_everything(0)
+
+    # NOTE: There are cases, where if the matrix is large enough, an output
+    # like 65504.4 can be produced, and can easily turn into inf when
+    # multiplied when using float16/bfloat16.  This means one function, e.g.,
+    # testing function, and another function, e.g. golden function, can
+    # produce a non-inf value while the other produces an inf value, and
+    # will cause assert_close/allclose to fail, even though if overflow
+    # wouldn't have occurred, the values would have been "close."
+    #
+    # So, the values here are kept small enough to avoid this situation.
+    if is_floating_point_type(in_dtype):
+        a = (0.25 * torch.rand(
+            (M, K), dtype=torch.float32, device=device)).to(in_dtype)
+        b = (0.25 * torch.rand(
+            (K, N), dtype=torch.float32, device=device)).to(in_dtype)
+    else:
+        a = torch.randint(-32, 32, (M, K), dtype=in_dtype, device=device)
+        b = torch.randint(-32, 32, (K, N), dtype=in_dtype, device=device)
+
+    if use_scalar_scale_a:
+        scale_a = torch.rand((1, 1), device=device)
+    else:
+        scale_a = 0.25 * torch.rand((M, 1), device=device)
+
+    if use_scalar_scale_b:
+        scale_b = torch.rand((1, 1), device=device)
+    else:
+        scale_b = 0.25 * torch.rand((N, 1), device=device)
+
+    bias = None
+    if use_bias:
+        bias = torch.rand((N, ), device=device, dtype=out_dtype)
+
+    triton_scaled_mm_module = importlib.import_module(
+        "vllm.model_executor.layers.quantization.compressed_tensors."
+        "triton_scaled_mm")
+    triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
+
+    c_check = triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
+
+    a_cpu = a.cpu()
+    b_cpu = b.cpu()
+    scale_a_cpu = scale_a.cpu()
+    scale_b_cpu = scale_b.cpu()
+    bias_cpu = None if bias is None else bias.cpu()
+
+    c_actual = scaled_mm_torch(a_cpu, b_cpu, scale_a_cpu, scale_b_cpu,
+                               out_dtype, bias_cpu)
+
+    c_check_cpu = c_check.cpu()
+    torch.testing.assert_close(c_check_cpu, c_actual, rtol=1e-1, atol=1e-1)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 767d45ede7e87..8f331a27a20de 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1,5 +1,6 @@
 import contextlib
 import functools
+import importlib
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
 
 import torch
@@ -486,6 +487,14 @@ def cutlass_scaled_mm(a: torch.Tensor,
 
     m = a.shape[0]
     n = b.shape[1]
+
+    if current_platform.is_rocm():
+        triton_scaled_mm_module = importlib.import_module(
+            "vllm.model_executor.layers.quantization.compressed_tensors."
+            "triton_scaled_mm")
+        triton_scaled_mm = triton_scaled_mm_module.triton_scaled_mm
+        return triton_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias)
+
     out = torch.empty((m, n), dtype=out_dtype, device=a.device)
 
     torch.ops._C.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
new file mode 100644
index 0000000000000..3ff162170f255
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py
@@ -0,0 +1,184 @@
+from typing import Optional, Type
+
+import torch
+import triton
+import triton.language as tl
+
+
+def is_weak_contiguous(x: torch.Tensor):
+    strides = x.stride()
+    sizes = x.shape
+    is_not_transpose = strides[0] == 1 and (strides[1] >= max(1, sizes[0]))
+    is_transpose = strides[1] == 1 and (strides[0] >= max(1, sizes[1]))
+    return is_transpose or is_not_transpose
+
+
+@triton.jit
+def scaled_mm_kernel(a_ptr, b_ptr, scale_a_ptr, scale_b_ptr, c_ptr, bias_ptr,
+                     M, N, K, stride_am, stride_ak, stride_bk, stride_bn,
+                     stride_cm, stride_cn, ACCUMULATOR_DTYPE: tl.constexpr,
+                     BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr,
+                     BLOCK_SIZE_K: tl.constexpr,
+                     BLOCK_SIZE_SCALE_A: tl.constexpr,
+                     BLOCK_SIZE_SCALE_B: tl.constexpr):
+    pid = tl.program_id(axis=0)
+
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+
+    pid_m = pid // num_pid_n
+    pid_n = pid % num_pid_n
+
+    accumulator_dtype = ACCUMULATOR_DTYPE
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N),
+                           dtype=accumulator_dtype)
+
+    # NOTE: Some tensor inputs are so large, they will cause int32 overflow
+    # so it is necessary to use tl.int64 for all the offsets, else SEGV will
+    # eventually occur.
+
+    # Offsets and masks.
+    offsets_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64)
+    masks_am = offsets_am < M
+
+    offsets_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)
+    masks_bn = offsets_bn < N
+
+    offsets_k = tl.arange(0, BLOCK_SIZE_K).to(tl.int64)
+    offsets_a = (stride_am * offsets_am[:, None] +
+                 stride_ak * offsets_k[None, :])
+    offsets_b = (stride_bk * offsets_k[:, None] +
+                 stride_bn * offsets_bn[None, :])
+
+    # NOTE: BLOCK_SIZE_SCALE_A could be 1 or BLOCK_SIZE_M, so need to create
+    # appropriate offsets and masks for each case. Same goes for
+    # BLOCK_SIZE_SCALE_B.
+    offsets_scale_am = (tl.arange(0, BLOCK_SIZE_SCALE_A) +
+                        (BLOCK_SIZE_SCALE_A > 1) * pid_m * BLOCK_SIZE_M)
+    masks_scale_am = offsets_scale_am < M
+
+    offsets_scale_bn = (tl.arange(0, BLOCK_SIZE_SCALE_B) +
+                        (BLOCK_SIZE_SCALE_B > 1) * pid_n * BLOCK_SIZE_N)
+    masks_scale_bn = offsets_scale_bn < N
+
+    a_ptrs = a_ptr + offsets_a
+    b_ptrs = b_ptr + offsets_b
+
+    scale_a_ptrs = scale_a_ptr + offsets_scale_am
+    scale_b_ptrs = scale_b_ptr + offsets_scale_bn
+
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        masks_k = offsets_k < K
+        masks_a = masks_am[:, None] & masks_k[None, :]
+        a = tl.load(a_ptrs, mask=masks_a)
+
+        masks_b = masks_k[:, None] & masks_bn[None, :]
+        b = tl.load(b_ptrs, mask=masks_b)
+
+        # Accumulate results.
+        accumulator = tl.dot(a, b, accumulator, out_dtype=accumulator_dtype)
+
+        offsets_k += BLOCK_SIZE_K
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+
+    # Apply scale at end.
+    masks_scale_a = masks_scale_am[:, None] & (tl.arange(0, 1) < 1)[:, None]
+    scale_a = tl.load(scale_a_ptrs[:, None], masks_scale_a)
+    # Need to broadcast to the appropriate size, if scale_a is already
+    # (BLOCK_SIZE_M, 1) then it will broadcast to its own shape. Same goes
+    # for scale_b below.
+    scale_a = scale_a.broadcast_to((BLOCK_SIZE_M, 1))
+    accumulator = scale_a * accumulator.to(tl.float32)
+
+    masks_scale_b = masks_scale_bn[:, None] & (tl.arange(0, 1) < 1)[None, :]
+    scale_b = tl.load(scale_b_ptrs[:, None], masks_scale_b)
+    scale_b = scale_b.broadcast_to((BLOCK_SIZE_N, 1))
+    accumulator = scale_b.T * accumulator.to(tl.float32)
+
+    # Convert to output format.
+    c = accumulator.to(c_ptr.type.element_ty)
+
+    # Add bias, it's already in output format, so add it after conversion.
+    if bias_ptr:
+        offsets_bias = offsets_bn
+        bias_ptrs = bias_ptr + offsets_bias
+        bias_mask = offsets_bias < N
+        bias = tl.load(bias_ptrs, bias_mask)
+        c += bias
+
+    # Save output
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)
+    offs_cm = offs_cm.to(tl.int64)
+    offs_cn = offs_cn.to(tl.int64)
+    c_ptrs = (c_ptr + stride_cm * offs_cm[:, None] +
+              stride_cn * offs_cn[None, :])
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+
+    tl.store(c_ptrs, c, mask=c_mask)
+
+
+# input   - [M, K]
+# weight - [K, N]
+def triton_scaled_mm(input: torch.Tensor,
+                     weight: torch.Tensor,
+                     scale_a: torch.Tensor,
+                     scale_b: torch.Tensor,
+                     out_dtype: Type[torch.dtype],
+                     bias: Optional[torch.Tensor] = None,
+                     block_size_m: int = 32,
+                     block_size_n: int = 32,
+                     block_size_k: int = 32) -> torch.Tensor:
+    M, K = input.shape
+    N = weight.shape[1]
+
+    assert N > 0 and K > 0 and M > 0
+    assert weight.shape[0] == K
+    assert input.dtype == weight.dtype
+    assert scale_a.dtype == scale_b.dtype and scale_a.is_floating_point()
+    assert scale_a.shape == torch.Size([1, 1]) or scale_a.shape == torch.Size(
+        [M, 1])
+    assert scale_b.shape == torch.Size([1, 1]) or scale_b.shape == torch.Size(
+        [N, 1])
+    assert out_dtype.is_floating_point
+    assert bias is None or bias.is_floating_point()
+    assert is_weak_contiguous(input)
+    assert is_weak_contiguous(weight)
+
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
+        N, META['BLOCK_SIZE_N']), )
+
+    result = torch.empty((M, N), dtype=out_dtype, device=input.device)
+
+    has_scalar = lambda x: x.shape[0] == 1 and x.shape[1] == 1
+
+    block_size_sa = 1 if has_scalar(scale_a) else block_size_m
+    block_size_sb = 1 if has_scalar(scale_b) else block_size_n
+
+    accumulator_dtype = tl.float32 if input.is_floating_point() else tl.int32
+
+    # A = input, B = weight, C = result
+    # A = M x K, B = K x N, C = M x N
+    scaled_mm_kernel[grid](input,
+                           weight,
+                           scale_a,
+                           scale_b,
+                           result,
+                           bias,
+                           M,
+                           N,
+                           K,
+                           input.stride(0),
+                           input.stride(1),
+                           weight.stride(0),
+                           weight.stride(1),
+                           result.stride(0),
+                           result.stride(1),
+                           accumulator_dtype,
+                           BLOCK_SIZE_M=block_size_m,
+                           BLOCK_SIZE_N=block_size_n,
+                           BLOCK_SIZE_K=block_size_k,
+                           BLOCK_SIZE_SCALE_A=block_size_sa,
+                           BLOCK_SIZE_SCALE_B=block_size_sb)
+
+    return result.to(out_dtype)

From d7edca1dee96e6caeeadcee4914a6b00d1c99fd5 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Sat, 9 Nov 2024 11:27:11 +0800
Subject: [PATCH 058/183] [CI/Build] Adding timeout in CPU CI to avoid CPU test
 queue blocking (#6892)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/run-cpu-test-ppc64le.sh | 57 ++++++++++---------
 .buildkite/run-cpu-test.sh         | 91 ++++++++++++++++--------------
 2 files changed, 79 insertions(+), 69 deletions(-)

diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh
index 5add7ff0c15c9..cd2bfd8bb5bf4 100755
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@@ -17,30 +17,35 @@ source /etc/environment
 #docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN="$HF_TOKEN" --name cpu-test cpu-test
 
-# Run basic model test
-docker exec cpu-test bash -c "
-  set -e
-  pip install pytest pytest-asyncio \
-    decord einops librosa peft Pillow sentence-transformers soundfile \
-    transformers_stream_generator matplotlib datamodel_code_generator
-  pip install torchvision --index-url https://download.pytorch.org/whl/cpu
-  # Embedding models are not supported for CPU yet
-  # pytest -v -s tests/models/embedding/language
-  pytest -v -s tests/models/encoder_decoder/language
-  pytest -v -s tests/models/decoder_only/language/test_models.py
-  # Chunked prefill not supported for CPU yet
-  # pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
-  pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
+function cpu_tests() {
+  # Run basic model test
+  docker exec cpu-test bash -c "
+    set -e
+    pip install pytest pytest-asyncio \
+      decord einops librosa peft Pillow sentence-transformers soundfile \
+      transformers_stream_generator matplotlib datamodel_code_generator
+    pip install torchvision --index-url https://download.pytorch.org/whl/cpu
+    # Embedding models are not supported for CPU yet
+    # pytest -v -s tests/models/embedding/language
+    pytest -v -s tests/models/encoder_decoder/language
+    pytest -v -s tests/models/decoder_only/language/test_models.py
+    pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
+    pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
 
-# online inference
-docker exec cpu-test bash -c "
-  set -e
-  python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & 
-  timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
-  python3 benchmarks/benchmark_serving.py \
-    --backend vllm \
-    --dataset-name random \
-    --model facebook/opt-125m \
-    --num-prompts 20 \
-    --endpoint /v1/completions \
-    --tokenizer facebook/opt-125m"
+  # online inference
+  docker exec cpu-test bash -c "
+    set -e
+    python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & 
+    timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
+    python3 benchmarks/benchmark_serving.py \
+      --backend vllm \
+      --dataset-name random \
+      --model facebook/opt-125m \
+      --num-prompts 20 \
+      --endpoint /v1/completions \
+      --tokenizer facebook/opt-125m"
+}
+
+# All of CPU tests are expected to be finished less than 25 mins.
+export -f cpu_tests
+timeout 25m bash -c "cpu_tests"
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 25a448e63be27..8d4f4d1a681f2 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -19,50 +19,55 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
  --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
 
-# offline inference
-docker exec cpu-test-avx2 bash -c "
-  set -e
-  python3 examples/offline_inference.py"
+function cpu_tests() {
+  # offline inference
+  docker exec cpu-test-avx2 bash -c "
+    set -e
+    python3 examples/offline_inference.py"
 
-# Run basic model test
-docker exec cpu-test bash -c "
-  set -e
-  pip install pytest pytest-asyncio \
-    decord einops librosa peft Pillow sentence-transformers soundfile \
-    transformers_stream_generator matplotlib datamodel_code_generator
-  pip install torchvision --index-url https://download.pytorch.org/whl/cpu
-  # Embedding models are not supported for CPU yet
-  # pytest -v -s tests/models/embedding/language
-  pytest -v -s tests/models/encoder_decoder/language
-  pytest -v -s tests/models/decoder_only/language/test_models.py
-  # Chunked prefill not supported for CPU yet
-  # pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
-  pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
+  # Run basic model test
+  docker exec cpu-test bash -c "
+    set -e
+    pip install pytest pytest-asyncio \
+      decord einops librosa peft Pillow sentence-transformers soundfile \
+      transformers_stream_generator matplotlib datamodel_code_generator
+    pip install torchvision --index-url https://download.pytorch.org/whl/cpu
+    # Embedding models are not supported for CPU yet
+    # pytest -v -s tests/models/embedding/language
+    pytest -v -s tests/models/encoder_decoder/language
+    pytest -v -s tests/models/decoder_only/language/test_models.py
+    pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
+    pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
 
-# Run compressed-tensor test
-docker exec cpu-test bash -c "
-  set -e
-  pytest -s -v \
-  tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
-  tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
+  # Run compressed-tensor test
+  docker exec cpu-test bash -c "
+    set -e
+    pytest -s -v \
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
+    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
 
-# Run AWQ test
-docker exec cpu-test bash -c "
-  set -e
-  pytest -s -v \
-  tests/quantization/test_ipex_quant.py"
+  # Run AWQ test
+  docker exec cpu-test bash -c "
+    set -e
+    pytest -s -v \
+    tests/quantization/test_ipex_quant.py"
 
-# online inference
-docker exec cpu-test bash -c "
-  set -e
-  export VLLM_CPU_KVCACHE_SPACE=10 
-  export VLLM_CPU_OMP_THREADS_BIND=48-92 
-  python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
-  timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
-  python3 benchmarks/benchmark_serving.py \
-    --backend vllm \
-    --dataset-name random \
-    --model facebook/opt-125m \
-    --num-prompts 20 \
-    --endpoint /v1/completions \
-    --tokenizer facebook/opt-125m"
+  # online inference
+  docker exec cpu-test bash -c "
+    set -e
+    export VLLM_CPU_KVCACHE_SPACE=10 
+    export VLLM_CPU_OMP_THREADS_BIND=48-92 
+    python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
+    timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
+    python3 benchmarks/benchmark_serving.py \
+      --backend vllm \
+      --dataset-name random \
+      --model facebook/opt-125m \
+      --num-prompts 20 \
+      --endpoint /v1/completions \
+      --tokenizer facebook/opt-125m"
+}
+
+# All of CPU tests are expected to be finished less than 25 mins.
+export -f cpu_tests
+timeout 25m bash -c "cpu_tests"

From e0191a95d88c454dbb989b7457a41c93cb7f7051 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 9 Nov 2024 11:31:02 +0800
Subject: [PATCH 059/183] [0/N] Rename `MultiModalInputs` to `MultiModalKwargs`
 (#10040)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../design/multimodal/multimodal_index.rst    |  2 +-
 .../mm_processor_kwargs/test_qwen.py          |  4 +--
 tests/multimodal/test_base.py                 | 22 ++++++-------
 vllm/model_executor/models/chatglm.py         |  4 +--
 vllm/model_executor/models/fuyu.py            |  4 +--
 vllm/model_executor/models/h2ovl.py           | 10 +++---
 vllm/model_executor/models/idefics3.py        |  4 +--
 vllm/model_executor/models/internvl.py        |  6 ++--
 vllm/model_executor/models/minicpmv.py        |  4 +--
 vllm/model_executor/models/mllama.py          |  2 +-
 vllm/model_executor/models/molmo.py           |  4 +--
 vllm/model_executor/models/pixtral.py         | 10 +++---
 vllm/model_executor/models/qwen.py            | 12 +++----
 vllm/model_executor/models/qwen2_audio.py     |  8 ++---
 vllm/model_executor/models/qwen2_vl.py        |  8 ++---
 vllm/model_executor/models/ultravox.py        |  8 ++---
 vllm/multimodal/__init__.py                   | 19 +++++++++--
 vllm/multimodal/audio.py                      |  4 +--
 vllm/multimodal/base.py                       | 33 ++++++++++++++-----
 vllm/multimodal/image.py                      | 10 +++---
 vllm/multimodal/registry.py                   |  6 ++--
 vllm/multimodal/video.py                      |  6 ++--
 vllm/spec_decode/draft_model_runner.py        |  4 +--
 vllm/worker/cpu_enc_dec_model_runner.py       |  4 +--
 vllm/worker/cpu_model_runner.py               | 10 +++---
 vllm/worker/embedding_model_runner.py         |  4 +--
 vllm/worker/enc_dec_model_runner.py           |  4 +--
 vllm/worker/hpu_model_runner.py               |  8 ++---
 vllm/worker/model_runner.py                   | 18 +++++-----
 vllm/worker/neuron_model_runner.py            | 10 +++---
 vllm/worker/openvino_model_runner.py          | 10 +++---
 vllm/worker/xpu_model_runner.py               | 10 +++---
 32 files changed, 151 insertions(+), 121 deletions(-)

diff --git a/docs/source/design/multimodal/multimodal_index.rst b/docs/source/design/multimodal/multimodal_index.rst
index e112b43aade5e..30f543abc20c7 100644
--- a/docs/source/design/multimodal/multimodal_index.rst
+++ b/docs/source/design/multimodal/multimodal_index.rst
@@ -53,7 +53,7 @@ Base Classes
 
 .. autodata:: vllm.multimodal.MultiModalDataDict
 
-.. autoclass:: vllm.multimodal.MultiModalInputs
+.. autoclass:: vllm.multimodal.MultiModalKwargs
     :members:
     :show-inheritance:
 
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
index 6ae8a6a704b0a..e6ed87fc8ea08 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
@@ -6,7 +6,7 @@
 from PIL.Image import Image
 
 from vllm.inputs import InputContext, token_inputs
-from vllm.multimodal.base import MultiModalInputs
+from vllm.multimodal.base import MultiModalKwargs
 from vllm.multimodal.utils import cached_get_tokenizer
 
 from .....conftest import IMAGE_ASSETS
@@ -96,7 +96,7 @@ def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
     mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
     # Ensure that we get the appropriately shaped pixel_values
     # for images and image embeddings, respectively.
-    assert isinstance(mapped_img_data, MultiModalInputs)
+    assert isinstance(mapped_img_data, MultiModalKwargs)
     assert "pixel_values" in mapped_img_data
     assert mapped_img_data["pixel_values"].shape == expected_shape
 
diff --git a/tests/multimodal/test_base.py b/tests/multimodal/test_base.py
index 68d05de904ba8..bfaf2cdeaa8d4 100644
--- a/tests/multimodal/test_base.py
+++ b/tests/multimodal/test_base.py
@@ -1,6 +1,6 @@
 import torch
 
-from vllm.multimodal.base import MultiModalInputs, NestedTensors
+from vllm.multimodal.base import MultiModalKwargs, NestedTensors
 
 
 def assert_nested_tensors_equal(expected: NestedTensors,
@@ -13,8 +13,8 @@ def assert_nested_tensors_equal(expected: NestedTensors,
             assert_nested_tensors_equal(expected_item, actual_item)
 
 
-def assert_multimodal_inputs_equal(expected: MultiModalInputs,
-                                   actual: MultiModalInputs):
+def assert_multimodal_inputs_equal(expected: MultiModalKwargs,
+                                   actual: MultiModalKwargs):
     assert set(expected.keys()) == set(actual.keys())
     for key in expected:
         assert_nested_tensors_equal(expected[key], actual[key])
@@ -22,7 +22,7 @@ def assert_multimodal_inputs_equal(expected: MultiModalInputs,
 
 def test_multimodal_input_batch_single_tensor():
     t = torch.rand([1, 2])
-    result = MultiModalInputs.batch([{"image": t}])
+    result = MultiModalKwargs.batch([{"image": t}])
     assert_multimodal_inputs_equal(result, {"image": t.unsqueeze(0)})
 
 
@@ -30,7 +30,7 @@ def test_multimodal_input_batch_multiple_tensors():
     a = torch.rand([1, 1, 2])
     b = torch.rand([1, 1, 2])
     c = torch.rand([1, 1, 2])
-    result = MultiModalInputs.batch([{"image": a}, {"image": b}, {"image": c}])
+    result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}])
     assert_multimodal_inputs_equal(result, {"image": torch.stack([a, b, c])})
 
 
@@ -38,7 +38,7 @@ def test_multimodal_input_batch_multiple_heterogeneous_tensors():
     a = torch.rand([1, 2, 2])
     b = torch.rand([1, 3, 2])
     c = torch.rand([1, 4, 2])
-    result = MultiModalInputs.batch([{"image": a}, {"image": b}, {"image": c}])
+    result = MultiModalKwargs.batch([{"image": a}, {"image": b}, {"image": c}])
     assert_multimodal_inputs_equal(result, {"image": [a, b, c]})
 
 
@@ -46,7 +46,7 @@ def test_multimodal_input_batch_nested_tensors():
     a = torch.rand([2, 3])
     b = torch.rand([2, 3])
     c = torch.rand([2, 3])
-    result = MultiModalInputs.batch([{
+    result = MultiModalKwargs.batch([{
         "image": [a]
     }, {
         "image": [b]
@@ -65,7 +65,7 @@ def test_multimodal_input_batch_heterogeneous_lists():
     a = torch.rand([1, 2, 3])
     b = torch.rand([1, 2, 3])
     c = torch.rand([1, 2, 3])
-    result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c]}])
+    result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
     assert_multimodal_inputs_equal(
         result,
         {"image": [torch.stack([a, b]), c.unsqueeze(0)]})
@@ -76,7 +76,7 @@ def test_multimodal_input_batch_multiple_batchable_lists():
     b = torch.rand([1, 2, 3])
     c = torch.rand([1, 2, 3])
     d = torch.rand([1, 2, 3])
-    result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c, d]}])
+    result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c, d]}])
     assert_multimodal_inputs_equal(
         result,
         {"image": torch.stack([torch.stack([a, b]),
@@ -88,8 +88,8 @@ def test_multimodal_input_batch_mixed_stacking_depths():
     b = torch.rand([1, 3, 3])
     c = torch.rand([1, 4, 3])
 
-    result = MultiModalInputs.batch([{"image": [a, b]}, {"image": [c]}])
+    result = MultiModalKwargs.batch([{"image": [a, b]}, {"image": [c]}])
     assert_multimodal_inputs_equal(result, {"image": [[a, b], c.unsqueeze(0)]})
 
-    result = MultiModalInputs.batch([{"image": [a]}, {"image": [b, c]}])
+    result = MultiModalKwargs.batch([{"image": [a]}, {"image": [b, c]}])
     assert_multimodal_inputs_equal(result, {"image": [a.unsqueeze(0), [b, c]]})
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 032fa82ab93cd..eb9c3e3ae785d 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -30,7 +30,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.base import MultiModalData
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
@@ -74,7 +74,7 @@ def mm_input_mapper_for_glmv(
         raise
     pixel_values = raw_batch_data['images']
 
-    return MultiModalInputs({'pixel_values': pixel_values})
+    return MultiModalKwargs({'pixel_values': pixel_values})
 
 
 def merge_glm_vision_embeddings(
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 3db82a898159b..653d5d60ea178 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -34,7 +34,7 @@
 from vllm.model_executor.models.persimmon import PersimmonForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.base import MultiModalInputs
+from vllm.multimodal.base import MultiModalKwargs
 from vllm.multimodal.image import cached_get_image_processor
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    consecutive_placeholder_ranges)
@@ -218,7 +218,7 @@ def input_mapper_for_fuyu(ctx: InputContext, data: object):
         ])
 
     # image has been processed with prompt in input processor
-    return MultiModalInputs({"pixel_values": data})
+    return MultiModalKwargs({"pixel_values": data})
 
 
 @MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_fuyu)
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 43242fe370ba2..767171dad7c7b 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -16,7 +16,7 @@
                          token_inputs)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.base import MultiModalInputs
+from vllm.multimodal.base import MultiModalKwargs
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.utils import is_list_of
 
@@ -324,12 +324,12 @@ def input_mapper(
         data: object,
         *,
         max_dynamic_patch: Optional[int] = None,
-    ) -> MultiModalInputs:
+    ) -> MultiModalKwargs:
 
         # NOTE: Preprocessing for the image data is done in the
         # 'input_processor' function during actual inference.
         if isinstance(data, dict):
-            return MultiModalInputs(data)
+            return MultiModalKwargs(data)
 
         # The section below is only used with dummy data during
         # memory profiling.
@@ -347,7 +347,7 @@ def input_mapper(
             pixel_values = [image_pixel_values_mapper(img) for img in data]
 
         else:
-            return MultiModalInputs({"image_embeds": data})
+            return MultiModalKwargs({"image_embeds": data})
         model_config = ctx.model_config
         tokenizer = cached_get_tokenizer(
             model_config.tokenizer,
@@ -359,7 +359,7 @@ def input_mapper(
             return_tensors="pt",
         )[0]
 
-        return MultiModalInputs({
+        return MultiModalKwargs({
             "pixel_values": pixel_values,
             "image_token_id": image_token_id
         })
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 3f6d010f4e493..8004367f8dc08 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -36,7 +36,7 @@
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.image import cached_get_image_processor
 from vllm.sequence import IntermediateTensors, SequenceData
 from vllm.transformers_utils.processor import cached_get_processor
@@ -127,7 +127,7 @@ def input_mapper_for_idefics3(
         logger.error("Failed to process image (%s)", data)
         raise
 
-    return MultiModalInputs(batch_data)
+    return MultiModalKwargs(batch_data)
 
 
 def _resize_output_size(height: int,
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index bb9d38889a175..335b11d293acd 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -26,7 +26,7 @@
                                                    InternVisionPatchModel)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.base import MultiModalInputs
+from vllm.multimodal.base import MultiModalKwargs
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
@@ -346,7 +346,7 @@ def input_mapper(
             # we can't stack here because images may have different num_patches
             data = [image_pixel_values_mapper(img) for img in data]
         else:
-            return MultiModalInputs({"image_embeds": data})
+            return MultiModalKwargs({"image_embeds": data})
         model_config = ctx.model_config
         tokenizer = cached_get_tokenizer(
             model_config.tokenizer,
@@ -355,7 +355,7 @@ def input_mapper(
                                           add_special_tokens=False,
                                           return_tensors="pt")[0]
 
-        return MultiModalInputs({
+        return MultiModalKwargs({
             "pixel_values": data,
             "image_token_id": image_token_id
         })
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 4ffe33bb6ce41..f8006095e2eb2 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -52,7 +52,7 @@
 from vllm.model_executor.models.utils import LLMWrapper
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.base import MultiModalInputs
+from vllm.multimodal.base import MultiModalKwargs
 from vllm.multimodal.image import cached_get_image_processor
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.sequence import IntermediateTensors, SequenceData
@@ -374,7 +374,7 @@ def input_mapper_for_minicpmv(ctx: InputContext, data: object):
             batch_data["slice_start_id"] = data[0]["slice_start_id"]
             batch_data["slice_end_id"] = data[0]["slice_end_id"]
 
-    return MultiModalInputs(batch_data)
+    return MultiModalKwargs(batch_data)
 
 
 class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index d442ffe3c1fb1..18e38daadc93a 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -1162,7 +1162,7 @@ def sample(
 
     def _parse_and_validate_image_input(self, **kwargs: object):
         # tensor with the same shape will be batched together by
-        # MultiModalInputs.batch, so pixel_values here can be:
+        # MultiModalKwargs.batch, so pixel_values here can be:
         #   - List[List[torch.Tensor]]:
         #       with shape (num_tiles, 3, image_res, image_res)
         #   - List[torch.Tensor]:
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 3a50923de3741..5f2f61cc610b3 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -37,7 +37,7 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
                            SequenceData)
@@ -866,7 +866,7 @@ def image_input_mapper_for_molmo(
     ctx: InputContext,
     data: object,
 ):
-    return MultiModalInputs(data)
+    return MultiModalKwargs(data)
 
 
 def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index facf1969b9479..de935fc420472 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -30,7 +30,7 @@
 from vllm.model_executor.models.utils import merge_multimodal_embeddings
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.base import MultiModalInputs
+from vllm.multimodal.base import MultiModalKwargs
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    consecutive_placeholder_ranges)
 from vllm.sequence import IntermediateTensors, SequenceData
@@ -94,8 +94,8 @@ def dummy_data_for_pixtral(ctx: InputContext, seq_len: int,
 
 
 def input_mapper_for_pixtral(ctx: InputContext,
-                             data: object) -> MultiModalInputs:
-    """Maps the input data to its MultiModalInputs (if any).
+                             data: object) -> MultiModalKwargs:
+    """Maps the input data to its MultiModalKwargs (if any).
 
     Args:
         ctx: Context of the loaded model.
@@ -103,7 +103,7 @@ def input_mapper_for_pixtral(ctx: InputContext,
             to pixel_values in .forward() for a visual QWenLMHeadModel model.
 
     Returns:
-        MultiModalInputs containing the stacked normalized images tensor or
+        MultiModalKwargs containing the stacked normalized images tensor or
         image embeddings.
     """
     # Early exit if we have provided an image to a language only Qwen model
@@ -121,7 +121,7 @@ def input_mapper_for_pixtral(ctx: InputContext,
                                                     dtype=torch.float16)
         images.append(image)
 
-    return MultiModalInputs({"images": images})
+    return MultiModalKwargs({"images": images})
 
 
 def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs):
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index c91c2caa3d519..1db7e2ba1cc12 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -43,7 +43,7 @@
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.base import MultiModalInputs
+from vllm.multimodal.base import MultiModalKwargs
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.sequence import IntermediateTensors, SequenceData
 from vllm.utils import is_list_of
@@ -722,8 +722,8 @@ def input_processor_for_qwen(ctx: InputContext,
                         multi_modal_data=multi_modal_data)
 
 
-def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs:
-    """Maps the input data to its MultiModalInputs (if any).
+def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalKwargs:
+    """Maps the input data to its MultiModalKwargs (if any).
 
     Args:
         ctx: Context of the loaded model.
@@ -731,7 +731,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs:
             to pixel_values in .forward() for a visual QWenLMHeadModel model.
 
     Returns:
-        MultiModalInputs containing the stacked normalized images tensor or
+        MultiModalKwargs containing the stacked normalized images tensor or
         image embeddings.
     """
     # Early exit if we have provided an image to a language only Qwen model
@@ -740,7 +740,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs:
         logger.warning(
             "Images were provided but this model has no visual config; "
             "multimodal inputs will not be forwarded to the model.")
-        return MultiModalInputs()
+        return MultiModalKwargs()
 
     model_config = ctx.model_config
     tokenizer = cached_get_tokenizer(
@@ -784,7 +784,7 @@ def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs:
             data = [data]
         transformed_images = [transform(datum) for datum in data]
         pixel_values = torch.stack(transformed_images, dim=0)
-    return MultiModalInputs({"pixel_values": pixel_values})
+    return MultiModalKwargs({"pixel_values": pixel_values})
 
 
 def build_normalization_transform(image_size: int) -> transforms.Compose:
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 54a7085f69ba9..18cf45b3939f7 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -42,7 +42,7 @@
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.models.qwen2 import Qwen2Model
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalInputs
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.utils import consecutive_placeholder_ranges
 from vllm.sequence import IntermediateTensors, SequenceData
 
@@ -221,13 +221,13 @@ def input_processor_for_qwen2_audio(
 def input_mapper_for_qwen2_audio(
     ctx: InputContext,
     multi_modal_data: Union[np.ndarray, List[np.ndarray]],
-) -> MultiModalInputs:
+) -> MultiModalKwargs:
     """Input mapper for Qwen2-Audio."""
     if not isinstance(multi_modal_data, list):
         multi_modal_data = [multi_modal_data]
 
     if len(multi_modal_data) == 0:
-        return MultiModalInputs()
+        return MultiModalKwargs()
 
     processor = cached_get_processor(ctx.model_config.model)
     audio_feature_extractor = processor.feature_extractor
@@ -254,7 +254,7 @@ def input_mapper_for_qwen2_audio(
         logger.error("Failed to process audio (%s)", multi_modal_data)
         raise
 
-    return MultiModalInputs(batch_data)
+    return MultiModalKwargs(batch_data)
 
 
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen2_audio)
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 0e820cf123139..8073c5f4b2fd2 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -57,7 +57,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.qwen2 import Qwen2Model
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
-                             MultiModalInputs)
+                             MultiModalKwargs)
 from vllm.multimodal.base import MultiModalData
 from vllm.multimodal.image import cached_get_image_processor
 from vllm.multimodal.utils import cached_get_tokenizer
@@ -576,10 +576,10 @@ def mm_input_mapper_for_qwen2_vl(
     *,
     min_pixels: Optional[int] = None,
     max_pixels: Optional[int] = None,
-) -> MultiModalInputs:
+) -> MultiModalKwargs:
     """Input mapper for Qwen2-VL."""
     if data_type_key == "image" and isinstance(data, dict):
-        return MultiModalInputs({
+        return MultiModalKwargs({
             "image_embeds": data.get("image_embeds"),
             "image_grid_thw": data.get("image_grid_thw"),
         })
@@ -613,7 +613,7 @@ def mm_input_mapper_for_qwen2_vl(
         logger.error("Failed to process image (%s)", data)
         raise
 
-    return MultiModalInputs(batch_data)
+    return MultiModalKwargs(batch_data)
 
 
 image_input_mapper_for_qwen2_vl = partial(mm_input_mapper_for_qwen2_vl,
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 411584b1a6c3c..6b7a638585ad9 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -24,7 +24,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.loader import DefaultModelLoader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalInputs,
+from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
                              NestedTensors)
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    consecutive_placeholder_ranges,
@@ -116,11 +116,11 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
         data = [data]
 
     if len(data) == 0:
-        return MultiModalInputs()
+        return MultiModalKwargs()
 
     # If the audio inputs are embeddings, no need for preprocessing
     if is_list_of(data, torch.Tensor, check="all"):
-        return MultiModalInputs({"audio_embeds": data})
+        return MultiModalKwargs({"audio_embeds": data})
 
     audio_features = []
     for audio_input in data:
@@ -154,7 +154,7 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object):
         # Remove the batch dimension because we're wrapping it in a list.
         audio_features.append(single_audio_features.squeeze(0))
 
-    return MultiModalInputs({"audio_features": audio_features})
+    return MultiModalKwargs({"audio_features": audio_features})
 
 
 def input_processor_for_ultravox(ctx: InputContext, inputs: DecoderOnlyInputs):
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 53da2badb9b98..14911853abc73 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,5 +1,5 @@
 from .base import (BatchedTensorInputs, MultiModalDataBuiltins,
-                   MultiModalDataDict, MultiModalInputs,
+                   MultiModalDataDict, MultiModalKwargs,
                    MultiModalPlaceholderDict, MultiModalPlaceholderMap,
                    MultiModalPlugin, NestedTensors)
 from .registry import MultiModalRegistry
@@ -17,7 +17,7 @@
     "BatchedTensorInputs",
     "MultiModalDataBuiltins",
     "MultiModalDataDict",
-    "MultiModalInputs",
+    "MultiModalKwargs",
     "MultiModalPlaceholderDict",
     "MultiModalPlaceholderMap",
     "MultiModalPlugin",
@@ -25,3 +25,18 @@
     "MULTIMODAL_REGISTRY",
     "MultiModalRegistry",
 ]
+
+
+def __getattr__(name: str):
+    import warnings
+
+    if name == "MultiModalInputs":
+        msg = ("MultiModalInputs has been renamed to MultiModalKwargs. "
+               "The original name will take another meaning in an upcoming "
+               "version.")
+
+        warnings.warn(DeprecationWarning(msg), stacklevel=2)
+
+        return MultiModalKwargs
+
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index 04d71826f29fa..e71ae5feec1c6 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -1,5 +1,5 @@
 from vllm.inputs.registry import InputContext
-from vllm.multimodal.base import MultiModalInputs, MultiModalPlugin
+from vllm.multimodal.base import MultiModalKwargs, MultiModalPlugin
 
 
 class AudioPlugin(MultiModalPlugin):
@@ -9,7 +9,7 @@ def get_data_key(self) -> str:
         return "audio"
 
     def _default_input_mapper(self, ctx: InputContext, data: object,
-                              **mm_processor_kwargs) -> MultiModalInputs:
+                              **mm_processor_kwargs) -> MultiModalKwargs:
         raise NotImplementedError("There is no default audio input mapper")
 
     def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 26c94cf2d0b20..fa514d3fcb3b7 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -30,15 +30,15 @@
 BatchedTensorInputs: TypeAlias = Dict[str, NestedTensors]
 """
 A dictionary containing nested tensors which have been batched via
-:meth:`MultiModalInputs.batch`.
+:meth:`MultiModalKwargs.batch`.
 """
 
 
-class _MultiModalInputsBase(UserDict[str, NestedTensors]):
+class _MultiModalKwargsBase(UserDict[str, NestedTensors]):
     pass
 
 
-class MultiModalInputs(_MultiModalInputsBase):
+class MultiModalKwargs(_MultiModalKwargsBase):
     """
     A dictionary that represents the keyword arguments to
     :meth:`~torch.nn.Module.forward`.
@@ -58,7 +58,7 @@ def _try_stack(nested_tensors: NestedTensors) -> NestedTensors:
         if isinstance(nested_tensors, (int, float)):
             return torch.tensor(nested_tensors)
 
-        stacked = [MultiModalInputs._try_stack(t) for t in nested_tensors]
+        stacked = [MultiModalKwargs._try_stack(t) for t in nested_tensors]
         if not is_list_of(stacked, torch.Tensor, check="all"):
             # Only tensors (not lists) can be stacked.
             return stacked
@@ -71,7 +71,7 @@ def _try_stack(nested_tensors: NestedTensors) -> NestedTensors:
         return torch.stack(tensors_)
 
     @staticmethod
-    def batch(inputs_list: List["MultiModalInputs"]) -> BatchedTensorInputs:
+    def batch(inputs_list: List["MultiModalKwargs"]) -> BatchedTensorInputs:
         """
         Batch multiple inputs together into a dictionary.
 
@@ -95,7 +95,7 @@ def batch(inputs_list: List["MultiModalInputs"]) -> BatchedTensorInputs:
                 item_lists[k].append(v)
 
         return {
-            k: MultiModalInputs._try_stack(item_list)
+            k: MultiModalKwargs._try_stack(item_list)
             for k, item_list in item_lists.items()
         }
 
@@ -177,7 +177,7 @@ class PlaceholderRange(TypedDict):
 """
 
 MultiModalInputMapper = Callable[[InputContext, MultiModalData[object]],
-                                 MultiModalInputs]
+                                 MultiModalKwargs]
 """
 Return a dictionary to be passed as keyword arguments to
 :meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers
@@ -226,7 +226,7 @@ def _default_input_mapper(
         ctx: InputContext,
         data: MultiModalData[object],
         **mm_processor_kwargs,
-    ) -> MultiModalInputs:
+    ) -> MultiModalKwargs:
         """
         Return a dictionary to be passed as keyword arguments to
         :meth:`~torch.nn.Module.forward`. This is similar in concept to
@@ -275,7 +275,7 @@ def map_input(
         model_config: "ModelConfig",
         data: MultiModalData[object],
         mm_processor_kwargs: Dict[str, Any],
-    ) -> MultiModalInputs:
+    ) -> MultiModalKwargs:
         """
         Transform the data into a dictionary of model inputs using the
         input mapper registered for that model.
@@ -585,3 +585,18 @@ def index_map(self) -> "IndexMap":
 
         return MultiModalPlaceholderMap.IndexMap(src=src_indices,
                                                  dest=dest_indices)
+
+
+def __getattr__(name: str):
+    import warnings
+
+    if name == "MultiModalInputs":
+        msg = ("MultiModalInputs has been renamed to MultiModalKwargs. "
+               "The original name will take another meaning in an upcoming "
+               "version.")
+
+        warnings.warn(DeprecationWarning(msg), stacklevel=2)
+
+        return MultiModalKwargs
+
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 3f6bb6c8338d2..589b46266b08d 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -10,7 +10,7 @@
 from vllm.transformers_utils.processor import get_image_processor
 from vllm.utils import is_list_of
 
-from .base import MultiModalData, MultiModalInputs, MultiModalPlugin
+from .base import MultiModalData, MultiModalKwargs, MultiModalPlugin
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -43,12 +43,12 @@ def _default_input_mapper(
         ctx: InputContext,
         data: MultiModalData[object],
         **mm_processor_kwargs,
-    ) -> MultiModalInputs:
+    ) -> MultiModalKwargs:
         model_config = ctx.model_config
 
         # Processed by input processor
         if isinstance(data, BatchFeature):
-            return MultiModalInputs(data.data)
+            return MultiModalKwargs(data.data)
 
         # PIL image
         if isinstance(data, Image.Image) or is_list_of(data, Image.Image):
@@ -78,11 +78,11 @@ def _default_input_mapper(
                     type(image_processor).__name__)
                 raise
 
-            return MultiModalInputs(batch_data)
+            return MultiModalKwargs(batch_data)
 
         # Image embedding
         elif isinstance(data, torch.Tensor) or is_list_of(data, torch.Tensor):
-            return MultiModalInputs({"image_embeds": data})
+            return MultiModalKwargs({"image_embeds": data})
 
         raise TypeError(f"Invalid image type: {type(data)}")
 
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index bce2f4c6abe5b..b844c9e1c2e89 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -5,7 +5,7 @@
 from vllm.logger import init_logger
 
 from .audio import AudioPlugin
-from .base import (MultiModalDataDict, MultiModalInputMapper, MultiModalInputs,
+from .base import (MultiModalDataDict, MultiModalInputMapper, MultiModalKwargs,
                    MultiModalPlugin, MultiModalTokensCalc, NestedTensors)
 from .image import ImagePlugin
 from .video import VideoPlugin
@@ -103,7 +103,7 @@ def map_input(
         model_config: "ModelConfig",
         data: MultiModalDataDict,
         mm_processor_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> MultiModalInputs:
+    ) -> MultiModalKwargs:
         """
         Apply an input mapper to the data passed to the model.
 
@@ -139,7 +139,7 @@ def map_input(
 
                 merged_dict[input_key] = input_tensor
 
-        return MultiModalInputs(merged_dict)
+        return MultiModalKwargs(merged_dict)
 
     def create_input_mapper(self, model_config: "ModelConfig"):
         """
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index 40a92fed28c87..a518270974f92 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -9,7 +9,7 @@
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils import is_list_of
 
-from .base import MultiModalData, MultiModalInputs
+from .base import MultiModalData, MultiModalKwargs
 from .image import ImagePlugin
 
 if TYPE_CHECKING:
@@ -55,7 +55,7 @@ def _default_input_mapper(
         ctx: InputContext,
         data: MultiModalData[object],
         **mm_processor_kwargs,
-    ) -> MultiModalInputs:
+    ) -> MultiModalKwargs:
         model_config = ctx.model_config
 
         if isinstance(data, list) and len(data) == 1:
@@ -79,7 +79,7 @@ def _default_input_mapper(
                 logger.error("Failed to process video (%s)", data)
                 raise
 
-            return MultiModalInputs(batch_data)
+            return MultiModalKwargs(batch_data)
 
         raise TypeError(f"Invalid video type: {type(data)}")
 
diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py
index 6330ac027db74..cd4d7eb0e6e4e 100644
--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
@@ -18,7 +18,7 @@
         "CUDA and ROCm flash attention backend.") from err
 
 from vllm.logger import init_logger
-from vllm.multimodal import MultiModalInputs
+from vllm.multimodal import MultiModalKwargs
 from vllm.sequence import ExecuteModelRequest, IntermediateTensors
 from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata,
                                       ModelRunner)
@@ -280,7 +280,7 @@ def execute_model(
                     kv_caches=kv_caches,
                     attn_metadata=model_input.attn_metadata,
                     intermediate_tensors=intermediate_tensors,
-                    **MultiModalInputs.as_kwargs(multi_modal_kwargs,
+                    **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
                                                  device=self.device),
                     **kwargs,
                 )
diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py
index 8ebbf6db939bc..994af7c5a455f 100644
--- a/vllm/worker/cpu_enc_dec_model_runner.py
+++ b/vllm/worker/cpu_enc_dec_model_runner.py
@@ -5,7 +5,7 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.multimodal import MultiModalInputs
+from vllm.multimodal import MultiModalKwargs
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
 from vllm.utils import make_tensor_with_pad
 from vllm.worker.cpu_model_runner import (CPUModelRunner,
@@ -287,7 +287,7 @@ def execute_model(
             kv_caches,
             "attn_metadata":
             model_input.attn_metadata,
-            **MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {},
+            **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
                                          device=self.device),
             "intermediate_tensors":
             intermediate_tensors,
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 26a15ed645c43..1590184d6f831 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -15,7 +15,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalInputs, MultiModalPlaceholderMap)
+                             MultiModalKwargs, MultiModalPlaceholderMap)
 from vllm.sequence import (IntermediateTensors, SequenceData,
                            SequenceGroupMetadata)
 from vllm.utils import make_tensor_with_pad
@@ -200,7 +200,7 @@ def _prepare_prompt(
 
         slot_mapping: List[int] = []
         seq_lens: List[int] = []
-        multi_modal_inputs_list: List[MultiModalInputs] = []
+        multi_model_kwargs_list: List[MultiModalKwargs] = []
         multi_modal_placeholder_maps: Dict[
             str,
             MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
@@ -225,7 +225,7 @@ def _prepare_prompt(
                     ._compute_multi_modal_input(
                         seq_group_metadata, seq_data, computed_len,
                     seq_group_metadata.mm_processor_kwargs)
-                multi_modal_inputs_list.append(mm_kwargs)
+                multi_model_kwargs_list.append(mm_kwargs)
                 for modality, placeholder_map in placeholder_maps.items():
                     multi_modal_placeholder_maps[modality].extend(
                         placeholder_map)
@@ -297,7 +297,7 @@ def _prepare_prompt(
             multi_modal_placeholder_index_maps=placeholder_index_maps,
         )
 
-        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
+        multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list)
 
         return (input_tokens, input_positions, attn_metadata, seq_lens,
                 multi_modal_kwargs)
@@ -520,7 +520,7 @@ def execute_model(
             kv_caches,
             "attn_metadata":
             model_input.attn_metadata,
-            **MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {},
+            **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
                                          device=self.device),
             "intermediate_tensors":
             intermediate_tensors,
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py
index ff288d5ca1512..37cfcbf13d7a3 100644
--- a/vllm/worker/embedding_model_runner.py
+++ b/vllm/worker/embedding_model_runner.py
@@ -8,7 +8,7 @@
 from vllm.forward_context import set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.pooling_metadata import PoolingMetadata
-from vllm.multimodal import MultiModalInputs
+from vllm.multimodal import MultiModalKwargs
 from vllm.pooling_params import PoolingParams
 from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData,
                            SequenceGroupMetadata)
@@ -104,7 +104,7 @@ def execute_model(
                 kv_caches=kv_caches,
                 attn_metadata=model_input.attn_metadata,
                 intermediate_tensors=intermediate_tensors,
-                **MultiModalInputs.as_kwargs(multi_modal_kwargs,
+                **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
                                              device=self.device))
 
         if (self.observability_config is not None
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 90a43196084ea..008e0c9745994 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -18,7 +18,7 @@
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader.utils import get_architecture_class_name
-from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalInputs,
+from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
                              MultiModalRegistry)
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (IntermediateTensors, PoolerOutput,
@@ -206,7 +206,7 @@ def execute_model(
                 kv_caches=kv_caches,
                 attn_metadata=model_input.attn_metadata,
                 intermediate_tensors=intermediate_tensors,
-                **MultiModalInputs.as_kwargs(multi_modal_kwargs,
+                **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
                                              device=self.device),
                 **seqlen_agnostic_kwargs)
 
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 7e9b2bd13b48a..92d6552b2f428 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -36,7 +36,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalInputs)
+                             MultiModalKwargs)
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (IntermediateTensors, SequenceData,
                            SequenceGroupMetadata)
@@ -716,7 +716,7 @@ def _prepare_prompt(
         context_lens: List[int] = []
         query_lens: List[int] = []
         prefix_block_tables: List[List[int]] = []
-        multi_modal_inputs_list: List[MultiModalInputs] = []
+        multi_model_kwargs_list: List[MultiModalKwargs] = []
 
         if len(seq_group_metadata_list) == 0:
             return PreparePromptMetadata.empty()
@@ -777,7 +777,7 @@ def _prepare_prompt(
             mm_data = seq_group_metadata.multi_modal_data
             if mm_data:
                 mm_kwargs = self.multi_modal_input_mapper(mm_data)
-                multi_modal_inputs_list.append(mm_kwargs)
+                multi_model_kwargs_list.append(mm_kwargs)
 
             if seq_group_metadata.block_tables is None:
                 # During memory profiling, the block tables are not initialized
@@ -876,7 +876,7 @@ def _prepare_prompt(
             multi_modal_placeholder_index_maps=
             None  # FIXME(kzawora): mutli-modality will not work here
         )
-        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
+        multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list)
 
         return PreparePromptMetadata(input_tokens=input_tokens,
                                      input_positions=input_positions,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index a1ec2e85be7b8..e1446192ce3d6 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -38,7 +38,7 @@
 from vllm.model_executor.models import supports_lora, supports_multimodal
 from vllm.model_executor.models.utils import set_cpu_offload_max_bytes
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalInputs, MultiModalPlaceholderMap,
+                             MultiModalKwargs, MultiModalPlaceholderMap,
                              MultiModalRegistry)
 from vllm.platforms import current_platform
 from vllm.prompt_adapter.layers import PromptAdapterMapping
@@ -252,7 +252,7 @@ def __init__(
             prompt_adapter_request: Optional[PromptAdapterRequest] = None,
 
             # Multi-modal inputs.
-            multi_modal_inputs: Optional[MultiModalInputs] = None,
+            multi_model_kwargs: Optional[MultiModalKwargs] = None,
             multi_modal_placeholder_maps: Optional[Dict[
                 str, MultiModalPlaceholderMap]] = None,
 
@@ -373,7 +373,7 @@ def __init__(
                     prompt_adapter_prompt_mapping or [])
 
             self.prompt_adapter_request = prompt_adapter_request
-            self.multi_modal_inputs = multi_modal_inputs
+            self.multi_model_kwargs = multi_model_kwargs
             self.multi_modal_placeholder_maps = multi_modal_placeholder_maps
             self.prefix_cache_hit = prefix_cache_hit
 
@@ -661,7 +661,7 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
         mm_kwargs = self.multi_modal_input_mapper(
             mm_data,
             mm_processor_kwargs=seq_group_metadata.mm_processor_kwargs)
-        inter_data.multi_modal_inputs = mm_kwargs
+        inter_data.multi_model_kwargs = mm_kwargs
         inter_data.multi_modal_placeholder_maps = placeholder_maps
 
         # special processing for mrope position deltas.
@@ -935,11 +935,11 @@ def build(self) -> ModelInputForGPU:
             )
 
         # Multi-modal data.
-        multi_modal_inputs_list = [
-            data.multi_modal_inputs for data in self.inter_data_list
-            if data.multi_modal_inputs is not None
+        multi_model_kwargs_list = [
+            data.multi_model_kwargs for data in self.inter_data_list
+            if data.multi_model_kwargs is not None
         ]
-        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
+        multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list)
 
         return self.model_input_cls(
             input_tokens=input_tokens_tensor,
@@ -1649,7 +1649,7 @@ def execute_model(
                 kv_caches=kv_caches,
                 attn_metadata=model_input.attn_metadata,
                 intermediate_tensors=intermediate_tensors,
-                **MultiModalInputs.as_kwargs(multi_modal_kwargs,
+                **MultiModalKwargs.as_kwargs(multi_modal_kwargs,
                                              device=self.device),
                 **seqlen_agnostic_kwargs)
 
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index 2da22cbfc7cb5..0ed33e435aa2f 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -13,7 +13,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader.neuron import get_neuron_model
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalInputs)
+                             MultiModalKwargs)
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
 from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase
@@ -122,7 +122,7 @@ def _prepare_prompt(
         input_block_ids: List[int] = []
 
         seq_lens: List[int] = []
-        multi_modal_inputs_list: List[MultiModalInputs] = []
+        multi_model_kwargs_list: List[MultiModalKwargs] = []
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
             seq_ids = list(seq_group_metadata.seq_data.keys())
@@ -149,7 +149,7 @@ def _prepare_prompt(
                     mm_data,
                     mm_processor_kwargs=seq_group_metadata.mm_processor_kwargs,
                 )
-                multi_modal_inputs_list.append(mm_kwargs)
+                multi_model_kwargs_list.append(mm_kwargs)
 
         max_seq_len = max(seq_lens)
         assert max_seq_len > 0
@@ -167,7 +167,7 @@ def _prepare_prompt(
                                        dtype=torch.long,
                                        device=self.device)
 
-        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
+        multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list)
 
         return (input_tokens, input_positions, input_block_ids, seq_lens,
                 multi_modal_kwargs)
@@ -314,7 +314,7 @@ def execute_model(
             input_ids=model_input.input_tokens,
             positions=model_input.input_positions,
             input_block_ids=model_input.input_block_ids,
-            **MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {},
+            **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
                                          device=self.device),
         )
 
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index c9c87ea748081..378e1e06039b2 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -13,7 +13,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader.openvino import get_model
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalInputs, MultiModalPlaceholderMap)
+                             MultiModalKwargs, MultiModalPlaceholderMap)
 from vllm.sequence import SequenceGroupMetadata
 from vllm.worker.model_runner_base import ModelRunnerBase
 
@@ -102,7 +102,7 @@ def _prepare_model_input(
         seq_lens: List[int] = []
         past_lens: List[int] = []
         query_lens: List[int] = []
-        multi_modal_inputs_list: List[MultiModalInputs] = []
+        multi_model_kwargs_list: List[MultiModalKwargs] = []
         multi_modal_placeholder_maps: Dict[
             str,
             MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
@@ -226,7 +226,7 @@ def _prepare_model_input(
                         mm_data,
                         mm_processor_kwargs=seq_group_metadata.
                         mm_processor_kwargs)
-                    multi_modal_inputs_list.append(mm_kwargs)
+                    multi_model_kwargs_list.append(mm_kwargs)
 
                     for modality, placeholder_map in placeholder_maps.items():
                         multi_modal_placeholder_maps[modality].extend(
@@ -275,7 +275,7 @@ def _prepare_model_input(
             multi_modal_placeholder_index_maps=placeholder_index_maps,
         )
 
-        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
+        multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list)
 
         return ModelInput(
             input_tokens,
@@ -341,7 +341,7 @@ def execute_model(
             kv_caches,
             "attn_metadata":
             attn_metadata,
-            **MultiModalInputs.as_kwargs(multi_modal_kwargs or {},
+            **MultiModalKwargs.as_kwargs(multi_modal_kwargs or {},
                                          device=self.device),
         }
 
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index bae8b469767b2..c9e637c057979 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -18,7 +18,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.model_loader import get_model
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensorInputs,
-                             MultiModalInputs, MultiModalPlaceholderMap,
+                             MultiModalKwargs, MultiModalPlaceholderMap,
                              MultiModalRegistry)
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
@@ -160,7 +160,7 @@ def _prepare_prompt(
         input_positions: List[int] = []
         slot_mapping: List[int] = []
         seq_lens: List[int] = []
-        multi_modal_inputs_list: List[MultiModalInputs] = []
+        multi_model_kwargs_list: List[MultiModalKwargs] = []
         multi_modal_placeholder_maps: Dict[
             str,
             MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
@@ -192,7 +192,7 @@ def _prepare_prompt(
                     .from_seq_group(seq_group_metadata, positions_range)
 
                 mm_kwargs = self.runner.multi_modal_input_mapper(mm_data)
-                multi_modal_inputs_list.append(mm_kwargs)
+                multi_model_kwargs_list.append(mm_kwargs)
 
                 for modality, placeholder_map in placeholder_maps.items():
                     multi_modal_placeholder_maps[modality].extend(
@@ -264,7 +264,7 @@ def _prepare_prompt(
             block_tables=torch.tensor([], device=self.device, dtype=torch.int),
         )
 
-        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
+        multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list)
 
         return (input_tokens, input_positions, attn_metadata, seq_lens,
                 multi_modal_kwargs)
@@ -565,7 +565,7 @@ def execute_model(
             kv_caches=kv_caches,
             attn_metadata=model_input.attn_metadata,
             intermediate_tensors=intermediate_tensors,
-            **MultiModalInputs.as_kwargs(model_input.multi_modal_kwargs or {},
+            **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
                                          device=self.device))
         # Compute the logits in the last pipeline stage.
         if not get_pp_group().is_last_rank:

From f83feccd7f661d0a582f9c0cb0bc9f802f4d995e Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Fri, 8 Nov 2024 22:36:46 -0500
Subject: [PATCH 060/183] [Bugfix] Ignore GPTQ quantization of Qwen2-VL visual
 module (#10169)

Signed-off-by: mgoin <michael@neuralmagic.com>
---
 vllm/model_executor/models/qwen2_vl.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 8073c5f4b2fd2..8dd75c9ee7e7b 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -51,7 +51,9 @@
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization import (GPTQConfig,
+                                                     GPTQMarlinConfig,
+                                                     QuantizationConfig)
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -982,7 +984,7 @@ def __init__(self,
         self.visual = Qwen2VisionTransformer(
             config.vision_config,
             norm_eps=getattr(config, "rms_norm_eps", 1e-6),
-            quant_config=quant_config,
+            quant_config=self._maybe_ignore_quant_config(quant_config),
             prefix="visual",
         )
 
@@ -1008,6 +1010,14 @@ def __init__(self,
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
 
+    def _maybe_ignore_quant_config(self, quant_config: QuantizationConfig):
+        # GPTQ configs do not have a list of ignored modules, however AutoGPTQ
+        # seems to avoid vision encoder sections for some models.
+        # See: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4
+        if isinstance(quant_config, (GPTQConfig, GPTQMarlinConfig)):
+            return None
+        return quant_config
+
     def _validate_and_reshape_mm_tensor(self,
                                         mm_input: Union[torch.Tensor,
                                                         List[torch.Tensor]],

From 47672f38b58581cf2b7c33201e6ae01639c5ff51 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sat, 9 Nov 2024 12:02:59 +0800
Subject: [PATCH 061/183] [CI/Build] Fix VLM broadcast tests
 `tensor_parallel_size` passing (#10161)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 tests/models/decoder_only/vision_language/test_models.py     | 1 +
 tests/models/decoder_only/vision_language/vlm_utils/types.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 163752e9fe06e..1ab42f8c126f8 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -34,6 +34,7 @@
     "dtype": "half",
     "max_tokens": 5,
     "tensor_parallel_size": 2,
+    "model_kwargs": {"device_map": "auto"},
     "image_size_factors": [(.25, 0.5, 1.0)],
     "distributed_executor_backend": (
         "ray",
diff --git a/tests/models/decoder_only/vision_language/vlm_utils/types.py b/tests/models/decoder_only/vision_language/vlm_utils/types.py
index fd18c7c8346f0..8459476dc2d07 100644
--- a/tests/models/decoder_only/vision_language/vlm_utils/types.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/types.py
@@ -158,6 +158,7 @@ def get_non_parametrized_runner_kwargs(self):
             "max_model_len": self.max_model_len,
             "max_num_seqs": self.max_num_seqs,
             "task": self.task,
+            "tensor_parallel_size": self.tensor_parallel_size,
             "hf_output_post_proc": self.hf_output_post_proc,
             "vllm_output_post_proc": self.vllm_output_post_proc,
             "auto_cls": self.auto_cls,

From 49d2a41a860f5aeffe850fb8bbe3b268966299bb Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 9 Nov 2024 12:07:10 +0800
Subject: [PATCH 062/183] [Doc] Adjust RunLLM location (#10176)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/_static/custom.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js
index ceeca47226cde..dac40ca2cfe75 100644
--- a/docs/source/_static/custom.js
+++ b/docs/source/_static/custom.js
@@ -8,7 +8,7 @@ document.addEventListener("DOMContentLoaded", function () {
     script.setAttribute("version", "stable");
     script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget.
     script.setAttribute("runllm-name", "vLLM");
-    script.setAttribute("runllm-position", "BOTTOM_LEFT");
+    script.setAttribute("runllm-position", "TOP_RIGHT");
     script.setAttribute("runllm-assistant-id", "207");
   
     script.async = true;

From 1a95f10ee7d2ffa538a6d210b53bf363e039feee Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 8 Nov 2024 22:17:28 -0800
Subject: [PATCH 063/183] [5/N] pass the whole config to model (#9983)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/model_executor/model_loader/loader.py    | 100 ++----------------
 .../model_executor/model_loader/tensorizer.py |  15 +--
 vllm/model_executor/models/arctic.py          |  16 +--
 vllm/model_executor/models/baichuan.py        |  37 +++----
 vllm/model_executor/models/bart.py            |  12 +--
 vllm/model_executor/models/bert.py            |  12 ++-
 vllm/model_executor/models/blip2.py           |  20 ++--
 vllm/model_executor/models/bloom.py           |  10 +-
 vllm/model_executor/models/chameleon.py       |  12 ++-
 vllm/model_executor/models/chatglm.py         |  15 +--
 vllm/model_executor/models/commandr.py        |  12 ++-
 vllm/model_executor/models/dbrx.py            |  10 +-
 vllm/model_executor/models/decilm.py          |  18 ++--
 vllm/model_executor/models/deepseek.py        |  10 +-
 vllm/model_executor/models/deepseek_v2.py     |  10 +-
 vllm/model_executor/models/eagle.py           |   7 +-
 vllm/model_executor/models/exaone.py          |  12 ++-
 vllm/model_executor/models/falcon.py          |  10 +-
 vllm/model_executor/models/florence2.py       |  10 +-
 vllm/model_executor/models/fuyu.py            |  15 ++-
 vllm/model_executor/models/gemma.py           |  11 +-
 vllm/model_executor/models/gemma2.py          |  27 ++---
 vllm/model_executor/models/gpt2.py            |  10 +-
 vllm/model_executor/models/gpt_bigcode.py     |  12 ++-
 vllm/model_executor/models/gpt_j.py           |  10 +-
 vllm/model_executor/models/gpt_neox.py        |  10 +-
 vllm/model_executor/models/granite.py         |  12 ++-
 vllm/model_executor/models/granitemoe.py      |  12 ++-
 vllm/model_executor/models/idefics3.py        |  13 ++-
 vllm/model_executor/models/interfaces_base.py |  24 +----
 vllm/model_executor/models/internlm2.py       |   9 +-
 vllm/model_executor/models/internlm2_ve.py    |   9 +-
 vllm/model_executor/models/internvl.py        |  15 ++-
 vllm/model_executor/models/jais.py            |  10 +-
 vllm/model_executor/models/jamba.py           |  14 +--
 vllm/model_executor/models/llama.py           |  30 ++++--
 vllm/model_executor/models/llava.py           |  15 ++-
 vllm/model_executor/models/llava_next.py      |  17 ++-
 .../model_executor/models/llava_next_video.py |  15 ++-
 vllm/model_executor/models/llava_onevision.py |  15 ++-
 vllm/model_executor/models/mamba.py           |  14 +--
 vllm/model_executor/models/medusa.py          |   5 +-
 vllm/model_executor/models/minicpm.py         |  12 ++-
 vllm/model_executor/models/minicpmv.py        |  48 ++++-----
 vllm/model_executor/models/mixtral.py         |  13 +--
 vllm/model_executor/models/mixtral_quant.py   |  10 +-
 vllm/model_executor/models/mllama.py          |  15 +--
 vllm/model_executor/models/molmo.py           |  16 +--
 vllm/model_executor/models/mpt.py             |  12 ++-
 vllm/model_executor/models/nemotron.py        |  13 +--
 vllm/model_executor/models/olmo.py            |  14 ++-
 vllm/model_executor/models/olmoe.py           |  10 +-
 vllm/model_executor/models/opt.py             |  12 ++-
 vllm/model_executor/models/orion.py           |  10 +-
 vllm/model_executor/models/paligemma.py       |  30 +++---
 vllm/model_executor/models/persimmon.py       |  14 ++-
 vllm/model_executor/models/phi.py             |  15 +--
 vllm/model_executor/models/phi3_small.py      |  13 +--
 vllm/model_executor/models/phi3v.py           |  23 ++--
 vllm/model_executor/models/phimoe.py          |  13 +--
 vllm/model_executor/models/pixtral.py         |  20 ++--
 vllm/model_executor/models/qwen.py            |  31 +++---
 vllm/model_executor/models/qwen2.py           |  14 +--
 vllm/model_executor/models/qwen2_audio.py     |  21 ++--
 vllm/model_executor/models/qwen2_cls.py       |  20 ++--
 vllm/model_executor/models/qwen2_moe.py       |  10 +-
 vllm/model_executor/models/qwen2_rm.py        |  19 ++--
 vllm/model_executor/models/qwen2_vl.py        |  19 ++--
 vllm/model_executor/models/solar.py           |  13 +--
 vllm/model_executor/models/stablelm.py        |  10 +-
 vllm/model_executor/models/starcoder2.py      |  14 ++-
 vllm/model_executor/models/ultravox.py        |  20 ++--
 vllm/model_executor/models/utils.py           |  27 ++---
 vllm/model_executor/models/xverse.py          |  22 ++--
 vllm/plugins/__init__.py                      |  12 ---
 75 files changed, 583 insertions(+), 654 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 464915248c9ad..8d3024534734b 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -9,8 +9,7 @@
 import os
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
-from typing import (Any, Dict, Generator, Iterable, List, Optional, Tuple,
-                    Type, cast)
+from typing import Any, Dict, Generator, Iterable, List, Optional, Tuple, cast
 
 import gguf
 import huggingface_hub
@@ -18,20 +17,17 @@
 import torch
 from huggingface_hub import HfApi, hf_hub_download
 from torch import nn
-from transformers import AutoModelForCausalLM, PretrainedConfig
+from transformers import AutoModelForCausalLM
 from transformers.utils import SAFE_WEIGHTS_INDEX_NAME
 
-from vllm.config import (CacheConfig, LoadConfig, LoadFormat, LoRAConfig,
-                         ModelConfig, MultiModalConfig, ParallelConfig,
-                         PoolerConfig, SchedulerConfig, VllmConfig)
+from vllm.config import (LoadConfig, LoadFormat, ModelConfig, ParallelConfig,
+                         VllmConfig)
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import (ReplicatedLinear,
                                                RowParallelLinear)
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
 from vllm.model_executor.model_loader.tensorizer import (
     TensorizerConfig, is_vllm_tensorized, load_with_tensorizer,
     serialize_vllm_model, tensorizer_weights_iterator)
@@ -43,8 +39,6 @@
     get_gguf_extra_tensor_names, gguf_quant_weights_iterator,
     initialize_dummy_weights, np_cache_weights_iterator, pt_weights_iterator,
     safetensors_weights_iterator)
-from vllm.model_executor.models import (has_inner_state, supports_lora,
-                                        supports_multimodal)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
 from vllm.utils import is_pin_memory_available
@@ -94,85 +88,11 @@ def device_loading_context(module: torch.nn.Module,
 logger = init_logger(__name__)
 
 
-def _get_model_initialization_kwargs(
-        model_class: Type[nn.Module],
-        lora_config: Optional[LoRAConfig],
-        multimodal_config: Optional[MultiModalConfig],
-        scheduler_config: Optional[SchedulerConfig] = None,
-        pooler_config: Optional[PoolerConfig] = None) -> Dict[str, Any]:
-    """Get extra kwargs for model initialization."""
-    extra_kwargs: Dict[str, Any] = {}
-
-    if supports_lora(model_class):
-        # lora_config=None is used to disable LoRA
-        extra_kwargs["lora_config"] = lora_config
-    elif lora_config:
-        raise ValueError(
-            f"Model {model_class.__name__} does not support LoRA, "
-            "but LoRA is enabled. Support for this model may "
-            "be added in the future. If this is important to you, "
-            "please open an issue on github.")
-
-    if supports_multimodal(model_class):
-        assert multimodal_config is not None
-
-        extra_kwargs["multimodal_config"] = multimodal_config
-
-    if has_inner_state(model_class) and scheduler_config:
-        extra_kwargs["scheduler_config"] = scheduler_config
-    if pooler_config:
-        extra_kwargs["pooler_config"] = pooler_config
-    return extra_kwargs
-
-
-def build_model(model_class: Type[nn.Module],
-                vllm_config: Optional[VllmConfig],
-                hf_config: PretrainedConfig,
-                cache_config: Optional[CacheConfig],
-                quant_config: Optional[QuantizationConfig],
-                *,
-                lora_config: Optional[LoRAConfig],
-                multimodal_config: Optional[MultiModalConfig],
-                scheduler_config: Optional[SchedulerConfig],
-                prefix: Optional[str] = None,
-                pooler_config: Optional[PoolerConfig] = None) -> nn.Module:
-    extra_kwargs = _get_model_initialization_kwargs(model_class, lora_config,
-                                                    multimodal_config,
-                                                    scheduler_config,
-                                                    pooler_config)
-    if prefix:
-        extra_kwargs["prefix"] = prefix
-
-    # TODO: unify all the module initialization code
-    # to only take the `VllmConfig` object as input
-    from vllm.plugins import set_vllm_config
-    set_vllm_config(vllm_config)
-
-    return model_class(config=hf_config,
-                       cache_config=cache_config,
-                       quant_config=quant_config,
-                       **extra_kwargs)
-
-
 def _initialize_model(vllm_config: VllmConfig) -> nn.Module:
     """Initialize a model with the given configurations."""
     model_config = vllm_config.model_config
-    lora_config = vllm_config.lora_config
-    scheduler_config = vllm_config.scheduler_config
-    cache_config = vllm_config.cache_config
     model_class, _ = get_model_architecture(model_config)
-
-    return build_model(
-        model_class,
-        vllm_config,
-        model_config.hf_config,
-        cache_config=cache_config,
-        quant_config=vllm_config.quant_config,
-        lora_config=lora_config,
-        multimodal_config=model_config.multimodal_config,
-        scheduler_config=scheduler_config,
-        pooler_config=model_config.pooler_config,
-    )
+    return model_class(vllm_config=vllm_config)
 
 
 class BaseModelLoader(ABC):
@@ -486,24 +406,18 @@ def _load_model_serialized(
 
         device_config = vllm_config.device_config
         model_config = vllm_config.model_config
-        lora_config = vllm_config.lora_config
-        cache_config = vllm_config.cache_config
 
         with set_default_torch_dtype(model_config.dtype):
             with torch.device(device_config.device):
                 model_class = get_model_architecture(model_config)[0]
-                quant_config = vllm_config.quant_config
-                extra_kwargs = _get_model_initialization_kwargs(
-                    model_class, lora_config, model_config.multimodal_config)
-                extra_kwargs["quant_config"] = quant_config
-                extra_kwargs["cache_config"] = cache_config
 
                 tensorizer_config = copy.copy(self.tensorizer_config)
                 tensorizer_config.model_class = model_class
                 tensorizer_config.hf_config = model_config.hf_config
                 tensorizer_config.dtype = model_config.dtype
 
-                model = load_with_tensorizer(tensorizer_config, **extra_kwargs)
+                model = load_with_tensorizer(tensorizer_config,
+                                             vllm_config=vllm_config)
         return model.eval()
 
     def download_model(self, model_config: ModelConfig) -> None:
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
index 437d2772e1f28..c48b287ed181a 100644
--- a/vllm/model_executor/model_loader/tensorizer.py
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -17,8 +17,6 @@
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm.logger import init_logger
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.utils import FlexibleArgumentParser
@@ -268,8 +266,7 @@ class TensorizerAgent:
     in vllm/model_executor/model_loader/weight_utils.py
     """
 
-    def __init__(self, tensorizer_config: TensorizerConfig,
-                 quant_config: QuantizationConfig, **extra_kwargs):
+    def __init__(self, tensorizer_config: TensorizerConfig, vllm_config):
         if tensorizer_error_msg is not None:
             raise ImportError(
                 "Tensorizer is not installed. Please install tensorizer "
@@ -279,11 +276,7 @@ def __init__(self, tensorizer_config: TensorizerConfig,
         self.tensorizer_config = tensorizer_config
         self.tensorizer_args = (
             self.tensorizer_config._construct_tensorizer_args())
-        self.extra_kwargs = extra_kwargs
-        if extra_kwargs.get("quant_config") is not None:
-            self.quant_config = extra_kwargs["quant_config"]
-        else:
-            self.quant_config = quant_config
+        self.vllm_config = vllm_config
         self.model = self._init_model()
 
     def _init_model(self):
@@ -293,9 +286,7 @@ def _init_model(self):
         assert self.tensorizer_config.model_class is not None
         with no_init_or_tensor():
             return self.tensorizer_config.model_class(
-                config=model_args,
-                quant_config=self.quant_config,
-                **self.extra_kwargs)
+                vllm_config=self.vllm_config, )
 
     def _resize_lora_embeddings(self):
         """Modify LoRA embedding layers to use bigger tensors
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index 4fec314a70aa4..997554f7dcccd 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -6,7 +6,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
@@ -415,14 +415,16 @@ def forward(
 
 class ArcticForCausalLM(nn.Module, SupportsPP):
 
-    def __init__(self,
-                 config: ArcticConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 **kwargs) -> None:
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         self.config = config
-        self.model = ArcticModel(config, cache_config, quant_config)
+        self.model = ArcticModel(config,
+                                 cache_config,
+                                 quant_config,
+                                 prefix=prefix)
         self.vocab_size = config.vocab_size
         self.lm_head = ParallelLMHead(
             self.vocab_size,
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index cce182da4820f..8e1dab71b1f39 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -26,7 +26,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -332,14 +332,15 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def __init__(
         self,
-        config: PretrainedConfig,
-        position_embedding: str,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        position_embedding: str = "ROPE",
     ):
         super().__init__()
-
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
         self.config = config
         self.lora_config = lora_config
 
@@ -439,17 +440,14 @@ class BaichuanForCausalLM(BaiChuanBaseForCausalLM):
 
     def __init__(
         self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ):
+        config = vllm_config.model_config.hf_config
         if config.hidden_size == 4096:  # baichuan2 7b
-            super().__init__(config, "ROPE", cache_config, quant_config,
-                             lora_config)
+            super().__init__(vllm_config, prefix, "ROPE")
         else:  # baichuan 13b, baichuan2 13b
-            super().__init__(config, "ALIBI", cache_config, quant_config,
-                             lora_config)
+            super().__init__(vllm_config, prefix, "ALIBI")
 
 
 class BaiChuanForCausalLM(BaiChuanBaseForCausalLM):
@@ -459,10 +457,7 @@ class BaiChuanForCausalLM(BaiChuanBaseForCausalLM):
 
     def __init__(
         self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ):
-        super().__init__(config, "ROPE", cache_config, quant_config,
-                         lora_config)
+        super().__init__(vllm_config, prefix, "ROPE")
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index fd600adceb21c..c6da6a590cf5a 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -25,7 +25,7 @@
 from transformers.utils import logging
 
 from vllm.attention import Attention, AttentionMetadata, AttentionType
-from vllm.config import CacheConfig, LoRAConfig
+from vllm.config import CacheConfig, LoRAConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -810,13 +810,13 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
 class BartForConditionalGeneration(nn.Module):
     base_model_prefix = "model"
 
-    def __init__(self,
-                 config: BartConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 lora_config: Optional[LoRAConfig] = None):
+    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
 
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
         # currently all existing BART models have `tie_word_embeddings` enabled
         assert config.tie_word_embeddings
         self.config = config
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index bfed2929d57d2..2b0f45c5603f5 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -6,7 +6,7 @@
 
 from vllm.attention import Attention, AttentionMetadata, AttentionType
 from vllm.attention.backends.xformers import XFormersImpl
-from vllm.config import CacheConfig, PoolerConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -384,12 +384,14 @@ class BertEmbeddingModel(nn.Module):
 
     def __init__(
         self,
-        config: BertConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        pooler_config: Optional[PoolerConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        pooler_config = vllm_config.model_config.pooler_config
         self.model = BertModel(config, cache_config, quant_config)
         self._pooler = Pooler.from_config_with_defaults(
             pooler_config,
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index efd24e7cf40f6..cdc30eda2ab3c 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -8,7 +8,7 @@
                           apply_chunking_to_forward)
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, MultiModalConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
 from vllm.model_executor.layers.activation import get_act_fn
@@ -483,14 +483,17 @@ def input_processor_for_blip2(ctx: InputContext, inputs: DecoderOnlyInputs):
 @INPUT_REGISTRY.register_input_processor(input_processor_for_blip2)
 class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
 
-    def __init__(self,
-                 config: Blip2Config,
-                 multimodal_config: MultiModalConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None) -> None:
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> None:
 
         super().__init__()
-
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
         self.multimodal_config = multimodal_config
 
@@ -513,8 +516,7 @@ def __init__(self,
 
         self.language_model = init_vllm_registered_model(
             config.text_config,
-            cache_config,
-            quant_config,
+            vllm_config=vllm_config,
             prefix="language_model")
 
         self.make_empty_intermediate_tensors = (
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index c2440ee75d588..7540bc23efd88 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -24,7 +24,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.activation import get_act_fn
@@ -283,11 +283,13 @@ class BloomForCausalLM(nn.Module, SupportsPP):
 
     def __init__(
         self,
-        config: BloomConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ):
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
         self.transformer = BloomModel(config, cache_config, quant_config)
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 58841f177ec22..f79bad6190708 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -9,7 +9,7 @@
 from transformers import ChameleonConfig, ChameleonVQVAEConfig
 
 from vllm.attention import Attention, AttentionMetadata
-from vllm.config import CacheConfig, MultiModalConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
@@ -926,12 +926,14 @@ class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
 
     def __init__(
         self,
-        config: ChameleonConfig,
-        multimodal_config: MultiModalConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
         self.multimodal_config = multimodal_config
         self.model = ChameleonModel(config, cache_config, quant_config)
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index eb9c3e3ae785d..c14f2fcb15063 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -11,7 +11,7 @@
 from torch.nn import LayerNorm
 
 from vllm.attention import Attention, AttentionMetadata
-from vllm.config import CacheConfig, LoRAConfig, MultiModalConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
@@ -595,14 +595,15 @@ class ChatGLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP,
 
     def __init__(
         self,
-        config: ChatGLMConfig,
-        multimodal_config: MultiModalConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ):
         super().__init__()
-
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
         self.lora_config = lora_config
         self.multimodal_config = multimodal_config
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 718f26bed443f..e921fa50b099e 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -28,7 +28,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig
+from vllm.config import CacheConfig, LoRAConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -334,12 +334,14 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def __init__(
         self,
-        config: CohereConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
         self.config = config
         # currently all existing command R models have `tie_word_embeddings`
         # enabled
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index ae43383155ffc..e3b3164cacde3 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -4,7 +4,7 @@
 import torch.nn as nn
 
 from vllm.attention import Attention, AttentionMetadata
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.fused_moe import FusedMoE
@@ -352,11 +352,13 @@ class DbrxForCausalLM(nn.Module, SupportsPP):
 
     def __init__(
         self,
-        config: DbrxConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ):
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         self.config = config
         if config.tie_word_embeddings:
             raise ValueError(
diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py
index 8c9653463858b..3e7005efb39ca 100644
--- a/vllm/model_executor/models/decilm.py
+++ b/vllm/model_executor/models/decilm.py
@@ -22,13 +22,11 @@
 # limitations under the License.
 """Inference-only DeciLM model compatible with HuggingFace weights."""
 
-from typing import Iterable, Optional, Tuple
+from typing import Iterable, Tuple
 
 import torch
-from transformers import LlamaConfig
 
-from vllm.config import CacheConfig, LoRAConfig
-from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.config import VllmConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama import LlamaForCausalLM
 
@@ -55,17 +53,13 @@ class DeciLMForCausalLM(LlamaForCausalLM):
 
     def __init__(
         self,
-        config: LlamaConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
+        config = vllm_config.model_config.hf_config
         config.num_key_value_heads = max(config.num_key_value_heads_per_layer)
         delattr(config, "num_key_value_heads_per_layer")
-        super().__init__(config=config,
-                         cache_config=cache_config,
-                         quant_config=quant_config,
-                         lora_config=lora_config)
+        super().__init__(vllm_config=vllm_config)
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index 53a1c7cfbfef4..c90d3d250e4c5 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -27,7 +27,7 @@
 from transformers import PretrainedConfig
 
 from vllm.attention import Attention, AttentionMetadata
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
@@ -385,11 +385,13 @@ class DeepseekForCausalLM(nn.Module, SupportsPP):
 
     def __init__(
         self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
         self.model = DeepseekModel(config, cache_config, quant_config)
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 95bbf4fb59c6a..0f391d8329a8e 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -28,7 +28,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
@@ -481,11 +481,13 @@ class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
 
     def __init__(
         self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
         self.model = DeepseekV2Model(config,
diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py
index a87e1c0228627..6bd73d20d340d 100644
--- a/vllm/model_executor/models/eagle.py
+++ b/vllm/model_executor/models/eagle.py
@@ -4,6 +4,7 @@
 import torch.nn as nn
 
 from vllm.attention.backends.abstract import AttentionMetadata
+from vllm.config import VllmConfig
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -12,7 +13,6 @@
 from vllm.model_executor.models import ModelRegistry
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs.eagle import EAGLEConfig
 
 
 class EAGLE(nn.Module):
@@ -34,14 +34,15 @@ class EAGLE(nn.Module):
        in the draft checkpoint (using key token_map). Also, the draft config
        needs to have truncated_vocab_size (=k) as an attribute."""
 
-    def __init__(self, config: EAGLEConfig, *args, **kwargs) -> None:
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
         self.config = config
 
         architectures = getattr(self.config.model, "architectures", [])
         model_cls, _ = ModelRegistry.resolve_model_cls(architectures)
 
-        self.model = model_cls(self.config.model, *args, **kwargs)
+        self.model = model_cls(vllm_config, prefix)
         self.fc = nn.Linear(config.model.hidden_size * 2,
                             config.model.hidden_size,
                             bias=getattr(self.config, "eagle_fc_bias", False))
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index a8d591b921cd6..fa6dbfe35b3ad 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -29,7 +29,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig
+from vllm.config import CacheConfig, LoRAConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -440,12 +440,14 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def __init__(
         self,
-        config: ExaoneConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
 
         self.config = config
         self.lora_config = lora_config
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index daf49521637b0..96ae119042277 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -27,7 +27,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
@@ -403,11 +403,13 @@ class FalconForCausalLM(nn.Module, SupportsPP):
 
     def __init__(
         self,
-        config: FalconConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ):
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
         self.transformer = FalconModel(config, cache_config, quant_config)
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 184bee5f65671..b0d970d9fb572 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -6,7 +6,7 @@
 from transformers import PretrainedConfig
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
@@ -189,11 +189,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
 
 class Florence2ForConditionalGeneration(nn.Module):
 
-    def __init__(self,
-                 config: PretrainedConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
+    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
 
         # TODO(Isotr0py): Add vision backbone
         self.language_model = Florence2LanguageForConditionalGeneration(
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 653d5d60ea178..cac10f505df67 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -22,14 +22,13 @@
 import torch.nn as nn
 import torch.utils.checkpoint
 from PIL import Image
-from transformers import FuyuConfig, FuyuImageProcessor
+from transformers import FuyuImageProcessor
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, MultiModalConfig
+from vllm.config import VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
 from vllm.model_executor.layers.linear import ColumnParallelLinear
-from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.models.persimmon import PersimmonForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -227,12 +226,12 @@ def input_mapper_for_fuyu(ctx: InputContext, data: object):
 @INPUT_REGISTRY.register_input_processor(input_processor_for_fuyu)
 class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
-    def __init__(self,
-                 config: FuyuConfig,
-                 multimodal_config: MultiModalConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None) -> None:
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
         self.multimodal_config = multimodal_config
 
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 1cc3ea679c553..4e0cbfb9cbf58 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -22,7 +22,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import GeluAndMul
@@ -374,13 +374,14 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def __init__(
         self,
-        config: GemmaConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
+        vllm_config: VllmConfig,
         prefix: str = "",
     ) -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
 
         self.config = config
         # currently all existing Gemma models have `tie_word_embeddings` enabled
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 16e0d6b30713a..773d3b72ec418 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -21,7 +21,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig, PoolerConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
 from vllm.model_executor.layers.activation import GeluAndMul
@@ -245,12 +245,13 @@ class Gemma2Model(nn.Module):
 
     def __init__(
         self,
-        config: Gemma2Config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        vllm_config: VllmConfig,
         prefix: str = "",
     ) -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         self.config = config
 
         self.embed_tokens = VocabParallelEmbedding(
@@ -400,11 +401,13 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def __init__(
         self,
-        config: Gemma2Config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
         del lora_config  # Unused.
         super().__init__()
         self.config = config
@@ -470,14 +473,14 @@ class Gemma2EmbeddingModel(nn.Module, SupportsPP):
 
     def __init__(
         self,
-        pooler_config: Optional[PoolerConfig] = None,
-        **kwargs,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
         super().__init__()
 
-        self.model = Gemma2Model(**kwargs)
+        self.model = Gemma2Model(vllm_config, prefix)
         self._pooler = Pooler.from_config_with_defaults(
-            pooler_config,
+            vllm_config.model_config.pooler_config,
             pooling_type=PoolingType.LAST,
             normalize=True,
             softmax=False)
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 7f81bbff94932..c3fc47db79986 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -24,7 +24,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed.parallel_state import (
     get_pp_group, get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.activation import get_act_fn
@@ -242,11 +242,13 @@ class GPT2LMHeadModel(nn.Module, SupportsPP):
 
     def __init__(
         self,
-        config: GPT2Config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ):
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
         self.transformer = GPT2Model(config,
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index 4be8e4199f04d..ea1614d966365 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -25,7 +25,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig
+from vllm.config import CacheConfig, LoRAConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -260,12 +260,14 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def __init__(
         self,
-        config: GPTBigCodeConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ):
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
 
         self.config = config
         self.lora_config = lora_config
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 834b4aff2e4ba..58cff67c69051 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -23,7 +23,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -231,11 +231,13 @@ class GPTJForCausalLM(nn.Module, SupportsPP):
 
     def __init__(
         self,
-        config: GPTJConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ):
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
         assert not config.tie_word_embeddings
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 1903156d7efe1..27b2577a8cdca 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -23,7 +23,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -244,11 +244,13 @@ class GPTNeoXForCausalLM(nn.Module, SupportsPP):
 
     def __init__(
         self,
-        config: GPTNeoXConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ):
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
         self.gpt_neox = GPTNeoXModel(config, cache_config, quant_config)
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index 8a75b9cb1d55d..c3e23b7138e7f 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -28,7 +28,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig
+from vllm.config import CacheConfig, LoRAConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -372,12 +372,14 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def __init__(
         self,
-        config: GraniteConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
 
         self.config = config
         self.lora_config = lora_config
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index b4da986efabe3..73f7c106e3d39 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -28,7 +28,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig
+from vllm.config import CacheConfig, LoRAConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -335,12 +335,14 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def __init__(
         self,
-        config: GraniteMoeConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
 
         self.config = config
         self.lora_config = lora_config
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 8004367f8dc08..b676171b556a7 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -26,7 +26,7 @@
 from transformers import ProcessorMixin as Idefics3ImageProcessor
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, MultiModalConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
 from vllm.logger import init_logger
@@ -615,13 +615,16 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal):
 
     def __init__(
         self,
-        config: Idefics3Config,
-        multimodal_config: MultiModalConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
         super().__init__()
 
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
         self.config = config
         self.multimodal_config = multimodal_config
 
diff --git a/vllm/model_executor/models/interfaces_base.py b/vllm/model_executor/models/interfaces_base.py
index 8d2d422f9891c..7bb43beff255c 100644
--- a/vllm/model_executor/models/interfaces_base.py
+++ b/vllm/model_executor/models/interfaces_base.py
@@ -11,9 +11,8 @@
 
 if TYPE_CHECKING:
     from vllm.attention import AttentionMetadata
-    from vllm.config import CacheConfig
+    from vllm.config import VllmConfig
     from vllm.model_executor.layers.pooler import PoolerOutput
-    from vllm.model_executor.layers.quantization import QuantizationConfig
     from vllm.model_executor.layers.sampler import SamplerOutput
     from vllm.model_executor.pooling_metadata import PoolingMetadata
     from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -39,10 +38,8 @@ class VllmModel(Protocol[C_co, T_co]):
 
     def __init__(
         self,
-        config: C_co,
-        *,
-        cache_config: Optional["CacheConfig"],
-        quant_config: Optional["QuantizationConfig"],
+        vllm_config: "VllmConfig",
+        prefix: str = "",
     ) -> None:
         ...
 
@@ -58,20 +55,7 @@ def forward(
 
 def _check_vllm_model_init(model: Union[Type[object], object]) -> bool:
     model_init = model.__init__
-    vllm_kws = ("cache_config", "quant_config")
-    missing_kws = tuple(kw for kw in vllm_kws
-                        if not supports_kw(model_init, kw))
-
-    if missing_kws and (isinstance(model, type)
-                        and issubclass(model, nn.Module)):
-        logger.warning(
-            "The model (%s) is missing "
-            "vLLM-specific keywords from its initializer: %s",
-            model,
-            missing_kws,
-        )
-
-    return len(missing_kws) == 0
+    return supports_kw(model_init, "vllm_config")
 
 
 def _check_vllm_model_forward(model: Union[Type[object], object]) -> bool:
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 7ddb1e2a1ab10..cbedd0c8a0130 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -7,7 +7,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               split_tensor_along_last_dim,
@@ -319,12 +319,13 @@ class InternLM2ForCausalLM(nn.Module, SupportsPP):
 
     def __init__(
         self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        vllm_config: VllmConfig,
         prefix: str = "",
     ) -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
         self.model = InternLM2Model(config,
diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py
index 108fc8382049d..f7bc823574034 100644
--- a/vllm/model_executor/models/internlm2_ve.py
+++ b/vllm/model_executor/models/internlm2_ve.py
@@ -5,7 +5,7 @@
 from transformers import PretrainedConfig
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization import QuantizationConfig
@@ -161,11 +161,12 @@ class InternLM2VEForCausalLM(InternLM2ForCausalLM):
 
     def __init__(
         self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        vllm_config: VllmConfig,
         prefix: str = "",
     ) -> None:
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         super().__init__(config, cache_config, quant_config)
         self.model = InternLM2VEModel(config,
                                       cache_config,
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 335b11d293acd..42bccf71273b3 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -16,7 +16,7 @@
 from transformers import PretrainedConfig
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, MultiModalConfig
+from vllm.config import VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
 from vllm.model_executor.layers.quantization import (AWQConfig,
@@ -410,13 +410,13 @@ def dummy_data(
 @INPUT_REGISTRY.register_input_processor(input_pipeline.input_processor)
 class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
 
-    def __init__(self,
-                 config: PretrainedConfig,
-                 multimodal_config: MultiModalConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None) -> None:
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
 
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
         self.config = config
         self.multimodal_config = multimodal_config
         self._patch_quant_config(config, quant_config)
@@ -440,8 +440,7 @@ def __init__(self,
 
         self.language_model = init_vllm_registered_model(
             config.text_config,
-            cache_config,
-            quant_config,
+            vllm_config=vllm_config,
             prefix="language_model")
 
         self.mlp1 = self._init_mlp1(config)
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index 23fdca09493b7..ae3f5b01d5cce 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -26,7 +26,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -288,11 +288,13 @@ class JAISLMHeadModel(nn.Module, SupportsPP):
 
     def __init__(
         self,
-        config: JAISConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ):
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
         self.transformer = JAISModel(config, cache_config, quant_config)
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 9b18a1b68f9d3..72eb1017c2868 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -7,7 +7,7 @@
 
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.attention.layer import Attention
-from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
+from vllm.config import CacheConfig, LoRAConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -350,12 +350,14 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA):
 
     def __init__(
         self,
-        config: JambaConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
-        scheduler_config: Optional[SchedulerConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        scheduler_config = vllm_config.scheduler_config
         assert not cache_config.enable_prefix_caching, \
             "Jamba currently does not support prefix caching"
 
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 9e8a403b2f1fc..b765912387e2e 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -28,7 +28,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig, PoolerConfig
+from vllm.config import CacheConfig, LoRAConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -494,15 +494,15 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def __init__(
         self,
-        config: LlamaConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
+        vllm_config: VllmConfig,
         prefix: str = "",
-        pooler_config: Optional[PoolerConfig] = None,
     ) -> None:
         super().__init__()
-
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        pooler_config = vllm_config.model_config.pooler_config
         self.config = config
         self.lora_config = lora_config
 
@@ -654,12 +654,22 @@ class LlamaEmbeddingModel(nn.Module, SupportsLoRA, SupportsPP):
 
     def __init__(
         self,
-        pooler_config: Optional[PoolerConfig] = None,
-        **kwargs,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
         super().__init__()
 
-        self.model = LlamaModel(**kwargs)
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        pooler_config = vllm_config.model_config.pooler_config
+
+        self.model = LlamaModel(config,
+                                cache_config,
+                                quant_config,
+                                lora_config,
+                                prefix=maybe_prefix(prefix, "model"))
         self._pooler = Pooler.from_config_with_defaults(
             pooler_config,
             pooling_type=PoolingType.LAST,
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index bdd67b12a06d8..c98462537728a 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -9,7 +9,7 @@
                           PretrainedConfig, SiglipVisionConfig)
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, MultiModalConfig
+from vllm.config import VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext)
 from vllm.model_executor.layers.activation import get_act_fn
@@ -258,13 +258,13 @@ def init_vision_tower_for_llava(
 @INPUT_REGISTRY.register_input_processor(input_processor_for_llava)
 class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
 
-    def __init__(self,
-                 config: LlavaConfig,
-                 multimodal_config: MultiModalConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None) -> None:
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
 
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
         self.config = config
         self.multimodal_config = multimodal_config
 
@@ -290,8 +290,7 @@ def __init__(self,
 
         self.language_model = init_vllm_registered_model(
             config.text_config,
-            cache_config,
-            quant_config,
+            vllm_config=vllm_config,
             prefix="language_model")
 
         self.make_empty_intermediate_tensors = (
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 37b8baa8c6be0..f187f8105b96a 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -11,11 +11,10 @@
 from typing_extensions import NotRequired
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, MultiModalConfig, PoolerConfig
+from vllm.config import VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext)
 from vllm.model_executor.layers.pooler import Pooler, PoolingType
-from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -282,13 +281,12 @@ def input_processor_for_llava_next(ctx: InputContext,
 class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
                                         SupportsPP):
 
-    def __init__(self,
-                 config: LlavaNextConfig,
-                 multimodal_config: MultiModalConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 pooler_config: Optional[PoolerConfig] = None) -> None:
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        pooler_config = vllm_config.model_config.pooler_config
+        multimodal_config = vllm_config.model_config.multimodal_config
 
         self.config = config
         self.multimodal_config = multimodal_config
@@ -308,8 +306,7 @@ def __init__(self,
 
         self.language_model = init_vllm_registered_model(
             config.text_config,
-            cache_config,
-            quant_config,
+            vllm_config=vllm_config,
             prefix="language_model")
 
         # The same model class supports both language generation and embedding
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 69bfc80a4372c..eceb0c0ab52df 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -10,11 +10,10 @@
                           SiglipVisionConfig)
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, MultiModalConfig
+from vllm.config import VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -254,12 +253,11 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
 class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
                                              SupportsPP):
 
-    def __init__(self,
-                 config: LlavaNextVideoConfig,
-                 multimodal_config: MultiModalConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None) -> None:
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
 
         self.config = config
         self.multimodal_config = multimodal_config
@@ -277,8 +275,7 @@ def __init__(self,
             projector_hidden_act=config.projector_hidden_act)
         self.language_model = init_vllm_registered_model(
             config.text_config,
-            cache_config,
-            quant_config,
+            vllm_config=vllm_config,
             prefix="language_model")
 
         self.make_empty_intermediate_tensors = (
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index ad5d551ee0834..64d373ce91509 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -14,11 +14,10 @@
 from typing_extensions import NotRequired
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, MultiModalConfig
+from vllm.config import VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -405,12 +404,11 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
 class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
                                              SupportsPP):
 
-    def __init__(self,
-                 config: LlavaOnevisionConfig,
-                 multimodal_config: MultiModalConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None) -> None:
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
 
         self.config = config
         self.multimodal_config = multimodal_config
@@ -424,8 +422,7 @@ def __init__(self,
         self.multi_modal_projector = LlavaOnevisionMultiModalProjector(config)
         self.language_model = init_vllm_registered_model(
             config.text_config,
-            cache_config,
-            quant_config,
+            vllm_config=vllm_config,
             prefix="language_model")
         self.image_newline = nn.Parameter(
             torch.empty(config.text_config.hidden_size))
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 91161957642f9..49e43f8cc683c 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -6,7 +6,7 @@
 from transformers import MambaConfig
 
 from vllm.attention.backends.abstract import AttentionMetadata
-from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
+from vllm.config import CacheConfig, LoRAConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -132,12 +132,14 @@ class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree):
 
     def __init__(
         self,
-        config: MambaConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
-        scheduler_config: Optional[SchedulerConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        scheduler_config = vllm_config.scheduler_config
         assert not cache_config.enable_prefix_caching, \
             "Mamba does not support prefix caching"
 
diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py
index 619a5cd00d6b6..4cb1b4a929b9f 100644
--- a/vllm/model_executor/models/medusa.py
+++ b/vllm/model_executor/models/medusa.py
@@ -3,13 +3,13 @@
 import torch
 import torch.nn as nn
 
+from vllm.config import VllmConfig
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.transformers_utils.configs.medusa import MedusaConfig
 
 
 class ResidualBlock(nn.Module):
@@ -44,7 +44,8 @@ class Medusa(nn.Module):
        in the draft checkpoint (using key token_map). Also, the draft config
        needs to have truncated_vocab_size (=k) as an attribute."""
 
-    def __init__(self, config: MedusaConfig, **_) -> None:
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
+        config = vllm_config.model_config.hf_config
         super().__init__()
         self.config = config
         self.blocks = nn.ModuleList([
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 7704431a4d90a..559d9c4dd35bf 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -29,7 +29,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig
+from vllm.config import CacheConfig, LoRAConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
@@ -463,12 +463,14 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def __init__(
         self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
 
         self.config = config
         self.lora_config = lora_config
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index f8006095e2eb2..9458204c5a038 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -34,7 +34,7 @@
 from typing_extensions import NotRequired
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, LoRAConfig, MultiModalConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -385,11 +385,13 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(
         self,
-        config: PretrainedConfig,
-        multimodal_config: MultiModalConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ):
+        config = vllm_config.model_config.hf_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         super().__init__()
         # All MiniCPM-V models disable `tie_word_embeddings` but
         # `PretrainedConfig.tie_word_embeddings` defaults to True; we cannot
@@ -701,12 +703,10 @@ class MiniCPMV2_0(MiniCPMVBaseModel):
 
     def __init__(
         self,
-        config: PretrainedConfig,
-        multimodal_config: MultiModalConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ):
-        super().__init__(config, multimodal_config, cache_config, quant_config)
+        super().__init__(vllm_config)
         assert self.version == (2, 0)
 
     def init_llm(
@@ -867,13 +867,10 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
 
     def __init__(
         self,
-        config: PretrainedConfig,
-        multimodal_config: MultiModalConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ):
-        super().__init__(config, multimodal_config, cache_config, quant_config)
+        super().__init__(vllm_config)
         assert self.version == (2, 5)
 
     def init_llm(
@@ -1017,12 +1014,10 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
 
     def __init__(
         self,
-        config: PretrainedConfig,
-        multimodal_config: MultiModalConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ):
-        super().__init__(config, multimodal_config, cache_config, quant_config)
+        super().__init__(vllm_config)
         assert self.version == (2, 6)
 
     def init_llm(
@@ -1141,12 +1136,8 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsLoRA):
     embedding_modules = {}
     embedding_padding_modules = []
 
-    def __new__(cls,
-                config: PretrainedConfig,
-                multimodal_config: MultiModalConfig,
-                cache_config: Optional[CacheConfig] = None,
-                quant_config: Optional[QuantizationConfig] = None,
-                lora_config: Optional[LoRAConfig] = None):
+    def __new__(cls, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_config
         if not hasattr(config, "version"):
             if config.hidden_size == 2304 and config.query_num == 64:
                 version = (2, 0)
@@ -1160,5 +1151,4 @@ def __new__(cls,
         if instance_class is None:
             raise ValueError(
                 "Currently, MiniCPMV only supports versions 2.0, 2.5, and 2.6")
-        return instance_class(config, multimodal_config, cache_config,
-                              quant_config)
+        return instance_class(vllm_config, prefix=prefix)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index f5c28e7d74811..91ec3228c0d48 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -28,7 +28,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig
+from vllm.config import CacheConfig, LoRAConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -334,13 +334,14 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def __init__(
         self,
-        config: MixtralConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
         super().__init__()
-
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
         self.config = config
         self.lora_config = lora_config
 
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 007c4e2eabc90..aeac326776392 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -29,7 +29,7 @@
 from transformers import MixtralConfig
 
 from vllm.attention import Attention, AttentionMetadata
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
@@ -352,11 +352,13 @@ class MixtralForCausalLM(nn.Module, SupportsPP):
 
     def __init__(
         self,
-        config: MixtralConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
         self.model = MixtralModel(config, cache_config, quant_config)
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 18e38daadc93a..14aa515570f38 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -33,7 +33,7 @@
 import vllm.distributed.parallel_state as ps
 from vllm.attention import Attention, AttentionMetadata, AttentionType
 from vllm.attention.ops.paged_attn import PagedAttention
-from vllm.config import CacheConfig, MultiModalConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.inputs import (INPUT_REGISTRY, DummyData, EncoderDecoderInputs,
                          InputContext, TokenInputs, token_inputs)
@@ -1108,12 +1108,15 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
         "up_proj": ("gate_up_proj", 1),
     }
 
-    def __init__(self,
-                 config: config_mllama.MllamaConfig,
-                 multimodal_config: MultiModalConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         self.vocab_size = config.text_config.vocab_size
         self.hidden_size = config.text_config.hidden_size
         self.max_num_tiles = config.vision_config.max_num_tiles
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index 5f2f61cc610b3..cd462c4d0495e 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -3,8 +3,7 @@
 from array import array
 from dataclasses import dataclass
 from functools import lru_cache, partial
-from typing import (Any, Iterable, List, Mapping, Optional, Tuple, TypedDict,
-                    Union)
+from typing import Iterable, List, Mapping, Optional, Tuple, TypedDict, Union
 
 import torch
 from einops import rearrange
@@ -16,7 +15,7 @@
 from vllm.attention import Attention, AttentionMetadata
 from vllm.attention.selector import _Backend
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, MultiModalConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               split_tensor_along_last_dim,
@@ -1027,13 +1026,14 @@ class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
     def __init__(
         self,
-        config: PretrainedConfig,
-        multimodal_config: Optional[MultiModalConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[Mapping[str, Any]] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
         super().__init__()
-
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
         self.multimodal_config = multimodal_config
 
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index b3977812cb273..672c8e9c22260 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -7,7 +7,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.activation import get_act_fn
@@ -269,11 +269,13 @@ class MPTForCausalLM(nn.Module, SupportsPP):
 
     def __init__(
         self,
-        config: MPTConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-    ):
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         self.config = config
         assert config.tie_word_embeddings
         self.quant_config = quant_config
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index 8d128a42b14b8..5991cce642981 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -27,7 +27,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig
+from vllm.config import CacheConfig, LoRAConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -403,13 +403,14 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def __init__(
         self,
-        config: NemotronConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
         super().__init__()
-
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
         assert isinstance(config, NemotronConfig)
 
         self.config = config
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 545d86eebb5ec..6905f8521a8c3 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -28,7 +28,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -291,11 +291,15 @@ class OlmoForCausalLM(nn.Module, SupportsPP):
     Extremely barebones HF model wrapper.
     """
 
-    def __init__(self,
-                 config: OlmoConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         self.config = config
         self.model = OlmoModel(config, cache_config, quant_config)
         if config.tie_word_embeddings:
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index de30b5270e7e8..8fa90d17003af 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -18,7 +18,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -311,11 +311,13 @@ class OlmoeForCausalLM(nn.Module, SupportsPP):
 
     def __init__(
         self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
         self.model = OlmoeModel(config, cache_config, quant_config)
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index a453376d02552..d378956b68cfc 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -24,7 +24,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -344,11 +344,13 @@ class OPTForCausalLM(nn.Module, SupportsPP):
 
     def __init__(
         self,
-        config: OPTConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        vllm_config: VllmConfig,
         prefix: str = "",
-    ):
+    ) -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         super().__init__()
         self.config = config
         self.quant_config = quant_config
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index d6ec1fb602f05..b400d4e3f5228 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -11,7 +11,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -270,11 +270,13 @@ class OrionForCausalLM(nn.Module, SupportsPP):
 
     def __init__(
         self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
         self.model = OrionModel(config, cache_config, quant_config)
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 4b6061e113cb2..69b7fe9d56847 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -6,13 +6,11 @@
 from transformers import PaliGemmaConfig
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, MultiModalConfig
+from vllm.config import VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
 from vllm.logger import init_logger
-from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.model_executor.models.gemma import GemmaForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.utils import cached_get_tokenizer
@@ -21,7 +19,8 @@
 from .interfaces import SupportsMultiModal, SupportsPP
 from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
                      dummy_seq_data_for_siglip, get_max_siglip_image_tokens)
-from .utils import AutoWeightsLoader, merge_multimodal_embeddings
+from .utils import (AutoWeightsLoader, init_vllm_registered_model,
+                    merge_multimodal_embeddings)
 
 logger = init_logger(__name__)
 
@@ -132,13 +131,15 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
 class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
                                         SupportsPP):
 
-    def __init__(self,
-                 config: PaliGemmaConfig,
-                 multimodal_config: MultiModalConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None) -> None:
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
-
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
         self.multimodal_config = multimodal_config
 
@@ -150,10 +151,11 @@ def __init__(self,
             projection_dim=config.vision_config.projection_dim)
 
         self.quant_config = quant_config
-        self.language_model = GemmaForCausalLM(config.text_config,
-                                               cache_config,
-                                               quant_config,
-                                               prefix="language_model")
+        config.text_config.architectures = ["GemmaForCausalLM"]
+        self.language_model = init_vllm_registered_model(
+            config.text_config,
+            vllm_config=vllm_config,
+            prefix="language_model")
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.language_model.logits_processor.scale *= logit_scale
 
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index 11e7c8abd4888..a86e2c1b4e4a1 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -27,7 +27,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -265,11 +265,15 @@ def forward(
 
 class PersimmonForCausalLM(nn.Module, SupportsPP):
 
-    def __init__(self,
-                 config: PersimmonConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         self.config = config
         self.vocab_size = config.vocab_size
         self.model = PersimmonModel(config,
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 4dae6e323654b..fef921528b042 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -42,7 +42,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -279,13 +279,14 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def __init__(
         self,
-        config: PhiConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
-    ):
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
-
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
         self.config = config
         # lm_head use bias, cannot share word embeddings
         assert not config.tie_word_embeddings
diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py
index 92bf0e61448e5..de1b09eba6c6d 100644
--- a/vllm/model_executor/models/phi3_small.py
+++ b/vllm/model_executor/models/phi3_small.py
@@ -6,7 +6,7 @@
 from transformers.configuration_utils import PretrainedConfig
 
 from vllm.attention import Attention, AttentionMetadata
-from vllm.config import CacheConfig, LoRAConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -365,12 +365,13 @@ class Phi3SmallForCausalLM(nn.Module, SupportsPP):
 
     def __init__(
         self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
-    ):
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
         self.model = Phi3SmallModel(config, cache_config, quant_config)
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index a84d6b317b479..65131d61673a3 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -25,8 +25,7 @@
 from transformers import CLIPVisionConfig, PretrainedConfig
 
 from vllm.attention import AttentionMetadata
-from vllm.config import (CacheConfig, ModelConfig, MultiModalConfig,
-                         PoolerConfig)
+from vllm.config import ModelConfig, VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
 from vllm.logger import init_logger
@@ -526,14 +525,16 @@ def input_processor_for_phi3v(ctx: InputContext,
 @INPUT_REGISTRY.register_input_processor(input_processor_for_phi3v)
 class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
-    def __init__(self,
-                 config: PretrainedConfig,
-                 multimodal_config: MultiModalConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 pooler_config: Optional[PoolerConfig] = None) -> None:
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
-
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        pooler_config = vllm_config.model_config.pooler_config
+        multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
         self.multimodal_config = multimodal_config
         self.image_token_id = _IMAGE_TOKEN_ID
@@ -552,8 +553,8 @@ def __init__(self,
 
         # The prefix is empty intentionally because default prefix of
         # LlamaForCausalLM is "model"
-        self.language_model = LlamaForCausalLM(config, cache_config,
-                                               quant_config)
+        self.language_model = LlamaForCausalLM(vllm_config=vllm_config,
+                                               prefix="")
 
         # The same model class supports both language generation and embedding
         # because the architecture name is the same
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index 19e2621ead996..17d00c0ede2b2 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -28,7 +28,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig
+from vllm.config import CacheConfig, LoRAConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
@@ -531,13 +531,14 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def __init__(
         self,
-        config: PhiMoEConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
         super().__init__()
-
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
         self.config = config
         self.lora_config = lora_config
 
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index de935fc420472..93919c9c051c0 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -9,14 +9,14 @@
 import torch.nn.functional as F
 from mistral_common.protocol.instruct.messages import ImageChunk
 from PIL import Image
-from transformers import PixtralVisionConfig, PretrainedConfig
+from transformers import PixtralVisionConfig
 from transformers.models.pixtral.image_processing_pixtral import (
     _num_image_tokens)
 from transformers.models.pixtral.modeling_pixtral import (
     PixtralRotaryEmbedding, apply_rotary_pos_emb, position_ids_in_meshgrid)
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, ModelConfig, MultiModalConfig
+from vllm.config import ModelConfig, VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
 from vllm.model_executor.layers.activation import get_act_and_mul_fn
@@ -152,13 +152,14 @@ def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs):
 class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
                                       SupportsPP):
 
-    def __init__(self,
-                 config: PretrainedConfig,
-                 multimodal_config: MultiModalConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None) -> None:
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
-
+        config = vllm_config.model_config.hf_config
+        multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
         self.multimodal_config = multimodal_config
 
@@ -174,8 +175,7 @@ def __init__(self,
         # init MistralForCausalLM
         self.language_model = init_vllm_registered_model(
             config.text_config,
-            cache_config,
-            quant_config,
+            vllm_config=vllm_config,
             prefix="language_model")
 
         self.vision_encoder = VisionTransformer(self.vision_args)
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 1db7e2ba1cc12..d3f10ee7c85ca 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -20,7 +20,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig, MultiModalConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
@@ -867,13 +867,14 @@ class QWenBaseModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
 
     def __init__(
         self,
-        config: PretrainedConfig,
-        multimodal_config: MultiModalConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
-    ):
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
         self.multimodal_config = multimodal_config
         self.quant_config = quant_config
@@ -1064,17 +1065,13 @@ class QWenLMHeadModel(QWenBaseModel, SupportsLoRA):
 
     def __new__(
         cls,
-        config: PretrainedConfig,
-        multimodal_config: MultiModalConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
-    ):
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> None:
+        config = vllm_config.model_config.hf_config
         # Initialize VL
         if hasattr(config, "visual"):
-            return QWenVL(config, multimodal_config, cache_config,
-                          quant_config, lora_config)
+            return QWenVL(vllm_config)
         # Initialize LLM
         else:
-            return QWenLLM(config, multimodal_config, cache_config,
-                           quant_config, lora_config)
+            return QWenLLM(vllm_config)
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 1e99c1b13b31f..b0156a25ca5cf 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -29,7 +29,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -405,12 +405,14 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def __init__(
         self,
-        config: Qwen2Config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
+        vllm_config: VllmConfig,
         prefix: str = "",
     ) -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
         # TODO (@robertgshaw2): see if this can be moved out
         if (cache_config.sliding_window is not None
                 and hasattr(config, "max_window_layers")):
@@ -423,8 +425,6 @@ def __init__(
                                  config.num_hidden_layers,
                              ))
 
-        super().__init__()
-
         self.config = config
         self.lora_config = lora_config
 
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 18cf45b3939f7..1057720e8c308 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -26,16 +26,14 @@
 import numpy as np
 import torch
 import torch.nn as nn
-from transformers import Qwen2AudioConfig, Qwen2AudioEncoder
+from transformers import Qwen2AudioEncoder
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, MultiModalConfig
+from vllm.config import VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import (
@@ -266,13 +264,16 @@ def input_mapper_for_qwen2_audio(
 class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
                                          SupportsPP):
 
-    def __init__(self,
-                 config: Qwen2AudioConfig,
-                 multimodal_config: MultiModalConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None) -> None:
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
-
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
         self.multimodal_config = multimodal_config
 
diff --git a/vllm/model_executor/models/qwen2_cls.py b/vllm/model_executor/models/qwen2_cls.py
index b9e3b74c477e2..25ecf76e35f22 100644
--- a/vllm/model_executor/models/qwen2_cls.py
+++ b/vllm/model_executor/models/qwen2_cls.py
@@ -8,14 +8,11 @@
 
 import torch
 from torch import nn
-from transformers import Qwen2Config
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, LoRAConfig, PoolerConfig
+from vllm.config import VllmConfig
 from vllm.model_executor.layers.linear import RowParallelLinear
 from vllm.model_executor.layers.pooler import Pooler, PoolingType
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
 from vllm.model_executor.models.qwen2 import Qwen2Model
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors, PoolerOutput
@@ -48,12 +45,15 @@ class Qwen2ForSequenceClassification(nn.Module):
 
     def __init__(
         self,
-        config: Qwen2Config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
-        pooler_config: Optional[PoolerConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        pooler_config = vllm_config.model_config.pooler_config
         # TODO (@robertgshaw2): see if this can be moved out
         if (cache_config.sliding_window is not None
                 and hasattr(config, "max_window_layers")):
@@ -66,8 +66,6 @@ def __init__(
                                  config.num_hidden_layers,
                              ))
 
-        super().__init__()
-
         self.config = config
         self.lora_config = lora_config
 
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index c8c48c0894c36..b1177f9c59063 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -30,7 +30,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
@@ -379,11 +379,13 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP):
 
     def __init__(
         self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
         self.model = Qwen2MoeModel(config, cache_config, quant_config)
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index 0fbf305da8b94..1f9411241bdd6 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -7,14 +7,12 @@
 
 import torch
 from torch import nn
-from transformers import Qwen2Config
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, LoRAConfig, PoolerConfig
+from vllm.config import VllmConfig
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.pooler import Pooler, PoolingType
-from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors, PoolerOutput
 
@@ -59,12 +57,15 @@ class Qwen2ForRewardModel(nn.Module, SupportsPP):
 
     def __init__(
         self,
-        config: Qwen2Config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
-        pooler_config: Optional[PoolerConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        pooler_config = vllm_config.model_config.pooler_config
         # TODO (@robertgshaw2): see if this can be moved out
         if (cache_config.sliding_window is not None
                 and hasattr(config, "max_window_layers")):
@@ -77,8 +78,6 @@ def __init__(
                                  config.num_hidden_layers,
                              ))
 
-        super().__init__()
-
         self.config = config
         self.lora_config = lora_config
 
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 8dd75c9ee7e7b..ab80c1494d067 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -40,7 +40,7 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.attention.selector import _Backend
-from vllm.config import CacheConfig, LoRAConfig, MultiModalConfig
+from vllm.config import VllmConfig
 from vllm.distributed import get_pp_group, parallel_state
 from vllm.distributed import utils as dist_utils
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
@@ -966,15 +966,16 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
     embedding_modules = {}
     embedding_padding_modules = []
 
-    def __init__(self,
-                 config: Qwen2VLConfig,
-                 multimodal_config: MultiModalConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 lora_config: Optional[LoRAConfig] = None) -> None:
-
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
-
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
         assert not cache_config.enable_prefix_caching, \
             "Qwen2-VL currently does not support prefix caching"
 
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index 931e48a44f631..ffabac8292dbd 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -29,7 +29,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig
+from vllm.config import CacheConfig, LoRAConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -411,13 +411,14 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def __init__(
         self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
         super().__init__()
-
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
         self.config = config
         self.lora_config = lora_config
 
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 4cb55506bb237..975d316977c37 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -25,7 +25,7 @@
 from transformers import PretrainedConfig
 
 from vllm.attention import Attention, AttentionMetadata
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -247,11 +247,13 @@ class StablelmForCausalLM(nn.Module, SupportsPP):
 
     def __init__(
         self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
         self.model = StableLMEpochModel(config, cache_config, quant_config)
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 0b0e3f21065b4..ae61aa4e248a5 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -25,7 +25,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -245,11 +245,15 @@ def forward(
 
 class Starcoder2ForCausalLM(nn.Module, SupportsPP):
 
-    def __init__(self,
-                 config: Starcoder2Config,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
         self.config = config
         self.model = Starcoder2Model(config,
                                      cache_config,
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 6b7a638585ad9..d47f0091e0f9f 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -15,12 +15,11 @@
 from transformers.models.whisper.modeling_whisper import WhisperEncoder
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, MultiModalConfig
+from vllm.config import VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
 from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.loader import DefaultModelLoader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -340,12 +339,14 @@ def forward(
 @INPUT_REGISTRY.register_input_processor(input_processor_for_ultravox)
 class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
 
-    def __init__(self,
-                 config: UltravoxConfig,
-                 multimodal_config: MultiModalConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional["QuantizationConfig"] = None):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+    ) -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
         self.multi_modal_config = multimodal_config
         assert self.multi_modal_config
@@ -361,10 +362,7 @@ def __init__(self,
                 ))
         self.multi_modal_projector = UltravoxProjector(config)
         self.language_model = init_vllm_registered_model(
-            config.text_config,
-            cache_config,
-            quant_config,
-            prefix="language_model")
+            config.text_config, vllm_config, prefix="language_model")
         if config.text_model_id is not None:
             self.secondary_weights.append(
                 DefaultModelLoader.Source(model_or_path=config.text_model_id,
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index fee97e8922a76..60eeceb18bcf0 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -11,11 +11,8 @@
 import vllm.envs as envs
 from vllm.attention.selector import (_Backend, backend_name_to_enum,
                                      get_global_forced_attn_backend)
-from vllm.config import (CacheConfig, LoRAConfig, MultiModalConfig,
-                         SchedulerConfig)
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
-from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.model_executor.model_loader.loader import build_model
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models import ModelRegistry
 from vllm.multimodal.base import MultiModalPlaceholderMap, NestedTensors
@@ -236,12 +233,7 @@ def load_weights(
 
 def init_vllm_registered_model(
     hf_config: PretrainedConfig,
-    cache_config: Optional[CacheConfig],
-    quant_config: Optional[QuantizationConfig],
-    *,
-    lora_config: Optional[LoRAConfig] = None,
-    multimodal_config: Optional[MultiModalConfig] = None,
-    scheduler_config: Optional[SchedulerConfig] = None,
+    vllm_config: VllmConfig,
     prefix: str = "",
 ) -> nn.Module:
     """
@@ -249,18 +241,11 @@ def init_vllm_registered_model(
     based on the arguments passed to the outer vLLM model.
     """
     model_class, _ = ModelRegistry.resolve_model_cls(hf_config.architectures)
+    import copy
+    copied_config = copy.deepcopy(vllm_config)
+    copied_config.model_config.hf_config = hf_config
 
-    return build_model(
-        model_class,
-        None,
-        hf_config,
-        cache_config,
-        quant_config,
-        lora_config=lora_config,
-        multimodal_config=multimodal_config,
-        scheduler_config=scheduler_config,
-        prefix=prefix,
-    )
+    return model_class(vllm_config=copied_config, prefix=prefix)
 
 
 @overload
diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py
index 1d08b382b0b00..7afb99176077b 100644
--- a/vllm/model_executor/models/xverse.py
+++ b/vllm/model_executor/models/xverse.py
@@ -27,7 +27,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -225,13 +225,14 @@ class XverseModel(nn.Module):
 
     def __init__(
         self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
+        vllm_config: VllmConfig,
         prefix: str = "",
     ) -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
         self.config = config
         self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size *
@@ -316,13 +317,16 @@ class XverseForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def __init__(
         self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
+        vllm_config: VllmConfig,
+        prefix: str = "",
     ) -> None:
         super().__init__()
 
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
         self.config = config
         self.lora_config = lora_config
 
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index 3336569f59467..8373e11cfff9f 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -61,15 +61,3 @@ def set_compilation_config(config: Optional[CompilationConfig]):
 
 def get_compilation_config() -> Optional[CompilationConfig]:
     return _compilation_config
-
-
-_vllm_config: Optional[VllmConfig] = None
-
-
-def set_vllm_config(config: Optional[VllmConfig]):
-    global _vllm_config
-    _vllm_config = config
-
-
-def get_vllm_config() -> Optional[VllmConfig]:
-    return _vllm_config

From 8e1529dc573c9b4697fca24944918b8d68fd5906 Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Sat, 9 Nov 2024 00:26:52 -0600
Subject: [PATCH 064/183] [CI/Build] Add run-hpu-test.sh script (#10167)

Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
---
 .buildkite/run-hpu-test.sh | 16 ++++++++++++++++
 Dockerfile.hpu             |  2 ++
 2 files changed, 18 insertions(+)
 create mode 100644 .buildkite/run-hpu-test.sh

diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh
new file mode 100644
index 0000000000000..4505dc7a9373c
--- /dev/null
+++ b/.buildkite/run-hpu-test.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+docker build -t hpu-test-env -f Dockerfile.hpu .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f hpu-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and launch offline inference
+docker run --runtime=habana --name=hpu-test --network=host -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
\ No newline at end of file
diff --git a/Dockerfile.hpu b/Dockerfile.hpu
index f481c8c6a57bf..d18fc016387bf 100644
--- a/Dockerfile.hpu
+++ b/Dockerfile.hpu
@@ -13,4 +13,6 @@ RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
 
 WORKDIR /workspace/
 
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

From f192aeba74ebf5a6d1a0fccc9a84e8fe99f8c619 Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Sat, 9 Nov 2024 03:01:27 -0500
Subject: [PATCH 065/183] [Bugfix] Enable some fp8 and quantized fullgraph
 tests (#10171)

Signed-off-by: Bill Nell <bill@neuralmagic.com>
---
 tests/compile/utils.py | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/tests/compile/utils.py b/tests/compile/utils.py
index 95cad19126df6..222c63a342a4b 100644
--- a/tests/compile/utils.py
+++ b/tests/compile/utils.py
@@ -9,29 +9,26 @@
 
 TEST_MODELS = [
     ("facebook/opt-125m", {}),
-    # TODO: add fake implementation for compressed-tensors
-    # ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
-    #     "dtype": torch.float16,
-    #     "quantization": "compressed-tensors"
-    # }),
+    ("nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change", {
+        "dtype": torch.float16,
+        "quantization": "compressed-tensors"
+    }),
     ("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", {
         "dtype": torch.float16,
         "quantization": "fp8"
     }),
-    # TODO: add fake implementation for compressed-tensors
-    # ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
-    #     "quantization": "compressed-tensors"
-    # }),
+    ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", {
+        "quantization": "compressed-tensors"
+    }),
     ("meta-llama/Meta-Llama-3-8B", {}),
 ]
 
-# TODO: enable in pytorch 2.5
-if False and is_quant_method_supported("aqlm"):  # noqa: SIM223
+if is_quant_method_supported("aqlm"):
     TEST_MODELS.append(("ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf", {
         "quantization": "aqlm"
     }))
 
-# TODO: enable in pytorch 2.5
+# TODO: figure out why this fails.
 if False and is_quant_method_supported("gguf"):  # noqa: SIM223
     TEST_MODELS.append(("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", {
         "quantization": "gguf"
@@ -71,13 +68,13 @@ def check_full_graph_support(model,
     os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(optimization_level)
     os.environ["VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "1"
 
-    # Inductor doesn't support fp8 and the base meta llama uses too
-    # much memory.
-    quantization = model_kwargs.get("quantization")
-    if ((quantization == "fp8" or model == "meta-llama/Meta-Llama-3-8B")
+    # The base meta llama uses too much memory.
+    if (model == "meta-llama/Meta-Llama-3-8B"
             and optimization_level >= CompilationLevel.PIECEWISE):
         return
 
+    print(f"MODEL={model}")
+
     prompts = [
         "Hello, my name is",
         "The president of the United States is",

From bd46357ad90fdb4263a3155c358d37d32dab127c Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 9 Nov 2024 00:04:50 -0800
Subject: [PATCH 066/183] [bugfix] fix broken tests of mlp speculator (#10177)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/model_executor/models/mlp_speculator.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index fde44265414c5..6aa43f22f4c93 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -4,13 +4,13 @@
 import torch
 import torch.nn as nn
 
+from vllm.config import VllmConfig
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.transformers_utils.configs import MLPSpeculatorConfig
 
 SQRT2 = 2**0.5
 
@@ -65,8 +65,9 @@ class MLPSpeculator(nn.Module):
     https://huggingface.co/ibm-fms and https://huggingface.co/ibm-granite
     """
 
-    def __init__(self, config: MLPSpeculatorConfig, **kwargs) -> None:
+    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
+        config = vllm_config.model_config.hf_config
         self.n_predict = config.n_predict
         self.vocab_size = config.vocab_size
         self.emb_dim = config.emb_dim

From 8a4358ecb5ba457fad2be0ed930132489eddddf5 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 9 Nov 2024 01:02:54 -0800
Subject: [PATCH 067/183] [doc] explaining the integration with huggingface
 (#10173)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 .../source/design/huggingface_integration.rst | 40 +++++++++++++++++++
 docs/source/index.rst                         |  1 +
 2 files changed, 41 insertions(+)
 create mode 100644 docs/source/design/huggingface_integration.rst

diff --git a/docs/source/design/huggingface_integration.rst b/docs/source/design/huggingface_integration.rst
new file mode 100644
index 0000000000000..20394bd311ea9
--- /dev/null
+++ b/docs/source/design/huggingface_integration.rst
@@ -0,0 +1,40 @@
+Integration with HuggingFace
+===================================
+
+This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run ``vllm serve``.
+
+Let's say we want to serve the popular QWen model by running ``vllm serve Qwen/Qwen2-7B``.
+
+1. The ``model`` argument is ``Qwen/Qwen2-7B``. vLLM will first try to locate the config file ``config.json`` using this argument. See the `code snippet <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L75>`__ for the implementation.
+
+   - If the ``model`` argument is a local path, vLLM will directly read the config file from the path.
+
+   - Otherwise, vLLM will try to read the config from the HuggingFace cache. See `their website <https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome>`__ for more information on how the HuggingFace cache works. Here, we can also use the argument ``--revision`` to specify the revision of the model in the cache.
+
+   - If neither of the above works, vLLM will download the config file from the HuggingFace model hub, using the ``model`` argument as the model name, the ``--revision`` argument as the revision, and the environment variable ``HF_TOKEN`` as the token to access the model hub. In our case, vLLM will download the `config.json <https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json>`__ file.
+
+2. After obtaining the config file, vLLM will load the config into a dictionary. It first `inspects <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L189>`__ the ``model_type`` field in the config to determine the model type and config class to use. There are some ``model_type`` values that vLLM directly supports; see `here <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L48>`__ for the list. If the ``model_type`` is not in the list, vLLM will use `AutoConfig.from_pretrained <https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoConfig.from_pretrained>`__ to load the config class, with ``model``, ``--revision``, and ``--trust_remote_code`` as the arguments.
+
+   - HuggingFace also has its own logic to determine the config class to use. It will again use the ``model_type`` field to search for the class name in the transformers library; see `here <https://github.com/huggingface/transformers/tree/main/src/transformers/models>`__ for the list of supported models. If the ``model_type`` is not found, HuggingFace will use the ``auto_map`` field from the config JSON file to determine the class name. Specifically, it is the ``AutoConfig`` field under ``auto_map``. See `DeepSeek <https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json>`__ for an example.
+
+   - The ``AutoConfig`` field under ``auto_map`` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the ``from_pretrained`` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when ``--trust_remote_code`` is enabled.
+
+3. After obtaining the config object, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see `here <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/config.py#L244>`__ for the implementation.
+
+4. The config object is `attached <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/config.py#L195>`__ as the ``hf_config`` field to vLLM's ``model_config`` object.
+
+5. After vLLM obtains the config object, it will use the ``architectures`` field to determine the model class to initialize. For ``Qwen/Qwen2-7B``, the ``architectures`` field is ``["Qwen2ForCausalLM"]``. vLLM maintains the mapping from architecture name to model class in `its registry <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/registry.py#L80>`__. If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM.
+
+6. Finally, we reach the model class we want to initialize, i.e., the ``Qwen2ForCausalLM`` class in `vLLM's code <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/qwen2.py#L364>`__. This class will initialize itself depending on various configs.
+
+Beyond that, there are two more things vLLM depends on HuggingFace for.
+
+1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using `AutoTokenizer.from_pretrained <https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained>`__ with the ``model`` argument as the model name and the ``--revision`` argument as the revision. It is also possible to use a tokenizer from another model by specifying the ``--tokenizer`` argument in the ``vllm serve`` command. Other relevant arguments are ``--tokenizer-revision`` and ``--tokenizer-mode``. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the `get_tokenizer <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87>`__ function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in `get_cached_tokenizer <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24>`__.
+
+2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the ``model`` argument as the model name and the ``--revision`` argument as the revision. vLLM provides the argument ``--load-format`` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass ``--load-format dummy`` to skip downloading the weights.
+
+   - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the `documentation <https://huggingface.co/docs/safetensors/en/index>`__ for more information on the safetensors format.
+
+This completes the integration between vLLM and HuggingFace.
+
+In summary, vLLM reads the config file ``config.json``, tokenizer, and model weight from the HuggingFace model hub or a local directory. It uses the config class from either vLLM, HuggingFace transformers, or loads the config class from the model's repository.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index b12e695de37b6..8457d4476a1c4 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -159,6 +159,7 @@ Documentation
    design/input_processing/model_inputs_index
    design/kernel/paged_attention
    design/multimodal/multimodal_index
+   design/huggingface_integration
 
 .. Contributing: contributing to the vLLM project
 

From 9e372664208b4905f7343f1fc76aca758fbf6f8f Mon Sep 17 00:00:00 2001
From: Zhao Yingzhuo <38399296+caijizhuo@users.noreply.github.com>
Date: Sat, 9 Nov 2024 18:09:48 +0800
Subject: [PATCH 068/183] bugfix: fix the bug that stream generate not work
 (#2756)

---
 vllm/entrypoints/api_server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index f3e80cab62a34..ea3c93f733038 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -66,7 +66,7 @@ async def stream_results() -> AsyncGenerator[bytes, None]:
                 prompt + output.text for output in request_output.outputs
             ]
             ret = {"text": text_outputs}
-            yield (json.dumps(ret) + "\0").encode("utf-8")
+            yield (json.dumps(ret) + "\n").encode("utf-8")
 
     if stream:
         return StreamingResponse(stream_results())

From d88bff1b96c6f4c8abbd3d5ab4758bdc040f7b62 Mon Sep 17 00:00:00 2001
From: cjackal <44624812+cjackal@users.noreply.github.com>
Date: Sat, 9 Nov 2024 19:18:29 +0900
Subject: [PATCH 069/183] [Frontend] add `add_request_id` middleware (#9594)

Signed-off-by: cjackal <44624812+cjackal@users.noreply.github.com>
---
 .../serving/openai_compatible_server.md       | 26 +++++++++++++++++++
 vllm/entrypoints/openai/api_server.py         |  8 ++++++
 2 files changed, 34 insertions(+)

diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index a196f8b1e574e..9b29ca66022cb 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -62,6 +62,32 @@ completion = client.chat.completions.create(
 )
 ```
 
+### Extra HTTP Headers
+
+Only `X-Request-Id` HTTP request header is supported for now.
+
+```python
+completion = client.chat.completions.create(
+  model="NousResearch/Meta-Llama-3-8B-Instruct",
+  messages=[
+    {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
+  ],
+  extra_headers={
+    "x-request-id": "sentiment-classification-00001",
+  }
+)
+print(completion._request_id)
+
+completion = client.completions.create(
+  model="NousResearch/Meta-Llama-3-8B-Instruct",
+  prompt="A robot may not injure a human being",
+  extra_headers={
+    "x-request-id": "completion-test",
+  }
+)
+print(completion._request_id)
+```
+
 ### Extra Parameters for Completions API
 
 The following [sampling parameters (click through to see documentation)](../dev/sampling_params.rst) are supported.
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 917b347ff1161..b8b7912742d45 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -7,6 +7,7 @@
 import signal
 import socket
 import tempfile
+import uuid
 from argparse import Namespace
 from contextlib import asynccontextmanager
 from functools import partial
@@ -475,6 +476,13 @@ async def authentication(request: Request, call_next):
                                     status_code=401)
             return await call_next(request)
 
+    @app.middleware("http")
+    async def add_request_id(request: Request, call_next):
+        request_id = request.headers.get("X-Request-Id") or uuid.uuid4().hex
+        response = await call_next(request)
+        response.headers["X-Request-Id"] = request_id
+        return response
+
     for middleware in args.middleware:
         module_path, object_name = middleware.rsplit(".", 1)
         imported = getattr(importlib.import_module(module_path), object_name)

From b09895a61843e654088773851a2b1acae4cdf184 Mon Sep 17 00:00:00 2001
From: Krishna Mandal <43015249+KrishnaM251@users.noreply.github.com>
Date: Sat, 9 Nov 2024 08:19:27 -0800
Subject: [PATCH 070/183] [Frontend][Core] Override HF `config.json` via CLI
 (#5836)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/test_config.py              | 10 ++++--
 vllm/config.py                    | 30 ++++++++++++-----
 vllm/engine/arg_utils.py          | 14 ++++++--
 vllm/engine/llm_engine.py         |  5 +--
 vllm/entrypoints/llm.py           |  7 +++-
 vllm/transformers_utils/config.py | 55 ++++++++++++++-----------------
 vllm/v1/engine/llm_engine.py      |  5 +--
 7 files changed, 73 insertions(+), 53 deletions(-)

diff --git a/tests/test_config.py b/tests/test_config.py
index 66bdb883657c5..36c426d6c51f6 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -200,8 +200,10 @@ def test_rope_customization():
         trust_remote_code=False,
         dtype="float16",
         seed=0,
-        rope_scaling=TEST_ROPE_SCALING,
-        rope_theta=TEST_ROPE_THETA,
+        hf_overrides={
+            "rope_scaling": TEST_ROPE_SCALING,
+            "rope_theta": TEST_ROPE_THETA,
+        },
     )
     assert getattr(llama_model_config.hf_config, "rope_scaling",
                    None) == TEST_ROPE_SCALING
@@ -232,7 +234,9 @@ def test_rope_customization():
         trust_remote_code=False,
         dtype="float16",
         seed=0,
-        rope_scaling=TEST_ROPE_SCALING,
+        hf_overrides={
+            "rope_scaling": TEST_ROPE_SCALING,
+        },
     )
     assert getattr(longchat_model_config.hf_config, "rope_scaling",
                    None) == TEST_ROPE_SCALING
diff --git a/vllm/config.py b/vllm/config.py
index bed58fcecb5cb..b902499bf5bdc 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1,5 +1,6 @@
 import enum
 import json
+import warnings
 from dataclasses import dataclass, field
 from typing import (TYPE_CHECKING, Any, ClassVar, Dict, Final, List, Literal,
                     Mapping, Optional, Set, Tuple, Type, Union)
@@ -74,9 +75,6 @@ class ModelConfig:
         code_revision: The specific revision to use for the model code on
             Hugging Face Hub. It can be a branch name, a tag name, or a
             commit id. If unspecified, will use the default version.
-        rope_scaling: Dictionary containing the scaling configuration for the
-            RoPE embeddings. When using this flag, don't update
-            `max_position_embeddings` to the expected new maximum.
         tokenizer_revision: The specific tokenizer version to use. It can be a
             branch name, a tag name, or a commit id. If unspecified, will use
             the default version.
@@ -116,6 +114,7 @@ class ModelConfig:
             can not be gathered from the vllm arguments.
         config_format: The config format which shall be loaded.
             Defaults to 'auto' which defaults to 'hf'.
+        hf_overrides: Arguments to be forwarded to the HuggingFace config.
         mm_processor_kwargs: Arguments to be forwarded to the model's processor
             for multi-modal data, e.g., image processor.
         pooling_type: Used to configure the pooling method in the embedding 
@@ -146,7 +145,7 @@ def __init__(
             allowed_local_media_path: str = "",
             revision: Optional[str] = None,
             code_revision: Optional[str] = None,
-            rope_scaling: Optional[dict] = None,
+            rope_scaling: Optional[Dict[str, Any]] = None,
             rope_theta: Optional[float] = None,
             tokenizer_revision: Optional[str] = None,
             max_model_len: Optional[int] = None,
@@ -164,6 +163,7 @@ def __init__(
             override_neuron_config: Optional[Dict[str, Any]] = None,
             config_format: ConfigFormat = ConfigFormat.AUTO,
             chat_template_text_format: str = "string",
+            hf_overrides: Optional[Dict[str, Any]] = None,
             mm_processor_kwargs: Optional[Dict[str, Any]] = None,
             pooling_type: Optional[str] = None,
             pooling_norm: Optional[bool] = None,
@@ -178,8 +178,22 @@ def __init__(
         self.seed = seed
         self.revision = revision
         self.code_revision = code_revision
-        self.rope_scaling = rope_scaling
-        self.rope_theta = rope_theta
+
+        if hf_overrides is None:
+            hf_overrides = {}
+        if rope_scaling is not None:
+            hf_override: Dict[str, Any] = {"rope_scaling": rope_scaling}
+            hf_overrides.update(hf_override)
+            msg = ("`--rope-scaling` will be removed in a future release. "
+                   f"'Please instead use `--hf-overrides '{hf_override!r}'`")
+            warnings.warn(DeprecationWarning(msg), stacklevel=2)
+        if rope_theta is not None:
+            hf_override = {"rope_theta": rope_theta}
+            hf_overrides.update(hf_override)
+            msg = ("`--rope-theta` will be removed in a future release. "
+                   f"'Please instead use `--hf-overrides '{hf_override!r}'`")
+            warnings.warn(DeprecationWarning(msg), stacklevel=2)
+
         # The tokenizer version is consistent with the model version by default.
         if tokenizer_revision is None:
             self.tokenizer_revision = revision
@@ -193,8 +207,8 @@ def __init__(
         self.disable_sliding_window = disable_sliding_window
         self.skip_tokenizer_init = skip_tokenizer_init
         self.hf_config = get_config(self.model, trust_remote_code, revision,
-                                    code_revision, rope_scaling, rope_theta,
-                                    config_format)
+                                    code_revision, config_format,
+                                    **hf_overrides)
         self.hf_text_config = get_hf_text_config(self.hf_config)
         self.encoder_config = self._get_encoder_config()
         self.hf_image_processor_config = get_hf_image_processor_config(
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 8c5b442e9f624..95d55e86e08e8 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -128,8 +128,9 @@ class EngineArgs:
     disable_log_stats: bool = False
     revision: Optional[str] = None
     code_revision: Optional[str] = None
-    rope_scaling: Optional[dict] = None
+    rope_scaling: Optional[Dict[str, Any]] = None
     rope_theta: Optional[float] = None
+    hf_overrides: Optional[Dict[str, Any]] = None
     tokenizer_revision: Optional[str] = None
     quantization: Optional[str] = None
     enforce_eager: Optional[bool] = None
@@ -140,8 +141,9 @@ class EngineArgs:
     # is intended for expert use only. The API may change without
     # notice.
     tokenizer_pool_type: Union[str, Type["BaseTokenizerGroup"]] = "ray"
-    tokenizer_pool_extra_config: Optional[dict] = None
+    tokenizer_pool_extra_config: Optional[Dict[str, Any]] = None
     limit_mm_per_prompt: Optional[Mapping[str, int]] = None
+    mm_processor_kwargs: Optional[Dict[str, Any]] = None
     enable_lora: bool = False
     max_loras: int = 1
     max_lora_rank: int = 16
@@ -187,7 +189,6 @@ class EngineArgs:
     collect_detailed_traces: Optional[str] = None
     disable_async_output_proc: bool = False
     override_neuron_config: Optional[Dict[str, Any]] = None
-    mm_processor_kwargs: Optional[Dict[str, Any]] = None
     scheduling_policy: Literal["fcfs", "priority"] = "fcfs"
 
     # Pooling configuration.
@@ -512,6 +513,12 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                             help='RoPE theta. Use with `rope_scaling`. In '
                             'some cases, changing the RoPE theta improves the '
                             'performance of the scaled model.')
+        parser.add_argument('--hf-overrides',
+                            type=json.loads,
+                            default=EngineArgs.hf_overrides,
+                            help='Extra arguments for the HuggingFace config.'
+                            'This should be a JSON string that will be '
+                            'parsed into a dictionary.')
         parser.add_argument('--enforce-eager',
                             action='store_true',
                             help='Always use eager-mode PyTorch. If False, '
@@ -940,6 +947,7 @@ def create_model_config(self) -> ModelConfig:
             code_revision=self.code_revision,
             rope_scaling=self.rope_scaling,
             rope_theta=self.rope_theta,
+            hf_overrides=self.hf_overrides,
             tokenizer_revision=self.tokenizer_revision,
             max_model_len=self.max_model_len,
             quantization=self.quantization,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 5d321fc98aeb6..d550b1d244af8 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -248,8 +248,7 @@ def __init__(
             "Initializing an LLM engine (v%s) with config: "
             "model=%r, speculative_config=%r, tokenizer=%r, "
             "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, "
-            "override_neuron_config=%s, "
-            "rope_scaling=%r, rope_theta=%r, tokenizer_revision=%s, "
+            "override_neuron_config=%s, tokenizer_revision=%s, "
             "trust_remote_code=%s, dtype=%s, max_seq_len=%d, "
             "download_dir=%r, load_format=%s, tensor_parallel_size=%d, "
             "pipeline_parallel_size=%d, "
@@ -271,8 +270,6 @@ def __init__(
             model_config.tokenizer_mode,
             model_config.revision,
             model_config.override_neuron_config,
-            model_config.rope_scaling,
-            model_config.rope_theta,
             model_config.tokenizer_revision,
             model_config.trust_remote_code,
             model_config.dtype,
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index d8b60a5e01471..f830839776364 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -98,7 +98,10 @@ class LLM:
             to eager mode. Additionally for encoder-decoder models, if the
             sequence length of the encoder input is larger than this, we fall
             back to the eager mode.
-        disable_custom_all_reduce: See ParallelConfig
+        disable_custom_all_reduce: See :class:`~vllm.config.ParallelConfig`
+        disable_async_output_proc: Disable async output processing.
+            This may result in lower performance.
+        hf_overrides: Arguments to be forwarded to the HuggingFace config.
         **kwargs: Arguments for :class:`~vllm.EngineArgs`. (See
             :ref:`engine_args`)
 
@@ -153,6 +156,7 @@ def __init__(
         max_seq_len_to_capture: int = 8192,
         disable_custom_all_reduce: bool = False,
         disable_async_output_proc: bool = False,
+        hf_overrides: Optional[dict] = None,
         mm_processor_kwargs: Optional[Dict[str, Any]] = None,
         # After positional args are removed, move this right below `model`
         task: TaskOption = "auto",
@@ -194,6 +198,7 @@ def __init__(
             max_seq_len_to_capture=max_seq_len_to_capture,
             disable_custom_all_reduce=disable_custom_all_reduce,
             disable_async_output_proc=disable_async_output_proc,
+            hf_overrides=hf_overrides,
             mm_processor_kwargs=mm_processor_kwargs,
             pooling_type=pooling_type,
             pooling_norm=pooling_norm,
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 6b38ee31c2657..14d9518364d26 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -146,9 +146,8 @@ def get_config(
     trust_remote_code: bool,
     revision: Optional[str] = None,
     code_revision: Optional[str] = None,
-    rope_scaling: Optional[dict] = None,
-    rope_theta: Optional[float] = None,
     config_format: ConfigFormat = ConfigFormat.AUTO,
+    token: Optional[str] = None,
     **kwargs,
 ) -> PretrainedConfig:
     # Separate model folder from file path for GGUF models
@@ -159,39 +158,43 @@ def get_config(
         model = Path(model).parent
 
     if config_format == ConfigFormat.AUTO:
-        if is_gguf or file_or_path_exists(model,
-                                          HF_CONFIG_NAME,
-                                          revision=revision,
-                                          token=kwargs.get("token")):
+        if is_gguf or file_or_path_exists(
+                model, HF_CONFIG_NAME, revision=revision, token=token):
             config_format = ConfigFormat.HF
         elif file_or_path_exists(model,
                                  MISTRAL_CONFIG_NAME,
                                  revision=revision,
-                                 token=kwargs.get("token")):
+                                 token=token):
             config_format = ConfigFormat.MISTRAL
         else:
             # If we're in offline mode and found no valid config format, then
             # raise an offline mode error to indicate to the user that they
             # don't have files cached and may need to go online.
             # This is conveniently triggered by calling file_exists().
-            file_exists(model,
-                        HF_CONFIG_NAME,
-                        revision=revision,
-                        token=kwargs.get("token"))
+            file_exists(model, HF_CONFIG_NAME, revision=revision, token=token)
 
             raise ValueError(f"No supported config format found in {model}")
 
     if config_format == ConfigFormat.HF:
         config_dict, _ = PretrainedConfig.get_config_dict(
-            model, revision=revision, code_revision=code_revision, **kwargs)
+            model,
+            revision=revision,
+            code_revision=code_revision,
+            token=token,
+            **kwargs,
+        )
 
         # Use custom model class if it's in our registry
         model_type = config_dict.get("model_type")
         if model_type in _CONFIG_REGISTRY:
             config_class = _CONFIG_REGISTRY[model_type]
-            config = config_class.from_pretrained(model,
-                                                  revision=revision,
-                                                  code_revision=code_revision)
+            config = config_class.from_pretrained(
+                model,
+                revision=revision,
+                code_revision=code_revision,
+                token=token,
+                **kwargs,
+            )
         else:
             try:
                 config = AutoConfig.from_pretrained(
@@ -199,6 +202,7 @@ def get_config(
                     trust_remote_code=trust_remote_code,
                     revision=revision,
                     code_revision=code_revision,
+                    token=token,
                     **kwargs,
                 )
             except ValueError as e:
@@ -216,7 +220,7 @@ def get_config(
                     raise e
 
     elif config_format == ConfigFormat.MISTRAL:
-        config = load_params_config(model, revision, token=kwargs.get("token"))
+        config = load_params_config(model, revision, token=token, **kwargs)
     else:
         raise ValueError(f"Unsupported config format: {config_format}")
 
@@ -228,19 +232,6 @@ def get_config(
         model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type]
         config.update({"architectures": [model_type]})
 
-    for key, value in [
-        ("rope_scaling", rope_scaling),
-        ("rope_theta", rope_theta),
-    ]:
-        if value is not None:
-            logger.info(
-                "Updating %s from %r to %r",
-                key,
-                getattr(config, key, None),
-                value,
-            )
-            config.update({key: value})
-
     patch_rope_scaling(config)
 
     return config
@@ -462,13 +453,15 @@ def _reduce_modelconfig(mc: ModelConfig):
 
 def load_params_config(model: Union[str, Path],
                        revision: Optional[str],
-                       token: Optional[str] = None) -> PretrainedConfig:
+                       token: Optional[str] = None,
+                       **kwargs) -> PretrainedConfig:
     # This function loads a params.json config which
     # should be used when loading models in mistral format
 
     config_file_name = "params.json"
 
     config_dict = get_hf_file_to_dict(config_file_name, model, revision, token)
+    assert isinstance(config_dict, dict)
 
     config_mapping = {
         "dim": "hidden_size",
@@ -512,6 +505,8 @@ def recurse_elems(elem: Any):
         config_dict["architectures"] = ["PixtralForConditionalGeneration"]
         config_dict["model_type"] = "pixtral"
 
+    config_dict.update(kwargs)
+
     config = recurse_elems(config_dict)
     return config
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 81dc01ae2d8e7..f805c5e69bc1c 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -74,8 +74,7 @@ def __init__(
             "Initializing an LLM engine (v%s) with config: "
             "model=%r, speculative_config=%r, tokenizer=%r, "
             "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, "
-            "override_neuron_config=%s, "
-            "rope_scaling=%r, rope_theta=%r, tokenizer_revision=%s, "
+            "override_neuron_config=%s, tokenizer_revision=%s, "
             "trust_remote_code=%s, dtype=%s, max_seq_len=%d, "
             "download_dir=%r, load_format=%s, tensor_parallel_size=%d, "
             "pipeline_parallel_size=%d, "
@@ -94,8 +93,6 @@ def __init__(
             model_config.tokenizer_mode,
             model_config.revision,
             model_config.override_neuron_config,
-            model_config.rope_scaling,
-            model_config.rope_theta,
             model_config.tokenizer_revision,
             model_config.trust_remote_code,
             model_config.dtype,

From 51c2e1fcef59ca42b378c433997c77affd114d30 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sun, 10 Nov 2024 03:39:14 +0800
Subject: [PATCH 071/183] [CI/Build] Split up models tests (#10069)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml                 | 24 ++++++----
 pyproject.toml                                |  1 +
 .../models/decoder_only/language/test_aqlm.py |  1 +
 .../models/decoder_only/language/test_fp8.py  |  1 +
 .../models/decoder_only/language/test_gguf.py | 35 +++++++-------
 .../decoder_only/language/test_gptq_marlin.py |  1 +
 .../language/test_gptq_marlin_24.py           |  1 +
 .../decoder_only/language/test_granite.py     |  3 +-
 .../decoder_only/language/test_granitemoe.py  | 39 ----------------
 .../decoder_only/language/test_modelopt.py    |  1 +
 .../decoder_only/language/test_models.py      |  4 +-
 .../mm_processor_kwargs/test_llava_next.py    |  4 +-
 .../mm_processor_kwargs/test_phi3v.py         |  3 +-
 .../mm_processor_kwargs/test_qwen2_vl.py      | 15 ++++--
 .../{test_internvl.py => test_awq.py}         | 19 ++++----
 .../vision_language/test_intern_vit.py        | 19 ++++----
 .../vision_language/test_models.py            | 46 +++++++++----------
 vllm/config.py                                |  9 +++-
 vllm/model_executor/models/fuyu.py            |  6 +--
 vllm/model_executor/models/internlm2_ve.py    |  4 +-
 vllm/model_executor/models/utils.py           |  8 ++--
 21 files changed, 115 insertions(+), 129 deletions(-)
 delete mode 100644 tests/models/decoder_only/language/test_granitemoe.py
 rename tests/models/decoder_only/vision_language/{test_internvl.py => test_awq.py} (90%)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 2c5d74e7abcbf..e8456357e6db1 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -305,7 +305,7 @@ steps:
 
 #####  models test  #####
 
-- label: Basic Models Test # 3min
+- label: Basic Models Test # 10min
   source_file_dependencies:
   - vllm/
   - tests/models
@@ -314,23 +314,24 @@ steps:
     - pytest -v -s models/test_oot_registration.py # it needs a clean process
     - pytest -v -s models/*.py --ignore=models/test_oot_registration.py
 
-- label: Decoder-only Language Models Test (Standard) # 35min
+- label: Decoder-only Language Models Test (Standard) # 18min
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
   - tests/models/decoder_only/language
   commands:
-    - pytest -v -s models/decoder_only/language/test_models.py
+    - pytest -v -s models/decoder_only/language -m core_model
+    - pytest -v -s models/decoder_only/language -m quant_model
 
-- label: Decoder-only Language Models Test (Extended) # 1h20min
+- label: Decoder-only Language Models Test (Extended) # 46min
   nightly: true
   source_file_dependencies:
   - vllm/
   - tests/models/decoder_only/language
   commands:
-    - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py
+    - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
 
-- label: Decoder-only Multi-Modal Models Test (Standard) # 26min
+- label: Decoder-only Multi-Modal Models Test (Standard) # 22min
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
@@ -339,21 +340,24 @@ steps:
   commands:
     - pytest -v -s models/decoder_only/audio_language -m core_model
     - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m core_model
+    # No tests under this group for now
+    # - pytest -v -s models/decoder_only/audio_language -m quant_model
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m quant_model
 
-- label: Decoder-only Multi-Modal Models Test (Extended)
+- label: Decoder-only Multi-Modal Models Test (Extended) # 1h10m
   nightly: true
   source_file_dependencies:
   - vllm/
   - tests/models/decoder_only/audio_language
   - tests/models/decoder_only/vision_language
   commands:
-    - pytest -v -s models/decoder_only/audio_language -m 'not core_model'
+    - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
     # HACK - run phi3v tests separately to sidestep this transformers bug
     # https://github.com/huggingface/transformers/issues/34307
     - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
-    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model'
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
 
-- label: Other Models Test # 6min
+- label: Other Models Test # 20min
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
diff --git a/pyproject.toml b/pyproject.toml
index 797e7a88ab31b..3c8c46cc8621e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -95,6 +95,7 @@ markers = [
     "skip_global_cleanup",
     "core_model: enable this model test in each PR instead of only nightly",
     "cpu_model: enable this model test in CPU tests",
+    "quant_model: run this model test under Quantized category",
     "distributed_2_gpus: run this test only in distributed tests for 2 GPUs",
     "skip_v1: do not run this test with v1",
 ]
diff --git a/tests/models/decoder_only/language/test_aqlm.py b/tests/models/decoder_only/language/test_aqlm.py
index de46032113086..a8cb5bbf9349e 100644
--- a/tests/models/decoder_only/language/test_aqlm.py
+++ b/tests/models/decoder_only/language/test_aqlm.py
@@ -38,6 +38,7 @@
 ]
 
 
+@pytest.mark.quant_model
 @pytest.mark.skipif(not is_quant_method_supported("aqlm"),
                     reason="AQLM is not supported on this GPU type.")
 @pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
diff --git a/tests/models/decoder_only/language/test_fp8.py b/tests/models/decoder_only/language/test_fp8.py
index f874bf6c73142..53f23e24511b3 100644
--- a/tests/models/decoder_only/language/test_fp8.py
+++ b/tests/models/decoder_only/language/test_fp8.py
@@ -15,6 +15,7 @@
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
 
 
+@pytest.mark.quant_model
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="fp8 is not supported on this GPU type.")
 @pytest.mark.parametrize(
diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py
index 5dc83942632fd..2b8f5e2faa45e 100644
--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/decoder_only/language/test_gguf.py
@@ -17,26 +17,21 @@
 
 MAX_MODEL_LEN = 1024
 
-# FIXME: Move this to confest
-MODELS = [
-    ("meta-llama/Llama-3.2-1B-Instruct",
-     hf_hub_download("bartowski/Llama-3.2-1B-Instruct-GGUF",
-                     filename="Llama-3.2-1B-Instruct-Q4_K_M.gguf")),
-    ("meta-llama/Llama-3.2-1B-Instruct",
-     hf_hub_download("bartowski/Llama-3.2-1B-Instruct-GGUF",
-                     filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf")),
-    ("Qwen/Qwen2-1.5B-Instruct",
-     hf_hub_download("Qwen/Qwen2-1.5B-Instruct-GGUF",
-                     filename="qwen2-1_5b-instruct-q4_k_m.gguf")),
-    ("Qwen/Qwen2-1.5B-Instruct",
-     hf_hub_download("legraphista/Qwen2-1.5B-Instruct-IMat-GGUF",
-                     filename="Qwen2-1.5B-Instruct.IQ4_XS.gguf")),
-]
-
 
 @pytest.mark.skipif(not is_quant_method_supported("gguf"),
                     reason="gguf is not supported on this GPU type.")
-@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize(("original_model", "gguf_id", "gguf_path"), [
+    ("meta-llama/Llama-3.2-1B-Instruct",
+     "bartowski/Llama-3.2-1B-Instruct-GGUF",
+     "Llama-3.2-1B-Instruct-Q4_K_M.gguf"),
+    ("meta-llama/Llama-3.2-1B-Instruct",
+     "bartowski/Llama-3.2-1B-Instruct-GGUF",
+     "Llama-3.2-1B-Instruct-IQ4_XS.gguf"),
+    ("Qwen/Qwen2-1.5B-Instruct", "Qwen/Qwen2-1.5B-Instruct-GGUF",
+     "qwen2-1_5b-instruct-q4_k_m.gguf"),
+    ("Qwen/Qwen2-1.5B-Instruct", "legraphista/Qwen2-1.5B-Instruct-IMat-GGUF",
+     "Qwen2-1.5B-Instruct.IQ4_XS.gguf"),
+])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
@@ -45,7 +40,9 @@ def test_models(
     num_gpus_available,
     vllm_runner,
     example_prompts,
-    model,
+    original_model,
+    gguf_id,
+    gguf_path,
     dtype: str,
     max_tokens: int,
     num_logprobs: int,
@@ -54,7 +51,7 @@ def test_models(
     if num_gpus_available < tp_size:
         pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
 
-    original_model, gguf_model = model
+    gguf_model = hf_hub_download(gguf_id, filename=gguf_path)
 
     tokenizer = AutoTokenizer.from_pretrained(original_model)
     messages = [[{
diff --git a/tests/models/decoder_only/language/test_gptq_marlin.py b/tests/models/decoder_only/language/test_gptq_marlin.py
index a896f145c11f1..037411a18c19f 100644
--- a/tests/models/decoder_only/language/test_gptq_marlin.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin.py
@@ -33,6 +33,7 @@
 ]
 
 
+@pytest.mark.quant_model
 @pytest.mark.flaky(reruns=3)
 @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
                     reason="gptq_marlin is not supported on this GPU type.")
diff --git a/tests/models/decoder_only/language/test_gptq_marlin_24.py b/tests/models/decoder_only/language/test_gptq_marlin_24.py
index aa63f9f36a3a8..26cb3ec310701 100644
--- a/tests/models/decoder_only/language/test_gptq_marlin_24.py
+++ b/tests/models/decoder_only/language/test_gptq_marlin_24.py
@@ -38,6 +38,7 @@ class ModelPair:
 ]
 
 
+@pytest.mark.quant_model
 @pytest.mark.flaky(reruns=2)
 @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin_24"),
                     reason="Marlin24 is not supported on this GPU type.")
diff --git a/tests/models/decoder_only/language/test_granite.py b/tests/models/decoder_only/language/test_granite.py
index 0b71f0d49c70a..5e93842f46164 100644
--- a/tests/models/decoder_only/language/test_granite.py
+++ b/tests/models/decoder_only/language/test_granite.py
@@ -7,7 +7,9 @@
 from ...utils import check_logprobs_close
 
 MODELS = [
+    # TODO(sang): Sliding window should be tested separately.
     "ibm/PowerLM-3b",
+    "ibm/PowerMoE-3b",
 ]
 
 
@@ -24,7 +26,6 @@ def test_models(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-    # TODO(sang): Sliding window should be tested separately.
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
             example_prompts, max_tokens, num_logprobs)
diff --git a/tests/models/decoder_only/language/test_granitemoe.py b/tests/models/decoder_only/language/test_granitemoe.py
deleted file mode 100644
index ba73375229eb3..0000000000000
--- a/tests/models/decoder_only/language/test_granitemoe.py
+++ /dev/null
@@ -1,39 +0,0 @@
-"""Compare the outputs of HF and vLLM for Granite models using greedy sampling.
-
-Run `pytest tests/models/test_granite.py`.
-"""
-import pytest
-
-from ...utils import check_logprobs_close
-
-MODELS = [
-    "ibm/PowerMoE-3b",
-]
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("max_tokens", [64])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
-
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
-    check_logprobs_close(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
diff --git a/tests/models/decoder_only/language/test_modelopt.py b/tests/models/decoder_only/language/test_modelopt.py
index e643b115d0ea8..077e50e3a4dfd 100644
--- a/tests/models/decoder_only/language/test_modelopt.py
+++ b/tests/models/decoder_only/language/test_modelopt.py
@@ -39,6 +39,7 @@
 @pytest.mark.skip(
     reason=
     "Prevent unstable test based on golden strings from breaking the build.")
+@pytest.mark.quant_model
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                     reason="fp8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_name", MODELS)
diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py
index d705909c24bf8..beb1ffb18436e 100644
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -1,8 +1,5 @@
 """Compare the outputs of HF and vLLM when using greedy sampling.
 
-This test only tests small models. Big models such as 7B should be tested from
-test_big_models.py because it could use a larger instance to run tests.
-
 Run `pytest tests/models/test_models.py`.
 """
 import pytest
@@ -35,6 +32,7 @@
 target_dtype = "half"
 
 
+@pytest.mark.core_model
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [32])
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py
index c2d3fda6994f6..51c0085101dd0 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_llava_next.py
@@ -56,11 +56,13 @@ def test_dummy_data_for_llava_next_feature_size(dummy_data_for_llava_next,
     ctx.model_config.hf_config.image_grid_pinpoints = gridpoints
     seq_len = 5000  # bigger than the max feature size for any image
 
-    seq_data, mm_data = dummy_data_for_llava_next(
+    dummy_data = dummy_data_for_llava_next(
         ctx,
         seq_len=seq_len,
         mm_counts={"image": 1},
     )
+    seq_data = dummy_data.seq_data
+    mm_data = dummy_data.multi_modal_data
 
     # The dummy data dims should match the gridpoint with the biggest feat size
     assert mm_data["image"].height == expected_size[0]
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
index d6a7b34fdde9f..60a8f63eb5faa 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_phi3v.py
@@ -131,12 +131,13 @@ def test_dummy_data_override(dummy_data_for_phi3v, model: str, num_crops: int,
         mm_processor_kwargs=None,
     )
 
-    sequence_data, _, = dummy_data_for_phi3v(
+    dummy_data = dummy_data_for_phi3v(
         ctx=ctx,
         seq_len=8192,  # Should be bigger than num_imgs * toks_per_img
         mm_counts={"image": num_imgs},
         num_crops=num_crops,
     )
+    sequence_data = dummy_data.seq_data
     # Ensure we have the right number of placeholders per num_crops size
     img_tok_count = sequence_data.get_token_ids().count(_IMAGE_TOKEN_ID)
     assert img_tok_count == toks_per_img * num_imgs
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
index c23fbedf0c6ae..7e2bea130583e 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen2_vl.py
@@ -86,10 +86,17 @@ def test_qwen2_vl_dummy_data(dummy_data_for_qwen2_vl,
 
     # NOTE: video value is required, but isn't actually used
     # when making the dummy data except for error handling currently
-    seq_data, mm_data = dummy_data_for_qwen2_vl(qwen2_vl_context, seq_len, {
-        "image": 1,
-        "video": 0
-    }, **mm_processor_kwargs)
+    dummy_data = dummy_data_for_qwen2_vl(
+        ctx=qwen2_vl_context,
+        seq_len=seq_len,
+        mm_counts={
+            "image": 1,
+            "video": 0
+        },
+        **mm_processor_kwargs,
+    )
+    seq_data = dummy_data.seq_data
+    mm_data = dummy_data.multi_modal_data
 
     # Ensure we have the right number of placeholders for min/max pixel values
     assert seq_data.get_token_ids().count(image_token_id) == token_count
diff --git a/tests/models/decoder_only/vision_language/test_internvl.py b/tests/models/decoder_only/vision_language/test_awq.py
similarity index 90%
rename from tests/models/decoder_only/vision_language/test_internvl.py
rename to tests/models/decoder_only/vision_language/test_awq.py
index 2fd1ac4bb08f7..6e6e5b40d6a35 100644
--- a/tests/models/decoder_only/vision_language/test_internvl.py
+++ b/tests/models/decoder_only/vision_language/test_awq.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Tuple, Type
+from typing import List, Optional, Type
 
 import pytest
 import torch
@@ -19,7 +19,8 @@
 def run_awq_test(
     vllm_runner: Type[VllmRunner],
     image_assets: _ImageAssets,
-    models: Tuple[str, str],
+    source_model: str,
+    quant_model: str,
     *,
     size_factors: List[float],
     dtype: str,
@@ -28,8 +29,6 @@ def run_awq_test(
     tensor_parallel_size: int,
     distributed_executor_backend: Optional[str] = None,
 ):
-    source_model, quant_model = models
-
     images = [asset.pil_image for asset in image_assets]
 
     inputs_per_image = [(
@@ -84,8 +83,11 @@ def run_awq_test(
         )
 
 
+@pytest.mark.quant_model
 @pytest.mark.parametrize(
-    "models", [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")])
+    ("source_model", "quant_model"),
+    [("OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B-AWQ")],
+)
 @pytest.mark.parametrize(
     "size_factors",
     [
@@ -103,12 +105,13 @@ def run_awq_test(
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
 @torch.inference_mode()
-def test_awq_models(vllm_runner, image_assets, models, size_factors,
-                    dtype: str, max_tokens: int, num_logprobs: int) -> None:
+def test_awq_models(vllm_runner, image_assets, source_model, quant_model,
+                    size_factors, dtype, max_tokens, num_logprobs) -> None:
     run_awq_test(
         vllm_runner,
         image_assets,
-        models,
+        source_model,
+        quant_model,
         size_factors=size_factors,
         dtype=dtype,
         max_tokens=max_tokens,
diff --git a/tests/models/decoder_only/vision_language/test_intern_vit.py b/tests/models/decoder_only/vision_language/test_intern_vit.py
index 98f313eb9b9af..32fcb0bbc42f1 100644
--- a/tests/models/decoder_only/vision_language/test_intern_vit.py
+++ b/tests/models/decoder_only/vision_language/test_intern_vit.py
@@ -11,21 +11,17 @@
 # we use snapshot_download to prevent conflicts between
 # dynamic_module and trust_remote_code for hf_runner
 DOWNLOAD_PATTERN = ["*.json", "*.py", "*.safetensors", "*.txt", "*.model"]
-models = [
-    snapshot_download("OpenGVLab/InternViT-300M-448px",
-                      allow_patterns=DOWNLOAD_PATTERN),
-    snapshot_download("OpenGVLab/InternViT-6B-448px-V1-5",
-                      allow_patterns=DOWNLOAD_PATTERN),
-]
 
 
 def run_intern_vit_test(
     image_assets: _ImageAssets,
-    model: str,
+    model_id: str,
     *,
     dtype: str,
     distributed_executor_backend: Optional[str] = None,
 ):
+    model = snapshot_download(model_id, allow_patterns=DOWNLOAD_PATTERN)
+
     img_processor = CLIPImageProcessor.from_pretrained(model)
     images = [asset.pil_image for asset in image_assets]
     pixel_values = [
@@ -67,12 +63,15 @@ def run_intern_vit_test(
         assert cos_similar(vllm_output, hf_output).mean() > 0.99
 
 
-@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("model_id", [
+    "OpenGVLab/InternViT-300M-448px",
+    "OpenGVLab/InternViT-6B-448px-V1-5",
+])
 @pytest.mark.parametrize("dtype", [torch.half])
 @torch.inference_mode()
-def test_models(dist_init, image_assets, model, dtype: str) -> None:
+def test_models(dist_init, image_assets, model_id, dtype: str) -> None:
     run_intern_vit_test(
         image_assets,
-        model,
+        model_id,
         dtype=dtype,
     )
diff --git a/tests/models/decoder_only/vision_language/test_models.py b/tests/models/decoder_only/vision_language/test_models.py
index 1ab42f8c126f8..3f6d8ef42cd5f 100644
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -130,8 +130,8 @@
         max_num_seqs=2,
         auto_cls=AutoModelForVision2Seq,
         vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
-        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
         image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
+        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
     ),
     #### Extended model tests
     "blip2": VLMTestInfo(
@@ -159,9 +159,9 @@
         dtype="bfloat16",
         marks=[
             pytest.mark.skipif(
-                transformers.__version__.startswith("4.46"),
+                transformers.__version__ < "4.46.2",
                 reason="Model broken in HF, see huggingface/transformers#34379"
-            )
+            ),
         ]
     ),
     "fuyu": VLMTestInfo(
@@ -185,8 +185,8 @@
         max_num_seqs=2,
         dtype="bfloat16",
         get_stop_token_ids=lambda tok: [151329, 151336, 151338],
-        marks=[large_gpu_mark(min_gb=48)],
         patch_hf_runner=model_utils.glm_patch_hf_runner,
+        marks=[large_gpu_mark(min_gb=48)],
     ),
     "h2ovl": VLMTestInfo(
         models = [
@@ -205,6 +205,22 @@
         use_tokenizer_eos=True,
         patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
     ),
+    "idefics3": VLMTestInfo(
+        models=["HuggingFaceM4/Idefics3-8B-Llama3"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
+        img_idx_to_prompt=lambda idx: "<image>",
+        max_model_len=8192,
+        max_num_seqs=2,
+        auto_cls=AutoModelForVision2Seq,
+        marks=[
+            pytest.mark.skipif(
+                transformers.__version__ < "4.46.0",
+                reason="Model introduced in HF >= 4.46.0"
+            ),
+            large_gpu_mark(min_gb=48),
+        ],
+    ),
     "intern_vl": VLMTestInfo(
         models=[
             "OpenGVLab/InternVL2-1B",
@@ -263,7 +279,6 @@
             runner_mm_key="videos",
         )],
     ),
-    # FIXME
     "llava_next_video": VLMTestInfo(
         models=["llava-hf/LLaVA-NeXT-Video-7B-hf"],
         test_type=VLMTestType.VIDEO,
@@ -275,7 +290,7 @@
         image_sizes=[((1669, 2560), (2560, 1669), (183, 488), (488, 183))],
         marks=[
             pytest.mark.skipif(
-                transformers.__version__.startswith("4.46"),
+                transformers.__version__ < "4.46.2",
                 reason="Model broken with changes in transformers 4.46"
             )
         ],
@@ -316,6 +331,7 @@
         max_model_len=8192,
         max_num_seqs=2,
         auto_cls=AutoModelForVision2Seq,
+        marks=[large_gpu_mark(min_gb=48)],
     ),
     "qwen": VLMTestInfo(
         models=["Qwen/Qwen-VL"],
@@ -327,22 +343,6 @@
         vllm_output_post_proc=model_utils.qwen_vllm_to_hf_output,
         prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
     ),
-    "idefics3": VLMTestInfo(
-        models=["HuggingFaceM4/Idefics3-8B-Llama3"],
-        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
-        prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
-        img_idx_to_prompt=lambda idx: "<image>",
-        max_model_len=8192,
-        max_num_seqs=2,
-        auto_cls=AutoModelForVision2Seq,
-        marks=[
-            pytest.mark.skipif(
-                transformers.__version__ < "4.46.0",
-                reason="Model introduced in HF >= 4.46.0"
-            ),
-            large_gpu_mark(min_gb=48),
-        ],
-    ),
     ### Tensor parallel / multi-gpu broadcast tests
     "broadcast-chameleon": VLMTestInfo(
         models=["facebook/chameleon-7b"],
@@ -362,7 +362,7 @@
                 reason="Need at least 2 GPUs to run the test.",
             ),
             pytest.mark.skipif(
-                transformers.__version__.startswith("4.46"),
+                transformers.__version__ < "4.46.2",
                 reason="Model broken in HF, see huggingface/transformers#34379"
             )
         ],
diff --git a/vllm/config.py b/vllm/config.py
index b902499bf5bdc..f9b230e1bc688 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1,7 +1,8 @@
+import copy
 import enum
 import json
 import warnings
-from dataclasses import dataclass, field
+from dataclasses import dataclass, field, replace
 from typing import (TYPE_CHECKING, Any, ClassVar, Dict, Final, List, Literal,
                     Mapping, Optional, Set, Tuple, Type, Union)
 
@@ -2078,6 +2079,12 @@ def _get_quantization_config(
             return quant_config
         return None
 
+    def with_hf_config(self, hf_config: PretrainedConfig) -> "VllmConfig":
+        model_config = copy.deepcopy(self.model_config)
+        model_config.hf_config = hf_config
+
+        return replace(self, model_config=model_config)
+
     def __post_init__(self):
         """Verify configs are valid & consistent with each other.
         """
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index cac10f505df67..37f38d4d76671 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -229,7 +229,6 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
     def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
@@ -246,9 +245,8 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
             quant_config=quant_config,
             gather_output=True,
         )
-        self.language_model = PersimmonForCausalLM(config.text_config,
-                                                   cache_config=cache_config,
-                                                   quant_config=quant_config)
+        self.language_model = PersimmonForCausalLM(
+            vllm_config.with_hf_config(config.text_config))
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py
index f7bc823574034..51e2c64d5552d 100644
--- a/vllm/model_executor/models/internlm2_ve.py
+++ b/vllm/model_executor/models/internlm2_ve.py
@@ -164,10 +164,12 @@ def __init__(
         vllm_config: VllmConfig,
         prefix: str = "",
     ) -> None:
+        super().__init__(vllm_config, prefix=prefix)
+
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-        super().__init__(config, cache_config, quant_config)
+
         self.model = InternLM2VEModel(config,
                                       cache_config,
                                       quant_config,
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 60eeceb18bcf0..ca4fc8ec952bf 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -241,11 +241,11 @@ def init_vllm_registered_model(
     based on the arguments passed to the outer vLLM model.
     """
     model_class, _ = ModelRegistry.resolve_model_cls(hf_config.architectures)
-    import copy
-    copied_config = copy.deepcopy(vllm_config)
-    copied_config.model_config.hf_config = hf_config
 
-    return model_class(vllm_config=copied_config, prefix=prefix)
+    return model_class(
+        vllm_config=vllm_config.with_hf_config(hf_config),
+        prefix=prefix,
+    )
 
 
 @overload

From 9fa4bdde9d091af250d90a233bb54420610037cb Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 9 Nov 2024 16:27:26 -0800
Subject: [PATCH 072/183] [ci][build] limit cmake version (#10188)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 Dockerfile.neuron                                | 2 +-
 Dockerfile.ppc64le                               | 2 +-
 docs/source/getting_started/cpu-installation.rst | 2 +-
 pyproject.toml                                   | 2 +-
 requirements-build.txt                           | 2 +-
 requirements-tpu.txt                             | 2 +-
 requirements-xpu.txt                             | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index 2143315d2a078..47e40e015239a 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -31,7 +31,7 @@ RUN --mount=type=bind,source=.git,target=.git \
     if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 
 RUN python3 -m pip install -U \
-        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
+        'cmake>=3.26,<=3.30' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
         -r requirements-neuron.txt
 
 ENV VLLM_TARGET_DEVICE neuron
diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
index b19c6ddec7948..c2a40000aab4b 100644
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -21,7 +21,7 @@ RUN --mount=type=bind,source=.git,target=.git \
 # These packages will be in rocketce eventually
 RUN --mount=type=cache,target=/root/.cache/pip  \
     pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
-        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
+        'cmake>=3.26,<=3.30' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
         torch==2.3.1 \
         -r requirements-cpu.txt \
         xformers uvloop==0.20.0
diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst
index 69530fd778c55..6bf170b164fb8 100644
--- a/docs/source/getting_started/cpu-installation.rst
+++ b/docs/source/getting_started/cpu-installation.rst
@@ -62,7 +62,7 @@ Build from source
 .. code-block:: console
 
     $ pip install --upgrade pip
-    $ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy
+    $ pip install cmake>=3.26,<=3.30 wheel packaging ninja "setuptools-scm>=8" numpy
     $ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
 
 - Finally, build and install vLLM CPU backend: 
diff --git a/pyproject.toml b/pyproject.toml
index 3c8c46cc8621e..3be401daa44c7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
 [build-system]
 # Should be mirrored in requirements-build.txt
 requires = [
-    "cmake>=3.26",
+    "cmake>=3.26,<=3.30",
     "ninja",
     "packaging",
     "setuptools>=61",
diff --git a/requirements-build.txt b/requirements-build.txt
index fec01caaf25ef..64b92861df25d 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -1,5 +1,5 @@
 # Should be mirrored in pyproject.toml
-cmake>=3.26
+cmake>=3.26,<=3.30
 ninja
 packaging
 setuptools>=61
diff --git a/requirements-tpu.txt b/requirements-tpu.txt
index f9a0770804e55..94a3225dcf479 100644
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -2,7 +2,7 @@
 -r requirements-common.txt
 
 # Dependencies for TPU
-cmake>=3.26
+cmake>=3.26,<=3.30
 ninja
 packaging
 setuptools-scm>=8
diff --git a/requirements-xpu.txt b/requirements-xpu.txt
index e41295792283f..479cb4bb18484 100644
--- a/requirements-xpu.txt
+++ b/requirements-xpu.txt
@@ -2,7 +2,7 @@
 -r requirements-common.txt
 
 ray >= 2.9
-cmake>=3.26
+cmake>=3.26,<=3.30
 ninja
 packaging
 setuptools-scm>=8

From 19682023b62c7ed00cee52a805dfa279dfc9c7a2 Mon Sep 17 00:00:00 2001
From: FuryMartin <fany@buaa.edu.cn>
Date: Sun, 10 Nov 2024 15:47:24 +0800
Subject: [PATCH 073/183] [Doc] Fix typo error in CONTRIBUTING.md (#10190)

Signed-off-by: FuryMartin <furymartin9910@outlook.com>
---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 8beae68289997..6d46a6dca371d 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,3 +1,3 @@
 # Contributing to vLLM
 
-You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview/).
+You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).

From bfb7d61a7c16e642ff3b84a62d6a308da6548a29 Mon Sep 17 00:00:00 2001
From: Yongzao <532741407@qq.com>
Date: Mon, 11 Nov 2024 02:22:04 +0800
Subject: [PATCH 074/183] [doc] Polish the integration with huggingface doc
 (#10195)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 .../source/design/huggingface_integration.rst | 22 +++++++++----------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/docs/source/design/huggingface_integration.rst b/docs/source/design/huggingface_integration.rst
index 20394bd311ea9..716273afd695c 100644
--- a/docs/source/design/huggingface_integration.rst
+++ b/docs/source/design/huggingface_integration.rst
@@ -5,27 +5,25 @@ This document describes how vLLM integrates with HuggingFace libraries. We will
 
 Let's say we want to serve the popular QWen model by running ``vllm serve Qwen/Qwen2-7B``.
 
-1. The ``model`` argument is ``Qwen/Qwen2-7B``. vLLM will first try to locate the config file ``config.json`` using this argument. See the `code snippet <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L75>`__ for the implementation.
+1. The ``model`` argument is ``Qwen/Qwen2-7B``. vLLM determines whether this model exists by checking for the corresponding config file ``config.json``. See this `code snippet <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L162-L182>`__ for the implementation. Within this process:
 
-   - If the ``model`` argument is a local path, vLLM will directly read the config file from the path.
+   - If the ``model`` argument corresponds to an existing local path, vLLM will load the config file directly from this path.
+   
+   - If the ``model`` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the ``model`` argument as the model name and the ``--revision`` argument as the revision. See `their website <https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome>`__ for more information on how the HuggingFace cache works.
 
-   - Otherwise, vLLM will try to read the config from the HuggingFace cache. See `their website <https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome>`__ for more information on how the HuggingFace cache works. Here, we can also use the argument ``--revision`` to specify the revision of the model in the cache.
+   - If the ``model`` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to `this function <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91>`__ for the implementation. The input arguments include the ``model`` argument as the model name, the ``--revision`` argument as the revision, and the environment variable ``HF_TOKEN`` as the token to access the model hub. In our case, vLLM will download the `config.json <https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json>`__ file.
 
-   - If neither of the above works, vLLM will download the config file from the HuggingFace model hub, using the ``model`` argument as the model name, the ``--revision`` argument as the revision, and the environment variable ``HF_TOKEN`` as the token to access the model hub. In our case, vLLM will download the `config.json <https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json>`__ file.
+2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this `code snippet <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L185-L186>`__ for the implementation.
 
-2. After obtaining the config file, vLLM will load the config into a dictionary. It first `inspects <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L189>`__ the ``model_type`` field in the config to determine the model type and config class to use. There are some ``model_type`` values that vLLM directly supports; see `here <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L48>`__ for the list. If the ``model_type`` is not in the list, vLLM will use `AutoConfig.from_pretrained <https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoConfig.from_pretrained>`__ to load the config class, with ``model``, ``--revision``, and ``--trust_remote_code`` as the arguments.
+3. Next, vLLM `inspects <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L189>`__ the ``model_type`` field in the config dictionary to `generate <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#190-L216>`__ the config object to use. There are some ``model_type`` values that vLLM directly supports; see `here <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L48>`__ for the list. If the ``model_type`` is not in the list, vLLM will use `AutoConfig.from_pretrained <https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoConfig.from_pretrained>`__ to load the config class, with ``model``, ``--revision``, and ``--trust_remote_code`` as the arguments. Please note that:
 
    - HuggingFace also has its own logic to determine the config class to use. It will again use the ``model_type`` field to search for the class name in the transformers library; see `here <https://github.com/huggingface/transformers/tree/main/src/transformers/models>`__ for the list of supported models. If the ``model_type`` is not found, HuggingFace will use the ``auto_map`` field from the config JSON file to determine the class name. Specifically, it is the ``AutoConfig`` field under ``auto_map``. See `DeepSeek <https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json>`__ for an example.
 
    - The ``AutoConfig`` field under ``auto_map`` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the ``from_pretrained`` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when ``--trust_remote_code`` is enabled.
 
-3. After obtaining the config object, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see `here <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/config.py#L244>`__ for the implementation.
+4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see `here <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/config.py#L244>`__ for the implementation.
 
-4. The config object is `attached <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/config.py#L195>`__ as the ``hf_config`` field to vLLM's ``model_config`` object.
-
-5. After vLLM obtains the config object, it will use the ``architectures`` field to determine the model class to initialize. For ``Qwen/Qwen2-7B``, the ``architectures`` field is ``["Qwen2ForCausalLM"]``. vLLM maintains the mapping from architecture name to model class in `its registry <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/registry.py#L80>`__. If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM.
-
-6. Finally, we reach the model class we want to initialize, i.e., the ``Qwen2ForCausalLM`` class in `vLLM's code <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/qwen2.py#L364>`__. This class will initialize itself depending on various configs.
+5. Finally, vLLM can reach the model class we want to initialize. vLLM uses the ``architectures`` field in the config object to determine the model class to initialize, as it maintains the mapping from architecture name to model class in `its registry <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/registry.py#L80>`__. If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM. For ``Qwen/Qwen2-7B``, the ``architectures`` field is ``["Qwen2ForCausalLM"]``, which corresponds to the ``Qwen2ForCausalLM`` class in `vLLM's code <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/qwen2.py#L364>`__. This class will initialize itself depending on various configs.
 
 Beyond that, there are two more things vLLM depends on HuggingFace for.
 
@@ -33,7 +31,7 @@ Beyond that, there are two more things vLLM depends on HuggingFace for.
 
 2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the ``model`` argument as the model name and the ``--revision`` argument as the revision. vLLM provides the argument ``--load-format`` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass ``--load-format dummy`` to skip downloading the weights.
 
-   - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the `documentation <https://huggingface.co/docs/safetensors/en/index>`__ for more information on the safetensors format.
+   - It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the `documentation <https://huggingface.co/docs/safetensors/en/index>`__ for more information on the safetensors format. This part of the logic can be found `here <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385>`__. Please note that:
 
 This completes the integration between vLLM and HuggingFace.
 

From 20cf2f553c223792ad3f65236b267586fa9bed6c Mon Sep 17 00:00:00 2001
From: Shawn Du <shawnd200@outlook.com>
Date: Mon, 11 Nov 2024 07:21:06 +0800
Subject: [PATCH 075/183] [Misc] small fixes to function tracing file path
 (#9543)

Signed-off-by: Shawn Du <shawnd200@outlook.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/index.rst | 4 ++--
 vllm/logger.py        | 5 +++--
 vllm/utils.py         | 3 +++
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 8457d4476a1c4..00d455ed9ad44 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -161,11 +161,11 @@ Documentation
    design/multimodal/multimodal_index
    design/huggingface_integration
 
-.. Contributing: contributing to the vLLM project
+.. For Developers: contributing to the vLLM project
 
 .. toctree::
    :maxdepth: 2
-   :caption: Contributing
+   :caption: For Developers
 
    contributing/overview
    contributing/profiling/profiling_index
diff --git a/vllm/logger.py b/vllm/logger.py
index 80b9fcc59272d..9e16e591315ba 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -117,13 +117,14 @@ def _trace_calls(log_path, root_dir, frame, event, arg=None):
                 last_lineno = 0
                 last_func_name = ""
             with open(log_path, 'a') as f:
+                ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
                 if event == 'call':
-                    f.write(f"{datetime.datetime.now()} Call to"
+                    f.write(f"{ts} Call to"
                             f" {func_name} in {filename}:{lineno}"
                             f" from {last_func_name} in {last_filename}:"
                             f"{last_lineno}\n")
                 else:
-                    f.write(f"{datetime.datetime.now()} Return from"
+                    f.write(f"{ts} Return from"
                             f" {func_name} in {filename}:{lineno}"
                             f" to {last_func_name} in {last_filename}:"
                             f"{last_lineno}\n")
diff --git a/vllm/utils.py b/vllm/utils.py
index 13d7f6d475346..1b02cbff79f78 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -4,6 +4,7 @@
 import datetime
 import enum
 import gc
+import getpass
 import inspect
 import ipaddress
 import os
@@ -967,6 +968,8 @@ def enable_trace_function_call_for_thread() -> None:
 
     if envs.VLLM_TRACE_FUNCTION:
         tmp_dir = tempfile.gettempdir()
+        # add username to tmp_dir to avoid permission issues
+        tmp_dir = os.path.join(tmp_dir, getpass.getuser())
         filename = (f"VLLM_TRACE_FUNCTION_for_process_{os.getpid()}"
                     f"_thread_{threading.get_ident()}_"
                     f"at_{datetime.datetime.now()}.log").replace(" ", "_")

From 73b9083e99c02c6ba91f6be9479b88e7e9a94cdf Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 10 Nov 2024 16:10:53 -0800
Subject: [PATCH 076/183] [misc] improve cloudpickle registration and tests
 (#10202)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/distributed/test_pipeline_parallel.py | 26 ++++++++---
 vllm/engine/arg_utils.py                    |  4 --
 vllm/transformers_utils/config.py           | 51 ++++++++++++---------
 3 files changed, 50 insertions(+), 31 deletions(-)

diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 1489a60891761..5d566f8308b70 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -32,6 +32,8 @@ class PPTestOptions(NamedTuple):
     multi_node_only: bool
     trust_remote_code: bool
     tokenizer_mode: Optional[str]
+    load_format: Optional[str] = None
+    hf_overrides: Optional[str] = None
 
 
 @dataclass
@@ -50,6 +52,8 @@ def detailed(
         task: TaskOption = "auto",
         trust_remote_code: bool = False,
         tokenizer_mode: Optional[str] = None,
+        load_format: Optional[str] = None,
+        hf_overrides: Optional[str] = None,
     ):
         return PPTestSettings(
             parallel_setups=[
@@ -78,7 +82,9 @@ def detailed(
             task=task,
             test_options=PPTestOptions(multi_node_only=multi_node_only,
                                        trust_remote_code=trust_remote_code,
-                                       tokenizer_mode=tokenizer_mode),
+                                       tokenizer_mode=tokenizer_mode,
+                                       load_format=load_format,
+                                       hf_overrides=hf_overrides),
         )
 
     @staticmethod
@@ -90,6 +96,8 @@ def fast(
         multi_node_only: bool = False,
         trust_remote_code: bool = False,
         tokenizer_mode: Optional[str] = None,
+        load_format: Optional[str] = None,
+        hf_overrides: Optional[str] = None,
     ):
         return PPTestSettings(
             parallel_setups=[
@@ -102,7 +110,9 @@ def fast(
             task=task,
             test_options=PPTestOptions(multi_node_only=multi_node_only,
                                        trust_remote_code=trust_remote_code,
-                                       tokenizer_mode=tokenizer_mode),
+                                       tokenizer_mode=tokenizer_mode,
+                                       load_format=load_format,
+                                       hf_overrides=hf_overrides),
         )
 
     def iter_params(self, model_name: str):
@@ -161,9 +171,8 @@ def iter_params(self, model_name: str):
     "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
     "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),
     "microsoft/phi-2": PPTestSettings.fast(),
-    "microsoft/Phi-3-mini-4k-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True),  # noqa: E501
+    "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True, load_format="dummy", hf_overrides='{"num_hidden_layers": 4, "hidden_size": 512, "intermediate_size": 800, "num_attention_heads": 4, "num_key_value_heads": 1}'),  # noqa: E501
     "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
-    "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
     "adept/persimmon-8b-chat": PPTestSettings.fast(),
     "Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
     "Qwen/Qwen2-7B-Instruct": PPTestSettings.fast(),
@@ -214,9 +223,9 @@ def iter_params(self, model_name: str):
 # NOTE: You can update this on your local machine to run specific tests
 TEST_MODELS = [
     # [LANGUAGE GENERATION]
+    "microsoft/Phi-3.5-MoE-instruct",
     "meta-llama/Meta-Llama-3-8B",
     "ibm/PowerLM-3b",
-    "microsoft/Phi-3-mini-4k-instruct",
     # [LANGUAGE EMBEDDING]
     "intfloat/e5-mistral-7b-instruct",
     "BAAI/bge-multilingual-gemma2",
@@ -238,7 +247,8 @@ def _compare_tp(
     method: Literal["generate", "encode"],
 ):
     tp_size, pp_size, eager_mode, chunked_prefill = parallel_setup
-    multi_node_only, trust_remote_code, tokenizer_mode = test_options
+    multi_node_only, trust_remote_code, tokenizer_mode, \
+        load_format, hf_overrides = test_options
 
     if num_gpus_available < tp_size * pp_size:
         pytest.skip(f"Need at least {tp_size} x {pp_size} GPUs")
@@ -267,6 +277,10 @@ def _compare_tp(
         common_args.append("--trust-remote-code")
     if tokenizer_mode:
         common_args.extend(["--tokenizer-mode", tokenizer_mode])
+    if load_format:
+        common_args.extend(["--load-format", load_format])
+    if hf_overrides:
+        common_args.extend(["--hf-overrides", hf_overrides])
 
     if (distributed_backend == "ray" and tp_size == 2 and pp_size == 2
             and chunked_prefill):
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 95d55e86e08e8..02e67f89e5a8d 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -19,8 +19,6 @@
 from vllm.model_executor.layers.pooler import PoolingType
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.platforms import current_platform
-from vllm.transformers_utils.config import (
-    maybe_register_config_serialize_by_value)
 from vllm.transformers_utils.utils import check_gguf_file
 from vllm.utils import FlexibleArgumentParser, StoreBoolean
 
@@ -1013,8 +1011,6 @@ def create_engine_config(self) -> VllmConfig:
                     "supported for multimodal models and has been disabled.")
             self.enable_prefix_caching = False
 
-        maybe_register_config_serialize_by_value(self.trust_remote_code)
-
         cache_config = CacheConfig(
             # neuron needs block_size = max_model_len
             block_size=self.block_size if self.device != "neuron" else
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 14d9518364d26..054845584c2ef 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -234,6 +234,9 @@ def get_config(
 
     patch_rope_scaling(config)
 
+    if trust_remote_code:
+        maybe_register_config_serialize_by_value()
+
     return config
 
 
@@ -389,33 +392,39 @@ def get_sentence_transformer_tokenizer_config(model: str,
     return None
 
 
-def maybe_register_config_serialize_by_value(trust_remote_code: bool) -> None:
+def maybe_register_config_serialize_by_value() -> None:
     """Try to register HF model configuration class to serialize by value
 
-        With trust_remote_code, the config class is typically an instance of a
-        custom class imported from the HF modules cache. The class will not be
-        importable in spawned workers by default (and won't exist at all on
-        other nodes), which breaks serialization of the config.
+        If trust_remote_code is set, and the model's config file specifies an
+        `AutoConfig` class, then the config class is typically an instance of
+        a custom class imported from the HF modules cache.
+
+        Examples:
+
+        >>> from transformers import AutoConfig
+        >>> klass = AutoConfig.from_pretrained('meta-llama/Meta-Llama-3-8B', trust_remote_code=True)
+        >>> klass.__class__ # transformers.models.llama.configuration_llama.LlamaConfig
+        >>> import transformers_modules # error, not initialized
+        >>> klass = AutoConfig.from_pretrained('deepseek-ai/DeepSeek-V2.5', trust_remote_code=True)
+        >>> import transformers_modules # success, initialized
+        >>> klass.__class__ # transformers_modules.deepseek-ai.DeepSeek-V2.5.98b11844770b2c3ffc18b175c758a803640f4e77.configuration_deepseek.DeepseekV2Config
+
+        In the DeepSeek example, the config class is an instance of a custom
+        class that is not serializable by default. This class will not be
+        importable in spawned workers, and won't exist at all on
+        other nodes, which breaks serialization of the config.
 
         In this function we tell the cloudpickle serialization library to pass
         instances of these generated classes by value instead of by reference,
         i.e. the class definition is serialized along with its data so that the
-        class module does not need to be importable on the receiving end. This
-        registration only works if the modules cache has already been
-        initialized.
-
+        class module does not need to be importable on the receiving end.
 
         See: https://github.com/cloudpipe/cloudpickle?tab=readme-ov-file#overriding-pickles-serialization-mechanism-for-importable-constructs
-    """
-    if not trust_remote_code:
-        return
-
+    """ # noqa
     try:
         import transformers_modules
     except ImportError:
-        logger.debug("Could not import transformers_modules used for remote"
-                     " code. If remote code is not needed remove"
-                     " `--trust-remote-code`.")
+        # the config does not need trust_remote_code
         return
 
     try:
@@ -428,19 +437,19 @@ class module does not need to be importable on the receiving end. This
             ray.cloudpickle.register_pickle_by_value(transformers_modules)
 
         # multiprocessing uses pickle to serialize arguments when using spawn
-        # Here we get pickle to use cloudpickle to serialize ModelConfig objects
+        # Here we get pickle to use cloudpickle to serialize config objects
         # that contain instances of the custom config class to avoid
         # serialization problems if the generated module (and model) has a `.`
         # in its name
         import multiprocessing
         import pickle
 
-        from vllm.config import ModelConfig
+        from vllm.config import VllmConfig
 
-        def _reduce_modelconfig(mc: ModelConfig):
-            return (pickle.loads, (cloudpickle.dumps(mc), ))
+        def _reduce_config(config: VllmConfig):
+            return (pickle.loads, (cloudpickle.dumps(config), ))
 
-        multiprocessing.reducer.register(ModelConfig, _reduce_modelconfig)
+        multiprocessing.reducer.register(VllmConfig, _reduce_config)
 
     except Exception as e:
         logger.warning(

From ad9a78bf640cca930de76a066a2f34139b9acb65 Mon Sep 17 00:00:00 2001
From: yansh97 <yansh97@foxmail.com>
Date: Mon, 11 Nov 2024 08:14:22 +0800
Subject: [PATCH 077/183] [Doc] Fix typo error in
 vllm/entrypoints/openai/cli_args.py (#10196)

---
 vllm/entrypoints/openai/cli_args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index a089985ac9758..74ea41344bece 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -190,7 +190,7 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         default=False,
         help=
         "Enable auto tool choice for supported models. Use --tool-call-parser"
-        "to specify which parser to use")
+        " to specify which parser to use")
 
     valid_tool_parsers = ToolParserManager.tool_parsers.keys()
     parser.add_argument(

From f0f2e5638ef4858b00b137bea1c3f8312e48efa6 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 10 Nov 2024 17:49:40 -0800
Subject: [PATCH 078/183] [doc] improve debugging code (#10206)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/getting_started/debugging.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index 91978065faf42..d40222bfd4da8 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -75,6 +75,9 @@ If GPU/CPU communication cannot be established, you can use the following Python
 
     print("PyTorch GLOO is successful!")
 
+    if world_size <= 1:
+        exit()
+
     # Test vLLM NCCL, with cuda graph
     from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 

From f89d18ff74e48f97c76afbab31956218d2486e36 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 10 Nov 2024 22:41:46 -0800
Subject: [PATCH 079/183] [6/N] pass whole config to inner model (#10205)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/model_executor/models/arctic.py          | 23 ++++----
 vllm/model_executor/models/baichuan.py        | 48 ++++++++--------
 vllm/model_executor/models/bart.py            | 23 ++++----
 vllm/model_executor/models/bert.py            | 25 ++++-----
 vllm/model_executor/models/blip2.py           | 10 +---
 vllm/model_executor/models/bloom.py           | 21 +++----
 vllm/model_executor/models/chameleon.py       | 27 ++++-----
 vllm/model_executor/models/chatglm.py         | 19 ++++---
 vllm/model_executor/models/commandr.py        | 33 +++++------
 vllm/model_executor/models/dbrx.py            | 21 +++----
 vllm/model_executor/models/decilm.py          |  6 +-
 vllm/model_executor/models/deepseek.py        | 26 ++++-----
 vllm/model_executor/models/deepseek_v2.py     | 29 ++++------
 vllm/model_executor/models/eagle.py           |  5 +-
 vllm/model_executor/models/exaone.py          | 34 +++++------
 vllm/model_executor/models/falcon.py          | 21 +++----
 vllm/model_executor/models/florence2.py       | 38 ++++++-------
 vllm/model_executor/models/gemma.py           | 24 +++-----
 vllm/model_executor/models/gemma2.py          | 28 +++-------
 vllm/model_executor/models/gpt2.py            | 24 ++++----
 vllm/model_executor/models/gpt_bigcode.py     | 22 ++++----
 vllm/model_executor/models/gpt_j.py           | 21 +++----
 vllm/model_executor/models/gpt_neox.py        | 20 +++----
 vllm/model_executor/models/granite.py         | 34 +++++------
 vllm/model_executor/models/granitemoe.py      | 33 ++++-------
 vllm/model_executor/models/idefics3.py        | 29 ++++------
 vllm/model_executor/models/internlm2.py       | 24 +++-----
 vllm/model_executor/models/internlm2_ve.py    | 23 +++-----
 vllm/model_executor/models/internvl.py        |  6 +-
 vllm/model_executor/models/jais.py            | 21 +++----
 vllm/model_executor/models/jamba.py           | 30 ++++------
 vllm/model_executor/models/llama.py           | 44 ++++-----------
 vllm/model_executor/models/llava.py           |  6 +-
 vllm/model_executor/models/llava_next.py      |  6 +-
 .../model_executor/models/llava_next_video.py |  6 +-
 vllm/model_executor/models/llava_onevision.py |  6 +-
 vllm/model_executor/models/mamba.py           | 31 +++++-----
 vllm/model_executor/models/minicpm.py         | 38 ++++++-------
 vllm/model_executor/models/minicpm3.py        | 12 ++--
 vllm/model_executor/models/minicpmv.py        | 56 +++++++------------
 vllm/model_executor/models/mixtral.py         | 34 +++++------
 vllm/model_executor/models/mixtral_quant.py   | 26 ++++-----
 vllm/model_executor/models/mllama.py          | 49 +++++++---------
 vllm/model_executor/models/molmo.py           | 26 ++++-----
 vllm/model_executor/models/mpt.py             | 26 ++++-----
 vllm/model_executor/models/nemotron.py        | 34 +++++------
 vllm/model_executor/models/olmo.py            | 24 ++++----
 vllm/model_executor/models/olmoe.py           | 26 ++++-----
 vllm/model_executor/models/opt.py             | 24 +++-----
 vllm/model_executor/models/orion.py           | 26 ++++-----
 vllm/model_executor/models/paligemma.py       | 13 ++---
 vllm/model_executor/models/persimmon.py       | 27 ++++-----
 vllm/model_executor/models/phi.py             | 24 ++++----
 vllm/model_executor/models/phi3_small.py      | 26 ++++-----
 vllm/model_executor/models/phi3v.py           | 14 ++---
 vllm/model_executor/models/phimoe.py          | 34 +++++------
 vllm/model_executor/models/pixtral.py         | 10 +---
 vllm/model_executor/models/qwen.py            | 27 ++++-----
 vllm/model_executor/models/qwen2.py           | 23 +++-----
 vllm/model_executor/models/qwen2_audio.py     | 12 ++--
 vllm/model_executor/models/qwen2_cls.py       | 11 ++--
 vllm/model_executor/models/qwen2_moe.py       | 26 ++++-----
 vllm/model_executor/models/qwen2_rm.py        | 11 ++--
 vllm/model_executor/models/qwen2_vl.py        | 19 +++----
 vllm/model_executor/models/solar.py           | 34 +++++------
 vllm/model_executor/models/stablelm.py        | 24 ++++----
 vllm/model_executor/models/starcoder2.py      | 26 ++++-----
 vllm/model_executor/models/ultravox.py        | 16 +++---
 vllm/model_executor/models/xverse.py          | 19 ++-----
 69 files changed, 681 insertions(+), 963 deletions(-)

diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index 997554f7dcccd..7d4b9654b54ab 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -34,7 +34,8 @@
 
 from .interfaces import SupportsPP
 from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 logger = init_logger(__name__)
 
@@ -364,14 +365,13 @@ def forward(
 @support_torch_compile
 class ArcticModel(nn.Module):
 
-    def __init__(
-        self,
-        config: ArcticConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = VocabParallelEmbedding(
@@ -418,13 +418,10 @@ class ArcticForCausalLM(nn.Module, SupportsPP):
     def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         self.config = config
-        self.model = ArcticModel(config,
-                                 cache_config,
-                                 quant_config,
-                                 prefix=prefix)
+        self.model = ArcticModel(vllm_config=vllm_config,
+                                 prefix=maybe_prefix(prefix, "model"))
         self.vocab_size = config.vocab_size
         self.lm_head = ParallelLMHead(
             self.vocab_size,
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 8e1dab71b1f39..aabbd31192a40 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -253,13 +253,18 @@ def forward(
 @support_torch_compile
 class BaiChuanModel(nn.Module):
 
-    def __init__(self,
-                 config: PretrainedConfig,
-                 position_embedding: str,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        prefix: str = "",
+        position_embedding: str = "ROPE",
+    ) -> None:
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.config = config
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
@@ -332,21 +337,22 @@ class BaiChuanBaseForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     def __init__(
         self,
+        *,
         vllm_config: VllmConfig,
         prefix: str = "",
         position_embedding: str = "ROPE",
     ):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
         self.config = config
         self.lora_config = lora_config
 
         self.quant_config = quant_config
-        self.model = BaiChuanModel(config, position_embedding, cache_config,
-                                   quant_config)
+        self.model = BaiChuanModel(vllm_config=vllm_config,
+                                   prefix=prefix,
+                                   position_embedding=position_embedding)
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
                                       quant_config=quant_config)
@@ -438,16 +444,16 @@ class BaichuanForCausalLM(BaiChuanBaseForCausalLM):
     NOTE: the class name has a lower case 'c'.
     """
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         if config.hidden_size == 4096:  # baichuan2 7b
-            super().__init__(vllm_config, prefix, "ROPE")
+            super().__init__(vllm_config=vllm_config,
+                             prefix=prefix,
+                             position_embedding="ROPE")
         else:  # baichuan 13b, baichuan2 13b
-            super().__init__(vllm_config, prefix, "ALIBI")
+            super().__init__(vllm_config=vllm_config,
+                             prefix=prefix,
+                             position_embedding="ALIBI")
 
 
 class BaiChuanForCausalLM(BaiChuanBaseForCausalLM):
@@ -455,9 +461,7 @@ class BaiChuanForCausalLM(BaiChuanBaseForCausalLM):
     NOTE: the class name has an upper case 'C'.
     """
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ):
-        super().__init__(vllm_config, prefix, "ROPE")
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config,
+                         prefix=prefix,
+                         position_embedding="ROPE")
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index c6da6a590cf5a..a50a5a5b018e1 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -41,6 +41,8 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
+from .utils import maybe_prefix
+
 logger = logging.get_logger(__name__)
 
 
@@ -739,13 +741,14 @@ class BartModel(nn.Module):
         "encoder.embed_tokens.weight", "decoder.embed_tokens.weight"
     ]
 
-    def __init__(self,
-                 config: BartConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 lora_config: Optional[LoRAConfig] = None):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
         self.config = config
 
         self.padding_idx = config.pad_token_id
@@ -810,20 +813,16 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
 class BartForConditionalGeneration(nn.Module):
     base_model_prefix = "model"
 
-    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
         # currently all existing BART models have `tie_word_embeddings` enabled
         assert config.tie_word_embeddings
         self.config = config
-        self.model = BartModel(config,
-                               cache_config,
-                               quant_config,
-                               lora_config=lora_config)
+        self.model = BartModel(vllm_config=vllm_config,
+                               prefix=maybe_prefix(prefix, "model"))
 
         self.unpadded_vocab_size = config.vocab_size
         if lora_config:
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 2b0f45c5603f5..614d2db8ccff6 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -21,6 +21,8 @@
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors, PoolerOutput
 
+from .utils import maybe_prefix
+
 
 class BertEmbedding(nn.Module):
 
@@ -309,12 +311,13 @@ def forward(self, hidden_states: torch.Tensor,
 
 class BertModel(nn.Module):
 
-    def __init__(self,
-                 config: BertConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.embeddings = BertEmbedding(config)
         self.encoder = BertEncoder(config,
                                    cache_config,
@@ -382,17 +385,11 @@ class BertEmbeddingModel(nn.Module):
        _pooler: An instance of Pooler used for pooling operations.
    """
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
-        config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
         pooler_config = vllm_config.model_config.pooler_config
-        self.model = BertModel(config, cache_config, quant_config)
+        self.model = BertModel(vllm_config=vllm_config,
+                               prefix=maybe_prefix(prefix, "model"))
         self._pooler = Pooler.from_config_with_defaults(
             pooler_config,
             pooling_type=PoolingType.CLS,
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index cdc30eda2ab3c..03dc1d15ab697 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -23,7 +23,7 @@
                    get_max_blip_image_tokens)
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, init_vllm_registered_model,
-                    merge_multimodal_embeddings)
+                    maybe_prefix, merge_multimodal_embeddings)
 
 # We use this internally as placeholders since there is no image token
 # defined on the HuggingFace repo
@@ -483,11 +483,7 @@ def input_processor_for_blip2(ctx: InputContext, inputs: DecoderOnlyInputs):
 @INPUT_REGISTRY.register_input_processor(input_processor_for_blip2)
 class Blip2ForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         super().__init__()
         config = vllm_config.model_config.hf_config
@@ -517,7 +513,7 @@ def __init__(
         self.language_model = init_vllm_registered_model(
             config.text_config,
             vllm_config=vllm_config,
-            prefix="language_model")
+            prefix=maybe_prefix(prefix, "language_model"))
 
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 7540bc23efd88..2c14519fb9e0e 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -42,7 +42,8 @@
 
 from .interfaces import SupportsPP
 from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 
 def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
@@ -221,14 +222,13 @@ def forward(
 @support_torch_compile
 class BloomModel(nn.Module):
 
-    def __init__(
-        self,
-        config: BloomConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.embed_dim = config.hidden_size
 
         # Embedding + LN Embedding
@@ -288,11 +288,12 @@ def __init__(
     ):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.transformer = BloomModel(config, cache_config, quant_config)
+        self.transformer = BloomModel(vllm_config=vllm_config,
+                                      prefix=maybe_prefix(
+                                          prefix, "transformer"))
         if self.config.tie_word_embeddings:
             self.lm_head = self.transformer.word_embeddings
         else:
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index f79bad6190708..7b59c818e0b60 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -37,7 +37,8 @@
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 # These configs are not part of the model config but the preprocessor
 # and processor files, so we hardcode them in the model file for now.
@@ -831,14 +832,13 @@ def convert_img2bpe(self, img_batch: torch.Tensor) -> torch.Tensor:
 
 class ChameleonModel(nn.Module):
 
-    def __init__(
-        self,
-        config: ChameleonConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.config = config
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
@@ -924,19 +924,14 @@ def forward(
 class ChameleonForConditionalGeneration(nn.Module, SupportsMultiModal,
                                         SupportsPP):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
         self.multimodal_config = multimodal_config
-        self.model = ChameleonModel(config, cache_config, quant_config)
+        self.model = ChameleonModel(vllm_config=vllm_config,
+                                    prefix=maybe_prefix(prefix, "model"))
         self.unpadded_vocab_size = config.vocab_size
         self.lm_head = ParallelLMHead(
             self.unpadded_vocab_size,
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index c14f2fcb15063..08ed84aa9c71a 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -39,7 +39,8 @@
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
 from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 logger = init_logger(__name__)
 
@@ -481,14 +482,13 @@ def forward(
 
 class ChatGLMModel(nn.Module):
 
-    def __init__(
-        self,
-        config: ChatGLMConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-    ):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.config = config
 
         self.embedding = VocabParallelEmbedding(config.padded_vocab_size,
@@ -600,7 +600,6 @@ def __init__(
     ):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
         multimodal_config = vllm_config.model_config.multimodal_config
@@ -611,7 +610,9 @@ def __init__(
         self.quant_config = quant_config
         self.max_position_embeddings = getattr(config, "max_sequence_length",
                                                8192)
-        self.transformer = ChatGLMModel(config, cache_config, quant_config)
+        self.transformer = ChatGLMModel(vllm_config=vllm_config,
+                                        prefix=maybe_prefix(
+                                            prefix, "transformer"))
         if self.config.tie_word_embeddings:
             self.transformer.output_layer.weight = (
                 self.transformer.embedding.weight)
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index e921fa50b099e..cd5c1d6844716 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -28,7 +28,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig, VllmConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
@@ -49,7 +49,8 @@
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 
 @torch.compile
@@ -253,15 +254,14 @@ def forward(
 @support_torch_compile
 class CohereModel(nn.Module):
 
-    def __init__(
-        self,
-        config: CohereConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
-        prefix: str = "",
-    ):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
         self.config = config
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
@@ -332,14 +332,9 @@ class CohereForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     embedding_modules = {"embed_tokens": "input_embeddings"}
     embedding_padding_modules = []
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
         self.config = config
@@ -353,10 +348,8 @@ def __init__(
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
                                                 config.vocab_size,
                                                 scale=config.logit_scale)
-        self.model = CohereModel(config,
-                                 cache_config,
-                                 quant_config,
-                                 lora_config=lora_config)
+        self.model = CohereModel(vllm_config=vllm_config,
+                                 prefix=maybe_prefix(prefix, "model"))
         self.sampler = get_sampler()
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index e3b3164cacde3..d5f9b903183d4 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -25,7 +25,8 @@
 
 from .interfaces import SupportsPP
 from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 
 class DbrxRouter(nn.Module):
@@ -294,14 +295,13 @@ def forward(
 
 class DbrxModel(nn.Module):
 
-    def __init__(
-        self,
-        config: DbrxConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.wte = VocabParallelEmbedding(
             config.vocab_size,
             config.d_model,
@@ -357,7 +357,6 @@ def __init__(
     ):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         self.config = config
         if config.tie_word_embeddings:
@@ -365,7 +364,9 @@ def __init__(
                 "tie_word_embeddings is not supported for Dbrx models.")
         self.quant_config = quant_config
         self.unpadded_vocab_size = config.vocab_size
-        self.transformer = DbrxModel(config, cache_config, quant_config)
+        self.transformer = DbrxModel(vllm_config=vllm_config,
+                                     prefix=maybe_prefix(
+                                         prefix, "transformer"))
         self.lm_head = ParallelLMHead(
             config.vocab_size,
             config.d_model,
diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py
index 3e7005efb39ca..b38fd9fa49c21 100644
--- a/vllm/model_executor/models/decilm.py
+++ b/vllm/model_executor/models/decilm.py
@@ -51,11 +51,7 @@ class DeciLMForCausalLM(LlamaForCausalLM):
     instead.
     """
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         config.num_key_value_heads = max(config.num_key_value_heads_per_layer)
         delattr(config, "num_key_value_heads_per_layer")
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index c90d3d250e4c5..a9bf1440c4d60 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -50,7 +50,8 @@
 
 from .interfaces import SupportsPP
 from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 
 class DeepseekMLP(nn.Module):
@@ -326,14 +327,13 @@ class DeepseekModel(nn.Module):
 
     fall_back_to_pt_during_load = False
 
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
@@ -383,18 +383,14 @@ def forward(
 
 class DeepseekForCausalLM(nn.Module, SupportsPP):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.model = DeepseekModel(config, cache_config, quant_config)
+        self.model = DeepseekModel(vllm_config=vllm_config,
+                                   prefix=maybe_prefix(prefix, "model"))
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
                                       quant_config=quant_config)
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 0f391d8329a8e..4fb1eed15a2e7 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -51,7 +51,8 @@
 
 from .interfaces import SupportsPP
 from .utils import (PPMissingLayer, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 
 class DeepseekV2MLP(nn.Module):
@@ -408,14 +409,13 @@ class DeepseekV2Model(nn.Module):
 
     fall_back_to_pt_during_load = False
 
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
@@ -479,21 +479,14 @@ def forward(
 
 class DeepseekV2ForCausalLM(nn.Module, SupportsPP):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.model = DeepseekV2Model(config,
-                                     cache_config,
-                                     quant_config,
-                                     prefix="model")
+        self.model = DeepseekV2Model(vllm_config=vllm_config,
+                                     prefix=maybe_prefix(prefix, "model"))
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
                                       quant_config=quant_config)
diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py
index 6bd73d20d340d..c902829994c7c 100644
--- a/vllm/model_executor/models/eagle.py
+++ b/vllm/model_executor/models/eagle.py
@@ -14,6 +14,8 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
+from .utils import maybe_prefix
+
 
 class EAGLE(nn.Module):
     """This class implements the EAGLE draft model from the paper: https://arxiv.org/pdf/2401.15077
@@ -42,7 +44,8 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
         architectures = getattr(self.config.model, "architectures", [])
         model_cls, _ = ModelRegistry.resolve_model_cls(architectures)
 
-        self.model = model_cls(vllm_config, prefix)
+        self.model = model_cls(vllm_config=vllm_config,
+                               prefix=maybe_prefix(prefix, "model"))
         self.fc = nn.Linear(config.model.hidden_size * 2,
                             config.model.hidden_size,
                             bias=getattr(self.config, "eagle_fc_bias", False))
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index fa6dbfe35b3ad..cd3e7da657e0e 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -29,7 +29,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig, VllmConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -54,7 +54,8 @@
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (PPMissingLayer, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 
 class ExaoneGatedMLP(nn.Module):
@@ -314,15 +315,14 @@ def forward(
 @support_torch_compile
 class ExaoneModel(nn.Module):
 
-    def __init__(
-        self,
-        config: ExaoneConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
         self.config = config
         self.padding_idx = config.pad_token_id
         lora_vocab = ((lora_config.lora_extra_vocab_size *
@@ -438,14 +438,9 @@ class ExaoneForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "c_fc_1": ("gate_up_proj", 1),
     }
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
 
@@ -453,11 +448,8 @@ def __init__(
         self.lora_config = lora_config
 
         self.transformer = ExaoneModel(
-            config,
-            cache_config,
-            quant_config,
-            lora_config=lora_config,
-            prefix="model",
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "model"),
         )
         if get_pp_group().is_last_rank:
             self.unpadded_vocab_size = config.vocab_size
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 96ae119042277..562ee5517e7f1 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -48,7 +48,8 @@
 
 from .interfaces import SupportsPP
 from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 FalconConfig = Union[HF_FalconConfig, RWConfig]
 
@@ -332,14 +333,13 @@ def forward(
 @support_torch_compile
 class FalconModel(nn.Module):
 
-    def __init__(
-        self,
-        config: FalconConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.config = config
         self.embed_dim = config.hidden_size
         self.num_heads = config.num_attention_heads
@@ -408,11 +408,12 @@ def __init__(
     ):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.transformer = FalconModel(config, cache_config, quant_config)
+        self.transformer = FalconModel(vllm_config=vllm_config,
+                                       prefix=maybe_prefix(
+                                           prefix, "transformer"))
         # only Falcon-11B doesn't share lm_head weight with word embeddings
         # and previous Falcon model doesn't have tie_word_embeddings config
         # so we set tie_word_embeddings to True by default
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index b0d970d9fb572..971a71180164b 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -3,13 +3,10 @@
 
 import torch
 import torch.nn as nn
-from transformers import PretrainedConfig
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import VllmConfig
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.bart import (BartDecoder, BartEncoder,
@@ -23,11 +20,13 @@
 
 class Florence2LanguageModel(nn.Module):
 
-    def __init__(self,
-                 config: PretrainedConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.config = config
 
         self.padding_idx = config.pad_token_id
@@ -93,15 +92,14 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
 
 class Florence2LanguageForConditionalGeneration(nn.Module):
 
-    def __init__(self,
-                 config: PretrainedConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+
         self.config = config
-        self.model = Florence2LanguageModel(config,
-                                            cache_config=cache_config,
-                                            quant_config=quant_config)
+        self.model = Florence2LanguageModel(vllm_config=vllm_config,
+                                            prefix=prefix)
         embed_scale = math.sqrt(
             config.d_model) if config.scale_embedding else 1.0
 
@@ -189,17 +187,15 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
 
 class Florence2ForConditionalGeneration(nn.Module):
 
-    def __init__(self, vllm_config: VllmConfig, prefix: str = ""):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
 
         # TODO(Isotr0py): Add vision backbone
         self.language_model = Florence2LanguageForConditionalGeneration(
-            config=config.text_config,
-            cache_config=cache_config,
-            quant_config=quant_config)
+            vllm_config=vllm_config.with_hf_config(config.text_config),
+            prefix=prefix,
+        )
 
     @property
     def sampler(self):
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 4e0cbfb9cbf58..55baba809e58f 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -258,14 +258,13 @@ def forward(
 @support_torch_compile
 class GemmaModel(nn.Module):
 
-    def __init__(
-        self,
-        config: GemmaConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.config = config
 
         self.embed_tokens = VocabParallelEmbedding(
@@ -372,14 +371,9 @@ class GemmaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     embedding_modules = {}
     embedding_padding_modules = []
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
 
@@ -389,9 +383,7 @@ def __init__(
         self.lora_config = lora_config
 
         self.quant_config = quant_config
-        self.model = GemmaModel(config,
-                                cache_config,
-                                quant_config,
+        self.model = GemmaModel(vllm_config=vllm_config,
                                 prefix=maybe_prefix(prefix, "model"))
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = get_sampler()
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index 773d3b72ec418..eeb3fd98a7eac 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -43,7 +43,8 @@
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 logger = init_logger(__name__)
 
@@ -243,11 +244,7 @@ def forward(
 @support_torch_compile
 class Gemma2Model(nn.Module):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
@@ -399,13 +396,8 @@ class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "up_proj": ("gate_up_proj", 1),
     }
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
         del lora_config  # Unused.
@@ -414,7 +406,8 @@ def __init__(
         # currently all existing Gemma models have `tie_word_embeddings` enabled
         assert config.tie_word_embeddings
         self.quant_config = quant_config
-        self.model = Gemma2Model(config, cache_config, quant_config)
+        self.model = Gemma2Model(vllm_config=vllm_config,
+                                 prefix=maybe_prefix(prefix, "model"))
         self.logits_processor = LogitsProcessor(
             config.vocab_size, soft_cap=config.final_logit_softcapping)
         self.sampler = get_sampler()
@@ -471,14 +464,11 @@ class Gemma2EmbeddingModel(nn.Module, SupportsPP):
         _pooler: An instance of Pooler used for pooling operations.
     """
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
-        self.model = Gemma2Model(vllm_config, prefix)
+        self.model = Gemma2Model(vllm_config=vllm_config,
+                                 prefix=maybe_prefix(prefix, "model"))
         self._pooler = Pooler.from_config_with_defaults(
             vllm_config.model_config.pooler_config,
             pooling_type=PoolingType.LAST,
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index c3fc47db79986..fcff7ec2e01eb 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -42,7 +42,8 @@
 
 from .interfaces import SupportsPP
 from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 
 class GPT2Attention(nn.Module):
@@ -184,14 +185,13 @@ def forward(
 @support_torch_compile
 class GPT2Model(nn.Module):
 
-    def __init__(
-        self,
-        config: GPT2Config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.config = config
         assert not config.add_cross_attention
         assert not config.scale_attn_by_inverse_layer_idx
@@ -247,14 +247,12 @@ def __init__(
     ):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.transformer = GPT2Model(config,
-                                     cache_config,
-                                     quant_config,
-                                     prefix="transformer")
+        self.transformer = GPT2Model(vllm_config=vllm_config,
+                                     prefix=maybe_prefix(
+                                         prefix, "transformer"))
         if self.config.tie_word_embeddings:
             self.lm_head = self.transformer.wte
         else:
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index ea1614d966365..ae1495ebd7914 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -25,7 +25,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig, VllmConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -189,15 +189,14 @@ def forward(
 @support_torch_compile
 class GPTBigCodeModel(nn.Module):
 
-    def __init__(
-        self,
-        config: GPTBigCodeConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
-        prefix: str = "",
-    ):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
         self.config = config
         assert not config.add_cross_attention
 
@@ -265,7 +264,6 @@ def __init__(
     ):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
 
@@ -273,8 +271,8 @@ def __init__(
         self.lora_config = lora_config
 
         self.quant_config = quant_config
-        self.transformer = GPTBigCodeModel(config, cache_config, quant_config,
-                                           lora_config)
+        self.transformer = GPTBigCodeModel(vllm_config=vllm_config,
+                                           prefix=prefix)
         if self.config.tie_word_embeddings:
             self.lm_head = self.transformer.wte
         else:
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 58cff67c69051..610795b084b44 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -42,7 +42,8 @@
 
 from .interfaces import SupportsPP
 from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 
 class GPTJAttention(nn.Module):
@@ -177,14 +178,13 @@ def forward(
 @support_torch_compile
 class GPTJModel(nn.Module):
 
-    def __init__(
-        self,
-        config: GPTJConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.config = config
         self.embed_dim = config.n_embd
         self.wte = VocabParallelEmbedding(
@@ -236,12 +236,13 @@ def __init__(
     ):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
         assert not config.tie_word_embeddings
-        self.transformer = GPTJModel(config, cache_config, quant_config)
+        self.transformer = GPTJModel(vllm_config=vllm_config,
+                                     prefix=maybe_prefix(
+                                         prefix, "transformer"))
         self.lm_head = ParallelLMHead(
             config.vocab_size,
             config.n_embd,
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 27b2577a8cdca..f5603772e9862 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -41,7 +41,8 @@
 
 from .interfaces import SupportsPP
 from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 
 class GPTNeoXAttention(nn.Module):
@@ -189,14 +190,13 @@ def forward(
 @support_torch_compile
 class GPTNeoXModel(nn.Module):
 
-    def __init__(
-        self,
-        config: GPTNeoXConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.config = config
 
         self.embed_in = VocabParallelEmbedding(
@@ -249,11 +249,11 @@ def __init__(
     ):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.gpt_neox = GPTNeoXModel(config, cache_config, quant_config)
+        self.gpt_neox = GPTNeoXModel(vllm_config=vllm_config,
+                                     prefix=maybe_prefix(prefix, "gpt_neox"))
         self.embed_out = ParallelLMHead(
             config.vocab_size,
             config.hidden_size,
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index c3e23b7138e7f..d1e6e31f2b8d1 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -28,7 +28,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig, VllmConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -52,7 +52,8 @@
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import PPMissingLayer, is_pp_missing_parameter, make_layers
+from .utils import (PPMissingLayer, is_pp_missing_parameter, make_layers,
+                    maybe_prefix)
 
 
 class GraniteMLP(nn.Module):
@@ -257,15 +258,14 @@ def forward(
 @support_torch_compile
 class GraniteModel(nn.Module):
 
-    def __init__(
-        self,
-        config: GraniteConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
         self.config = config
         self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size *
@@ -370,25 +370,17 @@ class GraniteForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "up_proj": ("gate_up_proj", 1),
     }
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
 
         self.config = config
         self.lora_config = lora_config
 
-        self.model = GraniteModel(config,
-                                  cache_config,
-                                  quant_config,
-                                  lora_config=lora_config,
-                                  prefix="model")
+        self.model = GraniteModel(vllm_config=vllm_config,
+                                  prefix=maybe_prefix(prefix, "model"))
         if get_pp_group().is_last_rank:
             self.unpadded_vocab_size = config.vocab_size
             if lora_config:
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 73f7c106e3d39..2ed115c56af45 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -28,7 +28,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig, VllmConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -47,7 +47,7 @@
 
 from . import mixtral
 from .interfaces import SupportsLoRA, SupportsPP
-from .utils import make_layers
+from .utils import make_layers, maybe_prefix
 
 
 class GraniteMoeMoE(nn.Module):
@@ -247,15 +247,14 @@ def forward(
 @support_torch_compile
 class GraniteMoeModel(nn.Module):
 
-    def __init__(
-        self,
-        config: GraniteMoeConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
         self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
@@ -333,25 +332,17 @@ class GraniteMoeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     }
     embedding_padding_modules = ["lm_head"]
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
 
         self.config = config
         self.lora_config = lora_config
 
-        self.model = GraniteMoeModel(config,
-                                     cache_config,
-                                     quant_config,
-                                     lora_config=lora_config,
-                                     prefix="model")
+        self.model = GraniteMoeModel(vllm_config=vllm_config,
+                                     prefix=maybe_prefix(prefix, "model"))
         self.unpadded_vocab_size = config.vocab_size
         if lora_config:
             self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index b676171b556a7..b234b602e6fbf 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -22,17 +22,15 @@
 from PIL import Image
 from torch import nn
 # Temporary solution for transformers below 4.46.0.
-from transformers import PretrainedConfig as Idefics3Config
 from transformers import ProcessorMixin as Idefics3ImageProcessor
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -48,7 +46,8 @@
 # yapf: enable
 from .interfaces import SupportsMultiModal
 from .llama import LlamaModel
-from .utils import AutoWeightsLoader, flatten_bn, merge_multimodal_embeddings
+from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
+                    merge_multimodal_embeddings)
 
 logger = init_logger(__name__)
 
@@ -417,13 +416,13 @@ def forward(self, image_hidden_states: torch.Tensor) -> torch.Tensor:
 
 class Idefics3Model(nn.Module):
 
-    def __init__(
-        self,
-        config: Idefics3Config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-    ):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.config = config
         self.padding_idx = self.config.text_config.pad_token_id
         self.vocab_size = self.config.text_config.vocab_size
@@ -613,22 +612,18 @@ def forward(
 @INPUT_REGISTRY.register_input_processor(input_processor_for_idefics3)
 class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
 
         self.config = config
         self.multimodal_config = multimodal_config
 
-        self.model = Idefics3Model(config, cache_config, quant_config)
+        self.model = Idefics3Model(vllm_config=vllm_config,
+                                   prefix=maybe_prefix(prefix, "model"))
         self.image_token_id = self.config.image_token_id
 
         self.lm_head = ParallelLMHead(
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index cbedd0c8a0130..21fa6983063b8 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -250,14 +250,13 @@ def forward(
 @support_torch_compile
 class InternLM2Model(nn.Module):
 
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.config = config
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
@@ -317,20 +316,13 @@ def forward(
 
 class InternLM2ForCausalLM(nn.Module, SupportsPP):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.model = InternLM2Model(config,
-                                    cache_config,
-                                    quant_config,
+        self.model = InternLM2Model(vllm_config=vllm_config,
                                     prefix=maybe_prefix(prefix, "model"))
         self.output = ParallelLMHead(config.vocab_size,
                                      config.hidden_size,
diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py
index 51e2c64d5552d..34889d691a934 100644
--- a/vllm/model_executor/models/internlm2_ve.py
+++ b/vllm/model_executor/models/internlm2_ve.py
@@ -104,14 +104,13 @@ def forward(
 
 class InternLM2VEModel(InternLM2Model):
 
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
-        super().__init__(config, cache_config, quant_config)
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.start_layer, self.end_layer, self.layers = make_layers(
             config.num_hidden_layers,
             lambda prefix: InternLM2VEDecoderLayer(
@@ -159,12 +158,8 @@ def forward(
 
 class InternLM2VEForCausalLM(InternLM2ForCausalLM):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
-        super().__init__(vllm_config, prefix=prefix)
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
 
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 42bccf71273b3..77efc9a26ef7a 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -35,7 +35,7 @@
                    get_clip_num_patches)
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
-                    merge_multimodal_embeddings)
+                    maybe_prefix, merge_multimodal_embeddings)
 
 IMG_START = '<img>'
 IMG_END = '</img>'
@@ -435,13 +435,13 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
             config,
             quant_config=quant_config,
             is_mono=self.is_mono,
-            prefix="vision_model",
+            prefix=maybe_prefix(prefix, "vision_model"),
         )
 
         self.language_model = init_vllm_registered_model(
             config.text_config,
             vllm_config=vllm_config,
-            prefix="language_model")
+            prefix=maybe_prefix(prefix, "language_model"))
 
         self.mlp1 = self._init_mlp1(config)
 
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index ae3f5b01d5cce..4dc9271703a8d 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -44,7 +44,8 @@
 
 from .interfaces import SupportsPP
 from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 
 class SwiGLUActivation(nn.Module):
@@ -215,14 +216,13 @@ def forward(
 @support_torch_compile
 class JAISModel(nn.Module):
 
-    def __init__(
-        self,
-        config: JAISConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.config = config
         assert not config.add_cross_attention
         assert not config.scale_attn_by_inverse_layer_idx
@@ -293,11 +293,12 @@ def __init__(
     ):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.transformer = JAISModel(config, cache_config, quant_config)
+        self.transformer = JAISModel(vllm_config=vllm_config,
+                                     prefix=maybe_prefix(
+                                         prefix, "transformer"))
         if self.config.tie_word_embeddings:
             self.lm_head = self.transformer.wte
         else:
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 72eb1017c2868..88fb8d5cf555a 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -7,7 +7,7 @@
 
 from vllm.attention.backends.abstract import AttentionMetadata
 from vllm.attention.layer import Attention
-from vllm.config import CacheConfig, LoRAConfig, VllmConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -29,6 +29,7 @@
                                       _get_graph_batch_size)
 
 from .interfaces import HasInnerState, SupportsLoRA
+from .utils import maybe_prefix
 
 KVCache = Tuple[torch.Tensor, torch.Tensor]
 
@@ -258,14 +259,14 @@ def forward(
 
 class JambaModel(nn.Module):
 
-    def __init__(
-        self,
-        config: JambaConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
         self.config = config
         self.padding_idx = config.pad_token_id
         lora_vocab = ((lora_config.lora_extra_vocab_size *
@@ -348,14 +349,9 @@ class JambaForCausalLM(nn.Module, HasInnerState, SupportsLoRA):
     }
     embedding_padding_modules = ["lm_head"]
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
         scheduler_config = vllm_config.scheduler_config
         assert not cache_config.enable_prefix_caching, \
@@ -364,10 +360,8 @@ def __init__(
         super().__init__()
         self.config = config
         self.scheduler_config = scheduler_config
-        self.model = JambaModel(config,
-                                cache_config=cache_config,
-                                quant_config=quant_config,
-                                lora_config=lora_config)
+        self.model = JambaModel(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
         self.unpadded_vocab_size = config.vocab_size
         if lora_config:
             self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index b765912387e2e..2472128976d88 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -28,7 +28,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig, VllmConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -271,15 +271,14 @@ def forward(
 @support_torch_compile
 class LlamaModel(nn.Module):
 
-    def __init__(
-        self,
-        config: LlamaConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
         self.config = config
         self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size *
@@ -492,24 +491,16 @@ class LlamaForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "norm": "model.norm"
     }
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
         pooler_config = vllm_config.model_config.pooler_config
         self.config = config
         self.lora_config = lora_config
 
-        self.model = LlamaModel(config,
-                                cache_config,
-                                quant_config,
-                                lora_config=lora_config,
+        self.model = LlamaModel(vllm_config=vllm_config,
                                 prefix=maybe_prefix(prefix, "model"))
         if get_pp_group().is_last_rank:
             self.unpadded_vocab_size = config.vocab_size
@@ -652,23 +643,12 @@ class LlamaEmbeddingModel(nn.Module, SupportsLoRA, SupportsPP):
     }
     embedding_padding_modules = []
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
-        config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
-        lora_config = vllm_config.lora_config
         pooler_config = vllm_config.model_config.pooler_config
 
-        self.model = LlamaModel(config,
-                                cache_config,
-                                quant_config,
-                                lora_config,
+        self.model = LlamaModel(vllm_config=vllm_config,
                                 prefix=maybe_prefix(prefix, "model"))
         self._pooler = Pooler.from_config_with_defaults(
             pooler_config,
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index c98462537728a..ca963fa1c52ea 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -32,7 +32,7 @@
                      dummy_seq_data_for_siglip, get_max_siglip_image_tokens,
                      input_processor_for_siglip)
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
-                    merge_multimodal_embeddings)
+                    maybe_prefix, merge_multimodal_embeddings)
 
 
 class LlavaImagePixelInputs(TypedDict):
@@ -282,7 +282,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
             config,
             quant_config,
             require_post_norm=False,
-            prefix="vision_tower")
+            prefix=maybe_prefix(prefix, "vision_tower"))
         self.multi_modal_projector = LlavaMultiModalProjector(
             vision_hidden_size=config.vision_config.hidden_size,
             text_hidden_size=config.text_config.hidden_size,
@@ -291,7 +291,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.language_model = init_vllm_registered_model(
             config.text_config,
             vllm_config=vllm_config,
-            prefix="language_model")
+            prefix=maybe_prefix(prefix, "language_model"))
 
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index f187f8105b96a..0b621a23ec980 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -31,7 +31,7 @@
                      dummy_seq_data_for_siglip, get_siglip_image_feature_size,
                      get_siglip_patch_grid_length, input_processor_for_siglip)
 from .utils import (AutoWeightsLoader, embed_multimodal, flatten_bn,
-                    init_vllm_registered_model)
+                    init_vllm_registered_model, maybe_prefix)
 
 
 class LlavaNextImagePixelInputs(TypedDict):
@@ -296,7 +296,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
             config,
             quant_config,
             require_post_norm=False,
-            prefix="vision_tower")
+            prefix=maybe_prefix(prefix, "vision_tower"))
         self.image_newline = nn.Parameter(
             torch.empty(config.text_config.hidden_size))
         self.multi_modal_projector = LlavaMultiModalProjector(
@@ -307,7 +307,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.language_model = init_vllm_registered_model(
             config.text_config,
             vllm_config=vllm_config,
-            prefix="language_model")
+            prefix=maybe_prefix(prefix, "language_model"))
 
         # The same model class supports both language generation and embedding
         # because the architecture name is the same
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index eceb0c0ab52df..b030c2f5fdc47 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -29,7 +29,7 @@
 from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
                      dummy_seq_data_for_siglip)
 from .utils import (AutoWeightsLoader, init_vllm_registered_model,
-                    merge_multimodal_embeddings)
+                    maybe_prefix, merge_multimodal_embeddings)
 
 # For profile run
 _MAX_FRAMES_PER_VIDEO = 32
@@ -267,7 +267,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
             config,
             quant_config,
             require_post_norm=False,
-            prefix="vision_tower")
+            prefix=maybe_prefix(prefix, "vision_tower"))
         self.vision_resampler = LlavaNextVideoPooler(config)
         self.multi_modal_projector = LlavaNextMultiModalProjector(
             vision_hidden_size=config.vision_config.hidden_size,
@@ -276,7 +276,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
         self.language_model = init_vllm_registered_model(
             config.text_config,
             vllm_config=vllm_config,
-            prefix="language_model")
+            prefix=maybe_prefix(prefix, "language_model"))
 
         self.make_empty_intermediate_tensors = (
             self.language_model.model.make_empty_intermediate_tensors)
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 64d373ce91509..c129f140d8d12 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -35,7 +35,7 @@
                      dummy_video_for_siglip, get_siglip_image_feature_size,
                      get_siglip_patch_grid_length, input_processor_for_siglip)
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
-                    merge_multimodal_embeddings)
+                    maybe_prefix, merge_multimodal_embeddings)
 
 # Result in the max possible feature size (2x2 grid of 336x336px tiles)
 MAX_IMAGE_FEATURE_SIZE_HEIGHT = MAX_IMAGE_FEATURE_SIZE_WIDTH = 448
@@ -418,12 +418,12 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
             config,
             quant_config,
             require_post_norm=False,
-            prefix="vision_tower")
+            prefix=maybe_prefix(prefix, "vision_tower"))
         self.multi_modal_projector = LlavaOnevisionMultiModalProjector(config)
         self.language_model = init_vllm_registered_model(
             config.text_config,
             vllm_config=vllm_config,
-            prefix="language_model")
+            prefix=maybe_prefix(prefix, "language_model"))
         self.image_newline = nn.Parameter(
             torch.empty(config.text_config.hidden_size))
 
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 49e43f8cc683c..55c575e22a0f6 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -6,7 +6,7 @@
 from transformers import MambaConfig
 
 from vllm.attention.backends.abstract import AttentionMetadata
-from vllm.config import CacheConfig, LoRAConfig, VllmConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -26,6 +26,8 @@
 from vllm.worker.model_runner import (_BATCH_SIZES_TO_CAPTURE,
                                       _get_graph_batch_size)
 
+from .utils import maybe_prefix
+
 KVCache = Tuple[torch.Tensor, torch.Tensor]
 
 
@@ -73,14 +75,14 @@ def forward(
 
 class MambaModel(nn.Module):
 
-    def __init__(
-        self,
-        config: MambaConfig,
-        quant_config: Optional[QuantizationConfig] = None,
-        cache_config: Optional[CacheConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
         self.config = config
         self.padding_idx = config.pad_token_id
         lora_vocab = ((lora_config.lora_extra_vocab_size *
@@ -130,14 +132,9 @@ def forward(
 
 class MambaForCausalLM(nn.Module, HasInnerState, IsAttentionFree):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
         scheduler_config = vllm_config.scheduler_config
         assert not cache_config.enable_prefix_caching, \
@@ -146,10 +143,8 @@ def __init__(
         super().__init__()
         self.config = config
         self.scheduler_config = scheduler_config
-        self.backbone = MambaModel(config,
-                                   cache_config=cache_config,
-                                   quant_config=quant_config,
-                                   lora_config=lora_config)
+        self.backbone = MambaModel(vllm_config=vllm_config,
+                                   prefix=maybe_prefix(prefix, "backbone"))
         self.unpadded_vocab_size = config.vocab_size
         if lora_config:
             self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 559d9c4dd35bf..2db953329fd91 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -29,7 +29,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig, VllmConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_reduce)
@@ -53,7 +53,8 @@
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 
 class MiniCPMMoE(nn.Module):
@@ -351,15 +352,14 @@ def forward(
 @support_torch_compile
 class MiniCPMModel(nn.Module):
 
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
         self.config = config
         self.cache_config = cache_config
         self.quant_config = quant_config
@@ -461,24 +461,22 @@ class MiniCPMForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     }
     embedding_padding_modules = ["lm_head"]
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
 
+        self.prefix = prefix
+        self.vllm_config = vllm_config
         self.config = config
         self.lora_config = lora_config
         self.cache_config = cache_config
         self.quant_config = quant_config
 
         self.num_experts = getattr(self.config, "num_experts", 0)
-        self._init_model()
+        self._init_model(vllm_config=vllm_config, prefix=prefix)
         unpadded_vocab_size = config.vocab_size
         if lora_config:
             unpadded_vocab_size += lora_config.lora_extra_vocab_size
@@ -502,11 +500,9 @@ def __init__(
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
-    def _init_model(self):
-        self.model = MiniCPMModel(config=self.config,
-                                  cache_config=self.cache_config,
-                                  quant_config=self.quant_config,
-                                  lora_config=self.lora_config)
+    def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        self.model = MiniCPMModel(vllm_config=vllm_config,
+                                  prefix=maybe_prefix(prefix, "model"))
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/minicpm3.py b/vllm/model_executor/models/minicpm3.py
index eeedf55cf3e57..278c4bbe6e563 100644
--- a/vllm/model_executor/models/minicpm3.py
+++ b/vllm/model_executor/models/minicpm3.py
@@ -28,7 +28,7 @@
 from transformers import PretrainedConfig
 
 from vllm.attention import Attention, AttentionMetadata
-from vllm.config import CacheConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -40,7 +40,7 @@
                                                 MiniCPMForCausalLM,
                                                 MiniCPMModel)
 
-from .utils import make_layers
+from .utils import make_layers, maybe_prefix
 
 
 class MiniCPM3Attention(nn.Module):
@@ -238,8 +238,6 @@ class MiniCPM3ForCausalLM(MiniCPMForCausalLM):
     # `embedding_modules` and `embedding_padding_modules`
     # are inherited from MiniCPMForCausalLM
 
-    def _init_model(self):
-        self.model = MiniCPM3Model(config=self.config,
-                                   cache_config=self.cache_config,
-                                   quant_config=self.quant_config,
-                                   lora_config=self.lora_config)
+    def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        self.model = MiniCPM3Model(vllm_config=vllm_config,
+                                   prefix=maybe_prefix(prefix, "model"))
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 9458204c5a038..aae534c0b5949 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -34,7 +34,7 @@
 from typing_extensions import NotRequired
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import VllmConfig
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData,
                          InputContext, token_inputs)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
@@ -59,7 +59,7 @@
 
 from .idefics2_vision_model import Idefics2VisionTransformer
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
-from .utils import is_pp_missing_parameter
+from .utils import is_pp_missing_parameter, maybe_prefix
 
 _KEYS_TO_MODIFY_MAPPING = {
     "llm.lm_head": "lm_head",
@@ -390,7 +390,6 @@ def __init__(
     ):
         config = vllm_config.model_config.hf_config
         multimodal_config = vllm_config.model_config.multimodal_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         super().__init__()
         # All MiniCPM-V models disable `tie_word_embeddings` but
@@ -401,11 +400,11 @@ def __init__(
         self.multimodal_config = multimodal_config
 
         self.version = get_version_by_config(self.config)
-        self.llm = self.init_llm(config,
-                                 cache_config,
-                                 quant_config,
-                                 prefix="llm")
-        self.vpm = self.init_vision_module(config, quant_config, prefix="vpm")
+        self.llm = self.init_llm(vllm_config=vllm_config,
+                                 prefix=maybe_prefix(prefix, "llm"))
+        self.vpm = self.init_vision_module(config,
+                                           quant_config,
+                                           prefix=maybe_prefix(prefix, "vpm"))
         param_dtype = torch.get_default_dtype()
         self.vpm.to(dtype=param_dtype)
         self.vision_dim = (self.vpm.embed_dim if self.version == (2, 0) else
@@ -414,13 +413,15 @@ def __init__(
         self.resampler = self.init_resampler(self.embed_dim,
                                              self.vision_dim,
                                              quant_config=quant_config,
-                                             prefix="resampler")
+                                             prefix=maybe_prefix(
+                                                 prefix, "resampler"))
         self.resampler.to(device="cuda", dtype=param_dtype)
         # TODO: why is there _KEYS_TO_MODIFY_MAPPING? lm_head should be in llm
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
                                       quant_config=quant_config,
-                                      prefix="llm.lm_head")
+                                      prefix=maybe_prefix(
+                                          prefix, "llm.lm_head"))
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = get_sampler()
 
@@ -661,9 +662,7 @@ def get_mm_mapping(self) -> MultiModelKeys:
 
     def init_llm(
         self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        vllm_config: VllmConfig,
         prefix: str = "",
     ) -> nn.Module:
         raise NotImplementedError
@@ -711,16 +710,10 @@ def __init__(
 
     def init_llm(
         self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        vllm_config: VllmConfig,
         prefix: str = "",
     ) -> nn.Module:
-
-        return LLMWrapper(MiniCPMModel(config,
-                                       cache_config=cache_config,
-                                       quant_config=quant_config,
-                                       prefix=prefix),
+        return LLMWrapper(MiniCPMModel(vllm_config=vllm_config, prefix=prefix),
                           name="model")
 
     def init_vision_module(
@@ -875,15 +868,10 @@ def __init__(
 
     def init_llm(
         self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        vllm_config: VllmConfig,
         prefix: str = "",
     ) -> nn.Module:
-        return LLMWrapper(LlamaModel(config,
-                                     cache_config=cache_config,
-                                     quant_config=quant_config,
-                                     prefix=prefix),
+        return LLMWrapper(LlamaModel(vllm_config=vllm_config, prefix=prefix),
                           name="model")
 
     def init_vision_module(
@@ -1022,16 +1010,10 @@ def __init__(
 
     def init_llm(
         self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
+        vllm_config: VllmConfig,
         prefix: str = "",
     ) -> nn.Module:
-
-        return LLMWrapper(Qwen2Model(config,
-                                     cache_config=cache_config,
-                                     quant_config=quant_config,
-                                     prefix=prefix),
+        return LLMWrapper(Qwen2Model(vllm_config=vllm_config, prefix=prefix),
                           name="model")
 
     def init_vision_module(
@@ -1151,4 +1133,4 @@ def __new__(cls, vllm_config: VllmConfig, prefix: str = ""):
         if instance_class is None:
             raise ValueError(
                 "Currently, MiniCPMV only supports versions 2.0, 2.5, and 2.6")
-        return instance_class(vllm_config, prefix=prefix)
+        return instance_class(vllm_config=vllm_config, prefix=prefix)
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 91ec3228c0d48..3eb2f60fd4fc7 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -28,7 +28,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig, VllmConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -48,7 +48,8 @@
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 
 class MixtralMoE(nn.Module):
@@ -248,15 +249,14 @@ def forward(
 @support_torch_compile
 class MixtralModel(nn.Module):
 
-    def __init__(
-        self,
-        config: MixtralConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
         self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size *
                       (lora_config.max_loras or 1)) if lora_config else 0
@@ -332,24 +332,16 @@ class MixtralForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     }
     embedding_padding_modules = ["lm_head"]
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
         self.config = config
         self.lora_config = lora_config
 
-        self.model = MixtralModel(config,
-                                  cache_config,
-                                  quant_config,
-                                  lora_config=lora_config,
-                                  prefix="model")
+        self.model = MixtralModel(vllm_config=vllm_config,
+                                  prefix=maybe_prefix(prefix, "model"))
         self.unpadded_vocab_size = config.vocab_size
         if lora_config:
             self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index aeac326776392..95cfb6f54dc10 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -49,7 +49,8 @@
 
 from .interfaces import SupportsPP
 from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 
 class MixtralMLP(nn.Module):
@@ -293,14 +294,13 @@ def forward(
 
 class MixtralModel(nn.Module):
 
-    def __init__(
-        self,
-        config: MixtralConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
@@ -350,18 +350,14 @@ def forward(
 class MixtralForCausalLM(nn.Module, SupportsPP):
     fall_back_to_pt_during_load = False
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.model = MixtralModel(config, cache_config, quant_config)
+        self.model = MixtralModel(vllm_config=vllm_config,
+                                  prefix=maybe_prefix(prefix, "model"))
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
                                       quant_config=quant_config)
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index 14aa515570f38..e5c1d28e6e7ea 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -33,7 +33,7 @@
 import vllm.distributed.parallel_state as ps
 from vllm.attention import Attention, AttentionMetadata, AttentionType
 from vllm.attention.ops.paged_attn import PagedAttention
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.inputs import (INPUT_REGISTRY, DummyData, EncoderDecoderInputs,
                          InputContext, TokenInputs, token_inputs)
@@ -56,6 +56,7 @@
 from .clip import CLIPMLP
 from .interfaces import SupportsMultiModal
 from .llama import LlamaDecoderLayer, LlamaMLP
+from .utils import maybe_prefix
 
 logger = init_logger(__name__)
 MLLAMA_IMAGE_TOKEN_ID = 128256
@@ -939,15 +940,13 @@ class MllamaTextModel(nn.Module):
     config_class = config_mllama.MllamaTextConfig
     base_model_prefix = "model"
 
-    def __init__(
-        self,
-        config: config_mllama.MllamaTextConfig,
-        cache_config: Optional[CacheConfig],
-        quant_config: Optional[QuantizationConfig],
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
+        config = vllm_config.model_config.hf_config.text_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
         self.embed_tokens = VocabParallelEmbedding(config.vocab_size + 8,
@@ -1029,18 +1028,14 @@ class MllamaForCausalLM(nn.Module):
         "MllamaCrossAttentionDecoderLayer", "MllamaSelfAttentionDecoderLayer"
     ]
 
-    def __init__(
-        self,
-        config: config_mllama.MllamaTextConfig,
-        cache_config: Optional[CacheConfig],
-        quant_config: Optional[QuantizationConfig],
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config.text_config
+        quant_config = vllm_config.quant_config
+
         self.vocab_size = config.vocab_size
-        self.model = MllamaTextModel(config,
-                                     cache_config,
-                                     quant_config,
+        self.model = MllamaTextModel(vllm_config=vllm_config,
                                      prefix=f"{prefix}.model")
         self.lm_head = ParallelLMHead(
             config.vocab_size,
@@ -1108,14 +1103,9 @@ class MllamaForConditionalGeneration(nn.Module, SupportsMultiModal):
         "up_proj": ("gate_up_proj", 1),
     }
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         self.vocab_size = config.text_config.vocab_size
         self.hidden_size = config.text_config.hidden_size
@@ -1127,12 +1117,11 @@ def __init__(
 
         self.vision_model = MllamaVisionModel(config.vision_config,
                                               quant_config,
-                                              prefix="vision_model")
+                                              prefix=maybe_prefix(
+                                                  prefix, "vision_model"))
         self.language_model = MllamaForCausalLM(
-            config.text_config,
-            cache_config=cache_config,
-            quant_config=quant_config,
-            prefix="language_model",
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "language_model"),
         )
         self.multi_modal_projector = ColumnParallelLinear(
             config.vision_config.vision_output_dim,
@@ -1140,7 +1129,7 @@ def __init__(
             bias=True,
             quant_config=quant_config,
             gather_output=True,
-            prefix="multi_modal_projector",
+            prefix=maybe_prefix(prefix, "multi_modal_projector"),
         )
         self.logits_processor = LogitsProcessor(config.output_hidden_states,
                                                 config.text_config.vocab_size)
diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index cd462c4d0495e..035a1e2ab7b02 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -44,7 +44,8 @@
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (get_vit_attn_backend,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 # TODO: hard-coded for now. Consider making it configurable.
 VIT_LAYERS = [-2, -9]
@@ -716,14 +717,13 @@ def forward(
 @support_torch_compile
 class MolmoModel(nn.Module):
 
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.config = config
 
         self.embedding_size = config.embedding_size or config.vocab_size
@@ -1024,14 +1024,9 @@ def input_processor_for_molmo(ctx: InputContext, inputs: DecoderOnlyInputs):
 @INPUT_REGISTRY.register_input_processor(input_processor_for_molmo)
 class MolmoForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
@@ -1040,7 +1035,8 @@ def __init__(
         vision_config = VisionBackboneConfig()
         self.vision_backbone = MolmoVisionBackbone(config, vision_config,
                                                    quant_config)
-        self.model = MolmoModel(config, cache_config, quant_config)
+        self.model = MolmoModel(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
 
         if self.config.weight_tying:
             self.lm_head = self.model.transformer.wte
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 672c8e9c22260..e15c0fe8db060 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -26,7 +26,8 @@
 
 from .interfaces import SupportsPP
 from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 
 def _get_alibi_slopes(
@@ -207,14 +208,13 @@ def forward(
 @support_torch_compile
 class MPTModel(nn.Module):
 
-    def __init__(
-        self,
-        config: MPTConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         assert config.embedding_fraction == 1.0
         assert config.norm_type == "low_precision_layernorm"
 
@@ -267,20 +267,16 @@ def forward(
 
 class MPTForCausalLM(nn.Module, SupportsPP):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         self.config = config
         assert config.tie_word_embeddings
         self.quant_config = quant_config
 
-        self.transformer = MPTModel(config, cache_config, quant_config)
+        self.transformer = MPTModel(vllm_config=vllm_config,
+                                    prefix=maybe_prefix(prefix, "transformer"))
         self.lm_head = self.transformer.wte
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = get_sampler()
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index 5991cce642981..e09d7088a69ce 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -27,7 +27,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig, VllmConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -47,7 +47,8 @@
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (PPMissingLayer, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 # The architecture is pretty similar to Llama, with these changes:
 # - There is no gate_proj, just up_proj
@@ -293,15 +294,14 @@ def forward(
 @support_torch_compile
 class NemotronModel(nn.Module):
 
-    def __init__(
-        self,
-        config: NemotronConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
         self.config = config
         self.padding_idx = config.pad_token_id
         lora_vocab = (lora_config.lora_extra_vocab_size *
@@ -401,14 +401,9 @@ class NemotronForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "v_proj": ("qkv_proj", 2),
     }
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
         assert isinstance(config, NemotronConfig)
@@ -416,11 +411,8 @@ def __init__(
         self.config = config
         self.lora_config = lora_config
 
-        self.model = NemotronModel(config,
-                                   cache_config,
-                                   quant_config,
-                                   lora_config=lora_config,
-                                   prefix="model")
+        self.model = NemotronModel(vllm_config=vllm_config,
+                                   prefix=maybe_prefix(prefix, "model"))
         if get_pp_group().is_last_rank:
             self.unpadded_vocab_size = config.vocab_size
             if lora_config:
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 6905f8521a8c3..3467ae5896494 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -46,7 +46,8 @@
 
 from .interfaces import SupportsPP
 from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 
 class OlmoAttention(nn.Module):
@@ -224,12 +225,13 @@ def forward(
 @support_torch_compile
 class OlmoModel(nn.Module):
 
-    def __init__(self,
-                 config: OlmoConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.config = config
 
         self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
@@ -291,17 +293,13 @@ class OlmoForCausalLM(nn.Module, SupportsPP):
     Extremely barebones HF model wrapper.
     """
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         self.config = config
-        self.model = OlmoModel(config, cache_config, quant_config)
+        self.model = OlmoModel(vllm_config=vllm_config,
+                               prefix=maybe_prefix(prefix, "model"))
         if config.tie_word_embeddings:
             self.lm_head = self.model.embed_tokens
         else:
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 8fa90d17003af..3d31919edd862 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -38,7 +38,8 @@
 
 from .interfaces import SupportsPP
 from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 
 class OlmoeMoE(nn.Module):
@@ -243,14 +244,13 @@ def forward(
 @support_torch_compile
 class OlmoeModel(nn.Module):
 
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
@@ -309,18 +309,14 @@ class OlmoeForCausalLM(nn.Module, SupportsPP):
 
     fall_back_to_pt_during_load = False
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.model = OlmoeModel(config, cache_config, quant_config)
+        self.model = OlmoeModel(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
                                       quant_config=quant_config)
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index d378956b68cfc..58b6107eba347 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -293,14 +293,13 @@ def forward(
 @support_torch_compile
 class OPTModel(nn.Module):
 
-    def __init__(
-        self,
-        config: OPTConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.decoder = OPTDecoder(config,
                                   cache_config,
                                   quant_config,
@@ -342,21 +341,14 @@ class OPTForCausalLM(nn.Module, SupportsPP):
         ".q_proj.", ".k_proj.", ".v_proj.", ".out_proj.", ".fc1.", ".fc2."
     ]
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         super().__init__()
         self.config = config
         self.quant_config = quant_config
-        self.model = OPTModel(config,
-                              cache_config,
-                              quant_config,
+        self.model = OPTModel(vllm_config=vllm_config,
                               prefix=maybe_prefix(prefix, "model"))
         if self.config.tie_word_embeddings:
             self.lm_head = self.model.decoder.embed_tokens
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index b400d4e3f5228..38821c8288347 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -29,7 +29,8 @@
 
 from .interfaces import SupportsPP
 from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 
 class OrionMLP(nn.Module):
@@ -208,14 +209,13 @@ def forward(
 @support_torch_compile
 class OrionModel(nn.Module):
 
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.config = config
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
@@ -268,18 +268,14 @@ def forward(
 
 class OrionForCausalLM(nn.Module, SupportsPP):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.model = OrionModel(config, cache_config, quant_config)
+        self.model = OrionModel(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
                                       quant_config=quant_config)
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 69b7fe9d56847..eea229359255e 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -20,7 +20,7 @@
 from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
                      dummy_seq_data_for_siglip, get_max_siglip_image_tokens)
 from .utils import (AutoWeightsLoader, init_vllm_registered_model,
-                    merge_multimodal_embeddings)
+                    maybe_prefix, merge_multimodal_embeddings)
 
 logger = init_logger(__name__)
 
@@ -131,11 +131,7 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
 class PaliGemmaForConditionalGeneration(nn.Module, SupportsMultiModal,
                                         SupportsPP):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
@@ -145,7 +141,8 @@ def __init__(
 
         self.vision_tower = SiglipVisionModel(config.vision_config,
                                               quant_config,
-                                              prefix="vision_tower")
+                                              prefix=maybe_prefix(
+                                                  prefix, "vision_tower"))
         self.multi_modal_projector = PaliGemmaMultiModalProjector(
             vision_hidden_size=config.vision_config.hidden_size,
             projection_dim=config.vision_config.projection_dim)
@@ -155,7 +152,7 @@ def __init__(
         self.language_model = init_vllm_registered_model(
             config.text_config,
             vllm_config=vllm_config,
-            prefix="language_model")
+            prefix=maybe_prefix(prefix, "language_model"))
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.language_model.logits_processor.scale *= logit_scale
 
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index a86e2c1b4e4a1..2e34a7cc30873 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -45,7 +45,8 @@
 
 from .interfaces import SupportsPP
 from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 
 class PersimmonMLP(nn.Module):
@@ -212,12 +213,13 @@ def forward(
 @support_torch_compile
 class PersimmonModel(nn.Module):
 
-    def __init__(self,
-                 config: PersimmonConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.vocab_size = config.vocab_size
 
         self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
@@ -265,20 +267,13 @@ def forward(
 
 class PersimmonForCausalLM(nn.Module, SupportsPP):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
         self.config = config
         self.vocab_size = config.vocab_size
-        self.model = PersimmonModel(config,
-                                    cache_config=cache_config,
-                                    quant_config=quant_config)
+        self.model = PersimmonModel(vllm_config=vllm_config,
+                                    prefix=maybe_prefix(prefix, "model"))
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
                                       bias=False)
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index fef921528b042..262f6996fc374 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -60,7 +60,8 @@
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 
 class PhiAttention(nn.Module):
@@ -196,12 +197,13 @@ def forward(
 @support_torch_compile
 class PhiModel(nn.Module):
 
-    def __init__(self,
-                 config: PhiConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.config = config
         self.quant_config = quant_config
         self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
@@ -277,14 +279,9 @@ class PhiForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     embedding_modules = {}
     embedding_padding_modules = []
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
         self.config = config
@@ -294,7 +291,8 @@ def __init__(
 
         self.quant_config = quant_config
 
-        self.model = PhiModel(config, cache_config, quant_config)
+        self.model = PhiModel(vllm_config=vllm_config,
+                              prefix=maybe_prefix(prefix, "model"))
 
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py
index de1b09eba6c6d..8a5fb6d303e60 100644
--- a/vllm/model_executor/models/phi3_small.py
+++ b/vllm/model_executor/models/phi3_small.py
@@ -24,7 +24,8 @@
 
 from .interfaces import SupportsPP
 from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 
 def load_column_parallel_weight(param: torch.nn.Parameter,
@@ -299,14 +300,13 @@ def forward(
 
 class Phi3SmallModel(nn.Module):
 
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.config = config
         self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
                                                    config.hidden_size)
@@ -363,18 +363,14 @@ def forward(
 class Phi3SmallForCausalLM(nn.Module, SupportsPP):
     _tied_weights_keys = ["lm_head.weight"]
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.model = Phi3SmallModel(config, cache_config, quant_config)
+        self.model = Phi3SmallModel(vllm_config=vllm_config,
+                                    prefix=maybe_prefix(prefix, "model"))
         self.vocab_size = config.vocab_size
         self.mup_width_multiplier = config.mup_width_multiplier
         self.lm_head = ParallelLMHead(
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 65131d61673a3..4b5dc944bce4b 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -45,7 +45,7 @@
 
 from .clip import dummy_image_for_clip, dummy_seq_data_for_clip
 from .interfaces import SupportsMultiModal, SupportsPP
-from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
+from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
 
 logger = init_logger(__name__)
@@ -525,11 +525,7 @@ def input_processor_for_phi3v(ctx: InputContext,
 @INPUT_REGISTRY.register_input_processor(input_processor_for_phi3v)
 class Phi3VForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
@@ -544,12 +540,14 @@ def __init__(
             config.hidden_size,
             org_num_embeddings=config.vocab_size,
             quant_config=quant_config,
-            prefix="model.embed_tokens",
+            prefix=maybe_prefix(prefix, "model.embed_tokens"),
         )
 
         # TODO: Optionally initializes this for supporting input embeddings.
         self.vision_embed_tokens = Phi3HDImageEmbedding(
-            config, quant_config, prefix="model.vision_embed_tokens")
+            config,
+            quant_config,
+            prefix=maybe_prefix(prefix, "model.vision_embed_tokens"))
 
         # The prefix is empty intentionally because default prefix of
         # LlamaForCausalLM is "model"
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index 17d00c0ede2b2..6d71a8949111b 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -28,7 +28,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig, VllmConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.fused_moe import FusedMoE
 from vllm.model_executor.layers.linear import (QKVParallelLinear,
@@ -48,7 +48,8 @@
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 
 class PhiMoEConfig(PretrainedConfig):
@@ -432,15 +433,14 @@ def forward(
 @support_torch_compile
 class PhiMoEModel(nn.Module):
 
-    def __init__(
-        self,
-        config: PhiMoEConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
         self.padding_idx = config.pad_token_id
         lora_vocab = ((lora_config.lora_extra_vocab_size *
                        (lora_config.max_loras or 1)) if lora_config else 0)
@@ -529,23 +529,15 @@ class PhiMoEForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     }
     embedding_padding_modules = ["lm_head"]
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
         self.config = config
         self.lora_config = lora_config
 
-        self.model = PhiMoEModel(config,
-                                 cache_config,
-                                 quant_config,
-                                 lora_config=lora_config)
+        self.model = PhiMoEModel(vllm_config=vllm_config,
+                                 prefix=maybe_prefix(prefix, "model"))
         self.unpadded_vocab_size = config.vocab_size
         if lora_config:
             self.unpadded_vocab_size += lora_config.lora_extra_vocab_size
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 93919c9c051c0..6bd5e119dd2dd 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -38,7 +38,7 @@
 from vllm.utils import is_list_of
 
 from .interfaces import SupportsMultiModal, SupportsPP
-from .utils import init_vllm_registered_model
+from .utils import init_vllm_registered_model, maybe_prefix
 
 try:
     from xformers import ops as xops
@@ -152,11 +152,7 @@ def input_processor_for_pixtral(ctx: InputContext, inputs: DecoderOnlyInputs):
 class PixtralForConditionalGeneration(nn.Module, SupportsMultiModal,
                                       SupportsPP):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         multimodal_config = vllm_config.model_config.multimodal_config
@@ -176,7 +172,7 @@ def __init__(
         self.language_model = init_vllm_registered_model(
             config.text_config,
             vllm_config=vllm_config,
-            prefix="language_model")
+            prefix=maybe_prefix(prefix, "language_model"))
 
         self.vision_encoder = VisionTransformer(self.vision_args)
         self.vision_language_adapter = VisionLanguageAdapter(
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index d3f10ee7c85ca..cc70099361dd2 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -50,7 +50,8 @@
 
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
 from .utils import (flatten_bn, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 logger = init_logger(__name__)
 
@@ -552,14 +553,13 @@ def forward(
 @support_torch_compile
 class QWenModel(nn.Module):
 
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.config = config
         self.vocab_size = config.vocab_size
 
@@ -865,20 +865,17 @@ def dummy_data_for_qwen(
 
 class QWenBaseModel(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
         self.multimodal_config = multimodal_config
         self.quant_config = quant_config
-        self.transformer = QWenModel(config, cache_config, quant_config)
+        self.transformer = QWenModel(vllm_config=vllm_config,
+                                     prefix=maybe_prefix(
+                                         prefix, "transformer"))
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
                                       quant_config=quant_config)
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index b0156a25ca5cf..2195ce49aa9a7 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -240,14 +240,13 @@ def forward(
 @support_torch_compile
 class Qwen2Model(nn.Module):
 
-    def __init__(
-        self,
-        config: Qwen2Config,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.config = config
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
@@ -403,11 +402,7 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "up_proj": ("gate_up_proj", 1),
     }
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
@@ -429,9 +424,7 @@ def __init__(
         self.lora_config = lora_config
 
         self.quant_config = quant_config
-        self.model = Qwen2Model(config,
-                                cache_config,
-                                quant_config,
+        self.model = Qwen2Model(vllm_config=vllm_config,
                                 prefix=maybe_prefix(prefix, "model"))
 
         if config.tie_word_embeddings:
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index 1057720e8c308..d30950361ad89 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -264,14 +264,9 @@ def input_mapper_for_qwen2_audio(
 class Qwen2AudioForConditionalGeneration(nn.Module, SupportsMultiModal,
                                          SupportsPP):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         multimodal_config = vllm_config.model_config.multimodal_config
         self.config = config
@@ -283,8 +278,9 @@ def __init__(
 
         self.quant_config = quant_config
 
-        self.language_model = Qwen2Model(config.text_config, cache_config,
-                                         quant_config)
+        self.language_model = Qwen2Model(
+            vllm_config=vllm_config.with_hf_config(config.text_config),
+            prefix=prefix)
         self.unpadded_vocab_size = config.text_config.vocab_size
         if config.text_config.tie_word_embeddings:
             self.lm_head = self.language_model.embed_tokens
diff --git a/vllm/model_executor/models/qwen2_cls.py b/vllm/model_executor/models/qwen2_cls.py
index 25ecf76e35f22..020af88aadd98 100644
--- a/vllm/model_executor/models/qwen2_cls.py
+++ b/vllm/model_executor/models/qwen2_cls.py
@@ -17,7 +17,7 @@
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors, PoolerOutput
 
-from .utils import AutoWeightsLoader
+from .utils import AutoWeightsLoader, maybe_prefix
 
 
 class Qwen2ForSequenceClassification(nn.Module):
@@ -43,11 +43,7 @@ class Qwen2ForSequenceClassification(nn.Module):
     embedding_modules = {}
     embedding_padding_modules = []
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
@@ -70,7 +66,8 @@ def __init__(
         self.lora_config = lora_config
 
         self.quant_config = quant_config
-        self.model = Qwen2Model(config, cache_config, quant_config)
+        self.model = Qwen2Model(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
 
         self.score = RowParallelLinear(config.hidden_size,
                                        config.num_labels,
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index b1177f9c59063..51c0cd5664fd2 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -54,7 +54,8 @@
 
 from .interfaces import SupportsPP
 from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 
 class Qwen2MoeMLP(nn.Module):
@@ -315,14 +316,13 @@ def forward(
 @support_torch_compile
 class Qwen2MoeModel(nn.Module):
 
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
@@ -377,18 +377,14 @@ class Qwen2MoeForCausalLM(nn.Module, SupportsPP):
 
     fall_back_to_pt_during_load = False
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.model = Qwen2MoeModel(config, cache_config, quant_config)
+        self.model = Qwen2MoeModel(vllm_config=vllm_config,
+                                   prefix=maybe_prefix(prefix, "model"))
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
                                       quant_config=quant_config)
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index 1f9411241bdd6..89768ec9dff37 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -18,7 +18,7 @@
 
 from .interfaces import SupportsPP
 from .qwen2 import Qwen2Model
-from .utils import AutoWeightsLoader
+from .utils import AutoWeightsLoader, maybe_prefix
 
 
 class ReLU(nn.Module):
@@ -55,11 +55,7 @@ class Qwen2ForRewardModel(nn.Module, SupportsPP):
     embedding_modules = {}
     embedding_padding_modules = []
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
@@ -82,7 +78,8 @@ def __init__(
         self.lora_config = lora_config
 
         self.quant_config = quant_config
-        self.model = Qwen2Model(config, cache_config, quant_config)
+        self.model = Qwen2Model(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
 
         self.score = nn.Sequential(
             ColumnParallelLinear(config.hidden_size,
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index ab80c1494d067..13109758767df 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -70,7 +70,7 @@
 from .interfaces import SupportsLoRA, SupportsMultiModal, SupportsPP
 from .utils import (PPMissingLayer, get_vit_attn_backend,
                     is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory)
+                    make_empty_intermediate_tensors_factory, maybe_prefix)
 
 logger = init_logger(__name__)
 
@@ -966,11 +966,7 @@ class Qwen2VLForConditionalGeneration(nn.Module, SupportsMultiModal,
     embedding_modules = {}
     embedding_padding_modules = []
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
@@ -986,13 +982,11 @@ def __init__(
             config.vision_config,
             norm_eps=getattr(config, "rms_norm_eps", 1e-6),
             quant_config=self._maybe_ignore_quant_config(quant_config),
-            prefix="visual",
+            prefix=maybe_prefix(prefix, "visual"),
         )
 
-        self.model = Qwen2Model(config,
-                                cache_config,
-                                quant_config,
-                                prefix="model")
+        self.model = Qwen2Model(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
 
         if get_pp_group().is_last_rank:
             if config.tie_word_embeddings:
@@ -1001,7 +995,8 @@ def __init__(
                 self.lm_head = ParallelLMHead(config.vocab_size,
                                               config.hidden_size,
                                               quant_config=quant_config,
-                                              prefix="lm_head")
+                                              prefix=maybe_prefix(
+                                                  prefix, "lm_head"))
         else:
             self.lm_head = PPMissingLayer()
 
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index ffabac8292dbd..4f03ca501fb68 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -29,7 +29,7 @@
 
 from vllm.attention import Attention, AttentionMetadata
 from vllm.compilation.decorators import support_torch_compile
-from vllm.config import CacheConfig, LoRAConfig, VllmConfig
+from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.activation import SiluAndMul
@@ -53,7 +53,8 @@
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (PPMissingLayer, is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 
 class SolarMLP(nn.Module):
@@ -266,15 +267,14 @@ def forward(
 @support_torch_compile
 class SolarModel(nn.Module):
 
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        cache_config: Optional[CacheConfig] = None,
-        quant_config: Optional[QuantizationConfig] = None,
-        lora_config: Optional[LoRAConfig] = None,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+
         self.config = config
         self.padding_idx = config.pad_token_id
         lora_vocab = ((lora_config.lora_extra_vocab_size *
@@ -409,25 +409,17 @@ class SolarForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
         "up_proj": ("gate_up_proj", 1),
     }
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
         self.config = config
         self.lora_config = lora_config
 
         self.model = SolarModel(
-            config,
-            cache_config,
-            quant_config,
-            lora_config=lora_config,
-            prefix="model",
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "model"),
         )
         if get_pp_group().is_last_rank:
             self.unpadded_vocab_size = config.vocab_size
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 975d316977c37..1125f9e9f9617 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -43,7 +43,8 @@
 
 from .interfaces import SupportsPP
 from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 
 class StablelmMLP(nn.Module):
@@ -193,12 +194,13 @@ def forward(
 
 class StableLMEpochModel(nn.Module):
 
-    def __init__(self,
-                 config: PretrainedConfig,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = '') -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.embed_tokens = VocabParallelEmbedding(
             config.vocab_size,
             config.hidden_size,
@@ -245,18 +247,14 @@ def forward(
 
 class StablelmForCausalLM(nn.Module, SupportsPP):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
-        self.model = StableLMEpochModel(config, cache_config, quant_config)
+        self.model = StableLMEpochModel(vllm_config=vllm_config,
+                                        prefix=maybe_prefix(prefix, "model"))
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
                                       quant_config=quant_config)
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index ae61aa4e248a5..ce7a7957f52c4 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -43,7 +43,8 @@
 
 from .interfaces import SupportsPP
 from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 
 class Starcoder2Attention(nn.Module):
@@ -195,12 +196,13 @@ def forward(
 @support_torch_compile
 class Starcoder2Model(nn.Module):
 
-    def __init__(self,
-                 config: Starcoder2Config,
-                 cache_config: Optional[CacheConfig] = None,
-                 quant_config: Optional[QuantizationConfig] = None,
-                 prefix: str = ""):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
+
+        config = vllm_config.model_config.hf_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
         self.config = config
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
@@ -245,19 +247,13 @@ def forward(
 
 class Starcoder2ForCausalLM(nn.Module, SupportsPP):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         self.config = config
-        self.model = Starcoder2Model(config,
-                                     cache_config,
-                                     quant_config=quant_config)
+        self.model = Starcoder2Model(vllm_config=vllm_config,
+                                     prefix=maybe_prefix(prefix, "model"))
         self.vocab_size = config.vocab_size
         self.unpadded_vocab_size = config.vocab_size
         if config.tie_word_embeddings:
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index d47f0091e0f9f..9fde22c016de0 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -34,7 +34,7 @@
 
 from .interfaces import SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn,
-                    init_vllm_registered_model,
+                    init_vllm_registered_model, maybe_prefix,
                     merge_multimodal_embeddings_from_map)
 
 _AUDIO_PLACEHOLDER_TOKEN = 128002
@@ -339,11 +339,7 @@ def forward(
 @INPUT_REGISTRY.register_input_processor(input_processor_for_ultravox)
 class UltravoxModel(nn.Module, SupportsMultiModal, SupportsPP):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         multimodal_config = vllm_config.model_config.multimodal_config
@@ -354,6 +350,8 @@ def __init__(
         self.secondary_weights = []
         self.audio_tower = ModifiedWhisperEncoder(config.audio_config)
         if config.audio_model_id is not None:
+            # this prefix is not for initialization, but for loading weights
+            # note the trailing dot
             self.secondary_weights.append(
                 DefaultModelLoader.Source(
                     model_or_path=config.audio_model_id,
@@ -362,8 +360,12 @@ def __init__(
                 ))
         self.multi_modal_projector = UltravoxProjector(config)
         self.language_model = init_vllm_registered_model(
-            config.text_config, vllm_config, prefix="language_model")
+            config.text_config,
+            vllm_config=vllm_config,
+            prefix=maybe_prefix(prefix, "language_model"))
         if config.text_model_id is not None:
+            # this prefix is not for initialization, but for loading weights
+            # note the trailing dot
             self.secondary_weights.append(
                 DefaultModelLoader.Source(model_or_path=config.text_model_id,
                                           revision=None,
diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py
index 7afb99176077b..153527da20d75 100644
--- a/vllm/model_executor/models/xverse.py
+++ b/vllm/model_executor/models/xverse.py
@@ -46,7 +46,8 @@
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (is_pp_missing_parameter,
-                    make_empty_intermediate_tensors_factory, make_layers)
+                    make_empty_intermediate_tensors_factory, make_layers,
+                    maybe_prefix)
 
 
 class XverseMLP(nn.Module):
@@ -223,11 +224,7 @@ def forward(
 @support_torch_compile
 class XverseModel(nn.Module):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
@@ -315,15 +312,10 @@ class XverseForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     }
     embedding_padding_modules = ["lm_head"]
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ) -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
 
@@ -331,7 +323,8 @@ def __init__(
         self.lora_config = lora_config
 
         self.quant_config = quant_config
-        self.model = XverseModel(config, cache_config, quant_config)
+        self.model = XverseModel(vllm_config=vllm_config,
+                                 prefix=maybe_prefix(prefix, "model"))
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
                                       quant_config=quant_config)

From 9804ac7c7ce34a62f648cce579d89e355fb0bfc0 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 11 Nov 2024 07:22:40 +0000
Subject: [PATCH 080/183] Bump the patch-update group with 5 updates (#10210)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 requirements-test.txt | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/requirements-test.txt b/requirements-test.txt
index fb322fcc72dc2..65695111e4dc5 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -14,7 +14,6 @@ aiohappyeyeballs==2.4.3
     # via aiohttp
 aiohttp==3.10.10
     # via
-    #   -r requirements-test.in
     #   datasets
     #   fsspec
     #   lm-eval
@@ -40,15 +39,15 @@ attrs==24.2.0
     #   referencing
 audioread==3.0.1
     # via librosa
-awscli==1.35.19
+awscli==1.35.23
     # via -r requirements-test.in
 bitsandbytes==0.44.1
     # via -r requirements-test.in
 black==24.10.0
     # via datamodel-code-generator
-boto3==1.35.53
+boto3==1.35.57
     # via tensorizer
-botocore==1.35.53
+botocore==1.35.57
     # via
     #   awscli
     #   boto3
@@ -82,7 +81,7 @@ cupy-cuda12x==13.3.0
     # via ray
 cycler==0.12.1
     # via matplotlib
-datamodel-code-generator==0.26.2
+datamodel-code-generator==0.26.3
     # via -r requirements-test.in
 dataproperty==1.0.1
     # via
@@ -263,7 +262,6 @@ numpy==1.26.4
     #   mistral-common
     #   numba
     #   numexpr
-    #   opencv-python
     #   opencv-python-headless
     #   pandas
     #   peft
@@ -307,8 +305,6 @@ nvidia-nvjitlink-cu12==12.4.127
     #   torch
 nvidia-nvtx-cu12==12.4.127
     # via torch
-opencv-python==4.10.0.84
-    # via -r requirements-test.in
 opencv-python-headless==4.10.0.84
     # via mistral-common
 packaging==24.1
@@ -440,7 +436,6 @@ regex==2024.9.11
     #   transformers
 requests==2.32.3
     # via
-    #   -r requirements-test.in
     #   buildkite-test-collector
     #   datasets
     #   evaluate
@@ -521,7 +516,7 @@ tiktoken==0.7.0
     #   mistral-common
 timm==1.0.11
     # via -r requirements-test.in
-tokenizers==0.20.1
+tokenizers==0.20.3
     # via transformers
 toml==0.10.2
     # via datamodel-code-generator

From 58170d65034f7a89edc56c716f1fcf05ff336aa5 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Mon, 11 Nov 2024 16:54:28 +0800
Subject: [PATCH 081/183] [Hardware][CPU] Add embedding models support for CPU
 backend (#10193)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 .buildkite/run-cpu-test-ppc64le.sh            |   3 +-
 .buildkite/run-cpu-test.sh                    |   3 +-
 .../embedding/language/test_embedding.py      |   7 +-
 vllm/attention/backends/torch_sdpa.py         |  14 +-
 vllm/model_executor/models/bert.py            |   6 -
 vllm/worker/cpu_embedding_model_runner.py     | 122 ++++++++++++++++++
 vllm/worker/cpu_enc_dec_model_runner.py       |  11 +-
 vllm/worker/cpu_model_runner.py               |  57 ++++----
 vllm/worker/cpu_worker.py                     |  14 +-
 9 files changed, 185 insertions(+), 52 deletions(-)
 create mode 100644 vllm/worker/cpu_embedding_model_runner.py

diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh
index cd2bfd8bb5bf4..b17540633225f 100755
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@@ -25,8 +25,7 @@ function cpu_tests() {
       decord einops librosa peft Pillow sentence-transformers soundfile \
       transformers_stream_generator matplotlib datamodel_code_generator
     pip install torchvision --index-url https://download.pytorch.org/whl/cpu
-    # Embedding models are not supported for CPU yet
-    # pytest -v -s tests/models/embedding/language
+    pytest -v -s tests/models/embedding/language
     pytest -v -s tests/models/encoder_decoder/language
     pytest -v -s tests/models/decoder_only/language/test_models.py
     pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 8d4f4d1a681f2..7a0c9dc902bae 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -32,8 +32,7 @@ function cpu_tests() {
       decord einops librosa peft Pillow sentence-transformers soundfile \
       transformers_stream_generator matplotlib datamodel_code_generator
     pip install torchvision --index-url https://download.pytorch.org/whl/cpu
-    # Embedding models are not supported for CPU yet
-    # pytest -v -s tests/models/embedding/language
+    pytest -v -s tests/models/embedding/language
     pytest -v -s tests/models/encoder_decoder/language
     pytest -v -s tests/models/decoder_only/language/test_models.py
     pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py
index 39b6bbaf43180..cd920aec6502e 100644
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -4,6 +4,8 @@
 """
 import pytest
 
+from vllm.utils import current_platform
+
 from ..utils import check_embeddings_close
 
 # Model, Guard
@@ -21,15 +23,14 @@
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 def test_models(
-    monkeypatch,
     hf_runner,
     vllm_runner,
     example_prompts,
     model,
     dtype: str,
 ) -> None:
-    if model in ENCODER_ONLY:
-        monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+    if model not in ENCODER_ONLY and current_platform.is_cpu():
+        pytest.skip("Skip large embedding models test on CPU.")
 
     # The example_prompts has ending "\n", for example:
     # "Write a short story about a robot that dreams for the first time.\n"
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index f985f70728a60..563178d3ab60d 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -158,7 +158,8 @@ def get_seq_lens(
         * Appropriate sequence lengths tensor for key & value
         '''
 
-        if attn_type == AttentionType.DECODER:
+        if (attn_type == AttentionType.DECODER
+                or attn_type == AttentionType.ENCODER_ONLY):
             seq_lens_q = self.seq_lens
             seq_lens_kv = self.seq_lens
         elif attn_type == AttentionType.ENCODER:
@@ -189,7 +190,8 @@ def get_attn_bias(
         * Appropriate attention bias value given the attention type
         '''
 
-        if attn_type == AttentionType.DECODER:
+        if (attn_type == AttentionType.DECODER
+                or attn_type == AttentionType.ENCODER_ONLY):
             return self.attn_bias
         elif attn_type == AttentionType.ENCODER:
             return self.encoder_attn_bias
@@ -215,7 +217,8 @@ def set_attn_bias(
                     encoder/decoder cross-attention
         '''
 
-        if attn_type == AttentionType.DECODER:
+        if (attn_type == AttentionType.DECODER
+                or attn_type == AttentionType.ENCODER_ONLY):
             self.attn_bias = attn_bias
         elif attn_type == AttentionType.ENCODER:
             self.encoder_attn_bias = attn_bias
@@ -252,7 +255,8 @@ def get_seq_len_block_table_args(
         * Appropriate block tables (or None)
         '''
 
-        if attn_type == AttentionType.DECODER:
+        if (attn_type == AttentionType.DECODER
+                or attn_type == AttentionType.ENCODER_ONLY):
             # Decoder self-attention
             # Choose max_seq_len based on whether we are in prompt_run
             return (self.seq_lens_tensor, self.max_decode_seq_len,
@@ -420,6 +424,8 @@ def forward(
                     "Torch SDPA backend doesn't support prefix decoding.")
 
         if decode_meta := attn_metadata.decode_metadata:
+            assert attn_type != AttentionType.ENCODER_ONLY, (
+                "Encoder-only models should not have decode metadata.")
             # Decoding run.
             (
                 seq_lens_arg,
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 614d2db8ccff6..7dbc7fa0aaba4 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -5,7 +5,6 @@
 from transformers import BertConfig
 
 from vllm.attention import Attention, AttentionMetadata, AttentionType
-from vllm.attention.backends.xformers import XFormersImpl
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
@@ -218,11 +217,6 @@ def __init__(
                               quant_config=quant_config,
                               prefix=f"{prefix}.attn")
 
-        if not isinstance(self.attn.impl, XFormersImpl):
-            raise ValueError(
-                "Encoder-only models currently require XFORMERS attention "
-                "backend. Set VLLM_ATTENTION_BACKEND=XFORMERS to use BERT.")
-
     def forward(
         self,
         hidden_states: torch.Tensor,
diff --git a/vllm/worker/cpu_embedding_model_runner.py b/vllm/worker/cpu_embedding_model_runner.py
new file mode 100644
index 0000000000000..86918fee65c5e
--- /dev/null
+++ b/vllm/worker/cpu_embedding_model_runner.py
@@ -0,0 +1,122 @@
+import dataclasses
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
+
+import torch
+
+from vllm.model_executor.pooling_metadata import PoolingMetadata
+from vllm.multimodal import MultiModalKwargs
+from vllm.pooling_params import PoolingParams
+from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData,
+                           SequenceGroupMetadata)
+from vllm.worker.cpu_model_runner import (CPUModelRunnerBase, ModelInputForCPU,
+                                          ModelInputForCPUBuilder)
+
+
+@dataclasses.dataclass(frozen=True)
+class ModelInputForCPUWithPoolingMetadata(ModelInputForCPU):
+    """
+    Used by the CPUEmbeddingModelRunner.
+    """
+    pooling_metadata: Optional["PoolingMetadata"] = None
+
+
+class CPUEmbeddingModelRunner(
+        CPUModelRunnerBase[ModelInputForCPUWithPoolingMetadata]):
+    _model_input_cls: Type[ModelInputForCPUWithPoolingMetadata] = (
+        ModelInputForCPUWithPoolingMetadata)
+    _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder
+
+    @torch.inference_mode()
+    def execute_model(
+        self,
+        model_input: ModelInputForCPUWithPoolingMetadata,
+        kv_caches: List[torch.Tensor],
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        num_steps: int = 1,
+    ) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]:
+        if num_steps > 1:
+            raise ValueError(
+                "CPU worker does not support multi-step execution.")
+
+        num_layers = self.model_config.get_num_layers(self.parallel_config)
+        # use an empty tensor instead of `None`` to force Dynamo to pass
+        # it by reference, rather by specializing on the value ``None``.
+        # the `dtype` argument does not matter, and we use `float32` as
+        # a placeholder (it has wide hardware support).
+        kv_caches = [
+            torch.tensor([], dtype=torch.float32, device=self.device)
+            for _ in range(num_layers)
+        ]
+
+        model_executable = self.model
+        execute_model_kwargs = {
+            "input_ids":
+            model_input.input_tokens,
+            "positions":
+            model_input.input_positions,
+            "kv_caches":
+            kv_caches,
+            "attn_metadata":
+            model_input.attn_metadata,
+            **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {},
+                                         device=self.device),
+            "intermediate_tensors":
+            intermediate_tensors,
+        }
+
+        hidden_states = model_executable(**execute_model_kwargs)
+
+        return [
+            self.model.pooler(hidden_states=hidden_states,
+                              pooling_metadata=model_input.pooling_metadata)
+        ]
+
+    def make_model_input_from_broadcasted_tensor_dict(
+            self,
+            tensor_dict: Dict[str,
+                              Any]) -> ModelInputForCPUWithPoolingMetadata:
+        return ModelInputForCPUWithPoolingMetadata.from_broadcasted_tensor_dict(
+            tensor_dict,
+            attn_backend=self.attn_backend,
+        )
+
+    def prepare_model_input(
+        self,
+        seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+        virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None
+    ) -> ModelInputForCPUWithPoolingMetadata:
+        assert seq_group_metadata_list is not None
+        model_input = self._prepare_model_input_tensors(
+            seq_group_metadata_list, finished_requests_ids)
+        # Prepare PoolingMetadata.
+        assert model_input.seq_lens is not None
+        pooling_metadata = self._prepare_pooling(seq_group_metadata_list,
+                                                 model_input.seq_lens)
+
+        return dataclasses.replace(model_input,
+                                   pooling_metadata=pooling_metadata)
+
+    def _prepare_pooling(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        prompt_lens: List[int],
+    ) -> PoolingMetadata:
+        """Prepare PoolingMetadata for the sequence group metadata list."""
+        seq_groups: List[Tuple[List[int], PoolingParams]] = []
+        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            pooling_params = seq_group_metadata.pooling_params
+            seq_groups.append((seq_ids, pooling_params))
+
+        seq_data: Dict[int, SequenceData] = {}
+        for seq_group_metadata in seq_group_metadata_list:
+            seq_data.update(seq_group_metadata.seq_data)
+
+        pooling_metadata = PoolingMetadata(
+            seq_groups=seq_groups,
+            seq_data=seq_data,
+            prompt_lens=prompt_lens,
+        )
+
+        return pooling_metadata
diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py
index 994af7c5a455f..896e948948c7a 100644
--- a/vllm/worker/cpu_enc_dec_model_runner.py
+++ b/vllm/worker/cpu_enc_dec_model_runner.py
@@ -8,7 +8,7 @@
 from vllm.multimodal import MultiModalKwargs
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
 from vllm.utils import make_tensor_with_pad
-from vllm.worker.cpu_model_runner import (CPUModelRunner,
+from vllm.worker.cpu_model_runner import (CPUModelRunnerBase,
                                           ModelInputForCPUBuilder,
                                           ModelInputForCPUWithSamplingMetadata)
 from vllm.worker.model_runner_base import (
@@ -50,7 +50,8 @@ def from_broadcasted_tensor_dict(
             super().from_broadcasted_tensor_dict(tensor_dict, attn_backend))
 
 
-class CPUEncoderDecoderModelRunner(CPUModelRunner):
+class CPUEncoderDecoderModelRunner(
+        CPUModelRunnerBase[EncoderDecoderModelInputForCPU]):
     _model_input_cls: Type[EncoderDecoderModelInputForCPU] = (
         EncoderDecoderModelInputForCPU)
     _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder
@@ -87,10 +88,8 @@ def prepare_model_input(
         virtual_engine: int = 0,
         finished_requests_ids: Optional[List[str]] = None
     ) -> EncoderDecoderModelInputForCPU:
-        model_input = super().prepare_model_input(seq_group_metadata_list,
-                                                  virtual_engine,
-                                                  finished_requests_ids)
-        model_input = cast(EncoderDecoderModelInputForCPU, model_input)
+        model_input = self._prepare_model_input_tensors(
+            seq_group_metadata_list, finished_requests_ids)
         (
             attn_metadata,
             encoder_input_tokens_tensor,
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 1590184d6f831..09c62fbb9875f 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -2,7 +2,8 @@
 import weakref
 from collections import defaultdict
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
+from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type,
+                    TypeVar, Union)
 
 import torch
 from torch import nn
@@ -31,6 +32,7 @@
 
 logger = init_logger(__name__)
 
+TModelInputForCPU = TypeVar('TModelInputForCPU', bound="ModelInputForCPU")
 _PAD_SLOT_ID = -1
 
 
@@ -60,10 +62,10 @@ def as_broadcastable_tensor_dict(
 
     @classmethod
     def from_broadcasted_tensor_dict(
-        cls: Type["ModelInputForCPU"],
+        cls: Type[TModelInputForCPU],
         tensor_dict: Dict[str, Any],
         attn_backend: Optional["AttentionBackend"] = None
-    ) -> "ModelInputForCPU":
+    ) -> TModelInputForCPU:
         if attn_backend is not None:
             tensor_dict = _init_attn_metadata_from_tensor_dict(
                 attn_backend, tensor_dict)
@@ -255,11 +257,14 @@ def _prepare_prompt(
                     slot_mapping.append(_PAD_SLOT_ID)
                     continue
 
-                block_number = block_table[i //
-                                           self.block_size]  # type: ignore
-                block_offset = i % self.block_size  # type: ignore
-                slot = block_number * self.block_size + block_offset
-                slot_mapping.append(slot)
+                # For encoder-only models, the block_table is None,
+                # and there is no need to initialize the slot_mapping.
+                if block_table is not None:
+                    block_number = block_table[i //
+                                               self.block_size]  # type: ignore
+                    block_offset = i % self.block_size  # type: ignore
+                    slot = block_number * self.block_size + block_offset
+                    slot_mapping.append(slot)
 
         if any(input_mrope_positions):
             input_positions = None  # type: ignore
@@ -402,10 +407,12 @@ def _prepare_decode(
         )
 
 
-class CPUModelRunner(ModelRunnerBase[ModelInputForCPU]):
-    _model_input_cls: Type[ModelInputForCPUWithSamplingMetadata] = (
-        ModelInputForCPUWithSamplingMetadata)
-    _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder
+class CPUModelRunnerBase(ModelRunnerBase[TModelInputForCPU]):
+    """
+    Helper class for shared methods between CPU model runners.
+    """
+    _model_input_cls: Type[TModelInputForCPU]
+    _builder_cls: Type[ModelInputForCPUBuilder]
 
     def __init__(
         self,
@@ -448,20 +455,11 @@ def __init__(
     def load_model(self) -> None:
         self.model = get_model(vllm_config=self.vllm_config)
 
-    def make_model_input_from_broadcasted_tensor_dict(
-        self,
-        tensor_dict: Dict[str, Any],
-    ) -> ModelInputForCPUWithSamplingMetadata:
-        return ModelInputForCPUWithSamplingMetadata.from_broadcasted_tensor_dict(  # noqa: E501
-            tensor_dict,
-            attn_backend=self.attn_backend,
-        )
-
     def _prepare_model_input_tensors(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
         finished_requests_ids: Optional[List[str]] = None
-    ) -> ModelInputForCPUWithSamplingMetadata:
+    ) -> TModelInputForCPU:
         """Helper method to prepare the model input based on a given sequence
         group. Prepares metadata needed for the base model forward pass but not
         metadata for possible additional steps, e.g., sampling.
@@ -473,6 +471,21 @@ def _prepare_model_input_tensors(
 
         return builder.build()  # type: ignore
 
+
+class CPUModelRunner(CPUModelRunnerBase[ModelInputForCPUWithSamplingMetadata]):
+    _model_input_cls: Type[ModelInputForCPUWithSamplingMetadata] = (
+        ModelInputForCPUWithSamplingMetadata)
+    _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder
+
+    def make_model_input_from_broadcasted_tensor_dict(
+        self,
+        tensor_dict: Dict[str, Any],
+    ) -> ModelInputForCPUWithSamplingMetadata:
+        return ModelInputForCPUWithSamplingMetadata.from_broadcasted_tensor_dict(  # noqa: E501
+            tensor_dict,
+            attn_backend=self.attn_backend,
+        )
+
     def prepare_model_input(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 162e1e4be873b..bc9164bd9d5df 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -14,8 +14,9 @@
 from vllm.model_executor import set_random_seed
 from vllm.sequence import ExecuteModelRequest
 from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.worker.cpu_embedding_model_runner import CPUEmbeddingModelRunner
 from vllm.worker.cpu_enc_dec_model_runner import CPUEncoderDecoderModelRunner
-from vllm.worker.cpu_model_runner import CPUModelRunner
+from vllm.worker.cpu_model_runner import CPUModelRunner, CPUModelRunnerBase
 from vllm.worker.worker_base import (LocalOrDistributedWorkerBase,
                                      LoraNotSupportedWorkerBase, WorkerBase,
                                      WorkerInput)
@@ -150,21 +151,20 @@ def __init__(
         else:
             self.local_omp_cpuid = omp_cpuids.split("|")[rank]
 
-        ModelRunnerClass: Type[CPUModelRunner] = CPUModelRunner
+        ModelRunnerClass: Type[CPUModelRunnerBase] = CPUModelRunner
         if self.model_config.task == "embedding":
-            raise NotImplementedError(
-                "Embedding models are not supported for CPU backend")
-            # ModelRunnerClass = CPUEmbeddingModelRunner
+            ModelRunnerClass = CPUEmbeddingModelRunner
         elif self.model_config.is_encoder_decoder:
             ModelRunnerClass = CPUEncoderDecoderModelRunner
-        self.model_runner: CPUModelRunner = ModelRunnerClass(
+        self.model_runner: CPUModelRunnerBase = ModelRunnerClass(
             vllm_config=vllm_config,
             kv_cache_dtype=kv_cache_dtype,
             is_driver_worker=is_driver_worker)
         # Uninitialized cache engine. Will be initialized by
         # initialize_cache.
         self.cache_engine: List[CPUCacheEngine]
-        self.cpu_cache: List[List[torch.Tensor]]
+        # Initialize cpu_cache as embedding models don't initialize kv_caches
+        self.cpu_cache: Optional[List[List[torch.Tensor]]] = None
 
         # Torch profiler. Enabled and configured through env vars:
         # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace

From 36e4acd02a955f71ebb7b220cbfae4a4379bc57b Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Mon, 11 Nov 2024 17:43:23 +0800
Subject: [PATCH 082/183] [LoRA][Kernel] Remove the unused libentry module
 (#10214)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_punica_sizes.py     |  73 ++++--------
 tests/lora/test_punica_variation.py |  73 ++++--------
 vllm/lora/ops/sgmv_expand.py        |   3 -
 vllm/lora/ops/sgmv_expand_slice.py  |   3 -
 vllm/lora/ops/sgmv_shrink.py        |   3 -
 vllm/triton_utils/__init__.py       |   3 +-
 vllm/triton_utils/libentry.py       | 167 ----------------------------
 7 files changed, 49 insertions(+), 276 deletions(-)
 delete mode 100644 vllm/triton_utils/libentry.py

diff --git a/tests/lora/test_punica_sizes.py b/tests/lora/test_punica_sizes.py
index e756544d96e98..66b5f82bbb97d 100644
--- a/tests/lora/test_punica_sizes.py
+++ b/tests/lora/test_punica_sizes.py
@@ -4,8 +4,6 @@
 whether the corresponding Triton kernel can run normally when tensor parallelism
 is set to [1, 2, 4, 8, 16, 32, 64].
 """
-from unittest.mock import patch
-
 import pytest
 import torch
 
@@ -16,7 +14,6 @@
 from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
 from vllm.lora.ops.sgmv_shrink import sgmv_shrink
 from vllm.platforms import current_platform
-from vllm.triton_utils.libentry import LibEntry
 
 from .utils import (generate_data, generate_data_for_expand_nslices,
                     ref_torch_groupgemm)
@@ -235,9 +232,6 @@ def test_punica_bgmv(
     seed: int,
     device: str,
 ):
-    from vllm.lora.ops.bgmv_expand import _bgmv_expand_kernel
-    from vllm.lora.ops.bgmv_shrink import _bgmv_shrink_kernel
-
     torch.set_default_device(device)
     current_platform.seed_everything(seed)
 
@@ -262,33 +256,21 @@ def test_punica_bgmv(
         device,
     )
     if op_type == "shrink":
-        # The current _bgmv_shrink_kernel does not require the libentry
-        # decoration. The purpose of adding this patch is to test the
-        # correctness of libentry.
-        with patch(
-                "vllm.lora.ops.bgmv_shrink._bgmv_shrink_kernel",
-                LibEntry(_bgmv_shrink_kernel),
-        ):
-            bgmv_shrink(
-                inputs_tensor,
-                lora_weights,
-                our_out_tensor,
-                indices,
-                scaling,
-            )
+        bgmv_shrink(
+            inputs_tensor,
+            lora_weights,
+            our_out_tensor,
+            indices,
+            scaling,
+        )
     else:
-        # ditto
-        with patch(
-                "vllm.lora.ops.bgmv_expand._bgmv_expand_kernel",
-                LibEntry(_bgmv_expand_kernel),
-        ):
-            bgmv_expand(
-                inputs_tensor,
-                lora_weights,
-                our_out_tensor,
-                indices,
-                add_inputs=True,
-            )
+        bgmv_expand(
+            inputs_tensor,
+            lora_weights,
+            our_out_tensor,
+            indices,
+            add_inputs=True,
+        )
     ref_torch_groupgemm(
         ref_out_tensor,
         inputs_tensor,
@@ -324,7 +306,6 @@ def test_punica_expand_nslices(
     seed: int,
     device: str,
 ):
-    from vllm.lora.ops.bgmv_expand_slice import _bgmv_expand_slice_kernel
 
     torch.set_default_device(device)
     current_platform.seed_everything(seed)
@@ -374,22 +355,16 @@ def test_punica_expand_nslices(
                 add_inputs=True,
             )
         else:
-            # The current _bgmv_expand_slice_kernel does not require the
-            # libentry decoration. The purpose of adding this patch is to test
-            # the correctness of libentry.
-            with patch(
-                    "vllm.lora.ops.bgmv_expand_slice._bgmv_expand_slice_kernel",
-                    LibEntry(_bgmv_expand_slice_kernel),
-            ):
-                bgmv_expand_slice(
-                    inputs_tensor,
-                    lora_weights,
-                    our_outputs,
-                    indices,
-                    slice_offset,
-                    slice_size=hidden_size,
-                    add_inputs=True,
-                )
+
+            bgmv_expand_slice(
+                inputs_tensor,
+                lora_weights,
+                our_outputs,
+                indices,
+                slice_offset,
+                slice_size=hidden_size,
+                add_inputs=True,
+            )
         ref_torch_groupgemm(
             ref_outputs[:, slice_offset:slice_offset + hidden_size],
             inputs_tensor,
diff --git a/tests/lora/test_punica_variation.py b/tests/lora/test_punica_variation.py
index dc0edeb10ef46..52b82f25d23e1 100644
--- a/tests/lora/test_punica_variation.py
+++ b/tests/lora/test_punica_variation.py
@@ -3,8 +3,6 @@
 under different conditions, including various batches, numbers of LoRA , and
 maximum ranks.
 """
-from unittest.mock import patch
-
 import pytest
 import torch
 
@@ -15,7 +13,6 @@
 from vllm.lora.ops.sgmv_expand_slice import sgmv_expand_slice
 from vllm.lora.ops.sgmv_shrink import sgmv_shrink
 from vllm.platforms import current_platform
-from vllm.triton_utils.libentry import LibEntry
 
 from .utils import (generate_data, generate_data_for_expand_nslices,
                     ref_torch_groupgemm)
@@ -150,8 +147,6 @@ def test_punica_bgmv(
     seed: int,
     device: str,
 ):
-    from vllm.lora.ops.bgmv_expand import _bgmv_expand_kernel
-    from vllm.lora.ops.bgmv_shrink import _bgmv_shrink_kernel
 
     torch.set_default_device(device)
     current_platform.seed_everything(seed)
@@ -177,33 +172,22 @@ def test_punica_bgmv(
         device,
     )
     if op_type == "shrink":
-        # The current _bgmv_shrink_kernel does not require the libentry
-        # decoration. The purpose of adding this patch is to test the
-        # correctness of libentry.
-        with patch(
-                "vllm.lora.ops.bgmv_shrink._bgmv_shrink_kernel",
-                LibEntry(_bgmv_shrink_kernel),
-        ):
-            bgmv_shrink(
-                inputs_tensor,
-                lora_weights,
-                our_out_tensor,
-                indices,
-                scaling,
-            )
+        bgmv_shrink(
+            inputs_tensor,
+            lora_weights,
+            our_out_tensor,
+            indices,
+            scaling,
+        )
     else:
-        # ditto
-        with patch(
-                "vllm.lora.ops.bgmv_expand._bgmv_expand_kernel",
-                LibEntry(_bgmv_expand_kernel),
-        ):
-            bgmv_expand(
-                inputs_tensor,
-                lora_weights,
-                our_out_tensor,
-                indices,
-                add_inputs=True,
-            )
+
+        bgmv_expand(
+            inputs_tensor,
+            lora_weights,
+            our_out_tensor,
+            indices,
+            add_inputs=True,
+        )
     ref_torch_groupgemm(
         ref_out_tensor,
         inputs_tensor,
@@ -239,8 +223,6 @@ def test_punica_expand_nslices(
     seed: int,
     device: str,
 ):
-    from vllm.lora.ops.bgmv_expand_slice import _bgmv_expand_slice_kernel
-
     torch.set_default_device(device)
     current_platform.seed_everything(seed)
 
@@ -289,22 +271,15 @@ def test_punica_expand_nslices(
                 add_inputs=True,
             )
         else:
-            # The current _bgmv_expand_slice_kernel does not require the
-            # libentry decoration. The purpose of adding this patch is to test
-            # the correctness of libentry.
-            with patch(
-                    "vllm.lora.ops.bgmv_expand_slice._bgmv_expand_slice_kernel",
-                    LibEntry(_bgmv_expand_slice_kernel),
-            ):
-                bgmv_expand_slice(
-                    inputs_tensor,
-                    lora_weights,
-                    our_outputs,
-                    indices,
-                    slice_offset,
-                    slice_size=hidden_size,
-                    add_inputs=True,
-                )
+            bgmv_expand_slice(
+                inputs_tensor,
+                lora_weights,
+                our_outputs,
+                indices,
+                slice_offset,
+                slice_size=hidden_size,
+                add_inputs=True,
+            )
         ref_torch_groupgemm(
             ref_outputs[:, slice_offset:slice_offset + hidden_size],
             inputs_tensor,
diff --git a/vllm/lora/ops/sgmv_expand.py b/vllm/lora/ops/sgmv_expand.py
index adb3ab5b46b87..4910cb4061298 100644
--- a/vllm/lora/ops/sgmv_expand.py
+++ b/vllm/lora/ops/sgmv_expand.py
@@ -9,10 +9,7 @@
 import triton
 import triton.language as tl
 
-from vllm.triton_utils import libentry
 
-
-@libentry()
 @triton.jit
 def _sgmv_expand_kernel(
     input_ptr,
diff --git a/vllm/lora/ops/sgmv_expand_slice.py b/vllm/lora/ops/sgmv_expand_slice.py
index efa234520ab87..844f5cec39e93 100644
--- a/vllm/lora/ops/sgmv_expand_slice.py
+++ b/vllm/lora/ops/sgmv_expand_slice.py
@@ -9,10 +9,7 @@
 import triton
 import triton.language as tl
 
-from vllm.triton_utils import libentry
 
-
-@libentry()
 @triton.jit
 def _sgmv_expand_slice_kernel(
     input_ptr,
diff --git a/vllm/lora/ops/sgmv_shrink.py b/vllm/lora/ops/sgmv_shrink.py
index c003f3dc0ce9e..b4d893047b06b 100644
--- a/vllm/lora/ops/sgmv_shrink.py
+++ b/vllm/lora/ops/sgmv_shrink.py
@@ -9,10 +9,7 @@
 import triton
 import triton.language as tl
 
-from vllm.triton_utils import libentry
 
-
-@libentry()
 @triton.jit
 def _sgmv_shrink_kernel(
     input_ptr,
diff --git a/vllm/triton_utils/__init__.py b/vllm/triton_utils/__init__.py
index 3f57c22e1f2e4..568185383aa5c 100644
--- a/vllm/triton_utils/__init__.py
+++ b/vllm/triton_utils/__init__.py
@@ -6,6 +6,5 @@
 
     from vllm.triton_utils.custom_cache_manager import (
         maybe_set_triton_cache_manager)
-    from vllm.triton_utils.libentry import libentry
 
-    __all__ += ["maybe_set_triton_cache_manager", "libentry"]
+    __all__ += ["maybe_set_triton_cache_manager"]
diff --git a/vllm/triton_utils/libentry.py b/vllm/triton_utils/libentry.py
deleted file mode 100644
index 4335c7adfc13b..0000000000000
--- a/vllm/triton_utils/libentry.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Copied From https://github.com/FlagOpen/FlagGems
-
-import inspect
-
-import triton
-
-
-class LibEntry(triton.KernelInterface):
-
-    def __init__(
-        self,
-        fn,
-    ):
-        self.fn = fn
-        self.arg_names = fn.arg_names
-        self.divisibility = 16
-        self.kernel_cache = dict()
-        fn = self.fn
-        while not isinstance(fn, triton.runtime.JITFunction):
-            fn = fn.fn
-        self.jit_function: triton.runtime.JITFunction = fn
-        self.specialize_indices = [
-            p.num for p in self.jit_function.params
-            if not p.is_constexpr and not p.do_not_specialize
-        ]
-        self.do_not_specialize_indices = [
-            p.num for p in self.jit_function.params
-            if not p.is_constexpr and p.do_not_specialize
-        ]
-
-    def key(self, spec_args, dns_args, const_args):
-        spec_key = [(arg.dtype, arg.data_ptr() %
-                     self.divisibility == 0) if hasattr(arg, "data_ptr") else
-                    (type(arg), arg) for arg in spec_args]
-        dns_key = [
-            arg.dtype if hasattr(
-                arg, "data_ptr") else type(arg) if not isinstance(arg, int)
-            else "i32" if arg >= -(2**31) and arg <= 2**31 -
-            1 else "u64" if arg >= 2**63 and arg <= 2**64 - 1 else "i64"
-            for arg in dns_args
-        ]
-        # const args passed by position
-        return tuple(spec_key + dns_key + const_args)
-
-    def run(self, *args, **kwargs):
-        grid = kwargs["grid"]
-        # collect all the arguments
-        spec_args = []  # specialize arguments
-        dns_args = []  # do not specialize arguments
-        const_args = []  # constexpr arguments
-        k_args = []  # kernel arguments
-        for i, arg in enumerate(args):
-            if i in self.specialize_indices:
-                k_args.append(arg)
-                spec_args.append(arg)
-            elif i in self.do_not_specialize_indices:
-                k_args.append(arg)
-                dns_args.append(arg)
-            else:
-                const_args.append(arg)
-        for p in self.jit_function.params[len(args):]:
-            if p.name in kwargs:
-                val = kwargs[p.name]
-            elif p.default is inspect._empty:
-                continue
-            else:
-                val = p.default
-
-            if p.is_constexpr:
-                const_args.append(val)
-            elif p.do_not_specialize:
-                dns_args.append(val)
-                k_args.append(val)
-            else:
-                spec_args.append(val)
-                k_args.append(val)
-
-        entry_key = self.key(spec_args, dns_args, const_args)
-
-        if entry_key not in self.kernel_cache:
-            # compile the kernel also completes the related computations
-            kernel = self.fn.run(*args, **kwargs)
-            fn = self.fn
-            # collect constexpr arguments for grid computation
-            constexprs = {}
-            while not isinstance(fn, triton.runtime.JITFunction):
-                if isinstance(fn, triton.runtime.Autotuner):
-                    config = fn.best_config
-                    constexprs["num_warps"] = config.num_warps
-                    constexprs["num_stages"] = config.num_stages
-                    constexprs["num_ctas"] = config.num_ctas
-                    constexprs = {**constexprs, **config.kwargs}
-                elif isinstance(fn, triton.runtime.Heuristics):
-                    for v, heur in fn.values.items():
-                        constexprs[v] = heur({
-                            **dict(zip(fn.arg_names, args)),
-                            **kwargs,
-                            **constexprs,
-                        })
-                else:
-                    raise RuntimeError("Invalid Runtime Function")
-                fn = fn.fn
-            # In vLLM, certain kernels like fused_moe_kernel get the
-            # best_config(as kwargs) from a configuration json file, rather
-            # than using Autotuner & Heuristics. Therefore, all their constexprs
-            # (tl.constexpr) are assigned values through the following loop.
-            for p in self.jit_function.params:
-                if p.is_constexpr and p.name not in constexprs:
-                    constexprs[p.name] = p.default  #default=inspect._empty
-            self.kernel_cache[entry_key] = (kernel, constexprs)
-        else:
-            # load kernel from cache directly
-            kernel, constexprs = self.kernel_cache[entry_key]
-
-            if callable(grid):
-                # collect all arguments to the grid fn，ie:
-                # 1. args,
-                # 2. kwargs,
-                # 3. all all other captured arguments in CompiledKernel from
-                # Autotunner & Heuristics when kwargs & captured args conflict,
-                # captured args have higher priority
-                # 4. We must filter out captured args with default value firstly
-                constexprs = {
-                    k: v
-                    for k, v in constexprs.items() if v is not inspect._empty
-                }
-                meta = {
-                    **dict(zip(self.arg_names, args)),
-                    **kwargs,
-                    **constexprs,
-                }
-                grid = grid(meta)
-            if isinstance(grid, tuple):
-                grid = grid + (1, 1)
-            elif isinstance(grid, list):
-                grid = grid + [1, 1]
-            kernel[grid[0:3]](*k_args)
-        # maintaining the same return type as the JITFunction.run
-        return kernel
-
-
-def libentry():
-    """
-    Decorator for triton library entries.
-    Motivation:
-        The runtime overhead of Triton kernels is the reason for the lower 
-        performance of small kernels, particularly evident with smaller models. 
-        Using this decorator can reduce Triton runtime overhead.
-    How:
-        The `run` function of JITFunction needs to accomplish:
-            - Parameter binding using inspect
-            - KernelArg type wrapping
-            - Cache key calculation
-        When dealing with small size, these steps can become bottlenecks in 
-        Triton runtime. Libentry simplifies these steps to reduce runtime 
-        overhead, thereby improving the runtime expenses of small kernels.
-    NOTE:
-        When Triton is upgraded to version 3.0.0, libentry can be removed,
-        see: https://github.com/vllm-project/vllm/pull/5036#issuecomment-2243396245
-        
-
-    """
-
-    def decorator(fn):
-        return LibEntry(fn)
-
-    return decorator

From 5fb1f935b04c29c5c379952681a8a49ad533355d Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 11 Nov 2024 02:01:18 -0800
Subject: [PATCH 083/183] [V1] Allow `tokenizer_mode` and `trust_remote_code`
 for Detokenizer (#10211)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/engine/llm_engine.py     |  5 ++++-
 vllm/v1/tokenizer/detokenizer.py | 19 +++++++++++++++----
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index f805c5e69bc1c..38d95ab44bb90 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -125,7 +125,10 @@ def __init__(
             # Ping the tokenizer to ensure liveness if it runs in a
             # different process.
             self.tokenizer.ping()
-        self.detokenizer = Detokenizer(self.model_config.tokenizer)
+        self.detokenizer = Detokenizer(
+            tokenizer_name=self.model_config.tokenizer,
+            tokenizer_mode=self.model_config.tokenizer_mode,
+            trust_remote_code=self.model_config.trust_remote_code)
 
         self.generation_config_fields = _load_generation_config_dict(
             model_config)
diff --git a/vllm/v1/tokenizer/detokenizer.py b/vllm/v1/tokenizer/detokenizer.py
index e485fcc3522d9..bf1be5d54140a 100644
--- a/vllm/v1/tokenizer/detokenizer.py
+++ b/vllm/v1/tokenizer/detokenizer.py
@@ -42,13 +42,17 @@ class DetokenizerOutputs(msgspec.Struct):
 
 class Detokenizer:
 
-    def __init__(self, tokenizer_name: str):
+    def __init__(self, tokenizer_name: str, tokenizer_mode: str,
+                 trust_remote_code: bool):
         # FIXME(woosuk): Currently, the detokenizer is just a hacky prototype.
         # For example, it does not terminate properly. We need to improve this.
         self.push_port = get_open_port()
         self.pull_port = get_open_port()
-        self.detokenizer = DetokenizerProc(tokenizer_name, self.push_port,
-                                           self.pull_port)
+        self.detokenizer = DetokenizerProc(tokenizer_name=tokenizer_name,
+                                           tokenizer_mode=tokenizer_mode,
+                                           trust_remote_code=trust_remote_code,
+                                           push_port=self.push_port,
+                                           pull_port=self.pull_port)
         self.detokenizer.start()
 
         self.zmq_context = zmq.Context()
@@ -82,11 +86,15 @@ class DetokenizerProc(multiprocessing.Process):
     def __init__(
         self,
         tokenizer_name: str,
+        tokenizer_mode: str,
+        trust_remote_code: bool,
         pull_port: int,
         push_port: int,
     ):
         super().__init__()
         self.tokenizer_name = tokenizer_name
+        self.tokenizer_mode = tokenizer_mode
+        self.trust_remote_code = trust_remote_code
         # NOTE: The pull_port of the detokenizer should be the same as the
         # push_port of the engine. Vice versa.
         self.pull_port = pull_port
@@ -97,7 +105,10 @@ def run(self):
         # not picklable.
         self.msgpack_encoder = msgpack.Encoder()
         self.msgpack_decoder = msgpack.Decoder(DetokenizerInputs)
-        self.tokenizer = get_tokenizer(self.tokenizer_name)
+        self.tokenizer = get_tokenizer(
+            tokenizer_name=self.tokenizer_name,
+            tokenizer_mode=self.tokenizer_mode,
+            trust_remote_code=self.trust_remote_code)
         # req_id -> RequestState
         self.request_states: Dict[str, RequestState] = {}
 

From 2cebda42bb9f52a99e566b9b439fdcca2e9f950e Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Mon, 11 Nov 2024 20:37:58 +0800
Subject: [PATCH 084/183] [Bugfix][Hardware][CPU] Fix broken encoder-decoder
 CPU runner (#10218)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 .buildkite/run-cpu-test-ppc64le.sh        |  2 ++
 .buildkite/run-cpu-test.sh                |  2 ++
 vllm/worker/cpu_embedding_model_runner.py |  1 +
 vllm/worker/cpu_enc_dec_model_runner.py   | 11 +++++++++++
 4 files changed, 16 insertions(+)

diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh
index b17540633225f..79526adef2a79 100755
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@@ -18,6 +18,8 @@ source /etc/environment
 docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN="$HF_TOKEN" --name cpu-test cpu-test
 
 function cpu_tests() {
+  set -e
+
   # Run basic model test
   docker exec cpu-test bash -c "
     set -e
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 7a0c9dc902bae..26a202b09b8a2 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -20,6 +20,8 @@ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/hugg
  --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
 
 function cpu_tests() {
+  set -e
+
   # offline inference
   docker exec cpu-test-avx2 bash -c "
     set -e
diff --git a/vllm/worker/cpu_embedding_model_runner.py b/vllm/worker/cpu_embedding_model_runner.py
index 86918fee65c5e..7053075bf4d8f 100644
--- a/vllm/worker/cpu_embedding_model_runner.py
+++ b/vllm/worker/cpu_embedding_model_runner.py
@@ -95,6 +95,7 @@ def prepare_model_input(
                                                  model_input.seq_lens)
 
         return dataclasses.replace(model_input,
+                                   virtual_engine=virtual_engine,
                                    pooling_metadata=pooling_metadata)
 
     def _prepare_pooling(
diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py
index 896e948948c7a..d040831870bd8 100644
--- a/vllm/worker/cpu_enc_dec_model_runner.py
+++ b/vllm/worker/cpu_enc_dec_model_runner.py
@@ -4,6 +4,7 @@
 import torch
 
 from vllm.attention import AttentionMetadata
+from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.multimodal import MultiModalKwargs
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
@@ -96,11 +97,21 @@ def prepare_model_input(
             encoder_input_positions_tensor,
         ) = self._prepare_encoder_model_input_tensors(seq_group_metadata_list,
                                                       model_input)
+        # Sampling metadata is only required for the final pp group
+        generators = self.get_generators(finished_requests_ids)
+        sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list,
+                                                     model_input.seq_lens,
+                                                     model_input.query_lens,
+                                                     self.device,
+                                                     pin_memory=False,
+                                                     generators=generators)
         return dataclasses.replace(
             model_input,
+            sampling_metadata=sampling_metadata,
             attn_metadata=attn_metadata,
             encoder_input_tokens=encoder_input_tokens_tensor,
             encoder_input_positions=encoder_input_positions_tensor,
+            virtual_engine=virtual_engine,
         )
 
     def _prepare_encoder_model_input_tensors(

From 874f551b3626321f6bf9a902b8fd9fc1fa7c7f2e Mon Sep 17 00:00:00 2001
From: harrywu <63134210+HarryWu99@users.noreply.github.com>
Date: Tue, 12 Nov 2024 00:17:38 +0800
Subject: [PATCH 085/183] [Metrics] add more metrics (#4464)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Robert Shaw <rshaw@neuralmagic.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 examples/production_monitoring/grafana.json | 384 +++++++++++++++++---
 vllm/engine/llm_engine.py                   |  31 +-
 vllm/engine/metrics.py                      |  66 +++-
 vllm/engine/metrics_types.py                |   6 +
 4 files changed, 437 insertions(+), 50 deletions(-)

diff --git a/examples/production_monitoring/grafana.json b/examples/production_monitoring/grafana.json
index d1389f5392c8c..f76a61bb5eec3 100644
--- a/examples/production_monitoring/grafana.json
+++ b/examples/production_monitoring/grafana.json
@@ -1,33 +1,4 @@
 {
-  "__inputs": [
-  ],
-  "__elements": {},
-  "__requires": [
-    {
-      "type": "grafana",
-      "id": "grafana",
-      "name": "Grafana",
-      "version": "10.4.2"
-    },
-    {
-      "type": "panel",
-      "id": "heatmap",
-      "name": "Heatmap",
-      "version": ""
-    },
-    {
-      "type": "datasource",
-      "id": "prometheus",
-      "name": "Prometheus",
-      "version": "1.0.0"
-    },
-    {
-      "type": "panel",
-      "id": "timeseries",
-      "name": "Time series",
-      "version": ""
-    }
-  ],
   "annotations": {
     "list": [
       {
@@ -54,7 +25,7 @@
   "editable": true,
   "fiscalYearStartMonth": 0,
   "graphTooltip": 0,
-  "id": null,
+  "id": 1,
   "links": [],
   "liveNow": false,
   "panels": [
@@ -76,6 +47,7 @@
             "axisLabel": "",
             "axisPlacement": "auto",
             "barAlignment": 0,
+            "barWidthFactor": 0.6,
             "drawStyle": "line",
             "fillOpacity": 0,
             "gradientMode": "none",
@@ -241,6 +213,7 @@
             "axisLabel": "",
             "axisPlacement": "auto",
             "barAlignment": 0,
+            "barWidthFactor": 0.6,
             "drawStyle": "line",
             "fillOpacity": 0,
             "gradientMode": "none",
@@ -358,6 +331,7 @@
             "axisLabel": "",
             "axisPlacement": "auto",
             "barAlignment": 0,
+            "barWidthFactor": 0.6,
             "drawStyle": "line",
             "fillOpacity": 0,
             "gradientMode": "none",
@@ -523,6 +497,7 @@
             "axisLabel": "",
             "axisPlacement": "auto",
             "barAlignment": 0,
+            "barWidthFactor": 0.6,
             "drawStyle": "line",
             "fillOpacity": 0,
             "gradientMode": "none",
@@ -658,6 +633,7 @@
             "axisLabel": "",
             "axisPlacement": "auto",
             "barAlignment": 0,
+            "barWidthFactor": 0.6,
             "drawStyle": "line",
             "fillOpacity": 0,
             "gradientMode": "none",
@@ -823,6 +799,7 @@
             "axisLabel": "",
             "axisPlacement": "auto",
             "barAlignment": 0,
+            "barWidthFactor": 0.6,
             "drawStyle": "line",
             "fillOpacity": 0,
             "gradientMode": "none",
@@ -984,7 +961,7 @@
           "unit": "none"
         }
       },
-      "pluginVersion": "10.4.2",
+      "pluginVersion": "11.2.0",
       "targets": [
         {
           "datasource": {
@@ -1076,7 +1053,7 @@
           "unit": "none"
         }
       },
-      "pluginVersion": "10.4.2",
+      "pluginVersion": "11.2.0",
       "targets": [
         {
           "datasource": {
@@ -1117,6 +1094,7 @@
             "axisLabel": "",
             "axisPlacement": "auto",
             "barAlignment": 0,
+            "barWidthFactor": 0.6,
             "drawStyle": "line",
             "fillOpacity": 0,
             "gradientMode": "none",
@@ -1147,8 +1125,7 @@
             "mode": "absolute",
             "steps": [
               {
-                "color": "green",
-                "value": null
+                "color": "green"
               },
               {
                 "color": "red",
@@ -1199,6 +1176,319 @@
       ],
       "title": "Finish Reason",
       "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "default": false,
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "seconds",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 32
+      },
+      "id": 14,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "edx8memhpd9tsa"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "rate(vllm:request_queue_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "__auto",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Queue Time",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "default": false,
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 40
+      },
+      "id": 15,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "edx8memhpd9tsa"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "rate(vllm:request_prefill_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Prefill",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        },
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "editorMode": "code",
+          "expr": "rate(vllm:request_decode_time_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])",
+          "hide": false,
+          "instant": false,
+          "legendFormat": "Decode",
+          "range": true,
+          "refId": "B"
+        }
+      ],
+      "title": "Requests Prefill and Decode Time",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {
+        "default": false,
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "fieldConfig": {
+        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
+          "custom": {
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
+            "barAlignment": 0,
+            "barWidthFactor": 0.6,
+            "drawStyle": "line",
+            "fillOpacity": 0,
+            "gradientMode": "none",
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "insertNulls": false,
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
+            "pointSize": 5,
+            "scaleDistribution": {
+              "type": "linear"
+            },
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
+            },
+            "thresholdsStyle": {
+              "mode": "off"
+            }
+          },
+          "mappings": [],
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {
+                "color": "green"
+              },
+              {
+                "color": "red",
+                "value": 80
+              }
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 40
+      },
+      "id": 16,
+      "options": {
+        "legend": {
+          "calcs": [],
+          "displayMode": "list",
+          "placement": "bottom",
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
+        }
+      },
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "edx8memhpd9tsa"
+          },
+          "disableTextWrap": false,
+          "editorMode": "code",
+          "expr": "rate(vllm:request_max_num_generation_tokens_sum{model_name=\"$model_name\"}[$__rate_interval])",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "Tokens",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Max Generation Token in Sequence Group",
+      "type": "timeseries"
     }
   ],
   "refresh": "",
@@ -1207,21 +1497,34 @@
   "templating": {
     "list": [
       {
-        "type": "datasource",
-        "name": "DS_PROMETHEUS",
-        "label": "datasource",
-        "current": {},
+        "current": {
+          "selected": false,
+          "text": "prometheus",
+          "value": "edx8memhpd9tsa"
+        },
         "hide": 0,
         "includeAll": false,
+        "label": "datasource",
         "multi": false,
+        "name": "DS_PROMETHEUS",
         "options": [],
         "query": "prometheus",
         "queryValue": "",
         "refresh": 1,
         "regex": "",
-        "skipUrlSync": false
+        "skipUrlSync": false,
+        "type": "datasource"
       },
       {
+        "current": {
+          "selected": false,
+          "text": "/share/datasets/public_models/Meta-Llama-3-8B-Instruct",
+          "value": "/share/datasets/public_models/Meta-Llama-3-8B-Instruct"
+        },
+        "datasource": {
+          "type": "prometheus",
+          "uid": "edx8memhpd9tsa"
+        },
         "definition": "label_values(model_name)",
         "hide": 0,
         "includeAll": false,
@@ -1249,7 +1552,6 @@
   "timezone": "",
   "title": "vLLM",
   "uid": "b281712d-8bff-41ef-9f3f-71ad43c05e9b",
-  "version": 1,
+  "version": 8,
   "weekStart": ""
 }
-
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index d550b1d244af8..69ed6e6bd59d2 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1672,6 +1672,7 @@ def _get_stats(self,
         # Iteration stats
         num_prompt_tokens_iter = 0
         num_generation_tokens_iter = 0
+        num_tokens_iter = 0
         time_to_first_tokens_iter: List[float] = []
         time_per_output_tokens_iter: List[float] = []
         num_preemption_iter = (0 if scheduler_outputs is None else
@@ -1680,6 +1681,10 @@ def _get_stats(self,
         # Request stats
         #   Latency
         time_e2e_requests: List[float] = []
+        time_queue_requests: List[float] = []
+        time_inference_requests: List[float] = []
+        time_prefill_requests: List[float] = []
+        time_decode_requests: List[float] = []
         time_in_queue_requests: List[float] = []
         model_forward_time_requests: List[float] = []
         model_execute_time_requests: List[float] = []
@@ -1687,6 +1692,7 @@ def _get_stats(self,
         num_prompt_tokens_requests: List[int] = []
         num_generation_tokens_requests: List[int] = []
         n_requests: List[int] = []
+        max_num_generation_tokens_requests: List[int] = []
         max_tokens_requests: List[int] = []
         finished_reason_requests: List[str] = []
 
@@ -1777,6 +1783,18 @@ def _get_stats(self,
                     # Latency timings
                     time_e2e_requests.append(now -
                                              seq_group.metrics.arrival_time)
+                    if (seq_group.metrics.first_scheduled_time is not None and
+                            seq_group.metrics.first_token_time is not None):
+                        time_queue_requests.append(
+                            seq_group.metrics.first_scheduled_time -
+                            seq_group.metrics.arrival_time)
+                        time_prefill_requests.append(
+                            seq_group.metrics.first_token_time -
+                            seq_group.metrics.first_scheduled_time)
+                        time_decode_requests.append(
+                            now - seq_group.metrics.first_token_time)
+                        time_inference_requests.append(
+                            now - seq_group.metrics.first_scheduled_time)
                     if seq_group.metrics.time_in_queue is not None:
                         time_in_queue_requests.append(
                             seq_group.metrics.time_in_queue)
@@ -1793,6 +1811,9 @@ def _get_stats(self,
                         seq.get_output_len()
                         for seq in seq_group.get_finished_seqs()
                     ])
+                    max_num_generation_tokens_requests.append(
+                        max(seq.get_output_len()
+                            for seq in seq_group.get_seqs()))
                     if seq_group.sampling_params is not None:
                         n_requests.append(seq_group.sampling_params.n)
                         max_tokens_requests.append(
@@ -1811,7 +1832,8 @@ def _get_stats(self,
             num_generation_tokens_iter = (
                 actual_num_batched_tokens - num_prompt_tokens_iter +
                 num_generation_tokens_from_prefill_groups)
-
+            num_tokens_iter = (num_generation_tokens_iter +
+                               num_prompt_tokens_iter)
         # Spec decode, if enabled, emits specialized metrics from the worker in
         # sampler output.
         if model_output and (model_output[0].spec_decode_worker_metrics
@@ -1837,6 +1859,7 @@ def _get_stats(self,
             # Iteration stats
             num_prompt_tokens_iter=num_prompt_tokens_iter,
             num_generation_tokens_iter=num_generation_tokens_iter,
+            num_tokens_iter=num_tokens_iter,
             time_to_first_tokens_iter=time_to_first_tokens_iter,
             time_per_output_tokens_iter=time_per_output_tokens_iter,
             spec_decode_metrics=spec_decode_metrics,
@@ -1845,12 +1868,18 @@ def _get_stats(self,
             # Request stats
             #   Latency
             time_e2e_requests=time_e2e_requests,
+            time_queue_requests=time_queue_requests,
+            time_inference_requests=time_inference_requests,
+            time_prefill_requests=time_prefill_requests,
+            time_decode_requests=time_decode_requests,
             time_in_queue_requests=time_in_queue_requests,
             model_forward_time_requests=model_forward_time_requests,
             model_execute_time_requests=model_execute_time_requests,
             #   Metadata
             num_prompt_tokens_requests=num_prompt_tokens_requests,
             num_generation_tokens_requests=num_generation_tokens_requests,
+            max_num_generation_tokens_requests=
+            max_num_generation_tokens_requests,
             n_requests=n_requests,
             max_tokens_requests=max_tokens_requests,
             finished_reason_requests=finished_reason_requests,
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 3e3357ed74633..e896bcdded2d1 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -111,6 +111,15 @@ def __init__(self, labelnames: List[str], max_model_len: int):
             name="vllm:generation_tokens_total",
             documentation="Number of generation tokens processed.",
             labelnames=labelnames)
+        self.counter_tokens = self._counter_cls(
+            name="vllm:tokens_total",
+            documentation="Number of prefill plus generation tokens processed.",
+            labelnames=labelnames)
+        self.histogram_iteration_tokens = self._histogram_cls(
+            name="vllm:iteration_tokens_total",
+            documentation="Histogram of number of tokens per engine_step.",
+            labelnames=labelnames,
+            buckets=[1, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8096])
         self.histogram_time_to_first_token = self._histogram_cls(
             name="vllm:time_to_first_token_seconds",
             documentation="Histogram of time to first token in seconds.",
@@ -130,23 +139,45 @@ def __init__(self, labelnames: List[str], max_model_len: int):
 
         # Request stats
         #   Latency
+        request_latency_buckets = [
+            0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
+            40.0, 50.0, 60.0
+        ]
         self.histogram_e2e_time_request = self._histogram_cls(
             name="vllm:e2e_request_latency_seconds",
             documentation="Histogram of end to end request latency in seconds.",
             labelnames=labelnames,
-            buckets=[
-                0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
-                40.0, 50.0, 60.0
-            ])
+            buckets=request_latency_buckets)
+        self.histogram_queue_time_request = self._histogram_cls(
+            name="vllm:request_queue_time_seconds",
+            documentation=
+            "Histogram of time spent in WAITING phase for request.",
+            labelnames=labelnames,
+            buckets=request_latency_buckets)
+        self.histogram_inference_time_request = self._histogram_cls(
+            name="vllm:request_inference_time_seconds",
+            documentation=
+            "Histogram of time spent in RUNNING phase for request.",
+            labelnames=labelnames,
+            buckets=request_latency_buckets)
+        self.histogram_prefill_time_request = self._histogram_cls(
+            name="vllm:request_prefill_time_seconds",
+            documentation=
+            "Histogram of time spent in PREFILL phase for request.",
+            labelnames=labelnames,
+            buckets=request_latency_buckets)
+        self.histogram_decode_time_request = self._histogram_cls(
+            name="vllm:request_decode_time_seconds",
+            documentation=
+            "Histogram of time spent in DECODE phase for request.",
+            labelnames=labelnames,
+            buckets=request_latency_buckets)
         self.histogram_time_in_queue_request = self._histogram_cls(
             name="vllm:time_in_queue_requests",
             documentation=
             "Histogram of time the request spent in the queue in seconds.",
             labelnames=labelnames,
-            buckets=[
-                0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
-                40.0, 50.0, 60.0
-            ])
+            buckets=request_latency_buckets)
         self.histogram_model_forward_time_request = self._histogram_cls(
             name="vllm:model_forward_time_milliseconds",
             documentation=
@@ -173,6 +204,12 @@ def __init__(self, labelnames: List[str], max_model_len: int):
                 labelnames=labelnames,
                 buckets=build_1_2_5_buckets(max_model_len),
             )
+        self.histogram_max_num_generation_tokens_request = self._histogram_cls(
+            name="vllm:request_max_num_generation_tokens",
+            documentation=
+            "Histogram of maximum number of requested generation tokens.",
+            labelnames=labelnames,
+            buckets=build_1_2_5_buckets(max_model_len))
         self.histogram_n_request = self._histogram_cls(
             name="vllm:request_params_n",
             documentation="Histogram of the n request parameter.",
@@ -526,6 +563,8 @@ def _log_prometheus(self, stats: Stats) -> None:
                           stats.num_prompt_tokens_iter)
         self._log_counter(self.metrics.counter_generation_tokens,
                           stats.num_generation_tokens_iter)
+        self._log_histogram(self.metrics.histogram_iteration_tokens,
+                            [stats.num_tokens_iter])
         self._log_histogram(self.metrics.histogram_time_to_first_token,
                             stats.time_to_first_tokens_iter)
         self._log_histogram(self.metrics.histogram_time_per_output_token,
@@ -535,6 +574,14 @@ def _log_prometheus(self, stats: Stats) -> None:
         # Latency
         self._log_histogram(self.metrics.histogram_e2e_time_request,
                             stats.time_e2e_requests)
+        self._log_histogram(self.metrics.histogram_queue_time_request,
+                            stats.time_queue_requests)
+        self._log_histogram(self.metrics.histogram_inference_time_request,
+                            stats.time_inference_requests)
+        self._log_histogram(self.metrics.histogram_decode_time_request,
+                            stats.time_prefill_requests)
+        self._log_histogram(self.metrics.histogram_prefill_time_request,
+                            stats.time_decode_requests)
         self._log_histogram(self.metrics.histogram_time_in_queue_request,
                             stats.time_in_queue_requests)
         self._log_histogram(self.metrics.histogram_model_forward_time_request,
@@ -553,6 +600,9 @@ def _log_prometheus(self, stats: Stats) -> None:
             self.metrics.histogram_num_generation_tokens_request,
             stats.num_generation_tokens_requests)
         self._log_histogram(self.metrics.histogram_n_request, stats.n_requests)
+        self._log_histogram(
+            self.metrics.histogram_max_num_generation_tokens_request,
+            stats.max_num_generation_tokens_requests)
         self._log_histogram(self.metrics.histogram_max_tokens_request,
                             stats.max_tokens_requests)
 
diff --git a/vllm/engine/metrics_types.py b/vllm/engine/metrics_types.py
index 19dcbfe57d112..5f7ec3bbcb269 100644
--- a/vllm/engine/metrics_types.py
+++ b/vllm/engine/metrics_types.py
@@ -39,6 +39,7 @@ class Stats:
     # Iteration stats (should have _iter suffix)
     num_prompt_tokens_iter: int
     num_generation_tokens_iter: int
+    num_tokens_iter: int
     time_to_first_tokens_iter: List[float]
     time_per_output_tokens_iter: List[float]
     num_preemption_iter: int
@@ -46,6 +47,10 @@ class Stats:
     # Request stats (should have _requests suffix)
     #   Latency
     time_e2e_requests: List[float]
+    time_queue_requests: List[float]
+    time_inference_requests: List[float]
+    time_prefill_requests: List[float]
+    time_decode_requests: List[float]
     time_in_queue_requests: List[float]
     model_forward_time_requests: List[float]
     model_execute_time_requests: List[float]
@@ -53,6 +58,7 @@ class Stats:
     num_prompt_tokens_requests: List[int]
     num_generation_tokens_requests: List[int]
     n_requests: List[int]
+    max_num_generation_tokens_requests: List[int]
     max_tokens_requests: List[int]
     finished_reason_requests: List[str]
     waiting_lora_adapters: List[str]

From 36fc439de00a11d82d75d1e571cc4360fab11cdb Mon Sep 17 00:00:00 2001
From: Yangcheng Li <bluebluelitchi@hotmail.com>
Date: Tue, 12 Nov 2024 00:53:07 +0800
Subject: [PATCH 086/183] [Doc] fix doc string typo in block_manager `swap_out`
 function (#10212)

---
 vllm/core/block_manager.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index 61ed7afba12ed..21f4c63b6572d 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -393,7 +393,7 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool:
         with num_lookahead_slots.
 
         Args:
-            seq_group (SequenceGroup): The sequence group to swap in.
+            seq_group (SequenceGroup): The sequence group to swap out.
             num_lookahead_slots (int): Number of lookahead slots used in 
                 speculative decoding, default to 0.
 
@@ -409,7 +409,7 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
         swapping out the given sequence_group with num_lookahead_slots.
 
         Args:
-            sequence_group (SequenceGroup): The sequence group to swap in.
+            sequence_group (SequenceGroup): The sequence group to swap out.
 
         Returns:
             List[Tuple[int, int]]: The mapping of swapping block from 
@@ -459,7 +459,7 @@ def _can_swap(self,
         on to the 'device'.
 
         Args:
-            sequence_group (SequenceGroup): The sequence group to swap in.
+            sequence_group (SequenceGroup): The sequence group to swap in/out.
             device (Device): device to swap the 'seq_group' on.
             status (SequenceStatus): The status of sequence which is needed
                 for action. RUNNING for swap out and SWAPPED for swap in

From e6de9784d26fb3b0c9a55be4ab4ea3127f1900a0 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 11 Nov 2024 09:02:14 -0800
Subject: [PATCH 087/183] [core][distributed] add stateless process group
 (#10216)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/distributed/test_utils.py               |  79 ++++---
 .../device_communicators/pynccl.py            |  38 ++--
 vllm/distributed/utils.py                     | 212 ++++++++++++------
 3 files changed, 217 insertions(+), 112 deletions(-)

diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
index 3c7facc12c59a..d40b09a8b868f 100644
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -1,10 +1,10 @@
 import pytest
 import ray
 import torch
-import torch.distributed as dist
 
 import vllm.envs as envs
-from vllm.distributed.utils import stateless_init_process_group
+from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
+from vllm.distributed.utils import StatelessProcessGroup
 from vllm.utils import (cuda_device_count_stateless,
                         update_environment_variables)
 
@@ -41,42 +41,45 @@ def test_cuda_device_count_stateless():
 
 
 def cpu_worker(rank, WORLD_SIZE):
-    pg1 = stateless_init_process_group(init_method="tcp://127.0.0.1:29500",
+    pg1 = StatelessProcessGroup.create(init_method="tcp://127.0.0.1:29500",
                                        rank=rank,
-                                       world_size=WORLD_SIZE,
-                                       backend="gloo")
+                                       world_size=WORLD_SIZE)
     if rank <= 2:
-        pg2 = stateless_init_process_group(init_method="tcp://127.0.0.1:29501",
+        pg2 = StatelessProcessGroup.create(init_method="tcp://127.0.0.1:29501",
                                            rank=rank,
-                                           world_size=3,
-                                           backend="gloo")
+                                           world_size=3)
     data = torch.tensor([rank])
-    dist.all_reduce(data, op=dist.ReduceOp.SUM, group=pg1)
+    data = pg1.broadcast_obj(data, src=2)
+    assert data.item() == 2
     if rank <= 2:
-        dist.all_reduce(data, op=dist.ReduceOp.SUM, group=pg2)
-    item = data[0].item()
-    print(f"rank: {rank}, item: {item}")
-    if rank == 3:
-        assert item == 6
-    else:
-        assert item == 18
+        data = torch.tensor([rank + 1])
+        data = pg2.broadcast_obj(data, src=2)
+        assert data.item() == 3
+        pg2.barrier()
+    pg1.barrier()
 
 
 def gpu_worker(rank, WORLD_SIZE):
-    pg1 = stateless_init_process_group(init_method="tcp://127.0.0.1:29502",
+    torch.cuda.set_device(rank)
+    pg1 = StatelessProcessGroup.create(init_method="tcp://127.0.0.1:29502",
                                        rank=rank,
-                                       world_size=WORLD_SIZE,
-                                       backend="nccl")
+                                       world_size=WORLD_SIZE)
+    pynccl1 = PyNcclCommunicator(pg1, device=rank)
+    pynccl1.disabled = False
     if rank <= 2:
-        pg2 = stateless_init_process_group(init_method="tcp://127.0.0.1:29503",
+        pg2 = StatelessProcessGroup.create(init_method="tcp://127.0.0.1:29503",
                                            rank=rank,
-                                           world_size=3,
-                                           backend="nccl")
-    torch.cuda.set_device(rank)
+                                           world_size=3)
+        pynccl2 = PyNcclCommunicator(pg2, device=rank)
+        pynccl2.disabled = False
     data = torch.tensor([rank]).cuda()
-    dist.all_reduce(data, op=dist.ReduceOp.SUM, group=pg1)
+    pynccl1.all_reduce(data)
+    pg1.barrier()
+    torch.cuda.synchronize()
     if rank <= 2:
-        dist.all_reduce(data, op=dist.ReduceOp.SUM, group=pg2)
+        pynccl2.all_reduce(data)
+        pg2.barrier()
+        torch.cuda.synchronize()
     item = data[0].item()
     print(f"rank: {rank}, item: {item}")
     if rank == 3:
@@ -85,9 +88,31 @@ def gpu_worker(rank, WORLD_SIZE):
         assert item == 18
 
 
+def broadcast_worker(rank, WORLD_SIZE):
+    pg1 = StatelessProcessGroup.create(init_method="tcp://127.0.0.1:29504",
+                                       rank=rank,
+                                       world_size=WORLD_SIZE)
+    if rank == 2:
+        pg1.broadcast_obj("secret", src=2)
+    else:
+        obj = pg1.broadcast_obj(None, src=2)
+        assert obj == "secret"
+    pg1.barrier()
+
+
+def allgather_worker(rank, WORLD_SIZE):
+    pg1 = StatelessProcessGroup.create(init_method="tcp://127.0.0.1:29505",
+                                       rank=rank,
+                                       world_size=WORLD_SIZE)
+    data = pg1.all_gather_obj(rank)
+    assert data == list(range(WORLD_SIZE))
+    pg1.barrier()
+
+
 @multi_gpu_test(num_gpus=4)
-@pytest.mark.parametrize("worker", [cpu_worker, gpu_worker])
-def test_stateless_init_process_group(worker):
+@pytest.mark.parametrize(
+    "worker", [cpu_worker, gpu_worker, broadcast_worker, allgather_worker])
+def test_stateless_process_group(worker):
     WORLD_SIZE = 4
     from multiprocessing import get_context
     ctx = get_context("fork")
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 7319566545678..7c6f48e88637b 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -9,6 +9,7 @@
 from vllm.distributed.device_communicators.pynccl_wrapper import (
     NCCLLibrary, buffer_type, cudaStream_t, ncclComm_t, ncclDataTypeEnum,
     ncclRedOpTypeEnum, ncclUniqueId)
+from vllm.distributed.utils import StatelessProcessGroup
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
@@ -18,7 +19,7 @@ class PyNcclCommunicator:
 
     def __init__(
         self,
-        group: ProcessGroup,
+        group: Union[ProcessGroup, StatelessProcessGroup],
         device: Union[int, str, torch.device],
         library_path: Optional[str] = None,
     ):
@@ -33,13 +34,18 @@ def __init__(
         It is the caller's responsibility to make sure each communicator
         is bind to a unique device.
         """
-        assert dist.is_initialized()
-        assert dist.get_backend(group) != dist.Backend.NCCL, (
-            "PyNcclCommunicator should be attached to a non-NCCL group.")
+        if not isinstance(group, StatelessProcessGroup):
+            assert dist.is_initialized()
+            assert dist.get_backend(group) != dist.Backend.NCCL, (
+                "PyNcclCommunicator should be attached to a non-NCCL group.")
+            # note: this rank is the rank in the group
+            self.rank = dist.get_rank(group)
+            self.world_size = dist.get_world_size(group)
+        else:
+            self.rank = group.rank
+            self.world_size = group.world_size
+
         self.group = group
-        # note: this rank is the rank in the group
-        self.rank = dist.get_rank(group)
-        self.world_size = dist.get_world_size(group)
 
         # if world_size == 1, no need to create communicator
         if self.world_size == 1:
@@ -68,13 +74,17 @@ def __init__(
         else:
             # construct an empty unique id
             self.unique_id = ncclUniqueId()
-        tensor = torch.ByteTensor(list(self.unique_id.internal))
-        ranks = dist.get_process_group_ranks(group)
-        # arg `src` in `broadcast` is the global rank
-        dist.broadcast(tensor, src=ranks[0], group=group)
-        byte_list = tensor.tolist()
-        for i, byte in enumerate(byte_list):
-            self.unique_id.internal[i] = byte
+
+        if not isinstance(group, StatelessProcessGroup):
+            tensor = torch.ByteTensor(list(self.unique_id.internal))
+            ranks = dist.get_process_group_ranks(group)
+            # arg `src` in `broadcast` is the global rank
+            dist.broadcast(tensor, src=ranks[0], group=group)
+            byte_list = tensor.tolist()
+            for i, byte in enumerate(byte_list):
+                self.unique_id.internal[i] = byte
+        else:
+            self.unique_id = group.broadcast_obj(self.unique_id, src=0)
         if isinstance(device, int):
             device = torch.device(f"cuda:{device}")
         elif isinstance(device, str):
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index d24ce898707fc..a77b41322f376 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -2,13 +2,13 @@
 # Adapted from
 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-from typing import Sequence, Tuple
+import dataclasses
+import pickle
+import time
+from collections import deque
+from typing import Any, Deque, Dict, Optional, Sequence, Tuple
 
 import torch
-from torch.distributed import ProcessGroup
-from torch.distributed.distributed_c10d import (Backend, PrefixStore,
-                                                _get_default_timeout,
-                                                is_nccl_available)
 from torch.distributed.rendezvous import rendezvous
 
 import vllm.envs as envs
@@ -91,69 +91,139 @@ def get_pp_indices(num_hidden_layers: int, pp_rank: int,
     return (start_layer, end_layer)
 
 
-def stateless_init_process_group(init_method: str, rank: int, world_size: int,
-                                 backend: str) -> ProcessGroup:
-    """A replacement for `torch.distributed.init_process_group` that does not
-    pollute the global state.
-
-    If we have process A and process B called `torch.distributed.init_process_group`
-    to form a group, and then we want to form another group with process A, B, C,
-    D, it is not possible in PyTorch, because process A and process B have already
-    formed a group, and process C and process D cannot join that group. This
-    function is a workaround for this issue.
-
-    `torch.distributed.init_process_group` is a global call, while this function
-    is a stateless call. It will return a `ProcessGroup` object that can be used
-    for collective communication. With this function, process A and process B
-    can call `stateless_init_process_group` to form a group, and then process A, B,
-    C, and D can call `stateless_init_process_group` to form another group.
-    """ # noqa
-
-    backend = Backend(backend)  # it is basically string
-    timeout = _get_default_timeout(backend)
-
-    store, rank, world_size = next(
-        rendezvous(init_method, rank, world_size, timeout=timeout))
-    store.set_timeout(timeout)
-
-    group_rank = rank
-    group_size = world_size
-
-    # Use a PrefixStore to avoid accidental overrides of keys used by
-    # different systems (e.g. RPC) in case the store is multi-tenant.
-    prefix_store = PrefixStore(init_method, store)
-
-    pg_options = ProcessGroup.Options(backend=backend, timeout=timeout)
-
-    pg: ProcessGroup = ProcessGroup(
-        prefix_store,
-        group_rank,
-        group_size,
-        pg_options,
-    )
-
-    if backend == "gloo":
-        from torch.distributed.distributed_c10d import ProcessGroupGloo
-        backend_class = ProcessGroupGloo(prefix_store,
-                                         group_rank,
-                                         group_size,
-                                         timeout=timeout)
-        backend_type = ProcessGroup.BackendType.GLOO
-        device = torch.device("cpu")
-    elif backend == "nccl":
-        assert is_nccl_available()
-        from torch.distributed.distributed_c10d import ProcessGroupNCCL
-
-        backend_options = ProcessGroupNCCL.Options()
-        backend_options._timeout = timeout
-
-        backend_class = ProcessGroupNCCL(prefix_store, group_rank, group_size,
-                                         backend_options)
-        backend_type = ProcessGroup.BackendType.NCCL
-        device = torch.device("cuda")
-
-    backend_class._set_sequence_number_for_group()
-
-    pg._register_backend(device, backend_type, backend_class)
-
-    return pg
+@dataclasses.dataclass
+class StatelessProcessGroup:
+    """A dataclass to hold a metadata store, and the rank, world_size of the
+    group. Only use it to communicate metadata between processes.
+    For data-plane communication, create NCCL-related objects.
+    """
+    prefix: str
+    rank: int
+    world_size: int
+    store: torch._C._distributed_c10d.Store
+    data_expiration_seconds: int = 3600  # 1 hour
+
+    # dst rank -> counter
+    send_dst_counter: Dict[int, int] = dataclasses.field(default_factory=dict)
+    # src rank -> counter
+    recv_src_counter: Dict[int, int] = dataclasses.field(default_factory=dict)
+    broadcast_send_counter: int = 0
+    broadcast_recv_src_counter: Dict[int, int] = dataclasses.field(
+        default_factory=dict)
+
+    # A deque to store the data entries, with key and timestamp.
+    entries: Deque[Tuple[str,
+                         float]] = dataclasses.field(default_factory=deque)
+
+    def __post_init__(self):
+        assert self.rank < self.world_size
+        self.send_dst_counter = {i: 0 for i in range(self.world_size)}
+        self.recv_src_counter = {i: 0 for i in range(self.world_size)}
+        self.broadcast_recv_src_counter = {
+            i: 0
+            for i in range(self.world_size)
+        }
+
+    def send_obj(self, obj: Any, dst: int):
+        """Send an object to a destination rank."""
+        self.expire_data()
+        key = f"{self.prefix}/send_to/{dst}/{self.send_dst_counter[dst]}"
+        self.store.set(key, pickle.dumps(obj))
+        self.send_dst_counter[dst] += 1
+        self.entries.append((key, time.time()))
+
+    def expire_data(self):
+        """Expire data that is older than `data_expiration_seconds` seconds."""
+        while self.entries:
+            # check the oldest entry
+            key, timestamp = self.entries[0]
+            if time.time() - timestamp > self.data_expiration_seconds:
+                self.store.delete_key(key)
+                self.entries.popleft()
+            else:
+                break
+
+    def recv_obj(self, src: int) -> Any:
+        """Receive an object from a source rank."""
+        obj = pickle.loads(
+            self.store.get(
+                f"{self.prefix}/send_to/{self.rank}/{self.recv_src_counter[src]}"
+            ))
+        self.recv_src_counter[src] += 1
+        return obj
+
+    def broadcast_obj(self, obj: Optional[Any], src: int) -> Any:
+        """Broadcast an object from a source rank to all other ranks.
+        It does not clean up after all ranks have received the object.
+        Use it for limited times, e.g., for initialization.
+        """
+        if self.rank == src:
+            self.expire_data()
+            key = (f"{self.prefix}/broadcast_from/{src}/"
+                   f"{self.broadcast_send_counter}")
+            self.store.set(key, pickle.dumps(obj))
+            self.broadcast_send_counter += 1
+            self.entries.append((key, time.time()))
+            return obj
+        else:
+            key = (f"{self.prefix}/broadcast_from/{src}/"
+                   f"{self.broadcast_recv_src_counter[src]}")
+            recv_obj = pickle.loads(self.store.get(key))
+            self.broadcast_recv_src_counter[src] += 1
+            return recv_obj
+
+    def all_gather_obj(self, obj: Any) -> list[Any]:
+        """All gather an object from all ranks."""
+        gathered_objs = []
+        for i in range(self.world_size):
+            if i == self.rank:
+                gathered_objs.append(obj)
+                self.broadcast_obj(obj, src=self.rank)
+            else:
+                recv_obj = self.broadcast_obj(None, src=i)
+                gathered_objs.append(recv_obj)
+        return gathered_objs
+
+    def barrier(self):
+        """A barrier to synchronize all ranks."""
+        for i in range(self.world_size):
+            if i == self.rank:
+                self.broadcast_obj(None, src=self.rank)
+            else:
+                self.broadcast_obj(None, src=i)
+
+    @staticmethod
+    def create(
+        init_method: str,
+        rank: int,
+        world_size: int,
+        data_expiration_seconds: int = 3600,
+    ) -> "StatelessProcessGroup":
+        """A replacement for `torch.distributed.init_process_group` that does not
+        pollute the global state.
+
+        If we have process A and process B called `torch.distributed.init_process_group`
+        to form a group, and then we want to form another group with process A, B, C,
+        D, it is not possible in PyTorch, because process A and process B have already
+        formed a group, and process C and process D cannot join that group. This
+        function is a workaround for this issue.
+
+        `torch.distributed.init_process_group` is a global call, while this function
+        is a stateless call. It will return a `StatelessProcessGroup` object that can be
+        used for exchanging metadata. With this function, process A and process B
+        can call `StatelessProcessGroup.create` to form a group, and then process A, B,
+        C, and D can call `StatelessProcessGroup.create` to form another group.
+        """ # noqa
+        from torch._C._distributed_c10d import _DEFAULT_PG_TIMEOUT
+        timeout = _DEFAULT_PG_TIMEOUT
+
+        store, rank, world_size = next(
+            rendezvous(init_method, rank, world_size, timeout=timeout))
+        store.set_timeout(timeout)
+
+        return StatelessProcessGroup(
+            prefix=init_method,
+            rank=rank,
+            world_size=world_size,
+            store=store,
+            data_expiration_seconds=data_expiration_seconds)

From 25144ceed0cfb5883b594137c83c3ec70c9d1c2f Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 11 Nov 2024 17:24:10 +0000
Subject: [PATCH 088/183] Bump actions/setup-python from 5.2.0 to 5.3.0
 (#10209)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/cleanup_pr_body.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cleanup_pr_body.yml b/.github/workflows/cleanup_pr_body.yml
index 37d93a1277974..0085a1cc22373 100644
--- a/.github/workflows/cleanup_pr_body.yml
+++ b/.github/workflows/cleanup_pr_body.yml
@@ -16,7 +16,7 @@ jobs:
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 
       - name: Set up Python
-        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
         with:
           python-version: '3.12'
 

From f9dadfbee331aeff9cb45c94e635ab8e16335a10 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 11 Nov 2024 10:42:07 -0800
Subject: [PATCH 089/183] [V1] Fix detokenizer ports (#10224)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/tokenizer/detokenizer.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/tokenizer/detokenizer.py b/vllm/v1/tokenizer/detokenizer.py
index bf1be5d54140a..8d80ebbc5cc45 100644
--- a/vllm/v1/tokenizer/detokenizer.py
+++ b/vllm/v1/tokenizer/detokenizer.py
@@ -48,11 +48,13 @@ def __init__(self, tokenizer_name: str, tokenizer_mode: str,
         # For example, it does not terminate properly. We need to improve this.
         self.push_port = get_open_port()
         self.pull_port = get_open_port()
+        # NOTE: The push port of the engine process should be the same as the
+        # pull port of the detokenizer process. Vice versa.
         self.detokenizer = DetokenizerProc(tokenizer_name=tokenizer_name,
                                            tokenizer_mode=tokenizer_mode,
                                            trust_remote_code=trust_remote_code,
-                                           push_port=self.push_port,
-                                           pull_port=self.pull_port)
+                                           push_port=self.pull_port,
+                                           pull_port=self.push_port)
         self.detokenizer.start()
 
         self.zmq_context = zmq.Context()
@@ -95,8 +97,8 @@ def __init__(
         self.tokenizer_name = tokenizer_name
         self.tokenizer_mode = tokenizer_mode
         self.trust_remote_code = trust_remote_code
-        # NOTE: The pull_port of the detokenizer should be the same as the
-        # push_port of the engine. Vice versa.
+        # NOTE: The pull_port of the detokenizer process should be the same as
+        # the push_port of the engine process. Vice versa.
         self.pull_port = pull_port
         self.push_port = push_port
 

From d7a4f2207bd0ff31cacf311a05266557d66e474e Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 11 Nov 2024 11:05:57 -0800
Subject: [PATCH 090/183] [V1] Do not use inductor for piecewise CUDA graphs
 (#10225)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_model_runner.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 2469048536e49..1e20920d14432 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -404,15 +404,14 @@ def execute_model(
 
     def load_model(self) -> None:
         if self.use_cuda_graph:
-            # FIXME(woosuk): Currently, the custom ops are not supported
-            # in the piecewise compilation mode. We rely on TorchInductor
-            # to optimize the model.
+            # FIXME(woosuk): Currently, we do not use inductor to reduce the
+            # compilation time and any potential issues with the inductor.
             os.environ["VLLM_CUSTOM_OPS"] = "none"
             set_compilation_config(
                 CompilationConfig(
                     use_cudagraph=True,
                     non_cudagraph_ops=["vllm.unified_v1_flash_attention"],
-                    use_inductor=True,
+                    use_inductor=False,
                 ))
 
         logger.info("Starting to load model %s...", self.model_config.model)

From 330e82d34a36ccee3f2f80fded3e7cc0d67718d6 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 11 Nov 2024 11:10:27 -0800
Subject: [PATCH 091/183] [v1][torch.compile] support managing cudagraph buffer
 (#10203)

Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 .../piecewise_compilation_config.json         |  3 +-
 tests/compile/piecewise/test_simple.py        | 12 ++---
 vllm/compilation/backends.py                  | 46 ++++++++++++++++++-
 vllm/compilation/config.py                    |  6 +++
 4 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/tests/compile/piecewise/piecewise_compilation_config.json b/tests/compile/piecewise/piecewise_compilation_config.json
index 03d077b76f627..798a34e8dd92d 100644
--- a/tests/compile/piecewise/piecewise_compilation_config.json
+++ b/tests/compile/piecewise/piecewise_compilation_config.json
@@ -1,4 +1,5 @@
 {
     "use_cudagraph": true,
-    "non_cudagraph_ops": ["silly.attention"]
+    "non_cudagraph_ops": ["silly.attention"],
+    "cudagraph_copy_inputs": true
 }
\ No newline at end of file
diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index d151d62516b07..fcfe80d8e4041 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -80,7 +80,7 @@ def test_simple_piecewise_compile():
     config = os.path.join(directory, "piecewise_compilation_config.json")
     os.environ["VLLM_TORCH_COMPILE_CONFIG"] = config
 
-    input_buffer = torch.randn(100).cuda()
+    inputs = torch.randn(100).cuda()
 
     with compilation_counter.expect(
             num_graphs_seen=1,  # one graph for the model
@@ -92,15 +92,15 @@ def test_simple_piecewise_compile():
     ):
 
         with set_compile_context([1, 2]):
-            model(input_buffer)
+            model(inputs)
 
-            model(input_buffer[:2])
-            model(input_buffer[:1])
+            model(torch.randn(2).cuda())
+            model(torch.randn(1).cuda())
 
-        input_buffer[:2].zero_()
+        input = torch.zeros(2).cuda()
         global global_counter
         global_counter = 0
-        output = model(input_buffer[:2])
+        output = model(input)
         assert global_counter == 2
         assert torch.allclose(output.cpu(), torch.tensor([3., 1.]))
 
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index c3c670422defa..5682faa158069 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -389,6 +389,8 @@ class VllmBackend:
     returned_callable: Callable
     # Inductor passes to run on the graph pre-defunctionalization
     post_grad_passes: Sequence[Callable]
+    sym_tensor_indices: List[int]
+    input_buffers: List[torch.Tensor]
 
     def __init__(self, post_grad_passes: Sequence[Callable] = ()):
         global global_graph_pool
@@ -401,6 +403,9 @@ def __init__(self, post_grad_passes: Sequence[Callable] = ()):
         self.graph_pool = global_graph_pool
         self.post_grad_passes = post_grad_passes
 
+        self.sym_tensor_indices = []
+        self.input_buffers = []
+
         # `torch.compile` is JIT compiled, so we don't need to
         # do anything here
 
@@ -461,7 +466,46 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
 
         self._called = True
 
-        return self.split_gm
+        if not self.compilation_configs.use_cudagraph or \
+            not self.compilation_configs.cudagraph_copy_inputs:
+            return self.split_gm
+
+        # if we need to copy input buffers for cudagraph
+        from torch._guards import detect_fake_mode
+        fake_mode = detect_fake_mode()
+        fake_args = [
+            fake_mode.from_tensor(t) if isinstance(t, torch.Tensor) else t
+            for t in example_inputs
+        ]
+
+        # index of tensors that have symbolic shapes (batch size)
+        self.sym_tensor_indices = [
+            i for i, x in enumerate(fake_args)
+            if isinstance(x, torch._subclasses.fake_tensor.FakeTensor)
+        ]
+
+        # compiler managed cudagraph input buffers
+        # we assume the first run with symbolic shapes
+        # has the maximum size among all the tensors
+        self.input_buffers = [
+            example_inputs[x].clone() for x in self.sym_tensor_indices
+        ]
+
+        def copy_and_call(*args):
+            list_args = list(args)
+            for i, index in enumerate(self.sym_tensor_indices):
+                runtime_tensor = list_args[index]
+                runtime_shape = runtime_tensor.shape[0]
+                static_tensor = self.input_buffers[i][:runtime_shape]
+
+                # copy the tensor to the static buffer
+                static_tensor.copy_(runtime_tensor)
+
+                # replace the tensor in the list_args to the static buffer
+                list_args[index] = static_tensor
+            return self.split_gm(*list_args)
+
+        return copy_and_call
 
 
 @dataclasses.dataclass
diff --git a/vllm/compilation/config.py b/vllm/compilation/config.py
index 72377533140b5..3e663505c627d 100644
--- a/vllm/compilation/config.py
+++ b/vllm/compilation/config.py
@@ -32,6 +32,11 @@ class CompilationConfig(BaseModel):
             It means the first several runs will be treated as warmup runs.
             Only after that, the execution will be recorded, and the recorded
             cudagraph will be used for subsequent runs.
+        - cudagraph_copy_inputs: whether to copy input tensors for
+            cudagraph. If the caller can guarantee that the same input buffers
+            are always used, it can set this to False. Otherwise, it should
+            set this to True, and the compiler will copy the input to an
+            internally managed buffer. Default is False.
     - Inductor compilation:
         - use_inductor: whether to use inductor compilation.
             - False: inductor compilation is not used. graph runs in eager.
@@ -78,6 +83,7 @@ class CompilationConfig(BaseModel):
     non_cudagraph_ops: List[str] = Field(default_factory=list)
     cudagraph_num_of_warmups: int = 0
     cudagraph_capture_sizes: Optional[List[int]] = None
+    cudagraph_copy_inputs: bool = False
 
     dump_graph_stages: List[str] = Field(default_factory=list)
     dump_graph_dir: Path = Field(default=Path("."))

From fe15729a2b77d760fcf99da76f15806c5eab33df Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 11 Nov 2024 11:26:48 -0800
Subject: [PATCH 092/183] [V1] Use custom ops for piecewise CUDA graphs
 (#10227)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_model_runner.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 1e20920d14432..74a7b4caa6b16 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1,4 +1,3 @@
-import os
 import time
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Dict, List, Optional, Set
@@ -406,7 +405,6 @@ def load_model(self) -> None:
         if self.use_cuda_graph:
             # FIXME(woosuk): Currently, we do not use inductor to reduce the
             # compilation time and any potential issues with the inductor.
-            os.environ["VLLM_CUSTOM_OPS"] = "none"
             set_compilation_config(
                 CompilationConfig(
                     use_cudagraph=True,

From 4800339c6287465a128288231ac9dcd94ddf27ba Mon Sep 17 00:00:00 2001
From: Yuan Tang <terrytangyuan@gmail.com>
Date: Mon, 11 Nov 2024 14:28:55 -0500
Subject: [PATCH 093/183] Add docs on serving with Llama Stack (#10183)

Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
---
 docs/source/serving/integrations.rst          |  1 +
 .../serving/serving_with_llamastack.rst       | 42 +++++++++++++++++++
 2 files changed, 43 insertions(+)
 create mode 100644 docs/source/serving/serving_with_llamastack.rst

diff --git a/docs/source/serving/integrations.rst b/docs/source/serving/integrations.rst
index 7882e14f3b849..f39997e0e44d9 100644
--- a/docs/source/serving/integrations.rst
+++ b/docs/source/serving/integrations.rst
@@ -13,3 +13,4 @@ Integrations
    deploying_with_dstack
    serving_with_langchain
    serving_with_llamaindex
+   serving_with_llamastack
diff --git a/docs/source/serving/serving_with_llamastack.rst b/docs/source/serving/serving_with_llamastack.rst
new file mode 100644
index 0000000000000..8ef96c4e54369
--- /dev/null
+++ b/docs/source/serving/serving_with_llamastack.rst
@@ -0,0 +1,42 @@
+.. _run_on_llamastack:
+
+Serving with Llama Stack
+============================
+
+vLLM is also available via `Llama Stack <https://github.com/meta-llama/llama-stack>`_ .
+
+To install Llama Stack, run
+
+.. code-block:: console
+
+    $ pip install llama-stack -q
+
+Inference using OpenAI Compatible API
+-------------------------------------
+
+Then start Llama Stack server pointing to your vLLM server with the following configuration:
+
+.. code-block:: yaml
+
+    inference:
+      - provider_id: vllm0
+        provider_type: remote::vllm
+        config:
+          url: http://127.0.0.1:8000
+
+Please refer to `this guide <https://github.com/meta-llama/llama-stack/blob/main/docs/source/getting_started/distributions/self_hosted_distro/remote_vllm.md>`_ for more details on this remote vLLM provider.
+
+Inference via Embedded vLLM
+---------------------------
+
+An `inline vLLM provider
+<https://github.com/meta-llama/llama-stack/tree/main/llama_stack/providers/inline/inference/vllm>`_
+is also available. This is a sample of configuration using that method:
+
+.. code-block:: yaml
+
+    inference
+      - provider_type: vllm
+        config:
+          model: Llama3.1-8B-Instruct
+          tensor_parallel_size: 4

From 8a7fe47d322920bdff1b1c3472fe7f423a73a23b Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 11 Nov 2024 11:54:59 -0800
Subject: [PATCH 094/183] [misc][distributed] auto port selection and disable
 tests (#10226)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/distributed/test_utils.py | 39 +++++++++++++++++++--------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
index d40b09a8b868f..5d77d8abb4718 100644
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -1,3 +1,5 @@
+import socket
+
 import pytest
 import ray
 import torch
@@ -5,7 +7,7 @@
 import vllm.envs as envs
 from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 from vllm.distributed.utils import StatelessProcessGroup
-from vllm.utils import (cuda_device_count_stateless,
+from vllm.utils import (cuda_device_count_stateless, get_open_port,
                         update_environment_variables)
 
 from ..utils import multi_gpu_test
@@ -40,14 +42,13 @@ def test_cuda_device_count_stateless():
     assert ray.get(actor.get_count.remote()) == 0
 
 
-def cpu_worker(rank, WORLD_SIZE):
-    pg1 = StatelessProcessGroup.create(init_method="tcp://127.0.0.1:29500",
+def cpu_worker(rank, WORLD_SIZE, port1, port2):
+    pg1 = StatelessProcessGroup.create(init_method=f"tcp://127.0.0.1:{port1}",
                                        rank=rank,
                                        world_size=WORLD_SIZE)
     if rank <= 2:
-        pg2 = StatelessProcessGroup.create(init_method="tcp://127.0.0.1:29501",
-                                           rank=rank,
-                                           world_size=3)
+        pg2 = StatelessProcessGroup.create(
+            init_method=f"tcp://127.0.0.1:{port2}", rank=rank, world_size=3)
     data = torch.tensor([rank])
     data = pg1.broadcast_obj(data, src=2)
     assert data.item() == 2
@@ -59,17 +60,16 @@ def cpu_worker(rank, WORLD_SIZE):
     pg1.barrier()
 
 
-def gpu_worker(rank, WORLD_SIZE):
+def gpu_worker(rank, WORLD_SIZE, port1, port2):
     torch.cuda.set_device(rank)
-    pg1 = StatelessProcessGroup.create(init_method="tcp://127.0.0.1:29502",
+    pg1 = StatelessProcessGroup.create(init_method=f"tcp://127.0.0.1:{port1}",
                                        rank=rank,
                                        world_size=WORLD_SIZE)
     pynccl1 = PyNcclCommunicator(pg1, device=rank)
     pynccl1.disabled = False
     if rank <= 2:
-        pg2 = StatelessProcessGroup.create(init_method="tcp://127.0.0.1:29503",
-                                           rank=rank,
-                                           world_size=3)
+        pg2 = StatelessProcessGroup.create(
+            init_method=f"tcp://127.0.0.1:{port2}", rank=rank, world_size=3)
         pynccl2 = PyNcclCommunicator(pg2, device=rank)
         pynccl2.disabled = False
     data = torch.tensor([rank]).cuda()
@@ -88,8 +88,8 @@ def gpu_worker(rank, WORLD_SIZE):
         assert item == 18
 
 
-def broadcast_worker(rank, WORLD_SIZE):
-    pg1 = StatelessProcessGroup.create(init_method="tcp://127.0.0.1:29504",
+def broadcast_worker(rank, WORLD_SIZE, port1, port2):
+    pg1 = StatelessProcessGroup.create(init_method=f"tcp://127.0.0.1:{port1}",
                                        rank=rank,
                                        world_size=WORLD_SIZE)
     if rank == 2:
@@ -100,8 +100,8 @@ def broadcast_worker(rank, WORLD_SIZE):
     pg1.barrier()
 
 
-def allgather_worker(rank, WORLD_SIZE):
-    pg1 = StatelessProcessGroup.create(init_method="tcp://127.0.0.1:29505",
+def allgather_worker(rank, WORLD_SIZE, port1, port2):
+    pg1 = StatelessProcessGroup.create(init_method=f"tcp://127.0.0.1:{port1}",
                                        rank=rank,
                                        world_size=WORLD_SIZE)
     data = pg1.all_gather_obj(rank)
@@ -109,17 +109,24 @@ def allgather_worker(rank, WORLD_SIZE):
     pg1.barrier()
 
 
+# TODO: investigate why this test is flaky. It hangs during initialization.
+@pytest.mark.skip("Skip the test because it is flaky.")
 @multi_gpu_test(num_gpus=4)
 @pytest.mark.parametrize(
     "worker", [cpu_worker, gpu_worker, broadcast_worker, allgather_worker])
 def test_stateless_process_group(worker):
+    port1 = get_open_port()
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("", port1))
+        port2 = get_open_port()
     WORLD_SIZE = 4
     from multiprocessing import get_context
     ctx = get_context("fork")
     processes = []
     for i in range(WORLD_SIZE):
         rank = i
-        processes.append(ctx.Process(target=worker, args=(rank, WORLD_SIZE)))
+        processes.append(
+            ctx.Process(target=worker, args=(rank, WORLD_SIZE, port1, port2)))
     for p in processes:
         p.start()
     for p in processes:

From 9d5b4e4deaa3318df49419d325490730391efd75 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 11 Nov 2024 11:58:07 -0800
Subject: [PATCH 095/183] [V1] Enable custom ops with piecewise CUDA graphs
 (#10228)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_model_runner.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 74a7b4caa6b16..2c40853742ac9 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1,3 +1,4 @@
+import os
 import time
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Dict, List, Optional, Set
@@ -405,6 +406,7 @@ def load_model(self) -> None:
         if self.use_cuda_graph:
             # FIXME(woosuk): Currently, we do not use inductor to reduce the
             # compilation time and any potential issues with the inductor.
+            os.environ["VLLM_CUSTOM_OPS"] = "all"
             set_compilation_config(
                 CompilationConfig(
                     use_cudagraph=True,

From 08f93e743972abd3060723f63352ef42cdf161a8 Mon Sep 17 00:00:00 2001
From: Nikolai Shcheglov <ndnd@mail.ru>
Date: Mon, 11 Nov 2024 16:29:19 -0600
Subject: [PATCH 096/183] Make shutil rename in python_only_dev (#10233)

Signed-off-by: shcheglovnd <shcheglovnd@avride.ai>
---
 python_only_dev.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python_only_dev.py b/python_only_dev.py
index 4ab203bb6f9d6..1ca0f5c30b741 100644
--- a/python_only_dev.py
+++ b/python_only_dev.py
@@ -69,7 +69,8 @@
     current_vllm_path = os.path.join(cwd, "vllm")
 
     print(f"Renaming {pre_built_vllm_path} to {tmp_path} for backup")
-    os.rename(pre_built_vllm_path, tmp_path)
+    shutil.copytree(pre_built_vllm_path, tmp_path)
+    shutil.rmtree(pre_built_vllm_path)
 
     print(f"Linking {current_vllm_path} to {pre_built_vllm_path}")
     os.symlink(current_vllm_path, pre_built_vllm_path)

From 6ace6fba2ca42b79a948a9b47af00487b5f73868 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Mon, 11 Nov 2024 18:05:38 -0500
Subject: [PATCH 097/183] [V1] `AsyncLLM` Implementation (#9826)

Signed-off-by: Nick Hill <nickhill@us.ibm.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nickhill@us.ibm.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 .buildkite/test-pipeline.yaml                |   8 +
 tests/entrypoints/llm/test_accuracy.py       |  56 ++
 tests/entrypoints/openai/test_accuracy.py    |  25 +-
 {vllm/v1/tokenizer => tests/v1}/__init__.py  |   0
 tests/v1/engine/__init__.py                  |   0
 tests/v1/engine/test_async_llm.py            |  66 +++
 tests/v1/engine/test_detokenizer.py          | 205 +++++++
 tests/v1/engine/test_engine_core.py          | 137 +++++
 tests/v1/engine/test_engine_core_client.py   | 202 +++++++
 vllm/config.py                               |  41 ++
 vllm/engine/multiprocessing/engine.py        |  13 +-
 vllm/engine/output_processor/stop_checker.py |  43 +-
 vllm/entrypoints/llm.py                      |   3 +
 vllm/entrypoints/openai/api_server.py        |  11 +-
 vllm/envs.py                                 |   5 +
 vllm/outputs.py                              |  30 +
 vllm/v1/__init__.py                          |   0
 vllm/v1/core/kv_cache_manager.py             |  13 +-
 vllm/v1/core/scheduler.py                    |  26 +-
 vllm/v1/engine/__init__.py                   |  72 +++
 vllm/v1/engine/async_llm.py                  | 368 +++++++++++++
 vllm/v1/engine/async_stream.py               |  55 ++
 vllm/v1/engine/core.py                       | 352 ++++++++++++
 vllm/v1/engine/core_client.py                | 218 ++++++++
 vllm/v1/engine/detokenizer.py                | 265 +++++++++
 vllm/v1/engine/llm_engine.py                 | 552 ++++---------------
 vllm/v1/engine/processor.py                  | 128 +++++
 vllm/v1/request.py                           |  17 +-
 vllm/v1/tokenizer/detokenizer.py             | 228 --------
 29 files changed, 2412 insertions(+), 727 deletions(-)
 create mode 100644 tests/entrypoints/llm/test_accuracy.py
 rename {vllm/v1/tokenizer => tests/v1}/__init__.py (100%)
 create mode 100644 tests/v1/engine/__init__.py
 create mode 100644 tests/v1/engine/test_async_llm.py
 create mode 100644 tests/v1/engine/test_detokenizer.py
 create mode 100644 tests/v1/engine/test_engine_core.py
 create mode 100644 tests/v1/engine/test_engine_core_client.py
 create mode 100644 vllm/v1/__init__.py
 create mode 100644 vllm/v1/engine/async_llm.py
 create mode 100644 vllm/v1/engine/async_stream.py
 create mode 100644 vllm/v1/engine/core.py
 create mode 100644 vllm/v1/engine/core_client.py
 create mode 100644 vllm/v1/engine/detokenizer.py
 create mode 100644 vllm/v1/engine/processor.py
 delete mode 100644 vllm/v1/tokenizer/detokenizer.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index e8456357e6db1..fbaa427bb7270 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -165,6 +165,14 @@ steps:
   # OOM in the CI unless we run this separately
   - pytest -v -s tokenization
 
+- label: V1 Test
+  #mirror_hardwares: [amd]
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    - pytest -v -s v1
+
 - label: Examples Test # 15min
   working_dir: "/vllm-workspace/examples"
   #mirror_hardwares: [amd]
diff --git a/tests/entrypoints/llm/test_accuracy.py b/tests/entrypoints/llm/test_accuracy.py
new file mode 100644
index 0000000000000..6bf7190a656b8
--- /dev/null
+++ b/tests/entrypoints/llm/test_accuracy.py
@@ -0,0 +1,56 @@
+"""
+This file test accuracy of the vLLM server via LMEval.
+It uses local-completions, which interacts with vLLM
+through the OAI API with N concurrent connections.
+This simulates real work usage of the API and makes
+sure that the zmq frontend mp RPC message passing and
+AsyncLLMEngine are working correctly.
+"""
+
+import lm_eval
+import pytest
+
+from vllm.platforms import current_platform
+
+MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
+NUM_CONCURRENT = 500
+TASK = "gsm8k"
+FILTER = "exact_match,strict-match"
+RTOL = 0.03
+EXPECTED_VALUE = 0.58
+
+
+def run_test():
+    """Run the end to end accuracy test."""
+
+    model_args = f"pretrained={MODEL_NAME},max_model_len=2048"
+
+    results = lm_eval.simple_evaluate(
+        model="vllm",
+        model_args=model_args,
+        tasks="gsm8k",
+        batch_size="auto",
+    )
+
+    measured_value = results["results"][TASK][FILTER]
+    assert (measured_value - RTOL < EXPECTED_VALUE
+            and measured_value + RTOL > EXPECTED_VALUE
+            ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="V1 is currently only supported on CUDA.")
+def test_lm_eval_accuracy_v1_engine(monkeypatch):
+    """Run with the V1 Engine."""
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        run_test()
+
+
+def test_lm_eval_accuracy_v0_engine(monkeypatch):
+    """Run with the V0 Engine."""
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        run_test()
diff --git a/tests/entrypoints/openai/test_accuracy.py b/tests/entrypoints/openai/test_accuracy.py
index a16e95f94171e..b1d4461d164aa 100644
--- a/tests/entrypoints/openai/test_accuracy.py
+++ b/tests/entrypoints/openai/test_accuracy.py
@@ -37,11 +37,11 @@
     MAX_WAIT_SECONDS = 600
 
 
-@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
-def test_lm_eval_accuracy(more_args):
+def run_test(more_args):
+    """Run the end to end accuracy test."""
+
     args = list(DEFAULT_ARGS)
     args.extend(more_args)
-
     print(f"Running with: {args}")
 
     with RemoteOpenAIServer(
@@ -64,3 +64,22 @@ def test_lm_eval_accuracy(more_args):
         assert (measured_value - RTOL < EXPECTED_VALUE
                 and measured_value + RTOL > EXPECTED_VALUE
                 ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
+
+
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="V1 currently only supported on CUDA")
+def test_lm_eval_accuracy_v1_engine(monkeypatch):
+    """Run with the V1 Engine."""
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        run_test([])
+
+
+@pytest.mark.parametrize("more_args", MORE_ARGS_LIST)
+def test_lm_eval_accuracy_v0_engine(monkeypatch, more_args):
+    """Run with the V0 Engine."""
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "0")
+        run_test(more_args)
diff --git a/vllm/v1/tokenizer/__init__.py b/tests/v1/__init__.py
similarity index 100%
rename from vllm/v1/tokenizer/__init__.py
rename to tests/v1/__init__.py
diff --git a/tests/v1/engine/__init__.py b/tests/v1/engine/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/v1/engine/test_async_llm.py b/tests/v1/engine/test_async_llm.py
new file mode 100644
index 0000000000000..1f26fe0fc892f
--- /dev/null
+++ b/tests/v1/engine/test_async_llm.py
@@ -0,0 +1,66 @@
+import asyncio
+from typing import Tuple
+
+import pytest
+
+from vllm import SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.platforms import current_platform
+from vllm.v1.engine.async_llm import AsyncLLM
+
+if not current_platform.is_cuda():
+    pytest.skip(reason="V1 currently only supported on CUDA.",
+                allow_module_level=True)
+
+ENGINE_ARGS = AsyncEngineArgs(model="meta-llama/Llama-3.2-1B",
+                              disable_log_requests=True)
+
+
+async def generate(engine: AsyncLLM, request_id: str,
+                   max_tokens: int) -> Tuple[int, str]:
+    count = 0
+    async for _ in engine.generate(request_id=request_id,
+                                   prompt="Hello my name is Robert and",
+                                   sampling_params=SamplingParams(
+                                       max_tokens=max_tokens, temperature=0)):
+
+        count += 1
+        await asyncio.sleep(0.)
+
+    return count, request_id
+
+
+@pytest.mark.asyncio
+async def test_load(monkeypatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        engine = AsyncLLM.from_engine_args(ENGINE_ARGS)
+
+        NUM_REQUESTS = 10000
+        NUM_EXPECTED_TOKENS = 10
+
+        request_ids = [f"request-{i}" for i in range(NUM_REQUESTS)]
+
+        # Create concurrent requests.
+        tasks = []
+        for request_id in request_ids:
+            tasks.append(
+                asyncio.create_task(
+                    generate(engine, request_id, NUM_EXPECTED_TOKENS)))
+
+        # Confirm that we got all the EXPECTED tokens from the requests.
+        failed_request_id = None
+        tokens = None
+        for task in tasks:
+            num_generated_tokens, request_id = await task
+            if (num_generated_tokens != NUM_EXPECTED_TOKENS
+                    and failed_request_id is None):
+                failed_request_id = request_id
+                tokens = num_generated_tokens
+
+        assert failed_request_id is None, (
+            f"{failed_request_id} generated {tokens} but "
+            f"expected {NUM_EXPECTED_TOKENS}")
+
+        engine.shutdown()
diff --git a/tests/v1/engine/test_detokenizer.py b/tests/v1/engine/test_detokenizer.py
new file mode 100644
index 0000000000000..07f343666cb5e
--- /dev/null
+++ b/tests/v1/engine/test_detokenizer.py
@@ -0,0 +1,205 @@
+from typing import List
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm.sampling_params import RequestOutputKind
+from vllm.v1.engine import EngineCoreOutput
+from vllm.v1.engine.detokenizer import Detokenizer, DetokenizerRequest
+
+TOKENIZER_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
+tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
+
+FULL_STRINGS = [
+    "My name is Robert from Neural Magic and I love working on vLLM so much!",
+    "Red Hat is the best open source company by far across Linux, K8s, and AI.",
+    "Nick is the name of my brother in addition to my colleague from Red Hat.",
+]
+
+STOP_STRINGS = ["I love working on", "company by far", "brother in"]
+
+FULL_TOKENS = [tokenizer(text).input_ids for text in FULL_STRINGS]
+PROMPT_LEN = 5
+PROMPT_TOKENS = [
+    tokenizer(text).input_ids[:PROMPT_LEN] for text in FULL_STRINGS
+]
+GENERATION_TOKENS = [
+    tokenizer(text).input_ids[PROMPT_LEN:] for text in FULL_STRINGS
+]
+PROMPT_STRINGS = [
+    tokenizer.decode(prompt_tokens, skip_special_tokens=True)
+    for prompt_tokens in PROMPT_TOKENS
+]
+PROMPT_STRINGS_LEN = [len(prompt_string) for prompt_string in PROMPT_STRINGS]
+GENERATION_STRINGS = [
+    text[prompt_len:]
+    for text, prompt_len in zip(FULL_STRINGS, PROMPT_STRINGS_LEN)
+]
+
+
+class MockEngineCore:
+    """Mock outputs form premade tokens lists."""
+
+    def __init__(self, tokens_list: List[List[int]]):
+        self.tokens_list = tokens_list
+        self.current_idx = 0
+
+    def get_outputs(self) -> List[EngineCoreOutput]:
+        token_idx = self.current_idx
+        self.current_idx += 1
+
+        outputs = []
+        for req_idx, token_ids in enumerate(self.tokens_list):
+            if len(token_ids) > token_idx:
+                output = EngineCoreOutput(request_id=f"request-{req_idx}",
+                                          new_token_ids=[token_ids[token_idx]],
+                                          finished=False)
+                if token_idx == len(token_ids) - 1:
+                    output.finished = True
+                    output.finish_reason = "stopped"
+                outputs.append(output)
+
+        return outputs
+
+
+@pytest.mark.parametrize(
+    "request_output_kind",
+    [RequestOutputKind.DELTA, RequestOutputKind.FINAL_ONLY])
+def test_incremental_detokenization(request_output_kind: RequestOutputKind):
+    detokenizer = Detokenizer(TOKENIZER_NAME)
+    engine_core = MockEngineCore(GENERATION_TOKENS)
+
+    # Make N requests.
+    requests = [
+        DetokenizerRequest(
+            request_id=f"request-{idx}",
+            prompt=prompt,
+            prompt_token_ids=prompt_tokens,
+            skip_special_tokens=False,
+            spaces_between_special_tokens=False,
+            output_kind=request_output_kind,
+            stop=[],
+            include_stop_str_in_output=False,
+        ) for idx, (
+            prompt,
+            prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+    ]
+
+    # Add requests to the detokenizer.
+    for request in requests:
+        detokenizer.add_request(request)
+
+    gen_strings = {}
+    gen_tokens = {}
+    while True:
+        # Mock output from the EngineCore.
+        outputs = engine_core.get_outputs()
+        if len(outputs) == 0:
+            break
+
+        # Step the Detokenizer.
+        request_outputs, requests_to_abort = detokenizer.step(outputs)
+        assert len(requests_to_abort) == 0
+
+        # Update tracking.
+        for request_output in request_outputs:
+            request_id = request_output.request_id
+            new_text = request_output.outputs[0].text
+            new_tokens = request_output.outputs[0].token_ids
+            if request_id not in gen_strings:
+                gen_strings[request_id] = new_text
+                gen_tokens[request_id] = new_tokens
+            else:
+                gen_strings[request_id] += new_text
+                gen_tokens[request_id].extend(new_tokens)
+
+    # Confirmed tracked values matches what we expected.
+    for idx, (ref_gen_str, ref_gen_toks) in enumerate(
+            zip(GENERATION_STRINGS, GENERATION_TOKENS)):
+        gen_str = gen_strings[f"request-{idx}"]
+        gen_toks = gen_tokens[f"request-{idx}"]
+
+        assert gen_str == ref_gen_str, f"{gen_str=}, {ref_gen_str=}"
+        assert gen_toks == ref_gen_toks, f"{gen_toks=}, {ref_gen_toks=}"
+
+    assert detokenizer.get_num_unfinished_requests() == 0
+    assert not detokenizer.has_unfinished_requests()
+
+
+@pytest.mark.parametrize("include_stop_str_in_output", [True, False])
+def test_stop_string(include_stop_str_in_output: bool):
+    detokenizer = Detokenizer(TOKENIZER_NAME)
+    engine_core = MockEngineCore(GENERATION_TOKENS)
+
+    # Make N requests.
+    requests = [
+        DetokenizerRequest(
+            request_id=f"request-{idx}",
+            prompt=prompt,
+            prompt_token_ids=prompt_tokens,
+            skip_special_tokens=False,
+            spaces_between_special_tokens=False,
+            output_kind=RequestOutputKind.DELTA,
+            stop=STOP_STRINGS,
+            include_stop_str_in_output=include_stop_str_in_output,
+        ) for idx, (
+            prompt,
+            prompt_tokens) in enumerate(zip(PROMPT_STRINGS, PROMPT_TOKENS))
+    ]
+
+    # Add requests to the detokenizer.
+    for request in requests:
+        detokenizer.add_request(request)
+
+    gen_strings = {}
+    aborted = []
+    while True:
+        # Mock output from the EngineCore.
+        outputs = engine_core.get_outputs()
+        if len(outputs) == 0:
+            break
+
+        # Step the Detokenizer.
+        request_outputs, requests_to_abort = detokenizer.step(outputs)
+        for request_output in request_outputs:
+            # If aborted, we should not get a request output.
+            assert request_output.request_id not in aborted
+        aborted.extend(requests_to_abort)
+
+        # Update tracking.
+        for request_output in request_outputs:
+            if request_output.finished:
+                assert request_output.outputs[0].finish_reason == "stop"
+
+            request_id = request_output.request_id
+            new_text = request_output.outputs[0].text
+            if request_id not in gen_strings:
+                gen_strings[request_id] = new_text
+            else:
+                gen_strings[request_id] += new_text
+
+    # Confirmed tracked values matches what we expected.
+    for idx, (ref_gen_str,
+              stop_str) in enumerate(zip(GENERATION_STRINGS, STOP_STRINGS)):
+
+        # Request should be aborted.
+        request_id = f"request-{idx}"
+        assert request_id in aborted
+
+        # Collected values that were generated.
+        gen_str = gen_strings[request_id]
+
+        # Construct reference strings.
+        stop_str_idx = ref_gen_str.find(stop_str)
+        ref_str_exc_stop = ref_gen_str[:stop_str_idx]
+        ref_str_inc_stop = ref_gen_str[:stop_str_idx] + stop_str
+
+        if include_stop_str_in_output:
+            assert gen_str == ref_str_inc_stop, (
+                f"{gen_str=}, {ref_str_inc_stop=}")
+        else:
+            assert gen_str == ref_str_exc_stop, (
+                f"{gen_str=}, {ref_str_exc_stop=}")
+
+    assert detokenizer.get_num_unfinished_requests() == 0
+    assert not detokenizer.has_unfinished_requests()
diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
new file mode 100644
index 0000000000000..8451aac33acc4
--- /dev/null
+++ b/tests/v1/engine/test_engine_core.py
@@ -0,0 +1,137 @@
+import time
+import uuid
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm import SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.platforms import current_platform
+from vllm.usage.usage_lib import UsageContext
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.engine.core import EngineCore
+
+if not current_platform.is_cuda():
+    pytest.skip(reason="V1 currently only supported on CUDA.",
+                allow_module_level=True)
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
+PROMPT = "Hello my name is Robert and I love quantization kernels"
+PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
+
+
+def make_request() -> EngineCoreRequest:
+    return EngineCoreRequest(
+        request_id=uuid.uuid4(),
+        prompt=PROMPT,
+        prompt_token_ids=PROMPT_TOKENS,
+        sampling_params=SamplingParams(),
+        eos_token_id=None,
+        arrival_time=time.time(),
+        lora_request=None,
+    )
+
+
+def test_engine_core(monkeypatch):
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        """Setup the EngineCore."""
+        engine_args = EngineArgs(model=MODEL_NAME)
+        vllm_config = engine_args.create_engine_config()
+        executor_class = AsyncLLM._get_executor_cls(vllm_config)
+
+        engine_core = EngineCore(vllm_config=vllm_config,
+                                 executor_class=executor_class,
+                                 usage_context=UsageContext.UNKNOWN_CONTEXT)
+        """Test basic request lifecycle."""
+
+        # First request.
+        engine_core.add_request(make_request())
+        assert len(engine_core.scheduler.waiting) == 1
+        assert len(engine_core.scheduler.running) == 0
+
+        _ = engine_core.step()
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 1
+
+        # Second request.
+        engine_core.add_request(make_request())
+        assert len(engine_core.scheduler.waiting) == 1
+        assert len(engine_core.scheduler.running) == 1
+
+        _ = engine_core.step()
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 2
+
+        # Add two requests in a row.
+        engine_core.add_request(make_request())
+        engine_core.add_request(make_request())
+        assert len(engine_core.scheduler.waiting) == 2
+        assert len(engine_core.scheduler.running) == 2
+
+        _ = engine_core.step()
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 4
+
+        # Loop through until they are all done.
+        while len(engine_core.step()) > 0:
+            pass
+
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 0
+        """Test abort cycle."""
+
+        # Basic abort.
+        req = make_request()
+        request_id = req.request_id
+
+        engine_core.add_request(req)
+        assert len(engine_core.scheduler.waiting) == 1
+        assert len(engine_core.scheduler.running) == 0
+
+        _ = engine_core.step()
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 1
+
+        engine_core.abort_requests([request_id])
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 0
+
+        # Add, step, abort 1 of the 3.
+        req0 = make_request()
+        req1 = make_request()
+        req2 = make_request()
+
+        engine_core.add_request(req0)
+        engine_core.add_request(req1)
+        assert len(engine_core.scheduler.waiting) == 2
+        assert len(engine_core.scheduler.running) == 0
+
+        _ = engine_core.step()
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 2
+
+        engine_core.add_request(req2)
+        assert len(engine_core.scheduler.waiting) == 1
+        assert len(engine_core.scheduler.running) == 2
+
+        _ = engine_core.step()
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 3
+
+        # Abort just one.
+        engine_core.abort_requests([req1.request_id])
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 2
+
+        _ = engine_core.step()
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 2
+
+        # Abort the other requests at the same time.
+        engine_core.abort_requests([req2.request_id, req0.request_id])
+        assert len(engine_core.scheduler.waiting) == 0
+        assert len(engine_core.scheduler.running) == 0
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
new file mode 100644
index 0000000000000..d582101a1164f
--- /dev/null
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -0,0 +1,202 @@
+import asyncio
+import time
+import uuid
+from typing import Dict, List
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm import SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.platforms import current_platform
+from vllm.usage.usage_lib import UsageContext
+from vllm.v1.engine import EngineCoreRequest
+from vllm.v1.engine.async_llm import AsyncLLM
+from vllm.v1.engine.core_client import EngineCoreClient
+
+if not current_platform.is_cuda():
+    pytest.skip(reason="V1 currently only supported on CUDA.",
+                allow_module_level=True)
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
+PROMPT = "Hello my name is Robert and I love quantization kernels"
+PROMPT_TOKENS = TOKENIZER(PROMPT).input_ids
+
+
+def make_request(params: SamplingParams) -> EngineCoreRequest:
+    return EngineCoreRequest(
+        request_id=str(uuid.uuid4()),
+        prompt=PROMPT,
+        prompt_token_ids=PROMPT_TOKENS,
+        sampling_params=params,
+        eos_token_id=None,
+        arrival_time=time.time(),
+        lora_request=None,
+    )
+
+
+def loop_until_done(client: EngineCoreClient, outputs: Dict):
+
+    while True:
+        engine_core_outputs = client.get_output()
+
+        if len(engine_core_outputs) == 0:
+            break
+
+        all_finished = True
+        for out in engine_core_outputs:
+            outputs[out.request_id].append(out)
+            if not out.finished:
+                all_finished = False
+
+        if all_finished:
+            break
+
+
+async def loop_until_done_async(client: EngineCoreClient, outputs: Dict):
+
+    while True:
+        engine_core_outputs = await client.get_output_async()
+
+        if len(engine_core_outputs) == 0:
+            break
+
+        all_finished = True
+        for out in engine_core_outputs:
+            outputs[out.request_id].append(out)
+            if not out.finished:
+                all_finished = False
+
+        if all_finished:
+            break
+
+
+@pytest.mark.parametrize("multiprocessing_mode", [True, False])
+def test_engine_core_client(monkeypatch, multiprocessing_mode: bool):
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        engine_args = EngineArgs(model=MODEL_NAME)
+        vllm_config = engine_args.create_engine_config()
+        executor_class = AsyncLLM._get_executor_cls(vllm_config)
+        client = EngineCoreClient.make_client(
+            vllm_config,
+            executor_class,
+            UsageContext.UNKNOWN_CONTEXT,
+            multiprocess_mode=multiprocessing_mode,
+            asyncio_mode=False,
+        )
+
+        MAX_TOKENS = 20
+        params = SamplingParams(max_tokens=MAX_TOKENS)
+        """Normal Request Cycle."""
+        requests = [make_request(params) for _ in range(10)]
+        request_ids = [req.request_id for req in requests]
+
+        # Add requests to the engine.
+        for request in requests:
+            client.add_request(request)
+            time.sleep(0.01)
+
+        outputs: Dict[str, List] = {req_id: [] for req_id in request_ids}
+        loop_until_done(client, outputs)
+
+        for req_id in request_ids:
+            assert len(outputs[req_id]) == MAX_TOKENS, (
+                f"{outputs[req_id]=}, {MAX_TOKENS=}")
+        """Abort Request Cycle."""
+
+        # Note: this code pathway will only work for multiprocessing
+        # since we have to call get_output() explicitly
+
+        # Add requests to the engine.
+        for idx, request in enumerate(requests):
+            client.add_request(request)
+            time.sleep(0.01)
+            if idx % 2 == 0:
+                client.abort_requests([request.request_id])
+
+        outputs = {req_id: [] for req_id in request_ids}
+        loop_until_done(client, outputs)
+
+        for idx, req_id in enumerate(request_ids):
+            if idx % 2 == 0:
+                assert len(outputs[req_id]) < MAX_TOKENS, (
+                    f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
+            else:
+                assert len(outputs[req_id]) == MAX_TOKENS, (
+                    f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
+        """Abort after request is finished."""
+
+        # Note: this code pathway will only work for multiprocessing
+        # since we have to call get_output() explicitly
+
+        request = requests[0]
+        client.add_request(request)
+        time.sleep(10.)
+
+        client.abort_requests([request.request_id])
+
+        # Shutdown the client.
+        client.shutdown()
+
+
+@pytest.mark.asyncio
+async def test_engine_core_client_asyncio(monkeypatch):
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        engine_args = EngineArgs(model=MODEL_NAME)
+        vllm_config = engine_args.create_engine_config()
+        executor_class = AsyncLLM._get_executor_cls(vllm_config)
+        client = EngineCoreClient.make_client(
+            vllm_config,
+            executor_class,
+            UsageContext.UNKNOWN_CONTEXT,
+            multiprocess_mode=True,
+            asyncio_mode=True,
+        )
+
+        MAX_TOKENS = 20
+        params = SamplingParams(max_tokens=MAX_TOKENS)
+        """Normal Request Cycle."""
+
+        requests = [make_request(params) for _ in range(10)]
+        request_ids = [req.request_id for req in requests]
+
+        # Add requests to the engine.
+        for request in requests:
+            await client.add_request_async(request)
+            await asyncio.sleep(0.01)
+
+        outputs: Dict[str, List] = {req_id: [] for req_id in request_ids}
+        await loop_until_done_async(client, outputs)
+
+        for req_id in request_ids:
+            assert len(outputs[req_id]) == MAX_TOKENS, (
+                f"{outputs[req_id]=}, {MAX_TOKENS=}")
+        """Abort Request Cycle."""
+
+        # Add requests to the engine.
+        for idx, request in enumerate(requests):
+            await client.add_request_async(request)
+            await asyncio.sleep(0.01)
+            if idx % 2 == 0:
+                await client.abort_requests_async([request.request_id])
+
+        outputs = {req_id: [] for req_id in request_ids}
+        await loop_until_done_async(client, outputs)
+
+        for idx, req_id in enumerate(request_ids):
+            if idx % 2 == 0:
+                assert len(outputs[req_id]) < MAX_TOKENS, (
+                    f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
+            else:
+                assert len(outputs[req_id]) == MAX_TOKENS, (
+                    f"{len(outputs[req_id])=}, {MAX_TOKENS=}")
+
+        # Shutdown the client.
+        client.shutdown()
diff --git a/vllm/config.py b/vllm/config.py
index f9b230e1bc688..dc9c06d7fb16e 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2106,3 +2106,44 @@ def __post_init__(self):
             self.model_config is not None and self.load_config is not None:
             self.quant_config = VllmConfig._get_quantization_config(
                 self.model_config, self.load_config)
+
+    def __str__(self):
+        return ("model=%r, speculative_config=%r, tokenizer=%r, "
+        "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, "
+        "override_neuron_config=%s, tokenizer_revision=%s, "
+        "trust_remote_code=%s, dtype=%s, max_seq_len=%d, "
+        "download_dir=%r, load_format=%s, tensor_parallel_size=%d, "
+        "pipeline_parallel_size=%d, "
+        "disable_custom_all_reduce=%s, quantization=%s, "
+        "enforce_eager=%s, kv_cache_dtype=%s, "
+        "quantization_param_path=%s, device_config=%s, "
+        "decoding_config=%r, observability_config=%r, "
+        "seed=%d, served_model_name=%s, "
+        "num_scheduler_steps=%d, enable_prefix_caching=%s, "
+        "use_async_output_proc=%s, mm_processor_kwargs=%s") % \
+        (self.model_config.model, self.speculative_config,
+        self.model_config.tokenizer,
+        self.model_config.skip_tokenizer_init,
+        self.model_config.tokenizer_mode,
+        self.model_config.revision,
+        self.model_config.override_neuron_config,
+        self.model_config.tokenizer_revision,
+        self.model_config.trust_remote_code,
+        self.model_config.dtype,
+        self.model_config.max_model_len,
+        self.load_config.download_dir,
+        self.load_config.load_format,
+        self.parallel_config.tensor_parallel_size,
+        self.parallel_config.pipeline_parallel_size,
+        self.parallel_config.disable_custom_all_reduce,
+        self.model_config.quantization,
+        self.model_config.enforce_eager,
+        self.cache_config.cache_dtype,
+        self.model_config.quantization_param_path,
+        self.device_config.device, self.decoding_config,
+        self.observability_config, self.model_config.seed,
+        self.model_config.served_model_name,
+        self.scheduler_config.num_scheduler_steps,
+        self.cache_config.enable_prefix_caching,
+        self.model_config.use_async_output_proc,
+        self.model_config.mm_processor_kwargs)
\ No newline at end of file
diff --git a/vllm/engine/multiprocessing/engine.py b/vllm/engine/multiprocessing/engine.py
index 889845ee67312..7de23643a2e1c 100644
--- a/vllm/engine/multiprocessing/engine.py
+++ b/vllm/engine/multiprocessing/engine.py
@@ -6,7 +6,6 @@
 import cloudpickle
 import zmq
 
-import vllm.envs
 from vllm import AsyncEngineArgs, SamplingParams
 from vllm.engine.llm_engine import LLMEngine
 # yapf conflicts with isort for this block
@@ -113,17 +112,9 @@ def from_engine_args(cls, engine_args: AsyncEngineArgs,
         load_general_plugins()
 
         engine_config = engine_args.create_engine_config()
-        if vllm.envs.VLLM_USE_V1:
-            # Lazy import: the v1 package isn't distributed
-            from vllm.v1.engine.llm_engine import LLMEngine as V1LLMEngine
-            engine_class = V1LLMEngine
-        else:
-            engine_class = LLMEngine
-
-        executor_class = engine_class._get_executor_cls(engine_config)
+        executor_class = LLMEngine._get_executor_cls(engine_config)
 
-        use_async_sockets = (engine_config.model_config.use_async_output_proc
-                             and not vllm.envs.VLLM_USE_V1)
+        use_async_sockets = engine_config.model_config.use_async_output_proc
 
         return cls(ipc_path=ipc_path,
                    use_async_sockets=use_async_sockets,
diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py
index a71ad493d9920..4b701f81504bb 100644
--- a/vllm/engine/output_processor/stop_checker.py
+++ b/vllm/engine/output_processor/stop_checker.py
@@ -1,4 +1,4 @@
-from typing import Callable, Optional
+from typing import Callable, List, Optional, Tuple
 
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
@@ -67,9 +67,13 @@ def maybe_stop_sequence(
             return
 
         # Check if any stop strings are matched.
-        stop_str = self._check_stop_strings(seq, new_char_count,
-                                            sampling_params)
-        if stop_str is not None:
+        stop = self.check_stop_strings(
+            seq.output_text, new_char_count, sampling_params.stop,
+            sampling_params.include_stop_str_in_output)
+        if stop is not None:
+            stop_str, truncate_to = stop
+            if truncate_to != -1:
+                seq.output_text = seq.output_text[:truncate_to]
             seq.status = SequenceStatus.FINISHED_STOPPED
             seq.stop_reason = stop_str
             return
@@ -85,33 +89,40 @@ def maybe_stop_sequence(
             return
 
     @staticmethod
-    def _check_stop_strings(seq: Sequence, new_char_count: int,
-                            sampling_params: SamplingParams) -> Optional[str]:
+    def check_stop_strings(
+        output_text: str,
+        new_char_count: int,
+        stop: List[str],
+        include_in_output: bool,
+    ) -> Optional[Tuple[str, int]]:
         """Check if any stop strings are matched and truncate sequence
         output text accordingly.
 
-        Returns the stop string if matched or else None.
+        Returns tuple (stop_string, offset) if matched or else None.
+
+        Where stop_string is the matched stop string and offset is the
+        length to which output_text should be truncated, or -1 for no
+        truncation.
         """
-        if not new_char_count or not sampling_params.stop:
+        if not new_char_count or not stop:
             return None
 
-        for stop_str in sampling_params.stop:
+        for stop_str in stop:
             stop_string_len = len(stop_str)
             # Avoid searching already-searched text.
-            stop_index = seq.output_text.find(
-                stop_str, -new_char_count - stop_string_len)
+            stop_index = output_text.find(stop_str,
+                                          -new_char_count - stop_string_len)
             if stop_index == -1:
                 continue
 
-            if sampling_params.include_stop_str_in_output:
+            if include_in_output:
                 # Truncate to end of stop string.
                 stop_index += stop_string_len
-                if stop_index >= len(seq.output_text):
+                if stop_index >= len(output_text):
                     # No truncation required.
-                    return stop_str
+                    return stop_str, -1
 
             # Truncate the output text to either the beginning
             # or end of the stop string.
-            seq.output_text = seq.output_text[:stop_index]
-            return stop_str
+            return stop_str, stop_index
         return None
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index f830839776364..a15dbd1c45119 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -210,8 +210,11 @@ def __init__(
         # Logic to switch between engines is done at runtime instead of import
         # to avoid import order issues
         self.engine_class = self.get_engine_class()
+
+        # TODO(rob): enable mp by default (issue with fork vs spawn)
         self.llm_engine = self.engine_class.from_engine_args(
             engine_args, usage_context=UsageContext.LLM_CLASS)
+
         self.request_counter = Counter()
 
     @staticmethod
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index b8b7912742d45..3e4070a25cf90 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -26,7 +26,6 @@
 import vllm.envs as envs
 from vllm.config import ModelConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
-from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.multiprocessing.client import MQLLMEngineClient
 from vllm.engine.multiprocessing.engine import run_mp_engine
 from vllm.engine.protocol import EngineClient
@@ -61,6 +60,11 @@
 from vllm.utils import FlexibleArgumentParser, get_open_zmq_ipc_path
 from vllm.version import __version__ as VLLM_VERSION
 
+if envs.VLLM_USE_V1:
+    from vllm.v1.engine.async_llm import AsyncLLMEngine  # type: ignore
+else:
+    from vllm.engine.async_llm_engine import AsyncLLMEngine  # type: ignore
+
 TIMEOUT_KEEP_ALIVE = 5  # seconds
 
 prometheus_multiproc_dir: tempfile.TemporaryDirectory
@@ -126,7 +130,8 @@ async def build_async_engine_client_from_engine_args(
     # Fall back
     # TODO: fill out feature matrix.
     if (MQLLMEngineClient.is_unsupported_config(engine_args)
-            or disable_frontend_multiprocessing):
+            or envs.VLLM_USE_V1 or disable_frontend_multiprocessing):
+
         engine_config = engine_args.create_engine_config()
         uses_ray = getattr(AsyncLLMEngine._get_executor_cls(engine_config),
                            "uses_ray", False)
@@ -143,6 +148,8 @@ async def build_async_engine_client_from_engine_args(
                 None, build_engine)
 
         yield engine_client
+        if hasattr(engine_client, "shutdown"):
+            engine_client.shutdown()
         return
 
     # Otherwise, use the multiprocessing AsyncLLMEngine.
diff --git a/vllm/envs.py b/vllm/envs.py
index 154246c69f165..f320e35971f94 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -72,6 +72,7 @@
     VLLM_CUSTOM_OPS: List[str] = []
     VLLM_DISABLED_KERNELS: List[str] = []
     VLLM_USE_V1: bool = False
+    VLLM_ENABLE_V1_MULTIPROCESSING: bool = False
 
 
 def get_default_cache_root():
@@ -473,6 +474,10 @@ def get_default_config_root():
     # If set, use the V1 code path.
     "VLLM_USE_V1":
     lambda: bool(int(os.getenv("VLLM_USE_V1", "0"))),
+
+    # If set, enable multiprocessing in LLM for the V1 code path.
+    "VLLM_ENABLE_V1_MULTIPROCESSING":
+    lambda: bool(int(os.getenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0"))),
 }
 
 # end-env-vars-definition
diff --git a/vllm/outputs.py b/vllm/outputs.py
index 951976310e7ae..abfdb7d328126 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -113,6 +113,36 @@ def __init__(
         self.encoder_prompt = encoder_prompt
         self.encoder_prompt_token_ids = encoder_prompt_token_ids
 
+    @classmethod
+    def new(
+        cls,
+        request_id: str,
+        prompt: Optional[str],
+        prompt_token_ids: Optional[List[int]],
+        text: str,
+        token_ids: List[int],
+        finished: bool = False,
+    ) -> "RequestOutput":
+        """Initialize a new RequestOutput object."""
+
+        # TODO: Support `n` > 1.
+        completion_output = CompletionOutput(
+            index=0,
+            text=text,
+            token_ids=token_ids,
+            cumulative_logprob=None,
+            logprobs=None,  # TODO
+        )
+
+        return RequestOutput(
+            request_id=request_id,
+            prompt=prompt,
+            prompt_token_ids=prompt_token_ids,
+            prompt_logprobs=None,  # TODO
+            outputs=[completion_output],
+            finished=finished,
+        )
+
     @classmethod
     def from_seq_group(
         cls, seq_group: SequenceGroup, use_cache: bool,
diff --git a/vllm/v1/__init__.py b/vllm/v1/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py
index 82094fb65dd1a..38f1c03a4d3ac 100644
--- a/vllm/v1/core/kv_cache_manager.py
+++ b/vllm/v1/core/kv_cache_manager.py
@@ -70,7 +70,7 @@ def get_computed_blocks(self, request: Request) -> List[KVCacheBlock]:
 
         Args:
             request: The request to get the computed blocks.
-        
+
         Returns:
             A list of blocks that are computed for the request.
         """
@@ -105,7 +105,7 @@ def append_slots(
         Args:
             request: The request to append slots.
             num_tokens: The number of tokens to append.
-        
+
         Returns:
             A list of new blocks if new blocks are allocated, or None
             if new blocks are required but cannot be allocated.
@@ -176,7 +176,7 @@ def allocate_slots(
             num_tokens: The number of tokens to allocate. Note that this does
                 not include the tokens that have already been computed.
             computed_blocks: The blocks that have already been computed.
-        
+
         Returns:
             A list of new allocated blocks.
         """
@@ -240,7 +240,8 @@ def free(self, request: Request) -> None:
         Args:
             request: The request to free the blocks.
         """
-        blocks = self.req_to_blocks.pop(request.request_id)
+        # Default to [] in case a request is freed (aborted) before alloc.
+        blocks = self.req_to_blocks.pop(request.request_id, [])
         if self.enable_caching:
             # Free blocks in reverse order so that the tail blocks are
             # freed first.
@@ -259,13 +260,13 @@ def _get_new_blocks(
         """Get new blocks from the free block pool, and add token IDs to
         allocated blocks if caching is enabled.
         Note that we do not check block cache in this function.
-        
+
         Args:
             num_blocks: The number of blocks to allocate.
             token_ids: The token IDs in the blocks. None if caching is disabled.
             parent_block: The parent block. Used to include block chain
                 in the block hash.
-        
+
         Returns:
             A list of new block.
         """
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index a60f8b8138ecf..ee860e792281d 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -1,12 +1,13 @@
 from collections import deque
 from dataclasses import dataclass
-from typing import Deque, Dict, Iterable, List, Optional, Set, Tuple, Union
+from typing import Deque, Dict, Iterable, List, Optional, Set, Union
 
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalDataDict
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.kv_cache_manager import KVCacheManager
+from vllm.v1.engine import EngineCoreOutput
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 
@@ -237,13 +238,12 @@ def update_from_output(
         self,
         scheduler_output: "SchedulerOutput",
         model_runner_output: "ModelRunnerOutput",
-    ) -> List[Tuple[Request, int]]:
+    ) -> List[EngineCoreOutput]:
         # NOTE(woosuk): This method doesn't consider speculative decoding.
         sampled_token_ids = model_runner_output.sampled_token_ids_cpu.tolist()
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
         new_running: List[Request] = []
-        # (request, num_sampled_tokens)
-        sampled: List[Tuple[Request, int]] = []
+        engine_core_outputs: List[EngineCoreOutput] = []
         for request in self.running:
             req_id = request.request_id
             request.num_computed_tokens += num_scheduled_tokens[req_id]
@@ -257,17 +257,29 @@ def update_from_output(
                 # generates at most one token at each step.
                 token_id = sampled_token_ids[req_index]
                 request.append_output_token_ids(token_id)
-                sampled.append((request, 1))
+                num_new_tokens = 1
                 # TODO: Update the KV cache manager for prefix caching.
 
-                # Check if the request is finished.
+                # Check for stop and update request state.
+                # This must be called before me make the EngineCoreOutput.
                 stopped = self._check_stop(request)
+
+                # Add EngineCoreOutput for this Request.
+                output = EngineCoreOutput(
+                    request_id=req_id,
+                    new_token_ids=request.output_token_ids[-num_new_tokens:],
+                    finished=request.is_finished(),
+                    finish_reason=request.get_finished_reason(),
+                    stop_reason=request.stop_reason)
+                engine_core_outputs.append(output)
+
+                # Breakout of the loop.
                 if stopped:
                     continue
 
             new_running.append(request)
         self.running = new_running
-        return sampled
+        return engine_core_outputs
 
     def _check_stop(self, request: Request) -> bool:
         if (request.num_tokens >= self.max_model_len
diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index e69de29bb2d1d..8bc16651faf97 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -0,0 +1,72 @@
+import enum
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import msgspec
+
+from vllm.lora.request import LoRARequest
+from vllm.sampling_params import RequestOutputKind, SamplingParams
+
+
+@dataclass
+class DetokenizerRequest:
+
+    request_id: str
+    prompt: Optional[str]
+    prompt_token_ids: List[int]
+    skip_special_tokens: bool
+    spaces_between_special_tokens: bool
+    output_kind: RequestOutputKind
+
+    stop: List[str]
+    include_stop_str_in_output: bool
+
+
+class EngineCoreRequest(msgspec.Struct, omit_defaults=True):
+
+    # NOTE: prompt and prompt_token_ids should be DecoderOnlyInput,
+    # but this object is currently not playing well with msgspec
+    # due to circular imports and typing we have in data.py
+
+    request_id: str
+    #NOTE(Nick): I don't think we need to pass prompt here since it should
+    # always be tokenized?
+    prompt: Optional[str]
+    prompt_token_ids: List[int]
+    sampling_params: SamplingParams
+    eos_token_id: Optional[int]
+    arrival_time: float
+    lora_request: Optional[LoRARequest]
+
+
+class EngineCoreOutput(msgspec.Struct,
+                       array_like=True,
+                       omit_defaults=True,
+                       gc=False):
+
+    request_id: str
+    new_token_ids: List[int]
+    finished: bool
+    finish_reason: Optional[str] = None
+    stop_reason: Union[int, str, None] = None
+
+
+class EngineCoreOutputs(msgspec.Struct,
+                        array_like=True,
+                        omit_defaults=True,
+                        gc=False):
+
+    #NOTE(Nick): We could consider ways to make this more compact,
+    # e.g. columnwise layout and using an int enum for finish/stop reason
+
+    # [num_reqs]
+    outputs: List[EngineCoreOutput]
+
+
+class EngineCoreRequestType(enum.Enum):
+    """
+    Request types defined as hex byte strings, so it can be sent over sockets
+    without separate encoding step.
+    """
+    ADD = b'\x00'
+    ABORT = b'\x01'
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
new file mode 100644
index 0000000000000..2d7c58cfea13b
--- /dev/null
+++ b/vllm/v1/engine/async_llm.py
@@ -0,0 +1,368 @@
+import asyncio
+from typing import AsyncGenerator, Dict, List, Mapping, Optional, Type, Union
+
+from vllm.config import ModelConfig, VllmConfig
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.metrics_types import StatLoggerBase
+from vllm.engine.protocol import EngineClient
+from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.outputs import EmbeddingRequestOutput, RequestOutput
+from vllm.pooling_params import PoolingParams
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizer import AnyTokenizer
+from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
+from vllm.usage.usage_lib import UsageContext
+from vllm.v1.engine.async_stream import AsyncStream
+from vllm.v1.engine.core_client import EngineCoreClient
+from vllm.v1.engine.detokenizer import Detokenizer
+from vllm.v1.engine.processor import Processor
+from vllm.v1.executor.gpu_executor import GPUExecutor
+
+logger = init_logger(__name__)
+
+
+class AsyncLLM(EngineClient):
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        executor_class: Type[GPUExecutor],
+        log_stats: bool,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
+        input_registry: InputRegistry = INPUT_REGISTRY,
+        use_cached_outputs: bool = False,
+        log_requests: bool = True,
+        start_engine_loop: bool = True,
+    ) -> None:
+        assert start_engine_loop
+
+        self.log_requests = log_requests
+        self.log_stats = log_stats
+        self.stat_loggers = stat_loggers
+        self.model_config = vllm_config.model_config
+
+        # Tokenizer (+ ensure liveness if running in another process).
+        self.tokenizer = init_tokenizer_from_configs(
+            model_config=vllm_config.model_config,
+            scheduler_config=vllm_config.scheduler_config,
+            parallel_config=vllm_config.parallel_config,
+            enable_lora=bool(vllm_config.lora_config))
+        self.tokenizer.ping()
+
+        # Request streams (map of request_id -> AsyncStream).
+        self.request_streams: Dict[str, AsyncStream] = {}
+        # List of cancelled request ids to be aborted.
+        self.client_aborted_requests: List[str] = []
+
+        # Processor (converts Inputs --> EngineCoreRequests).
+        self.processor = Processor(vllm_config.model_config,
+                                   vllm_config.lora_config, self.tokenizer,
+                                   input_registry)
+
+        # Detokenizer (converts EngineCoreOutputs --> RequestOutput).
+        self.detokenizer = Detokenizer(vllm_config.model_config.tokenizer)
+
+        # EngineCore (starts the engine in background process).
+        self.engine_core = EngineCoreClient.make_client(
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            usage_context=usage_context,
+            multiprocess_mode=True,
+            asyncio_mode=True,
+        )
+
+        self.output_handler = None
+
+    def __del__(self):
+        self.shutdown()
+
+    @classmethod
+    def from_engine_args(
+        cls,
+        engine_args: AsyncEngineArgs,
+        engine_config: Optional[VllmConfig] = None,
+        start_engine_loop: bool = True,
+        usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
+        stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
+    ) -> "AsyncLLMEngine":
+        """Create an AsyncLLM from the EngineArgs."""
+
+        # Create the engine configs.
+        if engine_config is None:
+            vllm_config = engine_args.create_engine_config()
+        else:
+            vllm_config = engine_config
+
+        executor_class = cls._get_executor_cls(vllm_config)
+
+        # Create the AsyncLLM.
+        return cls(
+            vllm_config=vllm_config,
+            executor_class=executor_class,
+            log_requests=not engine_args.disable_log_requests,
+            log_stats=not engine_args.disable_log_stats,
+            start_engine_loop=start_engine_loop,
+            usage_context=usage_context,
+            stat_loggers=stat_loggers,
+        )
+
+    def shutdown(self):
+        """Shutdown, cleaning up the background proc and IPC."""
+
+        self.engine_core.shutdown()
+
+        if handler := getattr(self, "output_handler", None):
+            handler.cancel()
+
+    @classmethod
+    def _get_executor_cls(cls, vllm_config: VllmConfig):
+        return GPUExecutor
+
+    async def add_request(
+        self,
+        request_id: str,
+        prompt: PromptType,
+        params: Union[SamplingParams, PoolingParams],
+        arrival_time: Optional[float] = None,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        priority: int = 0,
+    ) -> AsyncGenerator[Union[RequestOutput, EmbeddingRequestOutput], None]:
+        """Add new request to the AsyncLLM."""
+
+        if self.detokenizer.is_request_active(request_id):
+            raise KeyError(f"Request {request_id} already exists.")
+
+        # 1) Create a new AsyncStream for the request.
+        stream = self._add_request_to_streams(request_id)
+
+        # 2) Convert input --> DetokenizerRequest / EngineCoreRequest.
+        detokenizer_req, engine_core_req = self.processor.process_inputs(
+            request_id, prompt, params, arrival_time, lora_request,
+            trace_headers, prompt_adapter_request, priority)
+
+        # 3) Add the request to Detokenizer (this process).
+        self.detokenizer.add_request(detokenizer_req)
+
+        # 4) Add the EngineCoreRequest to EngineCore (separate process).
+        await self.engine_core.add_request_async(engine_core_req)
+
+        # 5) Return the generator.
+        return stream.generator()
+
+    # TODO: we should support multiple prompts in one call, as you
+    # can do with LLM.generate. So that for multi-prompt completion
+    # requests we don't need to send multiple messages to core proc,
+    # and so we don't need multiple streams which then get
+    # re-multiplexed in the API server anyhow.
+    async def generate(
+        self,
+        prompt: PromptType,
+        sampling_params: SamplingParams,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        priority: int = 0,
+    ) -> AsyncGenerator[RequestOutput, None]:
+        """
+        Main function called by the API server to kick off a request
+            * 1) Making an AsyncStream corresponding to the Request.
+            # 2) Processing the Input.
+            * 3) Adding the Request to the Detokenizer.
+            * 4) Adding the Request to the EngineCore (separate process).
+
+        A separate output_handler loop runs in a background AsyncIO task, 
+        pulling outputs from EngineCore and putting them into the 
+        per-request AsyncStream.
+
+        The caller of generate() iterates the returned AsyncGenerator,
+        returning the RequestOutput back to the caller.
+        """
+
+        # We start the output_handler on the first call to generate() so that
+        # we can call __init__ before the event loop starts, which enables us
+        # to handle startup failure gracefully in the OpenAI server.
+        if self.output_handler is None:
+            self.output_handler = asyncio.create_task(
+                self._run_output_handler())
+
+        async for output in await self.add_request(
+                request_id,
+                prompt,
+                sampling_params,
+                lora_request=lora_request,
+                trace_headers=trace_headers,
+                prompt_adapter_request=prompt_adapter_request,
+                priority=priority,
+        ):
+            yield output
+
+    def _finish_stream(self, request_id: str):
+        stream = self.request_streams.pop(request_id, None)
+        if stream is not None:
+            stream.finish()
+
+    def _add_request_to_streams(
+        self,
+        request_id: str,
+    ) -> AsyncStream:
+
+        if request_id in self.request_streams:
+            raise ValueError(f"Request id {request_id} already running.")
+
+        # Avoid streams having circular ref to parent AsyncLLM object.
+        aborted_reqs = self.client_aborted_requests
+        stream = AsyncStream(request_id, aborted_reqs.append)
+        self.request_streams[request_id] = stream
+
+        if self.log_requests:
+            logger.info("Added request %s.", request_id)
+
+        return stream
+
+    async def _process_cancellations(self) -> None:
+        """
+        Process requests cancelled from user disconnecting.
+
+        When a client disconnects, AsyncStream._cancel() is called.
+        We passed a callback to AsyncStream(), which appends to 
+        self.client_aborted_requests.
+
+        As a result, if any requests are canceled from the user side
+        the request_id will show up in self.client_aborted_requests.
+        """
+
+        # Avoid streams having circular ref to parent AsyncLLM object.
+        if not self.client_aborted_requests:
+            return
+        reqs_to_abort = self.client_aborted_requests.copy()
+        self.client_aborted_requests.clear()
+
+        # Remove from Detokenizer.
+        self.detokenizer.abort_requests(reqs_to_abort)
+
+        # Remove from RequestStreams.
+        for request_id in reqs_to_abort:
+            if self.log_requests:
+                logger.info("User-cancelled request %s.", request_id)
+            self._finish_stream(request_id)
+
+        # Remove from EngineCore.
+        await self.engine_core.abort_requests_async(reqs_to_abort)
+
+    def _process_request_outputs(self, request_outputs: List[RequestOutput]):
+        """Process outputs by putting them into per-request AsyncStreams."""
+
+        for request_output in request_outputs:
+            request_id = request_output.request_id
+            assert request_id in self.request_streams
+
+            # Each request in the API server pulls from the per-request stream.
+            stream = self.request_streams.get(request_id)
+            if stream is not None:
+                stream.put(request_output)
+
+                # If finished, remove from the tracker.
+                if request_output.finished:
+                    if self.log_requests:
+                        logger.info("Finished request %s.", request_id)
+                    self._finish_stream(request_id)
+
+    async def _run_output_handler(self):
+        """Background loop: pulls from EngineCore and pushes to AsyncStreams."""
+
+        try:
+            while True:
+                # 1) Pull EngineCoreOutput from the EngineCore.
+                outputs = await self.engine_core.get_output_async()
+
+                # 2) Detokenize based on the output.
+                request_outputs, reqs_to_abort = self.detokenizer.step(outputs)
+
+                # 3) Put the RequestOutputs into the per-request AsyncStreams.
+                self._process_request_outputs(request_outputs)
+
+                # 4) Abort any requests that finished due to stop strings.
+                await self.engine_core.abort_requests_async(reqs_to_abort)
+
+                # 5) Abort any requests due to client cancellations.
+                await self._process_cancellations()
+
+        except BaseException as e:
+            logger.error(e)
+            raise e
+
+    # TODO: can we eliminate these?
+
+    async def abort(self, request_id: str) -> None:
+        # Note: Who Calls this? I dont think this is actually used.
+        raise ValueError("Not Supported on V1 yet.")
+
+    def encode(
+        self,
+        prompt: PromptType,
+        pooling_params: PoolingParams,
+        request_id: str,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        priority: int = 0,
+    ):
+        raise ValueError("Not Supported on V1 yet.")
+
+    async def get_model_config(self) -> ModelConfig:
+        return self.model_config
+
+    async def get_decoding_config(self):
+        raise ValueError("Not Supported on V1 yet.")
+
+    async def get_tokenizer(
+        self,
+        lora_request: Optional[LoRARequest] = None,
+    ) -> AnyTokenizer:
+        assert lora_request is None
+        return self.detokenizer.tokenizer
+
+    async def is_tracing_enabled(self) -> bool:
+        return False
+
+    async def do_log_stats(
+        self,
+        scheduler_outputs=None,
+        model_output=None,
+    ) -> None:
+        logger.debug("Called do_log_stats.")
+
+    async def check_health(self) -> None:
+        logger.debug("Called check_health.")
+
+    async def start_profile(self) -> None:
+        raise ValueError("Not supported on V1 yet.")
+
+    async def stop_profile(self) -> None:
+        raise ValueError("Not supported on V1 yet.")
+
+    @property
+    def is_running(self) -> bool:
+        return True
+
+    @property
+    def is_stopped(self) -> bool:
+        return False
+
+    @property
+    def errored(self) -> bool:
+        return False
+
+    @property
+    def dead_error(self) -> BaseException:
+        return Exception
+
+
+# Retain V0 name for backwards compatibility.
+AsyncLLMEngine = AsyncLLM
diff --git a/vllm/v1/engine/async_stream.py b/vllm/v1/engine/async_stream.py
new file mode 100644
index 0000000000000..3e6c759ad5ebd
--- /dev/null
+++ b/vllm/v1/engine/async_stream.py
@@ -0,0 +1,55 @@
+import asyncio
+from typing import Any, AsyncGenerator, Callable, Optional, Type, Union
+
+from vllm.outputs import EmbeddingRequestOutput, RequestOutput
+
+
+class AsyncStream:
+    """A stream of RequestOutputs or EmbeddingRequestOutputs for a request
+    that can be iterated over asynchronously via an async generator."""
+
+    STOP_ITERATION = Exception()  # Sentinel
+
+    def __init__(self, request_id: str, cancel: Callable[[str], None]) -> None:
+        self.request_id = request_id
+        self._cancel = cancel
+        self._queue: asyncio.Queue = asyncio.Queue()
+        self._finished = False
+
+    def put(self, item: Union[RequestOutput, EmbeddingRequestOutput,
+                              Exception]) -> None:
+        if not self._finished:
+            self._queue.put_nowait(item)
+
+    def finish(
+        self,
+        exception: Optional[Union[BaseException, Type[BaseException]]] = None,
+    ) -> None:
+        if not self._finished:
+            self._finished = True
+            self._queue.put_nowait(exception if self._is_raisable(exception)
+                                   else AsyncStream.STOP_ITERATION)
+
+    async def generator(
+        self
+    ) -> AsyncGenerator[Union[RequestOutput, EmbeddingRequestOutput], None]:
+        finished = False
+        try:
+            while True:
+                result = await self._queue.get()
+                if self._is_raisable(result):
+                    finished = True
+                    if result == AsyncStream.STOP_ITERATION:
+                        return
+                    raise result
+                yield result
+        finally:
+            self._finished = True
+            if not finished:
+                self._cancel(self.request_id)
+
+    @staticmethod
+    def _is_raisable(value: Any):
+        return isinstance(value, BaseException) or \
+                (isinstance(value, type) and \
+                 issubclass(value, BaseException))
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
new file mode 100644
index 0000000000000..f9d3473d0131c
--- /dev/null
+++ b/vllm/v1/engine/core.py
@@ -0,0 +1,352 @@
+import multiprocessing
+import queue
+import threading
+import time
+from contextlib import contextmanager
+from multiprocessing.process import BaseProcess
+from multiprocessing.sharedctypes import Synchronized
+from typing import Any, Iterator, List, Tuple, Type, Union
+
+import zmq
+import zmq.asyncio
+from msgspec import msgpack
+
+from vllm.config import CacheConfig, VllmConfig
+from vllm.logger import init_logger
+from vllm.usage.usage_lib import UsageContext
+from vllm.v1.core.scheduler import Scheduler
+from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
+                            EngineCoreRequest, EngineCoreRequestType)
+from vllm.v1.executor.gpu_executor import GPUExecutor
+from vllm.v1.request import Request, RequestStatus
+from vllm.version import __version__ as VLLM_VERSION
+
+logger = init_logger(__name__)
+
+POLLING_TIMEOUT_MS = 5000
+POLLING_TIMEOUT_S = POLLING_TIMEOUT_MS // 1000
+LOGGING_TIME_S = 5000
+
+
+class EngineCore:
+    """Inner loop of vLLM's Engine."""
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        executor_class: Type[GPUExecutor],
+        usage_context: UsageContext,
+    ):
+        # Override the configs for V1.
+        # FIXME
+        if usage_context == UsageContext.LLM_CLASS:
+            vllm_config.scheduler_config.max_num_seqs = 1024
+            vllm_config.scheduler_config.max_num_batched_tokens = 8192
+        elif usage_context == UsageContext.OPENAI_API_SERVER:
+            vllm_config.scheduler_config.max_num_seqs = 1024
+            vllm_config.scheduler_config.max_num_batched_tokens = 2048
+
+        # TODO (ywang96): Enable APC by default when VLM supports it.
+        if not vllm_config.model_config.is_multimodal_model:
+            vllm_config.cache_config.enable_prefix_caching = True
+
+        assert vllm_config.model_config.task != "embedding"
+
+        logger.info("Initializing an LLM engine (v%s) with config: %s",
+                    VLLM_VERSION, vllm_config)
+
+        # Setup Model.
+        self.model_executor = executor_class(vllm_config)
+
+        # Setup KV Caches and update CacheConfig after profiling.
+        num_gpu_blocks, num_cpu_blocks = self._initialize_kv_caches(
+            vllm_config.cache_config)
+        vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
+        vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
+
+        # Setup scheduler.
+        self.scheduler = Scheduler(vllm_config.scheduler_config,
+                                   vllm_config.cache_config,
+                                   vllm_config.lora_config)
+
+        self._last_logging_time = time.time()
+
+    def _initialize_kv_caches(self,
+                              cache_config: CacheConfig) -> Tuple[int, int]:
+        num_gpu_blocks, _ = self.model_executor.determine_num_available_blocks(
+        )
+
+        if cache_config.num_gpu_blocks_override is not None:
+            num_gpu_blocks_override = cache_config.num_gpu_blocks_override
+            logger.info(
+                "Overriding num_gpu_blocks=%d with "
+                "num_gpu_blocks_override=%d", num_gpu_blocks,
+                num_gpu_blocks_override)
+            num_gpu_blocks = num_gpu_blocks_override
+
+        num_cpu_blocks = 0
+        self.model_executor.initialize_cache(num_gpu_blocks)
+        return num_gpu_blocks, num_cpu_blocks
+
+    def add_request(self, request: EngineCoreRequest):
+        """Add request to the scheduler."""
+
+        req = Request.from_engine_core_request(request)
+        self.scheduler.add_request(req)
+
+    def abort_requests(self, request_ids: List[str]):
+        """Abort requests from the scheduler."""
+
+        # TODO: The scheduler doesn't really need to know the
+        # specific finish reason, TBD whether we propagate that
+        # (i.e. client-aborted vs stop criteria met).
+        self.scheduler.finish_requests(request_ids,
+                                       RequestStatus.FINISHED_ABORTED)
+
+    def step(self) -> List[EngineCoreOutput]:
+        """Schedule, execute, and make output."""
+
+        if not self.scheduler.has_unfinished_requests():
+            return []
+
+        scheduler_output = self.scheduler.schedule()
+        output = self.model_executor.execute_model(scheduler_output)
+        engine_core_outputs = self.scheduler.update_from_output(
+            scheduler_output, output)
+        return engine_core_outputs
+
+
+class EngineCoreProc(EngineCore):
+    """ZMQ-wrapper for running EngineCore in background process."""
+
+    READY_STR = "READY"
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        executor_class: Type[GPUExecutor],
+        usage_context: UsageContext,
+        input_path: str,
+        output_path: str,
+        ready_path: str,
+        should_shutdown: Synchronized,
+    ):
+        super().__init__(vllm_config, executor_class, usage_context)
+
+        # Signal from main process to shutdown (multiprocessing.Value).
+        self.should_shutdown = should_shutdown
+
+        # Background Threads and Queues for IO. These enable us to
+        # overlap ZMQ socket IO with GPU since they release the GIL,
+        # and to overlap some serialization/deserialization with the
+        # model forward pass.
+        # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
+        self.input_queue = queue.Queue()
+        self.output_queue = queue.Queue()
+        threading.Thread(target=self.process_input_socket,
+                         args=(input_path, ),
+                         daemon=True).start()
+        threading.Thread(target=self.process_output_socket,
+                         args=(output_path, ),
+                         daemon=True).start()
+
+        # Send Readiness signal to EngineClient.
+        with self.make_socket(ready_path, zmq.constants.PUSH) as ready_socket:
+            ready_socket.send_string(EngineCoreProc.READY_STR)
+
+    @contextmanager
+    def make_socket(self, path: str, type: Any) -> Iterator[zmq.Socket]:
+        """Context manager for use """
+
+        ctx = zmq.Context()
+        try:
+            socket = ctx.socket(type)
+
+            if type == zmq.constants.PULL:
+                socket.connect(path)
+            elif type == zmq.constants.PUSH:
+                socket.bind(path)
+            else:
+                raise ValueError(f"Unknown Socket Type: {type}")
+
+            yield socket
+
+        except KeyboardInterrupt:
+            logger.debug("EngineCore had Keyboard Interrupt.")
+
+        finally:
+            ctx.destroy(linger=0)
+
+    @staticmethod
+    def wait_for_startup(
+        proc: BaseProcess,
+        ready_path: str,
+    ) -> None:
+        """Wait until the EngineCore is ready."""
+
+        try:
+            sync_ctx = zmq.Context()  # type: ignore[attr-defined]
+            socket = sync_ctx.socket(zmq.constants.PULL)
+            socket.connect(ready_path)
+
+            # Wait for EngineCore to send EngineCoreProc.READY_STR.
+            while socket.poll(timeout=POLLING_TIMEOUT_MS) == 0:
+                logger.debug("Waiting for EngineCoreProc to startup.")
+
+                if not proc.is_alive():
+                    raise RuntimeError("EngineCoreProc failed to start.")
+
+            message = socket.recv_string()
+            assert message == EngineCoreProc.READY_STR
+
+        except BaseException as e:
+            logger.exception(e)
+            raise e
+
+        finally:
+            sync_ctx.destroy(linger=0)
+
+    @staticmethod
+    def make_engine_core_process(
+        vllm_config: VllmConfig,
+        executor_class: Type[GPUExecutor],
+        usage_context: UsageContext,
+        input_path: str,
+        output_path: str,
+        ready_path: str,
+        should_shutdown: Synchronized,
+    ) -> BaseProcess:
+        # The current process might have CUDA context,
+        # so we need to spawn a new process.
+        # NOTE(rob): this is a problem for using EngineCoreProc w/
+        # LLM, since we need a if __name__ == "__main__" guard.
+        context = multiprocessing.get_context("spawn")
+
+        process_kwargs = {
+            "input_path": input_path,
+            "output_path": output_path,
+            "ready_path": ready_path,
+            "vllm_config": vllm_config,
+            "executor_class": executor_class,
+            "usage_context": usage_context,
+            "should_shutdown": should_shutdown
+        }
+        # Run EngineCore busy loop in background process.
+        proc = context.Process(target=EngineCoreProc.run_engine_core,
+                               kwargs=process_kwargs)
+        proc.start()
+
+        # Wait for startup
+        EngineCoreProc.wait_for_startup(proc, ready_path)
+        return proc
+
+    @staticmethod
+    def run_engine_core(*args, **kwargs):
+        """Launch EngineCore busy loop in background process."""
+
+        try:
+            engine_core = EngineCoreProc(*args, **kwargs)
+            engine_core.run_busy_loop()
+
+        except KeyboardInterrupt:
+            logger.debug("EngineCore interrupted.")
+
+        except BaseException as e:
+            logger.exception(e)
+            raise e
+
+    def run_busy_loop(self):
+        """Core busy loop of the EngineCore."""
+
+        # Loop until we get a shutdown signal.
+        while not self.should_shutdown:
+            # 1) Poll the input queue until there is work to do.
+            if not self.scheduler.has_unfinished_requests():
+                while True:
+                    try:
+                        req = self.input_queue.get(timeout=POLLING_TIMEOUT_S)
+                        self._handle_client_request(req)
+                        break
+                    except queue.Empty:
+                        self._log_stats()
+                        logger.debug("EngineCore busy loop waiting.")
+                        if self.should_shutdown:
+                            return
+
+            # 2) Handle any new client requests (Abort or Add).
+            while not self.input_queue.empty():
+                req = self.input_queue.get_nowait()
+                self._handle_client_request(req)
+
+            # 3) Step the engine core.
+            outputs = self.step()
+
+            # 4) Put EngineCoreOutputs into the output queue.
+            self.output_queue.put_nowait(outputs)
+
+            self._log_stats()
+
+    def _log_stats(self):
+        """Log basic stats every LOGGING_TIME_S"""
+
+        now = time.time()
+
+        if now - self._last_logging_time > LOGGING_TIME_S:
+            logger.info(
+                "RUNNING: %s | WAITING: %s",
+                len(self.scheduler.running),
+                len(self.scheduler.waiting),
+            )
+
+            self._last_logging_time = now
+
+    def _handle_client_request(
+            self, request: Union[EngineCoreRequest, List[str]]) -> None:
+        """Handle EngineCoreRequest or EngineCoreABORT from Client."""
+
+        if isinstance(request, EngineCoreRequest):
+            self.add_request(request)
+        else:
+            # TODO: make an EngineCoreAbort wrapper
+            assert isinstance(request, list)
+            self.abort_requests(request)
+
+    def process_input_socket(self, input_path: str):
+        """Input socket IO thread."""
+
+        # Msgpack serialization decoding.
+        decoder_add_req = msgpack.Decoder(EngineCoreRequest)
+        decoder_abort_req = msgpack.Decoder(list[str])
+
+        with self.make_socket(input_path, zmq.constants.PULL) as socket:
+            while True:
+                # (RequestType, RequestData)
+                type_frame, data_frame = socket.recv_multipart(copy=False)
+                request_type = type_frame.buffer
+                request_data = data_frame.buffer
+
+                # Deserialize the request data.
+                if request_type == EngineCoreRequestType.ADD.value:
+                    request = decoder_add_req.decode(request_data)
+                elif request_type == EngineCoreRequestType.ABORT.value:
+                    request = decoder_abort_req.decode(request_data)
+                else:
+                    raise ValueError(f"Unknown RequestType: {request_type}")
+
+                # Push to input queue for core busy loop.
+                self.input_queue.put_nowait(request)
+
+    def process_output_socket(self, output_path: str):
+        """Output socket IO thread."""
+
+        # Msgpack serialization encoding.
+        encoder = msgpack.Encoder()
+        # Reuse send buffer.
+        buffer = bytearray()
+
+        with self.make_socket(output_path, zmq.constants.PUSH) as socket:
+            while True:
+                engine_core_outputs = self.output_queue.get()
+                outputs = EngineCoreOutputs(outputs=engine_core_outputs)
+                encoder.encode_into(outputs, buffer)
+                socket.send_multipart((buffer, ), copy=False)
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
new file mode 100644
index 0000000000000..f9e4677fb8c59
--- /dev/null
+++ b/vllm/v1/engine/core_client.py
@@ -0,0 +1,218 @@
+import multiprocessing
+import time
+from typing import List, Union
+
+import msgspec
+import zmq
+import zmq.asyncio
+
+from vllm.logger import init_logger
+from vllm.utils import get_open_zmq_ipc_path
+from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
+                            EngineCoreRequest, EngineCoreRequestType)
+from vllm.v1.engine.core import EngineCore, EngineCoreProc
+
+logger = init_logger(__name__)
+
+
+class EngineCoreClient:
+    """
+    EngineCoreClient: subclasses handle different methods for pushing 
+        and pulling from the EngineCore for asyncio / multiprocessing.
+
+    Subclasses:
+    * InprocClient: In process EngineCore (for V0-style LLMEngine use)
+    * SyncMPClient: ZMQ + background proc EngineCore (for LLM)
+    * AsyncMPClient: ZMQ + background proc EngineCore w/ asyncio (for AsyncLLM)
+    """
+
+    @staticmethod
+    def make_client(
+        *args,
+        multiprocess_mode: bool,
+        asyncio_mode: bool,
+        **kwargs,
+    ) -> "EngineCoreClient":
+
+        # TODO: support this for debugging purposes.
+        if asyncio_mode and not multiprocess_mode:
+            raise NotImplementedError(
+                "Running EngineCore in asyncio without multiprocessing "
+                "is not currently supported.")
+
+        if multiprocess_mode and asyncio_mode:
+            return AsyncMPClient(*args, **kwargs)
+
+        if multiprocess_mode and not asyncio_mode:
+            return SyncMPClient(*args, **kwargs)
+
+        return InprocClient(*args, **kwargs)
+
+    def shutdown(self):
+        pass
+
+    def get_output(self) -> List[EngineCoreOutput]:
+        raise NotImplementedError
+
+    def add_request(self, request: EngineCoreRequest) -> None:
+        raise NotImplementedError
+
+    def abort_requests(self, request_ids: List[str]) -> None:
+        raise NotImplementedError
+
+    async def get_output_async(self) -> List[EngineCoreOutput]:
+        raise NotImplementedError
+
+    async def add_request_async(self, request: EngineCoreRequest) -> None:
+        raise NotImplementedError
+
+    async def abort_requests_async(self, request_ids: List[str]) -> None:
+        raise NotImplementedError
+
+
+class InprocClient(EngineCoreClient):
+    """
+    InprocClient: client for in-process EngineCore. Intended 
+    for use in LLMEngine for V0-style add_request() and step()
+        EngineCore setup in this process (no busy loop).
+
+        * pushes EngineCoreRequest directly into the EngineCore
+        * pulls EngineCoreOutputs by stepping the EngineCore
+
+        TODO: support asyncio-mode for debugging.
+    """
+
+    def __init__(self, *args, **kwargs):
+        self.engine_core = EngineCore(*args, **kwargs)
+
+    def get_output(self) -> List[EngineCoreOutput]:
+        return self.engine_core.step()
+
+    def add_request(self, request: EngineCoreRequest) -> None:
+        self.engine_core.add_request(request)
+
+    def abort_requests(self, request_ids: List[str]) -> None:
+        self.engine_core.abort_requests(request_ids)
+
+
+class MPClient(EngineCoreClient):
+    """
+    MPClient: base client for multi-proc EngineCore.
+        EngineCore runs in a background process busy loop, getting
+        new EngineCoreRequests and returning EngineCoreOutputs
+
+        * pushes EngineCoreRequests via input_socket
+        * pulls EngineCoreOutputs via output_socket
+    
+        * AsyncMPClient subclass for AsyncLLM usage
+        * SyncMPClient subclass for LLM usage
+    """
+
+    def __init__(
+        self,
+        *args,
+        asyncio_mode: bool,
+        **kwargs,
+    ):
+        # Serialization setup.
+        self.encoder = msgspec.msgpack.Encoder()
+        self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs)
+
+        # ZMQ setup.
+        self.ctx = (zmq.asyncio.Context() if asyncio_mode else zmq.Context())
+
+        # Path for IPC.
+        ready_path = get_open_zmq_ipc_path()
+        output_path = get_open_zmq_ipc_path()
+        input_path = get_open_zmq_ipc_path()
+
+        # Get output (EngineCoreOutput) from EngineCore.
+        self.output_socket = self.ctx.socket(zmq.constants.PULL)
+        self.output_socket.connect(output_path)
+
+        # Send input (EngineCoreRequest) to EngineCore.
+        self.input_socket = self.ctx.socket(zmq.constants.PUSH)
+        self.input_socket.bind(input_path)
+
+        # Start EngineCore in background process.
+        self.should_shutdown = multiprocessing.Value('b', False, lock=False)
+        self.proc = EngineCoreProc.make_engine_core_process(
+            *args,
+            input_path=input_path,
+            output_path=output_path,
+            ready_path=ready_path,
+            should_shutdown=self.should_shutdown,
+            **kwargs,
+        )
+
+    def shutdown(self):
+        # Send shutdown signal to background process.
+        self.should_shutdown = True
+
+        # Shut down the zmq context.
+        self.ctx.destroy(linger=0)
+
+        # Shutdown the process if needed.
+        if hasattr(self, "proc") and self.proc.is_alive():
+            self.proc.terminate()
+
+            time.sleep(5)
+            if self.proc.is_alive():
+                self.proc.kill()
+
+    def __del__(self):
+        self.shutdown()
+
+
+class SyncMPClient(MPClient):
+    """Synchronous client for multi-proc EngineCore."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, asyncio_mode=False, **kwargs)
+
+    def get_output(self) -> List[EngineCoreOutput]:
+
+        (frame, ) = self.output_socket.recv_multipart(copy=False)
+        engine_core_outputs = self.decoder.decode(frame.buffer).outputs
+        return engine_core_outputs
+
+    def _send_input(self, request_type: EngineCoreRequestType,
+                    request: Union[EngineCoreRequest, List[str]]) -> None:
+
+        # (RequestType, SerializedRequest)
+        msg = (request_type.value, self.encoder.encode(request))
+        self.input_socket.send_multipart(msg, copy=False)
+
+    def add_request(self, request: EngineCoreRequest) -> None:
+        self._send_input(EngineCoreRequestType.ADD, request)
+
+    def abort_requests(self, request_ids: List[str]) -> None:
+        self._send_input(EngineCoreRequestType.ABORT, request_ids)
+
+
+class AsyncMPClient(MPClient):
+    """Asyncio-compatible client for multi-proc EngineCore."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, asyncio_mode=True, **kwargs)
+
+    async def get_output_async(self) -> List[EngineCoreOutput]:
+
+        frames = await self.output_socket.recv_multipart(copy=False)
+        engine_core_outputs = self.decoder.decode(frames[0].buffer).outputs
+
+        return engine_core_outputs
+
+    async def _send_input(
+            self, request_type: EngineCoreRequestType,
+            request: Union[EngineCoreRequest, List[str]]) -> None:
+
+        msg = (request_type.value, self.encoder.encode(request))
+        await self.input_socket.send_multipart(msg, copy=False)
+
+    async def add_request_async(self, request: EngineCoreRequest) -> None:
+        await self._send_input(EngineCoreRequestType.ADD, request)
+
+    async def abort_requests_async(self, request_ids: List[str]) -> None:
+        if len(request_ids) > 0:
+            await self._send_input(EngineCoreRequestType.ABORT, request_ids)
diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
new file mode 100644
index 0000000000000..1dbf8e75ec478
--- /dev/null
+++ b/vllm/v1/engine/detokenizer.py
@@ -0,0 +1,265 @@
+from dataclasses import dataclass
+from typing import Dict, Iterable, List, Optional, Tuple
+
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.logger import init_logger
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import RequestOutputKind
+from vllm.transformers_utils.detokenizer_utils import (
+    AnyTokenizer, convert_prompt_ids_to_tokens, detokenize_incrementally)
+from vllm.transformers_utils.tokenizer import get_tokenizer
+from vllm.v1.engine import DetokenizerRequest, EngineCoreOutput
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class IncrementalDetokenizer:
+
+    # Generation data
+    output_text: str
+    tokens: List[str]
+    token_ids: List[int]
+
+    # Stop strings
+    stop: List[str]
+    include_stop_str_in_output: bool
+
+    # Metadata for incremental detokenization
+    prefix_offset: int
+    read_offset: int
+
+    # Parameters for detokenization
+    skip_special_tokens: bool
+    spaces_between_special_tokens: bool
+    output_kind: RequestOutputKind
+
+    # TODO: Probably decouple these
+    request_id: str
+    prompt: Optional[str]
+    prompt_token_ids: List[int]
+
+    # Tokenizer for this request
+    tokenizer: AnyTokenizer
+
+    # Accounting for stop string buffering
+    stop_buffer_length: int
+    _last_output_text_offset: int = 0
+
+    @property
+    def output_token_ids(self) -> List[int]:
+        assert len(self.token_ids) >= len(self.prompt_token_ids)
+        return self.token_ids[len(self.prompt_token_ids):]
+
+    @classmethod
+    def from_new_request(
+        cls,
+        tokenizer: AnyTokenizer,
+        request: DetokenizerRequest,
+    ) -> "IncrementalDetokenizer":
+
+        tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens(
+            tokenizer=tokenizer,
+            prompt_ids=request.prompt_token_ids,
+            skip_special_tokens=request.skip_special_tokens,
+        )
+
+        stops = request.stop
+        # Number of chars to hold back when stop strings are to be excluded
+        # from streamed output.
+        if stops and not request.include_stop_str_in_output:
+            stop_buffer_length = max(len(s) for s in stops) - 1
+        else:
+            stop_buffer_length = 0
+
+        return cls(
+            output_text="",
+            tokens=tokens,
+            # Detokenizer mutates this list, so need a unique copy.
+            # NOTE(Nick): could we take ownership of it though?
+            token_ids=request.prompt_token_ids.copy(),
+            stop=stops,
+            include_stop_str_in_output=request.include_stop_str_in_output,
+            prefix_offset=prefix_offset,
+            read_offset=read_offset,
+            skip_special_tokens=request.skip_special_tokens,
+            spaces_between_special_tokens=request.
+            spaces_between_special_tokens,
+            output_kind=request.output_kind,
+            request_id=request.request_id,
+            prompt=request.prompt,
+            prompt_token_ids=request.prompt_token_ids,
+            tokenizer=tokenizer,
+            stop_buffer_length=stop_buffer_length,
+        )
+
+    def add_tokens(
+        self,
+        new_token_ids: List[int],
+        finish_reason: Optional[str],
+        stop_reason: Optional[str],
+    ) -> Optional[RequestOutput]:
+        """
+        Update RequestState for the request_id by:
+            1) Detokenize the new token ids incrementally.
+            2) Update the RequestOutput with the new text.
+        """
+
+        # 1) Detokenize the new token ids incrementally.
+        # TODO(woosuk): This method becomes very inefficient when the number of
+        # new_token_ids is more than 1. We need to optimize this.
+        decoded_text = ""
+        for new_token_id in new_token_ids:
+            self.token_ids.append(new_token_id)
+            (new_tokens, new_decoded_token_text, prefix_offset,
+             read_offset) = detokenize_incrementally(
+                 tokenizer=self.tokenizer,
+                 all_input_ids=self.token_ids,
+                 prev_tokens=self.tokens,
+                 prefix_offset=self.prefix_offset,
+                 read_offset=self.read_offset,
+                 skip_special_tokens=self.skip_special_tokens,
+                 spaces_between_special_tokens=self.
+                 spaces_between_special_tokens,
+             )
+
+            self.tokens.extend(new_tokens)
+            self.prefix_offset = prefix_offset
+            self.read_offset = read_offset
+            self.output_text += new_decoded_token_text
+
+            decoded_text += new_decoded_token_text
+
+        # 2) Evaluate stop criteria.
+        if self.stop:
+            stop = StopChecker.check_stop_strings(
+                output_text=self.output_text,
+                new_char_count=len(decoded_text),
+                stop=self.stop,
+                include_in_output=self.include_stop_str_in_output,
+            )
+            if stop is not None:
+                stop_str, truncate_to = stop
+                if truncate_to != -1:
+                    self.output_text = self.output_text[:truncate_to]
+                finish_reason = "stop"  # TODO: use constant
+                stop_reason = stop_str
+
+        # TODO: handle stop_token_ids here too?
+
+        # 3) Update the RequestOutput object with the new text.
+        finished = bool(finish_reason)
+        if self.output_kind == RequestOutputKind.FINAL_ONLY \
+            and not finished:
+            return None
+
+        delta = self.output_kind == RequestOutputKind.DELTA
+        output_text = self._get_next_output_text(finished, delta)
+        token_ids = new_token_ids if delta else self.output_token_ids
+
+        request_output = RequestOutput.new(
+            self.request_id,
+            self.prompt,
+            self.prompt_token_ids,
+            output_text,
+            token_ids,
+            finished,
+        )
+
+        if finished:
+            completion_output = request_output.outputs[0]
+            completion_output.finish_reason = finish_reason
+            completion_output.stop_reason = stop_reason
+
+        return request_output
+
+    def _get_next_output_text(self, finished: bool, delta: bool) -> str:
+        """If delta is True, only new text since the last call to
+        this method is returned"""
+
+        # We return the full output text if the sequence is finished.
+        buffer_length = 0 if finished else self.stop_buffer_length
+        if not delta:
+            return self.output_text[:-buffer_length] if buffer_length else (
+                self.output_text)
+        length = len(self.output_text) - buffer_length
+        last_offset = self._last_output_text_offset
+        if last_offset < length:
+            self._last_output_text_offset = length
+            return self.output_text[last_offset:length]
+        return ""
+
+
+class Detokenizer:
+
+    def __init__(self, tokenizer_name: str):
+        # TODO: once we support LoRA, we should should pass the tokenizer
+        # here. We currently have two copies (this + in the LLMEngine).
+        self.tokenizer = get_tokenizer(tokenizer_name)
+
+        # Request id -> IncrementalDetokenizer
+        self.request_states: Dict[str, IncrementalDetokenizer] = {}
+
+    def is_request_active(self, request_id: str):
+        return request_id in self.request_states
+
+    def get_num_unfinished_requests(self):
+        return len(self.request_states)
+
+    def has_unfinished_requests(self) -> bool:
+        return len(self.request_states) > 0
+
+    def abort_requests(
+        self,
+        request_ids: Iterable[str],
+    ) -> None:
+        """Remove the request_ids from the Detokenizer."""
+
+        for request_id in request_ids:
+            self.request_states.pop(request_id, None)
+
+    def add_request(
+        self,
+        request: DetokenizerRequest,
+    ):
+        """Add new request to the Detokenizer."""
+
+        assert (request.request_id not in self.request_states)
+
+        request_state = IncrementalDetokenizer.from_new_request(
+            self.tokenizer, request)
+        self.request_states[request.request_id] = request_state
+
+    def step(
+        self, encore_core_outputs: List[EngineCoreOutput]
+    ) -> Tuple[List[RequestOutput], List[str]]:
+        """Update state and request the RequestOutputs to the LLMEngine."""
+
+        request_outputs: List[RequestOutput] = []
+        requests_to_abort: List[str] = []
+        for engine_core_output in encore_core_outputs:
+            request_id = engine_core_output.request_id
+            detokenizer = self.request_states.get(request_id)
+            if detokenizer is None:
+                # Ignore output for already-aborted request.
+                continue
+
+            # Detokenize and update state.
+            request_output = detokenizer.add_tokens(
+                new_token_ids=engine_core_output.new_token_ids,
+                finish_reason=engine_core_output.finish_reason,
+                stop_reason=engine_core_output.stop_reason,
+            )
+
+            if request_output is not None:
+                # Add to RequestOutputs list.
+                request_outputs.append(request_output)
+
+                # Free completed requests.
+                if request_output.finished:
+                    self.request_states.pop(request_id)
+                    if not engine_core_output.finished:
+                        requests_to_abort.append(request_id)
+
+        # Return to EngineClient.
+        return request_outputs, requests_to_abort
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 38d95ab44bb90..f37db92e8ea6b 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -1,35 +1,28 @@
-import time
-from typing import (Any, Dict, Iterable, List, Mapping, Optional, Tuple, Type,
-                    Union)
+from typing import Dict, List, Mapping, Optional, Type, Union
 
-from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
-                         ObservabilityConfig, ParallelConfig, SchedulerConfig,
-                         VllmConfig)
+from vllm.config import VllmConfig
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.metrics_types import StatLoggerBase
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs,
-                         EncoderDecoderLLMInputs, InputRegistry, PromptType)
-from vllm.inputs.preprocess import InputPreprocessor
+from vllm.envs import VLLM_ENABLE_V1_MULTIPROCESSING
+from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
-from vllm.outputs import CompletionOutput, RequestOutput
+from vllm.outputs import RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
-from vllm.sampling_params import RequestOutputKind, SamplingParams
-from vllm.transformers_utils.config import try_get_generation_config
-from vllm.transformers_utils.tokenizer_group import (
-    BaseTokenizerGroup, init_tokenizer_from_configs)
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
 from vllm.usage.usage_lib import UsageContext
-from vllm.v1.core.scheduler import Scheduler
+from vllm.v1.engine.core_client import EngineCoreClient
+from vllm.v1.engine.detokenizer import Detokenizer
+from vllm.v1.engine.processor import Processor
 from vllm.v1.executor.gpu_executor import GPUExecutor
-from vllm.v1.request import Request, RequestStatus
-from vllm.v1.tokenizer.detokenizer import Detokenizer, DetokenizerInputs
-from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
 
 
 class LLMEngine:
+    """Legacy LLMEngine for backwards compatibility."""
 
     def __init__(
         self,
@@ -40,146 +33,36 @@ def __init__(
         stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
         input_registry: InputRegistry = INPUT_REGISTRY,
         use_cached_outputs: bool = False,
+        multiprocess_mode: bool = False,
     ) -> None:
 
-        # TODO: remove the local variables and use self.* throughout the class.
-        model_config = self.model_config = vllm_config.model_config
-        cache_config = self.cache_config = vllm_config.cache_config
-        lora_config = self.lora_config = vllm_config.lora_config
-        parallel_config = self.parallel_config = vllm_config.parallel_config
-        scheduler_config = self.scheduler_config = vllm_config.scheduler_config
-        device_config = self.device_config = vllm_config.device_config
-        speculative_config = self.speculative_config = vllm_config.speculative_config  # noqa
-        load_config = self.load_config = vllm_config.load_config
-        decoding_config = self.decoding_config = vllm_config.decoding_config or DecodingConfig(  # noqa
+        # TODO: Can we avoid this?
+        self.model_config = vllm_config.model_config
+
+        # Tokenizer (+ ensure liveness if running in another process).
+        self.tokenizer = init_tokenizer_from_configs(
+            model_config=vllm_config.model_config,
+            scheduler_config=vllm_config.scheduler_config,
+            parallel_config=vllm_config.parallel_config,
+            enable_lora=bool(vllm_config.lora_config))
+        self.tokenizer.ping()
+
+        # Processor (convert Inputs --> EngineCoreRequests)
+        self.processor = Processor(vllm_config.model_config,
+                                   vllm_config.lora_config, self.tokenizer,
+                                   input_registry)
+
+        # Detokenizer (converts EngineCoreOutputs --> RequestOutput)
+        self.detokenizer = Detokenizer(vllm_config.model_config.tokenizer)
+
+        # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
+        self.engine_core = EngineCoreClient.make_client(
+            vllm_config,
+            executor_class,
+            usage_context,
+            multiprocess_mode=multiprocess_mode,
+            asyncio_mode=False,
         )
-        prompt_adapter_config = self.prompt_adapter_config = vllm_config.prompt_adapter_config  # noqa
-        observability_config = self.observability_config = vllm_config.observability_config or ObservabilityConfig(  # noqa
-        )
-
-        # Override the configs for V1.
-        # FIXME
-        if usage_context == UsageContext.LLM_CLASS:
-            scheduler_config.max_num_seqs = 1024
-            scheduler_config.max_num_batched_tokens = 8192
-        elif usage_context == UsageContext.OPENAI_API_SERVER:
-            scheduler_config.max_num_seqs = 1024
-            scheduler_config.max_num_batched_tokens = 2048
-
-        # TODO (ywang96): Enable APC by default when VLM supports it.
-        if not model_config.is_multimodal_model:
-            cache_config.enable_prefix_caching = True
-
-        logger.info(
-            "Initializing an LLM engine (v%s) with config: "
-            "model=%r, speculative_config=%r, tokenizer=%r, "
-            "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, "
-            "override_neuron_config=%s, tokenizer_revision=%s, "
-            "trust_remote_code=%s, dtype=%s, max_seq_len=%d, "
-            "download_dir=%r, load_format=%s, tensor_parallel_size=%d, "
-            "pipeline_parallel_size=%d, "
-            "disable_custom_all_reduce=%s, quantization=%s, "
-            "enforce_eager=%s, kv_cache_dtype=%s, "
-            "quantization_param_path=%s, device_config=%s, "
-            "decoding_config=%r, observability_config=%r, "
-            "seed=%d, served_model_name=%s, "
-            "num_scheduler_steps=%d, enable_prefix_caching=%s, "
-            "use_async_output_proc=%s, mm_processor_kwargs=%s)",
-            VLLM_VERSION,
-            model_config.model,
-            speculative_config,
-            model_config.tokenizer,
-            model_config.skip_tokenizer_init,
-            model_config.tokenizer_mode,
-            model_config.revision,
-            model_config.override_neuron_config,
-            model_config.tokenizer_revision,
-            model_config.trust_remote_code,
-            model_config.dtype,
-            model_config.max_model_len,
-            load_config.download_dir,
-            load_config.load_format,
-            parallel_config.tensor_parallel_size,
-            parallel_config.pipeline_parallel_size,
-            parallel_config.disable_custom_all_reduce,
-            model_config.quantization,
-            model_config.enforce_eager,
-            cache_config.cache_dtype,
-            model_config.quantization_param_path,
-            device_config.device,
-            decoding_config,
-            observability_config,
-            model_config.seed,
-            model_config.served_model_name,
-            scheduler_config.num_scheduler_steps,
-            cache_config.enable_prefix_caching,
-            model_config.use_async_output_proc,
-            model_config.mm_processor_kwargs,
-        )
-
-        self.log_stats = log_stats
-
-        assert not self.model_config.skip_tokenizer_init
-        self.tokenizer = self._init_tokenizer()
-        if self.tokenizer:
-            # Ping the tokenizer to ensure liveness if it runs in a
-            # different process.
-            self.tokenizer.ping()
-        self.detokenizer = Detokenizer(
-            tokenizer_name=self.model_config.tokenizer,
-            tokenizer_mode=self.model_config.tokenizer_mode,
-            trust_remote_code=self.model_config.trust_remote_code)
-
-        self.generation_config_fields = _load_generation_config_dict(
-            model_config)
-        self.input_preprocessor = InputPreprocessor(model_config,
-                                                    self.tokenizer)
-        self.input_registry = input_registry
-        self.input_processor = input_registry.create_input_processor(
-            model_config)
-
-        # Request id -> Request
-        self.requests: Dict[str, Request] = {}
-        # NOTE(woosuk): Now that the detokenizer works asynchronously, we need
-        # to keep track of how many steps each request has been lagged behind
-        # in terms of detokenization.
-        # Request id -> how many detokenizer steps the request should wait for.
-        self.num_lagged_steps: Dict[str, int] = {}
-        # OPTIMIZATION: Cache the request output and update it incrementally.
-        # This is used to avoid creating a new RequestOutput object every step.
-        # Request id -> RequestOutput
-        self.request_outputs: Dict[str, RequestOutput] = {}
-
-        self.model_executor = executor_class(vllm_config=vllm_config)
-        assert self.model_config.task != "embedding"
-        self._initialize_kv_caches()
-
-        # Create the scheduler.
-        # NOTE: the cache_config here have been updated with the numbers of
-        # GPU and CPU blocks, which are profiled in the distributed executor.
-        self.scheduler = Scheduler(scheduler_config, cache_config, lora_config)
-
-    def __del__(self):
-        # Small hack- implicit clean up of resources on garbage collect
-        # TODO: this should probably be explicitly invoked when we're done with
-        # the engine
-        self.terminate_detokenizer()
-
-    def _initialize_kv_caches(self) -> None:
-        num_gpu_blocks, _ = self.model_executor.determine_num_available_blocks(
-        )
-
-        if self.cache_config.num_gpu_blocks_override is not None:
-            num_gpu_blocks_override = self.cache_config.num_gpu_blocks_override
-            logger.info(
-                "Overriding num_gpu_blocks=%d with "
-                "num_gpu_blocks_override=%d", num_gpu_blocks,
-                num_gpu_blocks_override)
-            num_gpu_blocks = num_gpu_blocks_override
-
-        self.cache_config.num_gpu_blocks = num_gpu_blocks
-        self.cache_config.num_cpu_blocks = 0
-        self.model_executor.initialize_cache(num_gpu_blocks)
 
     @classmethod
     def from_engine_args(
@@ -187,71 +70,49 @@ def from_engine_args(
         engine_args: EngineArgs,
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
         stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
+        enable_multiprocessing: bool = False,
     ) -> "LLMEngine":
         """Creates an LLM engine from the engine arguments."""
+
         # Create the engine configs.
-        engine_config = engine_args.create_engine_config()
-        executor_class = cls._get_executor_cls(engine_config)
-        # Create the LLM engine.
-        engine = cls(
-            vllm_config=engine_config,
-            executor_class=executor_class,
-            log_stats=not engine_args.disable_log_stats,
-            usage_context=usage_context,
-            stat_loggers=stat_loggers,
-        )
-        return engine
-
-    def _init_tokenizer(self) -> BaseTokenizerGroup:
-        return init_tokenizer_from_configs(
-            model_config=self.model_config,
-            scheduler_config=self.scheduler_config,
-            parallel_config=self.parallel_config,
-            enable_lora=bool(self.lora_config))
-
-    def _verify_args(self) -> None:
-        self.model_config.verify_with_parallel_config(self.parallel_config)
-        self.cache_config.verify_with_parallel_config(self.parallel_config)
-        if self.lora_config:
-            self.lora_config.verify_with_model_config(self.model_config)
-            self.lora_config.verify_with_scheduler_config(
-                self.scheduler_config)
-        if self.prompt_adapter_config:
-            self.prompt_adapter_config.verify_with_model_config(
-                self.model_config)
-
-    def _add_processed_request(
-        self,
-        request_id: str,
-        processed_inputs: Union[DecoderOnlyInputs, EncoderDecoderLLMInputs],
-        params: Union[SamplingParams, PoolingParams],
-        arrival_time: float,
-        lora_request: Optional[LoRARequest],
-        prompt_adapter_request: Optional[PromptAdapterRequest],
-        trace_headers: Optional[Mapping[str, str]] = None,
-    ) -> None:
-        assert prompt_adapter_request is None
-        assert trace_headers is None
-        self._validate_model_inputs(processed_inputs)
-        eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
-
-        # TODO(woosuk): Support embedding mode.
-        assert isinstance(params, SamplingParams)
-        sampling_params = params.clone()
-        sampling_params.update_from_generation_config(
-            self.generation_config_fields, eos_token_id)
-
-        # TODO(woosuk): Check max_logprobs
-        # TODO(woosuk): Support encoder-decoder models.
-        req = Request(request_id, processed_inputs, params, eos_token_id,
-                      arrival_time)
-        self.requests[request_id] = req
-        self.num_lagged_steps[request_id] = 0
-        self.scheduler.add_request(req)
+        vllm_config = engine_args.create_engine_config()
+        executor_class = cls._get_executor_cls(vllm_config)
+
+        if VLLM_ENABLE_V1_MULTIPROCESSING:
+            logger.debug("Enabling multiprocessing for LLMEngine.")
+            enable_multiprocessing = True
+
+        # Create the LLMEngine.
+        return cls(vllm_config=vllm_config,
+                   executor_class=executor_class,
+                   log_stats=not engine_args.disable_log_stats,
+                   usage_context=usage_context,
+                   stat_loggers=stat_loggers,
+                   multiprocess_mode=enable_multiprocessing)
+
+    @classmethod
+    def _get_executor_cls(cls, vllm_config: VllmConfig):
+        return GPUExecutor
 
     def stop_remote_worker_execution_loop(self) -> None:
         raise NotImplementedError("TP not implemented yet.")
 
+    def get_num_unfinished_requests(self) -> int:
+        return self.detokenizer.get_num_unfinished_requests()
+
+    def has_unfinished_requests(self) -> bool:
+        return self.detokenizer.has_unfinished_requests()
+
+    @classmethod
+    def validate_outputs(cls, outputs, output_type):
+        return outputs
+
+    def abort_request(self, request_ids: List[str]) -> None:
+        """Remove request_ids from EngineCore and Detokenizer."""
+
+        self.engine_core.abort_requests(request_ids)
+        self.detokenizer.abort_requests(request_ids)
+
     def add_request(
         self,
         request_id: str,
@@ -263,261 +124,46 @@ def add_request(
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
         priority: int = 0,
     ) -> None:
-        if lora_request is not None and not self.lora_config:
-            raise ValueError(f"Got lora_request {lora_request} but LoRA is "
-                             "not enabled!")
-        if arrival_time is None:
-            arrival_time = time.time()
-        assert priority == 0, "vLLM V1 does not support priority at the moment."
-
-        preprocessed_inputs = self.input_preprocessor.preprocess(
-            prompt,
-            request_id=request_id,
-            lora_request=lora_request,
-            prompt_adapter_request=prompt_adapter_request,
-        )
-        processed_inputs = self.input_processor(preprocessed_inputs)
-
-        self._add_processed_request(
-            request_id=request_id,
-            processed_inputs=processed_inputs,
-            params=params,
-            arrival_time=arrival_time,
-            lora_request=lora_request,
-            prompt_adapter_request=prompt_adapter_request,
-            trace_headers=trace_headers,
-        )
 
-    def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
-        self.scheduler.finish_requests(request_id,
-                                       RequestStatus.FINISHED_ABORTED)
-        self._free_request(request_id)
+        # 1) Process raw inputs into the request.
+        detokenizer_req, engine_core_req = self.processor.process_inputs(
+            request_id, prompt, params, arrival_time, lora_request,
+            trace_headers, prompt_adapter_request, priority)
 
-    def get_num_unfinished_requests(self) -> int:
-        """Gets the number of unfinished requests."""
-        return len(self.requests)
+        # 2) Add the request to Detokenizer.
+        self.detokenizer.add_request(detokenizer_req)
 
-    def has_unfinished_requests(self) -> bool:
-        """Returns True if there are unfinished requests."""
-        return len(self.requests) > 0
+        # 3) Add the request to EngineCore.
+        self.engine_core.add_request(engine_core_req)
 
     def step(self) -> List[RequestOutput]:
-        # NOTE(woosuk): This method may return an empty list when the
-        # detokenizer is still processing the outputs. This should not be
-        # considered as the end of the generation process.
-        # FIXME(woosuk): Currently, the step method is inefficient because it
-        # creates RequestOutput objects for all running requests, while they
-        # may not be needed unless the output is streamed to the client.
-        if self.scheduler.has_unfinished_requests():
-            scheduler_output = self.scheduler.schedule()
-            output = self.model_executor.execute_model(scheduler_output)
-            sampled = self.scheduler.update_from_output(
-                scheduler_output, output)
-            self.send_to_detokenizer(sampled)
-        req_outputs = self.recv_from_detokenizer()
-        return req_outputs
-
-    def send_to_detokenizer(self, sampled: List[Tuple[Request, int]]) -> None:
-        inputs = DetokenizerInputs(
-            req_ids=[],
-            prompt_token_ids=[],
-            new_token_ids=[],
-            skip_special_tokens=[],
-            spaces_between_special_tokens=[],
-            free_req_ids=[],  # TODO(woosuk): Implement freeing.
-        )
-        for req, num_tokens in sampled:
-            inputs.req_ids.append(req.request_id)
-            if req.num_output_tokens == num_tokens:
-                # The request is first detokenized.
-                inputs.prompt_token_ids.append(req.prompt_token_ids)
-            else:
-                # The prompt token ids are already cached in the detokenizer.
-                inputs.prompt_token_ids.append([])
-            inputs.new_token_ids.append(req.output_token_ids[-num_tokens:])
-            inputs.skip_special_tokens.append(
-                req.sampling_params.skip_special_tokens)
-            inputs.spaces_between_special_tokens.append(
-                req.sampling_params.spaces_between_special_tokens)
-
-            # Update the number of lagged steps.
-            self.num_lagged_steps[req.request_id] += 1
-        self.detokenizer.send(inputs)
-
-    def recv_from_detokenizer(self) -> List[RequestOutput]:
-        detokenizer_output = self.detokenizer.recv()
-        if detokenizer_output is None:
-            return []
-
-        req_outputs: List[RequestOutput] = []
-        num_reqs = len(detokenizer_output.req_ids)
-        for i in range(num_reqs):
-            req_id = detokenizer_output.req_ids[i]
-            if req_id not in self.requests:
-                # The request has been aborted while the detokenizer was
-                # processing the outputs.
-                continue
-
-            req = self.requests[req_id]
-            req.output_text += detokenizer_output.detokenized_texts[i]
-
-            self.num_lagged_steps[req_id] -= 1
-            finished = (self.num_lagged_steps[req_id] == 0
-                        and req.is_finished())
-            req_output = self._make_request_output(
-                req, detokenizer_output.num_output_token_ids[i],
-                detokenizer_output.detokenized_texts[i], finished)
-            req_outputs.append(req_output)
-
-            if finished:
-                self._free_request(req_id)
-        return req_outputs
-
-    def terminate_detokenizer(self) -> None:
-        self.detokenizer.terminate()
-
-    def _make_request_output(
-        self,
-        request: Request,
-        num_output_tokens: int,
-        new_output_text: str,
-        finished: bool,
-    ) -> RequestOutput:
-        req_output = self.request_outputs.get(request.request_id)
-        if req_output is None:
-            # TODO: Support `n` > 1.
-            completion_output = CompletionOutput(
-                index=0,
-                text="",
-                token_ids=[],
-                cumulative_logprob=None,
-                logprobs=None,  # TODO
-                finish_reason=None,
-                stop_reason=None,
-                lora_request=None,
-            )
-            req_output = RequestOutput(
-                request_id=request.request_id,
-                prompt=request.prompt,
-                prompt_token_ids=request.prompt_token_ids,
-                prompt_logprobs=None,  # TODO
-                outputs=[completion_output],
-                finished=False,
-                metrics=None,
-                lora_request=None,
-                encoder_prompt=None,
-                encoder_prompt_token_ids=None,
-            )
-            self.request_outputs[request.request_id] = req_output
-
-        completion_output = req_output.outputs[0]
-        if request.sampling_params.output_kind == RequestOutputKind.CUMULATIVE:
-            completion_output.text += new_output_text
-            completion_output.token_ids = (
-                request.output_token_ids[:num_output_tokens])
-        elif request.sampling_params.output_kind == RequestOutputKind.DELTA:
-            completion_output.text = new_output_text
-            num_prev_tokens = len(completion_output.token_ids)
-            completion_output.token_ids = request.output_token_ids[
-                num_prev_tokens:num_output_tokens]
-        elif (request.sampling_params.output_kind ==
-              RequestOutputKind.FINAL_ONLY):
-            if finished:
-                completion_output.text = request.output_text
-                completion_output.token_ids = request.output_token_ids
-            else:
-                completion_output.text = ""
-                completion_output.token_ids = []
-
-        if finished:
-            completion_output.finish_reason = request.get_finished_reason()
-            completion_output.stop_reason = request.stop_reason
-            req_output.finished = finished
-        return req_output
-
-    def _free_request(self, request_id: str) -> None:
-        self.requests.pop(request_id, None)
-        self.num_lagged_steps.pop(request_id, None)
-        self.request_outputs.pop(request_id, None)
-
-    def check_health(self) -> None:
-        if self.tokenizer:
-            self.tokenizer.check_health()
-        self.model_executor.check_health()
-
-    def _validate_model_inputs(self, inputs: Union[DecoderOnlyInputs,
-                                                   EncoderDecoderLLMInputs]):
-        prompt_ids = inputs.get("prompt_token_ids")
-        if prompt_ids is None or len(prompt_ids) == 0:
-            raise ValueError("Prompt cannot be empty")
-
-        if self.model_config.is_multimodal_model:
-            max_prompt_len = self.model_config.max_model_len
-
-            if len(prompt_ids) > max_prompt_len:
-                raise ValueError(
-                    f"The prompt (total length {len(prompt_ids)}) is too long "
-                    f"to fit into the model (context length {max_prompt_len}). "
-                    "Make sure that `max_model_len` is no smaller than the "
-                    "number of text tokens plus multimodal tokens. For image "
-                    "inputs, the number of image tokens depends on the number "
-                    "of images, and possibly their aspect ratios as well.")
-
-    @classmethod
-    def validate_outputs(cls, outputs, output_type):
-        return outputs
-
-    def get_model_config(self) -> ModelConfig:
-        """Gets the model configuration."""
-        return self.model_config
 
-    def get_parallel_config(self) -> ParallelConfig:
-        """Gets the parallel configuration."""
-        return self.parallel_config
+        # 1) Get EngineCoreOutput from the EngineCore.
+        engine_core_outputs = self.engine_core.get_output()
 
-    def get_decoding_config(self) -> DecodingConfig:
-        """Gets the decoding configuration."""
-        return self.decoding_config
+        # 2) Detokenizer the EngineCoreOutput.
+        request_outputs, requests_to_abort = self.detokenizer.step(
+            engine_core_outputs)
 
-    def get_scheduler_config(self) -> SchedulerConfig:
-        """Gets the scheduler configuration."""
-        return self.scheduler_config
+        # 3) Abort requests that finished due to stopping criteria.
+        if requests_to_abort:
+            self.abort_request(requests_to_abort)
 
-    def get_lora_config(self) -> LoRAConfig:
-        """Gets the LoRA configuration."""
-        return self.lora_config
-
-    @classmethod
-    def _get_executor_cls(cls, engine_config: VllmConfig):
-        return GPUExecutor
+        return request_outputs
 
-    def is_tracing_enabled(self) -> bool:
-        return False
+    # TODO(rob): Can we get rid of these?
 
-    def do_log_stats(self, *args, **kwargs) -> None:
+    def get_model_config(self):
         pass
 
-    def is_encoder_decoder_model(self) -> bool:
-        return False
-
-    def start_profile(self) -> None:
+    def is_encoder_decoder_model(self):
         pass
 
-    def stop_profile(self) -> None:
+    def start_profile(self):
         pass
 
-    def get_tokenizer_group(self, *args, **kwargs):
-        return self.tokenizer
-
-
-def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]:
-    config = try_get_generation_config(
-        model_config.model,
-        trust_remote_code=model_config.trust_remote_code,
-        revision=model_config.revision,
-    )
-
-    if config is None:
-        return {}
+    def stop_profile(self):
+        pass
 
-    return config.to_diff_dict()
+    def get_tokenizer_group(self, group_type):
+        pass
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
new file mode 100644
index 0000000000000..d92e622810389
--- /dev/null
+++ b/vllm/v1/engine/processor.py
@@ -0,0 +1,128 @@
+import time
+from typing import Any, Dict, Mapping, Optional, Tuple, Union
+
+from vllm.config import LoRAConfig, ModelConfig
+from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs,
+                         EncoderDecoderLLMInputs, InputRegistry, PromptType)
+from vllm.inputs.preprocess import InputPreprocessor
+from vllm.lora.request import LoRARequest
+from vllm.pooling_params import PoolingParams
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.config import try_get_generation_config
+from vllm.transformers_utils.tokenizer_group import AnyTokenizer
+from vllm.v1.engine import DetokenizerRequest, EngineCoreRequest
+
+
+class Processor:
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        lora_config: Optional[LoRAConfig],
+        tokenizer: AnyTokenizer,
+        input_registry: InputRegistry = INPUT_REGISTRY,
+    ):
+
+        self.model_config = model_config
+        self.lora_config = lora_config
+        self.tokenizer = tokenizer
+
+        self.generation_config_fields = _load_generation_config_dict(
+            model_config)
+        self.input_preprocessor = InputPreprocessor(model_config,
+                                                    self.tokenizer)
+        self.input_processor = input_registry.create_input_processor(
+            model_config)
+
+    # TODO: run in an ThreadpoolExecutor or BackgroundProcess.
+    # This ideally should releases the GIL, so we should not block the
+    # asyncio loop while this is running.
+    def process_inputs(
+        self,
+        request_id: str,
+        prompt: PromptType,
+        params: Union[SamplingParams, PoolingParams],
+        arrival_time: float,
+        lora_request: Optional[LoRARequest] = None,
+        trace_headers: Optional[Mapping[str, str]] = None,
+        prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+        priority: int = 0,
+    ) -> Tuple[DetokenizerRequest, EngineCoreRequest]:
+
+        # TODO(woosuk): Support embedding mode.
+        # TODO(woosuk): Check max_logprobs
+        # TODO(woosuk): Support encoder-decoder models.
+
+        if lora_request is not None and not self.lora_config:
+            raise ValueError(f"Got lora_request {lora_request} but LoRA is "
+                             "not enabled!")
+        if arrival_time is None:
+            arrival_time = time.time()
+        assert priority == 0, "vLLM V1 does not support priority at the moment."
+        assert trace_headers is None, "vLLM V1 does not support tracing yet."
+
+        # Process inputs.
+        preprocessed_inputs = self.input_preprocessor.preprocess(
+            prompt,
+            request_id=request_id,
+            lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request,
+        )
+        processed_inputs = self.input_processor(preprocessed_inputs)
+        self._validate_model_inputs(processed_inputs)
+        eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
+
+        assert isinstance(params, SamplingParams)
+        # TODO: can we avoid cloning here in multiproc case
+        sampling_params = params.clone()
+        sampling_params.update_from_generation_config(
+            self.generation_config_fields, eos_token_id)
+
+        # Make Request for Detokenizer.
+        detokenizer_request = DetokenizerRequest(
+            request_id, processed_inputs.get("prompt"),
+            processed_inputs.get("prompt_token_ids"),
+            sampling_params.skip_special_tokens,
+            sampling_params.spaces_between_special_tokens,
+            sampling_params.output_kind, sampling_params.stop,
+            sampling_params.include_stop_str_in_output)
+
+        # Make Request for EngineCore.
+        engine_core_request = EngineCoreRequest(
+            request_id, processed_inputs.get("prompt"),
+            processed_inputs.get("prompt_token_ids"), sampling_params,
+            eos_token_id, arrival_time, lora_request)
+
+        return detokenizer_request, engine_core_request
+
+    def _validate_model_inputs(self, inputs: Union[DecoderOnlyInputs,
+                                                   EncoderDecoderLLMInputs]):
+        prompt_ids = inputs.get("prompt_token_ids")
+        if prompt_ids is None or len(prompt_ids) == 0:
+            raise ValueError("Prompt cannot be empty")
+
+        if self.model_config.is_multimodal_model:
+            max_prompt_len = self.model_config.max_model_len
+
+            if len(prompt_ids) > max_prompt_len:
+                raise ValueError(
+                    f"The prompt (total length {len(prompt_ids)}) is too long "
+                    f"to fit into the model (context length {max_prompt_len}). "
+                    "Make sure that `max_model_len` is no smaller than the "
+                    "number of text tokens plus multimodal tokens. For image "
+                    "inputs, the number of image tokens depends on the number "
+                    "of images, and possibly their aspect ratios as well.")
+
+
+def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]:
+    config = try_get_generation_config(
+        model_config.model,
+        trust_remote_code=model_config.trust_remote_code,
+        revision=model_config.revision,
+    )
+
+    if config is None:
+        return {}
+
+    return config.to_diff_dict()
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 087067cdac56f..00e5aea92a8df 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -1,9 +1,11 @@
 import enum
 from typing import TYPE_CHECKING, List, Optional, Union
 
+from vllm.inputs.data import DecoderOnlyInputs
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import RequestMetrics
+from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.utils import ConstantList
 
 if TYPE_CHECKING:
@@ -43,9 +45,22 @@ def __init__(
         self.num_prompt_tokens = len(self.prompt_token_ids)
         self._output_token_ids: List[int] = []
         self._all_token_ids: List[int] = self.prompt_token_ids.copy()
-        self.output_text = ""
         self.num_computed_tokens = 0
 
+    @classmethod
+    def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
+
+        return cls(
+            request_id=request.request_id,
+            inputs=DecoderOnlyInputs(type="token",
+                                     prompt_token_ids=request.prompt_token_ids,
+                                     prompt=request.prompt),
+            sampling_params=request.sampling_params,
+            eos_token_id=request.eos_token_id,
+            arrival_time=request.arrival_time,
+            lora_request=request.lora_request,
+        )
+
     @property
     def output_token_ids(self) -> ConstantList[int]:
         # Prevent directly appending to the output_token_ids since
diff --git a/vllm/v1/tokenizer/detokenizer.py b/vllm/v1/tokenizer/detokenizer.py
deleted file mode 100644
index 8d80ebbc5cc45..0000000000000
--- a/vllm/v1/tokenizer/detokenizer.py
+++ /dev/null
@@ -1,228 +0,0 @@
-import multiprocessing
-from dataclasses import dataclass
-from typing import Dict, List, Optional
-
-import msgspec
-import zmq
-from msgspec import msgpack
-
-from vllm.transformers_utils.detokenizer_utils import (
-    convert_prompt_ids_to_tokens, detokenize_incrementally)
-from vllm.transformers_utils.tokenizer import get_tokenizer
-from vllm.utils import get_open_port
-
-
-class DetokenizerInputs(msgspec.Struct):
-
-    # [num_reqs]
-    req_ids: List[str]
-    # A request's prompt token ids is sent to the detokenizer only when
-    # the request is first detokenized. Otherwise, an empty list is sent.
-    prompt_token_ids: List[List[int]]
-    new_token_ids: List[List[int]]
-    skip_special_tokens: List[bool]
-    spaces_between_special_tokens: List[bool]
-
-    # [num_free_reqs]
-    free_req_ids: List[str]
-
-
-class DetokenizerOutputs(msgspec.Struct):
-
-    # [num_reqs]
-    req_ids: List[str]
-    detokenized_texts: List[str]
-    # NOTE(woosuk): The number of the output token ids of each request
-    # at the time of detokenization. The detokenizer returns this to the engine
-    # because the request state (including the output token ids) is
-    # asynchronously updated in the engine, while RequestOutput requires the
-    # output token ids to be consistent with the detokenized text.
-    num_output_token_ids: List[int]
-
-
-class Detokenizer:
-
-    def __init__(self, tokenizer_name: str, tokenizer_mode: str,
-                 trust_remote_code: bool):
-        # FIXME(woosuk): Currently, the detokenizer is just a hacky prototype.
-        # For example, it does not terminate properly. We need to improve this.
-        self.push_port = get_open_port()
-        self.pull_port = get_open_port()
-        # NOTE: The push port of the engine process should be the same as the
-        # pull port of the detokenizer process. Vice versa.
-        self.detokenizer = DetokenizerProc(tokenizer_name=tokenizer_name,
-                                           tokenizer_mode=tokenizer_mode,
-                                           trust_remote_code=trust_remote_code,
-                                           push_port=self.pull_port,
-                                           pull_port=self.push_port)
-        self.detokenizer.start()
-
-        self.zmq_context = zmq.Context()
-        self.push_socket = self.zmq_context.socket(zmq.PUSH)
-        self.push_socket.connect(f"tcp://localhost:{self.push_port}")
-        self.pull_socket = self.zmq_context.socket(zmq.PULL)
-        self.pull_socket.connect(f"tcp://localhost:{self.pull_port}")
-        self.poller = zmq.Poller()
-        self.poller.register(self.pull_socket, zmq.POLLIN)
-        self.msgpack_encoder = msgpack.Encoder()
-        self.msgpack_decoder = msgpack.Decoder(DetokenizerOutputs)
-
-    def send(self, inputs: DetokenizerInputs) -> None:
-        self.push_socket.send(self.msgpack_encoder.encode(inputs),
-                              flags=zmq.NOBLOCK)
-
-    def recv(self) -> Optional[DetokenizerOutputs]:
-        socks = dict(self.poller.poll(timeout=0))
-        if self.pull_socket in socks and socks[self.pull_socket] == zmq.POLLIN:
-            msg = self.pull_socket.recv()
-            return self.msgpack_decoder.decode(msg)
-        return None
-
-    def terminate(self) -> None:
-        self.detokenizer.kill()
-        self.detokenizer.join()
-
-
-class DetokenizerProc(multiprocessing.Process):
-
-    def __init__(
-        self,
-        tokenizer_name: str,
-        tokenizer_mode: str,
-        trust_remote_code: bool,
-        pull_port: int,
-        push_port: int,
-    ):
-        super().__init__()
-        self.tokenizer_name = tokenizer_name
-        self.tokenizer_mode = tokenizer_mode
-        self.trust_remote_code = trust_remote_code
-        # NOTE: The pull_port of the detokenizer process should be the same as
-        # the push_port of the engine process. Vice versa.
-        self.pull_port = pull_port
-        self.push_port = push_port
-
-    def run(self):
-        # Initialize these objects after the process is forked since they are
-        # not picklable.
-        self.msgpack_encoder = msgpack.Encoder()
-        self.msgpack_decoder = msgpack.Decoder(DetokenizerInputs)
-        self.tokenizer = get_tokenizer(
-            tokenizer_name=self.tokenizer_name,
-            tokenizer_mode=self.tokenizer_mode,
-            trust_remote_code=self.trust_remote_code)
-        # req_id -> RequestState
-        self.request_states: Dict[str, RequestState] = {}
-
-        self.zmq_context = zmq.Context()
-        self.pull_socket = self.zmq_context.socket(zmq.PULL)
-        self.pull_socket.bind(f"tcp://*:{self.pull_port}")
-        self.push_socket = self.zmq_context.socket(zmq.PUSH)
-        self.push_socket.bind(f"tcp://*:{self.push_port}")
-
-        while True:
-            if self.pull_socket.poll(timeout=1000) == 0:
-                # Nothing to read
-                continue
-            message = self.pull_socket.recv()
-            inputs = self.msgpack_decoder.decode(message)
-
-            for req_id in inputs.free_req_ids:
-                self.free(req_id)
-
-            detokenized_texts: List[str] = []
-            num_output_token_ids: List[int] = []
-            num_reqs = len(inputs.req_ids)
-            for i in range(num_reqs):
-                req_id = inputs.req_ids[i]
-                if req_id not in self.request_states:
-                    self.add_request(
-                        request_id=req_id,
-                        prompt_token_ids=inputs.prompt_token_ids[i],
-                        skip_special_tokens=inputs.skip_special_tokens[i],
-                        spaces_between_special_tokens=inputs.
-                        spaces_between_special_tokens[i],
-                    )
-                new_str = self.detokenize(req_id, inputs.new_token_ids[i])
-                detokenized_texts.append(new_str)
-                req_state = self.request_states[req_id]
-                num_output_token_ids.append(
-                    len(req_state.token_ids) - req_state.num_prompt_tokens)
-
-            detokenized = DetokenizerOutputs(
-                req_ids=inputs.req_ids,
-                detokenized_texts=detokenized_texts,
-                num_output_token_ids=num_output_token_ids,
-            )
-            self.push_socket.send(self.msgpack_encoder.encode(detokenized),
-                                  flags=zmq.NOBLOCK)
-
-    def add_request(
-        self,
-        request_id: str,
-        prompt_token_ids: List[int],
-        skip_special_tokens: bool,
-        spaces_between_special_tokens: bool,
-    ) -> None:
-        tokens, prefix_offset, read_offset = convert_prompt_ids_to_tokens(
-            tokenizer=self.tokenizer,
-            prompt_ids=prompt_token_ids,
-            skip_special_tokens=skip_special_tokens,
-        )
-        self.request_states[request_id] = RequestState(
-            req_id=request_id,
-            token_ids=prompt_token_ids,
-            tokens=tokens,
-            num_prompt_tokens=len(prompt_token_ids),
-            prefix_offset=prefix_offset,
-            read_offset=read_offset,
-            skip_special_tokens=skip_special_tokens,
-            spaces_between_special_tokens=spaces_between_special_tokens,
-        )
-
-    def free(self, request_id: str) -> None:
-        del self.request_states[request_id]
-
-    def detokenize(self, request_id: str, new_token_ids: List[int]) -> str:
-        # TODO(woosuk): This method becomes very inefficient when the number of
-        # new_token_ids is more than 1. We need to optimize this.
-        req_state = self.request_states[request_id]
-        decoded_text = ""
-        for new_token_id in new_token_ids:
-            req_state.token_ids.append(new_token_id)
-            (new_tokens, new_decoded_token_text, prefix_offset,
-             read_offset) = detokenize_incrementally(
-                 tokenizer=self.tokenizer,
-                 all_input_ids=req_state.token_ids,
-                 prev_tokens=req_state.tokens,
-                 prefix_offset=req_state.prefix_offset,
-                 read_offset=req_state.read_offset,
-                 skip_special_tokens=req_state.skip_special_tokens,
-                 spaces_between_special_tokens=req_state.
-                 spaces_between_special_tokens,
-             )
-
-            req_state.tokens.extend(new_tokens)
-            req_state.prefix_offset = prefix_offset
-            req_state.read_offset = read_offset
-            req_state.output_text += new_decoded_token_text
-            decoded_text += new_decoded_token_text
-        return decoded_text
-
-
-@dataclass
-class RequestState:
-
-    req_id: str
-
-    token_ids: List[int]
-    tokens: List[str]
-    num_prompt_tokens: int
-
-    prefix_offset: int
-    read_offset: int
-
-    skip_special_tokens: bool
-    spaces_between_special_tokens: bool
-
-    output_text: str = ""

From d1c6799b8870e513bf4f2305cbf6cda9fc3d773b Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 11 Nov 2024 15:21:12 -0800
Subject: [PATCH 098/183] [doc] update debugging guide (#10236)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/getting_started/debugging.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index d40222bfd4da8..060599680be25 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -122,6 +122,8 @@ If you are testing with multi-nodes, adjust ``--nproc-per-node`` and ``--nnodes`
 
 If the script runs successfully, you should see the message ``sanity check is successful!``.
 
+If the test script hangs or crashes, usually it means the hardware/drivers are broken in some sense. You should try to contact your system administrator or hardware vendor for further assistance. As a common workaround, you can try to tune some NCCL environment variables, such as ``export NCCL_P2P_DISABLE=1`` to see if it helps. Please check `their documentation <https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html>`__ for more information. Please only use these environment variables as a temporary workaround, as they might affect the performance of the system. The best solution is still to fix the hardware/drivers so that the test script can run successfully.
+
 .. note::
 
     A multi-node environment is more complicated than a single-node one. If you see errors such as ``torch.distributed.DistNetworkError``, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments:

From 9cdba9669cb32191aa0ae6782c0648be3e0e44ed Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Mon, 11 Nov 2024 20:55:09 -0500
Subject: [PATCH 099/183] [Doc] Update help text for
 `--distributed-executor-backend` (#10231)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 vllm/config.py           |  9 ++++++---
 vllm/engine/arg_utils.py | 11 ++++++++---
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index dc9c06d7fb16e..bb9fee30c8445 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -951,9 +951,12 @@ class ParallelConfig:
             https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler.
         placement_group: ray distributed model workers placement group.
         distributed_executor_backend: Backend to use for distributed model
-            workers, either "ray" or "mp" (multiprocessing). If either
-            pipeline_parallel_size or tensor_parallel_size is greater than 1,
-            will default to "ray" if Ray is installed or "mp" otherwise.
+            workers, either "ray" or "mp" (multiprocessing). If the product
+            of pipeline_parallel_size and tensor_parallel_size is less than
+            or equal to the number of GPUs available, "mp" will be used to
+            keep processing on a single host. Otherwise, this will default
+            to "ray" if Ray is installed and fail otherwise. Note that tpu
+            and hpu only support Ray for distributed inference.
     """
 
     def __init__(
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 02e67f89e5a8d..1591059a89f92 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -369,9 +369,14 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             '--distributed-executor-backend',
             choices=['ray', 'mp'],
             default=EngineArgs.distributed_executor_backend,
-            help='Backend to use for distributed serving. When more than 1 GPU '
-            'is used, will be automatically set to "ray" if installed '
-            'or "mp" (multiprocessing) otherwise.')
+            help='Backend to use for distributed model '
+            'workers, either "ray" or "mp" (multiprocessing). If the product '
+            'of pipeline_parallel_size and tensor_parallel_size is less than '
+            'or equal to the number of GPUs available, "mp" will be used to '
+            'keep processing on a single host. Otherwise, this will default '
+            'to "ray" if Ray is installed and fail otherwise. Note that tpu '
+            'and hpu only support Ray for distributed inference.')
+
         parser.add_argument(
             '--worker-use-ray',
             action='store_true',

From eea55cca5b0896eab7fa213291090f70c858a3bc Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 11 Nov 2024 18:01:06 -0800
Subject: [PATCH 100/183] [1/N] torch.compile user interface design (#10237)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/compile/piecewise/test_simple.py    | 14 +++++++----
 tests/compile/piecewise/test_toy_llama.py | 21 ++++++++++------
 vllm/compilation/decorators.py            | 27 ++++++++++----------
 vllm/config.py                            | 30 ++++++++++++++---------
 4 files changed, 55 insertions(+), 37 deletions(-)

diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index fcfe80d8e4041..c631850ecdedb 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -12,10 +12,9 @@
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
 from vllm.compilation.levels import CompilationLevel
+from vllm.config import VllmConfig
 from vllm.utils import direct_register_custom_op
 
-os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE)
-
 global_counter = 0
 
 # create a library to hold the custom op
@@ -48,7 +47,11 @@ def silly_attention_fake(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
 @support_torch_compile
 class SillyModel(nn.Module):
 
-    def __init__(self) -> None:
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = '',
+                 **kwargs) -> None:
         super().__init__()
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -74,11 +77,12 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 def test_simple_piecewise_compile():
 
-    model = SillyModel()
-
     directory = os.path.dirname(__file__)
     config = os.path.join(directory, "piecewise_compilation_config.json")
     os.environ["VLLM_TORCH_COMPILE_CONFIG"] = config
+    os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE)
+
+    model = SillyModel(vllm_config=VllmConfig(), prefix='')
 
     inputs = torch.randn(100).cuda()
 
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index 73fa9e9906936..c363a587a818e 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -19,6 +19,7 @@
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
 from vllm.compilation.levels import CompilationLevel
+from vllm.config import VllmConfig
 from vllm.plugins import set_compilation_config
 from vllm.utils import direct_register_custom_op
 
@@ -195,9 +196,15 @@ def forward(
         return hidden_states, residual
 
 
+@support_torch_compile
 class LlamaModel(nn.Module):
 
-    def __init__(self, config: LlamaConfig) -> None:
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 config: LlamaConfig,
+                 prefix: str = '',
+                 **kwargs) -> None:
         super().__init__()
         self.embedding_tokens = nn.Embedding(
             num_embeddings=config.vocab_size,
@@ -265,10 +272,9 @@ def run_model(llama_config,
             CompilationLevel.NO_COMPILATION)
         set_compilation_config(None)
 
-    cls = LlamaModel
-    if use_compile:
-        cls = support_torch_compile(LlamaModel)
-    model = cls(llama_config).eval().cuda()
+    model = LlamaModel(config=llama_config,
+                       vllm_config=VllmConfig(),
+                       prefix="").eval().cuda()
 
     B = 16  # max batch size
     input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
@@ -357,7 +363,6 @@ def test_toy_llama():
 def benchmark():
     os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE)
     from triton.testing import do_bench
-    cls = support_torch_compile(LlamaModel)
 
     # similar to llama 3.1-8B
     llama_config = LlamaConfig(hidden_size=4096,
@@ -390,7 +395,9 @@ def benchmark():
         else:
             set_compilation_config(None)
 
-        model = cls(llama_config).eval().cuda().to(torch.bfloat16)
+        model = LlamaModel(config=llama_config,
+                           vllm_config=VllmConfig(),
+                           prefix="").eval().cuda().to(torch.bfloat16)
 
         B = 256  # max batch size
         input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index 3053e57e0b63b..ca1e96a33c014 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -6,6 +6,7 @@
 import vllm.envs as envs
 from vllm.compilation.levels import CompilationLevel
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
+from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.sequence import IntermediateTensors
 from vllm.utils import supports_dynamo
@@ -110,26 +111,26 @@ def _support_torch_compile(cls: type,
     """
     A decorator to add support for compiling the forward method of a class.
     """
-
-    # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner
-    # will handle the compilation, so we don't need to do anything here.
-    if envs.VLLM_TORCH_COMPILE_LEVEL in [
-            CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS
-    ] or not supports_dynamo():
+    if TorchCompileWrapperWithCustomDispatcher in cls.__bases__:
+        # support decorating multiple times
         return cls
 
     # take care of method resolution order
     # make sure super().__init__ is called on the base class
     #  other than TorchCompileWrapperWithCustomDispatcher
-    if TorchCompileWrapperWithCustomDispatcher not in cls.__bases__:
-        # support decorating multiple times
-        cls.__bases__ = cls.__bases__ + (
-            TorchCompileWrapperWithCustomDispatcher, )
+    cls.__bases__ = cls.__bases__ + (TorchCompileWrapperWithCustomDispatcher, )
 
     old_init = cls.__init__  # type: ignore
 
-    def __init__(self, *args, **kwargs):
-        old_init(self, *args, **kwargs)
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs):
+        old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
+        # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner
+        # will handle the compilation, so we don't need to do anything here.
+        self.do_not_compile = envs.VLLM_TORCH_COMPILE_LEVEL in [
+            CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS
+        ] or not supports_dynamo()
+        if self.do_not_compile:
+            return
         TorchCompileWrapperWithCustomDispatcher.__init__(self)
 
     cls.__init__ = __init__  # type: ignore
@@ -138,7 +139,7 @@ def __call__(self, *args, **kwargs):
         # torch.compiler.is_compiling() means we are inside the compilation
         # e.g. TPU has the compilation logic in model runner, so we don't
         # need to compile the model inside.
-        if torch.compiler.is_compiling():
+        if self.do_not_compile or torch.compiler.is_compiling():
             return self.forward(*args, **kwargs)
 
         # the first compilation needs to have dynamic shapes marked
diff --git a/vllm/config.py b/vllm/config.py
index bb9fee30c8445..b354fb61d7b7e 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2041,12 +2041,15 @@ class VllmConfig:
     simplifies passing around the distinct configurations in the codebase.
     """
 
-    model_config: ModelConfig
-    cache_config: CacheConfig
-    parallel_config: ParallelConfig
-    scheduler_config: SchedulerConfig
-    device_config: DeviceConfig
-    load_config: LoadConfig
+    model_config: ModelConfig = field(default=None, init=True)  # type: ignore
+    cache_config: CacheConfig = field(default=None, init=True)  # type: ignore
+    parallel_config: ParallelConfig = field(default=None,
+                                            init=True)  # type: ignore
+    scheduler_config: SchedulerConfig = field(default=None,
+                                              init=True)  # type: ignore
+    device_config: DeviceConfig = field(default=None,
+                                        init=True)  # type: ignore
+    load_config: LoadConfig = field(default=None, init=True)  # type: ignore
     lora_config: Optional[LoRAConfig] = None
     speculative_config: Optional[SpeculativeConfig] = None
     decoding_config: Optional[DecodingConfig] = None
@@ -2091,11 +2094,14 @@ def with_hf_config(self, hf_config: PretrainedConfig) -> "VllmConfig":
     def __post_init__(self):
         """Verify configs are valid & consistent with each other.
         """
-        self.model_config.verify_async_output_proc(self.parallel_config,
-                                                   self.speculative_config,
-                                                   self.device_config)
-        self.model_config.verify_with_parallel_config(self.parallel_config)
-        self.cache_config.verify_with_parallel_config(self.parallel_config)
+        if self.model_config is not None:
+            self.model_config.verify_async_output_proc(self.parallel_config,
+                                                       self.speculative_config,
+                                                       self.device_config)
+            self.model_config.verify_with_parallel_config(self.parallel_config)
+
+        if self.cache_config is not None:
+            self.cache_config.verify_with_parallel_config(self.parallel_config)
 
         if self.lora_config:
             self.lora_config.verify_with_model_config(self.model_config)
@@ -2149,4 +2155,4 @@ def __str__(self):
         self.scheduler_config.num_scheduler_steps,
         self.cache_config.enable_prefix_caching,
         self.model_config.use_async_output_proc,
-        self.model_config.mm_processor_kwargs)
\ No newline at end of file
+        self.model_config.mm_processor_kwargs)

From 7f5edb5900c4010c1daa5bfeb3829974d3f6dff1 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 12 Nov 2024 11:10:15 +0800
Subject: [PATCH 101/183] [Misc][LoRA] Replace hardcoded cuda device with
 configurable argument  (#10223)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 tests/lora/test_layers.py       |  56 +++++++-----
 tests/lora/test_lora_manager.py | 153 +++++++++++++++++++++++---------
 tests/lora/utils.py             |   9 +-
 vllm/lora/models.py             |  19 ++--
 vllm/lora/punica.py             |  15 ++--
 vllm/lora/worker_manager.py     |   2 +
 6 files changed, 174 insertions(+), 80 deletions(-)

diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index eb882faf3974a..15e576cb065c7 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -51,6 +51,7 @@
 CUDA_DEVICES = [
     f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
 ]
+
 # We will launch different triton kernels between the prefill and decode
 # stages, so we need to verify this. prefill stage(True) or decode stage(False)
 STAGES = [True, False]
@@ -120,11 +121,12 @@ def populate_loras(
             subloras: List[LoRALayerWeights] = []
             sublora_len = layer_weights.shape[0] // repeats
             for i in range(repeats):
-                sublora = DummyLoRAManager().init_random_lora(
-                    module_name=f"fake_{i}",
-                    weight=layer_weights,
-                    generate_embeddings_tensor=generate_embeddings_tensor,
-                )
+                sublora = DummyLoRAManager(
+                    layer_weights.device).init_random_lora(
+                        module_name=f"fake_{i}",
+                        weight=layer_weights,
+                        generate_embeddings_tensor=generate_embeddings_tensor,
+                    )
                 sublora.lora_b = sublora.lora_b[:, (sublora_len *
                                                     i):(sublora_len * (i + 1))]
                 sublora.optimize()
@@ -152,6 +154,7 @@ def create_random_inputs(
     input_size: Tuple[int, ...],
     input_range: Tuple[float, float],
     input_type: torch.dtype = torch.int,
+    device: torch.device = "cuda"
 ) -> Tuple[List[torch.Tensor], List[int], List[int]]:
     """Creates random inputs.
 
@@ -173,10 +176,14 @@ def create_random_inputs(
     for _ in range(num_inputs):
         if input_type == torch.int:
             inputs.append(
-                torch.randint(low=int(low), high=int(high), size=input_size))
+                torch.randint(low=int(low),
+                              high=int(high),
+                              size=input_size,
+                              device=device))
         else:
             inputs.append(
-                torch.rand(size=input_size, dtype=input_type) * high + low)
+                torch.rand(size=input_size, dtype=input_type, device=device) *
+                high + low)
 
         lora_id = random.choice(active_lora_ids)
         index_mapping += [lora_id] * input_size[0]
@@ -191,6 +198,10 @@ def create_random_inputs(
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
 @pytest.mark.parametrize("stage", STAGES)
 def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
+    # For multi-GPU testing of Triton kernel, we must explicitly set the CUDA
+    # device, see: https://github.com/triton-lang/triton/issues/2925
+    # Same below.
+    torch.cuda.set_device(device)
 
     torch.set_default_device(device)
     max_loras = 8
@@ -225,7 +236,7 @@ def create_random_embedding_layer():
             num_inputs=num_loras * 3,
             input_size=(200, ),
             input_range=(1, vocab_size),
-        )
+            device=device)
         lora_mapping = LoRAMapping(index_mapping,
                                    prompt_mapping,
                                    is_prefill=stage)
@@ -263,7 +274,7 @@ def create_random_embedding_layer():
             num_inputs=num_loras * 3,
             input_size=(200, ),
             input_range=(1, vocab_size),
-        )
+            device=device)
         lora_mapping = LoRAMapping(index_mapping,
                                    prompt_mapping,
                                    is_prefill=stage)
@@ -291,6 +302,7 @@ def create_random_embedding_layer():
 def test_embeddings_with_new_embeddings(dist_init, num_loras, device,
                                         vocab_size, stage) -> None:
 
+    torch.cuda.set_device(device)
     torch.set_default_device(device)
     max_loras = 8
     punica_wrapper = PunicaWrapper(8192, 256, device)
@@ -345,7 +357,7 @@ def create_random_embedding_layer():
             num_inputs=num_loras * 3,
             input_size=(200, ),
             input_range=(1, vocab_size),
-        )
+            device=device)
         lora_mapping = LoRAMapping(index_mapping,
                                    prompt_mapping,
                                    is_prefill=stage)
@@ -400,7 +412,7 @@ def create_random_embedding_layer():
             num_inputs=num_loras * 3,
             input_size=(200, ),
             input_range=(1, vocab_size),
-        )
+            device=device)
         original_inputs = deepcopy(inputs)
         lora_mapping = LoRAMapping(index_mapping,
                                    prompt_mapping,
@@ -426,6 +438,7 @@ def create_random_embedding_layer():
 def test_lm_head_logits_processor(dist_init, num_loras, device, vocab_size,
                                   stage) -> None:
 
+    torch.cuda.set_device(device)
     torch.set_default_device(device)
     max_loras = 8
     punica_wrapper = PunicaWrapper(8192, 256, device)
@@ -471,7 +484,7 @@ def _pretest():
             input_size=(1, 1024),
             input_range=(0, 1),
             input_type=torch.float16,
-        )
+            device=device)
         lora_mapping = LoRAMapping(index_mapping,
                                    prompt_mapping,
                                    is_prefill=stage)
@@ -520,7 +533,7 @@ def _pretest():
             input_size=(1, 1024),
             input_range=(0, 1),
             input_type=torch.float16,
-        )
+            device=device)
         lora_mapping = LoRAMapping(index_mapping,
                                    prompt_mapping,
                                    is_prefill=stage)
@@ -554,6 +567,7 @@ def _pretest():
 @pytest.mark.parametrize("stage", STAGES)
 def test_linear_replicated(dist_init, num_loras, device, stage) -> None:
 
+    torch.cuda.set_device(device)
     torch.set_default_device(device)
     punica_wrapper = PunicaWrapper(8192, 256, device)
     max_loras = 8
@@ -592,7 +606,7 @@ def create_random_linear_replicated_layer():
             input_size=(1, 4096),
             input_range=(0, 1),
             input_type=torch.float16,
-        )
+            device=device)
         lora_mapping = LoRAMapping(index_mapping,
                                    prompt_mapping,
                                    is_prefill=stage)
@@ -631,7 +645,7 @@ def create_random_linear_replicated_layer():
             input_size=(1, 4096),
             input_range=(0, 1),
             input_type=torch.float16,
-        )
+            device=device)
         lora_mapping = LoRAMapping(index_mapping,
                                    prompt_mapping,
                                    is_prefill=stage)
@@ -658,6 +672,7 @@ def create_random_linear_replicated_layer():
 def test_linear_parallel(dist_init, num_loras, orientation, fully_shard,
                          device, stage) -> None:
 
+    torch.cuda.set_device(device)
     torch.set_default_device(device)
     punica_wrapper = PunicaWrapper(8192, 256, device)
     max_loras = 8
@@ -706,7 +721,7 @@ def create_random_linear_parallel_layer():
             input_size=(1, 4096),
             input_range=(0, 1),
             input_type=torch.float16,
-        )
+            device=device)
         lora_mapping = LoRAMapping(index_mapping,
                                    prompt_mapping,
                                    is_prefill=stage)
@@ -745,7 +760,7 @@ def create_random_linear_parallel_layer():
             input_size=(1, 4096),
             input_range=(0, 1),
             input_type=torch.float16,
-        )
+            device=device)
         lora_mapping = LoRAMapping(index_mapping,
                                    prompt_mapping,
                                    is_prefill=stage)
@@ -772,6 +787,7 @@ def create_random_linear_parallel_layer():
 def test_column_parallel_packed(dist_init, num_loras, repeats, fully_shard,
                                 device, stage) -> None:
 
+    torch.cuda.set_device(device)
     torch.set_default_device(device)
     punica_wrapper = PunicaWrapper(8192, 256, device)
     max_loras = 8
@@ -842,7 +858,7 @@ class FakeConfig:
             input_size=(1, 4096),
             input_range=(0, 1),
             input_type=torch.float16,
-        )
+            device=device)
         lora_mapping = LoRAMapping(index_mapping,
                                    prompt_mapping,
                                    is_prefill=stage)
@@ -883,7 +899,7 @@ class FakeConfig:
             input_size=(1, 4096),
             input_range=(0, 1),
             input_type=torch.float16,
-        )
+            device=device)
         lora_mapping = LoRAMapping(index_mapping,
                                    prompt_mapping,
                                    is_prefill=stage)
@@ -962,7 +978,7 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
         input_size=(1, max_position),
         input_range=(0, lora_config.lora_extra_vocab_size),
         input_type=torch.float16,
-    )
+        device=device)
 
     lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
     long_lora_context = LongContextLoRAContext(list(scaling_factors),
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index 67cf298b4df2b..8d109b2c81503 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -25,8 +25,13 @@
 
 EMBEDDING_PADDING_MODULES = ["lm_head"]
 
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
 
-def test_from_lora_tensors(sql_lora_files):
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_from_lora_tensors(sql_lora_files, device):
     tensors = load_file(
         os.path.join(sql_lora_files, "adapter_model.safetensors"))
     new_embeddings = load_file(
@@ -36,7 +41,7 @@ def test_from_lora_tensors(sql_lora_files):
         8,
         16,
         tensors,
-        "cuda",
+        device,
         embeddings=new_embeddings,
         embedding_modules=EMBEDDING_MODULES,
         embedding_padding_modules=EMBEDDING_PADDING_MODULES)
@@ -46,6 +51,8 @@ def test_from_lora_tensors(sql_lora_files):
         assert lora.lora_alpha == 16
         assert lora.lora_a is not None
         assert lora.lora_b is not None
+        assert lora.lora_a.device == torch.device(device)
+        assert lora.lora_b.device == torch.device(device)
         assert (lora.lora_a.shape[1] == lora.lora_b.shape[0]
                 ), f"{lora.lora_a.shape=}, {lora.lora_b.shape=}"
         assert lora.lora_a.shape[1] == 8
@@ -60,8 +67,8 @@ def test_from_lora_tensors(sql_lora_files):
             assert lora.embeddings_tensor is None
 
 
-def create_lora(lora_id: int, model: nn.Module,
-                sub_modules: List[str]) -> LoRAModel:
+def create_lora(lora_id: int, model: nn.Module, sub_modules: List[str],
+                device: torch.device) -> LoRAModel:
     loras: Dict[str, LoRALayerWeights] = {}
     for name in sub_modules:
         w = model.get_submodule(name).weight
@@ -69,8 +76,8 @@ def create_lora(lora_id: int, model: nn.Module,
             name,
             8,
             16,
-            torch.rand([w.shape[1], 8], device="cuda"),
-            torch.rand([8, w.shape[0]], device="cuda"),
+            torch.rand([w.shape[1], 8], device=device),
+            torch.rand([8, w.shape[0]], device=device),
         )
     return LoRAModel(lora_id, 8, loras)
 
@@ -80,6 +87,7 @@ def create_packed_lora(
     model: nn.Module,
     module_name,
     replaced_module_names,
+    device: torch.device,
     empty_replaced_module_name=None,
 ) -> LoRAModel:
     w = model.get_submodule(module_name).weight
@@ -91,9 +99,9 @@ def create_packed_lora(
             replaced_module_name,
             8,
             16,
-            torch.rand([w.shape[1], 8], device="cuda"),
+            torch.rand([w.shape[1], 8], device=device),
             torch.rand([8, w.shape[0] // len(replaced_module_names)],
-                       device="cuda"),
+                       device=device),
         )
     return LoRAModel(lora_id, 8, loras)
 
@@ -104,7 +112,8 @@ def test_replace_submodules(dist_init, dummy_model):
     model.packed_modules_mapping = {}
     manager = LoRAModelManager(
         model, 1, 1, 1,
-        LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8))
+        LoRAConfig(max_lora_rank=8, max_cpu_loras=8, max_loras=8),
+        torch.device("cuda"))
     model = manager.model
 
     assert isinstance(model.get_submodule("dense1"),
@@ -116,16 +125,28 @@ def test_replace_submodules(dist_init, dummy_model):
                       RowParallelLinearWithLoRA)
 
 
-def test_lora_model_manager(dist_init, dummy_model):
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_lora_model_manager(dist_init, dummy_model, device):
     model = dummy_model
     model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
     model.packed_modules_mapping = {}
-    model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"])
-    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"])
-    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"])
-    manager = LoRAModelManager(
-        model, 2, 2, 2,
-        LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2))
+    model_lora1 = create_lora(1,
+                              model, ["layer1.dense1", "dense2", "lm_head"],
+                              device=device)
+    model_lora2 = create_lora(2,
+                              model, ["dense1", "dense2", "lm_head"],
+                              device=device)
+    model_lora3 = create_lora(3,
+                              model, ["dense1", "dense2", "lm_head"],
+                              device=device)
+    manager = LoRAModelManager(model,
+                               2,
+                               2,
+                               2,
+                               LoRAConfig(max_lora_rank=8,
+                                          max_cpu_loras=3,
+                                          max_loras=2),
+                               device=device)
     assert all(x is None for x in manager.lora_index_to_id)
     assert manager.add_adapter(model_lora1)
     assert manager.activate_adapter(1)
@@ -161,17 +182,32 @@ def test_lora_model_manager(dist_init, dummy_model):
     assert manager.lora_index_to_id[0] == 3
     assert manager.lora_index_to_id[1] == 2
 
+    assert manager.device == device
+    assert manager.punica_wrapper.device == device
 
-def test_lora_lru_cache_model_manager(dist_init, dummy_model):
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
     model = dummy_model
     model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
     model.packed_modules_mapping = {}
-    model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"])
-    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"])
-    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"])
-    manager = LRUCacheLoRAModelManager(
-        model, 2, 2, 2,
-        LoRAConfig(max_lora_rank=8, max_cpu_loras=3, max_loras=2))
+    model_lora1 = create_lora(1,
+                              model, ["layer1.dense1", "dense2", "lm_head"],
+                              device=device)
+    model_lora2 = create_lora(2,
+                              model, ["dense1", "dense2", "lm_head"],
+                              device=device)
+    model_lora3 = create_lora(3,
+                              model, ["dense1", "dense2", "lm_head"],
+                              device=device)
+    manager = LRUCacheLoRAModelManager(model,
+                                       2,
+                                       2,
+                                       2,
+                                       LoRAConfig(max_lora_rank=8,
+                                                  max_cpu_loras=3,
+                                                  max_loras=2),
+                                       device=device)
     assert all(x is None for x in manager.lora_index_to_id)
     assert manager.add_adapter(model_lora1)
     assert manager.activate_adapter(1)
@@ -238,20 +274,37 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model):
     with pytest.raises(ValueError):
         assert manager.pin_adapter(3)
 
+    assert manager.punica_wrapper.device == device
+    assert manager.device == device
+
 
-def test_lru_lora_model_manager(dist_init, dummy_model):
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_lru_lora_model_manager(dist_init, dummy_model, device):
     # This tests just the LRU cache functionality, everything else is
     # tested in test_lora_model_manager
     model = dummy_model
     model.supported_lora_modules = ["dense1", "dense2", "lm_head"]
     model.packed_modules_mapping = {}
-    model_lora1 = create_lora(1, model, ["layer1.dense1", "dense2", "lm_head"])
-    model_lora2 = create_lora(2, model, ["dense1", "dense2", "lm_head"])
-    model_lora3 = create_lora(3, model, ["dense1", "dense2", "lm_head"])
-    model_lora4 = create_lora(4, model, ["dense1", "dense2", "lm_head"])
-    manager = LRUCacheLoRAModelManager(
-        model, 2, 2, 2,
-        LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2))
+    model_lora1 = create_lora(1,
+                              model, ["layer1.dense1", "dense2", "lm_head"],
+                              device=device)
+    model_lora2 = create_lora(2,
+                              model, ["dense1", "dense2", "lm_head"],
+                              device=device)
+    model_lora3 = create_lora(3,
+                              model, ["dense1", "dense2", "lm_head"],
+                              device=device)
+    model_lora4 = create_lora(4,
+                              model, ["dense1", "dense2", "lm_head"],
+                              device=device)
+    manager = LRUCacheLoRAModelManager(model,
+                                       2,
+                                       2,
+                                       2,
+                                       LoRAConfig(max_lora_rank=8,
+                                                  max_cpu_loras=2,
+                                                  max_loras=2),
+                                       device=device)
 
     assert all(x is None for x in manager.lora_index_to_id)
 
@@ -351,14 +404,17 @@ def test_lru_lora_model_manager(dist_init, dummy_model):
         assert manager.remove_oldest_adapter()
 
     assert set(manager.list_adapters()) == {1}
+    assert manager.punica_wrapper.device == device
+    assert manager.device == device
 
 
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
-                                          sql_lora_files):
+                                          sql_lora_files, device):
     lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
     worker_adapter_manager = LRUCacheWorkerLoRAManager(
         4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
-        lora_config.lora_extra_vocab_size, lora_config, torch.device("cuda"),
+        lora_config.lora_extra_vocab_size, lora_config, device,
         EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
     worker_adapter_manager.create_lora_manager(
         llama_2_7b_model_extra_embeddings)
@@ -426,14 +482,19 @@ def test_lru_cache_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
             LoRARequest("14", 14, sql_lora_files)
         ], mapping)
 
+    assert worker_adapter_manager.device == device
+    assert (worker_adapter_manager._adapter_manager.punica_wrapper.device ==
+            device)
 
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
-                                sql_lora_files):
+                                sql_lora_files, device):
     # Should remove every LoRA not specified in the request.
     lora_config = LoRAConfig(max_lora_rank=8, max_cpu_loras=4, max_loras=4)
     worker_adapter_manager = WorkerLoRAManager(
         4, 2, llama_2_7b_model_extra_embeddings.unpadded_vocab_size -
-        lora_config.lora_extra_vocab_size, lora_config, torch.device("cuda"),
+        lora_config.lora_extra_vocab_size, lora_config, device,
         EMBEDDING_MODULES, EMBEDDING_PADDING_MODULES)
     worker_adapter_manager.create_lora_manager(
         llama_2_7b_model_extra_embeddings)
@@ -497,8 +558,13 @@ def test_worker_adapter_manager(llama_2_7b_model_extra_embeddings,
             LoRARequest("14", 14, sql_lora_files)
         ], mapping)
 
+    assert worker_adapter_manager.device == device
+    assert (worker_adapter_manager._adapter_manager.punica_wrapper.device ==
+            device)
+
 
-def test_packed_loras(dist_init, dummy_model_gate_up):
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_packed_loras(dist_init, dummy_model_gate_up, device):
     model = dummy_model_gate_up
     model.supported_lora_modules = ["gate_up_proj"]
     model.packed_modules_mapping = {
@@ -511,18 +577,25 @@ def test_packed_loras(dist_init, dummy_model_gate_up):
         1,
         model,
         module_name="gate_up_proj",
-        replaced_module_names=["gate_proj", "up_proj"])
+        replaced_module_names=["gate_proj", "up_proj"],
+        device=device)
     model_lora1 = create_packed_lora(
         2,
         model,
         module_name="gate_up_proj",
         replaced_module_names=["gate_proj", "up_proj"],
+        device=device,
         empty_replaced_module_name="gate_proj",
     )
 
-    manager = LoRAModelManager(
-        model, 2, 2, 2,
-        LoRAConfig(max_lora_rank=8, max_cpu_loras=2, max_loras=2))
+    manager = LoRAModelManager(model,
+                               2,
+                               2,
+                               2,
+                               LoRAConfig(max_lora_rank=8,
+                                          max_cpu_loras=2,
+                                          max_loras=2),
+                               device=device)
     model = manager.model
 
     assert isinstance(model.get_submodule("gate_up_proj"),
diff --git a/tests/lora/utils.py b/tests/lora/utils.py
index 00f8e26d1041f..e394c33b3f9ea 100644
--- a/tests/lora/utils.py
+++ b/tests/lora/utils.py
@@ -7,9 +7,10 @@
 
 class DummyLoRAManager:
 
-    def __init__(self):
+    def __init__(self, device: torch.device = "cuda:0"):
         super().__init__()
         self._loras: Dict[str, LoRALayerWeights] = {}
+        self._device = device
 
     def set_module_lora(self, module_name: str, lora: LoRALayerWeights):
         self._loras[module_name] = lora
@@ -28,16 +29,16 @@ def init_random_lora(self,
             lora_alpha=1,
             lora_a=torch.rand([weight.shape[1], rank],
                               dtype=weight.dtype,
-                              device="cuda"),
+                              device=self._device),
             lora_b=torch.rand([rank, weight.shape[0]],
                               dtype=weight.dtype,
-                              device="cuda"),
+                              device=self._device),
         )
         if generate_embeddings_tensor:
             lora.embeddings_tensor = torch.rand(5,
                                                 generate_embeddings_tensor,
                                                 dtype=weight.dtype,
-                                                device="cuda")
+                                                device=self._device)
         self.set_module_lora(module_name, lora)
 
         return lora
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 81e274612b73b..eafc3a43a2846 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -301,6 +301,7 @@ def __init__(
         max_num_batched_tokens: int,
         vocab_size: int,
         lora_config: LoRAConfig,
+        device: torch.device,
     ):
         """Create a LoRAModelManager and adapter for a given model.
 
@@ -314,6 +315,7 @@ def __init__(
             lora_config: the LoRA configuration.
         """
         self.lora_config = lora_config
+        self.device = device
         self.max_num_seqs = max_num_seqs
         assert self.capacity >= self.lora_slots
         self.max_num_batched_tokens = math.ceil(max_num_batched_tokens / 8) * 8
@@ -322,7 +324,7 @@ def __init__(
         self.long_lora_context: Optional[LongContextLoRAContext] = None
         self.punica_wrapper = PunicaWrapper(max_num_batched_tokens,
                                             max_batches=self.max_num_seqs,
-                                            device="cuda")
+                                            device=self.device)
         # Scaling factor -> offset to the sin_cos_cache to it.
         # Used for long context lora.
         self.scaling_factor_to_offset: Dict[float, int] = {}
@@ -653,16 +655,11 @@ def __init__(self, capacity: int, deactivate_lora_fn: Callable[[int],
 class LRUCacheLoRAModelManager(LoRAModelManager):
     """A model manager that manages multiple LoRAs with LRU cache."""
 
-    def __init__(
-        self,
-        model: nn.Module,
-        max_num_seqs: int,
-        max_num_batched_tokens: int,
-        vocab_size: int,
-        lora_config: LoRAConfig,
-    ):
+    def __init__(self, model: nn.Module, max_num_seqs: int,
+                 max_num_batched_tokens: int, vocab_size: int,
+                 lora_config: LoRAConfig, device: torch.device):
         super().__init__(model, max_num_seqs, max_num_batched_tokens,
-                         vocab_size, lora_config)
+                         vocab_size, lora_config, device)
         self._registered_adapters: LoRALRUCache = LoRALRUCache(
             self.capacity, self.deactivate_adapter)
         self._active_adapters: LoRALRUCache = LoRALRUCache(
@@ -732,6 +729,7 @@ def create_lora_manager(
         max_num_batched_tokens: int,
         vocab_size: int,
         lora_config: LoRAConfig,
+        device: torch.device,
         lora_manager_cls: Type[LoRAModelManager] = LoRAModelManager,
         **kwargs) -> LoRAModelManager:
     """Create a LoRA adapter for a given model."""
@@ -743,5 +741,6 @@ def create_lora_manager(
         max_num_batched_tokens=max_num_batched_tokens,
         vocab_size=vocab_size,
         lora_config=lora_config,
+        device=device,
         **kwargs)
     return lora_manager
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index 5033ce4126929..082041f390750 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -62,6 +62,7 @@ def convert_mapping(
     max_loras: int,
     vocab_size: int,
     extra_vocab_size: int,
+    device: torch.device,
     long_lora_context: Optional["LongContextLoRAContext"] = None,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
            Optional[torch.Tensor], List[int]]:
@@ -104,7 +105,7 @@ def convert_mapping(
     long_lora_offsets: Optional[torch.Tensor] = None
     if long_lora_context:
         long_lora_offsets = torch.zeros(len(index_mapping_indices),
-                                        device="cuda",
+                                        device=device,
                                         dtype=torch.long)
     prompt_mapping: List[int] = [
         lora_index_to_id.index(x) if x > 0 else -1
@@ -131,10 +132,10 @@ def convert_mapping(
     if long_lora_context:
         assert long_lora_offsets is not None
         indices_list.append(long_lora_offsets)
-    indices = torch.tensor(indices_list, dtype=torch.long, device="cuda")
+    indices = torch.tensor(indices_list, dtype=torch.long, device=device)
     prompt_mapping_tensor = torch.tensor(prompt_mapping,
-                                         device="cuda",
-                                         dtype=torch.long)
+                                         dtype=torch.long,
+                                         device=device)
     embeddings_indices = torch.stack([
         indices[2] * extra_vocab_size,
         indices[2] * (vocab_size + extra_vocab_size),
@@ -145,7 +146,7 @@ def convert_mapping(
     sampler_indices_padded = sampler_indices.clone()
     sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1
     sampler_indices_padded = torch.arange(
-        0, len(sampler_indices_padded), device="cuda", dtype=torch.long) + (
+        0, len(sampler_indices_padded), device=device, dtype=torch.long) + (
             sampler_indices_padded * len(sampler_indices_padded))
     long_lora_indices = None
     long_lora_indices_len: Optional[int] = None
@@ -183,7 +184,7 @@ class PunicaWrapper:
     """
 
     def __init__(self, max_num_batched_tokens: int, max_batches: int,
-                 device: str):
+                 device: Union[torch.device, str]):
         self._token_lora_indices = torch.empty(max_num_batched_tokens,
                                                dtype=torch.long,
                                                device=device)
@@ -215,6 +216,7 @@ def __init__(self, max_num_batched_tokens: int, max_batches: int,
         self._lora_indices_per_batch = torch.empty(max_batches,
                                                    dtype=torch.long,
                                                    device=device)
+        self.device: torch.device = device
         self.max_length: int = 0
         self.token_nums: int = 0
         self.batch_size: int = -1
@@ -263,6 +265,7 @@ def _update_base_metadata(
             max_loras,
             vocab_size,
             extra_vocab_size,
+            self.device,
             long_lora_context,
         )
         self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices)
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 724c308a07a27..93a5e27621912 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -73,6 +73,7 @@ def create_lora_manager(
             max_num_batched_tokens=self.max_num_batched_tokens,
             vocab_size=self.vocab_size,
             lora_config=self.lora_config,
+            device=self.device,
             lora_manager_cls=self._manager_cls,
         )
         self._adapter_manager = lora_manager
@@ -176,6 +177,7 @@ def create_lora_manager(
             max_num_seqs=self.max_num_seqs,
             vocab_size=self.vocab_size,
             lora_config=self.lora_config,
+            device=self.device,
             max_num_batched_tokens=self.max_num_batched_tokens,
         )
         self._adapter_manager = lora_manager

From 812c981fa00a8b2b95865c6e76b6c3735a56d7d9 Mon Sep 17 00:00:00 2001
From: Aleksandr Malyshev <164964928+maleksan85@users.noreply.github.com>
Date: Mon, 11 Nov 2024 22:55:07 -0800
Subject: [PATCH 102/183] Splitting attention kernel file (#10091)

Signed-off-by: maleksan85 <maleksan@amd.com>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
---
 CMakeLists.txt                                |   5 +-
 ...ntion_kernels.cu => attention_kernels.cuh} | 326 ------------------
 csrc/attention/paged_attention_v1.cu          | 193 +++++++++++
 csrc/attention/paged_attention_v2.cu          | 203 +++++++++++
 4 files changed, 399 insertions(+), 328 deletions(-)
 rename csrc/attention/{attention_kernels.cu => attention_kernels.cuh} (64%)
 create mode 100644 csrc/attention/paged_attention_v1.cu
 create mode 100644 csrc/attention/paged_attention_v2.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 376565583d928..5acbd762ee957 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,7 +37,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
 set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
 
 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")
 
 #
 # Supported/expected torch versions for CUDA/ROCm.
@@ -187,7 +187,8 @@ message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
 
 set(VLLM_EXT_SRC
   "csrc/cache_kernels.cu"
-  "csrc/attention/attention_kernels.cu"
+  "csrc/attention/paged_attention_v1.cu"
+  "csrc/attention/paged_attention_v2.cu"
   "csrc/pos_encoding_kernels.cu"
   "csrc/activation_kernels.cu"
   "csrc/layernorm_kernels.cu"
diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cuh
similarity index 64%
rename from csrc/attention/attention_kernels.cu
rename to csrc/attention/attention_kernels.cuh
index bcd170411e7cb..563e1438f0b01 100644
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cuh
@@ -670,332 +670,6 @@ __global__ void paged_attention_v2_reduce_kernel(
 
 }  // namespace vllm
 
-#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE)                                \
-  VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(                     \
-      ((void*)vllm::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE,        \
-                                              BLOCK_SIZE, NUM_THREADS,      \
-                                              KV_DTYPE, IS_BLOCK_SPARSE>),  \
-      shared_mem_size);                                                     \
-  vllm::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE,        \
-                                  NUM_THREADS, KV_DTYPE, IS_BLOCK_SPARSE>   \
-      <<<grid, block, shared_mem_size, stream>>>(                           \
-          out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \
-          scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq,    \
-          alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride,      \
-          k_scale, v_scale, tp_rank, blocksparse_local_blocks,              \
-          blocksparse_vert_stride, blocksparse_block_size,                  \
-          blocksparse_head_sliding_step);
-
-// TODO(woosuk): Tune NUM_THREADS.
-template <typename T, typename CACHE_T, int BLOCK_SIZE,
-          vllm::Fp8KVCacheDataType KV_DTYPE, bool IS_BLOCK_SPARSE,
-          int NUM_THREADS = 128>
-void paged_attention_v1_launcher(
-    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int num_kv_heads, float scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
-    float v_scale, const int tp_rank, const int blocksparse_local_blocks,
-    const int blocksparse_vert_stride, const int blocksparse_block_size,
-    const int blocksparse_head_sliding_step) {
-  int num_seqs = query.size(0);
-  int num_heads = query.size(1);
-  int head_size = query.size(2);
-  int max_num_blocks_per_seq = block_tables.size(1);
-  int q_stride = query.stride(0);
-  int kv_block_stride = key_cache.stride(0);
-  int kv_head_stride = key_cache.stride(1);
-
-  [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
-  assert(head_size % thread_group_size == 0);
-
-  // NOTE: alibi_slopes is optional.
-  const float* alibi_slopes_ptr =
-      alibi_slopes
-          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
-          : nullptr;
-
-  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
-  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
-  CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
-  CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
-  int* block_tables_ptr = block_tables.data_ptr<int>();
-  int* seq_lens_ptr = seq_lens.data_ptr<int>();
-
-  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
-  int padded_max_seq_len =
-      DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE;
-  int logits_size = padded_max_seq_len * sizeof(float);
-  int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
-  // Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len
-  // Keep that in sync with the logic here!
-  int shared_mem_size = std::max(logits_size, outputs_size);
-
-  dim3 grid(num_heads, num_seqs, 1);
-  dim3 block(NUM_THREADS);
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  switch (head_size) {
-    // NOTE(woosuk): To reduce the compilation time, we only compile for the
-    // head sizes that we use in the model. However, we can easily extend this
-    // to support any head size which is a multiple of 16.
-    case 64:
-      LAUNCH_PAGED_ATTENTION_V1(64);
-      break;
-    case 80:
-      LAUNCH_PAGED_ATTENTION_V1(80);
-      break;
-    case 96:
-      LAUNCH_PAGED_ATTENTION_V1(96);
-      break;
-    case 112:
-      LAUNCH_PAGED_ATTENTION_V1(112);
-      break;
-    case 120:
-      LAUNCH_PAGED_ATTENTION_V1(120);
-      break;
-    case 128:
-      LAUNCH_PAGED_ATTENTION_V1(128);
-      break;
-    case 192:
-      LAUNCH_PAGED_ATTENTION_V1(192);
-      break;
-    case 256:
-      LAUNCH_PAGED_ATTENTION_V1(256);
-      break;
-    default:
-      TORCH_CHECK(false, "Unsupported head size: ", head_size);
-      break;
-  }
-}
-
-#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE)  \
-  paged_attention_v1_launcher<T, CACHE_T, BLOCK_SIZE, KV_DTYPE,              \
-                              IS_BLOCK_SPARSE>(                              \
-      out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
-      seq_lens, max_seq_len, alibi_slopes, k_scale, v_scale, tp_rank,        \
-      blocksparse_local_blocks, blocksparse_vert_stride,                     \
-      blocksparse_block_size, blocksparse_head_sliding_step);
-
-#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
-  switch (is_block_sparse) {                                               \
-    case true:                                                             \
-      CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);     \
-      break;                                                               \
-    case false:                                                            \
-      CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);    \
-      break;                                                               \
-  }
-
-// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
-// 1, 2, 4, 64, 128, 256.
-#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE)         \
-  switch (block_size) {                                           \
-    case 8:                                                       \
-      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE);         \
-      break;                                                      \
-    case 16:                                                      \
-      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE);        \
-      break;                                                      \
-    case 32:                                                      \
-      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE);        \
-      break;                                                      \
-    default:                                                      \
-      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
-      break;                                                      \
-  }
-
-void paged_attention_v1(
-    torch::Tensor& out,    // [num_seqs, num_heads, head_size]
-    torch::Tensor& query,  // [num_seqs, num_heads, head_size]
-    torch::Tensor&
-        key_cache,  // [num_blocks, num_heads, head_size/x, block_size, x]
-    torch::Tensor&
-        value_cache,       // [num_blocks, num_heads, head_size, block_size]
-    int64_t num_kv_heads,  // [num_heads]
-    double scale,
-    torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
-    torch::Tensor& seq_lens,      // [num_seqs]
-    int64_t block_size, int64_t max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double k_scale, double v_scale,
-    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
-    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step) {
-  const bool is_block_sparse = (blocksparse_vert_stride > 1);
-
-  DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
-                             CALL_V1_LAUNCHER_BLOCK_SIZE)
-}
-
-#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE)                                   \
-  vllm::paged_attention_v2_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE,           \
-                                  NUM_THREADS, KV_DTYPE, IS_BLOCK_SPARSE,      \
-                                  PARTITION_SIZE>                              \
-      <<<grid, block, shared_mem_size, stream>>>(                              \
-          exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \
-          value_cache_ptr, num_kv_heads, scale, block_tables_ptr,              \
-          seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride,    \
-          kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank,          \
-          blocksparse_local_blocks, blocksparse_vert_stride,                   \
-          blocksparse_block_size, blocksparse_head_sliding_step);              \
-  vllm::paged_attention_v2_reduce_kernel<T, HEAD_SIZE, NUM_THREADS,            \
-                                         PARTITION_SIZE>                       \
-      <<<reduce_grid, block, reduce_shared_mem_size, stream>>>(                \
-          out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, seq_lens_ptr,    \
-          max_num_partitions);
-
-template <typename T, typename CACHE_T, int BLOCK_SIZE,
-          vllm::Fp8KVCacheDataType KV_DTYPE, bool IS_BLOCK_SPARSE,
-          int NUM_THREADS = 128, int PARTITION_SIZE = 512>
-void paged_attention_v2_launcher(
-    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
-    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int num_kv_heads, float scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
-    float v_scale, const int tp_rank, const int blocksparse_local_blocks,
-    const int blocksparse_vert_stride, const int blocksparse_block_size,
-    const int blocksparse_head_sliding_step) {
-  int num_seqs = query.size(0);
-  int num_heads = query.size(1);
-  int head_size = query.size(2);
-  int max_num_blocks_per_seq = block_tables.size(1);
-  int q_stride = query.stride(0);
-  int kv_block_stride = key_cache.stride(0);
-  int kv_head_stride = key_cache.stride(1);
-
-  [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
-  assert(head_size % thread_group_size == 0);
-
-  // NOTE: alibi_slopes is optional.
-  const float* alibi_slopes_ptr =
-      alibi_slopes
-          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
-          : nullptr;
-
-  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
-  float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
-  float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
-  T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
-  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
-  CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
-  CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
-  int* block_tables_ptr = block_tables.data_ptr<int>();
-  int* seq_lens_ptr = seq_lens.data_ptr<int>();
-
-  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
-  int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
-  int logits_size = PARTITION_SIZE * sizeof(float);
-  int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
-
-  // For paged attention v2 kernel.
-  dim3 grid(num_heads, num_seqs, max_num_partitions);
-  int shared_mem_size = std::max(logits_size, outputs_size);
-  // For paged attention v2 reduce kernel.
-  dim3 reduce_grid(num_heads, num_seqs);
-  int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float);
-
-  dim3 block(NUM_THREADS);
-  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
-  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  switch (head_size) {
-    // NOTE(woosuk): To reduce the compilation time, we only compile for the
-    // head sizes that we use in the model. However, we can easily extend this
-    // to support any head size which is a multiple of 16.
-    case 64:
-      LAUNCH_PAGED_ATTENTION_V2(64);
-      break;
-    case 80:
-      LAUNCH_PAGED_ATTENTION_V2(80);
-      break;
-    case 96:
-      LAUNCH_PAGED_ATTENTION_V2(96);
-      break;
-    case 112:
-      LAUNCH_PAGED_ATTENTION_V2(112);
-      break;
-    case 120:
-      LAUNCH_PAGED_ATTENTION_V2(120);
-      break;
-    case 128:
-      LAUNCH_PAGED_ATTENTION_V2(128);
-      break;
-    case 192:
-      LAUNCH_PAGED_ATTENTION_V2(192);
-      break;
-    case 256:
-      LAUNCH_PAGED_ATTENTION_V2(256);
-      break;
-    default:
-      TORCH_CHECK(false, "Unsupported head size: ", head_size);
-      break;
-  }
-}
-
-#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE)   \
-  paged_attention_v2_launcher<T, CACHE_T, BLOCK_SIZE, KV_DTYPE,               \
-                              IS_BLOCK_SPARSE>(                               \
-      out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,      \
-      num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \
-      k_scale, v_scale, tp_rank, blocksparse_local_blocks,                    \
-      blocksparse_vert_stride, blocksparse_block_size,                        \
-      blocksparse_head_sliding_step);
-
-#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
-  switch (is_block_sparse) {                                               \
-    case true:                                                             \
-      CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);     \
-      break;                                                               \
-    case false:                                                            \
-      CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);    \
-      break;                                                               \
-  }
-
-// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
-// 1, 2, 4, 64, 128, 256.
-#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE)         \
-  switch (block_size) {                                           \
-    case 8:                                                       \
-      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE);         \
-      break;                                                      \
-    case 16:                                                      \
-      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE);        \
-      break;                                                      \
-    case 32:                                                      \
-      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE);        \
-      break;                                                      \
-    default:                                                      \
-      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
-      break;                                                      \
-  }
-
-void paged_attention_v2(
-    torch::Tensor& out,         // [num_seqs, num_heads, head_size]
-    torch::Tensor& exp_sums,    // [num_seqs, num_heads, max_num_partitions]
-    torch::Tensor& max_logits,  // [num_seqs, num_heads, max_num_partitions]
-    torch::Tensor&
-        tmp_out,  // [num_seqs, num_heads, max_num_partitions, head_size]
-    torch::Tensor& query,  // [num_seqs, num_heads, head_size]
-    torch::Tensor&
-        key_cache,  // [num_blocks, num_heads, head_size/x, block_size, x]
-    torch::Tensor&
-        value_cache,       // [num_blocks, num_heads, head_size, block_size]
-    int64_t num_kv_heads,  // [num_heads]
-    double scale,
-    torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
-    torch::Tensor& seq_lens,      // [num_seqs]
-    int64_t block_size, int64_t max_seq_len,
-    const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, double k_scale, double v_scale,
-    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
-    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step) {
-  const bool is_block_sparse = (blocksparse_vert_stride > 1);
-  DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
-                             CALL_V2_LAUNCHER_BLOCK_SIZE)
-}
-
 #undef WARP_SIZE
 #undef MAX
 #undef MIN
diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu
new file mode 100644
index 0000000000000..8b99f0843aaf6
--- /dev/null
+++ b/csrc/attention/paged_attention_v1.cu
@@ -0,0 +1,193 @@
+/*
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * Copyright (c) 2023, The vLLM team.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "attention_kernels.cuh"
+
+#ifndef USE_ROCM
+  #define WARP_SIZE 32
+#else
+  #define WARP_SIZE warpSize
+#endif
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
+
+#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE)                                \
+  VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(                     \
+      ((void*)vllm::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE,        \
+                                              BLOCK_SIZE, NUM_THREADS,      \
+                                              KV_DTYPE, IS_BLOCK_SPARSE>),  \
+      shared_mem_size);                                                     \
+  vllm::paged_attention_v1_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE,        \
+                                  NUM_THREADS, KV_DTYPE, IS_BLOCK_SPARSE>   \
+      <<<grid, block, shared_mem_size, stream>>>(                           \
+          out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \
+          scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq,    \
+          alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride,      \
+          k_scale, v_scale, tp_rank, blocksparse_local_blocks,              \
+          blocksparse_vert_stride, blocksparse_block_size,                  \
+          blocksparse_head_sliding_step);
+
+// TODO(woosuk): Tune NUM_THREADS.
+template <typename T, typename CACHE_T, int BLOCK_SIZE,
+          vllm::Fp8KVCacheDataType KV_DTYPE, bool IS_BLOCK_SPARSE,
+          int NUM_THREADS = 128>
+void paged_attention_v1_launcher(
+    torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
+    torch::Tensor& value_cache, int num_kv_heads, float scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
+    const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
+    float v_scale, const int tp_rank, const int blocksparse_local_blocks,
+    const int blocksparse_vert_stride, const int blocksparse_block_size,
+    const int blocksparse_head_sliding_step) {
+  int num_seqs = query.size(0);
+  int num_heads = query.size(1);
+  int head_size = query.size(2);
+  int max_num_blocks_per_seq = block_tables.size(1);
+  int q_stride = query.stride(0);
+  int kv_block_stride = key_cache.stride(0);
+  int kv_head_stride = key_cache.stride(1);
+
+  [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
+  assert(head_size % thread_group_size == 0);
+
+  // NOTE: alibi_slopes is optional.
+  const float* alibi_slopes_ptr =
+      alibi_slopes
+          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
+          : nullptr;
+
+  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
+  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
+  CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
+  CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
+  int* block_tables_ptr = block_tables.data_ptr<int>();
+  int* seq_lens_ptr = seq_lens.data_ptr<int>();
+
+  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
+  int padded_max_seq_len =
+      DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE;
+  int logits_size = padded_max_seq_len * sizeof(float);
+  int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
+  // Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len
+  // Keep that in sync with the logic here!
+  int shared_mem_size = std::max(logits_size, outputs_size);
+
+  dim3 grid(num_heads, num_seqs, 1);
+  dim3 block(NUM_THREADS);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  switch (head_size) {
+    // NOTE(woosuk): To reduce the compilation time, we only compile for the
+    // head sizes that we use in the model. However, we can easily extend this
+    // to support any head size which is a multiple of 16.
+    case 64:
+      LAUNCH_PAGED_ATTENTION_V1(64);
+      break;
+    case 80:
+      LAUNCH_PAGED_ATTENTION_V1(80);
+      break;
+    case 96:
+      LAUNCH_PAGED_ATTENTION_V1(96);
+      break;
+    case 112:
+      LAUNCH_PAGED_ATTENTION_V1(112);
+      break;
+    case 120:
+      LAUNCH_PAGED_ATTENTION_V1(120);
+      break;
+    case 128:
+      LAUNCH_PAGED_ATTENTION_V1(128);
+      break;
+    case 192:
+      LAUNCH_PAGED_ATTENTION_V1(192);
+      break;
+    case 256:
+      LAUNCH_PAGED_ATTENTION_V1(256);
+      break;
+    default:
+      TORCH_CHECK(false, "Unsupported head size: ", head_size);
+      break;
+  }
+}
+
+#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE)  \
+  paged_attention_v1_launcher<T, CACHE_T, BLOCK_SIZE, KV_DTYPE,              \
+                              IS_BLOCK_SPARSE>(                              \
+      out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
+      seq_lens, max_seq_len, alibi_slopes, k_scale, v_scale, tp_rank,        \
+      blocksparse_local_blocks, blocksparse_vert_stride,                     \
+      blocksparse_block_size, blocksparse_head_sliding_step);
+
+#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
+  switch (is_block_sparse) {                                               \
+    case true:                                                             \
+      CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);     \
+      break;                                                               \
+    case false:                                                            \
+      CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);    \
+      break;                                                               \
+  }
+
+// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
+// 1, 2, 4, 64, 128, 256.
+#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE)         \
+  switch (block_size) {                                           \
+    case 8:                                                       \
+      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE);         \
+      break;                                                      \
+    case 16:                                                      \
+      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE);        \
+      break;                                                      \
+    case 32:                                                      \
+      CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE);        \
+      break;                                                      \
+    default:                                                      \
+      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
+      break;                                                      \
+  }
+
+void paged_attention_v1(
+    torch::Tensor& out,    // [num_seqs, num_heads, head_size]
+    torch::Tensor& query,  // [num_seqs, num_heads, head_size]
+    torch::Tensor&
+        key_cache,  // [num_blocks, num_heads, head_size/x, block_size, x]
+    torch::Tensor&
+        value_cache,       // [num_blocks, num_heads, head_size, block_size]
+    int64_t num_kv_heads,  // [num_heads]
+    double scale,
+    torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
+    torch::Tensor& seq_lens,      // [num_seqs]
+    int64_t block_size, int64_t max_seq_len,
+    const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, double k_scale, double v_scale,
+    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step) {
+  const bool is_block_sparse = (blocksparse_vert_stride > 1);
+
+  DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
+                             CALL_V1_LAUNCHER_BLOCK_SIZE)
+}
+
+#undef WARP_SIZE
+#undef MAX
+#undef MIN
+#undef DIVIDE_ROUND_UP
\ No newline at end of file
diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu
new file mode 100644
index 0000000000000..3a7a9dee916aa
--- /dev/null
+++ b/csrc/attention/paged_attention_v2.cu
@@ -0,0 +1,203 @@
+/*
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * Copyright (c) 2023, The vLLM team.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "attention_kernels.cuh"
+
+#ifndef USE_ROCM
+  #define WARP_SIZE 32
+#else
+  #define WARP_SIZE warpSize
+#endif
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+#define DIVIDE_ROUND_UP(a, b) (((a) + (b) - 1) / (b))
+
+#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE)                                   \
+  vllm::paged_attention_v2_kernel<T, CACHE_T, HEAD_SIZE, BLOCK_SIZE,           \
+                                  NUM_THREADS, KV_DTYPE, IS_BLOCK_SPARSE,      \
+                                  PARTITION_SIZE>                              \
+      <<<grid, block, shared_mem_size, stream>>>(                              \
+          exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \
+          value_cache_ptr, num_kv_heads, scale, block_tables_ptr,              \
+          seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride,    \
+          kv_block_stride, kv_head_stride, k_scale, v_scale, tp_rank,          \
+          blocksparse_local_blocks, blocksparse_vert_stride,                   \
+          blocksparse_block_size, blocksparse_head_sliding_step);              \
+  vllm::paged_attention_v2_reduce_kernel<T, HEAD_SIZE, NUM_THREADS,            \
+                                         PARTITION_SIZE>                       \
+      <<<reduce_grid, block, reduce_shared_mem_size, stream>>>(                \
+          out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, seq_lens_ptr,    \
+          max_num_partitions);
+
+template <typename T, typename CACHE_T, int BLOCK_SIZE,
+          vllm::Fp8KVCacheDataType KV_DTYPE, bool IS_BLOCK_SPARSE,
+          int NUM_THREADS = 128, int PARTITION_SIZE = 512>
+void paged_attention_v2_launcher(
+    torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
+    torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
+    torch::Tensor& value_cache, int num_kv_heads, float scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
+    const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
+    float v_scale, const int tp_rank, const int blocksparse_local_blocks,
+    const int blocksparse_vert_stride, const int blocksparse_block_size,
+    const int blocksparse_head_sliding_step) {
+  int num_seqs = query.size(0);
+  int num_heads = query.size(1);
+  int head_size = query.size(2);
+  int max_num_blocks_per_seq = block_tables.size(1);
+  int q_stride = query.stride(0);
+  int kv_block_stride = key_cache.stride(0);
+  int kv_head_stride = key_cache.stride(1);
+
+  [[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
+  assert(head_size % thread_group_size == 0);
+
+  // NOTE: alibi_slopes is optional.
+  const float* alibi_slopes_ptr =
+      alibi_slopes
+          ? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
+          : nullptr;
+
+  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
+  float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
+  float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
+  T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
+  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
+  CACHE_T* key_cache_ptr = reinterpret_cast<CACHE_T*>(key_cache.data_ptr());
+  CACHE_T* value_cache_ptr = reinterpret_cast<CACHE_T*>(value_cache.data_ptr());
+  int* block_tables_ptr = block_tables.data_ptr<int>();
+  int* seq_lens_ptr = seq_lens.data_ptr<int>();
+
+  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
+  int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
+  int logits_size = PARTITION_SIZE * sizeof(float);
+  int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
+
+  // For paged attention v2 kernel.
+  dim3 grid(num_heads, num_seqs, max_num_partitions);
+  int shared_mem_size = std::max(logits_size, outputs_size);
+  // For paged attention v2 reduce kernel.
+  dim3 reduce_grid(num_heads, num_seqs);
+  int reduce_shared_mem_size = 2 * max_num_partitions * sizeof(float);
+
+  dim3 block(NUM_THREADS);
+  const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  switch (head_size) {
+    // NOTE(woosuk): To reduce the compilation time, we only compile for the
+    // head sizes that we use in the model. However, we can easily extend this
+    // to support any head size which is a multiple of 16.
+    case 64:
+      LAUNCH_PAGED_ATTENTION_V2(64);
+      break;
+    case 80:
+      LAUNCH_PAGED_ATTENTION_V2(80);
+      break;
+    case 96:
+      LAUNCH_PAGED_ATTENTION_V2(96);
+      break;
+    case 112:
+      LAUNCH_PAGED_ATTENTION_V2(112);
+      break;
+    case 120:
+      LAUNCH_PAGED_ATTENTION_V2(120);
+      break;
+    case 128:
+      LAUNCH_PAGED_ATTENTION_V2(128);
+      break;
+    case 192:
+      LAUNCH_PAGED_ATTENTION_V2(192);
+      break;
+    case 256:
+      LAUNCH_PAGED_ATTENTION_V2(256);
+      break;
+    default:
+      TORCH_CHECK(false, "Unsupported head size: ", head_size);
+      break;
+  }
+}
+
+#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE)   \
+  paged_attention_v2_launcher<T, CACHE_T, BLOCK_SIZE, KV_DTYPE,               \
+                              IS_BLOCK_SPARSE>(                               \
+      out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,      \
+      num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \
+      k_scale, v_scale, tp_rank, blocksparse_local_blocks,                    \
+      blocksparse_vert_stride, blocksparse_block_size,                        \
+      blocksparse_head_sliding_step);
+
+#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
+  switch (is_block_sparse) {                                               \
+    case true:                                                             \
+      CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);     \
+      break;                                                               \
+    case false:                                                            \
+      CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);    \
+      break;                                                               \
+  }
+
+// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
+// 1, 2, 4, 64, 128, 256.
+#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE)         \
+  switch (block_size) {                                           \
+    case 8:                                                       \
+      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE);         \
+      break;                                                      \
+    case 16:                                                      \
+      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE);        \
+      break;                                                      \
+    case 32:                                                      \
+      CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE);        \
+      break;                                                      \
+    default:                                                      \
+      TORCH_CHECK(false, "Unsupported block size: ", block_size); \
+      break;                                                      \
+  }
+
+void paged_attention_v2(
+    torch::Tensor& out,         // [num_seqs, num_heads, head_size]
+    torch::Tensor& exp_sums,    // [num_seqs, num_heads, max_num_partitions]
+    torch::Tensor& max_logits,  // [num_seqs, num_heads, max_num_partitions]
+    torch::Tensor&
+        tmp_out,  // [num_seqs, num_heads, max_num_partitions, head_size]
+    torch::Tensor& query,  // [num_seqs, num_heads, head_size]
+    torch::Tensor&
+        key_cache,  // [num_blocks, num_heads, head_size/x, block_size, x]
+    torch::Tensor&
+        value_cache,       // [num_blocks, num_heads, head_size, block_size]
+    int64_t num_kv_heads,  // [num_heads]
+    double scale,
+    torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
+    torch::Tensor& seq_lens,      // [num_seqs]
+    int64_t block_size, int64_t max_seq_len,
+    const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, double k_scale, double v_scale,
+    const int64_t tp_rank, const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step) {
+  const bool is_block_sparse = (blocksparse_vert_stride > 1);
+  DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
+                             CALL_V2_LAUNCHER_BLOCK_SIZE)
+}
+
+#undef WARP_SIZE
+#undef MAX
+#undef MIN
+#undef DIVIDE_ROUND_UP
\ No newline at end of file

From 3a28f18b0bb954cc9886e5ee63cd616997165024 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 11 Nov 2024 22:56:44 -0800
Subject: [PATCH 103/183] [doc] explain the class hierarchy in vLLM (#10240)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/assets/design/hierarchy.png       | Bin 0 -> 174150 bytes
 docs/source/design/class_hierarchy.rst        |  33 ++++++++++++++++++
 .../source/design/huggingface_integration.rst |   2 ++
 docs/source/index.rst                         |   3 +-
 4 files changed, 37 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/assets/design/hierarchy.png
 create mode 100644 docs/source/design/class_hierarchy.rst

diff --git a/docs/source/assets/design/hierarchy.png b/docs/source/assets/design/hierarchy.png
new file mode 100644
index 0000000000000000000000000000000000000000..6a1b4ba9590ba94d041ac0a8b7bf883ecbc6ea73
GIT binary patch
literal 174150
zcmeEvWmuH!+O{Gr5iF2U6i^hAHUOnD5Tz9ux<y)$ZX5*>P_PD&76pW%W9X1nkPsw?
zQlwj&A-?Nzt#@w*-S7T>eLud#W3RoK1<%Yp&wXEUUgs6JucExv9tsAEZQHi(xqj`+
z&28H#D7S4RGbH~DezO+-<ShKyZhKSe(zcZPBO~x1UPfxyZ_CMT<AB%X+sL+4Z`*-f
z0{`2#onaf<&)3_wNpC;=uh%!XpZez-l5N|3Ot+E#bBzZ4L|#fG|N7U@9nstW{fW^e
z|8q43W%Q2!c}-@BypLD6Q5k;hvc9HnyKURqQsn>JPu<eMZ`&rZ?fR9A%1+zIyLX11
zkn9$p7P7fuGkA@9?**HubI$|AX|LE&KMf~i^*{0K)U}sM-=j}_?4dgGj+FI*^{or{
zsYwHuhT1$AyBbA>3(g6jD<UKebr?G)=3^7ZMVn`fdx8^R(vXmn?>{PWZ`=0&`XXAF
za`$MP>d?2%Z)gl86Tn+!VyX`rhhqpctoeLIpR<%&Y80&PaANo7-+o~^%BAKXX}fFF
z7dM_jbK}~-eMl`?UwL$LkCocyH|#gLUwVGSvyVen>u%+vle;aiZTjm9y7a<YAA=Uj
z<|p_o9Nr+2)pW1GgkJVPKU||Zdd9^)?!e|dk%(p`eYlq~bnumNWKf9z^-VV^aZkdX
zEYfp*d9KU0p(RJDi0a=fcls!q7unA5mX4n{E$AE@I=f9rSW-{%*ybfhud`Wqn#RwU
z_}fTsx|&b2RG>mm|NN=VPqKNjH-E;@YfQrLrRnu2J-PCqA92fjLSgOOX&fF<|NG9j
zT(RW~(N(+1DFx~~uWh<fq9yr7a-!Ku%I^Ppj4kW<+dciW8~-<}gk1ASsQ#$<e=%8q
zEY)vq%<nYpkKgz&vh=T&{3BHVT8jVn;{T0MJ#Nc={rER{Cz_v3$9=xVu7*ASEnAe^
z*`kegg6W5+$4nDqgQQ(2^Nd3+GCpe9Hr<v)45=K|bj0VaFi3)t)QfEF-aE=o=<;vv
zLI0o-Szqcmjj(UM9_6tz((b=hEx!J2GH)o-u8zO$R>{tNOd_~CW&|UhwkC5k-reOo
zRkImYHZsF!6Uw#i_WX*Yis+U;2YP%tcB;Mj&5bu#V>9c8I4jRkpzBHed5fL=7OxGR
zo&TZ~Pp{+j+ScBuEjt|l`ZzE7{^4h(p6jzwcIA7{zty~x$685&2K4OWuzsJCCxlM@
z*0ZeP82`8yRb5dzzRy?k)>i627~Qx|<VIMj_S12Fd~#Oq*C$?~e0=EKNzs+zH?+c&
zDMjBtG9Zr=ZA8jbI9+D<<%U%3KCz|f+VII@!X%COjlnRhV!Iyap;$6HwdnG@#pi^q
zoh}l87Y%&ZSGjXWMBinSRO-Ii9h_3M(Ajm-aQ>BLPPaf#aw><WsTgMP>{Q`woVG(p
zZ0bRwIoD>n^RWe!`Pj?acz<ojZvH)tLRk|=QR^w+Z%m<s)eL;IT2PuT1^Df4x};Z&
zSAW#{wXOBIPN`V5sfnYX9hZumZ4eiauxq+*nm-bU)vb16k<gQ%bX#8@GF|L;Y`_Jj
z{`Czo|Mha8eF|)mP6j?xo$IR+Gm`~Vacc{BUbr&{(-sxg<cA;65bdGjrS1CfseAa8
zPHslh=qic|@q@Z<GnIJ{JeI!t?Ab?&YjWeLV=XlZtBVcN;Al(H!Mu?Qly@CZm-2Ic
zrdm83F~!Hl)gB?$v+AO`_>nEjj{Ni-E(-0TL)vum)vf0=k&;IBf<s(pH+9+2YRO~9
z;skY}*-z&(nRHzzvhDhxoR7QC7Lg@?eE8uuG@8r$`!gvNBaMaa%6Mi&2F4h_bCxjX
zSXIsJ87|Om*F4v57#O>>@3hBKIdk0cfD19!oPz?#OE$}_M6?@8|4XZy+gl^3my&uL
zYl{l-aFS8-i%&1_W#3J)Fj6WsMNjhJuRYS(WSmB&d57MEJsddmB)Ps0<ET2+v>7GE
zRVymB4iCZvVRiWi{!@h9J&8l4lZCVOcCG5h^Ja;{levaIvn&0MSCjG&VsBO3<|K#P
z)(duw-d7M`v(v=%5-oe90-{kLpV%ED`n5zfEm@@7kB>APeO;;puz5UPEcvNbU#vC`
zTmThrCZXhN94M=TaMePl3WBo&nJ~W<a*?IJ2d2r{t<nmj^UA?{kN0Ix6%!m1R6&H%
zk>ufbs}6|@x{aq}lQN`3A^xi_zC(B;?r!Y1Hs98<@X2$vpBc|%H9n=rEm*uUS+cR7
zZof|CVbDJ)+aKnxE;iB<$4=-Vx<{VMy(Vv-G(A0bc~qzYo|KQ#x^%n!gp%<NB|W%L
z+2dCqU!#)ux2_;PAp6v^D0e)gHf|a8O)rM8FVU)ad5{L{P)S!I$%-o%`Ra9`sp5cw
zmfa&4P%|2e#|-=&-L?&)-V-1mIvS?2m?j8aBYNCs>&6TE%J+$X3h1?O)3$9GQ|LW(
zHyK1+bJG5*QI=KVOjU>HQu^12hv}H&(L^y`&2b3dBVCQ+wygV0l8yDnlIHh2nZtRC
zm%ioKZ@hgWFHJ?FNdD!rUf11fIrEgf+Y3s8>vK2M`x3D!(ghPZ&6QDXD#gBV_k|9n
zBW$PCuDwNXL*F4~a{m6T?D)x0lbG=h0d+9z)lBDH=7zHtdt8k-7AvQ}mRGkruD>a9
zGnPG5Ni(v~bLsL5e0P+N=h}RI@T;PA5c#IP!sAcn#n<MtD(N)8Grf7f<stMAfBVtJ
zd5(3n(rzb-rOlnGI@~;w)2oxSTs>-4vO2&!)*2N+zTYJLr=?ne_4B*<o;65~rTcC*
zqiLk`plMeHcqECE15Oj!Z4{}dnRNm(6A$j+?VD>>h~g<({lRDR{R5v#V|1A<L8&}V
zzdU2(u;puSI*!7m_LK{kUZzu|ULrD;19WK_dU;b(9S4YosEPcDUE@VlZ(7tlXH44C
z&N0L;xzAM6$L1}Bm}n6KK@junoPk<M*3=bZX*`7<ige|%EShf}0o%uW#0Fl~;tI23
z^o~9Ii-a_h<`IU(U)t-rzqEyv_|qEMb9FB}aSwS{0)-LtnbqxpkJYp);;%d3>$bzU
zI(su`5UmTezycOOVnB>9CB`TU;l1p*y<&5Xm(4@JTY1ce&GkLl*ZM{3oP2<FZ$>Q-
z{_*Q4*MVgb1Ls1+>FhrJey8`WZ?bQo(C9n9`U?VA*snAJ)zU(O!3jly>R~61j~=<;
zG~G9c?_C$D^z6F(o{avx&JQ1Ej}S|Bx2dAVIIxx?iw7}864}^EvaXubS}cBBifDC7
zS1_d`@llZmM(I64j=k>Z8Shq56^V8`lalk@=!+L$GiBg6Q~tVpq1&<7Ztk-bB?TRY
zDQtv9FU4apQ6r#VvIXNAGG8T9i&qjiG?Nsic5KE*8oH%5_me#XKRqRqIP$r}=ysPm
zDe;vXD{E7py;yTUm0cV2sh;zrU1+B!SZIuEM}MHWN9gTP)BG&Z2;&hjU-J7z=M-JQ
z&M1X5Yo4hWnf=6$I~8sm>D(kVmh=Mg3p_)u8dP{0@zv?_kxF`9*>nn{RZ!o<rZK)J
zr2$KrhZ+V5{#1LEx)FvCOoYRs$q2`u>*<mPLuV^#l$5jQ=kZo05e9oBYQHEcL>#h9
zvD(O&t+$-@0D-<~!N2_JT*KQm|70P<%6IJX;b$w&H1C!mT-<m(6aMb{h=M1FRszRh
z)|Z<yu@Cl%<yyMrmH3DE_a(%IaxA((!{6|(QJbG`RuG@Cx$@aKt`@!L-6Kqs9}e||
zR<`<Z>e$Bj!wo${QJ!L=GZhC$U@v87JoaV69*KdE=Swd9QDfS4oo(Xtcz@;!=#=sH
zX~g1o4BY~J*D-5=CZoGX$y;b0xH6`&vpVxs<Qv|ixtnxPpA2;XDtUFN+88^**0VTU
zFKiX1<su@^3@(@EeTu4%Gb;f>2(eCqpMb>du;urtMim0g(5|yRwc<wS?hv}{t~6We
z(R`7+_fYqf-tnpOkltrfOs~G>t>u)gaXaSI>$=@tnRF}(L#gR|cR6ekz<d(zjo>$}
z5!hFoh{B)tW=-NKE~Q{J5!)0OcBk_V7J#H{#;l87(>C!7WkQH!rT3PkQ$Mjb6X<zF
za|4#-tzn?3KqXxR)j_QC$uNtItdA$d-l~z^2~xkEU}vIYd8TI|SU0j>c&d==RD1Vz
zsvf+7KX;P@-ZFR)oM^gFYTjU|`YB_JO3w{7yU&uD+inf5)ThGz&vZR)*Dz1U@PJI$
z`#3sj|1}h1?bTN%DCUVcLm%@GuAqa)xA3xe(%%0yQo3W``1~5vx!bax;8$iJei;2G
zY7emFt=i|ro9cJ+1}MtQ2;-RwLLiQu5j~cgq;@Dx%WRtWIrT=3(HBiC9#NTs?Hld-
zGTlZ+b40*GZ88_otC1hA7QUurV{OU~h}YZp%$}=vHSL3YsY<oMrf0U4u}rHfmz}y4
z7awjTO*hFc#53A2IQxKk^{6Fy!0sNr^xDe=?nvWcEt~5?i(R%#q&8UgnMgn5(W=8%
z;bh!c!T8~jT=5wT@r~8lX25bm9G36#tx{gg-_9{Dn!Pd#GR02LOrf0iR@GF}YunxM
z$%_INWYo#=q}o34wcIBsm4V>s)t}SrK}b&(-ndcgcy*|_QzxfpQU4R$rEfFtR2RP$
zZnHm<iTEgV8TmXXkug$a+K%5JJf!oAeY2?m5PzP;htWaJkG3$8n@XV5b7KuF+I3fS
zzZ0OhjMEaNe)#~Ka+uxcZsbpwQD)w&)Ls7(%=GxnOe1;v5zo~bQ!s_H;>#Zu5-BLI
z?wcyv@bKHOne@T_)0yhGYG;yf`qE~bv1dp23OHnj8?L4m&o#+3KbPTol}x81r<i8h
zGZXn}ziF7G8=b3$2La!En=^?vf?iXnUW_m<S!XzQ+zzE|@OZL^QgR5_f*^&wi|<VW
zF?|T!$6eN3aKsb%FBwuD^p|J`XBf;uM%Hietysslj$d7Eywg0juBg~)Up0rpU1kH?
z#>pBmZ;=b|>#X@!P5OsV^2gqAP0Spk98$Y+<`TB$FS7oXb#UXx$LhdkbK%$ddHb}V
zlqcu(gt}K9cUtlhUwxeztv7c#DlB&>-E90vEpHT)+w^<0hhr^g+}DnXj3s5>^xA>n
zP!2Uo!gZ{-FL?F>(80)aTbL7e(=k15ZcOa;p$d7t@ih9~<CWW*%adhul0%7Xn;`&t
z5aGKUdOi?sp9Ca7R(m(rbidA5;^M^#9SszC;qw(7C}*sDHX}W@hF$c)Bg?Z27ruJY
zIPexuUuuwF9~{p0NvBq~P-<_czWBLL0MB)}q+q|&jb5sh3nOn|WMKMS&TYhajA{4E
z4&?@^sow$IT!XsjXQ}9_331)Vt)AZHddG~?+u3VZrbn`x<vB%GU3&>Seo<!9Np2|R
zZ);9qt7!RERhY3Rv=~Omt}~Ss7DlJ2XdYw44Qm&zN3Q!W)`D6$RsH~!B|nx@a7Tsn
zrUMi2C5xG2-$g}FE~<A=TI4-v{B>_d<@D$ndM&Vo&|)>KP@h|Zoam|+O+OvOiKfI}
zd3qsojEX7M*mS`*Ewb(ARI$by!5VLwBR7_uqx03MdbTKMds~@2KM}T-h@t?5z@A!9
z6jIzPEv0S09cI~Nky(c#-;Zrx0}se;VP*aZ<0&sX{{<JU?ID%Mg1US&k^YfqJAx>q
zlE>+f1@r`|J+UZRTNwN7cc(#&V1eWK3O=j@uP+=<kF~MP{rXT;<0JO+PWPplSM_(*
zsrEhwn~|iFN{tOjPOoAZd2+tzU1F_T@+1Vrb)pUsanzBZ@}3A{(kJZPdi-n1=(QBT
ze6d`EkUL~Dw?Ck(AMvdg__3}kI_%><HH$qAnqL=i73cKZv>oNFO4b}&D;psGo-1cw
zKdtGWmg8O}?KQtQdftI&UWB*InJPCR`PG=p`a(`gZ2Nj)XlbA8R8g>{?h*8g3nq8N
zhsnhRM`mSJ-Ze4QG(Mz9$jj>NHCh&XEYbJ)TzyC_eZyi*%QagCMdE;6j($I##R~Un
zVk2_2QJjqBmi3R1Z1g%>NglMCl#cW$|I?kuQSL$bhQ9X?GS75g;?CQdFwV07*vo>c
z;sP0WD~?EDMJx_io7F#|eQ7|OM}E5{^UA3dT2%qFAlvpTjyvfthF<Jr+TYeKVcAu@
zuNZ1J#@Li@-}x52QQ<qm9FO=2hnnQT8p2eGcqJkIJS%RTo8#++ftN~j{5i_*2P<%&
zvxC(1Powhu_iF*Ma)YI@H7c@YhA{`gNj$k*OM#cC#i9t{`m&V%SM-`9fky;0rb9Fs
zJT?fu@c_Pxf_O4)!<LUEIsb5Pzxb+nUwo)pBA1s|#a-|>H2ev~dP_%hFqLfRmNc>6
z-jfjD|0t~-!i`Yxas$fRLTb)aYb$Lf^{C_;bAaZxncEa~G`}!Hh;9n5^!mKo+B=iS
zXk}*fz2%`Oal$K;7~iI-jn$#K;WvQ}hbjDBbHP<R$k1{|0%c*|%|ySGE35I+h-U8!
zD%3RY)lG}p<r>b^Tkmt03dUnoBg)xu(a9%I9V{+?xTiKeKE%Lp4nW=Q#lnI{?ZWfl
zW*s3gqU{L)+K0CO7%IW9h1*rb!05e`E{$c5%V$Y?qhmRdbB7xedqI%$;&(IfozgVo
z%}Kw7Nq=rA!fVK%KRIco)fpSC6Pf}rsSNu#dI$pQu(DG)-#`yejCFwGAOvJuBWX=p
zj1W7p_w;vTOyeU2xbwb0HwB2<fXfJ&))G-gfT)THe<UV-egRY^#8t~n&@i@PvIbNk
zj85C`Ghg))+qWSyZdHaoXMEIrGj95(H;68FWdW*E6-rXIeB!is!OUQfP=W=RSXogd
z5Eb88Zs55TD7v7AyKf<4NStamaR=6<wM6RnHDYis%{~qCb!#?2;<ANc*y6rVcO|l_
z1u`OXa$SLQXx8^(L^(~=a%IZWy?wNU#j9zQ73G-^TVfV^n{*k#RDS5n2VrM`gdcDZ
z_B7YCqTcy8HXidiax>RO*W&V0871#q09ysKBZj)wxOeBftjpWmoIPd1aCew#qz{;S
zJrxShZJ^Q-0uYlqBb1Y~GIsTX&&4Ei6pe@ZxlV}GGgLmJAd-eiMf*H61gcUQ7;%6r
zGdQLE)qwq<k+2iV9rIY;pqf3!aW{}HMDKpgVA%?Joa8+Y-1Rz6s?z~ORS-plw}orc
zPbI|cH?ue++@@ta5UKjGW=WRG{>u&1yn&#xG(STxvi=Tyw{ga7V#>^+C$##<hdaa$
z(r{K(*^1r2CwZ}cn&FJWl@fBwb-<u9Gdfsh#4%@8Gr8Wv?t7>?M18TuiMiT1`Q0ao
z8}j7RjTRE(BSZ16D8Y_fFO2ZU)n?&T)Dv3Y^4t;Xj=UX3(N#S(0#TNOhDCa%kmF};
zoBQ&S$Ht#2IFlOnbpYB}`Lspo+KCk>i>@2`6muA@ianla?=PZ3zb~YI-vkjOcj|RU
zBI}eD0y%2`?)}X~De&ziYm(92To>y2n%_cr5;x8KOdP^NE)g35G<LajYS~;Q51d&q
zU8Q>Ys88|bSy%`SmS`0jU)t4TLibc_lVq^%@oQP&)8f?JIZ`Vq9#HLnL;ztG9%Ovp
zQi-dQ)PJ8_O(l?|Y~Wo$o?A(wCTRpQL|vJwrbW6WVi}uMRJB#?Oln)smkII+p(wCf
zow}rA-!d}+SS#~s0CCq0fp2*7DTi9_obODBHGQLiJM_jY(kDNni?9R^G0z>P1pB0U
z`(h5xSe9UGVBSD(Q%RT7611d6i^+JX%d*X-554nt)QdvJ!N%8@o4x6ELOg!Fh`PmD
zO{?umSlJ+yY{-BW)d&Te3$JSISv`CFJ|_KskIPuHYA(MOX+KLyNApB{!S417KZF^g
zltqPJDl94I;62gqpyQQ!e2$)TYJDzVTrNGrg^&RVap*4+L&8@zOv+7P!eRX?HSf%K
zSK7}-%A^};x#{@fd;tA6WMT}bzzM{pB&JM_4e7b*2z%Z>Udph2wvc)O$j=7z(aij_
zp3GOTsi~zk^e_l*<~1eCN%Pl@%l91al-O!$J?TuC`=A!2?q-r6N|~kWfPN;<26Q$a
zFi6sb5n%cIzNfEfex%%!oxQBhmDZ-?YQ(8aOGbu|I&`cuhBo9|Zpr$5>cqaOX~(YK
zg(;uCPkIX<xjKWtRoP8}6K5lNOtt<f*`ZdEVc-ZI{8sxbUFO}WejPh{8jEvJ^z@VN
z(dswF*Z?8_7GQdTjBVFc`ygLx1&0yU-U&<JiRYZ&dxUoeay~hLbr%ETG|%UXn)&iT
zaD%1!d<{aSP*M)U-3&76kC9ucWZD?GT|FKi1|L4g>ml`S2{hJ1x8>^;c_u<_FpkzN
zd;@#Kp~Zeag>}4tS(*J*yF=aGecy`Q2opKRc$%D+YKQkF%XL;ZsvlMXB|Js86?eH{
zRmWu{Hgg@YGCqn1J5!nX)SoNk8ew#=r#R639k@A04%elaXASR}<da|f1TPa#_b8>G
zy`{5UtGoWZ>hYl$ZEpjJm<0`43AMD!JC8<)@Ygo<I?H}6RW>hVLu~w>s^0MJQ(e7m
zYvFoG-#><WLy__Ig9TPxjf%_3v<hk&yMj*LAtpnW*D*J<C~h6mHI?a_sb&_B0J2jX
z_nL<x-hB;Wm$H7;aOR|%<&_*BIYQ5@xq}`9?EQJo<>YswsbZ*i!l1&G53W$H`z0F*
z2b0rR)_@M9BC*-Z8UttVV?DvKV;N$etHwUPr4n&|B%0jtYIJ@mwSJkJVAf6F29paa
z1=+Sq61jc%Dd`*p5oCZn%Xj5b>bon^`2{M<v-ON;Z(YN&+F8(x^f>h!9qTSke$d|q
zknKXMe-aPM;&HOKYrakYOC|YN5vn6hsth^Smo_X-=4{gnmiaP^#(RCnQcFB|bS)$i
zhHwy<aDoW>w^9STN+#$F5Q}znFL{OdksRK17P;7MwKBX#FXaUXUe3XGoD~IqnowXQ
zIFbe<R3n${8QhOBah9~VbFxQCf~iPGudCm!q@~=)OF>yx<OXQ~$_9UK7Jn26hsa3D
zo^+13RF8Qq9~fhMN0a8x!vja?mjF~|T6-h)ROxM;XCBU8+{vhHase1b9!pG_3)mv9
z;C@$)GZyV>$_Y{J1~~-?xQ|14y2Z5V#KoZAydCowjV!p2pfk;6)KDCUtqSk@snEIJ
z<VZfDIhQnieWOlj0i&vO9kJ$<(+_s;lLt2GwqJvcm9z!r5iFe!WD|g6s^>hj&b+Zy
zT^N_dgFDJgafmK*uJ>X4)%CRUy__QBD`Tk*xTicK(U&eCk9t5G_jxXEecD}ZB#!`k
z!q?rtWeHuu+fP0@LYxvJC7-(#ZL4-VA94|*9DkL|gBP|NPp_s6z!4@3hVH0@ok!`j
zF`PxKjU>dgUOFYINk>n1&%pI-3^l%My>F-ZDYKMBF<hqQfTHS~@Ex3&u3%8JW2P6f
z*7iJ_a5|n8rfNX=SU&)GVq6PT6mYZ!j9(PKwYf%`^P4uX$|G+A#S+_%EL62<_yRDL
z<}-;GteMXv<h3Zn>jJ{(utn=wvEyGfT5TZMp_LcKEe6n%n6m@Ft64eGGX>tJtYOKn
zSx)t?>!jIWHWJjP99;(9itaDgg0X`0Q?Gm6dhN~!-p*O{nJSmYKBS-&ATZu`+nfPY
zW>Z}AG|<rY3}D$Ti--%oMdG}@SZ!hyLO&U7YO#xc-|I)bsFRN^jc03sxOt|UoFZ|Q
z#Nk|ni0wu5fl-8QD}r^?-pfI8yJ&r7Ogq35+YVWnn@LqF<{e;|IquY{n{O0}mOQIJ
zQ!TzWqTsEBgkKTst3%@SIMrrFnj{EP7W2c`fi$s8@b^<I;<^+we+r2l7sTohkM(Y>
zJNlLOArMDcnKX(AKx7xL-OnBj-cKV6922%$vr+zB9iQpSNW6k1eg|*<aExg2xRX2$
z21yvcOBfh4<gbazfC!(4za@DcQZSt3oR^v<^(V_-2=U7bjl5+SQw2067az2-k<m0i
z6ge$t^d{<o<gAJJ3W(T2O_bCrfhi5dnxgasc5YtG{lizm6dpcWM=X8!B&JCH?n|bR
zwg5jC4#7h>ll_rn<>WKpF@<^tJ$LI;u=cb?G0E(%F4|PPM=gD=b#>Iz-jx(eT*RkS
zNhe6DgT+?zdZA%inQ>Lfrynqu`m<TEkE!iF_$qSGO23B_Il61p<@;<4{2Q%`q`d-<
ze{lo~=DWioO)7eCGDB=z%%Y5!0;hMk%}(=YH9ZgS4TTs6l%@578S3fBhC#xLvVnc&
z&MA@C-VA<`#^*krzN}&Mk(J;Uhbg;{Bu=!ebS<-6D9Z0+s(2X+KR3LOan8Er{+Ylw
zh>DvkhetYho=a_xc~92;B0(ZQP`UwmvJK;}yhIBHsoVe(3<&^Zn``-yO;XB!{{YFo
zRtmh6jBQer9OBDg7w%n_U#aBv_&ls1CZ-c>eZ?YAJ(XG$Mdw{I%6}ER*!*I@v1lr*
zm?uKkA?fuh2TdBfa05T~Dkh~saa_mkFdBR?c#t9OeE;O9%n)NC6tTUjFc5SFU$Rvo
zlf`4{xi_R&Qd4%P1Yy5P_EF3V78i&TNf~xVH5jYMj-|{+bl{^+hGuR<60TmwP*;_~
zbNu4^DVkF*8x_0e-S*ATFY4bom^+-OOCCd<rwU~`Ew6Nye`_`6>1Z+o`+Tzpw&~tq
z6+Bm(w@;`H`ln`fYR4){E2QQ{y3gB<qXgH1TrkBXyp2h|?SJWHQ3Rsq!2$sP%bE@C
z-`5X7P$jD%GIM2UbbOw{Jn0#N_`3u!q(5+ldofFqW>*qu1pFH=iPnJfYPQRLo~MtD
ze*Hkgb6JebC4iFBA-7v+^DadjLG`HSLTkuDBJ8|>xR)f|$RPHq0=^~6>6>rDFmoI}
zV5Y#HtP1F!vX3pcgyFMEJnG~(;vWJjk^~G&Vc0IV@U-&ll8XYky(x~qmUaF$Qyz;2
zJe=|<4Haw1@Ntexi=vh1L<ryfc^P7q_&j#&%)J^Lso)TmA$*&~H!owBm#%h|Oefgw
z8~b`5a;8HLFLKJvC(#^4OipwH$^3@yP^gRGUqpd8JV4C}YY1y&0b>f&CSN_Exrn6q
zjq*>}PMp}UX^7;OBFr{6YSm&BbvQ{-8tLqg(;jMvf+RUTxideVHGO{l<VYrV@(7b}
z%jH>Vu+CE%fKc8doFGz<pq-Df&-w5vN}A-S-R?lHgQr@u5~+GXK~4hMg-d#n8OUqM
zylUB?$>;IOa&Gu@LQ~?syR!VOE9@L+ikn=~>p$Gp=8o{XOfKqtdPQU|8tI{a)%|s%
z!d8XhKp(mta*PoP&wv{qu~}7ClEVauuP^b;bJVb&hM<&2ehJCIAY~i+uGxFiV^D9(
zgj(to4pn>49}g6HE<6XmC>)8=x^TtsxHR@;ojRyBcEkZPcIoKSdUuGkB@OWgn%`n*
z+a0>?+v?+8<f0!)+)%so<=qW-6#e)ZW|2cwK^J@%9{_;8vlDgxkj99e&XVq@k9@12
zB*9v8)}5J7wAw$CV7Psi*xhm;gPdnj?w*sxzb5@FrL<MkDgveBJYa$Sy5uv6i&C?<
zh5+Yq?&-DfHU4G%cb(gGJ>9iaY=FJ@#wxWM-Ft3#7T8w0dOlpg<t=uKC(0VxNkPk%
z9B-Rt&&l1PM<`GvD!*yb41pB=5YvJ813)WW$dO}33!r0$)V<>GK>8O~zB29@o7S=C
ze3xSIT`Ty?G7pX?x=zm@rjtlcMO}!N?ezhwZK2)VY@?b+N%r%4?3M}zNknhvszT_9
z_2%HIpOpwmkl=@T)wy8-AHBmnT32@n;T@gafp(E{fvOKmJk~N7FfCRzOP5)doy)1i
zC^!~L>*=?N1Ccct0Ek>67RqXba*~5;K(kI;g7^2<uJF506~g6GvIZdJ+5y?q7r!nu
zkb9LLvHf&|oQhLz!9Ib{!)sM+ZLT>qZvU|~a$S0ibi-_n?UxOO=;ylX#rq7&>L3PP
z@T6~<OpO`xD7d^zO!RRdg*t7L^Y3Yh)2o2{*HrW%D<=<;?w!d2$>BbLTc0ekxE}yJ
z<mWYRU^~xf&gWuK8VS~B>2kA>6w+zenR@v8vd^Hy@@FaLSZRFtV5G9qzyjVXs_GEi
zhUa7<*#BnuQv|;G$^p5T4(<9rh!S;DCPDVlTXPS>`;S1L%+k{-a3i2$WXC=cMJF8*
z1t=w1?OIr}h=4>%o@EQF%3(nA^*02DB<FtfYjGIJJ%R>@y#yI^BW%{7mQJW+kMnF|
z3sEvm!hhHIjZ|Z9f$Lid0#M3R@e&qO`A{Q3FTW0-A_v)8M}61WhwUhbp$JEw-8{-d
zIpR1;MMH?<cnh=O#!gq(cx1l>zD}H-B(xg?tZv%$Qz>TmthD5yhwUu<Wm?Xx_Y<fp
zYB<9zaq6KiqY1O2U#NzUo}S2Rj>~S<fa)Rw{fr2QMp4GvU1!e{$d_~w9N#<<klrW0
zYTa|J1BFLO?>m~xOhEaJOnGj&A~*E2yr^C{b^7U2cm+kI^c2}j`Qledcj%C(HD02m
z#UTmge+O;oRw%r(h;Y?ENo0B860`?<8sJFP!W5*$NpzvzkT!6n>)`8b;6p`4m%qP=
z2>WYptX)3mT?Jk2U0xlhuii12Ia;McOrP+#UPJ0ra$c89>%WJ1Ap6^eY`#hGKcZsp
z(1S5DBPf3`mjrTgDqkzB1&Y%d7l66db&}P&>~``v=|OaBgZPHX7}SaMd=Tcd1ntb5
zzj0`&iX!aI)h8SxCsU7Is+c*FqRe4At34uO$w*eS4sd^&o2z|B$Mt7X-9OKQM$y8<
zwFn#P{Q7V#`#CAA60IZ$z`cG%eaRmTHTT;)qPPLkL_A2@d2NV27)Oh)y{ZKkT<sba
zx&$H>L2ZCv-qm(@mLTKPj1&}boj{`TTbzPdPw|q3Ng4svkWrEsG`eH!CCBpR3%wDP
zNL-t(JFnZKUG17_Qu{qo_-@}l>Vsl#PN5*Il&Q;z{|0x!X<ZeBWU?g}^4#J2>IEd|
z<Z2IFJmkOV6g#1V)Nfg=!a8|ccw*LbCvf<T7V2I=o@M3vhcl<I?9nEzS#>{%o`t+?
zW-Y&DC8&&<cdOPRqD-)fD0)W2O8WMevc;O?*42!TZ-KnczjFX7hTy!@(b+ALED$w0
zDEzc@%-yOLVlg{l7=!LxgO`&*;<Qfd@;%L9jU$AI$SW)8B8?;LQ7d2fd4@3Jooj2=
z+)v{^7Eb1mHHDlpv>_ZC#^}34^7ieN`;@D9Yn0oM>z#n`l`OWeK?#DmWyZ&o3AUxD
z1DpZAWY97<0F0c27|AjE$<Z%VQb(S7Kpx8Rot5huC|=>ZM|1IA{Q%Sq$)*pCm|>G!
z^AdPc5&qf^k^;&ts*czMl@{Lzf4wY4IAhUx1J+V<!T=^%*F%tUpvssNCBX>UUY~Ln
zE@6X;M@R`FZAt(G-hK+mnvTrOwU09$ehs&hejL(FAHpD&jiylb&=Al8u(ZjKGo}4&
zn$upy6sh-~36wU1WQRh0l!r^~<*a#$=uf>qGoIB~^iHSJSZ<GUOcEmU-6knOLE5f8
zk>8|(+eujImzMei3%pFs^4<oDt-K3rR>IcB$N7&Kv$YX=xFBwcrPlCR9%x=COhw6y
z%vAWz2Hcaipgepm1;C!*lXLYKewJ8(yD+Q7jB_{$l`^Pm+qc+#IOg+~oLQA4-Fx<{
z4|AmV`t1<?hkW1|W7*?ESc<^iu%nK0Q8?h{552D=U#*l@cQ$&kN8kfWuGZW?O-a|A
z?D)1r$CAidQ5;l#5W3Tnh|;Rver1$#R5x}=os@(Z@)qN4DD5z$uH)7Vxgd8OiyHxe
zhn=uRQcS2S9ase<jCMHeR|K?{((08yUKGKCAS)Geruao7JJno(EoNoc%Xz{ygdaRt
z9t{<3sXyAEw&$+k<O}u96+b`MPMX)c@#0REB5GqV4VF^}j=d*u{cc&U?($Tu@E*bh
z;XCs}kCh4Gjy{RLophECAA<At%NO1Nq{$<4;=U7*4fU3RDs=`j{l%wV$P1ZvLVmVj
zqBHS+H??5&{ZjN!rjQ^ln~xMytEudfh6qEE4+Lw*gfDg%1rN|dBapjZ)@6%W?}S7^
zlVb^?TRwZ4_xOdwQ90z~)e6}c-!DB@JHykP)!Lci&MXEs2<*7_dQK=kac}9Ja2~=Q
zCo;oTkfIc5#hd(MsJ3(N$2$TZ!gRET%wK5-h0jze80|3bAowLq8-{)5&gsf%;9*5g
z`&+*8mrf$j`+W5<Eq^>bQbJmC>%(@x+?&4EG8R<W{lm!;7HqBvC#m)^v&U$-@pXs8
zSbCFjEH!pr(j<+yCeUN<&0J>GS`u%5>(F@;BZ8O+n0d-6JY-3Ol1a3j?m?;6CKO=8
z4pd4|KI6&DMA8hGAi)CTAoSY>S$<Uy<m^UBZs7Yh0#jDBnP14By)FcJB(qV^!c!xa
zdhzuAM^rT9DZK#AIVG*nq^?4$RRQuLmvxlt>>+`#$^h0<Uz{o})VgdJC7yr(yQ@dV
zD5>tHv~fVrbOo|lkp=^GkW0eh*=0|^C+%pROcxyov)nZ?({g10_Dar_T$Z)(Y~obg
zuzw{zG_3jIesF50y<RA_!Zu51YNK#{3lLZXC^u%6C`I(Nh}%R{kWf7m1kN?Hu@A~O
z6~BJ?7AH6uavX6BBM@{ql$jV-J^m<q0hM!jm=)FNKjGa5{*MOxdLDPh$rKV$*8uA2
zwqSqMLB`4y>ZQ4k*Z|K_j|&6->fyr(hfemzaq^tG7Bh>y9f{*@B<YvB&Qu?HfmGA*
zHHgg4t!N<i#dh#1#@OXbRHrCAB$`a~0fS|zX!@7sb<fi`JX}f&fCT0ww#4E5g2WxG
z`KfZGm|M*#<DErz3(EB2q1&g-Pzpqw#(fG<-u?5HFE{P)Q3<_1ii*~dWM#8g4G?v<
zrO71lHC(NKqfftDCxJ=Hvt*)rSIgNF9_FYMo8L2+*Q#M23r78XH|Uc3HP%b#yD^gV
z+I$hPZ6mLJn4}T*#$icS*Bx^;kSW4-6e;o2)NQCiIR==owdNFK5E^Ih`wNA*HvvK%
zb@0q8R9B`?p@s{fe!MDD7~&db`#boHK=SC~qqIFdp<d4v0LAsY`^L19+8#bgl{t<G
z;1p7Bx8Qi#29;@wru;|ojyLRNZ@3d|$JQJ_hfrzw({hZduZ*2@281CGRFEEyqZcxz
zTC^QV-65Iv6r|<2#&aNtp=z(wa}mjxR(U}Aol;V+5M+tIGq%Rw0XJ2TqFRLnRS;5!
z&Ij2Bd%?I+Ka~4k6on35(G)hX$A!7suh$1Dq3U!VifBgaon7;gx;nqooGoUK`a|!I
zw_0s_xXt=?(3%9|vt{OzykSg#SwHg#G=nmDa=<00vP+SYLmj{bb4ZR?Jhf;+V+Ei~
zGm^4rY0T)dsf|@Pj>xX%FNU<J_tzYN;p|BLl6L)#XE&E{YlXndvEO%1uTdc#VbMA$
zPKCo(I~46?a?)D?9!~p%2SD#wu%kI?+B?u0y9$BL8v%GA4Hn;K;lO*Wg4ptjJ%`So
zkn9tPG)m7fa~oWVK?>MAa%V&!CRELgBEMCAj=w5EQExS%!HNIkL~+O)F@G|RuY}X6
zPa;rtLc!(?+ZVrM?}g|z?aS_D^1>5YNH?odgvYF)&Oq2inba3YQliFIMN*fNsSgrV
zHW8bb>2Ex{hbjc9E2X3(2gOC%J=ex`DfdmKVnb+ktoJdnq<v{tA#rVV_ql9XFpLn?
zlxz+rOH=-!HawDPwh$LKn4_ek?MSHdpF~-C67X`bT$5Jl@b{_%Z;GLBdoqhnm+lw=
z!Q}DskH)HyQ9APxaVD{3D}&;i79%nOg1_2~^%K?u7;uI4O)|XN?QyT~k|0cXe58xn
z*L&%sojClRut@0SC^P)|nG!~57dPgrpf6Ad>5ii?>M=>D9smmd8Ozhe&EI)XVj0-9
zTE~V@A^%46|Ad}>B_!#U#z|_b%19&~b8>zUotk}%N;8yFDP1tL@GgXUfp`cM=$PL8
zB&vRpjn23`fmZ!QzE7A$g*=~HnHjo%gTlYJAOJ)WZ5HmH<s(hPdI)iH749usm!b;<
z-LyST-xTy?a(?~^G;RQBa$xru!n7MWrsNIQnBQzWfiB-G%q1B;fH3zF?-5BZUM!l5
z1>DSP32Dr{>UZN&B<cCL3dNBOKu~TeMCGRuh{%10{ss7!<R9)%keq#sL|KTidG~>-
zu=Qj1CR&LaPN)ut61c&w!h*g@fdrcB`z;VCi*)2bAy`q<i5Id>Z_J&+95*q!jz2@9
zs<K&x`zRCGQ%7}>&qQ&hCDj8%R)&z&L-H(_3}=cDLVd2#_otU_0&jF|kRn8|9gw<N
zpS+Ql5Tii@mNnIA+TwlCN<L->ITz!GLFp!hLluH>ijy(s3DFHp9|cx~y6c1zQ7#Bs
zeCXaL_jd>N8Ug$u#};mVNyEUGHAAr-34rfPzB&B}GlW!4W+8fyzf+L>)Ci3HHAIB;
z3my*cl7+UL5v0%^DOaex9uDc_4hj?om6zC#z6zh8l`D~1vai430&4h)voc4zq+626
zh58HY*At$<5ZENqy0<^8{Ld@^@|PE(y4WaJW^ha01h{`T5~ThQaI3$NRvl8%TMs3|
z<Hk^yavf5z77R%4OmYSxt;k4wL1g9-ZiDLh)ms<K_cDGjyZ|0n$*bvQD-Besur~qH
z4t7A$J*1py<bMEFs^m}ti%|Br%0xdVNuGAUaQjmT&BZloBZ#J<=9CG-0ZzuaEenja
z`#(Qy6#9!qtRJ<87OFU~YHz1+LRtr$YC>93=tD+__G9?L+)zWtW)U>GEn;v-q95$7
zK_!ifydbAR`n;|yre!{EgFV`bMlFdQ5plB42VC6@vK>X}KGHC>bc4vHMsJWqF<O-C
zh&0i!5ogTJYyFWmcy8o~&pzfzgv}FUygW32;1MPf!Tece6r67nWDB-!qedw3ULR6s
zI~e7;zOpASK8P*^Hnkay3I43@<B89-B<}2u9KAmN+&DG5_8yyB?q&#oUTOb8TpUun
zoz*1GInRnNCT3E1Ar12q=T8u6JgBx_G(|X=S5u_f23*qZ?p;u^6NSVV?HuWo8ScH%
zo^dCncx}Q8DVGxM1{5#?;hO>?Z@v30yn?76aaSuqt}^IVpzr=+b2Ms2Ov~^I|4qDG
zE6@w$Q%kalU;8&T(YQfcHR2ik-EZ?R|3=IjDtbtN%s!E6iB2RU1GB6XY`XWb02hQj
zDhLdtr%CcRu@!m+!DKTcJfP0$Ss;RZ#?)7#%R=G-sm27s8rT#0pX?ww`WMl94|+%+
zZY~UwH~{uF?B1Qt6CQrvOsNFjqo;icONE$iJ0z%u$bd%<kf$h%uca9hd2yX^>UCd?
zg*LEcsUhK9ezL#zFua7oDX29(Tpd`a=z5am1$VX7{hkr8?1-BJ>}eq6VLW5MnaKEi
zI~0pb0n~=$+70~2jX#B)8G!=u>pzR~!M`-Yk_WHtW)ivJ4gsm@XhM9w$HU!+o(DnK
zS{6cDlkfg}g`>wwiucue?5vC0Y<vcg{=0^vvr3z^{pZt4l+Zrv)AiccFZK80>`*CP
z{3=m2n-aG9?N6xSrVYcN5W^ax9o-!6!1IKj1i;DXvV*v>WUJh5!l>vfP#l4(j#K~J
zrxHm^v?(IZZ_aP6QQUGTo8Dvc7{c>EkFiw-Ht+O5tN&*={ufat{?i|!`nOsC&#Lz`
zy#Ax&{}-zFQ@Z{L)gPhS<hXvf7~uZ;`7Lh|dsqo=QY2GDBThkNq~^qt2F;WW?U$UJ
zihq7P>SmP98uAluspmwAvS-9TVonB{j23J0qcNLqa(D_!g)(&Xrhc9$dar~O8Ynu0
zE7c6Cn=U`9Sac6VhRv}dLZcd2`=C}s*am>D$*biM^1rpgs}vFuMIX=ib4~ZAr>vh8
z*-$=S#sAmee<VXA#P}7?Pvu88ef+7^(&!)Z-Jgg6@IQFt;I*I3i%PH7)Z(|$l3lVE
zE&#D~M4zVg67`<`bNM@DTvqn5U@-9_Y=W~}?cPfXs171M|B>c*-~PMJyeGj8VCK(v
zYyyNt0!*~lf7a@cTy51jA_4iSr+;+tkG=ZuaO;mP`D07|*pffK_K*MgH}v{r6aU!6
zKQ{6IBb&I+?7F!9q^io|zSXzej(O=&3sHFKJfJ9isYsS^bBCWY)fE?|8@r_m$En;d
zKHW=4?J`xV_}2HRcEao9MJFGwl$1Dkr*#xLI^0J78M2Kuj*1W*>av|&>*^>#yV>sj
z4@))d2XwpY+!2|7o0k6)LA#*g9lW{M<)8hR3;!MH+et#G&FK=bWxUq^l0_&&f^+(=
z6Y;+!Hh&wLBX^R|(wNG*Q#k0icNIYkY2q)h)>h^<bDH`~mcIE5X0AVY<GHZ%WPHAY
zBcR)}EB9`&N~f;)Z{VoAoSvR;!O^x~YOLU1BvPp^2n#oBbBf}*j$DW}UvQ~sw-R!C
zMn=gd0_EbWj*W8XeqDOB7&qjZ#oS%BkNoD3k(^JH=Nwh(z856yHFlbVKC4BA(bIwK
z|6b64dBQn0sQ+SeH4op?DuS#MI&S~L?g-W8UKO&2iF(1ua=&hI^m!x``mo^pk>8LY
zb2rdJNjxSh@5NR+?yCQ~SuHRd1vM0V{?{kiPLfT3P@8Z}DI-zO@g2j9U$!JV2N?&_
zDm+y9yYHn!-_g=;OuX;@{oJ>+zdm$652UT^FCYKcB5ZomXpjD^&HSyFDv4t$)?4-0
zn<l*^q1HI`>q@|KJ*2cNAH9#FwOG=&|BV!gUI&q6>)tc;n*>{Q9@*8lZ$FYb)F>ql
zH{H%Z7J7~!cB$AwN_*e0YlYU~P%LjDOXg^o5#J$Cl+pWn_t6B%ffh8hZW%%I_Y)}d
zp3YynTr5(Scwbk;>DP3nM07SNtNEVP-z@WflXnt(Q1n@edY7l7KmK}wOCr2gq3B$_
z|1aBjPhyth@#LpiO_j8BW&xWswtru0LI=-#JFhwA*PXv7!Amt<hAi_q>teg<FEyp)
zUvy!a*9UL^Qh))=8f}RF%;>nB;iE!Uk~Qu3oBN7{dye!YF8KY&VlZ$t`u<$!M&a8w
zzw9qM5|OIrsi(6<MWKK1S7RZ`8_qmCECnj&?XO!Xkx2<zQ36}({BIJe!>>tHeku`@
z%f5&$eJP0fXn>^exZq;ht6!yN|ItPrJv%}oulauz<{D7Jpytn4e<RNlm&rKP5G~qM
zbou1go-C*jLAcDK0eQi#jZMf$)p4E{nrces2#Jp&ZZ7|+j1tAL$i@B|zpNoFmdhiF
zS3ei|^5nDTe-?g|1`jX8w~B5s310r)n{P67GrUi{pUhvldC0&&yWE%#-&%du<+tmk
zhYtVwt$nTrF28&-x`(wf)iEL^lgRy`>CjM-`DDDeWaC57V)4(dL!aCq!mD}1x@}yj
zx2`zq<r>p!;jWYp)}K;}eqM?Ufngxqti$=p2@15L)tluw`XUL`+pG<9otW|eOU{aT
zolNdYEcBwyjj$5|R;6F!Hc5H(K601}y07a>@FrX*F}v%Q=f$ksv)b95zgd+3xpFwE
zw6vaS_yW<h4<8~yrq}xO<eM~HIAl!R=fzL!1oa6`FBHVoo6qdoBtR0jNE!5yyJO2H
z<2iQ>jnsera&$#8+ccD4;-<FbERK=jj@=<Ixc;kA*(3{_r9)y3jNF*9<w>Fk*nGuS
zU2ohECgx#U*vqYd_@5>D<;`Cdz_56<-rkg4`Hk9a{<u9zU(TP$_%Ex3h{>Pb_>W?@
z)n5G(s!euf>*5go>5owT5vu=HKmb1d@f(}b$bayzKYiLC_xE39>0h4z|BX=T>8-_`
zJ@6lQFS@W489s>&!TC9yAKD^?k#3w3s62K-+BT6k6SZ}{WFnIZ8u1!n^06nxU1FQC
zP;?{-Qm(p!wECQs3*|HFgg(-f-7trL1nJ9jXFhxD!%G1;P-4@2AU3Xnb}Y`Err*SK
z+t5N3NzmbKTIRKD;&!+tuA>PG<hxUf?Il$MEJXvLEL)jv8ippaX+sCy2$cEDrC;i+
z<t@=-(1Ugsd1Qvs&&En9+z&=dbcLZMg;_*JG0i^Si76a<ve{R*)|*PqLm1H#CV2l}
zd$#3;$#H9B3NbPsccf^s$GrJ@6AX^kSn+!ZjgcY|Fv1u)2_!VJ_xr~e%`l|;f{Hf&
zJY#mw#FXfk9hr{3m(~|dr$baMvkk4oZVf%>aWXu4Z+V8qpo4X;d&(n+$M%)iHgqRt
z7ff0nf#zRn=tR2?)7*?F*iN&+IU_%(8NP)cJ8YJmHS0E{F})cX+>I2PC#-8x9V<uX
zc8u@0Dw;=iEJSuL!ElH;=*~F`{pj+h(C9yz5I<5#v^UX#5PUAo93KBc{Oc*o3f<f&
zXtB4?YxF(9CO;p%V*l#P2~u)9^`oe<pIsEl+^QxuWIA>}GJAUWxlfG)>3hKEpg+Hn
zQawUr@p%PWVr8)8xOFLVDg!1ctM?PPqQM|jswCw^aqCF;FEm}rtsuvrq!h1=&LJb1
z*S^u0oa*w+tJZmRfGCzm>+m*~wCSQLiA3w?dJkhrporet1sVWsN|1&V9cMYwnYYLs
zN(>wpv5TTg@T#5>thhWJxMHdxVl)IJ_Iasuky!=M)YzGDt_RA{<AhRlOFT@WT_hqg
zWg<-Oe0P?mN}vZbmIs;P8as5S);AKSYs9vYA>&hmIC_cW6R69AAf(KK`j;RCjU$1>
zN96sIlV^)zQbGx?yJ*U*6xs)02*60CIas9}VIdMRq%*w0g%&jO-PJxQiBE|myeu|P
z&PE}fB5{RQ&Uc{?Hx7>bNF11WIkiSu<hATCfU14@gs~A`m^0<y1gA&9V7JZ)v!vaO
zZ9m#d*0Z(4D2Q*TQ5%A*Ngnn7%^~MM5ZxrYl)p{ug+WV7`~BQ^6%HZLQLYsV74rW1
z?|=f3cwOqX>p&B7oI$&obzf-~G~CA_gT;$rsDkP0(#*)(VlVG}FO2fG4Ll6(*$KL&
zp~n@^gY*UA#<YzvDotP`B3)$_k{_EF!HrWW@OI`vzY7iLo;%QW@4ZIUk_CdK0sKU(
zbkl_`MhekZmq(>PCm<pxj<k>Fz2kz0UiOt;6m+{_lxy^HX#RDIPOFW+_@D`zQ-Wb6
zNfEm{a-u^PoW&7GekcW|Q7Db#iw~aGlUNx%U!uNZhHnk)f&R=-Bl%+~GSGoz=>mfw
zY!%j^-A^7F(GSC<UNj+tF7Q&U#9ICi1N~IytfohZm-;9C5|!wK&QDpexQ-psdKPPL
z$jLL%Qkk_gJY5MWsT_(z7z^PYFJ|`TUBdnQpYKA4m9$}XfE;ps#%Nor?$>np1bt{K
zkv-qzWT*iR2o$`WIL?6mMoES5wpiIo8aV2~ag<>T3&^LY2OU-l;_EYk<_>hWpUz0t
zRh`|BkgQJ7XV8oc;1Yu_;j;Je4gF0)!6+K86Udnf-X!Ta^=|ko+T8F>_6|$<1Ol62
zWWGeccMt=1h6^z&BhVu=Wsm0`(hejh&4yngOwDZZ1y15MQ9OVqUrf%Z;ov`1dlWiu
zEYQ%sd9+em2YR-2zWP?RRv?FDC~d3@9Qpg;hDwqFoDeezQ)NHj<?3UXAHGE<!HzUu
z-X{kw$qtjOC2%%VEVn;0Kn-b&A6VFOED;d<eH#=Gk`3v<mm*{|!AaQ=MJEA7|Ac;g
zI8H`J>5WKYv@6+PyAM7^tSd4Fl}3IJ#5YcN^&7qS1Ot(|yeFYbn)C~igJ_lMw+df&
zR2jP}ati3m{FrKrjJ~GDU9~Km8%a$EbR*ai`{|dw4W7v}4%Yv8rN~`Im|G;{*Yeb$
zwv@kB?af?E1fI1D_W)^NM=YH5#Z=^YgP%%=^Z`d)4wrIshn6A*q*JIRcm;gq02542
zYkmt2KB76vZ)#xN$mvQ|{8>1v=Vwo;rTs^KF*eIu(25qESq%Z1rI;|>^+w59H3=z`
zY(iv(M*lH#O4OWa{c+S+Ej=_@6SM>At}x+^k)dZw_3@d!eksSqjG-XZ?EE;)XA0vf
znrlMP2BzM!>FxDcTcq{13GOBWx{8w>s;^W+pO8xKO_i+bp&{gGj6yCh$svZF74A<6
z_!ep`Qx$(UxAw9KjuDHU>UUPk$~JS8|7)+sMv1Z^eh9iPIgp7>v99z{y{E&V1+NK?
zZfXAT?6{10b4xfrvq3B@66`XKri?hu<7AJ#jGWOBiX3inQ-&LZG||9pcO~}%LZ13A
zl!9{*&Dyxz>P+<r*aq%34`)zznTF=2-IYUZ9`VS)hM!}Y8Nh3?25T;g&77M9-=7uc
z2!@JtjVsIiT?|WA5g3!3tB)(Ex|aM1hZ?>wF(Lz6Vv(N9xxzZv$;zlh-_Et;kb$`g
zO%vTxflS&icaXC+EPdFE#K9*t(P>&dL%|59t;an`L}!vrY&4*ZrEIpQcp6`l0Ge~{
z1k6>d<GLEVg!F{s6m<F#xRM2ZOoRNGQDpFe9XLEXT#n_Vr%g!rDbnQp)7MzuSzJdN
zVJ&y$j>Lt<UEV8rLQ)TlV!iL{3D&d;InIP7SM9nw*qSD2$yPb*TPnrMN+OB)&Vm#K
za8dNuzqA$vuh_}R@EUXQ+@#DH^|5R?7N*2r#V3{7W%#v2f0o#(`zOxmva;Skc2@@(
zERv5{{POtZYQOaU;wdn)rXXo|x|LhTST@LGbYM7!QdOvkvyISrTDi2Qa@ANN_19MN
zARPB+PcRO5G(`v2kk)9V$(V*y1H*|;$v5*$GtlV(va|`AjnSrjSG&sxy2Wy6+{|4S
zy?m<!nWMg)>lvoL_#PMaHOO|Q-mw^_DvZ1h%0Z?zN+ZWVB=GP`;G52nNWn1}*G;q2
z$X97Gr~L?SWxLp3mdQ%4#fW)4?4QkpVZ`=MR<%!>!#YRjySUDd_%=MpHT)pI4W}u5
zhBHf6lzc9(I|)=wy;QT$dfsbAr3Kc~HvZc(G8(0Y&}#LqqXgR5vQk2S2$lH(5TAXy
zXsL?;-@NzV_6W2`kM^>08&(e)kGH4CitoIlnD&PHeZ|sj9X#&KV!&JSBkOfm9_Qpf
zs8`w}1B%G_;_bL*S?-6;3_T;K9wK|!--z^j)13N585Dd-V@qUkiWBmM86rQAUE9RR
zp#w<2@lGC3x|y;%s#?Oa=Ixj#W{GF<I<P&~hLyw(^HPm{9456I!h~wfO;UQ!T@<*-
zhriw@*RH6skYcn~=Af|2o|_tW4X=d>2TB@;xXM&9e~}q>NijLUr4i83_okJ`9~|`F
zCLCHKj5oB;R-;X4!@$(kV!Phs`)vVesnF0oo((jInE^7=oL0%WPO}l1B%d&1RGIjY
zn-#U@M_Y`X9+451y0kioK`ehPhJCg(h6h)uW&NJaKQ#VJFhMXtu<R`-2>zXqdnl?x
z(wjQKPWNEGj@ek$y8Ab(c_|!kq!DizxPSky!Aw^!0416C%}ZJX7z86^`KfjJsLdz6
zp+<L->rcTvQg8h?0ae4?6QW=9I+UVVlc)6E9<sR6ZEs!bZn?1xrw=tD=8s~o8_w5Y
zv{yj7!2PWC#!_u|R+@z+6L22xFree5Fbs(rFiuOmEPvU}BLHSEWr3TIk6QiC7E4QJ
zo1cw!fo5$buYOIB4ad3NNef*prw_fakD*3am_1mKG$fu!4~?bZPPwW7PBUx*9}#qq
zPT#cfS64PDC|28!H8F1}><<w6@u5i|$=UJ-{q|RC%gJx3Uza08`(kNb@~LkTU?l7d
zWbp6LPEYdvc5zpnj-%LrjttOvmUPar;tZAAqk9*Vn&|K?91~<LN2rgrOb2jB7Tjdf
zV!)P3Zi{baz4R!(Xd2?4_OFe|l&AtRs)Nus>WNP&jx|7fGG|n-`$m0+!K!zV_FBn0
z+Lme#4M`(_G+3wD4>4y(RhTcnSM!~K#pJQn*QqO<5$S}ft)1#RFtg4S24p32eCRrB
z<<<TzkEY>Wu^Pi#*Z7`>=Sz%-u|-vWj>rjaEzA#=mII8nER1{Fq%GVL`1D*s>UB?y
zd0>OY>Uv;rUc?MH!}(EUrcf4~5kT+YJLIZB@Y(Loh9w^D=;93_Z)&cbJn^Ak2sQyz
zSez9*2@SW(PBH107NqxCzBJzORmnDR?TJAqWCv?Mi66)%-&y5nNfno%;%nW8Gz}xO
zUJ@TZF4Z6vH1Pkdfr0k;3zaniBoCDQi(wvDY;vd}XPX9QscU`QSe(-`tfQm!I^Nco
zyYTol!^H?`X^R70xC%3S!1M<A(Ix_<jY?=_>nuMdJhl`#zFSGzV4%D{sQt$*OpLA(
zb(`*cA8RRQd)QUY$m^kjd1L#f!c11NpH1<8#s)7p7B|*)SKjaQP;^lF7%=E(eB04W
zT{twS0Dt)JRV9Cp%;$>5%yKNRY1lOw%-13V9cnor(LL#Xw1UiGh*dd&91RyL%Zw!;
zKwTx7T<c>66*+R?&Yd==V*2E(p|1C&iIH9-Qs=Q#B9u5t0xE#C_y;ESCGId+8v&3p
zm*s$FN&^hm(4y$C7}a(*>MKRFqfUkCKRomUw7Zf+Gd)ysb-*K8hBV3n2Fv?hrlsP+
zLo>7zDZQDtL2O1lE-O^!u(|tQQPV>5`ijIC2yo5`Z>3e$dWIYi!xPAe)vmmNNjb=G
zH{3|0UyNhLk#TeEWVBP{pzo^_o+R{cc<9(COs0CSnY#cWro`9sR|FMFvU<oHL%VOx
zAd?F7U+LV=sxqrxRin{VOaW<n+|pi#nk0|152k)L<Xg$>(2QiIqAP*DHMc4FHYALr
zn51NlOixcVTP;p8^7Zsm9I-8Eat1Hh+7#vLw-N<mq@>&^@zTH<WjnB9W!tyHR5S@%
zZGZ<i#FZ0+_hpL=K?XuJjk-CPR|HiJT(VEk$0KLgI1Ve@r%Rf!UVIOuPlS;Z8Q6<|
zMl&#Xp38C<{>G_Mn2Mo^oTStY#`;`Kp|jP&Z>$ggXBNO-(YZ!G{W7n@xP;L`7$=J~
zm&YO#@3L6Uy(A1=JCSjDi5`}{NVpPf8JP+}KTghl!M(&|%_fzbL@j;AP|B$E-uBpr
z&zC5<UzUkZx^}&p4ruAGfrBG3yzur#o9-H15uh^mVh(O+&HdHObnaD#m_#+pq)uv1
z%B#~kCS!vTJm$oxh-x+*exHlj`($ifVF2MEc6{EOnp3bIPLE?py%`%`rfMSqZE!{#
zynuHw)5Qj4pr`j|7+<W!kdtQHy1$;PW)=LhMv7*tsrj63W{!xA-~VCnE5ou{w`dhX
z2^COE0VSkc1w_&S6hsN>?ruqG!9YqS6eL7SI^?4pK^kdkq)SRd^3H|seFQnrz4!n5
z<AYoFx4!kpj4|gN!^{A^IT*NiS~?*wl-_Lg9#=NeR!@BkL*|AMVz73`%76TX9ko!9
z@iMdiVqIXw!-a9Tu9bC2H0k-3hi!n1w^y5KvTy(gsH_FWm_Aos64}?ipbB(#Gl@~d
z`?eOB=xKYqR#IzfASP)&xDP^H{A)oaAW)?{?#seG+)JdO;r^k@*Fwb;>v=6uz)DFy
zrId4J!lujiDFTopL@r7hSd<_wn&nov2N{4Cgxu;J#96^Nx}E(?yMQ_3B_gb0Ysx|h
zBmoX*d2LPVdV1#&^3ZkAgT1fhjBs!ochZ%c<G=K=W@LmA`JR+a#dHbKb_89&Z+h>-
zb=gt5x4giF(S1(ZQ4nZ&;uSPAO`n;X)j7LF?;vnt_aYY%B5BC?oKhC3stv;TX3FWC
z-_@<jPIG2SNiWAXeKOIqf%#OJgbOVhf9jdVz24PAmZ!tkYWKhCn*cE>ixF$Gzeaaz
z!FQ$g!ciOVCM%VPCj4rK&xBN@CTLM(ex{!u8iU%ba0?qC87*6eMTl$#wTUf+ior!6
z6>ewSJd511|9-PM%Z8mE72=z6fB2^o!HAD#mH+txx^fR+7qgWy{Thh)WxHMYuKzR@
zPtGN!YzMZIWDA=@`=bbpCWAnasrY89GPtJr#9|<jiE|uSk?7fGnFS6J+q$|?SAWxT
z-u17~BeNBPxFdq_YfY7@oJgBe6%0A}@(U_$XBHkZ><UUu32qMIBAkzVASw4m^u--4
zTiF{Ed$mrNi9T;zEwO!lORe3nz@xi1KeQliDDAF*kwAZ67Uw02E97qpMhObBrX!zU
zOv11l10O4z6hb>{gjFp}q6dPD;u#)y)t<?|)FK^L_!+PAOm>J$LH1*&{HB)^dM!jx
zNmdns-L!Y#Dk$-P2iwOoQvgy?1ZF)~3|nH50AxIspzWdt*$$%0;eA8gxo0keWeL!T
zK));Bo)Yv>pjZ)dkD7~Y2oPjF`uI8oRe3)`N-E#StXm;u>POFhBkKfOBJZQV=^9ea
zR=}!vZ@9`g$>8k-LC^Ka6|jPG_ILQ0sAq_n#%=CL0RAc+5YtN(j6d<bNheQwWMJjB
zLZ1at9~c646Kl$a&U}7UFv7%NkdmWKSQ>rE5Z#=2UD1$Vbh<OcSs&f++ft-FL@xtD
zCX3yRQ{ToH01MVvlVNDig4Z2I36BDu-)l(Vn^T>^HBGaZwtm$fzC^H!QhDZP$!E6;
zcw4!o-e&zau!k|ETvs$p4o2<#jND=1$5}*$nN6M+ytHtJLCKl3kx3vc<<WDN>XqQR
zU&rV+y1aI5qXJL1N*=6TE0D~}b<spD1f{+`d$=Py<$2Oi=jIiiF`%C$i1b3mz90X7
z?7WlY`cL0<b*X3i+MAC&#N-(K+;lhJs>8shaD~K&w2{2avDS^xG12@<IH=-jV$Gd~
zNn;0=@_ZS?u2XC$cP;Ao%&gWaeT{rk#r?^v;7{fXBi82a$c9cQ>?8Z3y9QdkvUtFW
z<39$OeaRN3XP4tP0%=7nF-=MK_074RZFAwFc+6`bH+7Bbx>yFNh47i=WKR+JXj3VW
zFcAdx&@My3d>4PLmI;WxY`#Vf9zI4UiLLk%op!0p;KOsm0V)N1*sWj`gV_h%X@u)t
zHswc$l?i|WWvy1%$}p8YRl)ICJj2sY9wB;H8@&jSW(aD9*(mL!3qp5__>OKM7wpEs
z(DniLSQO~c;nB*G*E3~&)6{?D!J%eSO&*yYZ3UDh#GKhl{y`GYJv{KgGlO0Rw*qeB
z)5tz(R8lVDaM7bKl&z_KhOgtxE}a*pq2iL9ct$y!I0#dG03bHuhN9@=FHH^o>&yUs
zIEyj0*+PPWdu{WGTQuQLG{4-j?<q7c)gd_8RZZPMyC*Fr$$nf5XB_heBK_8P_f$zF
zRYJHtBD#CoVR598)@H9Wj$(wak;L2w`&(%k37?j>NQsimq8@tsQJD}kX_5oX<^?Zp
z7*F7y2oc>0mHR6Lt~-cVCFC6Ms1{R&97#vhfVrF}V_9GA86KRND3R%O9jOx3az?kb
zNg9drM3r12zFvLL-JDrV=dCTkG3D)_Wz;H}u&B%$86zOkWNslLsWL4Sx4C$v)c~~*
zNm)0hyQ~aV0tr*+Xpn+KYEo=(qBi~oN1**=42|w>($2#(6Mn><Aj8+gsM^3;?wOHo
zkarQFqCmOx%&R=c&Q8ffBuv3XLC7p_!Uy>_?)5yy@A~=i0^(Yn`9`jhm2#cJqksUj
z8ci)MXJ`RD7@3VojGrLoN=^{h+|~8Cd{LPruRz?hYDp})Smk51ftu_7b-V?=G`B?d
z7fYIT-7(2*K6<O0#1@quC5`EtVgoTAbNr5^7|@~O^<ttDz7mzibmSAZCL(z@#DEk^
zX1kGT=m!8*e@ycC?vmz`llHOXl|fi_2$IZIf%hK48NS!!IZwS#XU&PVLYLwtO;AC6
z4C1=3r-8*|Mg-oyW=GFz3D+oOFkX*r54O@Q>RQQ3qCu@QbKgZL<7Gsm@q8eGvM%`4
zq@CRpHh!|rt+q=kH}ql0m*6Tyb<z-L)`(RXl0W8|{;)@Chz;q`zn#oTaxK?r*Qs|E
z<+wc7+E@c7wzT<tt-6uupbc!?b@LmW#eOVZ`anwWmv8%_sMmTGsA*<fx=!vpXOtAo
z6W05S`VW#yfl4|6iEGMvpd=TF=rQ-MU<y`xTV#-10C1x{DyN*pb?1v?V;3u@+!?rZ
z{2D!0z+Ct-C3H~$EimJ{hI}o;g=kA!lj_sg<5vW*5cb>#NPQ~WiuZ}7+JLGgeeW2L
zK#B0XX1}U_NYz?E5VS)or*{Q_QRW;0G?r19ek)wY1m+y7e#D;$fCjb)^-kJVxs$#@
z+)HBU&k7coUUTaeO(C@EiE&dCE|rNEu*}*b2KhO`B7)oxr?X-%MZUw7BsNv-Rj%;J
zD6)`xCMY@sK+$djx$?b_+=?~FC?Q=#IOjIqo@8S}C~Dmm)m!GaKifErmV$v0rV{jA
zMG$}T$Iq3{6Q^kl=K!%K%G73P1JQs_p9M+UJ3tvXfZfAnNRKT5j#hd0q`z#tcP-%k
z>TNN_r_m6cc1Q|;h90|;;9OfAr9ef-30o6ThN<nFF5G6ZLGs|by2O$aV2;YGt`vJ3
z^w`+?n1Sq>PCf|3V+!+j1&ZCv5C!^neDz`<JYAC);qqDqec(QL(mqsovH(6OYpzeO
ze5VPV?+1g22s>2q)O3VRha!$YLTZNH<LG!JFEZAVkNe?p)&(?^1?%oNry}1Mrn`t<
z8*HqX_18vVv~oj4lf5Oe*o&Oe1fe&ZY`RQno0i|hoK-KqUXAY1xd^d=EPuq2eIT|n
zST__^|K2uKcLCbw_Mitw84afBukS*k;3Gs#a>=|PBJ&Ycmt=bduR`fANq0jKD8F=j
zczZ~O-YR@aUq+QXxPCc5jEjiwg4qiXueIbs8$}Zs*c{`w_Dd)>0{DmJI8Ja_N$(w|
z<n-%#W0|JH%I952nba8`s&M*dw3~UZ`+<scZ-o}qx9D^YlNYKV%T+y1YeDydZkjSE
z{DdwGsO5QS0*{<Z`czUFDCS}NGhlNPpKE?}79a!W)vR6saPV)=nw)_E!%QJPYnXFA
zH@fNGTY1k&W2xc=U-Q5Sb>XHQrTPWnh2tDm6*oU&*->6evd8R8^1`}wE1rF}w@Uou
ziaWwHI0<Do8~-b55?o&|Y)yy?GI|~5z4{L1Dp<m;Y8hm_=B{3)?Ou|*l{k`{y7J7T
z5#h>1^d}f@jBW|LUandx6EnrA#c;Yx`rYW^V_>Q@R^}Xa&ifhvz-Wo8>N(3xUS;oc
z4@8$m|4BV0RUq1V6_D2!P-1FH<wspF1`N6dzQeMuDMDdfkRR;b<&yVR><iJ@Q1ewC
zl@H>13ykFN=~j>|sJF@Lo^$Yz%8jlBYsU4>%AakEcIGBSz2TZ=(d-sqtV^W&$M&|l
z_A@k>*hQeS_L(Ad$M>axxvG63iV!8Uyva!1$e6zZ^^5ljAHFX;DCbn&t_*&$`(iiq
zPklCa4o1pO(>XFltofZXHbzT&Am3|hl=x8jW4npe=a~F2B{SqMNWCD0&{XZyl!s{@
zQXV}A;3}#;b`=p?QUYkmy665W`gO&`PU|dg0dXgzVgmY{d<*^UI}4+qWl9o96=eK3
zUFW<bhRZ!1l&1Ea?kFYLM59kX_O~)$Wns7cR`~<M1snTKgy6orBd5a8^eR#sFg2Q5
zwS}}P%qa8lk)wB(i$@z^iMtnDR?lJ)C|1u0IrTI0_*y9!RM{3SvkG^%laFVDZ6{V!
z+e}8OY9{q2dLRh`PIZe=)P#PXT2n*x&*4Ubt$NvqosQjEG2KgZHCxk`JKxsp`YM#x
z0A{+JS22<@AKD|eufj-AExEJvN^wl*^4?VAFq*Tn1~7+;A)>xdh#df&kKxd+ig|SM
zYC=VE3U8&6pfHonuIM$uL@5QGKWutRdXc(*E1>lQW`0&ZDyZpRU&20e+leX}bM6N(
zY#ulv0^xbGhkH)QN#YL^1idcl*F_{SWU6etvRSD`d~PB4)e-tEdQR{|m~`28rmO7~
znond2K=O&IB?$_^rQ3=Kne@cH6v8CfV(y&IQ@rf@R$1LSxH5Y==#;YX_*TE~^mBcg
z6ghO}91l$v=I0T^!cALB^@hjySaySRux(>`ovHg8buZsXD&%(&X`fHy7KlJ`n)9r&
zAvoXUwbzg_v+)DnXWo|UQrJKY(zB71!pry^P3JW2om&}Va(2mw<Qr@VOtqC8NcdyT
zdsm|uk<uPbh)QNc-nVt6?3Hew_Y((qO6x{Z&Ig3eSda#<Yo!fQsbnzCV6psyZGx~)
z_S`6<V-FH~9~n0461*{$rISe7TsII{;EJfqDuG&8fs@yQ)AaXwqc64<@P4f7frs}V
z%Ma)#H+t{q^nBVI)B`ye5UKJ{KBe|cFj|DzgGyNFLnxsW64sRSXce?M21hrGdmb_a
zty;$jvF|DfrJtEIO%Q(Q<n34$p%LJeO|JQXz^`7zfng={QLJ19ULv(AZe2kPB?!qM
zQX8~;3g`JM(!IlgI*in#tTv8=z20T;G-bs4hzZyhgnxNy0^&+D8|ABDI#qhWb;`XG
zClhpWV#Ho**YPk3K+1=CHyWVImAAZC*TqeU6KS<EnA|@fcAsS)WT9dZZ)lO7VRb|#
z$`le^mXu2QRCRIXy!1f;JtL>`mcIj*)MlkLTU;$>vdk#lLv;7;m5f9Rz3C-qAmdan
zF<qOs>cC$NyKE>O(#YXKijT*@h{K#NBqfv6-gL2!IZqp^x5Eq`sBBh5i+#O+wOIa&
zNisc&doM_3Zj#-DYLZvAF$EqPY~b`lFEh6*9MT`L_tf(A7oY>={+d%_)gZ{hB*Z;z
zch(*qLh#VMYC`^1Yu=<ky6=&q3I@C?27BUO=;x4oDo&-XGa#(Wj?jUFeGKXCZvh!y
zPzzBEO79K+lwms9m7$F@{=udqr+7J~Jms-DFsLE{?E}Nw+z3Qrw}L}UI4Anl&@-bt
z=s3c8biCb9HffvGELSsBFVxWL$p%uD0QrcGUX&P<HqLZ(6w*Ev+M%HBEA}K&u>>T?
z$NdO9mCfo#dLJ<+BNVTICH>st<v@`LLdDx!^ek|rGO)kxNq8y5<POnT*n(n<cS&c7
z&jz<fsax;#Q@nnVsEX{7GaNmAkMD(1cDc@%x4^ZjV;Iy?xPpLhX-fOpWD~?Fc6PqY
za(XuI@uo<Fkdr$i(`iC)1~rg-;1KPvE>xagI<{s$ixesdG-Le4Hig$g7C^3RHsHp|
z+Zl^W^0f$yKY_cuUo8luyg(Xf%FSSH9>&Y_AXRZ`Pa`f6=zah&BVi-{38ZUEa|154
zbeCM41yI?_6}D)9A<0hlpS48V9LhC;$-C>vXKH@}RERO6#emdErMh%?wQ>5A>=2bM
zoOs9zkKy#>x8Ny%tQu!cyNO4Kmh3>|18nLjBsq*b1#*B{XCi52`rOpEY0DQ7IgcUP
zx6m@PTUIt<>%{9xuAZA+>kl6V`fNrr7eJI5Oj`E^@r-~PCf^s^KRg)4=njH=9G|>R
zglXhNc){z7H6W53h6O{>#Zn~@eZ^wd*g);46-q5%zC-P55-Jv~Q_?7bF?Hy5X@#=8
ztklGO!N8W;kbOqBsmb&@1hm8}HO5b^{WVn3vyi@z7Lv5WMKQHbX;t(>Z?R-EdpM>b
zx}4++10LA&{>GhojfC)Q?IE8bf~0Q}ui4+F-&4DRE)Mn0rouP|n^DwQc*_S;wLlyk
zq`{_-eR$^tQuZvF%2<VFfJA>w823Uza+9W)#Vfa;4=jv|@wM(O_`3#9x1_RczYy9y
zGm=q#&bU)<+S{thT(eBcnAHEzSgU~nrexl(_Q~#xOw9=Gf%jP^&rQ-q+!3IId)Zs_
zkc9_{_>f51tNqfi7u}3?&dUTw{aDS4$Aq@eN(Z20pcT;~EEg~()#4pR;EdRSL8u2S
zDJjxlmmv?YLwf3npuUKzkZ?G!%q$uyCdcSwYEkV=8uD!&F{upN2>EvGA)POIfy%lJ
zM|r^6Fog=DP`)g&+N9qi6p)hseY59P079U2wva}w=++Xbe~U5Q2=2LF@ScY$G~rb^
zg8E(te$|n4<eINAZ`q1IGe+oXna*7ma1~ajmFIivCqdIZF|H>n*y%j0*wQJWvQW=J
z{L&S`7%`?8<aY^G*O6Yx^`f3PLH10cQL^!C$4&r9Zk-g$rrs#Hvngk=^|E4b6V8MO
zRAz|h$;AGs1S1_)`K4g+0+Fb=ZN?Z>uE^D_Oupo_-sO!2{3a*e+~i&%)M@Xw3n20~
zaKY#rV4-B%b6ieeE4qQp|ItRap!3MFlXnn}P^5FI02S;Fy@PQG<E=3Lh4nFv^5`an
z?iiah6{-h_?$vF41rrAR`3*!EZjozs!|m`9L}eg!eU)unogFk}oEQp8l88o4Uy!{^
z%k@Wimv{9^s3w|Gm8h&^WEMh5g-ij9qx=orkt;qay3|+jm1|He_<?+&xg<^Uz?NwM
zrEOcikvo8h<9atqNrCmVYoQG@d8wt`3)Z+uH3$4_G(j7Xva+-Qq7Z^eJ=i)cF@kN4
z14>@H?820d)!hDQ)6`+>vpsZn^3R$LVq@P2onayuEg8c{Y{1tD8t2(pgsw-Q4k(Up
zAIBS7?~=aa@c|+UO*2dZx$`~)bxbKArKeI;`g*#n=hiWgj|D}u3@)uF?&)znj5dmX
zG}yT9kAbS)i}1tD9h3J{M@BXy?6%XB8(SdL%d$Z{jbF*wJ;ot-1QBCs<?1ncMU@*Q
zuO(ax;Bg)SFm?OqgCrs^P#|lPo4p}b#y$tKd9lp{h<-5K-f+DLhcSYriBH&`!sE>s
z=a4QDgv<B7*>drfPRvDe;Lm1xlD%=xy);9+I=EPCIm(MjiN4{=uurV@(~S_~Z^pd0
zl3fDjkrZ1xVFv0@O~$P4&@uZkcCZpb@TDy{(Cp4T&0jObs9^UFP&KDV@a3z}#?y*O
zk*++@em=A2U8<hk+-Vg_ovX-7(dF2T`Mxs4ml+X7FwCwgw0=6M<HS|)-UM7Ig8)-X
zMV7<TO$vJ*lz05~D5YIrHLEB+yEQqvB@f22B{6T*Wv+_zCMLVwH9yHE2uftIS449K
z<V0DOQgV3HwpL4aoQLKY3g!0%V!+M@?VHB$4EDyK&;@n@FRAs@O1Uu2bqhp**E>{v
zIisZsnvCE2fmtkf!9NP%&uDkb-L7>vXn-vFcySOGy(j<bs*UF-Et|4J!lTx>2C@!N
zybL;lBhrcV+xVUr6hQh1QvB*&&ro`;CB`vB@6;r&>)MSW$_y$U&ta|XN~=+z>t{60
zQAxdb&m0N6T3~O2kS42>_c+VVGL&?)X*_87HG|?Yzad2a+=KQI{|BbB2#{v8;tTl(
z5mN^!q$<d!zx6};p+iD!5~qJ#bTyhARn1rlb#c*r*eG7Ut+h)+QiH`6kBr-fiZ{L^
zcKzOW14OjL&}*VsBxtYqjuLa<mR;!#|I<DrzY)MR4+T^vfCgRiOr>}bNYTSHBUD+G
z<~kM9!>XCn6~`t==hHstEoO3!B#FRfoOgg>s%3OtL6Ze_=6gaV>M4GV4<W9x>SNsV
zYS>kuyr062E`3f&U@WDTo!fOXk1j-Enax9>Pm1dEdXoB9BJbl=$9jds#=vFB#%Bg4
z1us)o(+LBnZ<slVNQP>7)q(^wVm_O>>#}0lT<W!Wr*T+wDN;(d*Ex#{;a0r+xNa_S
ztl=orDlUVF?^#);<@65gvwM6GdZ-Lwl{|9=N&7NjPQtZ`_x@r-MOI9mpP+nhqD@BE
zQR2EI-E0m!&BE7&G{(<DSDM%C7k2ykLBx;*L7#xLu`Ja*$+b#MY`LpuWT(r#lu}Sc
zBG9ue?@c&@R9;WAv6QQ-o@ky0<O_07GsYCX&LPheGk87p)hn+uVuydU-cx^d=^6W2
zuH|scJ<tF@N7(%|(jBvEEC`WFLHgt!BvezC0Tz`>MpL@~EepYe2sxAJBwrw@O2ZH4
z4-=1nWZqr)$pezZtq9ux>XN@Jp5?b^bR9a$9giv_PiQR<6rO7OqO_GqY3{TazHN4b
zK)1oJNtNS|P82u;33)=B80*ML=#}pXm$=LPqw@r)an5?^;E4^w)VBE$eM&RI4|UyF
zTiYwE>nKuFxI0?K6dE5ZaNR)=>Xa>jg?;@X+o~1&9+3~(ZM6R)Ol*1vH8qFiojLuE
z#)e)6U8hhlPYso0Hj@KB;5C#PZK)NMi+O2ty|W}$aV^y#RjI=ql=WgQw3C@-2@<9a
ziq{HFpg1%M-0Dd`v1Q2{RBvxkJ!h7_S`rN>zQ8=kG;Oz0%dQ(U=K<&IP~P;~I0Aza
z>nx1IGZ8c_o!y<q)pNy>TQmBx&4W-|Z2wTFZ1B#j1qmZsu!|N7bQVWy@`fvo(%OTu
zPSeIHWS|n9pk^g;<H<>u{W>k^i9`_Xh1TyY^hY^hP+jgoLGunZB1~k6^t%C)8-mon
zBplQmo+AnFBm}I?4{v!62tZbo=^VYlr2bQE9nk|v2+|3X$XJ0|oc!Eraw-x$ri5@v
zQDyvmji<!EBJA;yyUHkiFoWVE;R#N-<sly4NFyP`p&Zaom3yGla~D2CbR8x{LJ<+k
zlaE{hGBq~*^m(ELWDz9JFwR-C$7XmFXDn(<b!Es|SE6b%s37wYQ0S)Ry`B&h2zB)M
zkKTjO*T1WaOQ6hS7uPm*wYEXktUVcQ4cjL~;<~Bs#Fq~!Xw&0nfu8MIKi}CrCdJvz
zK6f5MVHe@$lqd94GWQ48yOc^;LoW&JELcm{T^xUF-C>n`?vfmJWSD#Ng#K;xjzzIA
z`b=r&u{P~Kia(C~=OvzrZ?T>>DS>7>iayI~^2t!+R0fP>YR_(|XYB$0V5%Pggc|3|
z)t%7BAA!g)I(~nLs4ts2ON~Mr9BK=x!w<l;#%$k!<CjJ+CftKWP)LfqZ}N*7@jS}d
z<2a(m&_K|+Pd)RRXjSP#$Wxe^_74GSR@vRgZcr!&Q0XpyvbXG0stJOj@-=Vv`*!B>
z>&R#Z@U^lcD-sKwsg=Qv87HjKPeW_(HUSgWr@Z#eUBwpA3%iGu9wwPQu<$)epyOsT
z589`b9vIS8DMc8kRILLJB31<~_~n7qe$Agj%UQ|_BwOsOozpJdkxp8gn!1AM@?M34
zKhBe%&}JNg6r##)PoO)73B8X9W4Y_Rtk`1Z?7X!B4bYJhAZbTXg7b}On;Ea<*tb=6
z&rb*&e#@K(>ZEz$g2<2{mjd{PR&YlhD*@%J87=dm4u{<X;<u4;2<!w(FxFU`D>)_M
zncj9QimFT-Pw<Ar#7o=%05~L>0M${tefG(pUc`m~t_3yf1FUW{4!V|ZB4*CbL7KEm
z+gd~~6aiC~LU&c3gfP$+$Pm!N`k@)v`AM*cAQ-Le0E$ZSAUK#yAZgM=G&g)6vlnK9
z7_M((yW`;z{owhdMjs`Imc2DTk$shEcy9~+FM}qTQ5rybqXn966uJcElAwc(ycM!u
zsV>o79hE?0L{lD{7fQDS^d`oAcs)G;oiutuOI!O2>;@5miSHQa9^2S<2b_pr;L!ty
z$D6OXBQYA{IKAlblugY6(LoFq)6750E)k%5+=89*GljtyG@%j<#kSxrUnBa#KcL&}
z-Us6iDioa>5`)M++$#x%ZbZeNPWzt3^Urnu?azU)WD^F6?%^Bc!S%Ol8im;99yEm8
zTpjW}-C|d~6y7VqxDh9m%z7r$KRP9G+PNK3x%4(wCPeFFK?sE7A-%rX0LXNBgb~<3
zQ17x8Ti{QthxEg6$cwnqI8@pWva)z2;6ap6WTv`{yC4~mabnmPQxrhxRUqDxyZ1lu
zrYBR(&1vc-KOluoCBTFd&2}*FL(i%eP!L7Bl^%e`5p?{2T}4Nxejb3;T7Hc~3R+|S
zJj8S@b{I&qCe7{$J!P@u{osaP9mPGT(vE?2YE6m?u1i4&%@b_@9zfb~dXNbkMBybQ
zqQNUX^D!a~ISC$~wNsdM&tBq9WWu;d!5xD`d+o!oaSu@qK(;5#aoFxKr4?FzeRUS(
zg$*&1tddlM$c3okgB){(JmUo+@4gH@84uuWA*v~X*ISSp`B%d2Z<$&&9e~XWra+1`
z32v0WumvjSaY&UYhkwkE5NnvPHmHkAK<?bkz<~PKK<k#h$$}~|41si>@ZAS7AT}#X
z6borZp9B>n2?UVJhyFO6rdjC5VZ<<mNF%>o0GVrOCdKH7+w}lYmAf`mz#deB3`r@i
zBWjnej7iBak&cHJqECo-V4zq-a6IVW<Aa#C_=nS5Be%EQf*W6(q&+`^-m)@T8N{(j
z(ZYLv1+u$Zo7TNw7mg?>x||kybihTNP(;I5<AJ-?0@#U?QbywJSK+An=;y3r9v-H;
zg)ryChKXP;$dH9s$f+J2M9bJ~Hs~^<1s-ea_k8^%gvCgUnBgl4t{VOjdx7eZee^q1
zH1K#f!=(crpO7V#eBTD@0P*ee>io0N6fg<OC@kUE(9x^?$it9|>($5?lV@qzD~?}^
z0S)a8sn(n@Zqy@L%m?wM=eOsdd?yn~fmNCW7-ZJMk5EqG9YNPN(u592F-v*}>C{7t
zL<np>GjC76$FD#_0z;F=9}JlfI%bj@;JU=sH@Z7p;N%A(18NUC6!l|~#P3+d2bn2W
z_b#nv&(n7&QipuSqpGo>>Q-_a5eIXGSs-H<1bI_OI_(+zui7I`JS^i`0Fnf$RX4@#
zUm+G)A#=;6{gnr}tOjnSaGqZ6&V4}g7Z1RD+F^YSq!YC5o(9NJ-a?x6cgt0g+dtxe
zfHSjmSVYVQqEM9r#aCJXZ0OC3!8B`v;T_rY)BEm}B$NAg_Sw$|4S*s2xh`8U@wGCe
zJTe2(=Q|<vC8YQH2Pmw>KopusaPBR{F-f76C{uN?4z0+%*mp40cQG)OX$c5SUd->!
zV$RZN=$47pg#A}e-3eQyBUr!`M5JTWYF^RB!+KKe3gy5@-$JG?Cyk*{f*eOfWK6^v
znYc9!l9juS1h4oc_xEkEI{2j-9J?=^2OPd%-ijRi<~-YVw}sffzu~TT8pK+%`Yb#5
z<LY~V?+?2rK?fMxgPdNBy%^dbzp>Bw3T46Y9aj&zUD3XOIKY?u`jqAekQYeU)v5kJ
ztF>GX10?EyukqhE;or6X-){WxP#xgW_aM0cyW;<|RR0%K@;`(7|Ka|Qo1k!P_jtbv
zV>D#;$eZn#Oyv(7cLdcy3f~aR-!@#IjGvnhnO@&@^Fp@=Yv>{`{7IYU9tw1s9iCrc
z@cBP}+TbnHYu%b#)3CQ&{LkN^Xl6p%1enZTz527}T-t^PjHCsoQ)lP?_&|vB({Ox9
zxy_u^OGG;VP^J@F#ZpjA-!Sq0^Fi)rr>_76AqrLLl|HUw_ucBXP&kI(I&?%3)s3lK
z*-*N?QM)DDs2KT7QDfU{XC;hwBSK!SmhpV!`W8!?)UUGdL<&N|@N=!-@q6PR^XKjn
zl+9`SZy#s<k$wMbw4&aiqX1d@bo3-yt|HN)US^z?6r~5ViW^LdJ%oxv-@I`E)`*)~
z9*6MhwI1}#Kh!aOpSh~u<j1xy`h{D+dP@(MeOFrhJVC>|h=0B1zds0FLKNpJRBe6t
zEaxvvdX||Tn%LZ-<!WQywZi><rrQ9T?%l~3$peInk(Mxn@0@(6<YNtO*zpLMMp|iH
z_p9|&WZmfhzT=>`9I4x`52;7IHZsop5K^f8n$fDi>Nc>=QEb>BH_q!jFHouO8_N8(
zB+JBIAk3^!cG;l$^Y+$8^_07^o<6&HzoL$2v47S|0X4-MRNnmypaWgKVY*YQ50E1P
zajpRu#gf^~g~#K|O=C}^1{t83_BD5AV}K1Nmzx&XKkO?jA+O)u32F95H@OY}L5o4p
z4kL69nNj_YiuC4LGnct))$S=<T%FT%HuH7UK*`R>DjLjHc({Zo_Cq85V?7u^aW)Nc
zZV2JN#bsCjv*hb)1~Jr3-#a+(8;966NfS&E8#<5P%|CfSHgn=s7nI$I8{_(!<?!l@
z)1<~OtNB}Kbhvy%1~Mwv7KQ~`T66Rylua@|c|bjKwoK8&@;=wZ491a?ATOeOPgA_v
zVBkp~XFXvbNWc5q#*jxnoK3O*cF=26rvny%`&sWC2lP2~sa-*{S7*8UikWQfW(S=d
zE?G!Q|Ei=^iDSEdSrtv@5)XYC8;UVPz`TFvt$O2B^#fh;kwVM8+T*_tpFs?Sw$#fF
z2eIacV48pu?xU<sz2tMRy(UqO9=2Jd;f@_Eh8?!|R`})c3N?UEm~Cu%8p}p;!m_f|
z{Rs6H68Gxfm#Wp56e|{xz3wte8d<gC{qUzf_<igFJ%bl6ld$qW=!GfMVV3zs!<Z@?
zPCb;h!gMH0C$Srcx00LIl4re{@7vurKi+N%*`=}0wa-VgjD1W4^%-&-MKcLc`Mbh_
zuil2*{!UDkztI6Z$uA8!muNo`-9cQk9BBV+psDAPVdE5LzL1R_yex~sI>lCBMf>mG
z%zSpH=dqiXNr7v*@oRRq(W9MLs>6db4+~zZ?(i#)zBU=HeBQ&Ecz`WwK7wJjBdU(e
zcQ5|tzxL%nKM1X2{^)@-CciefQvN>EqIdpcD_@Y;h}Lo`Q)c^-Ip)^1dzXDlIJ81x
zc7Np9o7=fg`~?Yy*sL>S^`n(g#O`;2CUO_aX3@R4-mV#Ue~eIG1`doIu-sE@h>f`h
z0MqJ}VkwIpZZ9oPdBYn!BE=1T#jOD<ir33EL|;g`Dbv(1wITY*a4O`-(p7DX>huSC
zwo2M}9K(ZBP*=V$cdC~>1df#*!=t2~f3sRQ^P+ce;imLkb~o-DWBv6NjnFgGd&m^K
zO#60HDRLEqZNiEN*Zeb^dyq{B3XZ{VP+K1h_i&L=Mh4EBE*dmPIbW*iO0-|#ds=T{
z)v>)Xv(aTjQ?-#5L_%%<;XnTC-_0<14JW>0LgG*F@;e*>CJn^UI~+SslJ3AI?s=rM
zrg*g}j05LmQE>Y}V9HbC!(~Znhz)hvYRe|>+BGm{$8K7%Zzx4M`v+bez|C5JmDxPl
z#e8|~mfHEf^{8LgJYN}(UV)eIJ_HnTj8QIVTbc9S7ZtCtl)Gn4^}Q69(RYrhD+ZNd
zrp#F{^SX6ssWy3M-QVXE`Y_$UPWfiYE|6h}76s)+zQ#CKty@+Wf3SU#v<5Ic^wrsb
zf4$DXKL8aNSFc6IB+~CIn?;DuU3cA}M{lS^6C=wtB3m#ZTa_|Fj^cjB6wIPnEihlP
zm*g6;{mOslE%yUkLA9E0msif`!!WqiYo^~HM5LPn_x!)m&e)vlGI;N+*MR1ckutCM
zwR22y(BxKsn<A#EbcgLj6E;3*dVP*`VQ??4f0$NtH{qSYSA4xxY7*R)cx82`MsIcM
z&5Ut9bemNpE#-5zb7#)%MN4)U_6*#*>id_j=jt6~3=}DF-kzvK<@nBeQB&l)tT+qx
z|I~BO;?7(T%UoeXM)8s|Q_Sc_mt{Ih+=2N^&zJd1t}EBZ=NaQ*k1YEON#~co7tHy=
zxtOLU{vmacDg7@4Iq>Bq1>Kn`a&?H9d-%dQP{e8|gE3}`HLVU?;baP^j1u>U7Ht<M
zq^+N$ZZ?OeFJ3+UEY+24C0p0$D8^zsW8NVsV->d72Jnyd^Wg7)JE1g!wVjzhEPGYm
z@U?}P*-WZYv4rTAM?IyE9;d3+>cXP=c-!FB#6$vx2ff*F`NSr<4H^0BiTf(}mv4)A
zs~ItPvRq_FBYJ{QVDJ@v#v|~%c<{=OvvEs}RbzvMRaS4<oO1;Cc>k(%)1u4whP?5@
zak~NSbPbVWQ?rj~*z{3ud;bvP!X_Xl9w#Ow48J))pdx~w9PRt$X7;N^e)ItT@(W*D
zTBJLho9FHC^DkYzK~XpdvqvKvUEO$B*7T%Z>VtQ@-J0_o!ng+-o<0iGo_d|K@8fs>
zDiq^Z=JA4Ak8%IQk$+{sj20UkyKyh{@}$p-@^Sg}j;;3{cQiC|$!Gb|v1nhjCa0vR
z63aFf))Ub)F_8)abdNhPAm93O_%=_Ua;5$|-^IvR>IHfYp&Oy0p|-YZ!^Ri{l0t{~
z|B;VF!rY}=+4=Vpz_&P_DI_fXi4p??WBk-SXNYXLs5jYdzo_&Rfqi$yOuEu3zjSrw
z`i={TJS#19Sc)ScB2qhpcI>vOC}#ox(;uRyrlvM9<<W8j{Ul1H{nEX^9*%sWM;Kej
z|62at6+lN@8gCM0WMeaDEmMUGn4|Ga+0=K9(q`OY8jc&$!q2;3wq+XxVz#xlk=>LL
zypEm2D8QWf)kzJt_;uYSSJo0LaMV&Z^)i3IH33Nx+EE<8{?-0qrUO?0c(l;%+qe6F
zOtubG&U%f%NK}y3ag*;t_b5C^^O@<(xsl}w3bWwnjcA8O;`n@CFbG!Z)AlCXi`;po
zHn#MmucM=*`|;z)vK%(`atA$I7<qOycTe0wJAXX&(EDR|FR-V35}2_$;Twvzoy9t7
zLfbs;5{FegJv}wGeO9)K55w_vLUOVbx%;E@<5>h|vV0tg0=h#>%PeNHDs{Jl*hJbn
z4q9%C4UFSN_1mjmL<c@P`q2Br21aIPz9(+UcobKjxTSc!Za0l-43DYwBj*k7g|J##
zYwAskPzicsVq*WTsHy7LJiUom=!__xwj;h>T9*(ZOQyE4xN)C+>7c`wDO0LZHshpj
z8npMe?Oumc3T(W<WhElP9B1^h)^Bpc8XfBN1;Ch6U^|;4At8}sY11H25srpK#C*lK
zX<QWZIJ0J+%=_Sm@g=Di?#>it`8s`T{r~o4QRK-EY^Ki+dh#IFh{f}v(^l5j6%+2V
z=o(%cWx3VE1k8O!5AC9hi?{78gX-lqo}#ZLefZ!&JdOSSM8x$_ylgd(qvt6EdlT30
zijX-mB>Fk*xkO!0xkCTp0@!^)H?%h_8tEE(C9Y;g2Ym|)tiYfk-^eSLS|pqn=O(FI
z4of9beP;9_T07K=e%NCfUCHYUx-&idH+1jk&zJO@BAl_PPs%Mh@H2c5{u<c6sF>Q9
zsat0*pRADKP{~<&Li&KSHxU9hXG_bPcE25f?Q%CEpl4%?l#-ORu-%yTsxuhwdTnb)
z88yXPd&;M>L&$9~?zN*H6FU{gb#`X|I!uye%Ayy(^^?W#oT_?0Z(z7Tvf(QUY5M8+
z-Nxx4gBQR9=U%Vz;b`!IhtC7CHJa#{k9%;QJh<vE^TJsVUA*X)ly+_ryCH_VOv9se
z9#(00O5)et=KX=tKIan9WS1+uz#&CiI_^q5sWjD=qFS(078e)C9vz2!&<h?v^>k*f
zK5EuZhVTGWNHCF9>LM#ETb<*7PT^C%D2-w%BU()PggchUk-6K_O;i|}*Q4U%#wY}q
z&c|`IqaQ7%O_T~Wr%^p^CR@MriMs9UY2;@9eem61&}gI&es&b>J$ul<Uc;kjU^v5T
z+>tDUWf3ZVsLe@^gW8BQ@JUbVcEqGK+Pf(%Pe<Cs2nTGRqwefedYYO;qCZ_R-5(wM
z^zo-|ZHcImDftfDy~;KY<pV6N20A7Q^A)sybna^h{42#4+UV%$F}#TfMAz;vVd~`v
zOs>7oHlg0hX~Uw%?7vmo`gnKGjW8IQnAAfeBX^{>{mWCl>4dM%3;vLA$@w5tJ^VA_
z>KMa8W;7BGTZR5y=s~O9!p!cvz4>TqadGTaI)Mlt-}6u7H&Z+oZAQ2+I%;Uu1+$bW
zW@vp%O>l(I7P@!XY<Np#yF@_jHi;#Bn*NFJ;)VK6GL#nSd3kxZH7{w{L=U!j8U|qT
z$dt@G<9^@4KksOOn){ZKkx@UM#}P#nLU-{Uyd3)}z8hkabY%(pC9(Yxx%p#EQ9?#&
z?d|RH(o58v>fwzKjM`K-2l&Odb4c>Cf?x33-TW^Po<|<+q}_SI!q5kZ92UW4reL2O
z>Fw>sqsVo;%%rzafF<qtI`gKuxVZC;8x@Jjx<{3kVytLamf0<qoDm`A^Zg|8RO!Ob
zE8Kwcr;qF|ej8l>Z!7NyEB|PCO8UT+r)M|sFJu5X=0pzR$-^E?m?EJh+4dd*<q5hw
z4G$P{NQz%b)vI|Y>PJ8b5Fun~X}LY~HESU3)hn5boo~u(t}IU#mC1MFDLEHP@)_d%
zzW%wU|Gci@DRu>Ri2zoSgHBx$8X+O!6G8%lw{_CvFB)kF>Dly7xKdM0a@C&d9s5CR
zZhiK>4hpS;LQ+nS%LRyR=k(h-H*twCHB1(t+1x=%t%xrgD=RC{Ex6_u1f>1_mhL~`
z)W1F_8WMfbwjVug#^H2FO|4(W<H^`5z8_<`!qVG(x>zC?#F<rc=kG?$`6U?_qhb3T
zcW1wD0QZe>)BK4tVTpiI(%L}3@b<m6fmCN`=0B&BXQe9+G2q{K{xHTmD;WEMhPO8k
zS^x>OA=_ACVPU%LW5-x2+$P$kp2s=i>iV<a4w!P-<UuYweZU#yZeGgGUJEiYJu&$-
zA!W?fnQ3QL5Hp8y%yk?0zuTKO^y5T#8fugd-WPfnmOfX2`SeFdM(TB6jG`Zr@Kd_%
zeqGZu<01V$3uEc;ck&HPci)O43xh=JM>8B!|7LHCq;GGeQZ!xD-8bj5KYncEynLDO
zoNV=h7H<ShJ?(17{e#~9#O~%3z~+lnNMhpg`3IiQ?!v>n8P5BfwJ%-sMs!AorSh-G
z^X{6R>f&$IGe^Mc45K~U5t}aQ>FAIX$6HX(lqr`x!0t>E!^zu<W0gAK;fLHIaG`!v
zPcx{0ZZhSzWCtEeVANfbYx7(lu1Gj}>oA7MvqKvJ+?ncYm>)uEMURra+POU0Gs8R8
zWmkAv%)lh>Z`a+-22t$Br?EQ+t#=<rZjb@11DzG`7j{Pt!ovbjF8Mm)k*yg#w9w!`
zEa2*stb=@HGpnSksx#xgv<u;7#o9^)ce>I7c1%TRQmpT{&{O_>z%)KW;C`iB@GwgK
z@7E*&oW|p{wUw3FElQ#EmzO#9ELlbdh(+*5I^N-6;W=@{MId)p=;AN)cg^eU3SB(@
zl`9-^i;n_b^*d9cfnPCG$N3I<)&W;b3h`zMjs>R4e=h~Q1ppNKW22+t;uLpbZzDCr
z%M)Uyi=S=szNX~@v-9ASH(}WOsW<U(FMo(e<>p=~Nl2LBuUEd|DE`v<OL<>`U1ug+
z{&S(xboIq|e_vi7F<5KS(H^t+2Yj<)toyREMZr8{kMYs5a(RNH-hOtWsf~!Akc$(C
zmwIR{BO_xx(4k`rZa8WVO;=ZUnC|Xddxnh&qcH*ZEy-LK$B<*XraCH3A8j{QW>=Y&
zQ6>*Vu!|1}7BW>RGG(~ydQAS_8VI`_i9f@_=ki}k_bv_ldRx!o?O>S2!y)wdwtbN?
z5yG(crBOKS>ytVnf?J1}u&GZLQyO{AWZ7n@Z!omx%O=TfnT^&44arRk{B2p>lZoJb
zCk?2PB5SxWb&6qa7P#1Hh~89B^K^`AjE~*OthU0y9hY6=<HwK8Duwn7)>6y68PQ|q
zsKP=g5io8~y8O<r7;AdAox@$xRlY3<c@*v7Pff;|NvWFSe+MiSIAC4sJ-ENn?t3=!
z*DXQ~^wde8bM|*PFg=Lz?p$KkQ&h%`%X^+#*%;1C%%+<EK>H3qa;n_%8A`XmONx+f
zvTtM>t=Ob_ci*bz3UK9Y?OtDB7d+UVf4&d9e`z_~y5NA_cfr1Y|2~<Vq$GRD*9uGz
znmg9fM8fIy>I@}8Ur?uF2z_$f$$kZB6s_o(WPFU=+^ZEc!(dn9Z<@Mx9eY_U8h-U$
z)4{3J=L^V_C;P0F4|wu6){MNYtc?>1f52Db05#$q5;SeOSJy2Dr6F>BmZT4^gg$@;
zi6W|7gK=NM6LGY<moINr&CPUYCz(O`GJP;e;A4QC_8&dNRR`F)t<$@4BT$;lbRR(<
zKYhqrvh5|Niy`&SY9L)`SlBe_*|QOTyI!20{VY0(<vb*cPBf<cj~)o!5Bq8^AhvDS
z)7@=^mDqpwKr^m!0UK=0sR6>A+ke}bi6=r14nGfjd3ot`80Qwvd)=YC$?{kzWHykS
zLxNpQ?#O$cqTkD{L4f0luQ<?He6crf&GZgkl>*=ATgd}7>>DfH-Q6|^r^zCo5W-M@
zGqs>T;0kPGv150X`FL=Kl$Jo|b_vU%##iaoDCN`G7g52LzzQ6~&B;0I^XuKCgyh}7
zefzc<9UX1=N?GFSn=KQ~&a)e8EL!Ty1=iz@1;MPU`IhRb_YQavJ{8zHOb6=TgD!%s
zJ3uSiJ&%qM<9%FQ&;6OObe!Q@_)S&SKJyPbbg8L7cW-AA&Td-Au31jgb{uXx1EWyb
zmRWAm#)-go)4u%W@49utrgP3U9S2yMqw0{hmT}vj*niK#fViQP@Ta>)y?SL0X)C73
zlEZf5YmD!-JGhiW0r#7RYj)#Dun@;3-JF+n%mkfyHb&02WJo>NRxf<M8F?wucvj{f
zzs*i<5R-EBFN3`ALLc{nEx7S+=IQ|pBY<5W#%(Tk?ONFz^yqpTOx<%?_q-J=?|Mjm
zkGt!ktlpoiQ}c3}WBC^XLUO@<adGhs9G3Gs>STaX5M-ZSX;)?_2{vSTUnFoR9B**;
zfH;ZdA{_rGKHY=|oc<aNlGrIp5nMihjhE~mgFJVZw8qv*M{}ftyHKX2cDX-@LOAY1
z1nq`)gSEMsZ3mC(B=yO2WZ8#jD!2n&PmJ1RXJ`8h9~ise%7OQuB(i}R?4M}%pPz)z
zqhm7i(6X_yp&tV$GeE}8G}<H+JSZ3Z%nOIOzs6cL#=GJo;*v=y(PoDIwa&ZJgvs0p
zJ)5Oy+*R#bl9G~Q`z2bq>EL9IBpn`ng!aMVe;)TA11~g%Mngm6a7n+>N&KP3AQ~FF
zIyooz=tKI4p0_;?z4Ss4*kss^8sQtip+sT{K4nLmjTH#mGB}jCF8>^$<`~qN&3}D$
zcTM&^U<-ovbaj))nEyQRpPyn_EdblS3;cI8Z+LiktluQP6?#!4Wqq0?_xa|-+t0?v
z#)O{}v#I@)`?}W?X;hZU<j!s$`zdhcc`zAX``O~t<+IO93JhE0@(G#bO#BkHA+y-O
zw#~$_*&ETist33n;lpUxPSWCys3bl)RJMeP$?){eGU1{*ZVvIuXZYwjRc)7cQ(uY=
zbR=5}UHNiU+@L?^HLbuc7chHVqGUe!7ynL<9SPyB<*AtVe-l{34@e|Ie!C$L-P!Yj
zRj-N<(K54cjwIce?rulY!ycvx7KOkoB;;tHK7INLk^EeAHl;ng)03P1iLd7ZNU>Pc
zw9&hHcTLNKi7v6fUh38p<d8`|IP=#+urp$i@Uw?Jj{J=2q4}c}1KqBW_N2$4pXdz>
z4Y=}vN4?8+guE2lq0-g5+`_^_@qmP(jp3Wcg5|3cA;*4FHg9&`F;<7Mr2<k?(j1!)
zgE)UZUot8TzCfRl(}#a{`ajEX;q(=l523vies%fB+qZA?&<*b6m2$Aex#0X<m7*(~
zxc8ihO>Oh~FGmim@RX3k!s6n!$&dwS8Hw28XvRg=tmljB>`JCAqpO#+Osx+}ku)A4
z5p4c|t8&_3-vE?V6F6SsM>NuqQ2S8W3miV<#gSzi*GL{n$7R-=tAhlQ+5klE{*OYD
z7`1hE`hY&WxSgp;@%HyoIy7)Yxn{=_GDPxgk4naWe&0nA_C2O_;xxO0#gC$Kl)_)`
zO&Sm2RX!>~L08r?5>3D(Uso-^_ryP9j+H~KR^+&x!X%%1oEm%mm(A9Y#K6=M3=~?y
z&3t*bg;I)Xc*FR}PeC0eWp(u(gM%PRjUSkmHdg-U4l5__`&RJRO;w6`NK>x#s=W@u
zxg~sC;~d>QRat@rmWXM-;BEfrzsxbogTrXY{J+V4I!~3#-<!DR_L?x2GJR_bZajd`
z@dpMJip-Fc5|!j}N&NN3FLCfxIQ*pE^otCtW8x)W_^jpFMi!X!az_Xg&&1GBG1q?K
z+xWkJqSONCzP$YTCaxKB%YeCTOOBqU%L9f3=4NK8!@Q;v2PF;^bx7Qp$=x>gpM&=M
zCm}1y8saZG4)SQCUqho|eTowkWx^z$|0=Kx5Y3mPbs;4$ooar=hU}T>Yd~EVwZ_zy
zn04!zh$;Dio&Sp+XOX%T-H*Kz2Lb->Gk6d(^Y)7>|D93XMrUMXJjp~)kNN&sBh8g`
zPs6)+mO0TSdTUM3p1xyFYY@%zD_eq%(Rm2~A>Z1`WXkZy$M?Lu?Ta_gQ8`D>m5CG|
z={zuQyeJP2yxw`P*Xpke_>{QIzlZ&ts3j(zijYR-7fV?zZv_d!o}}dE?QoY@D%&}K
zrFQOji0N5b8`&5c9qDEh#tP={$7#RsuCyE0S~NbRx_VNy)A!WH{b;{?JsfWKrT$Af
ze;;ltbTM&p96E{62S$Lcej>&{#BnKF>2tTlp~@SSR{~zVBI@Yu6eN^sGWqqTGnL$W
z2>gd1E@>5?PBNSbdoyNZ&OY$QbfkSTT(Q7%xN3lli_2_Ut@`gO5?~mB`BrU=$o&WN
z)p;QVaDHyAuFK6=a!@OYz<XgHjQ9f~pF`J9wgL*@6bEO2?Rwnl=)_YK>EVmCanDo>
zY&n?~Ge<`My~*rK7;AUK?+gycI;nSPSdg`z>%?6<*C+hd2K{8m!ls&;ELo$e)2|bT
z$Dnau;llGb|Jd5nbnG8%L4)aLzD&Dj{CDW(o=gZKbxoLn^ZJm3gTru+Y0pYMqFO^^
zOln*~Ac@>_y(_PsYg9Z4$}v6kPt^sGW|c3u<psGt{!i5TK_B(YyS-O)D=m?raAJx1
z9sGpk96Ncg-MwP^dW5=I)OM^>wHXcONc<gRDC?Y~a<voXkN@Wz3fUcQ9Od!+%#TV*
zVWA6Lp!QPr`LScL>R9}{q~u-(-pj(mSvCJ3R~jJaJ~cIUiikwS;a0WwC1S@2O1E!k
z;#|kRHD*mJB-BYp-H3{g){=vm?U;iz^W-f7{WQ&dyPKb2z|v=^{M%WzG%EhvbVF&x
ztSa`zP0q;Dy+E6}<aeUXeMWptX6VGbQ%w)sVjHtY=(^G_bfs+uV;^PxA$jpv<mNtt
z=2Kb_aW)e7*;2Fo($@O5V)dlkPi*UJmY(tLu(#sM=pSivtz0!{x%R|$CSe%Ch9=V7
zoHe$Btc%M|-CB7AX69!aV!K+4%si%PPpPc>z8>9X+sc!#4)o2P0h5UBaipa3agpEo
zo!LGCKemW(H8c1*8D;^$mOv3f+=5HWxc<2enRf@z=i*r#UR4&RYv}*<WvIRKRT2OJ
z1%WR@lv?vnhH&PiPkeq`H9jTdBqw<DY)H&<7Cp}7<*Nd0LznE2H3{Aj9Y1n=EaNvS
z^b&35g!wE1g+2R5;JK7|C)}U7m#_|d;9yDGM~l*2`W+=l4WN-l3O3b8Vn4&#61})Z
zQN1xFYI5x5iohCcMAfUL+%;F(H7UG-y3Y9`mFm^SsTIb%@V~kjV|N|vkms1jMG4L5
zUlbJF@qJqnQNOBYeU)2lyfIuGkc!vIZ1$DMa99=62Ol)fmUz@XoBU|0`1{B-yCU(E
z`S{y^kB}Ueg=M_w(V+TJ&S@SGRDUDJyN%35cP_nK=(+axSi~nQ*dkWvrm=q>kEr3(
zU;L?@35*_gyLWl|VlsH;H!P)}f@)@P8npyZiQv5zHKyh8HEO%-f%iR*Im5U!m9$4x
z@E1(Hd(C2`+>|i{YQCi&e>OKeoV#{j<W$-auY*&3d;Pa%Vl1*vs_GTZ3mZv4`%mUD
zZX3s66_AA(>am2|mNXVq4z5X8zx#|bN92doYj-!#vTRtaWT(2aU{_<2;doM%S#RW^
zkN+-dxtBq_YLTv^n)>hA&qI6irL4VDIQ*U^$GGyFp5r-A2^eejD)m>>XoN*pV)FT&
zwrk@h!d6JH5;lz!f8~`9Z*qjJ{0-V>+WA`Hpry5eNO6iXo;$<qjB0U(FXU)ZradU1
z>Xmwv4CUW(hApN=^0wf|k(U7^oD(`8TWNz2H%-*+&RNcL7H(htsXp#B#8F(AM%!Fs
zJJ}LTTC<9Ho8vif#4EN=>U>%MnOg92E}wQ?CPYUnYrQwHwzig<#Bf3Ibw=prD1mZ;
zwB9qn&kzNTd+y1#Kt0sUe_k+3r~`YXwQO0!>^x@y3bo>$Y#Mt)JnT^TJM-~Nmj;E(
z{7z^o(rm8VV_myR9JfKlN<M7?p+yB<>COtrrvH<{Qb>}*cu}6?<o8zZGPp+2Jp-tX
z&fbN1;}MFL(CHt?NPS&nsN8Eclag|_I<E|^xb=ox>6sX3YQI;+W&`#OFdz>l_#1pB
z`Ptdaa>WCk(oIj`E3&Jgt+<~EaUx0$nL|Y`kgpy?I~v(gK1O4DYU}ec%l>V163-sm
zak><fcfVigcoIaP6}5A<|G+8k_-ym;7Whw2rmFj_vDh4WBH^Zt8<M#=eT&Di|EcvE
z(_}wgxfq8$*1K?I6ry5cE|`zizcDw&J^8lj)1w;|wCuB)G=X!J!>%>=*i}ETHfgIe
zu3i5f=7{8%o+~by{`z?x0Rp~e>F=B#m~M}0{Y=`Eu~6u8y2-W7|5%kpW1eTH4WzZj
zrau=)(;l>0BEi}54A{UM$0K5s!piNG)=u*20yM{>*j>fiU?e!O<ErE0gSs~%J@HS7
zP4WUM?|;tu<oj;{R>bd6IG>;fdcuxj$)~o@U)ghXZen`kxH*Xwi;6j#UE6F@n%WHg
zQWi0?qQm><T>13UZF7kZyxO`v`zudy{OVq{XWao{Isy^sy7KJBqsBp37glBYyV%4M
zjwbXQPxLO%QF<LJBmL>RW9%=c_fJ$+h2=?s)zb8JKGOOwk6&r&>U`njx$Re1XuIsT
z@%(<)utfsN7wP*`?Ut#Mt3Yqo+Yap<EW>AKk3!K)1SefE6PNk@KpB440v!<{;TG|L
z`*~6h<;zb=%UA6VKQHnAI9K3|CqEwjdkZN}pt%^vAO0fz`}uBY)Ysf_SnV>&clvu!
z%?oGDjw~=vl>|=;7rHpJE%}T_H*$YCZALkI!<`lFfs0zbKD|>g)p<(Hr;GlnO4(D@
z_$3M3r`+A>BwdbNI!~DAR-(U6_bTkRW+lgYIC35sYmqL5VRq+W_BEn<$1c|ZZ@KH%
znCZ|;IVFBTF=yh-fAaSLax|yTxhR@W&zpmTIwE+k=c}>W-?A0HnCw?S&0Z6`Nxs=U
zNZuMJ(IY%KxLlap2dBJ<NiMnTlQldjs#NXpM(HMn&x6Cotpc?_El&$RE-;KSpBBn7
z|C*`WV>sQ%TwOfuu64EK*V(>t0Wc%HYX$!ndo?i8)uN9~m2BijG_}6D^u%FmqB8Wg
zPJe4&l=X39`|ZFOKjHUxD11(qkJI+*UHG2i<*{UBY&`cosZ@Vum3h1YJ?3#^tA%mq
zIou9Q=E&BwQ``BX?;vqxpF!*O-n_-7#k!M38U9x1QN(a`Klapxj$}pahX~gFGN0%-
zbRratZK0O;EY6vKHyDAIo)AkXJir4Vh`w@A`@AH69h8C=<r<H<R9W6+5xi*s^k>i2
zDIQrNcPS=~^IZwf)0Yy5G<EuhfA+77i<sK%aB=lAKy17Hl_%u)YqJJi8Zmz0^^g03
z?FT-jWQ&KLt$Fi{(wQf;N1otbV9GK1frU4?N~y%plsf7Z$2z=3NJ`(up}z5{9HULW
z=LIpx*7e*`%bUgE*Vrr3RAxTY6nQ;gQP-@Dj3?O|$7+$!OBkS*4A_|#{aVOoskw6#
z?Fp`aqy<l_@^J#PZ`=*-zuuVrIz})AjKz>H<B(C4RajQMGZ~Y=)nwDJ;>SVzGIhj$
z*ridt(|U73*Cl!0KYV&k1&qZlVK--AXN?a+igYY!PIeOS=aN%nVstJa??27sK{YU2
z5-&JNYIxX&^7mT?aJhf6uz1PEO#Cl&qLXweJhmha(nlU};%Zkj<ElSqz1+mD3XFX#
zVNo-bJbd%2x1WY1`lZL61#blI_D4?MTwj9zn(ay!=IsvWcbaWjQ;(HpY#c1g3~&$Y
z*Pu020kD1ZXm<AeQzc?|c8ug{71mo`gCX)ueDwqMD@CPC#BT&Lz+J@-$&spZ>5-@*
z*c^5b535<brKwL+q>;eUxhks6tGe8ad14M1ggl;fSeCK*&}UvYavv!Pu4d@1T}w@{
zf>MTt1qPx;h9wh!mp|NDBDctbXVUF8Cn~^IaAOI5C+B$=F5mfo69cHCJQv!qtdP}F
zBNSmBQ}~7*(aGLt=U2OAIN}_t{qo-F%ejxW{m!FMs1B~nmy@S5+0_x_z=3w7{g~d4
zd_5_9>CDj75z<n<5?!$7>U!Jvj%%Euc9Gjoix|DMgE`PjUbg9<lb!8s3ts62()tr}
z)XAxmyZpp`1b&g!Aix3;!Mooi?B6{4!f8xm`CN)6=c_gX=Vkpj-=H`9Qq!A@-ghu}
zsDFrk&H21G59R;i>$~HrZvX#F_Li*7QYm|sy;>+m$jUg%&fa7mLUA%8BQq2sd+(XO
zBYR{VS;yvB=X)K6y8HZo_aFDeeLr|#@Aq}RuGf6Np0CcACL&_k#l%o2XDU8`ofEU=
z9sy;}y-bkGa{Hh=05X*%JnqzH2`ye#d8AnaA7l**sak2Jl4p5%>5*(r5+4#@sH+cb
z10b_o@OQ-Jhd&z;5jh9t^0lEaq6XR_J;;seiIR^pr6xsR3>VCjiodAx4LLoYEt)rK
zkW6MrD5u+unbnkVk%VyBb<8i`VD`IIG`=N0SAHRtGcGW<Q9z9wA5>PcZ+f5_yoye_
zbE4*fUW4cQW%HyBw7P8*&s{1~tITuF>x@V}N$8amSeh?i$`_|Q3m0wR;q9TIvv{dN
z;UJEPtQV?*3ymQ~;Vd1Gw$8WTelA4ml)ZIB4Y=-7xm7xw&;is9$GCKG7P7bf#oM<Y
zlkZzU)oV$2eJOsrBJK7gROOnh3;kGwnfj0#?Jp>Wqz}m7L`+!3M;+&Hl9W8;VjwwX
zjXKp?44zWNx~q<kv2$Lh>b~a;)5@@X6eh8|eP>jpc-Dq2#uze|^|aRg28bMG?G6t5
zbT!86>Oaj#qEgO<XG=uApt%_|Z3@>E-e-`ps>pjh8D5;6UEVU_(@xb|!X718V!)f)
zLvyvi!jrEwJ!Rg1=qKr3T!#9!O8mkr$h!{~v)ViALeU8#V^W>a0cL7_eoUNpeH7Uy
zk9%OyQt<+5qpK|SP0NkB-a_6vLu!Riv>S7nQVX(&Zrb>7Wsh%J5!>>@A>lUB*2E2L
zG17D{MAH!Flx35>@d<g+%J+;s-*^x{c@{wgk0qh$q&kdm`^33-E3vn9d0Jg{?$I_l
zC{inM?FAt@6ez9{z<a-UIXH^$+jL0xHu;EFI8;iCROkfh@Rvmu93pd=hS1o&O^5!?
z<7~GpX?*P*m#t=gw=acLw9NA81Uu)n&do68Tl?5yRBp<HaGW)7jroJ2Jma2rb`B1)
z#0~-(D{QyOXj#ctL$$F=9$CGi5Dq^iaYEU(VHK|-li`?j^OENNnL?{z_PvWj1F!ss
z`U|YhSDZCU`i6{!nmn<>ZBxG|C!?JpbO)B1se^ePjHPp_$yV6h?<$`xz%{T0)!zN`
zY!iAx&{7=bEN##31AI!6Pk~A-)wrke4oDD)M%&}tNcip)Tw^OIBWQDydgfw)YK@s+
zZE@2<Ou+<oEac(%cyDxwc-=}1$&)8f%%6VSxe+tB4FW@{VoBlMss+^fVUix)*)y_(
zk`=z6mbH|=u0LgEpjJYjS~Zb%ynf?G{?~6!v!-5{d^?LalZS4IRHS4a*UJE2#;7#l
zyB(e=j1?+mGxOk+F54z`a6o3tsSFur;rnAPIc_pf-SoW0^>U&M<zhvVhOOPq)*ngy
zU#H)XD)I@X@?2_vlOFV9M$OaAO-)mCFVIV@hVtg?<lKz0EaS-#(@`n6F6RRsto7yC
zI&1>#-C2#drmEcu=xkMu3=@gs=Z4H|bwk`a_7Chg(WUJDbK|#K1UH>6bWxxbpRU8h
zSCim4s{cCFR|9)Lz}PeAt{3+$<M0jSk=J@|)~kxaT)FJkj=vyI3z}ARtbOf#fQH;;
z(dE5aQ-l}X0{?SXmL(yaM)q3fq%TRKRqWI&FM1g{3hIv-Kj+TmqD(9$B$+>uq!&`I
zj_``Xs2jIhMLyH+P}?NTOzh51Me-RQ;ONH%unHW7%;%_Cq5TBaxARYDNdI=8ftH=Z
zeE?W0r9D*OUnUcp`C=dqVkDK@AI~f_gi6~7)VGDR$4z_tOBlOn4ZG`oxInaBofE*x
z%DTf{?FnIAu6af&*)U0Cdy7+7D;w$=n#ml~f|VuxjFK<jbaX6|eLN_|J8KkvSgvH1
zy%DH?y{n?J<06tJi_Qy6sXi|jV-a1e#|dDRI2hg}!-NNk#iEiK#P?8Umr<ObUUO&^
zuoIsy6Jw1$iem6#OweUrj&3{CZo3I-GCD+A)Z?~!OSvv&gUOm;dt?Ldq8a8E`!K+F
zax{oKSdHe{VcuwXs2~Zwa8^c|F=8L$^&Leajt#bs{aqmQ7{{a#oS`E|%zR>ZTDFYX
zf%CPq%Uk;!-X&HP!tg+P5qk&tck$y)QuhUrhY&ue<#Vi3E@Nmu5)#U|rg>-Jwt8l@
z=P;+EyHq+p)pJF>bHh^Q4QjS(=G4`5+}&NasT=bJ^{r9u4K3^o3I!ZVF{VmIfCr$*
zQ(MIvYuf89yNvH-q^T_KxnTiCsV%b)dtAfM=GieeRqVKAznh4&SQS3TV-6AH>D*Y)
zf%V&w*p|0vN*3T*PRu>pYydU{oy;~K8{OyaD82q}HD7PI1`AMi6nQpe{1FA>>%BS!
zbr<RX#`v->yHhW~n{z^q1(TT4hA?l7>D4`Io=TtiYVNZWob9>h<ANyxlkuocL0lR}
zeArni^w5|XbnaBvNLwbYLqF~yg5u&NVoEwzZqIkD?XRj-m|R3Ob{%Nhij65Wti)_Q
z!*3zk|9A{_faZa+bA*Z<)iGpGFGCWS_38XrOcGW{A;j^XqJC?{HKk83f%q3@xdtY&
zIGbGzf&2d5n#aHf?q@1}lN!sjJvT78oA(C<KE1UWz4+oK!Ch4}wrh0BPWJx&uMD=G
zH^Z}B$CA?5a4B|EbJ~1C0iV8!BRh6DJkp0{Z*JSPfMXc{I{%}f-j|Z)xp?PX#EH^i
zTA1QEI;86D{KSGXJnUZfQk-#nXD$-70t3L5#YpuyK=-`o891MqfrUXGiX3H_HnepO
z&pg}~q;p@EoP3}u+4!CvxfxPQCfZPr#c^_<j=6XF1%gM>h$WLCQuDz;eF%&EXmjy!
zM-xFItD6Uss1sL4{9NNLsO>u1QVZ@wCMNmc?o=u6Z<DhNdX;o|BZv!7^wAa)ty@iL
z&t@cobT%PtLZZyxi1U^H&q&*jl7<IpxWBsy{FnCxT&Nr=kM_^TnsmjN@3E*D&D)jV
zh+ukW>EG|Lp@`T7$yxtFuHp5R`)+)Zm3mQbcN^4;veqUT-eF(xk}DI-Ly>Eq+p*V_
z)>2aHkyF`s9YiV?d7LYx!R(HaRUh=kD%P<hZXtkr>Q44G&7h!WB!vua*D~ZvR*fO1
zRSZL9Qm^Sc6N`(Hku^1zZJetUjp%-e0F~2HxmTkBo6qMxh|j6&`*a5NY6~-{wF`5X
z`f+m8)bfOnGx8v5+=rJJ$Fh$@v`f+s*KS+4R@@FAG!Z`Evi3TOWM$`eG^541=mN{$
zqtq}bRB~c~64tp!mtXt1vDI<2WVf<LQThp^T56nQdtq&(r!(X5Gsh86EQ06D#a@Va
zMJ6lBT?nL;!M5qHsTH{w)}&tR?~4v~BdDW~Uw4MKryQA@t3m1*wr?34yn`6Ko$PFf
z!+IuqA21Exu$qopMHTm~G03aS?GNUgpkBw9_Az*m@D?$V3)hO@Vwr%<Y7*S^^4ifE
zs`xtl<k}bC7aO*O5WfK$1={)?Iao77o}p4oJP;Wc^cD+$v}KnMCzsp$YrKS!gJ~8M
z8-iC+&z;|3zVA!aq$gJP!-A!Q^meCnje>`k@3JH}CRxYjcPkP?JgH=Rm8Qw*uZ^D#
z6`1*lb7*W99%s~n`;&OjW38C`>~5t`9daf(+QQ9a@*^qeqVhe{ksK_3Z-7OJMdLOy
zP$vH;!_5j^S2>DDI!2h?{lJ;0vY^?m&)abKmamdoI3v3e(&Qi&&RA&O@V3;r!!QC>
z`Idc>wZeC4rt~AvJ$f&!ka4x6kZB*3Dze<o+dm#{BkT^8{O3zBmVj2ucT`U}$1ert
zb8e;>Dw&txG}krpq-wQ>BjzGwI$~gPsIZdjon&=8L455(6y{9K*PPprTH{6g3vg62
zwMT}yoJuoyU+`Y^!fHZv-|Tn&`WVvlRIRyy<HKTh?Lv0C1|CK#6RweSY74!QSOH6c
zYbcSU$azpRT~iMjdT@B!9nVj1dik!Q?8J9pW^svzz>5`#K-UN73$|-FNmr~lM?{XR
zV(9nhHhAE?JSLsBE#pkuHBD#e9TI#m#>F6b3vt2f!6spYhQzoKkFT>Av54~8XOLa`
zP?fa-?sV_8R=^_ToFbby6>~lbyZ6H}4e#+)<B@sZn(^=3qk@qL7(r=xex*S{^gJ6Y
zYk&rTa=lLxj_xVE^pn~Def=>CGhP*>ft@escpe)c8Vp-e?;)1OQg1~IjfeZAskWh+
z7ssb`>A8M<g2@T`=JZQ)_Bo)r+4C-&ALt=LzeR~Hn3|icVE8=e#XiU6vkq0&49$V=
z1|&j8QKIiclc<*8>;uABqM~|i{G1Vi)zLC9$!g2_nG4QYqohXX3oC3fuVnj)l_m_z
zjfNIcL>(F+vN@s!4N$@^s$a(Y;HrRf-m4Uqb;Bio!2P1y;&Xw>w~b@#eKn}$)M=Ut
z`&tC3C1e-!5`oU1@X2)7!muf%i45d2Dh<<O@Ac3*RQwV8I|L3Q86O#@J+7S8AjKDt
zBtE`)JeN<>dXUK(dS(fQFH76xs-yKJY6=TJtw8BqU^=jBzC@Unv(HDH|B?n|Ln=Rw
zKT!Nu>|)BO{;lSuWk`_>RM99d&mX6>5+8evrhKtAcxTgf>D|!K*Ecy->j#aQ#)t(j
zW-9GIn+u=9kP@9^`j?^t2}D7<BaoFKQKmC;z)wC<_xWmqFm56xd1OQ8XRdfAOnk~a
zDoi0;GlrPhTFuUUbcM_Vk$R;s9P4@)FH4vUWzR3^J5jQ6|G+}`d-(hL$<3ggITHyq
z>#!Nm=66lpKfUYg8+i$M&Xm8PxPLSsKo9UND;yYYd_rrTAXq)zgF4hBPP{z<4Lo$Y
zu7QjNu7SdpEPR;68Lw#RQ+2O^vJ*EimSE-m!!6q^w2a?}JmsP3#0l%h3G0xDBYa)g
z*dY~r6cx;rkK9PKgC%gURK9fU=;(6TDR$+ve{suc`>FX<SxoAN;}b<iUV9MC3n(tn
zF?hSSI7T4qx7N)lO@4P;Rmo7kDm^9ews64gq%({9JhTn7tC#zJukPpcfPl2d_^?eU
z3mJWyrn$U=g3+Mx@bFEYo5J(jm&TXs_2vWE65D96Mv*C0xZlyfuWrlaqY=XXY@i7P
ztMV?(NCQA$ef0iRe0GRZ<T<uJId(}&cX#)#Fmfk<<*@xiDCZcGg8Ci?jDcEePSl^E
z4S+1T{l*U-B=TFa?534olMOmw^eT(<B!LJNqcKH)Td^u`UJ=h;mhR@lzzsTo<+YNR
zKZ^oxtcb!e_Iqk-YPfIpJ)|q_wH6;~EH)4{TnF;Z3}KoU=RO!)MD0{=<&J?}C5!CD
z*>r#S@Bn7B^V;ghHUm=ZW-#M)uFXSD`c(=4hLLZJRxIQdO7^@KZ`Bk1Kgq|5dm(FL
zWU<$E_CN2va*>?3H27UhrhbBkNA(Tm9|w#y7bICaQM^xj@4{5_S$l~LVsF2-)#Q-3
z6SCSUYP6P!F`!jz$SV@wZ!&Z;+2w;o`0`skJxBp*JkFemX>S4rq!q!qXAgXCe!h}a
zRs0+>Z&$y7&s_NoF=aI9OIqNqDVbC;t-y%9SXIHkms%N$N+hLo<iED$exNVD_6DFc
zO#7FkzJC2`CVE)>_;qc#2{8a%!t}!aTV>Q+Mi&?}_mox=_Ya*S{gsMIp@0=mQ+YGk
zx%^i)MZ(?m(LLt$K5h2z03_P~6kOAF?o49EKBPS2lUg*HtIoB8%j;wT4^_e|${0JT
zEb+%kECN4=-##F6v7n<`nUqB2ka-#WF2CZh%cB#N#YYKBQ0-~MfJ|%_K$sqJ5C2Hu
z3haldSPU1fAS>9f-W#F1Dr=IZ^VW_?wX4Ga0ca|^`^jyrUFf;fev>Y(c6-|!+HH&0
zw4L16_BC$Lsr2?t5s74_UY+W8Ch-pH(^6!D@BN$BiabcANL_5+61hW#fmmTOr;aw=
zot`M_b$r{F3(APU>pLF0d6pG>i9t0oUD1r?;qa&5s-JzjFs@73QF~A6MuSJM=-Az$
z#>|L5D>s)dKMFQ`Z(-m)>hxpn^q})*JEH|;V>o@jpO!|pSkmO(=Xa46g0TqC#KgH^
ziI4*ih=M(_u|u+Ol!(|Cura?+K?g_+&VyNu-|rv^T_B6UJ$J#}!Ae=;$vNRVV!!u+
z`sVOkGEao_XRmA23J1J-(qYuZ9AtitDpBsok8g7m&-oC>3tv2(0u4xPOJ_wn{0OA<
zMz9Hp&-EjH@1Sckff@(Vb%sJ9*JAk5*PCLktYHVKDRIlqQ*!t&jDOiE5dWw;0c<5q
z?R0xv%_2=D#aa0C9z5c*P~B2ertO`8H#1HbI&Mo&I43`d%z0B)4Y6EMZLu0h>{-e9
zzE#5VQSm@k9vphYFD-gxKz_i`W?;-hf2yieV)@jy)r|8lkm#mBB8een#>J_RA0anl
z%#O>XK?7(+!saf#@3`}G7jt&1%xitXm4-}yRujK)d)w_!*C>1Q=$+(V40DHU?hLsH
z2MntkGFW8O#v(IGu1HheI#wy=zSFj0qng^<2Vs|G!w6-^>8j<k=cmP3&(D@tJJXdC
zhB1~dK{LJsWs`D>%9q93YWsmAK+1}Ce@oF{6*9kqVo>z7WxhG+{^mDO_0a#3G{7Up
zT|KL@_b~_;*X2e}i(utvgBO^eCwZV{Y=W63o-kEG926etw!`$<gMObF9t5eY^@~+P
zy_ScLM2mCC%#WvtGqN_$UpwJD)MF&!YhueSMfyscs1zFkSFDGNc7&9$UnoU1@L}h2
zPys>V{*zL^V?BbOq%>}!sksd(E%xYR9$F;y{EV9fp4ZL|4DB$h;6$#%3!}BXwJGkt
zb`VY^)>B*WP5&z+{=4h%^@+AA3tgW~G=9-UPcZ^L%pVPj8Wz*|%?9&)y14CY_zFbG
z5M5mtbXfPElsK!WX5!_&P<mS;fB9Oj3~264NlA%Ne)Q;3v)^rxai^zoRJmldO_gs5
zs$YXBd$>+l_84@Ae)Swl52ir4@mIrJsp;wL%^e+;(eMeCdOndGTzf@R<k4qbJ#9Af
zMNtZ%sY(kpRZad%FTKDgR7D5*Vq^Z*?KFMjS=B~#V<g{mJ0!djUb2Z{@@_?M_A`+m
zmaX<4If}XFM>YDV4r*xAQrvQ1^k7IIVq78yHvL!Gi`ofY?kn4><2Ww&VJmxRl)3qk
z5zh2<@rggbJH>b($#QMi<*}0{1HCF2w?&{%^&^llQ2-zkf_*o1qdA`qb!2tJ7?xZl
zsKRtO?GSgzez=2o+7ISNCUsYY?OFNL^@g4%2&F=w2_@9$!|m(Oi=E+@JLlT9sh(kB
zVXJ#+V9+<Q)+~C1UDJq^+8oS<>=8^0C`GH~8ni8<yJx-1j5)QB6_5@lKIvYeU)zqF
z44v!&b2T*FCY>6z5f|&)I5BwQN;Hrth)a_;<%*&%z!CX-q-=Cb0P6;I!zGkLZ`{5k
zGq!jOa9oL8dJ@C4>=y?~I--U6_TqQ9XV2>zP=H2odG&3pXO*5C+Xs%pPWuyfgjoZ>
z({TpS=&oVMp^Rg1UkbVzkdV32y(w>Pm5>Q_;;Q2a3CL_N%6WX)<;rQtlJrEPcK!>?
z)g?jUc!}V2`l7SBs%!OncGzjK#ZqST0nKfE`pEqkq-r|`2Cy<aHYC?F(lr4>V1ES*
zn~Bs@tvJQ`>~4K#kjfg3yXU&{0gn6?JG^sKpfuj?4vG^MkzE+X%v$YcKIS}+EKgZ@
z<Q}7y)Gt-hpJtfYUgixTPqrL(<h_RJm5vy`L(D_M;(D2j3nA2wWMFm$Xu*Z5b~_jF
zQRM`Lis(Hm@7enJ^%dgGysWO`7g@$?xlPpCy{I2E6VBswI~Ow&T(mUl*N&^-2cTcA
zpYaA<;jpX${>7`Q19+4miF|XJR&DWo&qaTxSxqY-zqwywtrW}jF%I{5f60#7AGZ~c
zc~*blYipVd^GD2iPA03B+4<|+(&g88_-yGalEa0Kxn}k&#LPd0mV0SjLNKbvvd1||
zkg5heH1&8kEIPy12|3OAHGqZNlKAHn&h89dMq7!v4*6j1A*V<ePS88zu;#vNg@Jk7
za(w<?+(g1Rub6oxx>R<AnOef26Bw0#K~?RQOMQCrVgQH5B|8#2As=Lg2%y9W7vy(+
zWy{kPoKT{>^8`u_wcUbR$r+(kVRn$*fie9%jB1dfch6mS4q_9`yT!%CE#g?pIeYb=
z9491Y?gA_RI5a)+NM7_JC3STt`Gwyf<cAw5^S$r8R{m{3^+8bUTUV|Z;zBQjNk@D6
zMDpnhCaHx<Ta?mbR`TUO<SsV_s94T-lXP!>4fw90<Qd^sSD(O_*o@5<NbrOtcR}@8
zSS2d-ni8k~HUGAZloUZ&fl9sdI3F!-Jhea|5`X<Jx?cvnx)AXL^%?4)&9xS}OZ3x~
zebU3cWuV6J#=_cYIcO&MsX6RQiz*zuB-yD^XXmy0@yPtfOWwyQ1<0!YDyP0Z(BbpY
zFu>uh`Ame3GrU-cUktiV-t8Z)J62~kKP2R{=$1a{L>b4%J%`mjg0}ciBot}7aKid1
z9W<zS_b=~lFVl|XY?bS|$F=w?tely1eY1Jg*-lsjkQH8X3;4$;6nst6vSmZl5%)AC
zo5UP%mmXh9*nlw6D7(z{ra2kGKL1U39O)^RHdcUr(-m>M30HgC?Wg|e)!1`kyg0;g
zDA=}#QT*ORZ$$@%Du>owv;|(qH=859kk$eOl|oJU8Ti%i_nMv0foty;xFwV+eVQh(
zGP21`L4W2NHz^bib17=vQQ$1#e*=^_QFY<>6`^#UNL>-G@Q!i+vID%BZGBBT;UH8m
z{$}EL4s?$Y{KZK+6*xXjhYsX~;2L%}9q5A%n(W*Pd9P+5TR0tNHN3;tp4ew4s!^#^
z?db&O!%QTZ+o4g-xJkL_0C3$UsXtR+kx}i%dHUSY-ZWK+cSAW({Yll#nCev-Y4~P-
zk>rU`2Vc#KA|PR$+<gDginhVqSAe&<&zXj2p2f<pWE#>j&t=~-?<nLQU+HZ#(^{+p
z;38Jc-FDwbL@sZ?QMb-C5E=dUkx3TP*}ha5H8|9bpwH7k;sQV;;IAw%d4hk&6bua>
zuQkQNdaW^H<r2tUv>wOU!pW|0T<+6&h+k>FbP!Rq*=Hs4<7|Pu>Q>ECCNkO;R+h7b
zq|a1uAn{4nqMx^YpN}ErGU|tiA1zD3b&>vE(ntBur<I|u`kn;0wpczJ2BQTKv@Hl~
zBUw@_!`DfY=#v_7>ke3V2gg>+TD(q0ObQWiyX!H?pKowO;(gk{G=tRIk{FM9O$O|<
z$j8zJ+#UNW>Z`cr6gyT3Fkc;<XcVR^g)6$V7Nm)H>$3vO_>-~%buqWLax4hoIumqG
z2$Ee#hV}+ZOYtHdK(SQXZ_%fiY(G+B=dd=J8xx8Iri4cqNVdIL%0vY7+*6cl=>0;`
zO)Z*UoIbO)o*iDy*NkAqPVai67)+L0tM9lt$DM+4?af)f6pitkBh`sUkj3wNE_CbB
zN=mkc*W7)rXD8f@n}WPdb@|zu+S%n7^MEDUT>>hcxfsD>e!j1%K0v?K##l3^E3^^~
zYR)PT`Zmq~IEE-oIC>S-8T2Pb;3R!Eo-(LbVRxEBmu9&i&-%15U8S%E3pxrJsd6hM
zM=~2YeEH2TdyLCkO#>Agb&5GOKtJi74KjV-<J3FAkM7mFMxz)mc~jiPENQn9buO-$
z8TXMZef8Bi{zq8?Pg}s-L%(75Q=wS$Yq2(a?(xYyz4T*EXsOiSbsZxX;k>+Ks=XI*
zJ@DLI+jzvp<L)i>_CTj{MD@zoI~Z81iuRX(GG2>lURUeZhm~yLPKRtMA1Z(#D8_`e
z9$lPU)MQVHpb8OO4RwD<7yc=$gY8mz)Z~bUVcYu+fz3qK+)B|Ku>P^)*bWNnHyA$`
ze$|R@GG~*RT55<I5SqC`4_+;My|-i`L1SPJb#GNXzN4pLN*U*|`lm)>w6(RxO@*h~
z^mp$?e2KZ~0ikZ2hCFND((hT>q2j#b5&+9wXjwo`UFFmkl`4nu>|dc0VVKwSCR-<-
z@LZypDv!IKZ!=)zE@wG<8}ms8dEs5PJAmjrM*z%4o4{_aeP!%5THPok76)Hjv+vWO
zUOOoB?qXw{-meI2U!T_oL*<fdzP!UT614;*(GxaX7Zj~p%jgHje&lW_HV`D^CtZA`
zt?hPd_pQxmS}D>pj}9YE(nbDeM3RztwV^FuTt3lf_A%kZPjsbX(m_zq?A-Yly^y@6
z3)!=qyPm~muKQlLoV&*M=402@(H5(&EkW@YmqKmI$ULhIec>aF?2|gpwJ#)w%3IO!
zXUQx0d2x)ck!C1sLEo8Z_GpRkS(Wp9wPkoyt{!D1fFKATt|u|=SQy~Mc%`zXF}-Z_
zLsbf?j4C5FN_x|sXqCm_5`fZKhctA~NNx5`;N+vk`{&QYrgIlbau!1_hV7TUDl1o?
zaQ3g-l#qUBW4=s}G38FJY;e?3L-+ugODIyP@W}84tzZxl5v9$xBE@6)7~hx>CZ5%7
zs_!@XnfHP>Lrz+pim{@-k$X*T^d7y_NOGumRnC^r9p6sMPwtned+@G%=NOA_kug^}
zXGLtyD+#)Cd}^j{ZnoM#4~<|9Obf}_TQ;#MU>U~d-dwvqaN4v-Su?5fIY|GQeY2Rh
z+a^p}RjUMjKm7_gO6neVQh}$Smwi`-qFO30Vd91@3lTuCh%tm_US6X#{}=<n{sdlm
z>uzZa3<NxfpyK!r3_9l6`U;@}iMKhbxt+J~bp+iShmc)cy*Wavxvhe~g-x(p7Mk8z
zvKX0si7gMH<jyrdhH{^$7&@@Hpsb=oNhat>vs(vxB2?|3D^Z(mKm=6|sM;H9vX=ut
zE43i?^*p4KkuIr{DtN!FMK#I@mInd?i<Jl-%m&xs{q?4W+*;hB$W6hwL`xk;qgK@~
zC#1iE5OOfNY7vcJJ=cf}ZS0caebzD}b7Da9@JQL$?sA^HMEj5}PA=8rQ%(){S;9bv
z+%o#l-;(9F>whXFF{??N#xu3pOnKQq5+=UAJRCR(`T9^?s%jkP2Yydv<@4TD`ISv}
zdHu+;hT^2hXRrj$wkn@DFMQXW^M#KUtHvxnkY!)wrvj&&xNnUE5*9-vZW42M8<qh`
zh=@KVuSl%3b~{sYY8HJ13G(}g1{GV2iGMp&w{4*XLLN)5M(v5^>`XVR0Nr=9*x2e_
zjA%fo3xdorE|9YB8NqZ%yo5Hz0(8en^|G^VL<S)3m-cBT@b}0YIy;tYtK|8_%53Q4
z*g#t4SFzbh{4(<w5>wnK<xy_E4p}HSE8M<m*57`JPAIyY^!}J*)G;}XEu~t83zDtu
zzy)5m)KMJ4?dy;Gabzt~yXWZ+CcOr{Gy5I;DW2zjMP-&EBU{;Jm37M0W=KGkf@ETN
z(=_-H=|IJ<X6cqA{PU^?U}9(il@+dMt`14MPKz!5XECsl6i!8V!$`0;@j&bS>A3ma
zqIjej#xscC*gE%5n<V#s)t?CY8n?-t-Idm9pP7rR9@W8rLLv?XoUj>DBaM~dK34_d
zTc&5F6;gmOVQIe_Z+By6bnfhIPOehS=$u2wkTHK8I#w9CR+WulrrXt)TONPBS;-QT
zk@#_5q%h2NabiKXvUQy%AoKR}_&YS8gYfs2VvK<+LG5U;h|f$952)k$B-FcPXeY?0
z`|aD$KT7YgJJPuSwkW+ir(;{h*on8Hr}xzP3Kdy}x~67yT)Lbz4ZX$|X!4G3orepr
z?(peH4Mrg1iW^sY4qD^Btf9dY*3P5IZPD}Jr8u{zR%3g~HSIJtV{E5$#Y6;5`$^4F
zFA$R9O}qu<rY}xY6}ky@>I=Es<F*2f04o$qP~?a-k$V3k(v~(3LDF5K9;4A{XozEs
z6*Cu3X385vkZ5tmRS1^VjkYN0(7@;6_~+5SGS}VBhTHU;lWERrvdGbr4-C7gq>XE1
zssi5APwt{G59{gnWCU}0l<uVZ0tuVZ@I}QfGMvB}>Rk~)P@>h4y@e3kbYq0S#WAFK
z`9>jLiz&uc2k6OzhUZBj{B-kgfKABZZso`ytUpR``Ty=4Zep#%^6SPLREvw2f`S5W
z=e?gdJ=!NVXT@GOPu#jgJ<T?aL&aUw2(iPYtB55yFOVcUc{P>VyrbonF_mOKf5w0<
zyWFL$If2_&!O4K5f|h<MIhR8<sISgUNmG2$h<P8zPk%dkcEoC<m686C<bvn@&uPt+
zGJSxgd5RhOLcHU~1mB3dM@Oq{60OV{9<P{(e>|MrXeV)5cCUQ{9X*0X{2m6T4VNLC
z{gr{YKAT-`U$Yq9!(On3eACJW5Wg{7R*Ttn(Yc?2iM(y-{JB(Qm3csTxMe^&njJa1
z+6@^e7@#dTINJ6hVsgo(_{JUqCeeY<<f;SrkMYwZbK8ng7=?_(A;tlvKyB=*<2Ck!
zN4H<q-D`uL-p{U>__q?H)<P#XGPZ{Leh0&n_5|s)Mr%bzmgH1;OAGo!H_AgDuFc#+
z3|e0BjIY*J+vK`Rc=L4=gUDwG=AUXaX?XM*O{|6CAG<e`b;ClM%Xo67409LDIruU}
zIVE1)o#NCk+ZO6`xL}On*iB%y4u<P+uDbA~vX3}@sEyfLJU@!~Zawic#8fr&w8l5Q
zMCm>rF`tg{`VS9(j&g>F(_mocS3v!P<1ULmR#V$%XV%rO4U20r6wI_C*iIL1nMZ)s
zG<IBMWD<z8^xVmJ-8-G!;trF3gn_4ae^zGA1m-d#tYmMjGK}P#e3uqBqfvL}WZn=I
zzi2@|b{bI(X2liQelx6_wg?&fwo<Ydx@WN>MutIg*cpSLNAS`{z)f$BZ8MN6OI|F<
zb^2qH$UQCd`S$E>x_}mo@oiC<U{R3n#ws9PwtO3ePnatiIO$bkP7=4DhsH=y>=M_N
zt6Cwm3C@F47~#+BSp3QFKGgzZq%vR6BJs9HCjcz?9cjNH@6Rm6|AVa#b|1QrJJ{p|
z2xM&xS|ft_K(I~6Sv6xRl0e5vx{z=tuch9r0vC`FTHv<``4JG`ZB?!FekBGn%Q^#b
zGJ(Ncge@f1y5c!^Jji|+@RUphU+7Bu{?>ju)?(W8TS(cy{Hl<1$-e5odiS%a;8e+?
zFG7j$Vq3u(#l61Yot8LbHYP9Hdi`ceu>X7a-z)?1k_Mnz(3*XBjbLEVI0>Vh83YXF
zu^O!#)OJ}QLn0Y)x3*DrLh#~ViXKq|Y-17Xt7qr5AWhnWF<16_ShGU4Mn=7mjIi_D
z$1tu#v+)cIw5n~zfuvaJXHuyR@nGY?n=JhuKH|_P&GP`}T)R{XKZvqs#MkqyibrRJ
z-(9n)96D}kg^7clxAd5^x&KMD0G<nd3NWIK#b1^OU~;_SZ;FBlU?GY8J_JFfjg?WW
zA7PJ+1!3K!{~`0DraSF+M=)k`(_iQc3Dm3MYq~yd!_s&1My*mXHDwo<Zh_~8X{*ag
zT=~mwCiFr#I(RIx{{04Bm#Svp`T`Q1P;00a!=iXpC}Xc1C$D+D95dB<axg!*IR{-v
z`Aybw9Lj&7Fp~IG89KS~ypL|_9O;ImcWa!DCZr`LgUzXe<*$m>;qtXC0E@mnP<m7{
z`jxa{GQk4PQ~{r?&{+AHbKf6vUJ}1O(Dv;f=n;t-^%0e8b~*NBfQIpr1VNF4o7=%h
z(}$p)dOYRJ4?zhS;Q@K`Jb<Y)`v23;UDb(h?OmVA_?v>KQ5gF6?c1`SJ;`yQQBkyL
zdP6EKl0cJ7RK$#U)lh&;0?Z%2lXH>BfKs#>WYz$rYgdM2xA&8Tl4=tQ!X{CMLI;&$
zj5xh3lkv;C7mnkT!ypCpyrA*-+|{K|{`iJ-kC=L}W;)^|cvx8A4-|1p2@9!X&-Y&8
zB<2rsC%M)IkS<kNHB`TeX?|N@&;Z3&HTwfjdQR;&!Tm^vbG>KVi4N#6aIWcARS(+z
z+i6NIiY~Eq^j$tKEgT+#I?E#k1)j9{`1nvC(57Ki+^I_4E5nIlei6n0Nfxl#m6`s*
zigUcUI!jP4NNErt6%>jdt_bCs-+w@$YF%N;A#wG|5{cilm;Ygx>_#{DzQ3szedPHc
zMs2{Ed-v|$T_h4opNLcOj=(J#{#J9=IW{4xcF|ikQ`>bHO#qS8R<n7$FnDP|H}Jc!
zeN9L{WyWRV!KX>xnTp80!7mzxqdBGnIiQi{zkG8#pkI*65xeMt;V~5Mm*6&<Lg~ye
z*gpf^rc~>kKRiblie?@iQ0oKn!;?DW#GMg~{B>D=y#ja1;Zbn{ewM|n0KgpA>?C#5
zfvlZtuu-FiBxnv)@2#VlLmjAA^Z)u(Zf^AU&%lTOd|14s3hogB$It5qoK%a=kqp$m
z+?=ENif$0lPY)^-Pgj1_cijCiL+LB>L#C|e&}*m%PDF5O7VCiEp>j@S+f*;#7p+3`
z5iO8c7b*R|^7mWBOOAm?=Y^>7{J(E}I@uAsA_LV-4Tq0xWPrc%FePW_h@8_iF#hWG
zN7-<5*m3jd?{Da2Iqg<t%Os&$60!E?xq&FB!Hn6|Sm)s2lU*=$Y(#ck$K<by$l->+
zMU$UlHeCGs0t{gS4!^+0jt-JU9003Qx+kPyc1dtKb1GA(Bvb*Dwz%Gsy0`W?)p963
z+(w6f%(B-fS+$!)aCYgh>9Z8=N^ao7s5Ce=-v%6He{=)<GFDJ?;5gWySvg64qAzBU
zKE|%Hsw!2_CsHOyvYzjkjmOBr`G1rB*4sNGFUD1JxjN^hKnk9OEw6L1hs*^8ft>n8
zH<E5A^xf$kf3oWSHLn2lbaeg8e7A?D8$=mS@csSSY?$`};IBP`XVMJtSAe1{z9)*<
z?>RNqx4}ktCQKZD0fDi8z~yWEUy=go#JQBupM}EK!exaViaizu*$p&eCmZA?K=<P2
z@D-`X-|f-AWSn{f?Jn&EdYt-G`htiK`4s^-L9DC>Xc@MNkb7nPoqrWEbxY$J=tPwb
zXIEdf{_hDw$yg-E#|NT29_K!_`Qb)=cVO~caTw#>UJmUt$F^4{FKLgUQ@_?C3_;r%
zOvLlQB@T-JSVe^a@Ogr`N(X4yvz(#RQXC<GVlfl9kXpOaZJz@z!T(iy_<BK-a)1Yu
zhB?)@s(QKh^Jj&jc4TKQAY1(SQC|q^#q|&6AvFj3<HH;V?T-s5;LRlYaTinp6A2v=
z-l4=Ti>_AJXC<eE=#=DTWGpeTF1}7T`kPNJXyQ=s<QjHlOwpzI*j{OSO=ldlmPnCQ
z0-#=FWoX5t@0dICr+orlC>>z=&B)fH_Q~IJ_<=k^!rkqrx%tv(%})H&o)bU5r#<Vd
zh}s)d;=;roRRiP<djD*FI>k8Mb2|8=3r1N!Hi<nEl(rV^NVBQSGWk*bX3sSNCFRUN
z{HA(|VE+fUGY0>nRc$naI<3t-q!BP57SgN&>+4@PP<-rlb?l}adCwgio3*nH^Z)EU
zObQCHD*H}l{%ffqYY`GLWA8BTk{Vg&UFv|ym_};(Q*z+L)sL~HhwdRX{TzTPv5l$F
z9z*ERnl$nXusc?2z;Gjh<l7WW+Fb#oW}gi%3p87NU`IC$S<a^~9&z;iuIGRU8c_jC
zX%S11xJ688vcZHMT?kwG+Y8<9j_Zt|DT6xD76Z)#{M+FE5<)#Roiz=zpt$Ql7hsrS
zo@kiCgwqT#ipG+Dzrv{^ChL>{+S;=%Cu+tNxrVL#|5iHjk`kEP*?nl<X)52Un+6iB
z24sK@Pqw|`Il58)M=|^<&!fNmEXf(P#uGxk`<qUKhVu~!jv<d9+wxLH-Llld`q`Os
zMWMoXPlEd%cS8M}VeEPN|9Dv7^Qj|FN&I-<hKoQ<?xhPi2@MMOt@625DYqa87;y@u
z;D2*E`=CK46S)p;e||MLKAK`-3vQ(Xh;(lUjZdW*9P;Iq9`fbXi}v05udxB0SD1i`
zG+-J>BiwRSMM_xIfTP3eV3QmJXC9YMlU}LeFKzQk=MFlXa&e4O;QsFj;ZP0ns(S=q
zBc1`RLOni!7%R89@iA#ntffYpMlm&o6zLuj)%$`Yww$BT0@PP<u_)QqNZr<z+gRb2
zasqq?y@4Spa!(w<rh+>E&(d{Zvjv}x;tSmQd+GSp)H$O92^bx9_dP4KiHqI2p&ZF_
z&)VMKaJYf))i@%wIQ}aDXtBnni@rUsLj<>iXWvjtEO=#8?pyoHi-Uush5Fio_di;`
z7EmaUTUMUQ_>1<CSJ9*EjLpDuA-5-*BIm8YROV|FTq+pD|J$tufwuJ#u#Q!*GR=Uk
z<I2NV1A`h(nSi&vJd$h*;`dqi2uk$ZF+tN>5(^cx9fxIL)1eP2j@WW;4EsIjJnaEg
ztuc?8@dZ>YDP3dAS)u?|GBlg%p8^u}0b|d@RI})zmGk)cPz0sD1wQ<Z+0;l~p8eWP
zTVaM;_A{F;;Ulhx-y7zm2t@6}H+!wWo@9lgdFp)=R=gtT-b*sf4*!x<=g!yH$31*t
zo+z3P$T_0QaL+yd&4Y1h)4{(6xE?)uQi3jb9=s0gvoznPY$vFdS=VBi5(ue$#rUo_
zcnOd=(f`pw{oBZ401wOe)PV9|zM=shurrj?aJX;q8Nfuw!uLkhAz{JE&zzVUH$%I8
zXY(DKst*~~udw;#FJX$%oSd99EDs`<&EG?A3yaUVSkvm~Ul0SMTv3a~m(KpB<aA^(
zFLF5&HNDq5Ds=un6KhOs5X{IC7E&36u-?0C6(!{{t{Zw8Fw32VVXo+CiX9i(e{Z8y
z;VI@`##X19dQ3u}AqqzN#O&a;R><4u^YinyLNiN8#A(NGdIk_KT2{T}Jh?F7Ctd{)
zfBs1{S&7aN09}Z34nGKdjzeYo<XR~gjG@ui3edpi{;OObG0Az@bPfP^5h^7HDLPU1
zR0eq5ydYxFsYsaz>J|8}X?+FM>X|MZ?LWPm#;d0}+aDM{_EqGpczLRto1Uuo6lbUD
z#|8VtZkS79T#$qN|Mh3U=g+2{f`76YZ!ihx9`lZNW?<k$*?{a&I^aSp`0(MW@_%`I
zpyHstvY&;M?azQl@)MpV*~|^tC^ul=E7p7|^k%W+wQ=qFxnVjf(o2$ARv?%u{l`22
zci;(H!_~anx)B|yT_rT%8sRN2hm=QC5K?hGS%3D=;@Y4gZsA=c{$Jw$^p_i%i$-nU
z=PaO$(c6tPsw}tk93mK#FBiKv_vCNi2&8az@TLHpS4@Y|c5nK)3qW5{kX_&Ah8Iid
zm%!Hg;fZq*e=}*;fsdwU(M7X19$grGJ(#4ivGGLGy^#AwDQ7V-TPCG+$DiG&ohEq{
z%OsQa0l1`}Va7?c$Bl2lrIQuyeXfEr49e9)rhs%Z%BKId0#R~V@Z$#Lq6LViKmMcQ
zV$h$RK4J6Mfm5G0=Mv=r5s@fA;Kp$z?!i-_?eGp+3>~Y!j<)1=fGFvJ@;@mHv`LqG
ze+ufQi$|CKtQ7E0$_iF;EL6OiDh#fK8PgZCR(MxcKOC9gesZjn(L#<3)ZC8Hi9-OC
z9E^-fa`^^YY%Nb;f-zBZzMcANy7yu(J)S)qz<(&vj->8?T>^0CQn=iy2M3~~WPl@G
z2+^bfubUF+iNKbkO`k?CViGSkzWQ1m{Kr0D<-Ua0v3R*`$4D5ojy=E!Q3D+_uFoNl
z#fnwS#KlIXP!{UD7tl_Gb4sDe5#8b+&(h101U<S?au&Hg!**|A%lw5H14?EJ->eEZ
zo2Y(yVz-5MWO$3)%uD|~YC=t>xw@z0vUYfXJPUk^1QsqaJCoq@V@{)&<5l?G)ja<A
z)gG0}+92npllPM4e~Ek_z9oYzH@pimu~+OP#X#QE`3A=CC}ub3xECONP{oPe{HIyZ
zQ3nd%)Xl*x>W}xKxme}>rU~#N4R4;RCg@+-*X4I*v0DQE<gmT5v6&g*^g}a+|K*Ea
z<(@-NXwa6G9IwA#=JRtx;Z(FzCbJe5I0X{KfrRrGhM}r5qn~?7WEo!IsSB?{eKE<F
zu~Zb!`3PK?F?);m-1VDt+9nu({$;T5d9Ox!`3_=3H%;{mhDyx6pi3d7HBlbhloTmk
zb$%YJDP@co5GaHI@AU_p;x-1>2E2UmQi-@2KEp3y4*PCtM|6|=yhGoPf;x@y9|N6W
za;K|36W1zASWjlb)M_(fV4|2X4h-sJWeMXbcl-UM#u&Y*=Pzc?gPCoWw!oK=e;gX}
z--$?~mr3JF;}{{WyOY}w#8@zJNVvRSmx)~5_h0GH1~G$1OS8-$^V<OX8ZITB*jD$u
zlc`J8n;1fm$Yh!IEp<3~_THvqH!#(6F<Y30{u~5X%Z{g~x-T`S`*4`w@id}S&_N8)
zFfM+hBK<o<T&|&XlNalKl&Asi-mTE4kA8^stIUyq{4yCklbR*wnCL^6<H4ic!&tyj
zJs}Ys_Xy#Y<i@6@yyHy5VPb=cV;;E9`1Q7`sC~UDw!pi0A^#cjjl1x+iUCkI^I_nG
zOL4%z8g6{n&##BC?ZFP}8xdy()vjE6bG&p_<2h1xL0&|Z%E_=s>H-V^U5VrcX=A>)
zC)$p3*KOlK$}zt7lT0P&CLyUNjTcEJ2OG=DQ+_?ps7%tW+G1z@=UT!HA)P22#+fa4
z5m^_vO5Sb}5a~EJfjfm@V`Ki~@1VgKz=!J%vwS(3t;U^>H8?aBM?n-#;mpb7VnE0Z
zOEX&b3U+g1H~E$ic!=`TR5RW35~B4^q~lO{z!;8;mX<Y89EKu6@s7fVYx4dLE9LTl
z=TC##%uLk24!&bYrDfYyTi)U*cQUW6YMByjK^Y>j4=A$V{~K(9Y#6(f_I1v1s`gXW
z;q|(`aDPaHWX<K^5xcDs2M9W`%Tc?Ta2<BMm4`oEFP6-p<_7)I-@n4QH@8LI6-5g+
zue@G=7+i7D6I&$C@yBbn@9FAmcM!a>Cp*r>y@#2Kz2*av&3|?BJm?5(mQhzim{C=W
z#*-(!l9o63T<$1)J?k8wX`CM{u`^L^y>6+iEcEk_tAm!>s$~ig-5$72HUy#)jh=EU
z@4cNR={Gjz**V4gD2p#gxDaH?wARm<p07|%(m(m*_Zm9Ms7?U^-?S4cl+=-iqG!*q
ze*3H>AD9<QgSj=t<L(<L(9j_;wOYd?n3a&5yVg#>y7%8XX)sIZ_SuPQz(MS!>#}4K
zxL@Sg;&jy$r}FIn8TyuRE|f~>fQo_~@9(i?=#4MEDVa5HF6n@tY5x2A)vD_N!XzB2
z<b&d_mAKQMhSC}=kMp*IsZSMMY__d$J#F%A#>1Db`QCqr5lRJDy}c|OOTK^FBA{H1
zGwt~_boU~nW>#eO6&J;U<vFbi*$YRf&BXl-3zHzAUn6<qcwx-*ELp&Q(123oVyqaq
z@QS=&`$1|4zC*qG_ZdRZLNM4*Szy#rxBaToAL`>!eBM810`!Y5Rn<j%PAiU%G`H7p
z43qtYgJf>;8W{9>p#HbEBdn0@JvBZaH$Zdr(cmTeV=#0R_)s6ST{?$nvmm4~NNMF8
zS61EWoU}9w%@_R!&QxkB%L)IndFVIq9LC6gymAdt$;#e-Dz|{6Y_uo3bG|*$%_m^i
zu#xE&?H|8?`x3Z$ui|5Tjs>o2iSiZk?rRWWCEpb(F(idG80(w)5$|#YAAG@OYMu60
zz0<It+1z~ZzaNbI7<YjTqA1F;L&<AtsUj-J!A9juDgi57OGI6$jK6A`MR@Q0@zPLi
zd|XOXrcbFSpGznJ1<Xqb38&Sm$gY)GKHQnfq#S;!QQcq%Wl^x(X^CD3&>ZXTlfU>a
z9{XfQCMG5^075>jHWDuu&~mCqtR)~&{B;_0x$flGHzq0>nof|v_j}>JhfcNDn$7i-
zH+srK%7@FqAbX%^sngEnXno!C!ChuV0B^YYP{Eh^E@{T{X8@r_`#;1CcOgd-lUPo0
zHr`8{oMIa0RlHOEa`c@Pq`q&qqv$$=NSw=RlPvHkHXi-w#&ZK3cgDu%+{s%^T*{Jm
zcHYJ7iYFx2Bn@o%W;EQvC&=AwXJ>Ek#@1|>sh~o6{M)~O(0POjR5c&Ej7)PqVwWv3
zGa-NL^+g>^rKB&g3RpW4Ex%?q@G)6lgs(&&pY$o{4iNZ0i;tI%p9g))%nh{uiqfYC
z#{QRGfVR=Tz7X&C)yzL&!@|&VJd8s_rEayRQ2AdIOWMzYp2sz20j6<e28zQpQ@1~M
z1!nh98%*iPkFPo|yc)0G=xPXtzT`i-*qA#PvU=R}cCFGAPVO8o=y^<6&)s};syfoD
zxi~wHc7Mh>Eau>I`<ls*?D^L$v1$}!B`7U*p+DV)Ls@`MTp>+k5oH@(|2n(XSXB*I
z&EpEca`1fJ)z%zPX-AO>7UlnKGVDo7*^AEjFrFB0AcnI{670eikQhGPueutxp@_Uo
z9ZF^uT&v&fJG8p)xvAT}wV*ctp7769SLgvQnuAiPi)!_&pn<Wdc=C__1#+a~y05L2
zd*d4l{Y!sndr}Nfta8E=VJ75*-u0&}Px<)qvLQ+9Wgm!UZ`mv^N`K1GQs?{Xbc@9+
z>!H-<`+sQVUuFuX2L*8hXT<~Lry?JxY(@opjxJSe=_CksAU&9pvupg9;J;Uw%+lc-
zbNzm7`?%RL<pd{QX7d@e3JMA5z@?J6_Kd#eww$L#4lxTZJFj%+gES=t6~tww<9}Uy
zm;tojpDK>d^{VXTdH5a3Jx&eHf_)dzK1_9o@qPp7PIA*yk^JN48<3N-_nSHlZ=5}{
zw1foE2r)|pD6-}9NpcJ4L+IZOrX@U;mx?q&|97>jovrP##@}Y*P+sB(FgMjoY>hWF
z^CWsJb0An%feEe1Y~k8ewNybv#=H|-Zgd~qw1-YZTE`c@YKhdy!b0Mb@FxiUP50uu
zW#jC?FGdaf!EWj<yXEuMIE-cMery?zWq%9IuLFaU9;zHb1iFBGzLf^2<F)I3ng{D`
zn&dxY6wLFv^uzE@6eKkm3W+7XHezRXEaatHI)K%`;V!fFX4Nn%OUmnjSvZaBHLn(!
zC{fH=F<p@_rLKPb+s8jpU;vkHqUsEZCiOnMtgeKZ-@{q0>pM?#5Ysu13;lqXY!E=l
z#lDro;KzT1DHR1&U6;MI7Rk{)uQI1-w8&<(tUApC2DRu}j(Zxle$XfK3z%IDKB#q^
z^=rrgr1K+>Rpd^N?f7r2u#R-iqQdPZ7HRi~CWyfVcLSv)hkD(H$v_D=<og>Yv9nrV
zZB8zFBHE}M75QgCkJOJ&A--Pn8kqFj;qu&RVxTL+wdH&$eMhjH)&knNX;I|j9DGWT
zzJBtI-(Lj%O3)f7;a<5i*-**<y#1pegz~hf$eVXksE642+UL|`vUh54bh(G@3;m&J
z5^n*o=2W0xnl;(U#$c<*mzVFWTluMHcf8q$%}O#rteQ1zW^zEhY-iS=^>lHrH;wVj
zRhB=T^dpV?3Ny!o1>+3r>W;1T=O0v|G5o8H0qdDkm2!rAdv9>*T0i=qoHnLZIyeF{
z=lSc$+E*t93n1?tZB0z(%Vs5kciC+bXBd^RwjFTQ<{mcgs57Gcdpe}jt>sZ+FY-i5
z{I+<QbL6kY<Cwn&(C>bo?Es;K;bS^GNRqqIj;pY+h=>I&u2A()6|K_`tSwzZp;Ybh
zqwEW-z=*^r2_f5cVPRp#U1ViUZf<hQQ5R7&jV;sOr9U)g2A-P!W8_JxpJK>{bBgg-
zoZ9)sqx-cr@Drm#-`*@v312rCtRkU3ARvG-{9W)#IHDtnUOT|yZq&)zQB`1I6w5<E
zssl5s8lGK_ArIVDy>4KWG!~SL^R<E(0~3+*KertdzfQ3})9>KJCN6R7yDPEE_ISY#
zCR18jKwjHt8F!TbQg%N6VB06GS@OJ2qz(@)JcHn-7~E|z$HtANCyIR36(OB)X4~&-
z4?3`rRYBY7yFyJTf^ziYiPAcOpp`G0#`Tu0ZXA7hKBWHR{?%?SFgtp(8Q}K!N)ljy
z#3Q8-(VlFg!?Pn9hK^i2awprytBSh<RO`e)rHhf#ZAV?TX;R|D)utaK9sA$cZkFoE
z$w||V{6R<;O`|IMaUnHP-q<zC;Z9_RTI%*_4(}h%5n(>~_z#p0;wP(L=DrdZfUKR3
zkD(%pVBEDaxjx0x^<jttsTC6wGxnxo@kVp=#bbZ?=m(=)z`#UvyY8%Gb&Y*v2zNJF
z&`(n~^<Qs!hR9O9d$s>)dob<Oz>*`)h?BX@Ue|)?3sxQ?dVWNW3VG>Z<H@m!Krpe*
z->h5A2$X{mB<G?5$)586Fy-K)H6MF5{>&D}3(0pRluQuTT{!Om_hJAAyD;O6*$*ul
zML+x+od5J|s&vo-_dv_;*w_*lqvh=oCe%6`6hlj&g%tdpKsG?*tl!R5WH;Xj=811h
z{?8-FoTEGpqsXs35ac2TE=72~_@}MvE|!fuNm_J)^kkxrcIyxInLeDvvj7GU)_<)1
z?qq%MUtvftPJ2LmZ?RQ{>Rk^V*#ReJRICgHyp0wJCdb~*f6sK!`Cl6EG{zgWsd88O
zwze~y*$gz^V->@7bul_*O!kkv(o<ukP_JTqpOZI`tObr8fAnMTV-dd1lqGGO7fT9n
zn)c?CTt+IxVeNeyx)6%#5}+?CDW(fl-@Q9_364&lNm3^Z6RoD|`m=RWSLYz-U6rMk
zqu~{^Ej#zqUv|;l^cwT$-N|8%KXGi<A;7^vwb9-D9Yix_(PA33M+>=SmTwAQ7FLAU
z|KyY5u6(Bj=tb%fcaoofpJ|Iy)pS{@ue{3kM^57C#(GpOodqWTXSUUnz#Yau&sq2j
zIdzeXi1x-^#q_sb#XyAIx&BNiNV;QU5^PG#svpb!1&W9pCVdrTOpbE4Fayi+Vyo3_
z`b4kxh=SehMD3@~)~<Y+N3s8#%0NfamgH)HN#h;}VAanrzNd24!3odj+9|i*fN@PL
z)~`(Wqptq7I0Tsb-Lr4ykG`He2_w5~`Dxb*a&DF>hBnZw>*nB-K99y0k1APRgrB}T
zr~j#HLh3&}l_RH!l=?G<a~;gf@XjQkrKQ}$?mI1+b^nobKdH<I^NjDBR&xHea3r{T
zyc8!M@$JL30Tr<Ct#3j=^u(26wW_J;_z)Avo&zv7_-qS>(-oZfM<=T)V}SbB7aRu1
z@$~ImX3m=Z>HV^*!>`uopM882aJ4WsF>!KK!F&AK@vR3<g=$$7Nvy}on_1cMm*jHP
zB4E=(CN(-~=ptIu=oIZAOj!i3BN{4Ln{Ds<!UJ3WM+iyE1X$Za`sNlUu>s>?cYlZ9
zeDovhFGC4zi>v)2jLr+$HvnSLH7f~r>G*W7Pyu&xitAMP(Vhc_GJr{5q8J+J`XI#B
zF2u&L#VO)oYh(^vH_`0?0c`zh%~t5Y88cv8fj@cV-1zfaa%b#^_BVu4n+Av%+z1|B
z0e)9{=?_0P2Ci$&uaD$29uIxjCLTH**$e20vuas7)wvCCXt(ABAs~``AsS|<P1)sy
zl$^@+zpPa93_^9<GmuD`HpchpDM#W0Bu?g>TAoqYH#N8IvRPG=!2M|}BHb4d>Ge`E
zGGjL2Ye+M?uFnCTHBI%0KlzXqoWVVdvX-L^2w@mn<plVbfY@<MB3)SC_VEGhY9BBZ
zfDKI(tgn&wKVk{=7Cd39$aC%l7al_8YF7Ms#lPjgl>2aPa0NRCj>sf@exu>twPW8F
zW{ytM?jDo~|2_{ew=54-ix__UmO%E}0^1vFQ{~Rb1WCLqXIm`=F{HiHzxk@eogjR`
zWI!h>x-)lh^$bp;4?L{*tg9`G1_^o75YXP<j*yC6)Spf|mRcAmdi8m?lJiHa2jSlF
zos7+{Yf{fP^cY|0-{KIlvgJ4TlWUltwCe2xLxAfG-?=OOFUJ<Vz#FPVO~VtSgH6l{
z+ZWO|d{t2@!o1(o<^G*)-=Q${55I<67^s<mfZbWvql@6hx%^G>N^XLZm3H(3%#jXa
z-#x3@I{kpte=ozA;f2v+iQPQIwg0JRPC-AAqc!~9^^tt($`Q|gk_^E$u7uX`i<Vg+
zgKLrYN1|BgCEC1v5Rl6~idk=8KzkL503#+MAWQ<6VwrfoqPBx-Xe?#U;e2gz%>ksT
z%)XC0PQHoqO>k{L*32hr-b1nyYZI;Ler7+H!6u$*zOWnKOX}(-pg*n?s!k?LF{Z5|
zbsP&+HJ-gnzd)}Z9-n_~V2I99lhb>1Sr{8PjaMPN$Q<>QM}pmLrtInqVZfLCDxmUz
z?7e4H(_7O%e56_suz-LxMMZjvNG~1*RH`Dq3PJ<~1f-W3I5w)N^m;%*X`y!ri1aGG
zqjU%mLMM>q-2p83@!Ze<`@8NBt`(M$<hS?inQN}OW_IY?e~B-Erab)}*R%i~1Ctj#
zFEP|ZGk*=cmG~ICz>_4f9Ooze=KG_PJud-%?!LG0mEeP;BiK&@)If1p0BZypmlF5=
z8snOQy@UyCAh@@&7K6-H6QG5yb(10$+>bB&%L}SNB6AZV<K^CVrTEH5UZTKb)5gdm
z`N7Z}7b!4NF&&7oc1&(D-um%3HTM9j6RZ%5l0P`K49f8B)fu(3fZfiRU2bkerUfA%
zeM-tPKCzQVWL|PXpUw;6FFz6SFNb}Wj(FJPgoeH*&dRnV`>6%;u;@D!M4PRLD4C*N
zm#ZlSD*owlOc;&;15gmXuJOb2xM*2C6+s6I2yW&5;BzB#SLJ6$?Kp66Vy+2#S8|8Z
z4g4#r1Z$XN%#Ni?RY0107+AK6=?o$ma4*lE|L|CzUy~U^ZBznylfWe7I=b)4x<VY-
zfDo7V{D^}MP)Ag;E-o$#1b!+hU*2_<Dy@UdzM`(kYCz8~EbQ7!+5Q`2MO*@v%fL<g
z_?p(f>8nc?2$ZRxXg-`6kNMV6c6mjDsNpp9WOTFs6&hX_0Bq%KYi!8(y?;oIRi{!^
zqMh}4PrQK~ws`#&ZqDgXUAB<0V=?IhW=dF2)t)ilW&71@e9xhXtCw)#aLyUCc?AGx
zx7yTnLv|9T;I1g;3~qN^ki_H(aD`v{;qRzIz|)`easJ_OO!o2#Vlo<kmc@Y)K#YaA
zJZ<Z)?~NH*91Vd@800xjHSPQxX-NDYaw8z4cU9)q+{Mm%c&e;vu<F`>H4MQ|8KjAU
zbD#F>xu*qzhWdO~+V`{Idy#nTIZR7}#lLKa9r8i=Q5T2hI?CfKLMpFHeANP&(n~wb
z7>RHEH`0`7&mD&WviEcY!!JC#3k0!zG+7^3&wPPyPwAu-b7)C}YpWQ)N3}yF$3cOl
z-$Wfi<$b?Famyrl_JO2~#DIh4H#hsB8SPk35;D6$z+nJxS`RdC6(tTA{$3N?&!7cw
zl6deMmacVw@UUDMNG*X4xFyQ2PTwRH^&@)+2HZS;P2>VkfRAQSEcm{2O#HOf@QJzr
z#xDAu;aS1xGl!nGr-|L7Aob=1<$y}zu;qW!eBVzB*<q>@IdJV03e<;ORo9%R1AOz$
z#?li{h~98lY!NFSPxR+6P5mW{gC8P#TlSbAiL>uH;_b*@msDfKAU=2pifYFRW{sw{
zQ`=|H<L0Sohry`bd7x(*)&4IrhV1#<D!|p&qx0Hjw#nx}iHs9_jWaCYPJhyvZjL2b
z2K%rN^r^A&{)#c;iKy~Oir4}Bknm;imV4q5w!HYLYuS7f^?E#Qn6g=R{j+X{!Ls<R
zDbNJxmiTYX74a>oJm5Np=IwxQd-kOR5ViRL<y>)he4iOu`K1jAxU&KoDcN{_{Q}e=
z&^obm{dwTJ9_GtE@gY)`=C<9@ki(V5Y53Vc%J^1#9|77=6A&P?hllhuwImL{?@xzE
zJh%#Ja%+*QfdcmgUg0~FbK+0;38icn++efJD$eD`Dmm6w^MF8Il;gtK{@v8c(tz!2
z4S`Ej_uuM@@*cwU=)R*^!7Y(T<f)%H<F4sWg&?Ink!&=EnErzO#NfC3<i7PLX|ijX
zi9ZAq#u<58OOLhW%J<_EY_E3x&m$({JeEpqaoBZz?q3Q@uD9i~6T(Rj9?0Z1F{dff
zegvIa`9$Q8&>GHZvmH%#NTahJeZU`;YGvryyZq|Ae#8&(;^9?zg#GfePV!WN_=JhU
zj*xjDn^y*b(F6K!F5syMr0V-DfJOZ~myqvley}jJ7Xi#W^D_0|&ESS7UIwN6CQVp5
zcZVGENtK`JF80H+2CONWl>}t<_W__A{BKnK{u-R4#Ubi%5KyR<`dJU@T#~7*Sx1}|
zaQnC{*<Tz_$2(wr(JO^z5BEV<LI|-cD$YM2eKuBuk$vlpD=$t~%QJ6%qrsSEzz{f{
z&GoR_!^eLuGyQyc3igiCQGD*Kb?X4dnL7MJE_b5ZM&V6@!@j@m{e{A4y`V1fdPMn;
ztk45@-(d{j+OSe$UTxZ<u%@7F^QZn;lRp1a^#hxNk+xKpU)zW1w5D6X5#)Pb1Agep
zLJsKWF9e)be*Kr#B;>}^T)o8Ljp=?Qh;!b27R-Q#_#r@me*sC&<NUGveToMbcjIeD
zUR#uu(|mVcVlEYJ3=4~@1`qcr^IV_&3yr-THd7~OJj>7FCsz6Cbcm4#zL!^ZrK-tF
zracOl85YMBYwNB9rBPtNWD?Eh_-su@Sw_HjzO(8fu_VugPP0BBWSGu_>Xd4*T~eil
zM5wUbati>MTJ}%4rJ3i#<-Pl^(@5tN2Pg#pdtyU;i0k{c9l$JZE{N&#Yf3?|cAQ}`
zZu)CzBbK??J<=6IEu(%{;pBHS?<VRXM2b3}9eCbATAkv@-@bryXl(Q$pD(wbm`a7=
zVO%Rq0U06ydf9-2=whJgXawZw;D0>)4+mfpapd!vn#}-#=K?QfMq=!j10Ig{zqw=^
z<UQt;<8lLuzIZ}xOXnxtf}h}1m)Md{MBJSC?hA0I81leIh8N}MPcFb$P2=x4xjs2L
z0&1d|<6T*^mrHZfVxAo20|$ol9q2N|5&3fwtnV4<SCjl@&79!5k<Qs;qqN(GjAC2M
zH8jSo>5Y#Telph}83Vg;5*>B*fXgYnKpT7QR1##i<--M1GN@u|`yq2W1;viAo5*h;
zRsYrmJX%rszowSNCE!qF$NHrz8r$=0lwZNZ3EAUESzlF-Bjmkjh{1XhTuhrc*8d$S
zf{S86*<jM9>-)h&l6;wSlzW10@uQA4FxpyhjnUCkoRp?_3$HipOA3~8Z`=zU|DItG
z9dEE4s4U+9BCtC+%hpy^ZN`(e;Bv=8msevr$8&qd4rEt!Q$LuT=gCB*?WTXlZ+{gZ
z{#Etq*)eDUz%;5)Mq2{gV&5jV#fH6gotJV^j1>7&{q~TLw*KA36WI9!gz%px07ND_
zxeppw6V<kzas>Q%BJSAJxyZXKuJO5iPP-!eixx&O1Ef~e=|g`D?fZ=?{-X;g#_*n&
z-67R*P^b|SncunfWc-P|xAv1sPf=Dc(tL_A-hbWlD@k%0daIo}1u3|XRn^dN#v8_~
zHX-;>(DvwJ1UM(zUz8|<%*2m<QI>H0;A5W>wKtcPqpvRJ1Fr$fBr&tukw#j&PkJ8#
z{!+nPyM?{j3S9Am{CDnXZ`s~~qBz`_v#79;KM+4n{-w`T?pCC7>BSXf?p_WLlqd-9
zrkmf>_62$nXv?0B+}gKxqJyW7{D@I$)BVB-)_3N!Oi4!N>}$94orr+?EY3JrQ<X<Z
zp8vYzhr4-bjZN*Om#awI17zlt?UlV>d7*R7lI$dqRih<9{fi)at)*YQ?sYEU_%0X3
z`tR>LL$(|lD2fB-vXPN2$#-gU*2MDThlR=p<b{>e@#tvnI#E&4e^Fe0hicBZOaP-Y
z0S5K0%+HqD2K%)IR-$fQq_g$+Mp<)Ia9}gF6_D$s7tJsHz?Fs|ze+eId1&7W3Ch6@
zI=^j2ToZ)S#GGZ}75I?GDWPi{xJyO7@c#mtfK4$ud?S7%?n|n)EH)kTc)0~2ofxR&
zdqaBMVC<LF3JH?`06Kz~!I7hUrkQ+T0di!C9Oed|JGqM=2S@>L`qj@}m|<D+OXQnF
zKIw^&shMBS%YRz?A7ED99_uJH@ikEl(>aTh5ofJmv%RrqKhq?Z4sH(Ybpw&oun<Uy
zIDet9J8l942<*OhX8-9s?1()gi^S&L$S+)Ps^O3%p9*enH{m9O&_g~al#5e{bliUp
zI!~*zPrMXpQnLeTQZRmTW0n%8D__I<YL_m@WtdoE0v8^I{qonOL=lvU1m!`pM@siH
zC`9(<Q=~ZH?z`=8FPJL{Jy_qYue@5RIk5iKm&VAumQGtL?$s}qwx6LMEQsJqqKb>g
z-#PxwHD6+S#gp~TC8W|5)TLQ+a@$15XMXiKSR%SHHaL3VYf0&d{^Hm~e~h%?=0y7Z
zD?1gu@QPOb$|pmN>-8spJ<8<fiQTu--yBZBJ@~9B%xEnf`t|FZ;^&O!eJ8x8k+0T*
zW+CVR{L0y1^nTqu;8TsbruO#_?tNsBzXJ2$ITaNZT%MXRdp^`94~ZXpx$7+jlp%dY
zA#KIK)AYn=2Lz^RU3EC<MeCg9F+Lsq;+-LMa)f@41=D}RYfhH$Cuys$1ekI@7PtKs
zlSA%Pk6=p8j8sASa)y{|eD2Xi4l}d&#%Db!4reWaY})3(AxeNR8bK)>FgWRn@m`h&
zo1cItAYmM9Hvq1w(>61vhfw3wex@~55a8-$d}~e}KsqD|#-<)xE97Vze|E?6i@i`*
zVh|+<u5-3{b`adcV*X#WcHhH##~KM}JF)^|Q)x)$(=k?t)kK09TKHW9!TG(*qrv>S
zL{J<XHvCB;u>&~o&T`-*aP-7|wzpSrU-?vW<<A}hmp4l_Je{Kq^#h4xy@V;hdz62k
zvrA_p!1WzlVBUWVxFAwoz=sqnkKB2G@{LW1v*D!6BT$^HC29@@xbpPhfcuu+^%NEB
zMarfoRl)UB#8W24r!JCBcydVg;1jNM&_dQKyuq+^(K?4ivW!uXURc=fhUpWjnA5Se
z;vRrv3H~4<A#u)|0bq{32q3EY@9fsz`>z@ALC11*<<7J+@f1X2bPC7Yr)N`<QX6eL
zi=6a3177L4mN;n-LgQP`nSd9tF9QSH0yw0p;=xx6JP8uu%eU6|euNv+zKq1;FUYjO
zE~RY^Z6)l|l&NdhP`9={|EQ4~M@I+Pene0`j+=W7pwp?9Pj|9-24T>DFK_LwrW`3b
zPzM#*wTNv+eGm@kmbr+X(|6ZqJLC(J+BZPDqxrPAmRnA3gC7#nZl+aTq=+M&oEc*c
zwhOfXtNr-&cDKs@Qr<gum_DO0<oe!E&u_3>oc4rz-Xsv>BnDi}nDF7p8(%4@o@DE%
z2}nQ=e2Jr?mE!PIzRVSNEBG@M>vSi@8aRgVx<@Z*{-b*R-OLVP92HzH<Kz{umIxuG
zM0BO}#wi}vNZbw&2Xb|(3|+}hLRob0owbfbfUg1DjEAOH#3A1toPCD!6Ym<;g!2am
zPo%A#G`RNVicA8uY_ADdW!PcGsnffEm4kUizS<c&Azc`n#=-+;HM$z^II+fAXVycY
zS@NUEY-1!dGiE#F6aD->T=jce{F@tEQvwVZ`3r?HLg(yCJC2-=lE7a_&hIBfXam>4
zjM17GYQ8)B*dc#n0LawJlKT3u$IToUa{ClcX^=yD009mixkpY)^+A~OAV97k`^@Xj
z7TAei^t$1B8D{O7w3JuTb#}|MWxUqkc(k(<L<z<VuJ-~<?SP&nB)<6=45=XgFJh8|
z7$<nz)vEL9YNDk$%49A=E`WUZ(n?sZnIcC$sVVQIz(hG4y7lDD0Fxym(2)-nJ0pG&
zmtBdWvR$in?`;xp?^VV<g~Lc(6?NfTPs{mbnB(=sL_6|z+)Ye98)P}!BCFwfNpY6u
zPPHYDCGfL}%KQSuM3qO{e|M&GCX+`#yH+0&(t4eDoIeQnI+y2Vj*l^*=O$h17GSdc
zJQ{X-WXGm&ka4~XZ5MjgAO4x_C{NQ$^1)gK5^F=Q8X|k~46a9FW;*w+y_Rx8HjDc-
zTYxCwHk}BEyFD_}l%?kYi#!TjnQTM^IZjdeKx={caN;~Vr=d)WwQRiHdZL`>zo%AY
zhXa`4(x&C(p+YhptlDc){YspSFW2-<(fxoA%Jmnh+>#xiDJ?pzER_o`#ysQ2oGo5=
ziM~#_!5Vv+GkQOWJ$woPHwAl|&)$#S936n4kpNY9fm%v*vR7zuRfW7z+1EZScb_ET
zBh9FE^Rf8T+h*?JZu8qC_CX>!llFT_z&a>1T-Ewj{qc|U6JQEw(0}-9fI<yswQK4v
zwPfgAB9yfp+kgtFpAveG8OjxwOpzBH-$~ki;jGNFBR)TLdZ#?%{$oSk%7gSY!5(b>
zS?I@0d&fzgi<T|FzP4y4OdjqQ`YbLI6B;o{aB5fP_6i%@^{|`-8ezu?Oho5hq}uK+
z&;BJpwzBc-yDk&<9)Y>J|M2Jg)&-6_{L-l(jvX-L6P=FhiMg8og|)6V>mg&r&NY8<
z>wp`KQG@qdj1og1Q&B%Ao73!h^?*>#S53)F*aOwhL!$Pk&OL3vJic&&7VdM3L;bP&
zgO~~p3#U)MM8YC}9o1(yXE^zhW<Ir^a-EpoZ&R-GuXeB6V76MRXai{a(@ovJ4gJap
z{dU-;uK%D2nEf9H=RD7wupfpT-NiRfyA}&?1l6;_%B{hLV!;*;WJfF6S#6u*Q6uBb
zXA>7nguYc{-GlE0?Hj^uJov_VH?DoBioGZZZHwUoH@+C1qKxfCkiv|&)*=j-qjrTz
zs8bcHzLj9Ei9>0mIAva=w|CCB?~1O(zkJ6tv9FXoHH41S{RG)}d;c&5Vl<H>kR7=G
zV)mdWM#>@6n8f7w&pFLJ$KMcTP<4M%a>_@#guR>NnZtn6Va%yh?86<}*R~H(3jsv(
zjj@(}RaF8%JXJ;v;ZIW|kdAt4^lY4TJ=~1o-Nw!kjPP^U^Prn^b2s+h3gQG@K`7>s
z1kw%qgWCFSIj<P7TO5}X<Thm9>12o(6RHznVsTl1NX8sTr1tDY%P|KPXBGaAS=m9A
zwz-u(k6DE#H@#Fv3jXno5BO8mT#)e`V~VeUh-F)YF`Q$5j+1X{{u^Ee3B&6nyx{ue
z4q#pd^%3$OG5Pn+&I?hmJnnW;;JvkrJ6FcQxzEU`umjw9#5PxnSoem5!7D_CMhqrx
z%Vf7)Ay!=Gv|TW^l7+?)4A@yJ-5U3Y^?oSr*^i9>X7c@-@Mp))u(ALthcaf!Yz(H2
z@}FP}xcExVpDxt9`)TRDgSy>cPfe}~knU}0ha5b$@|uei*@krLwH~|_37PDP^~|N&
z+2W#WPH8_i-TP+sbA<&NfG3O`FH0QvWxBVGJ0a+5MMgE+nSi~w|NWx$05a7W^FFgz
zQV%@+(ypbILDKmV#ZlJI5AGc5%alU3_#=U7YQHI9f1cjzlP~3}yAqeu&l(WC04r<_
zl#K&wXAKbZ`;epSNPuN2k8|~&9huOQjn|-aPF!~N;vgl@i@Y<2$;q&<@u8vo)~R{x
zU(EL4pwPd(MLS3dH`NOWH^_U?_$G<sZ(GfMXP-iZO!tb0YKcFBH1P9_u8E{k)VS~b
zTJ;Cn`54H}^|Y{mF3fkLh`oareo9sB{-b;O+2=yn(=ODT#2sbuR7LQr9Ykesz>F<W
zh9&MtzaqI7?55xDW2U*C^)~CTt0l)Xlr;BG$iI99{P{5eUM4aaAd!R1e(`C7^D<)f
z4@@6TXID$SaNjrLhahr^4WhGA!~+7qwx>e*i1~ZXuYZ#@|Lu+1JFp=j-=kP(_8sv4
zDRer3i$6XY6!+>NVbmL%b>evTwva=<FQ_(3y7z?=3Q1M0=4a?&Izz-lidNWnru?!W
zIQOkDY2N$SBs#s6;tyKI?a;<HTkdUJ75_2M^>kz0SAOk8lAlgRh2Nw+rP25g{&?`X
z95+pNHWo8;kdk%6b;srNCJbmkm$U`ra{d;Jcicy?A(z#!YUOEUe|4LQ-K^fK!NNma
z;e-F#wN<0YE~6heI51G+y_J7c>F1dVU_LY<!ZBlH$@Rn9z>DYUa_F3>*sya!B&k$=
zb=C(7^p73=`67XlETHd7jiGbkk97hs81&##N}Q}t-FIpO7$MIK<=ejVaQo+mL+k4q
z<d(cWDzhl4f5ZBFDF$rLx63Bw7Gk4xHblPwXN3c9>5R6alP0n4GVXm-J*W+yd3#BP
z8<dRtaS^hc3qVVD*;MyilbQL!eytzrW=j9a$SVf4_vdAw{)0P}lO~DZo#aGurBMCy
zz&Kh#qb?d=d7`<PG>})~7VTtNE=N>9-B~O$&LQA;WO4ngjw!)_u4@>Y`D^b{fxN{k
zu&0}f{?XOTeM<lJSbcXC-2#t60s0Kud$X*}mlRMqcM0&@2h_lIB0t{H82kEP?E9B*
z34jqb{5rMxgC_<r)A!$3g<UY^NY<B&2XFN29(@1c|1@KG3bFFoT{KqvLs0jhzwjTQ
zNBl222soXOo|i2!?Vr})e&M$-o$wLFv8(HgE|~xK_t}594k`eFqQCK%Df~PJ|M%ZM
zv;s+oir0>z)c^bY?C;GLUjPIwikq{(7rcKO@qgL8|McAd_xIUf?N(wKV_eF${vqG`
z_g(n$o&UET|J#lq#_-=>{O@%9&sqJ?S^e*H{6uYgA^ZP7u>;<lQt7^*9}%OhU?TZa
z`D)vEJbo_TkBzWZUrS0OrOxcRTwSYVjO`p}LAxpnnK!*V;bp`0U8nqeoMkvdQfIC3
z$*@y_1PSo}Z0L_8BzT_OwsbCKeAuy_k9ddLlO5Q0voU?2G0Y27fbh!Pv2T~&^pt3)
z`9DH_ARR!AI%)+c7#;y2;>(xeO+v;^4#X;2AfGl*-91ye{HzvCWGK@pB_r`V!VKHR
z)Gii{Rchn_TA7~M>#RB_eS}mxYsNJgXow7m^MrZ9Et-Y~V936_H)=r?xvk=2UeDrU
z5CWqyT1z}b2aR*H5W4^AuUOTUB^TrlBiu)lzG2g+-E$fuAC~jI;`ghOsPmg-qzuc?
zSCKoLx0q`OvAFuP+<ID-70~lO4#R{vN5TQBCSMy$jrG3cR|XN1nRu8t=5HFW53XC^
zUbvK|SZ9@_G)NbLl6dLX?u@K4GZ2oeg%1_mn_+6$@XK>E=CW&{#?C*)7Qey4r*e6R
z7n*cbm>3b3VcLEuW)%ST6uU;4$@^A9f<50C1AMK7nEa3w#67xESQI*aJoPiU0)-Ck
zh4%0VgXqGqqEzhMH84JB$`h`rorizB@OZ?jkIs1O(-}WC&+${WhbWl^W1>>{06u=a
zmoUJvsj$I{dyl2VSMuU<8DhA#F8^TI?nEuhXtQ+bx^UHFP>&FyL11O&TYaBjn<j$t
zaYMAbHl|LrGDmDNzw7xm|0_Sb>ki=I!>b@wYIUwRL_ek%G=jvSg9=YDl#m26q&KFE
z8bKV4N%cm{i8yw1l%|=7fC+{f7<g|_CEG+xxjA5_MUf>8vfiGS_{X<)v3*D*gU8k6
zp{?TMFFAl3YrcVjnq+%1m<G00Q=2M_n_HS5cIq>%1<ikcyDY?phJ+I&Y`b46QPigw
z#EmH?W|ZO5U;^A{DKHpe9CY!{a^-fJ;agcOhs=VdwRnxni?_Z6d*3r~XuMb(zuOpp
znr*9!@^<)rR?EJWihtO3Rl^DJJPu2tVL?XV5tBAYONB{lt|go>6LDjyvwmJP(oI1E
zlp<Zo@!O{u42X^8+HqgOh!BA+Bp%ze`|V8*Ab+D&+(0sG2r<3{I<uBn;|b*+8(DF3
z)sK`YGA08eh>S(7ilZ%1PQ0qp)Hc)vW*g7g(NjLfXXl13mfq-35}OEs#CaQS)WnE8
zHobXaqv!39K5plFeRIcFjYD>4pxwEhf2%7VpU>II@y}>r(m@{3M<tPSiuMB+NS`kS
zE0d5p-$}S*CNe-ZAyVcFVOBK}1&Pe?@mW<}zQ-E_&@*Or_mI3hM?a@$xy?GHStzv7
zGds2j)qZ=yCi5iF{t^j}c59;uX&B?!CSI%Nz4A6TsEHt0xf+We<}xH|*RVXx?S$QS
zob>y$-H)kpdQxb&c8|Ioww?ZZKd=cKjF#IM2m&F^TcBabF}3_#R5qB;zLXpZdUpn6
zqavCGX9_$g&W`nSseM<%{m4U11VL6!^*l}DNH-^V8X3-HkW&}SanxJo($!g0cndJ!
zGI2Sq^W4?tgCsP$%6n%~JLXB}EqVSa{X#?M0rOf;8O*Dt*|pr=kyz)|R?#!a?N&SP
z==)mLx0x8rbWIiq3*_7SLp27qZAzG@W$_cT^k8sL2KRYl30Q>O7Kz)4$|D0h3;NU;
zhuij(A&T^X*>%YQ_4V-(Mb^2jRU12xAyp@EUuH#_GQ}0B+z9B}D-B0^0Y_@Y?6vkq
zGL)_Dh+Awy<>S}e$IkHcI;2#WIL=VdD1G@SU1)OpFav^-T=dJufSW`vVx`dX3jD;y
zj@g{3a#!&4{KbSEvZK{LMr1Xr?4<K)KIG_<nYeLE*{y5#pN?=+l{rIw`772sqZZRd
zY+$XV<|i3$z(GOij2&!e$RE%KnsHMd*J=22W8|uDpJyaH+<opftR}ti$f--G!39S5
z?WZH5GKQ0tJ4-zH-F8BG15H+Hyf?OMP7>eI*?D(Tbi|RZaR~H~8ctF*0kPi4-SKMR
z=ly_#Zmms~S^qc=V%6Ek@btxh9;?HERe+G#zPS{{O*~ff&ZP__HCiTubyh2;fqhg%
z`dRnMoKJo*LOZ1<o;*wySOuO-fAxkJ`DBlj>uZQL-&@cA`!7QIL&X|Y?n=X>NJNJp
zU&jQwlqXpvm{80ZxlM%<=@qNqi*LcUc^x5oj8J*vq?Z$yBgj)hEPC?f$>}~rmyzsP
z&Egdwir4lnLiMM$4eaGHz>_0aW(~5$NH06~%Ks)WG!Y~or<APHBT^bwuB{MhDGoW4
zWwL;tH5=>l6g!TK3WZN2$rU$nXG1e(x4!!NgI48A4~pvCV=Yh1H<1LKxvo?4RPOUJ
zY;V!+<~T61p%&Z=WE|C}ZHm8BF%_oI?XT-o{6?Y9_GIyTufBgD62D#xde?aLUn7PL
zRyqTGLT?Bkrk#Bu*LGT`peIU1Ec}dZ*kXb&HMe^hQ)5+it_^VEwP(k+>c(@M`On%n
z^MxA*f7knc4?!lbM}cX@ZOW%B#6QJEeQ{AUJOTg43xZ;0nc3d4Qb+-$K#1Fzf>%03
z?xPiUC?2-}twngRr;h^#8{6v|SzMPa9RZ6V=(#VHjp3HYK~OTbG%|h_ZtU&WdLCT(
zmli4Two3h|6m~XAzHNLR{F!##RWK)^<Auz<07kQugSRgLqgzu-=yjL@f<?5pVN%I5
zD8}o0PVpFXjjqORo;KbcKcghDPCQvgxmze>f_qM%kw<7)$Ui5*(DrKrODf}@btNeP
z8`QlKqV!a5f;^z7+s>*y#hd4%V%XEeuS1-WQ7Hvp<h?z5OGWgZ&ySz=1Lr}TzA>5F
z8WeA(1cj~mf<3>&Dd`2g=9H{i_=Ca48@sqMf|otuyR_qgi}QqHKxn>^V3Pcr3Q#gs
zS#m(vjsYR9+!duirstL@i7w9r5k*ElQs75F@OSr>oeq$fvt|c|%<32e;^QJmIM|Rf
zePBmRm85O53?vrx%v73AOSx6RdP~E5nsv#V3#nR~b=$FJnR*3j4Uw*Cg$B1>fo!<e
zGBs99^ZgZR<ho<canX0v=3pRcoc=;iJ!|Opy2UPV(0tx&No;hC%#Q(9dLlatBj84q
zI0@BkyVvX+&b>Ij9ib#nY@k}#($`%A%g)t01daFd1E+uC-6rr0KunQw*?V__&83LX
z7viWp0|k+0BqcS@eO|TxitssTW3!`{4G?MaZ4B8V(-xZ6JPjxBzTS2RzGAoie5yrT
z-yMlbWgT&Hd;a^eJ6XJ;;`c_!cBe&oplP@NqI!fwA@Qi&!7U%Gtfv=2NxgOpGqGB4
zIGDCHBZr>g7fBiaO2wZQl_qJtlbI<=h-@{FhhhAUPZRKH!YHoH0SHW;qX8Vs6r2B$
z<OkYPOhH+d4$7;B4<>8{j{~e=8KOkDyAqCE@Mf7ld9K~PH;)*83SFy@8L4}lynR3Y
zCUb1N&7~BY+{!*+Ou+WZ^WTbZ+uj+<o3SR8w0kWje4v`zap^%rd(dq}&qhRiL$gEx
zx`}qF44*=A+N{_);8~)tSav4#FIXjSOXh=67Mm6+;WW=*!MS-a!U?unHpaKI$!jcG
zGJ|VD`fj^yX4g>j1cA1n^Mtz4ip{C9K+oeFqZ`1HmWH_!zEaBSOzv#7ijEDJ`7O;5
z9dQmQ?~iLD+%+vLZ~T)6tjYpGVnEmK@(y-oK@;cIdR6N!eVz5)Z;|>vC=Mk?#CAm}
zCJI_<;oO&JaW?(dOxl>*gb1kz9TS2vB0A~tyCVzCwdz{8ncySrugA2~kJ1xO$50Ar
z`Q#}`)}YOY)+-AIKpKFfrZ30Nr~6a=kv8ZMg@@a2E20Oj5qtp73!E<LwwH#mO~V9%
z3QuDf!$z>T2iS{xt$XB<0ukTLZBh9(S2f4k00Bbn*lz3C6M@JG*>D@iN8!~Hz6Dd(
zre6}}Yk{jYa`St#J-S)h$D*#QJNq$!N@5o{<T2?}Q$(Oq>30U07Gb!8>Om=?#~7#S
z^7KJG<vRE4Lx#SKv70l*{09Lc^mPH5&S|2<^51=aBXf}s;VK_e{kQp$@oug0Y1pO<
zOgl`|MXz$H9RRDPs-jU!^RG4(>oSDxc66-BWIIHCIl?T6OB!u$yfZki5s&Tea@!u<
zT}zc6iQVW2CKNlm^MdMXl(dKQ=A-+J%i0CWcko72o%4(h_pCj>9W(C7d1pVF>y$y;
zWV$Qp8Bwfz&nX#m%WRgkJGq?i>B4&&R!oZ^xIoWyKMSvRyO5^2b04rXk6V@0E9hJ8
z8=p_9ohB^P5Y#8fD)iuMx-w&DJ1;_6TK`yfFQhW6`08^k?#>zkha_wU%t*>KvQKrX
z$$HhlPldIq&WTpcsi?`0VO|ge&lUAufYP`su|0?-oqS{2)LU(0F|cD`G#7%8pyn{@
zHbZ9zpq4*Jkl6U#I5_^!A=lC>iq2Hb%8y)&s_62NZLey5be-a)Hu-15c09q&Mee>X
zI}DAL`c417rd{Q#@&#wEX(r-@bYPO!dJDZ`1Y!wV+vtmQjxQ$4{r&Sw<-DMt_iK+q
z>`4P4#%MxB{(ku-<0FM3?1B~-uG~Evt-<bVo>oW|(aau+f5f=lqb;Y>TLdOMF3Eky
zcN<#kPR5AOh9W%8VQC(v3+Ca5TIG`$1?H$b8XUV%U5=oHejNvmSi{n%?!`&U#)vpH
z>mK(gjCsZljlqteLF(KiKWZ^h9#lQ(P?zjZLJs*V3^mFqNX8&>LrAA0QK(}Yr+Rj3
z<!-wz#mMBs0|E+$LqxGk*LSb>-$WxoSQ^Po*NhbnFPB>DxSpOHuiBoFm&VtAhsU*W
zzL=#8@`ot2foI4eC6AaB@>=U8uMZbdF*NrX;VHM1*l-W;@Tdz4u)~5YHwq9=l-Oys
z{lJDV+irpfeNH*edO2d`Nta4(JZ>PK4gtfVczgq!+8}qkRyM^l9kpty3|j4?d&@=~
zn)ycHQ3Bo=|FZC>e8;|Kre=M^;zznRJdc|%>r!NI&|QwGpQI`!rmHT3V?@jmXJsYh
z4YCR@VX?>c6}^YVC#HkUU3SF@S2tZy+bj!c`)%xYf4z(rv&*o3?G8A9r6lC5%-a{A
zQ!%5r&vx|fn7{_ZtKTs8>lT;`j%Sf6X!ZOqihNDdDq{pO*ugq47&F<JiENibfB{@G
zrVQ+N_tpl*us`$g5+EW&MfyRg_Q<$I=k863I<<|sTW5`L{-j36#0QE5UlQl7BjgkY
zXI8v5_4(z8<N2WBY0&0s9Ej9?8u<WJzcB##E_BLK;qRC)S+e^0Z#&B`WQV1%e<7k1
zJaehW0F`qy%?cxJ=PaKutz+XfNXB1yY2iR5C~p@g<ysE3^GpIFM*CJqa>uzA(D9py
zU}SZiF6ygTavC{7D7i5wSHv0bnY;CwQ|1mB<bfQ6rxvu8q>pav;U+m{b=~}^id!n;
z_@J2?gAR?YO8h17xy!QFV&v1UJeu`KJJDkw(@H5MrDa{Gl^^ZZwZgNP+P7EMw|a{z
zoj998^Vf$4yOj;j*99}>10O>X&7Ofly?mr&4$eh{RTy5ab@N$emMnq(!DFqq$B|0u
zePzctH_<WE1U3SO&3||3$&gJ^kB6>Wc=H%~H~P)B!6f_UsrI3vp|U{i(S^)Xyyx-B
zv8SGkRG*8}Mn%iN9Ty=4P3`Q+`Rb>)as4J{1^Xsp1R37Jx*NroNjP_Cv}Z|Ob;!6h
zbicgLwKc1pwK+({NFznq)w0eCTL@kbHMpwSu$7QGYXXua>!^gcI=L0Oi=ktyh!bV`
z<mo9OuS+k>6qhup(Dy!^t`{VN`rvfje1=G8l!YT(lI719Ou??=H|z+79!Kio6tZi!
z&Fg)}V-?-@6hYm2Q@I6!qm}q|gbU@->AO*yLq;Bpni)DdiDK*B8tHydQo*RIw$~BW
z_W}o33haAzp>#-4mUJm;QgEGl<m58y2DQ`|78LbOpC55})k^i($3cgbM!0RsFjVso
zTMF;#lkpt)n}goRSe1^-P32lQjg1slEZWpoY>p1a5`8LxA3v#ve-Lhw*%En}lEbw)
z19v?(th$GhtZyt5YF)589c{<&AU@<yLh)i}$A#hzpKk7ZkKS@mo8wQK2L#5}vs@Ne
zs1iw~2_`I?d+Gi(NTo~D&ZRc!a?iBI4k04>b|ma6$W%d+n6Fcb)NWPKbAS3<Xb1D>
ztpc3_>M&8?M*I34QA|Q$Z~Ele&WdZjwGjkI%OU|$n>G>9cdwC3wKONH(lD^E`>HV5
zA4Pt+0<kvvSE2-kN<#08b}HItnrrKifWVet7+XHd2VF>m7M5oqdQW}^>Fq+C8)9^I
z8b;+1*@4O`%No6o)yzsyPv7bSAm5usLraV=FU}n*C;>o$KNY|AWZ0$*6xJlyk+%4h
zlzKKUlrQ|RjDjg%Yh7&$=PT~bRYk-gRCfAa*JX5!MXA+JJg{ndMxBB9MI^3>prtE?
zly?OcZmsFQriP0i>4oKLI}3Sgnj6`-)qL?}tlA<n$ZL_1Ve>x>D2`n<Z8n*V`{T1h
zw<z;`gWN2Tuh4HFO8lZf>T9sF4U*IJ4EGD=Fld#T^Dy|`F|@%Y*ZfD@GSQIqX%A<P
zqJ{p9O*?bxz1Ej9-`D9)-Wz%GnN4qgYZLFk2)6*B=!ln>8|18rb2g7afI?8u1Id8C
z)6XzbhHmm<Y#_rMHW$1Lirc9ms~Bu_RVfH$CJ8+K(B^^p{P~WD@bHQS^Syj-7Z8y=
znG)gW+aiasgZ$;opDO(p-o;;&A^L5X+vr$z9v#8(+1AL8`4M6E%g`j0wY8-s*B+hu
zg2lPNOQvYd+r#ClnYUz0Zz0f5jXTEg7+}T-J1~T1hQ+8ajf~Eu5xRHw;!8+`FsX|b
zUov(9DqE%lE+E9j%t)oSTTGwKStmQn3MA&WLiNlVlM?`7j^IjL5;`k=#w#}~&~$r9
z$P*+9i+OzJ<V@dU5y5HJExKtAg*kXz@w~5q@G-i_7wT0_e&z{f$$IK2RTH*JyYjAk
z1FOfv7wKjr%jv^PEs@NtB2i<ii>URt;owMb*oi<DTjknk0NjyhB0AlLtRnhb5>WMN
zW5T*|ANSS7mA%|<TLNv}jN}q<YgF<h_LX^vqju%Oko5%?FIy$hrKqg);bU!^9l18|
z%VFUK(xe@6o8>c3XrioPMVd#&foocC2xJgQdbDf3q!k^bkem3ihockjg)_2*?Gd!s
zb`g8{-QtNL$<i_7)tSHhSCq(0-aQ!@`^JWwRUCmdOnU_UoMm&67<b&<vjHwIuRsRL
z{?NXgNq|`5tFsR1TWGR&V6O-E@ltPHsLm?QX1-C#NaKlG1Yn`!wEcz2oQY$YvbM$D
zUy~D+E)`cE^$5q6JXqLR94aGykQTK^xMuxPK3YF>*PCs-LD4UUCEIFeP7S695i{Rd
z7|?vN^e(PB3s2l*m)m1P_raAvBTCuR0k(_}yFHpO9tMxqeunYArgh0TVo<MT!+&Gz
zrK1|yVJ?V+R^6o#vXC>Rh<uRUNaMZin;4=*YwWRDP)o$HQE(qcR<{S6oi*kP6+`iP
zd|KqG>+fPk?Tj`9f{8>{2%>RR-)O=ZKV?iWI`TvVosk+3eL5Wt(lt}?G-|1f^E5b2
z8w$ib-60@a(=^vkftu)1s7;;hGuKyCR185B=pnCH1UFB8N++>(a2s&<k@e^kH;9`-
ztz2irs&Areg7U;HvQ3<2rl>v?6oR}>n!^J$#*QG$sMdys1TTk&x0>IB&}MVLhVbZt
zOJP17p*JY7(hNZq=?#IYn1;K-1dMDclufy8RG-3X##$T}pI!)+M(HN&a;)Qa04!fL
z^4<l=d3df=m_gF_9%0%JpW<Lqz8mcb!X@4IApg(`ZV9AOJ&;_7TW|FA(JRvCkS*`*
z!d+PfV<c=ls*!!&6WLNV3_jrN^p190VZ<fhlBsao6~ZA#L7hT#In>Y?ej}In=lGig
zVheug*p+72)|-o{Axzx{bit&40=w!6Y-eS-&f0GYuQ(xsbGt~Lk%5VVm#jOz1HSsN
zv{g#Sa*Ow%;t+(1n(uoF{0M!2lFKt8T6dfG1?KK;xelQ3cq;fRNtp_}uMxd~k%1fD
zAPNF~l{N?f+9sW5^U73!3z0`kqvmrM=+%Q{P2G(nVjZPm(22tb9F6glb~$Z5lNX<3
zI`QNJuGfg=n}yfd7PeIvA>aEBmx?M2rR*p~y7g3rNU^rX-=0Tl5)*3Sn-A(q9aFN1
zH3WJ7>#q+zkhCr7o+DB-%5hR|^^;5S>GLp<=jcpQwvOua$4A0_fB<MAc71I%2E#=v
zou}euBYw<`LRhPHA?y$%W7T$4UwH~5&Ch^d5^CKtz@J{|$<n7+nAeo#;zf#S)BPje
zC&9s@Am*GkyS^%UECJq%5*Px;h)>=r&<WEH3)|!%^#oMNv>Tv?bi>x43<8qfn5jw<
zG|EODk|LGnhe20eKK6#iK7_4n!!$vjw<~Y1Zb2apwD}aFy?L>K1g_<j-ENF(_L#|d
zwk@e!z-w2?GsfXeWsos5T$8EHFYi5UUmx0*+uPN<JEDZhWv`AC-j3M(272|m@qlxv
zY?v~08D5c=fWBE|*_B`(L!#US8moGU;6c9ap1!qyj;TB*tf)LWx1nBUBgeb-Ui7sL
zO|-8i4BU>W9hc5M0eYGE4c}D8AsvYg_?zg*6g!$W+xUoUQ;bw&ePP$aw#&dkgOy4@
zdanwBo-}q+xEh$5*?E4OkD!;w8qRtlM9!CdMY?E)pp6-Iw}L`vuoUrsI{l<qqy7X*
zn8xt*38#=GFCZ7&Sf4ufJ`P{WEBZReS8tuBM@OZw+Es=7bnQb%1mkRk5);*$&V<Ov
zqMfT|B2ydd<uEj`#%hN2TYyb#_hex}xTU;+UKdce&ajx_%6(qx)pj!>r3#ef3?LE7
zT937;2WOOJHHV07?-JpO*3?YK?Seu*UyQiqU4Bjd_OSKtqo-SBFEF0gRZB356F{DO
z%{I8|YtXatHh%ZL$H&IBUH2|<8F&xN<H)?r8`P&;B&Y%P&?#5#jzV*V_jPF}m`F?o
z+y;TnqJojE3-WrzF)ps2DxuM?ikR@#KQOuPJ#^03F0ByZwP-C*p|;d?l@?9q)2PeZ
zTo`lTAp!2AYg}1pO}<Jz4!n{!mV*Fk8Z%ywfa`gY9}UxzpgKteY)+KNda+kc@Jo{~
z;O-WxEmhyBZb*O%l$Fs4v4-FYXfOot+gJf2U&Ottc)RXmxWQeBx*!fBmWhz7E*~wY
z*>-Q+95P5LhBj}$Gl;5mNidlX{=A8n-N}<h#hK?2f2)%G82!Q(Nt2Y#N5px~?ZF%?
zLSr-5<VUB`Lp|EYhTv&QCWYI!-J7TGQaCmfXdWau4{YGjFr<aL*MQJ3WA_fEY3qjr
zHZV~k{Qw@0Q3yMCI6i{0QkrF4YPTy-nKe4m;;I5O3d9Kj+(NKeXT6`1V)PLj1~FPS
zW<5x`p8pA=7G?d*8{cOEs9%<>=Rfk`X{#<YR#R6o^JzuQtA)Q`uyRXpbZe+BQWrKR
z@1_;nd9Ox?=B*OjiRGyqv~9MlGe#Fb9TUjljmq<$>imR@#aYLDZOqAJ@fIh~JSb=z
z(n#qwqu_{x;kV@}E*_r_>%t?KoDk`ZyRVh*O$Yek?9)X2ha|!BMnSZ~4DH&w!<4kO
z$t^7&Yw_bGI&2a~85yD=DWNvvtHW8g1r{vLa*6u3CCOPD-KA^=K7yz;Pf@xila?6G
z>^QzLlvM?rIm_NSZLjQJ^m@=t6XRjW-ikE7v3dn;*fmd0eF~cy;ZoelO0?ScCP%n%
zyRIpOw%AH{RS%t1gTYAxK!&)L$>_}H`W22}p-x{Z2>ZKztVqD~+p*fUxvrg&Y2bn@
z<iU&&WH74z5aWB}!l}EHnh?LR(D46Mbb`bALE9uhxGx~TvNCSxYDbQ5+yu&tIU^@Z
z3YLih=WKoS@5E_tAtK7?CfuHf-7*uwiGod1hJZ7;wj5Ap0{WepAKhOUy^)~x)sXt7
zHeZnkz-0I02(5PBjxM9+ZmU*pH?-GZ%mKzfJwp<jn$|PTLQ@W~&+YKxMXps)@XQy!
znVhQJGXbb{-EsmBjcDQr^Qv`;!-qn*3xplQvSOsUNG*ZYF;*G@D)bDvXDm&s6%^?F
zh-vjm-i{{yd?*HZFnUdhskupm9Q0X}2LWKR)V={cY7SIbHjUM=n650>zN5X(<}eqh
z=42V`p0|gRy=;g%*`0OC&`l1g=-F5L{CvalV<DAcSB#1u-`{*Qh076}dZ)qdAG{`q
z@FeWjfZe@HcDIS;-cCj}C7Rh#*FDBsrI_kZ8O-NX9#eSy+)YnHo83oqS_4?&Zk=n-
z1|iRd^RdI}_3<ieQj$Oa5SRbNC3^YjWfGD@pSWnMQ)y@?H_jctd`O<~lve#UO|a58
zxC0mKg>RnV+SYICBvNwkUXsw%obWOD^Rrw-4k0t-h-BG_G(=J{#iJ6pJqhVD?CH@Q
zn_pW}#iek!Z+5&@lVJqY>)`n*&%}<y1{b_Oa#lps!stJn_Y(*ITj{=tiCZ7`0)+f?
zgRE1(DMWQibj{e@SP_jAP_LYLCPvAI`5CxnCR6&WJ|Ux3fyIpW9f}Rb!AeZjl}Orv
zzV|~{qD(U^9-1xIS&x>j2K1Ffzp=ob!HA|u;rXa57X+=kKeSxD@HXsxP;)aF8Y6@$
zZ5BX!_S@I9C|P1?K6-7Ahzc1?i8(Ly(?aJMi_KdfGC)uHH=}!VYwb*vTd{>L!p#r(
zF?vedLLK}z^tnZ&?mQwLDS87hm>q8FSZ5ZbnOk4R_3GPYGF@f}zZ3+KHRr=b-FpSf
z9grUBwnA+?K^21a=P8E?Uf)~#y^r4A+HS6A!o*#%yjv3C*T>M3+a`g!$9#5PwKu?3
z+sMs)t6ml#qWc6C+S<w^@@pfHot+DQFFfS*2vjpNP<MN<9Nx_zM)}fPw$3TX6>SCF
z92!44ZvVVWp)gcSkIV7MsFV!yg#%5kE?rj`Z<}MbiGh)rjlWVe8%TI88%0=xm7eDd
zcZgf}s#p7nIE5(x43c|7kSba-KMA9^_3c>GO%Y*db?eO{Kza0n;CR_<P8yZBtGB~F
z4KaF~VP1Y|Aem7yu;+rB)t%N}*H(`|!4~8#2g#L@=}BJHDbxEV@$Ae?$c<Wa-!2PQ
z|7xEXY~HTVuJ=B{2tCb$eXRi^4B9sFnV8Qe$zZy*&}TMvT=<`HU=&zG)VZv<WW#vi
z6aCyb$6j5dz@Y|YGkZ7vd2?@ChSdPhfE)8pENMu+Pz8YU%c(t{s#JBuppH`mM*C>A
zzf$U|Gl!XV=vU|)m`d1`f$onQkQYXT8FXRttBSOZy$VuRDJ`r%?>swW#GqqfDcdH+
zNNO6DQ)~COj(zyOH4cyyR6-`%QqeP2sM>^{;uh;pdQ!s)Lc5f0dS;=XVY6xd)w+un
z;h=K<x=?Bv>=>q$D@@1qRktH?&RciB)P)Dq$PG{i2opVL*N$c=rzgZyoBMBm;X#xO
z&vskEt9(^w6G*|J_gk4NF*#T9OJ8OSXW~4bXN)N_aS=$_w&b|juYP2~xSaW|(waa=
zBF}<o9D;gHRoXmQNH9?7wN9_M8ZQX!`^%*K5{mzkRI97X#B<$?4Cx<X86&PLQ!JrZ
z+U~U7FHhn97AogR{wB53YDwgsLIM{_*+Q<8mAE})HU63o7(sp*)FG5w*nH6xZ5J4z
z@l8<@1duhl$?FQKF9R`BmRK(=Rkzwlav}RV!SuzhH%K5<q(xCQU%TI8PhRae@y(cg
z-pLB8`PpOMJLVKey?2Jjs6ist^qPd+Y$`=HHcGc%=Y;}3nU2*ruhZg<vLq*~-DgRg
zk8?oaA%c42lw6zaoackhC7##vl-9>*NNsX`M&{EVL_=sqE?b9cYTe45YvMt~M1_TA
z_nZ1BU%W_hGu2|L>aToR*726Cc)(5S8BJu&KsjZQX6*CoMHXaj$JYllB?K1nulky?
zu?cZl&v}h_t2xL5P$NAyby{Km9NEuK`43M74?1~y+*M^%2E{W*8yLXyuKA{iO}&mv
zc_Ai>=yyh*wTj!8r-%nxCcpRUwavJs!u#$L^)0nlRUxCdi*LiM$}g$ZVQlGuc0N-&
zzQ(z}%0FLsW1QEPsN@>fGKAsQKUOPBF6L{z{DLJd%CIb%SZfqO?yd}Lya|)$=gb0*
z<{VFdY7AmGYMpXc9p(n=)!v4-B~wjylob|+F!gB{n$IiaCmS<|0SywWyBnr&8yRWD
zCuI3N!)oRVhxBN=UFEigp}fd^EIn^AI0Su~!B&xGw6=o|)%IMI4BgTBhOXb9-Fl%^
zKYsG)NMSPHeHNDI!u}RoZf->?JET|m*@lDoP}>6^+tJR&lT%EVn`#*_2>S2h0LS#g
ze3P&FGv4X*e;fRx@|I9W<|nr2K$uu0KTKuWtm~*`fHDVIv~CB{LRq!mEk8`uCYEC{
zZ^Od74L{mLfoz^w^?1xvx^r_-`m|=08yh2HZfQpu#RIwmwA}kqCUAf%g*~xd@v52k
z34c^(TqJac{W;>f+FB!l%6O~ey=z9CC6%1dut4Wa*;VLnK^I{D0Jm(dB{vGL;f>WA
z`D|{c0YOG*cl>!wv$=C+4RD&0Qb6Q&y;7C%wf)?g&XiMU*H!hcuJScJ_|WRNiLxzQ
z40CtyT<`%jAQh12=!}co&!I(nbvT4tuaVqgw;VvZN^!AEvD$5f)z7%(9bYjp>J<&k
z^I1UII6|74To?0aicPBFcQ^iNR~AQ4+m~Moj$ao6lyZPa;+LE#9m)xjbML*{M7w2#
zjUe~LXO7EP>9iiZe&1157T_S9VEM(4OF%Se<KrATefR<S_y}m+Ima){3_AyIDT773
zR`+P>^93-A3k&hFN`Y%I?skFM<odUF#t68v7CUis6=eK9*TF}xgwS9R^IO%AJ33U4
zm50Vxa@>Z;a|=L?*~4|QpzrxT91)P*oDrFLPkzQAo3XarKw;3U+pm%+*w8?&%Mw;)
zeJyrdqBNI56Oxsc?#aV9;Q2Gq0=u%SN&<`6oZWna%eqn7%XYN;alKF^3+@amuIEKy
zA)n3FJ}?yas$Tx;KbGSVNhFE<S%hUhR9-j!RKfTvwuSo9UDG5pksPKv>pvgL6Tp9V
zs$@l^kyy5?nTcr5J#f`#Uu5-bY-O_ZH#f&I2h9aOs<USL;<!{!+W>vnQU2PYHHh}5
z(qT5B&3fe&2?7e&&1I08DJut4ZjQkSE#^|2fE9VX(#ZMZJ);{d5L$vd$?myfP(b=P
zg(#g*yARZYDk^HX8H7x4xB{`yn>U}71S(GFT97MCaeXu4<+SndYe&&T>32KBD^F(u
zb(Xr_A4V~AT#M2C(T{R(>40OUU-xFC`OkEIG0&~#&xt5QYVn-chbTn{A0}m2Q=I*a
zNIS{hzD&MamoVf&YqAESGN%oZ{rWRWNrf{MrISO}85ggs(I%?FzKF#z#f`PTvsKZx
z|2Uf;kHl<uXD}Yht31*7u(>mC!m=HwfSB?8$2oi|r<?F_LIn7)%||#yU_j63bVkIH
zbBm6ZSdWPc$R1KMv?d3iQfH-yGJ4+`*mwG)Xu(ARAV^&8)R@;j`4ai)v#~94%=e8D
zAY^g*1}<e#R`&18K?9lUE6XWgVm#(*4+e6ImB0P^h@nk6Lh$c#9YSn|UFUjx9|3oi
zC!+&aG>&s@m)3S!zAt|Wy(9%FZKBV~&5C!wOL_6L`6C+H-DO(n2h`l$#}bR25%A+<
zQ#`ovh$jQA>?=z*sfepyUU$_WK?XPDL9x}5l!P*D6R7&0*y<UCi7gJXgYmAjRT_(@
z7%tGlE?uwzy}Hzxv(}kgS1kxAb7@&=<c^Zl^iBrepY1V`%sH+tB{QM74Zk_)D79EJ
z_SpCI525e(7p~gGwbPBK+s;5>ZDhN_8O_B+y%u|Q%V~XH98?xW-z3=|G836I!v&(8
z6_4o3`Udtrsh3U&T|ARbWxvyEm~#!D@wB{bsg!3JB!jQC`@wXsPbZalLn3+!CI!ha
zIaQ*%Wj$(lVqHc{Op#%+8EoD_cV);Yf3KCOTX{d-Sn550%jWjWJnm5dQop@_tJ+jP
z4w7LORq?5JEmEYXfGVpN{$hHme#*f=Kbbu6L95un%^g`#On;UfrpKN05-3vAx>N0S
z;m!PIyh80j|5m-c-42r>dN3S$C;pGXL-#<Gu_k2jxy7&VE^N%WC{;xpJRHQG5;z6T
zu&JX*n!Kj(=nGgO0O`X;`aBRrI9XkO>pAOChMy1Y+sY8YwLP2y5i$)E1<Snmc9itG
znvfCXi6pd_|MiA$Sx(g8@iJ*BaPfu#eNT2q{>Ehzn?$*FrwWzXdDsQo#2BD&Zx0fA
zj`uabL79bvR`<I<-0=;nCj|mXy>X1Ipz2$Eqnfv{h&11QA`(zKD%60^EE>J7g1C5&
z;^c$Y=f$13d3x?D53PQdZLtH2ZQAf8X~0(8R=Nq*Jl{sK)ohFx=24Q@>t%~4%CRCK
zP*r)n6x#DEDcj8qd^~<FwPR=~I-<9sy7<BxsG&sb-H5p(Nb#U@rHSX2n`zY&$uZv7
z5!38&Jc(|5yO_UudQk?-Wp%ZzXQ`E27Bw>lKvEnwAZ-Q<F8)GrdKkJ*pe2b~@ssVh
zJeD{uoPS$9!5asQw|(g$h?K;|1CjE=7%XV|Iy-9HGUF1Pml(p@vfRRG^C`+D38>F(
z7uI8n+}3-vX;2w?s)m_5087*m^_x-cueu!i%Dg~D`=d4Eu){VHUbv3!jB1Z&y5ayz
zJ<%C<BdZ}I=albkC&}2jvOk~-%<BeL`MD^L)>ZP`0U4uvzdI_kx$G4pNimq(-nmh)
zFoZ)f#ll}n0n9hPDk5!n1c-esOhYGdp#?Xb#&3zRN_~Aw`YHQ<qpYcIIi_y80_`P`
z^S;=?H4|R(id?K;lLS)|UD&-$q(&YCp7qM)iFk6$yq-EAY2?}Q*cr}xd^7+mCs8K=
z;R$pnpY!!-Wbj)Pf9pb&FBz@^rSf6>u-^AJnFYPKB^DU|DdSD}NFvXKz~~yUWPX{?
ziE`Bhp_Bn}tCGOn`r>I$R2#E(d;Q2NLXQd2sA2cf`VQF}4spLuiN(tqa=k@8Z<ts}
z!{-MI0Vc}^TioY(yCKR}Ha(%(z`D02HP%^gYl4jsp?Skfk1D2YHqrlX*CyxzpoPLw
z`8GY8+Q&&#cY_aIvsWn1ko&?goC84bGh#JDGx{_s*VQr4)o#yvW=G|@XoHW8RySRN
zR_;0#ePo}0OstTN2hhlNzG0GJ-4Vdszs|4q_PeHYOTU~m94N5gU#LG!b@FAi@dBvR
z_y&d35L+3p$FVE{`GfTwYmidEEGRino#!!<M)HU`6rBo}8Uu*sF3RiMvop^nmx?(=
zq<PL!U}|>+_#y;Q!+KprOx~SAtng@bUHGkkGm1<79fm1>{dgtMppe!ngjM;p@dl6$
zkNE~oUa!(onF%bInP_uEg#F0y$c_~%4LU_}X9M5TZ7^!;&qq%bJaxy&S_D86Ym?Qj
zN5&msx^QSu)2l<amO~&hyH4dC19a_rVV@`Ln^r(&>xxjOr=a!6eup?W4ATy;rGGsb
z+I}VsBvtPxJ9EGom}fxa%~8JAsr1pU>4>fub2w}iNO3-aOU^ahjQXspt_%VWQYr6T
zU1dbLzE<{EzM|pI+_dB7&)Y!H>wEL{JCB9PE7y_*lCSde*RaMr>MAvG#xfJe>e-hV
zx|!mua{_>h>>QUhmz=oJ-)Iw28Dtl)#Ja0z8`K6fGmC~1b~1Q<J53X|?!RE6zoZg%
z<#J{uWp3OUQ0#MM{&V6TyHdi<36VN0ZjVw!GZ9Q0Krap0azrk4ehgxyfsBOApAyv4
zhz}q3B?@J)+W#d-gtvuvwFmhAKla`_D(kHY8&(8GLP1hNP*6e%kp_X=KolfIx>4y8
z=@tW(R1uIy5Tp?)X^>R98w8|F8p&@qp3rlS=kZ<7TJKulyPp5>_Kx4K*)!M7To=is
zd!Lkz+LjBy0EG?>x$@1Yn-2}`X<g#%w$}PIuk}LS^QaYbwF({tJgnk<A>vhQ3r*4w
zuSQ*b0C|@z1LYImeQ;`{J2DZep8Md2O}u)VORtoDA&^(Q0p4*Y%?yz_eJNvi=C~3r
zER;4rOXX9=nx$qKM)@TRoW}>@Tk9incDosX21yWVOADEB_6kEuSG02U0IfEM?M)4n
z*)4GbeaX%N+Hj7DI5O3|idHB;2=egebc|*VRI@mG+By{D$fH|>EeLFBjq_>#%EE)s
zrZ|r93sXVdC(w3!H*JN8?6XZ;MF)R-=Q^FqO5WLQ&RYy!n`BO3hXN!xuV`4c$Zza_
zT@Zd+M|^BE8;a_#&2!No-JK4wTcH%oiOmqM^G0-D_x^%dVDPcsxikjT3Odc6=z4>j
zb13Dj;+u%spw5}I{DKm&ef1W_XpErZ#wS0f)(*OvUySrl%;o5@_H>2E^d5gyHB$a5
zUPbR%Vg}pL+dTzK3k&yA50SPj4msruE}Jdb2o8_>>OHE`(M@dO6VSJ?wB(PqXA=rw
z^GTVOq3prmcXxid*}YJh<a^#DLq%>>rj)44^uefqX4zP0;hbN}eady;%(7^&IzWc1
zZ(lF!M-|LOBF7%t`C^^A5eDE0AL``MPOIU;hty=-xsd)hd1XhUNu<xD=5;N#jXuwm
zGfLD`OU{w+3MgD33i3$iA<?bD`2ZFc3idVMLG_zB=ldA$^+3HaW}MCPj^1awtDdw)
z-Ec^>KUY5OM?U+bo!56|tz6-U($L*BY(gSu3iqUb9>%Qm-`e~-qPI!?=xn6<<^C1z
z3Nl4CVqM@#h;=ewDz<fT8XyVBO;TZdwIw>ZacI|W-^}#mpM2*8{)Xx5yJ3=U919iJ
zxPfY>Un>806b#g$D8OksqC1oxim5GD=~B2MPHqdkHj@Vo&(%hCaW)2zD-8*PO>+#C
z^g^xTbtEKHbX2ARq2V)zDT&i*4tle>I`I$=e}>^wmuW8RWM-Esas&3FON{vu>+ZK@
zff)lV?vMzNRk=vu8Ed6m9{->wih$$RZ5T+PV&QB#Rmon)kSBqROFaMp1_UxPln))d
zp$sJ#8cH@jRuX6FIZkozb}KcjH_4sK?1y6Ui?$FZkW}S>?H4b;^!Tv}-%3ZFGtc4)
zj`}!xRkZ^GCUJK++dN~;$;GH#U*a0S{lM_p)p4WBeWxp`0A@Y7mFnZfG6`AxqqRp+
zQd_{2k|Q$R(Jn*h#$clP<=_#UXX4J@CIw%I`$waEl2!UCu_*(@HK7gY{Cc@#o)09D
zr2Uo#>z>)V9LG7Jz-o(o@OqJBr~hJ}v|xq#-+X?0tPCJz(#b2=b6V`#TLZAEBDV&K
zIEhDd+pdfe9*t|cej(?<5OH$x>g7YH$gmLp3~o9i97%ZK*LT_#=wta<(yC7f4k3g;
zaIEEk2((<?U-6J^CEv1*v&tSwynS6I(To6-pyR>$zD&pT#x05U0B&Ect+|0o6VX7c
z(MR;&xN9?oE?dKd_O!d`*|R@CJ6o|vODSY;S;JtyyMixQi{Q{>*Ff{f4YrB?QwW+r
zt|~ECiplFQdC2I$0m=0kOZMXoJhrFUcG3jgPtFXKe-rd%!4M1GrOdWjDg`fS0!|bH
zT=pLDKhlBfa8y)pud;6U1)8^5xWh+mPDrvHC|FD~^TXOQj5Br|t|)4-r7hbr`)Vl8
zh*>Yz4gu6e0c}#9=(Y*!dTLp4<1bbPIAw52O}@<|&TsPwRhtF5($i8fe@zBL&k9#5
zQ$A>h$C(R*As<fbYtE6@0@<>_of;4=S{iipUQCUzVWTYMZmDM7oyC3V^9zM=<OTIZ
zTY#y`Z8jfh8X41u%~zY~upUiUTloU)0}iT*d9&Q>@LI=U#df`1LG6T<h6z4vx{Sfa
zkLeTPcEf=NmkDW8GD*z|OyjIR%($J5y7L~dbnF9)Tw|&i1c(f3xZ4QY)GQ>G;5OL9
zAV2Bgta$6QAtT$<Q_kxvZ#I^?cr$uYVUYpDCtY%TC=`n(&w|<1W&03nABp|9J+%u<
zlp1+hPP`X5%MAy}wsFRo0te~o+5<2X${SBBI$<Vmq{La!pqWT*Yqm?qv^Fo9+kL<x
zSJE==eOd6h_zt>n2F7L_`qtj>6P$eVf~WajYOP*~`Nc;P1UyFN?<qf>Htl}*WeW;D
zPO1@|U5}oJy!7Sa+f6{rv3Jj5OPOt7tdrH8e+<IOEK;s_a42Jj6L^oZ6;tgZz7e6t
zS(TJU!?D!{Wu%R?otH;8#}y0lP*f!=TT-E*YLe0Tw#ERb>{C0S8>d)1%(1xfOSjB8
zJ|!q~%9row-w&`RkzSwY&e0a0x*5ZBp`kQ0;c$EAjX!J6$Q-Do_aPVwv5-pYF#1R_
zGn4*ld4<}fHDB|Z&dtB6b}f)Fb=rE>t*%;`5}E~bE0D0mk+hX0%nD&SjJiqg2{JOu
z1B&kJEnA}oS*&WS<+ycnRb%3T{T{BW>%&(wh79(E`DW5?6<t1ga%$B;r^Uh%WUBU~
z>L&yLsP?dz#;MZ~T^hZE!n7b^)kc}|rg)11A)<Rn2MMeUx{w!2q^D*N{$S(V=iU_u
zrw8WV&))MI<!xr&O@7#UrN1A_o~B+X&nsv;EjDgEX^lapwb9YsO^sCiVLa#c2uuxH
zt^|2Cw`G4Nd(_7#0GX)wq7>?+tx7k%j`H4&^5VgRqF6Blp4N9E)<R&%gj7QPcy~WS
zcuI6cG4$w|UGh`a@sodF=dw9}3mUJUW2Ng=zj&%76*8L<W&%wKM{9{fGRxR6JAS!G
zdqu+f4|TrZ+d}GPUDIcu<@rEtnX=Rs6Zq{eiD~xM6q6d{w$l0_a1&zi^D!0!xzIn-
zjfbs*$V4*=#LZGM>$-}K&s^c2Fi|py_6xFJgl*5xam#ol)W)=v`x&0fw!OdQb7t<+
zBTiD*jAq%(9lH;!g58x!!`<xP@4{4*!z<^J_$l`+#o_m^>2poqdg|AKZ<Phd<|dBj
zl~4ih5c}~s<+A6E3hGuZ+m`1_5EpX@bS`N_tP*c8RZS$aK*?<=w}Q<6ayOZR(#sf&
z8!<1_bBEve)!0H6OLW*=IhEOc@K%qN22uv+mu>k1iZT~*eQ>Tiqq?1EwDkEd12=<;
zs<KznuMe|_x4E~a8<pS5Sn>L(Yg)yUc7@YpBK9q+&wu!sgwa3jCG2r|Nvk!~NrX3-
zzc!sKQQSF^Qs^u>a$F}U01^?2{^`*@s+aAIgcQsIm0bfC{pF1gKYV5etuO=-+DN~H
zJ5u6oE9?}*E|k_F3Q*04^3?NRm+!5daT<#bI3TSK9ik*YjE6(GjTHC*7%YMCE$}(@
zz?BM)PhEfW2VN**iPEG5281t`Z5l6*=o$>YK3tbm5yTH+2DoJ5gWLwoQk}D#%W^7V
z>4O_}CQEDSMu$@3)N_I>3?pS<d$)boNn|*5DCIF84^|BYC6{^!5CN45uCb-b=gCo9
zszG(_peiIu4j)0<xnHU|>+VC4z;gyh6Yaur=VewFZ>dPpbr~&3ud~V-*gTdP-`m$7
zKoPj`^DW81)5lATa#p<DC`ZY+l)Sbg9ccWUgoa<-cn{_BA%<Jf*Tm>aBP77Bj^%m=
zFAqY*{_PO#;Gys+juk`DS=TOoM0|AjV!57P$Yr+8LesM0uRVAzOMd*Ie)0vp^Z@70
z^6=u>k+bU^xqvV}pakko-C)1zm#*CQ;PzhIRV(IAeGp-YuCI?xS9F}NJ2PKhnr{0A
zAWA}LR^ydzI!>gYRfKBSzCab%bGEK!7oCZMgAt7Bx%-lT<Hv`HE&><MfP1}1r_6<&
z;fpn08mvAorHX^x^O@{wp_qKEa>H>lUzZ5zYICJq+WQLsrdJDrlUL2c#O0%y2vIEa
z;fQz7jo0;zQZNJz@p}G6ORdQT31!h7X4ymjvTr0B1N*i%m(LSi@3>V$==<z2{SygS
z%66(Pr#av75XQyj^)SXs$y|N4bH{6`DYshEgrE~3uEQv_V;*1>eDLTL4EZ@KTN3Za
zojLG*$N4tmCQ0!{S+nW##EE)aNp-c;B%@4lt`D~y&QA244N|iX7PpDO7{%0$6}iQ4
zljE;TXsvwd?}rwS%R`SilgpyBXu;|<fBWW$oFeF$y2j}QSsro8lZ@+dm~r<|-@+J>
zm&~PZrkIoW&({j3q!XBL4fg&Mr&0Y^m`Q;`pXMh!T(3hH5%Wb6L(zGl?B%JGMM6ZC
zUPU}z#w4gysd=?LZq26C;PoxIW}=6mZjWXR%RGC6&%>xQ6C$hV<9^<KAONKehG!pp
z{(`Jw!MiiFs1B;xa_?V#4g_vkX2KHHPSa-4V~%XWgUt0kRH=8|Sw+O#C-)fr`hy}H
z=qV<0;m?5tBgupy+Gwc$y?-*TZXd8p+gzvF<H1a%7vQ;EF<&RrM@)#?JWCM2fOo8i
zIH~3*h&-&2zgsd_#V+!>y!HhL{bb?R=FtKM6Z5#|D3`z-XE_O(6GG&1A&|^43=*h(
zr7nMM#oe~lbF7ed?AvJj*a=vDbn;t4bttxaz5HZ#2&o;1IY{>a=wE*aBP}hxG6P*|
zvY2+1j#ZLfr~izcMTFNP+5+_4j9);V-3^6bhnwDTx2dJw7>}eYBYz=aWys8D4>1b6
z&>HFzc>saIyX1?75G3|P1Lb~HH#TmEbiS^un3<VDbr^|w-Eb+v3{5u#@9&}8I>=A-
z4+*W`+da|a^Fu2;+`~b8TDUYg_t&exrH4*x%L}}*7hG{z&Z~W@Z9z?<awJOnFCV-|
zQ4vqaP2cK7#I0bkJ0Vdy`856$=(=>J#x3134oMC-R3Q+8JUh4|PvD660sI@7wq6!f
z)}e&*dvD<QKTtKx=g!DL?fEDTeu_y*gSYk~<;0VWvf~4T)%Qx=zH$p!eW%y{2kPZ|
zW-qu}fMlKAiIoJT0!@b6LpHh4S)04DuO1jBax}DN%P96E?{5W4jHwZAg`5vU@1u_d
zA-9Ly2RV43<v#(2j}LVu(kJ7$vI;j?5xGFLiv(+iTst(|lsqB^(3F%twe&#rj$(jH
z`PbnR!$UR3Li8u`F17OmLU(@sli4To02c4FkCnUNlv7>*vIIG~06)opo@i2-=n|=G
zI#}32h_G(#72dH^m!Qn*$bmgz5$c_=IBpF=G=RU|0?esENczYn0<|okjrfE)xQ2~O
zD*6dZsR*DYR@y`g6c`}Lz68iG5@cp!wFr4`12#b$@_^cqm%p4S<|th8&->N=B>eh2
zrvx47>}G&Jc^I>Cx#+-iAQl<{TC@lZmU&D(XDuJSacUX4_QGgGKa|F?XRz(YMGz?H
zkO<%yCdJ&`%ks98P8%BEt3EciroSUi^ui?Ipd$K=g#%F+z3>oxILwWb;<81kiSe**
zU6}gEH6Y4h3W3rF6p{yyi2oDsR|F+8<qbYH`cPo6%XEU?4B3I!8{@%ZkOj1x<hBCA
z0t`NmHRP$jAV8ejx?Q^;5T3mN9$s$0=_{#XD=KOegi+I21fwvWZWy=U)gLd!s#O^U
znftE2mNHqjLiXzxly<hHJAQ#qRoahM9gtR(m>Jx<{I{9|bWnPZ!(8RA3dUuW8$FYw
zKAD5D^DKqdHV2ipVI)>z7dO)h_Gwhcbtphn7~RXLQd|20nZjtjN<2Ai;ULC0P<rK7
z!_&or4oE*@^n1Z$qEhl81zcHe2-3BopdcwB(P^oJv2;Hcg{pCCPOb0X^`GImU59hP
ztl(luxD%@d05wR++`~j+l^kL%+U7=;x#AiYM&o+7Y*HnO4s%5G$nFv7!|uZebtYTB
znY?-N;(PckZ!qH1pG(ZgmP|4`7$BQMhJ4~AL76vd@tESb_Z9yd386^<TNjhNra7_K
zG79Xp>7<p3>1#NCU2#(?e^~Rw>|ZvQ3q>a3h()`gstT0SLR~&3pgKjR;k4M&SO~H1
z!?C8<4<WPtcEJps2<;|GUP3C9^sA3{-up*0`|Z|reF+`X2XjI>_uhmer|QP1e|TK=
zEm$01*mv(N|L0x*+sBvS;|7J<C&?9yPM4NVsj~i@0`{*k@p0IErhf|$zqh4-%mDgB
z<`%pkXGW{$*MH$IPS36%zT@|N+~3K1;GBMfOeKDAanPSW(C>l6&pw8YpMy-z&;I+z
z|H2eQFiW0ykNbD6!~gs-Tyo3(zxwq%T<=2Q$6~w3h~^LL{-6E;QX-mPe0o?G93U}L
z`m(PTh-rTu*-w}3h*G`Ke;(!UBg8|>v!4zp+WqR+V?e~q;Xkd>pN~qa2JS<#pwj%G
zFa7@0KY0bXd6$1WxW5gI=^a?DyEboz|A`6dZ|~lo_u{+L|7nf>T%Z3PfPbFE-?!uc
z4#0odOaD6n|9;;8FWdmbb%uSAav&QJ`HN2!EBXmi)3L{c&wOY7nva5au#ZLg`y!|M
zA*g8P4lKTlIvWC-41|BW=vuu%J`<R=nf!eVrfVcFfr{I^*RMyCBe|pme=vMny?c=!
z*!#TPoRfbBvXxojm^dzncmDYjT$Uu*yS-Zof|S2N%txCBWHDCf)LXt0mRW`rK{>ni
z=vsueugytoiN3Wp!Q^KrE^J$B0Zhb<-thf>v2G{+T#U*j_=#@oP<Qkb7%B*u(sy<o
zN6&U-84Jqvg|f~3c5lDkl?M+|ujuWk;+d?pVdu98(ae$b**Uh~<$M1$m%kf}QG2L)
zQ+L7Nf&R<&C>{*o_R$dE-+B^%fB!#xgA8~chdVjqw|^v*xt=%SQ@66+TFm6sRJBSx
z*-uv>;vI6lyRuZaovI{Mys4rY{rx=t>+R)ghiI9@Y+UjFoT|S+-4Cz0wnF;LI4Py^
z`z`bHk-=w+&cL<KSC@;y*`9A{to;0Zp`lgCS}yqaIUN7_XZ3wx(U$mf@zC%|rQvR!
zU6u3rVMG1f=r9l9w#3^f_}zB@_fbY5i4u>t*DBvO(y#o`2>@9XZnb~Y8SVHsGHM|q
zq5Pp`jenV-PI@?YLbb`;LNH|{;PvDQxBR9x^KV1?%~g;&3VzJpl6^nj7JvN?CM3c<
zuoy@F%Z`I99_&amYU}Muz+_^i`xI(9Kv$x7&LQPrCUgq0i6$C7MKmF<1K{toXoMRQ
z|MGLj62VjyYyX4H{ZE{v|8iGE#ABrS#Q^3p`<nkx!P9Reb<Kp>&44NcA59<TDXcj;
z6dpeK$us?ZlO2N={wo;sN5A|5(#x>fpW0tj{gplVZHex}3S*N#{PvF^v|dsZ4qd9w
zGry~${=+$o5CZGbIXHO&jZ@|fta#bPHP*l7hW<XF-(Deu^Kz1J@~8j**NyVu?ueWC
zP{*+J+yZKIUT3ZM0;2TSU-<nMi~}<96&><uKK|Qt{j(qWemVckLmYde{#tEAQ&ZD_
zSTGZ`U>B#(Z7<j{STNCXNt54Ij{oyI?1cp*k)i(e6&8hpN2OQ2&E!9w>F*~2J=GK7
zhd;Hm?CACfo-oumW%)JPuVwx}Wj2neMr37Wy$*?b{rdmCeb~e<W>&b|`!8|}>0$tv
zJA|lj8$dh;xQnz7$MgTnDcOFHRs@(Tl+yXn`}Ny*Vv3Q)O+3&2>-Pcy(&3dU{pPw>
z5{-^^ef*H`r26kxehE@120m#C=(#Yn=r>cOGk)D@%Yl_o{PgDMl0858QDCXu8DZ*d
zo#I7*dFmpd$yW>K;o)zGixo<LyMD=R{Qu0(%aM(a1w>`_Ihg&+9{ItC!a+XNn|&?R
zzig*Y$pDa#AqJHG7q=mFbOU<V`ZDO(7kD?g%G6BPk;n};?oHf<1(TuAy#-1CPu*OV
z<fBtlQ_LfyV`F2ih0DKbWd75f*3&?kG-Ym0hSuo6^hp1D>->Xr^Isp~c)W$@77(QT
zrzj;DqLj?5xkPA@|4Ya1mxlF={-=c$FU9X%0N-u$AAHeQd(oau2R+pPG|2yjV&Bc-
zcf0%ldMGCE8g|<|EG+Cl1Y#y65Qy;@QfzBX(V;+0*31weZPD)@{+~|Gw=H-H0<jl&
z2DbO~FMWXNSy6YKokug%Z{BDI-XyTtjx4h>=1G|K@q7w#-*A@L$KDfQ_S9y+pwXr?
zc&0$QN>ki>XxYykAeI{`w!DQDKYu(|VOQv&$6qbKy2<MOnqWs<yBU<HX?}$Qd|uP_
zI}`2Sa0K<Er(6`%W$Cx?4U9XcuP=mRc|+SyzpwB)b>PE*X+ieg`&-}Up8~p>3BwOe
z`+qtk5y9Ay{&bpMpRaSdBv@{Q3e`2Oe3LS_9V2L`@jp79(l3NQZyAc}H)yBvDUI!j
zH2Y|?UQ50>)jyTIt5xf#1HzEGqQa?{1uSifA6jogua?_hvhDTY2ejnH{60A+^jUqi
zh?y=!MqUkZ`@K%ZkJ$5|qL?CP<jGGJWQj&iU6s>xLz~6Xd@+|BsW2xRqi?E2+E8Z}
zo(Z7m$4KC)Y?x#ehzJ+DKq+>ez5zYy_ZZ%fvr?Igp-vg2x|aP?UUKy08rL9qQlmT@
zde3K3@JBT4#f8v(M?_;Kt!ApM`*(tdDVw3e5ZTYalF1Dtp-aJ~am_RM{K@SY%^ZRm
zFRByU03Y6w`u80;H7Atrteq$M*@ks(#Rn{L+GP!E&D6Nq55m12@=wwBej);2^>u=i
z8*<J&T*>z?Y|Pr-=JfFHlXV#M;$<_{4gC1?0Ekpg>k4O=US3RK(Du7r^H8y)!~S>?
zG+r*rPd#P{J@x5W=fALKFkC<3{xgfsRDi+X9^iV{Mf~mAxd>0ieeT0HP^V6ROV(Fq
zHBT8Q_*fhzTAsi9hTC(JfW4}_i%X+PUMM*gy{DcHzGLc{^|t?4`2a&wm^7?vm_2Il
z_fJXrZ4gL6QVR+U_Aj-J#V|Bi*-DZSk!|ZjmJx%)6jIQ90R6cBGYm;V($YFAyu0ih
zANA8Gx`S!kTOGum(9nSc-Ge>qQQLBiGC-K7#eH;a`w^|Myqr}{Ko8(`FO*hr=IoQY
z_j4ys^1wT&P@^3^8UGZmcl)aXnQhHFQZe{O^Htex6~BnbSm!q~Rr(U|*Dd1U2g$#A
z?DUf9r*kHV10oz_g3ANba!rQY?|K3l=c%5H+bE)CYLQG~QY!_^%0_`j-`I5l$-mLx
z{;?EXqDQ>j7QFJi)y%ZZji457+mFjZTRv+c-Fo{R^<XVJtJWmmwIUJxtCq%}D=H)T
z>6PKZ)z?%hW4~DpqF9Jel*Y8bC2Fh5DBowbNO328QWG}&=dFO&l5l&!Zx-w*QXOXk
z;?BcLZ?~Vp2CVyJ@@A_F8f~m*M2uNNnOR&-n9y3|s)x<)AJ6iiPX$(h%#}mFQQ||n
zk>S;6+p$#+s(<2LF38-5$>m)I3eX0V=hpfDV?S>BW_ExwojLWkr_B{X5|>YSZ^MkP
z&R9-;=c+=~->`2er{OK_n*SI5%$wjFO751YMo(*riZhZYt-ZDnuswFU53NWl`&N>{
z$ijxXp+Yeib;ZAIDWr3VeL)mG52g(c@8qiLj_pTorbEKG9GAZiB;Iv05V<UF;rZiG
z{#beVBf<`lmLR>jV`x%GrC?Y+9WyH13cFT#Hyu@{o+I-eD&XP!Hx|}EZ`BAXS3Zp<
z-lruddtV<+x%@i<_(O1diTDM#QS0Sh&%*4Waiiu0-;T0v)B63x`~l}i$YX)*G!18j
zF))8)Ccj-pT<~xPr@gm9e^(Gm;Kc21+v+lOnvr%Iefdy#U!^W0r$>kL&nNry8%1H9
zD%c51+vCJ@4SS_9jp*kIsd$VNUU5}ye+m<TpIA?E%vL76c)}@VWN9NWq-lsI=$kOn
z0US0c661Khci;BoeETJtG;oT8;%;th-A*H4#K|eS3_XL)YfPh3O@vl46p$KS(fm^K
z*vg+(@W<ziB^q7px;+8(gp#p*`rk$Uu0a6nHJ2VE`QiOHaS5ms4nldPrrXnlBN9J1
z?nL^j@L1%#PKJFyA?i%0Toi(#0IlXt`5~j5I#2&7Q291r!+2-De)jE$<k^ed8K~a*
zV<x>&WlFcq4@Hrq<gUO6RjpMVYTe0PDuOZj<K5eTM3B3p8oG>|zTb~Z0FBD^_z!qy
zj@l8RYO6n8s%{#+T_E%ii}%MW{_#h|Efl1-MIZV<NMS(H2yj;Xdd<rN6l=d@YdP}g
zO#C>=Kc43IKV+Bz=x8kY^|BGB5U>F6t>Sk7dcHy&PJtiI(m(I=-yh_=ImGlMfG1gk
z`~Lvofp9)?;mxmC@8BVT=azc>+5a#<e?C@(EEwSEheE$TwR|w{2NSY{|FY}<a_;>Q
zYA-bT>r?w4g%4^0hQA&S{BXVfquu$-ijYddxchVD?*01up-V!1zr(Lj?f;I(UpB(e
zrtW`7;}<7thwJ~2#$OKdPY?V5ZjJAKj-MSp^q&9S8sGU?|29#$q*7X&?=voae`)B)
z6+y2hMXm7K<SEb%`~r-24~Xe6-bZc1)aIqc+6|%<CSw`@0!@rsLn}{@^B#iyjk`Dx
zO-Ddw@Apedo$(7)qq`wF%6?#R3F=zfS}WYbN4jzE#^V#jQ%WB04-X#x-bei7v)}%}
z%!9xfSKV7Gi32R{UirN`Ca80d5}3Jt+sC09geehvVjmlXT0_qgF9GzK(8whrDzP&l
zm~hWq5F7<pYh125W5dtg*6B6exTNx*fWT!RY8ibu7887Ky|v>x^EGkFjY=`$1JJVL
z^8krLYC~^0KI+J&D@KxT?wI1uz=4?)6l%B5KC(0Z%OmapY)h0fWj0WEi)JxwhpQB^
z{^yS7%Iwfa=fxvW6kRU@y=lwmj5fij-n%%0QMApG53tPb&iqkrhQS4LX6g69Uqy|m
z1<xWS60t{H1DW9OKLo)HrbOsBA^Qrg`ouJLpP#zl*kuzRy}#P*TbATc`!Tu?Lc*Sg
zI+e@cOpF|b{>{8{B}pqy{<q+=>aNMXg5vl5{_%vAWIlynWni{^fAk@>Y_G}K1R-xX
z5%ewh9n%;D{VT$40+^tC*oTNKbG9Z}ZD0=&LilzYHCmoqFC@_fFMVC&Q*!P!9_mNn
zo3G|9IbFqa4<$GyiC^$r=<-dzDG{&2Zj=3lcqb4?=dJRCdiRmIn1%G?vu+TTh2@4X
z3<KSfq(Jv<1?mF11^to_HkSKD5FNl_O--`zK^jaJ-c2V3lf)R+8TV;Jo7OE?`{NJ<
zoMdpx!lg_Y^ABs4!kVJcFSqqd4O&`OZAfNPdgD#wbuR{5MU=-6n!ER+9nGD8a4%}n
z;#>bxQUa*k0gtur5n)E9Q&-E2FXF`aY<WH3LcN@hK<Kj%XdEnFxH$VnUt|dl)^uGV
ze1a5B`T^i%=N|Yv622d)!z$f;DmvH2#Lb9_xPcAZ@|s12R(2!wA4nOH7jG8d<#=QV
z9lHCVrG;IOYGD;NuWfy*l*kvLlyCzQVWD;CBiKLd!LRxCqvCrMGUbq3yv`Ea`27f>
z%_eix)i7M1DuMa-jFBp7?2=0a@$o@!uTlOi=m<1K9f6pLG@Kv~#K@4g#<xF%%KxGG
zaJ07=t&=4L%L)<+4$uV>y&Erc;XNK>{W({}Dbf!D7PzzLL!gnr@PVQ%muoF4B2SVH
zDo&tgqBJ_VOJQKkLt#s8><V!P9VQ|W3&<=|N>GE*E4mw}T(**4zh*RsbF#6Y(0&UT
z6Rcra!bF8-(7(Lq7nVC@zX?32Y&C2vD}z1(`|W-xTXPM8HT!T`EKXMp3L?iUO|QoB
z?4?XFAW5$x8olnS>(vHJIuIOtS(1XK{aB?^^{dOi=ctv8Tdbjr>=oO+?s%gU0T@A?
zfh&eUZ&}qH*wSfd(nfbG1)D)Cgwh%Y`vr-h$&d}OdjiW5jb%5okj?bBK=!G{*U7XO
zjFA9p^RZa2GLr%hr8m9y6Mi-$1^cLY64O0r;3ckQrPqaHQ&X(0b&ja9FWzw7-`QAV
z{Ym$a(e78`QwqM}1+q=GNc}h4DM~j9UTHd{7=7S~40C<;?4cXnewj<-ByW24oHXPu
zDz=`$q^Tm}xt6@en@9}gJ#WFRm%Y@}01O+>0E=M-t&5{A@TQj>)C+l9#S_crs)B=f
zml_Xro2#s)v$8$3ga3Vqgb)mrsR53|#20=3;9CN_DKtoUYkmG9@Hx4`!4g6(%)r;C
zJ<ZaJ`sN1gP=h|RCV{^9PAk+KW02ho^a@<tBy1y=<Hd-OXS;9Bo&yo2I*#F=MGfeN
z!aqdpN9vVf*9jzuG&6()7BbgiW~g2WL`<tSA!qMkHn3`FQaBW@xzO?r3PGqfqs%qb
zgL>6(AwFQ>5=vwq^26r`i*er3wjI~fqbNE+Lq)~k78t<B{TYn(0`_kA=^!eT?Y_h+
zpv)a2n%_%$<@mF02oCY6a95neXO&wGwSu_}4jzH8z(Xo_5`{M2hqw)=XI;(Ud=-G&
zoZ$0K!Yfrxo6vJeV+s@;Zz^aU?t@0!F@+7_W!t-l##W0D`iSrKNgQ}g{e!)!fDJkJ
zj_vA8YCtVMn2*HS%fme<|M(_gU6-+vC}EIIuWHBMUuQYreOC%ZZzoEC0IoIt*`Oz@
z2y~<dD}>zzy#Etw$AT+`c3xI}@t90n5fD>Sf#%qwYD}>gPbbqhcZ(NyYizdKK>+Da
z$sA!j=$knOkkU(+2(9WY*xqibH!9o^&m5#yUxli~4gw*7>6pzjqUDR5a<3w*IP#Hi
zKJv%%*}0=yKrFWL%Kt0id%ix-vQ?{alQxeW=iQ0rg*ptEI2C-SnkMBq@p)7idgc(9
z#sk*O_Q?K@h=HQdSH%vK4aoKJV1=F_nps4$sqzWDsQVf<Zz4<34}>S%y;j5g<_4|M
z^p*Uamow@D7XJZT(7$^R?gYue*2DCvzRjsVaS4r}tEVI@fy~5^>=7qUj0p!5KQI>K
z+cYhqX1V79RF~Hp>wvkk1QbDOfdSRO8<?IXo#K+dziUU?fmfl>ZMo+NAgEOK%2dHH
z#+ZZdA7MpMs=Ot`vqg^AP+?#3-I{0AGPRs_XGjJbi{U)2-+8`jjU*KBLlb?g&9iRo
zPL?;oI`t5?e|mC4`F?`|t@X2e4wDW%BQj83SC9@-w*b0Ksj2}!*KS-`ye(XFh7>68
zrq8G~Hy8i{Y<>bN-MK!wV{kH>S)ODxe+6nuMGO9(C+4Va8Q6Pt9u-EF4p}|RghS=$
z&r*Q6On}?T|9Kh5pZ*byZuAIhgl;|<-$)KaFwwSt8iEhD(QZ$?uxkhy$GD_HShnG2
zeYU5#UU<48E`T<=tt8!lX$o+vI3G}gKeW%8$F<BWf(}2;Ansdn(f%J3C%^}K7RdG#
zqk%0YF!t({-Q4Hqac^jIcZ3!>P!Bk9459ByWG{f1?LF8T;(h#|pETnO!BKD)lWzIe
z<^0>+yVfE$6g#KLi=jw(cz*SQmkJ|Hyj7++YEtAx4%&H+TmnvTF}R)|3U~oRmxTv`
zIbD-5E`;x7KO{+nUs+HIjX^^%E*Pzu^V9^;{--CAl2Wj0_^9ua3V0Ro-A^Vf*?U#B
zu>v)XmqzmVSK#LAD*<nSb|7oVjR?tg3cV;*ppPn1F-npTvSKUn29_N|<(gWeHh&+^
z&t)Am8o=y8^xDTOtrn5};hL9+@yU+^Wv=pSn~B6~%k_L<<$Ufx-vzut=kqF_WuoU=
zZ*$pDl$9+@eOGGUxCFfR>~i>*t_ZNxI6#(9VrzE{JCH7EKR)hHY5VSaVecx)`U>`Y
z3r;ijOaQYpGiwL+Od#|^7Fg0IrIsJ+=M4XP1yO^ZOukkn#n=+<zgm7~YPHZ$mt<={
zdwTu3d+}7^Rj70K1(1V}b*un`hF_T1$;3D3+I53X-ornIMnh~rfs(uW_!W5$CrH2T
z1A}Gk7@QN`!W~95Q(>#glT)U=|3>g`Q?GTALrMx!YqP<;2G^`A!mmLq_)1&w3g=gF
zCAS%E<7VKVFZ8SS-`JY$+567w+6Ld3MY{Pbfd_Q_k1bH1&I_cw@RZ>4{Yw{ieR%S`
zU+>Dj)5p#SzW%86G>$ZyYn0;ICEdrH3*zfD%PT!?PIWyF;RfLrOW7l7VIXl7k`FBg
zQ*|`8!VXS#Dk%=?D!^^k1Q$^gxNohpEkJZu(0OZow&V`7{>-&)E7a~5ONh2#1^O^m
zPngl^c=xLeS_ry@`qWLXtOH@mL!<K9Y`#^B3MWTr8~Vjn4ThQZYC)SEyP6p{J*PnP
zefjHiRg1n&unMYz6y-Jg+UK4-KtE|TU&TQC<HGiEb4hQ9df;_9!wQr{851V0R_xOi
z1C)u<(5k9vXl=U;7eb<S&n8gns%l=D+5n{oKcpxVP1z~-10LvVh)-}^RkCtv^`KZ(
zzH-Y_kcN=7>1WQ$`6|N{gGGL2PF<4dB!<ik`EX|uvB{Qfzmdo!-A>M9Gj{Xo+4l2N
zs*+|Qg31zPoraN{`FQ~!a{WJt3T_mXTfcR=(P5g|a^dTSX-b0viNi}l14ko!n^w;B
z=yD<*q<Pv2TS4VCicSoxGSX!p1`b}>f)?OskrY`*Dfv8PR`&#Q;@`y<j0nxd5gyNl
z{&*iLwka2EO{5%t349d|-7n$<R|R=QwaUS!J?H7ZRg&r};d0oO3pFg8&_YkKI$_mT
zXIp^+RXkhb5-{eHo)v9Oz>;7|z0e7lSR<$oZH$+4q|h-jRa0?)sn~RZmh%dhuOF+j
z;GW31;n3~Lz))Hs2s9eyL!LK8Y6IX9ICtkY?5$}XfvZabk>9#&SJ2m|Gu40umjQS=
zQVqM*PgY#@8<$OY+ThEYGp4IthEA1ml^(c`o0Hi&F6iXdiWBou@049+o&$;7-bLTn
z1wI@0ADLgjTJgJZRRtE&;$9_i<n&oWo`#&Y_m+m^K4T!;%_z!Rt@dS*6h~s%_Mk&3
zE0QoWGZ^&5W_k*I0A^=a;UKy<>z{q4lO;X&h}$t^c_ZpZ6*Fy7*AZ>fwXeXsk%+@N
z<VVpo*hz9e$qX30371W-^55fPPu51-paKw18tp8sX%_O1dzWmI(V~!nsnYfTP}NlD
z5SV9lc|%WzH{~nwJA2>~mMWP?yF6(5M&E@ybE^|6BG!$~ZNw1;U^}FpG`Fq~`6<@s
z@p^pPLL^7R4%cP|R~|bQysLmP^CB}A338wpAWq-|B{{R8pbhmIdqvhuR^#`#98wYa
z>D{@B*LwLrpZa3Oun*|r$W)w0l<0;#A1J#HDeDCN=$>wnP`=6xk{KW2^v=tdD_aV3
zJ|kT_AIQ{HrJrK3#IJnv+dj@#4mTARGrrm9V{x)hRJA7EK|;0WBu%Gbj1b#+-(qeO
z!+2B5_DI>g46m;_v}iPWWd_B%$afm0x0LpdN}VP!w^4h|y!hn&zRL~QuB9>G6-T^g
z+`;+J9g)FP0n=^dwFu1oR@YqbrVgx#_QDk<J27R9SU-`4kcgAE72RdEz-I}8;%A=5
z))W?dm#-iv{RGi5$kAR>3eO5%CdSx-qCj&ZoqKj5rnq6T6o>n6@%3QjP<3lXHLRB`
zxqv}n?fyJE^j4hNdt>x?LX#x~4b51@bT62_pI#j6&<mGD@sCmQVc^(fr_wPqa2A=6
zEweajFMn9Fq3OoLIAvQ@9;UdGs;wcBbf;TGiHe5%QFhd-p4V~hwOGBfhPzd%dB^3P
z>70qvyKthaNB5d`=mbo?S_xwT$+ZEAOW1`Kl5^lf`E{HhZO1Ef&IexIYuE)9BPt5+
z7RT&~JvNB)!??u}7k4LYuSU30+80KYqE06Wh}8Dq@}?o9*XMRN=`>#LnQ>OcZ}ik;
ztH^#rw$O7$Sw1)5L%c;+O4{r$58Qp!YXR~;4F(Ne!|i!DG&LcJj_OcZ6|vu!`BsKn
zWcHpbP!b(I3{+^6ICvMoN((K@x$h~9T2}B%c1>z!#bTg;A;K=25;(Q{;;^5Azz;Bz
zrL-1Nr{VR9geA`?_bG09$FS~v`mE*S`(+(!E*K{~7V4cEd1NOhPga+dTDOqiqMvY3
zNF=@GW_X#re!}A*`6-PUjRcCecIT`%&wT4vGna<76Pa!L3Cx{0rs_yo+6#KPBnk!(
z&%7AkXy@f1?L?eEslIrcVrHVV_(^%B`O(k)25$P~h^DZJy0X+HvSW=zD#qa2j~^42
zKHb2nMbJlGOf&UrwBw}xY}Gh%a_HRkz|qig5S7(Rq6n0_Te=DAL9amySU4XP|B|?G
zXQ_ri&Qx1?rr4aIl1r;EZ{=<pKDgpJ{}oUXlk6aK^~dw9ZkbbDi@9v_B3X2}0&W!G
z=eqPfu?P2@GJp{)9Jx~FO=65lMMiqI?TWj`_?#>?qG=ZGos5YQV^^^=0AxwBA!_)@
ztCh(|y}TmxfpyJp56z-$&Cg6jm)sx=I}i<~gKf_wJtmyfyCO^4w*GEYx|Gzl)|{5O
zr0k()E;G%fsFuQl>A@05-r#s!_G_BU&YF4?6W5}b8$J>3j~L`kja{yb6sX`JT2?Th
zOrH-_kz0a@^*K=1q;<2HJDKqoac9YJevcqvB1~M*nj}0XOdjC$hGmf3u)E#0$Ls+)
zb<9#wyinGJo>RQDsX>C>5Z%k9i>i{n6{3Y*hLIZbc>xC}TI!4@+wJ8Rt^B&+v_E%g
zhV0gd1*C0fUqaRUL{()0dkMm|6E>Tpo}>Lk03ytdcA0NQ5p&spIY}=Bx!{bWTXcM>
ziKG;W7Uo^+Ni9Lbf(9zBK*Ib62}x{Q%OE<wQ}$#YcYNN;b~l@*H(VS;jGW4%`z-m*
zcac;`Yq$q{Jbp$Od&IuTb1+Rmn99U#;r<d_if^NOKd-6>#D}YL9CO5`5j&#6$?Uc_
zS-L@0U8C*=fM@!wMe)vodCMHtx@z?EvI~ywrtW)OL{=MoPE&*fTZU1Iqi$_td+9rc
z()tJvQqx@K+A8pZIfYh7qcZ3=M_e`;oq*+&byEIqLrx^)d#QV4R7p{^^Mem}6E9Go
z=M3<R4ERL(F*EQ^@rmB5X2FFu0aeZL6Zu=fUM@GmybKIm=FBbb8Yi-V+11}Z(^wY@
zi4c*euCCXFgNE6wK>QcLE%V*UQzxS=%?#QR_vb<>?w_V17iNJo=wRAbw<@>M-Ka4C
z5eK6RCOFUMu5(t)g?!j2oUIArZSARxvEvhG^})BeHr+t4^yXL{uBRYy<=mF=(D1&l
z-+$5dEb+=bu&mtl=LCkpwpNZqc~1i4K0G$(@YE(Y1Dt^&K%Z500q-5Sci?_3P*Vtz
z2!W6SD{I?Sbpwgqf-MPQz0Pag+Q<F=RD{8bOZEv$8c|1uJnot&yb}_&a^LX+JnI4`
ziKh8!pp5AxdM3#(6ObBi6}OsARv7kZLP?E1_%ebY+_ef;Nqer0L`%D@HW`3fuMQ67
zcY?dODujjvUJ8%lW9Yohannr0-RKjLGKcc!$~Ki8adVvEDobIC3dqI1>!DSAS{8dG
zVD_NKMfHzN^wUnjDV{*8dE}#1KW&MS2oT(6iz?1)FeJR)$g9<Dd3~a3;1E@Qe0jc1
zATy0Ik3)~Cbv|Gb(yX}r>|4yQL!R+gS^gNGr4~hGv|vk<g`3Nz=EscJ`8qhp+E`hY
zG-_PS7_r`nmu{(JHQ`PPo9;ABZ^^Ug%3F=^GvU7K3)>;@#KOk}XZ0Hs6IaWY?=@;f
zCB<|{>ABtZpz3XmyejAx=XtR%FKjaMbqJ`K=eG)8^=}4#7DfDp${>oSeZa3ps_s&O
zQ`khhlc*t0J&LsCeR6uExB>WL(%|d6*h8S*2oaJdm3P`zX)_>y-`|L6kgWThmM<p6
zSzV?o5#~uV;qWX1CnUTVn8sS~8UvC3%g!ejNFrNX=4^@h=bMkTS%UYHssZ^?Bl5=v
zwxF;>KFwWQf(mCEp%WQS4oHr%S&NUMqP{`Y04Q@#1_>P<F583j6cpEjfw1x#HI+F5
zHy(r6#7D5EkicgjmtC}&0G1~)_d?xktwg2f`PI437ue++N^T}F+mm}vX!h36B8nx}
z$|S$rgPfCyGIN*?ytw$qw*mYXMGP0Ey=VCk6`C4ans`eJ0xx4DfR9gd{fI}iL9yx8
zsa1fe)K2p9&R2Tg@{XvkbDFjg?bxl`3v^2|xs3X9{qJ0q!0WiCR&j%AV&xopTjoWP
zeJh}2>3@B6>RouH`1VzLqaj^GP0ypXZ^o`+m;U6SZlh}NoPQ?-4ygkHQw>_gZLPLF
z38W%#AHRH-CQN}q@@D%dUPhij7L06=QKn}G)yH>odh4&nU3ZXd31?hhu4K2FD(T(c
zC_Tx)ZIqBcv{62*kFE76kSxY`=3fbuZ(T6eyph(boMCXU8SRq1D6uzMm;HfDT`lR=
zJEC$KLuJCpPmc~)e9>P#A1DOQ=omO!AChVS%hxbE<wG#5ljTcVa)q4Gidv@*9PW&x
zj8S_Ma1&#tIX(k-S*Sl9Q#4Z%d*&EKJ`VO>UA+pLrCc+Jw3v%PLk+}Ijm}ynB)<K~
zRpnj455(iwunA}&Aj{s2vJ2jz6*@MIQ}5}YI^?g~YHy$hykbq3O<psJ^We9{E}nor
zYvav!dfEkBej3=F3;=;R#A`;$aN!LGyJ|R^Z?|5cj(=XqE|Cr77n?A3^3_dTTaP@6
z<-bp0G@?7~*??mn(4jUxlVDRWw5^(PkPkR3oJEIWxf@H*$A`sFKgwJ?pLq1q4Z(R?
z0wcjePtM_*+E5xzT>3jF+#Kt~&DqK_vivL<um<MtnqGg@aok^t&PaY22WzdzirS+M
z&P<95TyoF4&xr2(LD$v3u9+*GHFCKzVi6<lMG>R?e8OAwM5oxFa4J|r66wk#tztw8
z`X%RZR}P?WF(@IF(YvQ4Nx001e1Ok;o{^mu7+eF>z~=fYH+Vko$*V1Qy3oz;CCLj>
zMSIquM%xH&4Ux#*F$H(+_f(+{=G7Nwo|z|kMs%-|E|-{3tpY%zyhe~1uDP^EIY2+{
zL9cb{q^QgU;*3jFj`^X`1hXG!cU%&KQvAxc8+LE~O!mWvF3bsshL?g&g|m$GMiwH4
z?>WNVZ7>o$-gmmwFy}_!OlDAfJQyIZsGcdK#HAqJrE?Q@=`HD|{#m6fl`(lCF%)>7
z&O1W^{b?i?qkbB^3|?^!@X>4R+J2N%m$8>c?_7E%br&ZJzx@#bn>gaTWqkp{rz_`l
zBeYnkb{-&hG>1S^XJ0PAiZga2*Z3@p{Ym5I1dR*0nm`}P%r0}PmBQ;I(kI9n4l<HG
zc)qnfWyF1)j3N?u4y|D$k~{<uG`0<iJtYJ4;sPi~3-;>9P{yo6&Y&)Yw}J7Z))asu
zp(yXwE$L;N!D#3{KkD}M)5}Cmar^M7_Nluhbd-@yejJJBDdVSOM-&#kt%2>f1*>>q
zSP-km8|dX#?zAJaITaa+4dIUy#H@2BkX>5r2zD+6FyPg@-8#{uCj`dZ#NB(D=Nlt-
z7hd?Xq$jf0VF_6ID@YmH2Ei}2t|YDjlF}nLlNH6&kY++)L0ux|LrCiRwSmM8TcP|-
znX;>7l+FcEV$&Md4~yRd_OB76oxY-ro`Y+M;}+4_8->>?+YRPWXwnU?k_(l$yauW{
zj`<2g@Uh}j9D3JGHT4D^z4kcX*otJl1OBhJPmSP`RaWiBI|}a|E0LrXC4lLNN&)d*
z_q;)7hi<|~{Xk5x>G-I<ik#1?ooIeBx%$jR)$9+ode}LPr<hE*kIIzkQOg^r9Bdx>
z&?ERUS8?uC7ASONt0y=lQ7j&8wvbSjOc1-~GH;YH7Ulx3-}Nfk-A$gh?8&K}MnN4U
zQ$r03O<}4VvUhPNL^~Ztc<!)c1xE&C=aHVJzOl=J?hCr0YpTK!n4lt4qVLwgo*Olz
z`jwV2Mf9vPl2l`@KkD|*m_hV{<-)6l)~o_<-Zgcb9(gJ$ZuSy@Y}ELw4Y7BGfmp49
z+rgj-T+VSzKwx9q@~5rr5{wO30pbr5yrP5spw!v(NS3ZF_TmM00U%@7GhMo6yU-wJ
zF(O@hkmJ4<_H$CgJ%`RKTx2&S;g|w8t_+Q=0b9f82<GvcU^SdPUAWqkQeI&4)rx)M
z^rMkhn8F~_{KeMvufxI8gxxeAc|}<GuW;241`k*qwLe@f-Y{sWa7#|1H_?Ej+3xBa
zZEOzSV7mHYm#t+N&NQ(%tpG^+@u!TFpe$~yu!x7fndg!9m2>r1>K+rT0T@U@zPs|_
z<#qi}p!85R^mxxHFSprB^>=if$?pc--_tmj2xIL?vnNUHyWhUYoJH)(P<#LIplcnX
zfUKrDod5wqp}O*ngI#Eiii><q*+x_Q+VHA<SvI-I#m6*i!Iz2UXe(3+0ve}Cmn&wC
zN)i{41PbCkHS#z39zH)(f8I5<k3XSXf#jIHWBFA81I|KFS8~1}U7pYaZmh7b#dzU+
zwqQz_r=HI!p9NDQ)`LvRTVzTO%(QT4UKNTz<s!2k23RaW7*JuZ_C|H#{sFH5JLW@N
zQD-^acs=)j$Rj0uW?p<K__|V~^|b|ezktZxVM-O-$x~i-b`?_LuxdtCWQ2Cg>AjKt
zdQl@WC@XwJv8{Omx0@NSRX^s*DEBFLNYgtTfC5JiIE6ya{KQ?)D3lKFJl0UseIy1~
z&f_IR@WwU^E<VORqVwWeQ{oE;)+{2UM~|mntrK7DQX@Sc-o|O*bP%RaJ>aZh)<B3C
z%_tpiq93ViV}p^tLMOnKZyB<<Odw*bsi7T_Khtf%KsMzQxno54<FsNJZn-CB4fYwy
z#hFl1=DYqSaibQdlXDHb`a$yOlyxIl3a!IrszLAm&S2buDhZBPCJAksOh!*d$>Rtm
zs7h)pe0I?l32W#@M>@DW_f*|%v6xKP(A29)&NWOaT<e(`R$#8pFmswvH2^`=>Kb2O
zdiw*Rpq|k?*%%{MZ>!+`vT|$TwTMp5=^qn2_Dn=khrJ}_A(|6Vm^~;))@9@#rfHaP
z&>_8WPQ{_d);E$F1qxNG1fAK@Nin0Bl84WjaQF1EZ<r=LPD&Wazw*?cJtb9iG4^BL
zSp-5heZ6{2>5WB+%H9SHACD-P(bb6e5W@vnf`1v4;y82?>0ckr9R(0eZK);2*-FjF
zo>;C`%Knq<9>eok!>m;2r-!+#M_4%poIV^6xB|MFl_&sYhumznHU<5Q7^aBhZYzm1
zNk(e@{9v#d-NK9+MHMIP&Kft|=`o$*Zn)P7UfUDZXezTj4jT<Wz0y>&{2M%L6or<L
zocfp$5O#vPHMe0-K8*Hk5ix0lxC-#i-cGZDqL8W_7$=!fl&Ft<&62Tz^&T?yu2yhi
zQ<b~y5nlW{@-|&j>Q(zM#e0m-XXy=j7^yGl^FJB`PDx{f@CQ0qAz(<7U3l9reqvnv
zo$q{xpMWL3nc588jl<QEfX*ygLY=wZQbAxgrl)kp72_xj`Yp$^&!<6L5o5}Ex^HPt
zAXwSe{fehUFH$YY;t7YMg4+($RqCT#x}`#`NCtuncviVSb~da>PD~>|#QS6Bsqcb5
zsNrWjg=@l{>i7G}bO`JIO5AO?x6??4GqKYmSEEI*z7_(FPQ!ufR)zWT#iT!@jOQEi
zif!$a8Hz^Z5m^ASjIG<k#AItH-Ct#o;NY-VuA+)(8AfVpV?c5484zdy$<~k5cUg_p
z196%q*Fhxrc~@175r|T}Kgv4c{?RcP5_-c+ISWwhptqaC^RRbzK|N)p#XUlv*zoSg
zwbA0_&;`)3KGt^PE|DtX+#%J6l#QHNXVYW}v>Eg1h|WeL8ucn6^?mM|>$hLFkQV3J
zohP?B;MlyqS{abjc%|d79MhsMU$upl*(|bg^_3y-cX}j4AT?Gj50$+LL3sK=wER|?
z-OkYl3vI@>ocCllcEjK;bT=M4H05Jm`!w?O;MwYk7zB%FjYH<zD(PWSH-TieE#ML|
ze4p*fW^{tu6Ps%}nv;y2ITxoC!YDQP*lR#iaEzYz>m>x*ROH{tx)<VMPe{hn-fa5p
z@sFW+rJ+ciimK!exo-z;_mtgjh{lD<g`ajA#V>UyD=r>PM|Y=DsL#UeZH}7wjFu}U
zSI2`PCNC|QgZ)8#J0&^QYy9eiulIV9XFngM?^f#}a}M{W(@w9jScE90_q$OI?+zOp
z6Ro??5u&1C5Aj_Z18)d892=~>(p-tdI7f%r<ap@|Sd4+|bqq4F$-(m<yGD!Sg*O1>
zN(tv$+yKt~-tLH;xWGesN@n?sjc@?`HbDxEr;>kaGJIoFuStq>ej3=Sr4hXRf;p+M
z8ow4bP^hvSW3dtEW0gH=7cJ(wm)AjD{pQPi{H%<<C@e-HW;tsY6O(Z_))w!}HY*se
z5x`-UQAT`1GvLOgUX;0VkU2TqoJA*TVr!#tD}YYx7L?q4P?N_tuaHSmW_IgBT83%5
z^3I(gj@EGdi8%qsMowEk+YsUYJixOv@_NjZhocoleCIxnRV*ad%{AB?1LtZKd+t)9
z`|@cop3Jtkj*E+9BkQQ%%Vu%LuFgSrL8`-`6G)fw9}17AuS3cqmz*>>gB6jzzb;e#
z(FGD9rMwTfSm_Dj!Akpbz8t~T77b4QI$yDThQ_VO-P_?=eqvojsA{L-?wR<=B%uFH
zgwvI31+E#h?932%W-N_-U_eVbhf3Bqw8EK_)w#v4^u{UIz`q)Q)BVv`RLT>2`y+GH
zXR`%{y9D$~^e1(kI%+;DiX@HR-AJ~{PY`xcorz*B%F`-NT%>UNiiDC>ttfNrJvnP%
z^v-(QyaLf%!^zKXDYOuHy+4Dn;r?}Ny^A|OKs3_5MC3h{oF!gWb}IrgZMz3oJ`SQ|
zit8bDB?YzCDgU+!WCj;Av+|SFJF8!|MZLk@%?>ezJ@(>ZsNxG0Ete^^gKDH^BaQtw
z#KzZ1C;hYEx?eHoC9;9-R!2y#22{8zy*Z%P%V`ss=it_jbApS0rm6(Vp&|PF<TkDe
zjq8@t8#w_7)m?hG;3}JkZ9QgJA>GV+H@xZ+^6{ot{cN00-Z*S#-xMs{nwd`ZPV-{u
z^X>k*KRl5X+|NQbak8qC&FSuSeDUquOTpM_$Jg-fNCkX5QZ=8{)ze!nZNhbg!o9hd
z;*%I!<;xrbV-+>Vhq!rC_Q?l@*${{x-V>b<aHXD@+>%mks2~4xi4murgr4{I2{4t(
z+^&g1O(kVA8Fq2{>ULG73z7KNxwP$z6RJRkJi#dG6rQme$sVrL5(i61P#%O=U+U4G
zg|vu*D)~9mCLx2F!REnuJK|Gmw?&Ov%aKr3b0LqTon4YZ;eKT}vZ2c$kl9VOj*5-X
z?OefBX4h#&oAJ2Ux>7!m8=|+~MCbMQmmGETHsvPsYco`q{}{eJbkuGCwCKm207`W;
z!SLm%D({WH@%-v`oqRU(N<ruAai@J=Dop2k7=;M&vQzTS<b<oc$Ne#N=#P3r*T^N`
zc*wd~EB<Db#_VqMdXG5!$K}U_ul8sz!()Q$=5j{PfKb%rWT&A<&o#Tc&ML>+&GF8+
zn(}~#<;KM5?)`lD{L`%J!Pwv_xK!RQ8P7KX|G8O<b?xZhNuZ`TG5k8Zxqt$`Yj&zN
za+dFH4iIr>(g5(Z)t=RqSGyXi%%!G!p&;OCC-cZ`1ugrHmXG>pk2sT|<lOL5Y{$8)
zZxxiZp429RT2VIrhrE}%^QLN@>dI<`SpXb$mqpnWuPy+Fb7lid;8McX910sQQbi+S
zLyNG-*l&fa&L5xFw10}?5Ra)arU|CivfEdm=q7Y&>MAQw)aH8Q13B)G&QS|-thlP=
zqu;GHOpKNHLak9`evCy)e88zoBmS8|@wAfTZTnr3h2}vwIPKbXnKpi6l*qEQUO4FX
zj<YZ(fV7Vbn&2=yE?+n+=lyb)?!_QLW9(|f;KEDT?Xl{=D_M>BjV&xP1k=r(Tty2z
zJ^D{iTw@<^uFcE`^SLNL)v*?+8x*HEal%LxiTEY$sfnDsK01u*8+ogc5{D(^jG`eb
z$<MZ{rgU`G7RZ`Q7E*3|FSTq;VB-QMmpbGm;ewD!{mfN@N3nQf1?dc>QIqVwkLev*
z@?bA$CyB^gtoS%zo2_uNbv*w{!q9$;O^wOIFRYypbyh|ehgYHY^0nuH-;W1Soiz5G
zs-&?7djUO#lSr-o#SRaFjMa7B7h#M6R_#V9%o4Y+2XI~)QRuZ|oDgUq(`Zv@ii;Qu
zCaYYlo|cDm%bBu|gMB8?!!GHS!i<gzvqSs*()Fze<_*G=mbBn~T)E}AprALToP+rk
z8|5vIsw6*7SRK3j3MKOH9uNr5sHF3Fg^Gv*ADjV%HZE4NsQSTahKu}m%&nT?k&Kaf
z0%a-f0tW@1E!BbceRQ-cW=qV6%Z~VKLP~*s5Wx1qYzZx%b`%*3Rbx7v5V*1DZ%mo7
ztLao1g5%N%wVC0;GpK^C6AEf2X4KyV)1J$wx^Xe4=v1icti(`av)9XVMP(g7(!wRr
zh^|#GosZ8gp*AqO;YiWD%#BZbL}Kd{917p#oqT2c;uvuyNo8&T)3~W{N}_elNRr6M
z@wWUcf?XVmRi!cwcFRuYCMOcaiXt^j{j|ev4%g*Q2c$~r{8^X_%@G(oEaZK&{)Xxk
zG|hj0>;2^fa@LrQ*ljdw;koSUCg;Ltts)bWVn~FlGU=Y!&^9DMlGRA#@b$2zAemU}
zAlxZ#ySCZ93FnOEOEX4{N^c$xb5KwBKco#?f(yvpC=>uyl)M)*wGSk5FW=<?PHz+J
z+`idyq=(df`fY1Cw@Lb)l~bxvgCW+JxgnC7)Iz%GUX+<=3HmLsbOO|QAZ#SW-Y2~B
zX#oj63PC|(ApebUm!XrImyyQ8@VYhQFyWpqQ#*enO|2KTNqHLA=dsGW6rFTFfXJeW
zAx6td8J=Uf*^Yz7yqDd636OzR(R{{JUKl8=6$gP{7kAA>tHHJxH`){}`|2UDoVqxl
zqZFaP1%q%hfb=*UTg2mRbEk==T~G5SY<sN=9~)Nv*ugJEI%)%(*s#Iov$N13Q~b5U
zPNPUEFnboa7oyOWV6&tn7`?A4+i$s=ZRVh+VAzd-1_e%qk_pqj3y{vvOc*VAskwZ*
zXD;l~5cyQ$S%Z`hx!xn{%ZvlglQ)B7CT?n1WwIeM^Oz65jBRd*yzf~quseTPf~twk
zMJ!dN$kF?tmQmPu?($dV6bR(I1C=Ly9u4JLf;7|T8bf>g3nYC2k4pj)h*XTc3#gY1
z8)O;>$xms=V<u5~gTwe*A_Xyvk~RGnD8N0<h>wZMATjQgD?T4&fToq|X(Wx3AU{i_
zP3KCBzCsRyNTFWs`E`9)B%OOw!u{bZqy{s5hGefN-e>S9sph~Fw&qf8+tllk@JvpG
z@kLE67p7u*XGtV9tUGkP@b&2`pw|0Imxm3B2e04pl(zBg;h==LA)Oz~UpDa<(qNJb
ze!HQsK-R-a;oHQfEZ^Q-bsK{ml&=f*3(j{~aJM+LD3IJnxx#Lt&b6SC!{@L1!h}L(
z*<lVTD9%FvfG{X{%7KczYYRo3OJ>0)nFiU33KU4b%7WtYk;1&*-rE%M!NiO=xax<a
zdGq=S2X)DMJj+i761O*A&Vd@2CR91*@4*z`2F_krOKOX1^5h*>A|P}tAt*PuZP$L7
zks)uV{!ti8*y4JQ?uClQzD#)vd>wp(rfhC_QI8IZWQ;+fN^MBnhf*-k%>N&I?;Vfz
z{{N3h6ln^HXvoe=h^$T_d#}qTdu3-9m9kP~6WRN+<uZ#XGxN$WdxY%G@A1<4oKC&Z
z`Fzg%pWp5Fz1_|~-NZGXujhC?ALIVG-`yu}oULcM;6mcBJ6?A1G9u30hPY#;WvB6e
zi<{8C;F9yzkqLFLa@G)2-gGi;E7SE&>&zw+F1fl>G_^PG60yFHi?Fa9_o2SB7p$|!
zI`y|ST3q-kv|Cb+iv@4;3liEbeJ#?N;SzCZgQ%OaBWs1ik}GB|$|{T<vA3MpAWj#o
ze=#WS8Nc~N8}9fb($4Fy(mbmkyoSeIT%q3A9Jv3DR}AG*DMCbs+NH@{J6L@XQNwYE
z!bD><_G=CK%T>sg)i+XCUqZt^86GR2)|!);PvU15aT+<LWiQ{AnVc-b-aMHSJD@bM
zA%%cRw-eLm60oHJP_dqJWka0|2fF+=q&MKT9A3xH*0>N`V=xz|i)e%7iH9G}B0;^;
zN*LtfKJ5A;1{KK-k68syTed!%i6G+QwSjtoOuKa0cB==R<-P1q1Z@+D-k2hRLA3D&
z&s<ytNpJU<q5>ckZB0Cb{M@b$=wZ$kNcJ{qWrNpW62fH=E&R0-M(NSF6n<RiE7TfZ
zH##@TAZX0+`YFB3)-gyQ$!wr{sJc-^**mFUhzZ&Wes=@BC!H6r9K01cV4r~)hCoN%
zgBhojk*Mw9L0=JO=8ur#x}OGV5kn_6d}i(m#2w-jO3kA=#K)#eNJ{eL3nLFwS&yUw
zh1E#}n>V`^e;VQX<(nyJx6MaD@Qg0y>9VZ?maRu7K|(uv0dp*iEn~+W@6Z4wewTxz
zT+RlRL%)TJ)c|t;zhjf(6K@^XyJB|CU;{z8r~u^ot@Q3!W1%QYx5#4MR=EMBuVO&I
zq<xs&_cMMPKjI#!sOde@e2w6oOBVSePGCFMyq<*&sWnp$UCVAo)~-uG`9!#=F65qN
zA(1kFrM<SI8}uxDXQ^h{?yXDCto4IMY6pN9chRRbG-MVz3vIQpkUTl~j?(J2o>c|V
zAOkJ$_HxOhpV3Y04x2599%hGDKK%iJN=HPEvRFGy&hr0^E4W{!<_U3Cs=jLzyMQdU
ztJ~6o0}tX$E!iF`D$a&cL)pQ}UoE`{xc~`cAd{%%idN2W{5>2V7SGWg_ujAk76CA5
z3_`Dn5Ti?y@0aHq;wP>kl67RP3RkX-0k$=^)MAO1yg#nE=i4TdGH{1t#Cw+Fh|?Sw
zjkSGW*+SP$ojusqF+;{^YMc{@&VtMMwZU$ik9uExnfN$e1h0Wk^da+8)w>?>d&WjZ
zAx>(ek)(+k=tRn{j}oxjfWm)pFH!I7M!JHF?3cz>T2Fn1PMBoce3i>{AVryV8$H@@
z5Ga7suEnip{`TgW2fNcSO_h+~Vl3!{_3^874lQ2s_;?&A%SMRUa0^27mioxOB9Z(_
zPW+*rG=_Hqqli3U{H5WscgNVFzXwgSdaqwb1Zxd+30<l`5l7R73V*LT8>zjdU!9(q
z6-Ca!$4paK*R@NCBir?qsN|qn90aQ8k;v#x_k(Tp<hk$ppS2T^|M?;uk^|q7vQ~IY
z8#7k{>=sZ4_JF$jm|=v`21r+EOzTZN7o&#%eBue{Dgjgq*^0?BoUf~vkPx2)zx}j^
z#q9LND7#CS3*e4z7q4*AOcerpVazDtekELtDR#}g^gOcL{>WWexbHOqG4j|tL|*<?
zFHJ_Mi<X``mC+O*ERgpjzg*b8P2E7UtU|IVgdoE@qYMt8C<W`2Ozy45q@%0WoQ2qZ
zQ}~`{Jb%h?f+tg=zUD|lvDd!UW?;mjdeHv-<)b0Oee{=#KIxbAo~eE>_R`3u1<@?q
zbNPla-#{oxmSquuHW~P%#_odL81zJ4d1HsKYpdM3Y+jLL{)mcH&-NQAIP~RYzW)Zz
zPMuF<9bJxFnc*8;eitY*RB?rw1t2<)R`b;x!hgonRkQq06(w~1!!|!<{a*64rM!C!
z^LP~Q!}cwX^W(aE?nMDCJExj!Ze<}=2V&z#&+vHRH`tsBR{)XW7l^+56{L*WwT@KC
zdw};v@vx*b-0lGl2qqFsAu))cQX!&b-JaIjr3f>&e+yDr5fxCncv{i4B?QWFiEyMF
zA{IsI2jEZ_YOggzyHel2sEL4yxCciikQ`$>R6nj^OnP^X!}YwE3eoN?j{v%Mzed(=
z=w#usx9_7!ubg(Cg^&h8+$Dk(6DLwHX~(M)VShIcki0Twsfo$fO@piBN%Sx8UZ8?>
zw{w1-bl?C5;@%-ZsGEVRCvH*T5U5uZiLs9Z>{zo`cIM~@0SxQzBEI)%pRc*tz@4lU
zc$A(^{Sqg>_E?y8a9ZS#zUSe`1;m5<MlDap^C8tb=Sf6zjU`Ud#nq4c$+RL~!_I`5
zkOXowZG5T|480(nDfsSAf9%!>{WLvRFf7B7jWaulZsbsnv9MbgS-+0RGn2!NXO3M%
zlr<YaZ%??hKSe96YH>N3o2+gGJTc&|cxaSVcK&nz!Y}Yhyx)XfrS0n8<nYT{$t1Xz
z)LS54OZm2v(T8eVq!6;0Dz6fPw*jSkGNew|CYbJ`b)XQrM|j0lJdl-SPV&NK&VH@W
zxA2~(9lDMt-}Y>Odn_b1e+uW`e1;zZu|EmUJ`o%*1EPT1&uT^TzON1@Wc0=B87I(J
znvgvGVvYqA@)FEMZWoOX-kF+bxkz)O(LS{Vhf_^Dkjpc}dD2>~CGV>@hd@G5A9eia
zDn9;3lmw0&v&b<A`8oO0vdc&lQvm?nuY5htX1#trrlN*eGI2tk!%JpC0q^0>Q?-(;
zCAYOh)7p)}_d?z2LIivqP}_dwJ$$&wj|y#lL!FbvYZkp@x@1DS$>C*~G)r{2p=9~x
zYFSE`IV#lTOs~y=6OB6n>r<(Q16;;50c>wRew}A3uF;1j2mCWE?mBWq_Z<mn87Ei>
ztekMLL~tsQa~7Mi;}9&)_bQqu^3xv9Q-Gvw-jpe-s0u;yPtO!jtohK=5U_iT2gZ#d
znqOpb6hepYLgcd9#=OV_?U!GVP()B*U4Kf=;t>=u{pmDjQ41N4vE?J{65*O$YIvjp
zDSedIJ#X8jNACAe3vlr)SqB~-h&-6Pw|aFEx^HaH9L#<J^02g!9~F`e({?ef#)##J
zACBlX^Y!jmaZ&f#S4+Ng)oa?f4Uxv{`y3TN+7p8$P&-`?l)<$@L3ZN0XKSgh%CQFl
zWP%Xpd$X#sO9>*NKV0#bkRXtXC!pg61;wa=@yAq-YAo*yTiH8WPa4YKF*7ohx}>a2
z-kkM;=0->h^vguxbe;Ho2HS<GNGf3&E>YLsY(bxRruJfqNiESpo5mAy9r~fqbrNYW
zNysexoK5h}jP_lL4K!1Xvns$H&{rSHA@dm;`)&ae!EOT)1Gm!Ne)zhwp6E<gOK;XH
zfU7io&llylO>3`W3ZU}wALS*2D<rAkp=XDy7e~AgN``U1=teTv4c9Vih8}+bJEu5=
zlaN}!XXk{$&MB5qDStV1L;<;;7LF!v%@5aWJPu-FI$pl>?olZvV}+a>blzB`O%SB6
zLlsul>~kZO6?FWJtkW(yL?mqXA#MkP({`|{ccm=9Pr96~IuMzz+J->BhW4j2h#U)R
zjj`ZlbjkDArf#^^I*mFU$mk%3%-sZzIn?PJ@%cfl%x>I)FQHYg=0sl}NRv8YC>oCC
z3fs$4S7YkXvJEnbmpXIsNUNl^JOa9r*J_GVIp=)~!Qv?#mQ*zAfxZy)ay4)!UpPQ0
zJVI6i=9lbfLriJh>7O!i!k>5X(!YaZtH1G=4AS3>KwQR+a-JxiUG`|my+sO9kDTH6
zWGd)<i0Yg*JYX%mg0B4?*)x1IN7p%YzpBFFN{Z@A5uO<6cadaV4iw#n%$g7a)wSe`
ztbp8zEP|ExwVJ&qjU?iIznFx9n=9id0Y1I?2FTu`h$#HelQc+h)Y#!&5RKD~7!6gf
z)DAg$J)`xVyS(oJzQL^qC|`;@P!McB4|Gc;iz7cKk~m9vxVoeewp-#_UQEaZ;f>9~
zU5~EJrz!7v3!#N=JoM!FdqBs$xHfngw~3%;Tn*1Hq*HPf1mnW?+!X<9(W-k1O0d^N
z9k_RuJ+FE+gV)satP3q;x!VbNzo-fTRTt|BHwW4H*Bv{Rr=Tg;z_*iM&+~H?)B8r;
z@3H0Kmf%hIny|e+Ht3qazI?12IlJq_*PGv*M1uS$kg+uSwsL#dj34qJJ!bU+f_Oxs
z)4b1;5F|%oT<SzOEn8ettG7ta1_<mvM|vfnx<2vw!f9}8p!%2)zetN}R`S}p4Y*TK
zluWEH-prgLY?>S+`gokR+{IBB_6>o{a~A{Xrlqf3UTcxqvgD#lKmIugH0=RAMQRwq
z$L1w{;W)RqgvoOFQv+5(69ib&=4Fzi(&(q@>EmYgz2C%Op!Oc9s895Ki$r=n$INJ6
zC$-=2Y4nmlBjLIBt`(p*@};6|phRm)31BOxNpQ%}%Ti}nakCQB`x^=rR`psaW4q@c
z+$W%LgLtlwtPYAjF98`DjcR#2Um^r)sz%$PQ#D!1(6uWh64>oLiQt`4x}Nw8Mp3`8
z2cn2m0gaYkgq0c@|G+}UuJvBi2sxvyU*lL$@jgdTi&}zrNd}#ejf(dhY}Abkg9E9!
z%7|N|xZ?>m;fw8v(lTvG$LsL;eHh2-6W1MDX+9&#OOo=uMzApf%$I`<0OF9nd;vk$
zal1p*Ez_EH_cSAg3r#p{{Wj#%2b6|wwx(z)EWlsUNltjm$HbNQgpEJIo2RiS6lr~(
z1O+>pdu!Ea&2X%mRy4_bNy^<~5Kv)QfX*0K)WVAEz>-k|3yluVv{6^?2fC<4@LfyU
zksddNtCb#O)UIEjBW`W-G5C-p`vE>v4|)Hz(0hR9ibg5XSDYw&7Q)BKnNb$rmD<SE
zNo!4+?r1gInIAKWxD<A$Ho@D87*azVN!!JOYX(r}yyXYIkgwkvOe`9?SW*`2);=sV
zV6)!%bYWp(MEY#0MK^k}>-4dk!;49cO0``1jT<kNyFh<hnW!Uh(+3=59_6n{hw5fo
zCDO+uq^Sn^wkQ-Uvts3(VY*8U%b~KyYqR-or2>}F<0XSwYYY(knd5ZL>n6w}4RLLN
z;tTcw!<mTmD257TCj|<usEx%Vao*TcnGajVntl%ickM_XwegY-+i9G^J>}Ft@{USk
zabYc5z7?uY3V|L%!>NN&%9iM~p2d;);a39%QFGULS45|k9g!^|->tf%y39Ld!jj==
z70{L8zY7%s(#32Y=k|O_Mq{%9zQWEcbG>kprkV=D1GoirYA;UN^hqFZR_(8M<FeAM
z0dqM^Ps1F3hS{eU4JimdZHXIn$E(W`5WVy<{%^v2CHQ--CY#{mrhw3N0T~JTE)79a
zL~Y;YwRmdzlqVEVcXJ0OsLY`OR1#W+EwWhWlM2sHLi?!(#kMQd6%_DxZ;m+N4cACJ
zY&{A&FGM*PbG0`9mC!WXR{Z#T$2RaPwTLcK3A{}36za+a)83VBXP%iWDpoF1q)6Xz
zg^XMPJ+vDatAWVZ^8u>aj%NdDS=pQ^)|qwVBsd&dh2-J@{7~5;m=DcqR$~ZUr2t7-
z6NCnCKA8|_6=W;jD@p{+M5h`Gf^nV%vzO8DSG#TEcC<#|H#e)gKxG7h(8HlrxsAvX
z)^DD#gLc^{Rsck7h}aH_1x6rE-m_Z>W_1#f;`Xy!dM|C-#Oafu<Wwrko;NtaOMKI<
zn%(u2;}10Q%pvI6QMJ-%QANO9mgtCEEx{RgNSyZaoun%k(+=T#6^^deO4<V&NMQI%
z6$uOnGRm{uD-qC6L1m?1k6EpV%KAdLU;*tODWLF6;Uy{3H!Sl>e9Z4Yj_oz;Kp#oI
zQPf`aBTZ1_U=P#N(s;T$716_vSe&AVZuu-zFuoc13|%dLt0yKXs*UObk0s9Iis%T}
zjdTA=>BEyB2a?v8lfSd=t>D2R#*G-^p}>FJi)d9kQ<`)-RN7;*HIua}o%XI;-3fdn
z0A4JOvfbmwo_EpmdzAwaKB2KOjt5A6-WQ)0uI;|VNXc=wESJ**tWGlID0q3~*j<Wu
zAVo_jcjr<h_UCJR4`qKqSXiTJ-`9)E7ln0yBe3s5Je$(Tp1N;lH<-Q?6H1Zp70_W#
zuPwWSgKXv;EX^&Bq<gk6%^@=u=P7*HiK<emK@#XjnxnWJHW0gm^mHU6y_!=68>d=F
zw>3cHbmPNcst>d8!-Yb#E`1uL3HP&nG(`^4?A*?2Oywc---(n&@sWmH5}sR-G3%a4
zG~E$G_8ElM_xPaWxBcj~uk20y)^tU}ORT5s#0KO#%Lg{VU|o2ZlSS-lMYIKzB?Dz+
zHx%H{dbK-&;-n37CSOQ)!SjOzQUOE<x0@pQW+2;n3ponVx*3)%lW{K>1jSR<f`4iq
z`Mo=VTocEV>p|%pM#~Pelw>L%A^7PZk?G?R1H|mWRgLdt$*F{hM!GwMt!Vs*{gC~N
zV44@X4S5+EB=HY;-v^f$!>`WcA&^dcXzf!nst(n5nXK;+5uwYjg6mq7$p59D_h~b1
zIieAYU^Q4Qzd^@!gy7D6U<w4eoK7W3hlbMtJYX~u67>}F^0dGAcY;JGf^WH0um6|C
zdHc`BWf|>rRESMa%}SR2rqb&KML+~CGjm>-d}zQ0kr;;^a|_zW?J!WoYe&G<sWw~l
zby$}irm4~NoPC_I{&F^+1TS|$@jmz^pgipxIqDLXRn(KnF1kaLBiYKgQ&FS>xd50d
zF@;Ki6f{v-&W5t)RN<OJR<t?#?DSJ=!USp&M(-zqI{DH(F=$y4nZJ@JBqu?pJ1?ap
zc%P>>!O~u6!?yCRVESYJ=K|`1?}VW*2^t}6p9CFeX@sZ&sBkiDvFrJ<+nHpyINPN-
zN53DK2Mpgj6lIf;2a8M>g3gvn%E2|!83f@ZiC9MzaG03gAZdXY6z)U%<@u2o(MeF>
zP1QP^!gy}D36h$cL~YIC%9KS{aZmcSJr-PgMe?Q>3F76s7yoj*u*3NG7gZ~T?eokg
zSSl?y0;yPW4QW+JORHC2_;#j^-?w@{=@Xr-7%1sYLfDeJtfpL9v=-z7)lXVaujR{d
ze%Mdo$pGSg38H~fD~ZEwZtqA{N1CmTYQdT+P>x+?n*;aZC1MZ|1gCKqZFb8%{ZA0;
z-yuEh9QXyDQjObY89?Bx9XE*hl97X|hNC2x)It+&_Z~u+61P>c6C#1IE@*BpfJiS1
z63`eG2|R+AfXFa~i^)7e$?tG{6`%)09l<-SR}%1k5JF(D_!-gRHJUfwUdGBI#4FVf
zWiyu68Tb}+5ZX*a^}?EE7hHlDGJ@+P1(5dU*3>@m!?yo0RR=V{Lg~&`UN%Q(B}+*g
z9`O|>-Akyhm_Slb!`l2i@ni%p^dpM6faQCMIOg`yub#4pCS2}RZD}+gsv&uDUPYFF
zz6J_uMZUt`RfP1>&ybePB!oZ1z{)rnDH+;-ync{N!2X*krR59J>3pzZUD|cLjDI<O
z{9v`bjdjK?(~2VbPq)+ORheww+PhYkW*MNN@0z*!_af3GTq3rzJ4m&s{l+nY7ut}p
zJ~hS_49oC<HD|t-G6hLBQ6XqFfPl*&Ewg+BiAX|%3Pg@g!TB;N$)s}X<9JtH1)gvI
zs3FS_zQ-S}QdKp_KdNwYtR!TCNp|*RDuknK>QU(sWQUO<E4uW(0wb4Q1=KNkhe9RO
zLF1e0(s55`osG7zHUtrlWW=(__^(1Du1IN^yvQALo{~rdHWCT+93nZR^-B1SJmK!%
zE%|3y7#_i4^!=j9Fa4`U(N#Hzl+uud!fN1e0z{63pD6xx7#NO+1In!pA!gW&f}#lq
zDNeJ2Q?~@*!$kU6x4ab|0J)_=nwzx=EL&lv2$}jtgh{|OL|H}r1UtiDEyVAWQNR{g
zp6p(F9i;&J+A}N;KL~c%-{4>P#^$weSoWu)-cL2D;}hlgS0V?ZR?Pyu&(DSwEdSv)
z{$D?M3L^B$C(Anj#9sSX>+@sAJhhQ`{QDh0FTn3N{hziA@|%C>>fe3!<1YARbNsP^
z{yl^L+N*!COa8UH|L@;XjdBGDYa`FQ*q<9t@gh&WhF~q~1Kpn`Sy%`lMSvrz@@L2L
zaEY+IhO}PJ=MYtG-VuB5UwI>cTtpvrAXeXAF8))_0gph6#M7qwxNPK=$y%R0o1(n{
z{13adL4OC(27@0f3pF@@Sf@8+AGJ9@ZOYF4*>#e`_uMFov=JaeVxz~FfbdpM&EVVf
zHazPI%b-(3BT#9oX;(GZPn5c6K)FOB{X2hl>isGtcYgyv{neo#oBn``uxnG*EFUQX
z`|zZO=bpsLQ${|ns*30~!0;*7vhr~DULW@u7dS!sAJJsE-pjNddEgKQu$|BG)Y${+
ze>}*4{BQ%@E2D{HIQx)~_?TSvTJK5kr8@l|1Ne`3{PHJZ5rik+zAlyQkBRxeeqfLU
zQg8L3`hTuZNeTkSYOPcWs9(PPhx7I8cg9@;Qa(d*ra#f?|N4?2KK9<nC@I0HM4bH1
z{ugKTXQ$CK52)fpY196>xSX6kjEc1XcRlBS^f2{^(&qofZGZo~KOW}a1^IVDe(Z+7
z-!cE5kbh0qKiDV#nyi2CkRPto-)BJnz0}i1{=sqjK7s#U>i>FK|CeJ8H}}78@Aq=W
z-#alsPTaq4@4s&EkKOS1JLX@v_g}a7AMBI=pWI%fRFiTgzrEeMbW<17H(Cz2Me?td
z<^j*}dLgjHQ8ZQby<ad7tT0{L1=R4}&rk&4HcxOH+XI^XWlF<aJT7d8izfRRQn+6|
z7L2z1UNcJoeK#@Mt-VB&9_d=ut#5-Vz1?e244oySX?UljXbO4@I+hm`bl)Tw-2U5|
zn)j_ZWB7Qg8<~PbF1yFOdzALFaO}!e>@|vvWqZ{fJ(BRthV{8LI)`GX-xTRMK%Kk4
z>%p=b{$CClxe0aS@LYhdppDLc`4-x%lAPg8$oEmxg1FwL6N>UkvV{FEE&>!WpeYnA
z>E%Mmnxketb9x=*f9fdt>0*tOz%4gJpYApSTFIXIslDz>0Z>x@WM|8~CqkJ{xbzuo
z=0cj;Bcpou&-uhZ=X&B4aQgu+n4z2GSX__%K{ystKP!X-W_w5(KHPcp!sl#H1}9Vk
zmY6=VsE___LiVIcTrbR?j0<Tr!$H59z1b>)3|f)2@pa_e!*AiT0lJ@q2IoPGgI*MT
z8orXq-+!8HoW=|neAeAMUlwG=JXc^`Mc<J}$HR5^#wE>H^0lcgL$^D={%^$`>C%OZ
zd(1`4mYA-}AEW5q^JH0|oN3`BX|y-2_qad=&s#JyfWORi_4zUK-k*C${*=mk!72w}
zJ8gW)L6JjyGmJxa{L<B`YqFhtbEWB-cP}eh^^NUnVB1eC^`}GW>B9z}(exx-8=rnh
z&)~aGvN)v9q<_~*u0Ag}gzkjFo_Xw_^=CPYpX|N_83VM!p}lD5$loDJJ;<06keyl6
zGTD!Oah#NAo(D3L%d5V5e>)OAV)zWB8Y#3_wf*N$dkJR~22gK2|NQXYj<mp~-9kyj
zUmZVw(*E1+qvR>mYYzDKsaZL4<OoO|hle?NTde0W@)ic!xOSb4N#&d}_g8zmvdFIZ
z^`z)t#pkm`pO$aW_<V<&513Qo1&0u+%Fp3@t5wbX@y*_plsyh$`K3qj$LdIsm=y0#
z$t<48-jqCq^y1Z5jwu1>*OKrV|Az|z*-3eLJ==wm+V~AU`~N;u>#Y^nzY=+Zr~J^-
z%{K;{$O@lRggh#ppU>Wxo=Z}>mqk{oPzt@lRtEozLZ;kP;cGKoK-am$R<B;Fu(uvn
zHOSy{S5mT&OtB|1Xp<NA?CB*)XqLzS-IC$p-eCJ^=GsI6yXWyvD$d?qI)m+7y#0Ic
zBn@uU4zgxP|6W&K*a;5>oxK^lPAi@Y$W!U+6A+L^lW(@F>>d9wyvV)DDJ|GfBKe<-
z_5C-8%Z2mdtjoEOp4mvi-&;`<L^89D<Ga4IKf#p^WQM8eek{;0Pvtp#=t{eGg6f;u
z+5Jg~$*=m1?BOkU6C&i5aSfoVjqG98xqa{c_C|pDNG&(Q1G{?T-iv*ECPx(JgOV`f
zyHc+x-U9L=*B>AJb<_RvWN|#WB8Z{++9a4mhU{5#3)sOA%6B{VIA~lhK3g^CDnGPh
za!~DWRnFL#aGhZ~fB_k%WB=ZCT%m!Zbeh)r`*J@K-bQxtM9yFB|L;RtQaQJGq|5sc
zcI{c92h8Wx?%Y|8iA0`WI1vA`N7k6CnZUq7mA~C=hl>Fc@?~JRXm99zC&OP9u0h-5
zHD_QQii+lH0R#<<hrJP(rkt>uFK>JE=~llGXvWz+?tZ%`fW6Q2f+uj=uja(B{~6Nu
zQ4N}`3}|?fhk?^tfIU18>>J2ueT)=V(P(<D`v%C~c5<E{>uM=hDd_uo(RERt&*wtt
z&YZ>_^EOR!`P1lyIdOZ;BY#)o-kT|8?tJm4dd+H&+~5z=>yO86{9DTo(kt2Uyno<6
zk?jlbuV!Gs(YlAxHw8>2>wjPg!<p`Y0znGYu#BNq`j2mgPt!aHVh>HCzxW3Mc=0iG
zJ?41|P=G_&8XJs5BY&pld&c-|b(E7c|GQ!MzMNNRa{%W)(f5jxG2&0(aZdPiVz5lT
z2szJT8!MS#I&t?l&hJn0^Av)O@C5zztItZMlaMp}$L{^#eh^-SqYzabgy6P*{kXsW
z&EZ5)nDy+lWZF~x{zvbLL_ChNkm!H(Vq_w8&w)>Hbi_*0^dAliHdz@AYb<NzZ?D6j
zCK#EL{UE=5=bTfN#NS$uU&re?2E*dErltSunfZ%3C%*-DVt+}m(%yyjA4B-{3y{yf
z!UjgcnVI(=FJYB1_=soT_b0dg)yMtCpb#V-468}w!|xh`fBBj}cFpJAVki1<iqgAh
z>tsb}<?~1wU4twPePU_TUVN2&QlOR}HY4k(zQbrPaUu`h?Ud9+c43p1*yoan2kln5
z_3nDJV@u36PA8+;@<qNtF?ZwU{#d7|lGv8$2m5dd&K(!W3Bso-=~XkuJ1)Er=O6#G
zMfB{_#Go(7iT~zJ$Dbtb`{4Dw&oc80eD!_g`~TTLJNUQAmaN}}|I^ohf89-m`?ufR
zn&noZ_~*}sOu{biMWT&jeW`yu8GAE#fl)Y5_o5RebHG1;E*R(id$0i88XR>0crx}L
z?m5Xhb@rgrfDqCDV$qHZS5?9S>|~Lj{I6!j^VXqfm&iZ#*Gka*v-SFYoj&W_zkPjI
z#oy<@ni1iPu-g73Rt8D`-RAnnCY%>{kGb~de>EfLf?%~pZHsf;ejoHde8RO;=hQz|
zk{Vq2hp+#>)Bing|7kD&J#N1o#($67f7*-xH;!AC?Q>1@AJ@${Mt0W~3p;2Qh=RVH
z!7X40O?F-!GlEPT7k!L<kUF#=x-~y)J)SCq(;Pe1H)p~1H@Cp?f_*Jf56BEXIxkkO
zD@wbTF7-;}=}rcy<Np2zrCYjVjS)uBzQ_*b8)8iqZU7lWx<SUEtHgeX+u~D>5jop&
z;d6{t>%j$|N&n`1?hAjx<+o<-d&+p?^VSz~0u<b!aUu~qiTEHx%x4Eps2Hni5dAbo
zRP?ZVdg&I}?3f5pzwKh;GHt7IgC=?P<xVAee`$_(m0*ruUtvbP9q4|Jgr==VL@rvi
zGBSeG_-TH4k!5crG*i8`fNqau$uQPrkL?Y0KOuy(o)}+I>|>#AN2<K&u-x${8T-a%
zHCHe7KKf->2)p4A9nWYTIG?R6B<h#TXoT9N;K0Mdvn3>1%v4{b(+-vl|6)V-dCO;U
zAYiQM`6cqE?OIgWEnzJsg}od5EzKS1CW=Hz{<fe=Se=>6X6R<ZJ*F&;Vs4&(``~V9
z$TqLlD7%~M46idAC@p`_)n`dDb^M7h3`uE)WcoiZwlv%tGX<`@h!3Eo9T#rggmN9?
z3K@Kyd3_Y`>OCgomY^B9Z+a{bQD3&~xv~h<C?ztL(5?{$T?@?)Y9B+Oo8(~xyPnsy
z#lgemx~M|4Q9geOysbl0FZsLusjhV$J}cOlS+OzYL2tkh4$8TdD#%)x0%1K~O2+4C
zn%!$}p)ck(P{i?YT6|)+1HHd%!3=M{p(}S5qTP13)?Zeiej-eQfAwCF>kKp@U3qZe
z$YuA<6|NXQ2Z^J<?h9dMg5&5k-~L9?9cJM&?`pVNCC(qO$v4V6H6N+)m1^v?BW&Cy
zftzJ?{RwSzx5zHNp_Y65_O;D$rk%>-Cm?Fha=3eZ|5#I$u?+wG&3xTAxY%xlbNnI^
zre1DROt^CA^+=Fp<NN0{F|C#BW0Z@aNe~5;{jDt{{)m4%LLwLT?U>-2Zu7i?m|EU#
zpY|Z#3+i&>jYlZ1u=|m*NKmoNEF+9ND=&EnYOdd3H15{kf-jJ2c`4A;Yh7u%L=y>Q
zLS34`MVhS@TBlc;f8>{Yz8km3JWo2+Cv4w*<lPzBq63~egn09r>b>w#7OtVojE`Zl
z6(a6Rw_Uh|HhXL+l5Q%1NEheZPqqbFEqs3SAj`*#R*Ri8u)GQ;^lQOD$SsnN1Iz@2
zXr0QSv)J4^!~G|Ot4<&!^^K|W35VLQPNgQ8C1)sSgiiLA*fqHpp91V@u?Cj|!A%w9
z39{SSRyLL<huuWQzc#ieh+WGfLUbE_n09_~Hk3+W%uGW2E>x`7m|v*aukMcb8|(3o
zd?QP``N(H;7AkCaV0kVw&POJSglj*)q~=*Z58AJk_{@^Zpy6yXbD%`2;dVvm`ESsL
z7iraxiGZm-Xa2yUn)2!$T@?XRm@454TpCgT_fEvdTc9M+X_5DV&a_<{8m*0hNk=-P
zJ*ubB{F+v<<J^FNIXNl8P4)YWb>WcuubZsh0s^4Mo)9zn-^RBAT>65`|Ip7awdd5y
zbJ~jToBXUAQe||ku0b=x=gM%e3z~PS5Tgm_DvX-EPJP}|H<Bo$TQguz<!w7+BNyb{
zEXGp}J=xeR)&u-4jOAjxAV1a!6lSvm#5Ybb2!95KoGWIzlF~iBzL+&iKKoR4y+@=L
z337q3pY9%NjO1#Nin#oauNfG9jDfE2R6bBcM*?$GGxSuZCDZof>_2c@8k?!kX?%wE
z3bS}B%!|yaMA%H_ru`U)t~I4|#|>dx4PAjx3V#eVJql+fjtBuENc35xOHQzBp8OY|
zP>u@rSh|5wohh>qw#L*tkA>i&;<*XqrXGnQt}lJqOyzI5(L&;iLtNs1Ipr7BbAEFj
zfeV8N_`RQ==CRU@GiP^OyOZE|4Bzt{{%qy0haL1wMpf?a^z!vWQ(v74KQVsR&BNeY
zoyFxeYRq}6T<PIHUeTQ-85U`?cldZmNyZ0?;XHPBq*SCTraVO`%VO@OEtiR&vx4t#
zSTRWu4QPZ`LrWl~)a5j8K7;KqyZwQ<z4G2YJLtBH(k^!?d3uBP=X*og0kIEHx6g!S
zUDQ4qCh)=W+;J;U1%C4`vXf}SRS)ACFStj<`H0ctsE0G8{l``B1R^XVW5D`AWO1ZJ
zzYSU-Pkq`7KY>de&TSdVJDC{Fe-gj9*xDM(N6z`cRqN>u%SJ-`4fyNSQiMjH-H<Yi
zj}z522;gdj22}$(*_Re(o#}=#p*RFe58$NQvK5=Q*RDsGcN;~~GIMIDt-I1vW;{|j
z^y|opVC1vgXG5~+wNHi!ymPs5+#Tm7m)O0tdH5kGVbI0&+QG9YLY*!Wsoa%~hG`bn
zI(SNh<m;I$CePF{_NKY{`Yw28c+hBgvc1&FdJ(e6T^)uMpCC8a1SZD_hzbfDua}8|
z&0HXplk_Z#ueq%uzxtL0FJ0^YBK<aj8c1yhU`8k50BziV;RKX$VUxYZ=yqvm2`;u@
zLZ5WG`*)>DXO{>pFV6!(=_M6XPjZ59nY;(CJ(o@mhsWj?n2)pwmcTx{r1nMN8)%rE
zcA(O<C;=jaCZKageYFUu_N4H9M0ARxaA+B%!;qoMVQYY@hEnU8@!N0IICzK1zD>=R
zsplKvw}cU%NId|1{A+n%j@}JB;7a1vI%<4OO9ipuDGmDDNxuZ{N@%b|Q~j%YgSJCl
zdV@DI&*ILX{H$)bwZf>OB#6^|*2#Q`YnC-6omM?ZVS%xs9B5DiJWeQ?tOKcW6G%?U
zO^irAzMoMAj0Lxinj%N}q(b%|JbLAZ_hF7gbGy^I7dZYWs$*i6OOc`4yWD_`P91Bw
z^0nLTWziJ-|GeOP9wU_+XNapaADvK;RAU}rP^_l<yY0y#JdV!&@S@VAWPvS+*62{G
zAC|NEYBfJ}>_$9rD;EM5%O<wUts(P11^*?uN+RL9>ScJpv5eKZc5l2TcFtb1H_uSA
zJzj(;p>j&l7Q8#QcT9;*$OhmB%Kyk#^uCa<$@k!RmZ}4@RAVirwFRo!YG%uFpv<|)
zKFK4YViNcccvGTEfrb1{>biQNZu@H+gM_ZUj-@GJ82h@FVeb?PW5@&FNF(zia*aZd
zrd?(=m+ov|N!J$;k@8JbMZa`-J&0qr{4tAi4jMZLGNCis!&tFxeyIURL$P`a<6JVD
z$5xG@bA!t27;vF-&Q<PKtiuHm(AcGBaHKt5nYlgJKrDaAmwBvkK*w$V(K=AO#ng-d
z`*hy~c{?v6yAlD+7#$YS#O&O}W<`@hDWSzvv9%yM9{Fmz4yYl=uTJ+#ZMEm3r*APz
z@S>B77TX<XFRAC%NJU?5<%k4V&(33-yltK)3Ut`tgE~ER^7RJ?lHwR$Uh+K#7d19>
z{R%jayr=RGjHthImW>tcpWtuLO}N(+yF86`n)L*Jvg~f)V`>5ibjoF>UEq41u8}GD
zh7%fAg;bK5V$)4dzib~fLTB)Qn{5&;u{Jabh*4Zm*|g7HedC6ej7_KwJUgdpx9lvs
zN~V>7bc?sADbgDc8d^zaugYWQTrDLl##`b={GwZUv==RU3ewWxa>g=r>0wfOJVH`*
zk|txb5S1CxSQ{WProzhC22dJ<3eUa#cA0A>M^lJ@j^NJifW7zXt<wwWG$Be@>hr!U
zeW&i?eB|d#l3Xcfex2`IS1v`}mEI9Mof~Pvsn6Al88@DLvlQtDXXhzsd_KV@&h<)^
zH8d-v)u3{uI31>x>~^<Tqh1QQjwNvz){_`x%IFU1-tM=6MHAt4{HnGH7Z2(|Q5rOj
zN5V??o`J^y<QtEVrai!Q&cXw)Lo@=C>2+M5@NlN44f3`f;I~MJv1+wVJ#b67{UFK;
z^PyYptlPKi&FX>V9B(s0De7rEa1%~mWE_cR_<VwlP1^!=6fRC{s-jG3IbD`a5;CpP
zd7$p37OaLyp4!1;x=O#a3xE2iIp!MW%xYsi-b3+Vx`-z3-b;3J-Gj_&ucV{6vl|4y
z=2ZK8q(w4Fm{IZCCMvGN%}Se|zr)mO3#Z2xCA=Wv67rl_3DGU5f6izw%18=L*pKY2
z5K%qeRm=L@&+nx@?1y*t`=92lbrv2meqgVOa5{}abNij3(Jd(h$`l2xg4S**t5#$o
zitnhsr7Vg2vog>b^T%LXIt5a)kJ!p*BuL~eD0ZZ0V=l|GAj}g}y#-hag#kpGd5YNj
zMb0Pi-)edbDrb5M?aJBgr@NZ^;L<c59$OTp!;DYxp>Mv?6#3v&?y|h+K!Z{BXqrH%
zMS>Giiu1c{MSZrC^x~ZPf>LvqbwNRcXJJhum6dIapCaAipHf9`qvLBP5&{K#x??Xh
zC$4F+^)D`|3-@<S6>79PD1E5$kl>x$)DyV3)U8xkqS_i(1q2P<<|J4>jNoLCIWuO^
z$YVf~N>x0SPbb8FZpC1KfGco4TqS&D^&vZLap;#=62he?SdwDvYVwR$vXI3RX&bK(
zX+GGCEM9j-%IFB*wKXQ747l0~1R7Y4%aS!)A`8oto?wq><en9)F`MN{YHlT%UOzde
z7Lbi!W1cA2{ac={;dXnd2ONmgw1-n^&kKp@5F?JUP{IMeW?(rO#_Q_DRTUJee2<XI
zMBh6!mVWK^xu63f_v1$Lk$A9hl%ba|T{!~eN<Ue^P1L;d6?hVh<QKEC=-?N3?*IqE
z)vVxA&i(Fj^7_s2Dib8ku>)&XT68l6)B#SbPR|rt7Y=r8C}zhea?)C3NLhyABydZd
zV=0gdXMe*bFSn>DX>1n3^i_n#CY?b~iiP(m6<<ecNS3C^c*WB|;2r1z5*e;+pOfG?
z-67`Ka@*Io&`}>o%=6M^ijki1?jur__k|SCsk7~t5HYHl5h+^JF~tG*Rjq`3Q{Ea`
zZ>KhfyMDW!?d&~+iY%$%^X@Pl2ddyu-uKJfZyymgWfc#Yy}Ua_IKEK~glVNu!%%7F
z4^ayX8XZH)J0+xrJ4TXZ_LMpfK6W3u`&7`wV&fS>uI0q8oGH8D)A|R!<%$Vn3L%+w
zpt@wgeK}0SXEkj)d)ikJJ#k9b!<V{83s^;r*1mm72GiM$_VY3cP-&3SVk-=Ct~ntV
zUOeE^U{|<xrXl3@+~egfdE#ur{S42$R9H(!h-p-}bE>Rc3sG1Nbi5sts=R(Kv8uyS
zOo;k!(XFhDLp$4>rALyF{%qRfB=DYKM=BGLU8JHkR3g3OGOKtTZR5l!VmVZD9-_jD
zo^kQaTB~5b#E$t$2_f^YBD&@`9W%|#k!sScuk2OzxdyZ!jhQ@L-Xe6(6m@-YH(-@R
zed<-AWcj|aSk`aArVrvaqcK4bN%)sFI#A&n8D>6{A$Q4dw;oZQPU+Gw3;M8iNs&;>
z?6frD$#*1$BZ;9nosaSbt4=9&pq`=lm4?KQi@f;?Bqx2Q5u*L!WGEy7!RJE^{5rAn
zUgEsID2h^GbI9^TcvAd#))~#Nx2f1?GQUa<j}P^<7zH~01*xiBX;mK=s$vN>xweSd
zn{PTCI&zb5XG=94q`q6VG|^g(WusLXPr*v)wsWKmvt^5%U9qPs1$L<z)_S6T8CqfH
zFE?2<(@RY%Y5lQn9S*9TyBO*z>LDI^b_W-#I!r}|tN&FpfhB(MXwM|LLCS;0{xHJ{
znL3&CwP}lo7N^%2$0@Y}9b}iH7{W0En4@X5y$Dr{vH8Qa_I2m1>}Mgmk<75Y5z#vV
z<I|&?731Ed9CAXJ<W8*?#ZUVS(lc;$Sj&4AVV)F)43xWGiygP9R-6ubhyfymWR;}y
zF5~mXU8?P(Lq~q`j7PvuK(^(2jrm(x<Svo*k|Xnx3p)BwurILDYSLELLtKtD`dp4t
zjpu93E{l{6s$LMQb`3ammSROMpveEO^q0-zPmJPZmsHqWCOf^P8$wyK6w{QGfq$!=
zkDo)Bg*i=xLqi_-w`VM)lXUS`gD>L9m0Ax*gX4@fB}{+zkjR1a)b033C>*6AWe$-K
zo<NEEIQWioqnkpC@^3e=*;-BDl0~7kg0N;;&16rkTjL)EYmPd-nbKF>tW)U-(tag!
zj2ej_upcO|n0&UMpX6h|9U?)l;PiOvK=H@;MU$%gI9xtg%~<9@f}_weWzv|pJw=v{
z?>V-`mWksjaZCtaij~L%CbJ}^%I0su?BuoPrs)r;h11?CYaMg2AVqa-jK*Dm;7`mn
ztR%RzW{^TT0^qpw&zcm7LL?q;jzwAHM(Bha#b<kiil-{4$JScBswDQJgU~jp2#^V|
zH3>_jr^RqRP8O7-D*ZOxg^tUGMWi~Q`g>KhoejfYHKe}~PQR@ngr%U`KgBwr!u?5i
zKfQA=cVCt^p=VPhmmTGZ8<~RnCz4;n>AMiqh|dn{gB#)BY0?nCgLobbYnSSnhqy-N
zWLO`lNw>XS@x2=$T}%FB0k<mIjGub%7S>xgGSrwAh}4)b^rEi^NayLw1qqHPB)$gC
zoO-?@kHYxu4O@JfkcYVt7h>11SbX}ZE_yy|?RAsfS<wzu!^nr={sX79QdDp{`>g~;
zq)%iR<^Yb6Hui%AUv^$&XU~fuAum>mr;4kQNit>foFzr^v(L)CcC6YF&Tp$unSMwT
z+(zQ5={sl~gSw}MN4o?~`d;Tc+zmhQURX=!l*ZEZt-?DgcvoqO4#>YCKhwc=@yRlw
zJ~d{CX;NM1f){Jg!70b~jF3!rZAnh57vHYh4AnGFc@0c&RX~c3D3aU%<fN(@h{Rxr
z&-qcKranE<Xk90}O`J&FJ&N9zlWJYsX#9dLGi^^6{V-=mdvw*kc;-+#qFdPDd_<1@
zx|WP9sr!Jket8zYsl$GBmp39xj_$sFM8hydIeOjb$lC6qLNoE}KSv@@iXk@ox*cU*
zM(3YPVX1uPW$$tH2SGte^&4f$FEEpfbSlBKdahz+wOZb40i}Lfp=z)|I_@R5SGKaG
z^$Z>fs3c@v&{q6Z!Qz-c>KlX8K29I^kl4XB<$iG%>9s-y2cV_7mi3^dX&btJIW|6;
zF+(U)p$>wx0jxXMNodbA+VQETAwfT+Jeq(b%^YG=u9Su<i1m22wkA*UYL7w~=7jR+
z9Sb)o6Sea;)t++cY*|U8#3>X_^ot|z2j6<we*G>mOu5i4k7yExo#+V21hWVvGwlTs
zZ%<f5`rr(`!-H^Un^}mIczTKfLt4_%=7(;p$jhok4al1ttdkX9VsO<c?nDSu2Uu_>
z&%jtpHC2;TvQRS2uG#x(RMJAvI8bNMVl?Z#xo3dtmK|Uqu@1}*TZm@1sOVIM=HfDw
zt%|}J#6!bes6=ms)JU|>IXiwD3n^(!kR_!`xn}NuzMVIcnOEIMeo93V%UMiZWS-W`
z*E6+r-V%7mY<4=)I)f7~7lSnk8!=%qActAhEt;<VLBb<=5cJ(S$9qzy%P<NbZzyW%
zEJx}~fd|+GJi4Qr!!HWx&!TVkK8!$pQXayn@+|q-Sl)wdmvO>94?opYHS(2nn_LV=
z{Dm04JD-JHLen}G93LO0idK2iADU#eP##p)DN!hRk4$07fZU;U)={c#d!<)~=mF-e
z;`gxikMh_Mqi|WRd*wy|_8}Lnzwd?}B}CA5MoDHqSUhIg%;c*rW`W?IG0v`u$3&_%
z1<GV{s-CIn;H600OjJTwQ)jd~o<uWt4ivY9-M(HbE$)2@Rq#Fn9V7~}EE;zdXiQ#X
z=o-IL&wvYb8CbJkI`orRDChb5R!op-baZB@I@Lw)N-$@Q*l(t*jJ)W3q259940luF
z@S?=Gc1MI1)Cid2>Z3(^HenH2jTeT%_Zzv`RcJjZXxUQ}1&Od;2EA`Utywhtpysv5
z&J@<M2(PGS3Hs^LcF%i31SR^W&rSECIq>46RcMWyqa#x5H!@h)Hdg1XxqBeK(0iw1
zLTMq=-_w<)f#v>$Lf2fO3OqDYN=jQ{R+&MlSt&wjQ}mG1fh{AOAOXeQcjv}<xKdbH
zTGrL68@}0uPzwnkPc|&#MxnI@#={ljjm`2NpjbPX#eD7;-IQT5X-gE(6(@5Wc`oMv
zln330aWJ<7B{sgSWA&{045p~Myk=#%re0_e4i|P<3UE$y_uDI`Y})i|T=w&b$++10
z=D|S?4b%4JU3gYbzM_8LY(tE8N+_JP<ccPj7+w#3JldhK^weven&YugPZzSLO(zlC
zar(`Ez<8t;ibN<Gv>M~}pM+Fza3^jiF-K=mROTsGO_Nc1Y)*fv6Ay64RXRlfc3>hh
z;~^*QA~T`Nha5hvrPlGsUn<UqEQ+m0B3J6<PIr;D{Xi=IbSR5PazOlTOLWUC;5o9B
zC;Wh9$v3vrA6_qx|NI_~_#F2WzF+PPa>(H2@?8A_S+H8QfTIoU{KbqbOOTe$)SYjK
z4K(o4@@S1&pVC%m$Yn^B88_9KZBa3o)Lu{tW>vCyDyq~R%U)+zAO<n(*|5>Os?2uh
zcMo86wu-Y8rXVit2)amXI$*<UCO+B7*<#;cgTydMDBa9Xo8gRqE=1E2(Ex4|^SOiw
zO`x=f0)k4Cr6Tl^%7+pa<h{p0ZC&E9%x1<hgF`tkq=hT_z(5vNVmFQwYtMUeKcQ>b
zHbhovyF)zj+tge6>n;xB#>_%HYYDB7&!f9CUpVk1sZ*;-Thgpxl(SNXXo)YNse99O
zS{Ng$LLIV+qHz1cr_;?AAs2|2oAxc*PCmW#wYxoEqtbDvCqkVyD+Q@+ab6e6+Sd~l
zN0O;Zr2dG6>tL39=-qEYyn!I<@P?Q5^0eig;Wmf@l^j*M+yaJc0uoq{dPq34n$J>U
zwM8oXK62f_YHBDXV|0{)GKfXDNea`QL3<Zf+S0+_f%y!U+te~wd^%~7i_509C9j&h
z8Cdl7dWtNkBgiusCA6)?z0!IUw&PUv*f0|p5@rn){j$U0x$RXJ%*oM7z%b{{^@c51
zS|XX6UH86xMy|Hu*4q3SIJ0?AZIMU{%Qa1KdjFDMXP0~K*G`m0MG}-EMeD8`eQ;=L
zU6askzpG8Sh7n}a>yw=}lE~{Q@qm<rakfm?rY3eKlaNX7%eY#bS(@M7ej-ck>zedw
zL39fnxJ|N3hTF>1u^2X%%B=<UYGP6ktmI;w@c}KOkk2ns9i*<+-beB=#YcMGg&bn4
z9d|BvNpBXy?Jk;iqsO=Ttk#W_SUuVzr^&W=sgHz;<8?kxaT>gel_g$OYAc(mqlH>&
zOumed&HA*C@OY`9amJSGT;KBGoIr3i&(>qe4gZmebS6x!HSY+L8S1{L&6&3F+z0#m
z?7nxkdtPj)_e_cOD-OaTt^uJU){c<70h`NlnuV~jleYOAvTv3ple9$A%9)hG4ZJxY
zoVsjx##B?Ei!gVo^7@ELVl#)uxU7Ev+cO!hNG|(p+UFgn-Sf}uUYyQIqw3QS4?gSJ
zaeKd(e`kGk7Q-v+p&B2q2c63~Eb?+RjNJ^&dY&x28Ajf`+6=nWAq=Veroh!5j}#u~
zHaAqx;L1nJpq4p`t5B)E+ON?giqNtPoW$=$JwJL^$j1xi-8BhS1}=;z8R`W?@YOe$
z$6sBfoDqXRMXLBb+E9K-Y;PfQKth)wZ{oEQvxJpMmH^qwC#tFq5;b`tjStm&@20C>
z&V<4EptOCwv=_6Mav%rCJC0T*4hT|PGOUtq@WC8qIDxS0t@afFr+;UvtC&XXgAhoS
zHF00Pyo?kjjpO6CLZU-&$<6n$&m5#`!K|c34c(M~+luIxz*BwED+{IT>RnOwnURob
zfnQ8^zKzN%6gr|B@9@Ge`%BvDY@=oCI=9|iJRM}@{a2^&DylMcDl8?LjJ$u&dlG;2
zm7REHG(yCZ#lLYmp^MLtMZjf=9bLBI+YOpadBpf!bIH<CMzn|01;ID76yx<<fx_&-
zRK(}BMBzfENCrM53m-2Ev8+=hi|ZG2&ukyRDP(qs8GHwCb*SR@o}2pRlGQ)=Y@_k<
zUFnsh%f8r$94ZMcPpXl7w!2QB*Bs&<nJ6`By`mcP(2^4wj9=lR^;MHj7u}ea7&S>m
zi%bqa;)Sc3UR!a);G6^DMWST07hfEV{O+o1IfMjX)zv5nA|y#vT8-bhZ^nGccB=X}
zGx=$ecQ0&)Yt`vhB@L!fR|m=JD0r;CN{x5UWMXa0F(;IHG9c%jo1JfYKM1FcIN73|
zcB&lZ0`<7KZs3DGnXs{u(bZQ#Rg@?}dne8H(>cG2rHpr1isQRNeOa`#o-#h40LH=P
zLIO5(G@hgmC;H<piQ0m~n~^0F5_C#VKH4HImLEP$pOK8wW;veTDWDO-WBt-n?)Fqj
zjWM%?t)#R}Za|niT1POAR+6hTYveZVI>w<od$KIsGz1fBWF<T0T9^}p_Q7C^LO|x^
z12+TlvbI*^v>W9$$4t|%O!a}@1)<$VAsI<?XIb)GLX5+-k&MvxQdt+ur_{`$D~oHe
z#=j%UjzeTsjTX&l$~QRVz~)=;E;ML)RNt+Cj<tMAp+$Z2Al@PFROMbol#|bDu);kc
zNZ<jJ1i1sVS^qaE_c>DDr25mzN6M6kAumBfjV3h66n%Vq&&5vbd-v$BwCRjb=g6F7
z8J($UCY`DLduDyEl1`@Q3C~@#yw1`zf<w*%iP_Z>%coG5zb&DJjx#WmKi9`Be@og@
z+kLrHW}49_yG$VkYp(S=yK-kEvuo*vl8T9ZaT!}hDs@`FAMWd-8uC@fphCC_Uh&SA
zEe4UEMpJvN=|Qc;%O7R1g#Up#?JEfC+FRl<n2WNKN<3+ozC5@9O!OKWO$rOQUa^R!
zE$0E_d;^9L$Zm^{GT*)*mVw!3DM}@hMXxjH(WpiUxE29dx3Q*Ja3PAvPhTGU)<ffM
z$I9qb+ZvMe#2jm>dz<m7J(b>N62%SaV%4J#Z0VsHmrJP~sJfFB2<ta!`+z3$HlO{(
z=gZzUn5)Nn3p*S{6q!Qq&UTiY*Qau^m17D67dR3(vsIm5Z;*y&iR4k7X&|C<l6ay4
zFqKONH!D&7HR*+ezt#IPXPbxyP#DH2W#R4u&2-ln?fozOQ@f}ZOIGN{PK9lH=)2Z$
zs|r!S@9fFLbha5^?hVxLV6s0ibacK)1>M2`VZ=*p;^7;=#7Q~(<|zyg`RpCUYIfEZ
z%O_Z++4$eelI>S&%<_f@JPLfUmlQ9k43C@S2Hrq&OuUJGyc7|j<+7PmYgVvbV_xq-
zvK3{lQX!Iik?^+1x>5|rWl4gY9E!jtko~=qZqEDV;YrO>A$yI`>BE{#o#nAvNhMG%
z>(AFv59>BmUY_mm?OEuCtM^M}UvLkt-PqyIBU<ClF`OBTF83g#Ho{dQszUs%Nu2P>
zI$d94_I*r;@`Y3l`Qp3RLlKORgCpj=71YSHs~57C%udXC9;9X`I>ovaorI#yqph0q
zR`D+!_mbq#(JFOxW;u`D`H1#kV9{Re*dNMpik76?msq9aH(ib-!-mk8THn0qQIY1Z
zM7-WF_^qb%uGW9>n$rHH(s6Me1cgi~zh-E!W~b8eZFW~db@sHo5kmnYlhwtBZ)UR&
z+~7<mU3FAW(cK`=2(d5g>b4iRV4Tkq4LN%Aso=r%4AbIx*K$-**ZQg3^JaQsx8Iyg
zia7Ptu{zF*Q%rtnK61%@glz9<nNx+0bJv)+1-`MZpwU+3v()FB4M2?$(`)BTJyDa+
zrg`UjHJ8FVw(-?yW9xfON8Z85U{z<pL{=o|mMpON(u!BjG^C)4<QuMOVeE71UUn~a
zTz-YQY|jiBAcu?<<yEB6Tc@c1fQ7M!Pri0vA$j8La7SmfhCH)v#3*F_n=;d#D}ggs
z{p7TnX#IS=Vv0;M)HlXJ0ket2$o3mTSnL)&KLuGBZa}@bh+2ABTYj)v9B*l{7Uiui
z>Yh@4bRB=Buo8<hXBjTa9K5&N&!CrcZ+9o#JpCZ?)L3U$YH2^9kxGsna_>JtY4OZS
zbzthU9SAbB7tMMivZ`gp8kG#W0Npy+%g>}f=g1<S87O!r4D)avHC5Ooq22C`y47kk
z68brq`NZpIh9TyWymn)1rwTz_kg`xl-2zyrZ}69<vpVjCMDG9uh{a`oYN*E%?NA!K
zrv4e2Oy`WRU~Md><DVzk4?6X~zN|dO8HX_*gwllVj@&uBkdDjC@OUrPwU#!a5_Rpv
zu+{h$KuP>w3UU4sKz_gK2vQ*VV7N&;csBsc86ok`tbo?W*~(y%R$H-WnqWS%ZG$-D
z!&BFBSixWtTUV<na*0g3^Lo3IaJ&pC6|&D9>0M}$KMSEj5mc;rS9bCYp~Ng`ZY*9z
z<mZuP3(U;n08tq)6qUHFl=nizW;1@8`=v&Vs#n8o>D(;0_<(~M;m6QR6Gd*lr!KO*
z2FK5tO~}p*?uAH1hYa!V>(AVFM^q`*ADdP+MsNx|KE)?TsA<-k^Kn*@Gi<tRCcjqx
zvYDKSc&@}$DG=Uj8K-4Gv4pcQ*4gzohS;(!s9*CGSy26N!FgSv>SLbw8ThP&SG%&H
zh}C6kfxR@1cb-yr6SuX%R52ft@L_J2{n>^-k5r~mk!lsaI<)cNSX~_ty`(m=^_D$m
zW~~*&h{N|J4}>;(vFyEEtWOM;^r_7pI8}=$J-RYnS7$~0+c2c)7JRE=`l%uic&TY9
z+pQg~h+XwQ$nQyQg;da%Cfn;)qnFiXvrNO%GvVT>orw;#eQTGN8P4|e89Rm3n=d=W
zHR*bF@pQ`Xi&P7Xbfb)JdyN@;ZHOx=#L;y&nU**sxAY`hyyw%?vjNvL=_W^3eANP?
zB)&LYA2%VXpnD^A8d)Ytz_M@hU^UIgDLlWtIrOQ*&5ZHE!DHN0^{v+O_m!LCdVH~Y
z7VQ_OI4iekr<RL99&1cn{5TCocSDuVGAE4+?5U4&U5tb4N=zK0n=6oN&6sCed4!x=
z)Zx6;3}X$aH)g1N5VdCeQF0pjo0l_n#@sF>)mZJYWw_|sVeTPcT<Grk@$s-NNwK-Q
ze#yGFHF3M)PSYwXl2>4E2DfNfOcBrd@TlEfG1~IAcY#Ukg=eHFOJI}R-dNTeA6+h~
za`3vx>`+0{&Sk4PyU87E+cgJb6rx(!))}QEuW-<aJ-swRJ#M^1Q5SmEdeeWzslxxK
zNC4Uzv~kdg;TrL1aI2kMmZD2Fe)*`89_|#2mJ3g?7Xv@AmoNCDk#s3zZ~n@;*nIl1
z&~2tgzED2V^Vq14$&+ZG&WjhlcFnJO6eOMtJg?8S;x3kH6e0bQm%?#=;RvBdrl^KY
zSu+1-?Cm!TS2t0<C__YC!aX1(`7siq=?s%y4#G+oho5tu!b~LVGh69!mt#EJ*<`&K
z)|<zn0!W~rOB(w)D?`2a(d&<-UAg&yK|;~1PJ2a+!i~fQm!!5);bMh3Hrps6fZLWz
zloprxPHAYWHuj_K3&?Ob<o4UBdt=eh%PZE#OSi8#GY=YunQfhG;pi>zp0@L}EVb}?
zTr|%)J#0JJ(w8k!E(9vGSvngESn*hAN%xnZ1Q$P1e@w|fi~a>{GDI(2Mpq)s8Jp`O
z5S%MeC*IsNnDN`#$vxA*MMdLJ%Hr8LeM^7lf!ovQ@RVL1eJ(4e)-)Ow#kGn<3up5C
zqsj$Oed>yaG-mSUWj)`{TL4~he!4#hzhU^RE8|pX8*>B`M}l6px4LXl&DD2XS&*A_
zSkTAt$i$>!(JLK>mPU+qe50k7lA)JAXg1}QqgBz`x(#`j#l0R3j<2SydQ}i+`^;JI
zaug|oC$h|sr^XSha_m<FV2=7jiw5jl%W<s==FqvqD}(E*i)*8w?kz0$JeFTRHnrBZ
z(mOU<cu!)K3_O)Iw}t<&y)Tceaqapi<xrYQMP!KPkR3(JP&Ch_c^>SYlG<og8Ily5
zr%jruJvHry-GrjKM7vTF&9ya88vWMo{Lb^zbKd9ky#F76_(*i$`@XMhU2A=ZwQ%FW
z{1cXrVJ#MYCP4a-YRI(O(K<B=x=Oz9Ik9_w+pYeJM<LM^@6?KOV<0;o$satvFgE$;
zv&p~Q^oe~To*JrE@>b&xRd-W6X-6em^_f7qXGhrX%Ne8b#`hTDa6~0g+Eyo@*nU0R
zd$mc*Ot4*~Vk^Emnv=Zw+K-NbXxwA_9TAH;n~~vP4p2$HZ4Ivhj>nODeca;x?4{2o
zMGK(TGmAM4^0!`^XcGOLWK{e^qSHd0QrvhxR`=~9KZfvFR^>COJyw@W(sjg+<y%&G
z4841G)ajidd7)oAXLN5|(@C!05u<Pyp=weX_q1zwS?NbKo>*_{S{n5tbD{Xgqz&RR
zj6KJd<cv)7i~pJ-1y}^;8nC@Bq89re30W0Kd!l5Yq9XQ+!s8Yf!%9flxM#ZPVil~4
z$A=cT>Z>_gl-~~#_0sN4Wjx$Tejm83lxR6v_-TH{#Wkbq1mt1$aEDLa4|5+-KU5LK
zehh#}uSPc(fmd^JRdak9@A?m|&GGZ#(ep;|{mewp8<;)mY=5HE{=|H%yD}$PMET*R
zY0j)Ss`qGpDHlt7x$q)B-Q0MQPmMD6k7W7t?IVj%sYAA3JSzQgR9d&Fv?u3eIMdm}
zgS6Xp&?!12?e7T`1jw%k{pn+a-5zhC<NIY?|F@61pj)_1s90h94oXvE3RR+;D0z5;
zZ|o-+Dah^0vWmZ4_uL8RP#^nMt_F%{dqnIbC+-~8?@~Jp{60x#Q+-|`3~oW+%`mm;
zByZigh4+!H$T(*;SGCWhC|+x*nv}~gioGEp=B|5ZNv0d7k<v2gRT$Ux<du22w^#t8
zJeyutX`A=f&MH<`jWEj80=mvuU1DduguNsb&ICK0-EMHJYy{^%<ydHW{(jzUYa$5X
zF4sf<{vPjmALYE23-C}yo_b14u366>K=;`x^v-xW7-n1y4a+J#S&%Isk|kd(JogP)
z2ZG>-B~C#@+lR$VllVEelCcw&=#qzQz~Y|Tf;!-w%|FWB>A0)YasQmHHG{`lvslxb
z=ObOk1;g#xDIaKAY>Sl&OsX(tYc^b}o1VS|gd0=NGE6#rEw{U%pKi1ve+qEH_Kgc9
zqq|eUB=|febDHKkSqoU24=AOzOA;)I3Ou=&(E{!?10yW%4N&V22t3&0GXW}Nv#RC|
z@f+BVzr^xJZAyi+&VS_WTR>wI<j^V>#pZQfESW8oquD;I13Xy*3h`4wzctEl*li0+
z&zi+&ZB>2G1e7FZK~0TUB0r7EZ5s?s-tHWC8V;T41~F4gSZIhORhZnL5v3wu!Zs4~
zD2}CoBH0CUntB~iL{9oI>}ivrwmoQm-ba{~JgLzaHB32s<j?x$BU8GnOxqj*cTpp1
z=Q<^qbjU88OKmnShy%-05RF`uVZ4av)KSU#<=0GsaYr7&8s(dex0-V$6+IG<ReN!!
z0t5O+P%yc+EE<fIg9a$tJ`~Aw=vO2A=0S0MF=&Y|iQDX$x#^{kIrgoB<d8CIrbNE$
zA|h^Bo-hB=U_vsw(<avi>XiZ5a&z$~k5(Y4162|PKLI*dP7Ub-LhcUL%|%B%4^x!$
z0+iU)Fn0jP;~`I@vZZhZ=&vVZ;!Lu|v6Ci#usqW~4;*jk9<_*UF9WXWvsb9<gWkb$
zxWJ%e-i5Ffm;Mk5i&GZ!Uu<G937;`>pt0)AC-0nv5s0^jyUWWd{Z?e`yFmuujad|#
zVHC0LP{TM<)(gCu<Ez2$zGG6gZry=im2;={?L^lcPBy@c9FVu@o%Ko8&E^-lVL{)P
zHYk(t$XSW8@$}4y51qOn#evU=uSojBDrBg42q(QKJHBVRM-Izti^@3wg~+}zII)e-
zqg4=$J%6<of!vXCX36>I&1X8ehFfS;Ai_&TAt`CskXn)H0uWja8`w{Nk*PMqarL;A
zEjU>Arib;>wmpgOWbYYBpqA*Cy3jJC=mZM6lp1mkSM6wD?zaNG(O8TiR_(Z5HAkB)
z!a1oNxgu~uh=Gp*p|mOZuba6K!~$L5p#Heq6gP+fUD#we*@<Fx{^ILL%jK{mb~OB%
z-G)J|I|k(HUHc(xa0*0QzaKwRC;Mcq4lR-YZVDm{KVMPQUDh4(Fwdd+vbGAn=W4f<
zNO)B}ZFElOyny{cBMu`#hv6hB-~+PQr21l9TH{uJ_-mb7s9a2m2tD@scw65VM5y{J
zJeHKoVD0^u7gEY=iV$i!$RfkTmG#iJgmB=*nE5kv&YCw6V_gaW8EcdZ(+ffw?gNcy
z|B0SY(%9MppXjZ+>#>~0AT}^*8w14P5M(zbY+;)}%<W1*<YscUF#Fo3Wy;J!=lT;1
z%Q+ZvXNHMLjhn9Z0c90bbMm=zgWgP!O2fA`o}5-svGm@i*V6G!!Z<m0AziZ*{39&|
zm&n}!isj9pE^S_IiP=z)ynm;Vb<s9~-H;}_AsCS{1CUhl!$2Sz`X=|p+J=b~Vhcf8
z`Cj%eFpHMbzKb)J;vZI#Iy*nGWD?v4gGgB#24WyyX>fxeOf;I+sLC>%*yAOU^1j03
zha3!ZWkxe;*4Tk{BFE)5j4-i>q0vnA30>BEb-v~Lr-}x=7y4to4_ulbq17b<J2()J
zAx$RIU?#lPOhsT)EfhJ#+zuFSLa10P757(^WqdvN;4p-N(I5!Ldo1+Aa_`$mn)+YV
zxb~^|(~@9q9VA+xKErRZma<omGMnc1JZHv0HmY2b{(Xghoej*ZQJJqNAwjq88iAbe
z!wdW4*vE-lIz6BK*Kas^aB}2{;c1W}(*wbSr29E|J0c4$f)*KYL&oH>31Hk%^J87b
zU!tQTAGA;~M!7)n@!&k1OwZH2eHVNHhblQBfo=!312;dTdh(gzqlYgcvbqSEn<vxb
zr$Iz+T;;YSsr7xNHZj%EoE`PhwS1-ZvOf1)Oa}eF?yzVQrqHNaeiX#DbNoV>*fOic
zE3Y>z&p$1U;gfcV)zg%HBdfLSf~UsQWM>hW?BMXn+~?Sf8GlrMEwe=LD_B58y2=u2
zo%j7v3LZn+juQa040#-L-eEK=7c?U)a<6CBI0*2J+i`#V7Nq?Zv;SJu4|*pMbwsu!
z%ZDHXc(@nm)sMD@T-0z|3cje}=ht7`T<uB><U3Ih@fc)HR&%*eAs!E%&rq%Uc<DJV
z(bDFl%nz^*9RkE9!4tX|IH#o26kyPZCX1zoc~ifM!^2(jCrzWg!S?Zhp`L|D{DNuz
zXsC&-KK&XPS8_tbb1mpC;4+}!pvHN8Yi|ZIvp}ockw0Dxw*h<iL50f)Du6`B3qq|T
zuTr-nfKWOB$E4g=CgU3IG~teJuqn;SV<;2$n85CNb>8d|Mh~2cBMwXGBuYx4!k+S+
zr1>s_7!>32`6euO)N~?1y%D_Mu7wRyNd`S8Cb>N&)gnpsFh7aQVn}3m-|Y#^!)5p!
zO>xpyvVMr*GH>xr_fY$e!#1~NVGjEPlUxu(n1)gxvFm|^-@AVOA3LoEnw~y&r_7`T
zg6=Z`gp?I>q((=aQIx1#;do4^3&^uI#Uk?iTOs}db|P`9i`^15Yw2J*%2!Ej@?wrG
zG`~*dF52>r{lv^dGZE^+*VFFLJCC;PS!<(U4#<+X-2w>jnA;5~kJGn0a!hZzwJtmU
z(6>87q_gd|sBl(qpl~$CddmSAENT;OKa6lT(cmVzc5^U(M@?YS?aUiDh$fW!q2?3~
zuyYWZ(X;2x*JondMfB3`Lgyj>Kfb;lhQN03gik*J)o&gDL7i-<>=HoWd1|tCmQ3AQ
zPeXM<u9CR_MAw9~iVK?6dE(2f0hkCaDwkmcnr(M3RG+<=s<M8a>f#x4DqF#AmCzJ|
zA5-1+cI!Xfr)my97m^o!JRHa(nI)0@!i!i;#2X4(h-T#I{}i9e)6Iywi%&TB8Ad6B
zEekOX!BH}2`We!w{{6F=X`%(B3ao@DC~k42cgXC!6vpiOWXa+#ss?K><*0j?r|b52
zC_i9T-qQKWc0yPYLXZHUljXpIxV3JfEhdGLfj8-3q?IlKu`*Dv)s-)*G69g%weXDq
zaH&lLcH=-iD<tOO`q?H#qMpw&(fZWl(7u$)&U4KRX{qKye#57bWAWg|qSaEEvvjh_
zu`QT^@Swt1K`|^bSJ$W$@st4}7=krJd3FP9llOITIiuGL8M%Yk@=e*Tk60=)3_QzQ
zDdv`nUHAR8&#ckSC<<=*YqQSj{Y3OPSMXKIMV^7(Dx!~fe1d4DrOpXioFzE8w5sUU
z><lGDiC8PaGU)K=K?$8)XXa3vs3u^rsbv!)U>|@f=h8@#p@HjUK^bM<RQ>*e{3LS!
zi)6-~WX4x5o+5l_KMaVDEb3_(;QQrGbufOawc7Zjup4s`A%9}DW8|X=5M{<A$2NFt
zIu0}ymjV2!Ira6?p0g*P8^lDYj%dHP{E)8E01^KZKMLKqU)c%A?UEn&3A}`wA!>`W
z=ib@FzP@In?w$Zz4BC4=ky_yWoF&V?_vBSxDCidsR+KoZ%OQL&3FO<Fd%Lr&D+m$v
z#iaeVD|#4Rh{PnsHJa!miS3$jZ_viOxq!e>w_rB4(cB8qUXSpJlCa$x8T%EnQKZTN
zN~=koij!W*rNuPOY~s@*V%%7R3otKJ^ivBU&A6LsaMbrJKTtQ*u5e@>gAv=w)3+G0
zClcdN4&@Vr<tRC(HHJ*wbj~0E^-Wb*=3zkkr1~QoyQ;IfNQObvVcWO7*b|qR9;8Mm
zU_;v;>N1N!1W3Ui@s&ybB{}}NkvVw&+P+JoYzVl2yPseyAIKwU$erheZIB|sk#|S<
z4dx#0w%GX;Ge<Xgkyk;8KuWRxconRL<m>xmTEUJq4U|K|WN+1=*-grs=Y|$j5~_si
z=d(OQhccIJh1xx%kkcl9-5|<*PsK*IokMwBzC-TQpJ=w;l_ELR`r3*VTm|xd)yLv5
z&XqvgL_P3CxJY-2a=A0V$WUtht16F<zvbp&i#vu)r5?Nk=M~NKG@)L|_5M)l6Zt9a
z=E7FoZioxT*zkX)Y7`5EbQx^3Hr^mD>JeHX6z1^2WWrRazzd7RIjNdS2;9wfS}8S&
zcQ+kmm1$41llVrJkTNoF8DA>Om%8_L<cX=ofzJEkIb`aHXs6Y43s8C~)jm5LWK#9v
z7B9st?xWl@Y~YXGhkO@3o*SqYOd|e`T)-!bSd-TXbk_yk-+X(N&la-X4q2U>NSL&F
z&dVZ+gvld<RW~@TLumKpYA5+^P^((ey8TYyEpNT9+5%EsB!QS%F&&Deu;3mIXR0X@
zSBKS{MD8=Zj4`{aVv%d8+X-TIey1q_Jc$l7btPv);_n*Kk0CR+pXFA|;!M8kUC&Gt
zMz5f|edj5SUZ6^DwSL1^P1}~uFo$R?vO}yFMYn5n_b(nwwT#|qj%Ymg>*Le3$%sI!
zje11fT+w|0ClG?91L`%r;l58BIIEs-tMhS8nQg{);)R=+AdJoPr=?cJqcA+}%f}G4
zR#`WQF<%ZCe|wPZ*DZBz$z9_IXk8Lu;x$b!-XunU?=zFMMuVAkZs230gr&ROrpbK;
zX*RL9kaa>;A5q|nc63oX7j@Twi2nYJuV;RyVth&~AcEx|RZ=x$x<{+THbpxYzvcl2
zbdWqmpY-pxxskH`Z^*d$oyp)x=HVbuetww73l41&ecR(rP($%!%U`4`7+ti!qnNL)
zf&wCdEDN~;3-jAd3>B%<USurU)QRE=HUMmK=7L&rJ4#IGuo;-bG~XvtrC2kCACb<T
z6Jp*d+g(3<VxcbrC)FG`>qYLXN0juJwVSbM2~~+dfIl*z)9{Wv)a4n<m^>s!zKXqw
z7KMj7>fuRB6&Kn$C<-+FfQjSXgG?UFh@U|-rnRQywWY}85eQi#P8;Rsz>hX4OEfv|
z14)B?l?Y9d!aEO$6Ti?|9@A@}(Md7%rcvD6Rbla_P3ebchR?U)0*GcllW$;ta6ulc
zCcbTw)O5Nb;g}Xg`f2d^?C=enH1&|WF;+BacD~IqENP6J#FDheGy{FD{>pqOm669C
zzgLs%0Bb3%+jry1+GML(8&n{(J950c1?7ZMuJ$=?Er_1WNP#Eb!A;H{uh3=NW1JO9
zJ)C$QasNW%IG#sHwRG5~I}`F4w9{2!p)MNYiEp_>jQ25kX-sVDH=Rja1VjSM#XmJl
zZNwO(nY6laqK|btO`UY6nu8BMae|Y}Fgkh`5tli3x3|tm>S>A?x5b0UI*)F%ElUb~
zLL`>dBXbOkcbb55L%UFi4<NS1Z6&3nodw6Z4{CatDAOpyPho{$Iz_h_?Hy6;hSZ6^
zpB4UTI+NQ_-A5?ftQ!@6_#8a#J=ybyG+jfx`UMV@tND`0-3Qd$JY!OHEt5&5b0*VZ
z&q{z}^WeDB$S&a&m_}LG=N;Bxnzi@fy%FE?>V|wx(z$k|tdlDNmZEV`ioQv5gP}Rr
z;JMHla+ATLbp@n7=INSb*ger7C~jVtr@D~LJs!N5gtg}Hbqss9Zu%ho7NJ%<lNXU}
zd6Sl$p@{Z+8&tqm!>;<*Tk)KJvAw8D^pt{pEiUEp5B|l3z26{VW}4)`xs<cj7A(ii
z1`j<+fUxX3drP1%>ZDP<mvDxr1Vb65k(%>YnLjUzpgAuPH<f`?aR3yYC<)X28ENB2
zx5&H~x8)k<VLav`zwl!-AxD@3yKnTey2CM*V76;SRLW>RIZliULvrH~D^Lkwotj5d
zx(!(<@@V^U=%|K|Gc2#4Hg#U=GJGoA4T2Yh-uQu(@{njdu7V9No|<IgG4|;rD(+mv
z;^L+3h9<A}EzYwU(|Mt`crBdFP+PwJNWg=zQ9(75`@U-XC~u2Dc3$z15GK%#tHK0)
zCH!Fcs0Zm7^rm#<dLA-NEeUVa6~Y&7>&)(#A$3!I28W}zMvs>)lWh0&V!=P8j+3LE
zguSa7PTA$$CyG|OJoN6_08ks!pmVvR+&1e04nb0c4Cu`$r*txUwQ6oyAofO!TXjdA
zq4(%vsnVJ72)mC^`zF@uV{*|hX7gCz$IUUTK59Z?QL0_@dY4~cS$%=33)<x+jHSG{
zpi76IB5%{p|LCw|@L5P(9{i9vW3bU<BN1<~ZJ6xaWuO~Ly+hJZ{dS&SlYreTSCoC~
z>m`pDc90iccyM@n(`KkM>Bbp=x2(9}n~n-uM2O^kD{W{p96uFfg}z+3-=ep>Al??G
z%Zw>Gu%#(}Wv7z$J(`nz3I}<QVXyDGZ3!ul<>n71Dp)_HRfpWr7}3#NZ8O%W5P)1;
zYPFbOfFjELj_1Qj_`vDl@}G?jVC9HS!%N#CBsBCi{6Nc!H55;YsfsHG_%nXtB7qzq
z<eYGk!Gv-XlF5rsy9f<+dMgo!ZDuO})DL|s4hP?KAE-a9TBgT!JN)a6y}^$@{b9-L
z;2lzT2i3HxCm=m1r9rAozp_z~Ln#2T?q(lYVvV5IWA#<k*&aVKr&}XI4HMxjdQ>T-
zunVH+qOt2+;{#dXgq;R(_W1mnE_N)j%{L3w)SB}}C%$Ffa^GU_wE}zfZL~T`$^{)I
zWrUR11eIH=_l#JmX`)`d>Fz?gL+h(R`g#P9o1fAN334B7L;)3D`hkJ`R%=-=#3icm
zd@;%Saa6E(+3MYeGRr;(W%N~w_z93R(&e7IH_8q`JCOo5M}LL&*z8qEOXc?c@Rk5+
zT;;OLYPy~HuVBuFZ5Kk@hJrMp@KF`0$1vvnjI@w&y;AB&A}ygJ@qR1|2R?X#O>%O;
z;D}+023{Z0!p#S4i}Eg;H>`9O2CjHq4<#2dv{_u*Fd|AKYOz1U?$!J8ozeb^yi6?}
zVO*vjy&hsC3=?1*JdBv;cBuaZ@cp&co2z-xj`YjrQ!hY)|A3y=QVWD=U6I2<E_dA^
z`KjuC4xm8RBr?xn_gE@!FkkO*S8*4^Pu&-=5V&&|1vQZzq_KV)RWR*1w95*awnhK)
z@wgZ=Ci%mexJ8$hlBhrt=(nvpU^o?%jxd}CKf^%OeUM3K)Vo#avTTQRve5jLo|h2I
zKsVX+l$wJXBQzyb1c_p3U`c+mVJd=}oLZhZ+bP`gv)9#kjzR#g2)01IUu(6_*wErd
zD%(WeMtPwIo2AzS(?zyaz$7O%rU3H(NVxh{;P0p~B)-qM1TDBTyI$3TNJ*V3@E+|R
z1X6W&4jD1+0GM`L#X;U2nU@p?p^{?<_a2U$0h%~DS@Xx8`TAWV&{d%eY5XC>Jo^)C
z-o-F7W>RjL8x<ySqGMwk4X^YQC?jgRBIJxApzH)MQH~cf0Z^MaA}*!+gkw$AVf2_^
zh(vxCo8!OT9N^X>;FFFDcPxTGvND{0gSJyDN#_sa&ClkAZ@d{|Ix%|nRbyDu9)HC|
zMr607uwP(TSk9t5NL<W%xf5pF4nT4s14&>k{kZ;HJw>A0$oDL)f%zEmGl2CEsLpAE
z+H3*fk9ma{1qe`)FiE+<IhPKklk&~3T7+dm3@!zyFJVXS_7(PqNdxVtf^SQJ6I=lw
z#>|+h@g>^S<*?%u&-4Y*L=zMIKy|>olCrkIfQN0p4IXx@yxzhrP!|inFZuPA1=X|p
z!SW3Y;H7TLJ^+u+mhRiVx-!b%x-md|b4#W{-6yaYYn}n4nSqJ1?`EV2K|R4Zw;mj_
zXDO=$^<hQ?KvjOY%bd#8Cp!pcg??G4lzrN<FA@Lq{_|2$F*j1i`y#Yt0AH=fIDpjT
z7%&{kakB2|SSqxWxELg~5z?dW0}vCA?e?9~NzNP?2a&Lo-1RZD=_lJa=_kAdpssg+
z{fPSmBcwG$1IXhCP|Ov-0<wSNvjO*!#s%LcH;ppG${8hDzfl$DNWT~=6LCXh^L3fr
zuo4h_Tp2q<-NQEMXHDeT6siccdIKODu88=nNE4{G>n8n^=%On3ja!gjX?q2eDpL@g
z5SmKtc7PKJfo3$5XT-=QJs+>e$!HL>!S2}Q0I&E^)2Sr_76Wt`dU2NSC_q||kaq|I
zhDrVc5;td3VGncDFm#wnWbz2h>~Ay+4n$D)MIqDC4j-ZW(~~dr*P=Qc+hLu#eT;Ah
zsBWtm2hbw<w*Z&(ju`G0hqqPY9TUw|QS5qAn(@rOcQ17Ift~7xdyCHbxaUv2YN-~8
zO+MnB9uOGx<46Y##$?e6Kp^A5G%s;^7AR0}X>^P5V0R{-gwF5w^Zroc>Ha*V2d0U?
zQQ=sI<fs0R-9A<aG~)Zr_I+^pGBp4ruu)zUj$$BfJ|^!ycXUxm)8SHk)-%s90e>XA
zgi8la?g~J2YR`XG{W2?hey7{*W_E#*mW*B$|KGhWZCHAnX=7IQ9EwQ{4A#Lkrw_qx
z`d`*rJ${VDh0k3|IaW93SuuANUob`%9&!0}+#NhDvcw5HQes(>=;=G>K-TejpTD9R
z<J$*(wmcFhc3nHMQp=UxS_tp+4(j3O3ASXn&b&*uG9T@@53WCqJ&6q>L|YXA|JQ=H
z*r$zUP>nBzAXX1Zb-!2RNbZAGoxfRE!myr#dn{~moQi_N0a*|^<-x?>fv-PE0%q%V
zv7-vlmhm#{298ZQf0J%L8Ru982d#e@u;FLB1VRcI<9N~Fu7jRJ9%2p5QRl2cbbb`@
zb&;A9mbeURAcO&Q0HkWAsUg}0ul=a(t$$#!j)cjmq!&4mydmQPUjp`*oJy~*=e&3S
zC3uwjmkYnbTyJTlHAZd+hE$Z!2afTNus4!(_y`Aohjx+J_C*v-Gf`=x&}K!y--SF9
z%BDt8J`r$3!(jJ)=MpVgXs4dlp5(6ywBP9H2K|Dpp8C$YnULfVGP)&0uc1BLRs-xh
zJz1BWZ)vT^lD(k0!_v-#W{!J7FA-)Ju{TcxEyPrI8dhH>-^m+U<B~e2`OcSS?p&1E
zi~y!cb1zmO9LNTAwhwhs@G6M~NOEUEUsYO@ULlLMXD=*qN$;GAQRHM2&{NhEoUA{#
z1tJy~Ie5UW(j>+R(N2`Eh=<ypR!~KHA{jGah?HnP1-}xckrv|e%Dt->0H3~6BgwSD
z<>xj9=m-U8%X8x_tJBR1FAj`ASNAg?z%x-YNCCsq4kUK^n<rf#|E$d3U<97Sr3+jB
z=I-_)8}RjshX&`@1(Yx$1@A_1u6Z9ja(WvQeA+b00wkLCW#~es%0NNW_Uar`J@Ejm
zv@Rf-De}ruV7;bcf5pSec)aEolK*1LMyRC-V21Yxj7&^e=Mb4G`bytMR{qE<WA*$2
zR*V<6Zh8B-=-&dn;te3FZ(r}uasX*h2~>n0dN!!xT_r3?+mVTHJ4|@CG?iIt{Qp%g
zxR*hUkQF#g!^cd!!b#%*JLhMXZFzHr5~pypY?=((6NlzxrF3Xe&{z%~a!!;>Vn73-
z^V(`>?d{Fn^#Bj#$_RBc))M%%k!SmFcl6ia-Ppu|H@*vE)Lubr4u(yI&?s}L_RPQ9
z<b2Um48BS0s9Xje;TQJc?0?^P|3kV3aBT)V7pHhGJiv?0?^xZ$uZ!(i+w|<O>;C->
zDIy(DQ<=#B=`#Ow!N2}=FAbW4W`80&Z~YG+^lvVRw2qD7DLM1!!SvTpTO&YR`=wws
z^gnxhwIE05e}AZdbFEECQ&4r23FiOuGJoC^JV$6;TGv&py!O;tLoKZN@)6Qq_0lV`
zUy{uK;*kP1on!USOZ(-${`1m)-y8qzw6$CNpPlx5K>H^=t_j8d|D0*RpF(h^i4Rf?
zz<ia_tgxo5-8u$8#_w#y=Y!pTIj*X;z90tW#q(=3Ji(ZkpcS+ccq45*0ZkM0G^^MB
zeyQJ==DL9QP<o1#9#;GPOwT+mh~LozT#g3mF012?HF_;ou`9rLYKo3>#h&;jy_iG_
z^={v~=-&_Key-{VG&_p;4Md&OjyED-C9>D8x$k?=UII`iFLF2G&4FM3x3&zV_x33c
zr++zr0CEvW2%G<Vy0JD+tzDJ>IS>1<zB{??yTV}eTQSB$ul4KTmx_|+Ino)c!2btO
CP5Dm%

literal 0
HcmV?d00001

diff --git a/docs/source/design/class_hierarchy.rst b/docs/source/design/class_hierarchy.rst
new file mode 100644
index 0000000000000..b3404f6b936e7
--- /dev/null
+++ b/docs/source/design/class_hierarchy.rst
@@ -0,0 +1,33 @@
+vLLM's Class Hierarchy
+=======================
+
+This document describes the class hierarchy of vLLM. We will explain the relationships between the core classes, their responsibilities, and the design choices behind them to make vLLM more modular and extensible.
+
+1. **Entrypoints**: vLLM has two entrypoints: `command line usage <https://github.com/vllm-project/vllm/blob/d1c6799b8870e513bf4f2305cbf6cda9fc3d773b/vllm/entrypoints/api_server.py#L138>`__ with ``vllm serve`` for launching an OpenAI-API compatible server, and `library-style usage <https://github.com/vllm-project/vllm/blob/d1c6799b8870e513bf4f2305cbf6cda9fc3d773b/vllm/entrypoints/llm.py#L38>`__ with the ``vllm.LLM`` class for running inference in a Python script. These are user-facing entrypoints that end-users interact with. Under the hood, both create an engine object to handle model inference.
+
+2. **Engine**: Each vLLM instance contains one engine object, orchestrating and serving as the control plane for model inference. Depending on the configuration, the engine can create multiple workers to handle the inference workload.
+
+3. **Worker**: A worker is a process that runs the model inference. vLLM follows the common practice of using one process to control one accelerator device, such as GPUs. For example, if we use tensor parallelism of size 2 and pipeline parallelism of size 2, we will have 4 workers in total. Workers are identified by their ``rank`` and ``local_rank``. ``rank`` is used for global orchestration, while ``local_rank`` is mainly used for assigning the accelerator device and accessing local resources such as the file system and shared memory.
+
+4. **Model Runner**: Every worker has one model runner object, responsible for loading and running the model. Much of the model execution logic resides here, such as preparing input tensors and capturing cudagraphs.
+
+5. **Model**: Every model runner object has one model object, which is the actual ``torch.nn.Module`` instance. See :ref:`huggingface_integration` for how various configurations affect the class we ultimately get.
+
+The following figure shows the class hierarchy of vLLM:
+
+    .. figure:: ../assets/design/hierarchy.png
+        :alt: query
+        :width: 100%
+        :align: center
+
+There are several important design choices behind this class hierarchy:
+
+1. **Extensibility**: All classes in the hierarchy accept a configuration object containing all the necessary information. The `VllmConfig <https://github.com/vllm-project/vllm/blob/d1c6799b8870e513bf4f2305cbf6cda9fc3d773b/vllm/config.py#L2036>`__ class is the main configuration object that is passed around. The class hierarchy is quite deep, and every class needs to read the configuration it is interested in. By encapsulating all configurations in one object, we can easily pass the configuration object around and access the configuration we need. Suppose we want to add a new feature (this is often the case given how fast the field of LLM inference is evolving) that only touches the model runner. We will have to add a new configuration option in the `VllmConfig` class. Since we pass the whole config object around, we only need to add the configuration option to the `VllmConfig` class, and the model runner can access it directly. We don't need to change the constructor of the engine, worker, or model class to pass the new configuration option.
+
+2. **Uniformity**: The model runner needs a unified interface to create and initialize the model. vLLM supports more than 50 types of popular open-source models. Each model has its own initialization logic. If the constructor signature varies with models, the model runner does not know how to call the constructor accordingly, without complicated and error-prone inspection logic. By making the constructor of the model class uniform, the model runner can easily create and initialize the model without knowing the specific model type. This is also useful for composing models. Vision-language models often consist of a vision model and a language model. By making the constructor uniform, we can easily create a vision model and a language model and compose them into a vision-language model.
+
+3. **Sharding and Quantization at Initialization**: Certain features require changing the model weights. For example, tensor parallelism needs to shard the model weights, and quantization needs to quantize the model weights. There are two possible ways to implement this feature. One way is to change the model weights after the model is initialized. The other way is to change the model weights during the model initialization. vLLM chooses the latter. The first approach is not scalable to large models. Suppose we want to run a 405B model (with roughly 810GB weights) with 16 H100 80GB GPUs. Ideally, every GPU should only load 50GB weights. If we change the model weights after the model is initialized, we need to load the full 810GB weights to every GPU and then shard the weights, leading to a huge memory overhead. Instead, if we shard the weights during the model initialization, every layer will only create a shard of the weights it needs, leading to a much smaller memory overhead. The same idea applies to quantization. Note that we also add an additional argument ``prefix`` to the model's constructor so that the model can initialize itself differently based on the prefix. This is useful for non-uniform quantization, where different parts of the model are quantized differently. The ``prefix`` is usually an empty string for the top-level model and a string like ``"vision"`` or ``"language"`` for the sub-models. In general, it matches the name of the module's state dict in the checkpoint file.
+
+One disadvantage of this design is that it is hard to write unit tests for individual components in vLLM because every component needs to be initialized by a complete config object. We solve this problem by providing a default initialization function that creates a default config object with all fields set to ``None``. If the component we want to test only cares about a few fields in the config object, we can create a default config object and set the fields we care about. This way, we can test the component in isolation. Note that many tests in vLLM are end-to-end tests that test the whole system, so this is not a big problem.
+
+In summary, the complete config object ``VllmConfig`` can be treated as an engine-level global state that is shared among all vLLM classes.
diff --git a/docs/source/design/huggingface_integration.rst b/docs/source/design/huggingface_integration.rst
index 716273afd695c..e6c1cea6001ea 100644
--- a/docs/source/design/huggingface_integration.rst
+++ b/docs/source/design/huggingface_integration.rst
@@ -1,3 +1,5 @@
+.. _huggingface_integration:
+
 Integration with HuggingFace
 ===================================
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 00d455ed9ad44..a2abd2995b1cc 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -156,10 +156,11 @@ Documentation
    :maxdepth: 2
    :caption: Design
 
+   design/class_hierarchy
+   design/huggingface_integration
    design/input_processing/model_inputs_index
    design/kernel/paged_attention
    design/multimodal/multimodal_index
-   design/huggingface_integration
 
 .. For Developers: contributing to the vLLM project
 

From d201d419730dec120b0ecb60ae212f08c0b68be0 Mon Sep 17 00:00:00 2001
From: Yuan <yuan.zhou@intel.com>
Date: Tue, 12 Nov 2024 18:07:32 +0800
Subject: [PATCH 104/183] [CI][CPU]refactor CPU tests to allow to bind with
 different cores (#10222)

Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
---
 .buildkite/run-cpu-test.sh | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 26a202b09b8a2..b3771bb268e22 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -4,9 +4,13 @@
 # It serves a sanity check for compilation and basic model usage.
 set -ex
 
+# allow to bind to different cores
+CORE_RANGE=${CORE_RANGE:-48-95}
+NUMA_NODE=${NUMA_NODE:-1}
+
 # Try building the docker image
-numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu .
-numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
+numactl -C $CORE_RANGE -N $NUMA_NODE docker build -t cpu-test -f Dockerfile.cpu .
+numactl -C $CORE_RANGE -N $NUMA_NODE docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
 
 # Setup cleanup
 remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
@@ -14,10 +18,10 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
- --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
- --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=$CORE_RANGE  \
+ --cpuset-mems=$NUMA_NODE --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=$CORE_RANGE \
+ --cpuset-mems=$NUMA_NODE --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
 
 function cpu_tests() {
   set -e
@@ -57,7 +61,7 @@ function cpu_tests() {
   docker exec cpu-test bash -c "
     set -e
     export VLLM_CPU_KVCACHE_SPACE=10 
-    export VLLM_CPU_OMP_THREADS_BIND=48-92 
+    export VLLM_CPU_OMP_THREADS_BIND=$CORE_RANGE
     python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
     timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
     python3 benchmarks/benchmark_serving.py \

From 36c513a0762b104c9076ab6a3449ea3efff6db4d Mon Sep 17 00:00:00 2001
From: Guillaume Calmettes <gcalmettes@scaleway.com>
Date: Tue, 12 Nov 2024 12:13:46 +0100
Subject: [PATCH 105/183] [BugFix] Do not raise a `ValueError` when
 `tool_choice` is set to the supported `none` option and `tools` are not
 defined. (#10000)

Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
---
 docs/source/serving/openai_compatible_server.md |  4 ++--
 vllm/entrypoints/openai/protocol.py             | 10 ++++++++--
 vllm/entrypoints/openai/serving_engine.py       | 11 +++++++++--
 3 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 9b29ca66022cb..200663dac4209 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -215,10 +215,10 @@ The order of priorities is `command line > config file values > defaults`.
 ---
 
 ## Tool calling in the chat completion API
-
-vLLM supports named function calling and `auto` tool choice  in the chat completion API. The `tool_choice` options `required` is **not yet supported** but on the roadmap.
+vLLM currently supports named function calling, as well as the `auto` and `none` options for the `tool_choice` field in the chat completion API. The `tool_choice` option `required` is **not yet supported** but on the roadmap.
 
 It is the callers responsibility to prompt the model with the tool information, vLLM will not automatically manipulate the prompt.
+Please see below for recommended configuration and chat templates to use when function calling is to be used with the different models.
 
 
 ### Named Function Calling
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 1335e51bd152c..0e0bb66c057df 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -454,6 +454,12 @@ def check_tool_usage(cls, data):
         if "tool_choice" not in data and data.get("tools"):
             data["tool_choice"] = "auto"
 
+        # if "tool_choice" is "none" -- ignore tools if present
+        if "tool_choice" in data and data["tool_choice"] == "none":
+            # ensure that no tools are present
+            data.pop("tools", None)
+            return data
+
         # if "tool_choice" is specified -- validation
         if "tool_choice" in data:
 
@@ -467,8 +473,8 @@ def check_tool_usage(cls, data):
             if data["tool_choice"] != "auto" and not isinstance(
                     data["tool_choice"], dict):
                 raise ValueError(
-                    "`tool_choice` must either be a named tool or \"auto\". "
-                    "`tool_choice=\"none\" is not supported.")
+                    "`tool_choice` must either be a named tool, \"auto\", "
+                    "or \"none\".")
 
             # ensure that if "tool_choice" is specified as an object,
             # it matches a valid tool
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index e31dc2ced61fb..fa315fa516632 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -469,12 +469,19 @@ async def _preprocess_chat(
 
         mm_data = await mm_data_future
 
-        if tool_parser is not None:
+        # tool parsing is done only if a tool_parser has been set and if
+        # tool_choice is not "none" (if tool_choice is "none" but a tool_parser
+        # is set, we want to prevent parsing a tool_call hallucinated by the LLM
+        should_parse_tools = tool_parser is not None and (hasattr(
+            request, "tool_choice") and request.tool_choice != "none")
+
+        if should_parse_tools:
             if not isinstance(request, ChatCompletionRequest):
                 msg = "Tool usage is only supported for Chat Completions API"
                 raise NotImplementedError(msg)
 
-            request = tool_parser(tokenizer).adjust_request(request=request)
+            request = tool_parser(tokenizer).adjust_request(  # type: ignore
+                request=request)
 
         if isinstance(request_prompt, str):
             prompt_inputs = self._tokenize_prompt_input(

From a838ba7254c98a7adc60a0976bdf277fb20b4221 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Tue, 12 Nov 2024 21:07:11 +0800
Subject: [PATCH 106/183] [Misc]Fix Idefics3Model argument (#10255)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/models/idefics3.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index b234b602e6fbf..8845b2f58af07 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -420,18 +420,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
 
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
         self.config = config
         self.padding_idx = self.config.text_config.pad_token_id
         self.vocab_size = self.config.text_config.vocab_size
-
         self.vision_model = Idefics3VisionTransformer(config.vision_config,
                                                       quant_config)
         self.connector = Idefics3Connector(config)
-        self.text_model = LlamaModel(config.text_config, cache_config,
-                                     quant_config)
+        self.text_model = LlamaModel(
+            vllm_config=vllm_config.with_hf_config(config.text_config),
+            prefix=maybe_prefix(prefix, "text_model"),
+        )
 
         self.image_seq_len = int(
             ((config.vision_config.image_size //

From 176fcb1c71655d825d2363e5f1468fa248fe783b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= <jiefu@tencent.com>
Date: Wed, 13 Nov 2024 00:36:51 +0800
Subject: [PATCH 107/183] [Bugfix] Fix QwenModel argument (#10262)

Signed-off-by: Jie Fu <jiefu@tencent.com>
---
 vllm/model_executor/models/qwen.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index cc70099361dd2..5acd87146c54e 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -1068,7 +1068,7 @@ def __new__(
         config = vllm_config.model_config.hf_config
         # Initialize VL
         if hasattr(config, "visual"):
-            return QWenVL(vllm_config)
+            return QWenVL(vllm_config=vllm_config)
         # Initialize LLM
         else:
-            return QWenLLM(vllm_config)
+            return QWenLLM(vllm_config=vllm_config)

From 47db6ec8310129699a62567b61d8ed380636b053 Mon Sep 17 00:00:00 2001
From: zifeitong <zifeitong@gmail.com>
Date: Tue, 12 Nov 2024 08:42:28 -0800
Subject: [PATCH 108/183] [Frontend] Add per-request number of cached token
 stats (#10174)

---
 tests/prefix_caching/test_prefix_caching.py | 24 ++++++++++++--
 vllm/entrypoints/openai/api_server.py       |  1 +
 vllm/entrypoints/openai/cli_args.py         |  5 +++
 vllm/entrypoints/openai/protocol.py         |  5 +++
 vllm/entrypoints/openai/run_batch.py        |  6 ++++
 vllm/entrypoints/openai/serving_chat.py     | 35 +++++++++++++--------
 vllm/outputs.py                             | 19 +++++++----
 vllm/sequence.py                            | 14 +++++++--
 vllm/worker/model_runner.py                 |  3 ++
 9 files changed, 89 insertions(+), 23 deletions(-)

diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index fd6564bbfe630..50723dbb610ac 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -27,6 +27,7 @@
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("cached_position", [0, 1])
+@pytest.mark.parametrize("block_size", [16])
 def test_mixed_requests(
     hf_runner,
     vllm_runner,
@@ -36,11 +37,12 @@ def test_mixed_requests(
     dtype: str,
     max_tokens: int,
     cached_position: int,
+    block_size: int,
     monkeypatch,
 ) -> None:
     """
     Test the case when some sequences have the prefix cache hit
-    and the others don't. The cached position determines where 
+    and the others don't. The cached position determines where
     the sequence is at among the batch of prefills.
     """
     override_backend_env_variable(monkeypatch, backend)
@@ -53,12 +55,30 @@ def test_mixed_requests(
             model,
             dtype=dtype,
             enable_prefix_caching=True,
+            block_size=block_size,
     ) as vllm_model:
         # Run the first prompt so the cache is populated
         vllm_outputs = vllm_model.generate_greedy([cached_prompt], max_tokens)
 
         # Run all the promopts
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
+        req_outputs = vllm_model.model.generate(example_prompts, greedy_params)
+
+        # Verify number of cached tokens
+        for i in range(len(req_outputs)):
+            if i == cached_position:
+                expected_num_cached_tokens = (
+                    len(req_outputs[i].prompt_token_ids) //
+                    block_size) * block_size
+            else:
+                expected_num_cached_tokens = 0
+            assert req_outputs[
+                i].num_cached_tokens == expected_num_cached_tokens
+
+        vllm_outputs = [
+            (output.prompt_token_ids + list(output.outputs[0].token_ids),
+             output.prompt + output.outputs[0].text) for output in req_outputs
+        ]
 
     check_outputs_equal(
         outputs_0_lst=hf_outputs,
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 3e4070a25cf90..6a24cdbc6a18f 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -540,6 +540,7 @@ def init_app_state(
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
         enable_auto_tools=args.enable_auto_tool_choice,
         tool_parser=args.tool_call_parser,
+        enable_prompt_tokens_details=args.enable_prompt_tokens_details,
     ) if model_config.task == "generate" else None
     state.openai_serving_completion = OpenAIServingCompletion(
         engine_client,
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index 74ea41344bece..eb08a89293370 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -228,6 +228,11 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         default=False,
         help="Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint"
     )
+    parser.add_argument(
+        "--enable-prompt-tokens-details",
+        action='store_true',
+        default=False,
+        help="If set to True, enable prompt_tokens_details in usage.")
 
     return parser
 
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 0e0bb66c057df..820aefd8800d9 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -99,10 +99,15 @@ class ModelList(OpenAIBaseModel):
     data: List[ModelCard] = Field(default_factory=list)
 
 
+class PromptTokenUsageInfo(OpenAIBaseModel):
+    cached_tokens: Optional[int] = None
+
+
 class UsageInfo(OpenAIBaseModel):
     prompt_tokens: int = 0
     total_tokens: int = 0
     completion_tokens: Optional[int] = 0
+    prompt_tokens_details: Optional[PromptTokenUsageInfo] = None
 
 
 class RequestResponseMetadata(BaseModel):
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 0d016d949d22b..1b422a93263b2 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -78,6 +78,11 @@ def parse_args():
         help="Port number for the Prometheus metrics server "
         "(only needed if enable-metrics is set).",
     )
+    parser.add_argument(
+        "--enable-prompt-tokens-details",
+        action='store_true',
+        default=False,
+        help="If set to True, enable prompt_tokens_details in usage.")
 
     return parser.parse_args()
 
@@ -217,6 +222,7 @@ async def main(args):
         prompt_adapters=None,
         request_logger=request_logger,
         chat_template=None,
+        enable_prompt_tokens_details=args.enable_prompt_tokens_details,
     ) if model_config.task == "generate" else None
     openai_serving_embedding = OpenAIServingEmbedding(
         engine,
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 9551b4f2091dd..74867d8de8843 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -18,8 +18,8 @@
     ChatCompletionRequest, ChatCompletionResponse,
     ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
     ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage,
-    DeltaToolCall, ErrorResponse, FunctionCall, RequestResponseMetadata,
-    ToolCall, UsageInfo)
+    DeltaToolCall, ErrorResponse, FunctionCall, PromptTokenUsageInfo,
+    RequestResponseMetadata, ToolCall, UsageInfo)
 from vllm.entrypoints.openai.serving_engine import (BaseModelPath,
                                                     LoRAModulePath,
                                                     OpenAIServing,
@@ -49,7 +49,8 @@ def __init__(self,
                  chat_template: Optional[str],
                  return_tokens_as_token_ids: bool = False,
                  enable_auto_tools: bool = False,
-                 tool_parser: Optional[str] = None):
+                 tool_parser: Optional[str] = None,
+                 enable_prompt_tokens_details: bool = False):
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          base_model_paths=base_model_paths,
@@ -80,6 +81,8 @@ def __init__(self,
                                 f"tool_parser:'{tool_parser}' which has not "
                                 "been registered") from e
 
+        self.enable_prompt_tokens_details = enable_prompt_tokens_details
+
     async def create_chat_completion(
         self,
         request: ChatCompletionRequest,
@@ -252,6 +255,7 @@ async def chat_completion_stream_generator(
         previous_num_tokens = [0] * num_choices
         finish_reason_sent = [False] * num_choices
         num_prompt_tokens = 0
+        num_cached_tokens = None
 
         if isinstance(request.tool_choice, ChatCompletionNamedToolChoiceParam):
             tool_choice_function_name = request.tool_choice.function.name
@@ -305,6 +309,7 @@ async def chat_completion_stream_generator(
                 # the result_generator, it needs to be sent as the FIRST
                 # response (by the try...catch).
                 if first_iteration:
+                    num_cached_tokens = res.num_cached_tokens
                     # Send first response for each request.n (index) with
                     # the role
                     role = self.get_chat_request_role(request)
@@ -530,11 +535,13 @@ async def chat_completion_stream_generator(
             # is sent, send the usage
             if include_usage:
                 completion_tokens = sum(previous_num_tokens)
-                final_usage = UsageInfo(
-                    prompt_tokens=num_prompt_tokens,
-                    completion_tokens=completion_tokens,
-                    total_tokens=num_prompt_tokens + completion_tokens,
-                )
+                final_usage = UsageInfo(prompt_tokens=num_prompt_tokens,
+                                        completion_tokens=completion_tokens,
+                                        total_tokens=num_prompt_tokens +
+                                        completion_tokens)
+                if self.enable_prompt_tokens_details and num_cached_tokens:
+                    final_usage.prompt_tokens_details = PromptTokenUsageInfo(
+                        cached_tokens=num_cached_tokens)
 
                 final_usage_chunk = ChatCompletionStreamResponse(
                     id=request_id,
@@ -702,11 +709,13 @@ async def chat_completion_full_generator(
             num_prompt_tokens += len(final_res.encoder_prompt_token_ids)
         num_generated_tokens = sum(
             len(output.token_ids) for output in final_res.outputs)
-        usage = UsageInfo(
-            prompt_tokens=num_prompt_tokens,
-            completion_tokens=num_generated_tokens,
-            total_tokens=num_prompt_tokens + num_generated_tokens,
-        )
+        usage = UsageInfo(prompt_tokens=num_prompt_tokens,
+                          completion_tokens=num_generated_tokens,
+                          total_tokens=num_prompt_tokens +
+                          num_generated_tokens)
+        if self.enable_prompt_tokens_details and final_res.num_cached_tokens:
+            usage.prompt_tokens_details = PromptTokenUsageInfo(
+                cached_tokens=final_res.num_cached_tokens)
 
         request_metadata.final_usage_info = usage
 
diff --git a/vllm/outputs.py b/vllm/outputs.py
index abfdb7d328126..badf50d0602d6 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -83,10 +83,11 @@ class RequestOutput:
         finished: Whether the whole request is finished.
         metrics: Metrics associated with the request.
         lora_request: The LoRA request that was used to generate the output.
-        encoder_prompt: The encoder prompt string of the request; 
-                        None if decoder-only
-        encoder_prompt_token_ids: The token IDs of the encoder prompt;
-                                  None if decoder-only
+        encoder_prompt: The encoder prompt string of the request.
+                        None if decoder-only.
+        encoder_prompt_token_ids: The token IDs of the encoder prompt.
+                                  None if decoder-only.
+        num_cached_tokens: The number of tokens with prefix cache hit.
     """
 
     def __init__(
@@ -101,6 +102,7 @@ def __init__(
         lora_request: Optional[LoRARequest] = None,
         encoder_prompt: Optional[str] = None,
         encoder_prompt_token_ids: Optional[List[int]] = None,
+        num_cached_tokens: Optional[int] = None,
     ) -> None:
         self.request_id = request_id
         self.prompt = prompt
@@ -112,6 +114,7 @@ def __init__(
         self.lora_request = lora_request
         self.encoder_prompt = encoder_prompt
         self.encoder_prompt_token_ids = encoder_prompt_token_ids
+        self.num_cached_tokens = num_cached_tokens
 
     @classmethod
     def new(
@@ -192,6 +195,8 @@ def from_seq_group(
 
         outputs = []
         include_prompt = True
+        # num_cached_tokens should be the same for all the sequences
+        num_cached_tokens = None
         for i, seq in enumerate(top_n_seqs):
             output_text = seq.get_output_text_to_return(
                 text_buffer_length, delta)
@@ -199,6 +204,7 @@ def from_seq_group(
             output_token_ids = seq.get_output_token_ids_to_return(delta)
             num_output_tokens = 1 if isinstance(output_token_ids,
                                                 int) else len(output_token_ids)
+            num_cached_tokens = seq.data.get_num_cached_tokens()
 
             output_logprobs = seq.output_logprobs if include_logprobs else None
 
@@ -272,7 +278,7 @@ def from_seq_group(
         init_args = (seq_group.request_id, prompt, prompt_token_ids,
                      prompt_logprobs, outputs, finished, seq_group.metrics,
                      seq_group.lora_request, encoder_prompt,
-                     encoder_prompt_token_ids)
+                     encoder_prompt_token_ids, num_cached_tokens)
 
         if use_cache:
             request_output = seq_group.cached_request_output
@@ -293,7 +299,8 @@ def __repr__(self) -> str:
                 f"outputs={self.outputs}, "
                 f"finished={self.finished}, "
                 f"metrics={self.metrics}, "
-                f"lora_request={self.lora_request})")
+                f"lora_request={self.lora_request}, "
+                f"num_cached_tokens={self.num_cached_tokens})")
 
 
 class EmbeddingRequestOutput:
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 7d7ddc7ec4447..1370cb5c4f9d2 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -167,6 +167,8 @@ class SequenceData(msgspec.Struct,
                                    ...] = msgspec.field(default_factory=tuple)
     # The number of tokens that are computed (that run against the model).
     _num_computed_tokens: int = 0
+    # The number of tokens with prefix cache hit.
+    _num_cached_tokens: int = 0
     _stage: SequenceStage = SequenceStage.PREFILL
     _cached_all_token_ids: List[int] = msgspec.field(default_factory=list)
 
@@ -323,6 +325,14 @@ def update_num_computed_tokens(self, num_new_computed_tokens: int):
         if self.get_num_uncomputed_tokens() == 0:
             self._stage = SequenceStage.DECODE
 
+    def get_num_cached_tokens(self) -> int:
+        """Return the number of tokens with prefix cache hit."""
+        return self._num_cached_tokens
+
+    def update_num_cached_tokens(self, num_cached_tokens: int):
+        """Update the number of tokens with prefix cache hit."""
+        self._num_cached_tokens = num_cached_tokens
+
     def reset_state_for_recompute(self) -> None:
         """Reset the number of computed tokens from this sequence. It is
         supposed to be called when a sequence needs to be started from
@@ -379,7 +389,7 @@ def __repr__(self) -> str:
 
 class Sequence:
     """Stores the data, status, and block information of a sequence.
-    
+
     The sequence is constructed from the :data:`DecoderOnlyInputs`
     (for decoder-only) or :data:`EncoderDecoderInputs` (for encoder-decoder)
     instance passed in through the :code:`inputs` constructor argument.
@@ -906,7 +916,7 @@ class SequenceGroupMetadata(
         multi_modal_data: Multi modal data.
         mm_processor_kwargs: Multimodal input processor / mapper overrides.
         encoder_seq_data: Optional sequence data for encoder prompt
-                          (SequenceGroup.encoder_seq). Should be None 
+                          (SequenceGroup.encoder_seq). Should be None
                           unless you are working with an encoder/decoder
                           model.
         cross_block_table: Optional cross-attention block table associated
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index e1446192ce3d6..2da02f21f8342 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -542,6 +542,9 @@ def _compute_for_prefix_cache_hit(
         # this may be larger than the sequence length if chunked
         # prefill is enabled.
         prefix_cache_len = len(computed_block_nums) * self.block_size
+        seq_group_metadata.seq_data[inter_data.seq_ids[
+            seq_idx]].update_num_cached_tokens(prefix_cache_len)
+
         # The number of so far computed prompt tokens in this sequence.
         context_len = inter_data.context_lens[seq_idx]
         # The total number of prompt tokens in this sequence.

From 7c65527918cd16286961a2a779e15743ca41ab0e Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 12 Nov 2024 08:57:14 -0800
Subject: [PATCH 109/183] [V1] Use pickle for serializing EngineCoreRequest &
 Add multimodal inputs to EngineCoreRequest (#10245)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/engine/__init__.py    |  9 +++++++--
 vllm/v1/engine/core.py        |  3 ++-
 vllm/v1/engine/core_client.py |  3 ++-
 vllm/v1/engine/processor.py   |  5 ++++-
 vllm/v1/serial_utils.py       | 10 ++++++++++
 5 files changed, 25 insertions(+), 5 deletions(-)
 create mode 100644 vllm/v1/serial_utils.py

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index 8bc16651faf97..edfb8bd7c2fc1 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -1,10 +1,11 @@
 import enum
 from dataclasses import dataclass
-from typing import List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 import msgspec
 
 from vllm.lora.request import LoRARequest
+from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
 from vllm.sampling_params import RequestOutputKind, SamplingParams
 
 
@@ -22,7 +23,8 @@ class DetokenizerRequest:
     include_stop_str_in_output: bool
 
 
-class EngineCoreRequest(msgspec.Struct, omit_defaults=True):
+@dataclass
+class EngineCoreRequest:
 
     # NOTE: prompt and prompt_token_ids should be DecoderOnlyInput,
     # but this object is currently not playing well with msgspec
@@ -33,6 +35,9 @@ class EngineCoreRequest(msgspec.Struct, omit_defaults=True):
     # always be tokenized?
     prompt: Optional[str]
     prompt_token_ids: List[int]
+    mm_data: Optional[MultiModalDataDict]
+    mm_placeholders: Optional[MultiModalPlaceholderDict]
+    mm_processor_kwargs: Optional[Dict[str, Any]]
     sampling_params: SamplingParams
     eos_token_id: Optional[int]
     arrival_time: float
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index f9d3473d0131c..808c3936b6c35 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -19,6 +19,7 @@
                             EngineCoreRequest, EngineCoreRequestType)
 from vllm.v1.executor.gpu_executor import GPUExecutor
 from vllm.v1.request import Request, RequestStatus
+from vllm.v1.serial_utils import PickleEncoder
 from vllm.version import __version__ as VLLM_VERSION
 
 logger = init_logger(__name__)
@@ -315,7 +316,7 @@ def process_input_socket(self, input_path: str):
         """Input socket IO thread."""
 
         # Msgpack serialization decoding.
-        decoder_add_req = msgpack.Decoder(EngineCoreRequest)
+        decoder_add_req = PickleEncoder()
         decoder_abort_req = msgpack.Decoder(list[str])
 
         with self.make_socket(input_path, zmq.constants.PULL) as socket:
diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
index f9e4677fb8c59..09801e20e16ca 100644
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@@ -11,6 +11,7 @@
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreRequest, EngineCoreRequestType)
 from vllm.v1.engine.core import EngineCore, EngineCoreProc
+from vllm.v1.serial_utils import PickleEncoder
 
 logger = init_logger(__name__)
 
@@ -115,7 +116,7 @@ def __init__(
         **kwargs,
     ):
         # Serialization setup.
-        self.encoder = msgspec.msgpack.Encoder()
+        self.encoder = PickleEncoder()
         self.decoder = msgspec.msgpack.Decoder(EngineCoreOutputs)
 
         # ZMQ setup.
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index d92e622810389..5f13cbf2e4036 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -91,7 +91,10 @@ def process_inputs(
         # Make Request for EngineCore.
         engine_core_request = EngineCoreRequest(
             request_id, processed_inputs.get("prompt"),
-            processed_inputs.get("prompt_token_ids"), sampling_params,
+            processed_inputs.get("prompt_token_ids"),
+            processed_inputs.get("multi_modal_data"),
+            processed_inputs.get("multi_modal_placeholders"),
+            processed_inputs.get("mm_processor_kwargs"), sampling_params,
             eos_token_id, arrival_time, lora_request)
 
         return detokenizer_request, engine_core_request
diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
new file mode 100644
index 0000000000000..b1cd5c11834f8
--- /dev/null
+++ b/vllm/v1/serial_utils.py
@@ -0,0 +1,10 @@
+import pickle
+
+
+class PickleEncoder:
+
+    def encode(self, obj):
+        return pickle.dumps(obj)
+
+    def decode(self, data):
+        return pickle.loads(data)

From b41fb9d3b10dcf187ac0501ca80ede96d387617f Mon Sep 17 00:00:00 2001
From: sroy745 <142070531+sroy745@users.noreply.github.com>
Date: Tue, 12 Nov 2024 10:53:57 -0800
Subject: [PATCH 110/183] [Encoder Decoder] Update Mllama to run with both
 FlashAttention and XFormers (#9982)

Signed-off-by: Sourashis Roy <sroy@roblox.com>
---
 tests/encoder_decoder/test_e2e_correctness.py |   9 +-
 .../vision_language/test_mllama.py            | 100 +++++++++++-------
 tests/test_config.py                          |   2 +
 vllm/model_executor/models/mllama.py          |  52 ++++++---
 vllm/worker/enc_dec_model_runner.py           |  34 ++----
 5 files changed, 117 insertions(+), 80 deletions(-)

diff --git a/tests/encoder_decoder/test_e2e_correctness.py b/tests/encoder_decoder/test_e2e_correctness.py
index f2d7e9fd78cf3..fa5d6a69a9bc8 100644
--- a/tests/encoder_decoder/test_e2e_correctness.py
+++ b/tests/encoder_decoder/test_e2e_correctness.py
@@ -7,7 +7,7 @@
 import pytest
 from transformers import AutoModelForSeq2SeqLM
 
-from vllm.attention.selector import (_Backend,
+from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
                                      global_force_attn_backend_context_manager)
 from vllm.platforms import current_platform
 from vllm.sequence import SampleLogprobs
@@ -34,6 +34,13 @@ def vllm_to_hf_output(
     return output_ids, hf_output_str, out_logprobs
 
 
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Fixture to clear backend cache before each test."""
+    _cached_get_attn_backend.cache_clear()  # Clear the cache
+    yield  # This allows the test to run
+
+
 @pytest.mark.parametrize("model", ["facebook/bart-large-cnn"])
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py
index 7f82347841cdb..a3b1c0950d9a2 100644
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -4,6 +4,8 @@
 from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
                           BatchEncoding)
 
+from vllm.attention.selector import (_Backend, _cached_get_attn_backend,
+                                     global_force_attn_backend_context_manager)
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 
@@ -14,6 +16,8 @@
 
 _LIMIT_IMAGE_PER_PROMPT = 3
 
+LIST_ENC_DEC_SUPPORTED_BACKENDS = [_Backend.XFORMERS, _Backend.FLASH_ATTN]
+
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
     "stop_sign":
     "<|image|><|begin_of_text|>The meaning of the image is",
@@ -221,6 +225,13 @@ def process(hf_inputs: BatchEncoding, **kwargs):
         )
 
 
+@pytest.fixture(autouse=True)
+def clear_cache():
+    """Fixture to clear backend cache before each test."""
+    _cached_get_attn_backend.cache_clear()  # Clear the cache
+    yield  # This allows the test to run
+
+
 @large_gpu_test(min_gb=48)
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
@@ -244,20 +255,26 @@ def process(hf_inputs: BatchEncoding, **kwargs):
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
 def test_models_single_leading_image(hf_runner, vllm_runner, image_assets,
                                      model, sizes, dtype, max_tokens,
-                                     num_logprobs) -> None:
-    run_test(
-        hf_runner,
-        vllm_runner,
-        image_assets,
-        model,
-        sizes=sizes,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=1,
-    )
+                                     num_logprobs,
+                                     attn_backend: _Backend) -> None:
+    with global_force_attn_backend_context_manager(attn_backend):
+        if attn_backend == _Backend.FLASH_ATTN:
+            # Flash Attention works only with bfloat16 data-type
+            dtype = 'bfloat16'
+        run_test(
+            hf_runner,
+            vllm_runner,
+            image_assets,
+            model,
+            sizes=sizes,
+            dtype=dtype,
+            max_tokens=max_tokens,
+            num_logprobs=num_logprobs,
+            tensor_parallel_size=1,
+        )
 
 
 @large_gpu_test(min_gb=48)
@@ -265,9 +282,10 @@ def test_models_single_leading_image(hf_runner, vllm_runner, image_assets,
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
 def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets,
-                                     model, dtype, max_tokens,
-                                     num_logprobs) -> None:
+                                     model, dtype, max_tokens, num_logprobs,
+                                     attn_backend: _Backend) -> None:
 
     stop_sign = image_assets[0].pil_image
     cherry_blossom = image_assets[1].pil_image
@@ -291,17 +309,20 @@ def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets,
                 cherry_blossom.resize((512, 1024)),
             ],
         ])]
-
-    _run_test(
-        hf_runner,
-        vllm_runner,
-        inputs,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=1,
-    )
+    with global_force_attn_backend_context_manager(attn_backend):
+        if attn_backend == _Backend.FLASH_ATTN:
+            # Flash Attention works only with bfloat16 data-type
+            dtype = 'bfloat16'
+        _run_test(
+            hf_runner,
+            vllm_runner,
+            inputs,
+            model,
+            dtype=dtype,
+            max_tokens=max_tokens,
+            num_logprobs=num_logprobs,
+            tensor_parallel_size=1,
+        )
 
 
 @large_gpu_test(min_gb=48)
@@ -309,8 +330,10 @@ def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets,
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("attn_backend", LIST_ENC_DEC_SUPPORTED_BACKENDS)
 def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
-                                   dtype, max_tokens, num_logprobs) -> None:
+                                   dtype, max_tokens, num_logprobs,
+                                   attn_backend: _Backend) -> None:
 
     stop_sign = image_assets[0].pil_image
     cherry_blossom = image_assets[1].pil_image
@@ -325,14 +348,17 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model,
             [stop_sign],
             [stop_sign, cherry_blossom],
         ])]
-
-    _run_test(
-        hf_runner,
-        vllm_runner,
-        inputs,
-        model,
-        dtype=dtype,
-        max_tokens=max_tokens,
-        num_logprobs=num_logprobs,
-        tensor_parallel_size=1,
-    )
+    with global_force_attn_backend_context_manager(attn_backend):
+        if attn_backend == _Backend.FLASH_ATTN:
+            # Flash Attention works only with bfloat16 data-type
+            dtype = 'bfloat16'
+        _run_test(
+            hf_runner,
+            vllm_runner,
+            inputs,
+            model,
+            dtype=dtype,
+            max_tokens=max_tokens,
+            num_logprobs=num_logprobs,
+            tensor_parallel_size=1,
+        )
diff --git a/tests/test_config.py b/tests/test_config.py
index 36c426d6c51f6..df382d22d83ec 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -243,6 +243,8 @@ def test_rope_customization():
     assert longchat_model_config.max_model_len == 4096
 
 
+@pytest.mark.skipif(current_platform.is_rocm(),
+                    reason="Encoder Decoder models not supported on ROCm.")
 @pytest.mark.parametrize(("model_id", "is_encoder_decoder"), [
     ("facebook/opt-125m", False),
     ("facebook/bart-base", True),
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index e5c1d28e6e7ea..db7ee7b2d8537 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -32,6 +32,8 @@
 
 import vllm.distributed.parallel_state as ps
 from vllm.attention import Attention, AttentionMetadata, AttentionType
+from vllm.attention.backends.flash_attn import FlashAttentionMetadata
+from vllm.attention.backends.xformers import XFormersMetadata
 from vllm.attention.ops.paged_attn import PagedAttention
 from vllm.config import VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
@@ -799,12 +801,13 @@ def forward(
         q = self.q_norm(q)
 
         if attention_mask is not None:
-            output = self.attention_with_mask(q, k, v, kv_cache,
-                                              attention_mask,
-                                              kv_range_for_decode,
-                                              attn_metadata)
+            output = self._attention_with_mask(q, k, v, kv_cache,
+                                               attention_mask,
+                                               kv_range_for_decode,
+                                               attn_metadata)
         else:
-            output = self.attn(q,
+            output = self.attn(q.view(-1,
+                                      self.num_local_heads * self.head_dim),
                                k,
                                v,
                                kv_cache,
@@ -813,7 +816,7 @@ def forward(
         out, _ = self.o_proj(output)
         return out
 
-    def attention_with_mask(
+    def _attention_with_mask(
         self,
         q: torch.Tensor,
         k: torch.Tensor,
@@ -824,14 +827,35 @@ def attention_with_mask(
         attn_metadata: AttentionMetadata,
     ) -> torch.Tensor:
         # Skip writing kv-cache for the initial profiling run.
-        if len(kv_cache.shape) == 3:
-            key_cache, value_cache = PagedAttention.split_kv_cache(
-                kv_cache, self.num_local_key_value_heads, self.head_dim)
-            cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode])
-            cached_v = torch.cat([v[s:e] for s, e in kv_range_for_decode])
-            PagedAttention.write_to_paged_cache(
-                cached_k, cached_v, key_cache, value_cache,
-                attn_metadata.cross_slot_mapping, "auto", 1.0, 1.0)
+        if len(kv_cache.shape) > 1:
+            if isinstance(attn_metadata, FlashAttentionMetadata):
+                cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode])
+                cached_v = torch.cat([v[s:e] for s, e in kv_range_for_decode])
+                torch.ops._C_cache_ops.reshape_and_cache_flash(
+                    cached_k,
+                    cached_v,
+                    kv_cache[0],
+                    kv_cache[1],
+                    attn_metadata.
+                    cross_slot_mapping,  # type: ignore[union-attr]
+                    "auto",
+                    1.0,
+                    1.0,
+                )
+            elif isinstance(attn_metadata, XFormersMetadata):
+                key_cache, value_cache = PagedAttention.split_kv_cache(
+                    kv_cache, self.num_local_key_value_heads, self.head_dim)
+                cached_k = torch.cat([k[s:e] for s, e in kv_range_for_decode])
+                cached_v = torch.cat([v[s:e] for s, e in kv_range_for_decode])
+                PagedAttention.write_to_paged_cache(
+                    cached_k, cached_v, key_cache, value_cache,
+                    attn_metadata.cross_slot_mapping, "auto", 1.0, 1.0)
+            else:
+                raise ValueError(
+                    f"Unsupported AttentionMetadata {type(attn_metadata)} "
+                    f"class found. Expected the AttentionMetadata to "
+                    f"be either XFormersMetadata or FlashAttentionMetadata.")
+
         # We have to call torch.sdpa for prefill when using a
         # custom cross-attention mask. Because the mask is not a
         # standard causal mask, neither a block diagonal mask which
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
index 008e0c9745994..82824faa6629a 100644
--- a/vllm/worker/enc_dec_model_runner.py
+++ b/vllm/worker/enc_dec_model_runner.py
@@ -9,15 +9,13 @@
                                               AttentionMetadata)
 from vllm.attention.backends.utils import PAD_SLOT_ID
 from vllm.attention.selector import (_Backend, get_env_variable_attn_backend,
-                                     get_global_forced_attn_backend,
-                                     global_force_attn_backend)
-from vllm.config import ModelConfig, VllmConfig
+                                     get_global_forced_attn_backend)
+from vllm.config import VllmConfig
 from vllm.forward_context import set_forward_context
 from vllm.inputs import INPUT_REGISTRY, InputRegistry
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.layers.sampler import SamplerOutput
-from vllm.model_executor.model_loader.utils import get_architecture_class_name
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
                              MultiModalRegistry)
 from vllm.sampling_params import SamplingParams
@@ -35,11 +33,6 @@
 
 logger = init_logger(__name__)
 
-# The Mllama model has PagedAttention specific logic because of which it
-# can only be run with the XFORMERS backend
-# TODO Make Mllama model work with Flash Attention backend.
-_XFORMERS_ONLY_ENCODER_DECODER_ARCHS = ["MllamaForConditionalGeneration"]
-
 
 @dataclasses.dataclass(frozen=True)
 class EncoderDecoderModelInput(ModelInputForGPUWithSamplingMetadata):
@@ -97,7 +90,7 @@ def __init__(
         models) but these arguments are present here for compatibility with 
         the base-class constructor.
         '''
-        self._maybe_force_supported_attention_backend(vllm_config.model_config)
+        self._maybe_force_supported_attention_backend()
 
         super().__init__(
             vllm_config=vllm_config,
@@ -108,12 +101,7 @@ def __init__(
         # Crash for unsupported encoder/scenarios
         assert_enc_dec_mr_supported_scenario(self)
 
-    def _is_xformers_only_encoder_decoder_model(self,
-                                                model: ModelConfig) -> bool:
-        return get_architecture_class_name(
-            model) in _XFORMERS_ONLY_ENCODER_DECODER_ARCHS
-
-    def _maybe_force_supported_attention_backend(self, model: ModelConfig):
+    def _maybe_force_supported_attention_backend(self):
         '''
         Force vLLM to use the XFormers attention backend,
         which is currently the only supported option.
@@ -128,23 +116,13 @@ def raise_backend_err():
         maybe_global_forced_backend = get_global_forced_attn_backend()
         is_forced_by_global = maybe_global_forced_backend is not None
         is_forced_by_env_var = maybe_env_var_forced_backend is not None
-
-        if not (is_forced_by_global or is_forced_by_env_var) \
-            and self._is_xformers_only_encoder_decoder_model(model):
-            # The user has not already specified an attention backend
-            # override
-            logger.info(
-                "Encoder-Decoder Model Architecture %s requires XFormers "
-                "backend; overriding backend auto-selection and "
-                "forcing XFormers.", get_architecture_class_name(model))
-            global_force_attn_backend(_Backend.XFORMERS)
-        elif is_forced_by_global:
+        if is_forced_by_global:  # noqa: SIM102
             # Backend override enforced by global variable takes
             # precedence over vLLM backend environment variable.
             if maybe_global_forced_backend not in\
                  [_Backend.XFORMERS, _Backend.FLASH_ATTN]:
                 raise_backend_err()
-        elif is_forced_by_env_var:
+        elif is_forced_by_env_var:  # noqa: SIM102
             # Backend override enforced by vLLM backend
             # environment variable
             if maybe_env_var_forced_backend not in\

From 8a06428c70657b3310a317b3caf3c562b0e042ae Mon Sep 17 00:00:00 2001
From: Umesh <lessimpumesh@gmail.com>
Date: Tue, 12 Nov 2024 11:08:40 -0800
Subject: [PATCH 111/183] [LoRA] Adds support for bias in LoRA (#5733)

Signed-off-by: Umesh Deshpande <udeshpa@us.ibm.com>
Co-authored-by: Umesh Deshpande <udeshpa@us.ibm.com>
---
 tests/lora/conftest.py            |   5 +
 tests/lora/test_lora_bias_e2e.py  |  52 ++++++
 tests/lora/test_utils.py          |  14 +-
 vllm/config.py                    |   1 +
 vllm/engine/arg_utils.py          |   5 +
 vllm/lora/fully_sharded_layers.py |  33 ++++
 vllm/lora/layers.py               | 296 +++++++++++++++++++++++++++++-
 vllm/lora/lora.py                 |  17 +-
 vllm/lora/models.py               |  36 +++-
 vllm/lora/utils.py                |  17 +-
 10 files changed, 456 insertions(+), 20 deletions(-)
 create mode 100644 tests/lora/test_lora_bias_e2e.py

diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 816d3986fe333..29ecf37808205 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -152,6 +152,11 @@ def sql_lora_files(sql_lora_huggingface_id):
     return snapshot_download(repo_id=sql_lora_huggingface_id)
 
 
+@pytest.fixture(scope="session")
+def lora_bias_files():
+    return snapshot_download(repo_id="followumesh/granite-3b-lora8-bias")
+
+
 @pytest.fixture(scope="session")
 def mixtral_lora_files():
     # Note: this module has incorrect adapter_config.json to test
diff --git a/tests/lora/test_lora_bias_e2e.py b/tests/lora/test_lora_bias_e2e.py
new file mode 100644
index 0000000000000..c2520c847d873
--- /dev/null
+++ b/tests/lora/test_lora_bias_e2e.py
@@ -0,0 +1,52 @@
+from typing import List
+
+import pytest
+
+import vllm
+from vllm.lora.request import LoRARequest
+
+MODEL_PATH = "ibm-granite/granite-3b-code-base"
+
+
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
+    prompts = [
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]",  # noqa: E501
+        "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]"  # noqa: E501
+    ]
+    sampling_params = vllm.SamplingParams(temperature=0,
+                                          max_tokens=256,
+                                          stop=["[/assistant]"])
+    outputs = llm.generate(
+        prompts,
+        sampling_params,
+        lora_request=LoRARequest(str(lora_id), lora_id, lora_path)
+        if lora_id else None)
+    generated_texts: List[str] = []
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        generated_texts.append(generated_text)
+    return generated_texts
+
+
+@pytest.mark.parametrize("lora_bias", [True])
+@pytest.mark.parametrize("fully_sharded", [True, False])
+def test_lora_bias(lora_bias_files: str, lora_bias: bool, fully_sharded: bool):
+    llm = vllm.LLM(MODEL_PATH,
+                   enable_lora=True,
+                   max_num_seqs=16,
+                   max_lora_rank=8,
+                   max_loras=1,
+                   enable_lora_bias=lora_bias,
+                   tensor_parallel_size=1,
+                   fully_sharded_loras=fully_sharded)
+
+    print("lora adapter created")
+    output1 = do_sample(llm, lora_bias_files, lora_id=0)
+
+    print("lora")
+    output2 = do_sample(llm, lora_bias_files, lora_id=1)
+
+    if lora_bias:
+        assert output1 != output2
+    else:
+        assert output1 == output2
diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py
index db02bacdb6439..85110b8fa8cd2 100644
--- a/tests/lora/test_utils.py
+++ b/tests/lora/test_utils.py
@@ -12,36 +12,40 @@
 
 def test_parse_fine_tuned_lora_name_valid():
     fixture = {
-        ("base_model.model.lm_head.lora_A.weight", "lm_head", True),
-        ("base_model.model.lm_head.lora_B.weight", "lm_head", False),
+        ("base_model.model.lm_head.lora_A.weight", "lm_head", True, False),
+        ("base_model.model.lm_head.lora_B.weight", "lm_head", False, False),
         (
             "base_model.model.model.embed_tokens.lora_embedding_A",
             "model.embed_tokens",
             True,
+            False,
         ),
         (
             "base_model.model.model.embed_tokens.lora_embedding_B",
             "model.embed_tokens",
             False,
+            False,
         ),
         (
             "base_model.model.model.layers.9.mlp.down_proj.lora_A.weight",
             "model.layers.9.mlp.down_proj",
             True,
+            False,
         ),
         (
             "base_model.model.model.layers.9.mlp.down_proj.lora_B.weight",
             "model.layers.9.mlp.down_proj",
             False,
+            False,
         ),
     }
-    for name, module_name, is_lora_a in fixture:
-        assert (module_name, is_lora_a) == parse_fine_tuned_lora_name(name)
+    for name, module_name, is_lora_a, is_bias in fixture:
+        assert (module_name, is_lora_a,
+                is_bias) == parse_fine_tuned_lora_name(name)
 
 
 def test_parse_fine_tuned_lora_name_invalid():
     fixture = {
-        "weight",
         "base_model.weight",
         "base_model.model.weight",
     }
diff --git a/vllm/config.py b/vllm/config.py
index b354fb61d7b7e..5ba1c41fcaac1 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1687,6 +1687,7 @@ class LoRAConfig:
     # This is a constant.
     lora_vocab_padding_size: ClassVar[int] = 256
     long_lora_scaling_factors: Optional[Tuple[float]] = None
+    bias_enabled: bool = False
 
     def __post_init__(self):
         # Setting the maximum rank to 256 should be able to satisfy the vast
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 1591059a89f92..27f62b0008578 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -143,6 +143,7 @@ class EngineArgs:
     limit_mm_per_prompt: Optional[Mapping[str, int]] = None
     mm_processor_kwargs: Optional[Dict[str, Any]] = None
     enable_lora: bool = False
+    enable_lora_bias: bool = False
     max_loras: int = 1
     max_lora_rank: int = 16
     enable_prompt_adapter: bool = False
@@ -584,6 +585,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument('--enable-lora',
                             action='store_true',
                             help='If True, enable handling of LoRA adapters.')
+        parser.add_argument('--enable-lora-bias',
+                            action='store_true',
+                            help='If True, enable bias for LoRA adapters.')
         parser.add_argument('--max-loras',
                             type=int,
                             default=EngineArgs.max_loras,
@@ -1148,6 +1152,7 @@ def create_engine_config(self) -> VllmConfig:
                              and parallel_config.use_ray),
             policy=self.scheduling_policy)
         lora_config = LoRAConfig(
+            bias_enabled=self.enable_lora_bias,
             max_lora_rank=self.max_lora_rank,
             max_loras=self.max_loras,
             fully_sharded_loras=self.fully_sharded_loras,
diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index a7887a048746a..04fc635828d4d 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -70,6 +70,14 @@ def apply(self, x: torch.Tensor,
                                        self.lora_b_stacked,
                                        add_input=True)
         # now have column partitioned output
+
+        if self.bias_stacked is not None:
+            self.bias_stacked = self.bias_stacked.view(
+                -1, self.bias_stacked.shape[-1])
+            self.bias_stacked = self.bias_stacked[
+                self.punica_wrapper.token_lora_indices]
+            output += self.bias_stacked
+
         output = output.view(*out_orig_shape)
         return output
 
@@ -121,6 +129,15 @@ def _mcp_apply(x, bias, layer: QKVParallelLinearWithLora):
     left_offset = 0
     for idx in range(n):
         shard_size = layer.lora_b_stacked[idx].shape[2]
+
+        if layer.bias_stacked is not None:
+            bias = layer.bias_stacked[idx]
+            if bias is not None:
+                bias = bias.view(-1, bias.shape[-1])
+                bias = bias[layer.punica_wrapper.token_lora_indices]
+                bias[layer.punica_wrapper.token_lora_indices == -1] = 0
+                output[:, left_offset:left_offset + shard_size] += bias
+
         layer.punica_wrapper.add_expand_slice(
             output,
             buffers[idx],
@@ -295,6 +312,15 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
         lora_b = lora_b[:, start_idx:end_idx]
         return lora_b
 
+    def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
+        if bias is None:
+            return bias
+        shard_size = self.bias_stacked.shape[2]
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
+        bias = bias[start_idx:end_idx]
+        return bias
+
     def apply(self, x: torch.Tensor) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x)
 
@@ -318,6 +344,13 @@ def apply(self, x: torch.Tensor) -> torch.Tensor:
         # reduced before being used
         shard_size = self.lora_b_stacked.shape[2]
         start_idx = self.tp_rank * shard_size
+
+        if self.bias_stacked is not None:
+            bias = self.bias_stacked.view(-1, self.bias_stacked.shape[-1])
+            bias = bias[self.punica_wrapper.token_lora_indices]
+            bias[self.punica_wrapper.token_lora_indices == -1] = 0
+            output += bias
+
         self.punica_wrapper.add_expand_slice(output, buffer,
                                              self.lora_b_stacked, start_idx,
                                              shard_size)
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 6254c67596e65..7429c60e0222d 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -67,6 +67,63 @@ def dec(*args, **kwargs):
     return dec
 
 
+def apply_bias(
+    indices: torch.Tensor,
+    output: torch.Tensor,
+    bias_stacked: torch.Tensor,
+):
+    """Applies bias to output
+
+    Input shapes:
+        bias_stacked:    (num_loras, output_dim)
+        indices:         (batch_size)
+        output:          (batch_size, output_dim)
+    """
+    org_output = output
+    output = output.view(-1, output.shape[-1])
+    indices = indices.view(-1)
+
+    bias_stacked = bias_stacked.view(-1, bias_stacked.shape[-1])
+    bias_stacked = bias_stacked[indices]
+    bias_stacked[indices == -1] = 0
+    output += bias_stacked
+
+    return output.view_as(org_output)
+
+
+def apply_bias_packed_nslice(
+    indices: torch.Tensor,
+    output: torch.Tensor,
+    output_slices: Tuple[int, ...],
+    bias_stacked: Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
+):
+    """Applies bias to output
+
+    Input shapes:
+        bias_stacked:      3 element tuple of (num_loras, output_dim)
+        indices:           (batch_size)
+        output:            (batch_size, q_slice_size + 2*kv_slice_size)
+        output_slices:     n-1 element tuple of (slice_size...),
+                           where n is number of slices
+    """
+    org_output = output
+    output = output.view(-1, output.shape[-1])
+    indices = indices.view(-1)
+
+    offset_left = 0
+    for slice_idx, slice in enumerate(output_slices):
+        bias = bias_stacked[slice_idx]
+        if bias is not None:
+            bias = bias.view(-1, bias.shape[-1])
+            bias = bias[indices]
+            bias[indices == -1] = 0
+            output[:, offset_left:offset_left + slice] += bias
+
+        offset_left += slice
+
+    return output.view_as(org_output)
+
+
 @dataclass
 class LoRAMapping(AdapterMapping):
     is_prefill: bool = False
@@ -105,6 +162,7 @@ def set_lora(
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
         embeddings_tensor: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor] = None,
     ):
         """Overwrites lora tensors at index."""
         ...
@@ -203,6 +261,7 @@ def set_lora(
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
         embeddings_tensor: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor] = None,
     ):
         self.reset_lora(index)
         self.lora_a_stacked[index, :lora_a.shape[0], :lora_a.shape[1]].copy_(
@@ -299,10 +358,22 @@ def create_lora_weights(
             dtype=lora_config.lora_dtype,
             device=self.device,
         )
+        if lora_config.bias_enabled:
+            self.bias_stacked = torch.zeros(
+                max_loras,
+                1,
+                self.output_size,
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            )
+        else:
+            self.bias_stacked = None
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
         self.lora_b_stacked[index] = 0
+        if self.lora_config.bias_enabled:
+            self.bias_stacked[index] = 0
 
     def set_lora(
         self,
@@ -310,6 +381,7 @@ def set_lora(
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
         embeddings_tensor: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor] = None,
     ):
         self.reset_lora(index)
 
@@ -319,10 +391,21 @@ def set_lora(
         self.lora_b_stacked[index,
                             0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
                                 lora_b.T, non_blocking=True)
+        if bias is not None:
+            self.bias_stacked[index,
+                              0, :bias.shape[0]].copy_(bias.T,
+                                                       non_blocking=True)
 
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
+        if self.bias_stacked is not None:
+            self.indices = self.punica_wrapper.token_lora_indices
+            output = apply_bias(
+                self.indices,
+                output,
+                self.bias_stacked,
+            )
         self.punica_wrapper.add_lora(output, x, self.lora_a_stacked,
                                      self.lora_b_stacked, 1.0)
         return output
@@ -401,11 +484,25 @@ def create_lora_weights(
             dtype=lora_config.lora_dtype,
             device=self.device,
         )
+
+        if lora_config.bias_enabled:
+            self.bias_stacked = torch.zeros(
+                max_loras,
+                1,
+                self.output_size,
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            )
+        else:
+            self.bias_stacked = None
+
         self.output_dim = self.lora_b_stacked.shape[2]
 
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
         self.lora_b_stacked[index] = 0
+        if self.lora_config.bias_enabled:
+            self.bias_stacked[index] = 0
 
     def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
         return lora_a
@@ -418,18 +515,30 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
         lora_b = lora_b[:, start_idx:end_idx]
         return lora_b
 
+    def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
+        if bias is None:
+            return bias
+        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
+        shard_size = self.output_dim
+        start_idx = tensor_model_parallel_rank * shard_size
+        end_idx = (tensor_model_parallel_rank + 1) * shard_size
+        bias = bias[start_idx:end_idx]
+        return bias
+
     def set_lora(
         self,
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
         embeddings_tensor: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor] = None,
     ):
         self.reset_lora(index)
 
         if self.tp_size > 1:
             lora_a = self.slice_lora_a(lora_a)
             lora_b = self.slice_lora_b(lora_b)
+            bias = self.slice_bias(bias)
 
         self.lora_a_stacked[index,
                             0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
@@ -437,10 +546,21 @@ def set_lora(
         self.lora_b_stacked[index,
                             0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
                                 lora_b.T, non_blocking=True)
+        if bias is not None:
+            self.bias_stacked[index,
+                              0, :bias.shape[0]].copy_(bias.T,
+                                                       non_blocking=True)
 
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
+        if self.bias_stacked is not None:
+            self.indices = self.punica_wrapper.token_lora_indices
+            output = apply_bias(
+                self.indices,
+                output,
+                self.bias_stacked,
+            )
         self.punica_wrapper.add_lora(output, x, self.lora_a_stacked,
                                      self.lora_b_stacked, 1.0)
         return output
@@ -534,6 +654,17 @@ def create_lora_weights(
                 dtype=lora_config.lora_dtype,
                 device=self.device,
             ) for _ in range(n_slices))
+        if lora_config.bias_enabled:
+            self.bias_stacked = tuple(
+                torch.zeros(
+                    max_loras,
+                    1,
+                    self.output_size // 2,
+                    dtype=lora_config.lora_dtype,
+                    device=self.device,
+                ) for _ in range(n_slices))
+        else:
+            self.bias_stacked = None
 
         self.output_dim = self.lora_b_stacked[0].shape[2]
 
@@ -542,6 +673,9 @@ def reset_lora(self, index: int):
         self.lora_a_stacked[1][index] = 0
         self.lora_b_stacked[0][index] = 0
         self.lora_b_stacked[1][index] = 0
+        if self.lora_config.bias_enabled:
+            self.bias_stacked[0][index] = 0
+            self.bias_stacked[1][index] = 0
 
     def slice_lora_a(
         self, lora_a: List[Union[torch.Tensor, None]]
@@ -562,18 +696,32 @@ def slice_lora_b(
         ]
         return lora_b
 
+    def slice_bias(
+        self, bias: List[Union[torch.Tensor,
+                               None]]) -> List[Union[torch.Tensor, None]]:
+        if bias[0] is None or bias[1] is None:
+            return bias
+        shard_size = self.output_dim
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
+        bias = [bias[0][start_idx:end_idx], bias[1][start_idx:end_idx]]
+        return bias
+
     def set_lora(
         self,
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
         embeddings_tensor: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor] = None,
     ):
         self.reset_lora(index)
 
         if self.tp_size > 1:
             lora_a = self.slice_lora_a(lora_a)
             lora_b = self.slice_lora_b(lora_b)
+            if bias is not None:
+                bias = self.slice_bias(bias)
 
         if lora_a[0] is not None:
             self.lora_a_stacked[0][
@@ -582,6 +730,10 @@ def set_lora(
             self.lora_b_stacked[0][
                 index, 0, :lora_b[0].shape[1], :lora_b[0].shape[0]].copy_(
                     lora_b[0].T, non_blocking=True)
+        if bias is not None and bias[0] is not None:
+            self.bias_stacked[0][index,
+                                 0, :bias[0].shape[0]].copy_(bias[0].T,
+                                                             non_blocking=True)
         if lora_a[1] is not None:
             self.lora_a_stacked[1][
                 index, 0, :lora_a[1].shape[1], :lora_a[1].shape[0]].copy_(
@@ -589,10 +741,22 @@ def set_lora(
             self.lora_b_stacked[1][
                 index, 0, :lora_b[1].shape[1], :lora_b[1].shape[0]].copy_(
                     lora_b[1].T, non_blocking=True)
+        if bias is not None and bias[1] is not None:
+            self.bias_stacked[1][index,
+                                 0, :bias[1].shape[0]].copy_(bias[1].T,
+                                                             non_blocking=True)
 
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
+        if self.bias_stacked is not None:
+            self.indices = self.punica_wrapper.token_lora_indices
+            output = apply_bias_packed_nslice(
+                self.indices,
+                output,
+                (self.output_dim, self.output_dim),
+                self.bias_stacked,
+            )
         self.punica_wrapper.add_lora_packed_nslice(
             output, x, self.lora_a_stacked, self.lora_b_stacked, 1.0,
             (self.output_dim, self.output_dim))
@@ -654,17 +818,35 @@ def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
         lora_b = torch.cat([lora_b_q, lora_b_k, lora_b_v], dim=1)
         return lora_b
 
+    def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
+        bias_q = bias[self.q_proj_shard_size *
+                      self.q_shard_id:self.q_proj_shard_size *
+                      (self.q_shard_id + 1)]
+        k_offset = self.q_proj_total_size
+        bias_k = bias[k_offset +
+                      self.kv_proj_shard_size * self.kv_shard_id:k_offset +
+                      self.kv_proj_shard_size * (self.kv_shard_id + 1)]
+        v_offset = k_offset + self.kv_proj_total_size
+        bias_v = bias[v_offset +
+                      self.kv_proj_shard_size * self.kv_shard_id:v_offset +
+                      self.kv_proj_shard_size * (self.kv_shard_id + 1)]
+        bias = torch.cat([bias_q, bias_k, bias_v], dim=1)
+        return bias
+
     def set_lora(
         self,
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
         embeddings_tensor: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor] = None,
     ):
         self.reset_lora(index)
         if self.tp_size > 1:
             lora_a = self.slice_lora_a(lora_a)
             lora_b = self.slice_lora_b(lora_b)
+            if bias is not None:
+                bias = self.slice_bias(bias)
 
         self.lora_a_stacked[index,
                             0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
@@ -672,6 +854,10 @@ def set_lora(
         self.lora_b_stacked[index,
                             0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
                                 lora_b.T, non_blocking=True)
+        if bias is not None:
+            self.bias_stacked[index,
+                              0, :bias.shape[0]].copy_(bias.T,
+                                                       non_blocking=True)
 
     @classmethod
     @_not_fully_sharded_can_replace
@@ -768,6 +954,32 @@ def create_lora_weights(
                 device=self.device,
             ),
         )
+        if lora_config.bias_enabled:
+            self.bias_stacked = (
+                torch.zeros(
+                    max_loras,
+                    1,
+                    self.q_proj_shard_size,
+                    dtype=lora_config.lora_dtype,
+                    device=self.device,
+                ),
+                torch.zeros(
+                    max_loras,
+                    1,
+                    self.kv_proj_shard_size,
+                    dtype=lora_config.lora_dtype,
+                    device=self.device,
+                ),
+                torch.zeros(
+                    max_loras,
+                    1,
+                    self.kv_proj_shard_size,
+                    dtype=lora_config.lora_dtype,
+                    device=self.device,
+                ),
+            )
+        else:
+            self.bias_stacked = None
 
         self.output_slices = (
             self.q_proj_shard_size,
@@ -787,6 +999,10 @@ def reset_lora(self, index: int):
         self.lora_b_stacked[1][index] = 0
         self.lora_a_stacked[2][index] = 0
         self.lora_b_stacked[2][index] = 0
+        if self.lora_config.bias_enabled:
+            self.bias_stacked[0][index] = 0
+            self.bias_stacked[1][index] = 0
+            self.bias_stacked[2][index] = 0
 
     def slice_lora_a(
         self, lora_a: List[Union[torch.Tensor, None]]
@@ -812,18 +1028,40 @@ def slice_lora_b(
         lora_b = [lora_b_q, lora_b_k, lora_b_v]
         return lora_b
 
+    def slice_bias(
+        self, bias: List[Union[torch.Tensor,
+                               None]]) -> List[Union[torch.Tensor, None]]:
+        bias_q, bias_k, bias_v = bias
+        if bias_q is not None:
+            bias_q = bias_q[self.q_proj_shard_size *
+                            self.q_shard_id:self.q_proj_shard_size *
+                            (self.q_shard_id + 1)]
+        if bias_k is not None:
+            bias_k = bias_k[self.kv_proj_shard_size *
+                            self.kv_shard_id:self.kv_proj_shard_size *
+                            (self.kv_shard_id + 1)]
+        if bias_v is not None:
+            bias_v = bias_v[self.kv_proj_shard_size *
+                            self.kv_shard_id:self.kv_proj_shard_size *
+                            (self.kv_shard_id + 1)]
+        bias = [bias_q, bias_k, bias_v]
+        return bias
+
     def set_lora(
         self,
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
         embeddings_tensor: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor] = None,
     ):
         self.reset_lora(index)
 
         if self.tp_size > 1:
             lora_a = self.slice_lora_a(lora_a)
             lora_b = self.slice_lora_b(lora_b)
+            if bias is not None:
+                bias = self.slice_bias(bias)
 
         if lora_b[0] is not None:
             lora_b_q = lora_b[0]
@@ -854,9 +1092,28 @@ def set_lora(
                 index, 0, :lora_a[2].shape[1], :lora_a[2].shape[0]].copy_(
                     lora_a[2].T, non_blocking=True)
 
+        if bias is not None:
+            if bias[0] is not None:
+                self.bias_stacked[0][index, 0, :bias[0].shape[0]].copy_(
+                    bias[0].T, non_blocking=True)
+            if bias[1] is not None:
+                self.bias_stacked[1][index, 0, :bias[1].shape[0]].copy_(
+                    bias[1].T, non_blocking=True)
+            if bias[2] is not None:
+                self.bias_stacked[2][index, 0, :bias[2].shape[0]].copy_(
+                    bias[2].T, non_blocking=True)
+
     def apply(self, x: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
+        if self.bias_stacked is not None:
+            self.indices = self.punica_wrapper.token_lora_indices
+            output = apply_bias_packed_nslice(
+                self.indices,
+                output,
+                self.output_slices,
+                self.bias_stacked,
+            )
         self.punica_wrapper.add_lora_packed_nslice(output, x,
                                                    self.lora_a_stacked,
                                                    self.lora_b_stacked, 1.0,
@@ -919,9 +1176,27 @@ def create_lora_weights(
             device=self.device,
         )
 
+        if lora_config.bias_enabled:
+            self.bias_stacked = torch.zeros(
+                (
+                    max_loras,
+                    1,
+                    self.output_size,
+                ),
+                dtype=lora_config.lora_dtype,
+                device=self.device,
+            )
+        else:
+            self.bias_stacked = None
+        # Lazily initialized
+        self.indices: torch.Tensor
+        self.indices_len: List[int]
+
     def reset_lora(self, index: int):
         self.lora_a_stacked[index] = 0
         self.lora_b_stacked[index] = 0
+        if self.lora_config.bias_enabled:
+            self.bias_stacked[index] = 0
 
     def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
         tensor_model_parallel_rank = get_tensor_model_parallel_rank()
@@ -934,18 +1209,24 @@ def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
     def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
         return lora_b
 
+    def slice_bias(self, bias: torch.Tensor) -> torch.Tensor:
+        return bias
+
     def set_lora(
         self,
         index: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
         embeddings_tensor: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor] = None,
     ):
         self.reset_lora(index)
 
         if self.base_layer.tp_size > 1:
             lora_a = self.slice_lora_a(lora_a)
             lora_b = self.slice_lora_b(lora_b)
+            if bias is not None:
+                bias = self.slice_bias(bias)
 
         self.lora_a_stacked[index,
                             0, :lora_a.shape[1], :lora_a.shape[0]].copy_(
@@ -953,9 +1234,20 @@ def set_lora(
         self.lora_b_stacked[index,
                             0, :lora_b.shape[1], :lora_b.shape[0]].copy_(
                                 lora_b.T, non_blocking=True)
+        if bias is not None:
+            self.bias_stacked[index,
+                              0, :bias.shape[0]].copy_(bias.T,
+                                                       non_blocking=True)
 
     def apply(self, x: torch.Tensor) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x)
+        if self.bias_stacked is not None:
+            self.indices = self.punica_wrapper.token_lora_indices
+            output = apply_bias(
+                self.indices,
+                output,
+                self.bias_stacked,
+            )
         self.punica_wrapper.add_lora(output, x, self.lora_a_stacked,
                                      self.lora_b_stacked, 1.0)
         return output
@@ -1132,6 +1424,7 @@ def set_lora(
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
         embeddings_tensor: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor] = None,
     ):
         self.reset_lora(index)
         self.lora_a_stacked[index,
@@ -1199,7 +1492,7 @@ def _get_logits(
                                                       neginf=float("-inf")))
         logits[:,
                self.base_layer.org_vocab_size:self.base_layer.org_vocab_size +
-               lora_logits.shape[1], ] = lora_logits
+               lora_logits.shape[1]] = lora_logits
 
         # LogitsProcessorWithLoRA always using bgmv
         self.punica_wrapper.add_lora_logits(logits, hidden_states,
@@ -1276,6 +1569,7 @@ def set_lora(
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
         embeddings_tensor: Optional[torch.Tensor],
+        bias: Optional[torch.Tensor] = None,
     ):
         ...
 
diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py
index 14081b5ba441c..b648312ba76ec 100644
--- a/vllm/lora/lora.py
+++ b/vllm/lora/lora.py
@@ -17,6 +17,7 @@ def __init__(
         lora_alpha: int,
         lora_a: torch.Tensor,
         lora_b: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
         embeddings_tensor: Optional[torch.Tensor] = None,
         scaling: Optional[float] = None,
     ) -> None:
@@ -25,6 +26,7 @@ def __init__(
         self.lora_alpha = lora_alpha
         self.lora_a = lora_a
         self.lora_b = lora_b
+        self.bias = bias
         self.embeddings_tensor = embeddings_tensor
 
         if scaling is None:
@@ -66,7 +68,8 @@ def create_dummy_lora_weights(
             rank: int,
             dtype: torch.dtype,
             device: torch.types.Device,
-            embeddings_tensor_dim: Optional[int] = None) -> "LoRALayerWeights":
+            embeddings_tensor_dim: Optional[int] = None,
+            bias_enabled: Optional[bool] = False) -> "LoRALayerWeights":
         pin_memory = str(device) == "cpu" and is_pin_memory_available()
         lora_a = torch.zeros([input_dim, rank],
                              dtype=dtype,
@@ -76,6 +79,14 @@ def create_dummy_lora_weights(
                              dtype=dtype,
                              device=device,
                              pin_memory=pin_memory)
+        if bias_enabled:
+            bias = torch.zeros([output_dim],
+                               dtype=dtype,
+                               device=device,
+                               pin_memory=pin_memory)
+        else:
+            bias = None
+
         embeddings_tensor = torch.rand(
             10,
             embeddings_tensor_dim,
@@ -88,6 +99,7 @@ def create_dummy_lora_weights(
             lora_alpha=1,
             lora_a=lora_a,
             lora_b=lora_b,
+            bias=bias,
             embeddings_tensor=embeddings_tensor,
         )
 
@@ -102,6 +114,7 @@ def __init__(
         lora_alphas: List[Optional[int]],
         lora_a: List[Optional[torch.Tensor]],
         lora_b: List[Optional[torch.Tensor]],
+        bias: Optional[List[Optional[torch.Tensor]]] = None,
         scaling: Optional[List[float]] = None,
     ) -> None:
         super().__init__(
@@ -110,6 +123,7 @@ def __init__(
             lora_alpha=0,
             lora_a=lora_a,
             lora_b=lora_b,
+            bias=bias,
             scaling=scaling,  # type: ignore
             embeddings_tensor=None,
         )
@@ -141,6 +155,7 @@ def pack(
             [lora.lora_alpha if lora is not None else None for lora in loras],
             [lora.lora_a if lora is not None else None for lora in loras],
             [lora.lora_b if lora is not None else None for lora in loras],
+            [lora.bias if lora is not None else None for lora in loras],
             scaling=[
                 1 if lora is not None else None  # type: ignore
                 for lora in loras
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index eafc3a43a2846..2ffefe61427e3 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -4,7 +4,7 @@
 import os
 import re
 from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, List, Optional, Type
+from typing import Any, Callable, Dict, List, Optional, Sequence, Type
 
 import safetensors.torch
 import torch
@@ -119,7 +119,8 @@ def from_lora_tensors(
         pin_memory = str(device) == "cpu" and is_pin_memory_available()
         loras: Dict[str, LoRALayerWeights] = {}
         for tensor_name, tensor in tensors.items():
-            module_name, is_lora_a = parse_fine_tuned_lora_name(tensor_name)
+            module_name, is_lora_a, is_bias = parse_fine_tuned_lora_name(
+                tensor_name)
             if module_name not in loras:
                 lora_embeddings_tensor = None
                 if embeddings:
@@ -136,8 +137,16 @@ def from_lora_tensors(
                                 lora_embeddings_tensor.pin_memory())
                 loras[module_name] = LoRALayerWeights(module_name, rank,
                                                       lora_alpha, None, None,
+                                                      None,
                                                       lora_embeddings_tensor)
-            if is_lora_a:
+            if is_bias:
+                loras[module_name].bias = tensor.to(device=device,
+                                                    dtype=dtype).t()
+                bias = tensor.to(device=device, dtype=dtype).t()
+                if pin_memory:
+                    bias = bias.pin_memory()
+                loras[module_name].bias = bias
+            elif is_lora_a:
                 loras[module_name].lora_a = tensor.to(device=device,
                                                       dtype=dtype).t()
                 if pin_memory:
@@ -215,7 +224,7 @@ def from_local_checkpoint(
             with safetensors.safe_open(lora_tensor_path,
                                        framework="pt") as f:  # type: ignore
                 for lora_module in f.keys():  # noqa
-                    module_name, _ = parse_fine_tuned_lora_name(lora_module)
+                    module_name, _, _ = parse_fine_tuned_lora_name(lora_module)
                     part_name = module_name.split(".")[-1]
                     if part_name not in expected_lora_modules:
                         unexpected_modules.append(module_name)
@@ -386,8 +395,19 @@ def activate_adapter(
             module_lora = lora_model.get_lora(module_name)
             if module_lora:
                 module_lora.optimize()
+                # Bias is not explicitly enabled with the flag enable_lora_bias.
+                bias = module_lora.bias
+                if ((torch.is_tensor(bias) or
+                     (isinstance(bias, Sequence) and any(b is not None
+                                                         for b in bias)))
+                        and not self.lora_config.bias_enabled):
+                    module_lora.bias = None
+                    raise ValueError(
+                        f"Adapter bias cannot be used for {module_name}"
+                        " without --enable-lora-bias.")
                 module.set_lora(index, module_lora.lora_a, module_lora.lora_b,
-                                module_lora.embeddings_tensor)
+                                module_lora.embeddings_tensor,
+                                module_lora.bias)
             else:
                 module.reset_lora(index)
         return True
@@ -509,6 +529,7 @@ def create_dummy_lora(
         """Create zero-initialized LoRAModel for warmup."""
         model = LoRAModel(lora_id, rank, {}, scaling_factor)
         for module_name, module in self.model.named_modules():
+            bias_enabled = self.lora_config.bias_enabled
             if (not self._match_target_modules(module_name)
                     or not isinstance(module, BaseLayerWithLoRA)
                     or isinstance(module, LinearScalingRotaryEmbeddingWithLora)
@@ -536,7 +557,8 @@ def create_dummy_lora(
                         rank,
                         module.lora_a_stacked.dtype,
                         "cpu",
-                        embeddings_tensor_dim=embeddings_tensor_dim)
+                        embeddings_tensor_dim=embeddings_tensor_dim,
+                        bias_enabled=bias_enabled)
                 else:
                     lora = LoRALayerWeights.create_dummy_lora_weights(
                         module_name,
@@ -545,6 +567,7 @@ def create_dummy_lora(
                         rank,
                         module.lora_a_stacked.dtype,
                         "cpu",
+                        bias_enabled=bias_enabled,
                     )
                 lora.optimize()
             else:
@@ -559,6 +582,7 @@ def create_dummy_lora(
                         rank,
                         module.lora_a_stacked[i].dtype,
                         "cpu",
+                        bias_enabled=bias_enabled,
                     )
                     lora.optimize()
                     subloras.append(lora)
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index a780429f413d3..5876494ce2824 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -91,7 +91,7 @@ def replace_submodule(model: nn.Module, module_name: str,
     return new_module
 
 
-def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool]:
+def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool, bool]:
     """Parse the name of lora weights.
 
     args:
@@ -101,15 +101,18 @@ def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool]:
         Tuple(module_name, is_lora_a):
             module_name: the name of the module, e.g. model.dense1,
             is_lora_a whether the tensor is lora_a or lora_b.
+            is_bias whether the tensor is lora bias.
     """
     parts = name.split(".")
+    if parts[-1] == "weight" and (parts[-2] == "lora_A"
+                                  or parts[-2] == "lora_B"):
+        return ".".join(parts[2:-2]), parts[-2] == "lora_A", False
 
-    if len(parts) >= 2 and parts[0] == "base_model" and parts[1] == "model":
-        if parts[-1] == "weight":
-            if parts[-2] == "lora_A" or parts[-2] == "lora_B":
-                return ".".join(parts[2:-2]), parts[-2] == "lora_A"
-        elif parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B":
-            return ".".join(parts[2:-1]), parts[-1] == "lora_embedding_A"
+    if parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B":
+        return ".".join(parts[2:-1]), parts[-1] == "lora_embedding_A", False
+
+    if parts[-1] == "bias":
+        return ".".join(parts[2:-2]), False, True
 
     raise ValueError(f"{name} is unsupported LoRA weight")
 

From 1f55e0571350f3dd2c04638e13e52d8ed557d93e Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 12 Nov 2024 13:39:56 -0800
Subject: [PATCH 112/183] [V1] Enable Inductor when using piecewise CUDA graphs
 (#10268)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/worker/gpu_model_runner.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 2c40853742ac9..db676e2819bf4 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -404,14 +404,17 @@ def execute_model(
 
     def load_model(self) -> None:
         if self.use_cuda_graph:
-            # FIXME(woosuk): Currently, we do not use inductor to reduce the
-            # compilation time and any potential issues with the inductor.
-            os.environ["VLLM_CUSTOM_OPS"] = "all"
+            # NOTE(woosuk): Currently, we use inductor because the piecewise
+            # CUDA graphs do not work properly with the custom CUDA kernels.
+            # FIXME(woosuk): Disable inductor to reduce the compilation time
+            # and avoid any potential issues with the inductor.
+            os.environ["VLLM_CUSTOM_OPS"] = "none"
             set_compilation_config(
                 CompilationConfig(
                     use_cudagraph=True,
                     non_cudagraph_ops=["vllm.unified_v1_flash_attention"],
-                    use_inductor=False,
+                    use_inductor=True,
+                    enable_fusion=False,
                 ))
 
         logger.info("Starting to load model %s...", self.model_config.model)

From 96ae0eaeb270be8741abb30f2251670b4554e886 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 12 Nov 2024 14:34:39 -0800
Subject: [PATCH 113/183] [doc] fix location of runllm widget (#10266)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/_static/custom.js | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/source/_static/custom.js b/docs/source/_static/custom.js
index dac40ca2cfe75..18b502c786e1d 100644
--- a/docs/source/_static/custom.js
+++ b/docs/source/_static/custom.js
@@ -8,7 +8,9 @@ document.addEventListener("DOMContentLoaded", function () {
     script.setAttribute("version", "stable");
     script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget.
     script.setAttribute("runllm-name", "vLLM");
-    script.setAttribute("runllm-position", "TOP_RIGHT");
+    script.setAttribute("runllm-position", "BOTTOM_RIGHT");
+    script.setAttribute("runllm-position-y", "20%");
+    script.setAttribute("runllm-position-x", "3%");
     script.setAttribute("runllm-assistant-id", "207");
   
     script.async = true;

From 18081451f9f5dd3ae476ff1e217d5573832b2604 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 12 Nov 2024 14:43:52 -0800
Subject: [PATCH 114/183] [doc] improve debugging doc (#10270)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/getting_started/debugging.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst
index 060599680be25..77bf550601346 100644
--- a/docs/source/getting_started/debugging.rst
+++ b/docs/source/getting_started/debugging.rst
@@ -20,6 +20,10 @@ Hangs loading a model from disk
 If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow. 
 It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory.
 
+.. note::
+
+    To isolate the model downloading and loading issue, you can use the ``--load-format dummy`` argument to skip loading the model weights. This way, you can check if the model downloading and loading is the bottleneck.
+
 Model is too large
 ----------------------------------------
 If the model is too large to fit in a single GPU, you might want to `consider tensor parallelism <https://docs.vllm.ai/en/latest/serving/distributed_serving.html#distributed-inference-and-serving>`_ to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `this example <https://docs.vllm.ai/en/latest/getting_started/examples/save_sharded_state.html>`_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.

From 377b74fe877c7eb4632c2ca0778b9da9a5db8ae6 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 12 Nov 2024 15:06:48 -0800
Subject: [PATCH 115/183] Revert "[ci][build] limit cmake version" (#10271)

---
 Dockerfile.neuron                                | 2 +-
 Dockerfile.ppc64le                               | 2 +-
 docs/source/getting_started/cpu-installation.rst | 2 +-
 pyproject.toml                                   | 2 +-
 requirements-build.txt                           | 2 +-
 requirements-tpu.txt                             | 2 +-
 requirements-xpu.txt                             | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/Dockerfile.neuron b/Dockerfile.neuron
index 47e40e015239a..2143315d2a078 100644
--- a/Dockerfile.neuron
+++ b/Dockerfile.neuron
@@ -31,7 +31,7 @@ RUN --mount=type=bind,source=.git,target=.git \
     if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 
 RUN python3 -m pip install -U \
-        'cmake>=3.26,<=3.30' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
+        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
         -r requirements-neuron.txt
 
 ENV VLLM_TARGET_DEVICE neuron
diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le
index c2a40000aab4b..b19c6ddec7948 100644
--- a/Dockerfile.ppc64le
+++ b/Dockerfile.ppc64le
@@ -21,7 +21,7 @@ RUN --mount=type=bind,source=.git,target=.git \
 # These packages will be in rocketce eventually
 RUN --mount=type=cache,target=/root/.cache/pip  \
     pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
-        'cmake>=3.26,<=3.30' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
+        'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
         torch==2.3.1 \
         -r requirements-cpu.txt \
         xformers uvloop==0.20.0
diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst
index 6bf170b164fb8..69530fd778c55 100644
--- a/docs/source/getting_started/cpu-installation.rst
+++ b/docs/source/getting_started/cpu-installation.rst
@@ -62,7 +62,7 @@ Build from source
 .. code-block:: console
 
     $ pip install --upgrade pip
-    $ pip install cmake>=3.26,<=3.30 wheel packaging ninja "setuptools-scm>=8" numpy
+    $ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy
     $ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
 
 - Finally, build and install vLLM CPU backend: 
diff --git a/pyproject.toml b/pyproject.toml
index 3be401daa44c7..3c8c46cc8621e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
 [build-system]
 # Should be mirrored in requirements-build.txt
 requires = [
-    "cmake>=3.26,<=3.30",
+    "cmake>=3.26",
     "ninja",
     "packaging",
     "setuptools>=61",
diff --git a/requirements-build.txt b/requirements-build.txt
index 64b92861df25d..fec01caaf25ef 100644
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -1,5 +1,5 @@
 # Should be mirrored in pyproject.toml
-cmake>=3.26,<=3.30
+cmake>=3.26
 ninja
 packaging
 setuptools>=61
diff --git a/requirements-tpu.txt b/requirements-tpu.txt
index 94a3225dcf479..f9a0770804e55 100644
--- a/requirements-tpu.txt
+++ b/requirements-tpu.txt
@@ -2,7 +2,7 @@
 -r requirements-common.txt
 
 # Dependencies for TPU
-cmake>=3.26,<=3.30
+cmake>=3.26
 ninja
 packaging
 setuptools-scm>=8
diff --git a/requirements-xpu.txt b/requirements-xpu.txt
index 479cb4bb18484..e41295792283f 100644
--- a/requirements-xpu.txt
+++ b/requirements-xpu.txt
@@ -2,7 +2,7 @@
 -r requirements-common.txt
 
 ray >= 2.9
-cmake>=3.26,<=3.30
+cmake>=3.26
 ninja
 packaging
 setuptools-scm>=8

From 112fa0bbe5e5354f592a42913a4e6d72e0407b93 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 12 Nov 2024 16:17:20 -0800
Subject: [PATCH 116/183] [V1] Fix CI tests on V1 engine (#10272)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 tests/v1/engine/test_engine_core.py        | 3 +++
 tests/v1/engine/test_engine_core_client.py | 3 +++
 vllm/v1/engine/core.py                     | 2 +-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py
index 8451aac33acc4..b3692b594326a 100644
--- a/tests/v1/engine/test_engine_core.py
+++ b/tests/v1/engine/test_engine_core.py
@@ -27,6 +27,9 @@ def make_request() -> EngineCoreRequest:
         request_id=uuid.uuid4(),
         prompt=PROMPT,
         prompt_token_ids=PROMPT_TOKENS,
+        mm_data=None,
+        mm_placeholders=None,
+        mm_processor_kwargs=None,
         sampling_params=SamplingParams(),
         eos_token_id=None,
         arrival_time=time.time(),
diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py
index d582101a1164f..7b241bf836a0e 100644
--- a/tests/v1/engine/test_engine_core_client.py
+++ b/tests/v1/engine/test_engine_core_client.py
@@ -29,6 +29,9 @@ def make_request(params: SamplingParams) -> EngineCoreRequest:
         request_id=str(uuid.uuid4()),
         prompt=PROMPT,
         prompt_token_ids=PROMPT_TOKENS,
+        mm_data=None,
+        mm_placeholders=None,
+        mm_processor_kwargs=None,
         sampling_params=params,
         eos_token_id=None,
         arrival_time=time.time(),
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 808c3936b6c35..428483bdb29cb 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -317,7 +317,7 @@ def process_input_socket(self, input_path: str):
 
         # Msgpack serialization decoding.
         decoder_add_req = PickleEncoder()
-        decoder_abort_req = msgpack.Decoder(list[str])
+        decoder_abort_req = PickleEncoder()
 
         with self.make_socket(input_path, zmq.constants.PULL) as socket:
             while True:

From 0d4ea3fb5c8c499b70cea8b1deee3e34a147cff1 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 12 Nov 2024 17:36:08 -0800
Subject: [PATCH 117/183] [core][distributed] use tcp store directly (#10275)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/distributed/test_utils.py | 26 ++++++++++++++++----------
 vllm/distributed/utils.py       | 28 +++++++++++++---------------
 2 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
index 5d77d8abb4718..50444d3abfaf2 100644
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -43,12 +43,15 @@ def test_cuda_device_count_stateless():
 
 
 def cpu_worker(rank, WORLD_SIZE, port1, port2):
-    pg1 = StatelessProcessGroup.create(init_method=f"tcp://127.0.0.1:{port1}",
+    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
+                                       port=port1,
                                        rank=rank,
                                        world_size=WORLD_SIZE)
     if rank <= 2:
-        pg2 = StatelessProcessGroup.create(
-            init_method=f"tcp://127.0.0.1:{port2}", rank=rank, world_size=3)
+        pg2 = StatelessProcessGroup.create(host="127.0.0.1",
+                                           port=port2,
+                                           rank=rank,
+                                           world_size=3)
     data = torch.tensor([rank])
     data = pg1.broadcast_obj(data, src=2)
     assert data.item() == 2
@@ -62,14 +65,17 @@ def cpu_worker(rank, WORLD_SIZE, port1, port2):
 
 def gpu_worker(rank, WORLD_SIZE, port1, port2):
     torch.cuda.set_device(rank)
-    pg1 = StatelessProcessGroup.create(init_method=f"tcp://127.0.0.1:{port1}",
+    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
+                                       port=port1,
                                        rank=rank,
                                        world_size=WORLD_SIZE)
     pynccl1 = PyNcclCommunicator(pg1, device=rank)
     pynccl1.disabled = False
     if rank <= 2:
-        pg2 = StatelessProcessGroup.create(
-            init_method=f"tcp://127.0.0.1:{port2}", rank=rank, world_size=3)
+        pg2 = StatelessProcessGroup.create(host="127.0.0.1",
+                                           port=port2,
+                                           rank=rank,
+                                           world_size=3)
         pynccl2 = PyNcclCommunicator(pg2, device=rank)
         pynccl2.disabled = False
     data = torch.tensor([rank]).cuda()
@@ -89,7 +95,8 @@ def gpu_worker(rank, WORLD_SIZE, port1, port2):
 
 
 def broadcast_worker(rank, WORLD_SIZE, port1, port2):
-    pg1 = StatelessProcessGroup.create(init_method=f"tcp://127.0.0.1:{port1}",
+    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
+                                       port=port1,
                                        rank=rank,
                                        world_size=WORLD_SIZE)
     if rank == 2:
@@ -101,7 +108,8 @@ def broadcast_worker(rank, WORLD_SIZE, port1, port2):
 
 
 def allgather_worker(rank, WORLD_SIZE, port1, port2):
-    pg1 = StatelessProcessGroup.create(init_method=f"tcp://127.0.0.1:{port1}",
+    pg1 = StatelessProcessGroup.create(host="127.0.0.1",
+                                       port=port1,
                                        rank=rank,
                                        world_size=WORLD_SIZE)
     data = pg1.all_gather_obj(rank)
@@ -109,8 +117,6 @@ def allgather_worker(rank, WORLD_SIZE, port1, port2):
     pg1.barrier()
 
 
-# TODO: investigate why this test is flaky. It hangs during initialization.
-@pytest.mark.skip("Skip the test because it is flaky.")
 @multi_gpu_test(num_gpus=4)
 @pytest.mark.parametrize(
     "worker", [cpu_worker, gpu_worker, broadcast_worker, allgather_worker])
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index a77b41322f376..dcfcb848cbe06 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -9,7 +9,7 @@
 from typing import Any, Deque, Dict, Optional, Sequence, Tuple
 
 import torch
-from torch.distributed.rendezvous import rendezvous
+from torch.distributed import TCPStore
 
 import vllm.envs as envs
 from vllm.logger import init_logger
@@ -97,7 +97,6 @@ class StatelessProcessGroup:
     group. Only use it to communicate metadata between processes.
     For data-plane communication, create NCCL-related objects.
     """
-    prefix: str
     rank: int
     world_size: int
     store: torch._C._distributed_c10d.Store
@@ -127,7 +126,7 @@ def __post_init__(self):
     def send_obj(self, obj: Any, dst: int):
         """Send an object to a destination rank."""
         self.expire_data()
-        key = f"{self.prefix}/send_to/{dst}/{self.send_dst_counter[dst]}"
+        key = f"send_to/{dst}/{self.send_dst_counter[dst]}"
         self.store.set(key, pickle.dumps(obj))
         self.send_dst_counter[dst] += 1
         self.entries.append((key, time.time()))
@@ -147,8 +146,7 @@ def recv_obj(self, src: int) -> Any:
         """Receive an object from a source rank."""
         obj = pickle.loads(
             self.store.get(
-                f"{self.prefix}/send_to/{self.rank}/{self.recv_src_counter[src]}"
-            ))
+                f"send_to/{self.rank}/{self.recv_src_counter[src]}"))
         self.recv_src_counter[src] += 1
         return obj
 
@@ -159,14 +157,14 @@ def broadcast_obj(self, obj: Optional[Any], src: int) -> Any:
         """
         if self.rank == src:
             self.expire_data()
-            key = (f"{self.prefix}/broadcast_from/{src}/"
+            key = (f"broadcast_from/{src}/"
                    f"{self.broadcast_send_counter}")
             self.store.set(key, pickle.dumps(obj))
             self.broadcast_send_counter += 1
             self.entries.append((key, time.time()))
             return obj
         else:
-            key = (f"{self.prefix}/broadcast_from/{src}/"
+            key = (f"broadcast_from/{src}/"
                    f"{self.broadcast_recv_src_counter[src]}")
             recv_obj = pickle.loads(self.store.get(key))
             self.broadcast_recv_src_counter[src] += 1
@@ -194,7 +192,8 @@ def barrier(self):
 
     @staticmethod
     def create(
-        init_method: str,
+        host: str,
+        port: int,
         rank: int,
         world_size: int,
         data_expiration_seconds: int = 3600,
@@ -214,15 +213,14 @@ def create(
         can call `StatelessProcessGroup.create` to form a group, and then process A, B,
         C, and D can call `StatelessProcessGroup.create` to form another group.
         """ # noqa
-        from torch._C._distributed_c10d import _DEFAULT_PG_TIMEOUT
-        timeout = _DEFAULT_PG_TIMEOUT
-
-        store, rank, world_size = next(
-            rendezvous(init_method, rank, world_size, timeout=timeout))
-        store.set_timeout(timeout)
+        store = TCPStore(
+            host_name=host,
+            port=port,
+            world_size=world_size,
+            is_master=(rank == 0),
+        )
 
         return StatelessProcessGroup(
-            prefix=init_method,
             rank=rank,
             world_size=world_size,
             store=store,

From bbd3e86926f15e59e4c62246b4b3185e71fe7ff2 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 12 Nov 2024 20:53:13 -0800
Subject: [PATCH 118/183] [V1] Support VLMs with fine-grained scheduling
 (#9871)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/gpt2.py    |  11 +-
 vllm/model_executor/models/llama.py   |   7 +-
 vllm/model_executor/models/llava.py   |  46 +++---
 vllm/model_executor/models/opt.py     |   7 +-
 vllm/model_executor/models/phi3v.py   |  63 +++++---
 vllm/model_executor/models/qwen2.py   |   7 +-
 vllm/v1/core/encoder_cache_manager.py |  48 ++++++
 vllm/v1/core/scheduler.py             | 205 +++++++++++++++++++++++---
 vllm/v1/engine/core.py                |  10 ++
 vllm/v1/engine/mm_input_mapper.py     |  39 +++++
 vllm/v1/request.py                    |  41 +++++-
 vllm/v1/worker/gpu_model_runner.py    | 154 ++++++++++++++++---
 12 files changed, 542 insertions(+), 96 deletions(-)
 create mode 100644 vllm/v1/core/encoder_cache_manager.py
 create mode 100644 vllm/v1/engine/mm_input_mapper.py

diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index fcff7ec2e01eb..adf2a7a51f737 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -216,9 +216,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor],
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            inputs_embeds = self.wte(input_ids)
+            if inputs_embeds is None:
+                inputs_embeds = self.wte(input_ids)
             position_embeds = self.wpe(position_ids)
             hidden_states = inputs_embeds + position_embeds
         else:
@@ -263,6 +265,9 @@ def __init__(
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.wte(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -270,9 +275,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors)
+                                         attn_metadata, intermediate_tensors,
+                                         inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 2472128976d88..8aed0fead18f9 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -538,6 +538,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             normalize=False,
             softmax=False)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -545,9 +548,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         model_output = self.model(input_ids, positions, kv_caches,
-                                  attn_metadata, intermediate_tensors)
+                                  attn_metadata, intermediate_tensors,
+                                  inputs_embeds)
         return model_output
 
     def compute_logits(
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index ca963fa1c52ea..af712bf8f9506 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -17,6 +17,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.base import NestedTensors
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
@@ -448,6 +449,25 @@ def _process_image_input(self,
         image_features = self._process_image_pixels(image_input)
         return self.multi_modal_projector(image_features)
 
+    def process_mm_inputs(self, **kwargs):
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        vision_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+        if vision_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, vision_embeddings,
+                self.config.image_token_index)
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -455,6 +475,7 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs: object,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         """Run forward pass for LLaVA-1.5.
@@ -494,24 +515,13 @@ def forward(
         """
         if intermediate_tensors is not None:
             inputs_embeds = None
-        else:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-            if image_input is not None:
-                vision_embeddings = self._process_image_input(image_input)
-                inputs_embeds = self.language_model.model.get_input_embeddings(
-                    input_ids)
-
-                inputs_embeds = merge_multimodal_embeddings(
-                    input_ids, inputs_embeds, vision_embeddings,
-                    self.config.image_token_index)
-            else:
-                inputs_embeds = self.language_model.model.get_input_embeddings(
-                    input_ids)
-
-        # always pass the input via `inputs_embeds`
-        # to make sure the computation graph is consistent
-        # for `torch.compile` integration
-        input_ids = None
+        elif inputs_embeds is None:
+            vision_embeddings = self.process_mm_inputs(**kwargs)
+            # always pass the input via `inputs_embeds`
+            # to make sure the computation graph is consistent
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 58b6107eba347..997fe642439e6 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -360,6 +360,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -367,9 +370,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 4b5dc944bce4b..de03d28638cda 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -39,6 +39,7 @@
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.base import NestedTensors, PlaceholderRange
 from vllm.multimodal.utils import cached_get_tokenizer, repeat_and_pad_token
 from vllm.sequence import IntermediateTensors, PoolerOutput
 from vllm.utils import is_list_of
@@ -500,15 +501,20 @@ def input_processor_for_phi3v(ctx: InputContext,
 
     # TODO: Move this to utils or integrate with clip.
     new_token_ids: List[int] = []
+    placeholder_ranges: List[PlaceholderRange] = []
     placeholder_idx = 0
     while merged_token_ids:
         token_id = merged_token_ids.pop(0)
         if token_id == _IMAGE_TOKEN_ID:
-            new_token_ids.extend(
-                repeat_and_pad_token(
-                    _IMAGE_TOKEN_ID,
-                    repeat_count=image_feature_size[placeholder_idx],
-                ))
+            replacement_ids = repeat_and_pad_token(
+                _IMAGE_TOKEN_ID,
+                repeat_count=image_feature_size[placeholder_idx],
+            )
+            placeholder_ranges.append({
+                "offset": len(new_token_ids),
+                "length": len(replacement_ids)
+            })
+            new_token_ids.extend(replacement_ids)
             placeholder_idx += 1
         else:
             new_token_ids.append(token_id)
@@ -516,7 +522,8 @@ def input_processor_for_phi3v(ctx: InputContext,
     # NOTE: Create a defensive copy of the original inputs
     return token_inputs(prompt_token_ids=new_token_ids,
                         prompt=new_prompt,
-                        multi_modal_data=multi_modal_data)
+                        multi_modal_data=multi_modal_data,
+                        multi_modal_placeholders={"image": placeholder_ranges})
 
 
 @MULTIMODAL_REGISTRY.register_image_input_mapper()
@@ -669,32 +676,42 @@ def _process_image_input(
 
         return image_embeds
 
+    def process_mm_inputs(self, **kwargs):
+        image_input = self._parse_and_validate_image_input(**kwargs)
+        if image_input is None:
+            return None
+        vision_embeddings = self._process_image_input(image_input)
+        return vision_embeddings
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        vision_embeddings: Optional[NestedTensors] = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.embed_tokens(input_ids)
+        if vision_embeddings is not None:
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, vision_embeddings,
+                self.image_token_id)
+        return inputs_embeds
+
     def forward(self,
                 input_ids: torch.Tensor,
                 positions: torch.Tensor,
                 kv_caches: List[torch.Tensor],
                 attn_metadata: AttentionMetadata,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs: object):
         if intermediate_tensors is not None:
             inputs_embeds = None
-        else:
-            image_input = self._parse_and_validate_image_input(**kwargs)
-
-            if image_input is not None:
-                vision_embeddings = self._process_image_input(image_input)
-                inputs_embeds = self.embed_tokens(input_ids)
-                inputs_embeds = merge_multimodal_embeddings(
-                    input_ids, inputs_embeds, vision_embeddings,
-                    self.image_token_id)
-            else:
-                inputs_embeds = self.language_model.model.embed_tokens(
-                    input_ids)
-
-        # always pass the input via `inputs_embeds`
-        # to make sure the computation graph is consistent
-        # for `torch.compile` integration
-        input_ids = None
+        elif inputs_embeds is None:
+            vision_embeddings = self.process_mm_inputs(**kwargs)
+            # always pass the input via `inputs_embeds`
+            # to make sure the computation graph is consistent
+            inputs_embeds = self.get_input_embeddings(input_ids,
+                                                      vision_embeddings)
+            input_ids = None
 
         hidden_states = self.language_model.model(input_ids,
                                                   positions,
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 2195ce49aa9a7..b623c576bb673 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -441,6 +441,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -448,9 +451,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
new file mode 100644
index 0000000000000..845bd5ea05e3c
--- /dev/null
+++ b/vllm/v1/core/encoder_cache_manager.py
@@ -0,0 +1,48 @@
+from typing import Dict, List, Set, Tuple
+
+from vllm.v1.request import Request
+
+
+class EncoderCacheManager:
+
+    def __init__(self, cache_size: int):
+        self.cache_size = cache_size
+        self.num_free_slots = cache_size
+        # req_id -> cached input ids
+        self.cached: Dict[str, Set[int]] = {}
+        # List of [req_id, input_id]
+        self.freed: List[Tuple[str, int]] = []
+
+    def has_cache(self, request: Request, input_id: int) -> bool:
+        req_id = request.request_id
+        return req_id in self.cached and input_id in self.cached[req_id]
+
+    def can_allocate(self, request: Request, input_id: int) -> bool:
+        num_tokens = request.get_num_encoder_tokens(input_id)
+        return num_tokens <= self.num_free_slots
+
+    def allocate(self, request: Request, input_id: int) -> None:
+        req_id = request.request_id
+        if req_id not in self.cached:
+            self.cached[req_id] = set()
+        self.cached[req_id].add(input_id)
+        self.num_free_slots -= request.get_num_encoder_tokens(input_id)
+
+    def get_cached_input_ids(self, request: Request) -> Set[int]:
+        return self.cached.get(request.request_id, set())
+
+    def free(self, request: Request, input_id: int) -> None:
+        req_id = request.request_id
+        if req_id not in self.cached:
+            return
+
+        self.cached[req_id].discard(input_id)
+        if len(self.cached[req_id]) == 0:
+            del self.cached[req_id]
+        self.num_free_slots += request.get_num_encoder_tokens(input_id)
+        self.freed.append((req_id, input_id))
+
+    def get_freed_ids(self) -> List[Tuple[str, int]]:
+        freed = self.freed
+        self.freed = []
+        return freed
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
index ee860e792281d..ba50a9786d805 100644
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@@ -1,16 +1,21 @@
 from collections import deque
 from dataclasses import dataclass
-from typing import Deque, Dict, Iterable, List, Optional, Set, Union
+from typing import (TYPE_CHECKING, Deque, Dict, Iterable, List, Optional, Set,
+                    Tuple, Union)
 
 from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
 from vllm.logger import init_logger
-from vllm.multimodal import MultiModalDataDict
 from vllm.sampling_params import SamplingParams
+from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
 from vllm.v1.core.kv_cache_manager import KVCacheManager
 from vllm.v1.engine import EngineCoreOutput
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
 
+if TYPE_CHECKING:
+    from vllm.multimodal import MultiModalKwargs
+    from vllm.multimodal.base import PlaceholderRange
+
 logger = init_logger(__name__)
 
 
@@ -61,12 +66,20 @@ def __init__(
         # Request id -> RunningRequestData
         self.running_reqs_data: Dict[str, RunningRequestData] = {}
 
-    def schedule(self) -> "SchedulerOutput":
-        scheduled_new_reqs: List[Request] = []
-        scheduled_resumed_reqs: List[Request] = []
-        scheduled_running_reqs: List[Request] = []
-        preempted_reqs: List[Request] = []
+        # Encoder-related.
+        # NOTE(woosuk): Here, "encoder" includes the vision encoder (and
+        # projector if needed). Currently, we assume that the encoder also
+        # has the Transformer architecture (e.g., ViT).
+        # FIXME(woosuk): Below are placeholder values. We need to calculate the
+        # actual values from the configurations.
+        self.max_num_encoder_input_tokens = 2048
+        # NOTE(woosuk): For the models without encoder (e.g., text-only models),
+        # the encoder cache will not be initialized and used, regardless of
+        # the cache size. This is because the memory space for the encoder cache
+        # is preallocated in the profiling run.
+        self.encoder_cache_manager = EncoderCacheManager(cache_size=2048)
 
+    def schedule(self) -> "SchedulerOutput":
         # NOTE(woosuk) on the scheduling algorithm:
         # There's no "decoding phase" nor "prefill phase" in the scheduler.
         # Each request just has the num_computed_tokens and num_tokens,
@@ -74,23 +87,45 @@ def schedule(self) -> "SchedulerOutput":
         # At each step, the scheduler tries to assign tokens to the requests
         # so that each request's num_computed_tokens can catch up its
         # num_tokens. This is general enough to cover chunked prefills,
-        # prefix caching, and the "jump forward" optimization in the future.
+        # prefix caching, and the "jump decoding" optimization in the future.
+
+        scheduled_new_reqs: List[Request] = []
+        scheduled_resumed_reqs: List[Request] = []
+        scheduled_running_reqs: List[Request] = []
+        preempted_reqs: List[Request] = []
 
         req_to_new_block_ids: Dict[str, List[int]] = {}
         num_scheduled_tokens: Dict[str, int] = {}
         token_budget = self.max_num_scheduled_tokens
+        # Encoder-related.
+        scheduled_encoder_inputs: Dict[str, List[int]] = {}
+        encoder_budget = self.max_num_encoder_input_tokens
 
         # First, schedule the RUNNING requests.
+        # NOTE(woosuk): At most 1 request in the RUNNING queue is allowed to be
+        # in the "partial" state, where the request has some tokens computed
+        # but not all. The constraint is due to the persistent batch in the
+        # V1 model runner.
+        # TODO(woosuk): Remove this constraint after refactoring model runner.
+        has_partial_request = False
         req_index = 0
         while req_index < len(self.running):
-            if token_budget == 0:
-                break
-
+            # Only the last request in the RUNNING queue can be "partial".
+            assert not has_partial_request
+            assert token_budget > 0
             request = self.running[req_index]
             num_new_tokens = request.num_tokens - request.num_computed_tokens
             num_new_tokens = min(num_new_tokens, token_budget)
             assert num_new_tokens > 0
 
+            # Schedule encoder inputs.
+            encoder_inputs_to_schedule, num_new_tokens, new_encoder_budget = (
+                self._try_schedule_encoder_inputs(request,
+                                                  request.num_computed_tokens,
+                                                  num_new_tokens,
+                                                  encoder_budget))
+            assert num_new_tokens > 0
+
             while True:
                 new_blocks = self.kv_cache_manager.append_slots(
                     request, num_new_tokens)
@@ -106,22 +141,40 @@ def schedule(self) -> "SchedulerOutput":
                     preempted_reqs.append(preempted_req)
                     if preempted_req == request:
                         # No more request to preempt.
+                        can_schedule = False
                         break
                 else:
                     # The request can be scheduled.
-                    scheduled_running_reqs.append(request)
-
-                    req_to_new_block_ids[request.request_id] = [
-                        b.block_id for b in new_blocks
-                    ]
-                    num_scheduled_tokens[request.request_id] = num_new_tokens
-                    token_budget -= num_new_tokens
-                    req_index += 1
+                    can_schedule = True
                     break
+            if not can_schedule:
+                break
+
+            # Schedule the request.
+            scheduled_running_reqs.append(request)
+            req_to_new_block_ids[request.request_id] = [
+                b.block_id for b in new_blocks
+            ]
+            num_scheduled_tokens[request.request_id] = num_new_tokens
+            token_budget -= num_new_tokens
+            req_index += 1
+            has_partial_request = (request.num_computed_tokens + num_new_tokens
+                                   < request.num_tokens)
+
+            # Encoder-related.
+            if encoder_inputs_to_schedule:
+                scheduled_encoder_inputs[request.request_id] = (
+                    encoder_inputs_to_schedule)
+                # Allocate the encoder cache.
+                for i in encoder_inputs_to_schedule:
+                    self.encoder_cache_manager.allocate(request, i)
+                encoder_budget = new_encoder_budget
 
         # Next, schedule the WAITING requests.
         if not preempted_reqs:
             while self.waiting:
+                if has_partial_request:
+                    break
                 if len(self.running) == self.max_num_running_reqs:
                     break
                 if token_budget == 0:
@@ -149,12 +202,21 @@ def schedule(self) -> "SchedulerOutput":
                     computed_blocks.pop()
                 num_new_tokens = min(num_new_tokens, token_budget)
                 assert num_new_tokens > 0
+
+                # Schedule encoder inputs.
+                (encoder_inputs_to_schedule, num_new_tokens,
+                 new_encoder_budget) = self._try_schedule_encoder_inputs(
+                     request, num_computed_tokens, num_new_tokens,
+                     encoder_budget)
+                if num_new_tokens == 0:
+                    # The request cannot be scheduled.
+                    break
+
                 new_blocks = self.kv_cache_manager.allocate_slots(
                     request, num_new_tokens, computed_blocks)
                 if new_blocks is None:
                     # The request cannot be scheduled.
                     break
-                request.num_computed_tokens = num_computed_tokens
 
                 self.waiting.popleft()
                 self.running.append(request)
@@ -172,6 +234,18 @@ def schedule(self) -> "SchedulerOutput":
                 num_scheduled_tokens[request.request_id] = num_new_tokens
                 token_budget -= num_new_tokens
                 request.status = RequestStatus.RUNNING
+                request.num_computed_tokens = num_computed_tokens
+                has_partial_request = (num_computed_tokens + num_new_tokens <
+                                       request.num_tokens)
+
+                # Encoder-related.
+                if encoder_inputs_to_schedule:
+                    scheduled_encoder_inputs[request.request_id] = (
+                        encoder_inputs_to_schedule)
+                    # Allocate the encoder cache.
+                    for i in encoder_inputs_to_schedule:
+                        self.encoder_cache_manager.allocate(request, i)
+                    encoder_budget = new_encoder_budget
 
         # Check if the scheduling constraints are satisfied.
         total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
@@ -205,12 +279,14 @@ def schedule(self) -> "SchedulerOutput":
             scheduled_running_reqs=running_reqs_data,
             num_scheduled_tokens=num_scheduled_tokens,
             total_num_scheduled_tokens=total_num_scheduled_tokens,
+            scheduled_encoder_inputs=scheduled_encoder_inputs,
             preempted_req_ids=preempted_req_ids,
             # finished_req_ids is an existing state in the scheduler,
             # instead of being newly scheduled in this step.
             # It contains the request IDs that are finished in between
             # the previous and the current steps.
             finished_req_ids=self.finished_req_ids,
+            free_encoder_input_ids=self.encoder_cache_manager.get_freed_ids(),
         )
 
         self.finished_req_ids = set()
@@ -234,6 +310,72 @@ def _make_running_request_data(
             self.running_reqs_data[request.request_id] = req_data
         return req_data
 
+    def _try_schedule_encoder_inputs(
+        self,
+        request: Request,
+        num_computed_tokens: int,
+        num_new_tokens: int,
+        encoder_budget: int,
+    ) -> Tuple[List[int], int, int]:
+        """
+        Determine which encoder inputs need to be scheduled in the current step,
+        and update `num_new_tokens` and encoder token budget accordingly.
+
+        An encoder input will be scheduled if:
+        - Its output tokens overlap with the range of tokens being computed
+        in this step, i.e.,
+        [num_computed_tokens, num_computed_tokens + num_new_tokens).
+        - It is not already computed and stored in the encoder cache.
+        - There is sufficient encoder token budget to process it.
+        - The encoder cache has space to store it.
+
+        If an encoder input cannot be scheduled due to cache or budget
+        limitations, the method adjusts `num_new_tokens` to schedule only the
+        decoder tokens up to just before the unschedulable encoder input.
+        """
+        if not request.has_encoder_inputs():
+            return [], num_new_tokens, encoder_budget
+
+        encoder_inputs_to_schedule: List[int] = []
+        mm_positions = request.mm_positions
+        assert mm_positions is not None
+        assert len(mm_positions) > 0
+        for i, pos_info in enumerate(mm_positions):
+            start_pos = pos_info["offset"]
+            num_encoder_tokens = pos_info["length"]
+
+            # The encoder output is needed if the two ranges overlap:
+            # [num_computed_tokens, num_computed_tokens + num_new_tokens) and
+            # [start_pos, start_pos + num_encoder_tokens)
+            if start_pos >= num_computed_tokens + num_new_tokens:
+                # The encoder input is not needed in this step.
+                break
+            if start_pos + num_encoder_tokens <= num_computed_tokens:
+                # The encoder input is already computed and stored
+                # in the decoder's KV cache.
+                continue
+
+            if self.encoder_cache_manager.has_cache(request, i):
+                # The encoder input is already computed and cached.
+                continue
+            if not self.encoder_cache_manager.can_allocate(request, i):
+                # The encoder cache is full. We can only schedule the decoder
+                # tokens just before the encoder input.
+                num_new_tokens = start_pos - num_computed_tokens
+                break
+            if num_encoder_tokens > encoder_budget:
+                # The encoder budget is exhausted. We can only schedule the
+                # decoder tokens up until the encoder input.
+                # NOTE(woosuk): We assume that the encoder tokens should be
+                # processed altogether, as the encoder usually uses
+                # bidirectional attention.
+                num_new_tokens = start_pos - num_computed_tokens
+                break
+
+            encoder_budget -= num_encoder_tokens
+            encoder_inputs_to_schedule.append(i)
+        return encoder_inputs_to_schedule, num_new_tokens, encoder_budget
+
     def update_from_output(
         self,
         scheduler_output: "SchedulerOutput",
@@ -251,6 +393,17 @@ def update_from_output(
             # the request generates output tokens. Otherwise, we ignore the
             # sampler output for the request.
             assert request.num_computed_tokens <= request.num_tokens
+
+            cached_encoder_input_ids = (
+                self.encoder_cache_manager.get_cached_input_ids(request))
+            for input_id in list(cached_encoder_input_ids):
+                start_pos = request.mm_positions[input_id]["offset"]
+                num_tokens = request.mm_positions[input_id]["length"]
+                if start_pos + num_tokens <= request.num_computed_tokens:
+                    # The encoder output is already processed and stored
+                    # in the decoder's KV cache.
+                    self.encoder_cache_manager.free(request, input_id)
+
             if request.num_computed_tokens == request.num_tokens:
                 req_index = model_runner_output.req_id_to_index[req_id]
                 # NOTE(woosuk): Currently, we assume that each request
@@ -355,7 +508,8 @@ class NewRequestData:
     req_id: str
     prompt_token_ids: List[int]
     prompt: Optional[str]
-    multi_modal_data: Optional[MultiModalDataDict]
+    mm_inputs: List["MultiModalKwargs"]
+    mm_positions: List["PlaceholderRange"]
     sampling_params: SamplingParams
     block_ids: List[int]
     num_computed_tokens: int
@@ -369,9 +523,10 @@ def from_request(
     ) -> "NewRequestData":
         return cls(
             req_id=request.request_id,
-            prompt_token_ids=request.inputs["prompt_token_ids"],
-            prompt=request.inputs.get("prompt"),
-            multi_modal_data=request.inputs.get("multi_modal_data"),
+            prompt_token_ids=request.prompt_token_ids,
+            prompt=request.prompt,
+            mm_inputs=request.mm_inputs,
+            mm_positions=request.mm_positions,
             sampling_params=request.sampling_params,
             block_ids=block_ids,
             num_computed_tokens=num_computed_tokens,
@@ -429,6 +584,8 @@ class SchedulerOutput:
 
     num_scheduled_tokens: Dict[str, int]
     total_num_scheduled_tokens: int
+    scheduled_encoder_inputs: Dict[str, List[int]]
 
     preempted_req_ids: Set[str]
     finished_req_ids: Set[str]
+    free_encoder_input_ids: List[Tuple[str, int]]
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 428483bdb29cb..35ed131d50de9 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -17,6 +17,7 @@
 from vllm.v1.core.scheduler import Scheduler
 from vllm.v1.engine import (EngineCoreOutput, EngineCoreOutputs,
                             EngineCoreRequest, EngineCoreRequestType)
+from vllm.v1.engine.mm_input_mapper import MMInputMapper
 from vllm.v1.executor.gpu_executor import GPUExecutor
 from vllm.v1.request import Request, RequestStatus
 from vllm.v1.serial_utils import PickleEncoder
@@ -65,6 +66,9 @@ def __init__(
         vllm_config.cache_config.num_gpu_blocks = num_gpu_blocks
         vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
 
+        # Set up multimodal input mapper (e.g., convert PIL images to tensors).
+        self.mm_input_mapper = MMInputMapper(vllm_config.model_config)
+
         # Setup scheduler.
         self.scheduler = Scheduler(vllm_config.scheduler_config,
                                    vllm_config.cache_config,
@@ -93,6 +97,12 @@ def add_request(self, request: EngineCoreRequest):
         """Add request to the scheduler."""
 
         req = Request.from_engine_core_request(request)
+        # FIXME(woosuk): The input mapping (e.g., PIL images to tensors) may
+        # take 10-50 ms, which can cause a spike in the latency. We should
+        # consider moving this to a separate thread.
+        if req.mm_data:
+            req.mm_inputs = self.mm_input_mapper.process_inputs(
+                req.mm_data, req.mm_processor_kwargs)
         self.scheduler.add_request(req)
 
     def abort_requests(self, request_ids: List[str]):
diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
new file mode 100644
index 0000000000000..594c973678235
--- /dev/null
+++ b/vllm/v1/engine/mm_input_mapper.py
@@ -0,0 +1,39 @@
+from typing import Any, Dict, List, Optional
+
+from vllm.config import ModelConfig
+from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
+                             MultiModalKwargs, MultiModalRegistry)
+
+
+class MMInputMapper:
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+    ):
+        self.mm_registry = mm_registry
+        self.multi_modal_input_mapper = mm_registry.create_input_mapper(
+            model_config)
+        self.mm_registry.init_mm_limits_per_prompt(model_config)
+
+    def process_inputs(
+        self,
+        mm_data: MultiModalDataDict,
+        mm_processor_kwargs: Optional[Dict[str, Any]],
+    ) -> List[MultiModalKwargs]:
+        image_inputs = mm_data["image"]
+        if not isinstance(image_inputs, list):
+            image_inputs = [image_inputs]
+
+        # Process each image input separately so that later we can schedule
+        # them in a fine-grained manner.
+        mm_inputs: List[MultiModalKwargs] = []
+        num_images = len(image_inputs)
+        for i in range(num_images):
+            mm_input = self.multi_modal_input_mapper(
+                {"image": [image_inputs[i]]},
+                mm_processor_kwargs=mm_processor_kwargs,
+            )
+            mm_inputs.append(mm_input)
+        return mm_inputs
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 00e5aea92a8df..f35cf738c89bf 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -3,6 +3,7 @@
 
 from vllm.inputs.data import DecoderOnlyInputs
 from vllm.lora.request import LoRARequest
+from vllm.multimodal import MultiModalKwargs
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import RequestMetrics
 from vllm.v1.engine import EngineCoreRequest
@@ -47,14 +48,30 @@ def __init__(
         self._all_token_ids: List[int] = self.prompt_token_ids.copy()
         self.num_computed_tokens = 0
 
+        # Raw multimodal data before the mm input mapper (e.g., PIL images).
+        self.mm_data = inputs.get("multi_modal_data")
+        self.mm_processor_kwargs = inputs.get("mm_processor_kwargs")
+        mm_positions = inputs.get("multi_modal_placeholders")
+        if mm_positions:
+            # FIXME(woosuk): Support other modalities.
+            self.mm_positions = mm_positions.get("image", [])
+        else:
+            self.mm_positions = []
+        # Output of the mm input mapper (e.g., image tensors).
+        self.mm_inputs: List[MultiModalKwargs] = []
+
     @classmethod
     def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
-
         return cls(
             request_id=request.request_id,
-            inputs=DecoderOnlyInputs(type="token",
-                                     prompt_token_ids=request.prompt_token_ids,
-                                     prompt=request.prompt),
+            inputs=DecoderOnlyInputs(
+                type="token",
+                prompt_token_ids=request.prompt_token_ids,
+                prompt=request.prompt,
+                multi_modal_data=request.mm_data,
+                multi_modal_placeholders=request.mm_placeholders,
+                mm_processor_kwargs=request.mm_processor_kwargs,
+            ),
             sampling_params=request.sampling_params,
             eos_token_id=request.eos_token_id,
             arrival_time=request.arrival_time,
@@ -96,9 +113,21 @@ def is_finished(self) -> bool:
     def get_finished_reason(self) -> Union[str, None]:
         return RequestStatus.get_finished_reason(self.status)
 
+    def has_encoder_inputs(self) -> bool:
+        return self.mm_data is not None
+
+    @property
+    def num_encoder_inputs(self) -> int:
+        return len(self.mm_positions)
+
+    def get_num_encoder_tokens(self, input_id: int) -> int:
+        assert input_id < len(self.mm_positions)
+        num_tokens = self.mm_positions[input_id]["length"]
+        return num_tokens
+
 
 class RequestStatus(enum.IntEnum):
-    """Status of a sequence."""
+    """Status of a request."""
     WAITING = 0
     RUNNING = 1
     PREEMPTED = 2
@@ -119,7 +148,7 @@ def get_finished_reason(status: "RequestStatus") -> Union[str, None]:
 
 
 # Mapping of finished statuses to their finish reasons.
-# NOTE: The ignored sequences are the sequences whose prompt lengths
+# NOTE: The ignored requests are the requests whose prompt lengths
 # are longer than the model's length cap. Therefore, the stop
 # reason should also be "length" as in OpenAI API.
 _FINISHED_REASON_MAP = {
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index db676e2819bf4..81480786a09e1 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1,7 +1,7 @@
 import os
 import time
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Dict, List, Optional, Set
+from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple
 
 import numpy as np
 import torch
@@ -14,9 +14,10 @@
 from vllm.compilation.levels import CompilationLevel
 from vllm.config import VllmConfig
 from vllm.forward_context import set_forward_context
+from vllm.inputs import INPUT_REGISTRY, InputRegistry
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
-from vllm.multimodal import MultiModalDataDict
+from vllm.multimodal import MultiModalKwargs
 from vllm.plugins import set_compilation_config
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, DeviceMemoryProfiler, cdiv,
@@ -27,6 +28,7 @@
 from vllm.v1.sample.metadata import SamplingMetadata
 
 if TYPE_CHECKING:
+    from vllm.multimodal.base import PlaceholderRange
     from vllm.v1.core.scheduler import SchedulerOutput
 
 logger = init_logger(__name__)
@@ -37,8 +39,8 @@ class GPUModelRunner:
     def __init__(
         self,
         vllm_config: VllmConfig,
+        input_registry: InputRegistry = INPUT_REGISTRY,
     ):
-        # TODO: use ModelRunnerBase.__init__(self, vllm_config=vllm_config)
         self.vllm_config = vllm_config
         self.model_config = vllm_config.model_config
         self.cache_config = vllm_config.cache_config
@@ -75,10 +77,16 @@ def __init__(
             parallel_config)
         self.num_kv_heads = model_config.get_num_kv_heads(parallel_config)
         self.head_size = model_config.get_head_size()
+        self.hidden_size = model_config.get_hidden_size()
+
+        # Multi-modal data support
+        self.input_registry = input_registry
 
         # Lazy initialization
         # self.model: nn.Module  # Set after load_model
         self.kv_caches: List[torch.Tensor] = []
+        # req_id -> (input_id -> encoder_output)
+        self.encoder_cache: Dict[str, Dict[int, torch.Tensor]] = {}
 
         # Request states.
         self.requests: Dict[str, CachedRequestState] = {}
@@ -96,18 +104,28 @@ def __init__(
                                and not self.model_config.enforce_eager)
         # TODO(woosuk): Provide an option to tune the max cudagraph batch size.
         self.cudagraph_batch_sizes = [1, 2, 4] + [i for i in range(8, 513, 8)]
-        self.input_ids = torch.zeros(self.max_num_tokens,
-                                     dtype=torch.int32,
-                                     device=self.device)
         self.positions = torch.zeros(self.max_num_tokens,
                                      dtype=torch.int64,
                                      device=self.device)
+        self.inputs_embeds = torch.zeros(
+            (self.max_num_tokens, self.hidden_size),
+            dtype=self.dtype,
+            device=self.device)
 
     def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
         # Remove stopped requests from the cached states.
         # Keep the states of the pre-empted requests.
         for req_id in scheduler_output.finished_req_ids:
             self.requests.pop(req_id, None)
+            self.encoder_cache.pop(req_id, None)
+
+        # Free the cached encoder outputs.
+        for req_id, input_id in scheduler_output.free_encoder_input_ids:
+            encoder_outputs = self.encoder_cache.get(req_id)
+            if encoder_outputs is not None:
+                encoder_outputs.pop(input_id, None)
+                if not encoder_outputs:
+                    self.encoder_cache.pop(req_id, None)
 
         # Remove the requests from the persistent batch.
         stopped_req_ids = set().union(
@@ -156,7 +174,8 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
                 req_id=req_id,
                 prompt_token_ids=req_data.prompt_token_ids,
                 prompt=req_data.prompt,
-                multi_modal_data=req_data.multi_modal_data,
+                mm_inputs=req_data.mm_inputs,
+                mm_positions=req_data.mm_positions,
                 sampling_params=sampling_params,
                 generator=generator,
                 block_ids=req_data.block_ids,
@@ -285,11 +304,9 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         seq_start_loc_np[0] = 0
         np.cumsum(seq_lens, out=seq_start_loc_np[1:])
 
-        self.input_ids[:total_num_scheduled_tokens].copy_(input_ids,
-                                                          non_blocking=True)
+        input_ids = input_ids.to(self.device, non_blocking=True)
         self.positions[:total_num_scheduled_tokens].copy_(positions,
                                                           non_blocking=True)
-
         query_start_loc = query_start_loc.to(self.device, non_blocking=True)
         seq_start_loc = seq_start_loc.to(self.device, non_blocking=True)
         slot_mapping = slot_mapping.to(self.device, non_blocking=True).long()
@@ -308,7 +325,7 @@ def _prepare_inputs(self, scheduler_output: "SchedulerOutput"):
         # token from the partial request.
         # TODO: Support prompt logprobs.
         logits_indices = query_start_loc[1:] - 1
-        return attn_metadata, logits_indices
+        return input_ids, attn_metadata, logits_indices
 
     def _prepare_sampling(
         self,
@@ -325,13 +342,91 @@ def _prepare_sampling(
         sampling_metadata = self.input_batch.make_sampling_metadata(skip_copy)
         return sampling_metadata
 
+    def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
+        scheduled_encoder_inputs = scheduler_output.scheduled_encoder_inputs
+        if not scheduled_encoder_inputs:
+            return
+
+        # Batch the multi-modal inputs.
+        mm_inputs: List[MultiModalKwargs] = []
+        req_input_ids: List[Tuple[int, int]] = []
+        for req_id, encoder_input_ids in scheduled_encoder_inputs.items():
+            req_state = self.requests[req_id]
+            for input_id in encoder_input_ids:
+                mm_inputs.append(req_state.mm_inputs[input_id])
+                req_input_ids.append((req_id, input_id))
+        batched_mm_inputs = MultiModalKwargs.batch(mm_inputs)
+        batched_mm_inputs = MultiModalKwargs.as_kwargs(batched_mm_inputs,
+                                                       device=self.device)
+
+        # Run the encoder.
+        # `encoder_outputs` is either of the following:
+        # 1. A tensor of shape [num_images, feature_size, hidden_size]
+        # in case when feature_size is fixed across all images.
+        # 2. A list (length: num_images) of tensors, each of shape
+        # [feature_size, hidden_size] in case when the feature size is
+        # dynamic depending on input images.
+        encoder_outputs = self.model.process_mm_inputs(**batched_mm_inputs)
+
+        # Cache the encoder outputs.
+        for (req_id, input_id), output in zip(req_input_ids, encoder_outputs):
+            if req_id not in self.encoder_cache:
+                self.encoder_cache[req_id] = {}
+            self.encoder_cache[req_id][input_id] = output
+
+    def _gather_encoder_outputs(
+        self,
+        scheduler_output: "SchedulerOutput",
+    ) -> List[torch.Tensor]:
+        encoder_outputs: List[torch.Tensor] = []
+        num_reqs = self.input_batch.num_reqs
+        for req_id in self.input_batch.req_ids[:num_reqs]:
+            num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
+                req_id]
+            req_state = self.requests[req_id]
+            num_computed_tokens = req_state.num_computed_tokens
+            mm_positions = req_state.mm_positions
+            for i, pos_info in enumerate(mm_positions):
+                start_pos = pos_info["offset"]
+                num_encoder_tokens = pos_info["length"]
+
+                # The encoder output is needed if the two ranges overlap:
+                # [num_computed_tokens,
+                #  num_computed_tokens + num_scheduled_tokens) and
+                # [start_pos, start_pos + num_encoder_tokens)
+                if start_pos >= num_computed_tokens + num_scheduled_tokens:
+                    # The encoder output is not needed in this step.
+                    break
+                if start_pos + num_encoder_tokens <= num_computed_tokens:
+                    # The encoder output is already processed and stored
+                    # in the decoder's KV cache.
+                    continue
+
+                start_idx = max(num_computed_tokens - start_pos, 0)
+                end_idx = min(
+                    num_computed_tokens - start_pos + num_scheduled_tokens,
+                    num_encoder_tokens)
+                assert start_idx < end_idx
+                assert req_id in self.encoder_cache
+                assert i in self.encoder_cache[req_id]
+                encoder_output = self.encoder_cache[req_id][i]
+                encoder_outputs.append(encoder_output[start_idx:end_idx])
+        return encoder_outputs
+
     @torch.inference_mode()
     def execute_model(
         self,
         scheduler_output: "SchedulerOutput",
     ) -> ModelRunnerOutput:
         self._update_states(scheduler_output)
-        attn_metadata, logits_indices = self._prepare_inputs(scheduler_output)
+
+        # Run the encoder.
+        self._execute_encoder(scheduler_output)
+        encoder_outputs = self._gather_encoder_outputs(scheduler_output)
+
+        # Prepare the decoder inputs.
+        input_ids, attn_metadata, logits_indices = self._prepare_inputs(
+            scheduler_output)
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.use_cuda_graph
                 and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
@@ -343,12 +438,26 @@ def execute_model(
             # Eager mode.
             num_input_tokens = num_scheduled_tokens
 
+        # Get the inputs embeds.
+        if encoder_outputs:
+            inputs_embeds = self.model.get_input_embeddings(
+                input_ids, encoder_outputs)
+        else:
+            inputs_embeds = self.model.get_input_embeddings(input_ids)
+        # NOTE(woosuk): To unify token ids and soft tokens (vision embeddings),
+        # always use embeddings (rather than token ids) as input to the model.
+        # TODO(woosuk): Avoid the copy. Optimize.
+        self.inputs_embeds[:num_scheduled_tokens].copy_(inputs_embeds)
+
+        # Run the decoder.
+        # Use persistent buffers for CUDA graphs.
         with set_forward_context(attn_metadata):
             hidden_states = self.model(
-                input_ids=self.input_ids[:num_input_tokens],
+                input_ids=None,
                 positions=self.positions[:num_input_tokens],
                 kv_caches=self.kv_caches,
                 attn_metadata=None,
+                inputs_embeds=self.inputs_embeds[:num_input_tokens],
             )
         hidden_states = hidden_states[:num_scheduled_tokens]
         hidden_states = hidden_states[logits_indices]
@@ -440,13 +549,16 @@ def _dummy_run(self, model: nn.Module, num_tokens: int) -> None:
         with set_forward_context(None):  # noqa: SIM117
             with set_compile_context(self.cudagraph_batch_sizes):
                 # Trigger compilation for general shape.
-                model(self.input_ids,
-                      self.positions,
-                      dummy_kv_caches,
-                      attn_metadata=None)
+                model(input_ids=None,
+                      positions=self.positions,
+                      kv_caches=dummy_kv_caches,
+                      attn_metadata=None,
+                      inputs_embeds=self.inputs_embeds)
 
     @torch.inference_mode()
     def profile_run(self) -> None:
+        # TODO(woosuk): Profile the max memory usage of the encoder and
+        # the encoder cache.
         self._dummy_run(self.model, self.max_num_tokens)
         torch.cuda.synchronize()
 
@@ -468,10 +580,11 @@ def capture_model(self) -> None:
             # can reuse the memory pool allocated for the large shapes.
             for num_tokens in reversed(self.cudagraph_batch_sizes):
                 self.model(
-                    self.input_ids[:num_tokens],
-                    self.positions[:num_tokens],
+                    input_ids=None,
+                    positions=self.positions[:num_tokens],
                     kv_caches=self.kv_caches,
                     attn_metadata=None,
+                    inputs_embeds=self.inputs_embeds[:num_tokens],
                 )
 
         end_time = time.perf_counter()
@@ -506,7 +619,8 @@ class CachedRequestState:
     req_id: str
     prompt_token_ids: List[int]
     prompt: Optional[str]
-    multi_modal_data: Optional["MultiModalDataDict"]
+    mm_inputs: List[MultiModalKwargs]
+    mm_positions: List["PlaceholderRange"]
     sampling_params: SamplingParams
     generator: Optional[torch.Generator]
 

From 56a955e7748e497d8c24c79a76c75f3f982fab4a Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Wed, 13 Nov 2024 00:54:10 -0500
Subject: [PATCH 119/183] Bump to compressed-tensors v0.8.0 (#10279)

Signed-off-by: Dipika <dipikasikka1@gmail.com>
---
 requirements-common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index ef5ed8b645158..acb766d25a2d9 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -31,4 +31,4 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.7.1 # required for compressed-tensors
+compressed-tensors == 0.8.0 # required for compressed-tensors
\ No newline at end of file

From 032fcf16ae9d924cc98a083c3c8464173f87a49e Mon Sep 17 00:00:00 2001
From: Xin Yang <105740670+xyang16@users.noreply.github.com>
Date: Tue, 12 Nov 2024 21:54:52 -0800
Subject: [PATCH 120/183] [Doc] Fix typo in arg_utils.py (#10264)

Signed-off-by: Xin Yang <xyang19@gmail.com>
---
 vllm/engine/arg_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 27f62b0008578..31aa8c5908719 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -626,8 +626,8 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             type=int,
             default=EngineArgs.max_cpu_loras,
             help=('Maximum number of LoRAs to store in CPU memory. '
-                  'Must be >= than max_num_seqs. '
-                  'Defaults to max_num_seqs.'))
+                  'Must be >= than max_loras. '
+                  'Defaults to max_loras.'))
         parser.add_argument(
             '--fully-sharded-loras',
             action='store_true',

From 3945c82346dae3129213607663bfd17edd905fef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=B5=E8=84=91=E6=98=9F=E4=BA=BA?= <kerorek@outlook.com>
Date: Wed, 13 Nov 2024 15:07:22 +0800
Subject: [PATCH 121/183] [Model] Add support for Qwen2-VL video embeddings
 input & multiple image embeddings input with varied resolutions (#10221)

Signed-off-by: imkero <kerorek@outlook.com>
---
 docs/source/models/supported_models.rst       |   2 +-
 .../vision_language/test_qwen2_vl.py          | 428 ++++++++++++++++++
 vllm/model_executor/models/qwen2_vl.py        | 180 ++++++--
 3 files changed, 578 insertions(+), 32 deletions(-)
 create mode 100644 tests/models/decoder_only/vision_language/test_qwen2_vl.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 5a474043078db..ca894819f2c26 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -538,7 +538,7 @@ Text Generation
     - ✅︎
   * - :code:`Qwen2VLForConditionalGeneration`
     - Qwen2-VL
-    - T + I\ :sup:`E+` + V\ :sup:`+`
+    - T + I\ :sup:`E+` + V\ :sup:`E+`
     - :code:`Qwen/Qwen2-VL-2B-Instruct`, :code:`Qwen/Qwen2-VL-7B-Instruct`, :code:`Qwen/Qwen2-VL-72B-Instruct`, etc.
     - ✅︎
     - ✅︎
diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
new file mode 100644
index 0000000000000..718c675b86fb4
--- /dev/null
+++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
@@ -0,0 +1,428 @@
+from typing import Any, List, Optional, Tuple, Type, TypedDict, Union
+
+import numpy.typing as npt
+import pytest
+import torch
+from PIL import Image
+
+from vllm.entrypoints.llm import LLM
+from vllm.multimodal.utils import (rescale_image_size, rescale_video_size,
+                                   sample_frames_from_video)
+
+from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
+                          PromptVideoInput, VllmRunner)
+from ...utils import check_logprobs_close
+
+models = ["Qwen/Qwen2-VL-2B-Instruct"]
+target_dtype = "half"
+
+IMAGE_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>"
+VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
+
+
+def qwen2_vl_chat_template(*query):
+    return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n"  # noqa: E501
+
+
+IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    qwen2_vl_chat_template(
+        IMAGE_PLACEHOLDER,
+        "What is the biggest text's content in this image?",
+    ),
+    "cherry_blossom":
+    qwen2_vl_chat_template(
+        IMAGE_PLACEHOLDER,
+        "What is the season shown in this image? ",
+        "Reply with a short sentence (no more than 20 words)",
+    ),
+})
+
+VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
+    "sample_demo_1":
+    qwen2_vl_chat_template(
+        VIDEO_PLACEHOLDER,
+        "Describe this video with a short sentence ",
+        "(no more than 20 words)",
+    ),
+})
+
+MULTIIMAGE_PROMPT = qwen2_vl_chat_template(
+    IMAGE_PLACEHOLDER,
+    IMAGE_PLACEHOLDER,
+    "Describe these two images separately. ",
+    "For each image, reply with a short sentence ",
+    "(no more than 10 words).",
+)
+
+
+class Qwen2VLPromptImageEmbeddingInput(TypedDict):
+    image_embeds: torch.Tensor
+    image_grid_thw: torch.Tensor
+
+
+class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
+    video_embeds: torch.Tensor
+    video_grid_thw: torch.Tensor
+
+
+def batch_make_image_embeddings(
+        image_batches: List[Union[Image.Image, List[Image.Image]]], processor,
+        llm: LLM) -> List[Qwen2VLPromptImageEmbeddingInput]:
+    """batched image embeddings for Qwen2-VL
+
+    This will infer all images' embeddings in a single batch, 
+      and split the result according to input batches.
+
+    image_batches:
+      - Single-image batches: `List[Image.Image]`
+      - Multiple-image batches: `List[List[Image.Image]]]`
+    
+    returns: `List[Qwen2VLPromptImageEmbeddingInput]`
+    """
+
+    image_batches_: List[Any] = image_batches[:]
+
+    # convert single-image batches to multiple-image batches
+    for idx in range(len(image_batches_)):
+        if not isinstance(image_batches_[idx], list):
+            image_batches_[idx] = [image_batches_[idx]]
+
+        assert isinstance(image_batches_[idx], list)
+
+    # append all images into a list (as a batch)
+    images: List[Image.Image] = []
+    for image_batch in image_batches_:
+        images += image_batch
+
+    # image to pixel values
+    image_processor = processor.image_processor
+
+    preprocess_result = image_processor \
+        .preprocess(images=images, return_tensors="pt") \
+        .data
+    pixel_values = preprocess_result["pixel_values"]
+    image_grid_thw = preprocess_result["image_grid_thw"]
+
+    # pixel values to embeddinds & grid_thws
+    with torch.no_grad():
+        visual = llm.llm_engine.model_executor.driver_worker. \
+            model_runner.model.visual
+
+        pixel_values_on_device = pixel_values.to(visual.device,
+                                                 dtype=visual.dtype)
+        image_grid_thw_on_device = image_grid_thw.to(visual.device,
+                                                     dtype=torch.int64)
+        image_embeds = visual(pixel_values_on_device,
+                              grid_thw=image_grid_thw_on_device)
+
+    # split into original batches
+    result: List[Qwen2VLPromptImageEmbeddingInput] = []
+    image_counter = 0
+    embed_counter = 0
+    for image_batch in image_batches_:
+        cur_batch_image_count = len(image_batch)
+        merge_size = image_processor.merge_size
+        cur_batch_embed_len = sum([
+            grid_thw.prod() // merge_size // merge_size
+            for grid_thw in image_grid_thw[image_counter:image_counter +
+                                           cur_batch_image_count]
+        ])
+
+        result.append({
+            "image_embeds":
+            image_embeds[embed_counter:embed_counter + cur_batch_embed_len],
+            "image_grid_thw":
+            image_grid_thw[image_counter:image_counter +
+                           cur_batch_image_count],
+        })
+
+        embed_counter += cur_batch_embed_len
+        image_counter += cur_batch_image_count
+
+    # ensure we don't lost any images or embeddings
+    assert embed_counter == image_embeds.size(0)
+    assert image_counter == image_grid_thw.size(0)
+    assert len(image_batches) == len(result)
+
+    return result
+
+
+def batch_make_video_embeddings(
+        video_batches: PromptVideoInput, processor,
+        llm: LLM) -> List[Qwen2VLPromptVideoEmbeddingInput]:
+    """batched video embeddings for Qwen2-VL
+
+    A NDArray represents a single video's all frames.
+
+    This will infer all videos' embeddings in a single batch, 
+      and split the result according to input batches.
+
+    video_batches:
+      - Single-video batches: `List[NDArray]`
+      - Multiple-video batches: `List[List[NDArray]]`
+    """
+
+    video_batches_: List[Any] = video_batches[:]
+
+    for idx in range(len(video_batches_)):
+        if not isinstance(video_batches_[idx], list):
+            single_video_batch: List[npt.NDArray] = [video_batches_[idx]]
+            video_batches_[idx] = single_video_batch
+
+        assert isinstance(video_batches_[idx], list)
+
+    # append all videos into a list (as a batch)
+    videos: List[npt.NDArray] = []
+    for video_batch in video_batches_:
+        videos += video_batch
+
+    # video to pixel values
+    image_processor = processor.image_processor
+
+    preprocess_result = image_processor \
+        .preprocess(images=None, videos=videos, return_tensors="pt") \
+        .data
+    pixel_values = preprocess_result["pixel_values_videos"]
+    video_grid_thw = preprocess_result["video_grid_thw"]
+
+    # pixel values to embeddinds & grid_thws
+    with torch.no_grad():
+        visual = llm.llm_engine.model_executor.driver_worker.\
+            model_runner.model.visual
+
+        pixel_values_on_device = pixel_values.to(visual.device,
+                                                 dtype=visual.dtype)
+        video_grid_thw_on_device = video_grid_thw.to(visual.device,
+                                                     dtype=torch.int64)
+        video_embeds = visual(pixel_values_on_device,
+                              grid_thw=video_grid_thw_on_device)
+
+    # split into original batches
+    result: List[Qwen2VLPromptVideoEmbeddingInput] = []
+    video_counter = 0
+    embed_counter = 0
+    for video_batch in video_batches_:
+        cur_batch_video_count = len(video_batch)
+        merge_size = image_processor.merge_size
+        cur_batch_embed_len = sum([
+            grid_thw.prod() // merge_size // merge_size
+            for grid_thw in video_grid_thw[video_counter:video_counter +
+                                           cur_batch_video_count]
+        ])
+
+        result.append({
+            "video_embeds":
+            video_embeds[embed_counter:embed_counter + cur_batch_embed_len],
+            "video_grid_thw":
+            video_grid_thw[video_counter:video_counter +
+                           cur_batch_video_count],
+        })
+
+        embed_counter += cur_batch_embed_len
+        video_counter += cur_batch_video_count
+
+    # ensure we don't lost any videos or embeddings
+    assert embed_counter == video_embeds.size(0)
+    assert video_counter == video_grid_thw.size(0)
+    assert len(video_batches) == len(result)
+
+    return result
+
+
+def run_test(
+    vllm_runner: Type[VllmRunner],
+    inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]],
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    mm_limit: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Inference result should be the same between
+    original image/video input and image/video embeddings input.
+    """
+    from transformers import AutoProcessor  # noqa: F401
+
+    processor = AutoProcessor.from_pretrained(model)
+
+    # NOTE:
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model,
+                     task="generate",
+                     max_model_len=4000,
+                     max_num_seqs=3,
+                     dtype=dtype,
+                     limit_mm_per_prompt={
+                         "image": mm_limit,
+                         "video": mm_limit
+                     },
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend
+                     ) as vllm_model:
+
+        outputs_per_case_for_original_input = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images or None,
+                                                videos=videos or None)
+            for prompts, images, videos in inputs
+        ]
+
+        outputs_per_case_for_embeddings_input = [
+            vllm_model.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=batch_make_image_embeddings(
+                    images, processor, vllm_model.model) if images else None,
+                videos=batch_make_video_embeddings(
+                    videos, processor, vllm_model.model) if videos else None)
+            for prompts, images, videos in inputs
+        ]
+
+    for outputs_for_original_input, \
+        outputs_for_embeddings_input \
+        in zip(outputs_per_case_for_original_input,
+            outputs_per_case_for_embeddings_input):
+        check_logprobs_close(
+            outputs_0_lst=outputs_for_original_input,
+            outputs_1_lst=outputs_for_embeddings_input,
+            name_0="original_input",
+            name_1="embeddings_input",
+        )
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # Single-scale
+        [0.5],
+        # Single-scale, batched
+        [0.5, 0.5],
+        # Multi-scale
+        [0.25, 0.5, 0.5],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
+                                         size_factors, dtype: str,
+                                         max_tokens: int,
+                                         num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_case: List[Tuple[
+        List[str], PromptImageInput, PromptVideoInput]] = [(
+            [prompt for _ in size_factors],
+            [rescale_image_size(image, factor) for factor in size_factors],
+            [],
+        ) for image, prompt in zip(images, IMAGE_PROMPTS)]
+
+    run_test(
+        vllm_runner,
+        inputs_per_case,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        [],
+        # Single-scale
+        [0.5],
+        # Single-scale, batched
+        [0.5, 0.5],
+        # Multi-scale
+        [0.25, 0.5, 0.5],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
+                                                  model, size_factors,
+                                                  dtype: str, max_tokens: int,
+                                                  num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_case: List[Tuple[List[str], PromptImageInput,
+                                PromptVideoInput]] = [(
+                                    [MULTIIMAGE_PROMPT for _ in size_factors],
+                                    [[
+                                        rescale_image_size(image, factor)
+                                        for image in images
+                                    ] for factor in size_factors],
+                                    [],
+                                )]
+
+    run_test(
+        vllm_runner,
+        inputs_per_case,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=2,
+        tensor_parallel_size=1,
+    )
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # Single-scale
+        [0.5],
+        # Single-scale, batched
+        [0.5, 0.5],
+        # Multi-scale
+        [0.25, 0.25, 0.5],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
+                                         size_factors, dtype: str,
+                                         max_tokens: int,
+                                         num_logprobs: int) -> None:
+    num_frames = 4
+    sampled_vids = [
+        sample_frames_from_video(asset.np_ndarrays, num_frames)
+        for asset in video_assets
+    ]
+
+    inputs_per_case: List[Tuple[
+        List[str], PromptImageInput, PromptVideoInput]] = [(
+            [prompt for _ in size_factors],
+            [],
+            [rescale_video_size(video, factor) for factor in size_factors],
+        ) for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)]
+
+    run_test(
+        vllm_runner,
+        inputs_per_case,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 13109758767df..1b162e7df8578 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -79,7 +79,7 @@
 
 class Qwen2VLImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
-    data: torch.Tensor
+    pixel_values: torch.Tensor
     """Shape:
     `(num_patches, num_channels * patch_size * patch_size)`
     """
@@ -92,9 +92,22 @@ class Qwen2VLImagePixelInputs(TypedDict):
 
 class Qwen2VLImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
-    data: torch.Tensor
-    """Shape: `(batch_size * num_images, image_feature_size, hidden_size)`
-    `hidden_size` must match the hidden size of language model backbone.
+    image_embeds: torch.Tensor
+    """Supported types:
+    - List[`torch.Tensor`]: A list of tensors holding all images' features.
+        Each tensor holds an image's features.
+    - `torch.Tensor`: A tensor holding all images' features
+        (concatenation of all images' feature tensors).
+    
+    Tensor shape: `(num_image_features, hidden_size)`
+    - `num_image_features` varies based on
+        the number and resolution of the images.
+    - `hidden_size` must match the hidden size of language model backbone.
+    """
+
+    image_grid_thw: torch.Tensor
+    """Shape: `(num_images, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
     """
 
 
@@ -102,7 +115,8 @@ class Qwen2VLImageEmbeddingInputs(TypedDict):
                            Qwen2VLImageEmbeddingInputs]
 
 
-class Qwen2VLVideoInputs(TypedDict):
+class Qwen2VLVideoPixelInputs(TypedDict):
+    type: Literal["pixel_values_videos"]
     pixel_values_videos: torch.Tensor
     """Shape:
     `(num_patches,
@@ -116,6 +130,30 @@ class Qwen2VLVideoInputs(TypedDict):
     """
 
 
+class Qwen2VLVideoEmbeddingInputs(TypedDict):
+    type: Literal["video_embeds"]
+    video_embeds: torch.Tensor
+    """Supported types:
+    - List[`torch.Tensor`]: A list of tensors holding all videos' features.
+        Each tensor holds an video's features.
+    - `torch.Tensor`: A tensor holding all videos' features
+      (concatenation of all videos' feature tensors).
+    
+    Tensor shape: `(num_image_features, hidden_size)`
+    - `num_image_features` varies based on 
+        the number and resolution of the videos.
+    - `hidden_size` must match the hidden size of language model backbone.
+    """
+
+    video_grid_thw: torch.Tensor
+    """Shape: `(num_videos, 3)`
+    This should be in `(grid_t, grid_h, grid_w)` format.
+    """
+
+
+Qwen2VLVideoInputs = Union[Qwen2VLVideoPixelInputs,
+                           Qwen2VLVideoEmbeddingInputs]
+
 # === Vision Encoder === #
 
 
@@ -585,6 +623,12 @@ def mm_input_mapper_for_qwen2_vl(
             "image_embeds": data.get("image_embeds"),
             "image_grid_thw": data.get("image_grid_thw"),
         })
+    if data_type_key == "video" and isinstance(data, dict):
+        return MultiModalKwargs({
+            "video_embeds": data.get("video_embeds"),
+            "video_grid_thw": data.get("video_grid_thw"),
+        })
+
     model_config = ctx.model_config
     # Handle mm processor kwargs; we pass these at creation time
     # because preprocess() in transformers doesn't expose them
@@ -890,16 +934,33 @@ def input_processor_for_qwen2_vl(
                 idx for idx, token in enumerate(prompt_token_ids)
                 if token == hf_config.image_token_id
             ]
-            image_cnt = len(image_indices)
-            embed_dim = image_inputs.get('image_embeds').size(0)
-            assert embed_dim % image_cnt == 0
-            num_pad_tokens = embed_dim // image_cnt
+
+            # ensure all image tokens have grid_thw
+            assert \
+                len(image_indices) == image_inputs["image_grid_thw"].size(0), \
+                "image token num does not match image_grid_thw.shape"
+
+            image_counter = 0
+            pad_token_counter = 0
             for idx, token in enumerate(prompt_token_ids):
                 if idx in image_indices:
+                    grid_thw = image_inputs["image_grid_thw"][image_counter]
+                    grid_t, grid_h, grid_w = grid_thw
+                    num_pad_tokens = (grid_t * grid_h * grid_w //
+                                      image_processor.merge_size //
+                                      image_processor.merge_size)
                     prompt_token_ids_with_image.extend([token] *
                                                        num_pad_tokens)
+                    image_counter += 1
+                    pad_token_counter += num_pad_tokens
                 else:
                     prompt_token_ids_with_image.append(token)
+
+            # ensure all embeddings are used
+            assert \
+                pad_token_counter == image_inputs["image_embeds"].size(0), \
+                "image_embeds.shape does not match image_grid_thw"
+
             prompt_token_ids = prompt_token_ids_with_image
         else:
             prompt_token_ids = _expand_pad_tokens(image_inputs,
@@ -912,14 +973,49 @@ def input_processor_for_qwen2_vl(
                                                   max_pixels=max_pixels)
 
     if video_inputs is not None:
-        prompt_token_ids = _expand_pad_tokens(video_inputs,
-                                              hf_config.video_token_id,
-                                              make_batched_videos,
-                                              "video",
-                                              image_processor,
-                                              prompt_token_ids,
-                                              min_pixels=min_pixels,
-                                              max_pixels=max_pixels)
+        if isinstance(video_inputs, dict):
+            prompt_token_ids_with_video = []
+            video_indices = [
+                idx for idx, token in enumerate(prompt_token_ids)
+                if token == hf_config.video_token_id
+            ]
+
+            # ensure all video tokens have grid_thw
+            assert \
+                len(video_indices) == video_inputs["video_grid_thw"].size(0), \
+                "video token num does not match video_grid_thw.shape"
+
+            video_counter = 0
+            pad_token_counter = 0
+            for idx, token in enumerate(prompt_token_ids):
+                if idx in video_indices:
+                    grid_thw = video_inputs["video_grid_thw"][video_counter]
+                    grid_t, grid_h, grid_w = grid_thw
+                    num_pad_tokens = (grid_t * grid_h * grid_w //
+                                      image_processor.merge_size //
+                                      image_processor.merge_size)
+                    prompt_token_ids_with_video.extend([token] *
+                                                       num_pad_tokens)
+                    video_counter += 1
+                    pad_token_counter += num_pad_tokens
+                else:
+                    prompt_token_ids_with_video.append(token)
+
+            # ensure all embeddings are used
+            assert \
+                pad_token_counter == video_inputs["video_embeds"].size(0), \
+                "video_embeds.shape does not match video_grid_thw"
+
+            prompt_token_ids = prompt_token_ids_with_video
+        else:
+            prompt_token_ids = _expand_pad_tokens(video_inputs,
+                                                  hf_config.video_token_id,
+                                                  make_batched_videos,
+                                                  "video",
+                                                  image_processor,
+                                                  prompt_token_ids,
+                                                  min_pixels=min_pixels,
+                                                  max_pixels=max_pixels)
 
     prompt = inputs.get("prompt")
     if prompt is None:
@@ -1051,49 +1147,71 @@ def _parse_and_validate_image_input(
                                  f"Got type: {type(pixel_values)}")
 
             return Qwen2VLImagePixelInputs(type="pixel_values",
-                                           data=pixel_values,
+                                           pixel_values=pixel_values,
                                            image_grid_thw=image_grid_thw)
 
         if image_embeds is not None:
             image_embeds = self._validate_and_reshape_mm_tensor(
                 image_embeds, "image embeds")
+            image_grid_thw = self._validate_and_reshape_mm_tensor(
+                image_grid_thw, "image grid_thw")
 
             if not isinstance(image_embeds, torch.Tensor):
                 raise ValueError("Incorrect type of image embeddings. "
                                  f"Got type: {type(image_embeds)}")
             return Qwen2VLImageEmbeddingInputs(type="image_embeds",
-                                               data=image_embeds)
+                                               image_embeds=image_embeds,
+                                               image_grid_thw=image_grid_thw)
 
     def _parse_and_validate_video_input(
             self, **kwargs: object) -> Optional[Qwen2VLVideoInputs]:
         pixel_values_videos = kwargs.pop("pixel_values_videos", None)
+        video_embeds = kwargs.pop("video_embeds", None)
         video_grid_thw = kwargs.pop("video_grid_thw", None)
 
-        if pixel_values_videos is None:
+        if pixel_values_videos is None and video_embeds is None:
             return None
 
-        pixel_values_videos = self._validate_and_reshape_mm_tensor(
-            pixel_values_videos, "video pixel values")
-        video_grid_thw = self._validate_and_reshape_mm_tensor(
-            video_grid_thw, "video grid_thw")
-
-        return Qwen2VLVideoInputs(
-            pixel_values_videos=pixel_values_videos,
-            video_grid_thw=video_grid_thw,
-        )
+        if pixel_values_videos is not None:
+            pixel_values_videos = self._validate_and_reshape_mm_tensor(
+                pixel_values_videos, "video pixel values")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            return Qwen2VLVideoPixelInputs(
+                type="pixel_values_videos",
+                pixel_values_videos=pixel_values_videos,
+                video_grid_thw=video_grid_thw,
+            )
+
+        if video_embeds is not None:
+            video_embeds = self._validate_and_reshape_mm_tensor(
+                video_embeds, "video embeds")
+            video_grid_thw = self._validate_and_reshape_mm_tensor(
+                video_grid_thw, "video grid_thw")
+
+            if not isinstance(video_embeds, torch.Tensor):
+                raise ValueError("Incorrect type of video embeddings. "
+                                 f"Got type: {type(video_embeds)}")
+            return Qwen2VLVideoEmbeddingInputs(type="video_embeds",
+                                               video_embeds=video_embeds,
+                                               video_grid_thw=video_grid_thw)
 
     def _process_image_input(self,
                              image_input: Qwen2VLImageInputs) -> torch.Tensor:
         if image_input["type"] == "image_embeds":
-            return image_input["data"].type(self.visual.dtype)
+            return image_input["image_embeds"].type(self.visual.dtype)
 
-        pixel_values = image_input["data"].type(self.visual.dtype)
+        pixel_values = image_input["pixel_values"].type(self.visual.dtype)
         image_embeds = self.visual(pixel_values,
                                    grid_thw=image_input["image_grid_thw"])
         return image_embeds
 
     def _process_video_input(self,
                              video_input: Qwen2VLVideoInputs) -> torch.Tensor:
+        if video_input["type"] == "video_embeds":
+            return video_input["video_embeds"].type(self.visual.dtype)
+
         pixel_values_videos = video_input["pixel_values_videos"].type(
             self.visual.dtype)
         video_embeds = self.visual(pixel_values_videos,

From 1b886aa104248a95720fda7be9f979fc665b3d02 Mon Sep 17 00:00:00 2001
From: Austin Veselka <50646302+FurtherAI@users.noreply.github.com>
Date: Wed, 13 Nov 2024 02:28:13 -0600
Subject: [PATCH 122/183] [Model] Adding Support for Qwen2VL as an Embedding
 Model. Using MrLight/dse-qwen2-2b-mrl-v1 (#9944)

Signed-off-by: FurtherAI <austin.veselka@lighton.ai>
Co-authored-by: FurtherAI <austin.veselka@lighton.ai>
---
 docs/source/models/supported_models.rst       |   6 +
 docs/source/models/vlm.rst                    |  17 ++
 ...ai_chat_embedding_client_for_multimodal.py | 123 +++++++++--
 examples/template_dse_qwen2_vl.jinja          |   7 +
 tests/conftest.py                             |   3 +
 .../vision_language/test_dse_qwen2_vl.py      | 209 ++++++++++++++++++
 vllm/model_executor/models/qwen2_vl.py        |  17 +-
 vllm/model_executor/models/registry.py        |   1 +
 8 files changed, 364 insertions(+), 19 deletions(-)
 create mode 100644 examples/template_dse_qwen2_vl.jinja
 create mode 100644 tests/models/embedding/vision_language/test_dse_qwen2_vl.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index ca894819f2c26..58ec3acc6aea5 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -584,6 +584,12 @@ Multimodal Embedding
     - :code:`TIGER-Lab/VLM2Vec-Full`
     - 🚧
     - ✅︎
+  * - :code:`Qwen2VLForConditionalGeneration`
+    - Qwen2-VL-based
+    - T + I
+    - :code:`MrLight/dse-qwen2-2b-mrl-v1`
+    - 
+    - ✅︎
 
 .. important::
   Some model architectures support both generation and embedding tasks.
diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 112e9db6a41de..bcbe50a25fa09 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -310,4 +310,21 @@ Since the request schema is not defined by OpenAI client, we post a request to t
     response_json = response.json()
     print("Embedding output:", response_json["data"][0]["embedding"])
 
+Here is an example for serving the ``MrLight/dse-qwen2-2b-mrl-v1`` model.
+
+.. code-block:: bash
+
+    vllm serve MrLight/dse-qwen2-2b-mrl-v1 --task embedding \
+      --trust-remote-code --max-model-len 8192 --chat-template examples/template_dse_qwen2_vl.jinja
+
+.. important::
+
+    Like with VLM2Vec, we have to explicitly pass ``--task embedding``. Additionally, ``MrLight/dse-qwen2-2b-mrl-v1`` requires an EOS token for embeddings, 
+    which is handled by the jinja template.
+
+.. important::
+
+    Also important, ``MrLight/dse-qwen2-2b-mrl-v1`` requires a placeholder image of the minimum image size for text query embeddings. See the full code 
+    example below for details.
+
 A full code example can be found in `examples/openai_chat_embedding_client_for_multimodal.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_chat_embedding_client_for_multimodal.py>`_.
diff --git a/examples/openai_chat_embedding_client_for_multimodal.py b/examples/openai_chat_embedding_client_for_multimodal.py
index effb588e1387f..fff82020d9a30 100644
--- a/examples/openai_chat_embedding_client_for_multimodal.py
+++ b/examples/openai_chat_embedding_client_for_multimodal.py
@@ -1,33 +1,120 @@
+import argparse
+import base64
+import io
+
 import requests
+from PIL import Image
 
 image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
 
-response = requests.post(
-    "http://localhost:8000/v1/embeddings",
-    json={
-        "model":
-        "TIGER-Lab/VLM2Vec-Full",
-        "messages": [{
+
+def vlm2vec():
+    response = requests.post(
+        "http://localhost:8000/v1/embeddings",
+        json={
+            "model":
+            "TIGER-Lab/VLM2Vec-Full",
+            "messages": [{
+                "role":
+                "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    },
+                    {
+                        "type": "text",
+                        "text": "Represent the given image."
+                    },
+                ],
+            }],
+            "encoding_format":
+            "float",
+        },
+    )
+    response.raise_for_status()
+    response_json = response.json()
+
+    print("Embedding output:", response_json["data"][0]["embedding"])
+
+
+def dse_qwen2_vl(inp: dict):
+    # Embedding an Image
+    if inp["dtype"] == "image":
+        messages = [{
+            "role":
+            "user",
+            "content": [{
+                "type": "image_url",
+                "image_url": {
+                    "url": inp["image_url"],
+                }
+            }, {
+                "type": "text",
+                "text": "What is shown in this image?"
+            }]
+        }]
+    # Embedding a Text Query
+    else:
+        # MrLight/dse-qwen2-2b-mrl-v1 requires a placeholder image
+        # of the minimum input size
+        buffer = io.BytesIO()
+        image_placeholder = Image.new("RGB", (56, 56))
+        image_placeholder.save(buffer, "png")
+        buffer.seek(0)
+        image_placeholder = base64.b64encode(buffer.read()).decode('utf-8')
+        messages = [{
             "role":
             "user",
             "content": [
                 {
                     "type": "image_url",
                     "image_url": {
-                        "url": image_url
+                        "url": f"data:image/jpeg;base64,{image_placeholder}",
                     }
                 },
                 {
                     "type": "text",
-                    "text": "Represent the given image."
+                    "text": f"Query: {inp['content']}"
                 },
-            ],
-        }],
-        "encoding_format":
-        "float",
-    },
-)
-response.raise_for_status()
-response_json = response.json()
-
-print("Embedding output:", response_json["data"][0]["embedding"])
+            ]
+        }]
+
+    response = requests.post(
+        "http://localhost:8000/v1/embeddings",
+        json={
+            "model": "MrLight/dse-qwen2-2b-mrl-v1",
+            "messages": messages,
+            "encoding_format": "float",
+        },
+    )
+    response.raise_for_status()
+    response_json = response.json()
+
+    print("Embedding output:", response_json["data"][0]["embedding"])
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        "Script to call a specified VLM through the API. Make sure to serve "
+        "the model with --task embedding before running this.")
+    parser.add_argument("model",
+                        type=str,
+                        choices=["vlm2vec", "dse_qwen2_vl"],
+                        required=True,
+                        help="Which model to call.")
+    args = parser.parse_args()
+
+    if args.model == "vlm2vec":
+        vlm2vec()
+    elif args.model == "dse_qwen2_vl":
+        dse_qwen2_vl({
+            "dtye": "image",
+            "image_url": image_url,
+        })
+        dse_qwen2_vl({
+            "dtype": "text",
+            "content": "What is the weather like today?",
+        })
diff --git a/examples/template_dse_qwen2_vl.jinja b/examples/template_dse_qwen2_vl.jinja
new file mode 100644
index 0000000000000..e7b93fae31770
--- /dev/null
+++ b/examples/template_dse_qwen2_vl.jinja
@@ -0,0 +1,7 @@
+{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}{% raw %}<|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endraw %}{% endif %}<|im_start|>{{ message['role'] }}{% raw %}
+{% endraw %}{% if message['content'] is string %}{{ message['content'] }}<|im_end|>{% raw %}
+{% endraw %}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>{% raw %}
+{% endraw %}{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant{% raw %}
+{% endraw %}{% endif %}<|endoftext|>
\ No newline at end of file
diff --git a/tests/conftest.py b/tests/conftest.py
index 6cf791dc62ce5..0dc1cc6e83c18 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -243,6 +243,9 @@ def video_assets() -> _VideoAssets:
 class HfRunner:
 
     def wrap_device(self, x: _T, device: Optional[str] = None) -> _T:
+        if x is None or isinstance(x, (bool, )):
+            return x
+
         if device is None:
             device = "cpu" if current_platform.is_cpu() else "cuda"
 
diff --git a/tests/models/embedding/vision_language/test_dse_qwen2_vl.py b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
new file mode 100644
index 0000000000000..3dd8cb729f8a6
--- /dev/null
+++ b/tests/models/embedding/vision_language/test_dse_qwen2_vl.py
@@ -0,0 +1,209 @@
+from functools import partial
+from typing import Callable, Dict, List, Type
+
+import pytest
+import torch
+from PIL import Image
+from transformers import BatchEncoding, Qwen2VLForConditionalGeneration
+
+from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ....utils import large_gpu_test
+from ..utils import check_embeddings_close
+
+HF_TEXT_PROMPTS = [
+    # T -> X
+    (
+        "Query: Find me an everyday image that matches the given caption: The label of the object is stop sign",  # noqa: E501,
+        Image.new("RGB", (56, 56))),
+    # T -> X
+    ("Query: Retrieve an image of this caption: cherry blossom",
+     Image.new("RGB", (56, 56))),
+]
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "What is shown in this image?",
+    "cherry_blossom":
+    "What is shown in this image?"
+})
+
+MODELS = ["MrLight/dse-qwen2-2b-mrl-v1"]
+
+
+def get_messages(image: Image.Image, text: str, embed_text: bool):
+    # assert False, 'remember to use outer [] as required'
+    if embed_text:
+        messages = [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": Image.new("RGB", (56, 56)),
+                    "resized_height": 1,
+                    "resized_width": 1
+                },  # need a dummy image here for an easier process.
+                {
+                    "type": "text",
+                    "text": text
+                },
+            ]
+        }]
+    else:
+        messages = [{
+            "role":
+            "user",
+            "content": [{
+                "type": "image",
+                "image": image
+            }, {
+                "type": "text",
+                "text": text
+            }]
+        }]
+    return messages
+
+
+def apply_chat_template_and_add_eos(
+    messages: List[Dict],
+    apply_chat_template_fn: Callable,
+):
+    prompt = apply_chat_template_fn(
+        messages, tokenize=False, add_generation_prompt=True) + "<|endoftext|>"
+    return prompt
+
+
+def postprocess_inputs(hf_model: HfRunner, inputs: BatchEncoding, **kwargs):
+    return hf_model.model.prepare_inputs_for_generation(**inputs, **kwargs)
+
+
+def _run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    input_texts: List[str],
+    input_images: PromptImageInput,
+    embed_texts: List[bool],
+    model: str,
+    *,
+    dtype: str,
+) -> None:
+    '''SET PYTHONPATH'''
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    with vllm_runner(model,
+                     task="embedding",
+                     dtype=dtype,
+                     enforce_eager=True,
+                     max_model_len=8192) as vllm_model:
+        tokenizer = vllm_model.model.get_tokenizer()
+        texts = [
+            # this is necessary because vllm_model.encode will not apply any
+            # templating to the prompt, and therefore lacks an image_pad
+            # token unless one is inserted beforehand (the (28,28) image
+            # above is converted to an image pad token by the chat template).
+            apply_chat_template_and_add_eos(
+                get_messages(image, text, False),
+                apply_chat_template_fn=tokenizer.apply_chat_template,
+            ) for text, image in zip(input_texts, input_images)
+            # vllm will replace the pad token with the actual image,
+            # which may be a placeholder image, later.
+        ]
+        vllm_outputs = vllm_model.encode(texts, images=input_images)
+
+    hf_outputs = []
+    with hf_runner(model,
+                   dtype=dtype,
+                   auto_cls=Qwen2VLForConditionalGeneration) as hf_model:
+        hf_model.postprocess_inputs = partial(
+            postprocess_inputs,
+            hf_model,
+            cache_position=torch.arange(
+                0,
+                1,  # 1 for batch size
+                requires_grad=False),
+            use_cache=False)
+        for text, image, embed_text in zip(input_texts, input_images,
+                                           embed_texts):
+            # dse requires non-standard input processing
+            # because it needs an image_pad token
+            messages = get_messages(image, text, embed_text)
+            prompt = apply_chat_template_and_add_eos(
+                messages, hf_model.processor.apply_chat_template)
+            inputs = hf_model.get_inputs(
+                prompts=[[prompt]],
+                images=[[image]],
+            )
+            with torch.no_grad():
+                outputs = hf_model.model(
+                    **hf_model.wrap_device(inputs[0],
+                                           device=hf_model.model.device.type),
+                    return_dict=True,
+                    output_hidden_states=True,
+                )
+                pooled_output = torch.nn.functional.normalize(
+                    outputs.hidden_states[-1][0, -1], p=2, dim=-1)
+            hf_outputs.append(pooled_output.tolist())
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+def test_models_text(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    input_texts_images = [(text, image_placeholder)
+                          for text, image_placeholder in HF_TEXT_PROMPTS]
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+    embed_texts = [True] * len(input_texts)
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        input_texts,
+        input_images,  # type: ignore
+        embed_texts,
+        model,
+        dtype=dtype,
+    )
+
+
+@large_gpu_test(min_gb=48)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+def test_models_image(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    input_texts_images = [
+        (text, asset.pil_image)
+        for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
+    ]
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+    embed_texts = [False] * len(input_texts)
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        input_texts,
+        input_images,
+        embed_texts,
+        model,
+        dtype=dtype,
+    )
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 1b162e7df8578..9a19ccbca3f1e 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -51,6 +51,7 @@
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import (GPTQConfig,
                                                      GPTQMarlinConfig,
                                                      QuantizationConfig)
@@ -58,12 +59,13 @@
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.qwen2 import Qwen2Model
+from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
                              MultiModalKwargs)
 from vllm.multimodal.base import MultiModalData
 from vllm.multimodal.image import cached_get_image_processor
 from vllm.multimodal.utils import cached_get_tokenizer
-from vllm.sequence import IntermediateTensors, SequenceData
+from vllm.sequence import IntermediateTensors, PoolerOutput, SequenceData
 from vllm.transformers_utils.config import uses_mrope
 from vllm.transformers_utils.processor import cached_get_processor
 
@@ -1067,6 +1069,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
+        pooler_config = vllm_config.model_config.pooler_config
         multimodal_config = vllm_config.model_config.multimodal_config
         assert not cache_config.enable_prefix_caching, \
             "Qwen2-VL currently does not support prefix caching"
@@ -1098,6 +1101,11 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = get_sampler()
+        self._pooler = Pooler.from_config_with_defaults(
+            pooler_config,
+            pooling_type=PoolingType.LAST,
+            normalize=True,
+            softmax=False)
         self.make_empty_intermediate_tensors = (
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
@@ -1318,6 +1326,13 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
+    def pooler(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Optional[PoolerOutput]:
+        return self._pooler(hidden_states, pooling_metadata)
+
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index 32750602b988c..f172c06c4a26a 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -109,6 +109,7 @@
     # [Multimodal]
     "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"),  # noqa: E501
     "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
+    "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration") # noqa: E501,
 }
 
 _MULTIMODAL_MODELS = {

From b6dde330198848a4a9903c1f0f97c3235fba0ba0 Mon Sep 17 00:00:00 2001
From: Pavani Majety <pmajety@nvidia.com>
Date: Wed, 13 Nov 2024 00:29:32 -0800
Subject: [PATCH 123/183] [Core] Flashinfer - Remove advance step size
 restriction (#10282)

---
 csrc/prepare_inputs/advance_step.cu | 66 +++++++++++++++++------------
 1 file changed, 38 insertions(+), 28 deletions(-)

diff --git a/csrc/prepare_inputs/advance_step.cu b/csrc/prepare_inputs/advance_step.cu
index 46fef79f439fb..bd184ee22682e 100644
--- a/csrc/prepare_inputs/advance_step.cu
+++ b/csrc/prepare_inputs/advance_step.cu
@@ -88,6 +88,7 @@ inline void verify_tensor(std::string const& name, torch::Tensor const& t,
   }
 }
 
+/// each thread processes a block per query
 __global__ void advance_step_flashinfer_kernel(
     int num_threads, int num_seqs, int num_queries, int block_size,
     long* input_tokens_ptr, long const* sampled_token_ids_ptr,
@@ -134,8 +135,10 @@ __global__ void advance_step_flashinfer_indptr_kernel(
     int num_threads, int num_seqs, int num_queries, int* paged_kv_indptr_ptr,
     int* block_table_bound_ptr) {
   int idx = blockIdx.x * num_threads + threadIdx.x;
-
   // Update paged_kv_indptr
+  if (idx == 0) {
+    paged_kv_indptr_ptr[idx] = 0;
+  }
   if (idx < num_queries) {
     int sum = 0;
     for (int i = 0; i <= idx; ++i) {
@@ -146,20 +149,33 @@ __global__ void advance_step_flashinfer_indptr_kernel(
 }
 
 __global__ void advance_step_flashinfer_indices_kernel(
-    int num_threads, int num_seqs, int num_queries, int const* block_tables_ptr,
-    int64_t const block_tables_stride, int* paged_kv_indices_ptr,
+    int num_seqs, int num_queries, int const* block_tables_ptr,
+    int64_t const max_num_blocks_per_seq, int* paged_kv_indices_ptr,
     int* paged_kv_indptr_ptr, int* block_table_bound_ptr) {
-  int idx = blockIdx.x * num_threads + threadIdx.x;
-  int row = idx / block_tables_stride;
-  int col = idx % block_tables_stride;
-
-  if (row < num_queries && col < block_table_bound_ptr[row]) {
-    paged_kv_indices_ptr[paged_kv_indptr_ptr[row] + col] =
-        block_tables_ptr[row * block_tables_stride + col];
+  // note: max_num_blocks_per_seq = block_tables.stride(0)
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // when cuda graphs are enabled, paged_kv_indptr tensor
+  // has to be updated for the padded queries
+  // tid represents a query# for paged_kv_indptr tensor
+  if (num_queries < tid && tid <= num_seqs) {
+    paged_kv_indptr_ptr[tid] = paged_kv_indptr_ptr[num_queries];
   }
-  // if cudagraph, fill padded seqs with the last valid seq's indptr
-  if (num_queries < row && row <= num_seqs) {
-    paged_kv_indptr_ptr[row] = paged_kv_indptr_ptr[num_queries];
+
+  // each thread processes a block_ptr in block_tables
+  // block_tables shape: [num_queries, max_num_blocks_per_seq]
+  // paged_kv_indices is flattened block_tables.
+  for (int idx = tid; idx < (num_seqs * max_num_blocks_per_seq);
+       idx += (gridDim.x * blockDim.x)) {
+    // block_tables-row = paged_kv_indptr[queryNum]
+    int queryNum = idx / max_num_blocks_per_seq;
+    int col = idx % max_num_blocks_per_seq;
+    if (queryNum < num_queries && col < block_table_bound_ptr[queryNum]) {
+      int indices_arr_idx = paged_kv_indptr_ptr[queryNum] + col;
+      int block_tables_idx = queryNum * max_num_blocks_per_seq + col;
+      paged_kv_indices_ptr[indices_arr_idx] =
+          block_tables_ptr[block_tables_idx];
+    }
   }
 }
 
@@ -247,22 +263,16 @@ void advance_step_flashinfer(
   int threads;
   cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
   cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);
-  if (logging) {
-    printf("launching kernel with %d blocks\n", blocks);
-  }
 
-  // TODO(will): support arbitrary block_tables stride
-  if ((blocks * threads) / block_tables.stride(0) < num_queries) {
-    TORCH_CHECK(false,
-                "multi-step: not enough threads to map block_table to"
-                "FlashInfer's paged_kv_indices on GPU. Try reducing the number "
-                "of seqs,",
-                " increasing the block size or take smaller steps.",
-                " num_queries = ", num_queries,
-                " block_tables.stride(0) = ", block_tables.stride(0),
-                " blocks = ", blocks, " max_threads = ", threads);
+  int block_tables_stride = block_tables.stride(0);
+  TORCH_CHECK((blocks * threads > num_queries),
+              "multi-step: not enough threads to map to num_queries = ",
+              num_queries, " block_tables.stride(0) = ", block_tables.stride(0),
+              " blocks = ", blocks, " max_threads = ", threads);
+  if (logging) {
+    printf("launching kernels with %d blocks and %d threads\n", blocks,
+           threads);
   }
-
   advance_step_flashinfer_kernel<<<blocks, threads, 0, stream>>>(
       threads, num_seqs, num_queries, block_size,
       reinterpret_cast<long*>(input_tokens.data_ptr()),
@@ -281,7 +291,7 @@ void advance_step_flashinfer(
       reinterpret_cast<int*>(block_table_bound.data_ptr()));
 
   advance_step_flashinfer_indices_kernel<<<blocks, threads, 0, stream>>>(
-      threads, num_seqs, num_queries,
+      num_seqs, num_queries,
       reinterpret_cast<int const*>(block_tables.data_ptr()),
       block_tables.stride(0),
       reinterpret_cast<int*>(paged_kv_indices.data_ptr()),

From d909acf9fe17b7db42d7de61903c0058c8b9b344 Mon Sep 17 00:00:00 2001
From: B-201 <Joy25810@foxmail.com>
Date: Wed, 13 Nov 2024 17:25:59 +0800
Subject: [PATCH 124/183] [Model][LoRA]LoRA support added for idefics3 (#10281)

Signed-off-by: B-201 <Joy25810@foxmail.com>
---
 docs/source/models/supported_models.rst |  2 +-
 vllm/model_executor/models/idefics3.py  | 55 +++++++++++++++++++++----
 2 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 58ec3acc6aea5..161733c049bbe 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -450,7 +450,7 @@ Text Generation
     - Idefics3
     - T + I
     - :code:`HuggingFaceM4/Idefics3-8B-Llama3` etc.
-    - 
+    - ✅︎
     - 
   * - :code:`InternVLChatModel`
     - InternVL2
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 8845b2f58af07..85f23a1da533b 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -33,6 +33,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.image import cached_get_image_processor
@@ -44,7 +45,7 @@
 from .idefics2_vision_model import (
     Idefics2VisionTransformer as Idefics3VisionTransformer)
 # yapf: enable
-from .interfaces import SupportsMultiModal
+from .interfaces import SupportsLoRA, SupportsMultiModal
 from .llama import LlamaModel
 from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
                     merge_multimodal_embeddings)
@@ -58,8 +59,6 @@ class Idefics3ImagePixelInputs(TypedDict):
     """
     Shape: `(batch_size * num_images, num_channels, height, width)`
     """
-    rows: List[int]
-    cols: List[int]
     pixel_attention_mask: Optional[torch.BoolTensor]
 
 
@@ -356,8 +355,15 @@ def dummy_data_for_idefics3(
     image_seq_len = processor.image_seq_len
     max_llm_image_tokens = max_num_image_patches * image_seq_len * num_images
 
+    if seq_len - max_llm_image_tokens < 0:
+        raise RuntimeError(
+            f"Idefics3 cannot process {num_images} images in a prompt, "
+            "please increase max_model_len or reduce image limit by "
+            "--limit-mm-per-prompt.")
+
     seq_data = SequenceData.from_prompt_token_counts(
-        (hf_config.image_token_id, max_llm_image_tokens), (0, seq_len))
+        (hf_config.image_token_id, max_llm_image_tokens),
+        (0, seq_len - max_llm_image_tokens))
 
     width = height = hf_config.vision_config.image_size
     image = Image.new("RGB", (width, height), color=0)
@@ -463,8 +469,6 @@ def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[ImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
         image_embeds = kwargs.pop("image_embeds", None)
-        rows = kwargs.pop("rows", None)
-        cols = kwargs.pop("cols", None)
         pixel_attention_mask = kwargs.pop("pixel_attention_mask", None)
 
         if pixel_values is None and image_embeds is None:
@@ -489,8 +493,6 @@ def _parse_and_validate_image_input(
                                             data=self._validate_pixel_values(
                                                 flatten_bn(pixel_values,
                                                            concat=True)),
-                                            rows=rows,
-                                            cols=cols,
                                             pixel_attention_mask=flatten_bn(
                                                 pixel_attention_mask,
                                                 concat=True))
@@ -610,7 +612,33 @@ def forward(
 @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_idefics3_image_tokens)
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_idefics3)
 @INPUT_REGISTRY.register_input_processor(input_processor_for_idefics3)
-class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal):
+class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
+                                       SupportsLoRA):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+    # LoRA specific attributes
+    supported_lora_modules = [
+        # vision_model
+        "fc1",
+        "fc2",
+        "out_proj",
+        # text_model
+        "qkv_proj",  # same name with vision encoder
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+    ]
+    embedding_modules = {}
+    embedding_padding_modules = []
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
@@ -672,3 +700,12 @@ def sample(
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(self)
         loader.load_weights(weights)
+
+    def get_mm_mapping(self) -> MultiModelKeys:
+        """
+        Get the module prefix in multimodal models
+        """
+        return MultiModelKeys.from_string_field(
+            language_model="model.text_model",
+            connector="model.connector",
+            tower_model="model.vision_model")

From bb7991aa291054a30f408e626273caa6769a07eb Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Wed, 13 Nov 2024 03:02:56 -0800
Subject: [PATCH 125/183] [V1] Add missing tokenizer options for `Detokenizer`
 (#10288)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/engine/detokenizer.py | 11 +++++++++--
 vllm/v1/engine/llm_engine.py  |  7 ++++++-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/engine/detokenizer.py b/vllm/v1/engine/detokenizer.py
index 1dbf8e75ec478..6249d60199a62 100644
--- a/vllm/v1/engine/detokenizer.py
+++ b/vllm/v1/engine/detokenizer.py
@@ -192,10 +192,17 @@ def _get_next_output_text(self, finished: bool, delta: bool) -> str:
 
 class Detokenizer:
 
-    def __init__(self, tokenizer_name: str):
+    def __init__(self,
+                 tokenizer_name: str,
+                 tokenizer_mode: str = "auto",
+                 trust_remote_code: bool = False,
+                 revision: Optional[str] = None):
         # TODO: once we support LoRA, we should should pass the tokenizer
         # here. We currently have two copies (this + in the LLMEngine).
-        self.tokenizer = get_tokenizer(tokenizer_name)
+        self.tokenizer = get_tokenizer(tokenizer_name=tokenizer_name,
+                                       tokenizer_mode=tokenizer_mode,
+                                       trust_remote_code=trust_remote_code,
+                                       revision=revision)
 
         # Request id -> IncrementalDetokenizer
         self.request_states: Dict[str, IncrementalDetokenizer] = {}
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index f37db92e8ea6b..5b45615a1b85b 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -53,7 +53,12 @@ def __init__(
                                    input_registry)
 
         # Detokenizer (converts EngineCoreOutputs --> RequestOutput)
-        self.detokenizer = Detokenizer(vllm_config.model_config.tokenizer)
+        self.detokenizer = Detokenizer(
+            tokenizer_name=vllm_config.model_config.tokenizer,
+            tokenizer_mode=vllm_config.model_config.tokenizer_mode,
+            trust_remote_code=vllm_config.model_config.trust_remote_code,
+            revision=vllm_config.model_config.tokenizer_revision,
+        )
 
         # EngineCore (gets EngineCoreRequests and gives EngineCoreOutputs)
         self.engine_core = EngineCoreClient.make_client(

From 0b8bb86bf19d68950b4d92a99350e07a26ec0d2c Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 13 Nov 2024 20:39:03 +0800
Subject: [PATCH 126/183] [1/N] Initial prototype for multi-modal processor
 (#10044)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../models/enabling_multimodal_inputs.rst     |   2 +-
 .../mm_processor_kwargs/test_qwen.py          |   2 +-
 .../{test_base.py => test_inputs.py}          |   2 +-
 tests/multimodal/test_processor_kwargs.py     |  37 ++-
 tests/v1/core/test_prefix_caching.py          |   4 +-
 vllm/config.py                                |   2 +-
 vllm/engine/async_llm_engine.py               |   4 +
 vllm/engine/llm_engine.py                     |  16 +-
 vllm/engine/multiprocessing/client.py         |   6 +
 vllm/engine/protocol.py                       |  16 +-
 vllm/entrypoints/openai/serving_chat.py       |   1 -
 vllm/entrypoints/openai/serving_completion.py |   1 -
 vllm/inputs/__init__.py                       |  12 +-
 vllm/inputs/data.py                           |  99 ++++++-
 vllm/inputs/preprocess.py                     | 143 +++++++--
 vllm/inputs/registry.py                       |  56 +++-
 vllm/model_executor/models/chatglm.py         |   4 +-
 vllm/model_executor/models/fuyu.py            |   3 +-
 vllm/model_executor/models/h2ovl.py           |   3 +-
 vllm/model_executor/models/internvl.py        |   3 +-
 vllm/model_executor/models/llava.py           |   2 +-
 vllm/model_executor/models/minicpmv.py        |   3 +-
 vllm/model_executor/models/phi3v.py           |   2 +-
 vllm/model_executor/models/pixtral.py         |   3 +-
 vllm/model_executor/models/qwen.py            |   3 +-
 vllm/model_executor/models/qwen2_vl.py        |   6 +-
 vllm/model_executor/models/utils.py           |   2 +-
 vllm/multimodal/__init__.py                   |  10 +-
 vllm/multimodal/audio.py                      |  12 +-
 vllm/multimodal/base.py                       | 188 ++----------
 vllm/multimodal/image.py                      |  10 +-
 vllm/multimodal/inputs.py                     | 225 +++++++++++++++
 vllm/multimodal/processing.py                 | 273 ++++++++++++++++++
 vllm/multimodal/registry.py                   |  84 +++++-
 vllm/multimodal/utils.py                      |   3 +-
 vllm/multimodal/video.py                      |  20 +-
 vllm/sequence.py                              |  68 ++---
 vllm/v1/engine/async_llm.py                   |   4 +
 vllm/v1/engine/llm_engine.py                  |   4 +-
 vllm/v1/engine/processor.py                   |  73 +++--
 vllm/v1/request.py                            |  26 +-
 vllm/v1/worker/gpu_model_runner.py            |   2 +-
 vllm/worker/cpu_model_runner.py               |  41 ++-
 vllm/worker/hpu_model_runner.py               |   6 +-
 vllm/worker/model_runner.py                   |  25 +-
 vllm/worker/neuron_model_runner.py            |  22 +-
 vllm/worker/openvino_model_runner.py          |  21 +-
 vllm/worker/xpu_model_runner.py               |  16 +-
 48 files changed, 1133 insertions(+), 437 deletions(-)
 rename tests/multimodal/{test_base.py => test_inputs.py} (97%)
 create mode 100644 vllm/multimodal/inputs.py
 create mode 100644 vllm/multimodal/processing.py

diff --git a/docs/source/models/enabling_multimodal_inputs.rst b/docs/source/models/enabling_multimodal_inputs.rst
index 3d0d1aec69845..49b5285c45590 100644
--- a/docs/source/models/enabling_multimodal_inputs.rst
+++ b/docs/source/models/enabling_multimodal_inputs.rst
@@ -66,7 +66,7 @@ A default mapper is available for each modality in the core vLLM library. This i
 3. Register maximum number of multi-modal tokens
 ------------------------------------------------
 
-For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data instance
+For each modality type that the model accepts as input, calculate the maximum possible number of tokens per data item
 and register it via :meth:`INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_max_multimodal_tokens>`.
 
 .. code-block:: diff
diff --git a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
index e6ed87fc8ea08..163220c91a27d 100644
--- a/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
+++ b/tests/models/decoder_only/vision_language/mm_processor_kwargs/test_qwen.py
@@ -6,7 +6,7 @@
 from PIL.Image import Image
 
 from vllm.inputs import InputContext, token_inputs
-from vllm.multimodal.base import MultiModalKwargs
+from vllm.multimodal import MultiModalKwargs
 from vllm.multimodal.utils import cached_get_tokenizer
 
 from .....conftest import IMAGE_ASSETS
diff --git a/tests/multimodal/test_base.py b/tests/multimodal/test_inputs.py
similarity index 97%
rename from tests/multimodal/test_base.py
rename to tests/multimodal/test_inputs.py
index bfaf2cdeaa8d4..678bbb52b8c2f 100644
--- a/tests/multimodal/test_base.py
+++ b/tests/multimodal/test_inputs.py
@@ -1,6 +1,6 @@
 import torch
 
-from vllm.multimodal.base import MultiModalKwargs, NestedTensors
+from vllm.multimodal.inputs import MultiModalKwargs, NestedTensors
 
 
 def assert_nested_tensors_equal(expected: NestedTensors,
diff --git a/tests/multimodal/test_processor_kwargs.py b/tests/multimodal/test_processor_kwargs.py
index 4d3bbd805c152..e6c8793989e13 100644
--- a/tests/multimodal/test_processor_kwargs.py
+++ b/tests/multimodal/test_processor_kwargs.py
@@ -1,12 +1,12 @@
 from array import array
-from typing import Mapping
+from typing import Callable, Dict, Mapping, Optional
 from unittest.mock import patch
 
 import pytest
 import torch
 
 from vllm.inputs import (DecoderOnlyInputs, DummyData, InputContext,
-                         InputRegistry, token_inputs)
+                         InputRegistry, ProcessorInputs, token_inputs)
 from vllm.multimodal import MultiModalRegistry
 from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData
 
@@ -34,10 +34,9 @@ def custom_processor(ctx: InputContext,
                          inputs: DecoderOnlyInputs,
                          *,
                          num_crops=DEFAULT_NUM_CROPS):
-        # For testing purposes, we don't worry about the llm inputs / return
-        # type validation, and just return the value of the kwarg that we
-        # clobber.
-        return num_crops
+        # For testing purposes, we don't worry about the prompt
+        return token_inputs(prompt_token_ids=[],
+                            mm_processor_kwargs={"num_crops": num_crops})
 
     with patch("vllm.inputs.registry.InputRegistry._get_model_input_processor",
                return_value=custom_processor):
@@ -109,6 +108,21 @@ def _get_num_crops_info(init_num_crops: int, inference_num_crops: int):
     return init_kwargs, inference_kwargs, expected_seq_count
 
 
+def _get_processed_num_crops(
+    processor: Callable[[ProcessorInputs], ProcessorInputs],
+    inference_kwargs: Optional[Dict[str, int]],
+) -> int:
+    processed_inputs = processor(
+        token_inputs(prompt_token_ids=[],
+                     prompt="",
+                     mm_processor_kwargs=inference_kwargs))
+
+    assert "type" in processed_inputs
+    assert processed_inputs["type"] == "token"
+    assert "mm_processor_kwargs" in processed_inputs
+    return processed_inputs["mm_processor_kwargs"]["num_crops"]
+
+
 @pytest.mark.parametrize("init_num_crops,inference_num_crops", [
     (None, None),
     (NUM_CROPS_OVERRIDE, None),
@@ -124,10 +138,8 @@ def test_input_processor_kwargs(use_processor_mock, init_num_crops,
 
     ctx = build_model_context(DUMMY_MODEL_ID, mm_processor_kwargs=init_kwargs)
     processor = dummy_registry.create_input_processor(ctx.model_config)
-    num_crops_val = processor(
-        token_inputs(prompt_token_ids=[],
-                     prompt="",
-                     mm_processor_kwargs=inference_kwargs))
+    num_crops_val = _get_processed_num_crops(processor, inference_kwargs)
+
     assert num_crops_val == expected_seq_count
 
 
@@ -153,10 +165,7 @@ def test_processor_with_sad_kwarg_overrides(use_processor_mock,
 
     processor = dummy_registry.create_input_processor(ctx.model_config)
     # Should filter out the inference time kwargs
-    num_crops_val = processor(
-        token_inputs(prompt_token_ids=[],
-                     prompt="",
-                     mm_processor_kwargs=mm_processor_kwargs))
+    num_crops_val = _get_processed_num_crops(processor, mm_processor_kwargs)
     assert num_crops_val == DEFAULT_NUM_CROPS
 
 
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index e5a3b62258dd8..d614d3e67460f 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -1,5 +1,5 @@
 """Compare the with and without prefix caching."""
-from vllm.inputs import DecoderOnlyInputs
+from vllm.inputs import token_inputs
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
 from vllm.v1.core.kv_cache_utils import hash_block_tokens
@@ -8,7 +8,7 @@
 def make_request(request_id, prompt_token_ids):
     return Request(
         request_id=request_id,
-        inputs=DecoderOnlyInputs(prompt_token_ids=prompt_token_ids),
+        inputs=token_inputs(prompt_token_ids=prompt_token_ids),
         sampling_params=SamplingParams(max_tokens=17),
         eos_token_id=100,
         arrival_time=0,
diff --git a/vllm/config.py b/vllm/config.py
index 5ba1c41fcaac1..002adb4316969 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -107,7 +107,7 @@ class ModelConfig:
             matches the model name exposed via the APIs. If multiple model
             names provided, the first name will be used. If not specified,
             the model name will be the same as `model`.
-        limit_mm_per_prompt: Maximum number of data instances per modality
+        limit_mm_per_prompt: Maximum number of data items per modality
             per prompt. Only applicable for multimodal models.
         override_neuron_config: Initialize non default neuron config or
             override default neuron config that are specific to Neuron devices,
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 1a371b52bb64b..5a5388708b1c6 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -19,6 +19,7 @@
 from vllm.executor.gpu_executor import GPUExecutorAsync
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.inputs import PromptType
+from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.guided_decoding import (
@@ -729,6 +730,9 @@ def _error_callback(self, exc: Exception) -> None:
         self.set_errored(exc)
         self._request_tracker.propagate_exception(exc)
 
+    async def get_input_preprocessor(self) -> InputPreprocessor:
+        return self.engine.input_preprocessor
+
     async def get_tokenizer(
         self,
         lora_request: Optional[LoRARequest] = None,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 69ed6e6bd59d2..f5299746d845d 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -30,7 +30,7 @@
 from vllm.executor.gpu_executor import GPUExecutor
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
-                         PromptType)
+                         PromptType, SingletonInputsAdapter)
 from vllm.inputs.parse import is_encoder_decoder_inputs, is_token_prompt
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
@@ -39,6 +39,7 @@
 from vllm.model_executor.guided_decoding import (
     get_local_guided_decoding_logits_processor)
 from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.outputs import (EmbeddingRequestOutput, RequestOutput,
                           RequestOutputFactory)
 from vllm.pooling_params import PoolingParams
@@ -226,6 +227,7 @@ def __init__(
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
         stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
         input_registry: InputRegistry = INPUT_REGISTRY,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         use_cached_outputs: bool = False,
     ) -> None:
 
@@ -335,7 +337,8 @@ def get_tokenizer_for_seq(sequence: Sequence) -> AnyTokenizer:
             model_config)
 
         self.input_preprocessor = InputPreprocessor(model_config,
-                                                    self.tokenizer)
+                                                    self.tokenizer,
+                                                    mm_registry)
 
         self.input_registry = input_registry
         self.input_processor = input_registry.create_input_processor(
@@ -851,13 +854,6 @@ def add_request(
         )
         processed_inputs = self.input_processor(preprocessed_inputs)
 
-        # This is a bit of a hack - copy the mm_processor_kwargs that were
-        # used in the input processor to the processed output, since these
-        # kwargs are presumed to be immutable and the values should be aligned
-        # between the input processor (here) and the input mapper.
-        processed_inputs["mm_processor_kwargs"] = preprocessed_inputs.get(
-            "mm_processor_kwargs")
-
         self._add_processed_request(
             request_id=request_id,
             processed_inputs=processed_inputs,
@@ -2019,7 +2015,7 @@ def _validate_model_inputs(self, inputs: ProcessorInputs,
         else:
             prompt_inputs = inputs
 
-        prompt_ids = prompt_inputs.get("prompt_token_ids")
+        prompt_ids = SingletonInputsAdapter(prompt_inputs).prompt_token_ids
 
         if prompt_ids is None or len(prompt_ids) == 0:
             raise ValueError("Prompt cannot be empty")
diff --git a/vllm/engine/multiprocessing/client.py b/vllm/engine/multiprocessing/client.py
index 882742c2fc61b..fe21c58c775fe 100644
--- a/vllm/engine/multiprocessing/client.py
+++ b/vllm/engine/multiprocessing/client.py
@@ -31,6 +31,7 @@
 # yapf: enable
 from vllm.envs import VLLM_RPC_TIMEOUT
 from vllm.inputs import PromptType
+from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.layers.sampler import SamplerOutput
@@ -94,6 +95,8 @@ def __init__(self, ipc_path: str, engine_config: VllmConfig,
             parallel_config=engine_config.parallel_config,
             enable_lora=bool(engine_config.lora_config),
         )
+        self.input_preprocessor = InputPreprocessor(self.model_config,
+                                                    self.tokenizer)
 
         # Send RPCGenerateRequest to the MQLLMEngine.
         self.input_socket: Socket = self.context.socket(zmq.constants.PUSH)
@@ -345,6 +348,9 @@ async def _check_success(error_message: str, socket: Socket):
               or response != VLLM_RPC_SUCCESS_STR):
             raise ValueError(error_message)
 
+    async def get_input_preprocessor(self) -> InputPreprocessor:
+        return self.input_preprocessor
+
     async def get_tokenizer(self, lora_request: Optional[LoRARequest] = None):
         return await self.tokenizer.get_lora_tokenizer_async(lora_request)
 
diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py
index e0b59d94cfdc3..e15395d75c91f 100644
--- a/vllm/engine/protocol.py
+++ b/vllm/engine/protocol.py
@@ -62,7 +62,6 @@ def generate(
     async def beam_search(
         self,
         prompt: PromptType,
-        model_config: ModelConfig,
         request_id: str,
         params: BeamSearchParams,
     ) -> AsyncGenerator[RequestOutput, None]:
@@ -74,13 +73,14 @@ async def beam_search(
         length_penalty = params.length_penalty
         include_stop_str_in_output = params.include_stop_str_in_output
 
-        tokenizer = await self.get_tokenizer()
-        input_preprocessor = InputPreprocessor(model_config, tokenizer)
+        preprocessor = await self.get_input_preprocessor()
+        tokenizer_group = preprocessor.get_tokenizer_group()
+        tokenizer = await tokenizer_group.get_lora_tokenizer_async()
 
         if is_explicit_encoder_decoder_prompt(prompt):
             raise NotImplementedError
         else:
-            processed_inputs = input_preprocessor._prompt_to_llm_inputs(
+            processed_inputs = preprocessor._prompt_to_llm_inputs(
                 prompt,
                 request_id=request_id,
             )
@@ -220,6 +220,7 @@ async def abort(self, request_id: str) -> None:
         Args:
             request_id: The unique id of the request.
         """
+        ...
 
     @abstractmethod
     async def get_model_config(self) -> ModelConfig:
@@ -228,8 +229,13 @@ async def get_model_config(self) -> ModelConfig:
 
     @abstractmethod
     async def get_decoding_config(self) -> DecodingConfig:
-        ...
         """Get the decoding configuration of the vLLM engine."""
+        ...
+
+    @abstractmethod
+    async def get_input_preprocessor(self) -> InputPreprocessor:
+        """Get the input processor of the vLLM engine."""
+        ...
 
     @abstractmethod
     async def get_tokenizer(
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 74867d8de8843..09edaf98f7d17 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -190,7 +190,6 @@ async def create_chat_completion(
                 if isinstance(sampling_params, BeamSearchParams):
                     generator = self.engine_client.beam_search(
                         prompt=engine_prompt,
-                        model_config=self.model_config,
                         request_id=request_id,
                         params=sampling_params,
                     )
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index db31b1153d97e..936aae8f1c267 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -140,7 +140,6 @@ async def create_completion(
                 if isinstance(sampling_params, BeamSearchParams):
                     generator = self.engine_client.beam_search(
                         prompt=engine_prompt,
-                        model_config=self.model_config,
                         request_id=request_id,
                         params=sampling_params,
                     )
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index 68ac50a2c5a16..54fbd7a321a6f 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -1,9 +1,11 @@
 from .data import (DecoderOnlyInputs, EncoderDecoderInputs,
                    ExplicitEncoderDecoderPrompt, ProcessorInputs, PromptType,
-                   SingletonInputs, SingletonPrompt, TextPrompt, TokenInputs,
-                   TokensPrompt, build_explicit_enc_dec_prompt,
-                   to_enc_dec_tuple_list, token_inputs, zip_enc_dec_prompts)
-from .registry import DummyData, InputContext, InputRegistry
+                   SingletonInputs, SingletonInputsAdapter, SingletonPrompt,
+                   TextPrompt, TokenInputs, TokensPrompt,
+                   build_explicit_enc_dec_prompt, to_enc_dec_tuple_list,
+                   token_inputs, zip_enc_dec_prompts)
+from .registry import (DummyData, InputContext, InputProcessingContext,
+                       InputRegistry)
 
 INPUT_REGISTRY = InputRegistry()
 """
@@ -26,12 +28,14 @@
     "EncoderDecoderInputs",
     "ProcessorInputs",
     "SingletonInputs",
+    "SingletonInputsAdapter",
     "build_explicit_enc_dec_prompt",
     "to_enc_dec_tuple_list",
     "zip_enc_dec_prompts",
     "INPUT_REGISTRY",
     "DummyData",
     "InputContext",
+    "InputProcessingContext",
     "InputRegistry",
 ]
 
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 46b41f431bec7..07ff9faa50f13 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -1,10 +1,14 @@
+from dataclasses import dataclass
+from functools import cached_property
 from typing import (TYPE_CHECKING, Any, Dict, Generic, Iterable, List, Literal,
                     Optional, Tuple, Union, cast)
 
-from typing_extensions import NotRequired, TypedDict, TypeVar
+import torch
+from typing_extensions import NotRequired, TypedDict, TypeVar, assert_never
 
 if TYPE_CHECKING:
     from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
+    from vllm.multimodal.inputs import MultiModalInputsV2
 
 
 class TextPrompt(TypedDict):
@@ -36,13 +40,13 @@ class TokensPrompt(TypedDict):
 
     multi_modal_data: NotRequired["MultiModalDataDict"]
     """
-    Optional multi-modal data to pass to the model,
+    DEPRECATED: Optional multi-modal data to pass to the model,
     if the model supports it.
     """
 
     mm_processor_kwargs: NotRequired[Dict[str, Any]]
     """
-    Optional multi-modal processor kwargs to be forwarded to the
+    DEPRECATED: Optional multi-modal processor kwargs to be forwarded to the
     multimodal input mapper & processor. Note that if multiple modalities
     have registered mappers etc for the model being considered, we attempt
     to pass the mm_processor_kwargs to each of them.
@@ -176,7 +180,7 @@ def token_inputs(
     return inputs
 
 
-DecoderOnlyInputs = TokenInputs
+DecoderOnlyInputs = Union[TokenInputs, "MultiModalInputsV2"]
 """
 The inputs in :class:`~vllm.LLMEngine` before they are
 passed to the model executor.
@@ -191,19 +195,91 @@ class EncoderDecoderInputs(TypedDict):
 
     This specifies the required data for encoder-decoder models.
     """
-    encoder: TokenInputs
+    encoder: Union[TokenInputs, "MultiModalInputsV2"]
     """The inputs for the encoder portion."""
 
-    decoder: TokenInputs
+    decoder: Union[TokenInputs, "MultiModalInputsV2"]
     """The inputs for the decoder portion."""
 
 
-SingletonInputs = TokenInputs
+SingletonInputs = Union[TokenInputs, "MultiModalInputsV2"]
 """
 A processed :class:`SingletonPrompt` which can be passed to
 :class:`vllm.sequence.Sequence`.
 """
 
+
+@dataclass
+class SingletonInputsAdapter:
+    """
+    Unified interface to access the components of :class:`SingletonInputs`.
+    """
+    inputs: SingletonInputs
+
+    @cached_property
+    def prompt(self) -> Optional[str]:
+        inputs = self.inputs
+
+        if inputs["type"] == "token" or inputs["type"] == "multimodal":
+            return inputs.get("prompt")
+
+        assert_never(inputs)
+
+    @cached_property
+    def prompt_token_ids(self) -> List[int]:
+        inputs = self.inputs
+
+        if inputs["type"] == "token" or inputs["type"] == "multimodal":
+            return inputs.get("prompt_token_ids", [])
+
+        assert_never(inputs)
+
+    @cached_property
+    def prompt_embeds(self) -> Optional[torch.Tensor]:
+        inputs = self.inputs
+
+        if inputs["type"] == "token" or inputs["type"] == "multimodal":
+            return None
+
+        assert_never(inputs)
+
+    @cached_property
+    def multi_modal_data(self) -> "MultiModalDataDict":
+        inputs = self.inputs
+
+        if inputs["type"] == "token":
+            return inputs.get("multi_modal_data", {})
+
+        if inputs["type"] == "multimodal":
+            return inputs.get("mm_kwargs", {})
+
+        assert_never(inputs)
+
+    @cached_property
+    def multi_modal_placeholders(self) -> "MultiModalPlaceholderDict":
+        inputs = self.inputs
+
+        if inputs["type"] == "token":
+            return inputs.get("multi_modal_placeholders", {})
+
+        if inputs["type"] == "multimodal":
+            return inputs.get("mm_placeholders", {})
+
+        assert_never(inputs)
+
+    @cached_property
+    def mm_processor_kwargs(self) -> Dict[str, Any]:
+        inputs = self.inputs
+
+        if inputs["type"] == "token":
+            return inputs.get("mm_processor_kwargs", {})
+
+        if inputs["type"] == "multimodal":
+            return {}
+
+        assert_never(inputs)
+
+
 ProcessorInputs = Union[DecoderOnlyInputs, EncoderDecoderInputs]
 """
 The inputs to :data:`vllm.inputs.InputProcessor`.
@@ -234,10 +310,11 @@ def zip_enc_dec_prompts(
 ) -> List[ExplicitEncoderDecoderPrompt[_T1, _T2]]:
     """
     Zip encoder and decoder prompts together into a list of
-    :class:`ExplicitEncoderDecoderPrompt` instances. mm_processor_kwargs
-    may also be provided; if a dict is passed, the same dictionary will be
-    used for every encoder/decoder prompt. If an iterable is provided, it will
-    be zipped with the encoder/decoder prompts.
+    :class:`ExplicitEncoderDecoderPrompt` instances.
+    
+    ``mm_processor_kwargs`` may also be provided; if a dict is passed, the same
+    dictionary will be used for every encoder/decoder prompt. If an iterable is
+    provided, it will be zipped with the encoder/decoder prompts.
     """
     if mm_processor_kwargs is None:
         mm_processor_kwargs = cast(Dict[str, Any], {})
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index 509b0448b9e51..fdf28615fda10 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -1,11 +1,13 @@
 import asyncio
-from typing import List, Optional
+from typing import List, Mapping, Optional, Union
 
 from typing_extensions import assert_never
 
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.multimodal.processing import MultiModalDataDict, MultiModalInputsV2
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.utils import print_warning_once
@@ -23,11 +25,13 @@ def __init__(
         self,
         model_config: ModelConfig,
         tokenizer: Optional[BaseTokenizerGroup],
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ) -> None:
         super().__init__()
 
         self.model_config = model_config
         self.tokenizer = tokenizer
+        self.mm_registry = mm_registry
 
     def get_tokenizer_group(self) -> BaseTokenizerGroup:
         if self.tokenizer is None:
@@ -198,14 +202,79 @@ async def _tokenize_prompt_async(
                                             prompt=prompt,
                                             lora_request=lora_request)
 
+    def _can_process_multimodal(self) -> bool:
+        model_config = self.model_config
+
+        if not model_config.is_multimodal_model:
+            raise ValueError("Your model does not support multi-modal inputs")
+
+        # Interim measure so we can handle models that have yet to be
+        # updated to use the new multi-modal processor
+        can_process_multimodal = self.mm_registry.has_processor(model_config)
+        if not can_process_multimodal:
+            logger.info(
+                "Your model uses the legacy input pipeline instead of the new "
+                "multi-modal processor. Please note that the legacy pipeline "
+                "will be removed in a future release. For more details, see: "
+                "https://github.com/vllm-project/vllm/issues/10114")
+
+        return can_process_multimodal
+
+    def _process_multimodal(
+        self,
+        prompt: Union[str, List[int]],
+        mm_data: MultiModalDataDict,
+        mm_processor_kwargs: Optional[Mapping[str, object]],
+        lora_request: Optional[LoRARequest],
+    ) -> MultiModalInputsV2:
+        """
+        Apply the model's multi-modal processor to a multi-modal prompt,
+        returning the corresponding token IDs and metadata.
+        """
+        tokenizer_group = self.get_tokenizer_group()
+        tokenizer = tokenizer_group.get_lora_tokenizer(lora_request)
+
+        mm_processor = self.mm_registry.create_processor(
+            self.model_config, tokenizer)
+
+        if isinstance(prompt, list):
+            prompt = tokenizer.decode(prompt)
+        if mm_processor_kwargs is None:
+            mm_processor_kwargs = {}
+
+        return mm_processor.apply(prompt, mm_data, mm_processor_kwargs)
+
+    async def _process_multimodal_async(
+        self,
+        prompt: Union[str, List[int]],
+        mm_data: MultiModalDataDict,
+        mm_processor_kwargs: Optional[Mapping[str, object]],
+        lora_request: Optional[LoRARequest],
+    ) -> MultiModalInputsV2:
+        """Async version of :meth:`_process_multimodal`."""
+        tokenizer_group = self.get_tokenizer_group()
+        tokenizer = await tokenizer_group.get_lora_tokenizer_async(lora_request
+                                                                   )
+
+        mm_processor = self.mm_registry.create_processor(
+            self.model_config, tokenizer)
+        if isinstance(prompt, list):
+            logger.warning("Passing `multi_modal_data` in TokensPrompt is"
+                           "deprecated and will be removed in a future update")
+            prompt = tokenizer.decode(prompt)
+        if mm_processor_kwargs is None:
+            mm_processor_kwargs = {}
+
+        return mm_processor.apply(prompt, mm_data, mm_processor_kwargs)
+
     def _prompt_to_llm_inputs(
         self,
         prompt: SingletonPrompt,
         request_id: str,
         lora_request: Optional[LoRARequest] = None,
     ) -> SingletonInputs:
-        '''
-        Extract the components of any single encoder or decoder input prompt.
+        """
+        Extract the singleton inputs from a prompt.
 
         Arguments:
 
@@ -215,12 +284,8 @@ def _prompt_to_llm_inputs(
 
         Returns:
 
-        * prompt
-        * prompt_token_ids
-        * multi_modal_data
-        * mm_processor_kwargs (request-level input processor/mapper overrides)
-        '''
-
+        * :class:`SingletonInputs` instance
+        """
         parsed = parse_singleton_prompt(prompt)
 
         if parsed["type"] == "str":
@@ -243,6 +308,14 @@ def _prompt_to_llm_inputs(
             multi_modal_data = tokens_content.get("multi_modal_data")
             mm_processor_kwargs = tokens_content.get("mm_processor_kwargs")
 
+            if multi_modal_data is not None and self._can_process_multimodal():
+                return self._process_multimodal(
+                    prompt_token_ids,
+                    multi_modal_data,
+                    mm_processor_kwargs,
+                    lora_request=lora_request,
+                )
+
             return token_inputs(
                 prompt_token_ids=prompt_token_ids,
                 multi_modal_data=multi_modal_data,
@@ -253,13 +326,22 @@ def _prompt_to_llm_inputs(
             text_content = parsed["content"]
 
             prompt_text = text_content["prompt"]
+            multi_modal_data = text_content.get("multi_modal_data")
+            mm_processor_kwargs = text_content.get("mm_processor_kwargs")
+
+            if multi_modal_data is not None and self._can_process_multimodal():
+                return self._process_multimodal(
+                    prompt_text,
+                    multi_modal_data,
+                    mm_processor_kwargs,
+                    lora_request=lora_request,
+                )
+
             prompt_token_ids = self._tokenize_prompt(
                 prompt_text,
                 request_id=request_id,
                 lora_request=lora_request,
             )
-            multi_modal_data = text_content.get("multi_modal_data")
-            mm_processor_kwargs = text_content.get("mm_processor_kwargs")
 
             return token_inputs(
                 prompt=prompt_text,
@@ -299,6 +381,14 @@ async def _prompt_to_llm_inputs_async(
             multi_modal_data = tokens_content.get("multi_modal_data")
             mm_processor_kwargs = tokens_content.get("mm_processor_kwargs")
 
+            if multi_modal_data is not None and self._can_process_multimodal():
+                return await self._process_multimodal_async(
+                    prompt_token_ids,
+                    multi_modal_data,
+                    mm_processor_kwargs,
+                    lora_request=lora_request,
+                )
+
             return token_inputs(
                 prompt_token_ids=prompt_token_ids,
                 multi_modal_data=multi_modal_data,
@@ -309,13 +399,22 @@ async def _prompt_to_llm_inputs_async(
             text_content = parsed["content"]
 
             prompt_text = text_content["prompt"]
+            multi_modal_data = text_content.get("multi_modal_data")
+            mm_processor_kwargs = text_content.get("mm_processor_kwargs")
+
+            if multi_modal_data is not None and self._can_process_multimodal():
+                return await self._process_multimodal_async(
+                    prompt_text,
+                    multi_modal_data,
+                    mm_processor_kwargs,
+                    lora_request=lora_request,
+                )
+
             prompt_token_ids = await self._tokenize_prompt_async(
                 prompt_text,
                 request_id=request_id,
                 lora_request=lora_request,
             )
-            multi_modal_data = text_content.get("multi_modal_data")
-            mm_processor_kwargs = text_content.get("mm_processor_kwargs")
 
             return token_inputs(
                 prompt=prompt_text,
@@ -331,7 +430,8 @@ def _build_enc_dec_llm_inputs(
         encoder_inputs: SingletonInputs,
         decoder_inputs: Optional[SingletonInputs],
     ) -> EncoderDecoderInputs:
-        if encoder_inputs["type"] == "token":
+        if (encoder_inputs["type"] == "token"
+                or encoder_inputs["type"] == "multimodal"):
             pass
         else:
             assert_never(encoder_inputs)
@@ -340,7 +440,8 @@ def _build_enc_dec_llm_inputs(
             dec_token_ids = self._prepare_decoder_input_ids_for_generation(
                 None)
             decoder_inputs = token_inputs(dec_token_ids)
-        elif decoder_inputs["type"] == "token":
+        elif (decoder_inputs["type"] == "token"
+              or decoder_inputs["type"] == "multimodal"):
             dec_token_ids = self._prepare_decoder_input_ids_for_generation(
                 decoder_inputs["prompt_token_ids"])
             decoder_inputs["prompt_token_ids"] = dec_token_ids
@@ -361,7 +462,7 @@ def _process_encoder_decoder_prompt(
         prompt: PromptType,
         request_id: str,
     ) -> EncoderDecoderInputs:
-        '''
+        """
         For encoder/decoder models only:
         Process an input prompt into an :class:`EncoderDecoderInputs` instance.
 
@@ -391,8 +492,7 @@ def _process_encoder_decoder_prompt(
         Returns:
 
         * :class:`EncoderDecoderInputs` instance
-        '''
-
+        """
         encoder_inputs: SingletonInputs
         decoder_inputs: Optional[SingletonInputs]
 
@@ -460,7 +560,8 @@ def _build_decoder_only_llm_inputs(
         prompt_inputs: DecoderOnlyInputs,
         prompt_adapter_request: Optional[PromptAdapterRequest],
     ) -> DecoderOnlyInputs:
-        if prompt_inputs["type"] == "token":
+        if (prompt_inputs["type"] == "token"
+                or prompt_inputs["type"] == "multimodal"):
             prompt_inputs["prompt_token_ids"] = self._apply_prompt_adapter(
                 prompt_inputs["prompt_token_ids"],
                 prompt_adapter_request=prompt_adapter_request,
@@ -477,7 +578,7 @@ def _process_decoder_only_prompt(
         lora_request: Optional[LoRARequest] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> DecoderOnlyInputs:
-        '''
+        """
         For decoder-only models:
         Process an input prompt into an :class:`DecoderOnlyInputs` instance.
 
@@ -491,7 +592,7 @@ def _process_decoder_only_prompt(
         Returns:
 
         * :class:`DecoderOnlyInputs` instance
-        '''
+        """
 
         prompt_comps = self._prompt_to_llm_inputs(
             prompt,
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 7d7a797be4f60..68b4756331e6d 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -5,14 +5,17 @@
                     Optional, Protocol, Type, cast)
 
 from torch import nn
-from transformers import PretrainedConfig
-from typing_extensions import TypeVar
+from transformers import PretrainedConfig, ProcessorMixin
+from typing_extensions import TypeVar, assert_never
 
 from vllm.logger import init_logger
+from vllm.transformers_utils.processor import cached_get_processor
+from vllm.transformers_utils.tokenizer import AnyTokenizer
 from vllm.utils import (get_allowed_kwarg_only_overrides, print_warning_once,
                         resolve_mm_processor_kwargs)
 
-from .data import ProcessorInputs
+from .data import ProcessorInputs, SingletonInputs
+from .parse import is_encoder_decoder_inputs
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -61,6 +64,19 @@ def get_hf_image_processor_config(self) -> Dict[str, Any]:
         return self.model_config.hf_image_processor_config
 
 
+@dataclass(frozen=True)
+class InputProcessingContext(InputContext):
+    tokenizer: AnyTokenizer
+    """The tokenizer used to tokenize the inputs."""
+
+    def get_hf_processor(self) -> ProcessorMixin:
+        return cached_get_processor(
+            self.model_config.tokenizer,
+            tokenizer=self.tokenizer,  # Override the tokenizer with ours
+            trust_remote_code=self.model_config.trust_remote_code,
+        )
+
+
 N = TypeVar("N", bound=Type[nn.Module])
 
 
@@ -94,7 +110,7 @@ def __call__(
         ...
 
 
-class _MultiModalCounts(UserDict):
+class _MultiModalCounts(UserDict[str, int]):
     """
     Wraps `mm_counts` for a more informative error message
     when attempting to access a plugin that does not exist.
@@ -287,6 +303,21 @@ def _get_model_input_processor(self, model_cls: Type[nn.Module]):
         return self._input_processors_by_model_type \
             .get(model_cls, self._default_input_processor)
 
+    def _ensure_mm_kwargs(
+        self,
+        inputs: SingletonInputs,
+        mm_processor_kwargs: Dict[str, Any],
+    ):
+        if inputs["type"] == "token":
+            # In case the input processor for that model fails to set it
+            if "mm_processor_kwargs" not in inputs:
+                inputs["mm_processor_kwargs"] = mm_processor_kwargs
+        elif inputs["type"] == "multimodal":
+            # Be more strict in V2
+            assert "mm_kwargs" in inputs
+        else:
+            assert_never(inputs["type"])
+
     def process_input(self, model_config: "ModelConfig",
                       inputs: ProcessorInputs) -> ProcessorInputs:
         """
@@ -312,8 +343,21 @@ def process_input(self, model_config: "ModelConfig",
             processor,
         )
 
-        return processor(InputContext(model_config), inputs,
-                         **mm_processor_kwargs)
+        processed_inputs = processor(
+            InputContext(model_config),
+            inputs,
+            **mm_processor_kwargs,
+        )
+
+        if is_encoder_decoder_inputs(processed_inputs):
+            self._ensure_mm_kwargs(processed_inputs["encoder"],
+                                   mm_processor_kwargs)
+            self._ensure_mm_kwargs(processed_inputs["decoder"],
+                                   mm_processor_kwargs)
+        else:
+            self._ensure_mm_kwargs(processed_inputs, mm_processor_kwargs)
+
+        return processed_inputs
 
     def create_input_processor(self, model_config: "ModelConfig"):
         """
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 08ed84aa9c71a..6ec2d5a2a3909 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -30,8 +30,8 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.glm4_vision_encoder import EVA2CLIPModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.multimodal.base import MultiModalData
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.inputs import MultiModalData, MultiModalKwargs
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors,
                            SequenceData)
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 37f38d4d76671..b39dfe706e0df 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -32,8 +32,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.models.persimmon import PersimmonForCausalLM
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.base import MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.image import cached_get_image_processor
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    consecutive_placeholder_ranges)
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
index 767171dad7c7b..df7e768fe14d3 100644
--- a/vllm/model_executor/models/h2ovl.py
+++ b/vllm/model_executor/models/h2ovl.py
@@ -15,8 +15,7 @@
 from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext,
                          token_inputs)
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.base import MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.utils import is_list_of
 
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 77efc9a26ef7a..07165ea688f94 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -25,8 +25,7 @@
 from vllm.model_executor.models.intern_vit import (InternVisionModel,
                                                    InternVisionPatchModel)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.base import MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index af712bf8f9506..005ae5e03cfed 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -17,7 +17,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.base import NestedTensors
+from vllm.multimodal.inputs import NestedTensors
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_list_of
 
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index aae534c0b5949..999739ccd98bf 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -51,8 +51,7 @@
 from vllm.model_executor.models.qwen2 import Qwen2Model
 from vllm.model_executor.models.utils import LLMWrapper
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.base import MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.image import cached_get_image_processor
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.sequence import IntermediateTensors, SequenceData
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index de03d28638cda..4db65edc174f1 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -39,7 +39,7 @@
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.base import NestedTensors, PlaceholderRange
+from vllm.multimodal.inputs import NestedTensors, PlaceholderRange
 from vllm.multimodal.utils import cached_get_tokenizer, repeat_and_pad_token
 from vllm.sequence import IntermediateTensors, PoolerOutput
 from vllm.utils import is_list_of
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 6bd5e119dd2dd..a3e30ea2dd299 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -29,8 +29,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.utils import merge_multimodal_embeddings
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.base import MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    consecutive_placeholder_ranges)
 from vllm.sequence import IntermediateTensors, SequenceData
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 5acd87146c54e..3d26ede722dd1 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -42,8 +42,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.base import MultiModalKwargs
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.sequence import IntermediateTensors, SequenceData
 from vllm.utils import is_list_of
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 9a19ccbca3f1e..2335baf459771 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -60,10 +60,10 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.qwen2 import Qwen2Model
 from vllm.model_executor.pooling_metadata import PoolingMetadata
-from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
-                             MultiModalKwargs)
-from vllm.multimodal.base import MultiModalData
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import cached_get_image_processor
+from vllm.multimodal.inputs import (MultiModalData, MultiModalDataDict,
+                                    MultiModalKwargs)
 from vllm.multimodal.utils import cached_get_tokenizer
 from vllm.sequence import IntermediateTensors, PoolerOutput, SequenceData
 from vllm.transformers_utils.config import uses_mrope
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index ca4fc8ec952bf..1fc6c1be4b7bb 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -15,7 +15,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models import ModelRegistry
-from vllm.multimodal.base import MultiModalPlaceholderMap, NestedTensors
+from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 from vllm.utils import is_pin_memory_available
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 14911853abc73..03a5f3a91f7a1 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,7 +1,8 @@
-from .base import (BatchedTensorInputs, MultiModalDataBuiltins,
-                   MultiModalDataDict, MultiModalKwargs,
-                   MultiModalPlaceholderDict, MultiModalPlaceholderMap,
-                   MultiModalPlugin, NestedTensors)
+from .base import MultiModalPlaceholderMap, MultiModalPlugin
+from .inputs import (BatchedTensorInputs, MultiModalData,
+                     MultiModalDataBuiltins, MultiModalDataDict,
+                     MultiModalKwargs, MultiModalPlaceholderDict,
+                     NestedTensors)
 from .registry import MultiModalRegistry
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
@@ -15,6 +16,7 @@
 
 __all__ = [
     "BatchedTensorInputs",
+    "MultiModalData",
     "MultiModalDataBuiltins",
     "MultiModalDataDict",
     "MultiModalKwargs",
diff --git a/vllm/multimodal/audio.py b/vllm/multimodal/audio.py
index e71ae5feec1c6..1a230602966d4 100644
--- a/vllm/multimodal/audio.py
+++ b/vllm/multimodal/audio.py
@@ -1,5 +1,7 @@
 from vllm.inputs.registry import InputContext
-from vllm.multimodal.base import MultiModalKwargs, MultiModalPlugin
+
+from .base import MultiModalPlugin
+from .inputs import AudioItem, MultiModalData, MultiModalKwargs
 
 
 class AudioPlugin(MultiModalPlugin):
@@ -8,8 +10,12 @@ class AudioPlugin(MultiModalPlugin):
     def get_data_key(self) -> str:
         return "audio"
 
-    def _default_input_mapper(self, ctx: InputContext, data: object,
-                              **mm_processor_kwargs) -> MultiModalKwargs:
+    def _default_input_mapper(
+        self,
+        ctx: InputContext,
+        data: MultiModalData[AudioItem],
+        **mm_processor_kwargs,
+    ) -> MultiModalKwargs:
         raise NotImplementedError("There is no default audio input mapper")
 
     def _default_max_multimodal_tokens(self, ctx: InputContext) -> int:
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index fa514d3fcb3b7..6eec660e42ac4 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -1,180 +1,23 @@
 from abc import ABC, abstractmethod
-from collections import UserDict, defaultdict
-from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Mapping,
-                    NamedTuple, Optional, Tuple, Type, TypedDict, TypeVar,
-                    Union, cast, final)
-
-import numpy as np
-import torch
-import torch.types
-from PIL import Image
+from collections import defaultdict
+from typing import (TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple,
+                    Optional, Sequence, Tuple, Type, TypeVar, Union)
+
 from torch import nn
-from typing_extensions import TypeAlias
 
 from vllm.inputs import InputContext
 from vllm.logger import init_logger
-from vllm.utils import (JSONTree, get_allowed_kwarg_only_overrides, is_list_of,
-                        json_map_leaves, resolve_mm_processor_kwargs)
+from vllm.utils import (get_allowed_kwarg_only_overrides,
+                        resolve_mm_processor_kwargs)
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
     from vllm.sequence import SequenceGroupMetadata
 
-logger = init_logger(__name__)
-
-NestedTensors = Union[List["NestedTensors"], List[torch.Tensor], torch.Tensor]
-"""
-Uses a list instead of a tensor if the dimensions of each element do not match.
-"""
-
-BatchedTensorInputs: TypeAlias = Dict[str, NestedTensors]
-"""
-A dictionary containing nested tensors which have been batched via
-:meth:`MultiModalKwargs.batch`.
-"""
-
-
-class _MultiModalKwargsBase(UserDict[str, NestedTensors]):
-    pass
-
-
-class MultiModalKwargs(_MultiModalKwargsBase):
-    """
-    A dictionary that represents the keyword arguments to
-    :meth:`~torch.nn.Module.forward`.
-    """
-
-    @staticmethod
-    def _try_stack(nested_tensors: NestedTensors) -> NestedTensors:
-        """
-        Recursively stacks lists of tensors when they all have the same shape.
-        """
-        if isinstance(nested_tensors, torch.Tensor):
-            return nested_tensors
-
-        if isinstance(nested_tensors, np.ndarray):
-            return torch.from_numpy(nested_tensors)
-
-        if isinstance(nested_tensors, (int, float)):
-            return torch.tensor(nested_tensors)
+from .inputs import (MultiModalData, MultiModalDataDict, MultiModalKwargs,
+                     PlaceholderRange)
 
-        stacked = [MultiModalKwargs._try_stack(t) for t in nested_tensors]
-        if not is_list_of(stacked, torch.Tensor, check="all"):
-            # Only tensors (not lists) can be stacked.
-            return stacked
-
-        tensors_ = cast(List[torch.Tensor], stacked)
-        if any(t.shape != tensors_[0].shape for t in tensors_):
-            # The tensors have incompatible shapes and can't be stacked.
-            return tensors_
-
-        return torch.stack(tensors_)
-
-    @staticmethod
-    def batch(inputs_list: List["MultiModalKwargs"]) -> BatchedTensorInputs:
-        """
-        Batch multiple inputs together into a dictionary.
-
-        The resulting dictionary has the same keys as the inputs.
-        If the corresponding value from each input is a tensor and they all
-        share the same shape, the output value is a single batched tensor;
-        otherwise, the output value is a list containing the original value
-        from each input.
-        """
-        if len(inputs_list) == 0:
-            return {}
-
-        item_lists: Dict[str, List[NestedTensors]] = defaultdict(list)
-
-        for inputs in inputs_list:
-            # For models that supports multiple modalities (e.g. Qwen2-VL),
-            # different modalities will return different data keys,
-            # so batch() should skip the same key check.
-
-            for k, v in inputs.items():
-                item_lists[k].append(v)
-
-        return {
-            k: MultiModalKwargs._try_stack(item_list)
-            for k, item_list in item_lists.items()
-        }
-
-    @staticmethod
-    def as_kwargs(
-        batched_inputs: BatchedTensorInputs,
-        *,
-        device: torch.types.Device,
-    ) -> BatchedTensorInputs:
-        json_inputs = cast(JSONTree[torch.Tensor], batched_inputs)
-
-        json_mapped = json_map_leaves(
-            lambda x: x.to(device, non_blocking=True),
-            json_inputs,
-        )
-
-        return cast(BatchedTensorInputs, json_mapped)
-
-
-_T = TypeVar("_T")
-
-MultiModalData: TypeAlias = Union[_T, List[_T]]
-"""
-Either a single data instance, or a list of data instances.
-
-The number of data instances allowed per modality is restricted by
-`--limit-mm-per-prompt`.
-"""
-
-
-@final
-class MultiModalDataBuiltins(TypedDict, total=False):
-    """Modality types that are predefined by vLLM."""
-
-    image: MultiModalData[Image.Image]
-    """The input image(s)."""
-
-    audio: MultiModalData[Tuple[np.ndarray, Union[int, float]]]
-    """The input audio item(s) and corresponding sampling rate(s)."""
-
-    video: MultiModalData[Tuple[np.ndarray]]
-    """The input video(s)."""
-
-
-MultiModalDataDict = Union[MultiModalDataBuiltins,
-                           Mapping[str, MultiModalData[object]]]
-"""
-A dictionary containing an item for each modality type to input.
-
-Note:
-    This dictionary also accepts modality keys defined outside
-    :class:`MultiModalDataBuiltins` as long as a customized plugin is registered
-    through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
-    Read more on that :ref:`here <adding_multimodal_plugin>`.
-"""
-
-
-class PlaceholderRange(TypedDict):
-    """
-    Placeholder location information for multi-modal data.
-
-    For example:
-        Prompt: AAAA BBBB What is in these images?
-        Images A and B will have:
-            A: { "offset": 0, "length": 4 }
-            B: { "offset": 5, "length": 4 }
-    """
-
-    offset: int
-    """The start index of the placeholder in the prompt."""
-
-    length: int
-    """The length of the placeholder."""
-
-
-MultiModalPlaceholderDict = Mapping[str, List[PlaceholderRange]]
-"""
-A dictionary containing placeholder ranges.
-"""
+logger = init_logger(__name__)
 
 MultiModalInputMapper = Callable[[InputContext, MultiModalData[object]],
                                  MultiModalKwargs]
@@ -192,6 +35,7 @@ class PlaceholderRange(TypedDict):
 model. This does not include tokens that correspond to the input text.
 """
 
+_T = TypeVar("_T")
 N = TypeVar("N", bound=Type[nn.Module])
 
 
@@ -224,7 +68,7 @@ def get_data_key(self) -> str:
     def _default_input_mapper(
         self,
         ctx: InputContext,
-        data: MultiModalData[object],
+        data: MultiModalData[Any],
         **mm_processor_kwargs,
     ) -> MultiModalKwargs:
         """
@@ -273,8 +117,8 @@ def wrapper(model_cls: N) -> N:
     def map_input(
         self,
         model_config: "ModelConfig",
-        data: MultiModalData[object],
-        mm_processor_kwargs: Dict[str, Any],
+        data: MultiModalData[Any],
+        mm_processor_kwargs: Optional[Dict[str, Any]],
     ) -> MultiModalKwargs:
         """
         Transform the data into a dictionary of model inputs using the
@@ -289,6 +133,7 @@ def map_input(
             - :ref:`input_processing_pipeline`
             - :ref:`enabling_multimodal_inputs`
         """
+
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
 
@@ -300,6 +145,9 @@ def map_input(
             raise KeyError(f"No input mapper in {self} is registered for "
                            f"model class {model_cls.__name__}.")
 
+        if mm_processor_kwargs is None:
+            mm_processor_kwargs = {}
+
         # In the case of the default mapper, we have to get resource
         # processor through its HuggingFace autoclass; since this goes
         # through **kwargs, we can't inspect it the same way, so we allow
@@ -508,7 +356,7 @@ def append_items_from_seq_group(
         self,
         positions: range,
         multi_modal_items: List[_T],
-        multi_modal_placeholders: List[PlaceholderRange],
+        multi_modal_placeholders: Sequence[PlaceholderRange],
     ) -> List[_T]:
         """
         Adds the multi-modal items that intersect ```positions`` to this
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 589b46266b08d..97bbce1ce1570 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -3,14 +3,14 @@
 
 import torch
 from PIL import Image
-from transformers.image_processing_base import BatchFeature
 
 from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.processor import get_image_processor
 from vllm.utils import is_list_of
 
-from .base import MultiModalData, MultiModalKwargs, MultiModalPlugin
+from .base import MultiModalPlugin
+from .inputs import ImageItem, MultiModalData, MultiModalKwargs
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -41,15 +41,11 @@ def _get_hf_image_processor(
     def _default_input_mapper(
         self,
         ctx: InputContext,
-        data: MultiModalData[object],
+        data: MultiModalData[ImageItem],
         **mm_processor_kwargs,
     ) -> MultiModalKwargs:
         model_config = ctx.model_config
 
-        # Processed by input processor
-        if isinstance(data, BatchFeature):
-            return MultiModalKwargs(data.data)
-
         # PIL image
         if isinstance(data, Image.Image) or is_list_of(data, Image.Image):
             image_processor = self._get_hf_image_processor(
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
new file mode 100644
index 0000000000000..64a4c58d5509c
--- /dev/null
+++ b/vllm/multimodal/inputs.py
@@ -0,0 +1,225 @@
+from collections import UserDict, defaultdict
+from typing import (Any, Dict, List, Literal, Mapping, Sequence, Tuple,
+                    TypedDict, TypeVar, Union, cast, final)
+
+import numpy as np
+import torch
+import torch.types
+from PIL.Image import Image
+from typing_extensions import TypeAlias
+
+from vllm.utils import JSONTree, is_list_of, json_map_leaves
+
+_T = TypeVar("_T")
+
+# yapf: disable
+ImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor]
+"""
+A :class:`transformers.image_utils.ImageInput` representing a single image,
+which can be passed to a HuggingFace :code:`ImageProcessor`.
+"""
+
+VideoItem: TypeAlias = Union[
+    List[Image],
+    np.ndarray,
+    torch.Tensor,
+    List[np.ndarray],
+    List[torch.Tensor],
+]
+"""
+
+A :class:`transformers.image_utils.VideoInput` representing a single video,
+which can be passed to a HuggingFace :code:`VideoProcessor`.
+"""
+
+AudioItem: TypeAlias = Union[
+    np.ndarray,
+    List[float],
+    Tuple[np.ndarray, float],  # DEPRECATED: Use mm_processor_kwargs instead
+]
+"""
+Represents a single audio that can be inputted to a HuggingFace
+:code:`AudioProcessor`.
+"""
+# yapf: enable
+
+MultiModalData: TypeAlias = Union[_T, List[_T]]
+"""
+Either a single data item, or a list of data items.
+
+The number of data items allowed per modality is restricted by
+:code:`--limit-mm-per-prompt`.
+"""
+
+
+@final
+class MultiModalDataBuiltins(TypedDict, total=False):
+    """Type annotations for modality types predefined by vLLM."""
+
+    image: MultiModalData[ImageItem]
+    """The input image(s)."""
+
+    video: MultiModalData[VideoItem]
+    """The input video(s)."""
+
+    audio: MultiModalData[AudioItem]
+    """The input audio(s)."""
+
+
+MultiModalDataDict: TypeAlias = Mapping[str, MultiModalData[Any]]
+"""
+A dictionary containing an entry for each modality type to input.
+
+Note:
+    This dictionary also accepts modality keys defined outside
+    :class:`MultiModalDataBuiltins` as long as a customized plugin
+    is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
+    Read more on that :ref:`here <adding_multimodal_plugin>`.
+"""
+
+
+class PlaceholderRange(TypedDict):
+    """
+    Placeholder location information for multi-modal data.
+
+    For example:
+        Prompt: AAAA BBBB What is in these images?
+        Images A and B will have:
+            A: { "offset": 0, "length": 4 }
+            B: { "offset": 5, "length": 4 }
+    """
+
+    offset: int
+    """The start index of the placeholder in the prompt."""
+
+    length: int
+    """The length of the placeholder."""
+
+
+NestedTensors = Union[List["NestedTensors"], List[torch.Tensor], torch.Tensor]
+"""
+Uses a list instead of a tensor if the dimensions of each element do not match.
+"""
+
+BatchedTensorInputs: TypeAlias = Dict[str, NestedTensors]
+"""
+A dictionary containing nested tensors which have been batched via
+:meth:`MultiModalKwargs.batch`.
+"""
+
+
+class MultiModalKwargs(UserDict[str, NestedTensors]):
+    """
+    A dictionary that represents the keyword arguments to
+    :meth:`~torch.nn.Module.forward`.
+    """
+
+    @staticmethod
+    def _try_stack(nested_tensors: NestedTensors) -> NestedTensors:
+        """
+        Stack the inner dimensions that have the same shape in
+        a nested list of tensors.
+
+        Thus, a dimension represented by a list means that the inner
+        dimensions are different for each element along that dimension.
+        """
+        if isinstance(nested_tensors, torch.Tensor):
+            return nested_tensors
+
+        # TODO: Remove these once all models have been migrated
+        if isinstance(nested_tensors, np.ndarray):
+            return torch.from_numpy(nested_tensors)
+        if isinstance(nested_tensors, (int, float)):
+            return torch.tensor(nested_tensors)
+
+        stacked = [MultiModalKwargs._try_stack(t) for t in nested_tensors]
+        if not is_list_of(stacked, torch.Tensor, check="all"):
+            # Only tensors (not lists) can be stacked.
+            return stacked
+
+        tensors_ = cast(List[torch.Tensor], stacked)
+        if any(t.shape != tensors_[0].shape for t in tensors_):
+            # The tensors have incompatible shapes and can't be stacked.
+            return tensors_
+
+        return torch.stack(tensors_)
+
+    @staticmethod
+    def batch(inputs_list: List["MultiModalKwargs"]) -> BatchedTensorInputs:
+        """
+        Batch multiple inputs together into a dictionary.
+
+        The resulting dictionary has the same keys as the inputs.
+        If the corresponding value from each input is a tensor and they all
+        share the same shape, the output value is a single batched tensor;
+        otherwise, the output value is a list containing the original value
+        from each input.
+        """
+        if len(inputs_list) == 0:
+            return {}
+
+        # We need to consider the case where each item in the batch
+        # contains different modalities (i.e. different keys).
+        item_lists: Dict[str, List[NestedTensors]] = defaultdict(list)
+
+        for inputs in inputs_list:
+            for k, v in inputs.items():
+                item_lists[k].append(v)
+
+        return {
+            k: MultiModalKwargs._try_stack(item_list)
+            for k, item_list in item_lists.items()
+        }
+
+    @staticmethod
+    def as_kwargs(
+        batched_inputs: BatchedTensorInputs,
+        *,
+        device: torch.types.Device,
+    ) -> BatchedTensorInputs:
+        json_inputs = cast(JSONTree[torch.Tensor], batched_inputs)
+
+        json_mapped = json_map_leaves(
+            lambda x: x.to(device, non_blocking=True),
+            json_inputs,
+        )
+
+        return cast(BatchedTensorInputs, json_mapped)
+
+
+MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]]
+"""
+A dictionary containing placeholder ranges.
+"""
+
+
+class MultiModalInputsV2(TypedDict):
+    """
+    Represents the outputs of :class:`vllm.multimodal.MultiModalProcessor`,
+    ready to be passed to vLLM internals.
+    """
+
+    type: Literal["multimodal"]
+    """The type of inputs."""
+
+    prompt: str
+    """
+    The original, unprocessed prompt text.
+
+    Note:
+        Since prompt text is not required by vLLM internals, we leave this
+        unprocessed to save CPU computation. You can still call
+        :code:`tokenizer.decode(prompt_token_ids)` to get the processed text.
+    """
+
+    prompt_token_ids: List[int]
+    """The processed token IDs which includes placeholder tokens."""
+
+    mm_kwargs: MultiModalKwargs
+    """Keyword arguments to be directly passed to the model after batching."""
+
+    mm_placeholders: MultiModalPlaceholderDict
+    """
+    For each modality, information about the placeholder tokens in
+    :code:`prompt_token_ids`.
+    """
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
new file mode 100644
index 0000000000000..88a924da174a6
--- /dev/null
+++ b/vllm/multimodal/processing.py
@@ -0,0 +1,273 @@
+from dataclasses import dataclass
+from functools import lru_cache, partial
+from typing import (Any, Callable, Collection, Generic, List, Mapping,
+                    Optional, TypedDict, TypeVar, final)
+
+from transformers import BatchFeature
+from typing_extensions import TypeAlias
+
+from vllm.inputs import InputProcessingContext
+from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.utils import is_list_of
+
+from .inputs import (AudioItem, ImageItem, MultiModalDataDict,
+                     MultiModalInputsV2, MultiModalKwargs, PlaceholderRange,
+                     VideoItem)
+
+_T = TypeVar("_T")
+
+ReplacementFunc: TypeAlias = Callable[[_T, BatchFeature, int], List[int]]
+"""
+Given the original data item, HF-processed data, and index of the processed
+item, output the replacement token IDs to be allocated in vLLM.
+"""
+
+
+@dataclass
+class ModalityProcessingMetadata(Generic[_T]):
+    placeholder_replacements: Mapping[str, ReplacementFunc]
+    """
+    A dictionary where each item represents the original placeholder in the
+    prompt text and the corresponding replacement.
+    """
+
+
+class MultiModalProcessingMetadataBuiltins(TypedDict, total=False):
+    """Type annotations for modality types predefined by vLLM."""
+
+    image: ModalityProcessingMetadata[ImageItem]
+    video: ModalityProcessingMetadata[VideoItem]
+    audio: ModalityProcessingMetadata[AudioItem]
+
+
+MultiModalProcessingMetadata: TypeAlias = \
+    Mapping[str, ModalityProcessingMetadata[Any]]
+"""
+A dictionary containing an entry for each modality type to process.
+
+Note:
+    This dictionary also accepts modality keys defined outside
+    :class:`MultiModalProcessingMetadataBuiltins` as long as a customized plugin
+    is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
+    Read more on that :ref:`here <adding_multimodal_plugin>`.
+"""
+
+MultiModalMultiData: TypeAlias = List[_T]
+"""
+A list of data items, where the number of data items allowed
+per modality is restricted by :code:`--limit-mm-per-prompt`.
+"""
+
+
+@final
+class MultiModalMultiDataBuiltins(TypedDict, total=False):
+    """Type annotations for modality types predefined by vLLM."""
+
+    image: MultiModalMultiData[ImageItem]
+    """The input images."""
+
+    video: MultiModalMultiData[VideoItem]
+    """The input videos."""
+
+    audio: MultiModalMultiData[AudioItem]
+    """The input audios."""
+
+
+MultiModalMultiDataDict: TypeAlias = Mapping[str, MultiModalMultiData[Any]]
+"""
+A dictionary containing an entry for each modality type to input.
+
+Note:
+    This dictionary also accepts modality keys defined outside
+    :class:`MultiModalMultiDataBuiltins` as long as a customized plugin
+    is registered through the :class:`~vllm.multimodal.MULTIMODAL_REGISTRY`.
+    Read more on that :ref:`here <adding_multimodal_plugin>`.
+"""
+
+
+def to_multi_format(data: MultiModalDataDict) -> MultiModalMultiDataDict:
+    """
+    Convert a :class:`MultiModalDataDict` containing single data items
+    to a :class:`MultiModalMultiDataDict` containing multiple data items
+    per entry.
+    """
+    multi_data: Mapping[str, MultiModalMultiData[Any]] = {}
+
+    for k, v in data.items():
+        # yapf: disable
+        if k == "video":
+            # Special case since even a single item can be a list
+            multi_data[k] = v if is_list_of(v, list) else [v]  # type: ignore[index]
+        elif k in ("image", "audio"):
+            multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
+        else:
+            multi_data[k] = v if isinstance(v, list) else [v]  # type: ignore[index]
+        # yapf: enable
+
+    return multi_data
+
+
+def encode_no_special_tokens(
+    tokenizer: AnyTokenizer,
+    text: str,
+) -> List[int]:
+    """
+    Backend-agnostic equivalent of HF's
+    :code:`tokenizer.encode(text, add_special_tokens=False)`.
+    """
+    if isinstance(tokenizer, MistralTokenizer):
+        return tokenizer.tokenizer.encode(text, bos=False, eos=False)
+
+    return tokenizer.encode(text, add_special_tokens=False)
+
+
+@lru_cache
+def candidate_placeholders(
+    tokenizer: AnyTokenizer,
+    placeholder_text: str,
+) -> Collection[List[int]]:
+    """Generate token ID sequences that may represent a placeholder text."""
+    # When the placeholder text is not mapped to a special token ID,
+    # it may be tokenized differently based on whether it is at the start/end
+    # of the string. So, we go through each combination of whether the text
+    # is at the start and end boundaries of the string
+
+    # Matches the placeholder when it is in the middle of the string
+    start_id, = encode_no_special_tokens(tokenizer, "a")
+    end_id, = encode_no_special_tokens(tokenizer, "b")
+
+    candidate_basic = encode_no_special_tokens(tokenizer, placeholder_text)
+
+    start_id_, *candidate_a = encode_no_special_tokens(
+        tokenizer,
+        f"a{placeholder_text}",
+    )
+    assert start_id == start_id_
+
+    start_id_, *candidate_ab, end_id_ = encode_no_special_tokens(
+        tokenizer,
+        f"a{placeholder_text}b",
+    )
+    assert start_id == start_id_ and end_id == end_id_
+
+    *candidate_b, end_id_ = encode_no_special_tokens(
+        tokenizer,
+        f"{placeholder_text}b",
+    )
+    assert end_id == end_id_
+
+    # Remove duplicates (need to convert to tuple to be hashable)
+    unique_candidates = {
+        tuple(c)
+        for c in [candidate_basic, candidate_a, candidate_ab, candidate_b]
+    }
+
+    # Convert back to list
+    return [list(c) for c in unique_candidates]
+
+
+def apply_placeholders(
+    token_ids: List[int],
+    placeholder_ids: List[int],
+    get_replacement_ids: Callable[[], List[int]],
+) -> Optional[PlaceholderRange]:
+    """
+    Find the first occurrence of :code:`placeholder_ids`,
+    and replace it with the output of :code:`get_replacement_ids`.
+
+    This function updates :code:`token_ids` in place.
+    """
+    placeholder_length = len(placeholder_ids)
+
+    for start_idx in range(len(token_ids) - placeholder_length + 1):
+        if token_ids[start_idx:placeholder_length] == placeholder_ids:
+            token_ids[start_idx:placeholder_length] = get_replacement_ids()
+
+            return PlaceholderRange(offset=start_idx,
+                                    length=placeholder_length)
+
+    return None
+
+
+class MultiModalProcessor:
+    """
+    Helper class to process multi-modal inputs to be used in vLLM.
+    """
+
+    def __init__(
+        self,
+        ctx: InputProcessingContext,
+        metadata: MultiModalProcessingMetadata,
+    ) -> None:
+        super().__init__()
+
+        self.ctx = ctx
+        self.metadata = metadata
+
+    def __call__(
+        self,
+        prompt: str,
+        mm_data: MultiModalDataDict,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> MultiModalInputsV2:
+        return self.apply(prompt, mm_data, mm_processor_kwargs)
+
+    def apply(
+        self,
+        prompt: str,
+        mm_data: MultiModalDataDict,
+        mm_processor_kwargs: Mapping[str, object],
+    ) -> MultiModalInputsV2:
+        tokenizer = self.ctx.tokenizer
+        hf_processor = self.ctx.get_hf_processor()
+
+        processed_inputs = hf_processor(
+            text=prompt,  # type: ignore
+            **mm_data,
+            **mm_processor_kwargs,
+        )
+        new_token_ids, = processed_inputs.pop("input_ids").tolist()
+        mm_kwargs = MultiModalKwargs(processed_inputs)
+
+        mm_placeholders: Mapping[str, List[PlaceholderRange]] = {}
+
+        for modality, orig_inputs in to_multi_format(mm_data).items():
+            assert isinstance(orig_inputs, list)
+
+            metadata = self.metadata[modality]
+            placeholder_replacements = metadata.placeholder_replacements
+
+            modality_placeholders: List[PlaceholderRange] = []
+
+            for item_idx, orig_item in enumerate(orig_inputs):
+                for match_text, replace_fn in placeholder_replacements.items():
+                    candidates = candidate_placeholders(tokenizer, match_text)
+                    get_replacement_ids = partial(
+                        replace_fn,
+                        orig_item,
+                        processed_inputs,
+                        item_idx,
+                    )
+
+                    for match_ids in candidates:
+                        # TODO(youkaichao): Don't update new_token_ids
+                        placeholders = apply_placeholders(
+                            new_token_ids,
+                            match_ids,
+                            get_replacement_ids,
+                        )
+
+                        if placeholders is not None:
+                            modality_placeholders.append(placeholders)
+
+            # yapf: disable
+            mm_placeholders[modality] = modality_placeholders  # type: ignore[index]
+            # yapf: enable
+
+        return MultiModalInputsV2(
+            type="multimodal",
+            prompt=prompt,
+            prompt_token_ids=new_token_ids,
+            mm_kwargs=mm_kwargs,
+            mm_placeholders=mm_placeholders,
+        )
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index b844c9e1c2e89..b992442d3b314 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,13 +1,20 @@
 import functools
 from collections import UserDict
-from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional, Sequence
+from typing import (TYPE_CHECKING, Any, Callable, Dict, Mapping, Optional,
+                    Sequence, Type, TypeVar)
 
+import torch.nn as nn
+from typing_extensions import TypeAlias
+
+from vllm.inputs import InputProcessingContext
 from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizer import AnyTokenizer
 
 from .audio import AudioPlugin
-from .base import (MultiModalDataDict, MultiModalInputMapper, MultiModalKwargs,
-                   MultiModalPlugin, MultiModalTokensCalc, NestedTensors)
+from .base import MultiModalInputMapper, MultiModalPlugin, MultiModalTokensCalc
 from .image import ImagePlugin
+from .inputs import MultiModalDataDict, MultiModalKwargs, NestedTensors
+from .processing import MultiModalProcessor
 from .video import VideoPlugin
 
 if TYPE_CHECKING:
@@ -15,8 +22,18 @@
 
 logger = init_logger(__name__)
 
+N = TypeVar("N", bound=Type[nn.Module])
+
+MultiModalProcessorFactory: TypeAlias = Callable[[InputProcessingContext],
+                                                 MultiModalProcessor]
+"""
+Constructs a :class:`MultiModalProcessor` instance from the context.
+
+The processing metadata should be derived from the context.
+"""
+
 
-class _MultiModalLimits(UserDict):
+class _MultiModalLimits(UserDict["ModelConfig", Dict[str, int]]):
     """
     Wraps `_limits_by_model` for a more informative error message
     when attempting to access a model that does not exist.
@@ -45,6 +62,9 @@ def __init__(
             plugins: Sequence[MultiModalPlugin] = DEFAULT_PLUGINS) -> None:
         self._plugins = {p.get_data_key(): p for p in plugins}
 
+        self._processor_factories: Dict[Type[nn.Module],
+                                        MultiModalProcessorFactory] = {}
+
         # This is used for non-multimodal models
         self._disabled_limits_per_plugin = {k: 0 for k in self._plugins}
 
@@ -243,3 +263,59 @@ def get_mm_limits_per_prompt(
             This should be called after :meth:`init_mm_limits_per_prompt`.
         """
         return self._limits_by_model[model_config]
+
+    def register_processor(
+        self,
+        factory: MultiModalProcessorFactory,
+    ):
+        """
+        Register a multi-modal processor to a model class.
+
+        When the model receives multi-modal data, the provided function is
+        invoked to transform the data into a dictionary of model inputs.
+
+        See also:
+            - :ref:`input_processing_pipeline`
+            - :ref:`enabling_multimodal_inputs`
+        """
+
+        def wrapper(model_cls: N) -> N:
+            if model_cls in self._processor_factories:
+                logger.warning(
+                    "Model class %s already has an input mapper "
+                    "registered to %s. It is overwritten by the new one.",
+                    model_cls, self)
+
+            self._processor_factories[model_cls] = factory
+
+            return model_cls
+
+        return wrapper
+
+    def has_processor(self, model_config: "ModelConfig") -> bool:
+        """
+        Test whether a multi-modal processor is defined for a specific model.
+        """
+        # Avoid circular import
+        from vllm.model_executor.model_loader import get_model_architecture
+
+        model_cls, _ = get_model_architecture(model_config)
+        return model_cls in self._processor_factories
+
+    def create_processor(
+        self,
+        model_config: "ModelConfig",
+        tokenizer: AnyTokenizer,
+    ) -> MultiModalProcessor:
+        """
+        Create a multi-modal processor for a specific model and tokenizer.
+        """
+
+        # Avoid circular import
+        from vllm.model_executor.model_loader import get_model_architecture
+
+        model_cls, _ = get_model_architecture(model_config)
+        processor_factory = self._processor_factories[model_cls]
+
+        ctx = InputProcessingContext(model_config, tokenizer)
+        return processor_factory(ctx)
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index bee3c25dbd8dd..40194716bbf94 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -11,9 +11,10 @@
 import vllm.envs as envs
 from vllm.connections import global_http_connection
 from vllm.logger import init_logger
-from vllm.multimodal.base import MultiModalDataDict, PlaceholderRange
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_tokenizer
 
+from .inputs import MultiModalDataDict, PlaceholderRange
+
 logger = init_logger(__name__)
 
 cached_get_tokenizer = lru_cache(get_tokenizer)
diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py
index a518270974f92..ba9bf58a4a20c 100644
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -1,5 +1,5 @@
 from functools import lru_cache
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, Optional
 
 import numpy as np
 
@@ -9,8 +9,9 @@
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils import is_list_of
 
-from .base import MultiModalData, MultiModalKwargs
+from .base import MultiModalData
 from .image import ImagePlugin
+from .inputs import MultiModalKwargs, VideoItem
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig
@@ -20,17 +21,6 @@
 cached_get_video_processor = lru_cache(get_video_processor)
 cached_get_tokenizer = lru_cache(get_tokenizer)
 
-VideoInput = Union[
-    "np.ndarray",  # single video input
-    List["np.ndarray"],
-    # TODO: support more types
-    # List[Image.Image], List[List[Image.Image]],
-    # "torch.Tensor",
-    # List["torch.Tensor"],
-    # List[List["np.ndarrray"]],
-    # List[List["torch.Tensor"]],
-]
-
 
 class VideoPlugin(ImagePlugin):
     """Plugin for video data."""
@@ -53,13 +43,13 @@ def _get_hf_video_processor(
     def _default_input_mapper(
         self,
         ctx: InputContext,
-        data: MultiModalData[object],
+        data: MultiModalData[VideoItem],
         **mm_processor_kwargs,
     ) -> MultiModalKwargs:
         model_config = ctx.model_config
 
         if isinstance(data, list) and len(data) == 1:
-            data = data[0]
+            data = data[0]  # type: ignore
 
         if isinstance(data, np.ndarray) or is_list_of(data, np.ndarray):
             video_processor = self._get_hf_video_processor(
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 1370cb5c4f9d2..3b41d25a2fe42 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -5,25 +5,21 @@
 from array import array
 from collections import defaultdict
 from dataclasses import dataclass, field
-from functools import cached_property, reduce
-from typing import (TYPE_CHECKING, Any, Callable, DefaultDict, Dict, List,
-                    Mapping, Optional)
+from functools import reduce
+from typing import Any, Callable, DefaultDict, Dict, List, Mapping, Optional
 from typing import Sequence as GenericSequence
 from typing import Set, Tuple, Union
 
 import msgspec
 import torch
-from typing_extensions import assert_never
 
+from vllm.inputs import SingletonInputs, SingletonInputsAdapter
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalDataDict, MultiModalPlaceholderDict
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import RequestOutputKind, SamplingParams
 
-if TYPE_CHECKING:
-    from vllm.inputs import SingletonInputs
-
 VLLM_TOKEN_ID_ARRAY_TYPE = "l"
 
 VLLM_INVALID_TOKEN_ID = -1
@@ -407,14 +403,14 @@ class Sequence:
     def __init__(
         self,
         seq_id: int,
-        inputs: "SingletonInputs",
+        inputs: SingletonInputs,
         block_size: int,
         eos_token_id: Optional[int] = None,
         lora_request: Optional[LoRARequest] = None,
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> None:
         self.seq_id = seq_id
-        self.inputs = inputs
+        self.inputs = SingletonInputsAdapter(inputs)
         self.block_size = block_size
         self.eos_token_id = eos_token_id
         self.lora_request = lora_request
@@ -441,59 +437,29 @@ def __init__(
     def n_blocks(self) -> int:
         return (self.get_len() + self.block_size - 1) // self.block_size
 
-    @cached_property
+    @property
     def prompt(self) -> Optional[str]:
-        inputs = self.inputs
-
-        if inputs["type"] == "token":
-            return inputs.get("prompt")
+        return self.inputs.prompt
 
-        assert_never(inputs)
-
-    @cached_property
+    @property
     def prompt_token_ids(self) -> List[int]:
-        inputs = self.inputs
-
-        if inputs["type"] == "token":
-            return inputs.get("prompt_token_ids", [])
+        return self.inputs.prompt_token_ids
 
-        assert_never(inputs)
-
-    @cached_property
+    @property
     def prompt_embeds(self) -> Optional[torch.Tensor]:
-        inputs = self.inputs
-
-        if inputs["type"] == "token":
-            return None
-
-        assert_never(inputs)
+        return self.inputs.prompt_embeds
 
-    @cached_property
+    @property
     def multi_modal_data(self) -> "MultiModalDataDict":
-        inputs = self.inputs
-
-        if inputs["type"] == "token":
-            return inputs.get("multi_modal_data", {})
-
-        assert_never(inputs)
-
-    @cached_property
-    def mm_processor_kwargs(self) -> Dict[str, Any]:
-        inputs = self.inputs
-
-        if inputs["type"] == "token":
-            return inputs.get("mm_processor_kwargs", {})
-
-        assert_never(inputs)
+        return self.inputs.multi_modal_data
 
     @property
     def multi_modal_placeholders(self) -> MultiModalPlaceholderDict:
-        inputs = self.inputs
-
-        if inputs["type"] == "token":
-            return inputs.get("multi_modal_placeholders", {})
+        return self.inputs.multi_modal_placeholders
 
-        assert_never(inputs)
+    @property
+    def mm_processor_kwargs(self) -> Dict[str, Any]:
+        return self.inputs.mm_processor_kwargs
 
     @property
     def lora_int_id(self) -> int:
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 2d7c58cfea13b..09bff9655a882 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -6,6 +6,7 @@
 from vllm.engine.metrics_types import StatLoggerBase
 from vllm.engine.protocol import EngineClient
 from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType
+from vllm.inputs.preprocess import InputPreprocessor
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import EmbeddingRequestOutput, RequestOutput
@@ -321,6 +322,9 @@ async def get_model_config(self) -> ModelConfig:
     async def get_decoding_config(self):
         raise ValueError("Not Supported on V1 yet.")
 
+    async def get_input_preprocessor(self) -> InputPreprocessor:
+        return self.processor.input_preprocessor
+
     async def get_tokenizer(
         self,
         lora_request: Optional[LoRARequest] = None,
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 5b45615a1b85b..4ebfff9584267 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -7,6 +7,7 @@
 from vllm.inputs import INPUT_REGISTRY, InputRegistry, PromptType
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.outputs import RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
@@ -32,6 +33,7 @@ def __init__(
         usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
         stat_loggers: Optional[Dict[str, StatLoggerBase]] = None,
         input_registry: InputRegistry = INPUT_REGISTRY,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
         use_cached_outputs: bool = False,
         multiprocess_mode: bool = False,
     ) -> None:
@@ -50,7 +52,7 @@ def __init__(
         # Processor (convert Inputs --> EngineCoreRequests)
         self.processor = Processor(vllm_config.model_config,
                                    vllm_config.lora_config, self.tokenizer,
-                                   input_registry)
+                                   input_registry, mm_registry)
 
         # Detokenizer (converts EngineCoreOutputs --> RequestOutput)
         self.detokenizer = Detokenizer(
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 5f13cbf2e4036..5c1577190c75a 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -2,15 +2,17 @@
 from typing import Any, Dict, Mapping, Optional, Tuple, Union
 
 from vllm.config import LoRAConfig, ModelConfig
-from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs,
-                         EncoderDecoderLLMInputs, InputRegistry, PromptType)
+from vllm.inputs import (INPUT_REGISTRY, InputRegistry, ProcessorInputs,
+                         PromptType, SingletonInputsAdapter)
+from vllm.inputs.parse import is_encoder_decoder_inputs
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.lora.request import LoRARequest
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.config import try_get_generation_config
-from vllm.transformers_utils.tokenizer_group import AnyTokenizer
+from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.v1.engine import DetokenizerRequest, EngineCoreRequest
 
 
@@ -20,8 +22,9 @@ def __init__(
         self,
         model_config: ModelConfig,
         lora_config: Optional[LoRAConfig],
-        tokenizer: AnyTokenizer,
+        tokenizer: BaseTokenizerGroup,
         input_registry: InputRegistry = INPUT_REGISTRY,
+        mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
     ):
 
         self.model_config = model_config
@@ -31,7 +34,8 @@ def __init__(
         self.generation_config_fields = _load_generation_config_dict(
             model_config)
         self.input_preprocessor = InputPreprocessor(model_config,
-                                                    self.tokenizer)
+                                                    self.tokenizer,
+                                                    mm_registry)
         self.input_processor = input_registry.create_input_processor(
             model_config)
 
@@ -73,6 +77,19 @@ def process_inputs(
         self._validate_model_inputs(processed_inputs)
         eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
 
+        if is_encoder_decoder_inputs(processed_inputs):
+            decoder_inputs = SingletonInputsAdapter(
+                processed_inputs["decoder"])
+            encoder_inputs = SingletonInputsAdapter(
+                processed_inputs["encoder"])
+        else:
+            decoder_inputs = SingletonInputsAdapter(processed_inputs)
+            encoder_inputs = None
+
+        # TODO: Impl encoder-decoder
+        if encoder_inputs is not None:
+            raise NotImplementedError
+
         assert isinstance(params, SamplingParams)
         # TODO: can we avoid cloning here in multiproc case
         sampling_params = params.clone()
@@ -81,27 +98,43 @@ def process_inputs(
 
         # Make Request for Detokenizer.
         detokenizer_request = DetokenizerRequest(
-            request_id, processed_inputs.get("prompt"),
-            processed_inputs.get("prompt_token_ids"),
+            request_id,
+            decoder_inputs.prompt,
+            decoder_inputs.prompt_token_ids,
             sampling_params.skip_special_tokens,
             sampling_params.spaces_between_special_tokens,
-            sampling_params.output_kind, sampling_params.stop,
-            sampling_params.include_stop_str_in_output)
+            sampling_params.output_kind,
+            sampling_params.stop,
+            sampling_params.include_stop_str_in_output,
+        )
 
         # Make Request for EngineCore.
         engine_core_request = EngineCoreRequest(
-            request_id, processed_inputs.get("prompt"),
-            processed_inputs.get("prompt_token_ids"),
-            processed_inputs.get("multi_modal_data"),
-            processed_inputs.get("multi_modal_placeholders"),
-            processed_inputs.get("mm_processor_kwargs"), sampling_params,
-            eos_token_id, arrival_time, lora_request)
+            request_id,
+            decoder_inputs.prompt,
+            decoder_inputs.prompt_token_ids,
+            decoder_inputs.multi_modal_data,
+            decoder_inputs.multi_modal_placeholders,
+            decoder_inputs.mm_processor_kwargs,
+            sampling_params,
+            eos_token_id,
+            arrival_time,
+            lora_request,
+        )
 
         return detokenizer_request, engine_core_request
 
-    def _validate_model_inputs(self, inputs: Union[DecoderOnlyInputs,
-                                                   EncoderDecoderLLMInputs]):
-        prompt_ids = inputs.get("prompt_token_ids")
+    def _validate_model_inputs(self, inputs: ProcessorInputs):
+        if is_encoder_decoder_inputs(inputs):
+            # For encoder-decoder multimodal models, the max_prompt_len
+            # restricts the decoder prompt length
+            prompt_inputs = inputs["decoder" if self.model_config.
+                                   is_multimodal_model else "encoder"]
+        else:
+            prompt_inputs = inputs
+
+        prompt_ids = SingletonInputsAdapter(prompt_inputs).prompt_token_ids
+
         if prompt_ids is None or len(prompt_ids) == 0:
             raise ValueError("Prompt cannot be empty")
 
@@ -117,6 +150,10 @@ def _validate_model_inputs(self, inputs: Union[DecoderOnlyInputs,
                     "inputs, the number of image tokens depends on the number "
                     "of images, and possibly their aspect ratios as well.")
 
+            # TODO: Find out how many placeholder tokens are there so we can
+            # check that chunked prefill does not truncate them
+            # max_batch_len = self.scheduler_config.max_num_batched_tokens
+
 
 def _load_generation_config_dict(model_config: ModelConfig) -> Dict[str, Any]:
     config = try_get_generation_config(
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index f35cf738c89bf..51fb4003e5fe0 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -1,7 +1,7 @@
 import enum
-from typing import TYPE_CHECKING, List, Optional, Union
+from typing import List, Optional, Union
 
-from vllm.inputs.data import DecoderOnlyInputs
+from vllm.inputs import DecoderOnlyInputs, SingletonInputsAdapter, token_inputs
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalKwargs
 from vllm.sampling_params import SamplingParams
@@ -9,23 +9,20 @@
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.utils import ConstantList
 
-if TYPE_CHECKING:
-    from vllm.inputs import DecoderOnlyInputs
-
 
 class Request:
 
     def __init__(
         self,
         request_id: str,
-        inputs: "DecoderOnlyInputs",
+        inputs: DecoderOnlyInputs,
         sampling_params: SamplingParams,
         eos_token_id: Optional[int],
         arrival_time: float,
         lora_request: Optional[LoRARequest] = None,
     ) -> None:
         self.request_id = request_id
-        self.inputs = inputs
+        self.inputs = SingletonInputsAdapter(inputs)
         self.sampling_params = sampling_params
         # Because of LoRA, the eos token id can be different for each request.
         self.eos_token_id = eos_token_id
@@ -41,17 +38,17 @@ def __init__(
         assert sampling_params.max_tokens is not None
         self.max_tokens = sampling_params.max_tokens
 
-        self.prompt = inputs.get("prompt")
-        self.prompt_token_ids = inputs["prompt_token_ids"]
+        self.prompt = self.inputs.prompt
+        self.prompt_token_ids = self.inputs.prompt_token_ids
         self.num_prompt_tokens = len(self.prompt_token_ids)
         self._output_token_ids: List[int] = []
         self._all_token_ids: List[int] = self.prompt_token_ids.copy()
         self.num_computed_tokens = 0
 
         # Raw multimodal data before the mm input mapper (e.g., PIL images).
-        self.mm_data = inputs.get("multi_modal_data")
-        self.mm_processor_kwargs = inputs.get("mm_processor_kwargs")
-        mm_positions = inputs.get("multi_modal_placeholders")
+        self.mm_data = self.inputs.multi_modal_data
+        self.mm_processor_kwargs = self.inputs.mm_processor_kwargs
+        mm_positions = self.inputs.multi_modal_placeholders
         if mm_positions:
             # FIXME(woosuk): Support other modalities.
             self.mm_positions = mm_positions.get("image", [])
@@ -64,8 +61,7 @@ def __init__(
     def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
         return cls(
             request_id=request.request_id,
-            inputs=DecoderOnlyInputs(
-                type="token",
+            inputs=token_inputs(
                 prompt_token_ids=request.prompt_token_ids,
                 prompt=request.prompt,
                 multi_modal_data=request.mm_data,
@@ -114,7 +110,7 @@ def get_finished_reason(self) -> Union[str, None]:
         return RequestStatus.get_finished_reason(self.status)
 
     def has_encoder_inputs(self) -> bool:
-        return self.mm_data is not None
+        return len(self.mm_data) > 0
 
     @property
     def num_encoder_inputs(self) -> int:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 81480786a09e1..eebd1de96537f 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -28,7 +28,7 @@
 from vllm.v1.sample.metadata import SamplingMetadata
 
 if TYPE_CHECKING:
-    from vllm.multimodal.base import PlaceholderRange
+    from vllm.multimodal.inputs import PlaceholderRange
     from vllm.v1.core.scheduler import SchedulerOutput
 
 logger = init_logger(__name__)
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 09c62fbb9875f..d3e1202c15e61 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -148,19 +148,29 @@ def build(self) -> ModelInputForCPU:
             query_lens=seq_lens,
         )
 
-    def _compute_multi_modal_input(self, seq_group: SequenceGroupMetadata,
-                                   seq_data: SequenceData, computed_len: int,
-                                   mm_processor_kwargs: Dict[str, Any]):
-
+    def _compute_multi_modal_input(
+        self,
+        seq_data: SequenceData,
+        computed_len: int,
+        seq_group_metadata: SequenceGroupMetadata,
+    ):
         # NOTE: mm_data only includes the subset of multi-modal items that
         # intersect with the current prefill positions.
         mm_data, placeholder_maps = MultiModalPlaceholderMap.from_seq_group(
-            seq_group, range(computed_len, len(seq_data.get_token_ids())))
+            seq_group_metadata,
+            range(computed_len, len(seq_data.get_token_ids())),
+        )
 
         if not mm_data:
-            return
+            return None, None, None
 
-        mm_kwargs = self.multi_modal_input_mapper(mm_data, mm_processor_kwargs)
+        if self.runner.mm_registry.has_processor(self.runner.model_config):
+            mm_kwargs = mm_data
+        else:
+            mm_kwargs = self.multi_modal_input_mapper(
+                mm_data,
+                seq_group_metadata.mm_processor_kwargs,
+            )
 
         # special processing for mrope position deltas.
         mrope_positions = None
@@ -202,7 +212,7 @@ def _prepare_prompt(
 
         slot_mapping: List[int] = []
         seq_lens: List[int] = []
-        multi_model_kwargs_list: List[MultiModalKwargs] = []
+        multi_modal_kwargs_list: List[MultiModalKwargs] = []
         multi_modal_placeholder_maps: Dict[
             str,
             MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
@@ -223,11 +233,14 @@ def _prepare_prompt(
 
             mrope_positions = None
             if seq_group_metadata.multi_modal_data:
-                mm_kwargs, placeholder_maps, mrope_positions = self \
-                    ._compute_multi_modal_input(
-                        seq_group_metadata, seq_data, computed_len,
-                    seq_group_metadata.mm_processor_kwargs)
-                multi_model_kwargs_list.append(mm_kwargs)
+                (
+                    mm_kwargs,
+                    placeholder_maps,
+                    mrope_positions,
+                ) = self._compute_multi_modal_input(seq_data, computed_len,
+                                                    seq_group_metadata)
+
+                multi_modal_kwargs_list.append(mm_kwargs)
                 for modality, placeholder_map in placeholder_maps.items():
                     multi_modal_placeholder_maps[modality].extend(
                         placeholder_map)
@@ -302,7 +315,7 @@ def _prepare_prompt(
             multi_modal_placeholder_index_maps=placeholder_index_maps,
         )
 
-        multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list)
+        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
 
         return (input_tokens, input_positions, attn_metadata, seq_lens,
                 multi_modal_kwargs)
diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 92d6552b2f428..1ff30d685c6b1 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -716,7 +716,7 @@ def _prepare_prompt(
         context_lens: List[int] = []
         query_lens: List[int] = []
         prefix_block_tables: List[List[int]] = []
-        multi_model_kwargs_list: List[MultiModalKwargs] = []
+        multi_modal_kwargs_list: List[MultiModalKwargs] = []
 
         if len(seq_group_metadata_list) == 0:
             return PreparePromptMetadata.empty()
@@ -777,7 +777,7 @@ def _prepare_prompt(
             mm_data = seq_group_metadata.multi_modal_data
             if mm_data:
                 mm_kwargs = self.multi_modal_input_mapper(mm_data)
-                multi_model_kwargs_list.append(mm_kwargs)
+                multi_modal_kwargs_list.append(mm_kwargs)
 
             if seq_group_metadata.block_tables is None:
                 # During memory profiling, the block tables are not initialized
@@ -876,7 +876,7 @@ def _prepare_prompt(
             multi_modal_placeholder_index_maps=
             None  # FIXME(kzawora): mutli-modality will not work here
         )
-        multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list)
+        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
 
         return PreparePromptMetadata(input_tokens=input_tokens,
                                      input_positions=input_positions,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 2da02f21f8342..042f9f07eace6 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -252,7 +252,7 @@ def __init__(
             prompt_adapter_request: Optional[PromptAdapterRequest] = None,
 
             # Multi-modal inputs.
-            multi_model_kwargs: Optional[MultiModalKwargs] = None,
+            multi_modal_kwargs: Optional[MultiModalKwargs] = None,
             multi_modal_placeholder_maps: Optional[Dict[
                 str, MultiModalPlaceholderMap]] = None,
 
@@ -373,7 +373,7 @@ def __init__(
                     prompt_adapter_prompt_mapping or [])
 
             self.prompt_adapter_request = prompt_adapter_request
-            self.multi_model_kwargs = multi_model_kwargs
+            self.multi_modal_kwargs = multi_modal_kwargs
             self.multi_modal_placeholder_maps = multi_modal_placeholder_maps
             self.prefix_cache_hit = prefix_cache_hit
 
@@ -661,10 +661,15 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
         if not mm_data:
             return
 
-        mm_kwargs = self.multi_modal_input_mapper(
-            mm_data,
-            mm_processor_kwargs=seq_group_metadata.mm_processor_kwargs)
-        inter_data.multi_model_kwargs = mm_kwargs
+        if self.runner.mm_registry.has_processor(self.runner.model_config):
+            mm_kwargs = mm_data
+        else:
+            mm_kwargs = self.multi_modal_input_mapper(
+                mm_data,
+                seq_group_metadata.mm_processor_kwargs,
+            )
+
+        inter_data.multi_modal_kwargs = mm_kwargs
         inter_data.multi_modal_placeholder_maps = placeholder_maps
 
         # special processing for mrope position deltas.
@@ -938,11 +943,11 @@ def build(self) -> ModelInputForGPU:
             )
 
         # Multi-modal data.
-        multi_model_kwargs_list = [
-            data.multi_model_kwargs for data in self.inter_data_list
-            if data.multi_model_kwargs is not None
+        multi_modal_kwargs_list = [
+            data.multi_modal_kwargs for data in self.inter_data_list
+            if data.multi_modal_kwargs is not None
         ]
-        multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list)
+        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
 
         return self.model_input_cls(
             input_tokens=input_tokens_tensor,
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index 0ed33e435aa2f..ae4eb6ba6eaec 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -67,7 +67,8 @@ def __init__(
         self.pin_memory = is_pin_memory_available()
 
         # Multi-modal data support
-        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
+        self.mm_registry = MULTIMODAL_REGISTRY
+        self.multi_modal_input_mapper = self.mm_registry \
             .create_input_mapper(self.model_config)
 
         # Lazy initialization.
@@ -122,7 +123,7 @@ def _prepare_prompt(
         input_block_ids: List[int] = []
 
         seq_lens: List[int] = []
-        multi_model_kwargs_list: List[MultiModalKwargs] = []
+        multi_modal_kwargs_list: List[MultiModalKwargs] = []
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
             seq_ids = list(seq_group_metadata.seq_data.keys())
@@ -144,12 +145,15 @@ def _prepare_prompt(
 
             mm_data = seq_group_metadata.multi_modal_data
             if mm_data:
-                # Process multi-modal data
-                mm_kwargs = self.multi_modal_input_mapper(
-                    mm_data,
-                    mm_processor_kwargs=seq_group_metadata.mm_processor_kwargs,
-                )
-                multi_model_kwargs_list.append(mm_kwargs)
+                if self.mm_registry.has_processor(self.model_config):
+                    mm_kwargs = mm_data
+                else:
+                    mm_kwargs = self.multi_modal_input_mapper(
+                        mm_data,
+                        seq_group_metadata.mm_processor_kwargs,
+                    )
+
+                multi_modal_kwargs_list.append(mm_kwargs)
 
         max_seq_len = max(seq_lens)
         assert max_seq_len > 0
@@ -167,7 +171,7 @@ def _prepare_prompt(
                                        dtype=torch.long,
                                        device=self.device)
 
-        multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list)
+        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
 
         return (input_tokens, input_positions, input_block_ids, seq_lens,
                 multi_modal_kwargs)
diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index 378e1e06039b2..6000e5dfe4e30 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -70,7 +70,8 @@ def __init__(
         )
 
         # Multi-modal data support
-        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
+        self.mm_registry = MULTIMODAL_REGISTRY
+        self.multi_modal_input_mapper = self.mm_registry \
             .create_input_mapper(self.model_config)
 
         # Lazy initialization.
@@ -102,7 +103,7 @@ def _prepare_model_input(
         seq_lens: List[int] = []
         past_lens: List[int] = []
         query_lens: List[int] = []
-        multi_model_kwargs_list: List[MultiModalKwargs] = []
+        multi_modal_kwargs_list: List[MultiModalKwargs] = []
         multi_modal_placeholder_maps: Dict[
             str,
             MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
@@ -222,11 +223,15 @@ def _prepare_model_input(
                     mm_data, placeholder_maps = MultiModalPlaceholderMap \
                         .from_seq_group(seq_group_metadata, positions_range)
 
-                    mm_kwargs = self.multi_modal_input_mapper(
-                        mm_data,
-                        mm_processor_kwargs=seq_group_metadata.
-                        mm_processor_kwargs)
-                    multi_model_kwargs_list.append(mm_kwargs)
+                    if self.mm_registry.has_processor(self.model_config):
+                        mm_kwargs = mm_data
+                    else:
+                        mm_kwargs = self.multi_modal_input_mapper(
+                            mm_data,
+                            seq_group_metadata.mm_processor_kwargs,
+                        )
+
+                    multi_modal_kwargs_list.append(mm_kwargs)
 
                     for modality, placeholder_map in placeholder_maps.items():
                         multi_modal_placeholder_maps[modality].extend(
@@ -275,7 +280,7 @@ def _prepare_model_input(
             multi_modal_placeholder_index_maps=placeholder_index_maps,
         )
 
-        multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list)
+        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
 
         return ModelInput(
             input_tokens,
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index c9e637c057979..e6322e095bbb9 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -160,7 +160,7 @@ def _prepare_prompt(
         input_positions: List[int] = []
         slot_mapping: List[int] = []
         seq_lens: List[int] = []
-        multi_model_kwargs_list: List[MultiModalKwargs] = []
+        multi_modal_kwargs_list: List[MultiModalKwargs] = []
         multi_modal_placeholder_maps: Dict[
             str,
             MultiModalPlaceholderMap] = defaultdict(MultiModalPlaceholderMap)
@@ -191,8 +191,16 @@ def _prepare_prompt(
                 mm_data, placeholder_maps = MultiModalPlaceholderMap \
                     .from_seq_group(seq_group_metadata, positions_range)
 
-                mm_kwargs = self.runner.multi_modal_input_mapper(mm_data)
-                multi_model_kwargs_list.append(mm_kwargs)
+                if self.runner.mm_registry.has_processor(
+                        self.runner.model_config):
+                    mm_kwargs = mm_data
+                else:
+                    mm_kwargs = self.runner.multi_modal_input_mapper(
+                        mm_data,
+                        seq_group_metadata.mm_processor_kwargs,
+                    )
+
+                multi_modal_kwargs_list.append(mm_kwargs)
 
                 for modality, placeholder_map in placeholder_maps.items():
                     multi_modal_placeholder_maps[modality].extend(
@@ -264,7 +272,7 @@ def _prepare_prompt(
             block_tables=torch.tensor([], device=self.device, dtype=torch.int),
         )
 
-        multi_modal_kwargs = MultiModalKwargs.batch(multi_model_kwargs_list)
+        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
 
         return (input_tokens, input_positions, attn_metadata, seq_lens,
                 multi_modal_kwargs)

From ac49b59d8b01ffb9979e18e67b252d45410bc1e6 Mon Sep 17 00:00:00 2001
From: HoangCongDuc <55457046+HoangCongDuc@users.noreply.github.com>
Date: Thu, 14 Nov 2024 00:56:39 +0800
Subject: [PATCH 127/183] [Bugfix] bitsandbytes models fail to run pipeline
 parallel (#10200)

Signed-off-by: Hoang Cong Duc <hoangcongducltt@gmail.com>
---
 tests/quantization/test_bitsandbytes.py    | 30 +++++++++++++++++++++-
 vllm/model_executor/model_loader/loader.py |  6 +++++
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py
index 0f01f5f819ea4..569fc8dfb6a21 100644
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -9,7 +9,7 @@
 import torch
 
 from tests.quantization.utils import is_quant_method_supported
-from tests.utils import fork_new_process_for_each_test
+from tests.utils import compare_two_settings, fork_new_process_for_each_test
 
 models_4bit_to_test = [
     ("facebook/opt-125m", "quantize opt model inflight"),
@@ -82,6 +82,34 @@ def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                              vllm_tp_size=2)
 
 
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason='Test requires at least 2 GPUs.')
+@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
+                    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.parametrize("model_name, description", models_4bit_to_test)
+@fork_new_process_for_each_test
+def test_load_pp_4bit_bnb_model(model_name, description) -> None:
+    common_args = [
+        "--disable-log-stats",
+        "--disable-log-requests",
+        "--dtype",
+        "bfloat16",
+        "--enable-prefix-caching",
+        "--quantization",
+        "bitsandbytes",
+        "--load-format",
+        "bitsandbytes",
+        "--gpu-memory-utilization",
+        "0.7",
+    ]
+    pp_args = [
+        *common_args,
+        "--pipeline-parallel-size",
+        "2",
+    ]
+    compare_two_settings(model_name, common_args, pp_args)
+
+
 def log_generated_texts(prompts, outputs, runner_name):
     logged_texts = []
     for i, (_, generated_text) in enumerate(outputs):
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 8d3024534734b..715e6c11f86ce 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -991,7 +991,13 @@ def _load_weights(self, model_config: ModelConfig,
 
         param_dict = dict(model.named_parameters())
         stacked_quant_state_dict: Dict[str, Dict[int, Any]] = {}
+        # TODO: Change this lazy import to normal import
+        # after the checks are updated to run on a new version
+        from vllm.model_executor.models.utils import is_pp_missing_parameter
         for quant_param_name in quant_state_dict:
+            if is_pp_missing_parameter(quant_param_name, model):
+                continue
+
             non_stacked_param_name = quant_param_name
 
             shard_index = 0

From 15bb8330aa50ca6ec86f827a0fe79134b1dbac8c Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Thu, 14 Nov 2024 10:54:59 +0800
Subject: [PATCH 128/183] [Bugfix] Fix tensor parallel for qwen2 classification
 model (#10297)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 tests/models/embedding/language/test_cls_models.py | 6 +++---
 vllm/model_executor/models/qwen2_cls.py            | 7 ++++++-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py
index d8ca6d361f0e3..40ee49cf60742 100644
--- a/tests/models/embedding/language/test_cls_models.py
+++ b/tests/models/embedding/language/test_cls_models.py
@@ -21,14 +21,14 @@ def test_classification_models(
     model: str,
     dtype: str,
 ) -> None:
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.classify(example_prompts)
+
     with hf_runner(model,
                    dtype=dtype,
                    auto_cls=AutoModelForSequenceClassification) as hf_model:
         hf_outputs = hf_model.classify(example_prompts)
 
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.classify(example_prompts)
-
     print(hf_outputs, vllm_outputs)
 
     # check logits difference
diff --git a/vllm/model_executor/models/qwen2_cls.py b/vllm/model_executor/models/qwen2_cls.py
index 020af88aadd98..27eb7e8a93975 100644
--- a/vllm/model_executor/models/qwen2_cls.py
+++ b/vllm/model_executor/models/qwen2_cls.py
@@ -69,9 +69,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.model = Qwen2Model(vllm_config=vllm_config,
                                 prefix=maybe_prefix(prefix, "model"))
 
+        # hidden_states from Qwen2Model has been reduced,
+        # the input of score layer is not parallelized.
         self.score = RowParallelLinear(config.hidden_size,
                                        config.num_labels,
-                                       quant_config=quant_config)
+                                       quant_config=quant_config,
+                                       input_is_parallel=False,
+                                       bias=False,
+                                       prefix=maybe_prefix(prefix, "score"))
         self._pooler = Pooler.from_config_with_defaults(
             pooler_config,
             pooling_type=PoolingType.LAST,

From 504ac53d18fc057d2a98741fa27d89df9054422d Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 13 Nov 2024 18:55:39 -0800
Subject: [PATCH 129/183] [misc] error early for old-style class (#10304)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/design/class_hierarchy.rst        | 39 +++++++++++++++++++
 vllm/model_executor/model_loader/loader.py    | 17 +++++++-
 vllm/model_executor/models/arctic.py          |  2 +-
 vllm/model_executor/models/bloom.py           |  6 +--
 vllm/model_executor/models/chatglm.py         |  6 +--
 vllm/model_executor/models/dbrx.py            |  6 +--
 vllm/model_executor/models/eagle.py           |  2 +-
 vllm/model_executor/models/falcon.py          |  6 +--
 vllm/model_executor/models/fuyu.py            |  2 +-
 vllm/model_executor/models/gpt2.py            |  6 +--
 vllm/model_executor/models/gpt_bigcode.py     |  6 +--
 vllm/model_executor/models/gpt_j.py           |  6 +--
 vllm/model_executor/models/gpt_neox.py        |  6 +--
 vllm/model_executor/models/internvl.py        |  2 +-
 vllm/model_executor/models/jais.py            |  6 +--
 vllm/model_executor/models/llava.py           |  2 +-
 vllm/model_executor/models/llava_next.py      |  2 +-
 .../model_executor/models/llava_next_video.py |  2 +-
 vllm/model_executor/models/llava_onevision.py |  2 +-
 vllm/model_executor/models/medusa.py          |  2 +-
 vllm/model_executor/models/utils.py           | 10 ++---
 21 files changed, 75 insertions(+), 63 deletions(-)

diff --git a/docs/source/design/class_hierarchy.rst b/docs/source/design/class_hierarchy.rst
index b3404f6b936e7..15f0c8ccf77ee 100644
--- a/docs/source/design/class_hierarchy.rst
+++ b/docs/source/design/class_hierarchy.rst
@@ -26,6 +26,45 @@ There are several important design choices behind this class hierarchy:
 
 2. **Uniformity**: The model runner needs a unified interface to create and initialize the model. vLLM supports more than 50 types of popular open-source models. Each model has its own initialization logic. If the constructor signature varies with models, the model runner does not know how to call the constructor accordingly, without complicated and error-prone inspection logic. By making the constructor of the model class uniform, the model runner can easily create and initialize the model without knowing the specific model type. This is also useful for composing models. Vision-language models often consist of a vision model and a language model. By making the constructor uniform, we can easily create a vision model and a language model and compose them into a vision-language model.
 
+.. note::
+
+    To support this change, all vLLM models' signatures have been updated to:
+
+    .. code-block:: python
+
+        def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+    
+    To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:
+
+    .. code-block:: python
+
+        class MyOldModel(nn.Module):
+            def __init__(
+                self,
+                config,
+                cache_config: Optional[CacheConfig] = None,
+                quant_config: Optional[QuantizationConfig] = None,
+                lora_config: Optional[LoRAConfig] = None,
+                prefix: str = "",
+            ) -> None:
+                ...
+
+        from vllm.config import VllmConfig
+        class MyNewModel(MyOldModel):
+            def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+                config = vllm_config.model_config.hf_config
+                cache_config = vllm_config.cache_config
+                quant_config = vllm_config.quant_config
+                lora_config = vllm_config.lora_config
+                super().__init__(config, cache_config, quant_config, lora_config, prefix)
+        
+        if __version__ >= "0.6.4":
+            MyModel = MyNewModel
+        else:
+            MyModel = MyOldModel
+
+    This way, the model can work with both old and new versions of vLLM.
+
 3. **Sharding and Quantization at Initialization**: Certain features require changing the model weights. For example, tensor parallelism needs to shard the model weights, and quantization needs to quantize the model weights. There are two possible ways to implement this feature. One way is to change the model weights after the model is initialized. The other way is to change the model weights during the model initialization. vLLM chooses the latter. The first approach is not scalable to large models. Suppose we want to run a 405B model (with roughly 810GB weights) with 16 H100 80GB GPUs. Ideally, every GPU should only load 50GB weights. If we change the model weights after the model is initialized, we need to load the full 810GB weights to every GPU and then shard the weights, leading to a huge memory overhead. Instead, if we shard the weights during the model initialization, every layer will only create a shard of the weights it needs, leading to a much smaller memory overhead. The same idea applies to quantization. Note that we also add an additional argument ``prefix`` to the model's constructor so that the model can initialize itself differently based on the prefix. This is useful for non-uniform quantization, where different parts of the model are quantized differently. The ``prefix`` is usually an empty string for the top-level model and a string like ``"vision"`` or ``"language"`` for the sub-models. In general, it matches the name of the module's state dict in the checkpoint file.
 
 One disadvantage of this design is that it is hard to write unit tests for individual components in vLLM because every component needs to be initialized by a complete config object. We solve this problem by providing a default initialization function that creates a default config object with all fields set to ``None``. If the component we want to test only cares about a few fields in the config object, we can create a default config object and set the fields we care about. This way, we can test the component in isolation. Note that many tests in vLLM are end-to-end tests that test the whole system, so this is not a big problem.
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 715e6c11f86ce..5bcae37961195 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -4,6 +4,7 @@
 import dataclasses
 import fnmatch
 import glob
+import inspect
 import json
 import math
 import os
@@ -88,11 +89,23 @@ def device_loading_context(module: torch.nn.Module,
 logger = init_logger(__name__)
 
 
-def _initialize_model(vllm_config: VllmConfig) -> nn.Module:
+def _initialize_model(vllm_config: VllmConfig, prefix: str = "") -> nn.Module:
     """Initialize a model with the given configurations."""
     model_config = vllm_config.model_config
     model_class, _ = get_model_architecture(model_config)
-    return model_class(vllm_config=vllm_config)
+    signatures = inspect.signature(model_class.__init__)
+    # collect all kw-only parameters
+    kw_only_params = [
+        param.name for param in signatures.parameters.values()
+        if param.kind == inspect.Parameter.KEYWORD_ONLY
+    ]
+    assert "vllm_config" in kw_only_params and "prefix" in kw_only_params, \
+    ("vLLM model class must accept `vllm_config` and `prefix` as kw-only "
+    "arguments. Possibly you have an old-style model class registered from "
+    "out of tree and it is used for new vLLM version. "
+    "Please check https://docs.vllm.ai/en/latest/design/class_hierarchy.html "
+    "for the design and update the model class accordingly.")
+    return model_class(vllm_config=vllm_config, prefix=prefix)
 
 
 class BaseModelLoader(ABC):
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index 7d4b9654b54ab..9ee2a2cc09a24 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -415,7 +415,7 @@ def forward(
 
 class ArcticForCausalLM(nn.Module, SupportsPP):
 
-    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 2c14519fb9e0e..84adf574af5e2 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -281,11 +281,7 @@ def forward(
 
 class BloomForCausalLM(nn.Module, SupportsPP):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 6ec2d5a2a3909..70e9b607b0642 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -593,11 +593,7 @@ class ChatGLMForCausalLM(nn.Module, SupportsLoRA, SupportsPP,
     embedding_modules = {}
     embedding_padding_modules = []
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index d5f9b903183d4..fff8710f6b475 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -350,11 +350,7 @@ def forward(
 
 class DbrxForCausalLM(nn.Module, SupportsPP):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py
index c902829994c7c..85c51e8404584 100644
--- a/vllm/model_executor/models/eagle.py
+++ b/vllm/model_executor/models/eagle.py
@@ -36,7 +36,7 @@ class EAGLE(nn.Module):
        in the draft checkpoint (using key token_map). Also, the draft config
        needs to have truncated_vocab_size (=k) as an attribute."""
 
-    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         self.config = config
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 562ee5517e7f1..dcfcb6694feb5 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -401,11 +401,7 @@ class FalconForCausalLM(nn.Module, SupportsPP):
         ".dense_4h_to_h.",
     ]
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index b39dfe706e0df..50701793b7b83 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -225,7 +225,7 @@ def input_mapper_for_fuyu(ctx: InputContext, data: object):
 @INPUT_REGISTRY.register_input_processor(input_processor_for_fuyu)
 class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
 
-    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index adf2a7a51f737..cc85693f99526 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -242,11 +242,7 @@ def forward(
 
 class GPT2LMHeadModel(nn.Module, SupportsPP):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index ae1495ebd7914..ab25c66c3a887 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -257,11 +257,7 @@ class GPTBigCodeForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
 
     embedding_padding_modules = []
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 610795b084b44..a83d03480dde1 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -229,11 +229,7 @@ def forward(
 
 class GPTJForCausalLM(nn.Module, SupportsPP):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index f5603772e9862..794b141bfa4aa 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -242,11 +242,7 @@ def forward(
 
 class GPTNeoXForCausalLM(nn.Module, SupportsPP):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 07165ea688f94..92579e3aae949 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -409,7 +409,7 @@ def dummy_data(
 @INPUT_REGISTRY.register_input_processor(input_pipeline.input_processor)
 class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP):
 
-    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
 
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index 4dc9271703a8d..65800c44e5a93 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -286,11 +286,7 @@ def forward(
 
 class JAISLMHeadModel(nn.Module, SupportsPP):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 005ae5e03cfed..b13bcfa676811 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -259,7 +259,7 @@ def init_vision_tower_for_llava(
 @INPUT_REGISTRY.register_input_processor(input_processor_for_llava)
 class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP):
 
-    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
 
         config = vllm_config.model_config.hf_config
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 0b621a23ec980..dd2fa6cac969f 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -281,7 +281,7 @@ def input_processor_for_llava_next(ctx: InputContext,
 class LlavaNextForConditionalGeneration(nn.Module, SupportsMultiModal,
                                         SupportsPP):
 
-    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index b030c2f5fdc47..5d5598d07bfde 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -253,7 +253,7 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
 class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
                                              SupportsPP):
 
-    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index c129f140d8d12..a5b2108177830 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -404,7 +404,7 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
 class LlavaOnevisionForConditionalGeneration(nn.Module, SupportsMultiModal,
                                              SupportsPP):
 
-    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
         config = vllm_config.model_config.hf_config
         quant_config = vllm_config.quant_config
diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py
index 4cb1b4a929b9f..de5b2d89c0962 100644
--- a/vllm/model_executor/models/medusa.py
+++ b/vllm/model_executor/models/medusa.py
@@ -44,7 +44,7 @@ class Medusa(nn.Module):
        in the draft checkpoint (using key token_map). Also, the draft config
        needs to have truncated_vocab_size (=k) as an attribute."""
 
-    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         config = vllm_config.model_config.hf_config
         super().__init__()
         self.config = config
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 1fc6c1be4b7bb..1d51885f9094a 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -14,7 +14,6 @@
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.model_executor.models import ModelRegistry
 from vllm.multimodal import MultiModalPlaceholderMap, NestedTensors
 from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
@@ -240,12 +239,9 @@ def init_vllm_registered_model(
     Helper function to initialize an inner model registered to vLLM,
     based on the arguments passed to the outer vLLM model.
     """
-    model_class, _ = ModelRegistry.resolve_model_cls(hf_config.architectures)
-
-    return model_class(
-        vllm_config=vllm_config.with_hf_config(hf_config),
-        prefix=prefix,
-    )
+    from vllm.model_executor.model_loader.loader import _initialize_model
+    vllm_config = vllm_config.with_hf_config(hf_config)
+    return _initialize_model(vllm_config, prefix)
 
 
 @overload

From e0853b65089b94c9bab9f480970dc73e1e8a0c0d Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Wed, 13 Nov 2024 22:12:35 -0500
Subject: [PATCH 130/183] [Misc] format.sh: Simplify tool_version_check
 (#10305)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 format.sh | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/format.sh b/format.sh
index d06ee62351a21..a57882d2ac3f9 100755
--- a/format.sh
+++ b/format.sh
@@ -44,18 +44,19 @@ CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}')
 
 # # params: tool name, tool version, required version
 tool_version_check() {
-    if [[ "$2" != "$3" ]]; then
-        echo "❓❓Wrong $1 version installed: $3 is required, not $2."
+    expected=$(grep "$1" requirements-lint.txt | cut -d'=' -f3)
+    if [[ "$2" != "$expected" ]]; then
+        echo "❓❓Wrong $1 version installed: $expected is required, not $2."
         exit 1
     fi
 }
 
-tool_version_check "yapf" "$YAPF_VERSION" "$(grep yapf requirements-lint.txt | cut -d'=' -f3)"
-tool_version_check "ruff" "$RUFF_VERSION" "$(grep "ruff==" requirements-lint.txt | cut -d'=' -f3)"
-tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-lint.txt | cut -d'=' -f3)"
-tool_version_check "isort" "$ISORT_VERSION" "$(grep isort requirements-lint.txt | cut -d'=' -f3)"
-tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-lint.txt | cut -d'=' -f3)"
-tool_version_check "clang-format" "$CLANGFORMAT_VERSION" "$(grep clang-format requirements-lint.txt | cut -d'=' -f3)"
+tool_version_check "yapf" "$YAPF_VERSION"
+tool_version_check "ruff" "$RUFF_VERSION"
+tool_version_check "mypy" "$MYPY_VERSION"
+tool_version_check "isort" "$ISORT_VERSION"
+tool_version_check "codespell" "$CODESPELL_VERSION"
+tool_version_check "clang-format" "$CLANGFORMAT_VERSION"
 
 YAPF_FLAGS=(
     '--recursive'

From f67ce05d0b826322f85403f1113f69ca3853aa39 Mon Sep 17 00:00:00 2001
From: Mike Depinet <mike.depinet@gmail.com>
Date: Wed, 13 Nov 2024 20:14:34 -0800
Subject: [PATCH 131/183] [Frontend] Pythonic tool parser (#9859)

Signed-off-by: Mike Depinet <mike@fixie.ai>
---
 .../serving/openai_compatible_server.md       |  76 +++--
 ...tool_chat_template_llama3.2_pythonic.jinja |  98 ++++++
 examples/tool_chat_template_toolace.jinja     |  65 ++++
 .../openai/tool_parsers/__init__.py           |   0
 .../tool_parsers/test_pythonic_tool_parser.py | 160 ++++++++++
 .../entrypoints/openai/tool_parsers/utils.py  | 123 ++++++++
 tests/tool_use/utils.py                       |  12 +-
 vllm/entrypoints/openai/serving_chat.py       |   5 +
 .../openai/tool_parsers/__init__.py           |   4 +-
 .../tool_parsers/pythonic_tool_parser.py      | 289 ++++++++++++++++++
 10 files changed, 806 insertions(+), 26 deletions(-)
 create mode 100644 examples/tool_chat_template_llama3.2_pythonic.jinja
 create mode 100644 examples/tool_chat_template_toolace.jinja
 create mode 100644 tests/entrypoints/openai/tool_parsers/__init__.py
 create mode 100644 tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
 create mode 100644 tests/entrypoints/openai/tool_parsers/utils.py
 create mode 100644 vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py

diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 200663dac4209..78965813b1213 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -162,7 +162,7 @@ vllm serve <model> --chat-template ./path-to-chat-template.jinja
 vLLM community provides a set of chat templates for popular models. You can find them in the examples
 directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
 
-With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies 
+With the inclusion of multi-modal chat APIs, the OpenAI spec now accepts chat messages in a new format which specifies
 both a `type` and a `text` field. An example is provided below:
 ```python
 completion = client.chat.completions.create(
@@ -172,10 +172,10 @@ completion = client.chat.completions.create(
   ]
 )
 ```
-Most chat templates for LLMs expect the `content` to be a `string` but there are some newer models like 
+Most chat templates for LLMs expect the `content` to be a `string` but there are some newer models like
 `meta-llama/Llama-Guard-3-1B` that expect the content to be parsed with the new OpenAI spec. In order to choose which
 format the content needs to be parsed in by vLLM, please use the `--chat-template-text-format` argument to specify
-between `string` or `openai`. The default value is `string` and vLLM internally converts both spec formats to match 
+between `string` or `openai`. The default value is `string` and vLLM internally converts both spec formats to match
 this, unless explicitly specified.
 
 
@@ -191,8 +191,8 @@ this, unless explicitly specified.
 ### Config file
 
 The `serve` module can also accept arguments from a config file in
-`yaml` format. The arguments in the yaml must be specified using the 
-long form of the argument outlined [here](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#command-line-arguments-for-the-server): 
+`yaml` format. The arguments in the yaml must be specified using the
+long form of the argument outlined [here](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#command-line-arguments-for-the-server):
 
 For example:
 
@@ -208,7 +208,7 @@ uvicorn-log-level: "info"
 $ vllm serve SOME_MODEL --config config.yaml
 ```
 ---
-**NOTE**  
+**NOTE**
 In case an argument is supplied simultaneously using command line and the config file, the value from the commandline will take precedence.
 The order of priorities is `command line > config file values > defaults`.
 
@@ -222,30 +222,30 @@ Please see below for recommended configuration and chat templates to use when fu
 
 
 ### Named Function Calling
-vLLM supports named function calling in the chat completion API by default. It does so using Outlines, so this is 
-enabled by default, and will work with any supported model. You are guaranteed a validly-parsable function call - not a 
-high-quality one. 
+vLLM supports named function calling in the chat completion API by default. It does so using Outlines, so this is
+enabled by default, and will work with any supported model. You are guaranteed a validly-parsable function call - not a
+high-quality one.
 
 vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
 
-To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and 
-specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request. 
+To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and
+specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request.
 
 
 ### Automatic Function Calling
 To enable this feature, you should set the following flags:
-* `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it 
+* `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it
 deems appropriate.
-* `--tool-call-parser` -- select the tool parser to use (listed below). Additional tool parsers 
+* `--tool-call-parser` -- select the tool parser to use (listed below). Additional tool parsers
 will continue to be added in the future, and also can register your own tool parsers in the `--tool-parser-plugin`.
 * `--tool-parser-plugin` -- **optional** tool parser plugin used to register user defined tool parsers into vllm, the registered tool parser name can be specified in `--tool-call-parser`.
-* `--chat-template` -- **optional** for auto tool choice. the path to the chat template which handles `tool`-role messages and `assistant`-role messages 
-that contain previously generated tool calls. Hermes, Mistral and Llama models have tool-compatible chat templates in their 
-`tokenizer_config.json` files, but you can specify a custom template. This argument can be set to `tool_use` if your model has a tool use-specific chat 
+* `--chat-template` -- **optional** for auto tool choice. the path to the chat template which handles `tool`-role messages and `assistant`-role messages
+that contain previously generated tool calls. Hermes, Mistral and Llama models have tool-compatible chat templates in their
+`tokenizer_config.json` files, but you can specify a custom template. This argument can be set to `tool_use` if your model has a tool use-specific chat
 template configured in the `tokenizer_config.json`. In this case, it will be used per the `transformers` specification. More on this [here](https://huggingface.co/docs/transformers/en/chat_templating#why-do-some-models-have-multiple-templates)
 from HuggingFace; and you can find an example of this in a `tokenizer_config.json` [here](https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B/blob/main/tokenizer_config.json)
 
-If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template! 
+If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template!
 
 
 #### Hermes Models (`hermes`)
@@ -256,8 +256,8 @@ All Nous Research Hermes-series models newer than Hermes 2 Pro should be support
 * `NousResearch/Hermes-3-*`
 
 
-_Note that the Hermes 2 **Theta** models are known to have degraded tool call quality & capabilities due to the merge 
-step in their creation_. 
+_Note that the Hermes 2 **Theta** models are known to have degraded tool call quality & capabilities due to the merge
+step in their creation_.
 
 Flags: `--tool-call-parser hermes`
 
@@ -269,9 +269,9 @@ Supported models:
 * Additional mistral function-calling models are compatible as well.
 
 Known issues:
-1. Mistral 7B struggles to generate parallel tool calls correctly. 
-2. Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is 
-much shorter than what vLLM generates. Since an exception is thrown when this condition 
+1. Mistral 7B struggles to generate parallel tool calls correctly.
+2. Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is
+much shorter than what vLLM generates. Since an exception is thrown when this condition
 is not met, the following additional chat templates are provided:
 
 * `examples/tool_chat_template_mistral.jinja` - this is the "official" Mistral chat template, but tweaked so that
@@ -291,11 +291,11 @@ Supported models:
 * `meta-llama/Meta-Llama-3.1-405B-Instruct`
 * `meta-llama/Meta-Llama-3.1-405B-Instruct-FP8`
 
-The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling).
+The tool calling that is supported is the [JSON based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) in Llama-3.2 models, see the `pythonic` tool parser below.
 Other tool calling formats like the built in python tool calling or custom tool calling are not supported.
 
 Known issues:
-1. Parallel tool calls are not supported. 
+1. Parallel tool calls are not supported.
 2. The model can generate parameters with a wrong format, such as generating
    an array serialized as string instead of an array.
 
@@ -341,6 +341,34 @@ AI21's Jamba-1.5 models are supported.
 Flags: `--tool-call-parser jamba`
 
 
+#### Models with Pythonic Tool Calls (`pythonic`)
+
+A growing number of models output a python list to represent tool calls instead of using JSON. This has the advantage of inherently supporting parallel tool calls and removing ambiguity around the JSON schema required for tool calls. The `pythonic` tool parser can support such models.
+
+As a concrete example, these models may look up the weather in San Francisco and Seattle by generating:
+```python
+[get_weather(city='San Francisco', metric='celsius'), get_weather(city='Seattle', metric='celsius')]
+```
+
+Limitations:
+* The model must not generate both text and tool calls in the same generation. This may not be hard to change for a specific model, but the community currently lacks consensus on which tokens to emit when starting and ending tool calls.  (In particular, the Llama 3.2 models emit no such tokens.)
+* Llama's smaller models struggle to use tools effectively.
+
+Example supported models:
+* `meta-llama/Llama-3.2-1B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`)
+* `meta-llama/Llama-3.2-3B-Instruct`\* (use with `examples/tool_chat_template_llama3.2_pythonic.jinja`)
+* `Team-ACE/ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`)
+* `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with `examples/tool_chat_template_toolace.jinja`)
+
+Flags: `--tool-call-parser pythonic --chat-template {see_above}`
+
+---
+**WARNING**
+Llama's smaller models frequently fail to emit tool calls in the correct format. Your mileage may vary.
+
+---
+
+
 ### How to write a tool parser plugin
 
 A tool parser plugin is a Python file containing one or more ToolParser implementations. You can write a ToolParser similar to the `Hermes2ProToolParser` in vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py.
diff --git a/examples/tool_chat_template_llama3.2_pythonic.jinja b/examples/tool_chat_template_llama3.2_pythonic.jinja
new file mode 100644
index 0000000000000..8c38de6c6a907
--- /dev/null
+++ b/examples/tool_chat_template_llama3.2_pythonic.jinja
@@ -0,0 +1,98 @@
+{{- bos_token }}
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools_in_user_message is defined %}
+    {%- set tools_in_user_message = false %}
+{%- endif %}
+{%- if not date_string is defined %}
+    {%- if strftime_now is defined %}
+        {%- set date_string = strftime_now("%d %b %Y") %}
+    {%- else %}
+        {%- set date_string = "26 Jul 2024" %}
+    {%- endif %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language. When you receive a tool call response, use the output to format an answer to the original user question." %}
+{%- endif %}
+
+{#- System message #}
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if tools is not none %}
+    {{- "Environment: ipython\n" }}
+{%- endif %}
+{{- "Cutting Knowledge Date: December 2023\n" }}
+{{- "Today Date: " + date_string + "\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- "You have access to the following functions. To call functions, please respond with a python list of the calls. " }}
+    {{- 'Respond in the format [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)] ' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+
+{#- Custom tools are passed in a user message with some extra guidance #}
+{%- if tools_in_user_message and not tools is none %}
+    {#- Extract the first user message so we can plug it in here #}
+    {%- if messages | length != 0 %}
+        {%- set first_user_message = messages[0]['content']|trim %}
+        {%- set messages = messages[1:] %}
+    {%- else %}
+        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+    {%- endif %}
+    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+    {{- "Given the following functions, please respond with a python list for function calls " }}
+    {{- "with their proper arguments to best answer the given prompt.\n\n" }}
+    {{- 'Respond in the format [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)] ' }}
+    {{- "Do not use variables.\n\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- first_user_message + "<|eot_id|>"}}
+{%- endif %}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {{- '<|start_header_id|>assistant<|end_header_id|>\n\n[' -}}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- tool_call.name + '(' -}}
+            {%- for param in tool_call.arguments %}
+                {{- param + '=' -}}
+                {{- "%sr" | format(tool_call.arguments[param]) -}}
+                {% if not loop.last %}, {% endif %}
+            {%- endfor %}
+            {{- ')' -}}
+            {% if not loop.last %}, {% endif %}
+        {%- endfor %}
+        {{- ']<|eot_id|>' -}}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is mapping %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- { "output": message.content } | tojson }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}
diff --git a/examples/tool_chat_template_toolace.jinja b/examples/tool_chat_template_toolace.jinja
new file mode 100644
index 0000000000000..a9b3b7189dddf
--- /dev/null
+++ b/examples/tool_chat_template_toolace.jinja
@@ -0,0 +1,65 @@
+{{- bos_token }}
+
+{%- if custom_tools is defined %}
+    {%- set tools = custom_tools %}
+{%- endif %}
+{%- if not tools is defined %}
+    {%- set tools = none %}
+{%- endif %}
+
+{#- This block extracts the system message, so we can slot it into the right place. #}
+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content']|trim %}
+    {%- set messages = messages[1:] %}
+{%- else %}
+    {%- set system_message = "You are a helpful assistant with tool calling capabilities. Only reply with a tool call if the function exists in the library provided by the user. If it doesn't exist, just reply directly in natural language." %}
+{%- endif %}
+
+{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+{%- if tools is not none and not tools_in_user_message %}
+    {{- "You are an expert in composing functions. You are given a question and a set of possible functions. Based on the question, you will need to make one or more function/tool calls to achieve the purpose.\n" }}
+    {{- "If none of the function can be used, point it out. If the given question lacks the parameters required by the function, also point it out.\n" }}
+    {{- "You should only return the function call in tools call sections.\n\n" }}
+    {{- "If you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]\n" }}
+    {{- "You SHOULD NOT include any other text in the response.\n" }}
+    {{- "Here is a list of functions in JSON format that you can invoke.\n" }}
+    {%- for t in tools %}
+        {{- t | tojson(indent=4) }}
+        {{- "\n\n" }}
+    {%- endfor %}
+    {{- "\n" }}
+{%- endif %}
+{{- system_message }}
+{{- "<|eot_id|>" }}
+
+{%- for message in messages %}
+    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+    {%- elif 'tool_calls' in message %}
+        {{- '<|start_header_id|>assistant<|end_header_id|>\n\n[' -}}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- tool_call.name + '(' -}}
+            {%- for param in tool_call.arguments %}
+                {{- param + '=' -}}
+                {{- "%sr" | format(tool_call.arguments[param]) -}}
+                {% if not loop.last %}, {% endif %}
+            {%- endfor %}
+            {{- ')' -}}
+            {% if not loop.last %}, {% endif %}
+        {%- endfor %}
+        {{- ']<|eot_id|>' -}}
+    {%- elif message.role == "tool" or message.role == "ipython" %}
+        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+        {%- if message.content is mapping %}
+            {{- message.content | tojson }}
+        {%- else %}
+            {{- { "output": message.content } | tojson }}
+        {%- endif %}
+        {{- "<|eot_id|>" }}
+    {%- endif %}
+{%- endfor %}
+
+{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
diff --git a/tests/entrypoints/openai/tool_parsers/__init__.py b/tests/entrypoints/openai/tool_parsers/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
new file mode 100644
index 0000000000000..47b0b6bb80ffe
--- /dev/null
+++ b/tests/entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
@@ -0,0 +1,160 @@
+from typing import List
+from unittest.mock import MagicMock
+
+import pytest
+
+from tests.entrypoints.openai.tool_parsers.utils import (
+    run_tool_extraction, run_tool_extraction_streaming)
+from vllm.entrypoints.openai.protocol import FunctionCall
+from vllm.entrypoints.openai.tool_parsers import ToolParser, ToolParserManager
+
+# https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#model-response-format-1
+SIMPLE_FUNCTION_OUTPUT = "get_weather(city='San Francisco', metric='celsius')"
+SIMPLE_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments='{"city": "San Francisco", "metric": "celsius"}',
+)
+MORE_TYPES_FUNCTION_OUTPUT = (
+    "register_user(name='John Doe', "
+    "age=37, "
+    "address={'city': 'San Francisco', 'state': 'CA'}, "
+    "role=None, "
+    "passed_test=True, "
+    "aliases=['John', 'Johnny'])")
+MORE_TYPES_FUNCTION_CALL = FunctionCall(
+    name="register_user",
+    arguments='{"name": "John Doe", '
+    '"age": 37, '
+    '"address": {"city": "San Francisco", "state": "CA"}, '
+    '"role": null, '
+    '"passed_test": true, '
+    '"aliases": ["John", "Johnny"]}',
+)
+PARAMETERLESS_FUNCTION_OUTPUT = "get_weather()"
+PARAMETERLESS_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments='{}',
+)
+EMPTY_DICT_FUNCTION_OUTPUT = "do_something_cool(additional_data={})"
+EMPTY_DICT_FUNCTION_CALL = FunctionCall(
+    name="do_something_cool",
+    arguments='{"additional_data": {}}',
+)
+EMPTY_LIST_FUNCTION_OUTPUT = "do_something_cool(steps=[])"
+EMPTY_LIST_FUNCTION_CALL = FunctionCall(
+    name="do_something_cool",
+    arguments='{"steps": []}',
+)
+ESCAPED_STRING_FUNCTION_OUTPUT = (
+    r"get_weather(city='Martha\'s Vineyard', metric='\"cool units\"')")
+ESCAPED_STRING_FUNCTION_CALL = FunctionCall(
+    name="get_weather",
+    arguments='{"city": "Martha\'s Vineyard", "metric": "\\"cool units\\""}',
+)
+
+
+@pytest.mark.parametrize("streaming", [True, False])
+def test_no_tool_call(streaming: bool):
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
+        mock_tokenizer)
+    model_output = "How can I help you today?"
+
+    content, tool_calls = run_tool_extraction(tool_parser,
+                                              model_output,
+                                              streaming=streaming)
+
+    assert content == model_output
+    assert len(tool_calls) == 0
+
+
+TEST_CASES = [
+    pytest.param(True,
+                 f"[{SIMPLE_FUNCTION_OUTPUT}]", [SIMPLE_FUNCTION_CALL],
+                 id="simple_streaming"),
+    pytest.param(False,
+                 f"[{SIMPLE_FUNCTION_OUTPUT}]", [SIMPLE_FUNCTION_CALL],
+                 id="simple_nonstreaming"),
+    pytest.param(True,
+                 f"[{MORE_TYPES_FUNCTION_OUTPUT}]", [MORE_TYPES_FUNCTION_CALL],
+                 id="more_types_streaming"),
+    pytest.param(False,
+                 f"[{MORE_TYPES_FUNCTION_OUTPUT}]", [MORE_TYPES_FUNCTION_CALL],
+                 id="more_types_nonstreaming"),
+    pytest.param(True,
+                 f"[{PARAMETERLESS_FUNCTION_OUTPUT}]",
+                 [PARAMETERLESS_FUNCTION_CALL],
+                 id="parameterless_streaming"),
+    pytest.param(False,
+                 f"[{PARAMETERLESS_FUNCTION_OUTPUT}]",
+                 [PARAMETERLESS_FUNCTION_CALL],
+                 id="parameterless_nonstreaming"),
+    pytest.param(True,
+                 f"[{EMPTY_DICT_FUNCTION_OUTPUT}]", [EMPTY_DICT_FUNCTION_CALL],
+                 id="empty_dict_streaming"),
+    pytest.param(False,
+                 f"[{EMPTY_DICT_FUNCTION_OUTPUT}]", [EMPTY_DICT_FUNCTION_CALL],
+                 id="empty_dict_nonstreaming"),
+    pytest.param(True,
+                 f"[{EMPTY_LIST_FUNCTION_OUTPUT}]", [EMPTY_LIST_FUNCTION_CALL],
+                 id="empty_list_streaming"),
+    pytest.param(False,
+                 f"[{EMPTY_LIST_FUNCTION_OUTPUT}]", [EMPTY_LIST_FUNCTION_CALL],
+                 id="empty_list_nonstreaming"),
+    pytest.param(True,
+                 f"[{ESCAPED_STRING_FUNCTION_OUTPUT}]",
+                 [ESCAPED_STRING_FUNCTION_CALL],
+                 id="escaped_string_streaming"),
+    pytest.param(False,
+                 f"[{ESCAPED_STRING_FUNCTION_OUTPUT}]",
+                 [ESCAPED_STRING_FUNCTION_CALL],
+                 id="escaped_string_nonstreaming"),
+    pytest.param(True,
+                 f"[{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}]",
+                 [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
+                 id="parallel_calls_streaming"),
+    pytest.param(False,
+                 f"[{SIMPLE_FUNCTION_OUTPUT}, {MORE_TYPES_FUNCTION_OUTPUT}]",
+                 [SIMPLE_FUNCTION_CALL, MORE_TYPES_FUNCTION_CALL],
+                 id="parallel_calls_nonstreaming"),
+]
+
+
+@pytest.mark.parametrize("streaming, model_output, expected_tool_calls",
+                         TEST_CASES)
+def test_tool_call(streaming: bool, model_output: str,
+                   expected_tool_calls: List[FunctionCall]):
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
+        mock_tokenizer)
+
+    content, tool_calls = run_tool_extraction(tool_parser,
+                                              model_output,
+                                              streaming=streaming)
+
+    assert content is None
+    assert len(tool_calls) == len(expected_tool_calls)
+    for actual, expected in zip(tool_calls, expected_tool_calls):
+        assert actual.type == "function"
+        assert actual.function == expected
+
+
+def test_streaming_tool_call_with_large_steps():
+    mock_tokenizer = MagicMock()
+    tool_parser: ToolParser = ToolParserManager.get_tool_parser("pythonic")(
+        mock_tokenizer)
+    model_output_deltas = [
+        "[get_weather(city='San",
+        " Francisco', metric='celsius'), "
+        f"{PARAMETERLESS_FUNCTION_OUTPUT}, "
+        f"{EMPTY_LIST_FUNCTION_OUTPUT}]",
+    ]
+
+    reconstructor = run_tool_extraction_streaming(
+        tool_parser, model_output_deltas, assert_one_tool_per_delta=False)
+
+    assert reconstructor.other_content == ""
+    assert len(reconstructor.tool_calls) == 3
+    assert reconstructor.tool_calls[0].function == SIMPLE_FUNCTION_CALL
+    assert reconstructor.tool_calls[1].function == PARAMETERLESS_FUNCTION_CALL
+    assert reconstructor.tool_calls[2].function == EMPTY_LIST_FUNCTION_CALL
diff --git a/tests/entrypoints/openai/tool_parsers/utils.py b/tests/entrypoints/openai/tool_parsers/utils.py
new file mode 100644
index 0000000000000..f0a2a32c16786
--- /dev/null
+++ b/tests/entrypoints/openai/tool_parsers/utils.py
@@ -0,0 +1,123 @@
+from typing import Iterable, List, Tuple, Union
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaMessage,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers import ToolParser
+
+
+class StreamingToolReconstructor:
+
+    def __init__(self, assert_one_tool_per_delta: bool = True):
+        self.tool_calls: List[ToolCall] = []
+        self.other_content: str = ""
+        self._assert_one_tool_per_delta = assert_one_tool_per_delta
+
+    def append_delta(self, delta: DeltaMessage):
+        if delta.content is not None:
+            self.other_content += delta.content
+        else:
+            assert delta.tool_calls, (
+                "Streaming results should have either content or tool calls "
+                "(or both)")
+        if self._assert_one_tool_per_delta:
+            # Note: This isn't strictly required by the API and may not be
+            # possible to adhere to depending on the token space and number of
+            # tokens per streamed response from the model, but it is required
+            # by tool_use tests, so we enforce it here by default also.
+            assert len(delta.tool_calls) < 2, (
+                "Streaming should include only one tool call per update.")
+        for call_delta in delta.tool_calls:
+            assert call_delta.type == "function", (
+                "Streaming tool calls should only emit function calls. Got "
+                f"{call_delta.type}")
+            current_tool_call = self.tool_calls[
+                call_delta.index] if call_delta.index < len(
+                    self.tool_calls) else None
+            if current_tool_call:
+                assert (not call_delta.function.name), (
+                    "Streaming tool calls should emit the full function name "
+                    f"exactly once. Got {call_delta.function.name}")
+                assert (not call_delta.id), (
+                    "Streaming tool calls must emit function id only once. Got "
+                    f"{call_delta.id}")
+                assert (call_delta.index == len(self.tool_calls) - 1), (
+                    f"Incorrect index for tool delta. Got {call_delta.index}, "
+                    f"expected {len(self.tool_calls) - 1}")
+                current_tool_call.function.arguments += (
+                    call_delta.function.arguments)
+            else:
+                assert call_delta.id is not None, (
+                    "Streaming tool calls must have an id on first appearance")
+                assert call_delta.function.name is not None, (
+                    "Streaming tool calls must have a function name on first "
+                    "appearance")
+                assert call_delta.index == len(self.tool_calls), (
+                    f"Incorrect index for tool delta. Got {call_delta.index}, "
+                    f"expected {len(self.tool_calls)}")
+                self.tool_calls.append(
+                    ToolCall(id=call_delta.id,
+                             function=FunctionCall(
+                                 name=call_delta.function.name,
+                                 arguments=call_delta.function.arguments
+                                 or "")))
+
+
+def run_tool_extraction(
+    tool_parser: ToolParser,
+    model_output: str,
+    request: Union[ChatCompletionRequest, None] = None,
+    streaming: bool = False,
+    assert_one_tool_per_delta: bool = True,
+) -> Tuple[Union[str, None], List[ToolCall]]:
+    if streaming:
+        reconstructor = run_tool_extraction_streaming(
+            tool_parser,
+            model_output,
+            request,
+            assert_one_tool_per_delta=assert_one_tool_per_delta)
+        return reconstructor.other_content or None, reconstructor.tool_calls
+    else:
+        extracted = run_tool_extraction_nonstreaming(tool_parser, model_output,
+                                                     request)
+        assert extracted.tools_called == bool(extracted.tool_calls)
+        return extracted.content, extracted.tool_calls
+
+
+def run_tool_extraction_nonstreaming(
+    tool_parser: ToolParser,
+    model_output: str,
+    request: Union[ChatCompletionRequest, None] = None
+) -> ExtractedToolCallInformation:
+    request = request or ChatCompletionRequest(messages=[], model="test-model")
+    return tool_parser.extract_tool_calls(model_output, request)
+
+
+def run_tool_extraction_streaming(
+    tool_parser: ToolParser,
+    model_deltas: Iterable[str],
+    request: Union[ChatCompletionRequest, None] = None,
+    assert_one_tool_per_delta: bool = True,
+) -> StreamingToolReconstructor:
+    request = request or ChatCompletionRequest(messages=[], model="test-model")
+    reconstructor = StreamingToolReconstructor(
+        assert_one_tool_per_delta=assert_one_tool_per_delta)
+    previous_text = ""
+    previous_tokens: List[int] = []
+    for delta in model_deltas:
+        token_delta = [
+            tool_parser.vocab.get(token)
+            for token in tool_parser.model_tokenizer.tokenize(delta)
+            if token in tool_parser.vocab
+        ]
+        current_text = previous_text + delta
+        current_tokens = previous_tokens + token_delta
+        delta_message = tool_parser.extract_tool_calls_streaming(
+            previous_text, current_text, delta, previous_tokens,
+            current_tokens, token_delta, request)
+        if delta_message is not None:
+            reconstructor.append_delta(delta_message)
+        previous_text = current_text
+        previous_tokens = current_tokens
+    return reconstructor
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
index 576555b368afe..6818ac44b2478 100644
--- a/tests/tool_use/utils.py
+++ b/tests/tool_use/utils.py
@@ -122,7 +122,17 @@ def ensure_system_prompt(messages: List[Dict[str, Any]],
         ],
         "supports_parallel":
         False,
-    }
+    },
+    "toolACE": {
+        "model":
+        "Team-ACE/ToolACE-8B",
+        "arguments": [
+            "--tool-call-parser", "pythonic", "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_toolace.jinja")
+        ],
+        "supports_parallel":
+        True,
+    },
 }
 
 WEATHER_TOOL: ChatCompletionToolParam = {
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 09edaf98f7d17..07cc9e94bdd03 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -74,6 +74,11 @@ def __init__(self,
         self.tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None
         if self.enable_auto_tools:
             try:
+                if (tool_parser == "pythonic" and
+                        model_config.model.startswith("meta-llama/Llama-3.2")):
+                    logger.warning(
+                        "Llama3.2 models may struggle to emit valid pythonic"
+                        " tool calls")
                 self.tool_parser = ToolParserManager.get_tool_parser(
                     tool_parser)
             except Exception as e:
diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
index 2187862e8380b..2850349a44835 100644
--- a/vllm/entrypoints/openai/tool_parsers/__init__.py
+++ b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -6,9 +6,11 @@
 from .jamba_tool_parser import JambaToolParser
 from .llama_tool_parser import Llama3JsonToolParser
 from .mistral_tool_parser import MistralToolParser
+from .pythonic_tool_parser import PythonicToolParser
 
 __all__ = [
     "ToolParser", "ToolParserManager", "Granite20bFCToolParser",
     "GraniteToolParser", "Hermes2ProToolParser", "MistralToolParser",
-    "Internlm2ToolParser", "Llama3JsonToolParser", "JambaToolParser"
+    "Internlm2ToolParser", "Llama3JsonToolParser", "JambaToolParser",
+    "PythonicToolParser"
 ]
diff --git a/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
new file mode 100644
index 0000000000000..26da4d689fb8b
--- /dev/null
+++ b/vllm/entrypoints/openai/tool_parsers/pythonic_tool_parser.py
@@ -0,0 +1,289 @@
+import ast
+import json
+import re
+from typing import Any, Sequence, Tuple, Union
+
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+                                              DeltaFunctionCall, DeltaMessage,
+                                              DeltaToolCall,
+                                              ExtractedToolCallInformation,
+                                              FunctionCall, ToolCall)
+from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
+    ToolParser, ToolParserManager)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class _UnexpectedAstError(Exception):
+    pass
+
+
+@ToolParserManager.register_module("pythonic")
+class PythonicToolParser(ToolParser):
+    """
+    Tool call parser for models that produce tool calls in a pythonic style,
+    such as Llama 3.2 models.
+
+    Used when --enable-auto-tool-choice --tool-call-parser pythonic are all set
+    """
+    # TODO(mdepinet): Possible future improvements:
+    #   1. Support text + tools separated by either <|python_tag|> or \n\n
+    #   2. Support tools outside of a list (or separated by a semicolon).
+    #      This depends on item 1 for consistent streaming.
+    # Neither of these are necessary for e.g. ToolACE, but both would help make
+    # Llama3.2 models more reliable.
+
+    TOOL_CALL_REGEX = re.compile(
+        r"\[([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s)?\),\s*)*([a-zA-Z]+\w*\(([a-zA-Z]+\w*=.*,\s*)*([a-zA-Z]+\w*=.*\s*)?\)\s*)+\]",
+        re.DOTALL)
+
+    def __init__(self, tokenizer: PreTrainedTokenizerBase):
+        super().__init__(tokenizer)
+
+    # Rename for readability. This is NOT a tool id.
+    @property
+    def current_tool_index(self) -> int:
+        return self.current_tool_id
+
+    @current_tool_index.setter
+    def current_tool_index(self, value: int) -> None:
+        self.current_tool_id = value
+
+    def extract_tool_calls(
+            self, model_output: str,
+            request: ChatCompletionRequest) -> ExtractedToolCallInformation:
+        """
+        Extract the tool calls from a complete model response.
+        """
+
+        if not (self.TOOL_CALL_REGEX.match(model_output)):
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+        try:
+            module = ast.parse(model_output)
+            parsed = getattr(module.body[0], "value", None)
+            if isinstance(parsed, ast.List) and all(
+                    isinstance(e, ast.Call) for e in parsed.elts):
+                return ExtractedToolCallInformation(
+                    tools_called=True,
+                    tool_calls=[
+                        _handle_single_tool(e)  # type: ignore
+                        for e in parsed.elts
+                    ],
+                    content=None)
+            else:
+                raise _UnexpectedAstError(
+                    "Tool output must be a list of function calls")
+        except Exception:
+            logger.exception("Error in extracting tool call from response.")
+            # Treat as regular text
+            return ExtractedToolCallInformation(tools_called=False,
+                                                tool_calls=[],
+                                                content=model_output)
+
+    def extract_tool_calls_streaming(
+        self,
+        previous_text: str,
+        current_text: str,
+        delta_text: str,
+        previous_token_ids: Sequence[int],
+        current_token_ids: Sequence[int],
+        delta_token_ids: Sequence[int],
+        request: ChatCompletionRequest,
+    ) -> Union[DeltaMessage, None]:
+
+        if not current_text.startswith("["):
+            return DeltaMessage(content=delta_text)
+
+        try:
+            valid_and_added_text = _make_valid_python(current_text)
+            if valid_and_added_text is None:
+                return None
+            valid_text, added_text = valid_and_added_text
+
+            module = ast.parse(valid_text)
+            parsed = getattr(module.body[0], "value", None)
+            if not isinstance(parsed, ast.List) or not all(
+                    isinstance(e, ast.Call) for e in parsed.elts):
+                raise _UnexpectedAstError(
+                    "Tool output must be a list of function calls")
+            tool_calls = [
+                _handle_single_tool(e)  # type: ignore
+                for e in parsed.elts
+            ]
+
+            tool_deltas = []
+            for index, new_call in enumerate(tool_calls):
+                if index < self.current_tool_index:
+                    continue
+
+                self.current_tool_index = index
+                if len(self.streamed_args_for_tool) == index:
+                    self.streamed_args_for_tool.append("")
+
+                new_call_complete = index < len(
+                    tool_calls) - 1 or ")]" not in added_text
+                if new_call_complete:
+                    self.current_tool_index += 1
+
+                withheld_suffix = (added_text[:-2]
+                                   if not new_call_complete else "")
+                if not new_call_complete and added_text[-2] == ")":
+                    # Function call is incomplete. Withhold the closing bracket.
+                    withheld_suffix = withheld_suffix + "}"
+                # Strings get single quotes in the model-produced string.
+                # JSON requires double quotes.
+                withheld_suffix = withheld_suffix.replace("'", '"')
+                delta = _compute_tool_delta(self.streamed_args_for_tool[index],
+                                            new_call, index, withheld_suffix)
+
+                if delta is not None:
+                    tool_deltas.append(delta)
+                    if (delta.function is not None
+                            and delta.function.arguments is not None):
+                        self.streamed_args_for_tool[
+                            index] += delta.function.arguments
+
+            # HACK: serving_chat.py inspects the internal state of tool parsers
+            # when determining it's final streaming delta, automatically
+            # adding autocompleted JSON.
+            # These two lines avoid that nonsense while ensuring finish_reason
+            # is set to tool_calls when at least one tool is called.
+            if tool_deltas and not self.prev_tool_call_arr:
+                self.prev_tool_call_arr = [{"arguments": {}}]
+
+            if tool_deltas:
+                return DeltaMessage(tool_calls=tool_deltas)
+            elif not added_text and self.current_tool_id > 0:
+                # Return an empty DeltaMessage once the tool calls are all done
+                # so that finish_reason gets set.
+                return DeltaMessage(content='')
+            else:
+                return None
+        except Exception:
+            logger.exception("Error trying to handle streaming tool call.")
+            logger.debug(
+                "Skipping chunk as a result of tool streaming extraction "
+                "error")
+            return None
+
+
+def _get_parameter_value(val: ast.expr) -> Any:
+    if isinstance(val, ast.Constant):
+        return val.value
+    elif isinstance(val, ast.Dict):
+        if not all(isinstance(k, ast.Constant) for k in val.keys):
+            raise _UnexpectedAstError(
+                "Dict tool call arguments must have literal keys")
+        return {
+            k.value: _get_parameter_value(v)  # type: ignore
+            for k, v in zip(val.keys, val.values)
+        }
+    elif isinstance(val, ast.List):
+        return [_get_parameter_value(v) for v in val.elts]
+    else:
+        raise _UnexpectedAstError("Tool call arguments must be literals")
+
+
+def _handle_single_tool(call: ast.Call) -> ToolCall:
+    if not isinstance(call.func, ast.Name):
+        raise _UnexpectedAstError("Invalid tool call name")
+    function_name = call.func.id
+    arguments = {}
+    for keyword in call.keywords:
+        arguments[keyword.arg] = _get_parameter_value(keyword.value)
+    return ToolCall(type="function",
+                    function=FunctionCall(name=function_name,
+                                          arguments=json.dumps(arguments)))
+
+
+def _make_valid_python(text: str) -> Union[Tuple[str, str], None]:
+    bracket_stack = []
+    for index, char in enumerate(text):
+        if char in {"[", "(", "{"}:
+            bracket_stack.append(char)
+        elif char == "]":
+            if not bracket_stack or bracket_stack.pop() != "[":
+                raise _UnexpectedAstError("Mismatched square brackets")
+        elif char == ")":
+            if not bracket_stack or bracket_stack.pop() != "(":
+                raise _UnexpectedAstError("Mismatched parentheses")
+        elif char == "}":
+            if not bracket_stack or bracket_stack.pop() != "{":
+                raise _UnexpectedAstError("Mismatched curly braces")
+        elif char in {"'", '"'}:
+            if bracket_stack and bracket_stack[-1] == char:
+                if index > 0 and text[index - 1] == "\\":
+                    # Treat an escaped quote as a regular character
+                    pass
+                else:
+                    bracket_stack.pop()
+            elif bracket_stack and bracket_stack[-1] in {"'", '"'}:
+                # Double quote within a single quote string or vice versa.
+                pass
+            else:
+                bracket_stack.append(char)
+
+    text = text.rstrip()
+    if text.endswith("=") or text.endswith(":"):
+        # Since we have no type information for this property/parameter value,
+        # we can't fill in a valid value.
+        return None
+    if bracket_stack and bracket_stack[-1] == "{":
+        trailing_dict_text = text[:text.rfind("{")]
+        num_keys = trailing_dict_text.count(":")
+        num_values = trailing_dict_text.count(",")
+        if num_keys <= num_values:
+            return None  # Incomplete property name within parameter value
+    if bracket_stack and bracket_stack[-1] == "(":
+        trailing_params_text = text[:text.rfind("(")]
+        num_full_param_names = trailing_params_text.count("=")
+        num_full_param_values = trailing_params_text.count(",")
+        if num_full_param_names <= num_full_param_values:
+            return None  # Incomplete parameter name
+    if text.endswith(","):
+        text = text[:-1]
+    if bracket_stack and bracket_stack[-1] == "[" and not text.endswith(
+            "[") and not text.endswith(")"):
+        return None  # Incomplete function name
+
+    added_text = ""
+    for char in reversed(bracket_stack):
+        if char == "[":
+            added_text += "]"
+        elif char == "(":
+            added_text += ")"
+        elif char == "{":
+            added_text += "}"
+        elif char == "'":
+            added_text += "'"
+        elif char == '"':
+            added_text += '"'
+
+    return text + added_text, added_text
+
+
+def _compute_tool_delta(previously_sent_args: str, new_call: ToolCall,
+                        index: int,
+                        withheld_suffix: str) -> Union[DeltaToolCall, None]:
+    new_call_args = new_call.function.arguments
+    if withheld_suffix:
+        assert new_call_args.endswith(withheld_suffix)
+        new_call_args = new_call_args[:-len(withheld_suffix)]
+    if not previously_sent_args:
+        return DeltaToolCall(id=new_call.id,
+                             index=index,
+                             function=DeltaFunctionCall(
+                                 name=new_call.function.name,
+                                 arguments=new_call_args,
+                             ))
+
+    arg_diff = new_call_args[len(previously_sent_args):]
+    return DeltaToolCall(
+        id="", index=index, function=DeltaFunctionCall(
+            arguments=arg_diff)) if arg_diff else None

From 52b48c1ead683ec2afe6b0396ece32d73884cd21 Mon Sep 17 00:00:00 2001
From: Guillaume Calmettes <gcalmettes@scaleway.com>
Date: Thu, 14 Nov 2024 05:48:16 +0100
Subject: [PATCH 132/183] [BugFix]: properly deserialize `tool_calls` iterator
 before processing by mistral-common when MistralTokenizer is used (#9951)

Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
---
 vllm/entrypoints/openai/serving_chat.py | 36 +++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 07cc9e94bdd03..5178481c737b4 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -127,6 +127,42 @@ async def create_chat_completion(
                 return self.create_error_response(
                     "tool_choice = \"required\" is not supported!")
 
+            # NOTE: There is currently a bug in pydantic where attributes
+            # declared as iterables are replaced in in the instances by
+            # pydantic-core ValidatorIterator instance. In particular, this
+            # affects tool_calls defined in ChatCompletionAssistantMessageParam
+            # model:
+            # see:
+            #   - https://github.com/pydantic/pydantic/issues/9467
+            # As a result, tool_calls from assistant messages are never
+            # deserialized in the request object if the tool_calls iterator is
+            # not consumed. This affect messages passed to the MistralTokenizer
+            # since no chat template is applied and therefore the tools_calls
+            # iterator is not directly consumed.
+            # Issue is tracked on Pydantic side, with resolution planned for
+            # v2.11 release. In the meantime, the official workaround is to
+            # consume the iterator so the tool_calls are correctly deserialized
+            # in the OpenAI ChatCompletionAssistantMessageParam object
+            # https://github.com/pydantic/pydantic/issues/9467#issuecomment-2442097291 # noqa: E501
+            # Official Pydantic Issues:
+            #   - https://github.com/pydantic/pydantic/issues/9541
+            # TODO: remove when pydantic v2.11 is released
+            if isinstance(tokenizer, MistralTokenizer):
+                for i, message in enumerate(request.messages):
+                    if message.get("role") == 'assistant':
+                        tool_calls_validator = message.get(
+                            "tool_calls", ().__iter__())
+                        validated_tool_calls = []
+                        while True:
+                            try:
+                                tool_call = next(
+                                    tool_calls_validator)  # type: ignore
+                                validated_tool_calls.append(tool_call)
+                            except StopIteration:
+                                break
+                        request.messages[i][
+                            "tool_calls"] = validated_tool_calls
+
             if (request.tool_choice == "auto" and
                     not (self.enable_auto_tools and tool_parser is not None)
                     and not isinstance(tokenizer, MistralTokenizer)):

From 294bf467bacc2c9532cc56d1a512edde01bed947 Mon Sep 17 00:00:00 2001
From: B-201 <Joy25810@foxmail.com>
Date: Thu, 14 Nov 2024 14:31:44 +0800
Subject: [PATCH 133/183] [Model] Add BNB quantization support for Idefics3
 (#10310)

Signed-off-by: B-201 <Joy25810@foxmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/model_executor/models/idefics3.py | 68 +++++++++++++++++++++++---
 1 file changed, 61 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 85f23a1da533b..0cecc754e916f 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -22,6 +22,7 @@
 from PIL import Image
 from torch import nn
 # Temporary solution for transformers below 4.46.0.
+from transformers import PretrainedConfig as Idefics3Config
 from transformers import ProcessorMixin as Idefics3ImageProcessor
 
 from vllm.attention import AttentionMetadata
@@ -31,6 +32,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.linear import ReplicatedLinear
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
 from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
 from vllm.model_executor.models.module_mapping import MultiModelKeys
@@ -374,12 +376,23 @@ def dummy_data_for_idefics3(
 
 class Idefics3SimpleMLP(nn.Module):
 
-    def __init__(self, config):
+    def __init__(
+        self,
+        config: Idefics3Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
         super().__init__()
         input_size = config.vision_config.hidden_size * (config.scale_factor**
                                                          2)
         output_size = config.text_config.hidden_size
-        self.proj = ReplicatedLinear(input_size, output_size, bias=False)
+        self.proj = ReplicatedLinear(
+            input_size,
+            output_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "proj"),
+        )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         out, _ = self.proj(x)
@@ -388,10 +401,19 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 class Idefics3Connector(nn.Module):
 
-    def __init__(self, config):
+    def __init__(
+        self,
+        config: Idefics3Config,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
+    ):
         super().__init__()
         self.scale_factor = config.scale_factor
-        self.modality_projection = Idefics3SimpleMLP(config)
+        self.modality_projection = Idefics3SimpleMLP(
+            config,
+            quant_config,
+            prefix=maybe_prefix(prefix, "modality_projection"),
+        )
 
     def pixel_shuffle(self,
                       x: torch.Tensor,
@@ -431,9 +453,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.config = config
         self.padding_idx = self.config.text_config.pad_token_id
         self.vocab_size = self.config.text_config.vocab_size
-        self.vision_model = Idefics3VisionTransformer(config.vision_config,
-                                                      quant_config)
-        self.connector = Idefics3Connector(config)
+        self.vision_model = Idefics3VisionTransformer(
+            config.vision_config,
+            quant_config=quant_config,
+            prefix=maybe_prefix(prefix, "vision_model"))
+        self.connector = Idefics3Connector(
+            config,
+            quant_config,
+            prefix=maybe_prefix(prefix, "connector"),
+        )
         self.text_model = LlamaModel(
             vllm_config=vllm_config.with_hf_config(config.text_config),
             prefix=maybe_prefix(prefix, "text_model"),
@@ -637,6 +665,32 @@ class Idefics3ForConditionalGeneration(nn.Module, SupportsMultiModal,
         "gate_up_proj",
         "down_proj",
     ]
+
+    # BitandBytes specific attributes
+    default_bitsandbytes_target_modules = [
+        ".gate_proj.",
+        ".down_proj.",
+        ".up_proj.",
+        ".q_proj.",
+        ".k_proj.",
+        ".v_proj.",
+        ".o_proj.",
+        # vision_model
+        ".fc1.",
+        ".fc2.",
+        ".out_proj.",
+        # connector
+        ".proj.",
+    ]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
+
     embedding_modules = {}
     embedding_padding_modules = []
 

From 29f3ef26a38e5afab529fb9f6098704fd106a779 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 14 Nov 2024 00:23:39 -0800
Subject: [PATCH 134/183] [ci][distributed] disable hanging tests (#10317)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/distributed/test_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/distributed/test_utils.py b/tests/distributed/test_utils.py
index 50444d3abfaf2..686b697c98e03 100644
--- a/tests/distributed/test_utils.py
+++ b/tests/distributed/test_utils.py
@@ -117,6 +117,7 @@ def allgather_worker(rank, WORLD_SIZE, port1, port2):
     pg1.barrier()
 
 
+@pytest.mark.skip(reason="This test is flaky and prone to hang.")
 @multi_gpu_test(num_gpus=4)
 @pytest.mark.parametrize(
     "worker", [cpu_worker, gpu_worker, broadcast_worker, allgather_worker])

From 03025c023f99bea58652e9b5a8a4a8b50af6bdd0 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Thu, 14 Nov 2024 16:45:32 +0800
Subject: [PATCH 135/183] [CI/Build] Fix CPU CI online inference timeout
 (#10314)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 .buildkite/run-cpu-test.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index b3771bb268e22..bf0fe29590b54 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -61,7 +61,7 @@ function cpu_tests() {
   docker exec cpu-test bash -c "
     set -e
     export VLLM_CPU_KVCACHE_SPACE=10 
-    export VLLM_CPU_OMP_THREADS_BIND=$CORE_RANGE
+    export VLLM_CPU_OMP_THREADS_BIND=$1
     python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
     timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
     python3 benchmarks/benchmark_serving.py \
@@ -75,4 +75,4 @@ function cpu_tests() {
 
 # All of CPU tests are expected to be finished less than 25 mins.
 export -f cpu_tests
-timeout 25m bash -c "cpu_tests"
+timeout 25m bash -c "cpu_tests $CORE_RANGE"

From 675d603400616dcb45093ffc9f57c4859c22df76 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 14 Nov 2024 17:47:53 +0800
Subject: [PATCH 136/183] [CI/Build] Make shellcheck happy (#10285)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/run-cpu-test.sh | 12 ++++++------
 tools/shellcheck.sh        |  3 ++-
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index bf0fe29590b54..a00331abb7d03 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -9,8 +9,8 @@ CORE_RANGE=${CORE_RANGE:-48-95}
 NUMA_NODE=${NUMA_NODE:-1}
 
 # Try building the docker image
-numactl -C $CORE_RANGE -N $NUMA_NODE docker build -t cpu-test -f Dockerfile.cpu .
-numactl -C $CORE_RANGE -N $NUMA_NODE docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu .
+numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
 
 # Setup cleanup
 remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
@@ -18,10 +18,10 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=$CORE_RANGE  \
- --cpuset-mems=$NUMA_NODE --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
-docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=$CORE_RANGE \
- --cpuset-mems=$NUMA_NODE --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE"  \
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
+ --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2
 
 function cpu_tests() {
   set -e
diff --git a/tools/shellcheck.sh b/tools/shellcheck.sh
index 0bb6fd2eafa14..d99fa77b96351 100755
--- a/tools/shellcheck.sh
+++ b/tools/shellcheck.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+set -e
 
 scversion="stable"
 
@@ -18,4 +19,4 @@ if ! [ -x "$(command -v shellcheck)" ]; then
 fi
 
 # TODO - fix warnings in .buildkite/run-amd-test.sh
-find . -name "*.sh" -not -path "./.buildkite/run-amd-test.sh" -exec sh -c 'git check-ignore -q $1 || shellcheck $1' _ {} \;
+find . -name "*.sh" -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck "{}"'

From 1dbae0329c6d907b72b373667b4d5716bae4415f Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Thu, 14 Nov 2024 08:19:38 -0800
Subject: [PATCH 137/183] [Docs] Publish meetup slides (#10331)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 README.md                         | 10 +---------
 docs/source/community/meetups.rst |  1 +
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index b75bfc5c699a7..6530886ed7de2 100644
--- a/README.md
+++ b/README.md
@@ -15,16 +15,8 @@ Easy, fast, and cheap LLM serving for everyone
 
 ---
 
-**vLLM x Snowflake Meetup (Wednesday, November 13th, 5:30-8PM PT) at Snowflake HQ, San Mateo**
-
-We are excited to announce the last in-person vLLM meetup of the year!
-Join the vLLM developers and engineers from Snowflake AI Research to chat about the latest LLM inference optimizations and your 2025 vLLM wishlist!
-Register [here](https://lu.ma/h0qvrajz) and be a part of the event!
-
----
-
-
 *Latest News* 🔥
+- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing).
 - [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
 - [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/sessioncatalog?tab.day=20241001&search.sessiontracks=1719251906298001uzJ2) from other vLLM contributors and users!
 - [2024/09] We hosted [the sixth vLLM meetup](https://lu.ma/87q3nvnh) with NVIDIA! Please find the meetup slides [here](https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing).
diff --git a/docs/source/community/meetups.rst b/docs/source/community/meetups.rst
index a3962e96e7913..c87f01aa263b3 100644
--- a/docs/source/community/meetups.rst
+++ b/docs/source/community/meetups.rst
@@ -5,6 +5,7 @@ vLLM Meetups
 
 We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
 
+- `The seventh vLLM meetup <https://lu.ma/h0qvrajz>`__, with Snowflake, November 14th 2024. `[Slides] <https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing>`__
 - `The sixth vLLM meetup <https://lu.ma/87q3nvnh>`__, with NVIDIA, September 9th 2024. `[Slides] <https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing>`__
 - `The fifth vLLM meetup <https://lu.ma/lp0gyjqr>`__, with AWS, July 24th 2024. `[Slides] <https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing>`__
 - `The fourth vLLM meetup <https://lu.ma/agivllm>`__, with Cloudflare and BentoML, June 11th 2024. `[Slides] <https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing>`__

From 4a18fd14ba4a349291c798a16bf62fa8a9af0b6b Mon Sep 17 00:00:00 2001
From: Maximilien de Bayser <mbayser@br.ibm.com>
Date: Thu, 14 Nov 2024 18:23:29 -0300
Subject: [PATCH 138/183] Support Roberta embedding models (#9387)

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Signed-off-by: Flavia Beo <flavia.beo@ibm.com>
Co-authored-by: Flavia Beo <flavia.beo@ibm.com>
---
 csrc/attention/paged_attention_v1.cu          |   3 +
 csrc/attention/paged_attention_v2.cu          |   3 +
 csrc/cpu/attention.cpp                        |   6 +
 .../test_model_load_with_params.py            |  44 +++++++
 .../embedding/language/test_embedding.py      |   2 +
 vllm/attention/ops/ipex_attn.py               |   2 +-
 vllm/attention/ops/paged_attn.py              |   2 +-
 vllm/model_executor/models/bert.py            |  35 ++++--
 vllm/model_executor/models/registry.py        |   2 +
 vllm/model_executor/models/roberta.py         | 117 ++++++++++++++++++
 10 files changed, 202 insertions(+), 14 deletions(-)
 create mode 100644 vllm/model_executor/models/roberta.py

diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu
index 8b99f0843aaf6..741cd0c82dc89 100644
--- a/csrc/attention/paged_attention_v1.cu
+++ b/csrc/attention/paged_attention_v1.cu
@@ -98,6 +98,9 @@ void paged_attention_v1_launcher(
     // NOTE(woosuk): To reduce the compilation time, we only compile for the
     // head sizes that we use in the model. However, we can easily extend this
     // to support any head size which is a multiple of 16.
+    case 32:
+      LAUNCH_PAGED_ATTENTION_V1(32);
+      break;
     case 64:
       LAUNCH_PAGED_ATTENTION_V1(64);
       break;
diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu
index 3a7a9dee916aa..6de8d0bdd5b8d 100644
--- a/csrc/attention/paged_attention_v2.cu
+++ b/csrc/attention/paged_attention_v2.cu
@@ -104,6 +104,9 @@ void paged_attention_v2_launcher(
     // NOTE(woosuk): To reduce the compilation time, we only compile for the
     // head sizes that we use in the model. However, we can easily extend this
     // to support any head size which is a multiple of 16.
+    case 32:
+      LAUNCH_PAGED_ATTENTION_V2(32);
+      break;
     case 64:
       LAUNCH_PAGED_ATTENTION_V2(64);
       break;
diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
index e3953c7c45719..e73eca1b345fd 100644
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -385,6 +385,9 @@ void paged_attention_v1_impl_launcher(
   int* seq_lens_ptr = seq_lens.data_ptr<int>();
 
   switch (head_size) {
+    case 32:
+      LAUNCH_V1_ATTENTION_KERNEL(T, 32, BLOCK_SIZE);
+      break;
     case 64:
       LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
       break;
@@ -702,6 +705,9 @@ void paged_attention_v2_impl_launcher(
   int* seq_lens_ptr = seq_lens.data_ptr<int>();
 
   switch (head_size) {
+    case 32:
+      LAUNCH_V2_ATTENTION_KERNEL(T, 32, BLOCK_SIZE);
+      break;
     case 64:
       LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
       break;
diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py
index 7e5e2780d3916..ed321ba9f00c1 100644
--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -4,12 +4,17 @@
 
 from vllm.model_executor.layers.pooler import PoolingType
 from vllm.model_executor.models.bert import BertEmbeddingModel
+from vllm.model_executor.models.roberta import RobertaEmbeddingModel
 from vllm.platforms import current_platform
 
 MAX_MODEL_LEN = 128
 MODEL_NAME = os.environ.get("MODEL_NAME", "BAAI/bge-base-en-v1.5")
 REVISION = os.environ.get("REVISION", "main")
 
+MODEL_NAME_ROBERTA = os.environ.get("MODEL_NAME",
+                                    "intfloat/multilingual-e5-large")
+REVISION_ROBERTA = os.environ.get("REVISION", "main")
+
 
 @pytest.mark.skipif(current_platform.is_rocm(),
                     reason="Xformers backend is not supported on ROCm.")
@@ -48,3 +53,42 @@ def test_model_loading_with_params(vllm_runner):
         assert model._pooler.normalize
         # assert output
         assert output
+
+
+@pytest.mark.skipif(current_platform.is_rocm(),
+                    reason="Xformers backend is not supported on ROCm.")
+def test_roberta_model_loading_with_params(vllm_runner):
+    """
+    Test parameter weight loading with tp>1.
+    """
+    with vllm_runner(model_name=MODEL_NAME_ROBERTA,
+                     revision=REVISION_ROBERTA,
+                     dtype="float16",
+                     max_model_len=MAX_MODEL_LEN) as model:
+        output = model.encode("Write a short story about a robot that"
+                              " dreams for the first time.\n")
+
+        model_config = model.model.llm_engine.model_config
+
+        model_tokenizer = model.model.llm_engine.tokenizer
+
+        # asserts on the bert model config file
+        assert model_config.encoder_config["max_seq_length"] == 512
+        assert not model_config.encoder_config["do_lower_case"]
+
+        # asserts on the pooling config files
+        assert model_config.pooler_config.pooling_type == PoolingType.MEAN.name
+        assert model_config.pooler_config.pooling_norm
+
+        # asserts on the tokenizer loaded
+        assert model_tokenizer.tokenizer_id == "intfloat/multilingual-e5-large"
+        assert not model_tokenizer.tokenizer_config["do_lower_case"]
+
+        model = model.model.llm_engine.model_executor\
+                     .driver_worker.model_runner.model
+        assert isinstance(model, RobertaEmbeddingModel)
+        assert model._pooler.pooling_type == PoolingType.MEAN
+        assert model._pooler.normalize
+
+        # assert output
+        assert output
diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py
index cd920aec6502e..fcdd684168d04 100644
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -13,10 +13,12 @@
     "intfloat/e5-mistral-7b-instruct",
     "BAAI/bge-base-en-v1.5",
     "BAAI/bge-multilingual-gemma2",
+    "intfloat/multilingual-e5-large",
 ]
 
 ENCODER_ONLY = [
     "BAAI/bge-base-en-v1.5",
+    "intfloat/multilingual-e5-large",
 ]
 
 
diff --git a/vllm/attention/ops/ipex_attn.py b/vllm/attention/ops/ipex_attn.py
index 6b270ffd5bc00..8df6d4ced9dc6 100644
--- a/vllm/attention/ops/ipex_attn.py
+++ b/vllm/attention/ops/ipex_attn.py
@@ -10,7 +10,7 @@ class PagedAttention:
 
     @staticmethod
     def get_supported_head_sizes() -> List[int]:
-        return [64, 80, 96, 112, 128, 256]
+        return [32, 64, 80, 96, 112, 128, 256]
 
     @staticmethod
     def get_kv_cache_shape(
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
index 92023d5b75f5a..076f151ffcb61 100644
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -34,7 +34,7 @@ class PagedAttention:
 
     @staticmethod
     def get_supported_head_sizes() -> List[int]:
-        return [64, 80, 96, 112, 120, 128, 192, 256]
+        return [32, 64, 80, 96, 112, 120, 128, 192, 256]
 
     @staticmethod
     def get_kv_cache_shape(
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 7dbc7fa0aaba4..42dd6119e76f1 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -5,7 +5,7 @@
 from transformers import BertConfig
 
 from vllm.attention import Attention, AttentionMetadata, AttentionType
-from vllm.config import CacheConfig, VllmConfig
+from vllm.config import CacheConfig, PoolerConfig, VllmConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -305,14 +305,16 @@ def forward(self, hidden_states: torch.Tensor,
 
 class BertModel(nn.Module):
 
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 prefix: str = "",
+                 embedding_class: type = BertEmbedding):
         super().__init__()
-
         config = vllm_config.model_config.hf_config
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
-
-        self.embeddings = BertEmbedding(config)
+        self.embeddings = embedding_class(config)
         self.encoder = BertEncoder(config,
                                    cache_config,
                                    quant_config,
@@ -382,13 +384,9 @@ class BertEmbeddingModel(nn.Module):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         pooler_config = vllm_config.model_config.pooler_config
-        self.model = BertModel(vllm_config=vllm_config,
-                               prefix=maybe_prefix(prefix, "model"))
-        self._pooler = Pooler.from_config_with_defaults(
-            pooler_config,
-            pooling_type=PoolingType.CLS,
-            normalize=True,
-            softmax=False)
+        self.model = self._build_model(vllm_config=vllm_config,
+                                       prefix=maybe_prefix(prefix, "model"))
+        self._pooler = self._build_pooler(pooler_config)
 
     def forward(
         self,
@@ -415,3 +413,16 @@ def pooler(
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         self.model.load_weights(weights)
+
+    def _build_model(self,
+                     vllm_config: VllmConfig,
+                     prefix: str = "") -> BertModel:
+        return BertModel(vllm_config=vllm_config,
+                         prefix=prefix,
+                         embedding_class=BertEmbedding)
+
+    def _build_pooler(self, pooler_config: PoolerConfig) -> Pooler:
+        return Pooler.from_config_with_defaults(pooler_config,
+                                                pooling_type=PoolingType.CLS,
+                                                normalize=True,
+                                                softmax=False)
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index f172c06c4a26a..f22d1b04ebf09 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -94,6 +94,8 @@
 _EMBEDDING_MODELS = {
     # [Text-only]
     "BertModel": ("bert", "BertEmbeddingModel"),
+    "RobertaModel": ("roberta", "RobertaEmbeddingModel"),
+    "XLMRobertaModel": ("roberta", "RobertaEmbeddingModel"),
     "DeciLMForCausalLM": ("decilm", "DeciLMForCausalLM"),
     "Gemma2Model": ("gemma2", "Gemma2EmbeddingModel"),
     "LlamaModel": ("llama", "LlamaEmbeddingModel"),
diff --git a/vllm/model_executor/models/roberta.py b/vllm/model_executor/models/roberta.py
new file mode 100644
index 0000000000000..c1dcdd36ec3de
--- /dev/null
+++ b/vllm/model_executor/models/roberta.py
@@ -0,0 +1,117 @@
+from typing import List, Optional
+
+import torch
+from torch import nn
+from transformers import RobertaConfig
+
+from vllm.attention import AttentionMetadata
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.models.bert import BertEmbeddingModel, BertModel
+from vllm.sequence import IntermediateTensors
+
+
+class RobertaEmbedding(nn.Module):
+
+    def __init__(self, config: RobertaConfig):
+        super().__init__()
+        self.size = config.hidden_size
+        self.word_embeddings = VocabParallelEmbedding(config.vocab_size,
+                                                      config.hidden_size)
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
+                                                config.hidden_size,
+                                                padding_idx=self.padding_idx)
+
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size,
+                                                  config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size,
+                                      eps=config.layer_norm_eps)
+        self.position_ids = nn.Parameter(
+            torch.empty((1, config.max_position_embeddings)), )
+
+        self.position_embedding_type = config.position_embedding_type
+        if self.position_embedding_type != "absolute":
+            raise ValueError("Only 'absolute' position_embedding_type" +
+                             " is supported")
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        input_shape = input_ids.size()
+
+        # Input embeddings.
+        inputs_embeds = self.word_embeddings(input_ids)
+
+        # TODO: figure out if there is a better way
+        # to make to make position ids start at padding_idx + 1
+        # References:
+        # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L133
+        # - https://github.com/huggingface/transformers/blob/a3d69a8994d673899608a7c17fbf4f953f50474e/src/transformers/models/roberta/modeling_roberta.py#L1669
+        position_ids += self.padding_idx + 1
+
+        # Position embeddings.
+        position_embeddings = self.position_embeddings(position_ids)
+
+        # Token type embeddings. (TODO: move off hotpath?)
+        token_type_embeddings = self.token_type_embeddings(
+            torch.zeros(input_shape,
+                        dtype=torch.long,
+                        device=inputs_embeds.device))
+
+        embeddings = inputs_embeds + token_type_embeddings + position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        return embeddings
+
+
+class RobertaEmbeddingModel(BertEmbeddingModel):
+    """A model that uses Roberta to provide embedding functionalities.
+
+   This class encapsulates the BertModel and provides an interface for
+   embedding operations and customized pooling functions.
+
+   Attributes:
+       model: An instance of BertModel used for forward operations.
+       _pooler: An instance of Pooler used for pooling operations.
+   """
+
+    def _build_model(self,
+                     vllm_config: VllmConfig,
+                     prefix: str = "") -> BertModel:
+        return BertModel(vllm_config=vllm_config,
+                         prefix=prefix,
+                         embedding_class=RobertaEmbedding)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+
+        # Verify assumption that position are always a sequence from
+        # 0 to N. (Actually here we just check 0 and N to simplify).
+        # This is important to fix the position which are assumed to
+        # start from padding_idx + 1 instead of 0 in the Roberta models.
+        assert hasattr(attn_metadata, "seq_lens_tensor")
+        cumulative = attn_metadata.seq_lens_tensor.cumsum(dim=0)
+        start_pos = torch.cat(
+            (torch.tensor([0], device=attn_metadata.seq_lens_tensor.device),
+             cumulative[:-1]))
+        assert len(torch.nonzero(positions[start_pos])) == 0
+        end_pos = cumulative - 1
+        last_tokens = attn_metadata.seq_lens_tensor - 1
+        assert len(torch.nonzero(positions[end_pos] - last_tokens)) == 0
+
+        return super().forward(input_ids=input_ids,
+                               positions=positions,
+                               kv_caches=kv_caches,
+                               attn_metadata=attn_metadata,
+                               intermediate_tensors=intermediate_tensors,
+                               inputs_embeds=inputs_embeds)

From b2e0ad3b598ed0e022cdbd678a20821d411873c2 Mon Sep 17 00:00:00 2001
From: Murali Andoorveedu <37849411+andoorve@users.noreply.github.com>
Date: Thu, 14 Nov 2024 16:38:20 -0800
Subject: [PATCH 139/183] [Perf] Reduce peak memory usage of llama (#10339)

Signed-off-by: andoorve <37849411+andoorve@users.noreply.github.com>
---
 vllm/model_executor/models/llama.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 8aed0fead18f9..e53631ef19f31 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -90,8 +90,8 @@ def __init__(
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
-        gate_up, _ = self.gate_up_proj(x)
-        x = self.act_fn(gate_up)
+        x, _ = self.gate_up_proj(x)
+        x = self.act_fn(x)
         x, _ = self.down_proj(x)
         return x
 

From 554af9228df620a63d4736240a8f76a64a675f4d Mon Sep 17 00:00:00 2001
From: Zijin Xiao <ZijinX@outlook.com>
Date: Fri, 15 Nov 2024 08:38:53 +0800
Subject: [PATCH 140/183] [Bugfix] use AF_INET6 for OpenAI Compatible Server
 with ipv6 (#9583)

Signed-off-by: xiaozijin <xiaozijin@bytedance.com>
---
 vllm/entrypoints/openai/api_server.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 6a24cdbc6a18f..b13f6a228b4c6 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -12,7 +12,7 @@
 from contextlib import asynccontextmanager
 from functools import partial
 from http import HTTPStatus
-from typing import AsyncIterator, Optional, Set
+from typing import AsyncIterator, Optional, Set, Tuple
 
 import uvloop
 from fastapi import APIRouter, FastAPI, Request
@@ -57,7 +57,8 @@
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
 from vllm.logger import init_logger
 from vllm.usage.usage_lib import UsageContext
-from vllm.utils import FlexibleArgumentParser, get_open_zmq_ipc_path
+from vllm.utils import (FlexibleArgumentParser, get_open_zmq_ipc_path,
+                        is_valid_ipv6_address)
 from vllm.version import __version__ as VLLM_VERSION
 
 if envs.VLLM_USE_V1:
@@ -568,6 +569,18 @@ def init_app_state(
     )
 
 
+def create_server_socket(addr: Tuple[str, int]) -> socket.socket:
+    family = socket.AF_INET
+    if is_valid_ipv6_address(addr[0]):
+        family = socket.AF_INET6
+
+    sock = socket.socket(family=family, type=socket.SOCK_STREAM)
+    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+    sock.bind(addr)
+
+    return sock
+
+
 async def run_server(args, **uvicorn_kwargs) -> None:
     logger.info("vLLM API server version %s", VLLM_VERSION)
     logger.info("args: %s", args)
@@ -584,9 +597,8 @@ async def run_server(args, **uvicorn_kwargs) -> None:
     # workaround to make sure that we bind the port before the engine is set up.
     # This avoids race conditions with ray.
     # see https://github.com/vllm-project/vllm/issues/8204
-    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    sock.bind((args.host or "", args.port))
-    sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+    sock_addr = (args.host or "", args.port)
+    sock = create_server_socket(sock_addr)
 
     def signal_handler(*_) -> None:
         # Interrupt server on sigterm while initializing

From 11cd1ae6ad6fa7d35060fea35133e08c0a1c227c Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Fri, 15 Nov 2024 01:42:49 +0100
Subject: [PATCH 141/183] [Tool parsing] Improve / correct mistral tool parsing
 (#10333)

---
 .../decoder_only/language/test_mistral.py     | 93 ++++++++++++++++---
 vllm/entrypoints/openai/serving_chat.py       | 39 +-------
 .../tool_parsers/mistral_tool_parser.py       | 25 +++--
 .../transformers_utils/tokenizers/__init__.py |  4 +-
 vllm/transformers_utils/tokenizers/mistral.py | 70 +++++++++++++-
 5 files changed, 172 insertions(+), 59 deletions(-)

diff --git a/tests/models/decoder_only/language/test_mistral.py b/tests/models/decoder_only/language/test_mistral.py
index 6ec4b7e7e3f71..99b5d5694f9f7 100644
--- a/tests/models/decoder_only/language/test_mistral.py
+++ b/tests/models/decoder_only/language/test_mistral.py
@@ -2,9 +2,13 @@
 
 Run `pytest tests/models/test_mistral.py`.
 """
+import copy
+
 import pytest
 
 from vllm import SamplingParams
+from vllm.entrypoints.openai.tool_parsers.mistral_tool_parser import (  # noqa
+    MistralToolParser)
 
 from ...utils import check_logprobs_close
 
@@ -58,17 +62,69 @@
             },
             "required": ["city", "state", "unit"]
         }
+    },
+}, {
+    "type": "function",
+    "function": {
+        "name": "rewrite",
+        "description": "Rewrites text",
+        "parameters": {
+            "type": "object",
+            "required": [],
+            "properties": {
+                "text": {
+                    "type": "string",
+                    "description": "The input text to rewrite."
+                }
+            }
+        }
     }
 }]
-MSGS = [{
-    "role":
-    "user",
-    "content": ("Can you tell me what the temperate"
-                " will be in Dallas, in fahrenheit?")
-}]
-EXPECTED_FUNC_CALL = (
-    '[{"name": "get_current_weather", "arguments": '
-    '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}}]')
+MSGS = [
+    {
+        "role": "system",
+        "content": "You are an assistant."
+    },
+    {
+        "role":
+        "user",
+        "content":
+        "Could you please rewrite the below article? \n\n My English needs improvving, maybe I make errors."  # noqa
+    },
+    {
+        "role":
+        "assistant",
+        "content":
+        "",
+        "tool_calls": [{
+            "id": "bbc5b7ede",
+            "type": "function",
+            "function": {
+                "name":
+                "rewrite",
+                "arguments":
+                '{\"text\":\"My English needs improvving, maybe I make errors.\"}'  # noqa
+            }
+        }]
+    },
+    {
+        "role": "tool",
+        "content":
+        "{\"action\":\"rewrite\",\"outcome\":\"My English needs improving, maybe I make errors.\"}",  # noqa
+        "tool_call_id": "bbc5b7ede",
+        "name": "rewrite"
+    },
+    {
+        "role": "assistant",
+        "content": "---\n\nMy English needs improving, maybe I make errors"
+    },
+    {
+        "role":
+        "user",
+        "content": ("Can you tell me what the temperate"
+                    " will be in Dallas, in fahrenheit?")
+    }
+]
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -175,8 +231,23 @@ def test_mistral_function_calling(
                      tokenizer_mode="mistral",
                      config_format="mistral",
                      load_format="mistral") as vllm_model:
-        outputs = vllm_model.model.chat(MSGS,
+
+        msgs = copy.deepcopy(MSGS)
+        outputs = vllm_model.model.chat(msgs,
                                         tools=TOOLS,
                                         sampling_params=SAMPLING_PARAMS)
 
-        assert outputs[0].outputs[0].text.strip() == EXPECTED_FUNC_CALL
+        tokenizer = vllm_model.model.get_tokenizer()
+        tool_parser = MistralToolParser(tokenizer)
+
+        model_output = outputs[0].outputs[0].text.strip()
+        assert model_output.startswith(tool_parser.bot_token), model_output
+        parsed_message = tool_parser.extract_tool_calls(model_output, None)
+
+        assert parsed_message.tools_called
+        assert parsed_message.tool_calls[0].id == "0UAqFzWsD"
+        assert parsed_message.tool_calls[
+            0].function.name == "get_current_weather"
+        assert parsed_message.tool_calls[
+            0].function.arguments == '{"city": "Dallas", "state": "TX", "unit": "fahrenheit"}'  # noqa
+        assert parsed_message.content is None
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 5178481c737b4..77cae00ae827f 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -30,6 +30,7 @@
 from vllm.sampling_params import BeamSearchParams, SamplingParams
 from vllm.sequence import Logprob
 from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer
+from vllm.transformers_utils.tokenizers import maybe_serialize_tool_calls
 from vllm.utils import iterate_with_cancellation
 
 logger = init_logger(__name__)
@@ -127,41 +128,11 @@ async def create_chat_completion(
                 return self.create_error_response(
                     "tool_choice = \"required\" is not supported!")
 
-            # NOTE: There is currently a bug in pydantic where attributes
-            # declared as iterables are replaced in in the instances by
-            # pydantic-core ValidatorIterator instance. In particular, this
-            # affects tool_calls defined in ChatCompletionAssistantMessageParam
-            # model:
-            # see:
-            #   - https://github.com/pydantic/pydantic/issues/9467
-            # As a result, tool_calls from assistant messages are never
-            # deserialized in the request object if the tool_calls iterator is
-            # not consumed. This affect messages passed to the MistralTokenizer
-            # since no chat template is applied and therefore the tools_calls
-            # iterator is not directly consumed.
-            # Issue is tracked on Pydantic side, with resolution planned for
-            # v2.11 release. In the meantime, the official workaround is to
-            # consume the iterator so the tool_calls are correctly deserialized
-            # in the OpenAI ChatCompletionAssistantMessageParam object
-            # https://github.com/pydantic/pydantic/issues/9467#issuecomment-2442097291 # noqa: E501
-            # Official Pydantic Issues:
-            #   - https://github.com/pydantic/pydantic/issues/9541
-            # TODO: remove when pydantic v2.11 is released
+            # because of issues with pydantic we need to potentially
+            # re-serialize the tool_calls field of the request
+            # for more info: see comment in `maybe_serialize_tool_calls`
             if isinstance(tokenizer, MistralTokenizer):
-                for i, message in enumerate(request.messages):
-                    if message.get("role") == 'assistant':
-                        tool_calls_validator = message.get(
-                            "tool_calls", ().__iter__())
-                        validated_tool_calls = []
-                        while True:
-                            try:
-                                tool_call = next(
-                                    tool_calls_validator)  # type: ignore
-                                validated_tool_calls.append(tool_call)
-                            except StopIteration:
-                                break
-                        request.messages[i][
-                            "tool_calls"] = validated_tool_calls
+                maybe_serialize_tool_calls(request)
 
             if (request.tool_choice == "auto" and
                     not (self.enable_auto_tools and tool_parser is not None)
diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
index f5c0d92f3f9bd..5caac84138e3b 100644
--- a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py
@@ -62,7 +62,7 @@ def __init__(self, tokenizer: AnyTokenizer):
         ]  # map what has been streamed for each tool so far to a list
         self.bot_token = "[TOOL_CALLS]"
         self.bot_token_id = self.vocab.get(self.bot_token)
-        self.tool_call_regex = re.compile(r"\[{.*?}\]", re.DOTALL)
+        self.tool_call_regex = re.compile(r"\[{.*}\]", re.DOTALL)
         if self.bot_token_id is None:
             raise RuntimeError(
                 "Mistral Tool Parser could not locate the tool call token in "
@@ -84,16 +84,25 @@ def extract_tool_calls(
             return ExtractedToolCallInformation(tools_called=False,
                                                 tool_calls=[],
                                                 content=model_output)
+
+        # first remove the BOT token
+        tool_content = model_output.replace(self.bot_token, "").strip()
+
         try:
 
-            # use a regex to find the tool call. remove the BOT token
-            #   and make sure to replace single quotes with double quotes
-            raw_tool_call = self.tool_call_regex.findall(
-                model_output.replace(self.bot_token, ""))[0]
+            # we first try to directly load the json as parsing very nested
+            # jsons is difficult
+            try:
+                function_call_arr = json.loads(tool_content)
+            except json.JSONDecodeError:
+                # use a regex to find the part corresponding to the tool call.
+                # NOTE: This use case should not happen if the model is trained
+                # correctly. It's a easy possible fix so it's included, but
+                # can be brittle for very complex / highly nested tool calls
+                raw_tool_call = self.tool_call_regex.findall(tool_content)[0]
+                function_call_arr = json.loads(raw_tool_call)
 
-            # load the JSON, and then use it to build the Function and
             # Tool Call
-            function_call_arr = json.loads(raw_tool_call)
             tool_calls: List[MistralToolCall] = [
                 MistralToolCall(
                     type="function",
@@ -116,7 +125,7 @@ def extract_tool_calls(
             # return information to just treat the tool call as regular JSON
             return ExtractedToolCallInformation(tools_called=False,
                                                 tool_calls=[],
-                                                content=model_output)
+                                                content=tool_content)
 
     def extract_tool_calls_streaming(
         self,
diff --git a/vllm/transformers_utils/tokenizers/__init__.py b/vllm/transformers_utils/tokenizers/__init__.py
index 5f437d414e181..e68ad79b296b8 100644
--- a/vllm/transformers_utils/tokenizers/__init__.py
+++ b/vllm/transformers_utils/tokenizers/__init__.py
@@ -1,3 +1,3 @@
-from .mistral import MistralTokenizer
+from .mistral import MistralTokenizer, maybe_serialize_tool_calls
 
-__all__ = ["MistralTokenizer"]
+__all__ = ["MistralTokenizer", "maybe_serialize_tool_calls"]
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index 1b273c6b120ea..b1cb9a15b943b 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -7,6 +7,7 @@
 import huggingface_hub
 from huggingface_hub import HfApi, hf_hub_download
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
+from mistral_common.tokens.tokenizers.base import SpecialTokens
 # yapf: disable
 from mistral_common.tokens.tokenizers.mistral import (
     MistralTokenizer as PublicMistralTokenizer)
@@ -29,6 +30,43 @@ class Encoding:
     input_ids: List[int]
 
 
+def maybe_serialize_tool_calls(request: ChatCompletionRequest):
+    # SEE: https://github.com/vllm-project/vllm/pull/9951
+    # Credits go to: @gcalmettes
+    # NOTE: There is currently a bug in pydantic where attributes
+    # declared as iterables are replaced in in the instances by
+    # pydantic-core ValidatorIterator instance. In particular, this
+    # affects tool_calls defined in ChatCompletionAssistantMessageParam
+    # model:
+    # see:
+    #   - https://github.com/pydantic/pydantic/issues/9467
+    # As a result, tool_calls from assistant messages are never
+    # deserialized in the request object if the tool_calls iterator is
+    # not consumed. This affect messages passed to the MistralTokenizer
+    # since no chat template is applied and therefore the tools_calls
+    # iterator is not directly consumed.
+    # Issue is tracked on Pydantic side, with resolution planned for
+    # v2.11 release. In the meantime, the official workaround is to
+    # consume the iterator so the tool_calls are correctly deserialized
+    # in the OpenAI ChatCompletionAssistantMessageParam object
+    # https://github.com/pydantic/pydantic/issues/9467#issuecomment-2442097291 # noqa: E501
+    # Official Pydantic Issues:
+    #   - https://github.com/pydantic/pydantic/issues/9541
+    # TODO: remove when pydantic v2.11 is released
+    for i, message in enumerate(request.messages):
+        if message.get("role") == 'assistant':
+            tool_calls_validator = message.get("tool_calls", ().__iter__())
+            validated_tool_calls = []
+            while True:
+                try:
+                    tool_call = next(tool_calls_validator)  # type: ignore
+                    validated_tool_calls.append(tool_call)
+                except StopIteration:
+                    break
+
+            request.messages[i]["tool_calls"] = validated_tool_calls
+
+
 def list_local_repo_files(repo_id: str, revision: Optional[str]) -> List[str]:
     repo_cache = os.path.join(
         huggingface_hub.constants.HF_HUB_CACHE,
@@ -222,7 +260,8 @@ def convert_tokens_to_string(self, tokens: List[str]) -> str:
         if self.is_tekken:
             tokens = [
                 t for t in tokens
-                if t not in self.tokenizer._all_special_tokens
+                if (t is SpecialTokens.tool_calls
+                    or t not in self.tokenizer._all_special_tokens)
             ]
 
             if any(isinstance(t, bytes) for t in tokens):
@@ -246,7 +285,27 @@ def _token_to_id(t: str):
             else:
                 decoded = "".join(tokens)
         else:
-            decoded = self.tokenizer.decode(tokens)  # type: ignore[arg-type]
+            # make sure certain special tokens like Tool calls are
+            # not decoded
+            special_tokens = {SpecialTokens.tool_calls}
+            regular_tokens: List[str] = []
+            decoded_list = []
+
+            for token in tokens:
+                if token in special_tokens:
+                    if regular_tokens:
+                        decoded_list.append(
+                            self.tokenizer.decode(regular_tokens))
+                        regular_tokens = []
+                    decoded_list.append(token)
+                else:
+                    regular_tokens.append(token)
+
+            if regular_tokens:
+                decoded_list.append(
+                    self.decode(regular_tokens))  # type: ignore
+
+            decoded = ''.join(decoded_list)
 
         return decoded
 
@@ -274,8 +333,11 @@ def convert_ids_to_tokens(
         assert self.is_tekken or self.is_spm, type(self.tokenizer)
 
         if self.is_tekken:
-            # skip special tokens
-            ids = [i for i in ids if i > self.tokenizer.num_special_tokens]
+            # skip special tokens except tool call
+            ids = [
+                i for i in ids if i > self.tokenizer.num_special_tokens or i ==
+                self.tokenizer.get_control_token(SpecialTokens.tool_calls)
+            ]
 
         tokens = [self.tokenizer.id_to_piece(id) for id in ids]
 

From 972112d82f00e1396c0376cde78c083208b77127 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 15 Nov 2024 08:55:54 +0800
Subject: [PATCH 142/183] [Bugfix] Fix unable to load some models (#10312)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .buildkite/test-pipeline.yaml                |   5 +-
 tests/distributed/test_pipeline_parallel.py  |   6 +-
 tests/models/registry.py                     | 212 +++++++++++++++++++
 tests/models/test_initialization.py          |  55 +++++
 tests/models/test_registry.py                |  10 +
 vllm/config.py                               |  36 +++-
 vllm/engine/arg_utils.py                     |   8 +-
 vllm/entrypoints/llm.py                      |   8 +-
 vllm/model_executor/models/fuyu.py           |   7 +-
 vllm/model_executor/models/internlm2_ve.py   |   8 +-
 vllm/model_executor/models/minicpmv.py       |  32 +--
 vllm/model_executor/models/mlp_speculator.py |   2 +-
 vllm/model_executor/models/registry.py       |   8 +-
 13 files changed, 339 insertions(+), 58 deletions(-)
 create mode 100644 tests/models/registry.py
 create mode 100644 tests/models/test_initialization.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index fbaa427bb7270..baad54eaf6a91 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -313,14 +313,15 @@ steps:
 
 #####  models test  #####
 
-- label: Basic Models Test # 10min
+- label: Basic Models Test # 30min
   source_file_dependencies:
   - vllm/
   - tests/models
   commands:
     - pip install -e ./plugins/vllm_add_dummy_model
     - pytest -v -s models/test_oot_registration.py # it needs a clean process
-    - pytest -v -s models/*.py --ignore=models/test_oot_registration.py
+    - pytest -v -s models/test_registry.py
+    - pytest -v -s models/test_initialization.py
 
 - label: Decoder-only Language Models Test (Standard) # 18min
   #mirror_hardwares: [amd]
diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py
index 5d566f8308b70..c49ed9802cde8 100644
--- a/tests/distributed/test_pipeline_parallel.py
+++ b/tests/distributed/test_pipeline_parallel.py
@@ -166,14 +166,14 @@ def iter_params(self, model_name: str):
     "mistralai/Mixtral-8x7B-Instruct-v0.1": PPTestSettings.fast(tp_base=4),
     "mosaicml/mpt-7b": PPTestSettings.fast(),
     "nvidia/Minitron-8B-Base": PPTestSettings.fast(),
-    "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
     "allenai/OLMo-1B-hf": PPTestSettings.fast(),
+    "allenai/OLMoE-1B-7B-0924-Instruct": PPTestSettings.fast(),
     "facebook/opt-iml-max-1.3b": PPTestSettings.fast(),
     "OrionStarAI/Orion-14B-Chat": PPTestSettings.fast(trust_remote_code=True),
+    "adept/persimmon-8b-chat": PPTestSettings.fast(),
     "microsoft/phi-2": PPTestSettings.fast(),
-    "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True, load_format="dummy", hf_overrides='{"num_hidden_layers": 4, "hidden_size": 512, "intermediate_size": 800, "num_attention_heads": 4, "num_key_value_heads": 1}'),  # noqa: E501
     "microsoft/Phi-3-small-8k-instruct": PPTestSettings.fast(trust_remote_code=True),  # noqa: E501
-    "adept/persimmon-8b-chat": PPTestSettings.fast(),
+    "microsoft/Phi-3.5-MoE-instruct": PPTestSettings.detailed(trust_remote_code=True, multi_node_only=True, load_format="dummy", hf_overrides='{"num_hidden_layers": 4, "hidden_size": 512, "intermediate_size": 800, "num_attention_heads": 4, "num_key_value_heads": 1}'),  # noqa: E501
     "Qwen/Qwen-7B-Chat": PPTestSettings.fast(trust_remote_code=True),
     "Qwen/Qwen2-7B-Instruct": PPTestSettings.fast(),
     "Qwen/Qwen1.5-MoE-A2.7B-Chat": PPTestSettings.fast(),
diff --git a/tests/models/registry.py b/tests/models/registry.py
new file mode 100644
index 0000000000000..ec9ff52d112df
--- /dev/null
+++ b/tests/models/registry.py
@@ -0,0 +1,212 @@
+from dataclasses import dataclass, field
+from typing import AbstractSet, Mapping, Optional
+
+
+@dataclass(frozen=True)
+class _HfExamplesInfo:
+    default: str
+    """The default model to use for testing this architecture."""
+
+    extras: Mapping[str, str] = field(default_factory=dict)
+    """Extra models to use for testing this architecture."""
+
+    tokenizer: Optional[str] = None
+    """Set the tokenizer to load for this architecture."""
+
+    tokenizer_mode: str = "auto"
+    """Set the tokenizer type for this architecture."""
+
+    speculative_model: Optional[str] = None
+    """
+    The default model to use for testing this architecture, which is only used
+    for speculative decoding.
+    """
+
+    is_available_online: bool = True
+    """
+    Set this to ``False`` if the name of this architecture no longer exists on
+    the HF repo. To maintain backwards compatibility, we have not removed them
+    from the main model registry, so without this flag the registry tests will
+    fail.
+    """
+
+    trust_remote_code: bool = False
+    """The ``trust_remote_code`` level required to load the model."""
+
+
+# yapf: disable
+_TEXT_GENERATION_EXAMPLE_MODELS = {
+    # [Decoder-only]
+    "AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B",
+                                   trust_remote_code=True),
+    "AquilaForCausalLM": _HfExamplesInfo("BAAI/AquilaChat2-7B",
+                                         trust_remote_code=True),
+    "ArcticForCausalLM": _HfExamplesInfo("Snowflake/snowflake-arctic-instruct",
+                                         trust_remote_code=True),
+    "BaiChuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan-7B",
+                                         trust_remote_code=True),
+    "BaichuanForCausalLM": _HfExamplesInfo("baichuan-inc/Baichuan2-7B-chat",
+                                         trust_remote_code=True),
+    "BloomForCausalLM": _HfExamplesInfo("bigscience/bloomz-1b1"),
+    # ChatGLMModel supports multimodal
+    "CohereForCausalLM": _HfExamplesInfo("CohereForAI/c4ai-command-r-v01",
+                                         trust_remote_code=True),
+    "DbrxForCausalLM": _HfExamplesInfo("databricks/dbrx-instruct"),
+    "DeciLMForCausalLM": _HfExamplesInfo("Deci/DeciLM-7B-instruct",
+                                         trust_remote_code=True),
+    "DeepseekForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-llm-7b-chat"),
+    "DeepseekV2ForCausalLM": _HfExamplesInfo("deepseek-ai/DeepSeek-V2-Lite-Chat",  # noqa: E501
+                                         trust_remote_code=True),
+    "ExaoneForCausalLM": _HfExamplesInfo("LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct"),  # noqa: E501
+    "FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
+    "GemmaForCausalLM": _HfExamplesInfo("google/gemma-2b"),
+    "Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
+    "GPT2LMHeadModel": _HfExamplesInfo("gpt2"),
+    "GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder"),
+    "GPTJForCausalLM": _HfExamplesInfo("EleutherAI/gpt-j-6b"),
+    "GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-160m"),
+    "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
+    "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
+    "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
+                                           trust_remote_code=True),
+    "InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b",
+                                            trust_remote_code=True),
+    "InternLM2VEForCausalLM": _HfExamplesInfo("OpenGVLab/Mono-InternVL-2B",
+                                              trust_remote_code=True),
+    "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
+    "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini"),
+    "LlamaForCausalLM": _HfExamplesInfo("meta-llama/Meta-Llama-3-8B"),
+    "LLaMAForCausalLM": _HfExamplesInfo("decapoda-research/llama-7b-hf",
+                                        is_available_online=False),
+    "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
+    "FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"),  # noqa: E501
+    "MiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-2B-sft-bf16",
+                                         trust_remote_code=True),
+    "MiniCPM3ForCausalLM": _HfExamplesInfo("openbmb/MiniCPM3-4B",
+                                         trust_remote_code=True),
+    "MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
+    "MixtralForCausalLM": _HfExamplesInfo("mistralai/Mixtral-8x7B-Instruct-v0.1"),  # noqa: E501
+    "QuantMixtralForCausalLM": _HfExamplesInfo("mistral-community/Mixtral-8x22B-v0.1-AWQ"),  # noqa: E501
+    "MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False),
+    "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"),
+    "NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"),
+    "OlmoForCausalLM": _HfExamplesInfo("allenai/OLMo-1B-hf"),
+    "OlmoeForCausalLM": _HfExamplesInfo("allenai/OLMoE-1B-7B-0924-Instruct"),
+    "OPTForCausalLM": _HfExamplesInfo("facebook/opt-iml-max-1.3b"),
+    "OrionForCausalLM": _HfExamplesInfo("OrionStarAI/Orion-14B-Chat",
+                                        trust_remote_code=True),
+    "PersimmonForCausalLM": _HfExamplesInfo("adept/persimmon-8b-chat"),
+    "PhiForCausalLM": _HfExamplesInfo("microsoft/phi-2"),
+    "Phi3ForCausalLM": _HfExamplesInfo("microsoft/Phi-3-mini-4k-instruct"),
+    "Phi3SmallForCausalLM": _HfExamplesInfo("microsoft/Phi-3-small-8k-instruct",
+                                            trust_remote_code=True),
+    "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
+                                         trust_remote_code=True),
+    # QWenLMHeadModel supports multimodal
+    "Qwen2ForCausalLM": _HfExamplesInfo("Qwen/Qwen2-7B-Instruct"),
+    "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
+    "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b",
+                                     is_available_online=False),
+    "StableLMEpochForCausalLM": _HfExamplesInfo("stabilityai/stablelm-zephyr-3b",  # noqa: E501
+                                                is_available_online=False),
+    "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
+    "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
+    "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"),
+    "XverseForCausalLM": _HfExamplesInfo("xverse/XVERSE-7B-Chat",
+                                         is_available_online=False,
+                                         trust_remote_code=True),
+    # [Encoder-decoder]
+    "BartModel": _HfExamplesInfo("facebook/bart-base"),
+    "BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"),
+    # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
+    # Therefore, we borrow the BartTokenizer from the original Bart model
+    "Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base",  # noqa: E501
+                                                         tokenizer="facebook/bart-base",
+                                                         trust_remote_code=True),  # noqa: E501
+}
+
+_EMBEDDING_EXAMPLE_MODELS = {
+    # [Text-only]
+    "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
+    "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),
+    "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
+    "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"),
+    "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"),  # noqa: E501
+    # [Multimodal]
+    "LlavaNextForConditionalGeneration": _HfExamplesInfo("royokong/e5-v"),
+    "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full",
+                                         trust_remote_code=True),
+    "Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"), # noqa: E501
+}
+
+_MULTIMODAL_EXAMPLE_MODELS = {
+    # [Decoder-only]
+    "Blip2ForConditionalGeneration": _HfExamplesInfo("Salesforce/blip2-opt-2.7b"),  # noqa: E501
+    "ChameleonForConditionalGeneration": _HfExamplesInfo("facebook/chameleon-7b"),  # noqa: E501
+    "ChatGLMModel": _HfExamplesInfo("THUDM/glm-4v-9b",
+                                    extras={"text_only": "THUDM/chatglm3-6b"},
+                                    trust_remote_code=True),
+    "ChatGLMForConditionalGeneration": _HfExamplesInfo("chatglm2-6b",
+                                                       is_available_online=False),
+    "FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
+    "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m"),
+    "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
+                                         trust_remote_code=True),
+    "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3"),  # noqa: E501
+    "LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
+                                                     extras={"mistral": "mistral-community/pixtral-12b"}),  # noqa: E501
+    "LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"),  # noqa: E501
+    "LlavaNextVideoForConditionalGeneration": _HfExamplesInfo("llava-hf/LLaVA-NeXT-Video-7B-hf"),  # noqa: E501
+    "LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),  # noqa: E501
+    "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
+                                trust_remote_code=True),
+    "MolmoForCausalLM": _HfExamplesInfo("allenai/Molmo-7B-D-0924",
+                                        trust_remote_code=True),
+    "NVLM_D": _HfExamplesInfo("nvidia/NVLM-D-72B",
+                              trust_remote_code=True),
+    "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-pt-224"),  # noqa: E501
+    "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
+                                        trust_remote_code=True),
+    "PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409",  # noqa: E501
+                                                       tokenizer_mode="mistral"),
+    "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-VL-Chat",
+                                       extras={"text_only": "Qwen/Qwen-7B-Chat"},  # noqa: E501
+                                       trust_remote_code=True),
+    "Qwen2AudioForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-Audio-7B-Instruct"),  # noqa: E501
+    "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"),  # noqa: E501
+    "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_3"),
+    # [Encoder-decoder]
+    "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"),  # noqa: E501
+}
+
+_SPECULATIVE_DECODING_EXAMPLE_MODELS = {
+    "EAGLEModel": _HfExamplesInfo("JackFram/llama-68m",
+                                  speculative_model="abhigoyal/vllm-eagle-llama-68m-random"),  # noqa: E501
+    "MedusaModel": _HfExamplesInfo("JackFram/llama-68m",
+                                   speculative_model="abhigoyal/vllm-medusa-llama-68m-random"),  # noqa: E501
+    "MLPSpeculatorPreTrainedModel": _HfExamplesInfo("JackFram/llama-160m",
+                                                    speculative_model="ibm-fms/llama-160m-accelerator"),  # noqa: E501
+}
+
+_EXAMPLE_MODELS = {
+    **_TEXT_GENERATION_EXAMPLE_MODELS,
+    **_EMBEDDING_EXAMPLE_MODELS,
+    **_MULTIMODAL_EXAMPLE_MODELS,
+    **_SPECULATIVE_DECODING_EXAMPLE_MODELS,
+}
+
+
+class HfExampleModels:
+    def __init__(self, hf_models: Mapping[str, _HfExamplesInfo]) -> None:
+        super().__init__()
+
+        self.hf_models = hf_models
+
+    def get_supported_archs(self) -> AbstractSet[str]:
+        return self.hf_models.keys()
+
+    def get_hf_info(self, model_arch: str) -> _HfExamplesInfo:
+        return self.hf_models[model_arch]
+
+
+HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
new file mode 100644
index 0000000000000..b8312c2d9b7cc
--- /dev/null
+++ b/tests/models/test_initialization.py
@@ -0,0 +1,55 @@
+from unittest.mock import patch
+
+import pytest
+import transformers
+from transformers import PretrainedConfig
+
+from vllm import LLM
+
+from .registry import HF_EXAMPLE_MODELS
+
+
+@pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
+def test_can_initialize(model_arch):
+    if (model_arch == "Idefics3ForConditionalGeneration"
+            and transformers.__version__ < "4.46.0"):
+        pytest.skip(reason="Model introduced in HF >= 4.46.0")
+
+    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
+    if not model_info.is_available_online:
+        pytest.skip("Model is not available online")
+
+    # Avoid OOM
+    def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
+        if hasattr(hf_config, "text_config"):
+            text_config: PretrainedConfig = hf_config.text_config
+        else:
+            text_config = hf_config
+
+        text_config.update({
+            "num_layers": 1,
+            "num_hidden_layers": 1,
+            "num_experts": 2,
+            "num_experts_per_tok": 2,
+            "num_local_experts": 2,
+        })
+
+        return hf_config
+
+    # Avoid calling model.forward()
+    def _initialize_kv_caches(self) -> None:
+        self.cache_config.num_gpu_blocks = 0
+        self.cache_config.num_cpu_blocks = 0
+
+    with patch.object(LLM.get_engine_class(), "_initialize_kv_caches",
+                      _initialize_kv_caches):
+        LLM(
+            model_info.default,
+            tokenizer=model_info.tokenizer,
+            tokenizer_mode=model_info.tokenizer_mode,
+            speculative_model=model_info.speculative_model,
+            num_speculative_tokens=1 if model_info.speculative_model else None,
+            trust_remote_code=model_info.trust_remote_code,
+            load_format="dummy",
+            hf_overrides=hf_overrides,
+        )
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index a2194fa15f90e..dbc415796ee55 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -14,6 +14,7 @@
 from vllm.platforms import current_platform
 
 from ..utils import fork_new_process_for_each_test
+from .registry import HF_EXAMPLE_MODELS
 
 
 @pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs())
@@ -73,3 +74,12 @@ def test_registry_is_pp(model_arch, is_pp, init_cuda):
                 "This model no longer initializes CUDA on import. "
                 "Please test using a different one.",
                 stacklevel=2)
+
+
+def test_hf_registry_coverage():
+    untested_archs = (HF_EXAMPLE_MODELS.get_supported_archs() -
+                      set(ModelRegistry.get_supported_archs()))
+
+    assert not untested_archs, (
+        "Please add the following architectures to "
+        f"`tests/models/registry.py`: {untested_archs}")
diff --git a/vllm/config.py b/vllm/config.py
index 002adb4316969..83b1483eb99e0 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3,8 +3,8 @@
 import json
 import warnings
 from dataclasses import dataclass, field, replace
-from typing import (TYPE_CHECKING, Any, ClassVar, Dict, Final, List, Literal,
-                    Mapping, Optional, Set, Tuple, Type, Union)
+from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Dict, Final, List,
+                    Literal, Mapping, Optional, Set, Tuple, Type, Union)
 
 import torch
 from transformers import PretrainedConfig
@@ -20,7 +20,7 @@
     get_hf_text_config, get_pooling_config,
     get_sentence_transformer_tokenizer_config, is_encoder_decoder, uses_mrope)
 from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory,
-                        print_warning_once)
+                        identity, print_warning_once)
 
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
@@ -44,6 +44,9 @@
 # "draft" is only used internally for speculative decoding
 _Task = Literal["generate", "embedding", "draft"]
 
+HfOverrides = Union[Dict[str, Any], Callable[[PretrainedConfig],
+                                             PretrainedConfig]]
+
 
 class ModelConfig:
     """Configuration for the model.
@@ -115,7 +118,9 @@ class ModelConfig:
             can not be gathered from the vllm arguments.
         config_format: The config format which shall be loaded.
             Defaults to 'auto' which defaults to 'hf'.
-        hf_overrides: Arguments to be forwarded to the HuggingFace config.
+        hf_overrides: If a dictionary, contains arguments to be forwarded to the
+            HuggingFace config. If a callable, it is called to update the
+            HuggingFace config.
         mm_processor_kwargs: Arguments to be forwarded to the model's processor
             for multi-modal data, e.g., image processor.
         pooling_type: Used to configure the pooling method in the embedding 
@@ -164,7 +169,7 @@ def __init__(
             override_neuron_config: Optional[Dict[str, Any]] = None,
             config_format: ConfigFormat = ConfigFormat.AUTO,
             chat_template_text_format: str = "string",
-            hf_overrides: Optional[Dict[str, Any]] = None,
+            hf_overrides: Optional[HfOverrides] = None,
             mm_processor_kwargs: Optional[Dict[str, Any]] = None,
             pooling_type: Optional[str] = None,
             pooling_norm: Optional[bool] = None,
@@ -182,15 +187,23 @@ def __init__(
 
         if hf_overrides is None:
             hf_overrides = {}
+
+        if callable(hf_overrides):
+            hf_overrides_kw = {}
+            hf_overrides_fn = hf_overrides
+        else:
+            hf_overrides_kw = hf_overrides
+            hf_overrides_fn = identity
+
         if rope_scaling is not None:
             hf_override: Dict[str, Any] = {"rope_scaling": rope_scaling}
-            hf_overrides.update(hf_override)
+            hf_overrides_kw.update(hf_override)
             msg = ("`--rope-scaling` will be removed in a future release. "
                    f"'Please instead use `--hf-overrides '{hf_override!r}'`")
             warnings.warn(DeprecationWarning(msg), stacklevel=2)
         if rope_theta is not None:
             hf_override = {"rope_theta": rope_theta}
-            hf_overrides.update(hf_override)
+            hf_overrides_kw.update(hf_override)
             msg = ("`--rope-theta` will be removed in a future release. "
                    f"'Please instead use `--hf-overrides '{hf_override!r}'`")
             warnings.warn(DeprecationWarning(msg), stacklevel=2)
@@ -207,9 +220,12 @@ def __init__(
         self.max_logprobs = max_logprobs
         self.disable_sliding_window = disable_sliding_window
         self.skip_tokenizer_init = skip_tokenizer_init
-        self.hf_config = get_config(self.model, trust_remote_code, revision,
-                                    code_revision, config_format,
-                                    **hf_overrides)
+
+        hf_config = get_config(self.model, trust_remote_code, revision,
+                               code_revision, config_format, **hf_overrides_kw)
+        hf_config = hf_overrides_fn(hf_config)
+        self.hf_config = hf_config
+
         self.hf_text_config = get_hf_text_config(self.hf_config)
         self.encoder_config = self._get_encoder_config()
         self.hf_image_processor_config = get_hf_image_processor_config(
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 31aa8c5908719..244aa09e12552 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -9,9 +9,9 @@
 
 import vllm.envs as envs
 from vllm.config import (CacheConfig, ConfigFormat, DecodingConfig,
-                         DeviceConfig, LoadConfig, LoadFormat, LoRAConfig,
-                         ModelConfig, ObservabilityConfig, ParallelConfig,
-                         PromptAdapterConfig, SchedulerConfig,
+                         DeviceConfig, HfOverrides, LoadConfig, LoadFormat,
+                         LoRAConfig, ModelConfig, ObservabilityConfig,
+                         ParallelConfig, PromptAdapterConfig, SchedulerConfig,
                          SpeculativeConfig, TaskOption, TokenizerPoolConfig,
                          VllmConfig)
 from vllm.executor.executor_base import ExecutorBase
@@ -128,7 +128,7 @@ class EngineArgs:
     code_revision: Optional[str] = None
     rope_scaling: Optional[Dict[str, Any]] = None
     rope_theta: Optional[float] = None
-    hf_overrides: Optional[Dict[str, Any]] = None
+    hf_overrides: Optional[HfOverrides] = None
     tokenizer_revision: Optional[str] = None
     quantization: Optional[str] = None
     enforce_eager: Optional[bool] = None
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index a15dbd1c45119..63c2bb6097079 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -9,7 +9,7 @@
 from vllm import envs
 from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
                               BeamSearchSequence, get_beam_search_score)
-from vllm.engine.arg_utils import EngineArgs, TaskOption
+from vllm.engine.arg_utils import EngineArgs, HfOverrides, TaskOption
 from vllm.engine.llm_engine import LLMEngine
 from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
                                          apply_hf_chat_template,
@@ -101,7 +101,9 @@ class LLM:
         disable_custom_all_reduce: See :class:`~vllm.config.ParallelConfig`
         disable_async_output_proc: Disable async output processing.
             This may result in lower performance.
-        hf_overrides: Arguments to be forwarded to the HuggingFace config.
+        hf_overrides: If a dictionary, contains arguments to be forwarded to the
+            HuggingFace config. If a callable, it is called to update the
+            HuggingFace config.
         **kwargs: Arguments for :class:`~vllm.EngineArgs`. (See
             :ref:`engine_args`)
 
@@ -156,7 +158,7 @@ def __init__(
         max_seq_len_to_capture: int = 8192,
         disable_custom_all_reduce: bool = False,
         disable_async_output_proc: bool = False,
-        hf_overrides: Optional[dict] = None,
+        hf_overrides: Optional[HfOverrides] = None,
         mm_processor_kwargs: Optional[Dict[str, Any]] = None,
         # After positional args are removed, move this right below `model`
         task: TaskOption = "auto",
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 50701793b7b83..31fc098a8bb3f 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -41,7 +41,8 @@
 from vllm.utils import is_list_of
 
 from .interfaces import SupportsMultiModal, SupportsPP
-from .utils import AutoWeightsLoader, flatten_bn, merge_multimodal_embeddings
+from .utils import (AutoWeightsLoader, flatten_bn, maybe_prefix,
+                    merge_multimodal_embeddings)
 
 # Cannot find the following 2 numbers from hf config.
 _IMAGE_TOKEN_ID = 71011
@@ -245,7 +246,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             gather_output=True,
         )
         self.language_model = PersimmonForCausalLM(
-            vllm_config.with_hf_config(config.text_config))
+            vllm_config=vllm_config.with_hf_config(config.text_config),
+            prefix=maybe_prefix(prefix, "language_model"),
+        )
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
 
diff --git a/vllm/model_executor/models/internlm2_ve.py b/vllm/model_executor/models/internlm2_ve.py
index 34889d691a934..f1b7c896cadfe 100644
--- a/vllm/model_executor/models/internlm2_ve.py
+++ b/vllm/model_executor/models/internlm2_ve.py
@@ -161,11 +161,5 @@ class InternLM2VEForCausalLM(InternLM2ForCausalLM):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__(vllm_config=vllm_config, prefix=prefix)
 
-        config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
-
-        self.model = InternLM2VEModel(config,
-                                      cache_config,
-                                      quant_config,
+        self.model = InternLM2VEModel(vllm_config=vllm_config,
                                       prefix=maybe_prefix(prefix, "model"))
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 999739ccd98bf..fd8eda997f76f 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -382,11 +382,7 @@ class MiniCPMVBaseModel(nn.Module, SupportsMultiModal, SupportsPP):
     instantiated.
     """
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         multimodal_config = vllm_config.model_config.multimodal_config
         quant_config = vllm_config.quant_config
@@ -699,12 +695,8 @@ def is_default_weight_loading(self, name: str) -> bool:
 
 class MiniCPMV2_0(MiniCPMVBaseModel):
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ):
-        super().__init__(vllm_config)
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
         assert self.version == (2, 0)
 
     def init_llm(
@@ -857,12 +849,8 @@ class MiniCPMV2_5(MiniCPMVBaseModel, SupportsLoRA):
     embedding_modules = {}
     embedding_padding_modules = []
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ):
-        super().__init__(vllm_config)
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
         assert self.version == (2, 5)
 
     def init_llm(
@@ -999,12 +987,8 @@ class MiniCPMV2_6(MiniCPMVBaseModel, SupportsLoRA):
     embedding_modules = {}
     embedding_padding_modules = []
 
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        prefix: str = "",
-    ):
-        super().__init__(vllm_config)
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
         assert self.version == (2, 6)
 
     def init_llm(
@@ -1117,7 +1101,7 @@ class MiniCPMV(MiniCPMVBaseModel, SupportsLoRA):
     embedding_modules = {}
     embedding_padding_modules = []
 
-    def __new__(cls, vllm_config: VllmConfig, prefix: str = ""):
+    def __new__(cls, *, vllm_config: VllmConfig, prefix: str = ""):
         config = vllm_config.model_config.hf_config
         if not hasattr(config, "version"):
             if config.hidden_size == 2304 and config.query_num == 64:
diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index 6aa43f22f4c93..4d7e82880041d 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -65,7 +65,7 @@ class MLPSpeculator(nn.Module):
     https://huggingface.co/ibm-fms and https://huggingface.co/ibm-granite
     """
 
-    def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None:
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
         config = vllm_config.model_config.hf_config
         self.n_predict = config.n_predict
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index f22d1b04ebf09..c0d503a1c5ba2 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -1,3 +1,7 @@
+"""
+Whenever you add an architecture to this page, please also update
+`tests/models/registry.py` with example HuggingFace models for it.
+"""
 import importlib
 import os
 import pickle
@@ -58,14 +62,14 @@
     "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
     "MambaForCausalLM": ("mamba", "MambaForCausalLM"),
     "FalconMambaForCausalLM": ("mamba", "MambaForCausalLM"),
+    "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
+    "MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"),
     "MistralForCausalLM": ("llama", "LlamaForCausalLM"),
     "MixtralForCausalLM": ("mixtral", "MixtralForCausalLM"),
     "QuantMixtralForCausalLM": ("mixtral_quant", "MixtralForCausalLM"),
     # transformers's mpt class has lower case
     "MptForCausalLM": ("mpt", "MPTForCausalLM"),
     "MPTForCausalLM": ("mpt", "MPTForCausalLM"),
-    "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
-    "MiniCPM3ForCausalLM": ("minicpm3", "MiniCPM3ForCausalLM"),
     "NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"),
     "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
     "OlmoeForCausalLM": ("olmoe", "OlmoeForCausalLM"),

From bf2ddc6610094524a61e90441e579d502c7dee06 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luka=20Govedi=C4=8D?=
 <ProExpertProg@users.noreply.github.com>
Date: Thu, 14 Nov 2024 20:35:11 -0500
Subject: [PATCH 143/183] [bugfix] Fix static asymmetric quantization case
 (#10334)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Daniël de Kok <me@danieldk.eu>
Signed-off-by: luka <luka@neuralmagic.com>
Co-authored-by: Daniël de Kok <me@danieldk.eu>
---
 tests/kernels/test_int8_quant.py              | 19 ++++++------
 tests/quantization/test_compressed_tensors.py | 30 +++++++++++++++++++
 vllm/_custom_ops.py                           |  8 ++++-
 .../schemes/compressed_tensors_w8a8_int8.py   | 11 ++++---
 .../layers/quantization/utils/w8a8_utils.py   |  5 +++-
 5 files changed, 58 insertions(+), 15 deletions(-)

diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py
index 12c578db0893c..761eb95c423fc 100644
--- a/tests/kernels/test_int8_quant.py
+++ b/tests/kernels/test_int8_quant.py
@@ -86,10 +86,7 @@ def test_dynamic_scaled_int8_azp_quant(num_tokens: int, hidden_size: int,
     assert torch_out.min() >= int8_traits.min and torch_out.max(
     ) <= int8_traits.max
 
-    ops_out = torch.empty_like(x, dtype=torch.int8)
-    scales_out = torch.empty_like(scales, dtype=torch.float32)
-    azp_out = torch.empty_like(azps, dtype=torch.int32)
-    torch.ops._C.dynamic_scaled_int8_quant(ops_out, x, scales_out, azp_out)
+    ops_out, scales_out, azp_out = scaled_int8_quant(x, symmetric=False)
 
     if (not torch.allclose(scales_out, scales)):
         print(torch.argmax(torch.abs(scales_out - scales)))
@@ -119,7 +116,8 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
 
     out1 = (x / scale_arg).round().clamp(int8_traits.min,
                                          int8_traits.max).to(torch.int8)
-    out2, _, _ = scaled_int8_quant(x, scale_arg)
+    out2, scale2, _ = scaled_int8_quant(x, scale_arg)
+    assert scale2 is scale_arg
 
     # big atol to account for rounding errors
     torch.testing.assert_close(out1, out2, atol=1, rtol=0.0)
@@ -145,11 +143,15 @@ def test_static_scaled_int8_azp_quant(num_tokens: int, hidden_size: int,
 
     out1 = ((x / scale).round() + azp).clamp(int8_traits.min,
                                              int8_traits.max).to(torch.int8)
-    out2 = torch.empty_like(x, dtype=torch.int8)
     scale_arg = torch.tensor([scale], dtype=torch.float32, device="cuda")
     azp_arg = torch.tensor([azp], dtype=torch.int32, device="cuda")
 
-    torch.ops._C.static_scaled_int8_quant(out2, x, scale_arg, azp_arg)
+    out2, scale2, azp2 = scaled_int8_quant(x,
+                                           scale_arg,
+                                           azp_arg,
+                                           symmetric=False)
+    assert scale2 is scale_arg
+    assert azp2 is azp_arg
 
     # big atol to account for rounding errors
     torch.testing.assert_close(out1, out2, atol=1, rtol=0.0)
@@ -184,6 +186,5 @@ def test_static_scaled_int8_azp_quant_saturating_cast(is_max: bool) -> None:
     val_i8 = int8_traits.max if is_max else int8_traits.min
     expected = torch.full((1, 5), val_i8, dtype=torch.int8, device="cuda")
 
-    out = torch.empty_like(expected)
-    torch.ops._C.static_scaled_int8_quant(out, x, scale, azp)
+    out, _, _ = scaled_int8_quant(x, scale, azp, symmetric=False)
     torch.testing.assert_close(expected, out, atol=0, rtol=0)
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 03097569b2b3b..26add5bf6d90d 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -8,6 +8,7 @@
 import torch
 from compressed_tensors.quantization import QuantizationType
 
+from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
     CompressedTensorsLinearMethod, CompressedTensorsW4A16Sparse24,
     CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
@@ -74,6 +75,35 @@ def zp_valid(zp: Optional[torch.Tensor]):
         assert output
 
 
+@pytest.mark.parametrize(
+    "model_path",
+    [
+        "neuralmagic/Llama-3.2-1B-quantized.w8a8"
+        # TODO static & asymmetric
+    ])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_compressed_tensors_w8a8_logprobs(hf_runner, vllm_runner,
+                                          example_prompts, model_path,
+                                          max_tokens, num_logprobs):
+    dtype = "bfloat16"
+
+    with hf_runner(model_path, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(model_path, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
 def test_compressed_tensors_no_enforce_eager(vllm_runner):
     model_path = "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change"
     with vllm_runner(model_path) as llm:
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 8f331a27a20de..b276b8fc25473 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -510,10 +510,16 @@ def cutlass_scaled_mm_azp(a: torch.Tensor,
                           azp_adj: torch.Tensor,
                           azp: Optional[torch.Tensor] = None,
                           bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    """
+    :param azp_adj: In the per-tensor case, this should include the azp.
+    Always per-channel.
+    :param azp: Only set in the per-token case. Per-token if set.
+    """
     assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
     assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
     assert bias is None or bias.numel(
     ) == b.shape[1] and bias.dtype == out_dtype
+    assert azp is None or azp.numel() == a.shape[0]
 
     m = a.shape[0]
     n = b.shape[1]
@@ -735,7 +741,7 @@ def scaled_int8_quant(
             azp is
             None), "azp must only be provided for asymmetric quantization."
         torch.ops._C.static_scaled_int8_quant(output, input, scale, azp)
-        return output, scale, None
+        return output, scale, azp
 
     # dynamic-per-token quantization.
     input_scales = torch.empty((input.numel() // input.shape[-1], 1),
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
index 15d9cdbcbb86b..6cbc58d61e970 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py
@@ -82,9 +82,13 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # For more details, see csrc/quantization/cutlass_w8a8/Epilogues.md
         # https://github.com/vllm-project/vllm/blob/8d59dbb00044a588cab96bcdc028006ed922eb06/csrc/quantization/cutlass_w8a8/Epilogues.md
         if not self.input_symmetric:
-            layer.azp_adj = layer.weight.sum(dim=0,
-                                             keepdim=True,
-                                             dtype=torch.int32)
+            azp_adj = layer.weight.sum(dim=0, keepdim=True, dtype=torch.int32)
+            if self.is_static_input_scheme:
+                # cutlass_w8a8 requires azp to be folded into azp_adj
+                #  in the per-tensor case
+                azp_adj = layer.input_zero_point * azp_adj
+
+            layer.azp_adj = azp_adj
         else:
             layer.azp_adj = None
 
@@ -138,7 +142,6 @@ def create_weights(self, layer: torch.nn.Module,
 
     def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
                       bias: Optional[torch.Tensor]) -> torch.Tensor:
-
         return apply_int8_linear(input=x,
                                  weight=layer.weight,
                                  weight_scale=layer.weight_scale,
diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
index ec73533126ab6..4037bcb963b25 100644
--- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py
@@ -211,13 +211,16 @@ def apply_int8_linear(
                                                symmetric=symmetric)
 
     if x_zp is not None:
+        # Currently, static is always per-tensor and dynamic is per-token
+        static = input_zero_point is not None
+        azp = None if static else x_zp
         return ops.cutlass_scaled_mm_azp(x_q,
                                          weight,
                                          scale_a=x_scale,
                                          scale_b=weight_scale,
                                          out_dtype=input.dtype,
                                          azp_adj=azp_adj,
-                                         azp=x_zp,
+                                         azp=azp,
                                          bias=bias)
     return ops.cutlass_scaled_mm(x_q,
                                  weight,

From 2885ba0e24e536d0a5b2439be5e96aef504a2e7f Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Thu, 14 Nov 2024 21:44:26 -0500
Subject: [PATCH 144/183] [Misc] Change RedundantReshapesPass and FusionPass
 logging from info to debug (#10308)

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 vllm/compilation/fusion.py   | 4 ++--
 vllm/compilation/reshapes.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index 2a0cf0002c9dd..eb43604b1399b 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -281,11 +281,11 @@ def __call__(self, graph: torch.fx.Graph):
         self.dump_graph(graph, "before_fusion")
 
         count = self.patterns.apply(graph)
-        logger.info("Replaced %s patterns", count)
+        logger.debug("Replaced %s patterns", count)
         self.dump_graph(graph, "after_pattern_match")
 
         # Manually process multi-output matches (and run DCE)
         self.process_matches(graph)
-        logger.info("Post-processed %s matches", len(self.matches))
+        logger.debug("Post-processed %s matches", len(self.matches))
         self.dump_graph(graph, "after_fusion")
         self.matches.clear()
diff --git a/vllm/compilation/reshapes.py b/vllm/compilation/reshapes.py
index 0d284246d2576..36597e119d2e1 100644
--- a/vllm/compilation/reshapes.py
+++ b/vllm/compilation/reshapes.py
@@ -53,7 +53,7 @@ def __call__(self, graph: torch.fx.Graph):
                     graph.erase_node(node)
                     count += 1
 
-        logger.info("Removed %s no-op reshapes", count)
+        logger.debug("Removed %s no-op reshapes", count)
 
         self.dump_graph(graph, "after_reshapes")
 

From b40cf6402e356a10415e969e648a32911fb9b8ec Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 15 Nov 2024 12:23:09 +0800
Subject: [PATCH 145/183] [Model] Support Qwen2 embeddings and use tags to
 select model tests (#10184)

---
 .buildkite/run-cpu-test-ppc64le.sh            |   6 +-
 .buildkite/run-cpu-test.sh                    |   6 +-
 .buildkite/test-pipeline.yaml                 |  48 ++++----
 docs/source/models/supported_models.rst       |  13 +-
 .../decoder_only/language/test_jamba.py       |  18 +--
 .../decoder_only/language/test_mamba.py       |  18 +--
 .../decoder_only/language/test_models.py      |  71 ++++++-----
 .../embedding/language/test_cls_models.py     |  30 ++---
 .../embedding/language/test_embedding.py      |  42 +++----
 .../vision_language/test_llava_next.py        |   2 +
 .../embedding/vision_language/test_phi3v.py   |   2 +
 .../encoder_decoder/language/test_bart.py     |  11 +-
 .../vision_language/test_mllama.py            |   3 +
 tests/models/registry.py                      |   4 +
 tests/models/test_registry.py                 |   4 +-
 vllm/model_executor/models/qwen2.py           | 112 ++++++++++++++++--
 vllm/model_executor/models/qwen2_cls.py       |  15 +--
 vllm/model_executor/models/qwen2_rm.py        |  16 +--
 vllm/model_executor/models/registry.py        |   9 +-
 19 files changed, 252 insertions(+), 178 deletions(-)

diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh
index 79526adef2a79..5d7a0bff90963 100755
--- a/.buildkite/run-cpu-test-ppc64le.sh
+++ b/.buildkite/run-cpu-test-ppc64le.sh
@@ -27,9 +27,9 @@ function cpu_tests() {
       decord einops librosa peft Pillow sentence-transformers soundfile \
       transformers_stream_generator matplotlib datamodel_code_generator
     pip install torchvision --index-url https://download.pytorch.org/whl/cpu
-    pytest -v -s tests/models/embedding/language
-    pytest -v -s tests/models/encoder_decoder/language
-    pytest -v -s tests/models/decoder_only/language/test_models.py
+    pytest -v -s tests/models/decoder_only/language -m cpu_model
+    pytest -v -s tests/models/embedding/language -m cpu_model
+    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
     pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
     pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
 
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index a00331abb7d03..14756b5964aaf 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -38,9 +38,9 @@ function cpu_tests() {
       decord einops librosa peft Pillow sentence-transformers soundfile \
       transformers_stream_generator matplotlib datamodel_code_generator
     pip install torchvision --index-url https://download.pytorch.org/whl/cpu
-    pytest -v -s tests/models/embedding/language
-    pytest -v -s tests/models/encoder_decoder/language
-    pytest -v -s tests/models/decoder_only/language/test_models.py
+    pytest -v -s tests/models/decoder_only/language -m cpu_model
+    pytest -v -s tests/models/embedding/language -m cpu_model
+    pytest -v -s tests/models/encoder_decoder/language -m cpu_model
     pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
     pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
 
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index baad54eaf6a91..24bf223fb12c0 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -323,62 +323,60 @@ steps:
     - pytest -v -s models/test_registry.py
     - pytest -v -s models/test_initialization.py
 
-- label: Decoder-only Language Models Test (Standard) # 18min
+- label: Language Models Test (Standard) # 42min
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
   - tests/models/decoder_only/language
+  - tests/models/embedding/language
+  - tests/models/encoder_decoder/language
   commands:
-    - pytest -v -s models/decoder_only/language -m core_model
-    - pytest -v -s models/decoder_only/language -m quant_model
+    - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
+    - pytest -v -s models/embedding/language -m core_model
+    - pytest -v -s models/embedding/vision_language -m core_model
 
-- label: Decoder-only Language Models Test (Extended) # 46min
+- label: Language Models Test (Extended) # 50min
   nightly: true
   source_file_dependencies:
   - vllm/
   - tests/models/decoder_only/language
+  - tests/models/embedding/language
+  - tests/models/encoder_decoder/language
   commands:
     - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
+    - pytest -v -s models/embedding/language -m 'not core_model'
+    - pytest -v -s models/embedding/vision_language -m 'not core_model'
 
-- label: Decoder-only Multi-Modal Models Test (Standard) # 22min
+- label: Multi-Modal Models Test (Standard) # 26min
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
   - tests/models/decoder_only/audio_language
   - tests/models/decoder_only/vision_language
+  - tests/models/embedding/vision_language
+  - tests/models/encoder_decoder/vision_language
   commands:
-    - pytest -v -s models/decoder_only/audio_language -m core_model
-    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m core_model
-    # No tests under this group for now
-    # - pytest -v -s models/decoder_only/audio_language -m quant_model
-    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m quant_model
+    - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
+    - pytest -v -s models/encoder_decoder/language -m core_model
+    - pytest -v -s models/encoder_decoder/vision_language -m core_model
 
-- label: Decoder-only Multi-Modal Models Test (Extended) # 1h10m
+- label: Multi-Modal Models Test (Extended) # 1h15m
   nightly: true
   source_file_dependencies:
   - vllm/
   - tests/models/decoder_only/audio_language
   - tests/models/decoder_only/vision_language
+  - tests/models/embedding/vision_language
+  - tests/models/encoder_decoder/vision_language
   commands:
     - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
     # HACK - run phi3v tests separately to sidestep this transformers bug
     # https://github.com/huggingface/transformers/issues/34307
     - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
     - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
-
-- label: Other Models Test # 20min
-  #mirror_hardwares: [amd]
-  source_file_dependencies:
-  - vllm/
-  - tests/models/embedding/language
-  - tests/models/embedding/vision_language
-  - tests/models/encoder_decoder/language
-  - tests/models/encoder_decoder/vision_language
-  commands:
-    - pytest -v -s models/embedding/language
-    - pytest -v -s models/embedding/vision_language
-    - pytest -v -s models/encoder_decoder/language
-    - pytest -v -s models/encoder_decoder/vision_language
+    - pytest -v -s models/encoder_decoder/language -m 'not core_model'
+    - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
 
 # This test is used only in PR development phase to test individual models and should never run on main
 - label: Custom Models Test
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 161733c049bbe..a76bb775c6ee6 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -330,11 +330,16 @@ Text Embedding
     - :code:`BAAI/bge-multilingual-gemma2`, etc.
     - 
     - ✅︎
-  * - :code:`MistralModel`
-    - Mistral-based
+  * - :code:`LlamaModel`, :code:`LlamaForCausalLM`, :code:`MistralModel`, etc.
+    - Llama-based
     - :code:`intfloat/e5-mistral-7b-instruct`, etc.
     - ✅︎
     - ✅︎
+  * - :code:`Qwen2Model`, :code:`Qwen2ForCausalLM`
+    - Qwen2-based
+    - :code:`ssmits/Qwen2-7B-Instruct-embed-base`, :code:`Alibaba-NLP/gte-Qwen2-1.5B-instruct`, etc.
+    - ✅︎
+    - ✅︎
 
 .. important::
   Some model architectures support both generation and embedding tasks.
@@ -355,7 +360,7 @@ Reward Modeling
   * - :code:`Qwen2ForRewardModel`
     - Qwen2-based
     - :code:`Qwen/Qwen2.5-Math-RM-72B`, etc.
-    - 
+    - ✅︎
     - ✅︎
 
 .. note::
@@ -376,7 +381,7 @@ Classification
   * - :code:`Qwen2ForSequenceClassification`
     - Qwen2-based
     - :code:`jason9693/Qwen2.5-1.5B-apeach`, etc.
-    - 
+    - ✅︎
     - ✅︎
 
 .. note::
diff --git a/tests/models/decoder_only/language/test_jamba.py b/tests/models/decoder_only/language/test_jamba.py
index 384ec77e5455a..6542689c3f277 100644
--- a/tests/models/decoder_only/language/test_jamba.py
+++ b/tests/models/decoder_only/language/test_jamba.py
@@ -33,6 +33,10 @@ def test_models(
 
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        # This test is for verifying whether the model's extra_repr
+        # can be printed correctly.
+        print(vllm_model.model.llm_engine.model_executor.driver_worker.
+              model_runner.model)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
@@ -293,17 +297,3 @@ def test_jamba_distributed_produces_identical_generation(
         name_0="vllm_tp_1",
         name_1="vllm_tp_2",
     )
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-def test_model_print(
-    vllm_runner,
-    model: str,
-    dtype: str,
-) -> None:
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        # This test is for verifying whether the model's extra_repr
-        # can be printed correctly.
-        print(vllm_model.model.llm_engine.model_executor.driver_worker.
-              model_runner.model)
diff --git a/tests/models/decoder_only/language/test_mamba.py b/tests/models/decoder_only/language/test_mamba.py
index 2dc231c595ffa..78eab8d5354fd 100644
--- a/tests/models/decoder_only/language/test_mamba.py
+++ b/tests/models/decoder_only/language/test_mamba.py
@@ -51,6 +51,10 @@ def test_models(
 
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        # This test is for verifying whether the model's extra_repr
+        # can be printed correctly.
+        print(vllm_model.model.llm_engine.model_executor.driver_worker.
+              model_runner.model)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
@@ -279,17 +283,3 @@ def test_state_cleanup(
     except ValueError:
         pytest.fail("Mamba inner state wasn't cleaned up between states, "
                     "could be related to finished_requests_ids")
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-def test_model_print(
-    vllm_runner,
-    model: str,
-    dtype: str,
-) -> None:
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        # This test is for verifying whether the model's extra_repr
-        # can be printed correctly.
-        print(vllm_model.model.llm_engine.model_executor.driver_worker.
-              model_runner.model)
diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py
index beb1ffb18436e..2a7ed8826d2f3 100644
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -4,37 +4,52 @@
 """
 import pytest
 
-from vllm.platforms import current_platform
-
 from ...utils import check_logprobs_close
 
-MODELS = [
-    "facebook/opt-125m",  # opt
-    "openai-community/gpt2",  # gpt2
-    # "Milos/slovak-gpt-j-405M",  # gptj
-    # "bigcode/tiny_starcoder_py",  # gpt_bigcode
-    # "EleutherAI/pythia-70m",  # gpt_neox
-    "bigscience/bloom-560m",  # bloom - testing alibi slopes
-    "microsoft/phi-2",  # phi
-    # "stabilityai/stablelm-3b-4e1t",  # stablelm
-    # "bigcode/starcoder2-3b",  # starcoder2
-    "google/gemma-1.1-2b-it",  # gemma
-    "Qwen/Qwen2.5-0.5B-Instruct",  # qwen2
-    "meta-llama/Llama-3.2-1B-Instruct",  # llama
-]
-
-if not current_platform.is_cpu():
-    MODELS += [
-        # fused_moe which not supported on CPU
-        "openbmb/MiniCPM3-4B",
-    ]
-
-target_dtype = "half"
-
 
-@pytest.mark.core_model
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize(
+    "model",
+    [
+        pytest.param(
+            "bigscience/bloom-560m",  # bloom - testing alibi slopes
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        ),
+        pytest.param(
+            "openai-community/gpt2",  # gpt2
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        ),
+        pytest.param("Milos/slovak-gpt-j-405M"),  # gptj
+        pytest.param("bigcode/tiny_starcoder_py"),  # gpt_bigcode
+        pytest.param("EleutherAI/pythia-70m"),  # gpt_neox
+        pytest.param(
+            "google/gemma-1.1-2b-it",  # gemma
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        ),
+        pytest.param(
+            "meta-llama/Llama-3.2-1B-Instruct",  # llama
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        ),
+        pytest.param(
+            "openbmb/MiniCPM3-4B",
+            # fused_moe not supported on CPU
+            marks=[pytest.mark.core_model],
+        ),
+        pytest.param(
+            "facebook/opt-125m",  # opt
+            marks=[pytest.mark.core_model, pytest.mark.cpu_model],
+        ),
+        pytest.param(
+            "microsoft/phi-2",  # phi
+            marks=[pytest.mark.core_model],
+        ),
+        pytest.param(
+            "Qwen/Qwen2.5-0.5B-Instruct",  # qwen2
+            marks=[pytest.mark.core_model],
+        ),
+        pytest.param("stabilityai/stablelm-3b-4e1t"),  # stablelm
+        pytest.param("bigcode/starcoder2-3b"),  # starcoder2
+    ])
+@pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [5])
 def test_models(
diff --git a/tests/models/embedding/language/test_cls_models.py b/tests/models/embedding/language/test_cls_models.py
index 40ee49cf60742..6321503e7b248 100644
--- a/tests/models/embedding/language/test_cls_models.py
+++ b/tests/models/embedding/language/test_cls_models.py
@@ -9,10 +9,14 @@
 import torch
 from transformers import AutoModelForSequenceClassification
 
-CLASSIFICATION_MODELS = ["jason9693/Qwen2.5-1.5B-apeach"]
 
-
-@pytest.mark.parametrize("model", CLASSIFICATION_MODELS)
+@pytest.mark.parametrize(
+    "model",
+    [
+        pytest.param("jason9693/Qwen2.5-1.5B-apeach",
+                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+    ],
+)
 @pytest.mark.parametrize("dtype", ["float"])
 def test_classification_models(
     hf_runner,
@@ -23,31 +27,19 @@ def test_classification_models(
 ) -> None:
     with vllm_runner(model, dtype=dtype) as vllm_model:
         vllm_outputs = vllm_model.classify(example_prompts)
+        # This test is for verifying whether the model's extra_repr
+        # can be printed correctly.
+        print(vllm_model.model.llm_engine.model_executor.driver_worker.
+              model_runner.model)
 
     with hf_runner(model,
                    dtype=dtype,
                    auto_cls=AutoModelForSequenceClassification) as hf_model:
         hf_outputs = hf_model.classify(example_prompts)
 
-    print(hf_outputs, vllm_outputs)
-
     # check logits difference
     for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
         hf_output = torch.tensor(hf_output)
         vllm_output = torch.tensor(vllm_output)
 
         assert torch.allclose(hf_output, vllm_output, 1e-3)
-
-
-@pytest.mark.parametrize("model", CLASSIFICATION_MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
-def test_classification_model_print(
-    vllm_runner,
-    model: str,
-    dtype: str,
-) -> None:
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        # This test is for verifying whether the model's extra_repr
-        # can be printed correctly.
-        print(vllm_model.model.llm_engine.model_executor.driver_worker.
-              model_runner.model)
diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py
index fcdd684168d04..c3f351ef707be 100644
--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -4,25 +4,25 @@
 """
 import pytest
 
-from vllm.utils import current_platform
-
 from ..utils import check_embeddings_close
 
-# Model, Guard
-MODELS = [
-    "intfloat/e5-mistral-7b-instruct",
-    "BAAI/bge-base-en-v1.5",
-    "BAAI/bge-multilingual-gemma2",
-    "intfloat/multilingual-e5-large",
-]
-
-ENCODER_ONLY = [
-    "BAAI/bge-base-en-v1.5",
-    "intfloat/multilingual-e5-large",
-]
 
-
-@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize(
+    "model",
+    [
+        # [Encoder-only]
+        pytest.param("BAAI/bge-base-en-v1.5",
+                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+        pytest.param("intfloat/multilingual-e5-large"),
+        # [Encoder-decoder]
+        pytest.param("intfloat/e5-mistral-7b-instruct",
+                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+        pytest.param("BAAI/bge-multilingual-gemma2",
+                     marks=[pytest.mark.core_model]),
+        pytest.param("ssmits/Qwen2-7B-Instruct-embed-base"),
+        pytest.param("Alibaba-NLP/gte-Qwen2-1.5B-instruct"),
+    ],
+)
 @pytest.mark.parametrize("dtype", ["half"])
 def test_models(
     hf_runner,
@@ -31,9 +31,6 @@ def test_models(
     model,
     dtype: str,
 ) -> None:
-    if model not in ENCODER_ONLY and current_platform.is_cpu():
-        pytest.skip("Skip large embedding models test on CPU.")
-
     # The example_prompts has ending "\n", for example:
     # "Write a short story about a robot that dreams for the first time.\n"
     # sentence_transformers will strip the input texts, see:
@@ -46,8 +43,13 @@ def test_models(
                    is_sentence_transformer=True) as hf_model:
         hf_outputs = hf_model.encode(example_prompts)
 
-    with vllm_runner(model, dtype=dtype, max_model_len=None) as vllm_model:
+    with vllm_runner(model, task="embedding", dtype=dtype,
+                     max_model_len=None) as vllm_model:
         vllm_outputs = vllm_model.encode(example_prompts)
+        # This test is for verifying whether the model's extra_repr
+        # can be printed correctly.
+        print(vllm_model.model.llm_engine.model_executor.driver_worker.
+              model_runner.model)
 
     check_embeddings_close(
         embeddings_0_lst=hf_outputs,
diff --git a/tests/models/embedding/vision_language/test_llava_next.py b/tests/models/embedding/vision_language/test_llava_next.py
index 9fab5898a06ba..329c6ba279f89 100644
--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
@@ -88,6 +88,7 @@ def _run_test(
 
 @pytest.mark.skipif(transformers.__version__.startswith("4.46"),
                     reason="Model broken with changes in transformers 4.46")
+@pytest.mark.core_model
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 def test_models_text(
@@ -112,6 +113,7 @@ def test_models_text(
 
 
 @large_gpu_test(min_gb=48)
+@pytest.mark.core_model
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 def test_models_image(
diff --git a/tests/models/embedding/vision_language/test_phi3v.py b/tests/models/embedding/vision_language/test_phi3v.py
index ee411472ba284..6145aff1a5ea2 100644
--- a/tests/models/embedding/vision_language/test_phi3v.py
+++ b/tests/models/embedding/vision_language/test_phi3v.py
@@ -74,6 +74,7 @@ def _run_test(
     )
 
 
+@pytest.mark.core_model
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 def test_models_text(
@@ -98,6 +99,7 @@ def test_models_text(
 
 
 @large_gpu_test(min_gb=48)
+@pytest.mark.core_model
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 def test_models_image(
diff --git a/tests/models/encoder_decoder/language/test_bart.py b/tests/models/encoder_decoder/language/test_bart.py
index 8e8862fadbf04..10aba8427944f 100644
--- a/tests/models/encoder_decoder/language/test_bart.py
+++ b/tests/models/encoder_decoder/language/test_bart.py
@@ -14,8 +14,6 @@
 from ....utils import multi_gpu_test
 from ...utils import check_logprobs_close
 
-MODELS = ["facebook/bart-base", "facebook/bart-large-cnn"]
-
 
 def vllm_to_hf_output(
     vllm_output: Tuple[List[int], str, Optional[SampleLogprobs]],
@@ -170,7 +168,14 @@ def run_test(
     )
 
 
-@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize(
+    "model",
+    [
+        pytest.param("facebook/bart-base",
+                     marks=[pytest.mark.core_model, pytest.mark.cpu_model]),
+        pytest.param("facebook/bart-large-cnn"),
+    ],
+)
 @pytest.mark.parametrize("dtype", ["float", "bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py
index a3b1c0950d9a2..77dd1d81f84d7 100644
--- a/tests/models/encoder_decoder/vision_language/test_mllama.py
+++ b/tests/models/encoder_decoder/vision_language/test_mllama.py
@@ -233,6 +233,7 @@ def clear_cache():
 
 
 @large_gpu_test(min_gb=48)
+@pytest.mark.core_model
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
     "sizes",
@@ -278,6 +279,7 @@ def test_models_single_leading_image(hf_runner, vllm_runner, image_assets,
 
 
 @large_gpu_test(min_gb=48)
+@pytest.mark.core_model
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [128])
@@ -326,6 +328,7 @@ def test_models_multi_leading_images(hf_runner, vllm_runner, image_assets,
 
 
 @large_gpu_test(min_gb=48)
+@pytest.mark.core_model
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [128])
diff --git a/tests/models/registry.py b/tests/models/registry.py
index ec9ff52d112df..3848367b6126c 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -129,9 +129,13 @@ class _HfExamplesInfo:
     # [Text-only]
     "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
     "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),
+    "LlamaModel": _HfExamplesInfo("llama", is_available_online=False),
     "MistralModel": _HfExamplesInfo("intfloat/e5-mistral-7b-instruct"),
+    "Qwen2Model": _HfExamplesInfo("ssmits/Qwen2-7B-Instruct-embed-base"),
     "Qwen2ForRewardModel": _HfExamplesInfo("Qwen/Qwen2.5-Math-RM-72B"),
     "Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"),  # noqa: E501
+    "RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"),  # noqa: E501
+    "XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-large"),
     # [Multimodal]
     "LlavaNextForConditionalGeneration": _HfExamplesInfo("royokong/e5-v"),
     "Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full",
diff --git a/tests/models/test_registry.py b/tests/models/test_registry.py
index dbc415796ee55..e462dae3dc688 100644
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -77,8 +77,8 @@ def test_registry_is_pp(model_arch, is_pp, init_cuda):
 
 
 def test_hf_registry_coverage():
-    untested_archs = (HF_EXAMPLE_MODELS.get_supported_archs() -
-                      set(ModelRegistry.get_supported_archs()))
+    untested_archs = (ModelRegistry.get_supported_archs() -
+                      HF_EXAMPLE_MODELS.get_supported_archs())
 
     assert not untested_archs, (
         "Please add the following architectures to "
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index b623c576bb673..431e397e1e10d 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -37,6 +37,7 @@
                                                QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.pooler import Pooler, PoolingType
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import SamplerOutput, get_sampler
@@ -44,8 +45,9 @@
     ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
+from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.sequence import IntermediateTensors
+from vllm.sequence import IntermediateTensors, PoolerOutput
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, PPMissingLayer, is_pp_missing_parameter,
@@ -247,6 +249,18 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
+        # TODO (@robertgshaw2): see if this can be moved out
+        if (cache_config.sliding_window is not None
+                and hasattr(config, "max_window_layers")):
+            raise ValueError("Sliding window for some but all layers is not "
+                             "supported. This model uses sliding window "
+                             "but `max_window_layers` = {} is less than "
+                             "`num_hidden_layers` = {}. Please open an issue "
+                             "to discuss this feature.".format(
+                                 config.max_window_layers,
+                                 config.num_hidden_layers,
+                             ))
+
         self.config = config
         self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
@@ -405,20 +419,9 @@ class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
-        # TODO (@robertgshaw2): see if this can be moved out
-        if (cache_config.sliding_window is not None
-                and hasattr(config, "max_window_layers")):
-            raise ValueError("Sliding window for some but all layers is not "
-                             "supported. This model uses sliding window "
-                             "but `max_window_layers` = {} is less than "
-                             "`num_hidden_layers` = {}. Please open an issue "
-                             "to discuss this feature.".format(
-                                 config.max_window_layers,
-                                 config.num_hidden_layers,
-                             ))
+        pooler_config = vllm_config.model_config.pooler_config
 
         self.config = config
         self.lora_config = lora_config
@@ -438,6 +441,15 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = get_sampler()
+
+        # The same model class supports both language generation and embedding
+        # because the architecture name is the same
+        self._pooler = Pooler.from_config_with_defaults(
+            pooler_config,
+            pooling_type=PoolingType.LAST,
+            normalize=True,
+            softmax=False)
+
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
@@ -475,6 +487,13 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
+    def pooler(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Optional[PoolerOutput]:
+        return self._pooler(hidden_states, pooling_metadata)
+
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         loader = AutoWeightsLoader(
             self,
@@ -482,3 +501,70 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                            if self.config.tie_word_embeddings else None),
         )
         loader.load_weights(weights)
+
+
+class Qwen2EmbeddingModel(nn.Module, SupportsLoRA, SupportsPP):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": [
+            "gate_proj",
+            "up_proj",
+        ],
+    }
+
+    # LoRA specific attributes
+    supported_lora_modules = [
+        "qkv_proj",
+        "o_proj",
+        "gate_up_proj",
+        "down_proj",
+    ]
+    embedding_modules = {}
+    embedding_padding_modules = []
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        lora_config = vllm_config.lora_config
+        pooler_config = vllm_config.model_config.pooler_config
+
+        self.config = config
+        self.lora_config = lora_config
+
+        self.quant_config = quant_config
+        self.model = Qwen2Model(vllm_config=vllm_config,
+                                prefix=maybe_prefix(prefix, "model"))
+
+        self._pooler = Pooler.from_config_with_defaults(
+            pooler_config,
+            pooling_type=PoolingType.MEAN,
+            normalize=True,
+            softmax=False)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+    ) -> torch.Tensor:
+        return self.model(input_ids, positions, kv_caches, attn_metadata,
+                          intermediate_tensors)
+
+    def pooler(
+        self,
+        hidden_states: torch.Tensor,
+        pooling_metadata: PoolingMetadata,
+    ) -> Optional[PoolerOutput]:
+        return self._pooler(hidden_states, pooling_metadata)
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        loader = AutoWeightsLoader(self,
+                                   ignore_unexpected_prefixes=["lm_head."])
+        loader.load_weights(weights)
diff --git a/vllm/model_executor/models/qwen2_cls.py b/vllm/model_executor/models/qwen2_cls.py
index 27eb7e8a93975..120403e948686 100644
--- a/vllm/model_executor/models/qwen2_cls.py
+++ b/vllm/model_executor/models/qwen2_cls.py
@@ -17,10 +17,11 @@
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors, PoolerOutput
 
+from .interfaces import SupportsLoRA, SupportsPP
 from .utils import AutoWeightsLoader, maybe_prefix
 
 
-class Qwen2ForSequenceClassification(nn.Module):
+class Qwen2ForSequenceClassification(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -46,21 +47,9 @@ class Qwen2ForSequenceClassification(nn.Module):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
         pooler_config = vllm_config.model_config.pooler_config
-        # TODO (@robertgshaw2): see if this can be moved out
-        if (cache_config.sliding_window is not None
-                and hasattr(config, "max_window_layers")):
-            raise ValueError("Sliding window for some but all layers is not "
-                             "supported. This model uses sliding window "
-                             "but `max_window_layers` = {} is less than "
-                             "`num_hidden_layers` = {}. Please open an issue "
-                             "to discuss this feature.".format(
-                                 config.max_window_layers,
-                                 config.num_hidden_layers,
-                             ))
 
         self.config = config
         self.lora_config = lora_config
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index 89768ec9dff37..55843d8325348 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -16,7 +16,7 @@
 from vllm.model_executor.pooling_metadata import PoolingMetadata
 from vllm.sequence import IntermediateTensors, PoolerOutput
 
-from .interfaces import SupportsPP
+from .interfaces import SupportsLoRA, SupportsPP
 from .qwen2 import Qwen2Model
 from .utils import AutoWeightsLoader, maybe_prefix
 
@@ -32,7 +32,7 @@ def forward(self, input):
         return self.activation(input)
 
 
-class Qwen2ForRewardModel(nn.Module, SupportsPP):
+class Qwen2ForRewardModel(nn.Module, SupportsLoRA, SupportsPP):
     packed_modules_mapping = {
         "qkv_proj": [
             "q_proj",
@@ -58,21 +58,9 @@ class Qwen2ForRewardModel(nn.Module, SupportsPP):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         super().__init__()
         config = vllm_config.model_config.hf_config
-        cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
         lora_config = vllm_config.lora_config
         pooler_config = vllm_config.model_config.pooler_config
-        # TODO (@robertgshaw2): see if this can be moved out
-        if (cache_config.sliding_window is not None
-                and hasattr(config, "max_window_layers")):
-            raise ValueError("Sliding window for some but all layers is not "
-                             "supported. This model uses sliding window "
-                             "but `max_window_layers` = {} is less than "
-                             "`num_hidden_layers` = {}. Please open an issue "
-                             "to discuss this feature.".format(
-                                 config.max_window_layers,
-                                 config.num_hidden_layers,
-                             ))
 
         self.config = config
         self.lora_config = lora_config
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
index c0d503a1c5ba2..22c2e328bfb65 100644
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -11,7 +11,8 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from functools import lru_cache
-from typing import Callable, Dict, List, Optional, Tuple, Type, TypeVar, Union
+from typing import (AbstractSet, Callable, Dict, List, Optional, Tuple, Type,
+                    TypeVar, Union)
 
 import cloudpickle
 import torch.nn as nn
@@ -110,6 +111,8 @@
     },
     "MistralModel": ("llama", "LlamaEmbeddingModel"),
     "Phi3ForCausalLM": ("phi3", "Phi3ForCausalLM"),
+    "Qwen2Model": ("qwen2", "Qwen2EmbeddingModel"),
+    "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
     "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"),
     "Qwen2ForSequenceClassification": ("qwen2_cls", "Qwen2ForSequenceClassification"),  # noqa: E501
     # [Multimodal]
@@ -301,8 +304,8 @@ class _ModelRegistry:
     # Keyed by model_arch
     models: Dict[str, _BaseRegisteredModel] = field(default_factory=dict)
 
-    def get_supported_archs(self) -> List[str]:
-        return list(self.models.keys())
+    def get_supported_archs(self) -> AbstractSet[str]:
+        return self.models.keys()
 
     def register_model(
         self,

From 2ec88272881a49d40d91ae0cd858b19d22996c70 Mon Sep 17 00:00:00 2001
From: Sky Lee <46676799+skylee-01@users.noreply.github.com>
Date: Fri, 15 Nov 2024 13:40:10 +0800
Subject: [PATCH 146/183] [Bugfix]  Qwen-vl output is inconsistent in
 speculative decoding (#10350)

---
 vllm/spec_decode/batch_expansion.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
index 6a7929d9d8f9c..25ef27b8378f0 100644
--- a/vllm/spec_decode/batch_expansion.py
+++ b/vllm/spec_decode/batch_expansion.py
@@ -353,6 +353,7 @@ def _create_single_target_seq_group_metadata(
         seq_data = seq_group_metadata.seq_data[seq_id]
         prompt_token_ids = seq_data.prompt_token_ids_array
         new_output_token_ids = [*seq_data.get_output_token_ids(), *token_ids]
+        mrope_position_delta = seq_data.mrope_position_delta
 
         new_seq_data_dict = {
             target_seq_id:
@@ -368,6 +369,7 @@ def _create_single_target_seq_group_metadata(
         # the kv cache is filled by a previous batch in the batch expansion.
         for data in new_seq_data_dict.values():
             data.update_num_computed_tokens(data.get_len() - 1)
+            data.mrope_position_delta = mrope_position_delta
 
         return SequenceGroupMetadata(
             request_id=seq_group_metadata.request_id,

From 2ac6d0e75bc846998da56b50bf4f8853cb36d484 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 15 Nov 2024 14:59:00 +0800
Subject: [PATCH 147/183] [Misc] Consolidate pooler config overrides (#10351)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/supported_models.rst |  10 ++-
 tests/engine/test_arg_utils.py          |   9 +-
 tests/test_config.py                    |  50 +++++------
 vllm/config.py                          | 112 ++++++++++++------------
 vllm/engine/arg_utils.py                |  85 ++++--------------
 vllm/entrypoints/llm.py                 |  15 +---
 vllm/model_executor/layers/pooler.py    |  54 +++++++-----
 7 files changed, 143 insertions(+), 192 deletions(-)

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index a76bb775c6ee6..96a513d42753b 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -345,6 +345,9 @@ Text Embedding
   Some model architectures support both generation and embedding tasks.
   In this case, you have to pass :code:`--task embedding` to run the model in embedding mode.
 
+.. tip::
+  You can override the model's pooling method by passing :code:`--override-pooler-config`.
+
 Reward Modeling
 ---------------
 
@@ -364,7 +367,7 @@ Reward Modeling
     - ✅︎
 
 .. note::
-    As an interim measure, these models are supported via Embeddings API. See `this RFC <https://github.com/vllm-project/vllm/issues/8967>`_ for upcoming changes.
+    As an interim measure, these models are supported in both offline and online inference via Embeddings API.
 
 Classification
 ---------------
@@ -385,7 +388,7 @@ Classification
     - ✅︎
 
 .. note::
-    As an interim measure, these models are supported via Embeddings API. It will be supported via Classification API in the future (no reference APIs exist now).
+    As an interim measure, these models are supported in both offline and online inference via Embeddings API.
 
 
 Multimodal Language Models
@@ -600,6 +603,9 @@ Multimodal Embedding
   Some model architectures support both generation and embedding tasks.
   In this case, you have to pass :code:`--task embedding` to run the model in embedding mode.
 
+.. tip::
+  You can override the model's pooling method by passing :code:`--override-pooler-config`.
+
 Model Support Policy
 =====================
 
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
index e92e2588d01cb..7b1be5a9802fd 100644
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@@ -2,6 +2,7 @@
 
 import pytest
 
+from vllm.config import PoolerConfig
 from vllm.engine.arg_utils import EngineArgs, nullable_kvs
 from vllm.utils import FlexibleArgumentParser
 
@@ -32,9 +33,13 @@ def test_limit_mm_per_prompt_parser(arg, expected):
 
 def test_valid_pooling_config():
     parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
-    args = parser.parse_args(["--pooling-type=MEAN"])
+    args = parser.parse_args([
+        '--override-pooler-config',
+        '{"pooling_type": "MEAN"}',
+    ])
     engine_args = EngineArgs.from_cli_args(args=args)
-    assert engine_args.pooling_type == 'MEAN'
+    assert engine_args.override_pooler_config == PoolerConfig(
+        pooling_type="MEAN", )
 
 
 @pytest.mark.parametrize(
diff --git a/tests/test_config.py b/tests/test_config.py
index df382d22d83ec..3cf90297ce177 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1,6 +1,8 @@
+from dataclasses import asdict
+
 import pytest
 
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, PoolerConfig
 from vllm.model_executor.layers.pooler import PoolingType
 from vllm.platforms import current_platform
 
@@ -108,7 +110,7 @@ def test_get_sliding_window():
                     reason="Xformers backend is not supported on ROCm.")
 def test_get_pooling_config():
     model_id = "sentence-transformers/all-MiniLM-L12-v2"
-    minilm_model_config = ModelConfig(
+    model_config = ModelConfig(
         model_id,
         task="auto",
         tokenizer=model_id,
@@ -119,39 +121,31 @@ def test_get_pooling_config():
         revision=None,
     )
 
-    minilm_pooling_config = minilm_model_config._init_pooler_config(
-        pooling_type=None,
-        pooling_norm=None,
-        pooling_returned_token_ids=None,
-        pooling_softmax=None,
-        pooling_step_tag_id=None)
+    pooling_config = model_config._init_pooler_config(None)
+    assert pooling_config is not None
 
-    assert minilm_pooling_config.pooling_norm
-    assert minilm_pooling_config.pooling_type == PoolingType.MEAN.name
+    assert pooling_config.normalize
+    assert pooling_config.pooling_type == PoolingType.MEAN.name
 
 
 @pytest.mark.skipif(current_platform.is_rocm(),
                     reason="Xformers backend is not supported on ROCm.")
 def test_get_pooling_config_from_args():
     model_id = "sentence-transformers/all-MiniLM-L12-v2"
-    minilm_model_config = ModelConfig(model_id,
-                                      task="auto",
-                                      tokenizer=model_id,
-                                      tokenizer_mode="auto",
-                                      trust_remote_code=False,
-                                      seed=0,
-                                      dtype="float16",
-                                      revision=None)
-
-    minilm_pooling_config = minilm_model_config._init_pooler_config(
-        pooling_type='CLS',
-        pooling_norm=True,
-        pooling_returned_token_ids=None,
-        pooling_softmax=None,
-        pooling_step_tag_id=None)
-
-    assert minilm_pooling_config.pooling_norm
-    assert minilm_pooling_config.pooling_type == PoolingType.CLS.name
+    model_config = ModelConfig(model_id,
+                               task="auto",
+                               tokenizer=model_id,
+                               tokenizer_mode="auto",
+                               trust_remote_code=False,
+                               seed=0,
+                               dtype="float16",
+                               revision=None)
+
+    override_config = PoolerConfig(pooling_type='CLS', normalize=True)
+
+    pooling_config = model_config._init_pooler_config(override_config)
+    assert pooling_config is not None
+    assert asdict(pooling_config) == asdict(override_config)
 
 
 @pytest.mark.skipif(current_platform.is_rocm(),
diff --git a/vllm/config.py b/vllm/config.py
index 83b1483eb99e0..1c190da1d327e 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -112,10 +112,6 @@ class ModelConfig:
             the model name will be the same as `model`.
         limit_mm_per_prompt: Maximum number of data items per modality
             per prompt. Only applicable for multimodal models.
-        override_neuron_config: Initialize non default neuron config or
-            override default neuron config that are specific to Neuron devices,
-            this argument will be used to configure the neuron config that
-            can not be gathered from the vllm arguments.
         config_format: The config format which shall be loaded.
             Defaults to 'auto' which defaults to 'hf'.
         hf_overrides: If a dictionary, contains arguments to be forwarded to the
@@ -123,20 +119,12 @@ class ModelConfig:
             HuggingFace config.
         mm_processor_kwargs: Arguments to be forwarded to the model's processor
             for multi-modal data, e.g., image processor.
-        pooling_type: Used to configure the pooling method in the embedding 
-            model.
-        pooling_norm: Used to determine whether to normalize the pooled 
-            data in the embedding model.
-        pooling_softmax: Used to determine whether to softmax the pooled 
-            data in the embedding model.
-        pooling_step_tag_id: When pooling_step_tag_id is not -1, it indicates 
-            that the score corresponding to the pooling_step_tag_id in the 
-            generated sentence should be returned. Otherwise, it returns 
-            the scores for all tokens.
-        pooling_returned_token_ids: pooling_returned_token_ids represents a 
-            list of indices for the vocabulary dimensions to be extracted, 
-            such as the token IDs of good_token and bad_token in the 
-            math-shepherd-mistral-7b-prm model.
+        override_neuron_config: Initialize non default neuron config or
+            override default neuron config that are specific to Neuron devices,
+            this argument will be used to configure the neuron config that
+            can not be gathered from the vllm arguments.
+        override_pooling_config: Initialize non default pooling config or
+            override default pooling config for the embedding model.
     """
 
     def __init__(
@@ -166,16 +154,12 @@ def __init__(
             served_model_name: Optional[Union[str, List[str]]] = None,
             limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
             use_async_output_proc: bool = True,
-            override_neuron_config: Optional[Dict[str, Any]] = None,
             config_format: ConfigFormat = ConfigFormat.AUTO,
             chat_template_text_format: str = "string",
             hf_overrides: Optional[HfOverrides] = None,
             mm_processor_kwargs: Optional[Dict[str, Any]] = None,
-            pooling_type: Optional[str] = None,
-            pooling_norm: Optional[bool] = None,
-            pooling_softmax: Optional[bool] = None,
-            pooling_step_tag_id: Optional[int] = None,
-            pooling_returned_token_ids: Optional[List[int]] = None) -> None:
+            override_neuron_config: Optional[Dict[str, Any]] = None,
+            override_pooler_config: Optional["PoolerConfig"] = None) -> None:
         self.model = model
         self.tokenizer = tokenizer
         self.tokenizer_mode = tokenizer_mode
@@ -280,13 +264,7 @@ def __init__(
         supported_tasks, task = self._resolve_task(task, self.hf_config)
         self.supported_tasks = supported_tasks
         self.task: Final = task
-        self.pooler_config = self._init_pooler_config(
-            pooling_type,
-            pooling_norm,
-            pooling_softmax,
-            pooling_step_tag_id,
-            pooling_returned_token_ids,
-        )
+        self.pooler_config = self._init_pooler_config(override_pooler_config)
 
         self._verify_quantization()
         self._verify_cuda_graph()
@@ -311,27 +289,21 @@ def _get_encoder_config(self):
 
     def _init_pooler_config(
         self,
-        pooling_type: Optional[str] = None,
-        pooling_norm: Optional[bool] = None,
-        pooling_softmax: Optional[bool] = None,
-        pooling_step_tag_id: Optional[int] = None,
-        pooling_returned_token_ids: Optional[List[int]] = None
+        override_pooler_config: Optional["PoolerConfig"],
     ) -> Optional["PoolerConfig"]:
+
         if self.task == "embedding":
-            pooling_config = get_pooling_config(self.model, self.revision)
-            if pooling_config is not None:
-                # override if user does not
-                # specifies pooling_type and/or pooling_norm
-                if pooling_type is None:
-                    pooling_type = pooling_config["pooling_type"]
-                if pooling_norm is None:
-                    pooling_norm = pooling_config["normalize"]
-            return PoolerConfig(
-                pooling_type=pooling_type,
-                pooling_norm=pooling_norm,
-                pooling_softmax=pooling_softmax,
-                pooling_step_tag_id=pooling_step_tag_id,
-                pooling_returned_token_ids=pooling_returned_token_ids)
+            user_config = override_pooler_config or PoolerConfig()
+
+            base_config = get_pooling_config(self.model, self.revision)
+            if base_config is not None:
+                # Only set values that are not overridden by the user
+                for k, v in base_config.items():
+                    if getattr(user_config, k) is None:
+                        setattr(user_config, k, v)
+
+            return user_config
+
         return None
 
     def _init_attention_free(self) -> bool:
@@ -1786,13 +1758,43 @@ class MultiModalConfig:
 
 @dataclass
 class PoolerConfig:
-    """Controls the behavior of pooler in embedding model"""
+    """Controls the behavior of output pooling in embedding models."""
 
     pooling_type: Optional[str] = None
-    pooling_norm: Optional[bool] = None
-    pooling_softmax: Optional[bool] = None
-    pooling_step_tag_id: Optional[int] = None
-    pooling_returned_token_ids: Optional[List[int]] = None
+    """
+    The pooling method of the embedding model. This should be a key in
+    :class:`vllm.model_executor.layers.pooler.PoolingType`.
+    """
+
+    normalize: Optional[bool] = None
+    """
+    Whether to normalize the pooled outputs. Usually, this should be set to
+    ``True`` for embedding outputs.
+    """
+
+    softmax: Optional[bool] = None
+    """
+    Whether to apply softmax to the pooled outputs. Usually, this should be set
+    to ``True`` for classification outputs.
+    """
+
+    step_tag_id: Optional[int] = None
+    """
+    If set, only the score corresponding to the ``step_tag_id`` in the 
+    generated sentence should be returned. Otherwise, the scores for all tokens
+    are returned.
+    """
+
+    returned_token_ids: Optional[List[int]] = None
+    """
+    A list of indices for the vocabulary dimensions to be extracted, 
+    such as the token IDs of ``good_token`` and ``bad_token`` in the 
+    ``math-shepherd-mistral-7b-prm`` model.
+    """
+
+    @staticmethod
+    def from_json(json_str: str) -> "PoolerConfig":
+        return PoolerConfig(**json.loads(json_str))
 
 
 _STR_DTYPE_TO_TORCH_DTYPE = {
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 244aa09e12552..4afc61c8d0c4c 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -11,12 +11,11 @@
 from vllm.config import (CacheConfig, ConfigFormat, DecodingConfig,
                          DeviceConfig, HfOverrides, LoadConfig, LoadFormat,
                          LoRAConfig, ModelConfig, ObservabilityConfig,
-                         ParallelConfig, PromptAdapterConfig, SchedulerConfig,
-                         SpeculativeConfig, TaskOption, TokenizerPoolConfig,
-                         VllmConfig)
+                         ParallelConfig, PoolerConfig, PromptAdapterConfig,
+                         SchedulerConfig, SpeculativeConfig, TaskOption,
+                         TokenizerPoolConfig, VllmConfig)
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
-from vllm.model_executor.layers.pooler import PoolingType
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 from vllm.platforms import current_platform
 from vllm.transformers_utils.utils import check_gguf_file
@@ -187,15 +186,10 @@ class EngineArgs:
     otlp_traces_endpoint: Optional[str] = None
     collect_detailed_traces: Optional[str] = None
     disable_async_output_proc: bool = False
-    override_neuron_config: Optional[Dict[str, Any]] = None
     scheduling_policy: Literal["fcfs", "priority"] = "fcfs"
 
-    # Pooling configuration.
-    pooling_type: Optional[str] = None
-    pooling_norm: Optional[bool] = None
-    pooling_softmax: Optional[bool] = None
-    pooling_step_tag_id: Optional[int] = None
-    pooling_returned_token_ids: Optional[List[int]] = None
+    override_neuron_config: Optional[Dict[str, Any]] = None
+    override_pooler_config: Optional[PoolerConfig] = None
 
     def __post_init__(self):
         if not self.tokenizer:
@@ -859,12 +853,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default=EngineArgs.disable_async_output_proc,
             help="Disable async output processing. This may result in "
             "lower performance.")
-        parser.add_argument(
-            '--override-neuron-config',
-            type=json.loads,
-            default=None,
-            help="Override or set neuron device configuration. "
-            "e.g. {\"cast_logits_dtype\": \"bloat16\"}.'")
 
         parser.add_argument(
             '--scheduling-policy',
@@ -877,56 +865,17 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             'arrival deciding any ties).')
 
         parser.add_argument(
-            '--pooling-type',
-            choices=[pt.name for pt in PoolingType],
-            default=None,
-            help='Used to configure the pooling method in the embedding model.'
-        )
-
-        parser.add_argument('--pooling-norm',
-                            default=None,
-                            action='store_true',
-                            help="Used to determine whether to normalize "
-                            "the pooled data in the embedding model.")
-
-        parser.add_argument('--no-pooling-norm',
-                            default=None,
-                            action='store_false',
-                            dest='pooling_norm',
-                            help="Used to determine whether to normalize "
-                            "the pooled data in the embedding model.")
-
-        parser.add_argument('--pooling-softmax',
-                            default=None,
-                            action='store_true',
-                            help="Used to determine whether to softmax "
-                            "the pooled data in the embedding model.")
-
-        parser.add_argument('--no-pooling-softmax',
-                            default=None,
-                            action='store_false',
-                            dest='pooling_softmax',
-                            help="Used to determine whether to softmax "
-                            "the pooled data in the embedding model.")
-
-        parser.add_argument(
-            '--pooling-step-tag-id',
-            type=int,
+            '--override-neuron-config',
+            type=json.loads,
             default=None,
-            help="When pooling-step-tag-id is not -1, it indicates "
-            "that the score corresponding to the step-tag-ids in the "
-            "generated sentence should be returned. Otherwise, it "
-            "returns the scores for all tokens.")
-
+            help="Override or set neuron device configuration. "
+            "e.g. {\"cast_logits_dtype\": \"bloat16\"}.'")
         parser.add_argument(
-            '--pooling-returned-token-ids',
-            nargs='+',
-            type=int,
+            '--override-pooler-config',
+            type=PoolerConfig.from_json,
             default=None,
-            help="pooling-returned-token-ids represents a list of "
-            "indices for the vocabulary dimensions to be extracted, "
-            "such as the token IDs of good_token and bad_token in "
-            "the math-shepherd-mistral-7b-prm model.")
+            help="Override or set the pooling method in the embedding model. "
+            "e.g. {\"pooling_type\": \"mean\", \"normalize\": false}.'")
 
         return parser
 
@@ -967,14 +916,10 @@ def create_model_config(self) -> ModelConfig:
             served_model_name=self.served_model_name,
             limit_mm_per_prompt=self.limit_mm_per_prompt,
             use_async_output_proc=not self.disable_async_output_proc,
-            override_neuron_config=self.override_neuron_config,
             config_format=self.config_format,
             mm_processor_kwargs=self.mm_processor_kwargs,
-            pooling_type=self.pooling_type,
-            pooling_norm=self.pooling_norm,
-            pooling_softmax=self.pooling_softmax,
-            pooling_step_tag_id=self.pooling_step_tag_id,
-            pooling_returned_token_ids=self.pooling_returned_token_ids,
+            override_neuron_config=self.override_neuron_config,
+            override_pooler_config=self.override_pooler_config,
         )
 
     def create_load_config(self) -> LoadConfig:
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 63c2bb6097079..3ab467e649b57 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -9,7 +9,8 @@
 from vllm import envs
 from vllm.beam_search import (BeamSearchInstance, BeamSearchOutput,
                               BeamSearchSequence, get_beam_search_score)
-from vllm.engine.arg_utils import EngineArgs, HfOverrides, TaskOption
+from vllm.engine.arg_utils import (EngineArgs, HfOverrides, PoolerConfig,
+                                   TaskOption)
 from vllm.engine.llm_engine import LLMEngine
 from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
                                          apply_hf_chat_template,
@@ -162,11 +163,7 @@ def __init__(
         mm_processor_kwargs: Optional[Dict[str, Any]] = None,
         # After positional args are removed, move this right below `model`
         task: TaskOption = "auto",
-        pooling_type: Optional[str] = None,
-        pooling_norm: Optional[bool] = None,
-        pooling_softmax: Optional[bool] = None,
-        pooling_step_tag_id: Optional[int] = None,
-        pooling_returned_token_ids: Optional[List[int]] = None,
+        override_pooler_config: Optional[PoolerConfig] = None,
         **kwargs,
     ) -> None:
         '''
@@ -202,11 +199,7 @@ def __init__(
             disable_async_output_proc=disable_async_output_proc,
             hf_overrides=hf_overrides,
             mm_processor_kwargs=mm_processor_kwargs,
-            pooling_type=pooling_type,
-            pooling_norm=pooling_norm,
-            pooling_softmax=pooling_softmax,
-            pooling_step_tag_id=pooling_step_tag_id,
-            pooling_returned_token_ids=pooling_returned_token_ids,
+            override_pooler_config=override_pooler_config,
             **kwargs,
         )
         # Logic to switch between engines is done at runtime instead of import
diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 024badbc17b96..6fee57a0a03eb 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -63,14 +63,14 @@ def from_config_with_defaults(
         return cls(
             pooling_type=PoolingType[pooler_config.pooling_type]
             if pooler_config.pooling_type is not None else pooling_type,
-            normalize=pooler_config.pooling_norm
-            if pooler_config.pooling_norm is not None else normalize,
-            softmax=pooler_config.pooling_softmax
-            if pooler_config.pooling_softmax is not None else softmax,
-            step_tag_id=pooler_config.pooling_step_tag_id
-            if pooler_config.pooling_step_tag_id is not None else step_tag_id,
-            returned_token_ids=pooler_config.pooling_returned_token_ids
-            if pooler_config.pooling_returned_token_ids is not None else
+            normalize=pooler_config.normalize
+            if pooler_config.normalize is not None else normalize,
+            softmax=pooler_config.softmax
+            if pooler_config.softmax is not None else softmax,
+            step_tag_id=pooler_config.step_tag_id
+            if pooler_config.step_tag_id is not None else step_tag_id,
+            returned_token_ids=pooler_config.returned_token_ids
+            if pooler_config.returned_token_ids is not None else
             returned_token_ids,
         )
 
@@ -94,10 +94,14 @@ def forward(
             pooled_data = hidden_states[last_token_flat_indices]
         elif self.pooling_type == PoolingType.ALL:
             offset = 0
-            pooled_data = []
+            pooled_data_lst = []
             for prompt_len in prompt_lens:
-                pooled_data.append(hidden_states[offset:offset + prompt_len])
+                pooled_data_i = hidden_states[offset:offset + prompt_len]
+
+                pooled_data_lst.append(pooled_data_i)
                 offset += prompt_len
+
+            pooled_data = torch.stack(pooled_data_lst)
         elif self.pooling_type == PoolingType.MEAN:
             # Calculate mean pooling
             cumsum = torch.cumsum(hidden_states, dim=0)
@@ -110,24 +114,26 @@ def forward(
                 cumsum[end_indices - 1] - cumsum[start_indices] +
                 hidden_states[start_indices]) / prompt_lens.unsqueeze(1)
         elif self.pooling_type == PoolingType.STEP:
-            if self.returned_token_ids is not None and len(
-                    self.returned_token_ids) > 0:
-                logits = hidden_states[:,
-                                       self.returned_token_ids].softmax(dim=-1)
-            else:
-                logits = hidden_states.softmax(dim=-1)
+            returned_token_ids = self.returned_token_ids
+            if returned_token_ids is not None and len(returned_token_ids) > 0:
+                hidden_states = hidden_states[:, returned_token_ids]
+
+            logits = hidden_states.softmax(dim=-1)
+            step_tag_id = self.step_tag_id
+
             offset = 0
-            pooled_data = []
+            pooled_data_lst = []
             for prompt_len, seq_data_i in zip(
                     prompt_lens, pooling_metadata.seq_data.values()):
-                if self.step_tag_id is None:
-                    pooled_data.append(logits[offset:offset + prompt_len])
-                else:
-                    step_idxs = torch.tensor(
-                        seq_data_i.prompt_token_ids) == self.step_tag_id
-                    pooled_data.append(logits[offset:offset +
-                                              prompt_len][step_idxs])
+                pooled_data_i = logits[offset:offset + prompt_len]
+                if step_tag_id is not None:
+                    token_ids = torch.tensor(seq_data_i.prompt_token_ids)
+                    pooled_data_i = pooled_data_i[token_ids == step_tag_id]
+
                 offset += prompt_len
+                pooled_data_lst.append(pooled_data_i)
+
+            pooled_data = torch.stack(pooled_data_lst)
         else:
             raise ValueError(f"Invalid pooling type: {self.pooling_type}")
 

From 02dbf30e9a4389b41d95dd595bfe1224592dd404 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Thu, 14 Nov 2024 23:31:52 -0800
Subject: [PATCH 148/183] [Build] skip renaming files for release wheels
 pipeline (#9671)

Signed-off-by: simon-mo <simon.mo@hey.com>
---
 .buildkite/release-pipeline.yaml | 21 +++++++-----------
 .buildkite/upload-wheels.sh      | 38 ++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 13 deletions(-)
 create mode 100644 .buildkite/upload-wheels.sh

diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
index 3b7fa0f2d94b3..f78e360b7afd3 100644
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@@ -6,28 +6,23 @@ steps:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      # rename the files to change linux -> manylinux1
-      - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
-      - "mv artifacts/dist/$(ls artifacts/dist) artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
-      - "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/$BUILDKITE_COMMIT/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
-      - "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+      - "bash .buildkite/upload-wheels.sh"
     env:
       DOCKER_BUILDKIT: "1"
 
-  - block: "Build CUDA 11.8 wheel"
-    key: block-build-cu118-wheel
-  
+  # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
+  # However, this block can be uncommented to save some compute hours.
+  # - block: "Build CUDA 11.8 wheel"
+  #   key: block-build-cu118-wheel
+
   - label: "Build wheel - CUDA 11.8"
-    depends_on: block-build-cu118-wheel
+    # depends_on: block-build-cu118-wheel
     agents:
       queue: cpu_queue
     commands:
       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
-      # rename the files to change linux -> manylinux1
-      - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
-      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
-      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
+      - "bash .buildkite/upload-wheels.sh"
     env:
       DOCKER_BUILDKIT: "1"
diff --git a/.buildkite/upload-wheels.sh b/.buildkite/upload-wheels.sh
new file mode 100644
index 0000000000000..541b395eddbe7
--- /dev/null
+++ b/.buildkite/upload-wheels.sh
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+
+set -ex
+
+# Assume wheels are in artifacts/dist/*.whl
+wheel_files=(artifacts/dist/*.whl)
+
+# Check that exactly one wheel is found
+if [[ ${#wheel_files[@]} -ne 1 ]]; then
+  echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
+  exit 1
+fi
+
+# Get the single wheel file
+wheel="${wheel_files[0]}"
+
+# Rename 'linux' to 'manylinux1' in the wheel filename
+new_wheel="${wheel/linux/manylinux1}"
+mv -- "$wheel" "$new_wheel"
+wheel="$new_wheel"
+
+# Extract the version from the wheel
+version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
+echo "Version: $version"
+
+# If the version contains "dev", rename it to v1.0.0.dev for consistency
+if [[ $version == *dev* ]]; then
+    new_version="1.0.0.dev"
+    new_wheel="${wheel/$version/$new_version}"
+    mv -- "$wheel" "$new_wheel"
+    wheel="$new_wheel"
+    version="$new_version"
+fi
+
+# Upload the wheel to S3
+aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
+aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
+aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
\ No newline at end of file

From 3d158cdc8dad62dfed45d5d808ae9f14f16e4dae Mon Sep 17 00:00:00 2001
From: wchen61 <183351030@qq.com>
Date: Fri, 15 Nov 2024 16:52:20 +0800
Subject: [PATCH 149/183] Add default value to avoid Falcon crash (#5363)
 (#10347)

Signed-off-by: wchen61 <wchen61@foxmail.com>
---
 vllm/model_executor/models/falcon.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index dcfcb6694feb5..b3dbf063ac298 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -250,6 +250,9 @@ def __init__(
         self.mlp = FalconMLP(config, quant_config)
         self.config = config
 
+        if (not hasattr(config, "num_ln_in_parallel_attn")):
+            config.num_ln_in_parallel_attn = None
+
         if (config.num_ln_in_parallel_attn is None
                 and config.new_decoder_architecture):
             config.num_ln_in_parallel_attn = 2

From b311efd0bd84faffcb1fe47aaa27ffd8c53688be Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Fri, 15 Nov 2024 17:34:17 +0800
Subject: [PATCH 150/183] [Misc] Fix import error in tensorizer tests and
 cleanup some code (#10349)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/tensorizer_loader/test_tensorizer.py    | 70 ++++++++++---------
 vllm/engine/llm_engine.py                     |  3 -
 vllm/entrypoints/llm.py                       |  3 -
 .../tool_parsers/abstract_tool_parser.py      | 17 +++--
 vllm/inputs/preprocess.py                     |  9 +--
 vllm/utils.py                                 | 20 ++++++
 vllm/v1/engine/llm_engine.py                  |  3 -
 7 files changed, 67 insertions(+), 58 deletions(-)

diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index 32591ecfe6774..edd079bc7a389 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -8,10 +8,12 @@
 import openai
 import pytest
 import torch
+from huggingface_hub import snapshot_download
 from tensorizer import EncryptionParams
 
 from vllm import SamplingParams
 from vllm.engine.arg_utils import EngineArgs
+# yapf conflicts with isort for this docstring
 # yapf: disable
 from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,
                                                          TensorSerializer,
@@ -20,13 +22,14 @@
                                                          open_stream,
                                                          serialize_vllm_model,
                                                          tensorize_vllm_model)
+# yapf: enable
+from vllm.utils import import_from_path
 
 from ..conftest import VllmRunner
-from ..utils import RemoteOpenAIServer
+from ..utils import VLLM_PATH, RemoteOpenAIServer
 from .conftest import retry_until_skip
 
-# yapf conflicts with isort for this docstring
-
+EXAMPLES_PATH = VLLM_PATH / "examples"
 
 prompts = [
     "Hello, my name is",
@@ -94,8 +97,8 @@ def test_can_deserialize_s3(vllm_runner):
                          num_readers=1,
                          s3_endpoint="object.ord1.coreweave.com",
                      )) as loaded_hf_model:
-        deserialized_outputs = loaded_hf_model.generate(prompts,
-                                                        sampling_params)
+        deserialized_outputs = loaded_hf_model.generate(
+            prompts, sampling_params)
         # noqa: E501
 
         assert deserialized_outputs
@@ -111,23 +114,21 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
 
         outputs = vllm_model.generate(prompts, sampling_params)
 
-        config_for_serializing = TensorizerConfig(
-            tensorizer_uri=model_path,
-            encryption_keyfile=key_path
-        )
+        config_for_serializing = TensorizerConfig(tensorizer_uri=model_path,
+                                                  encryption_keyfile=key_path)
         serialize_vllm_model(get_torch_model(vllm_model),
                              config_for_serializing)
 
     config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path,
                                                 encryption_keyfile=key_path)
 
-    with vllm_runner(
-            model_ref,
-            load_format="tensorizer",
-            model_loader_extra_config=config_for_deserializing) as loaded_vllm_model:  # noqa: E501
+    with vllm_runner(model_ref,
+                     load_format="tensorizer",
+                     model_loader_extra_config=config_for_deserializing
+                     ) as loaded_vllm_model:  # noqa: E501
 
-        deserialized_outputs = loaded_vllm_model.generate(prompts,
-                                                          sampling_params)
+        deserialized_outputs = loaded_vllm_model.generate(
+            prompts, sampling_params)
         # noqa: E501
 
         assert outputs == deserialized_outputs
@@ -156,14 +157,14 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
 
 
 def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
-    from huggingface_hub import snapshot_download
-
-    from examples.multilora_inference import (create_test_prompts,
-                                              process_requests)
+    multilora_inference = import_from_path(
+        "examples.multilora_inference",
+        EXAMPLES_PATH / "multilora_inference.py",
+    )
 
     model_ref = "meta-llama/Llama-2-7b-hf"
     lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
-    test_prompts = create_test_prompts(lora_path)
+    test_prompts = multilora_inference.create_test_prompts(lora_path)
 
     # Serialize model before deserializing and binding LoRA adapters
     with vllm_runner(model_ref, ) as vllm_model:
@@ -186,7 +187,8 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
             max_num_seqs=50,
             max_model_len=1000,
     ) as loaded_vllm_model:
-        process_requests(loaded_vllm_model.model.llm_engine, test_prompts)
+        multilora_inference.process_requests(
+            loaded_vllm_model.model.llm_engine, test_prompts)
 
         assert loaded_vllm_model
 
@@ -217,8 +219,11 @@ def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
 
     ## Start OpenAI API server
     openai_args = [
-        "--dtype", "float16", "--load-format",
-        "tensorizer", "--model-loader-extra-config",
+        "--dtype",
+        "float16",
+        "--load-format",
+        "tensorizer",
+        "--model-loader-extra-config",
         json.dumps(model_loader_extra_config),
     ]
 
@@ -251,8 +256,7 @@ def test_raise_value_error_on_invalid_load_format(vllm_runner):
     torch.cuda.empty_cache()
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Requires 2 GPUs")
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs")
 def test_tensorizer_with_tp_path_without_template(vllm_runner):
     with pytest.raises(ValueError):
         model_ref = "EleutherAI/pythia-1.4b"
@@ -271,10 +275,9 @@ def test_tensorizer_with_tp_path_without_template(vllm_runner):
         )
 
 
-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason="Requires 2 GPUs")
-def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
-                                                                    tmp_path):
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires 2 GPUs")
+def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(
+        vllm_runner, tmp_path):
     model_ref = "EleutherAI/pythia-1.4b"
     # record outputs from un-sharded un-tensorized model
     with vllm_runner(
@@ -313,13 +316,12 @@ def test_deserialized_encrypted_vllm_model_with_tp_has_same_outputs(vllm_runner,
             disable_custom_all_reduce=True,
             enforce_eager=True,
             model_loader_extra_config=tensorizer_config) as loaded_vllm_model:
-        deserialized_outputs = loaded_vllm_model.generate(prompts,
-                                                          sampling_params)
+        deserialized_outputs = loaded_vllm_model.generate(
+            prompts, sampling_params)
 
     assert outputs == deserialized_outputs
 
 
-
 @retry_until_skip(3)
 def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
     gc.collect()
@@ -337,8 +339,8 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
     with vllm_runner(model_ref,
                      load_format="tensorizer",
                      model_loader_extra_config=config) as loaded_vllm_model:
-        deserialized_outputs = loaded_vllm_model.generate(prompts,
-                                                          sampling_params)
+        deserialized_outputs = loaded_vllm_model.generate(
+            prompts, sampling_params)
         # noqa: E501
 
         assert outputs == deserialized_outputs
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index f5299746d845d..aa9c7893c4cfe 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -2002,9 +2002,6 @@ def create_trace_span(self, seq_group: SequenceGroup) -> None:
                     SpanAttributes.LLM_LATENCY_TIME_IN_MODEL_EXECUTE,
                     metrics.model_execute_time)
 
-    def is_encoder_decoder_model(self):
-        return self.input_preprocessor.is_encoder_decoder_model()
-
     def _validate_model_inputs(self, inputs: ProcessorInputs,
                                lora_request: Optional[LoRARequest]):
         if is_encoder_decoder_inputs(inputs):
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 3ab467e649b57..4b33fc1458ee3 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -964,6 +964,3 @@ def _run_engine(
         # This is necessary because some requests may be finished earlier than
         # its previous requests.
         return sorted(outputs, key=lambda x: int(x.request_id))
-
-    def _is_encoder_decoder_model(self):
-        return self.llm_engine.is_encoder_decoder_model()
diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
index 5ce31bd4d941b..aa7c201098935 100644
--- a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
+++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py
@@ -1,5 +1,3 @@
-import importlib
-import importlib.util
 import os
 from functools import cached_property
 from typing import Callable, Dict, List, Optional, Sequence, Type, Union
@@ -9,7 +7,7 @@
                                               ExtractedToolCallInformation)
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import AnyTokenizer
-from vllm.utils import is_list_of
+from vllm.utils import import_from_path, is_list_of
 
 logger = init_logger(__name__)
 
@@ -149,13 +147,14 @@ def _register(module):
     @classmethod
     def import_tool_parser(cls, plugin_path: str) -> None:
         """
-        Import a user defined tool parser by the path of the tool parser define
+        Import a user-defined tool parser by the path of the tool parser define
         file.
         """
         module_name = os.path.splitext(os.path.basename(plugin_path))[0]
-        spec = importlib.util.spec_from_file_location(module_name, plugin_path)
-        if spec is None or spec.loader is None:
-            logger.error("load %s from %s failed.", module_name, plugin_path)
+
+        try:
+            import_from_path(module_name, plugin_path)
+        except Exception:
+            logger.exception("Failed to load module '%s' from %s.",
+                             module_name, plugin_path)
             return
-        module = importlib.util.module_from_spec(spec)
-        spec.loader.exec_module(module)
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index fdf28615fda10..aacff87df6d79 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -67,7 +67,7 @@ def get_decoder_start_token_id(self) -> Optional[int]:
         model config is unavailable.
         '''
 
-        if not self.is_encoder_decoder_model():
+        if not self.model_config.is_encoder_decoder:
             print_warning_once("Using None for decoder start token id because "
                                "this is not an encoder/decoder model.")
             return None
@@ -632,7 +632,7 @@ def preprocess(
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> ProcessorInputs:
         """Preprocess the input prompt."""
-        if self.is_encoder_decoder_model():
+        if self.model_config.is_encoder_decoder:
             # Encoder-decoder model requires special mapping of
             # input prompts to encoder & decoder
             return self._process_encoder_decoder_prompt(
@@ -660,7 +660,7 @@ async def preprocess_async(
         prompt_adapter_request: Optional[PromptAdapterRequest] = None,
     ) -> ProcessorInputs:
         """Async version of :meth:`preprocess`."""
-        if self.is_encoder_decoder_model():
+        if self.model_config.is_encoder_decoder:
             # Encoder-decoder model requires special mapping of
             # input prompts to encoder & decoder
             return await self._process_encoder_decoder_prompt_async(
@@ -679,6 +679,3 @@ async def preprocess_async(
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
         )
-
-    def is_encoder_decoder_model(self):
-        return self.model_config.is_encoder_decoder
diff --git a/vllm/utils.py b/vllm/utils.py
index 1b02cbff79f78..111460a29de47 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -5,6 +5,7 @@
 import enum
 import gc
 import getpass
+import importlib.util
 import inspect
 import ipaddress
 import os
@@ -1539,6 +1540,25 @@ def is_in_doc_build() -> bool:
         return False
 
 
+def import_from_path(module_name: str, file_path: Union[str, os.PathLike]):
+    """
+    Import a Python file according to its file path.
+
+    Based on the official recipe:
+    https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly
+    """
+    spec = importlib.util.spec_from_file_location(module_name, file_path)
+    if spec is None:
+        raise ModuleNotFoundError(f"No module named '{module_name}'")
+
+    assert spec.loader is not None
+
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)
+    return module
+
+
 # create a library to hold the custom op
 vllm_lib = Library("vllm", "FRAGMENT")  # noqa
 
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index 4ebfff9584267..75a77be750acd 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -163,9 +163,6 @@ def step(self) -> List[RequestOutput]:
     def get_model_config(self):
         pass
 
-    def is_encoder_decoder_model(self):
-        pass
-
     def start_profile(self):
         pass
 

From 26908554b2ecc8f76fa57942566629ec5713ef5b Mon Sep 17 00:00:00 2001
From: Xin Yang <105740670+xyang16@users.noreply.github.com>
Date: Fri, 15 Nov 2024 02:22:57 -0800
Subject: [PATCH 151/183] [Doc] Remove float32 choice from --lora-dtype
 (#10348)

Signed-off-by: Xin Yang <xyang19@gmail.com>
---
 vllm/engine/arg_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 4afc61c8d0c4c..dbbcd6e95b791 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -601,7 +601,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             '--lora-dtype',
             type=str,
             default=EngineArgs.lora_dtype,
-            choices=['auto', 'float16', 'bfloat16', 'float32'],
+            choices=['auto', 'float16', 'bfloat16'],
             help=('Data type for LoRA. If auto, will default to '
                   'base model dtype.'))
         parser.add_argument(

From 1d65ec7eeb35f03eb87ed080094f1aa5ff2ae3d3 Mon Sep 17 00:00:00 2001
From: Jee Jee Li <pandaleefree@gmail.com>
Date: Fri, 15 Nov 2024 18:34:58 +0800
Subject: [PATCH 152/183] [Bugfix] Fix fully sharded LoRA bug (#10352)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
---
 vllm/lora/fully_sharded_layers.py | 23 ++++++++++++-----------
 vllm/lora/layers.py               | 15 ++++++++-------
 vllm/worker/worker.py             |  2 +-
 3 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/vllm/lora/fully_sharded_layers.py b/vllm/lora/fully_sharded_layers.py
index 04fc635828d4d..3443c3feb4d2a 100644
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@@ -165,15 +165,14 @@ class MergedColumnParallelLinearWithShardedLoRA(
     def slice_lora_a(
         self, lora_a: List[Union[torch.Tensor, None]]
     ) -> List[Union[torch.Tensor, None]]:
-        if lora_a[0] is None or lora_a[1] is None:
-            return lora_a
+        #NOTE: lora_a contains 2 subloras, and each sublora could be None.
         output_shard_size = self.lora_a_stacked[0].shape[2]
         output_start_idx = self.tp_rank * output_shard_size
         lora_a = [
-            lora_a[0][:,
-                      output_start_idx:output_start_idx + output_shard_size],
-            lora_a[1][:,
-                      output_start_idx:output_start_idx + output_shard_size],
+            lora_a[0][:, output_start_idx:output_start_idx +
+                      output_shard_size] if lora_a[0] is not None else None,
+            lora_a[1][:, output_start_idx:output_start_idx +
+                      output_shard_size] if lora_a[1] is not None else None,
         ]
         return lora_a
 
@@ -261,14 +260,16 @@ class MergedQKVParallelLinearWithShardedLora(MergedQKVParallelLinearWithLora):
     def slice_lora_a(
         self, lora_a: List[Union[torch.Tensor, None]]
     ) -> List[Union[torch.Tensor, None]]:
-        if lora_a[0] is None or lora_a[1] is None or lora_a[2] is None:
-            return lora_a
+        # NOTE: lora_a contains 3 subloras, and each sublora could be None.
         shard_size = [self.lora_a_stacked[i].shape[2] for i in range(3)]
         start_idx = [self.tp_rank * shard_size[i] for i in range(3)]
         lora_a = [
-            lora_a[0][:, start_idx[0]:start_idx[0] + shard_size[0]],
-            lora_a[1][:, start_idx[1]:start_idx[1] + shard_size[1]],
-            lora_a[2][:, start_idx[2]:start_idx[2] + shard_size[2]],
+            lora_a[0][:, start_idx[0]:start_idx[0] +
+                      shard_size[0]] if lora_a[0] is not None else None,
+            lora_a[1][:, start_idx[1]:start_idx[1] +
+                      shard_size[1]] if lora_a[1] is not None else None,
+            lora_a[2][:, start_idx[2]:start_idx[2] +
+                      shard_size[2]] if lora_a[2] is not None else None,
         ]
         return lora_a
 
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 7429c60e0222d..6afe80219fe07 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -685,26 +685,27 @@ def slice_lora_a(
     def slice_lora_b(
         self, lora_b: List[Union[torch.Tensor, None]]
     ) -> List[Union[torch.Tensor, None]]:
-        if lora_b[0] is None or lora_b[1] is None:
-            return lora_b
+        #NOTE: lora_b contains 2 subloras, and each sublora could be None.
         shard_size = self.output_dim
         start_idx = self.tp_rank * shard_size
         end_idx = (self.tp_rank + 1) * shard_size
         lora_b = [
-            lora_b[0][:, start_idx:end_idx],
-            lora_b[1][:, start_idx:end_idx],
+            lora_b[0][:, start_idx:end_idx] if lora_b[0] is not None else None,
+            lora_b[1][:, start_idx:end_idx] if lora_b[1] is not None else None,
         ]
         return lora_b
 
     def slice_bias(
         self, bias: List[Union[torch.Tensor,
                                None]]) -> List[Union[torch.Tensor, None]]:
-        if bias[0] is None or bias[1] is None:
-            return bias
+        # NOTE : each bias could be None.
         shard_size = self.output_dim
         start_idx = self.tp_rank * shard_size
         end_idx = (self.tp_rank + 1) * shard_size
-        bias = [bias[0][start_idx:end_idx], bias[1][start_idx:end_idx]]
+        bias = [
+            bias[0][start_idx:end_idx] if bias[0] is not None else None,
+            bias[1][start_idx:end_idx] if bias[1] is not None else None
+        ]
         return bias
 
     def set_lora(
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index d8c8011a585d8..d3ca6d9d0b17e 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -232,7 +232,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         logger.info(
             "Memory profiling results: total_gpu_memory=%.2fGiB"
             " initial_memory_usage=%.2fGiB peak_torch_memory=%.2fGiB"
-            " memory_usage_post_profile=%.2fGib"
+            " memory_usage_post_profile=%.2fGiB"
             " non_torch_memory=%.2fGiB kv_cache_size=%.2fGiB"
             " gpu_memory_utilization=%.2f", total_gpu_memory / (1024**3),
             (total_gpu_memory - free_memory_pre_profile) / (1024**3),

From f2056f726d9b0f257bc0e79938a9a6f483ce9e2d Mon Sep 17 00:00:00 2001
From: shangmingc <csmthu@gmail.com>
Date: Fri, 15 Nov 2024 20:40:30 +0800
Subject: [PATCH 153/183] [Misc] Fix some help info of arg_utils to improve
 readability (#10362)

---
 vllm/engine/arg_utils.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index dbbcd6e95b791..d73f95f59c71f 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -272,10 +272,10 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument(
             '--allowed-local-media-path',
             type=str,
-            help="Allowing API requests to read local images or videos"
-            "from directories specified by the server file system."
-            "This is a security risk."
-            "Should only be enabled in trusted environments")
+            help="Allowing API requests to read local images or videos "
+            "from directories specified by the server file system. "
+            "This is a security risk. "
+            "Should only be enabled in trusted environments.")
         parser.add_argument('--download-dir',
                             type=nullable_str,
                             default=EngineArgs.download_dir,
@@ -340,7 +340,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             'scaling factors. This should generally be supplied, when '
             'KV cache dtype is FP8. Otherwise, KV cache scaling factors '
             'default to 1.0, which may cause accuracy issues. '
-            'FP8_E5M2 (without scaling) is only supported on cuda version'
+            'FP8_E5M2 (without scaling) is only supported on cuda version '
             'greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead '
             'supported for common inference criteria.')
         parser.add_argument('--max-model-len',
@@ -446,9 +446,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             'this argument can be seen as a virtual way to increase '
             'the GPU memory size. For example, if you have one 24 GB '
             'GPU and set this to 10, virtually you can think of it as '
-            'a 34 GB GPU. Then you can load a 13B model with BF16 weight,'
+            'a 34 GB GPU. Then you can load a 13B model with BF16 weight, '
             'which requires at least 26GB GPU memory. Note that this '
-            'requires fast CPU-GPU interconnect, as part of the model is'
+            'requires fast CPU-GPU interconnect, as part of the model is '
             'loaded from CPU memory to GPU memory on the fly in each '
             'model forward pass.')
         parser.add_argument(
@@ -468,7 +468,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             type=int,
             default=None,
             help='If specified, ignore GPU profiling result and use this number'
-            'of GPU blocks. Used for testing preemption.')
+            ' of GPU blocks. Used for testing preemption.')
         parser.add_argument('--max-num-batched-tokens',
                             type=int,
                             default=EngineArgs.max_num_batched_tokens,
@@ -514,7 +514,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
         parser.add_argument('--hf-overrides',
                             type=json.loads,
                             default=EngineArgs.hf_overrides,
-                            help='Extra arguments for the HuggingFace config.'
+                            help='Extra arguments for the HuggingFace config. '
                             'This should be a JSON string that will be '
                             'parsed into a dictionary.')
         parser.add_argument('--enforce-eager',
@@ -572,7 +572,7 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             '--mm-processor-kwargs',
             default=None,
             type=json.loads,
-            help=('Overrides for the multimodal input mapping/processing,'
+            help=('Overrides for the multimodal input mapping/processing, '
                   'e.g., image processor. For example: {"num_crops": 4}.'))
 
         # LoRA related configs
@@ -822,9 +822,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             "of the provided names. The model name in the model "
             "field of a response will be the first name in this "
             "list. If not specified, the model name will be the "
-            "same as the `--model` argument. Noted that this name(s)"
+            "same as the `--model` argument. Noted that this name(s) "
             "will also be used in `model_name` tag content of "
-            "prometheus metrics, if multiple names provided, metrics"
+            "prometheus metrics, if multiple names provided, metrics "
             "tag will take the first one.")
         parser.add_argument('--qlora-adapter-name-or-path',
                             type=str,

From 3a763ba0c3a92fdde78e855ded94f9ff29e02088 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 15 Nov 2024 05:55:51 -0800
Subject: [PATCH 154/183] [core][misc] keep compatibility for old-style classes
 (#10356)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/model_executor/model_loader/loader.py | 40 +++++++++++++++-------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 5bcae37961195..140b61fe6d56a 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -94,18 +94,34 @@ def _initialize_model(vllm_config: VllmConfig, prefix: str = "") -> nn.Module:
     model_config = vllm_config.model_config
     model_class, _ = get_model_architecture(model_config)
     signatures = inspect.signature(model_class.__init__)
-    # collect all kw-only parameters
-    kw_only_params = [
-        param.name for param in signatures.parameters.values()
-        if param.kind == inspect.Parameter.KEYWORD_ONLY
-    ]
-    assert "vllm_config" in kw_only_params and "prefix" in kw_only_params, \
-    ("vLLM model class must accept `vllm_config` and `prefix` as kw-only "
-    "arguments. Possibly you have an old-style model class registered from "
-    "out of tree and it is used for new vLLM version. "
-    "Please check https://docs.vllm.ai/en/latest/design/class_hierarchy.html "
-    "for the design and update the model class accordingly.")
-    return model_class(vllm_config=vllm_config, prefix=prefix)
+    all_params = [param.name for param in signatures.parameters.values()]
+    if "vllm_config" in all_params and "prefix" in all_params:
+        # new-style model class
+        return model_class(vllm_config=vllm_config, prefix=prefix)
+    msg = ("vLLM model class should accept `vllm_config` and `prefix` as "
+           "input arguments. Possibly you have an old-style model class"
+           " registered from out of tree and it is used for new vLLM version. "
+           "Check https://docs.vllm.ai/en/latest/design/class_hierarchy.html "
+           "for the design and update the model class accordingly.")
+    logger.warning(msg)
+    logger.warning(
+        "Trying to guess the arguments for old-style model class %s",
+        model_class)
+    # try to be compatible with old-style model class
+    kwargs = {}
+    if "prefix" in all_params:
+        kwargs["prefix"] = prefix
+    if "config" in all_params:
+        kwargs["config"] = model_config.hf_config
+    if "cache_config" in all_params:
+        kwargs["cache_config"] = vllm_config.cache_config
+    if "quant_config" in all_params:
+        kwargs["quant_config"] = vllm_config.quant_config
+    if "lora_config" in all_params:
+        kwargs["lora_config"] = vllm_config.lora_config
+    if "scheduler_config" in all_params:
+        kwargs["scheduler_config"] = vllm_config.scheduler_config
+    return model_class(**kwargs)
 
 
 class BaseModelLoader(ABC):

From 691a3ec0475ba1fe4255bc975d02cc7a4392bf2c Mon Sep 17 00:00:00 2001
From: Guillaume Calmettes <gcalmettes@scaleway.com>
Date: Fri, 15 Nov 2024 15:50:40 +0100
Subject: [PATCH 155/183] [Bugfix] Ensure special tokens are properly filtered
 out for guided structured output with MistralTokenizer (#10363)

Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
---
 requirements-common.txt                       |  4 ++--
 vllm/transformers_utils/tokenizers/mistral.py | 19 +++++++++++++++----
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index acb766d25a2d9..c68004d27626b 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -17,7 +17,7 @@ pillow  # Required for image processing
 prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
-lm-format-enforcer == 0.10.6
+lm-format-enforcer >= 0.10.9, < 0.11
 outlines >= 0.0.43, < 0.1
 typing_extensions >= 4.10
 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
@@ -31,4 +31,4 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.8.0 # required for compressed-tensors
\ No newline at end of file
+compressed-tensors == 0.8.0 # required for compressed-tensors
diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py
index b1cb9a15b943b..83b3c37d6f04c 100644
--- a/vllm/transformers_utils/tokenizers/mistral.py
+++ b/vllm/transformers_utils/tokenizers/mistral.py
@@ -174,18 +174,29 @@ def _download_mistral_tokenizer_from_hf(tokenizer_name: str,
                                          revision=revision)
         return tokenizer_file
 
-    # the following attributes are set to fit VLLM's design
+    # the following attributes are set to fit VLLM's design and are used
+    # by the guided structured output backends.
     @property
     def all_special_tokens_extended(self) -> List[str]:
-        return []
+        # tekken defines its own extended special tokens list
+        if hasattr(self.tokenizer, "SPECIAL_TOKENS"):
+            special_tokens = self.tokenizer.SPECIAL_TOKENS
+        else:
+            special_tokens = list(SpecialTokens)
+        return [
+            s.value if isinstance(s, SpecialTokens) else s
+            for s in special_tokens
+        ]
 
     @property
     def all_special_tokens(self) -> List[str]:
-        return []
+        return self.all_special_tokens_extended
 
     @property
     def all_special_ids(self) -> List[int]:
-        return []
+        return [
+            self.all_special_tokens.index(t) for t in self.all_special_tokens
+        ]
 
     @property
     def bos_token_id(self) -> int:

From 79ee45b42822d750ead6121c8c741c8a947bfeaf Mon Sep 17 00:00:00 2001
From: ElizaWszola <eliza@neuralmagic.com>
Date: Fri, 15 Nov 2024 17:31:18 +0100
Subject: [PATCH 156/183] [Misc] Bump up test_fused_moe tolerance (#10364)

Signed-off-by: ElizaWszola <eliza@neuralmagic.com>
---
 tests/kernels/test_moe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/kernels/test_moe.py b/tests/kernels/test_moe.py
index 17428ebfc2e28..8b23b62826053 100644
--- a/tests/kernels/test_moe.py
+++ b/tests/kernels/test_moe.py
@@ -45,7 +45,7 @@ def test_fused_moe(
     score = torch.randn((m, e), device="cuda", dtype=dtype)
     triton_output = fused_moe(a, w1, w2, score, topk, renormalize=False)
     torch_output = torch_moe(a, w1, w2, score, topk)
-    torch.testing.assert_close(triton_output, torch_output, atol=1e-2, rtol=0)
+    torch.testing.assert_close(triton_output, torch_output, atol=2e-2, rtol=0)
 
 
 @pytest.mark.parametrize("dtype",

From a6221a144af772fd1a68fe7e627935dc53e81738 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Fri, 15 Nov 2024 09:48:07 -0800
Subject: [PATCH 157/183] [Misc] bump mistral common version (#10367)

Signed-off-by: simon-mo <simon.mo@hey.com>
---
 requirements-common.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index c68004d27626b..f62ad66a1ecc4 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -26,7 +26,7 @@ pyzmq
 msgspec
 gguf == 0.10.0
 importlib_metadata
-mistral_common[opencv] >= 1.4.4
+mistral_common[opencv] >= 1.5.0
 pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12

From c76ac49d266e27aa3fea84ef2df1f813d24c91c7 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Fri, 15 Nov 2024 12:47:40 -0800
Subject: [PATCH 158/183] [Docs] Add Nebius as sponsors (#10371)

Signed-off-by: simon-mo <simon.mo@hey.com>
---
 README.md                         | 1 +
 docs/source/community/sponsors.md | 1 +
 2 files changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 6530886ed7de2..0ef073210d070 100644
--- a/README.md
+++ b/README.md
@@ -100,6 +100,7 @@ vLLM is a community project. Our compute resources for development and testing a
 - Dropbox
 - Google Cloud
 - Lambda Lab
+- Nebius
 - NVIDIA
 - Replicate
 - Roblox
diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md
index 52fbf9a577c7e..c6f83b3a92ca0 100644
--- a/docs/source/community/sponsors.md
+++ b/docs/source/community/sponsors.md
@@ -15,6 +15,7 @@ vLLM is a community project. Our compute resources for development and testing a
 - Dropbox
 - Google Cloud
 - Lambda Lab
+- Nebius
 - NVIDIA
 - Replicate
 - Roblox

From a067f85e08f6604b328a16efe3ead4629e0ead5b Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 15 Nov 2024 16:13:53 -0500
Subject: [PATCH 159/183] [Frontend] Add --version flag to CLI (#10369)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 vllm/scripts.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/scripts.py b/vllm/scripts.py
index 4e4c071784287..a51c21cfa29e7 100644
--- a/vllm/scripts.py
+++ b/vllm/scripts.py
@@ -9,6 +9,7 @@
 from openai import OpenAI
 from openai.types.chat import ChatCompletionMessageParam
 
+import vllm.version
 from vllm.engine.arg_utils import EngineArgs
 from vllm.entrypoints.openai.api_server import run_server
 from vllm.entrypoints.openai.cli_args import (make_arg_parser,
@@ -143,6 +144,11 @@ def main():
     env_setup()
 
     parser = FlexibleArgumentParser(description="vLLM CLI")
+    parser.add_argument('-v',
+                        '--version',
+                        action='version',
+                        version=vllm.version.__version__)
+
     subparsers = parser.add_subparsers(required=True, dest="subparser")
 
     serve_parser = subparsers.add_parser(

From 3e8d14d8a1e3e54655f79d7bb3481cde02943281 Mon Sep 17 00:00:00 2001
From: Russell Bryant <rbryant@redhat.com>
Date: Fri, 15 Nov 2024 16:20:20 -0500
Subject: [PATCH 160/183] [Doc] Move PR template content to docs (#10159)

Signed-off-by: Russell Bryant <rbryant@redhat.com>
---
 .github/PULL_REQUEST_TEMPLATE.md      |  71 +---------------
 .github/scripts/cleanup_pr_body.sh    |  25 +++++-
 docs/source/contributing/overview.rst | 114 +++++++++++++++++++++++---
 3 files changed, 126 insertions(+), 84 deletions(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index be0afc6305044..51a73c857ccb2 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -2,73 +2,4 @@ FILL IN THE PR DESCRIPTION HERE
 
 FIX #xxxx (*link existing issues this PR will resolve*)
 
-**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**
-
----
-
-<details>
-<!-- inside this <details> section, markdown rendering does not work, so we use raw html here. -->
-<summary><b> PR Checklist (Click to Expand) </b></summary>
-
-<p>Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.</p>
-
-<h3>PR Title and Classification</h3>
-<p>Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:</p>
-<ul>
-    <li><code>[Bugfix]</code> for bug fixes.</li>
-    <li><code>[CI/Build]</code> for build or continuous integration improvements.</li>
-    <li><code>[Doc]</code> for documentation fixes and improvements.</li>
-    <li><code>[Model]</code> for adding a new model or improving an existing model. Model name should appear in the title.</li>
-    <li><code>[Frontend]</code> For changes on the vLLM frontend (e.g., OpenAI API server, <code>LLM</code> class, etc.) </li>
-    <li><code>[Kernel]</code> for changes affecting CUDA kernels or other compute kernels.</li>
-    <li><code>[Core]</code> for changes in the core vLLM logic (e.g., <code>LLMEngine</code>, <code>AsyncLLMEngine</code>, <code>Scheduler</code>, etc.)</li>
-    <li><code>[Hardware][Vendor]</code> for hardware-specific changes. Vendor name should appear in the prefix (e.g., <code>[Hardware][AMD]</code>).</li>
-    <li><code>[Misc]</code> for PRs that do not fit the above categories. Please use this sparingly.</li>
-</ul>
-<p><strong>Note:</strong> If the PR spans more than one category, please include all relevant prefixes.</p>
-
-<h3>Code Quality</h3>
-
-<p>The PR need to meet the following code quality standards:</p>
-
-<ul>
-    <li>We adhere to <a href="https://google.github.io/styleguide/pyguide.html">Google Python style guide</a> and <a href="https://google.github.io/styleguide/cppguide.html">Google C++ style guide</a>.</li>
-    <li>Pass all linter checks. Please use <a href="https://github.com/vllm-project/vllm/blob/main/format.sh"><code>format.sh</code></a> to format your code.</li>
-    <li>The code need to be well-documented to ensure future contributors can easily understand the code.</li>
-    <li>Include sufficient tests to ensure the project to stay correct and robust. This includes both unit tests and integration tests.</li>
-    <li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
-</ul>
-
-<h3>Adding or changing kernels</h3>
-<p>Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.</p>
-<ul>
-    <li>Make sure custom ops are registered following PyTorch guidelines: <a href="https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial">Custom C++ and CUDA Operators</a> and <a href="https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU">The Custom Operators Manual</a></li>
-    <li>Custom operations that return <code>Tensors</code> require meta-functions. Meta-functions should be implemented and registered in python so that dynamic dims can be handled automatically. See above documents for a description of meta-functions.</li>
-    <li>Use <a href="https://pytorch.org/docs/stable/library.html#torch.library.opcheck"><code>torch.libary.opcheck()</code></a> to test the function registration and meta-function for any registered ops.  See <code>tests/kernels</code> for examples.</li>
-    <li>When changing the C++ signature of an existing op, the schema must be updated to reflect the changes.</li>
-    <li>If a new custom type is needed, see the following document: <a href="https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA">Custom Class Support in PT2</a>.
-</ul>
-
-<h3>Notes for Large Changes</h3>
-<p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
-
-<h3>What to Expect for the Reviews</h3>
-
-<p>The goal of the vLLM team is to be a <i>transparent reviewing machine</i>. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process: </p>
-
-<ul>
-    <li> After the PR is submitted, the PR will be assigned to a reviewer. Every reviewer will pick up the PRs based on their expertise and availability.</li>
-    <li> After the PR is assigned, the reviewer will provide status update every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team.</li>
-    <li> After the review, the reviewer will put an <code> action-required</code> label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.</li>
-    <li> Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.
- </li>
-</ul>
-
-<h3>Thank You</h3>
-
-<p> Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone! </p>
-
-
-</details>
-
-
+**BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html **
diff --git a/.github/scripts/cleanup_pr_body.sh b/.github/scripts/cleanup_pr_body.sh
index 3b2da7b9f8966..3246c6f9bc4b7 100755
--- a/.github/scripts/cleanup_pr_body.sh
+++ b/.github/scripts/cleanup_pr_body.sh
@@ -15,19 +15,36 @@ NEW=/tmp/new_pr_body.txt
 gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
 cp "${OLD}" "${NEW}"
 
-# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
-sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE\*\*/,$d' "${NEW}"
-
 # Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
 sed -i '/FIX #xxxx.*$/d' "${NEW}"
 
 # Remove "FILL IN THE PR DESCRIPTION HERE"
 sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
 
+# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
+sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
+
+# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
+python3 - <<EOF
+import re
+
+with open("${NEW}", "r") as file:
+    content = file.read()
+
+pattern = re.compile(r'(---\n\n)?<details>.*?<summary>.*?PR Checklist \(Click to Expand\).*?</summary>.*?</details>', re.DOTALL)
+content = re.sub(pattern, '', content)
+
+with open("${NEW}", "w") as file:
+    file.write(content)
+EOF
+
 # Run this only if ${NEW} is different than ${OLD}
 if ! cmp -s "${OLD}" "${NEW}"; then
-    echo "Updating PR body"
     gh pr edit --body-file "${NEW}" "${PR_NUMBER}"
+    echo
+    echo "Updated PR body:"
+    echo
+    cat "${NEW}"
 else
     echo "No changes needed"
 fi
diff --git a/docs/source/contributing/overview.rst b/docs/source/contributing/overview.rst
index ac2d2b2fe4103..4cea0afdaea74 100644
--- a/docs/source/contributing/overview.rst
+++ b/docs/source/contributing/overview.rst
@@ -41,15 +41,6 @@ Testing
 Contribution Guidelines
 =======================
 
-DCO and Signed-off-by
-----------------------
-
-When contributing changes to this project, you must agree to the `DCO <https://github.com/vllm-project/vllm/tree/main/DCO>`_.
-Commits must include a ``Signed-off-by:`` header which certifies agreement with
-the terms of the `DCO <https://github.com/vllm-project/vllm/tree/main/DCO>`_.
-
-Using ``-s`` with ``git commit`` will automatically add this header.
-
 Issues
 ------
 
@@ -61,7 +52,110 @@ If you encounter a bug or have a feature request, please `search existing issues
 Pull Requests & Code Reviews
 ----------------------------
 
-Please check the PR checklist in the `PR template <https://github.com/vllm-project/vllm/tree/main/.github/PULL_REQUEST_TEMPLATE.md>`_ for a detailed guide for contribution.
+Thank you for your contribution to vLLM! Before submitting the pull request,
+please ensure the PR meets the following criteria. This helps vLLM maintain the
+code quality and improve the efficiency of the review process.
+
+DCO and Signed-off-by
+^^^^^^^^^^^^^^^^^^^^^
+
+When contributing changes to this project, you must agree to the `DCO <https://github.com/vllm-project/vllm/tree/main/DCO>`_.
+Commits must include a ``Signed-off-by:`` header which certifies agreement with
+the terms of the `DCO <https://github.com/vllm-project/vllm/tree/main/DCO>`_.
+
+Using ``-s`` with ``git commit`` will automatically add this header.
+
+PR Title and Classification
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Only specific types of PRs will be reviewed. The PR title is prefixed
+appropriately to indicate the type of change. Please use one of the following:
+
+- ``[Bugfix]`` for bug fixes.
+- ``[CI/Build]`` for build or continuous integration improvements.
+- ``[Doc]`` for documentation fixes and improvements.
+- ``[Model]`` for adding a new model or improving an existing model. Model name
+  should appear in the title.
+- ``[Frontend]`` For changes on the vLLM frontend (e.g., OpenAI API server,
+  ``LLM`` class, etc.)
+- ``[Kernel]`` for changes affecting CUDA kernels or other compute kernels.
+- ``[Core]`` for changes in the core vLLM logic (e.g., ``LLMEngine``,
+  ``AsyncLLMEngine``, ``Scheduler``, etc.)
+- ``[Hardware][Vendor]`` for hardware-specific changes. Vendor name should
+  appear in the prefix (e.g., ``[Hardware][AMD]``).
+- ``[Misc]`` for PRs that do not fit the above categories. Please use this
+  sparingly.
+
+.. note::
+   If the PR spans more than one category, please include all relevant prefixes.
+
+Code Quality
+^^^^^^^^^^^^
+
+The PR needs to meet the following code quality standards:
+
+- We adhere to `Google Python style guide
+  <https://google.github.io/styleguide/pyguide.html>`_ and `Google C++ style guide
+  <https://google.github.io/styleguide/cppguide.html>`_.
+- Pass all linter checks. Please use `format.sh
+  <https://github.com/vllm-project/vllm/blob/main/format.sh>`_ to format your
+  code.
+- The code needs to be well-documented to ensure future contributors can easily
+  understand the code.
+- Include sufficient tests to ensure the project stays correct and robust. This
+  includes both unit tests and integration tests.
+- Please add documentation to ``docs/source/`` if the PR modifies the
+  user-facing behaviors of vLLM. It helps vLLM users understand and utilize the
+  new features or changes.
+
+Adding or Changing Kernels
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.
+
+- Make sure custom ops are registered following PyTorch guidelines:
+  `Custom C++ and CUDA Operators <https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial>`_
+  and `The Custom Operators Manual <https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU>`_.
+- Custom operations that return ``Tensors`` require meta-functions.
+  Meta-functions should be implemented and registered in Python so that dynamic
+  dims can be handled automatically. See above documents for a description of
+  meta-functions.
+- Use `torch.library.opcheck() <https://pytorch.org/docs/stable/library.html#torch.library.opcheck>`_
+  to test the function registration and meta-function for any registered ops.
+  See ``tests/kernels`` for examples.
+- When changing the C++ signature of an existing op, the schema must be updated
+  to reflect the changes.
+- If a new custom type is needed, see the following document:
+  `Custom Class Support in PT2 <https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA>`_.
+
+Notes for Large Changes
+^^^^^^^^^^^^^^^^^^^^^^^
+
+Please keep the changes as concise as possible. For major architectural changes
+(>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue
+(RFC) discussing the technical design and justification. Otherwise, we will tag
+it with ``rfc-required`` and might not go through the PR.
+
+What to Expect for the Reviews
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The goal of the vLLM team is to be a *transparent reviewing machine*. We would
+like to make the review process transparent and efficient and make sure no
+contributor feels confused or frustrated. However, the vLLM team is small, so we
+need to prioritize some PRs over others. Here is what you can expect from the
+review process:
+
+- After the PR is submitted, the PR will be assigned to a reviewer. Every
+  reviewer will pick up the PRs based on their expertise and availability.
+- After the PR is assigned, the reviewer will provide status updates every 2-3
+  days. If the PR is not reviewed within 7 days, please feel free to ping the
+  reviewer or the vLLM team.
+- After the review, the reviewer will put an ``action-required`` label on the PR
+  if there are changes required. The contributor should address the comments and
+  ping the reviewer to re-review the PR.
+- Please respond to all comments within a reasonable time frame. If a comment
+  isn't clear or you disagree with a suggestion, feel free to ask for
+  clarification or discuss the suggestion.
 
 Thank You
 ---------

From 4f168f69a3e856bda3f30e02fcee7db2a01ff32b Mon Sep 17 00:00:00 2001
From: Michael Green <59619482+mikegre-google@users.noreply.github.com>
Date: Fri, 15 Nov 2024 21:26:17 +0000
Subject: [PATCH 161/183] [Docs] Misc updates to TPU installation instructions
 (#10165)

---
 .../getting_started/tpu-installation.rst      | 54 ++++++++++++-------
 1 file changed, 35 insertions(+), 19 deletions(-)

diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst
index 75ab2b6ba02dc..22cc684a1c778 100644
--- a/docs/source/getting_started/tpu-installation.rst
+++ b/docs/source/getting_started/tpu-installation.rst
@@ -44,15 +44,18 @@ Requirements
 Provision Cloud TPUs
 ====================
 
-You can provision Cloud TPUs using the `Cloud TPU API <https://cloud.google.com/tpu/docs/reference/rest>`_` 
-or the `queued resources <https://cloud.google.com/tpu/docs/queued-resources>`_` 
-API. This section shows how to create TPUs using the queued resource API. 
-For more information about using the Cloud TPU API, see `Create a Cloud TPU using the Create Node API <https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#create-node-api>`_. 
-`Queued resources <https://cloud.devsite.corp.google.com/tpu/docs/queued-resources>`_
-enable you to request Cloud TPU resources in a queued manner. When you request 
-queued resources, the request is added to a queue maintained by the Cloud TPU 
-service. When the requested resource becomes available, it's assigned to your 
-Google Cloud project for your immediate exclusive use. 
+You can provision Cloud TPUs using the `Cloud TPU API <https://cloud.google.com/tpu/docs/reference/rest>`_ 
+or the `queued resources <https://cloud.google.com/tpu/docs/queued-resources>`_ 
+API. This section shows how to create TPUs using the queued resource API. For 
+more information about using the Cloud TPU API, see `Create a Cloud TPU using the Create Node API <https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#create-node-api>`_. 
+Queued resources enable you to request Cloud TPU resources in a queued manner. 
+When you request queued resources, the request is added to a queue maintained by 
+the Cloud TPU service. When the requested resource becomes available, it's 
+assigned to your Google Cloud project for your immediate exclusive use. 
+
+.. note::
+   In all of the following commands, replace the ALL CAPS parameter names with 
+   appropriate values. See the parameter descriptions table for more information.
 
 Provision a Cloud TPU with the queued resource API
 --------------------------------------------------
@@ -68,6 +71,7 @@ Create a TPU v5e with 4 TPU chips:
     --runtime-version RUNTIME_VERSION \
     --service-account SERVICE_ACCOUNT
 
+   
 .. list-table:: Parameter descriptions
     :header-rows: 1
 
@@ -81,12 +85,13 @@ Create a TPU v5e with 4 TPU chips:
     * - PROJECT_ID
       - Your Google Cloud project
     * - ZONE
-      - The `zone <https://cloud.google.com/tpu/docs/regions-zones>`_ where you 
-        want to create your Cloud TPU.
+      - The GCP zone where you want to create your Cloud TPU. The value you use 
+        depends on the version of TPUs you are using. For more information, see 
+        `TPU regions and zones <https://cloud.google.com/tpu/docs/regions-zones>`_ 
     * - ACCELERATOR_TYPE
-      - The TPU version you want to use. Specify the TPU version, followed by a 
-        '-' and the number of TPU cores. For example `v5e-4` specifies a v5e TPU 
-        with 4 cores. For more information, see `TPU versions <https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions>`_.
+      - The TPU version you want to use. Specify the TPU version, for example 
+        `v5litepod-4` specifies a v5e TPU with 4 cores. For more information, 
+        see `TPU versions <https://cloud.devsite.corp.google.com/tpu/docs/system-architecture-tpu-vm#versions>`_.
     * - RUNTIME_VERSION
       - The TPU VM runtime version to use. For more information see `TPU VM images <https://cloud.google.com/tpu/docs/runtimes>`_.
     * - SERVICE_ACCOUNT
@@ -98,7 +103,15 @@ Connect to your TPU using SSH:
 
 .. code-block:: bash
 
-    gcloud compute tpus tpu-vm ssh TPU_NAME
+    gcloud compute tpus tpu-vm ssh TPU_NAME --zone ZONE
+
+Install Miniconda
+
+.. code-block:: bash
+
+    wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+    bash Miniconda3-latest-Linux-x86_64.sh
+    source ~/.bashrc
 
 Create and activate a Conda environment for vLLM:
 
@@ -162,9 +175,11 @@ Run the Docker image with the following command:
 
 .. note::
 
-    Since TPU relies on XLA which requires static shapes, vLLM bucketizes the possible input shapes and compiles an XLA graph for each different shape.
-    The compilation time may take 20~30 minutes in the first run.
-    However, the compilation time reduces to ~5 minutes afterwards because the XLA graphs are cached in the disk (in :code:`VLLM_XLA_CACHE_PATH` or :code:`~/.cache/vllm/xla_cache` by default).
+    Since TPU relies on XLA which requires static shapes, vLLM bucketizes the 
+    possible input shapes and compiles an XLA graph for each shape. The 
+    compilation time may take 20~30 minutes in the first run. However, the 
+    compilation time reduces to ~5 minutes afterwards because the XLA graphs are 
+    cached in the disk (in :code:`VLLM_XLA_CACHE_PATH` or :code:`~/.cache/vllm/xla_cache` by default).
 
 .. tip::
 
@@ -173,7 +188,8 @@ Run the Docker image with the following command:
     .. code-block:: console
 
         from torch._C import *  # noqa: F403
-        ImportError: libopenblas.so.0: cannot open shared object file: No such file or directory
+        ImportError: libopenblas.so.0: cannot open shared object file: No such 
+        file or directory
 
 
     Install OpenBLAS with the following command:

From 32e46e000f77499f4dd7c0bed194e33856f2df24 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Sat, 16 Nov 2024 13:35:40 +0800
Subject: [PATCH 162/183] [Frontend] Automatic detection of chat content format
 from AST (#9919)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 .../serving/openai_compatible_server.md       |  18 +-
 tests/entrypoints/openai/test_serving_chat.py |   3 +-
 tests/entrypoints/test_chat_utils.py          | 619 +++++++++++-------
 vllm/config.py                                |   2 -
 vllm/engine/arg_utils.py                      |  10 -
 vllm/engine/llm_engine.py                     |   4 +-
 vllm/entrypoints/chat_utils.py                | 246 ++++++-
 vllm/entrypoints/llm.py                       |  44 +-
 vllm/entrypoints/openai/api_server.py         |  13 +-
 vllm/entrypoints/openai/cli_args.py           |  17 +-
 vllm/entrypoints/openai/protocol.py           |  71 +-
 vllm/entrypoints/openai/run_batch.py          |   2 +
 vllm/entrypoints/openai/serving_chat.py       |  40 +-
 vllm/entrypoints/openai/serving_embedding.py  |  12 +-
 vllm/entrypoints/openai/serving_engine.py     |  17 +-
 .../openai/serving_tokenization.py            |  20 +-
 16 files changed, 788 insertions(+), 350 deletions(-)

diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 78965813b1213..79d032bf8b211 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -172,12 +172,20 @@ completion = client.chat.completions.create(
   ]
 )
 ```
-Most chat templates for LLMs expect the `content` to be a `string` but there are some newer models like
-`meta-llama/Llama-Guard-3-1B` that expect the content to be parsed with the new OpenAI spec. In order to choose which
-format the content needs to be parsed in by vLLM, please use the `--chat-template-text-format` argument to specify
-between `string` or `openai`. The default value is `string` and vLLM internally converts both spec formats to match
-this, unless explicitly specified.
 
+Most chat templates for LLMs expect the `content` field to be a string, but there are some newer models like 
+`meta-llama/Llama-Guard-3-1B` that expect the content to be formatted according to the OpenAI schema in the
+request. vLLM provides best-effort support to detect this automatically, which is logged as a string like
+*"Detected the chat template content format to be..."*, and internally converts incoming requests to match
+the detected format, which can be one of:
+
+- `"string"`: A string.
+  - Example: `"Hello world"`
+- `"openai"`: A list of dictionaries, similar to OpenAI schema.
+  - Example: `[{"type": "text", "text": "Hello world!"}]`
+
+If the result is not what you expect, you can set the `--chat-template-content-format` CLI argument
+to override which format to use.
 
 ## Command line arguments for the server
 
diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py
index e969d33775d86..93660e6118ca8 100644
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
@@ -26,7 +26,6 @@ class MockModelConfig:
     tokenizer = MODEL_NAME
     trust_remote_code = False
     tokenizer_mode = "auto"
-    chat_template_text_format = "string"
     max_model_len = 100
     tokenizer_revision = None
     multimodal_config = MultiModalConfig()
@@ -49,6 +48,7 @@ async def _async_serving_chat_init():
                                            BASE_MODEL_PATHS,
                                            response_role="assistant",
                                            chat_template=CHAT_TEMPLATE,
+                                           chat_template_content_format="auto",
                                            lora_modules=None,
                                            prompt_adapters=None,
                                            request_logger=None)
@@ -70,6 +70,7 @@ def test_serving_chat_should_set_correct_max_tokens():
                                      BASE_MODEL_PATHS,
                                      response_role="assistant",
                                      chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
                                      lora_modules=None,
                                      prompt_adapters=None,
                                      request_logger=None)
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index 5fa466f8f041f..72477e048eafa 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -6,15 +6,24 @@
 
 from vllm.assets.image import ImageAsset
 from vllm.config import ModelConfig
-from vllm.entrypoints.chat_utils import (parse_chat_messages,
-                                         parse_chat_messages_futures)
+from vllm.entrypoints.chat_utils import (_try_extract_ast, load_chat_template,
+                                         parse_chat_messages,
+                                         parse_chat_messages_futures,
+                                         resolve_chat_template_content_format)
 from vllm.entrypoints.llm import apply_hf_chat_template
 from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.utils import encode_image_base64
 from vllm.transformers_utils.tokenizer_group import TokenizerGroup
 
+from ..utils import VLLM_PATH
+
+EXAMPLES_DIR = VLLM_PATH / "examples"
+
 PHI3V_MODEL_ID = "microsoft/Phi-3.5-vision-instruct"
+ULTRAVOX_MODEL_ID = "fixie-ai/ultravox-v0_3"
+QWEN2VL_MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"
 MLLAMA_MODEL_ID = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+LLAMA_GUARD_MODEL_ID = "meta-llama/Llama-Guard-3-1B"
 
 
 @pytest.fixture(scope="function")
@@ -26,7 +35,6 @@ def phi3v_model_config():
                        trust_remote_code=True,
                        dtype="bfloat16",
                        seed=0,
-                       chat_template_text_format="string",
                        limit_mm_per_prompt={
                            "image": 2,
                        })
@@ -94,19 +102,24 @@ def test_parse_chat_messages_single_image(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages([{
-        "role":
-        "user",
-        "content": [{
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
-        }, {
-            "type": "text",
-            "text": "What's in the image?"
-        }]
-    }], phi3v_model_config, phi3v_tokenizer)
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [{
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "text",
+                "text": "What's in the image?"
+            }]
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
 
     assert conversation == [{
         "role": "user",
@@ -121,19 +134,24 @@ async def test_parse_chat_messages_single_image_async(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_future = parse_chat_messages_futures([{
-        "role":
-        "user",
-        "content": [{
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
-        }, {
-            "type": "text",
-            "text": "What's in the image?"
-        }]
-    }], phi3v_model_config, phi3v_tokenizer)
+    conversation, mm_future = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [{
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "text",
+                "text": "What's in the image?"
+            }]
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
 
     assert conversation == [{
         "role": "user",
@@ -147,24 +165,29 @@ def test_parse_chat_messages_multiple_images(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages([{
-        "role":
-        "user",
-        "content": [{
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
-        }, {
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
-        }, {
-            "type": "text",
-            "text": "What's in these images?"
-        }]
-    }], phi3v_model_config, phi3v_tokenizer)
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [{
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "text",
+                "text": "What's in these images?"
+            }]
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
 
     assert conversation == [{
         "role":
@@ -181,24 +204,29 @@ async def test_parse_chat_messages_multiple_images_async(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_future = parse_chat_messages_futures([{
-        "role":
-        "user",
-        "content": [{
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
-        }, {
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
-        }, {
-            "type": "text",
-            "text": "What's in these images?"
-        }]
-    }], phi3v_model_config, phi3v_tokenizer)
+    conversation, mm_future = parse_chat_messages_futures(
+        [{
+            "role":
+            "user",
+            "content": [{
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "text",
+                "text": "What's in these images?"
+            }]
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
 
     assert conversation == [{
         "role":
@@ -214,27 +242,31 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages([{
-        "role":
-        "user",
-        "content": [{
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
-        }, {
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
-        }, {
-            "type":
-            "text",
-            "text":
-            "What's in <|image_1|> and how does it compare to <|image_2|>?"
-        }]
-    }], phi3v_model_config, phi3v_tokenizer)
-
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [{
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type":
+                "text",
+                "text":
+                "What's in <|image_1|> and how does it compare to <|image_2|>?"
+            }]
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
     assert conversation == [{
         "role":
         "user",
@@ -249,26 +281,35 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages([{
-        "role":
-        "user",
-        "content": [{
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
-        }, {
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
-        }, {
-            "type":
-            "text",
-            "text":
-            "What's in <|image_1|> and how does it compare to the other one?"
-        }]
-    }], phi3v_model_config, phi3v_tokenizer)
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": image_url
+                    }
+                },
+                {
+                    "type":
+                    "text",
+                    "text":
+                    "What's in <|image_1|> and how does it compare to the other one?"  # noqa: E501
+                }
+            ]
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
 
     assert conversation == [{
         "role":
@@ -285,34 +326,39 @@ def test_parse_chat_messages_multiple_images_across_messages(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages([{
-        "role":
-        "user",
-        "content": [{
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [{
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "text",
+                "text": "What's in this image?"
+            }]
         }, {
-            "type": "text",
-            "text": "What's in this image?"
-        }]
-    }, {
-        "role": "assistant",
-        "content": "Some stuff."
-    }, {
-        "role":
-        "user",
-        "content": [{
-            "type": "image_url",
-            "image_url": {
-                "url": image_url
-            }
+            "role": "assistant",
+            "content": "Some stuff."
         }, {
-            "type": "text",
-            "text": "What about this one?"
-        }]
-    }], phi3v_model_config, phi3v_tokenizer)
+            "role":
+            "user",
+            "content": [{
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            }, {
+                "type": "text",
+                "text": "What about this one?"
+            }]
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
 
     assert conversation == [
         {
@@ -335,7 +381,6 @@ def test_parse_chat_messages_context_text_format(
     phi3v_model_config,
     phi3v_tokenizer,
 ):
-    phi3v_model_config.chat_template_text_format = "openai"
     conversation, mm_data = parse_chat_messages(
         [{
             "role": "user",
@@ -349,7 +394,11 @@ def test_parse_chat_messages_context_text_format(
         }, {
             "role": "user",
             "content": "What about this one?"
-        }], phi3v_model_config, phi3v_tokenizer)
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="openai",
+    )
 
     assert conversation == [
         {
@@ -389,29 +438,34 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message(
                 ValueError,
                 match="At most 2 image\\(s\\) may be provided in one request\\."
         ):
-            parse_chat_messages([{
-                "role":
-                "user",
-                "content": [{
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                }, {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                }, {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                }, {
-                    "type": "text",
-                    "text": "What's in these images?"
-                }]
-            }], phi3v_model_config, phi3v_tokenizer)
+            parse_chat_messages(
+                [{
+                    "role":
+                    "user",
+                    "content": [{
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    }, {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    }, {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    }, {
+                        "type": "text",
+                        "text": "What's in these images?"
+                    }]
+                }],
+                phi3v_model_config,
+                phi3v_tokenizer,
+                content_format="string",
+            )
 
 
 def test_parse_chat_messages_rejects_too_many_images_across_messages(
@@ -427,39 +481,44 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages(
                 ValueError,
                 match="At most 2 image\\(s\\) may be provided in one request\\."
         ):
-            parse_chat_messages([{
-                "role":
-                "user",
-                "content": [{
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
+            parse_chat_messages(
+                [{
+                    "role":
+                    "user",
+                    "content": [{
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    }, {
+                        "type": "text",
+                        "text": "What's in this image?"
+                    }]
                 }, {
-                    "type": "text",
-                    "text": "What's in this image?"
-                }]
-            }, {
-                "role": "assistant",
-                "content": "Some stuff."
-            }, {
-                "role":
-                "user",
-                "content": [{
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
+                    "role": "assistant",
+                    "content": "Some stuff."
                 }, {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": image_url
-                    }
-                }, {
-                    "type": "text",
-                    "text": "What about these two?"
-                }]
-            }], phi3v_model_config, phi3v_tokenizer)
+                    "role":
+                    "user",
+                    "content": [{
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    }, {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    }, {
+                        "type": "text",
+                        "text": "What about these two?"
+                    }]
+                }],
+                phi3v_model_config,
+                phi3v_tokenizer,
+                content_format="string",
+            )
 
 
 def test_parse_chat_messages_multiple_images_uncommon_input(
@@ -467,17 +526,22 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
     phi3v_tokenizer,
     image_url,
 ):
-    conversation, mm_data = parse_chat_messages([{
-        "role":
-        "user",
-        "content": [
-            "What's in these images?", {
-                "image_url": image_url
-            }, {
-                "image_url": image_url
-            }
-        ]
-    }], phi3v_model_config, phi3v_tokenizer)
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                "What's in these images?", {
+                    "image_url": image_url
+                }, {
+                    "image_url": image_url
+                }
+            ]
+        }],
+        phi3v_model_config,
+        phi3v_tokenizer,
+        content_format="string",
+    )
 
     assert conversation == [{
         "role":
@@ -495,16 +559,21 @@ def test_mllama_single_image(
     image_url,
 ):
     """Ensures that a single image is parsed correctly mllama."""
-    conversation, mm_data = parse_chat_messages([{
-        "role":
-        "user",
-        "content": [{
-            'type': 'text',
-            'text': 'The content of this image is:'
-        }, {
-            "image_url": image_url
-        }]
-    }], mllama_model_config, mllama_tokenizer)
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [{
+                'type': 'text',
+                'text': 'The content of this image is:'
+            }, {
+                "image_url": image_url
+            }]
+        }],
+        mllama_model_config,
+        mllama_tokenizer,
+        content_format="openai",
+    )
     _assert_mm_data_is_image_input(mm_data, 1)
     assert conversation == [{
         'role':
@@ -524,26 +593,31 @@ def test_mllama_interleaved_images(
     image_url,
 ):
     """Ensures that multiple image are parsed as interleaved dicts."""
-    conversation, mm_data = parse_chat_messages([{
-        "role":
-        "user",
-        "content": [
-            {
-                'type': 'text',
-                'text': 'The content of the first image is:'
-            },
-            {
-                "image_url": image_url
-            },
-            {
-                'type': 'text',
-                'text': 'The content of the second image is:'
-            },
-            {
-                "image_url": image_url
-            },
-        ]
-    }], mllama_model_config, mllama_tokenizer)
+    conversation, mm_data = parse_chat_messages(
+        [{
+            "role":
+            "user",
+            "content": [
+                {
+                    'type': 'text',
+                    'text': 'The content of the first image is:'
+                },
+                {
+                    "image_url": image_url
+                },
+                {
+                    'type': 'text',
+                    'text': 'The content of the second image is:'
+                },
+                {
+                    "image_url": image_url
+                },
+            ]
+        }],
+        mllama_model_config,
+        mllama_tokenizer,
+        content_format="openai",
+    )
     _assert_mm_data_is_image_input(mm_data, 2)
     assert conversation == [{
         'role':
@@ -626,6 +700,7 @@ def get_conversation(is_hf: bool):
         vllm_conversation,
         model_config,
         tokenizer_group,
+        content_format="openai",
     )
 
     vllm_result = apply_hf_chat_template(
@@ -636,3 +711,89 @@ def get_conversation(is_hf: bool):
     )
 
     assert hf_result == vllm_result
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("model", "expected_format"),
+    [(PHI3V_MODEL_ID, "string"),
+     (QWEN2VL_MODEL_ID, "openai"),
+     (ULTRAVOX_MODEL_ID, "string"),
+     (MLLAMA_MODEL_ID, "openai"),
+     (LLAMA_GUARD_MODEL_ID, "openai")],
+)
+# yapf: enable
+def test_resolve_content_format_hf_defined(model, expected_format):
+    tokenizer_group = TokenizerGroup(
+        model,
+        enable_lora=False,
+        max_num_seqs=5,
+        max_input_length=None,
+    )
+    tokenizer = tokenizer_group.tokenizer
+
+    chat_template = tokenizer.chat_template
+    assert isinstance(chat_template, str)
+
+    print("[TEXT]")
+    print(chat_template)
+    print("[AST]")
+    print(_try_extract_ast(chat_template))
+
+    resolved_format = resolve_chat_template_content_format(
+        None,  # Test detecting the tokenizer's chat_template
+        "auto",
+        tokenizer,
+    )
+
+    assert resolved_format == expected_format
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("template_path", "expected_format"),
+    [("template_alpaca.jinja", "string"),
+     ("template_baichuan.jinja", "string"),
+     ("template_blip2.jinja", "string"),
+     ("template_chatglm.jinja", "string"),
+     ("template_chatglm2.jinja", "string"),
+     ("template_chatml.jinja", "string"),
+     ("template_falcon_180b.jinja", "string"),
+     ("template_falcon.jinja", "string"),
+     ("template_inkbot.jinja", "string"),
+     ("template_llava.jinja", "string"),
+     ("template_vlm2vec.jinja", "openai"),
+     ("tool_chat_template_granite_20b_fc.jinja", "string"),
+     ("tool_chat_template_hermes.jinja", "string"),
+     ("tool_chat_template_internlm2_tool.jinja", "string"),
+     ("tool_chat_template_llama3.1_json.jinja", "string"),
+     ("tool_chat_template_llama3.2_json.jinja", "string"),
+     ("tool_chat_template_mistral_parallel.jinja", "string"),
+     ("tool_chat_template_mistral.jinja", "string")],
+)
+# yapf: enable
+def test_resolve_content_format_examples(template_path, expected_format):
+    tokenizer_group = TokenizerGroup(
+        PHI3V_MODEL_ID,
+        enable_lora=False,
+        max_num_seqs=5,
+        max_input_length=None,
+    )
+    dummy_tokenizer = tokenizer_group.tokenizer
+    dummy_tokenizer.chat_template = None
+
+    chat_template = load_chat_template(EXAMPLES_DIR / template_path)
+    assert isinstance(chat_template, str)
+
+    print("[TEXT]")
+    print(chat_template)
+    print("[AST]")
+    print(_try_extract_ast(chat_template))
+
+    resolved_format = resolve_chat_template_content_format(
+        chat_template,
+        "auto",
+        dummy_tokenizer,
+    )
+
+    assert resolved_format == expected_format
diff --git a/vllm/config.py b/vllm/config.py
index 1c190da1d327e..64b2f75e092de 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -155,7 +155,6 @@ def __init__(
             limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
             use_async_output_proc: bool = True,
             config_format: ConfigFormat = ConfigFormat.AUTO,
-            chat_template_text_format: str = "string",
             hf_overrides: Optional[HfOverrides] = None,
             mm_processor_kwargs: Optional[Dict[str, Any]] = None,
             override_neuron_config: Optional[Dict[str, Any]] = None,
@@ -216,7 +215,6 @@ def __init__(
             self.model, revision)
         self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
         self.use_async_output_proc = use_async_output_proc
-        self.chat_template_text_format = chat_template_text_format
         self.mm_processor_kwargs = mm_processor_kwargs
 
         # Set enforce_eager to False if the value is unset.
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index d73f95f59c71f..92fa87c7fa45b 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -90,7 +90,6 @@ class EngineArgs:
     task: TaskOption = "auto"
     skip_tokenizer_init: bool = False
     tokenizer_mode: str = 'auto'
-    chat_template_text_format: str = 'string'
     trust_remote_code: bool = False
     allowed_local_media_path: str = ""
     download_dir: Optional[str] = None
@@ -258,14 +257,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             'fast tokenizer if available.\n* "slow" will '
             'always use the slow tokenizer. \n* '
             '"mistral" will always use the `mistral_common` tokenizer.')
-        parser.add_argument(
-            '--chat-template-text-format',
-            type=str,
-            default=EngineArgs.chat_template_text_format,
-            choices=['string', 'openai'],
-            help='The format to render text content within a chat template. '
-            '"string" will keep the content field as a string whereas '
-            '"openai" will parse content in the current OpenAI format.')
         parser.add_argument('--trust-remote-code',
                             action='store_true',
                             help='Trust remote code from huggingface.')
@@ -894,7 +885,6 @@ def create_model_config(self) -> ModelConfig:
             # We know this is not None because we set it in __post_init__
             tokenizer=cast(str, self.tokenizer),
             tokenizer_mode=self.tokenizer_mode,
-            chat_template_text_format=self.chat_template_text_format,
             trust_remote_code=self.trust_remote_code,
             allowed_local_media_path=self.allowed_local_media_path,
             dtype=self.dtype,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index aa9c7893c4cfe..9a2d73a020c8f 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -262,8 +262,7 @@ def __init__(
             "num_scheduler_steps=%d, chunked_prefill_enabled=%s "
             "multi_step_stream_outputs=%s, enable_prefix_caching=%s, "
             "use_async_output_proc=%s, use_cached_outputs=%s, "
-            "chat_template_text_format=%s, mm_processor_kwargs=%s, "
-            "pooler_config=%r)",
+            "mm_processor_kwargs=%s, pooler_config=%r)",
             VLLM_VERSION,
             model_config.model,
             speculative_config,
@@ -296,7 +295,6 @@ def __init__(
             cache_config.enable_prefix_caching,
             model_config.use_async_output_proc,
             use_cached_outputs,
-            model_config.chat_template_text_format,
             model_config.mm_processor_kwargs,
             model_config.pooler_config,
         )
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 3ca460c47c3bd..abee5ac46391c 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -2,12 +2,14 @@
 import codecs
 import json
 from abc import ABC, abstractmethod
-from collections import defaultdict
+from collections import defaultdict, deque
 from functools import lru_cache, partial
 from pathlib import Path
 from typing import (Any, Awaitable, Callable, Dict, Generic, Iterable, List,
                     Literal, Mapping, Optional, Tuple, TypeVar, Union, cast)
 
+import jinja2.nodes
+import transformers.utils.chat_template_utils as hf_chat_utils
 # yapf conflicts with isort for this block
 # yapf: disable
 from openai.types.chat import (ChatCompletionAssistantMessageParam,
@@ -153,6 +155,199 @@ class ConversationMessage(TypedDict, total=False):
     """The tool calls generated by the model, such as function calls."""
 
 
+# Passed in by user
+ChatTemplateContentFormatOption = Literal["auto", "string", "openai"]
+
+# Used internally
+_ChatTemplateContentFormat = Literal["string", "openai"]
+
+
+def _is_var_access(node: jinja2.nodes.Node, varname: str) -> bool:
+    if isinstance(node, jinja2.nodes.Name):
+        return node.ctx == "load" and node.name == varname
+
+    return False
+
+
+def _is_attr_access(node: jinja2.nodes.Node, varname: str, key: str) -> bool:
+    if isinstance(node, jinja2.nodes.Getitem):
+        return (_is_var_access(node.node, varname)
+                and isinstance(node.arg, jinja2.nodes.Const)
+                and node.arg.value == key)
+
+    if isinstance(node, jinja2.nodes.Getattr):
+        return _is_var_access(node.node, varname) and node.attr == key
+
+    return False
+
+
+def _is_var_or_elems_access(
+    node: jinja2.nodes.Node,
+    varname: str,
+    key: Optional[str] = None,
+) -> bool:
+    if isinstance(node, jinja2.nodes.Filter):
+        return (node.node is not None
+                and _is_var_or_elems_access(node.node, varname, key))
+    if isinstance(node, jinja2.nodes.Test):
+        return _is_var_or_elems_access(node.node, varname, key)
+
+    if (isinstance(node, jinja2.nodes.Getitem)
+            and isinstance(node.arg, jinja2.nodes.Slice)):
+        return _is_var_or_elems_access(node.node, varname, key)
+
+    # yapf: disable
+    return (
+        _is_attr_access(node, varname, key) if key
+        else _is_var_access(node, varname)
+    ) # yapf: enable
+
+
+def _iter_nodes_assign_var_or_elems(root: jinja2.nodes.Node, varname: str):
+    # Global variable that is implicitly defined at the root
+    yield root, varname
+
+    # Iterative BFS
+    related_varnames = deque([varname])
+    while related_varnames:
+        related_varname = related_varnames.popleft()
+
+        for assign_ast in root.find_all(jinja2.nodes.Assign):
+            lhs = assign_ast.target
+            rhs = assign_ast.node
+
+            if _is_var_or_elems_access(rhs, related_varname):
+                assert isinstance(lhs, jinja2.nodes.Name)
+                yield assign_ast, lhs.name
+
+                # Avoid infinite looping for self-assignment
+                if lhs.name != related_varname:
+                    related_varnames.append(lhs.name)
+
+
+# NOTE: The proper way to handle this is to build a CFG so that we can handle
+# the scope in which each variable is defined, but that is too complicated
+def _iter_nodes_assign_messages_item(root: jinja2.nodes.Node):
+    messages_varnames = [
+        varname
+        for _, varname in _iter_nodes_assign_var_or_elems(root, "messages")
+    ]
+
+    # Search for {%- for message in messages -%} loops
+    for loop_ast in root.find_all(jinja2.nodes.For):
+        loop_iter = loop_ast.iter
+        loop_target = loop_ast.target
+
+        for varname in messages_varnames:
+            if _is_var_or_elems_access(loop_iter, varname):
+                assert isinstance(loop_target, jinja2.nodes.Name)
+                yield loop_ast, loop_target.name
+                break
+
+
+def _iter_nodes_assign_content_item(root: jinja2.nodes.Node):
+    message_varnames = [
+        varname for _, varname in _iter_nodes_assign_messages_item(root)
+    ]
+
+    # Search for {%- for content in message['content'] -%} loops
+    for loop_ast in root.find_all(jinja2.nodes.For):
+        loop_iter = loop_ast.iter
+        loop_target = loop_ast.target
+
+        for varname in message_varnames:
+            if _is_var_or_elems_access(loop_iter, varname, "content"):
+                assert isinstance(loop_target, jinja2.nodes.Name)
+                yield loop_ast, loop_target.name
+                break
+
+
+def _try_extract_ast(chat_template: str) -> Optional[jinja2.nodes.Template]:
+    try:
+        jinja_compiled = hf_chat_utils._compile_jinja_template(chat_template)
+        return jinja_compiled.environment.parse(chat_template)
+    except Exception:
+        logger.exception("Error when compiling Jinja template")
+        return None
+
+
+def _detect_content_format(
+    chat_template: str,
+    *,
+    default: _ChatTemplateContentFormat,
+) -> _ChatTemplateContentFormat:
+    jinja_ast = _try_extract_ast(chat_template)
+    if jinja_ast is None:
+        return default
+
+    try:
+        next(_iter_nodes_assign_content_item(jinja_ast))
+    except StopIteration:
+        return "string"
+    except Exception:
+        logger.exception("Error when parsing AST of Jinja template")
+        return default
+    else:
+        return "openai"
+
+
+def _resolve_chat_template_content_format(
+    chat_template: Optional[str],
+    given_format: ChatTemplateContentFormatOption,
+    tokenizer: AnyTokenizer,
+) -> _ChatTemplateContentFormat:
+    if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
+        tokenizer_chat_template = tokenizer.chat_template
+    else:
+        tokenizer_chat_template = None
+
+    jinja_text: Optional[str]
+    if isinstance(tokenizer_chat_template, str) and chat_template is None:
+        jinja_text = tokenizer_chat_template
+    elif (isinstance(tokenizer_chat_template, dict)
+            and chat_template in tokenizer_chat_template):
+        jinja_text = tokenizer_chat_template[chat_template]
+    else:
+        jinja_text = load_chat_template(chat_template, is_literal=True)
+
+    detected_format = ("string" if jinja_text is None else
+                       _detect_content_format(jinja_text, default="string"))
+
+    return detected_format if given_format == "auto" else given_format
+
+
+@lru_cache
+def resolve_chat_template_content_format(
+    chat_template: Optional[str],
+    given_format: ChatTemplateContentFormatOption,
+    tokenizer: AnyTokenizer,
+) -> _ChatTemplateContentFormat:
+    detected_format = _resolve_chat_template_content_format(
+        chat_template,
+        given_format,
+        tokenizer,
+    )
+
+    logger.info(
+        "Detected the chat template content format to be '%s'. "
+        "You can set `--chat-template-content-format` to override this.",
+        detected_format,
+    )
+
+    if given_format != "auto" and given_format != detected_format:
+        logger.warning(
+            "You specified `--chat-template-content-format %s` "
+            "which is different from the detected format '%s'. "
+            "If our automatic detection is incorrect, please consider "
+            "opening a GitHub issue so that we can improve it: "
+            "https://github.com/vllm-project/vllm/issues/new/choose",
+            given_format,
+            detected_format,
+        )
+
+    return detected_format
+
+
 ModalityStr = Literal["image", "audio", "video"]
 _T = TypeVar("_T")
 
@@ -407,12 +602,23 @@ def validate_chat_template(chat_template: Optional[Union[Path, str]]):
 
 
 def load_chat_template(
-        chat_template: Optional[Union[Path, str]]) -> Optional[str]:
+    chat_template: Optional[Union[Path, str]],
+    *,
+    is_literal: bool = False,
+) -> Optional[str]:
     if chat_template is None:
         return None
+
+    if is_literal:
+        if isinstance(chat_template, Path):
+            raise TypeError("chat_template is expected to be read directly "
+                            "from its value")
+
+        return codecs.decode(chat_template, "unicode_escape")
+
     try:
         with open(chat_template) as f:
-            resolved_chat_template = f.read()
+            return f.read()
     except OSError as e:
         if isinstance(chat_template, Path):
             raise
@@ -426,10 +632,7 @@ def load_chat_template(
 
         # If opening a file fails, set chat template to be args to
         # ensure we decode so our escape are interpreted correctly
-        resolved_chat_template = codecs.decode(chat_template, "unicode_escape")
-
-    logger.info("Using supplied chat template:\n%s", resolved_chat_template)
-    return resolved_chat_template
+        return load_chat_template(chat_template, is_literal=True)
 
 
 # TODO: Let user specify how to insert multimodal tokens into prompt
@@ -464,7 +667,6 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
 _AudioParser = partial(cast, ChatCompletionContentPartAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
 _VideoParser = partial(cast, ChatCompletionContentPartVideoParam)
-MODEL_KEEP_MULTI_MODAL_CONTENT = {'mllama'}
 
 # Define a mapping from part types to their corresponding parsing functions.
 MM_PARSER_MAP: Dict[str, Callable[[ChatCompletionContentPartParam], str]] = {
@@ -542,18 +744,12 @@ def _parse_chat_message_content_parts(
     role: str,
     parts: Iterable[ChatCompletionContentPartParam],
     mm_tracker: BaseMultiModalItemTracker,
-    chat_template_text_format: str,
+    *,
+    wrap_dicts: bool,
 ) -> List[ConversationMessage]:
     content: List[Union[str, Dict[str, str]]] = []
 
     mm_parser = mm_tracker.create_parser()
-    model_config = mm_tracker.model_config
-
-    wrap_dicts = (chat_template_text_format == "openai"
-                  or (model_config.task == "embedding"
-                      and model_config.is_multimodal_model)
-                  or (model_config.hf_config.model_type
-                      in MODEL_KEEP_MULTI_MODAL_CONTENT))
 
     for part in parts:
         parse_res = _parse_chat_message_content_part(
@@ -578,9 +774,11 @@ def _parse_chat_message_content_parts(
 
 
 def _parse_chat_message_content_part(
-        part: ChatCompletionContentPartParam,
-        mm_parser: BaseMultiModalContentParser,
-        wrap_dicts: bool) -> Optional[Union[str, Dict[str, str]]]:
+    part: ChatCompletionContentPartParam,
+    mm_parser: BaseMultiModalContentParser,
+    *,
+    wrap_dicts: bool,
+) -> Optional[Union[str, Dict[str, str]]]:
     """Parses a single part of a conversation. If wrap_dicts is True,
     structured dictionary pieces for texts and images will be
     wrapped in dictionaries, i.e., {"type": "text", "text", ...} and
@@ -629,7 +827,7 @@ def _parse_chat_message_content_part(
 def _parse_chat_message_content(
     message: ChatCompletionMessageParam,
     mm_tracker: BaseMultiModalItemTracker,
-    chat_template_text_format: str,
+    content_format: _ChatTemplateContentFormat,
 ) -> List[ConversationMessage]:
     role = message["role"]
     content = message.get("content")
@@ -645,7 +843,7 @@ def _parse_chat_message_content(
         role,
         content,  # type: ignore
         mm_tracker,
-        chat_template_text_format,
+        wrap_dicts=(content_format == "openai"),
     )
 
     for result_msg in result:
@@ -684,6 +882,7 @@ def parse_chat_messages(
     messages: List[ChatCompletionMessageParam],
     model_config: ModelConfig,
     tokenizer: AnyTokenizer,
+    content_format: _ChatTemplateContentFormat,
 ) -> Tuple[List[ConversationMessage], Optional[MultiModalDataDict]]:
     conversation: List[ConversationMessage] = []
     mm_tracker = MultiModalItemTracker(model_config, tokenizer)
@@ -692,7 +891,7 @@ def parse_chat_messages(
         sub_messages = _parse_chat_message_content(
             msg,
             mm_tracker,
-            model_config.chat_template_text_format,
+            content_format,
         )
 
         conversation.extend(sub_messages)
@@ -706,6 +905,7 @@ def parse_chat_messages_futures(
     messages: List[ChatCompletionMessageParam],
     model_config: ModelConfig,
     tokenizer: AnyTokenizer,
+    content_format: _ChatTemplateContentFormat,
 ) -> Tuple[List[ConversationMessage], Awaitable[Optional[MultiModalDataDict]]]:
     conversation: List[ConversationMessage] = []
     mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer)
@@ -714,7 +914,7 @@ def parse_chat_messages_futures(
         sub_messages = _parse_chat_message_content(
             msg,
             mm_tracker,
-            model_config.chat_template_text_format,
+            content_format,
         )
 
         conversation.extend(sub_messages)
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 4b33fc1458ee3..86b0b6893f1d9 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -13,9 +13,11 @@
                                    TaskOption)
 from vllm.engine.llm_engine import LLMEngine
 from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
+                                         ChatTemplateContentFormatOption,
                                          apply_hf_chat_template,
                                          apply_mistral_chat_template,
-                                         parse_chat_messages)
+                                         parse_chat_messages,
+                                         resolve_chat_template_content_format)
 from vllm.inputs import PromptType, TextPrompt, TokensPrompt
 from vllm.inputs.parse import parse_and_batch_prompt
 from vllm.logger import init_logger
@@ -523,6 +525,7 @@ def chat(
         use_tqdm: bool = True,
         lora_request: Optional[LoRARequest] = None,
         chat_template: Optional[str] = None,
+        chat_template_content_format: ChatTemplateContentFormatOption = "auto",
         add_generation_prompt: bool = True,
         continue_final_message: bool = False,
         tools: Optional[List[Dict[str, Any]]] = None,
@@ -539,9 +542,11 @@ def chat(
         to the OpenAI API.
 
         Args:
-            messages: A list of conversations or a single conversation. 
-                - Each conversation is represented as a list of messages.
-                - Each message is a dictionary with 'role' and 'content' keys.
+            messages: A list of conversations or a single conversation.
+
+              - Each conversation is represented as a list of messages.
+              - Each message is a dictionary with 'role' and 'content' keys.
+
             sampling_params: The sampling parameters for text generation.
                 If None, we use the default sampling parameters. When it
                 is a single value, it is applied to every prompt. When it
@@ -551,11 +556,19 @@ def chat(
             lora_request: LoRA request to use for generation, if any.
             chat_template: The template to use for structuring the chat.
               If not provided, the model's default chat template will be used.
+            chat_template_content_format: The format to render message content.
+
+              - "string" will render the content as a string.
+                Example: ``"Who are you?"``
+              - "openai" will render the content as a list of dictionaries,
+                similar to OpenAI schema.
+                Example: ``[{"type": "text", "text": "Who are you?"}]``
+
             add_generation_prompt: If True, adds a generation template
                 to each message.
             continue_final_message: If True, continues the final message in
-                the conversation instead of starting a new one. Cannot be `True`
-                if `add_generation_prompt` is also `True`.
+                the conversation instead of starting a new one. Cannot be
+                ``True`` if ``add_generation_prompt`` is also ``True``.
             mm_processor_kwargs: Multimodal processor kwarg overrides for this
                 chat request. Only used for offline requests.
 
@@ -576,17 +589,26 @@ def chat(
                 cast(List[ChatCompletionMessageParam], messages)
             ]
 
+        tokenizer = self.get_tokenizer()
+        model_config = self.llm_engine.get_model_config()
+        resolved_content_format = resolve_chat_template_content_format(
+            chat_template,
+            chat_template_content_format,
+            tokenizer,
+        )
+
         prompts: List[Union[TokensPrompt, TextPrompt]] = []
 
         for msgs in list_of_messages:
-            tokenizer = self.get_tokenizer()
-            model_config = self.llm_engine.get_model_config()
-
             # NOTE: _parse_chat_message_content_parts() currently doesn't
             # handle mm_processor_kwargs, since there is no implementation in
             # the chat message parsing for it.
             conversation, mm_data = parse_chat_messages(
-                msgs, model_config, tokenizer)
+                msgs,
+                model_config,
+                tokenizer,
+                content_format=resolved_content_format,
+            )
 
             prompt_data: Union[str, List[int]]
             if isinstance(tokenizer, MistralTokenizer):
@@ -737,7 +759,7 @@ def encode(
                 generation, if any.
 
         Returns:
-            A list of `EmbeddingRequestOutput` objects containing the
+            A list of ``EmbeddingRequestOutput`` objects containing the
             generated embeddings in the same order as the input prompts.
 
         Note:
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index b13f6a228b4c6..b0fe061f5db4a 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -29,6 +29,7 @@
 from vllm.engine.multiprocessing.client import MQLLMEngineClient
 from vllm.engine.multiprocessing.engine import run_mp_engine
 from vllm.engine.protocol import EngineClient
+from vllm.entrypoints.chat_utils import load_chat_template
 from vllm.entrypoints.launcher import serve_http
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.cli_args import (make_arg_parser,
@@ -529,6 +530,9 @@ def init_app_state(
     state.engine_client = engine_client
     state.log_stats = not args.disable_log_stats
 
+    resolved_chat_template = load_chat_template(args.chat_template)
+    logger.info("Using supplied chat template:\n%s", resolved_chat_template)
+
     state.openai_serving_chat = OpenAIServingChat(
         engine_client,
         model_config,
@@ -537,7 +541,8 @@ def init_app_state(
         lora_modules=args.lora_modules,
         prompt_adapters=args.prompt_adapters,
         request_logger=request_logger,
-        chat_template=args.chat_template,
+        chat_template=resolved_chat_template,
+        chat_template_content_format=args.chat_template_content_format,
         return_tokens_as_token_ids=args.return_tokens_as_token_ids,
         enable_auto_tools=args.enable_auto_tool_choice,
         tool_parser=args.tool_call_parser,
@@ -557,7 +562,8 @@ def init_app_state(
         model_config,
         base_model_paths,
         request_logger=request_logger,
-        chat_template=args.chat_template,
+        chat_template=resolved_chat_template,
+        chat_template_content_format=args.chat_template_content_format,
     ) if model_config.task == "embedding" else None
     state.openai_serving_tokenization = OpenAIServingTokenization(
         engine_client,
@@ -565,7 +571,8 @@ def init_app_state(
         base_model_paths,
         lora_modules=args.lora_modules,
         request_logger=request_logger,
-        chat_template=args.chat_template,
+        chat_template=resolved_chat_template,
+        chat_template_content_format=args.chat_template_content_format,
     )
 
 
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index eb08a89293370..24c206a1261f2 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -7,10 +7,11 @@
 import argparse
 import json
 import ssl
-from typing import List, Optional, Sequence, Union
+from typing import List, Optional, Sequence, Union, get_args
 
 from vllm.engine.arg_utils import AsyncEngineArgs, nullable_str
-from vllm.entrypoints.chat_utils import validate_chat_template
+from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
+                                         validate_chat_template)
 from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
                                                     PromptAdapterPath)
 from vllm.entrypoints.openai.tool_parsers import ToolParserManager
@@ -132,6 +133,18 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
                         help="The file path to the chat template, "
                         "or the template in single-line form "
                         "for the specified model")
+    parser.add_argument(
+        '--chat-template-content-format',
+        type=str,
+        default="auto",
+        choices=get_args(ChatTemplateContentFormatOption),
+        help='The format to render message content within a chat template.'
+        '\n\n'
+        '* "string" will render the content as a string. '
+        'Example: "Hello World"\n'
+        '* "openai" will render the content as a list of dictionaries, '
+        'similar to OpenAI schema. '
+        'Example: [{"type": "text", "text": "Hello world!"}]')
     parser.add_argument("--response-role",
                         type=nullable_str,
                         default="assistant",
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 820aefd8800d9..b7b064ae01f05 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -5,9 +5,8 @@
 from typing import Any, Dict, List, Literal, Optional, Union
 
 import torch
-from openai.types.chat import ChatCompletionContentPartParam
 from pydantic import BaseModel, ConfigDict, Field, model_validator
-from typing_extensions import Annotated, Required, TypedDict
+from typing_extensions import Annotated
 
 from vllm.entrypoints.chat_utils import ChatCompletionMessageParam
 from vllm.pooling_params import PoolingParams
@@ -35,26 +34,6 @@
 assert _LONG_INFO.max == _MOCK_LONG_INFO.max
 
 
-class CustomChatCompletionMessageParam(TypedDict, total=False):
-    """Enables custom roles in the Chat Completion API."""
-    role: Required[str]
-    """The role of the message's author."""
-
-    content: Union[str, List[ChatCompletionContentPartParam]]
-    """The contents of the message."""
-
-    name: str
-    """An optional name for the participant.
-
-    Provides the model information to differentiate between participants of the
-    same role.
-    """
-
-    tool_call_id: Optional[str]
-
-    tool_calls: Optional[List[dict]]
-
-
 class OpenAIBaseModel(BaseModel):
     # OpenAI API does not allow extra fields
     model_config = ConfigDict(extra="forbid")
@@ -1054,16 +1033,56 @@ class TokenizeCompletionRequest(OpenAIBaseModel):
     model: str
     prompt: str
 
-    add_special_tokens: bool = Field(default=True)
+    add_special_tokens: bool = Field(
+        default=True,
+        description=(
+            "If true (the default), special tokens (e.g. BOS) will be added to "
+            "the prompt."),
+    )
 
 
 class TokenizeChatRequest(OpenAIBaseModel):
     model: str
     messages: List[ChatCompletionMessageParam]
 
-    add_generation_prompt: bool = Field(default=True)
-    continue_final_message: bool = Field(default=False)
-    add_special_tokens: bool = Field(default=False)
+    add_generation_prompt: bool = Field(
+        default=True,
+        description=
+        ("If true, the generation prompt will be added to the chat template. "
+         "This is a parameter used by chat template in tokenizer config of the "
+         "model."),
+    )
+    continue_final_message: bool = Field(
+        default=False,
+        description=
+        ("If this is set, the chat will be formatted so that the final "
+         "message in the chat is open-ended, without any EOS tokens. The "
+         "model will continue this message rather than starting a new one. "
+         "This allows you to \"prefill\" part of the model's response for it. "
+         "Cannot be used at the same time as `add_generation_prompt`."),
+    )
+    add_special_tokens: bool = Field(
+        default=False,
+        description=(
+            "If true, special tokens (e.g. BOS) will be added to the prompt "
+            "on top of what is added by the chat template. "
+            "For most models, the chat template takes care of adding the "
+            "special tokens so this should be set to false (as is the "
+            "default)."),
+    )
+    chat_template: Optional[str] = Field(
+        default=None,
+        description=(
+            "A Jinja template to use for this conversion. "
+            "As of transformers v4.44, default chat template is no longer "
+            "allowed, so you must provide a chat template if the tokenizer "
+            "does not define one."),
+    )
+    chat_template_kwargs: Optional[Dict[str, Any]] = Field(
+        default=None,
+        description=("Additional kwargs to pass to the template renderer. "
+                     "Will be accessible by the chat template."),
+    )
 
     @model_validator(mode="before")
     @classmethod
diff --git a/vllm/entrypoints/openai/run_batch.py b/vllm/entrypoints/openai/run_batch.py
index 1b422a93263b2..00cdb3b6839f5 100644
--- a/vllm/entrypoints/openai/run_batch.py
+++ b/vllm/entrypoints/openai/run_batch.py
@@ -222,6 +222,7 @@ async def main(args):
         prompt_adapters=None,
         request_logger=request_logger,
         chat_template=None,
+        chat_template_content_format="auto",
         enable_prompt_tokens_details=args.enable_prompt_tokens_details,
     ) if model_config.task == "generate" else None
     openai_serving_embedding = OpenAIServingEmbedding(
@@ -230,6 +231,7 @@ async def main(args):
         base_model_paths,
         request_logger=request_logger,
         chat_template=None,
+        chat_template_content_format="auto",
     ) if model_config.task == "embedding" else None
 
     tracker = BatchProgressTracker()
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 77cae00ae827f..2eef909eb9319 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -10,7 +10,8 @@
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.chat_utils import ConversationMessage, load_chat_template
+from vllm.entrypoints.chat_utils import (ChatTemplateContentFormatOption,
+                                         ConversationMessage)
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionLogProb, ChatCompletionLogProbs,
@@ -38,20 +39,23 @@
 
 class OpenAIServingChat(OpenAIServing):
 
-    def __init__(self,
-                 engine_client: EngineClient,
-                 model_config: ModelConfig,
-                 base_model_paths: List[BaseModelPath],
-                 response_role: str,
-                 *,
-                 lora_modules: Optional[List[LoRAModulePath]],
-                 prompt_adapters: Optional[List[PromptAdapterPath]],
-                 request_logger: Optional[RequestLogger],
-                 chat_template: Optional[str],
-                 return_tokens_as_token_ids: bool = False,
-                 enable_auto_tools: bool = False,
-                 tool_parser: Optional[str] = None,
-                 enable_prompt_tokens_details: bool = False):
+    def __init__(
+        self,
+        engine_client: EngineClient,
+        model_config: ModelConfig,
+        base_model_paths: List[BaseModelPath],
+        response_role: str,
+        *,
+        lora_modules: Optional[List[LoRAModulePath]],
+        prompt_adapters: Optional[List[PromptAdapterPath]],
+        request_logger: Optional[RequestLogger],
+        chat_template: Optional[str],
+        chat_template_content_format: ChatTemplateContentFormatOption,
+        return_tokens_as_token_ids: bool = False,
+        enable_auto_tools: bool = False,
+        tool_parser: Optional[str] = None,
+        enable_prompt_tokens_details: bool = False,
+    ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          base_model_paths=base_model_paths,
@@ -61,8 +65,8 @@ def __init__(self,
                          return_tokens_as_token_ids=return_tokens_as_token_ids)
 
         self.response_role = response_role
-        self.use_tool_use_model_template = False
-        self.chat_template = load_chat_template(chat_template)
+        self.chat_template = chat_template
+        self.chat_template_content_format: Final = chat_template_content_format
 
         # set up tool use
         self.enable_auto_tools: bool = enable_auto_tools
@@ -120,6 +124,7 @@ async def create_chat_completion(
             ) = self._maybe_get_adapters(request)
 
             tokenizer = await self.engine_client.get_tokenizer(lora_request)
+
             tool_parser = self.tool_parser
 
             # validation for OpenAI tools
@@ -157,6 +162,7 @@ async def create_chat_completion(
                 tokenizer,
                 request.messages,
                 chat_template=request.chat_template or self.chat_template,
+                chat_template_content_format=self.chat_template_content_format,
                 add_generation_prompt=request.add_generation_prompt,
                 continue_final_message=request.continue_final_message,
                 tool_dicts=tool_dicts,
diff --git a/vllm/entrypoints/openai/serving_embedding.py b/vllm/entrypoints/openai/serving_embedding.py
index bbe7db8f13231..74ad7389784fc 100644
--- a/vllm/entrypoints/openai/serving_embedding.py
+++ b/vllm/entrypoints/openai/serving_embedding.py
@@ -1,7 +1,7 @@
 import asyncio
 import base64
 import time
-from typing import AsyncGenerator, List, Literal, Optional, Union, cast
+from typing import AsyncGenerator, Final, List, Literal, Optional, Union, cast
 
 import numpy as np
 from fastapi import Request
@@ -9,7 +9,7 @@
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.chat_utils import load_chat_template
+from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
 from vllm.entrypoints.openai.protocol import (EmbeddingChatRequest,
                                               EmbeddingRequest,
@@ -77,7 +77,8 @@ def __init__(
         *,
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
-    ):
+        chat_template_content_format: ChatTemplateContentFormatOption,
+    ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          base_model_paths=base_model_paths,
@@ -85,7 +86,8 @@ def __init__(
                          prompt_adapters=None,
                          request_logger=request_logger)
 
-        self.chat_template = load_chat_template(chat_template)
+        self.chat_template = chat_template
+        self.chat_template_content_format: Final = chat_template_content_format
 
     async def create_embedding(
         self,
@@ -144,6 +146,8 @@ async def create_embedding(
                     tokenizer,
                     request.messages,
                     chat_template=request.chat_template or self.chat_template,
+                    chat_template_content_format=self.
+                    chat_template_content_format,
                     add_generation_prompt=request.add_generation_prompt,
                     continue_final_message=request.continue_final_message,
                     truncate_prompt_tokens=truncate_prompt_tokens,
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index fa315fa516632..cae2877ea7e99 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -11,14 +11,16 @@
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
+# yapf conflicts with isort for this block
+# yapf: disable
 from vllm.entrypoints.chat_utils import (ChatCompletionMessageParam,
+                                         ChatTemplateContentFormatOption,
                                          ConversationMessage,
                                          apply_hf_chat_template,
                                          apply_mistral_chat_template,
-                                         parse_chat_messages_futures)
+                                         parse_chat_messages_futures,
+                                         resolve_chat_template_content_format)
 from vllm.entrypoints.logger import RequestLogger
-# yapf conflicts with isort for this block
-# yapf: disable
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                               CompletionRequest,
                                               DetokenizeRequest,
@@ -426,7 +428,8 @@ async def _preprocess_chat(
         request: ChatLikeRequest,
         tokenizer: AnyTokenizer,
         messages: List[ChatCompletionMessageParam],
-        chat_template: Optional[str] = None,
+        chat_template: Optional[str],
+        chat_template_content_format: ChatTemplateContentFormatOption,
         add_generation_prompt: bool = True,
         continue_final_message: bool = False,
         tool_dicts: Optional[List[Dict[str, Any]]] = None,
@@ -437,10 +440,16 @@ async def _preprocess_chat(
         add_special_tokens: bool = False,
     ) -> Tuple[List[ConversationMessage], Sequence[RequestPrompt],
                List[TokensPrompt]]:
+        resolved_content_format = resolve_chat_template_content_format(
+            chat_template,
+            chat_template_content_format,
+            tokenizer,
+        )
         conversation, mm_data_future = parse_chat_messages_futures(
             messages,
             self.model_config,
             tokenizer,
+            content_format=resolved_content_format,
         )
 
         _chat_template_kwargs: Dict[str, Any] = dict(
diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py
index 1fd82304f7a4d..59b3b1311f881 100644
--- a/vllm/entrypoints/openai/serving_tokenization.py
+++ b/vllm/entrypoints/openai/serving_tokenization.py
@@ -1,8 +1,8 @@
-from typing import List, Optional, Union
+from typing import Final, List, Optional, Union
 
 from vllm.config import ModelConfig
 from vllm.engine.protocol import EngineClient
-from vllm.entrypoints.chat_utils import load_chat_template
+from vllm.entrypoints.chat_utils import ChatTemplateContentFormatOption
 from vllm.entrypoints.logger import RequestLogger
 # yapf conflicts with isort for this block
 # yapf: disable
@@ -33,7 +33,8 @@ def __init__(
         lora_modules: Optional[List[LoRAModulePath]],
         request_logger: Optional[RequestLogger],
         chat_template: Optional[str],
-    ):
+        chat_template_content_format: ChatTemplateContentFormatOption,
+    ) -> None:
         super().__init__(engine_client=engine_client,
                          model_config=model_config,
                          base_model_paths=base_model_paths,
@@ -41,12 +42,8 @@ def __init__(
                          prompt_adapters=None,
                          request_logger=request_logger)
 
-        # If this is None we use the tokenizer's default chat template
-        # the list of commonly-used chat template names for HF named templates
-        hf_chat_templates: List[str] = ['default', 'tool_use']
-        self.chat_template = chat_template \
-            if chat_template in hf_chat_templates \
-            else load_chat_template(chat_template)
+        self.chat_template = chat_template
+        self.chat_template_content_format: Final = chat_template_content_format
 
     async def create_tokenize(
         self,
@@ -75,9 +72,12 @@ async def create_tokenize(
                     request,
                     tokenizer,
                     request.messages,
-                    chat_template=self.chat_template,
+                    chat_template=request.chat_template or self.chat_template,
+                    chat_template_content_format=self.
+                    chat_template_content_format,
                     add_generation_prompt=request.add_generation_prompt,
                     continue_final_message=request.continue_final_message,
+                    chat_template_kwargs=request.chat_template_kwargs,
                     add_special_tokens=request.add_special_tokens,
                 )
             else:

From 755b85359be910fabe39a75299439fc11beb57d4 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 15 Nov 2024 21:46:27 -0800
Subject: [PATCH 163/183] [doc] add doc for the plugin system (#10372)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 docs/source/design/class_hierarchy.rst |  2 +
 docs/source/design/plugin_system.rst   | 62 ++++++++++++++++++++++++++
 docs/source/index.rst                  |  1 +
 docs/source/models/adding_model.rst    | 25 +++--------
 vllm/plugins/__init__.py               | 16 +++++--
 5 files changed, 84 insertions(+), 22 deletions(-)
 create mode 100644 docs/source/design/plugin_system.rst

diff --git a/docs/source/design/class_hierarchy.rst b/docs/source/design/class_hierarchy.rst
index 15f0c8ccf77ee..58a888b17ba53 100644
--- a/docs/source/design/class_hierarchy.rst
+++ b/docs/source/design/class_hierarchy.rst
@@ -1,3 +1,5 @@
+.. _class_hierarchy:
+
 vLLM's Class Hierarchy
 =======================
 
diff --git a/docs/source/design/plugin_system.rst b/docs/source/design/plugin_system.rst
new file mode 100644
index 0000000000000..bfca702b9267a
--- /dev/null
+++ b/docs/source/design/plugin_system.rst
@@ -0,0 +1,62 @@
+.. _plugin_system:
+
+vLLM's Plugin System
+====================
+
+The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM.
+
+How Plugins Work in vLLM
+------------------------
+
+Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see :ref:`class_hierarchy`), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the `load_general_plugins <https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16>`__ function in the ``vllm.plugins`` module. This function is called for every process created by vLLM before it starts any work.
+
+How vLLM Discovers Plugins
+--------------------------
+
+vLLM's plugin system uses the standard Python ``entry_points`` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin:
+
+.. code-block:: python
+
+    # inside `setup.py` file
+    from setuptools import setup
+
+    setup(name='vllm_add_dummy_model',
+          version='0.1',
+          packages=['vllm_add_dummy_model'],
+          entry_points={
+              'vllm.general_plugins':
+              ["register_dummy_model = vllm_add_dummy_model:register"]
+          })
+    
+    # inside `vllm_add_dummy_model.py` file
+    def register():
+        from vllm import ModelRegistry
+
+        if "MyLlava" not in ModelRegistry.get_supported_archs():
+            ModelRegistry.register_model("MyLlava",
+                                            "vllm_add_dummy_model.my_llava:MyLlava")
+
+For more information on adding entry points to your package, please check the `official documentation <https://setuptools.pypa.io/en/latest/userguide/entry_point.html>`__.
+
+Every plugin has three parts:
+
+1. **Plugin group**: The name of the entry point group. vLLM uses the entry point group ``vllm.general_plugins`` to register general plugins. This is the key of ``entry_points`` in the ``setup.py`` file. Always use ``vllm.general_plugins`` for vLLM's general plugins.
+
+2. **Plugin name**: The name of the plugin. This is the value in the dictionary of the ``entry_points`` dictionary. In the example above, the plugin name is ``register_dummy_model``. Plugins can be filtered by their names using the ``VLLM_PLUGINS`` environment variable. To load only a specific plugin, set ``VLLM_PLUGINS`` to the plugin name.
+
+3. **Plugin value**: The fully qualified name of the function to register in the plugin system. In the example above, the plugin value is ``vllm_add_dummy_model:register``, which refers to a function named ``register`` in the ``vllm_add_dummy_model`` module.
+
+What Can Plugins Do?
+--------------------
+
+Currently, the primary use case for plugins is to register custom, out-of-the-tree models into vLLM. This is done by calling ``ModelRegistry.register_model`` to register the model. In the future, the plugin system may be extended to support more features, such as swapping in custom implementations for certain classes in vLLM.
+
+Guidelines for Writing Plugins
+------------------------------
+
+- **Being re-entrant**: The function specified in the entry point should be re-entrant, meaning it can be called multiple times without causing issues. This is necessary because the function might be called multiple times in some processes.
+
+Compatibility Guarantee
+-----------------------
+
+vLLM guarantees the interface of documented plugins, such as ``ModelRegistry.register_model``, will always be available for plugins to register models. However, it is the responsibility of plugin developers to ensure their plugins are compatible with the version of vLLM they are targeting. For example, ``"vllm_add_dummy_model.my_llava:MyLlava"`` should be compatible with the version of vLLM that the plugin targets. The interface for the model may change during vLLM's development.
\ No newline at end of file
diff --git a/docs/source/index.rst b/docs/source/index.rst
index a2abd2995b1cc..3b2698a8845ed 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -158,6 +158,7 @@ Documentation
 
    design/class_hierarchy
    design/huggingface_integration
+   design/plugin_system
    design/input_processing/model_inputs_index
    design/kernel/paged_attention
    design/multimodal/multimodal_index
diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst
index c6d88cc38e99b..a70ebf99c746f 100644
--- a/docs/source/models/adding_model.rst
+++ b/docs/source/models/adding_model.rst
@@ -102,11 +102,11 @@ This method should load the weights from the HuggingFace's checkpoint file and a
 Finally, register your :code:`*ForCausalLM` class to the :code:`_VLLM_MODELS` in `vllm/model_executor/models/registry.py <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/registry.py>`_.
 
 6. Out-of-Tree Model Integration
---------------------------------------------
+--------------------------------
 
-We also provide a way to integrate a model without modifying the vLLM codebase. Step 2, 3, 4 are still required, but you can skip step 1 and 5.
+You can integrate a model without modifying the vLLM codebase. Steps 2, 3, and 4 are still required, but you can skip steps 1 and 5. Instead, write a plugin to register your model. For general introduction of the plugin system, see :ref:`plugin_system`.
 
-Just add the following lines in your code:
+To register the model, use the following code:
 
 .. code-block:: python
 
@@ -114,7 +114,7 @@ Just add the following lines in your code:
     from your_code import YourModelForCausalLM
     ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
 
-If your model imports modules that initialize CUDA, consider instead lazy-importing it to avoid an error like :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`:
+If your model imports modules that initialize CUDA, consider lazy-importing it to avoid errors like :code:`RuntimeError: Cannot re-initialize CUDA in forked subprocess`:
 
 .. code-block:: python
 
@@ -123,19 +123,8 @@ If your model imports modules that initialize CUDA, consider instead lazy-import
     ModelRegistry.register_model("YourModelForCausalLM", "your_code:YourModelForCausalLM")
 
 .. important::
-    If your model is a multimodal model, make sure the model class implements the :class:`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
+    If your model is a multimodal model, ensure the model class implements the :class:`~vllm.model_executor.models.interfaces.SupportsMultiModal` interface.
     Read more about that :ref:`here <enabling_multimodal_inputs>`.
 
-If you are running api server with :code:`vllm serve <args>`, you can wrap the entrypoint with the following code:
-
-.. code-block:: python
-
-    from vllm import ModelRegistry
-    from your_code import YourModelForCausalLM
-    ModelRegistry.register_model("YourModelForCausalLM", YourModelForCausalLM)
-
-    if __name__ == '__main__':
-        import runpy
-        runpy.run_module('vllm.entrypoints.openai.api_server', run_name='__main__')
-
-Save the above code in a file and run it with :code:`python your_file.py <args>`.
+.. note::
+    Although you can directly put these code snippets in your script using ``vllm.LLM``, the recommended way is to place these snippets in a vLLM plugin. This ensures compatibility with various vLLM features like distributed inference and the API server.
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index 8373e11cfff9f..9fca724599012 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -27,16 +27,24 @@ def load_general_plugins():
     allowed_plugins = envs.VLLM_PLUGINS
 
     discovered_plugins = entry_points(group='vllm.general_plugins')
+    logger.info("Available plugins:")
+    for plugin in discovered_plugins:
+        logger.info("name=%s, value=%s, group=%s", plugin.name, plugin.value,
+                    plugin.group)
+    if allowed_plugins is None:
+        logger.info("all available plugins will be loaded.")
+        logger.info("set environment variable VLLM_PLUGINS to control"
+                    " which plugins to load.")
+    else:
+        logger.info("plugins to load: %s", allowed_plugins)
     for plugin in discovered_plugins:
-        logger.info("Found general plugin: %s", plugin.name)
         if allowed_plugins is None or plugin.name in allowed_plugins:
             try:
                 func = plugin.load()
                 func()
-                logger.info("Loaded general plugin: %s", plugin.name)
+                logger.info("plugin %s loaded.", plugin.name)
             except Exception:
-                logger.exception("Failed to load general plugin: %s",
-                                 plugin.name)
+                logger.exception("Failed to load plugin %s", plugin.name)
 
 
 _torch_compile_backend: Optional[Union[Callable, str]] = None

From 2f427c2d163b5c6d5923a8808e9d786e170944ce Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 16 Nov 2024 01:23:20 -0800
Subject: [PATCH 164/183] [misc][plugin] improve log messages (#10386)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/plugins/__init__.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index 9fca724599012..7b1bbb14c5302 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -27,6 +27,9 @@ def load_general_plugins():
     allowed_plugins = envs.VLLM_PLUGINS
 
     discovered_plugins = entry_points(group='vllm.general_plugins')
+    if len(discovered_plugins) == 0:
+        logger.info("No plugins found.")
+        return
     logger.info("Available plugins:")
     for plugin in discovered_plugins:
         logger.info("name=%s, value=%s, group=%s", plugin.name, plugin.value,

From 1d754726265d52773653e53e1a18f6eb63122480 Mon Sep 17 00:00:00 2001
From: rasmith <Randall.Smith@amd.com>
Date: Sat, 16 Nov 2024 03:55:05 -0600
Subject: [PATCH 165/183] [BugFix] [Kernel] Fix GPU SEGV occuring in fused_moe
 kernel (#10385)

Signed-off-by: Randall Smith <Randall.Smith@amd.com>
---
 vllm/model_executor/layers/fused_moe/fused_moe.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 340da32263c1c..e6f9f01ef0f74 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -105,16 +105,18 @@ def fused_moe_kernel(
     num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
     if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
         return
-    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(
+        tl.int64)
     offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
     token_mask = offs_token < num_valid_tokens
 
-    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_bn = (pid_n * BLOCK_SIZE_N +
+               tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N
     offs_k = tl.arange(0, BLOCK_SIZE_K)
     a_ptrs = a_ptr + (offs_token[:, None] // top_k * stride_am +
                       offs_k[None, :] * stride_ak)
 
-    off_experts = tl.load(expert_ids_ptr + pid_m)
+    off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64)
     b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk +
                                                 offs_bn[None, :] * stride_bn)
     if use_int8_w8a16:

From 8b6725b0cf4ee5f363218f4bc341970c80297ccf Mon Sep 17 00:00:00 2001
From: Jaehyun An <steve.ai@kakaocorp.com>
Date: Sat, 16 Nov 2024 19:15:40 +0900
Subject: [PATCH 166/183] [Misc] Update benchmark to support image_url file or
 http (#10287)

Signed-off-by: rbbang <anjaehyun87@gmail.com>
---
 benchmarks/benchmark_serving.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index bdb8ea8e2a5dc..e9fc037a46965 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -251,6 +251,19 @@ def sample_hf_requests(
                     "url": f"data:image/jpeg;base64,{image_base64}"
                 },
             }
+        elif "image" in data and isinstance(data["image"], str):
+            if (data["image"].startswith("http://") or \
+                data["image"].startswith("file://")):
+                image_url = data["image"]
+            else:
+                image_url = f"file://{data['image']}"
+
+            mm_content = {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                },
+            }
         else:
             mm_content = None
 

From b98d89efd4b1a09c11c4d0cf30c9af0e93514764 Mon Sep 17 00:00:00 2001
From: Sky Lee <46676799+skylee-01@users.noreply.github.com>
Date: Sun, 17 Nov 2024 00:33:01 +0800
Subject: [PATCH 167/183] [Misc] Medusa supports custom bias (#10361)

---
 vllm/model_executor/models/medusa.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py
index de5b2d89c0962..b05360b55466b 100644
--- a/vllm/model_executor/models/medusa.py
+++ b/vllm/model_executor/models/medusa.py
@@ -14,11 +14,14 @@
 
 class ResidualBlock(nn.Module):
 
-    def __init__(self, hidden_size: int, num_layers: int) -> None:
+    def __init__(self, config: VllmConfig, hidden_size: int,
+                 num_layers: int) -> None:
         super().__init__()
 
         self.layers = nn.ModuleList([
-            nn.Linear(hidden_size, hidden_size, bias=False)
+            nn.Linear(hidden_size,
+                      hidden_size,
+                      bias=getattr(config, "medusa_fc_bias", False))
             for _ in range(num_layers)
         ])
         self.act = nn.SiLU()
@@ -49,7 +52,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
         self.config = config
         self.blocks = nn.ModuleList([
-            ResidualBlock(hidden_size=self.config.hidden_size,
+            ResidualBlock(config=config,
+                          hidden_size=self.config.hidden_size,
                           num_layers=self.config.num_hidden_layers)
             for _ in range(self.config.num_heads)
         ])

From 361c29e1740e0b2186f8cca3ed96ad235a8a960a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=B5=E8=84=91=E6=98=9F=E4=BA=BA?= <kerorek@outlook.com>
Date: Sun, 17 Nov 2024 02:10:00 +0800
Subject: [PATCH 168/183] [Bugfix] Fix M-RoPE position calculation when chunked
 prefill is enabled (#10388)

Signed-off-by: imkero <kerorek@outlook.com>
---
 .../vision_language/test_qwen2_vl.py          | 136 +++++++++++++++++-
 .../model_executor/layers/rotary_embedding.py |   3 +-
 vllm/worker/model_runner.py                   |   1 +
 3 files changed, 135 insertions(+), 5 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/test_qwen2_vl.py b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
index 718c675b86fb4..71b6ba4dca435 100644
--- a/tests/models/decoder_only/vision_language/test_qwen2_vl.py
+++ b/tests/models/decoder_only/vision_language/test_qwen2_vl.py
@@ -18,6 +18,7 @@
 
 IMAGE_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>"
 VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
+MODEL_HIDDEN_SIZE = 1536
 
 
 def qwen2_vl_chat_template(*query):
@@ -230,7 +231,7 @@ def batch_make_video_embeddings(
     return result
 
 
-def run_test(
+def run_embedding_input_test(
     vllm_runner: Type[VllmRunner],
     inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]],
     model: str,
@@ -326,7 +327,7 @@ def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
             [],
         ) for image, prompt in zip(images, IMAGE_PROMPTS)]
 
-    run_test(
+    run_embedding_input_test(
         vllm_runner,
         inputs_per_case,
         model,
@@ -371,7 +372,7 @@ def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
                                     [],
                                 )]
 
-    run_test(
+    run_embedding_input_test(
         vllm_runner,
         inputs_per_case,
         model,
@@ -416,7 +417,134 @@ def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
             [rescale_video_size(video, factor) for factor in size_factors],
         ) for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)]
 
-    run_test(
+    run_embedding_input_test(
+        vllm_runner,
+        inputs_per_case,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
+
+
+def run_chunked_prefill_test(
+    vllm_runner: Type[VllmRunner],
+    inputs: List[Tuple[List[str], PromptImageInput, PromptVideoInput]],
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    mm_limit: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    """Compare inference result between
+    chunked prefill disabled and chunked prefill enabled
+    """
+
+    # NOTE:
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model,
+                     task="generate",
+                     max_model_len=4000,
+                     max_num_seqs=4,
+                     dtype=dtype,
+                     limit_mm_per_prompt={
+                         "image": mm_limit,
+                         "video": mm_limit
+                     },
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend
+                     ) as vllm_model:
+
+        outputs_per_case = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images or None,
+                                                videos=videos or None)
+            for prompts, images, videos in inputs
+        ]
+
+    with vllm_runner(
+            model,
+            task="generate",
+            max_model_len=4000,
+            max_num_seqs=4,
+            dtype=dtype,
+            limit_mm_per_prompt={
+                "image": mm_limit,
+                "video": mm_limit
+            },
+            tensor_parallel_size=tensor_parallel_size,
+            distributed_executor_backend=distributed_executor_backend,
+            enable_chunked_prefill=True,
+            # should be small enough to ensure prefilling is chunked
+            max_num_batched_tokens=32,
+            mm_processor_kwargs={
+                "max_pixels": 16 * 28 * 28,
+            }) as vllm_model_chunked:
+        outputs_per_case_chunked = [
+            vllm_model_chunked.generate_greedy_logprobs(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images or None,
+                videos=videos or None) for prompts, images, videos in inputs
+        ]
+
+    for outputs, \
+        outputs_chunked \
+        in zip(outputs_per_case,
+            outputs_per_case_chunked):
+        check_logprobs_close(
+            outputs_0_lst=outputs,
+            outputs_1_lst=outputs_chunked,
+            name_0="non_chunked",
+            name_1="chunked",
+        )
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [1])
+@pytest.mark.parametrize("num_logprobs", [10])
+def test_qwen2_vl_mrope_chunked_prefill(vllm_runner, example_prompts,
+                                        model: str, dtype: str,
+                                        max_tokens: int,
+                                        num_logprobs: int) -> None:
+    """
+    Test Qwen2-VL's chunked prefill with M-RoPE
+    """
+    prompts = [
+        qwen2_vl_chat_template(IMAGE_PLACEHOLDER, prompt)
+        for prompt in example_prompts[:1]
+    ]
+
+    # 1. Qwen2-VL's M-RoPE works only when there are some multi-modal inputs,
+    #    so an image is included in the inputs
+    # 2. however, Qwen2-VL currently won't work properly
+    #    when chunked prefill is enabled and there are some multi-modal inputs,
+    #    here use a hacky way: provide a **zero-length** image to make it happy
+    #
+    # and finally we achieved:
+    # (1) chunked_prefill enabled; (2) M-RoPE works; to continue our tests
+    zero_len_image = {
+        "image_embeds": torch.empty((0, MODEL_HIDDEN_SIZE)),
+        "image_grid_thw": torch.tensor([[0, 0, 0]])
+    }
+    images = [zero_len_image] * len(prompts)
+
+    inputs_per_case: List[Tuple[List[str], PromptImageInput,
+                                PromptVideoInput]] = [
+                                    (prompts, images, []),
+                                ]
+
+    run_chunked_prefill_test(
         vllm_runner,
         inputs_per_case,
         model,
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 63ceec63e8317..b01e4c61fe101 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -847,6 +847,7 @@ def get_input_positions(
         vision_end_token_id: int,
         spatial_merge_size: int,
         context_len: int = 0,
+        seq_len: Optional[int] = None,
     ) -> Tuple[List[List[int]], int]:
         """Get mrope input positions and delta value."""
 
@@ -921,7 +922,7 @@ def get_input_positions(
                 torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
 
         llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-        llm_positions = llm_positions[:, context_len:]
+        llm_positions = llm_positions[:, context_len:seq_len]
         mrope_position_delta = (llm_positions.max() + 1 -
                                 len(input_tokens)).item()
 
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 042f9f07eace6..22ee3f9f863e4 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -700,6 +700,7 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
                         spatial_merge_size=hf_config.vision_config.
                         spatial_merge_size,
                         context_len=inter_data.context_lens[seq_idx],
+                        seq_len=inter_data.seq_lens[seq_idx],
                     )
 
                 seq_data.mrope_position_delta = mrope_position_delta

From 661a34fd4fdd700a29b2db758e23e4e243e7ff18 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Sat, 16 Nov 2024 10:45:26 -0800
Subject: [PATCH 169/183] [V1] Add code owners for V1 (#10397)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 .github/CODEOWNERS | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index cd721971d01d6..3cb91fc0f8232 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -3,13 +3,16 @@
 
 # This lists cover the "core" components of vLLM that require careful review
 /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/core @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/engine/llm_engine.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/executor/executor_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/worker/worker_base.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/worker/worker.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-/vllm/model_executor/layers/sampler.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
-CMakeLists.txt @tlrmchlsmth @WoosukKwon
+/vllm/core @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
+CMakeLists.txt @tlrmchlsmth
+
+# vLLM V1
+/vllm/v1 @WoosukKwon @robertgshaw2-neuralmagic @njhill @ywang96 @comaniac @alexm-neuralmagic
 
 # Test ownership
 /tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo

From 4fd937502827a7e06c54ded1f9d9b70ff640e222 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 16 Nov 2024 18:02:14 -0800
Subject: [PATCH 170/183] [2/N][torch.compile] make compilation cfg part of
 vllm cfg (#10383)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 tests/compile/piecewise/test_simple.py        |   8 +-
 tests/compile/piecewise/test_toy_llama.py     |  22 +-
 tests/compile/test_basic_correctness.py       |   2 +-
 tests/compile/test_full_graph.py              |   2 +-
 tests/compile/test_fusion.py                  |   2 +-
 tests/compile/test_wrapper.py                 |   4 +-
 tests/compile/utils.py                        |   2 +-
 .../model_executor/test_enabled_custom_ops.py |  52 ++---
 tests/tpu/test_compilation.py                 |   2 +-
 tests/tpu/test_custom_dispatcher.py           |   2 +-
 vllm/compilation/backends.py                  |  20 +-
 vllm/compilation/config.py                    | 159 ---------------
 vllm/compilation/decorators.py                |  10 +-
 vllm/compilation/fusion.py                    |   2 +-
 vllm/compilation/inductor_pass.py             |   2 +-
 vllm/compilation/levels.py                    |   8 -
 vllm/compilation/wrapper.py                   |  11 +-
 vllm/config.py                                | 189 ++++++++++++++++++
 vllm/envs.py                                  |  13 --
 vllm/model_executor/custom_op.py              |  27 +--
 vllm/model_executor/model_loader/loader.py    |   7 +-
 vllm/platforms/interface.py                   |  20 +-
 vllm/platforms/tpu.py                         |  21 +-
 vllm/plugins/__init__.py                      |  30 ++-
 vllm/v1/worker/gpu_model_runner.py            |  10 +-
 vllm/worker/model_runner.py                   |   7 +-
 vllm/worker/tpu_model_runner.py               |   8 +-
 27 files changed, 359 insertions(+), 283 deletions(-)
 delete mode 100644 vllm/compilation/config.py
 delete mode 100644 vllm/compilation/levels.py

diff --git a/tests/compile/piecewise/test_simple.py b/tests/compile/piecewise/test_simple.py
index c631850ecdedb..45f56cbbd4b16 100644
--- a/tests/compile/piecewise/test_simple.py
+++ b/tests/compile/piecewise/test_simple.py
@@ -11,8 +11,8 @@
 from vllm.compilation.compile_context import set_compile_context
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
-from vllm.compilation.levels import CompilationLevel
-from vllm.config import VllmConfig
+from vllm.config import CompilationLevel, VllmConfig
+from vllm.plugins import set_current_vllm_config
 from vllm.utils import direct_register_custom_op
 
 global_counter = 0
@@ -82,7 +82,9 @@ def test_simple_piecewise_compile():
     os.environ["VLLM_TORCH_COMPILE_CONFIG"] = config
     os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.PIECEWISE)
 
-    model = SillyModel(vllm_config=VllmConfig(), prefix='')
+    vllm_config = VllmConfig()
+    with set_current_vllm_config(vllm_config):
+        model = SillyModel(vllm_config=vllm_config, prefix='')
 
     inputs = torch.randn(100).cuda()
 
diff --git a/tests/compile/piecewise/test_toy_llama.py b/tests/compile/piecewise/test_toy_llama.py
index c363a587a818e..8032304e95806 100644
--- a/tests/compile/piecewise/test_toy_llama.py
+++ b/tests/compile/piecewise/test_toy_llama.py
@@ -15,12 +15,10 @@
 from torch.library import Library
 
 from vllm.compilation.compile_context import set_compile_context
-from vllm.compilation.config import CompilationConfig
 from vllm.compilation.counter import compilation_counter
 from vllm.compilation.decorators import support_torch_compile
-from vllm.compilation.levels import CompilationLevel
-from vllm.config import VllmConfig
-from vllm.plugins import set_compilation_config
+from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
+from vllm.plugins import set_compilation_config, set_current_vllm_config
 from vllm.utils import direct_register_custom_op
 
 # create a library to hold the custom op
@@ -272,9 +270,11 @@ def run_model(llama_config,
             CompilationLevel.NO_COMPILATION)
         set_compilation_config(None)
 
-    model = LlamaModel(config=llama_config,
-                       vllm_config=VllmConfig(),
-                       prefix="").eval().cuda()
+    vllm_config = VllmConfig()
+    with set_current_vllm_config(vllm_config):
+        model = LlamaModel(config=llama_config,
+                           vllm_config=vllm_config,
+                           prefix="").eval().cuda()
 
     B = 16  # max batch size
     input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
@@ -395,9 +395,11 @@ def benchmark():
         else:
             set_compilation_config(None)
 
-        model = LlamaModel(config=llama_config,
-                           vllm_config=VllmConfig(),
-                           prefix="").eval().cuda().to(torch.bfloat16)
+        vllm_config = VllmConfig()
+        with set_current_vllm_config(vllm_config):
+            model = LlamaModel(config=llama_config,
+                               vllm_config=vllm_config,
+                               prefix="").eval().cuda().to(torch.bfloat16)
 
         B = 256  # max batch size
         input_ids = torch.randint(0, llama_config.vocab_size, (B, )).cuda()
diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py
index 833589ba5dc9f..08747ebc58b75 100644
--- a/tests/compile/test_basic_correctness.py
+++ b/tests/compile/test_basic_correctness.py
@@ -3,7 +3,7 @@
 
 import pytest
 
-from vllm.compilation.levels import CompilationLevel
+from vllm.config import CompilationLevel
 from vllm.utils import cuda_device_count_stateless
 
 from ..utils import compare_all_settings
diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index f00334934cb46..4dfdfe21a67df 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -1,6 +1,6 @@
 import pytest
 
-from vllm.compilation.levels import CompilationLevel
+from vllm.config import CompilationLevel
 
 from ..utils import fork_new_process_for_each_test
 from .utils import TEST_MODELS, check_full_graph_support
diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py
index e4d3defafb951..4db79b070fd8d 100644
--- a/tests/compile/test_fusion.py
+++ b/tests/compile/test_fusion.py
@@ -3,10 +3,10 @@
 from compressed_tensors.quantization import FP8_DTYPE
 
 import vllm.envs as envs
-from vllm.compilation.config import CompilationConfig
 from vllm.compilation.fusion import (FusionPass, find_auto_fn,
                                      find_auto_fn_maybe)
 from vllm.compilation.reshapes import RedundantReshapesPass
+from vllm.config import CompilationConfig
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     apply_fp8_linear)
diff --git a/tests/compile/test_wrapper.py b/tests/compile/test_wrapper.py
index 3668c1fab6b89..74f66baaa5ea1 100644
--- a/tests/compile/test_wrapper.py
+++ b/tests/compile/test_wrapper.py
@@ -3,6 +3,7 @@
 import torch
 
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
+from vllm.config import CompilationLevel
 
 
 class MyMod(torch.nn.Module):
@@ -18,7 +19,8 @@ class MyWrapper(TorchCompileWrapperWithCustomDispatcher):
     def __init__(self, model):
         self.model = model
         compiled_callable = torch.compile(self.forward, backend="eager")
-        super().__init__(compiled_callable)
+        super().__init__(compiled_callable,
+                         compilation_level=CompilationLevel.DYNAMO_ONCE)
 
     def forward(self, x: torch.Tensor, cache: Optional[torch.Tensor] = None):
         # this is the function to be compiled
diff --git a/tests/compile/utils.py b/tests/compile/utils.py
index 222c63a342a4b..729f10676888b 100644
--- a/tests/compile/utils.py
+++ b/tests/compile/utils.py
@@ -4,7 +4,7 @@
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
-from vllm.compilation.levels import CompilationLevel
+from vllm.config import CompilationLevel
 from vllm.platforms import current_platform
 
 TEST_MODELS = [
diff --git a/tests/model_executor/test_enabled_custom_ops.py b/tests/model_executor/test_enabled_custom_ops.py
index af267f804ffa7..c3219bc50646b 100644
--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
@@ -3,11 +3,13 @@
 
 import pytest
 
+from vllm.config import CompilationConfig, VllmConfig
 from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.activation import (GeluAndMul,
                                                    ReLUSquaredActivation,
                                                    SiluAndMul)
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.plugins import set_current_vllm_config
 
 
 # Registered subclass for test
@@ -51,42 +53,40 @@ class Relu3(ReLUSquaredActivation):
     ])
 def test_enabled_ops(env: str, torch_level: int, ops_enabled: List[int],
                      default_on: bool):
-    os.environ["VLLM_CUSTOM_OPS"] = env
     os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(torch_level)
+    vllm_config = VllmConfig(compilation_config=CompilationConfig(
+        custom_ops=env.split(",")))
+    with set_current_vllm_config(vllm_config):
+        assert CustomOp.default_on() == default_on
 
-    # Reset default_on (computed once):
-    CustomOp.default_on.cache_clear()
+        ops_enabled = [bool(x) for x in ops_enabled]
 
-    assert CustomOp.default_on() == default_on
+        assert RMSNorm(1024).enabled() == ops_enabled[0]
+        assert CustomOp.op_registry["rms_norm"].enabled() == ops_enabled[0]
 
-    ops_enabled = [bool(x) for x in ops_enabled]
+        assert SiluAndMul().enabled() == ops_enabled[1]
+        assert CustomOp.op_registry["silu_and_mul"].enabled() == ops_enabled[1]
 
-    assert RMSNorm(1024).enabled() == ops_enabled[0]
-    assert CustomOp.op_registry["rms_norm"].enabled() == ops_enabled[0]
+        assert GeluAndMul().enabled() == ops_enabled[2]
+        assert CustomOp.op_registry["gelu_and_mul"].enabled() == ops_enabled[2]
 
-    assert SiluAndMul().enabled() == ops_enabled[1]
-    assert CustomOp.op_registry["silu_and_mul"].enabled() == ops_enabled[1]
+        # If registered, subclasses should follow their own name
+        assert Relu3().enabled() == ops_enabled[3]
+        assert CustomOp.op_registry["relu3"].enabled() == ops_enabled[3]
 
-    assert GeluAndMul().enabled() == ops_enabled[2]
-    assert CustomOp.op_registry["gelu_and_mul"].enabled() == ops_enabled[2]
+        # Unregistered subclass
+        class SiluAndMul2(SiluAndMul):
+            pass
 
-    # If registered, subclasses should follow their own name
-    assert Relu3().enabled() == ops_enabled[3]
-    assert CustomOp.op_registry["relu3"].enabled() == ops_enabled[3]
-
-    # Unregistered subclass
-    class SiluAndMul2(SiluAndMul):
-        pass
-
-    # Subclasses should not require registration
-    assert SiluAndMul2().enabled() == SiluAndMul().enabled()
+        # Subclasses should not require registration
+        assert SiluAndMul2().enabled() == SiluAndMul().enabled()
 
 
 @pytest.mark.parametrize(
     "env", ["all,none", "all,+rms_norm,all", "+rms_norm,-rms_norm"])
 def test_enabled_ops_invalid(env: str):
-    os.environ["VLLM_CUSTOM_OPS"] = env
-    CustomOp.default_on.cache_clear()
-
-    with pytest.raises(AssertionError):
-        RMSNorm(1024).enabled()
+    with pytest.raises(Exception):  # noqa
+        vllm_config = VllmConfig(compilation_config=CompilationConfig(
+            custom_ops=env.split(",")))
+        with set_current_vllm_config(vllm_config):
+            RMSNorm(1024).enabled()
diff --git a/tests/tpu/test_compilation.py b/tests/tpu/test_compilation.py
index 86d9af88e49ea..941abe17a3378 100644
--- a/tests/tpu/test_compilation.py
+++ b/tests/tpu/test_compilation.py
@@ -5,7 +5,7 @@
 
 import depyf
 
-from vllm.compilation.levels import CompilationLevel
+from vllm.config import CompilationLevel
 
 # disable custom dispatcher, let Dynamo takes over
 # all the control
diff --git a/tests/tpu/test_custom_dispatcher.py b/tests/tpu/test_custom_dispatcher.py
index 923d0f1680802..53b10c06135a1 100644
--- a/tests/tpu/test_custom_dispatcher.py
+++ b/tests/tpu/test_custom_dispatcher.py
@@ -1,6 +1,6 @@
 import os
 
-from vllm.compilation.levels import CompilationLevel
+from vllm.config import CompilationLevel
 
 from ..utils import compare_two_settings
 
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 5682faa158069..22c613931f082 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -10,13 +10,12 @@
 import torch.fx as fx
 
 import vllm.envs as envs
+from vllm.config import CompilationConfig, CompilationLevel
 from vllm.logger import init_logger
 from vllm.utils import combine_fx_passes, weak_ref_tensors
 
-from .config import CompilationConfig
 from .counter import compilation_counter
 from .fusion import FusionPass
-from .levels import CompilationLevel
 from .reshapes import RedundantReshapesPass
 
 logger = init_logger(__name__)
@@ -392,7 +391,10 @@ class VllmBackend:
     sym_tensor_indices: List[int]
     input_buffers: List[torch.Tensor]
 
-    def __init__(self, post_grad_passes: Sequence[Callable] = ()):
+    def __init__(
+        self,
+        compilation_configs: CompilationConfig,
+    ):
         global global_graph_pool
         if global_graph_pool is None:
             global_graph_pool = torch.cuda.graph_pool_handle()
@@ -401,11 +403,13 @@ def __init__(self, post_grad_passes: Sequence[Callable] = ()):
         # streams, it might not be safe to share a global pool.
         # only investigate this when we use multiple streams
         self.graph_pool = global_graph_pool
-        self.post_grad_passes = post_grad_passes
+        self.post_grad_passes = []
 
         self.sym_tensor_indices = []
         self.input_buffers = []
 
+        self.compilation_configs = compilation_configs
+
         # `torch.compile` is JIT compiled, so we don't need to
         # do anything here
 
@@ -437,10 +441,10 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
         assert not self._called, "VllmBackend can only be called once"
 
         self.graph = graph
-        # config is read now, because only here can
+        # config is updated now, because only here can
         # we get the sizes to capture for cudagraph
         # from compilation context
-        self.compilation_configs = CompilationConfig.select_and_init_config()
+        self.compilation_configs.init_during_runtime()
         self.add_passes_to_config()
 
         self.split_gm, self.piecewise_graphs = split_graph(
@@ -688,4 +692,6 @@ def select_default_backend(level: int) -> Union[str, Callable]:
         return backend_str
     assert level == CompilationLevel.PIECEWISE
 
-    return VllmBackend()
+    from vllm.plugins import get_current_vllm_config
+    compilation_config = get_current_vllm_config().compilation_config
+    return VllmBackend(compilation_config)
diff --git a/vllm/compilation/config.py b/vllm/compilation/config.py
deleted file mode 100644
index 3e663505c627d..0000000000000
--- a/vllm/compilation/config.py
+++ /dev/null
@@ -1,159 +0,0 @@
-import copy
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-
-from pydantic import BaseModel, Field, PrivateAttr
-
-import vllm.envs as envs
-from vllm.logger import init_logger
-
-from .compile_context import get_compile_context
-
-logger = init_logger(__name__)
-
-
-class CompilationConfig(BaseModel):
-    """
-    Configuration for compilation.
-    It has two parts:
-    - CudaGraph capture:
-        - use_cudagraph: whether to use cudagraph inside compilation.
-            - False: cudagraph inside compilation is not used.
-            - True: cudagraph inside compilation is used. It requires
-                that all input buffers have fixed addresses.
-            Note that this is orthogonal to the cudagraph capture out
-            side of compilation.
-            TODO: move outside cudagraph logic into compilation.
-            torch.compile will handle cudagraph capture logic in the future.
-        - cudagraph_capture_sizes: sizes to capture cudagraph.
-            - None: capture sizes are inferred from compilation context.
-            - List[int]: capture sizes are specified.
-        - cudagraph_num_of_warmups: number of warmup runs for cudagraph.
-            It means the first several runs will be treated as warmup runs.
-            Only after that, the execution will be recorded, and the recorded
-            cudagraph will be used for subsequent runs.
-        - cudagraph_copy_inputs: whether to copy input tensors for
-            cudagraph. If the caller can guarantee that the same input buffers
-            are always used, it can set this to False. Otherwise, it should
-            set this to True, and the compiler will copy the input to an
-            internally managed buffer. Default is False.
-    - Inductor compilation:
-        - use_inductor: whether to use inductor compilation.
-            - False: inductor compilation is not used. graph runs in eager.
-            - True: inductor compilation is used. one graph for symbolic shape
-                is compiled. In addition, compile for different sizes specified
-                in inductor_compile_sizes, using configurations
-                in inductor_compile_config.
-        - inductor_compile_sizes: sizes to compile for inductor.
-        - inductor_specialize_for_cudagraph_no_more_than: an optional integer
-            to specialize inductor for cudagraph sizes no more than the
-            specified size. It is useful when we want to specialize inductor
-            with a subset of cudagraph sizes.
-        - inductor_compile_config: additional configurations for inductor.
-            - None: use default configurations.
-        - inductor_passes: additional passes for inductor. It is a dictionary
-            from pass name to pass function qualified name. We use function
-            name because the config uses json format. If we pass the config
-            from Python, functions can also be passed directly via Python object
-            constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`
-    - Custom inductor passes:
-        - dump_graph_stages: list of stages for which we want to dump the graph.
-            Each pass defines its own stages (before, after, maybe in-between).
-        - dump_graph_dir: directory to dump the graph. Default is .
-        - enable_fusion: whether to enable the custom fusion pass.
-            TODO better pass enabling system.
-    
-    Why we have different sizes for cudagraph and inductor:
-    - cudagraph: a cudagraph captured for a specific size can only be used
-        for the same size. We need to capture all the sizes we want to use.
-    - inductor: a graph compiled by inductor for a general shape can be used
-        for different sizes. Inductor can also compile for specific sizes,
-        where it can have more information to optimize the graph with fully
-        static shapes. However, we find the general shape compilation is
-        sufficient for most cases. It might be beneficial to compile for
-        certain small batchsizes, where inductor is good at optimizing.
-    """
-    use_inductor: bool = True
-    inductor_specialize_for_cudagraph_no_more_than: Optional[int] = None
-    inductor_compile_sizes: Optional[List[int]] = Field(default_factory=dict)
-    inductor_compile_config: Dict = Field(default_factory=dict)
-    inductor_passes: Dict[str, str] = Field(default_factory=dict)
-
-    use_cudagraph: bool = False
-    non_cudagraph_ops: List[str] = Field(default_factory=list)
-    cudagraph_num_of_warmups: int = 0
-    cudagraph_capture_sizes: Optional[List[int]] = None
-    cudagraph_copy_inputs: bool = False
-
-    dump_graph_stages: List[str] = Field(default_factory=list)
-    dump_graph_dir: Path = Field(default=Path("."))
-    enable_fusion: bool = True
-
-    # not configurable, computed after init
-    compile_sizes: List[int] = PrivateAttr
-    capture_sizes: List[int] = PrivateAttr
-
-    def model_post_init(self, __context: Any) -> None:
-        for k, v in self.inductor_passes.items():
-            if not isinstance(v, str):
-                assert callable(v), (
-                    f"pass {k} should be a function or a qualified name")
-                self.inductor_compile_config[k] = v
-                continue
-
-            # resolve function from qualified name
-            names = v.split(".")
-            module = ".".join(names[:-1])
-            func_name = names[-1]
-            func = __import__(module).__dict__[func_name]
-            self.inductor_compile_config[k] = func
-
-    def init_during_runtime(self):
-        """To complete the initialization of config,
-        we need to know the compile context, which is only available
-        during the first run of the model.
-        """
-        context = get_compile_context()
-        context = copy.deepcopy(context) if context is not None else []
-        sizes_to_specialize: List[int] = context
-        if self.cudagraph_capture_sizes is None:
-            self.capture_sizes = sizes_to_specialize
-        else:
-            self.capture_sizes = self.cudagraph_capture_sizes
-            logger.info(("cudagraph sizes specified by model runner"
-                         " %s is overridden by config %s"),
-                        sizes_to_specialize, self.cudagraph_capture_sizes)
-        if self.inductor_specialize_for_cudagraph_no_more_than is not None:
-            assert self.inductor_compile_sizes is None, (
-                "inductor_compile_sizes should be None when "
-                "inductor_specialize_for_cudagraph_no_more_than is not None")
-            self.compile_sizes = [
-                x for x in self.capture_sizes
-                if x <= self.inductor_specialize_for_cudagraph_no_more_than
-            ]
-        else:
-            assert self.inductor_compile_sizes is not None, (
-                "inductor_compile_sizes should not be None when "
-                "inductor_specialize_for_cudagraph_no_more_than is None")
-            self.compile_sizes = self.inductor_compile_sizes
-
-    @staticmethod
-    def select_and_init_config() -> "CompilationConfig":
-        """The order of selecting config is:
-        1. Use the config specified in environment variable.
-        2. Use the config specified in plugins.
-        3. Use the default config.
-        """
-        config_path = envs.VLLM_TORCH_COMPILE_CONFIG
-        if config_path is not None:
-            with open(config_path) as json_file:
-                config = CompilationConfig.model_validate_json(
-                    json_file.read())
-        else:
-            from vllm.plugins import get_compilation_config
-            predefined_config = get_compilation_config()
-            config = predefined_config if predefined_config is not None else (
-                CompilationConfig())
-
-        config.init_during_runtime()
-        return config
diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py
index ca1e96a33c014..4b78491bc5a48 100644
--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -3,10 +3,8 @@
 
 import torch
 
-import vllm.envs as envs
-from vllm.compilation.levels import CompilationLevel
 from vllm.compilation.wrapper import TorchCompileWrapperWithCustomDispatcher
-from vllm.config import VllmConfig
+from vllm.config import CompilationLevel, VllmConfig
 from vllm.logger import init_logger
 from vllm.sequence import IntermediateTensors
 from vllm.utils import supports_dynamo
@@ -126,12 +124,14 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = '', **kwargs):
         old_init(self, vllm_config=vllm_config, prefix=prefix, **kwargs)
         # for CompilationLevel.DYNAMO_AS_IS , the upper level model runner
         # will handle the compilation, so we don't need to do anything here.
-        self.do_not_compile = envs.VLLM_TORCH_COMPILE_LEVEL in [
+        self.do_not_compile = \
+            vllm_config.compilation_config.level in [
             CompilationLevel.NO_COMPILATION, CompilationLevel.DYNAMO_AS_IS
         ] or not supports_dynamo()
         if self.do_not_compile:
             return
-        TorchCompileWrapperWithCustomDispatcher.__init__(self)
+        TorchCompileWrapperWithCustomDispatcher.__init__(
+            self, compilation_level=vllm_config.compilation_config.level)
 
     cls.__init__ = __init__  # type: ignore
 
diff --git a/vllm/compilation/fusion.py b/vllm/compilation/fusion.py
index eb43604b1399b..e6a3afef85e1b 100644
--- a/vllm/compilation/fusion.py
+++ b/vllm/compilation/fusion.py
@@ -6,8 +6,8 @@
 from torch._inductor.pattern_matcher import (Match, PatternMatcherPass,
                                              fwd_only, register_replacement)
 
-from vllm.compilation.config import CompilationConfig
 from vllm.compilation.inductor_pass import InductorPass
+from vllm.config import CompilationConfig
 from vllm.logger import init_logger
 
 logger = init_logger(__name__)
diff --git a/vllm/compilation/inductor_pass.py b/vllm/compilation/inductor_pass.py
index b23351fa19759..8082a08b40019 100644
--- a/vllm/compilation/inductor_pass.py
+++ b/vllm/compilation/inductor_pass.py
@@ -2,7 +2,7 @@
 
 import torch
 
-from vllm.compilation.config import CompilationConfig
+from vllm.config import CompilationConfig
 # yapf: disable
 from vllm.distributed import get_tensor_model_parallel_rank as get_tp_rank
 from vllm.distributed import (
diff --git a/vllm/compilation/levels.py b/vllm/compilation/levels.py
deleted file mode 100644
index 19a3a2b526870..0000000000000
--- a/vllm/compilation/levels.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# constants for the levels of the compilation process
-
-
-class CompilationLevel:
-    NO_COMPILATION = 0
-    DYNAMO_AS_IS = 1
-    DYNAMO_ONCE = 2
-    PIECEWISE = 3
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 7366ed4d16b0b..2a1aecc11ce26 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -8,8 +8,7 @@
 import torch
 
 import vllm.envs as envs
-
-from .levels import CompilationLevel
+from vllm.config import CompilationLevel
 
 
 class TorchCompileWrapperWithCustomDispatcher:
@@ -25,7 +24,9 @@ class TorchCompileWrapperWithCustomDispatcher:
         `torch.compile` over the forward method.
     """
 
-    def __init__(self, compiled_callable: Optional[Callable] = None):
+    def __init__(self,
+                 compiled_callable: Optional[Callable] = None,
+                 compilation_level: int = 0):
 
         if compiled_callable is None:
             # default compilation settings
@@ -38,7 +39,7 @@ def __init__(self, compiled_callable: Optional[Callable] = None):
             backend = get_torch_compile_backend()
             if backend is None:
                 from vllm.compilation.backends import select_default_backend
-                backend = select_default_backend(envs.VLLM_TORCH_COMPILE_LEVEL)
+                backend = select_default_backend(compilation_level)
 
             compiled_callable = torch.compile(
                 self.forward,
@@ -54,7 +55,7 @@ def __init__(self, compiled_callable: Optional[Callable] = None):
         # subclasses can use this to switch between the custom dispatcher
         # and the default Dynamo guard mechanism.
         self.use_custom_dispatcher: bool = \
-            envs.VLLM_TORCH_COMPILE_LEVEL >= CompilationLevel.DYNAMO_ONCE
+            compilation_level >= CompilationLevel.DYNAMO_ONCE
 
     def __call__(self, *args, **kwargs):
         """Implement the dispatch logic here, beyond the torch.compile level.
diff --git a/vllm/config.py b/vllm/config.py
index 64b2f75e092de..7e37edbe594b1 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -3,10 +3,12 @@
 import json
 import warnings
 from dataclasses import dataclass, field, replace
+from pathlib import Path
 from typing import (TYPE_CHECKING, Any, Callable, ClassVar, Dict, Final, List,
                     Literal, Mapping, Optional, Set, Tuple, Type, Union)
 
 import torch
+from pydantic import BaseModel, Field, PrivateAttr
 from transformers import PretrainedConfig
 
 import vllm.envs as envs
@@ -2052,6 +2054,185 @@ def __post_init__(self):
                 f"installed. Original error:\n{otel_import_error_traceback}")
 
 
+class CompilationLevel:
+    # constants for the levels of the compilation process
+    NO_COMPILATION = 0
+    DYNAMO_AS_IS = 1
+    DYNAMO_ONCE = 2
+    PIECEWISE = 3
+
+
+class CompilationConfig(BaseModel):
+    """
+    Configuration for compilation.
+    It has three parts:
+    - Top-level Compilation control:
+        - level: the level of compilation.
+            - 0: no compilation.
+            - 1: dynamo as is.
+            - 2: dynamo once.
+            - 3: piecewise compilation.
+        - custom_ops: fine-grained control over which custom ops to enable/disable.
+            Use 'all' to enable all, 'none' to disable all.
+            Also specify a list of custom op names to enable (prefixed with a '+'),
+            or disable (prefixed with a '-').
+            Examples:
+                - 'all,-op1' to enable all except op1
+                - 'none,+op1,+op2' to enable only op1 and op2
+            By default, all custom ops are enabled when running without Inductor
+                and disabled when running with Inductor (compile_level >= Inductor).
+    - CudaGraph capture:
+        - use_cudagraph: whether to use cudagraph inside compilation.
+            - False: cudagraph inside compilation is not used.
+            - True: cudagraph inside compilation is used. It requires
+                that all input buffers have fixed addresses.
+            Note that this is orthogonal to the cudagraph capture out
+            side of compilation.
+            TODO: move outside cudagraph logic into compilation.
+            torch.compile will handle cudagraph capture logic in the future.
+        - cudagraph_capture_sizes: sizes to capture cudagraph.
+            - None: capture sizes are inferred from compilation context.
+            - List[int]: capture sizes are specified.
+        - cudagraph_num_of_warmups: number of warmup runs for cudagraph.
+            It means the first several runs will be treated as warmup runs.
+            Only after that, the execution will be recorded, and the recorded
+            cudagraph will be used for subsequent runs.
+        - cudagraph_copy_inputs: whether to copy input tensors for
+            cudagraph. If the caller can guarantee that the same input buffers
+            are always used, it can set this to False. Otherwise, it should
+            set this to True, and the compiler will copy the input to an
+            internally managed buffer. Default is False.
+    - Inductor compilation:
+        - use_inductor: whether to use inductor compilation.
+            - False: inductor compilation is not used. graph runs in eager.
+            - True: inductor compilation is used. one graph for symbolic shape
+                is compiled. In addition, compile for different sizes specified
+                in inductor_compile_sizes, using configurations
+                in inductor_compile_config.
+        - inductor_compile_sizes: sizes to compile for inductor.
+        - inductor_specialize_for_cudagraph_no_more_than: an optional integer
+            to specialize inductor for cudagraph sizes no more than the
+            specified size. It is useful when we want to specialize inductor
+            with a subset of cudagraph sizes.
+        - inductor_compile_config: additional configurations for inductor.
+            - None: use default configurations.
+        - inductor_passes: additional passes for inductor. It is a dictionary
+            from pass name to pass function qualified name. We use function
+            name because the config uses json format. If we pass the config
+            from Python, functions can also be passed directly via Python object
+            constructor, e.g. `CompilationConfig(inductor_passes={"a": func})`
+        - custom inductor passes:
+            - dump_graph_stages: list of stages for which we want to dump the graph.
+                Each pass defines its own stages (before, after, maybe in-between).
+            - dump_graph_dir: directory to dump the graph. Default is .
+            - enable_fusion: whether to enable the custom fusion pass.
+                TODO better pass enabling system.
+    
+    Why we have different sizes for cudagraph and inductor:
+    - cudagraph: a cudagraph captured for a specific size can only be used
+        for the same size. We need to capture all the sizes we want to use.
+    - inductor: a graph compiled by inductor for a general shape can be used
+        for different sizes. Inductor can also compile for specific sizes,
+        where it can have more information to optimize the graph with fully
+        static shapes. However, we find the general shape compilation is
+        sufficient for most cases. It might be beneficial to compile for
+        certain small batchsizes, where inductor is good at optimizing.
+    """ # noqa
+    level: int = 0
+    custom_ops: List[str] = Field(default_factory=list)
+
+    use_inductor: bool = True
+    inductor_specialize_for_cudagraph_no_more_than: Optional[int] = None
+    inductor_compile_sizes: Optional[List[int]] = Field(default_factory=dict)
+    inductor_compile_config: Dict = Field(default_factory=dict)
+    inductor_passes: Dict[str, str] = Field(default_factory=dict)
+
+    use_cudagraph: bool = False
+    non_cudagraph_ops: List[str] = Field(default_factory=list)
+    cudagraph_num_of_warmups: int = 0
+    cudagraph_capture_sizes: Optional[List[int]] = None
+    cudagraph_copy_inputs: bool = False
+
+    dump_graph_stages: List[str] = Field(default_factory=list)
+    dump_graph_dir: Path = Field(default=Path("."))
+    enable_fusion: bool = True
+
+    # not configurable, computed after init
+    compile_sizes: List[int] = PrivateAttr
+    capture_sizes: List[int] = PrivateAttr
+
+    def model_post_init(self, __context: Any) -> None:
+        self.level = envs.VLLM_TORCH_COMPILE_LEVEL
+
+        count_none = self.custom_ops.count("none")
+        count_all = self.custom_ops.count("all")
+        assert count_none + count_all <= 1, "Can only specify 'none' or 'all'"
+
+        for k, v in self.inductor_passes.items():
+            if not isinstance(v, str):
+                assert callable(v), (
+                    f"pass {k} should be a function or a qualified name")
+                self.inductor_compile_config[k] = v
+                continue
+
+            # resolve function from qualified name
+            names = v.split(".")
+            module = ".".join(names[:-1])
+            func_name = names[-1]
+            func = __import__(module).__dict__[func_name]
+            self.inductor_compile_config[k] = func
+
+    def init_during_runtime(self):
+        """To complete the initialization of config,
+        we need to know the compile context, which is only available
+        during the first run of the model.
+        """
+        from vllm.compilation.compile_context import get_compile_context
+        context = get_compile_context()
+        context = copy.deepcopy(context) if context is not None else []
+        sizes_to_specialize: List[int] = context
+        if self.cudagraph_capture_sizes is None:
+            self.capture_sizes = sizes_to_specialize
+        else:
+            self.capture_sizes = self.cudagraph_capture_sizes
+            logger.info(("cudagraph sizes specified by model runner"
+                         " %s is overridden by config %s"),
+                        sizes_to_specialize, self.cudagraph_capture_sizes)
+        if self.inductor_specialize_for_cudagraph_no_more_than is not None:
+            assert self.inductor_compile_sizes is None, (
+                "inductor_compile_sizes should be None when "
+                "inductor_specialize_for_cudagraph_no_more_than is not None")
+            self.compile_sizes = [
+                x for x in self.capture_sizes
+                if x <= self.inductor_specialize_for_cudagraph_no_more_than
+            ]
+        else:
+            assert self.inductor_compile_sizes is not None, (
+                "inductor_compile_sizes should not be None when "
+                "inductor_specialize_for_cudagraph_no_more_than is None")
+            self.compile_sizes = self.inductor_compile_sizes
+
+    @staticmethod
+    def select_and_init_config() -> "CompilationConfig":
+        """The order of selecting config is:
+        1. Use the config specified in environment variable.
+        2. Use the config specified in plugins.
+        3. Use the default config.
+        """
+        config_path = envs.VLLM_TORCH_COMPILE_CONFIG
+        if config_path is not None:
+            with open(config_path) as json_file:
+                config = CompilationConfig.model_validate_json(
+                    json_file.read())
+        else:
+            from vllm.plugins import get_compilation_config
+            predefined_config = get_compilation_config()
+            config = predefined_config if predefined_config is not None else (
+                CompilationConfig())
+
+        return config
+
+
 @dataclass
 class VllmConfig:
     """Dataclass which contains all vllm-related configuration. This
@@ -2073,6 +2254,8 @@ class VllmConfig:
     observability_config: Optional[ObservabilityConfig] = None
     prompt_adapter_config: Optional[PromptAdapterConfig] = None
     quant_config: Optional[QuantizationConfig] = None
+    compilation_config: CompilationConfig = field(default=None,
+                                                  init=True)  # type: ignore
 
     @staticmethod
     def _get_quantization_config(
@@ -2133,6 +2316,12 @@ def __post_init__(self):
             self.quant_config = VllmConfig._get_quantization_config(
                 self.model_config, self.load_config)
 
+        if self.compilation_config is None:
+            self.compilation_config = CompilationConfig.select_and_init_config(
+            )
+
+        current_platform.check_and_update_config(self)
+
     def __str__(self):
         return ("model=%r, speculative_config=%r, tokenizer=%r, "
         "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, "
diff --git a/vllm/envs.py b/vllm/envs.py
index f320e35971f94..716e835a555f1 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -69,7 +69,6 @@
     VLLM_SKIP_P2P_CHECK: bool = False
     VLLM_TORCH_COMPILE_LEVEL: int = 0
     VLLM_TORCH_COMPILE_CONFIG: Optional[str] = None
-    VLLM_CUSTOM_OPS: List[str] = []
     VLLM_DISABLED_KERNELS: List[str] = []
     VLLM_USE_V1: bool = False
     VLLM_ENABLE_V1_MULTIPROCESSING: bool = False
@@ -217,18 +216,6 @@ def get_default_config_root():
     "VLLM_TORCH_COMPILE_CONFIG":
     lambda: os.environ.get("VLLM_TORCH_COMPILE_CONFIG", None),
 
-    # Fine-grained control over which custom ops to enable/disable.
-    # Use 'all' to enable all, 'none' to disable all.
-    # Also specify a list of custom op names to enable (prefixed with a '+'),
-    # or disable (prefixed with a '-').
-    # Examples:
-    # - 'all,-op1' to enable all except op1
-    # - 'none,+op1,+op2' to enable only op1 and op2
-    # By default, all custom ops are enabled when running without Inductor
-    # and disabled when running with Inductor (compile_level >= Inductor).
-    "VLLM_CUSTOM_OPS":
-    lambda: os.environ.get("VLLM_CUSTOM_OPS", "").replace(" ", "").split(","),
-
     # local rank of the process in the distributed setting, used to determine
     # the GPU device id
     "LOCAL_RANK":
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
index 24d75f4df4e02..6ae7d7cf6964f 100644
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -1,12 +1,10 @@
-from functools import lru_cache
 from typing import Dict, Type
 
 import torch.nn as nn
 
-import vllm.envs as envs
-from vllm.compilation.levels import CompilationLevel
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
+from vllm.plugins import get_current_vllm_config
 from vllm.utils import print_warning_once
 
 logger = init_logger(__name__)
@@ -87,6 +85,8 @@ def dispatch_forward(self):
     @classmethod
     def enabled(cls) -> bool:
         # if no name, then it was not registered
+        compilation_config = get_current_vllm_config().compilation_config
+        custom_ops = compilation_config.custom_ops
         if not hasattr(cls, "name"):
             print_warning_once(
                 f"Custom op {cls.__name__} was not registered, "
@@ -94,22 +94,25 @@ def enabled(cls) -> bool:
                 f"It will be enabled/disabled based on the global settings.")
             return CustomOp.default_on()
 
-        enabled = f"+{cls.name}" in envs.VLLM_CUSTOM_OPS
-        disabled = f"-{cls.name}" in envs.VLLM_CUSTOM_OPS
+        enabled = f"+{cls.name}" in custom_ops
+        disabled = f"-{cls.name}" in custom_ops
         assert not (enabled
                     and disabled), f"Cannot enable and disable {cls.name}"
 
         return (CustomOp.default_on() or enabled) and not disabled
 
-    # On by default if VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.PIECEWISE
-    # Specifying 'all' or 'none' in VLLM_CUSTOM_OPS takes precedence.
     @staticmethod
-    @lru_cache
     def default_on() -> bool:
-        count_none = envs.VLLM_CUSTOM_OPS.count("none")
-        count_all = envs.VLLM_CUSTOM_OPS.count("all")
-        assert count_none + count_all <= 1, "Can only specify 'none' or 'all'"
-        return envs.VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.PIECEWISE and \
+        """
+        On by default if level < CompilationLevel.PIECEWISE
+        Specifying 'all' or 'none' in custom_op takes precedence.
+        """
+        from vllm.config import CompilationLevel
+        compilation_config = get_current_vllm_config().compilation_config
+        custom_ops = compilation_config.custom_ops
+        count_none = custom_ops.count("none")
+        count_all = custom_ops.count("all")
+        return compilation_config.level < CompilationLevel.PIECEWISE and \
             not count_none > 0 or count_all > 0
 
     # Dictionary of all custom ops (classes, indexed by registered name).
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 140b61fe6d56a..0f8b81c3ef40c 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -42,6 +42,7 @@
     safetensors_weights_iterator)
 from vllm.model_executor.utils import set_weight_attrs
 from vllm.platforms import current_platform
+from vllm.plugins import set_current_vllm_config
 from vllm.utils import is_pin_memory_available
 
 
@@ -97,7 +98,8 @@ def _initialize_model(vllm_config: VllmConfig, prefix: str = "") -> nn.Module:
     all_params = [param.name for param in signatures.parameters.values()]
     if "vllm_config" in all_params and "prefix" in all_params:
         # new-style model class
-        return model_class(vllm_config=vllm_config, prefix=prefix)
+        with set_current_vllm_config(vllm_config):
+            return model_class(vllm_config=vllm_config, prefix=prefix)
     msg = ("vLLM model class should accept `vllm_config` and `prefix` as "
            "input arguments. Possibly you have an old-style model class"
            " registered from out of tree and it is used for new vLLM version. "
@@ -121,7 +123,8 @@ def _initialize_model(vllm_config: VllmConfig, prefix: str = "") -> nn.Module:
         kwargs["lora_config"] = vllm_config.lora_config
     if "scheduler_config" in all_params:
         kwargs["scheduler_config"] = vllm_config.scheduler_config
-    return model_class(**kwargs)
+    with set_current_vllm_config(vllm_config):
+        return model_class(**kwargs)
 
 
 class BaseModelLoader(ABC):
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
index 81d8bdae2383c..970c0d1be617e 100644
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -1,10 +1,15 @@
 import enum
 import random
-from typing import NamedTuple, Optional, Tuple, Union
+from typing import TYPE_CHECKING, NamedTuple, Optional, Tuple, Union
 
 import numpy as np
 import torch
 
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+
 
 class PlatformEnum(enum.Enum):
     CUDA = enum.auto()
@@ -129,6 +134,19 @@ def seed_everything(cls, seed: int) -> None:
         np.random.seed(seed)
         torch.manual_seed(seed)
 
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        """
+        Check and update the configuration for the current platform.
+
+        It can raise an exception if the configuration is not compatible with
+        the current platform, or it can update the configuration to make it
+        compatible with the current platform.
+
+        The config is passed by reference, so it can be modified in place.
+        """
+        pass
+
 
 class UnspecifiedPlatform(Platform):
     _enum = PlatformEnum.UNSPECIFIED
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index 8d0ce47df4040..c2e22bfc09f22 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -1,18 +1,16 @@
 import os
+from typing import TYPE_CHECKING
 
 import torch
 
-import vllm.envs as envs
-from vllm.compilation.levels import CompilationLevel
 from vllm.plugins import set_torch_compile_backend
 
 from .interface import Platform, PlatformEnum
 
-if "VLLM_TORCH_COMPILE_LEVEL" not in os.environ:
-    os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(CompilationLevel.DYNAMO_ONCE)
-
-assert envs.VLLM_TORCH_COMPILE_LEVEL < CompilationLevel.PIECEWISE,\
-     "TPU does not support Inductor."
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
 
 set_torch_compile_backend("openxla")
 
@@ -31,3 +29,12 @@ def get_device_total_memory(cls, device_id: int = 0) -> int:
     @classmethod
     def inference_mode(cls):
         return torch.no_grad()
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        from vllm.config import CompilationLevel
+        compilation_config = vllm_config.compilation_config
+        if "VLLM_TORCH_COMPILE_LEVEL" not in os.environ:
+            compilation_config.level = CompilationLevel.DYNAMO_ONCE
+        assert compilation_config.level < CompilationLevel.PIECEWISE,\
+            "TPU does not support Inductor."
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index 7b1bbb14c5302..c20b9ec891d5d 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -1,11 +1,11 @@
 import logging
+from contextlib import contextmanager
 from typing import TYPE_CHECKING, Callable, Optional, Union
 
 import vllm.envs as envs
 
 if TYPE_CHECKING:
-    from vllm.compilation.config import CompilationConfig
-    from vllm.config import VllmConfig
+    from vllm.config import CompilationConfig, VllmConfig
 else:
     CompilationConfig = None
     VllmConfig = None
@@ -72,3 +72,29 @@ def set_compilation_config(config: Optional[CompilationConfig]):
 
 def get_compilation_config() -> Optional[CompilationConfig]:
     return _compilation_config
+
+
+_current_vllm_config: Optional[VllmConfig] = None
+
+
+@contextmanager
+def set_current_vllm_config(vllm_config: VllmConfig):
+    """
+    Temporarily set the current VLLM config.
+    Used during model initialization.
+    We save the current VLLM config in a global variable,
+    so that all modules can access it, e.g. custom ops
+    can access the VLLM config to determine how to dispatch.
+    """
+    global _current_vllm_config
+    old_vllm_config = _current_vllm_config
+    try:
+        _current_vllm_config = vllm_config
+        yield
+    finally:
+        _current_vllm_config = old_vllm_config
+
+
+def get_current_vllm_config() -> VllmConfig:
+    assert _current_vllm_config is not None, "Current VLLM config is not set."
+    return _current_vllm_config
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index eebd1de96537f..d60f93a44f6dd 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1,4 +1,3 @@
-import os
 import time
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple
@@ -8,11 +7,8 @@
 import torch.distributed
 import torch.nn as nn
 
-from vllm import envs
 from vllm.compilation.compile_context import set_compile_context
-from vllm.compilation.config import CompilationConfig
-from vllm.compilation.levels import CompilationLevel
-from vllm.config import VllmConfig
+from vllm.config import CompilationConfig, CompilationLevel, VllmConfig
 from vllm.forward_context import set_forward_context
 from vllm.inputs import INPUT_REGISTRY, InputRegistry
 from vllm.logger import init_logger
@@ -99,7 +95,7 @@ def __init__(
             pin_memory=self.pin_memory,
         )
 
-        self.use_cuda_graph = (envs.VLLM_TORCH_COMPILE_LEVEL
+        self.use_cuda_graph = (self.vllm_config.compilation_config.level
                                == CompilationLevel.PIECEWISE
                                and not self.model_config.enforce_eager)
         # TODO(woosuk): Provide an option to tune the max cudagraph batch size.
@@ -517,9 +513,9 @@ def load_model(self) -> None:
             # CUDA graphs do not work properly with the custom CUDA kernels.
             # FIXME(woosuk): Disable inductor to reduce the compilation time
             # and avoid any potential issues with the inductor.
-            os.environ["VLLM_CUSTOM_OPS"] = "none"
             set_compilation_config(
                 CompilationConfig(
+                    custom_ops=["none"],
                     use_cudagraph=True,
                     non_cudagraph_ops=["vllm.unified_v1_flash_attention"],
                     use_inductor=True,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 22ee3f9f863e4..fd89f95445565 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -19,8 +19,7 @@
 from vllm.attention.backends.abstract import AttentionState
 from vllm.attention.backends.utils import CommonAttentionState
 from vllm.compilation.compile_context import set_compile_context
-from vllm.compilation.levels import CompilationLevel
-from vllm.config import VllmConfig
+from vllm.config import CompilationLevel, VllmConfig
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.distributed import get_pp_group
 from vllm.distributed.parallel_state import graph_capture
@@ -1142,8 +1141,8 @@ def load_model(self) -> None:
                     "provided. Defaulting to scaling factors of 1.0. "
                     "This may lead to less accurate results!")
 
-        if envs.VLLM_TORCH_COMPILE_LEVEL == CompilationLevel.DYNAMO_AS_IS \
-            and supports_dynamo():
+        if self.vllm_config.compilation_config.level ==\
+            CompilationLevel.DYNAMO_AS_IS and supports_dynamo():
             from vllm.plugins import get_torch_compile_backend
             backend = get_torch_compile_backend() or "eager"
             self.model = torch.compile(
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index a721186137328..d7a641857a613 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -140,7 +140,7 @@ def load_model(self) -> None:
             model = get_model(vllm_config=self.vllm_config)
         model = model.eval()
         xm.wait_device_ops()
-        self.model = ModelWrapper(model)
+        self.model = ModelWrapper(model, self.vllm_config)
 
     def _dummy_run(
         self,
@@ -669,13 +669,15 @@ def execute_model(
 
 class ModelWrapper(TorchCompileWrapperWithCustomDispatcher):
 
-    def __init__(self, model: nn.Module):
+    def __init__(self, model: nn.Module, vllm_config: VllmConfig):
         self.model = model
         compiled_callable = torch.compile(self.forward,
                                           backend="openxla",
                                           fullgraph=True,
                                           dynamic=False)
-        super().__init__(compiled_callable)
+        super().__init__(
+            compiled_callable,
+            compilation_level=vllm_config.compilation_config.level)
 
     def __call__(self, *args, is_prompt: bool, **kwargs):
         if len(self.compiled_codes) < 3 or not self.use_custom_dispatcher:

From 643ecf7b11a3e74c838f438cfc1b3e59c018853b Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Sat, 16 Nov 2024 21:18:46 -0800
Subject: [PATCH 171/183] [V1] Refactor model executable interface for all
 text-only language models (#10374)

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/arctic.py        | 16 ++++++++++++++--
 vllm/model_executor/models/baichuan.py      | 16 ++++++++++++++--
 vllm/model_executor/models/bloom.py         | 17 ++++++++++++++---
 vllm/model_executor/models/commandr.py      | 16 ++++++++++++++--
 vllm/model_executor/models/dbrx.py          | 16 ++++++++++++++--
 vllm/model_executor/models/deepseek.py      | 16 ++++++++++++++--
 vllm/model_executor/models/deepseek_v2.py   | 16 ++++++++++++++--
 vllm/model_executor/models/eagle.py         | 13 ++++++++++---
 vllm/model_executor/models/exaone.py        |  7 ++++++-
 vllm/model_executor/models/falcon.py        | 16 ++++++++++++++--
 vllm/model_executor/models/gemma.py         |  7 ++++++-
 vllm/model_executor/models/gemma2.py        | 12 ++++++++++--
 vllm/model_executor/models/gpt2.py          |  7 +++++--
 vllm/model_executor/models/gpt_bigcode.py   | 17 +++++++++++++----
 vllm/model_executor/models/gpt_j.py         | 16 ++++++++++++++--
 vllm/model_executor/models/gpt_neox.py      | 16 ++++++++++++++--
 vllm/model_executor/models/granite.py       |  7 ++++++-
 vllm/model_executor/models/granitemoe.py    | 16 ++++++++++++++--
 vllm/model_executor/models/internlm2.py     |  9 +++++++--
 vllm/model_executor/models/jais.py          | 14 ++++++++++++--
 vllm/model_executor/models/jamba.py         | 16 ++++++++++++++--
 vllm/model_executor/models/mamba.py         | 15 +++++++++++++--
 vllm/model_executor/models/minicpm.py       |  7 ++++++-
 vllm/model_executor/models/mixtral.py       | 16 ++++++++++++++--
 vllm/model_executor/models/mixtral_quant.py | 16 ++++++++++++++--
 vllm/model_executor/models/mpt.py           | 16 ++++++++++++++--
 vllm/model_executor/models/nemotron.py      |  7 ++++++-
 vllm/model_executor/models/olmo.py          | 19 +++++++++++++------
 vllm/model_executor/models/olmoe.py         | 16 ++++++++++++++--
 vllm/model_executor/models/orion.py         | 16 ++++++++++++++--
 vllm/model_executor/models/persimmon.py     |  8 +++++++-
 vllm/model_executor/models/phi.py           | 16 ++++++++++++++--
 vllm/model_executor/models/phi3_small.py    | 19 +++++++++++--------
 vllm/model_executor/models/phimoe.py        | 16 ++++++++++++++--
 vllm/model_executor/models/qwen.py          | 16 ++++++++++++++--
 vllm/model_executor/models/qwen2.py         |  2 +-
 vllm/model_executor/models/qwen2_cls.py     |  7 ++++++-
 vllm/model_executor/models/qwen2_moe.py     | 16 ++++++++++++++--
 vllm/model_executor/models/qwen2_rm.py      |  7 ++++++-
 vllm/model_executor/models/solar.py         |  4 +++-
 vllm/model_executor/models/stablelm.py      | 16 ++++++++++++++--
 vllm/model_executor/models/starcoder2.py    | 16 ++++++++++++++--
 vllm/model_executor/models/xverse.py        | 16 ++++++++++++++--
 43 files changed, 483 insertions(+), 90 deletions(-)

diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index 9ee2a2cc09a24..d52418ee0f6f1 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -389,6 +389,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -396,9 +399,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -439,6 +446,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -446,9 +456,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index aabbd31192a40..01ce7c42cd391 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -284,6 +284,9 @@ def __init__(
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -291,9 +294,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -363,6 +370,9 @@ def __init__(
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -370,9 +380,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 84adf574af5e2..cf2eee8172769 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -251,6 +251,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.word_embeddings_layernorm(self.word_embeddings(input_ids))
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -258,10 +261,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.word_embeddings(input_ids)
-            hidden_states = self.word_embeddings_layernorm(hidden_states)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -301,6 +307,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -308,9 +317,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors)
+                                         attn_metadata, intermediate_tensors,
+                                         inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index cd5c1d6844716..fbb09a64cde9b 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -280,6 +280,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -287,9 +290,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -354,6 +361,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     @torch.no_grad()
     def forward(
         self,
@@ -362,9 +372,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index fff8710f6b475..3952ff31e5cec 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -321,6 +321,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.d_model))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.wte(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -328,9 +331,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.wte(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
         else:
             assert intermediate_tensors
             hidden_states = intermediate_tensors["hidden_states"]
@@ -376,6 +383,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -383,9 +393,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors)
+                                         attn_metadata, intermediate_tensors,
+                                         inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index a9bf1440c4d60..36dfea5a65656 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -353,6 +353,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -360,9 +363,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
             residual = None
         else:
             hidden_states = intermediate_tensors["hidden_states"]
@@ -401,6 +408,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -408,9 +418,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 4fb1eed15a2e7..1e32fe60c7a5b 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -445,6 +445,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -452,9 +455,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -495,6 +502,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -502,9 +512,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py
index 85c51e8404584..f138d13630263 100644
--- a/vllm/model_executor/models/eagle.py
+++ b/vllm/model_executor/models/eagle.py
@@ -78,6 +78,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
     def sampler(self):
         return self.model.sampler
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -86,11 +89,14 @@ def forward(
         attn_metadata: AttentionMetadata,
         previous_hidden_states: torch.Tensor,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
 
-        tok_embeds = self.model.model.embed_tokens(input_ids)
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings(input_ids)
+
         inputs_embeds = self.fc(
-            torch.cat([tok_embeds, previous_hidden_states], dim=-1))
+            torch.cat([inputs_embeds, previous_hidden_states], dim=-1))
 
         inputs_embeds[positions == 0] = 0  # masking inputs at position=0
 
@@ -100,7 +106,8 @@ def forward(
             positions=positions,
             kv_caches=kv_caches,
             attn_metadata=attn_metadata,
-            intermediate_tensors=intermediate_tensors)
+            intermediate_tensors=intermediate_tensors,
+        )
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index cd3e7da657e0e..52dd603ca558d 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -479,6 +479,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -486,9 +489,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         model_output = self.transformer(input_ids, positions, kv_caches,
-                                        attn_metadata, intermediate_tensors)
+                                        attn_metadata, intermediate_tensors,
+                                        inputs_embeds)
         return model_output
 
     def compute_logits(
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index b3dbf063ac298..e97abe949ccdb 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -367,6 +367,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.word_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -374,9 +377,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.word_embeddings(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
         for i in range(self.start_layer, self.end_layer):
@@ -432,6 +439,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.LongTensor,
@@ -439,9 +449,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors)
+                                         attn_metadata, intermediate_tensors,
+                                         inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 55baba809e58f..ace13664c6ea6 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -390,6 +390,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -397,9 +400,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index eeb3fd98a7eac..a60b4e73a76d4 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -272,6 +272,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: Optional[torch.Tensor],
@@ -285,7 +288,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.embed_tokens(input_ids)
+                hidden_states = self.get_input_embeddings(input_ids)
             hidden_states *= self.normalizer
             residual = None
         else:
@@ -414,6 +417,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -421,9 +427,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index cc85693f99526..fa0fdad28d161 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -209,6 +209,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.n_embd))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.wte(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -220,7 +223,7 @@ def forward(
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
             if inputs_embeds is None:
-                inputs_embeds = self.wte(input_ids)
+                inputs_embeds = self.get_input_embeddings(input_ids)
             position_embeds = self.wpe(position_ids)
             hidden_states = inputs_embeds + position_embeds
         else:
@@ -262,7 +265,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             self.transformer.make_empty_intermediate_tensors)
 
     def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.transformer.wte(input_ids)
+        return self.transformer.get_input_embeddings(input_ids)
 
     def forward(
         self,
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index ab25c66c3a887..b2fc79d0d36dc 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -218,6 +218,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.n_embd))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.wte(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -225,11 +228,12 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            inputs_embeds = self.wte(input_ids)
-            position_embeds = self.wpe(position_ids)
-            hidden_states = inputs_embeds + position_embeds
+            if inputs_embeds is None:
+                inputs_embeds = self.get_input_embeddings(input_ids)
+            hidden_states = inputs_embeds + self.wpe(position_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
 
@@ -285,6 +289,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -292,9 +299,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors)
+                                         attn_metadata, intermediate_tensors,
+                                         inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index a83d03480dde1..cec3fd12a67d6 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -201,6 +201,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.n_embd))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.wte(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -208,9 +211,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.wte(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
         for i in range(self.start_layer, self.end_layer):
@@ -250,6 +257,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -257,9 +267,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors)
+                                         attn_metadata, intermediate_tensors,
+                                         inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 794b141bfa4aa..11f286d6bcba0 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -214,6 +214,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_in(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -221,9 +224,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_in(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
         else:
             hidden_states = intermediate_tensors["hidden_states"]
         for i in range(self.start_layer, self.end_layer):
@@ -262,6 +269,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.gpt_neox.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.gpt_neox.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -269,9 +279,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.gpt_neox(input_ids, positions, kv_caches,
-                                      attn_metadata, intermediate_tensors)
+                                      attn_metadata, intermediate_tensors,
+                                      inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index d1e6e31f2b8d1..cb2583e69d88d 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -409,6 +409,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         else:
             self.lm_head = PPMissingLayer()
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -416,9 +419,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         model_output = self.model(input_ids, positions, kv_caches,
-                                  attn_metadata, intermediate_tensors)
+                                  attn_metadata, intermediate_tensors,
+                                  inputs_embeds)
         return model_output
 
     def compute_logits(
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index 2ed115c56af45..f437dd521a7d5 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -277,6 +277,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -284,9 +287,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
             hidden_states *= self.embedding_multiplier
             residual = None
         else:
@@ -366,6 +373,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
 
         self.sampler = get_sampler()
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -373,9 +383,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 21fa6983063b8..19bfe16e4d5fc 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -290,7 +290,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.tok_embeddings(input_ids)
+                hidden_states = self.get_input_embeddings(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -335,6 +335,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -342,9 +345,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index 65800c44e5a93..ee49ffb3cd87f 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -250,6 +250,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.n_embd))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.wte(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -257,9 +260,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[IntermediateTensors, torch.Tensor]:
         if get_pp_group().is_first_rank:
-            inputs_embeds = self.wte(input_ids)
+            if inputs_embeds is None:
+                inputs_embeds = self.get_input_embeddings(input_ids)
             if self.wpe is not None:
                 position_embeds = self.wpe(position_ids)
                 hidden_states = inputs_embeds + position_embeds
@@ -311,6 +316,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -318,9 +326,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[IntermediateTensors, torch.Tensor]:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors)
+                                         attn_metadata, intermediate_tensors,
+                                         inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 88fb8d5cf555a..5612dd6886385 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -292,6 +292,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.final_layernorm = RMSNorm(config.hidden_size,
                                        eps=config.rms_norm_eps)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -299,8 +302,12 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         mamba_cache_params: MambaCacheParams,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        hidden_states = self.embed_tokens(input_ids)
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.get_input_embeddings(input_ids)
         residual = None
         for i in range(len(self.layers)):
             layer = self.layers[i]
@@ -381,12 +388,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                                 config.vocab_size)
         self.sampler = get_sampler()
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(self,
                 input_ids: torch.Tensor,
                 positions: torch.Tensor,
                 kv_caches: List[KVCache],
                 attn_metadata: AttentionMetadata,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs):
         if self.mamba_cache is None:
             max_batch_size = (_get_graph_batch_size(
@@ -409,7 +420,8 @@ def forward(self,
                                               mamba_cache_tensors[1],
                                               state_indices_tensor)
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, mamba_cache_params)
+                                   attn_metadata, mamba_cache_params,
+                                   inputs_embeds)
         return hidden_states
 
     def copy_inputs_before_cuda_graphs(self, input_buffers, **kwargs):
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index 55c575e22a0f6..ac0d265a961f0 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -106,15 +106,22 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.norm_f = RMSNorm(config.hidden_size,
                               eps=config.layer_norm_epsilon)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
         attn_metadata: AttentionMetadata,
         mamba_cache_params: MambaCacheParams,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
 
-        hidden_states = self.embeddings(input_ids)
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+        else:
+            hidden_states = self.get_input_embeddings(input_ids)
         residual = None
 
         for i in range(len(self.layers)):
@@ -168,12 +175,16 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                                 config.vocab_size)
         self.sampler = get_sampler()
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.backbone.get_input_embeddings(input_ids)
+
     def forward(self,
                 input_ids: torch.Tensor,
                 positions: torch.Tensor,
                 kv_caches: List[KVCache],
                 attn_metadata: AttentionMetadata,
                 intermediate_tensors: Optional[IntermediateTensors] = None,
+                inputs_embeds: Optional[torch.Tensor] = None,
                 **kwargs):
         if self.mamba_cache is None:
             max_batch_size = (_get_graph_batch_size(
@@ -194,7 +205,7 @@ def forward(self,
                                               state_indices_tensor)
 
         hidden_states = self.backbone(input_ids, positions, attn_metadata,
-                                      mamba_cache_params)
+                                      mamba_cache_params, inputs_embeds)
 
         return hidden_states
 
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 2db953329fd91..6b67266c53362 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -504,6 +504,9 @@ def _init_model(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.model = MiniCPMModel(vllm_config=vllm_config,
                                   prefix=maybe_prefix(prefix, "model"))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -511,9 +514,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 3eb2f60fd4fc7..eebf5bab5a288 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -281,6 +281,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -288,9 +291,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -363,6 +370,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -370,9 +380,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 95cfb6f54dc10..af2e9586988df 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -318,6 +318,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -325,9 +328,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -368,6 +375,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -375,9 +385,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index e15c0fe8db060..3c74ef2448abb 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -237,6 +237,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.d_model))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.wte(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -244,9 +247,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.wte(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -283,6 +290,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.transformer.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -290,9 +300,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.transformer(input_ids, positions, kv_caches,
-                                         attn_metadata, intermediate_tensors)
+                                         attn_metadata, intermediate_tensors,
+                                         inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index e09d7088a69ce..eb45beae7d21a 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -440,6 +440,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -447,9 +450,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         model_output = self.model(input_ids, positions, kv_caches,
-                                  attn_metadata, intermediate_tensors)
+                                  attn_metadata, intermediate_tensors,
+                                  inputs_embeds)
         return model_output
 
     def compute_logits(
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 3467ae5896494..98d4e1ec320a4 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -248,6 +248,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -255,17 +258,16 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         """
         :param input_ids: A tensor of shape `(batch_size, seq_len)`.
         """
         if get_pp_group().is_first_rank:
-            # Get embeddings of input.
-            # shape: (batch_size, seq_len, d_model)
-            inputs_embeds = self.embed_tokens(input_ids)
-
-            # embed positions
-            hidden_states = inputs_embeds
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -315,6 +317,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -322,6 +327,7 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(
             input_ids=input_ids,
@@ -329,6 +335,7 @@ def forward(
             kv_caches=kv_caches,
             attn_metadata=attn_metadata,
             intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
         )
         return hidden_states
 
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index 3d31919edd862..f4eebab8c98dd 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -269,6 +269,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -276,9 +279,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -326,6 +333,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -333,9 +343,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index 38821c8288347..39d659c49cbcf 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -237,6 +237,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 "hidden_states",
             ], config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -244,9 +247,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -286,6 +293,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -293,9 +303,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index 2e34a7cc30873..62c509153a111 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -235,6 +235,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -248,7 +251,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.embed_tokens(input_ids)
+                hidden_states = self.get_input_embeddings(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -282,6 +285,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index 262f6996fc374..a2ab0d74c48db 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -218,6 +218,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -225,9 +228,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -303,6 +310,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -310,9 +320,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
 
         return hidden_states
 
diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py
index 8a5fb6d303e60..2139cec441807 100644
--- a/vllm/model_executor/models/phi3_small.py
+++ b/vllm/model_executor/models/phi3_small.py
@@ -324,11 +324,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.hidden_size))
 
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
 
     def forward(
         self,
@@ -337,9 +334,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor],
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
             if (self.mup_embedding_multiplier is not None
                     and self.mup_embedding_multiplier > 0.0):
                 hidden_states = hidden_states * self.mup_embedding_multiplier
@@ -397,8 +398,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         else:
             self.dummy_token_indices = None
 
-    def get_input_embeddings(self):
-        return self.model.embed_tokens
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
 
     def set_input_embeddings(self, value):
         self.model.embed_tokens = value
@@ -433,6 +434,7 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         output_hidden_states = self.model(
             input_ids=input_ids,
@@ -440,6 +442,7 @@ def forward(
             kv_caches=kv_caches,
             attn_metadata=attn_metadata,
             intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
         )
         output_hidden_states = output_hidden_states
         return output_hidden_states
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index 6d71a8949111b..b7e70f8fa2c6d 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -465,6 +465,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -472,9 +475,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -560,6 +567,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -567,9 +577,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(self, hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 3d26ede722dd1..447632cefcd9a 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -578,6 +578,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                                         quant_config=quant_config) if hasattr(
                                             config, "visual") else None
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.wte(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -586,6 +589,7 @@ def forward(
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
         pixel_values: Optional[QwenImageInputs],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         img_pos = None
         # If pixel / visual embeddings are provided, this is a visual model
@@ -606,6 +610,10 @@ def forward(
                 )
 
         if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
             hidden_states = self.wte(input_ids)
             # Merge the image embeddings into the hidden states if actually have
             # visual features and the corresponding image tokens
@@ -915,6 +923,9 @@ def _get_image_input_type(
                 )
         return None
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.transformer.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -922,7 +933,8 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
-        pixel_values: Optional[torch.Tensor] = None
+        pixel_values: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if intermediate_tensors is not None:
             input_ids = None
@@ -932,7 +944,7 @@ def forward(
 
         hidden_states = self.transformer(input_ids, positions, kv_caches,
                                          attn_metadata, intermediate_tensors,
-                                         pixel_values)
+                                         pixel_values, inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 431e397e1e10d..8f10df808c216 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -309,7 +309,7 @@ def forward(
             if inputs_embeds is not None:
                 hidden_states = inputs_embeds
             else:
-                hidden_states = self.embed_tokens(input_ids)
+                hidden_states = self.get_input_embeddings(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
diff --git a/vllm/model_executor/models/qwen2_cls.py b/vllm/model_executor/models/qwen2_cls.py
index 120403e948686..07eb330620a43 100644
--- a/vllm/model_executor/models/qwen2_cls.py
+++ b/vllm/model_executor/models/qwen2_cls.py
@@ -72,6 +72,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             normalize=False,
             softmax=True)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -79,9 +82,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         logits, _ = self.score(hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 51c0cd5664fd2..249d94b5d95e9 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -344,6 +344,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -351,9 +354,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
             residual = None
         else:
             assert intermediate_tensors is not None
@@ -395,6 +402,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -402,9 +412,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index 55843d8325348..6db467af334f5 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -85,6 +85,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -92,9 +95,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         logits, _ = self.score(hidden_states)
         return logits
 
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index 4f03ca501fb68..affb2c975ce4a 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -456,9 +456,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         model_output = self.model(input_ids, positions, kv_caches,
-                                  attn_metadata, intermediate_tensors)
+                                  attn_metadata, intermediate_tensors,
+                                  inputs_embeds)
         return model_output
 
     def compute_logits(self, hidden_states: torch.Tensor,
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 1125f9e9f9617..99acce596602e 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -218,6 +218,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -225,9 +228,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -265,6 +272,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -272,9 +282,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index ce7a7957f52c4..0ef940acebb93 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -221,6 +221,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(["hidden_states"],
                                                     config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -228,9 +231,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
         else:
             assert intermediate_tensors is not None
             hidden_states = intermediate_tensors["hidden_states"]
@@ -273,6 +280,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -280,9 +290,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(
diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py
index 153527da20d75..51172d8782a70 100644
--- a/vllm/model_executor/models/xverse.py
+++ b/vllm/model_executor/models/xverse.py
@@ -252,6 +252,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
             make_empty_intermediate_tensors_factory(
                 ["hidden_states", "residual"], config.hidden_size))
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -259,9 +262,13 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors],
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         if get_pp_group().is_first_rank:
-            hidden_states = self.embed_tokens(input_ids)
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.get_input_embeddings(input_ids)
             residual = None
         else:
             hidden_states = intermediate_tensors["hidden_states"]
@@ -335,6 +342,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         self.make_empty_intermediate_tensors = (
             self.model.make_empty_intermediate_tensors)
 
+    def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.get_input_embeddings(input_ids)
+
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -342,9 +352,11 @@ def forward(
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         hidden_states = self.model(input_ids, positions, kv_caches,
-                                   attn_metadata, intermediate_tensors)
+                                   attn_metadata, intermediate_tensors,
+                                   inputs_embeds)
         return hidden_states
 
     def compute_logits(

From 905d0f0af4e2c07893e36778da9ab02bde01ace8 Mon Sep 17 00:00:00 2001
From: "Chendi.Xue" <chendi.xue@intel.com>
Date: Sun, 17 Nov 2024 00:58:22 -0600
Subject: [PATCH 172/183] [CI/Build] Fix IDC hpu [Device not found] issue
 (#10384)

Signed-off-by: Chendi Xue <chendi.xue@intel.com>
---
 .buildkite/run-hpu-test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/run-hpu-test.sh b/.buildkite/run-hpu-test.sh
index 4505dc7a9373c..fa4f74fca7a11 100644
--- a/.buildkite/run-hpu-test.sh
+++ b/.buildkite/run-hpu-test.sh
@@ -13,4 +13,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-docker run --runtime=habana --name=hpu-test --network=host -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
\ No newline at end of file
+docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
\ No newline at end of file

From cf349c4a97adb36354bdc2b14448ea55279d1575 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Sun, 17 Nov 2024 15:12:04 +0800
Subject: [PATCH 173/183] [Bugfix][CPU] Fix CPU embedding runner with tensor
 parallel (#10394)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/worker/cpu_embedding_model_runner.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/vllm/worker/cpu_embedding_model_runner.py b/vllm/worker/cpu_embedding_model_runner.py
index 7053075bf4d8f..d0b8fec48d74f 100644
--- a/vllm/worker/cpu_embedding_model_runner.py
+++ b/vllm/worker/cpu_embedding_model_runner.py
@@ -66,6 +66,10 @@ def execute_model(
 
         hidden_states = model_executable(**execute_model_kwargs)
 
+        # Only perform pooling in the driver worker.
+        if not self.is_driver_worker:
+            return []
+
         return [
             self.model.pooler(hidden_states=hidden_states,
                               pooling_metadata=model_input.pooling_metadata)

From 8d74b5aee9e780852de870c936b59707835e84f5 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 16 Nov 2024 23:14:23 -0800
Subject: [PATCH 174/183] [platforms] refactor cpu code (#10402)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/executor/cpu_executor.py | 68 +----------------------------------
 vllm/platforms/cpu.py         | 60 +++++++++++++++++++++++++++++++
 2 files changed, 61 insertions(+), 67 deletions(-)

diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index 4ceb5a837dd7f..1542a2ae367eb 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -2,9 +2,6 @@
 from functools import partial
 from typing import Any, Awaitable, List, Optional, Set, Tuple, Union
 
-import vllm.envs as envs
-from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
-                         SchedulerConfig)
 from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
 from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
                                                   ResultHandler, WorkerMonitor)
@@ -13,7 +10,7 @@
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (GiB_bytes, get_distributed_init_method, get_open_port,
+from vllm.utils import (get_distributed_init_method, get_open_port,
                         get_vllm_instance_id, make_async)
 from vllm.worker.worker_base import WorkerWrapperBase
 
@@ -57,13 +54,6 @@ def _init_executor(self) -> None:
         os.environ["LOCAL_WORLD_SIZE"] = str(
             self.parallel_config.tensor_parallel_size)
 
-        self.model_config = _verify_and_get_model_config(self.model_config)
-        self.cache_config = _verify_and_get_cache_config(self.cache_config)
-        self.scheduler_config = _verify_and_get_scheduler_config(
-            self.scheduler_config)
-        self.parallel_config = _verify_and_get_parallel_config(
-            self.parallel_config)
-
         # Multiprocessing-based executor does not support multi-node setting.
         # Since it only works for single node, we can use the loopback address
         # 127.0.0.1 for communication.
@@ -313,62 +303,6 @@ async def check_health_async(self) -> None:
         self.check_health()
 
 
-def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
-    # Reminder: Please update docs/source/serving/compatibility_matrix.rst
-    # If the feature combo become valid
-    if not config.enforce_eager:
-        logger.warning(
-            "CUDA graph is not supported on CPU, fallback to the eager "
-            "mode.")
-        config.enforce_eager = True
-    return config
-
-
-def _verify_and_get_scheduler_config(
-        config: SchedulerConfig) -> SchedulerConfig:
-    # Reminder: Please update docs/source/serving/compatibility_matrix.rst
-    # If the feature combo become valid
-    if config.chunked_prefill_enabled:
-        logger.warning("Chunked prefill is not supported on CPU, disable it.")
-        config.chunked_prefill_enabled = False
-
-    return config
-
-
-def _verify_and_get_cache_config(config: CacheConfig) -> CacheConfig:
-    # Reminder: Please update docs/source/serving/compatibility_matrix.rst
-    # If the feature combo become valid
-    if config.enable_prefix_caching:
-        logger.warning("Prefix caching is not supported on CPU, disable it.")
-        config.enable_prefix_caching = False
-
-    kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
-
-    if kv_cache_space >= 0:
-        if kv_cache_space == 0:
-            config.cpu_kvcache_space_bytes = 4 * GiB_bytes  # type: ignore
-            logger.warning("Environment variable VLLM_CPU_KVCACHE_SPACE (GB) "
-                           "for CPU backend is not set, using 4 by default.")
-        else:
-            config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes  # type: ignore
-    else:
-        raise RuntimeError(
-            "Invalid environment variable VLLM_CPU_KVCACHE_SPACE"
-            f" {kv_cache_space}, expect a positive integer value.")
-
-    return config
-
-
-def _verify_and_get_parallel_config(config: ParallelConfig) -> ParallelConfig:
-    if (config.distributed_executor_backend is not None
-            and config.distributed_executor_backend != "mp"):
-        logger.warning(
-            "%s is not supported on CPU, fallback to mp distributed executor "
-            "backend.", config.distributed_executor_backend)
-        config.distributed_executor_backend = "mp"
-    return config
-
-
 def _driver_method_invoker(driver, method: str, *args, **kwargs):
     return getattr(driver, method)(*args, **kwargs)
 
diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py
index 5243f59203afc..42bee31dfb0e9 100644
--- a/vllm/platforms/cpu.py
+++ b/vllm/platforms/cpu.py
@@ -1,8 +1,19 @@
+from typing import TYPE_CHECKING
+
 import psutil
 import torch
 
+from vllm.logger import init_logger
+
 from .interface import Platform, PlatformEnum
 
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+
+logger = init_logger(__name__)
+
 
 class CpuPlatform(Platform):
     _enum = PlatformEnum.CPU
@@ -18,3 +29,52 @@ def get_device_total_memory(cls, device_id: int = 0) -> int:
     @classmethod
     def inference_mode(cls):
         return torch.no_grad()
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        import vllm.envs as envs
+        from vllm.utils import GiB_bytes
+        model_config = vllm_config.model_config
+        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
+        # If the feature combo become valid
+        if not model_config.enforce_eager:
+            logger.warning(
+                "CUDA graph is not supported on CPU, fallback to the eager "
+                "mode.")
+            model_config.enforce_eager = True
+
+        cache_config = vllm_config.cache_config
+
+        if cache_config.enable_prefix_caching:
+            logger.warning(
+                "Prefix caching is not supported on CPU, disable it.")
+            cache_config.enable_prefix_caching = False
+
+        kv_cache_space = envs.VLLM_CPU_KVCACHE_SPACE
+
+        if kv_cache_space >= 0:
+            if kv_cache_space == 0:
+                cache_config.cpu_kvcache_space_bytes = 4 * GiB_bytes  # type: ignore
+                logger.warning(
+                    "Environment variable VLLM_CPU_KVCACHE_SPACE (GB) "
+                    "for CPU backend is not set, using 4 by default.")
+            else:
+                cache_config.cpu_kvcache_space_bytes = kv_cache_space * GiB_bytes  # type: ignore # noqa
+        else:
+            raise RuntimeError(
+                "Invalid environment variable VLLM_CPU_KVCACHE_SPACE"
+                f" {kv_cache_space}, expect a positive integer value.")
+
+        scheduler_config = vllm_config.scheduler_config
+        if scheduler_config.chunked_prefill_enabled:
+            logger.warning(
+                "Chunked prefill is not supported on CPU, disable it.")
+            scheduler_config.chunked_prefill_enabled = False
+
+        parallel_config = vllm_config.parallel_config
+        if (parallel_config.distributed_executor_backend is not None
+                and parallel_config.distributed_executor_backend != "mp"):
+            logger.warning(("%s is not supported on CPU, fallback to mp "
+                            "distributed executor backend."),
+                           parallel_config.distributed_executor_backend)
+            parallel_config.distributed_executor_backend = "mp"

From 76aab90ab68476c353ad58019fd51fd18622056a Mon Sep 17 00:00:00 2001
From: Kunshang Ji <kunshang.ji@intel.com>
Date: Sun, 17 Nov 2024 16:44:44 +0800
Subject: [PATCH 175/183] [Hardware] [HPU]add `mark_step` for hpu (#10239)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
---
 vllm/worker/hpu_model_runner.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/vllm/worker/hpu_model_runner.py b/vllm/worker/hpu_model_runner.py
index 1ff30d685c6b1..99cf9a7e67256 100644
--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
@@ -272,6 +272,19 @@ def precompute_indices_and_offsets(block_size, slot_mapping, is_prompt):
     return indices, offsets
 
 
+def modify_decoder_layer(module: torch.nn.Module, suffix="DecoderLayer"):
+    if module.__class__.__name__.endswith(suffix):
+
+        def forward_hook(module, args, output):
+            htorch.core.mark_step()
+            return output
+
+        module.register_forward_hook(forward_hook)
+
+    for child_name, child_module in module.named_children():
+        modify_decoder_layer(child_module)
+
+
 class HpuModelAdapter:
 
     def __init__(self, model, block_size, dtype, enforce_eager):
@@ -636,6 +649,7 @@ def load_model(self) -> None:
             else:
                 self.model = self.model.to("hpu")
                 htcore.mark_step()
+            modify_decoder_layer(self.model)
             torch.hpu.synchronize()
 
             with HabanaMemoryProfiler() as m_wrap:

From 80d85c5d7bc33ce0ae210ebad3c45e4361b57640 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=94=B5=E8=84=91=E6=98=9F=E4=BA=BA?= <kerorek@outlook.com>
Date: Sun, 17 Nov 2024 16:50:24 +0800
Subject: [PATCH 176/183] [Bugfix] Fix mrope_position_delta in non-last prefill
 chunk (#10403)

Signed-off-by: imkero <kerorek@outlook.com>
---
 vllm/model_executor/layers/rotary_embedding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index b01e4c61fe101..117fe086e5e87 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -922,9 +922,9 @@ def get_input_positions(
                 torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
 
         llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
-        llm_positions = llm_positions[:, context_len:seq_len]
         mrope_position_delta = (llm_positions.max() + 1 -
                                 len(input_tokens)).item()
+        llm_positions = llm_positions[:, context_len:seq_len]
 
         return llm_positions.tolist(), mrope_position_delta
 

From d1557e66d3227355e5aed8018a945a5e6a733147 Mon Sep 17 00:00:00 2001
From: wchen61 <wchen61@foxmail.com>
Date: Sun, 17 Nov 2024 19:32:40 +0800
Subject: [PATCH 177/183] =?UTF-8?q?[Misc]=20Enhance=20offline=5Finference?=
 =?UTF-8?q?=20to=20support=20user-configurable=20paramet=E2=80=A6=20(#1039?=
 =?UTF-8?q?2)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: wchen61 <wchen61@foxmail.com>
---
 examples/offline_inference.py | 98 ++++++++++++++++++++++++++++-------
 1 file changed, 78 insertions(+), 20 deletions(-)

diff --git a/examples/offline_inference.py b/examples/offline_inference.py
index 9b758fa2479f6..391ac6b9b6b03 100644
--- a/examples/offline_inference.py
+++ b/examples/offline_inference.py
@@ -1,22 +1,80 @@
+from dataclasses import asdict
+
 from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.utils import FlexibleArgumentParser
+
+
+def get_prompts(num_prompts: int):
+    # The default sample prompts.
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    if num_prompts != len(prompts):
+        prompts = (prompts * ((num_prompts // len(prompts)) + 1))[:num_prompts]
+
+    return prompts
+
+
+def main(args):
+    # Create prompts
+    prompts = get_prompts(args.num_prompts)
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(n=args.n,
+                                     temperature=args.temperature,
+                                     top_p=args.top_p,
+                                     top_k=args.top_k,
+                                     max_tokens=args.max_tokens)
+
+    # Create an LLM.
+    # The default model is 'facebook/opt-125m'
+    engine_args = EngineArgs.from_cli_args(args)
+    llm = LLM(**asdict(engine_args))
+
+    # Generate texts from the prompts.
+    # The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+
+if __name__ == '__main__':
+    parser = FlexibleArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    group = parser.add_argument_group("SamplingParams options")
+    group.add_argument("--num-prompts",
+                       type=int,
+                       default=4,
+                       help="Number of prompts used for inference")
+    group.add_argument("--max-tokens",
+                       type=int,
+                       default=16,
+                       help="Generated output length for sampling")
+    group.add_argument('--n',
+                       type=int,
+                       default=1,
+                       help='Number of generated sequences per prompt')
+    group.add_argument('--temperature',
+                       type=float,
+                       default=0.8,
+                       help='Temperature for text generation')
+    group.add_argument('--top-p',
+                       type=float,
+                       default=0.95,
+                       help='top_p for text generation')
+    group.add_argument('--top-k',
+                       type=int,
+                       default=-1,
+                       help='top_k for text generation')
 
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-# Create an LLM.
-llm = LLM(model="facebook/opt-125m")
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts, sampling_params)
-# Print the outputs.
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    args = parser.parse_args()
+    main(args)

From c4e464333eac5a46e1cc2701e095a44057c82927 Mon Sep 17 00:00:00 2001
From: Isotr0py <2037008807@qq.com>
Date: Mon, 18 Nov 2024 09:07:46 +0800
Subject: [PATCH 178/183] [Misc] Add uninitialized params tracking for
 `AutoWeightsLoader` (#10327)

Signed-off-by: Isotr0py <2037008807@qq.com>
---
 vllm/model_executor/model_loader/loader.py     | 12 +++++++++++-
 vllm/model_executor/models/arctic.py           |  8 ++++++--
 vllm/model_executor/models/baichuan.py         |  8 ++++++--
 vllm/model_executor/models/bert.py             |  8 ++++++--
 vllm/model_executor/models/blip.py             | 12 ++++++++----
 vllm/model_executor/models/blip2.py            |  7 ++++---
 vllm/model_executor/models/bloom.py            |  8 ++++++--
 vllm/model_executor/models/chameleon.py        |  8 ++++++--
 vllm/model_executor/models/chatglm.py          | 10 ++++++++--
 vllm/model_executor/models/clip.py             | 11 ++++++++---
 vllm/model_executor/models/commandr.py         |  4 +++-
 vllm/model_executor/models/dbrx.py             |  8 ++++++--
 vllm/model_executor/models/decilm.py           |  8 ++++++--
 vllm/model_executor/models/deepseek.py         |  8 ++++++--
 vllm/model_executor/models/deepseek_v2.py      |  8 ++++++--
 vllm/model_executor/models/exaone.py           |  9 +++++++--
 vllm/model_executor/models/falcon.py           |  8 ++++++--
 vllm/model_executor/models/florence2.py        | 17 +++++++++++------
 vllm/model_executor/models/fuyu.py             |  8 +++++---
 vllm/model_executor/models/gemma.py            |  4 +++-
 vllm/model_executor/models/gemma2.py           |  9 ++++++---
 vllm/model_executor/models/gpt2.py             |  8 ++++++--
 vllm/model_executor/models/gpt_bigcode.py      |  8 ++++++--
 vllm/model_executor/models/gpt_j.py            |  8 ++++++--
 vllm/model_executor/models/gpt_neox.py         |  8 ++++++--
 vllm/model_executor/models/granite.py          |  9 +++++++--
 vllm/model_executor/models/granitemoe.py       |  8 +++++---
 .../models/idefics2_vision_model.py            | 11 ++++++++---
 vllm/model_executor/models/idefics3.py         |  7 ++++---
 vllm/model_executor/models/intern_vit.py       |  8 ++++++--
 vllm/model_executor/models/internlm2.py        |  8 ++++++--
 vllm/model_executor/models/internvl.py         |  7 ++++---
 vllm/model_executor/models/jais.py             |  8 ++++++--
 vllm/model_executor/models/jamba.py            |  8 ++++++--
 vllm/model_executor/models/llama.py            | 15 ++++++++++-----
 vllm/model_executor/models/llava.py            |  7 ++++---
 vllm/model_executor/models/llava_next.py       |  7 ++++---
 vllm/model_executor/models/llava_next_video.py |  7 ++++---
 vllm/model_executor/models/llava_onevision.py  |  7 ++++---
 vllm/model_executor/models/mamba.py            |  8 ++++++--
 vllm/model_executor/models/medusa.py           |  9 +++++++--
 vllm/model_executor/models/minicpm.py          |  8 ++++++--
 vllm/model_executor/models/minicpmv.py         | 14 +++++++++-----
 vllm/model_executor/models/mixtral.py          |  8 ++++++--
 vllm/model_executor/models/mixtral_quant.py    |  8 ++++++--
 vllm/model_executor/models/mllama.py           |  9 ++++++---
 vllm/model_executor/models/mlp_speculator.py   |  8 ++++++--
 vllm/model_executor/models/mpt.py              |  8 ++++++--
 vllm/model_executor/models/nemotron.py         |  8 ++++++--
 vllm/model_executor/models/olmo.py             |  8 ++++++--
 vllm/model_executor/models/olmoe.py            |  8 ++++++--
 vllm/model_executor/models/opt.py              |  8 ++++++--
 vllm/model_executor/models/orion.py            |  8 ++++++--
 vllm/model_executor/models/paligemma.py        |  7 ++++---
 vllm/model_executor/models/persimmon.py        |  8 ++++++--
 vllm/model_executor/models/phi.py              |  8 ++++++--
 vllm/model_executor/models/phi3_small.py       |  8 ++++++--
 vllm/model_executor/models/phi3v.py            |  9 ++++++---
 vllm/model_executor/models/phimoe.py           |  8 ++++++--
 vllm/model_executor/models/pixtral.py          | 12 ++++++++----
 vllm/model_executor/models/qwen.py             |  8 ++++++--
 vllm/model_executor/models/qwen2.py            | 18 ++++++++++++------
 vllm/model_executor/models/qwen2_audio.py      |  9 +++++++--
 vllm/model_executor/models/qwen2_cls.py        |  7 ++++---
 vllm/model_executor/models/qwen2_moe.py        |  8 ++++++--
 vllm/model_executor/models/qwen2_rm.py         |  7 ++++---
 vllm/model_executor/models/qwen2_vl.py         |  8 ++++++--
 vllm/model_executor/models/siglip.py           | 11 ++++++++---
 vllm/model_executor/models/solar.py            |  9 +++++++--
 vllm/model_executor/models/stablelm.py         |  8 ++++++--
 vllm/model_executor/models/starcoder2.py       |  8 ++++++--
 vllm/model_executor/models/ultravox.py         |  7 ++++---
 vllm/model_executor/models/utils.py            | 11 ++++++-----
 vllm/model_executor/models/xverse.py           |  8 ++++++--
 74 files changed, 454 insertions(+), 185 deletions(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 0f8b81c3ef40c..d9ce85949e4ee 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -334,7 +334,17 @@ def load_model(self, vllm_config: VllmConfig) -> nn.Module:
             with target_device:
                 model = _initialize_model(vllm_config=vllm_config)
 
-            model.load_weights(self._get_all_weights(model_config, model))
+            weights_to_load = {name for name, _ in model.named_parameters()}
+            loaded_weights = model.load_weights(
+                self._get_all_weights(model_config, model))
+            # We only enable strict check for non-quantiized models
+            # that have loaded weights tracking currently.
+            if model_config.quantization is None and loaded_weights is not None:
+                weights_not_loaded = weights_to_load - loaded_weights
+                if weights_not_loaded:
+                    raise ValueError(
+                        "Following weights were not initialized from "
+                        f"checkpoint: {weights_not_loaded}")
 
             for _, module in model.named_modules():
                 quant_method = getattr(module, "quant_method", None)
diff --git a/vllm/model_executor/models/arctic.py b/vllm/model_executor/models/arctic.py
index d52418ee0f6f1..e58ad19cab54c 100644
--- a/vllm/model_executor/models/arctic.py
+++ b/vllm/model_executor/models/arctic.py
@@ -1,5 +1,5 @@
 """Inference-only Snowflake Arctic model."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -480,7 +480,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -518,6 +519,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                         ("ws", f"experts.{expert_id}.w3.weight", expert_id))
 
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
 
         logger.info(
             "It will take ~10 minutes loading from the 16-bit weights. "
@@ -573,3 +575,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                         weight_loader = getattr(param, "weight_loader",
                                                 default_weight_loader)
                         weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 01ce7c42cd391..3749a16a38994 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -18,7 +18,7 @@
 # limitations under the License.
 """Inference-only BaiChuan model compatible with HuggingFace weights."""
 import math
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -404,13 +404,15 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("gate_up_proj", "gate_proj", 0),
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -449,6 +451,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
 
 
 class BaichuanForCausalLM(BaiChuanBaseForCausalLM):
diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py
index 42dd6119e76f1..d8301a36acb01 100644
--- a/vllm/model_executor/models/bert.py
+++ b/vllm/model_executor/models/bert.py
@@ -1,4 +1,4 @@
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, List, Optional, Set, Tuple
 
 import torch
 from torch import nn
@@ -337,7 +337,8 @@ def forward(
 
         return self.encoder(hidden_states, kv_caches, attn_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "query", "q"),
@@ -346,6 +347,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         ]
 
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "pooler" in name:
                 continue
@@ -368,6 +370,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
 
 
 class BertEmbeddingModel(nn.Module):
diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index e612010677364..6db6462e97f3f 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -1,6 +1,6 @@
 """Minimal implementation of BlipVisionModel intended to be only used 
 within a vision language model."""
-from typing import Iterable, Optional, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -415,7 +415,8 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
 
         return self.post_layernorm(hidden_states)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -423,6 +424,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("qkv_proj", "v_proj", "v"),
         ] if self.shard_weight else []
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         layer_count = len(self.encoder.layers)
 
         for name, loaded_weight in weights:
@@ -440,8 +442,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
-
-                param = params_dict[name.replace(weight_name, param_name)]
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
                 break
@@ -450,3 +452,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 03dc1d15ab697..7d7639b4a92ce 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -1,5 +1,5 @@
 from functools import cached_property
-from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
+from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
 import torch
@@ -692,6 +692,7 @@ def sample(
     ) -> Optional[SamplerOutput]:
         return self.language_model.sample(logits, sampling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index cf2eee8172769..1060d418474ef 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -16,7 +16,7 @@
 # limitations under the License.
 """Inference-only BLOOM model compatible with HuggingFace weights."""
 import math
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -341,8 +341,10 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if name == "lm_head.weight":
                 continue
@@ -371,3 +373,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
             weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/chameleon.py b/vllm/model_executor/models/chameleon.py
index 7b59c818e0b60..8f91abffaea90 100644
--- a/vllm/model_executor/models/chameleon.py
+++ b/vllm/model_executor/models/chameleon.py
@@ -1,5 +1,5 @@
 from functools import cached_property
-from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional,
+from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set,
                     Tuple, TypedDict, Union)
 
 import torch
@@ -1034,7 +1034,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -1044,6 +1045,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             (".gate_up_proj", ".up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -1111,3 +1113,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 70e9b607b0642..81e56381eabd8 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -3,7 +3,8 @@
 """Inference-only ChatGLM model compatible with THUDM weights."""
 from argparse import Namespace
 from array import array
-from typing import Dict, Iterable, List, Mapping, Optional, Tuple, TypedDict
+from typing import (Dict, Iterable, List, Mapping, Optional, Set, Tuple,
+                    TypedDict)
 
 import torch
 from PIL import Image
@@ -645,7 +646,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         # Merge two ColumnParallelLinear into one MergedColumnParallelLinear
         merged_weights_dict: Dict[str, Dict[str, Optional[torch.Tensor]]] = {
             "transformer.vision.linear_proj.merged_proj.weight": {
@@ -655,6 +657,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         }
 
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             is_weight_to_be_merge = False
             for _, merged_weight_dict in merged_weights_dict.items():
@@ -677,6 +680,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
             weight_loader(param, loaded_weight)
+            loaded_params.add(name)
 
         for combined_name, merged_weight_dict in merged_weights_dict.items():
             if combined_name in params_dict:
@@ -686,3 +690,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, combined_weight)
+                loaded_params.add(combined_name)
+        return loaded_params
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 2d81b9266826b..184758f4a8a45 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -1,6 +1,6 @@
 """Minimal implementation of CLIPVisionModel intended to be only used
 within a vision language model."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import numpy as np
 import torch
@@ -483,7 +483,8 @@ def device(self):
 
     # (TODO) Add prefix argument for filtering out weights to be loaded
     #        ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -491,6 +492,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("qkv_proj", "v_proj", "v"),
         ] if self.shard_weight else []
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         layer_count = len(self.vision_model.encoder.layers)
 
         for name, loaded_weight in weights:
@@ -508,8 +510,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
+                name = name.replace(weight_name, param_name)
 
-                param = params_dict[name.replace(weight_name, param_name)]
+                param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
                 break
@@ -518,3 +521,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index fbb09a64cde9b..9fd083e5a02a9 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -402,7 +402,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -447,3 +448,4 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
             loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index 3952ff31e5cec..eab338800249e 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -1,4 +1,4 @@
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -417,13 +417,15 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
 
         expert_params_mapping = [(
             "w13_weight" if weight_name in ["w1", "v1"] else "w2_weight",
             f"mlp.{weight_name}",
         ) for weight_name in ["w1", "v1", "w2"]]
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             for param_name, weight_name in expert_params_mapping:
                 if weight_name not in name:
@@ -447,3 +449,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py
index b38fd9fa49c21..c551853956b92 100644
--- a/vllm/model_executor/models/decilm.py
+++ b/vllm/model_executor/models/decilm.py
@@ -22,7 +22,7 @@
 # limitations under the License.
 """Inference-only DeciLM model compatible with HuggingFace weights."""
 
-from typing import Iterable, Tuple
+from typing import Iterable, Set, Tuple
 
 import torch
 
@@ -57,7 +57,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         delattr(config, "num_key_value_heads_per_layer")
         super().__init__(vllm_config=vllm_config)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -67,6 +68,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -97,6 +99,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
 
     def _degroup_weight(self, loaded_weight: torch.Tensor) -> torch.Tensor:
         hidden_size = self.config.hidden_size
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index 36dfea5a65656..8c5ad9904e925 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -20,7 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Deepseek model."""
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -442,7 +442,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -453,6 +454,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         ]
 
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -487,3 +489,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 1e32fe60c7a5b..d2c4ca0bf85e9 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -20,7 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only DeepseekV2 model."""
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -550,7 +550,8 @@ def make_empty_intermediate_tensors(
                         device=device),
         })
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("gate_up_proj", "gate_proj", 0),
@@ -566,6 +567,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             num_experts=self.config.n_routed_experts)
 
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -623,3 +625,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
                     weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py
index 52dd603ca558d..9d739d0479548 100644
--- a/vllm/model_executor/models/exaone.py
+++ b/vllm/model_executor/models/exaone.py
@@ -22,7 +22,7 @@
 # limitations under the License.
 """Inference-only Exaone model compatible with HuggingFace weights."""
 
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -513,7 +513,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -523,6 +524,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             (".gate_up_proj", ".c_fc_1", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -543,6 +545,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                                         default_weight_loader)
                 loaded_weight = loaded_weight[0]
                 weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
                 continue
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
@@ -576,6 +579,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
 
     # If this function is called, it should always initialize KV cache scale
     # factors (or else raise an exception). Thus, handled exceptions should
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index e97abe949ccdb..2aa4b67d99894 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -18,7 +18,7 @@
 """PyTorch Falcon model."""
 
 import math
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -473,7 +473,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         total_num_heads = self.config.num_attention_heads
         if self.config.new_decoder_architecture:
             total_num_kv_heads = self.config.num_kv_heads
@@ -483,6 +484,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             total_num_kv_heads = total_num_heads
         num_query_heads_per_kv_head = total_num_heads // total_num_kv_heads
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if name == "lm_head.weight" and self.tie_word_embeddings:
                 # Falcon uses tied embeddings except Falcon-11b.
@@ -519,3 +521,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
             weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/florence2.py b/vllm/model_executor/models/florence2.py
index 971a71180164b..d3a9ff6915b84 100644
--- a/vllm/model_executor/models/florence2.py
+++ b/vllm/model_executor/models/florence2.py
@@ -1,5 +1,5 @@
 import math
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, List, Optional, Set, Tuple
 
 import torch
 import torch.nn as nn
@@ -156,7 +156,8 @@ def sample(self, logits: torch.Tensor,
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -165,12 +166,13 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         ]
 
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
-
-                param = params_dict[name.replace(weight_name, param_name)]
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
                 break
@@ -183,6 +185,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
 
 
 class Florence2ForConditionalGeneration(nn.Module):
@@ -248,10 +252,11 @@ def sample(
     ) -> SamplerOutput:
         return self.language_model.sample(logits, sampling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         skip_prefixes = [
             'image_projection', "vision_tower", "image_proj_norm",
             "image_pos_embed", "visual_temporal_embed"
         ]
         loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes)
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/fuyu.py b/vllm/model_executor/models/fuyu.py
index 31fc098a8bb3f..7b46907ac83ab 100644
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -16,7 +16,8 @@
 """ PyTorch Fuyu model."""
 import math
 from array import array
-from typing import Iterable, List, Literal, Mapping, Optional, Tuple, TypedDict
+from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
+                    TypedDict)
 
 import torch
 import torch.nn as nn
@@ -354,6 +355,7 @@ def sample(
         next_tokens = self.language_model.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index ace13664c6ea6..64e03b30bf2f1 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -424,7 +424,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -469,3 +470,4 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             logger.warning(
                 "Some weights are not initialized from checkpoints: %s",
                 unloaded_params)
+        return loaded_params
diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py
index a60b4e73a76d4..4ba39223cc07f 100644
--- a/vllm/model_executor/models/gemma2.py
+++ b/vllm/model_executor/models/gemma2.py
@@ -312,7 +312,8 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -354,6 +355,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             logger.warning(
                 "Some weights are not initialized from checkpoints: %s",
                 unloaded_params)
+        return loaded_params
 
 
 class Gemma2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
@@ -451,13 +453,14 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
             self,
             skip_prefixes=(["lm_head."]
                            if self.config.tie_word_embeddings else None),
         )
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
 
 
 class Gemma2EmbeddingModel(nn.Module, SupportsPP):
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index fa0fdad28d161..1c61408ae1dd9 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -16,7 +16,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GPT-2 model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -298,8 +298,10 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "lm_head.weight" in name:
                 # GPT-2 ties the weights of the embedding layer and the final
@@ -328,3 +330,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
             weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index b2fc79d0d36dc..50a143cb1b600 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -17,7 +17,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GPTBigCode model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -323,8 +323,10 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "lm_head.weight" in name:
                 continue
@@ -344,3 +346,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader(param, loaded_weight, 'v')
             else:
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index cec3fd12a67d6..d5defc60764e6 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -15,7 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GPT-J model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -291,7 +291,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -301,6 +302,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "attn.bias" in name or "attn.masked_bias" in name:
                 continue
@@ -330,3 +332,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 11f286d6bcba0..0bb5e2f9b95f9 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -15,7 +15,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GPT-NeoX model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -303,8 +303,10 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if ("attention.bias" in name or "attention.masked_bias" in name
                     or "rotary_emb.inv_freq" in name):
@@ -337,3 +339,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
             weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py
index cb2583e69d88d..c1e2e87f08ec3 100644
--- a/vllm/model_executor/models/granite.py
+++ b/vllm/model_executor/models/granite.py
@@ -20,7 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only IBM Granite model compatible with HuggingFace weights."""
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -455,7 +455,8 @@ def make_empty_intermediate_tensors(
                         device=device),
         })
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -465,6 +466,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             (".gate_up_proj", ".up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -485,6 +487,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                                         default_weight_loader)
                 loaded_weight = loaded_weight[0]
                 weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
                 continue
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
@@ -518,6 +521,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
 
     # If this function is called, it should always initialize KV cache scale
     # factors (or else raise an exception). Thus, handled exceptions should
diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py
index f437dd521a7d5..a91a18816995f 100644
--- a/vllm/model_executor/models/granitemoe.py
+++ b/vllm/model_executor/models/granitemoe.py
@@ -20,7 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only GraniteMoe model."""
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, List, Optional, Set, Tuple
 
 import torch
 from torch import nn
@@ -419,7 +419,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         new_weights = {}
         for n, p in weights:
             if n.endswith('.block_sparse_moe.input_linear.weight'):
@@ -452,4 +453,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 pass
             else:
                 new_weights[n] = p
-        mixtral.MixtralForCausalLM.load_weights(self, new_weights.items())
+        return mixtral.MixtralForCausalLM.load_weights(self,
+                                                       new_weights.items())
diff --git a/vllm/model_executor/models/idefics2_vision_model.py b/vllm/model_executor/models/idefics2_vision_model.py
index b21bc2a3f9ce1..16192928beb1f 100644
--- a/vllm/model_executor/models/idefics2_vision_model.py
+++ b/vllm/model_executor/models/idefics2_vision_model.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 """PyTorch Idefics2 model."""
 
-from typing import Iterable, Optional, Tuple
+from typing import Iterable, Optional, Set, Tuple
 
 import torch
 from torch import nn
@@ -331,7 +331,8 @@ def forward(
         last_hidden_state = self.post_layernorm(encoder_outputs)
         return last_hidden_state
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -339,11 +340,13 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("qkv_proj", "v_proj", "v"),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
                     continue
-                param = params_dict[name.replace(weight_name, param_name)]
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
                 break
@@ -352,3 +355,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/idefics3.py b/vllm/model_executor/models/idefics3.py
index 0cecc754e916f..5d176b2a4e416 100644
--- a/vllm/model_executor/models/idefics3.py
+++ b/vllm/model_executor/models/idefics3.py
@@ -15,7 +15,7 @@
 
 import math
 from typing import (Dict, Iterable, List, Literal, Mapping, NamedTuple,
-                    Optional, Tuple, TypedDict, Union)
+                    Optional, Set, Tuple, TypedDict, Union)
 
 import torch
 import torch.utils.checkpoint
@@ -751,9 +751,10 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
 
     def get_mm_mapping(self) -> MultiModelKeys:
         """
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index 9761635d2a6c2..bd91a0806ae5c 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -5,7 +5,7 @@
 # Licensed under The MIT License [see LICENSE for details]
 # --------------------------------------------------------
 from functools import partial
-from typing import Iterable, Optional, Tuple
+from typing import Iterable, Optional, Set, Tuple
 
 import torch
 import torch.nn as nn
@@ -469,10 +469,14 @@ def forward(
 
         return encoder_outputs
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             param = params_dict[name]
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
             weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 19bfe16e4d5fc..94b819b5d9366 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -1,5 +1,5 @@
 from functools import partial
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -369,13 +369,15 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("gate_up_proj", "w1", 0),
             ("gate_up_proj", "w3", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -402,3 +404,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py
index 92579e3aae949..7ea2f9be2191d 100644
--- a/vllm/model_executor/models/internvl.py
+++ b/vllm/model_executor/models/internvl.py
@@ -6,7 +6,7 @@
 # --------------------------------------------------------
 import re
 from functools import cached_property, partial
-from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
+from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
 import torch
@@ -663,6 +663,7 @@ def sample(
     ) -> Optional[SamplerOutput]:
         return self.language_model.sample(logits, sampling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index ee49ffb3cd87f..41db85b678456 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -19,7 +19,7 @@
 """Inference-only Jais model compatible with HuggingFace weights."""
 
 import math
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -350,8 +350,10 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "lm_head.weight" in name:
                 # GPT-2 ties the weights of the embedding layer and the final
@@ -382,3 +384,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
             weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
index 5612dd6886385..f83f0fce7275f 100644
--- a/vllm/model_executor/models/jamba.py
+++ b/vllm/model_executor/models/jamba.py
@@ -1,5 +1,5 @@
 """Inference-only Jamba model."""
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, List, Optional, Set, Tuple
 
 import torch
 from torch import nn
@@ -462,7 +462,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -479,6 +480,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             num_experts=self.config.num_experts)
 
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -534,6 +536,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
                     weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
 
 
 def _is_moe_layer(name: str):
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index e53631ef19f31..2b40e9ec73fad 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -20,7 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only LLaMA model compatible with HuggingFace weights."""
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -350,7 +350,8 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -360,6 +361,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             (".gate_up_proj", ".up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -375,6 +377,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                                         default_weight_loader)
                 loaded_weight = loaded_weight[0]
                 weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
                 continue
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
@@ -390,7 +393,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
-
                 break
             else:
                 # Skip loading extra bias for GPTQ models.
@@ -408,6 +410,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
 
     # If this function is called, it should always initialize KV cache scale
     # factors (or else raise an exception). Thus, handled exceptions should
@@ -577,13 +581,14 @@ def sample(self, logits: torch.Tensor,
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
             self,
             skip_prefixes=(["lm_head."]
                            if self.config.tie_word_embeddings else None),
         )
-        loader.load_weights(
+        return loader.load_weights(
             self.maybe_remap_mistral(name, loaded_weight)
             for name, loaded_weight in weights)
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index b13bcfa676811..e7d3161a7cb2d 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,5 +1,5 @@
 from functools import cached_property
-from typing import (Iterable, List, Literal, Mapping, Optional, Protocol,
+from typing import (Iterable, List, Literal, Mapping, Optional, Protocol, Set,
                     Tuple, TypedDict, Union)
 
 import torch
@@ -547,6 +547,7 @@ def sample(
     ) -> Optional[SamplerOutput]:
         return self.language_model.sample(logits, sampling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index dd2fa6cac969f..37e2227a52dcd 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -1,5 +1,5 @@
 from functools import cached_property
-from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
+from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
 import torch
@@ -654,6 +654,7 @@ def pooler(
     ) -> Optional[PoolerOutput]:
         return self._pooler(hidden_states, pooling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
index 5d5598d07bfde..e2880c76cf43d 100644
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -1,6 +1,6 @@
 import math
 from functools import cached_property
-from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
+from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
 import numpy as np
@@ -445,10 +445,11 @@ def sample(
     ) -> Optional[SamplerOutput]:
         return self.language_model.sample(logits, sampling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
             self,
             # This model doesn't support images for now
             ignore_unexpected_prefixes=["image_newline"],
         )
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index a5b2108177830..705ca1e4ab6e6 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -1,6 +1,6 @@
 import math
 from functools import cached_property
-from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
+from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
 import numpy as np
@@ -887,6 +887,7 @@ def sample(
     ) -> Optional[SamplerOutput]:
         return self.language_model.sample(logits, sampling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/mamba.py b/vllm/model_executor/models/mamba.py
index ac0d265a961f0..405b8f7787ba8 100644
--- a/vllm/model_executor/models/mamba.py
+++ b/vllm/model_executor/models/mamba.py
@@ -1,5 +1,5 @@
 """PyTorch MAMBA model."""
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, List, Optional, Set, Tuple
 
 import torch
 from torch import nn
@@ -243,8 +243,10 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "A_log" in name:
                 name = name.replace("A_log", "A")
@@ -256,3 +258,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
             weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/medusa.py b/vllm/model_executor/models/medusa.py
index b05360b55466b..b4ed6538bddac 100644
--- a/vllm/model_executor/models/medusa.py
+++ b/vllm/model_executor/models/medusa.py
@@ -1,4 +1,4 @@
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, List, Optional, Set, Tuple
 
 import torch
 import torch.nn as nn
@@ -156,8 +156,10 @@ def generate_proposals(
             sampling_metadata=sampling_metadata,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
 
         weights_map = {}
 
@@ -181,9 +183,12 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
             weight_loader(param, loaded_weight)
+            loaded_params.add(name)
 
         if self.token_map is not None:
             self.token_map.to(device=self.lm_heads[0].weight.device)
 
         assert (self.truncated_vocab_size
                 == self.orig_vocab_size) or (self.token_map is not None)
+
+        return loaded_params
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 6b67266c53362..b92bff4d7c28c 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -21,7 +21,7 @@
 # limitations under the License.
 """Inference-only MiniCPM model compatible with HuggingFace weights."""
 import math
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -539,7 +539,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -556,6 +557,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             for weight_name in ["w1", "w2", "w3"]
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -606,3 +608,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
                     weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index fd8eda997f76f..99bf1d42d0355 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -24,7 +24,7 @@
 import re
 from functools import partial
 from typing import (Any, Callable, Iterable, List, Literal, Mapping, Optional,
-                    Tuple, TypedDict, Union)
+                    Set, Tuple, TypedDict, Union)
 
 import torch
 import torch.types
@@ -602,7 +602,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -612,6 +613,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
                 if key_to_modify in name:
@@ -630,10 +632,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 for param_name, weight_name, shard_id in stacked_params_mapping:
                     if weight_name not in name:
                         continue
-                    if is_pp_missing_parameter(
-                            name.replace(weight_name, param_name), self):
+                    name = name.replace(weight_name, param_name)
+                    if is_pp_missing_parameter(name, self):
                         continue
-                    param = params_dict[name.replace(weight_name, param_name)]
+                    param = params_dict[name]
                     weight_loader = param.weight_loader
                     weight_loader(param, loaded_weight, shard_id)
                     break
@@ -646,6 +648,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
 
     def get_mm_mapping(self) -> MultiModelKeys:
         """
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index eebf5bab5a288..0faffb4f1b00c 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -20,7 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Mixtral model."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -404,7 +404,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -421,6 +422,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             num_experts=self.config.num_local_experts)
 
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -478,3 +480,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
                     weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index af2e9586988df..ddd6afcf6a1b6 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -20,7 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Mixtral model."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import numpy as np
 import torch
@@ -409,7 +409,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -418,6 +419,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         ]
 
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -448,3 +450,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/mllama.py b/vllm/model_executor/models/mllama.py
index db7ee7b2d8537..41f62b37f3bd9 100644
--- a/vllm/model_executor/models/mllama.py
+++ b/vllm/model_executor/models/mllama.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 """PyTorch Mllama model."""
 import math
-from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
+from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
 import numpy as np
@@ -1427,7 +1427,8 @@ def forward(
 
         return outputs
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -1437,7 +1438,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             (".gate_up_proj", ".up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
-        updated_params = set()
+        updated_params: Set[str] = set()
         for name, loaded_weight in weights:
             if 'patch_embedding.weight' in name:
                 name = name.replace('patch_embedding.weight',
@@ -1457,6 +1458,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+                updated_params.add(name)
+        return updated_params
 
 
 def skip_attention_mask(sparse_mask: List[List[int]]) -> bool:
diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
index 4d7e82880041d..f2aa2653c4f5c 100644
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -1,5 +1,5 @@
 import math
-from typing import Iterable, List, Tuple
+from typing import Iterable, List, Set, Tuple
 
 import torch
 import torch.nn as nn
@@ -188,11 +188,15 @@ def generate_proposals(
 
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             param = params_dict.get(name.replace("speculator.", ""))
             if param is not None:
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+                loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index 3c74ef2448abb..8716e92b0f1c2 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -1,6 +1,6 @@
 # Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main
 import math
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 import torch.nn as nn
@@ -324,8 +324,10 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             # Skip loading extra bias for GPTQ models.
             if name.endswith(".bias") and name not in params_dict:
@@ -336,3 +338,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
             weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py
index eb45beae7d21a..ceab299a7950a 100644
--- a/vllm/model_executor/models/nemotron.py
+++ b/vllm/model_executor/models/nemotron.py
@@ -20,7 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Nemotron model compatible with HuggingFace weights."""
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -474,7 +474,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -482,6 +483,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             (".qkv_proj", ".v_proj", "v"),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -522,3 +524,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 98d4e1ec320a4..dc138e2e636ad 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -20,7 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only OLMo model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -356,7 +356,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -366,6 +367,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -402,3 +404,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py
index f4eebab8c98dd..ab87695d8e650 100644
--- a/vllm/model_executor/models/olmoe.py
+++ b/vllm/model_executor/models/olmoe.py
@@ -10,7 +10,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only OLMoE model compatible with HuggingFace weights."""
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -364,7 +364,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -383,6 +384,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             num_experts=self.config.num_experts)
 
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -455,3 +457,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
                     weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 997fe642439e6..db85a494980a7 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -16,7 +16,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only OPT model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -394,7 +394,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -402,6 +403,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("qkv_proj", "v_proj", "v"),
         ]
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "lm_head.weight" in name and self.config.tie_word_embeddings:
                 continue
@@ -431,3 +433,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index 39d659c49cbcf..b01734af8ddd8 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -3,7 +3,7 @@
 # Copyright (c) OrionStar Inc.
 # LICENSE: https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/LICENSE
 """Inference-only Orion-14B model compatible with HuggingFace weights."""
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -327,7 +327,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -337,6 +338,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -368,3 +370,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index eea229359255e..dd5256eb87ab3 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -1,4 +1,4 @@
-from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
+from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union)
 
 import torch
@@ -295,6 +295,7 @@ def sample(
     ) -> Optional[SamplerOutput]:
         return self.language_model.sample(logits, sampling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self)
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/persimmon.py b/vllm/model_executor/models/persimmon.py
index 62c509153a111..3b8199f4f1661 100644
--- a/vllm/model_executor/models/persimmon.py
+++ b/vllm/model_executor/models/persimmon.py
@@ -19,7 +19,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only persimmon model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -324,8 +324,10 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -358,3 +360,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
             weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index a2ab0d74c48db..0a117bf16c9b3 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -34,7 +34,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 """Inference-only Phi-1.5 model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -345,7 +345,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -353,6 +354,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("qkv_proj", "v_proj", "v")
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
 
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
@@ -383,3 +385,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/phi3_small.py b/vllm/model_executor/models/phi3_small.py
index 2139cec441807..a78e4d355a314 100644
--- a/vllm/model_executor/models/phi3_small.py
+++ b/vllm/model_executor/models/phi3_small.py
@@ -1,5 +1,5 @@
 import math
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -457,9 +457,11 @@ def sample(
                                    sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
 
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -471,3 +473,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             weight_loader = getattr(param, "weight_loader",
                                     default_weight_loader)
             weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 4db65edc174f1..2e583bb08e87a 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -15,7 +15,7 @@
 import itertools
 import re
 from functools import cached_property, lru_cache
-from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional,
+from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, Set,
                     Tuple, TypedDict, Union)
 
 import numpy as np
@@ -744,7 +744,8 @@ def pooler(
     ) -> Optional[PoolerOutput]:
         return self._pooler(hidden_states, pooling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         hf_to_vllm_mapper = WeightsMapper(
             orig_to_new_prefix={
                 "model.vision_embed_tokens.wte": "embed_tokens",
@@ -759,5 +760,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
 
         # The HF config doesn't specify whether these are tied,
         # so we detect it this way
-        if "embed_tokens" not in autoloaded_weights:
+        if "embed_tokens.weight" not in autoloaded_weights:
             self.embed_tokens = self.language_model.model.embed_tokens
+            autoloaded_weights.add("embed_tokens.weight")
+        return autoloaded_weights
diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py
index b7e70f8fa2c6d..e475d286bd7ea 100644
--- a/vllm/model_executor/models/phimoe.py
+++ b/vllm/model_executor/models/phimoe.py
@@ -20,7 +20,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only PhiMoE model."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -598,7 +598,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -613,6 +614,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             num_experts=self.config.num_local_experts)
 
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -666,3 +668,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
                     weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index a3e30ea2dd299..307febde7eef0 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -1,7 +1,7 @@
 from dataclasses import dataclass, fields
 from functools import cached_property
 from itertools import tee
-from typing import Iterable, List, Mapping, Optional, Tuple, Union
+from typing import Iterable, List, Mapping, Optional, Set, Tuple, Union
 
 import numpy
 import torch
@@ -1053,7 +1053,8 @@ def forward(
 
     # (TODO) Add prefix argument for filtering out weights to be loaded
     #        ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -1063,6 +1064,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             (".gate_up_proj", ".up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         layer_count = len(self.transformer.layers)
 
         for name, loaded_weight in weights:
@@ -1075,8 +1077,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
-
-                param = params_dict[name.replace(weight_name, param_name)]
+                name = name.replace(weight_name, param_name)
+                param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
                 break
@@ -1085,3 +1087,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 447632cefcd9a..3978c176a2144 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -8,7 +8,7 @@
 import re
 from functools import partial
 from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
-                    Optional, Tuple, TypedDict, Union)
+                    Optional, Set, Tuple, TypedDict, Union)
 
 import numpy as np
 import torch
@@ -964,13 +964,15 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("gate_up_proj", "w2", 0),
             ("gate_up_proj", "w1", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -999,6 +1001,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
 
 
 class QWenLLM(QWenBaseModel):
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 8f10df808c216..370cff5fa153f 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -21,7 +21,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2 model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -332,7 +332,8 @@ def forward(
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -342,6 +343,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -372,6 +374,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
 
 
 class Qwen2ForCausalLM(nn.Module, SupportsLoRA, SupportsPP):
@@ -494,13 +498,14 @@ def pooler(
     ) -> Optional[PoolerOutput]:
         return self._pooler(hidden_states, pooling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(
             self,
             skip_prefixes=(["lm_head."]
                            if self.config.tie_word_embeddings else None),
         )
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
 
 
 class Qwen2EmbeddingModel(nn.Module, SupportsLoRA, SupportsPP):
@@ -564,7 +569,8 @@ def pooler(
     ) -> Optional[PoolerOutput]:
         return self._pooler(hidden_states, pooling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self,
                                    ignore_unexpected_prefixes=["lm_head."])
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/qwen2_audio.py b/vllm/model_executor/models/qwen2_audio.py
index d30950361ad89..a4965f34b1ca8 100644
--- a/vllm/model_executor/models/qwen2_audio.py
+++ b/vllm/model_executor/models/qwen2_audio.py
@@ -20,7 +20,8 @@
 # limitations under the License.
 """Inference-only Qwen2-Audio model compatible with HuggingFace weights."""
 from functools import lru_cache
-from typing import Iterable, List, Mapping, Optional, Tuple, TypedDict, Union
+from typing import (Iterable, List, Mapping, Optional, Set, Tuple, TypedDict,
+                    Union)
 
 import librosa
 import numpy as np
@@ -420,7 +421,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -430,6 +432,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -463,3 +466,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/qwen2_cls.py b/vllm/model_executor/models/qwen2_cls.py
index 07eb330620a43..dc5dabf6fc38b 100644
--- a/vllm/model_executor/models/qwen2_cls.py
+++ b/vllm/model_executor/models/qwen2_cls.py
@@ -4,7 +4,7 @@
 # Copyright 2024 The Qwen team.
 # Copyright 2023 The vLLM team.
 """Inference-only Qwen2-Classification model compatible with HF weights."""
-from typing import Iterable, List, Optional, Tuple
+from typing import Iterable, List, Optional, Set, Tuple
 
 import torch
 from torch import nn
@@ -97,7 +97,8 @@ def pooler(
     ) -> Optional[PoolerOutput]:
         return self._pooler(hidden_states, pooling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self,
                                    ignore_unexpected_prefixes=["lm_head."])
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index 249d94b5d95e9..96a9bc451f4df 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -21,7 +21,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Qwen2MoE model compatible with HuggingFace weights."""
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 import torch.nn.functional as F
@@ -436,7 +436,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -455,6 +456,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             num_experts=self.config.num_experts)
 
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -532,3 +534,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
                     weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/qwen2_rm.py b/vllm/model_executor/models/qwen2_rm.py
index 6db467af334f5..988d682d36be3 100644
--- a/vllm/model_executor/models/qwen2_rm.py
+++ b/vllm/model_executor/models/qwen2_rm.py
@@ -3,7 +3,7 @@
 # Copyright 2024 The Qwen team.
 # Copyright 2023 The vLLM team.
 """Inference-only Qwen2-RM model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -110,7 +110,8 @@ def pooler(
     ) -> Optional[PoolerOutput]:
         return self._pooler(hidden_states, pooling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         loader = AutoWeightsLoader(self,
                                    ignore_unexpected_prefixes=["lm_head."])
-        loader.load_weights(weights)
+        return loader.load_weights(weights)
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 2335baf459771..ef6b52db6e17d 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -23,7 +23,7 @@
 """Inference-only Qwen2-VL model compatible with HuggingFace weights."""
 from functools import partial
 from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
-                    Optional, Tuple, Type, TypedDict, Union)
+                    Optional, Set, Tuple, Type, TypedDict, Union)
 
 import torch
 import torch.nn as nn
@@ -1333,7 +1333,8 @@ def pooler(
     ) -> Optional[PoolerOutput]:
         return self._pooler(hidden_states, pooling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -1343,6 +1344,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("gate_up_proj", "gate_proj", 0),
         ]
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -1392,3 +1394,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index acaf4afdecfe5..c9e09b879843a 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -2,7 +2,7 @@
 within a vision language model."""
 
 import math
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import numpy as np
 import torch
@@ -594,7 +594,8 @@ def forward(
             interpolate_pos_encoding=interpolate_pos_encoding,
         )
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -602,6 +603,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("qkv_proj", "v_proj", "v"),
         ] if self.shard_weight else []
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         layer_count = len(self.vision_model.encoder.layers)
 
         for name, loaded_weight in weights:
@@ -619,8 +621,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
+                name = name.replace(weight_name, param_name)
 
-                param = params_dict[name.replace(weight_name, param_name)]
+                param = params_dict[name]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
                 break
@@ -629,3 +632,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py
index affb2c975ce4a..6d6fafc5ab0eb 100644
--- a/vllm/model_executor/models/solar.py
+++ b/vllm/model_executor/models/solar.py
@@ -21,7 +21,7 @@
 # limitations under the License.
 """Inference-only Solar model compatible with HuggingFace weights."""
 
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -477,7 +477,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             (".qkv_proj", ".q_proj", "q"),
@@ -487,6 +488,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             (".gate_up_proj", ".up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -502,6 +504,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                                         default_weight_loader)
                 loaded_weight = loaded_weight[0]
                 weight_loader(param, loaded_weight)
+                loaded_params.add(scale_name)
                 continue
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in name:
@@ -535,6 +538,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
 
     # If this function is called, it should always initialize KV cache scale
     # factors (or else raise an exception). Thus, handled exceptions should
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 99acce596602e..e11d2e916730a 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -18,7 +18,7 @@
 # https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json
 """Inference-only StabeLM (https://github.com/Stability-AI/StableLM)
 model compatible with HuggingFace weights."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -306,7 +306,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -316,6 +317,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -347,3 +349,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 0ef940acebb93..74c66042226de 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -17,7 +17,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ PyTorch Starcoder2 model."""
-from typing import Iterable, List, Optional, Tuple, Union
+from typing import Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -314,7 +314,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
             ("qkv_proj", "q_proj", "q"),
@@ -323,6 +324,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         ]
 
         params_dict = dict(self.named_parameters(remove_duplicate=False))
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
@@ -346,3 +348,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py
index 9fde22c016de0..512adbc7db35e 100644
--- a/vllm/model_executor/models/ultravox.py
+++ b/vllm/model_executor/models/ultravox.py
@@ -3,7 +3,7 @@
 
 import math
 from functools import cached_property, lru_cache
-from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
+from typing import (Iterable, List, Literal, Mapping, Optional, Set, Tuple,
                     TypedDict, Union, cast)
 
 import numpy as np
@@ -504,10 +504,11 @@ def sample(
     ) -> Optional[SamplerOutput]:
         return self.language_model.sample(logits, sampling_metadata)
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         hf_to_vllm_mapper = WeightsMapper(
             orig_to_new_prefix={"audio_tower.model.encoder.": "audio_tower."})
 
         loader = AutoWeightsLoader(self,
                                    ignore_unexpected_prefixes=["audio_tower."])
-        loader.load_weights(weights, mapper=hf_to_vllm_mapper)
+        return loader.load_weights(weights, mapper=hf_to_vllm_mapper)
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 1d51885f9094a..7a4fcce95603d 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -1,7 +1,7 @@
 import itertools
 from dataclasses import dataclass, field
 from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping,
-                    Optional, Protocol, Tuple, Union, overload)
+                    Optional, Protocol, Set, Tuple, Union, overload)
 
 import torch
 import torch.nn as nn
@@ -172,8 +172,9 @@ def _load_module(
         if module != self.module:
             module_load_weights = getattr(module, "load_weights", None)
             if callable(module_load_weights):
-                module_load_weights(weights)
-                return
+                loaded_params = module_load_weights(weights)
+                yield from map(lambda x: self._get_qualname(base_prefix, x),
+                               loaded_params)
 
         child_modules = dict(module.named_children())
         child_params = dict(module.named_parameters(recurse=False))
@@ -222,11 +223,11 @@ def load_weights(
         weights: Iterable[Tuple[str, torch.Tensor]],
         *,
         mapper: Optional[WeightsMapper] = None,
-    ) -> List[str]:
+    ) -> Set[str]:
         if mapper is not None:
             weights = mapper.apply(weights)
 
-        autoloaded_weights = list(self._load_module("", self.module, weights))
+        autoloaded_weights = set(self._load_module("", self.module, weights))
         return autoloaded_weights
 
 
diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py
index 51172d8782a70..bc37a997eabb5 100644
--- a/vllm/model_executor/models/xverse.py
+++ b/vllm/model_executor/models/xverse.py
@@ -19,7 +19,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Inference-only Xverse model compatible with HuggingFace weights."""
-from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
 
 import torch
 from torch import nn
@@ -376,7 +376,8 @@ def sample(
         next_tokens = self.sampler(logits, sampling_metadata)
         return next_tokens
 
-    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    def load_weights(self, weights: Iterable[Tuple[str,
+                                                   torch.Tensor]]) -> Set[str]:
         stacked_params_mapping = [
             ("qkv_proj", "q_proj", "q"),
             ("qkv_proj", "k_proj", "k"),
@@ -385,6 +386,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("gate_up_proj", "up_proj", 1),
         ]
         params_dict = dict(self.named_parameters())
+        loaded_params: Set[str] = set()
         for name, loaded_weight in weights:
             if ("rotary_emb.inv_freq" in name
                     or "rotary_emb.cos_cached" in name
@@ -413,3 +415,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader = getattr(param, "weight_loader",
                                         default_weight_loader)
                 weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params

From 47826cacf0e037b4e109f0b2d8d594e47def500e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=84=8D=F0=9D=95=A0=F0=9D=95=9D=F0=9D=95=9D=F0=9D=95=A0?=
 =?UTF-8?q?=F0=9D=95=A8=20=F0=9D=95=84=F0=9D=95=92=F0=9D=95=9F?=
 <hollowman@opensuse.org>
Date: Mon, 18 Nov 2024 05:29:26 +0200
Subject: [PATCH 179/183] [Bugfix] Ignore ray reinit error when current
 platform is ROCm or XPU (#10375)

Signed-off-by: Hollow Man <hollowman@opensuse.org>
---
 vllm/executor/ray_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/executor/ray_utils.py b/vllm/executor/ray_utils.py
index 41dd59bc65ec5..4f28efd639084 100644
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -234,7 +234,7 @@ def initialize_ray_cluster(
     if current_platform.is_rocm() or current_platform.is_xpu():
         # Try to connect existing ray instance and create a new one if not found
         try:
-            ray.init("auto")
+            ray.init("auto", ignore_reinit_error=True)
         except ConnectionError:
             logger.warning(
                 "No existing RAY instance detected. "

From 51bb12d17b374d5c4521cd01e5b066fd2419a8fa Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sun, 17 Nov 2024 23:57:20 -0800
Subject: [PATCH 180/183] [4/N][torch.compile] clean up
 set_torch_compile_backend (#10401)

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 vllm/compilation/backends.py | 16 ++--------------
 vllm/compilation/wrapper.py  | 11 +++--------
 vllm/config.py               | 31 ++++++++++++++++++++++++++++++-
 vllm/platforms/tpu.py        |  7 +++----
 vllm/plugins/__init__.py     | 14 +-------------
 vllm/utils.py                |  9 +++++++++
 vllm/worker/model_runner.py  |  3 +--
 7 files changed, 49 insertions(+), 42 deletions(-)

diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
index 22c613931f082..0cf1e3a95fcba 100644
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@@ -2,15 +2,14 @@
 import dataclasses
 import operator
 from contextlib import ExitStack
-from typing import (Any, Callable, Dict, List, Optional, Sequence, Set, Tuple,
-                    Union)
+from typing import Any, Callable, Dict, List, Optional, Sequence, Set, Tuple
 from unittest.mock import patch
 
 import torch
 import torch.fx as fx
 
 import vllm.envs as envs
-from vllm.config import CompilationConfig, CompilationLevel
+from vllm.config import CompilationConfig
 from vllm.logger import init_logger
 from vllm.utils import combine_fx_passes, weak_ref_tensors
 
@@ -684,14 +683,3 @@ def __call__(self, *args) -> Any:
 
         entry.cudagraph.replay()
         return entry.output
-
-
-def select_default_backend(level: int) -> Union[str, Callable]:
-    if level in [CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE]:
-        backend_str = "eager"
-        return backend_str
-    assert level == CompilationLevel.PIECEWISE
-
-    from vllm.plugins import get_current_vllm_config
-    compilation_config = get_current_vllm_config().compilation_config
-    return VllmBackend(compilation_config)
diff --git a/vllm/compilation/wrapper.py b/vllm/compilation/wrapper.py
index 2a1aecc11ce26..0143d0301ca1a 100644
--- a/vllm/compilation/wrapper.py
+++ b/vllm/compilation/wrapper.py
@@ -32,14 +32,9 @@ def __init__(self,
             # default compilation settings
             # compiling the forward method
 
-            # choose the compile backend
-
-            # if the user has set the backend, use it
-            from vllm.plugins import get_torch_compile_backend
-            backend = get_torch_compile_backend()
-            if backend is None:
-                from vllm.compilation.backends import select_default_backend
-                backend = select_default_backend(compilation_level)
+            from vllm.plugins import get_current_vllm_config
+            backend = get_current_vllm_config(
+            ).compilation_config.init_backend()
 
             compiled_callable = torch.compile(
                 self.forward,
diff --git a/vllm/config.py b/vllm/config.py
index 7e37edbe594b1..14017bbdb3cf2 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -22,7 +22,7 @@
     get_hf_text_config, get_pooling_config,
     get_sentence_transformer_tokenizer_config, is_encoder_decoder, uses_mrope)
 from vllm.utils import (GiB_bytes, cuda_device_count_stateless, get_cpu_memory,
-                        identity, print_warning_once)
+                        identity, print_warning_once, resolve_obj_by_qualname)
 
 if TYPE_CHECKING:
     from ray.util.placement_group import PlacementGroup
@@ -2072,6 +2072,13 @@ class CompilationConfig(BaseModel):
             - 1: dynamo as is.
             - 2: dynamo once.
             - 3: piecewise compilation.
+        - backend: the backend for compilation. It needs to be a string.
+            - "" (empty string): use the default backend.
+            - "eager"/"openxla"/...: use the specified backend registered in PyTorch.
+            - "full.module.name": a qualified name which can be used to import the backend function.
+            We use string to avoid serialization issues when using compilation in a distributed setting.
+            When the compilation level is 1 or 2, the backend is used for the compilation directly (it sees the whole graph).
+            When the compilation level is 3, the backend is used for the piecewise compilation (it sees a part of the graph).
         - custom_ops: fine-grained control over which custom ops to enable/disable.
             Use 'all' to enable all, 'none' to disable all.
             Also specify a list of custom op names to enable (prefixed with a '+'),
@@ -2139,6 +2146,7 @@ class CompilationConfig(BaseModel):
         certain small batchsizes, where inductor is good at optimizing.
     """ # noqa
     level: int = 0
+    backend: str = ""
     custom_ops: List[str] = Field(default_factory=list)
 
     use_inductor: bool = True
@@ -2182,6 +2190,27 @@ def model_post_init(self, __context: Any) -> None:
             func = __import__(module).__dict__[func_name]
             self.inductor_compile_config[k] = func
 
+    def init_backend(self) -> Union[str, Callable]:
+        if self.level == CompilationLevel.NO_COMPILATION:
+            raise ValueError("No compilation level is set.")
+
+        from torch._dynamo.backends.registry import list_backends
+        torch_backends = list_backends(exclude_tags=tuple())
+        if self.level in [
+                CompilationLevel.DYNAMO_AS_IS, CompilationLevel.DYNAMO_ONCE
+        ]:
+            if self.backend == "":
+                return "eager"
+            if self.backend in torch_backends:
+                return self.backend
+            return resolve_obj_by_qualname(self.backend)
+
+        # TODO: pass user-specified backend to piecewise compilation
+        # merge with the config use_inductor
+        assert self.level == CompilationLevel.PIECEWISE
+        from vllm.compilation.backends import VllmBackend
+        return VllmBackend(self)
+
     def init_during_runtime(self):
         """To complete the initialization of config,
         we need to know the compile context, which is only available
diff --git a/vllm/platforms/tpu.py b/vllm/platforms/tpu.py
index c2e22bfc09f22..643db835c85ff 100644
--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
@@ -3,8 +3,6 @@
 
 import torch
 
-from vllm.plugins import set_torch_compile_backend
-
 from .interface import Platform, PlatformEnum
 
 if TYPE_CHECKING:
@@ -12,8 +10,6 @@
 else:
     VllmConfig = None
 
-set_torch_compile_backend("openxla")
-
 
 class TpuPlatform(Platform):
     _enum = PlatformEnum.TPU
@@ -38,3 +34,6 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             compilation_config.level = CompilationLevel.DYNAMO_ONCE
         assert compilation_config.level < CompilationLevel.PIECEWISE,\
             "TPU does not support Inductor."
+
+        if compilation_config.backend == "":
+            compilation_config.backend = "openxla"
diff --git a/vllm/plugins/__init__.py b/vllm/plugins/__init__.py
index c20b9ec891d5d..a0c73a752b5e8 100644
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -1,6 +1,6 @@
 import logging
 from contextlib import contextmanager
-from typing import TYPE_CHECKING, Callable, Optional, Union
+from typing import TYPE_CHECKING, Optional
 
 import vllm.envs as envs
 
@@ -50,18 +50,6 @@ def load_general_plugins():
                 logger.exception("Failed to load plugin %s", plugin.name)
 
 
-_torch_compile_backend: Optional[Union[Callable, str]] = None
-
-
-def set_torch_compile_backend(backend: Union[Callable, str]):
-    global _torch_compile_backend
-    _torch_compile_backend = backend
-
-
-def get_torch_compile_backend() -> Optional[Union[Callable, str]]:
-    return _torch_compile_backend
-
-
 _compilation_config: Optional[CompilationConfig] = None
 
 
diff --git a/vllm/utils.py b/vllm/utils.py
index 111460a29de47..5d0514cd9d168 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1600,3 +1600,12 @@ def direct_register_custom_op(
     my_lib.impl(op_name, op_func, "CUDA")
     if fake_impl is not None:
         my_lib._register_fake(op_name, fake_impl)
+
+
+def resolve_obj_by_qualname(qualname: str) -> Any:
+    """
+    Resolve an object by its fully qualified name.
+    """
+    module_name, obj_name = qualname.rsplit(".", 1)
+    module = importlib.import_module(module_name)
+    return getattr(module, obj_name)
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index fd89f95445565..fb5813651680b 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1143,8 +1143,7 @@ def load_model(self) -> None:
 
         if self.vllm_config.compilation_config.level ==\
             CompilationLevel.DYNAMO_AS_IS and supports_dynamo():
-            from vllm.plugins import get_torch_compile_backend
-            backend = get_torch_compile_backend() or "eager"
+            backend = self.vllm_config.compilation_config.init_backend()
             self.model = torch.compile(
                 self.model,
                 fullgraph=envs.VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE,

From c7dec926f6f1beaed759b8689373926e68867358 Mon Sep 17 00:00:00 2001
From: lkchen <github@lkchen.net>
Date: Mon, 18 Nov 2024 00:06:16 -0800
Subject: [PATCH 181/183] [VLM] Report multi_modal_placeholders in output
 (#10407)

Signed-off-by: Linkun Chen <lkchen+anyscale@github.com>
---
 .../vision_language/test_pixtral.py           | 79 ++++++++++++++++++-
 vllm/model_executor/models/pixtral.py         | 16 +++-
 vllm/outputs.py                               | 30 +++++--
 3 files changed, 115 insertions(+), 10 deletions(-)

diff --git a/tests/models/decoder_only/vision_language/test_pixtral.py b/tests/models/decoder_only/vision_language/test_pixtral.py
index d8a98a0f84d3b..6233860747b9c 100644
--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -8,13 +8,17 @@
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 
 import pytest
+from mistral_common.multimodal import download_image
 from mistral_common.protocol.instruct.messages import ImageURLChunk
 from mistral_common.protocol.instruct.request import ChatCompletionRequest
 from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
 from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
+from transformers import AutoProcessor
 
-from vllm import EngineArgs, LLMEngine, SamplingParams, TokensPrompt
+from vllm import (EngineArgs, LLMEngine, RequestOutput, SamplingParams,
+                  TextPrompt, TokensPrompt)
 from vllm.multimodal import MultiModalDataBuiltins
+from vllm.multimodal.inputs import PlaceholderRange
 from vllm.sequence import Logprob, SampleLogprobs
 
 from ....utils import VLLM_PATH, large_gpu_test
@@ -49,6 +53,20 @@ def _create_msg_format(urls: List[str]) -> List[Dict[str, Any]]:
     }]
 
 
+def _create_msg_format_hf(urls: List[str]) -> List[Dict[str, Any]]:
+    return [{
+        "role":
+        "user",
+        "content": [{
+            "type": "text",
+            "content": PROMPT,
+        }, *({
+            "type": "image",
+            "image": download_image(url)
+        } for url in urls)],
+    }]
+
+
 def _create_engine_inputs(urls: List[str]) -> TokensPrompt:
     msg = _create_msg_format(urls)
 
@@ -70,6 +88,23 @@ def _create_engine_inputs(urls: List[str]) -> TokensPrompt:
     return engine_inputs
 
 
+def _create_engine_inputs_hf(urls: List[str]) -> TextPrompt:
+    msg = _create_msg_format_hf(urls)
+
+    tokenizer = AutoProcessor.from_pretrained("mistral-community/pixtral-12b")
+    prompt = tokenizer.apply_chat_template(msg)
+
+    images = []
+    for chunk in msg[0]["content"]:
+        if chunk["type"] == "image":
+            images.append(chunk["image"])
+
+    mm_data = MultiModalDataBuiltins(image=images)
+    engine_inputs = TextPrompt(prompt=prompt, multi_modal_data=mm_data)
+
+    return engine_inputs
+
+
 MSGS = [
     _create_msg_format(IMG_URLS[:1]),
     _create_msg_format(IMG_URLS[:2]),
@@ -191,3 +226,45 @@ def test_model_engine(vllm_runner, model: str, dtype: str) -> None:
                          outputs_1_lst=logprobs,
                          name_0="h100_ref",
                          name_1="output")
+
+
+@large_gpu_test(min_gb=24)
+@pytest.mark.parametrize(
+    "prompt,expected_ranges",
+    [(_create_engine_inputs_hf(IMG_URLS[:1]), [{
+        "offset": 10,
+        "length": 494
+    }]),
+     (_create_engine_inputs_hf(IMG_URLS[1:4]), [{
+         "offset": 10,
+         "length": 266
+     }, {
+         "offset": 276,
+         "length": 1056
+     }, {
+         "offset": 1332,
+         "length": 418
+     }])])
+def test_multi_modal_placeholders(
+        vllm_runner, prompt, expected_ranges: list[PlaceholderRange]) -> None:
+    with vllm_runner(
+            "mistral-community/pixtral-12b",
+            max_model_len=8192,
+            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
+    ) as vllm_model:
+        outputs = vllm_model.model.generate(prompt)
+
+        assert len(outputs) == 1, f"{len(outputs)=}"
+        output: RequestOutput = outputs[0]
+        assert hasattr(output,
+                       "multi_modal_placeholders"), f"{output.__dict__=}"
+        assert "image" in output.multi_modal_placeholders, \
+            f"{output.multi_modal_placeholders.keys()=}"
+        image_placeholder_ranges: list[
+            PlaceholderRange] = output.multi_modal_placeholders["image"]
+        assert len(image_placeholder_ranges) == len(
+            expected_ranges), f"{image_placeholder_ranges=}"
+        for real_range, expected_range in zip(image_placeholder_ranges,
+                                              expected_ranges):
+            assert real_range == expected_range, \
+                f"{real_range=} {expected_range=}"
diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py
index 307febde7eef0..d44a538d56b8c 100644
--- a/vllm/model_executor/models/pixtral.py
+++ b/vllm/model_executor/models/pixtral.py
@@ -30,6 +30,7 @@
 from vllm.model_executor.models.utils import merge_multimodal_embeddings
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.multimodal.inputs import PlaceholderRange
 from vllm.multimodal.utils import (cached_get_tokenizer,
                                    consecutive_placeholder_ranges)
 from vllm.sequence import IntermediateTensors, SequenceData
@@ -773,15 +774,28 @@ def input_processor_for_pixtral_hf(
         replace_tokens[-1] = image_end_id
         replace_tokens_list.append(replace_tokens)
 
+    reverse_offsets: List[int] = []
     # Backward iteration for replacement without affecting known indices
     for placeholder_idx, replace_tokens in zip(reversed(placeholder_indices),
                                                reversed(replace_tokens_list)):
+        reverse_offsets.append(
+            len(new_token_ids) - placeholder_idx + len(replace_tokens))
         new_token_ids[placeholder_idx:placeholder_idx + 1] = replace_tokens
 
+    placeholder_ranges: List[PlaceholderRange] = []
+    for reverse_offset, replace_tokens in zip(reversed(reverse_offsets),
+                                              replace_tokens_list):
+        placeholder_ranges.append(
+            PlaceholderRange(
+                offset=len(new_token_ids) - reverse_offset,
+                length=len(replace_tokens),
+            ))
+
     # NOTE: Create a defensive copy of the original inputs
     return token_inputs(prompt_token_ids=new_token_ids,
                         prompt=new_prompt,
-                        multi_modal_data=multi_modal_data)
+                        multi_modal_data=multi_modal_data,
+                        multi_modal_placeholders={"image": placeholder_ranges})
 
 
 class PixtralHFMLP(nn.Module):
diff --git a/vllm/outputs.py b/vllm/outputs.py
index badf50d0602d6..4ae9b377ae693 100644
--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@@ -5,6 +5,7 @@
 from typing import Union
 
 from vllm.lora.request import LoRARequest
+from vllm.multimodal.inputs import MultiModalPlaceholderDict
 from vllm.sampling_params import RequestOutputKind
 from vllm.sequence import (PromptLogprobs, RequestMetrics, SampleLogprobs,
                            SequenceGroup, SequenceGroupBase, SequenceStatus)
@@ -103,10 +104,13 @@ def __init__(
         encoder_prompt: Optional[str] = None,
         encoder_prompt_token_ids: Optional[List[int]] = None,
         num_cached_tokens: Optional[int] = None,
+        *,
+        multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None,
     ) -> None:
         self.request_id = request_id
         self.prompt = prompt
         self.prompt_token_ids = prompt_token_ids
+        self.multi_modal_placeholders = multi_modal_placeholders or {}
         self.prompt_logprobs = prompt_logprobs
         self.outputs = outputs
         self.finished = finished
@@ -275,17 +279,26 @@ def from_seq_group(
         finished_time = time.time() if finished else None
         seq_group.set_finished_time(finished_time)
 
-        init_args = (seq_group.request_id, prompt, prompt_token_ids,
-                     prompt_logprobs, outputs, finished, seq_group.metrics,
-                     seq_group.lora_request, encoder_prompt,
-                     encoder_prompt_token_ids, num_cached_tokens)
+        init_kwargs = {
+            "request_id": seq_group.request_id,
+            "prompt": prompt,
+            "prompt_token_ids": prompt_token_ids,
+            "prompt_logprobs": prompt_logprobs,
+            "outputs": outputs,
+            "finished": finished,
+            "metrics": seq_group.metrics,
+            "lora_request": seq_group.lora_request,
+            "encoder_prompt": encoder_prompt,
+            "encoder_prompt_token_ids": encoder_prompt_token_ids,
+            "num_cached_tokens": num_cached_tokens,
+            "multi_modal_placeholders": seq_group.multi_modal_placeholders
+        }
 
         if use_cache:
             request_output = seq_group.cached_request_output
-            request_output.__init__(*init_args)  # type: ignore
-
+            request_output.__init__(**init_kwargs)  # type: ignore
         else:
-            request_output = cls(*init_args)
+            request_output = cls(**init_kwargs)  # type: ignore
 
         return request_output
 
@@ -300,7 +313,8 @@ def __repr__(self) -> str:
                 f"finished={self.finished}, "
                 f"metrics={self.metrics}, "
                 f"lora_request={self.lora_request}, "
-                f"num_cached_tokens={self.num_cached_tokens})")
+                f"num_cached_tokens={self.num_cached_tokens}, "
+                f"multi_modal_placeholders={self.multi_modal_placeholders})")
 
 
 class EmbeddingRequestOutput:

From 01aae1cc68d6013dd91e87418a6d82fa02c58457 Mon Sep 17 00:00:00 2001
From: Maybewuss <38156589+Maybewuss@users.noreply.github.com>
Date: Mon, 18 Nov 2024 18:05:36 +0800
Subject: [PATCH 182/183] [Model] Remove redundant  softmax when using
 PoolingType.STEP (#10415)

---
 vllm/model_executor/layers/pooler.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/pooler.py b/vllm/model_executor/layers/pooler.py
index 6fee57a0a03eb..bfe2d7d0f382e 100644
--- a/vllm/model_executor/layers/pooler.py
+++ b/vllm/model_executor/layers/pooler.py
@@ -118,14 +118,13 @@ def forward(
             if returned_token_ids is not None and len(returned_token_ids) > 0:
                 hidden_states = hidden_states[:, returned_token_ids]
 
-            logits = hidden_states.softmax(dim=-1)
             step_tag_id = self.step_tag_id
 
             offset = 0
             pooled_data_lst = []
             for prompt_len, seq_data_i in zip(
                     prompt_lens, pooling_metadata.seq_data.values()):
-                pooled_data_i = logits[offset:offset + prompt_len]
+                pooled_data_i = hidden_states[offset:offset + prompt_len]
                 if step_tag_id is not None:
                     token_ids = torch.tensor(seq_data_i.prompt_token_ids)
                     pooled_data_i = pooled_data_i[token_ids == step_tag_id]

From 340018048d3536a14e5ad9b3cf583424eff188b7 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Mon, 18 Nov 2024 13:46:58 +0200
Subject: [PATCH 183/183] format.sh

---
 vllm/lora/models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index f449879340c6c..313761c37a979 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -4,8 +4,8 @@
 import os
 import re
 from dataclasses import dataclass, field
-from typing import (Any, Callable, Dict, List, Optional, 
-                    Sequence, Tuple, Type, Union)
+from typing import (Any, Callable, Dict, List, Optional, Sequence, Tuple, Type,
+                    Union)
 
 import safetensors.torch
 import torch