diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh index cd2bfd8bb5bf4..b17540633225f 100755 --- a/.buildkite/run-cpu-test-ppc64le.sh +++ b/.buildkite/run-cpu-test-ppc64le.sh @@ -25,8 +25,7 @@ function cpu_tests() { decord einops librosa peft Pillow sentence-transformers soundfile \ transformers_stream_generator matplotlib datamodel_code_generator pip install torchvision --index-url https://download.pytorch.org/whl/cpu - # Embedding models are not supported for CPU yet - # pytest -v -s tests/models/embedding/language + pytest -v -s tests/models/embedding/language pytest -v -s tests/models/encoder_decoder/language pytest -v -s tests/models/decoder_only/language/test_models.py pytest -v -s tests/models/decoder_only/audio_language -m cpu_model diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 8d4f4d1a681f2..7a0c9dc902bae 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -32,8 +32,7 @@ function cpu_tests() { decord einops librosa peft Pillow sentence-transformers soundfile \ transformers_stream_generator matplotlib datamodel_code_generator pip install torchvision --index-url https://download.pytorch.org/whl/cpu - # Embedding models are not supported for CPU yet - # pytest -v -s tests/models/embedding/language + pytest -v -s tests/models/embedding/language pytest -v -s tests/models/encoder_decoder/language pytest -v -s tests/models/decoder_only/language/test_models.py pytest -v -s tests/models/decoder_only/audio_language -m cpu_model diff --git a/tests/models/embedding/language/test_embedding.py b/tests/models/embedding/language/test_embedding.py index 39b6bbaf43180..cd920aec6502e 100644 --- a/tests/models/embedding/language/test_embedding.py +++ b/tests/models/embedding/language/test_embedding.py @@ -4,6 +4,8 @@ """ import pytest +from vllm.utils import current_platform + from ..utils import check_embeddings_close # Model, Guard @@ -21,15 +23,14 @@ @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half"]) def test_models( - monkeypatch, hf_runner, vllm_runner, example_prompts, model, dtype: str, ) -> None: - if model in ENCODER_ONLY: - monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS") + if model not in ENCODER_ONLY and current_platform.is_cpu(): + pytest.skip("Skip large embedding models test on CPU.") # The example_prompts has ending "\n", for example: # "Write a short story about a robot that dreams for the first time.\n" diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py index f985f70728a60..563178d3ab60d 100644 --- a/vllm/attention/backends/torch_sdpa.py +++ b/vllm/attention/backends/torch_sdpa.py @@ -158,7 +158,8 @@ def get_seq_lens( * Appropriate sequence lengths tensor for key & value ''' - if attn_type == AttentionType.DECODER: + if (attn_type == AttentionType.DECODER + or attn_type == AttentionType.ENCODER_ONLY): seq_lens_q = self.seq_lens seq_lens_kv = self.seq_lens elif attn_type == AttentionType.ENCODER: @@ -189,7 +190,8 @@ def get_attn_bias( * Appropriate attention bias value given the attention type ''' - if attn_type == AttentionType.DECODER: + if (attn_type == AttentionType.DECODER + or attn_type == AttentionType.ENCODER_ONLY): return self.attn_bias elif attn_type == AttentionType.ENCODER: return self.encoder_attn_bias @@ -215,7 +217,8 @@ def set_attn_bias( encoder/decoder cross-attention ''' - if attn_type == AttentionType.DECODER: + if (attn_type == AttentionType.DECODER + or attn_type == AttentionType.ENCODER_ONLY): self.attn_bias = attn_bias elif attn_type == AttentionType.ENCODER: self.encoder_attn_bias = attn_bias @@ -252,7 +255,8 @@ def get_seq_len_block_table_args( * Appropriate block tables (or None) ''' - if attn_type == AttentionType.DECODER: + if (attn_type == AttentionType.DECODER + or attn_type == AttentionType.ENCODER_ONLY): # Decoder self-attention # Choose max_seq_len based on whether we are in prompt_run return (self.seq_lens_tensor, self.max_decode_seq_len, @@ -420,6 +424,8 @@ def forward( "Torch SDPA backend doesn't support prefix decoding.") if decode_meta := attn_metadata.decode_metadata: + assert attn_type != AttentionType.ENCODER_ONLY, ( + "Encoder-only models should not have decode metadata.") # Decoding run. ( seq_lens_arg, diff --git a/vllm/model_executor/models/bert.py b/vllm/model_executor/models/bert.py index 614d2db8ccff6..7dbc7fa0aaba4 100644 --- a/vllm/model_executor/models/bert.py +++ b/vllm/model_executor/models/bert.py @@ -5,7 +5,6 @@ from transformers import BertConfig from vllm.attention import Attention, AttentionMetadata, AttentionType -from vllm.attention.backends.xformers import XFormersImpl from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn @@ -218,11 +217,6 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.attn") - if not isinstance(self.attn.impl, XFormersImpl): - raise ValueError( - "Encoder-only models currently require XFORMERS attention " - "backend. Set VLLM_ATTENTION_BACKEND=XFORMERS to use BERT.") - def forward( self, hidden_states: torch.Tensor, diff --git a/vllm/worker/cpu_embedding_model_runner.py b/vllm/worker/cpu_embedding_model_runner.py new file mode 100644 index 0000000000000..86918fee65c5e --- /dev/null +++ b/vllm/worker/cpu_embedding_model_runner.py @@ -0,0 +1,122 @@ +import dataclasses +from typing import Any, Dict, List, Optional, Tuple, Type, Union + +import torch + +from vllm.model_executor.pooling_metadata import PoolingMetadata +from vllm.multimodal import MultiModalKwargs +from vllm.pooling_params import PoolingParams +from vllm.sequence import (IntermediateTensors, PoolerOutput, SequenceData, + SequenceGroupMetadata) +from vllm.worker.cpu_model_runner import (CPUModelRunnerBase, ModelInputForCPU, + ModelInputForCPUBuilder) + + +@dataclasses.dataclass(frozen=True) +class ModelInputForCPUWithPoolingMetadata(ModelInputForCPU): + """ + Used by the CPUEmbeddingModelRunner. + """ + pooling_metadata: Optional["PoolingMetadata"] = None + + +class CPUEmbeddingModelRunner( + CPUModelRunnerBase[ModelInputForCPUWithPoolingMetadata]): + _model_input_cls: Type[ModelInputForCPUWithPoolingMetadata] = ( + ModelInputForCPUWithPoolingMetadata) + _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder + + @torch.inference_mode() + def execute_model( + self, + model_input: ModelInputForCPUWithPoolingMetadata, + kv_caches: List[torch.Tensor], + intermediate_tensors: Optional[IntermediateTensors] = None, + num_steps: int = 1, + ) -> Optional[Union[List[PoolerOutput], IntermediateTensors]]: + if num_steps > 1: + raise ValueError( + "CPU worker does not support multi-step execution.") + + num_layers = self.model_config.get_num_layers(self.parallel_config) + # use an empty tensor instead of `None`` to force Dynamo to pass + # it by reference, rather by specializing on the value ``None``. + # the `dtype` argument does not matter, and we use `float32` as + # a placeholder (it has wide hardware support). + kv_caches = [ + torch.tensor([], dtype=torch.float32, device=self.device) + for _ in range(num_layers) + ] + + model_executable = self.model + execute_model_kwargs = { + "input_ids": + model_input.input_tokens, + "positions": + model_input.input_positions, + "kv_caches": + kv_caches, + "attn_metadata": + model_input.attn_metadata, + **MultiModalKwargs.as_kwargs(model_input.multi_modal_kwargs or {}, + device=self.device), + "intermediate_tensors": + intermediate_tensors, + } + + hidden_states = model_executable(**execute_model_kwargs) + + return [ + self.model.pooler(hidden_states=hidden_states, + pooling_metadata=model_input.pooling_metadata) + ] + + def make_model_input_from_broadcasted_tensor_dict( + self, + tensor_dict: Dict[str, + Any]) -> ModelInputForCPUWithPoolingMetadata: + return ModelInputForCPUWithPoolingMetadata.from_broadcasted_tensor_dict( + tensor_dict, + attn_backend=self.attn_backend, + ) + + def prepare_model_input( + self, + seq_group_metadata_list: Optional[List[SequenceGroupMetadata]], + virtual_engine: int = 0, + finished_requests_ids: Optional[List[str]] = None + ) -> ModelInputForCPUWithPoolingMetadata: + assert seq_group_metadata_list is not None + model_input = self._prepare_model_input_tensors( + seq_group_metadata_list, finished_requests_ids) + # Prepare PoolingMetadata. + assert model_input.seq_lens is not None + pooling_metadata = self._prepare_pooling(seq_group_metadata_list, + model_input.seq_lens) + + return dataclasses.replace(model_input, + pooling_metadata=pooling_metadata) + + def _prepare_pooling( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + prompt_lens: List[int], + ) -> PoolingMetadata: + """Prepare PoolingMetadata for the sequence group metadata list.""" + seq_groups: List[Tuple[List[int], PoolingParams]] = [] + for i, seq_group_metadata in enumerate(seq_group_metadata_list): + seq_ids = list(seq_group_metadata.seq_data.keys()) + pooling_params = seq_group_metadata.pooling_params + seq_groups.append((seq_ids, pooling_params)) + + seq_data: Dict[int, SequenceData] = {} + for seq_group_metadata in seq_group_metadata_list: + seq_data.update(seq_group_metadata.seq_data) + + pooling_metadata = PoolingMetadata( + seq_groups=seq_groups, + seq_data=seq_data, + prompt_lens=prompt_lens, + ) + + return pooling_metadata diff --git a/vllm/worker/cpu_enc_dec_model_runner.py b/vllm/worker/cpu_enc_dec_model_runner.py index 994af7c5a455f..896e948948c7a 100644 --- a/vllm/worker/cpu_enc_dec_model_runner.py +++ b/vllm/worker/cpu_enc_dec_model_runner.py @@ -8,7 +8,7 @@ from vllm.multimodal import MultiModalKwargs from vllm.sequence import IntermediateTensors, SequenceGroupMetadata from vllm.utils import make_tensor_with_pad -from vllm.worker.cpu_model_runner import (CPUModelRunner, +from vllm.worker.cpu_model_runner import (CPUModelRunnerBase, ModelInputForCPUBuilder, ModelInputForCPUWithSamplingMetadata) from vllm.worker.model_runner_base import ( @@ -50,7 +50,8 @@ def from_broadcasted_tensor_dict( super().from_broadcasted_tensor_dict(tensor_dict, attn_backend)) -class CPUEncoderDecoderModelRunner(CPUModelRunner): +class CPUEncoderDecoderModelRunner( + CPUModelRunnerBase[EncoderDecoderModelInputForCPU]): _model_input_cls: Type[EncoderDecoderModelInputForCPU] = ( EncoderDecoderModelInputForCPU) _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder @@ -87,10 +88,8 @@ def prepare_model_input( virtual_engine: int = 0, finished_requests_ids: Optional[List[str]] = None ) -> EncoderDecoderModelInputForCPU: - model_input = super().prepare_model_input(seq_group_metadata_list, - virtual_engine, - finished_requests_ids) - model_input = cast(EncoderDecoderModelInputForCPU, model_input) + model_input = self._prepare_model_input_tensors( + seq_group_metadata_list, finished_requests_ids) ( attn_metadata, encoder_input_tokens_tensor, diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index 1590184d6f831..09c62fbb9875f 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -2,7 +2,8 @@ import weakref from collections import defaultdict from dataclasses import dataclass -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union +from typing import (TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, + TypeVar, Union) import torch from torch import nn @@ -31,6 +32,7 @@ logger = init_logger(__name__) +TModelInputForCPU = TypeVar('TModelInputForCPU', bound="ModelInputForCPU") _PAD_SLOT_ID = -1 @@ -60,10 +62,10 @@ def as_broadcastable_tensor_dict( @classmethod def from_broadcasted_tensor_dict( - cls: Type["ModelInputForCPU"], + cls: Type[TModelInputForCPU], tensor_dict: Dict[str, Any], attn_backend: Optional["AttentionBackend"] = None - ) -> "ModelInputForCPU": + ) -> TModelInputForCPU: if attn_backend is not None: tensor_dict = _init_attn_metadata_from_tensor_dict( attn_backend, tensor_dict) @@ -255,11 +257,14 @@ def _prepare_prompt( slot_mapping.append(_PAD_SLOT_ID) continue - block_number = block_table[i // - self.block_size] # type: ignore - block_offset = i % self.block_size # type: ignore - slot = block_number * self.block_size + block_offset - slot_mapping.append(slot) + # For encoder-only models, the block_table is None, + # and there is no need to initialize the slot_mapping. + if block_table is not None: + block_number = block_table[i // + self.block_size] # type: ignore + block_offset = i % self.block_size # type: ignore + slot = block_number * self.block_size + block_offset + slot_mapping.append(slot) if any(input_mrope_positions): input_positions = None # type: ignore @@ -402,10 +407,12 @@ def _prepare_decode( ) -class CPUModelRunner(ModelRunnerBase[ModelInputForCPU]): - _model_input_cls: Type[ModelInputForCPUWithSamplingMetadata] = ( - ModelInputForCPUWithSamplingMetadata) - _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder +class CPUModelRunnerBase(ModelRunnerBase[TModelInputForCPU]): + """ + Helper class for shared methods between CPU model runners. + """ + _model_input_cls: Type[TModelInputForCPU] + _builder_cls: Type[ModelInputForCPUBuilder] def __init__( self, @@ -448,20 +455,11 @@ def __init__( def load_model(self) -> None: self.model = get_model(vllm_config=self.vllm_config) - def make_model_input_from_broadcasted_tensor_dict( - self, - tensor_dict: Dict[str, Any], - ) -> ModelInputForCPUWithSamplingMetadata: - return ModelInputForCPUWithSamplingMetadata.from_broadcasted_tensor_dict( # noqa: E501 - tensor_dict, - attn_backend=self.attn_backend, - ) - def _prepare_model_input_tensors( self, seq_group_metadata_list: List[SequenceGroupMetadata], finished_requests_ids: Optional[List[str]] = None - ) -> ModelInputForCPUWithSamplingMetadata: + ) -> TModelInputForCPU: """Helper method to prepare the model input based on a given sequence group. Prepares metadata needed for the base model forward pass but not metadata for possible additional steps, e.g., sampling. @@ -473,6 +471,21 @@ def _prepare_model_input_tensors( return builder.build() # type: ignore + +class CPUModelRunner(CPUModelRunnerBase[ModelInputForCPUWithSamplingMetadata]): + _model_input_cls: Type[ModelInputForCPUWithSamplingMetadata] = ( + ModelInputForCPUWithSamplingMetadata) + _builder_cls: Type[ModelInputForCPUBuilder] = ModelInputForCPUBuilder + + def make_model_input_from_broadcasted_tensor_dict( + self, + tensor_dict: Dict[str, Any], + ) -> ModelInputForCPUWithSamplingMetadata: + return ModelInputForCPUWithSamplingMetadata.from_broadcasted_tensor_dict( # noqa: E501 + tensor_dict, + attn_backend=self.attn_backend, + ) + def prepare_model_input( self, seq_group_metadata_list: List[SequenceGroupMetadata], diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 162e1e4be873b..bc9164bd9d5df 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -14,8 +14,9 @@ from vllm.model_executor import set_random_seed from vllm.sequence import ExecuteModelRequest from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE +from vllm.worker.cpu_embedding_model_runner import CPUEmbeddingModelRunner from vllm.worker.cpu_enc_dec_model_runner import CPUEncoderDecoderModelRunner -from vllm.worker.cpu_model_runner import CPUModelRunner +from vllm.worker.cpu_model_runner import CPUModelRunner, CPUModelRunnerBase from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, LoraNotSupportedWorkerBase, WorkerBase, WorkerInput) @@ -150,21 +151,20 @@ def __init__( else: self.local_omp_cpuid = omp_cpuids.split("|")[rank] - ModelRunnerClass: Type[CPUModelRunner] = CPUModelRunner + ModelRunnerClass: Type[CPUModelRunnerBase] = CPUModelRunner if self.model_config.task == "embedding": - raise NotImplementedError( - "Embedding models are not supported for CPU backend") - # ModelRunnerClass = CPUEmbeddingModelRunner + ModelRunnerClass = CPUEmbeddingModelRunner elif self.model_config.is_encoder_decoder: ModelRunnerClass = CPUEncoderDecoderModelRunner - self.model_runner: CPUModelRunner = ModelRunnerClass( + self.model_runner: CPUModelRunnerBase = ModelRunnerClass( vllm_config=vllm_config, kv_cache_dtype=kv_cache_dtype, is_driver_worker=is_driver_worker) # Uninitialized cache engine. Will be initialized by # initialize_cache. self.cache_engine: List[CPUCacheEngine] - self.cpu_cache: List[List[torch.Tensor]] + # Initialize cpu_cache as embedding models don't initialize kv_caches + self.cpu_cache: Optional[List[List[torch.Tensor]]] = None # Torch profiler. Enabled and configured through env vars: # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace