Merge remote-tracking branch 'origin/habana_main' into HEAD

HabanaAI · Sep 20, 2024 · 82960d8 · 82960d8
2 parents b2653ab + bc39baa
commit 82960d8
Show file tree

Hide file tree

Showing 38 changed files with 639 additions and 679 deletions.
diff --git a/.github/workflows/cpu-test.yml b/.github/workflows/cpu-test.yml
@@ -0,0 +1,34 @@
+name: cpu-test
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the habana_main branch
+  push:
+    branches:
+      - habana_main
+  pull_request:
+    branches:
+      - habana_main
+
+
+jobs:
+  cputest:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
+        pip install -r requirements-hpu.txt
+        VLLM_TARGET_DEVICE=hpu python setup.py develop
+    - name: cpu-test
+      run: |
+        VLLM_SKIP_WARMUP=true VLLM_PROMPT_SEQ_BUCKET_MAX=128 VLLM_USE_FAKE_HPU=1 python examples/offline_inference_fakehpu.py
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
@@ -32,15 +32,23 @@ jobs:
         pip install types-setuptools
     - name: Mypy
       run: |
-        mypy
-        mypy tests --follow-imports skip
-        mypy vllm/attention --follow-imports skip
-        mypy vllm/distributed --follow-imports skip
-        mypy vllm/engine  --follow-imports skip
-        mypy vllm/executor --follow-imports skip
-        mypy vllm/lora --follow-imports skip
-        mypy vllm/model_executor  --follow-imports skip
-        mypy vllm/prompt_adapter --follow-imports skip
-        mypy vllm/spec_decode --follow-imports skip
-        mypy vllm/worker --follow-imports skip
+        mypy tests --config-file pyproject.toml
+        mypy vllm/*.py --config-file pyproject.toml
+        mypy vllm/attention --config-file pyproject.toml
+        mypy vllm/core --config-file pyproject.toml
+        mypy vllm/distributed --config-file pyproject.toml
+        mypy vllm/engine  --config-file pyproject.toml
+        mypy vllm/entrypoints --config-file pyproject.toml
+        mypy vllm/executor --config-file pyproject.toml
+        mypy vllm/inputs --config-file pyproject.toml
+        mypy vllm/logging --config-file pyproject.toml
+        mypy vllm/lora --config-file pyproject.toml
+        mypy vllm/model_executor  --config-file pyproject.toml
+        mypy vllm/multimodal --config-file pyproject.toml
+        mypy vllm/platforms --config-file pyproject.toml
+        mypy vllm/spec_decode --config-file pyproject.toml
+        mypy vllm/transformers_utils --config-file pyproject.toml
+        mypy vllm/usage --config-file pyproject.toml
+        mypy vllm/worker --config-file pyproject.toml
+
 
diff --git a/Dockerfile.hpu b/Dockerfile.hpu
@@ -0,0 +1,18 @@
+FROM vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+
+RUN pip install -v -r requirements-hpu.txt
+
+ENV no_proxy=localhost,127.0.0.1
+ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
+
+RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
+
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/README_GAUDI.md b/README_GAUDI.md
@@ -81,14 +81,15 @@ Supported Features
 -   Inference with [HPU
     Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
     for accelerating low-batch latency and throughput
+-   INC quantization
 
 Unsupported Features
 ====================
 
 -   Beam search
 -   LoRA adapters
 -   Attention with Linear Biases (ALiBi)
--   Quantization (AWQ, FP8 E5M2, FP8 E4M3)
+-   AWQ quantization
 -   Prefill chunking (mixed-batch inferencing)
 
 Supported Configurations
@@ -315,9 +316,9 @@ mark 90% of free device memory at that point as usable. Next, KV cache
 gets allocated, model is warmed up, and HPU Graphs are captured.
 Environment variable `VLLM_GRAPH_RESERVED_MEM` defines the ratio of
 memory reserved for HPU Graphs capture. With its default value
-(`VLLM_GRAPH_RESERVED_MEM=0.4`), 40% of usable memory will be reserved
+(`VLLM_GRAPH_RESERVED_MEM=0.1`), 10% of usable memory will be reserved
 for graph capture (later referred to as \"usable graph memory\"), and
-the remaining 60% will be utilized for KV cache. Environment variable
+the remaining 90% will be utilized for KV cache. Environment variable
 `VLLM_GRAPH_PROMPT_RATIO` determines the ratio of usable graph memory
 reserved for prefill and decode graphs. By default
 (`VLLM_GRAPH_PROMPT_RATIO=0.5`), both stages have equal memory
@@ -445,7 +446,7 @@ Environment variables
 -   `VLLM_SKIP_WARMUP`: if `true`, warmup will be skipped, `false` by
     default
 -   `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated for
-    HPUGraph capture, `0.4` by default
+    HPUGraph capture, `0.1` by default
 -   `VLLM_GRAPH_PROMPT_RATIO`: percentage of reserved graph memory
     dedicated for prompt graphs, `0.5` by default
 -   `VLLM_GRAPH_PROMPT_STRATEGY`: strategy determining order of prompt

diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst
@@ -76,14 +76,15 @@ Supported Features
 -  Tensor parallelism support for multi-card inference
 -  Inference with `HPU Graphs <https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html>`__
    for accelerating low-batch latency and throughput
+-  INC quantization
 
 Unsupported Features
 ====================
 
 -  Beam search
 -  LoRA adapters
 -  Attention with Linear Biases (ALiBi)
--  Quantization (AWQ, FP8 E5M2, FP8 E4M3)
+-  AWQ quantization
 -  Prefill chunking (mixed-batch inferencing)
 
 Supported Configurations
@@ -243,7 +244,7 @@ Before KV cache gets allocated, model weights are loaded onto the device, and a
 Only after that, ``gpu_memory_utilization`` flag is utilized - at its default value,  will mark 90% of free device memory at that point as usable.
 Next, KV cache gets allocated, model is warmed up, and HPU Graphs are captured. 
 Environment variable ``VLLM_GRAPH_RESERVED_MEM`` defines the ratio of memory reserved for HPU Graphs capture. 
-With its default value (``VLLM_GRAPH_RESERVED_MEM=0.4``), 40% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 60% will be utilized for KV cache. 
+With its default value (``VLLM_GRAPH_RESERVED_MEM=0.1``), 10% of usable memory will be reserved for graph capture (later referred to as "usable graph memory"), and the remaining 90% will be utilized for KV cache. 
 Environment variable ``VLLM_GRAPH_PROMPT_RATIO`` determines the ratio of usable graph memory reserved for prefill and decode graphs. By default (``VLLM_GRAPH_PROMPT_RATIO=0.5``), both stages have equal memory constraints. 
 Lower value corresponds to less usable graph memory reserved for prefill stage, e.g. ``VLLM_GRAPH_PROMPT_RATIO=0.2`` will reserve 20% of usable graph memory for prefill graphs, and 80% of usable graph memory for decode graphs. 
 
@@ -322,14 +323,14 @@ Environment variables
 **Performance tuning knobs:**
 
 -   ``VLLM_SKIP_WARMUP``: if ``true``, warmup will be skipped, ``false`` by default
--   ``VLLM_GRAPH_RESERVED_MEM``: percentage of memory dedicated for HPUGraph capture, ``0.4`` by default
+-   ``VLLM_GRAPH_RESERVED_MEM``: percentage of memory dedicated for HPUGraph capture, ``0.1`` by default
 -   ``VLLM_GRAPH_PROMPT_RATIO``: percentage of reserved graph memory dedicated for prompt graphs, ``0.5`` by default
 -   ``VLLM_GRAPH_PROMPT_STRATEGY``: strategy determining order of prompt graph capture, ``min_tokens`` or ``max_bs``, ``min_tokens`` by default
 -   ``VLLM_GRAPH_DECODE_STRATEGY``: strategy determining order of decode graph capture, ``min_tokens`` or ``max_bs``, ``max_bs`` by default
 -   ``VLLM_{phase}_{dim}_BUCKET_{param}`` - collection of 12 environment variables configuring ranges of bucketing mechanism
 
     - ``{phase}`` is either ``PROMPT`` or ``DECODE``
-    - ``{dim}`` is either ``BS`` or ``SEQ``
+    - ``{dim}`` is either ``BS``, ``SEQ`` or ``BLOCK``
     - ``{param}`` is either ``MIN``, ``STEP`` or ``MAX``
     - Default values:
 
@@ -345,9 +346,9 @@ Environment variables
          - batch size min (``VLLM_DECODE_BS_BUCKET_MIN``): ``min(max_num_seqs, 32)``
          - batch size step (``VLLM_DECODE_BS_BUCKET_STEP``): ``min(max_num_seqs, 32)``
          - batch size max (``VLLM_DECODE_BS_BUCKET_MAX``): ``max_num_seqs``
-         - sequence length min (``VLLM_DECODE_SEQ_BUCKET_MIN``): ``128``
-         - sequence length step (``VLLM_DECODE_SEQ_BUCKET_STEP``): ``128``
-         - sequence length max (``VLLM_DECODE_SEQ_BUCKET_MAX``): ``max(128, (max_num_seqs*max_model_len)/block_size)``
+         - sequence length min (``VLLM_DECODE_BLOCK_BUCKET_MIN``): ``128``
+         - sequence length step (``VLLM_DECODE_BLOCK_BUCKET_STEP``): ``128``
+         - sequence length max (``VLLM_DECODE_BLOCK_BUCKET_MAX``): ``max(128, (max_num_seqs*max_model_len)/block_size)``
 
 
 Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM execution:  

diff --git a/examples/offline_inference_fakehpu.py b/examples/offline_inference_fakehpu.py
@@ -0,0 +1,38 @@
+import os
+
+from vllm import LLM, SamplingParams
+
+if os.environ.get('VLLM_USE_FAKE_HPU', '0') != '0':
+    from vllm.utils import migrate_to_cpu
+    migrate_to_cpu()
+
+# Sample prompts.
+prompts = [
+    "Berlin is the capital city of ",
+    "Louvre is located in the city of ",
+    "Barack Obama was the 44th president of ",
+    "Warsaw is the capital city of ",
+    "Gniezno is a city in ",
+    "San Francisco is located in the state of ",
+    "Llanfairpwllgwyngyll is located in country of ",
+]
+ref_answers = [
+    "Germany", "Paris", "United States", "Poland", "Poland", "California",
+    "Wales"
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0, n=1, use_beam_search=False)
+
+# Create an LLM.
+llm = LLM(model="facebook/opt-125m", max_model_len=32, max_num_seqs=4)
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output, answer in zip(outputs, ref_answers):
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    assert answer in generated_text, (
+        f"The generated text does not contain the correct answer: {answer}")
+print('PASSED')
diff --git a/format.sh b/format.sh
@@ -96,18 +96,23 @@ echo 'vLLM yapf: Done'
 
 # Run mypy
 echo 'vLLM mypy:'
-mypy --follow-imports skip  # Note that this is less strict than CI
-mypy tests --follow-imports skip
-mypy vllm/attention --follow-imports skip
-mypy vllm/distributed --follow-imports skip
-mypy vllm/engine  --follow-imports skip
-mypy vllm/executor --follow-imports skip
-mypy vllm/lora --follow-imports skip
-mypy vllm/model_executor  --follow-imports skip
-mypy vllm/prompt_adapter --follow-imports skip
-mypy vllm/spec_decode --follow-imports skip
-mypy vllm/worker --follow-imports skip
-echo 'vLLM mypy: Done'
+mypy tests --config-file pyproject.toml
+mypy vllm/*.py --config-file pyproject.toml
+mypy vllm/attention --config-file pyproject.toml
+mypy vllm/core --config-file pyproject.toml
+mypy vllm/distributed --config-file pyproject.toml
+mypy vllm/engine  --config-file pyproject.toml
+mypy vllm/entrypoints --config-file pyproject.toml
+mypy vllm/executor --config-file pyproject.toml
+mypy vllm/logging --config-file pyproject.toml
+mypy vllm/lora --config-file pyproject.toml
+mypy vllm/model_executor  --config-file pyproject.toml
+mypy vllm/multimodal --config-file pyproject.toml
+mypy vllm/prompt_adapter --config-file pyproject.toml
+mypy vllm/spec_decode --config-file pyproject.toml
+mypy vllm/transformers_utils --config-file pyproject.toml
+mypy vllm/usage --config-file pyproject.toml
+mypy vllm/worker --config-file pyproject.toml
 
 
 # If git diff returns a file that is in the skip list, the file may be checked anyway:

diff --git a/requirements-hpu.txt b/requirements-hpu.txt
@@ -6,3 +6,4 @@ ray == 2.32.0
 triton
 pandas
 tabulate
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@30ee2d1
diff --git a/tests/lora/test_lora_hpu.py b/tests/lora/test_lora_hpu.py
@@ -1,6 +1,6 @@
 import pytest
 import torch
-from vllm.hpu.ops import LoraMask
+from vllm_hpu_extension.ops import LoraMask
 
 from vllm.hpu.punica_hpu import GaudiPunicaWrapper
 

diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
@@ -9,7 +9,7 @@
 from transformers import GenerationConfig, GenerationMixin
 
 import vllm.envs as envs
-from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.sampler import ApplyToppTopkScalar, Sampler
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, SamplingParams,
@@ -781,3 +781,62 @@ def test_sampler_include_gpu_probs_tensor(device: str):
     assert sampler_output.sampled_token_probs is not None
     assert sampler_output.logprobs is not None
     assert sampler_output.sampled_token_ids is not None
+
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+def test_topk_topk_scalar():
+    obj1 = ApplyToppTopkScalar(2)
+    assert ApplyToppTopkScalar._padded_k == 0
+    x = torch.tensor([[9, 9, 8, 8, 8, 8, 7, 7, 7.0],
+                      [10, 10, 9, 9, 9, 8, 5, 5, 5]])
+
+    retval1 = obj1(x, p=0.9, k=5)
+    ninf = -float("inf")
+    expected1 = torch.tensor([[9., 9., 8., 8., 8., 8., ninf, ninf, ninf],
+                              [10., 10., 9., 9., 9., ninf, ninf, ninf, ninf]])
+    assert torch.all(retval1 == expected1).item()
+    assert ApplyToppTopkScalar._padded_k == 9
+
+    obj2 = ApplyToppTopkScalar(2)
+    assert obj2._padded_k == 9
+
+    x = torch.tensor([[2, 2, 9, 9, 2, 2, 1, 1, 1.0],
+                      [10, 9, 9, 5, 9, 9, 5, 9, 10]])
+    retval2 = obj2(x, p=0.9, k=5)
+    expected2 = torch.tensor(
+        [[ninf, ninf, 9., 9., ninf, ninf, ninf, ninf, ninf],
+         [10., ninf, 9., ninf, 9., 9., ninf, 9., 10.]])
+    assert torch.all(retval2 == expected2).item()
+    assert obj2._padded_k == 9
+
+    retval3 = obj2(x, p=1.0, k=5)
+    expected3 = torch.tensor([[2., 2., 9., 9., 2., 2., ninf, ninf, ninf],
+                              [10., 9., 9., ninf, 9., 9., ninf, 9., 10.]])
+
+    assert torch.all(retval3 == expected3).item()
+
+    # this should not be done in general, doing it here for testing purposes
+    ApplyToppTopkScalar._padded_k = 0
+    x = torch.tensor([[1, 1, 1, 9, 8, 1, 1, 1, 1.0],
+                      [2, 1, 2, 2, 1, 1, 1, 1, 1]])
+    obj3 = ApplyToppTopkScalar(2)
+    retval4 = obj3(x, p=0.9, k=2)
+    expected4 = torch.tensor(
+        [[ninf, ninf, ninf, 9., 8., ninf, ninf, ninf, ninf],
+         [2., ninf, 2., 2., ninf, ninf, ninf, ninf, ninf]])
+    assert torch.all(retval4 == expected4).item()
+    assert obj3._padded_k == 4
+    y = torch.tensor([[8, 8, 8, 9, 8, 1, 1, 1, 1.0],
+                      [2, 1, 2, 2, 1, 1, 1, 1, 1]])
+    retval5 = obj3(y, p=0.9, k=2)
+    assert obj3._padded_k == 8
+    expected5 = torch.tensor([[8., 8., 8., 9., 8., ninf, ninf, ninf, ninf],
+                              [2., ninf, 2., 2., ninf, ninf, ninf, ninf,
+                               ninf]])
+    assert torch.all(retval5 == expected5).item()
+    y = torch.tensor([[8, 8, 8, 9, 8, 8, 1, 1, 1.0],
+                      [2, 1, 2, 2, 3, 1, 1, 1, 1]])
+    retval6 = obj3(y, p=0.9, k=2)
+    expected6 = torch.tensor([[8., 8., 8., 9., 8., 8., ninf, ninf, ninf],
+                              [2., ninf, 2., 2., 3., ninf, ninf, ninf, ninf]])
+    assert torch.all(retval6 == expected6).item()
+    assert obj3._padded_k == 8
diff --git a/vllm/__init__.py b/vllm/__init__.py
@@ -1,4 +1,8 @@
 """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
+from vllm.utils import is_fake_hpu, migrate_to_cpu
+
+if is_fake_hpu():
+    migrate_to_cpu()
 
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,3 +6,4 @@ ray == 2.32.0 @@
     triton
     pandas
     tabulate
+    vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@30ee2d1