diff --git a/Dockerfile.hpu b/Dockerfile.hpu index d18fc016387bf..aa1502cc5ee8b 100644 --- a/Dockerfile.hpu +++ b/Dockerfile.hpu @@ -1,4 +1,4 @@ -FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest +FROM vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest COPY ./ /workspace/vllm diff --git a/README_GAUDI.md b/README_GAUDI.md index 22e4320eec384..08be33d54abea 100644 --- a/README_GAUDI.md +++ b/README_GAUDI.md @@ -11,7 +11,7 @@ Please follow the instructions provided in the [Gaudi Installation Guide](https: - OS: Ubuntu 22.04 LTS - Python: 3.10 - Intel Gaudi accelerator -- Intel Gaudi software version 1.18.0 +- Intel Gaudi software version 1.19.0 ## Quick start using Dockerfile ``` @@ -44,8 +44,8 @@ It is highly recommended to use the latest Docker image from Intel Gaudi vault. Use the following commands to run a Docker image: ```{.console} -$ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest -$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest +$ docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest +$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest ``` ### Build and Install vLLM-fork @@ -55,7 +55,7 @@ Currently, the latest features and performance optimizations are developed in Ga ```{.console} $ git clone https://github.com/HabanaAI/vllm-fork.git $ cd vllm-fork -$ git checkout habana_main +$ git checkout v1.19.0 $ pip install -r requirements-hpu.txt $ python setup.py develop ``` @@ -71,11 +71,11 @@ $ python setup.py develop - Inference with [HPU Graphs](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) for accelerating low-batch latency and throughput - Attention with Linear Biases (ALiBi) - INC quantization +- LoRA adapters # Unsupported Features - Beam search -- LoRA adapters - AWQ quantization - Prefill chunking (mixed-batch inferencing) @@ -112,7 +112,7 @@ Currently in vLLM for HPU we support four execution modes, depending on selected | 1 | 1 | PyTorch lazy mode | > [!WARNING] -> In 1.18.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode. +> In 1.19.0, all modes utilizing `PT_HPU_LAZY_MODE=0` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.19.0, please use HPU Graphs, or PyTorch lazy mode. ## Bucketing mechanism diff --git a/docs/source/getting_started/gaudi-installation.rst b/docs/source/getting_started/gaudi-installation.rst index 79d40293fd470..0c409d56d2ca1 100644 --- a/docs/source/getting_started/gaudi-installation.rst +++ b/docs/source/getting_started/gaudi-installation.rst @@ -18,7 +18,7 @@ Requirements - OS: Ubuntu 22.04 LTS - Python: 3.10 - Intel Gaudi accelerator -- Intel Gaudi software version 1.18.0 +- Intel Gaudi software version 1.19.0 Quick start using Dockerfile @@ -63,8 +63,8 @@ Use the following commands to run a Docker image: .. code:: console - $ docker pull vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest - $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest + $ docker pull vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest + $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest Build and Install vLLM ~~~~~~~~~~~~~~~~~~~~~~ @@ -85,7 +85,7 @@ Currently, the latest features and performance optimizations are developed in Ga $ git clone https://github.com/HabanaAI/vllm-fork.git $ cd vllm-fork - $ git checkout habana_main + $ git checkout v1.19.0 $ pip install -r requirements-hpu.txt $ python setup.py develop @@ -107,12 +107,12 @@ Supported Features for accelerating low-batch latency and throughput - Attention with Linear Biases (ALiBi) - INC quantization +- LoRA adapters Unsupported Features ==================== - Beam search -- LoRA adapters - AWQ quantization - Prefill chunking (mixed-batch inferencing) @@ -186,7 +186,7 @@ Currently in vLLM for HPU we support four execution modes, depending on selected - PyTorch lazy mode .. warning:: - In 1.18.0, all modes utilizing ``PT_HPU_LAZY_MODE=0`` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.18.0, please use HPU Graphs, or PyTorch lazy mode. + In 1.19.0, all modes utilizing ``PT_HPU_LAZY_MODE=0`` are highly experimental and should be only used for validating functional correctness. Their performance will be improved in the next releases. For obtaining the best performance in 1.19.0, please use HPU Graphs, or PyTorch lazy mode. Bucketing mechanism diff --git a/docs/source/serving/compatibility_matrix.rst b/docs/source/serving/compatibility_matrix.rst index f629b3ca78318..7c0a34efd30ce 100644 --- a/docs/source/serving/compatibility_matrix.rst +++ b/docs/source/serving/compatibility_matrix.rst @@ -305,6 +305,7 @@ Feature x Hardware - Hopper - CPU - AMD + - Gaudi * - :ref:`CP ` - `✗ `__ - ✅ @@ -313,6 +314,7 @@ Feature x Hardware - ✅ - ✗ - ✅ + - ✗ * - :ref:`APC ` - `✗ `__ - ✅ @@ -321,6 +323,7 @@ Feature x Hardware - ✅ - ✗ - ✅ + - ✅ * - :ref:`LoRA ` - ✅ - ✅ @@ -329,6 +332,7 @@ Feature x Hardware - ✅ - `✗ `__ - ✅ + - ✅ * - :abbr:`prmpt adptr (Prompt Adapter)` - ✅ - ✅ @@ -337,6 +341,7 @@ Feature x Hardware - ✅ - `✗ `__ - ✅ + - ✗ * - :ref:`SD ` - ✅ - ✅ @@ -345,6 +350,7 @@ Feature x Hardware - ✅ - ✅ - ✅ + - ✅ * - CUDA graph - ✅ - ✅ @@ -353,6 +359,7 @@ Feature x Hardware - ✅ - ✗ - ✅ + - ✗ * - :abbr:`enc-dec (Encoder-Decoder Models)` - ✅ - ✅ @@ -361,6 +368,7 @@ Feature x Hardware - ✅ - ✅ - ✗ + - ✅ * - :abbr:`logP (Logprobs)` - ✅ - ✅ @@ -369,6 +377,7 @@ Feature x Hardware - ✅ - ✅ - ✅ + - ✅ * - :abbr:`prmpt logP (Prompt Logprobs)` - ✅ - ✅ @@ -377,6 +386,7 @@ Feature x Hardware - ✅ - ✅ - ✅ + - ✅ * - :abbr:`async output (Async Output Processing)` - ✅ - ✅ @@ -385,6 +395,7 @@ Feature x Hardware - ✅ - ✗ - ✗ + - ✅ * - multi-step - ✅ - ✅ @@ -393,6 +404,7 @@ Feature x Hardware - ✅ - `✗ `__ - ✅ + - ✅ * - :abbr:`MM (Multimodal)` - ✅ - ✅ @@ -401,6 +413,7 @@ Feature x Hardware - ✅ - ✅ - ✅ + - ✅ * - best-of - ✅ - ✅ @@ -409,6 +422,7 @@ Feature x Hardware - ✅ - ✅ - ✅ + - ✅ * - beam-search - ✅ - ✅ @@ -417,6 +431,7 @@ Feature x Hardware - ✅ - ✅ - ✅ + - ✗ * - :abbr:`guided dec (Guided Decoding)` - ✅ - ✅ @@ -425,3 +440,4 @@ Feature x Hardware - ✅ - ✅ - ✅ + - ✅