Nov 6 rebase (sans vllm-project#6143) (#468)

This PR adds all commits before vllm-project#6143 without vllm-project#6143.
HabanaAI · Nov 6, 2024 · 5eb7f3d · 5eb7f3d
2 parents 5812cb6 + 8e62377
commit 5eb7f3d
Show file tree

Hide file tree

Showing 164 changed files with 1,313 additions and 922 deletions.
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -56,7 +56,7 @@
 
 def read_markdown(file):
     if os.path.exists(file):
-        with open(file, "r") as f:
+        with open(file) as f:
             return f.read() + "\n"
     else:
         return f"{file} not found.\n"
@@ -75,14 +75,14 @@ def results_to_json(latency, throughput, serving):
     # collect results
     for test_file in results_folder.glob("*.json"):
 
-        with open(test_file, "r") as f:
+        with open(test_file) as f:
             raw_result = json.loads(f.read())
 
         if "serving" in str(test_file):
             # this result is generated via `benchmark_serving.py`
 
             # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                 command = json.loads(f.read())
             raw_result.update(command)
 
@@ -97,7 +97,7 @@ def results_to_json(latency, throughput, serving):
             # this result is generated via `benchmark_latency.py`
 
             # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                 command = json.loads(f.read())
             raw_result.update(command)
 
@@ -119,7 +119,7 @@ def results_to_json(latency, throughput, serving):
             # this result is generated via `benchmark_throughput.py`
 
             # attach the benchmarking command to raw_result
-            with open(test_file.with_suffix(".commands"), "r") as f:
+            with open(test_file.with_suffix(".commands")) as f:
                 command = json.loads(f.read())
             raw_result.update(command)
 

diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py
@@ -72,15 +72,15 @@ def main(args):
 
     # collect results
     for test_file in results_folder.glob("*_nightly_results.json"):
-        with open(test_file, "r") as f:
+        with open(test_file) as f:
             results = results + json.loads(f.read())
 
     # generate markdown table
     df = pd.DataFrame.from_dict(results)
 
     md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
 
-    with open(args.description, "r") as f:
+    with open(args.description) as f:
         description = f.read()
 
     description = description.format(

diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -36,11 +36,11 @@
     # collect results
     for test_file in results_folder.glob("*.json"):
 
-        with open(test_file, "r") as f:
+        with open(test_file) as f:
             raw_result = json.loads(f.read())
 
         # attach the benchmarking command to raw_result
-        with open(test_file.with_suffix(".commands"), "r") as f:
+        with open(test_file.with_suffix(".commands")) as f:
             command = json.loads(f.read())
         raw_result.update(command)
 

diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh
@@ -11,4 +11,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -321,15 +321,14 @@ steps:
   - tests/models/decoder_only/language
   commands:
     - pytest -v -s models/decoder_only/language/test_models.py
-    - pytest -v -s models/decoder_only/language/test_big_models.py
 
 - label: Decoder-only Language Models Test (Extended) # 1h20min
   nightly: true
   source_file_dependencies:
   - vllm/
   - tests/models/decoder_only/language
   commands:
-    - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py --ignore=models/decoder_only/language/test_big_models.py
+    - pytest -v -s models/decoder_only/language --ignore=models/decoder_only/language/test_models.py
 
 - label: Decoder-only Multi-Modal Models Test (Standard)
   #mirror_hardwares: [amd]
@@ -511,6 +510,7 @@ steps:
   # NOTE: don't test llama model here, it seems hf implementation is buggy
   # see https://github.com/vllm-project/vllm/pull/5689 for details
   - pytest -v -s distributed/test_custom_all_reduce.py
+  - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
   - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
   - pytest -v -s -x lora/test_mixtral.py
 

diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
@@ -6,12 +6,14 @@ on:
     paths:
       - '.github/workflows/*.ya?ml'
       - '.github/workflows/actionlint.*'
+      - '.github/workflows/matchers/actionlint.json'
   pull_request:
     branches:
       - "main"
     paths:
       - '.github/workflows/*.ya?ml'
       - '.github/workflows/actionlint.*'
+      - '.github/workflows/matchers/actionlint.json'
 
 env:
   LC_ALL: en_US.UTF-8

diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
@@ -6,9 +6,21 @@ on:
   push:
     branches:
       - habana_main
+    paths:
+      - '**/*.h'
+      - '**/*.cpp'
+      - '**/*.cu'
+      - '**/*.cuh'
+      - '.github/workflows/clang-format.yml'
   pull_request:
     branches:
       - habana_main
+    paths:
+      - '**/*.h'
+      - '**/*.cpp'
+      - '**/*.cu'
+      - '**/*.cuh'
+      - '.github/workflows/clang-format.yml'
 
 jobs:
   clang-format:

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
@@ -5,17 +5,34 @@ on:
   # but only for the habana_main branch
   push:
     branches:
+<<<<<<< HEAD
       - habana_main
   pull_request:
     branches:
       - habana_main
+=======
+      - main
+    paths:
+      - '**/*.py'
+      - '.github/workflows/mypy.yaml'
+      - 'tools/mypy.sh'
+      - 'pyproject.toml'
+  pull_request:
+    branches:
+      - main
+    paths:
+      - '**/*.py'
+      - '.github/workflows/mypy.yaml'
+      - 'tools/mypy.sh'
+      - 'pyproject.toml'
+>>>>>>> a5fda50a10641e47c0c290907f30ef2add6d4e7a
 
 jobs:
   mypy:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
     steps:
     - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
     - name: Set up Python ${{ matrix.python-version }}

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -48,7 +48,7 @@ jobs:
       fail-fast: false
       matrix:
           os: ['ubuntu-20.04']
-          python-version: ['3.8', '3.9', '3.10', '3.11', '3.12']
+          python-version: ['3.9', '3.10', '3.11', '3.12']
           pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
           cuda-version: ['11.8', '12.1']
 

diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
@@ -6,33 +6,42 @@ on:
   push:
     branches:
       - habana_main
+    paths:
+      - "**/*.py"
+      - pyproject.toml
+      - requirements-lint.txt
+      - .github/workflows/matchers/ruff.json
+      - .github/workflows/ruff.yml
   pull_request:
     branches:
       - habana_main
+    paths:
+      - "**/*.py"
+      - pyproject.toml
+      - requirements-lint.txt
+      - .github/workflows/matchers/ruff.json
+      - .github/workflows/ruff.yml
 
 jobs:
   ruff:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.12"]
     steps:
-    - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install -r requirements-lint.txt
-    - name: Analysing the code with ruff
-      run: |
-        echo "::add-matcher::.github/workflows/matchers/ruff.json"
-        ruff check --output-format github .
-    - name: Spelling check with codespell
-      run: |
-        codespell --toml pyproject.toml
-    - name: Run isort
-      run: |
-        isort . --check-only
+      - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements-lint.txt
+      - name: Analysing the code with ruff
+        run: |
+          echo "::add-matcher::.github/workflows/matchers/ruff.json"
+          ruff check --output-format github .
+      - name: Run isort
+        run: |
+          isort . --check-only
diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
@@ -6,27 +6,33 @@ on:
   push:
     branches:
       - habana_main
+    paths:
+      - "**/*.py"
+      - .github/workflows/yapf.yml
   pull_request:
     branches:
       - habana_main
+    paths:
+      - "**/*.py"
+      - .github/workflows/yapf.yml
 
 jobs:
   yapf:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.12"]
     steps:
-    - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install yapf==0.32.0
-        pip install toml==0.10.2
-    - name: Running yapf
-      run: |
-        yapf --diff --recursive .
+      - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install yapf==0.32.0
+          pip install toml==0.10.2
+      - name: Running yapf
+        run: |
+          yapf --diff --recursive .
diff --git a/.jenkins/lm-eval-harness/test_lm_eval_correctness.py b/.jenkins/lm-eval-harness/test_lm_eval_correctness.py
@@ -76,18 +76,14 @@ def report_performance(task, input_lens, output_lens, time, record_property):
     context_lens = [i + o for i, o in zip(input_lens, output_lens)]
     gen_tput = sum(output_lens) / time
     all_lens = [input_lens, output_lens, context_lens]
-    min_input_tokens, min_output_tokens, min_context_tokens = [
-        min(x) for x in all_lens
-    ]
-    max_input_tokens, max_output_tokens, max_context_tokens = [
-        max(x) for x in all_lens
-    ]
-    mean_input_tokens, mean_output_tokens, mean_context_tokens = [
-        statistics.mean(x) for x in all_lens
-    ]
-    stddev_input_tokens, stddev_output_tokens, stddev_context_tokens = [
-        statistics.stdev(x) for x in all_lens
-    ]
+    min_input_tokens, min_output_tokens, min_context_tokens = (
+        min(x) for x in all_lens)
+    max_input_tokens, max_output_tokens, max_context_tokens = (
+        max(x) for x in all_lens)
+    mean_input_tokens, mean_output_tokens, mean_context_tokens = (
+        statistics.mean(x) for x in all_lens)
+    stddev_input_tokens, stddev_output_tokens, stddev_context_tokens = (
+        statistics.stdev(x) for x in all_lens)
     msg = (
         f'{task} | estimated average generation throughput: {gen_tput:.2f} tokens/s \n'  # noqa: G004, E501
         f'{task} | input_tokens   | min: {min_input_tokens} | max: {max_input_tokens} | mean: {mean_input_tokens:.2f} | stddev: {stddev_input_tokens:.2f}\n'  # noqa: E501

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -6,17 +6,16 @@ version: 2
 build:
   os: ubuntu-22.04
   tools:
-    python: "3.8"
+    python: '3.9'
 
 sphinx:
-   configuration: docs/source/conf.py
-   fail_on_warning: true
+  configuration: docs/source/conf.py
+  fail_on_warning: true
 
 # If using Sphinx, optionally build your docs in additional formats such as PDF
 formats: []
 
 # Optionally declare the Python requirements required to build your docs
 python:
-   install:
-   - requirements: docs/requirements-docs.txt
-
+  install:
+    - requirements: docs/requirements-docs.txt