diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval_iree.yaml similarity index 58% rename from .github/workflows/ci_eval.yaml rename to .github/workflows/ci_eval_iree.yaml index f7d4a8e47..ed0f7340d 100644 --- a/.github/workflows/ci_eval.yaml +++ b/.github/workflows/ci_eval_iree.yaml @@ -4,7 +4,7 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -name: CI - Perplexity +name: CI - Perplexity IREE on: workflow_dispatch: @@ -21,9 +21,9 @@ concurrency: cancel-in-progress: true jobs: - test_perplexity_vmfb: + test_perplexity_iree: timeout-minutes: 1000 - name: "IREE/vmfb" + name: "Perplexity IREE" strategy: matrix: version: [3.11] @@ -71,51 +71,5 @@ jobs: iree-base-compiler \ iree-base-runtime \ "numpy<2.0" - - name: Run perplexity test with vmfb - run: pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --longrun --iree-device='hip://7' --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json - - test_perplexity_torch: - timeout-minutes: 1000 - name: "Torch/eager mode" - strategy: - matrix: - version: [3.11] - runs-on: [llama-mi300x-3] - fail-fast: false - runs-on: ${{matrix.runs-on}} - defaults: - run: - shell: bash - env: - PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache" - SHARK_PLATFORM_REPO_ROOT: ${{ github.workspace }} - steps: - - name: "Setting up Python" - id: setup_python - uses: actions/setup-python@v3 - with: - python-version: ${{matrix.version}} - - - name: "Checkout Code" - uses: actions/checkout@v3 - - - name: Cache Pip Packages - uses: actions/cache@v4 - id: cache-pip - with: - path: ${{ env.PIP_CACHE_DIR }} - key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements.txt') }} - - - name: Install sharktank deps - run: | - python -m pip install --no-compile --upgrade pip - # Note: We install in three steps in order to satisfy requirements - # from non default locations first. Installing the PyTorch CPU - # wheels saves multiple minutes and a lot of bandwidth on runner setup. - pip install --no-compile -r pytorch-cpu-requirements.txt - pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \ - -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine" - pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/ - - - name: Run perplexity test in eager mode - run: pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py --longrun --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json + - name: Run perplexity test with IREE + run: pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py --longrun --iree-device='hip://7' --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json diff --git a/.github/workflows/ci_eval_torch.yaml b/.github/workflows/ci_eval_torch.yaml new file mode 100644 index 000000000..7bda47978 --- /dev/null +++ b/.github/workflows/ci_eval_torch.yaml @@ -0,0 +1,68 @@ +# Copyright 2024 Advanced Micro Devices, Inc. +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +name: CI - Perplexity Torch + +on: + workflow_dispatch: + schedule: + # Weekdays nightly at 07:00 UTC = 23:00 PST / 00:00 PDT. + - cron: "0 7 * * 1-5" + +concurrency: + # A PR number if a pull request and otherwise the commit hash. This cancels + # queued and in-progress runs for the same PR (presubmit) or commit + # (postsubmit). The workflow name is prepended to avoid conflicts between + # different workflows. + group: ${{ github.workflow }}-${{ github.event.number || github.sha }} + cancel-in-progress: true + +jobs: + test_perplexity_torch: + timeout-minutes: 1000 + name: "Perplexity Torch" + strategy: + matrix: + version: [3.11] + runs-on: [llama-mi300x-3] + fail-fast: false + runs-on: ${{matrix.runs-on}} + defaults: + run: + shell: bash + env: + PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache" + SHARK_PLATFORM_REPO_ROOT: ${{ github.workspace }} + steps: + - name: "Setting up Python" + id: setup_python + uses: actions/setup-python@v3 + with: + python-version: ${{matrix.version}} + + - name: "Checkout Code" + uses: actions/checkout@v3 + + - name: Cache Pip Packages + uses: actions/cache@v4 + id: cache-pip + with: + path: ${{ env.PIP_CACHE_DIR }} + key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements.txt') }} + + - name: Install sharktank deps + run: | + python -m pip install --no-compile --upgrade pip + # Note: We install in three steps in order to satisfy requirements + # from non default locations first. Installing the PyTorch CPU + # wheels saves multiple minutes and a lot of bandwidth on runner setup. + pip install --no-compile -r pytorch-cpu-requirements.txt + pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \ + -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine" + pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/ + + - name: Run perplexity test with Torch + run: pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py --longrun --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json diff --git a/sharktank/README.md b/sharktank/README.md index f7212333e..7d06b51b1 100644 --- a/sharktank/README.md +++ b/sharktank/README.md @@ -12,7 +12,7 @@ tooling. ## Project Status -[![CI - Perplexity](https://github.com/nod-ai/SHARK-Platform/actions/workflows/ci_eval.yaml/badge.svg?branch=main&event=schedule)](https://github.com/nod-ai/SHARK-Platform/actions/workflows/ci_eval.yaml) +[![CI - Perplexity Torch](https://github.com/nod-ai/SHARK-Platform/actions/workflows/ci_eval_torch.yaml/badge.svg?branch=main&event=schedule)](https://github.com/nod-ai/SHARK-Platform/actions/workflows/ci_eval_torch.yaml) [![CI - Perplexity IREE](https://github.com/nod-ai/SHARK-Platform/actions/workflows/ci_eval_iree.yaml/badge.svg?branch=main&event=schedule)](https://github.com/nod-ai/SHARK-Platform/actions/workflows/ci_eval_iree.yaml) ## Examples diff --git a/sharktank/sharktank/evaluate/README.md b/sharktank/sharktank/evaluate/README.md index 784bb24fd..640d5e630 100644 --- a/sharktank/sharktank/evaluate/README.md +++ b/sharktank/sharktank/evaluate/README.md @@ -9,16 +9,31 @@ pip install -r sharktank/requirements-tests.txt ### Perplexity +Perplexity score measures the ability of a language model to predict the next token in a sequence. A lower score indicates that a model has higher certainty in it's predictions. Perplexity acts as an intrinsic evaluation metric that measures the model quality, independent of any downstream task. + +In SHARK-Platform, we use perplexity to track code regressions and quality loss across quantized models (with FP16 as baseline). We use 100 prompts from the Wikitext-2 test set and calculate the mean perplexities shown below. These numbers are neither comparable between models with different tokenizers nor with other projects due to varying implementations. + Test perplexity for Llama3.1 8B & 405B (FP16 & FP8) models: ```bash pytest sharktank/tests/evaluate/perplexity_test.py --longrun ``` -Get perplexity for a new model: +Calculate the perplexity for a new model: ```bash python -m sharktank.evaluate.perplexity \ --gguf-file=llama3_70b_f16.gguf \ --tokenizer-config-json=tokenizer_config.json ``` + +### LLaMA 3.1 Scoreboard + +| CPU | GPU | +|:---------------|:-----------| +| AMD EPYC 9554 | MI300X | + + +|Models |Model size (GB) |Torch |IREE | +|:--------|:---------------|:----------|:----------| +|8B f16 |16.07 |14.930181 |14.991893 | diff --git a/sharktank/sharktank/evaluate/perplexity_vmfb.py b/sharktank/sharktank/evaluate/perplexity_iree.py similarity index 100% rename from sharktank/sharktank/evaluate/perplexity_vmfb.py rename to sharktank/sharktank/evaluate/perplexity_iree.py diff --git a/sharktank/tests/evaluate/perplexity_vmfb_test.py b/sharktank/tests/evaluate/perplexity_iree_test.py similarity index 95% rename from sharktank/tests/evaluate/perplexity_vmfb_test.py rename to sharktank/tests/evaluate/perplexity_iree_test.py index 93ffbe61c..06ec76158 100644 --- a/sharktank/tests/evaluate/perplexity_vmfb_test.py +++ b/sharktank/tests/evaluate/perplexity_iree_test.py @@ -8,7 +8,7 @@ import pytest import json -from sharktank.evaluate import perplexity_vmfb +from sharktank.evaluate import perplexity_iree longrun = pytest.mark.skipif("not config.getoption('longrun')") @@ -35,7 +35,7 @@ def test_llama3_8B_f16_decomposed(self): model_name = "llama3_8B_f16_decomposed_vmfb" baseline_perplexity = self.baseline_perplexity[model_name] - current_perplexity = perplexity_vmfb.main( + current_perplexity = perplexity_iree.main( [ f"--irpa-file={self.llama3_8b_f16_model}", f"--tokenizer-config-json={self.llama3_8b_tokenizer}", @@ -70,7 +70,7 @@ def test_llama3_8B_f16(self): model_name = "llama3_8B_f16_vmfb" baseline_perplexity = self.baseline_perplexity[model_name] - current_perplexity = perplexity_vmfb.main( + current_perplexity = perplexity_iree.main( [ f"--irpa-file={self.llama3_8b_f16_model}", f"--tokenizer-config-json={self.llama3_8b_tokenizer}", @@ -105,7 +105,7 @@ def test_llama3_8B_fp8_decomposed(self): model_name = "llama3_8B_fp8_decomposed_vmfb" baseline_perplexity = self.baseline_perplexity[model_name] - current_perplexity = perplexity_vmfb.main( + current_perplexity = perplexity_iree.main( [ f"--irpa-file={self.llama3_8b_fp8_model}", f"--tokenizer-config-json={self.llama3_8b_tokenizer}", @@ -140,7 +140,7 @@ def test_llama3_8B_fp8(self): model_name = "llama3_8B_fp8_vmfb" baseline_perplexity = self.baseline_perplexity[model_name] - current_perplexity = perplexity_vmfb.main( + current_perplexity = perplexity_iree.main( [ f"--irpa-file={self.llama3_8b_fp8_model}", f"--tokenizer-config-json={self.llama3_8b_tokenizer}", @@ -175,7 +175,7 @@ def test_llama3_405B_f16_decomposed(self): model_name = "llama3_405B_f16_decomposed_vmfb" baseline_perplexity = self.baseline_perplexity[model_name] - current_perplexity = perplexity_vmfb.main( + current_perplexity = perplexity_iree.main( [ f"--irpa-file={self.llama3_405b_f16_model}", f"--tokenizer-config-json={self.llama3_405b_tokenizer}", @@ -210,7 +210,7 @@ def test_llama3_405B_f16(self): model_name = "llama3_405B_f16_vmfb" baseline_perplexity = self.baseline_perplexity[model_name] - current_perplexity = perplexity_vmfb.main( + current_perplexity = perplexity_iree.main( [ f"--irpa-file={self.llama3_405b_f16_model}", f"--tokenizer-config-json={self.llama3_405b_tokenizer}", @@ -245,7 +245,7 @@ def test_llama3_405B_fp8_decomposed(self): model_name = "llama3_405B_fp8_decomposed_vmfb" baseline_perplexity = self.baseline_perplexity[model_name] - current_perplexity = perplexity_vmfb.main( + current_perplexity = perplexity_iree.main( [ f"--irpa-file={self.llama3_405b_fp8_model}", f"--tokenizer-config-json={self.llama3_405b_tokenizer}", @@ -280,7 +280,7 @@ def test_llama3_405B_fp8(self): model_name = "llama3_405B_fp8_vmfb" baseline_perplexity = self.baseline_perplexity[model_name] - current_perplexity = perplexity_vmfb.main( + current_perplexity = perplexity_iree.main( [ f"--irpa-file={self.llama3_405b_fp8_model}", f"--tokenizer-config-json={self.llama3_405b_tokenizer}",