nod-ai · archana-ramalingam · Nov 7, 2024 · Nov 7, 2024 · Nov 7, 2024 · Nov 8, 2024
diff --git a/.github/workflows/ci_eval.yaml → .github/workflows/ci_eval_iree.yaml b/.github/workflows/ci_eval.yaml → .github/workflows/ci_eval_iree.yaml
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-name: CI - Perplexity
+name: CI - Perplexity IREE
 
 on:
   workflow_dispatch:
@@ -21,9 +21,9 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  test_perplexity_vmfb:
+  test_perplexity_iree:
     timeout-minutes: 1000
-    name: "IREE/vmfb"
+    name: "Perplexity IREE"
     strategy:
       matrix:
         version: [3.11]
@@ -71,51 +71,5 @@ jobs:
             iree-base-compiler \
             iree-base-runtime \
             "numpy<2.0"
-      - name: Run perplexity test with vmfb
-        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --longrun --iree-device='hip://7' --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json
-
-  test_perplexity_torch:
-    timeout-minutes: 1000
-    name: "Torch/eager mode"
-    strategy:
-      matrix:
-        version: [3.11]
-        runs-on: [llama-mi300x-3]
-      fail-fast: false
-    runs-on: ${{matrix.runs-on}}
-    defaults:
-      run:
-        shell: bash
-    env:
-      PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
-      SHARK_PLATFORM_REPO_ROOT: ${{ github.workspace }}
-    steps:
-      - name: "Setting up Python"
-        id: setup_python
-        uses: actions/setup-python@v3
-        with:
-          python-version: ${{matrix.version}}
-
-      - name: "Checkout Code"
-        uses: actions/checkout@v3
-
-      - name: Cache Pip Packages
-        uses: actions/cache@v4
-        id: cache-pip
-        with:
-          path: ${{ env.PIP_CACHE_DIR }}
-          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements.txt') }}
-
-      - name: Install sharktank deps
-        run: |
-          python -m pip install --no-compile --upgrade pip
-          # Note: We install in three steps in order to satisfy requirements
-          # from non default locations first. Installing the PyTorch CPU
-          # wheels saves multiple minutes and a lot of bandwidth on runner setup.
-          pip install --no-compile -r pytorch-cpu-requirements.txt
-          pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
-            -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
-          pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
-
-      - name: Run perplexity test in eager mode
-        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py --longrun --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json
+      - name: Run perplexity test with IREE
+        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py --longrun --iree-device='hip://7' --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json
diff --git a/.github/workflows/ci_eval_torch.yaml b/.github/workflows/ci_eval_torch.yaml
@@ -0,0 +1,68 @@
+# Copyright 2024 Advanced Micro Devices, Inc.
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+name: CI - Perplexity Torch
+
+on:
+  workflow_dispatch:
+  schedule:
+    # Weekdays nightly at 07:00 UTC = 23:00 PST / 00:00 PDT.
+    - cron: "0 7 * * 1-5"
+
+concurrency:
+  # A PR number if a pull request and otherwise the commit hash. This cancels
+  # queued and in-progress runs for the same PR (presubmit) or commit
+  # (postsubmit). The workflow name is prepended to avoid conflicts between
+  # different workflows.
+  group: ${{ github.workflow }}-${{ github.event.number || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  test_perplexity_torch:
+    timeout-minutes: 1000
+    name: "Perplexity Torch"
+    strategy:
+      matrix:
+        version: [3.11]
+        runs-on: [llama-mi300x-3]
+      fail-fast: false
+    runs-on: ${{matrix.runs-on}}
+    defaults:
+      run:
+        shell: bash
+    env:
+      PIP_CACHE_DIR: "${{ github.workspace }}/.pip-cache"
+      SHARK_PLATFORM_REPO_ROOT: ${{ github.workspace }}
+    steps:
+      - name: "Setting up Python"
+        id: setup_python
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{matrix.version}}
+
+      - name: "Checkout Code"
+        uses: actions/checkout@v3
+
+      - name: Cache Pip Packages
+        uses: actions/cache@v4
+        id: cache-pip
+        with:
+          path: ${{ env.PIP_CACHE_DIR }}
+          key: pip-${{ steps.setup_python.outputs.python-version }}-${{ hashFiles('*requirements.txt') }}
+
+      - name: Install sharktank deps
+        run: |
+          python -m pip install --no-compile --upgrade pip
+          # Note: We install in three steps in order to satisfy requirements
+          # from non default locations first. Installing the PyTorch CPU
+          # wheels saves multiple minutes and a lot of bandwidth on runner setup.
+          pip install --no-compile -r pytorch-cpu-requirements.txt
+          pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \
+            -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine"
+          pip install --no-compile -r requirements.txt -r sharktank/requirements-tests.txt -e sharktank/
+
+      - name: Run perplexity test with Torch
+        run:  pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py --longrun --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json
diff --git a/sharktank/README.md b/sharktank/README.md
@@ -12,7 +12,7 @@ tooling.
 
 ## Project Status
 
-[![CI - Perplexity](https://github.com/nod-ai/SHARK-Platform/actions/workflows/ci_eval.yaml/badge.svg?branch=main&event=schedule)](https://github.com/nod-ai/SHARK-Platform/actions/workflows/ci_eval.yaml)
+[![CI - Perplexity Torch](https://github.com/nod-ai/SHARK-Platform/actions/workflows/ci_eval_torch.yaml/badge.svg?branch=main&event=schedule)](https://github.com/nod-ai/SHARK-Platform/actions/workflows/ci_eval_torch.yaml) [![CI - Perplexity IREE](https://github.com/nod-ai/SHARK-Platform/actions/workflows/ci_eval_iree.yaml/badge.svg?branch=main&event=schedule)](https://github.com/nod-ai/SHARK-Platform/actions/workflows/ci_eval_iree.yaml)
 
 ## Examples
 

diff --git a/sharktank/sharktank/evaluate/README.md b/sharktank/sharktank/evaluate/README.md
@@ -9,16 +9,31 @@ pip install -r sharktank/requirements-tests.txt
 
 ### Perplexity
 
+Perplexity score measures the ability of a language model to predict the next token in a sequence. A lower score indicates that a model has higher certainty in it's predictions. Perplexity acts as an intrinsic evaluation metric that measures the model quality, independent of any downstream task.
+
+In SHARK-Platform, we use perplexity to track code regressions and quality loss across quantized models (with FP16 as baseline). We use 100 prompts from the Wikitext-2 test set and calculate the mean perplexities shown below. These numbers are neither comparable between models with different tokenizers nor with other projects due to varying implementations.
+
 Test perplexity for Llama3.1 8B & 405B (FP16 & FP8) models:
 
 ```bash
 pytest sharktank/tests/evaluate/perplexity_test.py  --longrun
 ```
 
-Get perplexity for a new model:
+Calculate the perplexity for a new model:
 
 ```bash
 python -m  sharktank.evaluate.perplexity \
   --gguf-file=llama3_70b_f16.gguf \
   --tokenizer-config-json=tokenizer_config.json
 ```
+
+### LLaMA 3.1 Scoreboard
+
+| CPU            | GPU        |
+|:---------------|:-----------|
+| AMD EPYC 9554  | MI300X     |
+
+
+|Models   |Model size (GB) |Torch      |IREE       |
+|:--------|:---------------|:----------|:----------|
+|8B f16   |16.07           |14.930181  |14.991893  |
diff --git a/...ank/sharktank/evaluate/perplexity_vmfb.py → ...ank/sharktank/evaluate/perplexity_iree.py b/...ank/sharktank/evaluate/perplexity_vmfb.py → ...ank/sharktank/evaluate/perplexity_iree.py
diff --git a/...nk/tests/evaluate/perplexity_vmfb_test.py → ...nk/tests/evaluate/perplexity_iree_test.py b/...nk/tests/evaluate/perplexity_vmfb_test.py → ...nk/tests/evaluate/perplexity_iree_test.py
@@ -8,7 +8,7 @@
 import pytest
 import json
 
-from sharktank.evaluate import perplexity_vmfb
+from sharktank.evaluate import perplexity_iree
 
 longrun = pytest.mark.skipif("not config.getoption('longrun')")
 
@@ -35,7 +35,7 @@ def test_llama3_8B_f16_decomposed(self):
         model_name = "llama3_8B_f16_decomposed_vmfb"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity_vmfb.main(
+        current_perplexity = perplexity_iree.main(
             [
                 f"--irpa-file={self.llama3_8b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
@@ -70,7 +70,7 @@ def test_llama3_8B_f16(self):
         model_name = "llama3_8B_f16_vmfb"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity_vmfb.main(
+        current_perplexity = perplexity_iree.main(
             [
                 f"--irpa-file={self.llama3_8b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
@@ -105,7 +105,7 @@ def test_llama3_8B_fp8_decomposed(self):
         model_name = "llama3_8B_fp8_decomposed_vmfb"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity_vmfb.main(
+        current_perplexity = perplexity_iree.main(
             [
                 f"--irpa-file={self.llama3_8b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
@@ -140,7 +140,7 @@ def test_llama3_8B_fp8(self):
         model_name = "llama3_8B_fp8_vmfb"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity_vmfb.main(
+        current_perplexity = perplexity_iree.main(
             [
                 f"--irpa-file={self.llama3_8b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_8b_tokenizer}",
@@ -175,7 +175,7 @@ def test_llama3_405B_f16_decomposed(self):
         model_name = "llama3_405B_f16_decomposed_vmfb"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity_vmfb.main(
+        current_perplexity = perplexity_iree.main(
             [
                 f"--irpa-file={self.llama3_405b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
@@ -210,7 +210,7 @@ def test_llama3_405B_f16(self):
         model_name = "llama3_405B_f16_vmfb"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity_vmfb.main(
+        current_perplexity = perplexity_iree.main(
             [
                 f"--irpa-file={self.llama3_405b_f16_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
@@ -245,7 +245,7 @@ def test_llama3_405B_fp8_decomposed(self):
         model_name = "llama3_405B_fp8_decomposed_vmfb"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity_vmfb.main(
+        current_perplexity = perplexity_iree.main(
             [
                 f"--irpa-file={self.llama3_405b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",
@@ -280,7 +280,7 @@ def test_llama3_405B_fp8(self):
         model_name = "llama3_405B_fp8_vmfb"
         baseline_perplexity = self.baseline_perplexity[model_name]
 
-        current_perplexity = perplexity_vmfb.main(
+        current_perplexity = perplexity_iree.main(
             [
                 f"--irpa-file={self.llama3_405b_fp8_model}",
                 f"--tokenizer-config-json={self.llama3_405b_tokenizer}",