Cl mod benchmark (ModelCloud#944)

* add torch inference benchmark * add torch benchmark test * code review * code review * update model path * code clean up * update torch value * code clean * code clean * code opt * code opt * code refactor * code opt * add other test * update model * add d_type * fix torch dtype * rm dtype * cleanup * cleanup * code opt * code clean up * update model id * update code * update score --------- Co-authored-by: LRL-ModelCloud <[email protected]>
ZX-ModelCloud · Dec 20, 2024 · 94a3911 · 94a3911
1 parent 33791ce
commit 94a3911
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 3 deletions.
diff --git a/tests/benchmark/benchmark.py b/tests/benchmark/benchmark.py
@@ -0,0 +1,18 @@
+from benchmark_test import BenchmarkTest
+from gptqmodel import BACKEND
+from parameterized import parameterized  # noqa: E402
+
+
+class TestInference(BenchmarkTest):
+    @parameterized.expand(
+        [
+            (BACKEND.TORCH, 'cuda', 292.50),
+            (BACKEND.TORCH, 'cpu', 5.50),
+            (BACKEND.TORCH, 'xpu', 58.20),
+            (BACKEND.TORCH, 'mps', 3.40),
+        ]
+    )
+    def test_inference(self, backend, device, tokens_per_second):
+        if device == 'mps':
+            self.skipTest(f"MacOS env skip")
+        self.benchmark(backend=backend, device=device, tokens_per_second=tokens_per_second)
diff --git a/tests/benchmark/benchmark_test.py b/tests/benchmark/benchmark_test.py
@@ -10,8 +10,8 @@
 
 
 class BenchmarkTest(unittest.TestCase):
-    MODEL_id = "/monster/data/model/gptq_4bits_11-21_15-47-09_maxlen2048_ns2048_descFalse_damp0.1"
-    MIN_NEW_TOEKNS = 100
+    MODEL_id = "/monster/data/model/Llama-3.2-1B-Instruct-gptqmodel-4bit-vortext-v1"
+    MIN_NEW_TOEKNS = 10
     NUM_RUNS = 10
     PROMPTS = [
         "I am in Paris and I",
@@ -25,7 +25,7 @@ class BenchmarkTest(unittest.TestCase):
         "Which is the most widely used Internet search engine in the world?",
         "What is the official language of France?",
     ]
-    MAX_DELTA_FLOOR_PERCENT = 0.15
+    MAX_DELTA_FLOOR_PERCENT = 0.25
     MAX_POSITIVE_DELTA_CEIL_PERCENT = 1.0
 
     def benchmark(self, backend, device, tokens_per_second):