aphrodite-engine · AlpinDale · Jan 6, 2025 · Jan 5, 2025 · Jan 6, 2025
diff --git a/aphrodite/transformers_utils/tokenizers/mistral.py b/aphrodite/transformers_utils/tokenizers/mistral.py
@@ -177,10 +177,27 @@ def apply_chat_template(self,
 
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
         if isinstance(self.tokenizer, Tekkenizer):
-            return "".join(t for t in tokens
-                           if t not in self.tokenizer._all_special_tokens)
+            tokens = [
+                t for t in tokens
+                if t not in self.tokenizer._all_special_tokens
+            ]
+            if any(isinstance(t, bytes) for t in tokens):
+                # we need to encode and decode all tokens again
+                shift = self.tokenizer.num_special_tokens
+                byte_tokens = [
+                    t.encode("utf-8") if not isinstance(t, bytes) else t
+                    for t in tokens
+                ]
+                ids = [
+                    self.tokenizer._tekken_token2id_nospecial[t] + shift
+                    for t in byte_tokens
+                ]
+                decoded = self.tokenizer.decode(ids)
+            else:
+                decoded = "".join(tokens)
         else:
-            return self.tokenizer.decode(tokens)  # type: ignore[arg-type]
+            decoded = self.tokenizer.decode(tokens)  # type: ignore[arg-type]
+        return decoded
 
     def decode(self, ids: Union[List[int], int]) -> str:
         if isinstance(ids, int):
@@ -204,4 +221,8 @@ def convert_ids_to_tokens(
                               self.tokenizer)
 
         tokens = [self.tokenizer.id_to_piece(id) for id in ids]
+        if any(t.strip() == "�" for t in tokens):
+            # if any stripped decoded token is undefined
+            # because it's invalid unicode then pass bytes
+            tokens = [self.tokenizer.id_to_byte_piece(id) for id in ids]
         return tokens
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
@@ -15,12 +15,15 @@
 from aphrodite.worker.model_runner import ModelInputForGPUWithSamplingMetadata
 
 from ..models.utils import check_outputs_equal
+from ..utils import multi_gpu_test
 
 MODELS = [
     "facebook/opt-125m",
     "meta-llama/Llama-2-7b-hf",
 ]
 
+TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
+
 
 def test_aphrodite_gc_ed():
     """Verify aphrodite instance is GC'ed when it is deleted"""
@@ -60,8 +63,68 @@ def test_models(
                      dtype=dtype,
                      enforce_eager=enforce_eager,
                      gpu_memory_utilization=0.7) as aphrodite_model:
-        aphrodite_outputs = aphrodite_model.generate_greedy(example_prompts,
-                                                            max_tokens)
+        aphrodite_outputs = aphrodite_model.generate_greedy(
+            example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=aphrodite_outputs,
+        name_0="hf",
+        name_1="aphrodite",
+    )
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "model, distributed_executor_backend, attention_backend, "
+    "test_suite", [
+        ("facebook/opt-125m", "ray", "", "L4"),
+        ("facebook/opt-125m", "mp", "", "L4"),
+        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
+        ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
+        ("facebook/opt-125m", "ray", "", "A100"),
+        ("facebook/opt-125m", "mp", "", "A100"),
+        ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
+        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
+    ])
+def test_models_distributed(
+    hf_runner,
+    aphrodite_runner,
+    example_prompts,
+    model: str,
+    distributed_executor_backend: str,
+    attention_backend: str,
+    test_suite: str,
+) -> None:
+
+    if test_suite != TARGET_TEST_SUITE:
+        pytest.skip(f"Skip test for {test_suite}")
+
+    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
+        # test ray adag
+        os.environ['APHRODITE_USE_RAY_SPMD_WORKER'] = "1"
+        os.environ['APHRODITE_USE_RAY_COMPILED_DAG'] = "1"
+
+    if attention_backend:
+        os.environ["APHRODITE_ATTENTION_BACKEND"] = attention_backend
+
+    dtype = "half"
+    max_tokens = 5
+
+    # NOTE: take care of the order. run Aphrodite first, and then run HF.
+    # Aphrodite needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    with aphrodite_runner(model,
+                     dtype=dtype,
+                     tensor_parallel_size=2,
+                     distributed_executor_backend=distributed_executor_backend
+                     ) as aphrodite_model:
+        aphrodite_outputs = aphrodite_model.generate_greedy(
+            example_prompts, max_tokens)
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     check_outputs_equal(
         outputs_0_lst=hf_outputs,
@@ -84,8 +147,10 @@ def test_model_with_failure(aphrodite_runner) -> None:
                                 str(exc_info.value))
             assert matches is not None
             filename = f"{matches.group(1)}.pkl"
+
         with open(filename, "rb") as filep:
             inputs = pickle.load(filep)
+
         if any(key not in inputs for key in ("arg_1", "arg_2", "arg_3")):
             raise AssertionError("Missing keys in dumped inputs. Dumped keys: "
                                  f"{list(inputs.keys())}")

diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
@@ -6,28 +6,18 @@
 
 Run `pytest tests/models/test_chunked_prefill.py`.
 """
+import os
 from contextlib import nullcontext
 
 import pytest
 
 from ..models.utils import check_logprobs_close, check_outputs_equal
+from ..utils import multi_gpu_test
 
 MODELS = [
     "facebook/opt-125m",
     "meta-llama/Llama-2-7b-hf",
 ]
-E5M2_KV_MODELS = [
-    "facebook/opt-125m",
-    "meta-llama/Llama-2-7b-chat-hf",
-]
-E4M3_KV_MODELS = [
-    "meta-llama/Llama-2-7b-chat-hf", "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
-    "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
-]
-KV_CACHE_QUANTIZATION_PATHS = {
-    "meta-llama/Llama-2-7b-chat-hf":
-    "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json"
-}
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -68,8 +58,62 @@ def test_models(
             enforce_eager=enforce_eager,
             max_num_seqs=max_num_seqs,
     ) as aphrodite_model:
-        aphrodite_outputs = aphrodite_model.generate_greedy(example_prompts,
-                                                            max_tokens)
+        aphrodite_outputs = aphrodite_model.generate_greedy(
+            example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=aphrodite_outputs,
+        name_0="hf",
+        name_1="aphrodite",
+    )
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
+@pytest.mark.parametrize("model", MODELS)
+def test_models_distributed(
+    hf_runner,
+    aphrodite_runner,
+    example_prompts,
+    model: str,
+    distributed_executor_backend: str,
+) -> None:
+    if (model == "meta-llama/Llama-2-7b-hf"
+            and distributed_executor_backend == "ray"):
+        # test ray adag
+        os.environ['APHRODITE_USE_RAY_SPMD_WORKER'] = "1"
+        os.environ['APHRODITE_USE_RAY_COMPILED_DAG'] = "1"
+
+    dtype = "half"
+    max_tokens = 5
+    chunked_prefill_token_size = 16
+
+    # Add a chunked prefill config.
+    max_num_seqs = min(chunked_prefill_token_size, 256)
+    assert chunked_prefill_token_size != -1
+    enable_chunked_prefill = True
+    max_num_batched_tokens = chunked_prefill_token_size
+
+    # NOTE: take care of the order. run Aphrodite first, and then run HF.
+    # Aphrodite needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+
+    with aphrodite_runner(
+            model,
+            dtype=dtype,
+            tensor_parallel_size=2,
+            max_num_seqs=max_num_seqs,
+            enable_chunked_prefill=enable_chunked_prefill,
+            max_num_batched_tokens=max_num_batched_tokens,
+            distributed_executor_backend=distributed_executor_backend,
+    ) as aphrodite_model:
+        aphrodite_outputs = aphrodite_model.generate_greedy(
+            example_prompts, max_tokens)
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     check_outputs_equal(
         outputs_0_lst=hf_outputs,
@@ -79,13 +123,13 @@ def test_models(
     )
 
 
-@pytest.mark.parametrize("kv_cache_dtype,model",
-                         [("fp8_e5m2", m)
-                          for m in E5M2_KV_MODELS] + [("fp8_e4m3", m)
-                                                      for m in E4M3_KV_MODELS])
+@pytest.mark.parametrize(
+    "kv_cache_dtype,model",
+    [("fp8_e4m3",
+      "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")])
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
 @pytest.mark.parametrize("max_tokens", [4])
-@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
+@pytest.mark.parametrize("chunked_prefill_token_size", [4, 16])
 @pytest.mark.parametrize("enforce_eager", [False, True])
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.
@@ -105,35 +149,22 @@ def test_models_with_fp8_kv_cache(
     disable_async_output_proc: bool,
 ) -> None:
     """
-    Only checks log probs match between chunked-prefill and
-    non-chunked-prefill version of Aphrodite model runner.
-
-    This test is used when there is discrepancy in kernels
-    / numerics (e.g. when using lower-precision types like FP8).
+    Check output logprobs match between no_chunked_prefill and chunked_prefill
+    with fp8 kv cache. General fp8 kv-cache tests are covered in test_fp8.py,
+    so here we only check chunked prefill.
     """
     NUM_LOG_PROBS = 8
 
-    if model == "facebook/opt-125m":
-        pytest.skip(
-            "#7378: CUDA illegal memory access (undiagnosed) facebook/opt-125m"
-        )
-
     max_num_seqs = chunked_prefill_token_size
     max_num_batched_tokens = chunked_prefill_token_size
 
-    extra_kwargs = {}
-    if model in KV_CACHE_QUANTIZATION_PATHS:
-        extra_kwargs["quantization_param_path"] = KV_CACHE_QUANTIZATION_PATHS[
-            model]
-
     with aphrodite_runner(
             model,
             tensor_parallel_size=tensor_parallel_size,
             enforce_eager=enforce_eager,
             max_num_seqs=max_num_seqs,
             kv_cache_dtype=kv_cache_dtype,
             disable_async_output_proc=disable_async_output_proc,
-            **extra_kwargs,
     ) as aphrodite_model:
         no_chunked_prefill_outputs = aphrodite_model.generate_greedy_logprobs(
             example_prompts, max_tokens, NUM_LOG_PROBS)
@@ -147,7 +178,6 @@ def test_models_with_fp8_kv_cache(
             max_num_seqs=max_num_seqs,
             kv_cache_dtype=kv_cache_dtype,
             disable_async_output_proc=disable_async_output_proc,
-            **extra_kwargs,
     ) as aphrodite_model:
         chunked_prefill_outputs = aphrodite_model.generate_greedy_logprobs(
             example_prompts, max_tokens, NUM_LOG_PROBS)
@@ -212,8 +242,8 @@ def test_with_prefix_caching(
             # Send the request one-by-one to ensure the cache is populated.
             with pytest.raises(ValueError) if should_fail else nullcontext():
                 for prompt in full_prompts:
-                    outputs[enable] += aphrodite_model.generate_greedy(
-                        [prompt], max_tokens)
+                    outputs[enable] += aphrodite_model.generate_greedy([prompt],
+                                                                  max_tokens)
 
     # Check results only if we did not expect a failure.
     if check_result:

diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
@@ -9,8 +9,8 @@
 import pytest
 from prometheus_client import REGISTRY
 
+import aphrodite.common.envs as envs
 from aphrodite import SamplingParams
-from aphrodite.executor.ray_gpu_executor import APHRODITE_USE_RAY_SPMD_WORKER
 from aphrodite.processing.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
                                             ENABLE_ARTIFICIAL_PREEMPT)
 
@@ -20,17 +20,20 @@
     "facebook/opt-125m",
 ]
 
-assert ENABLE_ARTIFICIAL_PREEMPT is True, (
-    "Use an env var APHRODITE_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
-    "`APHRODITE_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
-    "tests/basic_correctness/test_preemption.py`")
+
+@pytest.fixture(scope="module", autouse=True)
+def check_settings():
+    assert ENABLE_ARTIFICIAL_PREEMPT is True, (
+        "Use an env var APHRODITE_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
+        "`APHRODITE_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
+        "tests/basic_correctness/test_preemption.py`")
 
 
 @pytest.fixture
 def worker_use_ray() -> bool:
     # When SPMD worker is used, use ray_use_worker=True
     # to test delta input optimization works with preemption.
-    return APHRODITE_USE_RAY_SPMD_WORKER
+    return envs.APHRODITE_USE_RAY_SPMD_WORKER
 
 
 @pytest.mark.parametrize("model", MODELS)
@@ -65,9 +68,10 @@ def test_chunked_prefill_recompute(
             enable_chunked_prefill=enable_chunked_prefill,
             max_num_seqs=max_num_seqs,
             worker_use_ray=worker_use_ray,
+            disable_log_stats=False,
     ) as aphrodite_model:
-        aphrodite_outputs = aphrodite_model.generate_greedy(example_prompts,
-                                                            max_tokens)
+        aphrodite_outputs = aphrodite_model.generate_greedy(
+            example_prompts, max_tokens)
         assert (
             aphrodite_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
                 < ARTIFICIAL_PREEMPTION_MAX_CNT)
@@ -106,8 +110,8 @@ def test_preemption(
             disable_log_stats=False,
             worker_use_ray=worker_use_ray,
     ) as aphrodite_model:
-        aphrodite_outputs = aphrodite_model.generate_greedy(example_prompts,
-                                                            max_tokens)
+        aphrodite_outputs = aphrodite_model.generate_greedy(
+            example_prompts, max_tokens)
         assert (
             aphrodite_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
                 < ARTIFICIAL_PREEMPTION_MAX_CNT)
@@ -216,7 +220,6 @@ def test_swap_infeasible(
     prefill_blocks = 2
     decode_blocks = max_tokens // BLOCK_SIZE
     example_prompts = example_prompts[:1]
-
     with aphrodite_runner(
             model,
             dtype=dtype,