Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tests: refactor model tests #1078

Merged
merged 2 commits into from
Jan 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 24 additions & 3 deletions aphrodite/transformers_utils/tokenizers/mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,10 +177,27 @@ def apply_chat_template(self,

def convert_tokens_to_string(self, tokens: List[str]) -> str:
if isinstance(self.tokenizer, Tekkenizer):
return "".join(t for t in tokens
if t not in self.tokenizer._all_special_tokens)
tokens = [
t for t in tokens
if t not in self.tokenizer._all_special_tokens
]
if any(isinstance(t, bytes) for t in tokens):
# we need to encode and decode all tokens again
shift = self.tokenizer.num_special_tokens
byte_tokens = [
t.encode("utf-8") if not isinstance(t, bytes) else t
for t in tokens
]
ids = [
self.tokenizer._tekken_token2id_nospecial[t] + shift
for t in byte_tokens
]
decoded = self.tokenizer.decode(ids)
else:
decoded = "".join(tokens)
else:
return self.tokenizer.decode(tokens) # type: ignore[arg-type]
decoded = self.tokenizer.decode(tokens) # type: ignore[arg-type]
return decoded

def decode(self, ids: Union[List[int], int]) -> str:
if isinstance(ids, int):
Expand All @@ -204,4 +221,8 @@ def convert_ids_to_tokens(
self.tokenizer)

tokens = [self.tokenizer.id_to_piece(id) for id in ids]
if any(t.strip() == "�" for t in tokens):
# if any stripped decoded token is undefined
# because it's invalid unicode then pass bytes
tokens = [self.tokenizer.id_to_byte_piece(id) for id in ids]
return tokens
69 changes: 67 additions & 2 deletions tests/basic_correctness/test_basic_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,15 @@
from aphrodite.worker.model_runner import ModelInputForGPUWithSamplingMetadata

from ..models.utils import check_outputs_equal
from ..utils import multi_gpu_test

MODELS = [
"facebook/opt-125m",
"meta-llama/Llama-2-7b-hf",
]

TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")


def test_aphrodite_gc_ed():
"""Verify aphrodite instance is GC'ed when it is deleted"""
Expand Down Expand Up @@ -60,8 +63,68 @@ def test_models(
dtype=dtype,
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7) as aphrodite_model:
aphrodite_outputs = aphrodite_model.generate_greedy(example_prompts,
max_tokens)
aphrodite_outputs = aphrodite_model.generate_greedy(
example_prompts, max_tokens)

check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=aphrodite_outputs,
name_0="hf",
name_1="aphrodite",
)


@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize(
"model, distributed_executor_backend, attention_backend, "
"test_suite", [
("facebook/opt-125m", "ray", "", "L4"),
("facebook/opt-125m", "mp", "", "L4"),
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
("facebook/opt-125m", "ray", "", "A100"),
("facebook/opt-125m", "mp", "", "A100"),
("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
])
def test_models_distributed(
hf_runner,
aphrodite_runner,
example_prompts,
model: str,
distributed_executor_backend: str,
attention_backend: str,
test_suite: str,
) -> None:

if test_suite != TARGET_TEST_SUITE:
pytest.skip(f"Skip test for {test_suite}")

if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
# test ray adag
os.environ['APHRODITE_USE_RAY_SPMD_WORKER'] = "1"
os.environ['APHRODITE_USE_RAY_COMPILED_DAG'] = "1"

if attention_backend:
os.environ["APHRODITE_ATTENTION_BACKEND"] = attention_backend

dtype = "half"
max_tokens = 5

# NOTE: take care of the order. run Aphrodite first, and then run HF.
# Aphrodite needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with aphrodite_runner(model,
dtype=dtype,
tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend
) as aphrodite_model:
aphrodite_outputs = aphrodite_model.generate_greedy(
example_prompts, max_tokens)

with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

check_outputs_equal(
outputs_0_lst=hf_outputs,
Expand All @@ -84,8 +147,10 @@ def test_model_with_failure(aphrodite_runner) -> None:
str(exc_info.value))
assert matches is not None
filename = f"{matches.group(1)}.pkl"

with open(filename, "rb") as filep:
inputs = pickle.load(filep)

if any(key not in inputs for key in ("arg_1", "arg_2", "arg_3")):
raise AssertionError("Missing keys in dumped inputs. Dumped keys: "
f"{list(inputs.keys())}")
Expand Down
106 changes: 68 additions & 38 deletions tests/basic_correctness/test_chunked_prefill.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,18 @@

Run `pytest tests/models/test_chunked_prefill.py`.
"""
import os
from contextlib import nullcontext

import pytest

from ..models.utils import check_logprobs_close, check_outputs_equal
from ..utils import multi_gpu_test

MODELS = [
"facebook/opt-125m",
"meta-llama/Llama-2-7b-hf",
]
E5M2_KV_MODELS = [
"facebook/opt-125m",
"meta-llama/Llama-2-7b-chat-hf",
]
E4M3_KV_MODELS = [
"meta-llama/Llama-2-7b-chat-hf", "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme"
]
KV_CACHE_QUANTIZATION_PATHS = {
"meta-llama/Llama-2-7b-chat-hf":
"./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json"
}


@pytest.mark.parametrize("model", MODELS)
Expand Down Expand Up @@ -68,8 +58,62 @@ def test_models(
enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs,
) as aphrodite_model:
aphrodite_outputs = aphrodite_model.generate_greedy(example_prompts,
max_tokens)
aphrodite_outputs = aphrodite_model.generate_greedy(
example_prompts, max_tokens)

check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=aphrodite_outputs,
name_0="hf",
name_1="aphrodite",
)


@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@pytest.mark.parametrize("model", MODELS)
def test_models_distributed(
hf_runner,
aphrodite_runner,
example_prompts,
model: str,
distributed_executor_backend: str,
) -> None:
if (model == "meta-llama/Llama-2-7b-hf"
and distributed_executor_backend == "ray"):
# test ray adag
os.environ['APHRODITE_USE_RAY_SPMD_WORKER'] = "1"
os.environ['APHRODITE_USE_RAY_COMPILED_DAG'] = "1"

dtype = "half"
max_tokens = 5
chunked_prefill_token_size = 16

# Add a chunked prefill config.
max_num_seqs = min(chunked_prefill_token_size, 256)
assert chunked_prefill_token_size != -1
enable_chunked_prefill = True
max_num_batched_tokens = chunked_prefill_token_size

# NOTE: take care of the order. run Aphrodite first, and then run HF.
# Aphrodite needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).

with aphrodite_runner(
model,
dtype=dtype,
tensor_parallel_size=2,
max_num_seqs=max_num_seqs,
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend,
) as aphrodite_model:
aphrodite_outputs = aphrodite_model.generate_greedy(
example_prompts, max_tokens)

with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

check_outputs_equal(
outputs_0_lst=hf_outputs,
Expand All @@ -79,13 +123,13 @@ def test_models(
)


@pytest.mark.parametrize("kv_cache_dtype,model",
[("fp8_e5m2", m)
for m in E5M2_KV_MODELS] + [("fp8_e4m3", m)
for m in E4M3_KV_MODELS])
@pytest.mark.parametrize(
"kv_cache_dtype,model",
[("fp8_e4m3",
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")])
# Due to low-precision numerical divergence, we only test logprob of 4 tokens
@pytest.mark.parametrize("max_tokens", [4])
@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
@pytest.mark.parametrize("chunked_prefill_token_size", [4, 16])
@pytest.mark.parametrize("enforce_eager", [False, True])
# NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test.
Expand All @@ -105,35 +149,22 @@ def test_models_with_fp8_kv_cache(
disable_async_output_proc: bool,
) -> None:
"""
Only checks log probs match between chunked-prefill and
non-chunked-prefill version of Aphrodite model runner.

This test is used when there is discrepancy in kernels
/ numerics (e.g. when using lower-precision types like FP8).
Check output logprobs match between no_chunked_prefill and chunked_prefill
with fp8 kv cache. General fp8 kv-cache tests are covered in test_fp8.py,
so here we only check chunked prefill.
"""
NUM_LOG_PROBS = 8

if model == "facebook/opt-125m":
pytest.skip(
"#7378: CUDA illegal memory access (undiagnosed) facebook/opt-125m"
)

max_num_seqs = chunked_prefill_token_size
max_num_batched_tokens = chunked_prefill_token_size

extra_kwargs = {}
if model in KV_CACHE_QUANTIZATION_PATHS:
extra_kwargs["quantization_param_path"] = KV_CACHE_QUANTIZATION_PATHS[
model]

with aphrodite_runner(
model,
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs,
kv_cache_dtype=kv_cache_dtype,
disable_async_output_proc=disable_async_output_proc,
**extra_kwargs,
) as aphrodite_model:
no_chunked_prefill_outputs = aphrodite_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS)
Expand All @@ -147,7 +178,6 @@ def test_models_with_fp8_kv_cache(
max_num_seqs=max_num_seqs,
kv_cache_dtype=kv_cache_dtype,
disable_async_output_proc=disable_async_output_proc,
**extra_kwargs,
) as aphrodite_model:
chunked_prefill_outputs = aphrodite_model.generate_greedy_logprobs(
example_prompts, max_tokens, NUM_LOG_PROBS)
Expand Down Expand Up @@ -212,8 +242,8 @@ def test_with_prefix_caching(
# Send the request one-by-one to ensure the cache is populated.
with pytest.raises(ValueError) if should_fail else nullcontext():
for prompt in full_prompts:
outputs[enable] += aphrodite_model.generate_greedy(
[prompt], max_tokens)
outputs[enable] += aphrodite_model.generate_greedy([prompt],
max_tokens)

# Check results only if we did not expect a failure.
if check_result:
Expand Down
25 changes: 14 additions & 11 deletions tests/basic_correctness/test_preemption.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
import pytest
from prometheus_client import REGISTRY

import aphrodite.common.envs as envs
from aphrodite import SamplingParams
from aphrodite.executor.ray_gpu_executor import APHRODITE_USE_RAY_SPMD_WORKER
from aphrodite.processing.scheduler import (ARTIFICIAL_PREEMPTION_MAX_CNT,
ENABLE_ARTIFICIAL_PREEMPT)

Expand All @@ -20,17 +20,20 @@
"facebook/opt-125m",
]

assert ENABLE_ARTIFICIAL_PREEMPT is True, (
"Use an env var APHRODITE_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
"`APHRODITE_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
"tests/basic_correctness/test_preemption.py`")

@pytest.fixture(scope="module", autouse=True)
def check_settings():
assert ENABLE_ARTIFICIAL_PREEMPT is True, (
"Use an env var APHRODITE_TEST_ENABLE_ARTIFICIAL_PREEMPT=1. "
"`APHRODITE_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest "
"tests/basic_correctness/test_preemption.py`")


@pytest.fixture
def worker_use_ray() -> bool:
# When SPMD worker is used, use ray_use_worker=True
# to test delta input optimization works with preemption.
return APHRODITE_USE_RAY_SPMD_WORKER
return envs.APHRODITE_USE_RAY_SPMD_WORKER


@pytest.mark.parametrize("model", MODELS)
Expand Down Expand Up @@ -65,9 +68,10 @@ def test_chunked_prefill_recompute(
enable_chunked_prefill=enable_chunked_prefill,
max_num_seqs=max_num_seqs,
worker_use_ray=worker_use_ray,
disable_log_stats=False,
) as aphrodite_model:
aphrodite_outputs = aphrodite_model.generate_greedy(example_prompts,
max_tokens)
aphrodite_outputs = aphrodite_model.generate_greedy(
example_prompts, max_tokens)
assert (
aphrodite_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
Expand Down Expand Up @@ -106,8 +110,8 @@ def test_preemption(
disable_log_stats=False,
worker_use_ray=worker_use_ray,
) as aphrodite_model:
aphrodite_outputs = aphrodite_model.generate_greedy(example_prompts,
max_tokens)
aphrodite_outputs = aphrodite_model.generate_greedy(
example_prompts, max_tokens)
assert (
aphrodite_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
Expand Down Expand Up @@ -216,7 +220,6 @@ def test_swap_infeasible(
prefill_blocks = 2
decode_blocks = max_tokens // BLOCK_SIZE
example_prompts = example_prompts[:1]

with aphrodite_runner(
model,
dtype=dtype,
Expand Down
Loading
Loading