Skip to content

Commit

Permalink
linter fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
ldurejko committed Nov 6, 2024
1 parent d5c12fd commit bd8380b
Show file tree
Hide file tree
Showing 7 changed files with 27 additions and 20 deletions.
3 changes: 2 additions & 1 deletion tests/basic_correctness/test_basic_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ def test_models(
("facebook/opt-125m", "ray", "", "A100"),
("facebook/opt-125m", "mp", "", "A100"),
("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
("/mnt/weka/data/pytorch/llama3/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
("/mnt/weka/data/pytorch/llama3/Meta-Llama-3-8B", "ray", "FLASHINFER",
"A100"),
])
def test_models_distributed(
hf_runner,
Expand Down
4 changes: 2 additions & 2 deletions tests/compile/test_basic_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
@pytest.mark.parametrize(
"model, model_args, pp_size, tp_size, attn_backend, method, fullgraph",
[
("/mnt/weka/data/pytorch/llama3/Meta-Llama-3-8B", [], 2, 2, "FLASH_ATTN", "generate",
True),
("/mnt/weka/data/pytorch/llama3/Meta-Llama-3-8B", [], 2, 2,
"FLASH_ATTN", "generate", True),
("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples",
["--quantization", "compressed-tensors"
], 1, 1, "FLASH_ATTN", "generate", True),
Expand Down
6 changes: 4 additions & 2 deletions tests/lora/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,10 +274,12 @@ def get_model_patched(*, model_config, device_config, **kwargs):
if current_platform.is_hpu():
with patch("vllm.worker.hpu_model_runner.get_model",
get_model_patched):
engine = vllm.LLM("/mnt/weka/data/pytorch/llama2/Llama-2-7b-hf", enable_lora=False)
engine = vllm.LLM("/mnt/weka/data/pytorch/llama2/Llama-2-7b-hf",
enable_lora=False)
else:
with patch("vllm.worker.model_runner.get_model", get_model_patched):
engine = vllm.LLM("/mnt/weka/data/pytorch/llama2/Llama-2-7b-hf", enable_lora=False)
engine = vllm.LLM("/mnt/weka/data/pytorch/llama2/Llama-2-7b-hf",
enable_lora=False)

yield engine.llm_engine
del engine
Expand Down
15 changes: 8 additions & 7 deletions tests/lora/test_multilora_hpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,13 +91,14 @@ def process_requests(engine: LLMEngine,

def _test_llama_multilora(sql_lora_files, tp_size):
"""Main function that sets up and runs the prompt processing."""
engine_args = EngineArgs(model="/mnt/weka/data/pytorch/llama2/Llama-2-7b-hf",
enable_lora=True,
max_loras=2,
max_lora_rank=8,
max_num_seqs=256,
dtype='float32',
tensor_parallel_size=tp_size)
engine_args = EngineArgs(
model="/mnt/weka/data/pytorch/llama2/Llama-2-7b-hf",
enable_lora=True,
max_loras=2,
max_lora_rank=8,
max_num_seqs=256,
dtype='float32',
tensor_parallel_size=tp_size)
engine = LLMEngine.from_engine_args(engine_args)
test_prompts = create_test_prompts(sql_lora_files)
results = process_requests(engine, test_prompts)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ def test_models(hf_runner, vllm_runner, image_assets,
num_logprobs = 5
tensor_parallel_size = 2

if model.startswith("/mnt/weka/data/pytorch/llama3.2/Llama-3.2-11B-Vision-Instruct"):
if model.startswith(
"/mnt/weka/data/pytorch/llama3.2/Llama-3.2-11B-Vision-Instruct"):
from .test_mllama import models, run_test
else:
raise NotImplementedError(f"Unsupported model: {model}")
Expand Down
9 changes: 5 additions & 4 deletions tests/quantization/test_cpu_offload.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@
reason="fp8 is not supported on this GPU type.")
def test_cpu_offload_fp8():
# Test quantization of an unquantized checkpoint
compare_two_settings("/mnt/weka/data/pytorch/llama3/Meta-Llama-3-8B-Instruct",
["--quantization", "fp8"],
["--quantization", "fp8", "--cpu-offload-gb", "2"],
max_wait_seconds=480)
compare_two_settings(
"/mnt/weka/data/pytorch/llama3/Meta-Llama-3-8B-Instruct",
["--quantization", "fp8"],
["--quantization", "fp8", "--cpu-offload-gb", "2"],
max_wait_seconds=480)
# Test loading a quantized checkpoint
compare_two_settings("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", [],
["--cpu-offload-gb", "2"],
Expand Down
7 changes: 4 additions & 3 deletions tests/test_sharded_state_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,10 @@ def test_filter_subtensors():
@pytest.fixture(scope="module")
def llama_2_7b_files():
with TemporaryDirectory() as cache_dir:
input_dir = snapshot_download("/mnt/weka/data/pytorch/llama2/Llama-2-7b-hf",
cache_dir=cache_dir,
ignore_patterns="*.bin*")
input_dir = snapshot_download(
"/mnt/weka/data/pytorch/llama2/Llama-2-7b-hf",
cache_dir=cache_dir,
ignore_patterns="*.bin*")
yield input_dir


Expand Down

0 comments on commit bd8380b

Please sign in to comment.