diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 8278101c14a3b..09140d6c152c5 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -84,7 +84,8 @@ def test_models( ("facebook/opt-125m", "ray", "", "A100"), ("facebook/opt-125m", "mp", "", "A100"), ("facebook/opt-125m", "mp", "FLASHINFER", "A100"), - ("/mnt/weka/data/pytorch/llama3/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"), + ("/mnt/weka/data/pytorch/llama3/Meta-Llama-3-8B", "ray", "FLASHINFER", + "A100"), ]) def test_models_distributed( hf_runner, diff --git a/tests/compile/test_basic_correctness.py b/tests/compile/test_basic_correctness.py index 887b08b6f578a..b406abfb71af1 100644 --- a/tests/compile/test_basic_correctness.py +++ b/tests/compile/test_basic_correctness.py @@ -13,8 +13,8 @@ @pytest.mark.parametrize( "model, model_args, pp_size, tp_size, attn_backend, method, fullgraph", [ - ("/mnt/weka/data/pytorch/llama3/Meta-Llama-3-8B", [], 2, 2, "FLASH_ATTN", "generate", - True), + ("/mnt/weka/data/pytorch/llama3/Meta-Llama-3-8B", [], 2, 2, + "FLASH_ATTN", "generate", True), ("nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dyn-Per-Token-2048-Samples", ["--quantization", "compressed-tensors" ], 1, 1, "FLASH_ATTN", "generate", True), diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 061086feb7db2..58239a2e3033a 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -274,10 +274,12 @@ def get_model_patched(*, model_config, device_config, **kwargs): if current_platform.is_hpu(): with patch("vllm.worker.hpu_model_runner.get_model", get_model_patched): - engine = vllm.LLM("/mnt/weka/data/pytorch/llama2/Llama-2-7b-hf", enable_lora=False) + engine = vllm.LLM("/mnt/weka/data/pytorch/llama2/Llama-2-7b-hf", + enable_lora=False) else: with patch("vllm.worker.model_runner.get_model", get_model_patched): - engine = vllm.LLM("/mnt/weka/data/pytorch/llama2/Llama-2-7b-hf", enable_lora=False) + engine = vllm.LLM("/mnt/weka/data/pytorch/llama2/Llama-2-7b-hf", + enable_lora=False) yield engine.llm_engine del engine diff --git a/tests/lora/test_multilora_hpu.py b/tests/lora/test_multilora_hpu.py index 5a321d044ed11..b0766fbe1c732 100644 --- a/tests/lora/test_multilora_hpu.py +++ b/tests/lora/test_multilora_hpu.py @@ -91,13 +91,14 @@ def process_requests(engine: LLMEngine, def _test_llama_multilora(sql_lora_files, tp_size): """Main function that sets up and runs the prompt processing.""" - engine_args = EngineArgs(model="/mnt/weka/data/pytorch/llama2/Llama-2-7b-hf", - enable_lora=True, - max_loras=2, - max_lora_rank=8, - max_num_seqs=256, - dtype='float32', - tensor_parallel_size=tp_size) + engine_args = EngineArgs( + model="/mnt/weka/data/pytorch/llama2/Llama-2-7b-hf", + enable_lora=True, + max_loras=2, + max_lora_rank=8, + max_num_seqs=256, + dtype='float32', + tensor_parallel_size=tp_size) engine = LLMEngine.from_engine_args(engine_args) test_prompts = create_test_prompts(sql_lora_files) results = process_requests(engine, test_prompts) diff --git a/tests/models/encoder_decoder/vision_language/test_broadcast.py b/tests/models/encoder_decoder/vision_language/test_broadcast.py index 798ff6f833200..63766b5bfe5c2 100644 --- a/tests/models/encoder_decoder/vision_language/test_broadcast.py +++ b/tests/models/encoder_decoder/vision_language/test_broadcast.py @@ -16,7 +16,8 @@ def test_models(hf_runner, vllm_runner, image_assets, num_logprobs = 5 tensor_parallel_size = 2 - if model.startswith("/mnt/weka/data/pytorch/llama3.2/Llama-3.2-11B-Vision-Instruct"): + if model.startswith( + "/mnt/weka/data/pytorch/llama3.2/Llama-3.2-11B-Vision-Instruct"): from .test_mllama import models, run_test else: raise NotImplementedError(f"Unsupported model: {model}") diff --git a/tests/quantization/test_cpu_offload.py b/tests/quantization/test_cpu_offload.py index df7821037e9a7..99276906e03a7 100644 --- a/tests/quantization/test_cpu_offload.py +++ b/tests/quantization/test_cpu_offload.py @@ -12,10 +12,11 @@ reason="fp8 is not supported on this GPU type.") def test_cpu_offload_fp8(): # Test quantization of an unquantized checkpoint - compare_two_settings("/mnt/weka/data/pytorch/llama3/Meta-Llama-3-8B-Instruct", - ["--quantization", "fp8"], - ["--quantization", "fp8", "--cpu-offload-gb", "2"], - max_wait_seconds=480) + compare_two_settings( + "/mnt/weka/data/pytorch/llama3/Meta-Llama-3-8B-Instruct", + ["--quantization", "fp8"], + ["--quantization", "fp8", "--cpu-offload-gb", "2"], + max_wait_seconds=480) # Test loading a quantized checkpoint compare_two_settings("neuralmagic/Meta-Llama-3-8B-Instruct-FP8", [], ["--cpu-offload-gb", "2"], diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py index dd88976747a1e..abc7130ea4d9d 100644 --- a/tests/test_sharded_state_loader.py +++ b/tests/test_sharded_state_loader.py @@ -46,9 +46,10 @@ def test_filter_subtensors(): @pytest.fixture(scope="module") def llama_2_7b_files(): with TemporaryDirectory() as cache_dir: - input_dir = snapshot_download("/mnt/weka/data/pytorch/llama2/Llama-2-7b-hf", - cache_dir=cache_dir, - ignore_patterns="*.bin*") + input_dir = snapshot_download( + "/mnt/weka/data/pytorch/llama2/Llama-2-7b-hf", + cache_dir=cache_dir, + ignore_patterns="*.bin*") yield input_dir