From 28df6fd061a47b988dd1acd83f6d0e3cfeb521f8 Mon Sep 17 00:00:00 2001 From: Konrad Zawora Date: Mon, 23 Sep 2024 17:41:45 +0300 Subject: [PATCH] prune more easy parts --- .github/workflows/cpu-test.yml | 34 ---- examples/lora_inference_hpu.py | 47 ----- examples/offline_inference_fakehpu.py | 38 ---- tests/lora/test_llama_hpu.py | 100 ---------- tests/lora/test_lora_hpu.py | 260 -------------------------- tests/lora/test_multilora_hpu.py | 130 ------------- 6 files changed, 609 deletions(-) delete mode 100644 .github/workflows/cpu-test.yml delete mode 100644 examples/lora_inference_hpu.py delete mode 100644 examples/offline_inference_fakehpu.py delete mode 100644 tests/lora/test_llama_hpu.py delete mode 100644 tests/lora/test_lora_hpu.py delete mode 100644 tests/lora/test_multilora_hpu.py diff --git a/.github/workflows/cpu-test.yml b/.github/workflows/cpu-test.yml deleted file mode 100644 index 89a702f9751d9..0000000000000 --- a/.github/workflows/cpu-test.yml +++ /dev/null @@ -1,34 +0,0 @@ -name: cpu-test - -on: - # Trigger the workflow on push or pull request, - # but only for the habana_main branch - push: - branches: - - habana_main - pull_request: - branches: - - habana_main - - -jobs: - cputest: - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.10"] - steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install torch --extra-index-url https://download.pytorch.org/whl/cpu - pip install -r requirements-hpu.txt - VLLM_TARGET_DEVICE=hpu python setup.py develop - - name: cpu-test - run: | - VLLM_SKIP_WARMUP=true VLLM_PROMPT_SEQ_BUCKET_MAX=128 VLLM_USE_FAKE_HPU=1 python examples/offline_inference_fakehpu.py diff --git a/examples/lora_inference_hpu.py b/examples/lora_inference_hpu.py deleted file mode 100644 index b8154a29a82bb..0000000000000 --- a/examples/lora_inference_hpu.py +++ /dev/null @@ -1,47 +0,0 @@ -from huggingface_hub import snapshot_download - -from vllm import LLM, SamplingParams -from vllm.lora.request import LoRARequest - -sql_lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test") - -llm = LLM(model="meta-llama/Llama-2-7b-hf", - enable_lora=True, - max_num_seqs=2, - dtype='bfloat16') - -sampling_params = SamplingParams(temperature=0, - max_tokens=1024, - stop=["[/assistant]"]) - -prompts = [ - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 -] - -expected_output = [ - " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 - " SELECT nationality FROM table_name_11 WHERE elector = 'Anchero Pantaleone' ", # noqa: E501 - " SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501 - " SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501 - " SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", # noqa: E501 - " SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' " # noqa: E501 -] - -outputs = llm.generate(prompts, - sampling_params, - lora_request=LoRARequest("sql_adapter", 1, - sql_lora_path)) - -for i, output in enumerate(outputs): - prompt = output.prompt - generated_text = output.outputs[0].text - match = expected_output[i] == generated_text - if not match: - print( - f"Comparison failed for request_id::{i}\n\t[PROMPT]{prompt!r}\n\t[GENERATED]{generated_text!r}\n\t[EXPECTED]{expected_output[i]!r}" # noqa: E501 - ) diff --git a/examples/offline_inference_fakehpu.py b/examples/offline_inference_fakehpu.py deleted file mode 100644 index 972d84b60b318..0000000000000 --- a/examples/offline_inference_fakehpu.py +++ /dev/null @@ -1,38 +0,0 @@ -import os - -from vllm import LLM, SamplingParams - -if os.environ.get('VLLM_USE_FAKE_HPU', '0') != '0': - from vllm.utils import migrate_to_cpu - migrate_to_cpu() - -# Sample prompts. -prompts = [ - "Berlin is the capital city of ", - "Louvre is located in the city of ", - "Barack Obama was the 44th president of ", - "Warsaw is the capital city of ", - "Gniezno is a city in ", - "San Francisco is located in the state of ", - "Llanfairpwllgwyngyll is located in country of ", -] -ref_answers = [ - "Germany", "Paris", "United States", "Poland", "Poland", "California", - "Wales" -] -# Create a sampling params object. -sampling_params = SamplingParams(temperature=0, n=1, use_beam_search=False) - -# Create an LLM. -llm = LLM(model="facebook/opt-125m", max_model_len=32, max_num_seqs=4) -# Generate texts from the prompts. The output is a list of RequestOutput objects -# that contain the prompt, generated text, and other information. -outputs = llm.generate(prompts, sampling_params) -# Print the outputs. -for output, answer in zip(outputs, ref_answers): - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - assert answer in generated_text, ( - f"The generated text does not contain the correct answer: {answer}") -print('PASSED') diff --git a/tests/lora/test_llama_hpu.py b/tests/lora/test_llama_hpu.py deleted file mode 100644 index dfd551f2ca043..0000000000000 --- a/tests/lora/test_llama_hpu.py +++ /dev/null @@ -1,100 +0,0 @@ -from multiprocessing import Process -from typing import List - -from conftest import cleanup - -import vllm -from vllm.lora.request import LoRARequest - -MODEL_PATH = "meta-llama/Llama-2-7b-hf" - - -def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: - prompts = [ - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_95 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a low tone mora with a gloss of /˩okiru/ [òkìɽɯ́]? [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? [/user] [assistant]", # noqa: E501 - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]" # noqa: E501 - ] - sampling_params = vllm.SamplingParams(temperature=0, - max_tokens=256, - stop=["[/assistant]"]) - outputs = llm.generate( - prompts, - sampling_params, - lora_request=LoRARequest(str(lora_id), lora_id, lora_path) - if lora_id else None) - # Print the outputs. - generated_texts: List[str] = [] - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - generated_texts.append(generated_text) - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - return generated_texts - - -def _test_llama_lora(sql_lora_files, tp_size): - llm = vllm.LLM(MODEL_PATH, - enable_lora=True, - max_num_seqs=16, - max_loras=4, - dtype='float32', - tensor_parallel_size=tp_size) - - expected_no_lora_output = [ - "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_75 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_76 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_77 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_78 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user]", # noqa: E501 - " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? ", # noqa: E501 - "\n\n answer: 1\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_96 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_97 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one mora for a high tone mora with a gloss of /˧kot/ [kòt]? [/user] [assistant]\n\n answer: 2\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_98 (one_mora VARCHAR, gloss VARCHAR, accented_mora VARCHAR)\n\n question: What is the one m", # noqa: E501 - " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE candidate (people_id VARCHAR, unsure_rate INTEGER); CREATE TABLE people (sex VARCHAR, people_id VARCHAR)\n\n question: which gender got the highest average uncertain ratio. ", # noqa: E501 - " Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_60 (pick INTEGER, former_wnba_team VARCHAR)\n\n question: What pick was a player that previously played for the Minnesota Lynx? ", # noqa: E501 - "\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_28138035_4 (womens_doubles VARCHAR, mens_singles VARCHAR)\n\n question: Name the women's doubles for werner schlager [/user] [assistant]\n\n [user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE", # noqa: E501 - ] - expected_lora_output = [ - " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 - " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501 - " SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501 - " SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501 - " SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", # noqa: E501 - " SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' " # noqa: E501 - ] - - print("lora adapter created") - assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output - - print("lora 1") - assert do_sample(llm, sql_lora_files, lora_id=1) == expected_lora_output - - print("no lora") - assert do_sample(llm, sql_lora_files, lora_id=0) == expected_no_lora_output - - print("lora 2") - assert do_sample(llm, sql_lora_files, lora_id=2) == expected_lora_output - - print("removing lora") - cleanup() - - -def test_llama_lora_1x(sql_lora_files): - p = Process(target=_test_llama_lora, args=(sql_lora_files, 1)) - p.start() - p.join() - assert p.exitcode == 0 - - -def test_llama_lora_2x(sql_lora_files): - # Work-around to resolve stalling issue in multi-card scenario - p = Process(target=_test_llama_lora, args=(sql_lora_files, 2)) - p.start() - p.join() - assert p.exitcode == 0 - - -def test_llama_lora_4x(sql_lora_files): - # Work-around to resolve stalling issue in multi-card scenario - p = Process(target=_test_llama_lora, args=(sql_lora_files, 4)) - p.start() - p.join() - assert p.exitcode == 0 diff --git a/tests/lora/test_lora_hpu.py b/tests/lora/test_lora_hpu.py deleted file mode 100644 index a59cfe875ef9c..0000000000000 --- a/tests/lora/test_lora_hpu.py +++ /dev/null @@ -1,260 +0,0 @@ -import pytest -import torch -from vllm_hpu_extension.ops import LoraMask - -from vllm.hpu.punica_hpu import GaudiPunicaWrapper - -from .utils import DummyLoRAManager - -TENSOR_SIZES = [128, 1024, 2048, 4096, 8192, 11008, 11008 // 2, 11008 // 4] -QKV_TENSOR_SIZES = [ - (8192, 1024, 1024), - (8192 // 8, 1024 // 8, 1024 // 8), - (4096, 4096, 4096), - (4096 // 2, 4096 // 2, 4096 // 2), -] -BATCH_SIZES = [8, 32, 256] -RANKS = [8] -DTYPES = [torch.bfloat16] -TOLERANCES = { - torch.float16: (5e-3, 5e-3), - torch.bfloat16: (3e-2, 2e-2), -} - - -def createLoraMask(indices, batch_size, seq_len, max_loras, max_lora_rank, - lora_dtype): - indices = indices.view(-1, 1) - mask = torch.arange(max_loras * max_lora_rank, device=indices.device) - mask = mask.view(1, -1) - mask = ((mask >= ((indices) * max_lora_rank)) * - (mask < ((indices + 1) * max_lora_rank))).to(dtype=lora_dtype) - mask = mask.view(batch_size, 1, - -1).expand(batch_size, seq_len, - -1).reshape(batch_size * seq_len, -1) - return mask - - -@pytest.mark.parametrize("m", TENSOR_SIZES) -@pytest.mark.parametrize("n", TENSOR_SIZES) -@pytest.mark.parametrize("k", BATCH_SIZES) -@pytest.mark.parametrize("rank", RANKS) -@pytest.mark.parametrize("dtype", DTYPES) -def test_apply_lora(m, n, k, rank, dtype) -> None: - manager = DummyLoRAManager() - - module_name = "module" - weight = torch.rand([m, n], device="hpu", dtype=dtype) - - manager.init_random_lora(module_name, weight, rank=rank) - lora = manager.get_module_lora(module_name) - - input = torch.rand(k, n, device="hpu", dtype=dtype) - expected = input @ lora.lora_a @ lora.lora_b * lora.scaling - - lora_a_stack = torch.zeros(8, - 1, - lora.lora_a.shape[1], - lora.lora_a.shape[0], - device="hpu", - dtype=dtype) - lora_b_stack = torch.zeros(8, - 1, - lora.lora_b.shape[1], - lora.lora_b.shape[0], - device="hpu", - dtype=dtype) - for i in range(lora_a_stack.shape[0]): - lora_a_stack[i][0] = lora.lora_a.T - lora_b_stack[i][0] = (lora.lora_b * lora.scaling).T - - output = torch.zeros(k, m, device="hpu", dtype=dtype) - indices = torch.randint(0, - lora_a_stack.shape[0], (len(input), ), - device="hpu") - mask = createLoraMask(indices, k, 1, 8, rank, dtype) - LoraMask.setLoraMask(mask) - punica_wrapper = GaudiPunicaWrapper(4096, max_batches=256, device="hpu") - - punica_wrapper.add_lora(output, input, lora_a_stack, lora_b_stack, 1.0) - - rtol, atol = TOLERANCES[dtype] - assert torch.allclose(expected, output, rtol=rtol, atol=atol) - - output[:] = 0 - indices = torch.full((len(input), ), -1, device="hpu") - mask = createLoraMask(indices, k, 1, 8, rank, dtype) - LoraMask.setLoraMask(mask) - - punica_wrapper.add_lora(output, input, lora_a_stack, lora_b_stack, 1.0) - assert torch.allclose(torch.zeros_like(output), output) - - manager.reset_lora() - - -@pytest.mark.parametrize("m", TENSOR_SIZES) -@pytest.mark.parametrize("n", TENSOR_SIZES) -@pytest.mark.parametrize("k", BATCH_SIZES) -@pytest.mark.parametrize("rank", RANKS) -@pytest.mark.parametrize("dtype", DTYPES) -def test_apply_lora_packed_2slice(m, n, k, rank, dtype) -> None: - if m % 2 != 0: - pytest.skip("m must be divisible by 2") - if m // 2 not in TENSOR_SIZES: - pytest.skip("m//2 must be in TENSOR_SIZES") - - manager = DummyLoRAManager() - - module_name = "module" - weight = torch.rand([m // 2, n], device="hpu", dtype=dtype) - - manager.init_random_lora(module_name + "1", weight, rank=rank) - lora_1 = manager.get_module_lora(module_name + "1") - manager.init_random_lora(module_name + "2", weight, rank=rank) - lora_2 = manager.get_module_lora(module_name + "2") - - input = torch.rand(k, n, device="hpu", dtype=dtype) - expected = torch.cat([ - input @ lora_1.lora_a @ lora_1.lora_b * lora_1.scaling, - input @ lora_2.lora_a @ lora_2.lora_b * lora_2.scaling - ], - dim=1) - - lora_a_stacks = [ - torch.zeros(8, - 1, - lora_1.lora_a.shape[1], - lora_1.lora_a.shape[0], - device="hpu", - dtype=dtype) for i in range(2) - ] - lora_b_stacks = [ - torch.zeros(8, - 1, - lora_1.lora_b.shape[1], - lora_1.lora_b.shape[0], - device="hpu", - dtype=dtype) for i in range(2) - ] - for i in range(lora_a_stacks[0].shape[0]): - lora_a_stacks[0][i][0] = lora_1.lora_a.T - lora_b_stacks[0][i][0] = (lora_1.lora_b * lora_1.scaling).T - lora_a_stacks[1][i][0] = lora_2.lora_a.T - lora_b_stacks[1][i][0] = (lora_2.lora_b * lora_2.scaling).T - - output = torch.zeros(k, m, device="hpu", dtype=dtype) - indices = torch.randint(0, - lora_a_stacks[0].shape[0], (len(input), ), - device="hpu") - mask = createLoraMask(indices, k, 1, 8, rank, dtype) - LoraMask.setLoraMask(mask) - - punica_wrapper = GaudiPunicaWrapper(4096, max_batches=256, device="hpu") - punica_wrapper.add_lora_packed_nslice(output, input, lora_a_stacks, - lora_b_stacks, 1.0, (m // 2, m // 2)) - - rtol, atol = TOLERANCES[dtype] - assert torch.allclose(expected, output, rtol=rtol, atol=atol) - - output[:] = 0 - indices = torch.full((len(input), ), -1, device="hpu") - mask = createLoraMask(indices, k, 1, 8, rank, dtype) - LoraMask.setLoraMask(mask) - - punica_wrapper.add_lora_packed_nslice(output, input, lora_a_stacks, - lora_b_stacks, 1.0, (m // 2, m // 2)) - assert torch.allclose(torch.zeros_like(output), output) - - manager.reset_lora() - - -@pytest.mark.parametrize("qkv", QKV_TENSOR_SIZES) -@pytest.mark.parametrize("n", TENSOR_SIZES) -@pytest.mark.parametrize("k", BATCH_SIZES) -@pytest.mark.parametrize("rank", RANKS) -@pytest.mark.parametrize("dtype", DTYPES) -def test_apply_lora_packed_3slice(qkv, n, k, rank, dtype) -> None: - manager = DummyLoRAManager() - - module_name = "module" - weight_q = torch.empty(qkv[0], n, device="hpu", dtype=dtype) - weight_kv = torch.empty(qkv[1], n, device="hpu", dtype=dtype) - - manager.init_random_lora(module_name + "q", weight_q, rank=rank) - lora_q = manager.get_module_lora(module_name + "q") - manager.init_random_lora(module_name + "k", weight_kv, rank=rank) - lora_k = manager.get_module_lora(module_name + "k") - manager.init_random_lora(module_name + "v", weight_kv, rank=rank) - lora_v = manager.get_module_lora(module_name + "v") - - input = torch.rand(k, n, device="hpu", dtype=dtype) - expected = torch.cat([ - input @ lora_q.lora_a @ lora_q.lora_b * lora_q.scaling, - input @ lora_k.lora_a @ lora_k.lora_b * lora_k.scaling, - input @ lora_v.lora_a @ lora_v.lora_b * lora_v.scaling - ], - dim=1) - - lora_a_stacks = [ - torch.zeros(8, - 1, - lora_q.lora_a.shape[1], - lora_q.lora_a.shape[0], - device="hpu", - dtype=dtype) - ] + [ - torch.zeros(8, - 1, - lora_k.lora_a.shape[1], - lora_k.lora_a.shape[0], - device="hpu", - dtype=dtype) for i in range(2) - ] - lora_b_stacks = [ - torch.zeros(8, - 1, - lora_q.lora_b.shape[1], - lora_q.lora_b.shape[0], - device="hpu", - dtype=dtype) - ] + [ - torch.zeros(8, - 1, - lora_k.lora_b.shape[1], - lora_k.lora_b.shape[0], - device="hpu", - dtype=dtype) for i in range(2) - ] - for i in range(lora_a_stacks[0].shape[0]): - lora_a_stacks[0][i][0] = lora_q.lora_a.T - lora_b_stacks[0][i][0] = (lora_q.lora_b * lora_q.scaling).T - lora_a_stacks[1][i][0] = lora_k.lora_a.T - lora_b_stacks[1][i][0] = (lora_k.lora_b * lora_k.scaling).T - lora_a_stacks[2][i][0] = lora_v.lora_a.T - lora_b_stacks[2][i][0] = (lora_v.lora_b * lora_v.scaling).T - - output = torch.zeros(k, sum(qkv), device="hpu", dtype=dtype) - indices = torch.randint(0, - lora_a_stacks[0].shape[0], (len(input), ), - device="hpu") - mask = createLoraMask(indices, k, 1, 8, rank, dtype) - LoraMask.setLoraMask(mask) - - punica_wrapper = GaudiPunicaWrapper(4096, max_batches=256, device="hpu") - qkvs = (qkv[0], qkv[1], qkv[2]) - punica_wrapper.add_lora_packed_nslice(output, input, lora_a_stacks, - lora_b_stacks, 1.0, qkvs) - - rtol, atol = TOLERANCES[dtype] - assert torch.allclose(expected, output, rtol=rtol, atol=atol) - - output[:] = 0 - indices = torch.full((len(input), ), -1, device="hpu") - mask = createLoraMask(indices, k, 1, 8, rank, dtype) - LoraMask.setLoraMask(mask) - qkvs = (qkv[0], qkv[1], qkv[2]) - punica_wrapper.add_lora_packed_nslice(output, input, lora_a_stacks, - lora_b_stacks, 1.0, qkvs) - assert torch.allclose(torch.zeros_like(output), output) - - manager.reset_lora() diff --git a/tests/lora/test_multilora_hpu.py b/tests/lora/test_multilora_hpu.py deleted file mode 100644 index 64eda037ff059..0000000000000 --- a/tests/lora/test_multilora_hpu.py +++ /dev/null @@ -1,130 +0,0 @@ -from multiprocessing import Process -from typing import List, Optional, Tuple - -from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams -from vllm.lora.request import LoRARequest - - -def create_test_prompts( - lora_path: str -) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]: - """Create a list of test prompts with their sampling parameters. - - 2 requests for base model, 4 requests for the LoRA. We define 2 - different LoRA adapters (using the same model for demo purposes). - """ - return [ - ("A robot may not injure a human being", - SamplingParams(temperature=0.0, - logprobs=1, - prompt_logprobs=1, - max_tokens=128), None), - ("To be or not to be,", - SamplingParams(temperature=0.8, - top_k=5, - presence_penalty=0.2, - max_tokens=128), None), - ( - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 - SamplingParams(temperature=0.0, - logprobs=1, - prompt_logprobs=1, - max_tokens=128, - stop_token_ids=[32003]), - LoRARequest("sql-lora", 1, lora_path)), - ( - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 - SamplingParams(temperature=0, - max_tokens=128, - stop_token_ids=[32003]), - LoRARequest("sql-lora", 1, lora_path)), - ( - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 - SamplingParams(temperature=0.0, - logprobs=1, - prompt_logprobs=1, - max_tokens=128, - stop_token_ids=[32003]), - LoRARequest("sql-lora2", 2, lora_path)), - ( - "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501 - SamplingParams(temperature=0, - max_tokens=128, - stop_token_ids=[32003]), - LoRARequest("sql-lora", 1, lora_path)), - ] - - -def process_requests(engine: LLMEngine, - test_prompts: List[Tuple[str, SamplingParams, - Optional[LoRARequest]]]): - """Continuously process a list of prompts and handle the outputs.""" - request_id = 0 - result = {} - - while test_prompts or engine.has_unfinished_requests(): - if test_prompts: - prompt, sampling_params, lora_request = test_prompts.pop(0) - engine.add_request(str(request_id), - prompt, - sampling_params, - lora_request=lora_request) - request_id += 1 - - request_outputs: List[RequestOutput] = engine.step() - - for request_output in request_outputs: - if request_output.finished: - result[ - request_output.request_id] = request_output.outputs[0].text - return result - - -expected_output = [ - " or, through inaction, allow a human being to come to harm.\nA robot must obey the orders given it by human beings except where such orders would conflict with the First Law.\nA robot must protect its own existence as long as such protection does not conflict with the First or Second Law.\nThe Three Laws of Robotics were created by Isaac Asimov in 1942. They are the foundation of robotics and artificial intelligence.\nThe Three Laws of Robotics are the foundation of robotics and artificial intelligence. They were created by Isaac Asimov in 194", # noqa: E501 - " that is the question.\nIt is the most famous line in all of Shakespeare's plays and one of the most famous in English literature. The question is not whether or not to be, but rather the question of who to be.\nIn Hamlet's case, the question is whether or not to be a good person. He is torn between the goodness of his father and the evil of his mother.\nThe question is a difficult one, and one that has been asked many times before. In Hamlet's case, the question is whether or not to be a good person, and he is torn between the", # noqa: E501 - " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 - " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501 - " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 - " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' " # noqa: E501 -] - - -def _test_llama_multilora(sql_lora_files, tp_size): - """Main function that sets up and runs the prompt processing.""" - engine_args = EngineArgs(model="meta-llama/Llama-2-7b-hf", - enable_lora=True, - max_loras=2, - max_lora_rank=8, - max_num_seqs=256, - dtype='float32', - tensor_parallel_size=tp_size) - engine = LLMEngine.from_engine_args(engine_args) - test_prompts = create_test_prompts(sql_lora_files) - results = process_requests(engine, test_prompts) - generated_texts = [results[key] for key in sorted(results)] - assert generated_texts == expected_output - - -def test_llama_multilora_1x(sql_lora_files): - # Work-around to resolve stalling issue in multi-card scenario - p = Process(target=_test_llama_multilora, args=(sql_lora_files, 1)) - p.start() - p.join() - assert p.exitcode == 0 - - -def test_llama_multilora_2x(sql_lora_files): - # Work-around to resolve stalling issue in multi-card scenario - p = Process(target=_test_llama_multilora, args=(sql_lora_files, 2)) - p.start() - p.join() - assert p.exitcode == 0 - - -def test_llama_multilora_4x(sql_lora_files): - # Work-around to resolve stalling issue in multi-card scenario - p = Process(target=_test_llama_multilora, args=(sql_lora_files, 4)) - p.start() - p.join() - assert p.exitcode == 0