Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add max-batch-size to benchmark_throughput.py #122

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions benchmarks/benchmark_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,14 @@ def run_vllm(
enable_prefix_caching: bool,
enable_chunked_prefill: bool,
max_num_batched_tokens: int,
max_batch_size: Optional[int],
distributed_executor_backend: Optional[str],
gpu_memory_utilization: float = 0.9,
worker_use_ray: bool = False,
download_dir: Optional[str] = None,
) -> float:
from vllm import LLM, SamplingParams
max_num_seqs = { 'max_num_seqs' : max_batch_size } if max_batch_size is not None else {}
llm = LLM(
model=model,
tokenizer=tokenizer,
Expand All @@ -106,6 +108,7 @@ def run_vllm(
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend,
**max_num_seqs,
)

# Add the requests to the engine.
Expand Down Expand Up @@ -231,13 +234,13 @@ def main(args: argparse.Namespace):
args.max_model_len, args.enforce_eager, args.kv_cache_dtype,
args.quantization_param_path, args.device,
args.enable_prefix_caching, args.enable_chunked_prefill,
args.max_num_batched_tokens, args.distributed_executor_backend,
args.max_num_batched_tokens, args.max_batch_size, args.distributed_executor_backend,
args.gpu_memory_utilization, args.worker_use_ray,
args.download_dir)
elif args.backend == "hf":
assert args.tensor_parallel_size == 1
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
args.use_beam_search, args.hf_max_batch_size,
args.use_beam_search, args.max_batch_size,
args.trust_remote_code)
elif args.backend == "mii":
elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
Expand Down Expand Up @@ -298,10 +301,10 @@ def main(args: argparse.Namespace):
default=1000,
help="Number of prompts to process.")
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--hf-max-batch-size",
parser.add_argument("--max-batch-size",
type=int,
default=None,
help="Maximum batch size for HF backend.")
help="Maximum batch size for vLLM or HF.")
parser.add_argument('--trust-remote-code',
action='store_true',
help='trust remote code from huggingface')
Expand Down Expand Up @@ -404,11 +407,8 @@ def main(args: argparse.Namespace):
else:
assert args.input_len is None

if args.backend == "vllm":
if args.hf_max_batch_size is not None:
raise ValueError("HF max batch size is only for HF backend.")
elif args.backend == "hf":
if args.hf_max_batch_size is None:
if args.backend == "hf":
if args.max_batch_size is None:
raise ValueError("HF max batch size is required for HF backend.")
if args.quantization is not None:
raise ValueError("Quantization is only for vLLM backend.")
Expand All @@ -421,8 +421,8 @@ def main(args: argparse.Namespace):
raise ValueError("Beam search is not supported for MII backend.")
if args.quantization is not None:
raise ValueError("Quantization is only for vLLM backend.")
if args.hf_max_batch_size is not None:
raise ValueError("HF max batch size is only for HF backend.")
if args.max_batch_size is not None:
raise ValueError("Max batch size is only for HF or vLLM backends")
if args.tokenizer != args.model:
raise ValueError("Tokenizer must be the same as the model for MII "
"backend.")
Expand Down
Loading