From db6ed5d1dbf494186aee4a6de7690ee1fcb99ed2 Mon Sep 17 00:00:00 2001 From: sam <1211977+sambarnes@users.noreply.github.com> Date: Fri, 8 Mar 2024 07:48:54 -0700 Subject: [PATCH] feat: add support for h100s & bump backlog limit (#74) --- modal/runner/containers/vllm_unified.py | 27 +++++++++++++------------ modal/runner/engines/vllm.py | 3 ++- modal/runner/shared/common.py | 2 +- modal/shared/protocol.py | 4 ++++ 4 files changed, 21 insertions(+), 15 deletions(-) diff --git a/modal/runner/containers/vllm_unified.py b/modal/runner/containers/vllm_unified.py index 6f55952..30a0b56 100644 --- a/modal/runner/containers/vllm_unified.py +++ b/modal/runner/containers/vllm_unified.py @@ -17,16 +17,19 @@ def _make_container( name: str, - num_gpus: int = 1, - memory: int = 0, + gpu: modal.gpu = modal.gpu.A100(count=1, memory=40), concurrent_inputs: int = 8, max_containers: int = None, ): """Helper function to create a container with the given GPU configuration.""" - assert memory in {0, 40, 80}, "Modal only supports 40 & 80 GB" - gpu = modal.gpu.A100(count=num_gpus, memory=memory) - gpu_type = GPUType.A100_80G if memory == 80 else GPUType.A100_40G + num_gpus = gpu.count + if isinstance(gpu, modal.gpu.A100): + gpu_type = GPUType.A100_80G if gpu.memory == 80 else GPUType.A100_40G + elif isinstance(gpu, modal.gpu.H100): + gpu_type = GPUType.H100_80G + else: + raise ValueError(f"Unknown GPU type: {gpu}") class _VllmContainer(VllmEngine): def __init__( @@ -94,35 +97,33 @@ def __init__( VllmContainer_MicrosoftPhi2 = _make_container( name="VllmContainer_MicrosoftPhi2", - num_gpus=1, + gpu=modal.gpu.A100(count=1, memory=40), concurrent_inputs=120, ) VllmContainer_IntelNeuralChat7B = _make_container( name="VllmContainer_IntelNeuralChat7B", - num_gpus=1, + gpu=modal.gpu.A100(count=1, memory=40), concurrent_inputs=100, ) VllmContainer_JebCarterPsyfighter13B = _make_container( "VllmContainer_JebCarterPsyfighter13B", - num_gpus=1, + gpu=modal.gpu.A100(count=1, memory=40), concurrent_inputs=32, ) VllmContainer_KoboldAIPsyfighter2 = _make_container( name="VllmContainer_KoboldAIPsyfighter2", - num_gpus=1, + gpu=modal.gpu.A100(count=1, memory=40), concurrent_inputs=32, ) VllmContainer_NeverSleepNoromaidMixtral8x7B = _make_container( name="VllmContainer_NeverSleepNoromaidMixtral8x7B", - num_gpus=2, - memory=80, + gpu=modal.gpu.A100(count=2, memory=80), concurrent_inputs=4, max_containers=3, ) VllmContainer_JohnDurbinBagel34B = _make_container( name="VllmContainer_JohnDurbinBagel34B", - num_gpus=2, - memory=80, + gpu=modal.gpu.A100(count=2, memory=80), concurrent_inputs=4, max_containers=1, ) diff --git a/modal/runner/engines/vllm.py b/modal/runner/engines/vllm.py index d88664c..4776ee2 100644 --- a/modal/runner/engines/vllm.py +++ b/modal/runner/engines/vllm.py @@ -160,7 +160,8 @@ async def generate(self, payload: CompletionPayload, params): extra={ "model": self.engine_args.model, "tokens": resp.usage.completion_tokens, - "tps": resp.usage.completion_tokens / t_start_inference, + "tps": resp.usage.completion_tokens + / (time.time() - t_start_inference), "duration": resp.usage.duration, "cost": resp.usage.duration * self.cost_per_second, }, diff --git a/modal/runner/shared/common.py b/modal/runner/shared/common.py index f761cf6..95bf4b4 100644 --- a/modal/runner/shared/common.py +++ b/modal/runner/shared/common.py @@ -7,6 +7,6 @@ api_key_id="RUNNER_API_KEY", ) -BACKLOG_THRESHOLD = 30 +BACKLOG_THRESHOLD = 100 stub = Stub(config.name) diff --git a/modal/shared/protocol.py b/modal/shared/protocol.py index ab801a9..603c07e 100644 --- a/modal/shared/protocol.py +++ b/modal/shared/protocol.py @@ -6,11 +6,13 @@ _COST_PER_SECOND_A100_40G: Final[float] = 0.001036 _COST_PER_SECOND_A100_80G: Final[float] = 0.001553 +_COST_PER_SECOND_H100_80G: Final[float] = 0.002125 class GPUType(Enum): A100_40G = "A100_40G" A100_80G = "A100_80G" + H100_80G = "H100_80G" @property def cost_per_second(self) -> float: @@ -19,6 +21,8 @@ def cost_per_second(self) -> float: return _COST_PER_SECOND_A100_40G case GPUType.A100_80G: return _COST_PER_SECOND_A100_80G + case GPUType.H100_80G: + return _COST_PER_SECOND_H100_80G # https://github.com/vllm-project/vllm/blob/320a622ec4d098f2da5d097930f4031517e7327b/vllm/sampling_params.py#L7-L52