feat: add support for h100s & bump backlog limit (#74)

OpenRouterTeam · Mar 8, 2024 · db6ed5d · db6ed5d
1 parent 01b0e37
commit db6ed5d
Show file tree

Hide file tree

Showing 4 changed files with 21 additions and 15 deletions.
diff --git a/modal/runner/containers/vllm_unified.py b/modal/runner/containers/vllm_unified.py
@@ -17,16 +17,19 @@
 
 def _make_container(
     name: str,
-    num_gpus: int = 1,
-    memory: int = 0,
+    gpu: modal.gpu = modal.gpu.A100(count=1, memory=40),
     concurrent_inputs: int = 8,
     max_containers: int = None,
 ):
     """Helper function to create a container with the given GPU configuration."""
 
-    assert memory in {0, 40, 80}, "Modal only supports 40 & 80 GB"
-    gpu = modal.gpu.A100(count=num_gpus, memory=memory)
-    gpu_type = GPUType.A100_80G if memory == 80 else GPUType.A100_40G
+    num_gpus = gpu.count
+    if isinstance(gpu, modal.gpu.A100):
+        gpu_type = GPUType.A100_80G if gpu.memory == 80 else GPUType.A100_40G
+    elif isinstance(gpu, modal.gpu.H100):
+        gpu_type = GPUType.H100_80G
+    else:
+        raise ValueError(f"Unknown GPU type: {gpu}")
 
     class _VllmContainer(VllmEngine):
         def __init__(
@@ -94,35 +97,33 @@ def __init__(
 
 VllmContainer_MicrosoftPhi2 = _make_container(
     name="VllmContainer_MicrosoftPhi2",
-    num_gpus=1,
+    gpu=modal.gpu.A100(count=1, memory=40),
     concurrent_inputs=120,
 )
 VllmContainer_IntelNeuralChat7B = _make_container(
     name="VllmContainer_IntelNeuralChat7B",
-    num_gpus=1,
+    gpu=modal.gpu.A100(count=1, memory=40),
     concurrent_inputs=100,
 )
 VllmContainer_JebCarterPsyfighter13B = _make_container(
     "VllmContainer_JebCarterPsyfighter13B",
-    num_gpus=1,
+    gpu=modal.gpu.A100(count=1, memory=40),
     concurrent_inputs=32,
 )
 VllmContainer_KoboldAIPsyfighter2 = _make_container(
     name="VllmContainer_KoboldAIPsyfighter2",
-    num_gpus=1,
+    gpu=modal.gpu.A100(count=1, memory=40),
     concurrent_inputs=32,
 )
 VllmContainer_NeverSleepNoromaidMixtral8x7B = _make_container(
     name="VllmContainer_NeverSleepNoromaidMixtral8x7B",
-    num_gpus=2,
-    memory=80,
+    gpu=modal.gpu.A100(count=2, memory=80),
     concurrent_inputs=4,
     max_containers=3,
 )
 VllmContainer_JohnDurbinBagel34B = _make_container(
     name="VllmContainer_JohnDurbinBagel34B",
-    num_gpus=2,
-    memory=80,
+    gpu=modal.gpu.A100(count=2, memory=80),
     concurrent_inputs=4,
     max_containers=1,
 )
diff --git a/modal/runner/engines/vllm.py b/modal/runner/engines/vllm.py
@@ -160,7 +160,8 @@ async def generate(self, payload: CompletionPayload, params):
                 extra={
                     "model": self.engine_args.model,
                     "tokens": resp.usage.completion_tokens,
-                    "tps": resp.usage.completion_tokens / t_start_inference,
+                    "tps": resp.usage.completion_tokens
+                    / (time.time() - t_start_inference),
                     "duration": resp.usage.duration,
                     "cost": resp.usage.duration * self.cost_per_second,
                 },

diff --git a/modal/runner/shared/common.py b/modal/runner/shared/common.py
@@ -7,6 +7,6 @@
     api_key_id="RUNNER_API_KEY",
 )
 
-BACKLOG_THRESHOLD = 30
+BACKLOG_THRESHOLD = 100
 
 stub = Stub(config.name)
diff --git a/modal/shared/protocol.py b/modal/shared/protocol.py
@@ -6,11 +6,13 @@
 
 _COST_PER_SECOND_A100_40G: Final[float] = 0.001036
 _COST_PER_SECOND_A100_80G: Final[float] = 0.001553
+_COST_PER_SECOND_H100_80G: Final[float] = 0.002125
 
 
 class GPUType(Enum):
     A100_40G = "A100_40G"
     A100_80G = "A100_80G"
+    H100_80G = "H100_80G"
 
     @property
     def cost_per_second(self) -> float:
@@ -19,6 +21,8 @@ def cost_per_second(self) -> float:
                 return _COST_PER_SECOND_A100_40G
             case GPUType.A100_80G:
                 return _COST_PER_SECOND_A100_80G
+            case GPUType.H100_80G:
+                return _COST_PER_SECOND_H100_80G
 
 
 # https://github.com/vllm-project/vllm/blob/320a622ec4d098f2da5d097930f4031517e7327b/vllm/sampling_params.py#L7-L52