Skip to content

Commit

Permalink
feat: add support for h100s & bump backlog limit (#74)
Browse files Browse the repository at this point in the history
  • Loading branch information
sambarnes authored Mar 8, 2024
1 parent 01b0e37 commit db6ed5d
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 15 deletions.
27 changes: 14 additions & 13 deletions modal/runner/containers/vllm_unified.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,19 @@

def _make_container(
name: str,
num_gpus: int = 1,
memory: int = 0,
gpu: modal.gpu = modal.gpu.A100(count=1, memory=40),
concurrent_inputs: int = 8,
max_containers: int = None,
):
"""Helper function to create a container with the given GPU configuration."""

assert memory in {0, 40, 80}, "Modal only supports 40 & 80 GB"
gpu = modal.gpu.A100(count=num_gpus, memory=memory)
gpu_type = GPUType.A100_80G if memory == 80 else GPUType.A100_40G
num_gpus = gpu.count
if isinstance(gpu, modal.gpu.A100):
gpu_type = GPUType.A100_80G if gpu.memory == 80 else GPUType.A100_40G
elif isinstance(gpu, modal.gpu.H100):
gpu_type = GPUType.H100_80G
else:
raise ValueError(f"Unknown GPU type: {gpu}")

class _VllmContainer(VllmEngine):
def __init__(
Expand Down Expand Up @@ -94,35 +97,33 @@ def __init__(

VllmContainer_MicrosoftPhi2 = _make_container(
name="VllmContainer_MicrosoftPhi2",
num_gpus=1,
gpu=modal.gpu.A100(count=1, memory=40),
concurrent_inputs=120,
)
VllmContainer_IntelNeuralChat7B = _make_container(
name="VllmContainer_IntelNeuralChat7B",
num_gpus=1,
gpu=modal.gpu.A100(count=1, memory=40),
concurrent_inputs=100,
)
VllmContainer_JebCarterPsyfighter13B = _make_container(
"VllmContainer_JebCarterPsyfighter13B",
num_gpus=1,
gpu=modal.gpu.A100(count=1, memory=40),
concurrent_inputs=32,
)
VllmContainer_KoboldAIPsyfighter2 = _make_container(
name="VllmContainer_KoboldAIPsyfighter2",
num_gpus=1,
gpu=modal.gpu.A100(count=1, memory=40),
concurrent_inputs=32,
)
VllmContainer_NeverSleepNoromaidMixtral8x7B = _make_container(
name="VllmContainer_NeverSleepNoromaidMixtral8x7B",
num_gpus=2,
memory=80,
gpu=modal.gpu.A100(count=2, memory=80),
concurrent_inputs=4,
max_containers=3,
)
VllmContainer_JohnDurbinBagel34B = _make_container(
name="VllmContainer_JohnDurbinBagel34B",
num_gpus=2,
memory=80,
gpu=modal.gpu.A100(count=2, memory=80),
concurrent_inputs=4,
max_containers=1,
)
3 changes: 2 additions & 1 deletion modal/runner/engines/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,8 @@ async def generate(self, payload: CompletionPayload, params):
extra={
"model": self.engine_args.model,
"tokens": resp.usage.completion_tokens,
"tps": resp.usage.completion_tokens / t_start_inference,
"tps": resp.usage.completion_tokens
/ (time.time() - t_start_inference),
"duration": resp.usage.duration,
"cost": resp.usage.duration * self.cost_per_second,
},
Expand Down
2 changes: 1 addition & 1 deletion modal/runner/shared/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@
api_key_id="RUNNER_API_KEY",
)

BACKLOG_THRESHOLD = 30
BACKLOG_THRESHOLD = 100

stub = Stub(config.name)
4 changes: 4 additions & 0 deletions modal/shared/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@

_COST_PER_SECOND_A100_40G: Final[float] = 0.001036
_COST_PER_SECOND_A100_80G: Final[float] = 0.001553
_COST_PER_SECOND_H100_80G: Final[float] = 0.002125


class GPUType(Enum):
A100_40G = "A100_40G"
A100_80G = "A100_80G"
H100_80G = "H100_80G"

@property
def cost_per_second(self) -> float:
Expand All @@ -19,6 +21,8 @@ def cost_per_second(self) -> float:
return _COST_PER_SECOND_A100_40G
case GPUType.A100_80G:
return _COST_PER_SECOND_A100_80G
case GPUType.H100_80G:
return _COST_PER_SECOND_H100_80G


# https://github.com/vllm-project/vllm/blob/320a622ec4d098f2da5d097930f4031517e7327b/vllm/sampling_params.py#L7-L52
Expand Down

0 comments on commit db6ed5d

Please sign in to comment.