Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(loadtest): add loadtest tools #906

Merged
merged 5 commits into from
Nov 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,10 +1,3 @@
loadtest:
ifdef TABBY_API_HOST
k6 run tests/*.loadtest.js
else
$(error TABBY_API_HOST is undefined)
endif

fix:
cargo machete --fix || true
cargo +nightly fmt
Expand Down
6 changes: 6 additions & 0 deletions python/tabby-loadtest/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Run loadtest with tabby on modal GPUs

Steps:
1. Adjust `TABBY_API_HOST` in `run.sh` to match your modal deployment url.
2. Add models you're interested in to benchmark at end of `run.sh`
3. Run `run.sh`, output will be appended to `record.csv`
48 changes: 48 additions & 0 deletions python/tabby-loadtest/loadtest.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import http from "k6/http";
import { check, sleep } from "k6";
import { textSummary } from "https://jslib.k6.io/k6-utils/1.4.0/index.js";

const PARALLELISM = parseInt(__ENV.PARALLELISM);

export const options = {
stages: [
{ duration: "1s", target: PARALLELISM },
{ duration: "30s", target: PARALLELISM },
],
// Below thresholds are tested against TabbyML/StarCoder-1B served by NVIDIA T4 GPU.
thresholds: {
http_req_failed: ['rate<0.001'],
http_req_duration: ["med<1800", "avg<1800", "p(90)<2500", "p(95)<3000"],
},
};

export default () => {
const payload = JSON.stringify({
language: "python",
segments: {
prefix: "def binarySearch(arr, left, right, x):\n mid = (left +"
},
});
const headers = { "Content-Type": "application/json" };
const res = http.post(`${__ENV.TABBY_API_HOST}/v1/completions`, payload, {
headers,
});
check(res, { success: (r) => r.status === 200 });
sleep(0.5);
};

export function handleSummary(data) {
const avg_latency = data.metrics.http_req_duration.values.avg / 1000;
const med_latency = data.metrics.http_req_duration.values.med / 1000;
const p90_latency = data.metrics.http_req_duration.values["p(90)"] / 1000;
const p95_latency = data.metrics.http_req_duration.values["p(95)"] / 1000;
const qps = PARALLELISM / avg_latency;

return {
"metrics.txt": `${rounded(qps)},${rounded(avg_latency)},${rounded(med_latency)},${rounded(p90_latency)},${rounded(p95_latency)}`
};
}

function rounded(x) {
return Math.round(x * 100) / 100;
}
59 changes: 59 additions & 0 deletions python/tabby-loadtest/record.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
GPU,Model,Parallelism,QPS,Latency (Avg)," Latency (Med)"," Latency (p90)"," Latency (p95)"," Passed"
T4,TabbyML/StarCoder-1B,7,4.14,1.69,"1.58","2","2.05","SUCCESS"
T4,TabbyML/StarCoder-1B,10,4.85,2.06,"1.98","2.49","2.63","FAILED"
T4,TabbyML/StarCoder-1B,8,4.22,1.9,"1.85","2.29","2.38","FAILED"
A10G,TabbyML/StarCoder-1B,17,12.01,1.42,"1.35","1.66","1.9","SUCCESS"
A10G,TabbyML/StarCoder-1B,25,14.61,1.71,"1.68","1.97","2.07","SUCCESS"
A10G,TabbyML/StarCoder-1B,29,15.94,1.82,"1.79","2.11","2.17","FAILED"
A10G,TabbyML/StarCoder-1B,27,14.89,1.81,"1.79","2.03","2.37","FAILED"
A10G,TabbyML/StarCoder-1B,26,14.79,1.76,"1.72","2.08","2.22","SUCCESS"
A100,TabbyML/StarCoder-1B,33,13.16,2.51,"1.57","2.58","11.52","FAILED"
A100,TabbyML/StarCoder-1B,17,12.94,1.31,"1.29","1.46","1.53","SUCCESS"
A100,TabbyML/StarCoder-1B,25,16.98,1.47,"1.43","1.69","1.86","SUCCESS"
A100,TabbyML/StarCoder-1B,29,10.46,2.77,"2.8","3.11","3.18","FAILED"
A100,TabbyML/StarCoder-1B,27,13.58,1.99,"1.96","2.37","2.42","FAILED"
A100,TabbyML/StarCoder-1B,26,16.52,1.57,"1.47","1.96","2.19","SUCCESS"
T4,TabbyML/DeepseekCoder-1.3B,7,3.82,1.83,"1.86","1.94","1.96","FAILED"
T4,TabbyML/DeepseekCoder-1.3B,4,3.05,1.31,"1.32","1.45","1.49","SUCCESS"
T4,TabbyML/DeepseekCoder-1.3B,5,3.18,1.57,"1.54","1.92","1.97","SUCCESS"
T4,TabbyML/DeepseekCoder-1.3B,6,3.33,1.8,"1.84","1.92","1.95","FAILED"
A10G,TabbyML/DeepseekCoder-1.3B,17,11.66,1.46,1.46,1.65,1.77,SUCCESS
A10G,TabbyML/DeepseekCoder-1.3B,21,6.83,3.07,1.63,14.32,14.56,FAILED
A10G,TabbyML/DeepseekCoder-1.3B,19,12.63,1.5,1.5,1.69,1.78,SUCCESS
A10G,TabbyML/DeepseekCoder-1.3B,20,12.88,1.55,1.53,1.79,1.92,SUCCESS
A100,TabbyML/DeepseekCoder-1.3B,33,20.78,1.59,1.55,1.84,1.9,SUCCESS
A100,TabbyML/DeepseekCoder-1.3B,49,18.2,2.69,2.62,3.15,3.38,FAILED
A100,TabbyML/DeepseekCoder-1.3B,41,16.44,2.49,2.36,2.9,3.49,FAILED
A100,TabbyML/DeepseekCoder-1.3B,37,21.15,1.75,1.67,2.07,2.18,SUCCESS
A100,TabbyML/DeepseekCoder-1.3B,39,14.78,2.64,2.6,3.02,3.13,FAILED
A100,TabbyML/DeepseekCoder-1.3B,38,20.88,1.82,1.76,2.08,2.19,FAILED
T4,TabbyML/StarCoder-3B,7,1.89,3.7,3.68,3.89,3.96,FAILED
T4,TabbyML/StarCoder-3B,4,1.53,2.62,2.62,2.75,2.78,FAILED
T4,TabbyML/StarCoder-3B,2,0.86,2.32,2.34,2.43,2.47,FAILED
A10G,TabbyML/StarCoder-3B,17,5.42,3.14,3.16,3.47,3.84,FAILED
A10G,TabbyML/StarCoder-3B,9,4.31,2.09,2.05,2.39,2.69,FAILED
A10G,TabbyML/StarCoder-3B,5,2.85,1.75,1.73,1.95,2.28,SUCCESS
A10G,TabbyML/StarCoder-3B,7,3.61,1.94,1.94,2.18,2.29,FAILED
A10G,TabbyML/StarCoder-3B,6,3.17,1.89,1.91,2.04,2.06,FAILED
A100,TabbyML/StarCoder-3B,33,8.12,4.07,4.12,4.53,4.61,FAILED
A100,TabbyML/StarCoder-3B,17,7.9,2.15,2.1,2.34,2.81,FAILED
A100,TabbyML/StarCoder-3B,9,4.47,2.01,1.99,2.19,2.26,FAILED
A100,TabbyML/StarCoder-3B,5,3.21,1.56,1.56,1.68,1.72,SUCCESS
A100,TabbyML/StarCoder-3B,7,4.29,1.63,1.61,1.78,1.8,SUCCESS
A100,TabbyML/StarCoder-3B,8,4.63,1.73,1.73,1.92,2.03,SUCCESS
A10G,TabbyML/DeepseekCoder-6.7B,5,1.3,3.85,3.83,4.25,4.31,FAILED
A10G,TabbyML/DeepseekCoder-6.7B,3,1.14,2.63,2.6,2.81,2.86,FAILED
A10G,TabbyML/DeepseekCoder-6.7B,2,0.83,2.4,2.4,2.48,2.5,FAILED
A100,TabbyML/DeepseekCoder-6.7B,9,3.14,2.87,2.85,3.08,3.13,FAILED
A100,TabbyML/DeepseekCoder-6.7B,5,2.08,2.4,2.46,2.58,2.63,FAILED
A100,TabbyML/DeepseekCoder-6.7B,3,1.32,2.27,2.3,2.54,2.69,FAILED
A100,TabbyML/DeepseekCoder-6.7B,2,1.2,1.67,1.66,1.84,1.93,SUCCESS
A100,TabbyML/CodeLlama-7B,9,3.69,2.44,2.45,2.59,2.63,FAILED
A100,TabbyML/CodeLlama-7B,5,2.14,2.34,2.31,2.61,3.26,FAILED
A100,TabbyML/CodeLlama-7B,3,1.52,1.97,2.02,2.3,2.37,FAILED
A100,TabbyML/CodeLlama-7B,5,2.37,2.11,2.13,2.24,2.26,FAILED
A100,TabbyML/CodeLlama-7B,3,1.59,1.89,1.95,2.04,2.07,FAILED
A100,TabbyML/CodeLlama-7B,2,1.45,1.38,1.39,1.54,1.56,SUCCESS
A100,TabbyML/CodeLlama-13B,5,1.21,4.14,4.15,4.38,4.5,FAILED
A100,TabbyML/CodeLlama-13B,3,0.89,3.36,3.4,3.71,3.73,FAILED
A100,TabbyML/CodeLlama-13B,2,0.73,2.75,2.73,2.92,3.06,FAILED
107 changes: 107 additions & 0 deletions python/tabby-loadtest/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!/bin/bash

record() {
echo $GPU_CONFIG,$MODEL_ID,$PARALLELISM,$1 >> record.csv
}

cleanup() {
MODAL_APP_ID=$(modal app list | grep tabby-server-loadtest | grep deployed | awk '{print $2}')

if [ -z $MODAL_APP_ID ]; then
modal app stop $MODAL_APP_ID
fi
}

loadtest() {
export GPU_CONFIG=$1
export MODEL_ID=$2
export PARALLELISM=$3

>&2 modal deploy server.py

export MODAL_PROCESS_ID=$!
export TABBY_API_HOST=https://wsxiaoys--tabby-server-loadtest-app.modal.run

# wait for warmup
>&2 echo "Waiting for warmup..."


n=0
while [[ "$(curl -s -o /dev/null -w ''%{http_code}'' $TABBY_API_HOST/v1/health)" != "200" ]]; do
if [ "$n" -ge 5 ]; then
# error after 5 retries.
return 1
fi

sleep 10;
n=$((n+1))
done

>&2 echo "Start load testing..."

>&2 k6 run loadtest.js
SUCCESS=$?
METRICS=$(cat metrics.txt)
rm metrics.txt

if [ $SUCCESS -ne 0 ]; then
record $METRICS,FAILED
else
record $METRICS,SUCCESS
fi

cleanup

return $SUCCESS
}

function dichotomic_search {
min=$1
max=$2
command=$3

while (( $min < $max )); do
# Compute the mean between min and max, rounded up to the superior unit
current=$(( (min + max + 1 ) / 2 ))

if $command $current
then min=$current
else max=$((current - 1))
fi
done
}

test_t4() {
loadtest T4 $MODEL_ID $1
}

test_a10g() {
loadtest A10G $MODEL_ID $1
}

test_a100() {
loadtest A100 $MODEL_ID $1
}

test_1b3b_model() {
export MODEL_ID="$1"

dichotomic_search 1 12 test_t4
dichotomic_search 1 32 test_a10g
dichotomic_search 1 64 test_a100
}

test_7b_model() {
export MODEL_ID="$1"

dichotomic_search 1 8 test_a100
}

test_13b_model() {
export MODEL_ID="$1"

dichotomic_search 1 8 test_a100
}

# test_7b_model TabbyML/CodeLlama-7B
test_13b_model TabbyML/CodeLlama-13B
97 changes: 97 additions & 0 deletions python/tabby-loadtest/server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
"""Usage:
modal serve app.py
"""

import os
from modal import Image, Stub, asgi_app

GPU_CONFIG = os.environ.get("GPU_CONFIG", "T4")
IMAGE_NAME = "tabbyml/tabby:0.6.0"
MODEL_ID = os.environ.get("MODEL_ID", "TabbyML/StarCoder-1B")
PARALLELISM = os.environ.get("PARALLELISM", "4")


def download_model():
import os
import subprocess

model_id = os.environ.get("MODEL_ID")
subprocess.run(
[
"/opt/tabby/bin/tabby",
"download",
"--model",
model_id,
]
)


image = (
Image.from_registry(
IMAGE_NAME,
add_python="3.11",
)
.env({"MODEL_ID": MODEL_ID})
.dockerfile_commands("ENTRYPOINT []")
.run_function(download_model)
.pip_install("asgi-proxy-lib")
.env({"PARALLELISM": PARALLELISM})
)

stub = Stub("tabby-server-loadtest", image=image)


@stub.function(
gpu=GPU_CONFIG,
allow_concurrent_inputs=int(PARALLELISM),
container_idle_timeout=120,
timeout=360,
)
@asgi_app()
def app():
import os
import socket
import subprocess
import time
from asgi_proxy import asgi_proxy

model_id = os.environ.get("MODEL_ID")
parallelism = os.environ.get("PARALLELISM")

env = os.environ.copy()
env["TABBY_DISABLE_USAGE_COLLECTION"] = "1"

launcher = subprocess.Popen(
[
"/opt/tabby/bin/tabby",
"serve",
"--model",
model_id,
"--port",
"8000",
"--device",
"cuda",
"--parallelism",
parallelism,
],
env=env
)

# Poll until webserver at 127.0.0.1:8000 accepts connections before running inputs.
def tabby_ready():
try:
socket.create_connection(("127.0.0.1", 8000), timeout=1).close()
return True
except (socket.timeout, ConnectionRefusedError):
# Check if launcher webserving process has exited.
# If so, a connection can never be made.
retcode = launcher.poll()
if retcode is not None:
raise RuntimeError(f"launcher exited unexpectedly with code {retcode}")
return False

while not tabby_ready():
time.sleep(1.0)

print("Tabby server ready!")
return asgi_proxy("http://localhost:8000")
30 changes: 0 additions & 30 deletions tests/default.loadtest.js

This file was deleted.