Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

server: bench: minor fixes #10765

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions examples/server/bench/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ Benchmark is using [k6](https://k6.io/).

SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension.

Example:
Example (assuming golang >= 1.21 is installed):
```shell
go install go.k6.io/xk6/cmd/xk6@latest
xk6 build master \
$GOPATH/bin/xk6 build master \
--with github.com/phymbert/xk6-sse
```

Expand All @@ -33,7 +33,7 @@ The server must answer OAI Chat completion requests on `http://localhost:8080/v1

Example:
```shell
server --host localhost --port 8080 \
llama-server --host localhost --port 8080 \
--model ggml-model-q4_0.gguf \
--cont-batching \
--metrics \
Expand Down
30 changes: 21 additions & 9 deletions examples/server/bench/bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,12 +189,12 @@ def main(args_in: list[str] | None = None) -> None:
"pp": {
"p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2),
"avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2),
"0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2),
"0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2) if 'prompt_tokens_seconds' in prometheus_metrics else 0,
},
"tg": {
"p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2),
"avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2),
"0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2),
"0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2) if 'predicted_tokens_seconds' in prometheus_metrics else 0,
},
}
with open("results.github.env", 'a') as github_env:
Expand All @@ -214,11 +214,14 @@ def start_benchmark(args):
k6_args = [
'run', args.scenario,
'--no-color',
'--no-connection-reuse',
'--no-vu-connection-reuse',
]
k6_args.extend(['--duration', args.duration])
k6_args.extend(['--iterations', args.n_prompts])
k6_args.extend(['--vus', args.parallel])
k6_args.extend(['--summary-export', 'k6-results.json'])
k6_args.extend(['--out', 'csv=k6-results.csv'])
args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} "
args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]])
print(f"bench: starting k6 with: {args}")
Expand All @@ -231,7 +234,7 @@ def start_server(args):
server_process = start_server_background(args)

attempts = 0
max_attempts = 20
max_attempts = 600
if 'GITHUB_ACTIONS' in os.environ:
max_attempts *= 2

Expand All @@ -242,7 +245,15 @@ def start_server(args):
print(f"bench: waiting for server to start ...")
time.sleep(0.5)

print("bench: server started.")
attempts = 0
while not is_server_ready(args.host, args.port):
attempts += 1
if attempts > max_attempts:
assert False, "server not ready"
print(f"bench: waiting for server to be ready ...")
time.sleep(0.5)

print("bench: server started and ready.")
return server_process


Expand All @@ -255,11 +266,6 @@ def start_server_background(args):
'--host', args.host,
'--port', args.port,
]
model_file = args.model_path_prefix + os.path.sep + args.hf_file
model_dir = os.path.dirname(model_file)
if not os.path.exists(model_dir):
os.makedirs(model_dir)
server_args.extend(['--model', model_file])
server_args.extend(['--hf-repo', args.hf_repo])
server_args.extend(['--hf-file', args.hf_file])
server_args.extend(['--n-gpu-layers', args.n_gpu_layers])
Expand Down Expand Up @@ -303,6 +309,12 @@ def is_server_listening(server_fqdn, server_port):
return _is_server_listening


def is_server_ready(server_fqdn, server_port):
url = f"http://{server_fqdn}:{server_port}/health"
response = requests.get(url)
return response.status_code == 200


def escape_metric_name(metric_name):
return re.sub('[^A-Z0-9]', '_', metric_name.upper())

Expand Down
18 changes: 15 additions & 3 deletions examples/server/bench/script.js
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens')

const llamacpp_tokens_second = new Trend('llamacpp_tokens_second')
const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second')
const llamacpp_emit_first_token_second = new Trend('llamacpp_emit_first_token_second')

const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter')
const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter')
Expand Down Expand Up @@ -89,6 +90,9 @@ export default function () {
],
"model": model,
"stream": true,
"stream_options": {
"include_usage": true, // False to be supported in llama.cpp server
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure what you mean here, but in llama.cpp we ignore include_usage and always include include the usage info.

},
"seed": 42,
"max_tokens": max_tokens,
"stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS
Expand All @@ -105,12 +109,20 @@ export default function () {
client.on('event', function (event) {
if (promptEvalEndTime == null) {
promptEvalEndTime = new Date()
llamacpp_emit_first_token_second.add((promptEvalEndTime - startTime) / 1.e3)
}

if (event.data === '[DONE]' || event.data === '') {
return
}

let chunk = JSON.parse(event.data)
let choice = chunk.choices[0]
if (choice.finish_reason) {
finish_reason = choice.finish_reason

if (chunk.choices && chunk.choices.length > 0) {
let choice = chunk.choices[0]
if (choice.finish_reason) {
finish_reason = choice.finish_reason
}
}

if (chunk.usage) {
Expand Down
Loading