Skip to content

Commit

Permalink
server : Add option to return token pieces in /tokenize endpoint (#9108)
Browse files Browse the repository at this point in the history
* server : added with_pieces functionality to /tokenize endpoint

* server : Add tokenize with pieces tests to server.feature

* Handle case if tokenizer splits along utf8 continuation bytes

* Add example of token splitting

* Remove trailing ws

* Fix trailing ws

* Maybe fix ci

* maybe this fix windows ci?

---------

Co-authored-by: Xuan Son Nguyen <[email protected]>
  • Loading branch information
mathijshenquet and ngxson authored Sep 12, 2024
1 parent e6b7801 commit 7820364
Show file tree
Hide file tree
Showing 6 changed files with 139 additions and 6 deletions.
1 change: 1 addition & 0 deletions .github/workflows/server.yml
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,7 @@ jobs:
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
run: |
cd examples/server/tests
$env:PYTHONIOENCODING = ":replace"
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
- name: Slow tests
Expand Down
39 changes: 37 additions & 2 deletions examples/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -407,9 +407,44 @@ Notice that each `probs` is an array of length `n_probs`.

*Options:*

`content`: Set the text to tokenize.
`content`: (Required) The text to tokenize.

`add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
`add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`

`with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs. Default: `false`

**Response:**

Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. The piece field is a string if the piece is valid unicode or a list of bytes otherwise.


If `with_pieces` is `false`:
```json
{
"tokens": [123, 456, 789]
}
```

If `with_pieces` is `true`:
```json
{
"tokens": [
{"id": 123, "piece": "Hello"},
{"id": 456, "piece": " world"},
{"id": 789, "piece": "!"}
]
}
```

With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
```json
{
"tokens": [
{"id": 198, "piece": [195]}, // hex C3
{"id": 164, "piece": [161]} // hex A1
]
}
```

### POST `/detokenize`: Convert tokens to text

Expand Down
33 changes: 30 additions & 3 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3013,12 +3013,39 @@ int main(int argc, char ** argv) {
const auto handle_tokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) {
const json body = json::parse(req.body);

std::vector<llama_token> tokens;
json tokens_response = json::array();
if (body.count("content") != 0) {
const bool add_special = json_value(body, "add_special", false);
tokens = ctx_server.tokenize(body.at("content"), add_special);
const bool with_pieces = json_value(body, "with_pieces", false);
std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special);

if (with_pieces) {
for (const auto& token : tokens) {
std::string piece = llama_token_to_piece(ctx_server.ctx, token);
json piece_json;

// Check if the piece is valid UTF-8
if (is_valid_utf8(piece)) {
piece_json = piece;
} else {
// If not valid UTF-8, store as array of byte values
piece_json = json::array();
for (unsigned char c : piece) {
piece_json.push_back(static_cast<int>(c));
}
}

tokens_response.push_back({
{"id", token},
{"piece", piece_json}
});
}
} else {
tokens_response = tokens;
}
}
const json data = format_tokenizer_response(tokens);

const json data = format_tokenizer_response(tokens_response);
res_ok(res, data);
};

Expand Down
8 changes: 8 additions & 0 deletions examples/server/tests/features/server.feature
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,14 @@ Feature: llama.cpp server
Given first token is removed
Then tokens can be detokenized

Scenario: Tokenize with pieces
When tokenizing with pieces:
"""
What is the capital of Germany?
"""
Then tokens are given with pieces

Scenario: Models available
Given available models
Then 1 models are supported
Expand Down
29 changes: 29 additions & 0 deletions examples/server/tests/features/steps/steps.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import asyncio
import json
import os
Expand Down Expand Up @@ -697,6 +700,32 @@ def step_tokenize_set_add_special(context):
context.tokenize_add_special = True


@step("tokenizing with pieces")
@async_run_until_complete
async def step_tokenize_with_pieces(context):
context.tokenized_text = context_text(context)
async with aiohttp.ClientSession() as session:
tokenize_args = {"content": context.tokenized_text, "with_pieces": True}
if getattr(context, "tokenize_add_special", None) is not None:
tokenize_args["add_special"] = context.tokenize_add_special

async with session.post(
f"{context.base_url}/tokenize", json=tokenize_args
) as response:
assert response.status == 200
tokenize_json = await response.json()
context.tokens_with_pieces = tokenize_json["tokens"]


@step("tokens are given with pieces")
@async_run_until_complete
async def step_tokenize_with_pieces(context):
# Verify that the response contains both token IDs and pieces
assert all(
"id" in token and "piece" in token for token in context.tokens_with_pieces
)


@step('tokenizing')
@async_run_until_complete
async def step_tokenize(context):
Expand Down
35 changes: 34 additions & 1 deletion examples/server/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -616,7 +616,40 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
return res;
}

static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
static bool is_valid_utf8(const std::string & str) {
const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
const unsigned char* end = bytes + str.length();

while (bytes < end) {
if (*bytes <= 0x7F) {
// 1-byte sequence (0xxxxxxx)
bytes++;
} else if ((*bytes & 0xE0) == 0xC0) {
// 2-byte sequence (110xxxxx 10xxxxxx)
if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
return false;
bytes += 2;
} else if ((*bytes & 0xF0) == 0xE0) {
// 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
return false;
bytes += 3;
} else if ((*bytes & 0xF8) == 0xF0) {
// 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
(bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
return false;
bytes += 4;
} else {
// Invalid UTF-8 lead byte
return false;
}
}

return true;
}

static json format_tokenizer_response(const json & tokens) {
return json {
{"tokens", tokens}
};
Expand Down

0 comments on commit 7820364

Please sign in to comment.