Skip to content

Commit

Permalink
Add llg_tokenize_bytes_prefix function for tokenization with prefix s…
Browse files Browse the repository at this point in the history
…upport
  • Loading branch information
mmoskal committed Nov 29, 2024
1 parent a3efce0 commit f432d4e
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 0 deletions.
12 changes: 12 additions & 0 deletions parser/llguidance.h
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,18 @@ size_t llg_tokenize_bytes(const struct LlgTokenizer *tok,
uint32_t *output_tokens,
size_t output_tokens_len);

/**
* Tokenize the given bytes and return the tokens.
* Special tokens will be tokenized, if they follow 0xFF byte prefix.
* Always returns the number of tokens that would be written to output_tokens
* if output_tokens_len was large enough.
*/
size_t llg_tokenize_bytes_prefix(const struct LlgTokenizer *tok,
const uint8_t *bytes,
size_t bytes_len,
uint32_t *output_tokens,
size_t output_tokens_len);

/**
* Return a string representation of the tokens, useful for debugging.
* The output is null-terminated.
Expand Down
23 changes: 23 additions & 0 deletions parser/src/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -657,6 +657,29 @@ pub extern "C" fn llg_tokenize_bytes(
n_toks
}

/// Tokenize the given bytes and return the tokens.
/// Special tokens will be tokenized, if they follow 0xFF byte prefix.
/// Always returns the number of tokens that would be written to output_tokens
/// if output_tokens_len was large enough.
#[no_mangle]
pub extern "C" fn llg_tokenize_bytes_prefix(
tok: &LlgTokenizer,
bytes: *const u8,
bytes_len: usize,
output_tokens: *mut u32,
output_tokens_len: usize,
) -> usize {
let tokens = tok
.token_env
.tokenize_bytes_prefix(unsafe { std::slice::from_raw_parts(bytes, bytes_len) });
let n_toks = tokens.len();
let to_copy = std::cmp::min(n_toks, output_tokens_len);
unsafe {
std::ptr::copy_nonoverlapping(tokens.as_ptr(), output_tokens, to_copy);
}
n_toks
}

/// Return a string representation of the tokens, useful for debugging.
/// The output is null-terminated.
/// Returns the number of bytes that would be written to output if output_len was large enough.
Expand Down

0 comments on commit f432d4e

Please sign in to comment.