Skip to content

Commit

Permalink
Comment about 'maxtokens' in parser.rs (#24)
Browse files Browse the repository at this point in the history
Co-authored-by: Jeffrey Kegler <[email protected]>
  • Loading branch information
v-jkegler and Jeffrey Kegler authored Oct 4, 2024
1 parent 48bcba4 commit edde0d5
Showing 1 changed file with 33 additions and 0 deletions.
33 changes: 33 additions & 0 deletions parser/src/earley/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,12 @@ struct Scratch {

items: Vec<Item>,
item_props: Vec<ItemProps>,

// Is this Earley table in "definitive" mode?
// 'definitive' is set when the LLM is adding new tokens --
// when new tokens are being 'defined'. The opposite of
// definitive mode is "speculative" mode, which is used
// for computing the token mask.
definitive: bool,
}

Expand All @@ -179,6 +185,12 @@ struct RowInfo {
lexeme: Lexeme,
token_idx_start: usize,
token_idx_stop: usize,

// A hash whose key is a symbol index, and whose value is
// the maximum tokens allowed for that symbol. Undefined hash
// keys allow an unlimited number of tokens. That the number
// of tokens is unlimited can be indicated explicitly by setting
// the value of the hash entry to usize::MAX.
max_tokens: Arc<HashMap<LexemeIdx, usize>>,
}

Expand Down Expand Up @@ -1273,6 +1285,8 @@ impl ParserState {
if let Some(lx) = sym_data.lexeme {
allowed_lexemes.set(lx.as_usize(), true);
if self.scratch.definitive {
// In definitive mode, set 'maxtokens' for
// the postdot symbol.
max_tokens.push((lx, sym_data.props.max_tokens));
}
}
Expand Down Expand Up @@ -1335,10 +1349,21 @@ impl ParserState {
}

if self.scratch.definitive {

// Clear all row info data after the
// working row.
if self.row_infos.len() > idx {
self.row_infos.drain(idx..);
}

// We collect and prune the information in
// 'max_tokens' into a new hash, 'max_tokens_map',
// which will replace 'max_tokens'.
let mut max_tokens_map = HashMap::default();

// If a lexeme allowed twice, set its value in the
// 'max_tokens_map' to the maximum of the max_tokens
// values specified.
for (lx, mx) in max_tokens {
if let Some(ex) = max_tokens_map.get(&lx) {
if *ex < mx {
Expand All @@ -1348,6 +1373,13 @@ impl ParserState {
max_tokens_map.insert(lx, mx);
}
}

// Some entries in 'max_tokens_map' will
// explicitly indicate that the number of tokens is
// unlimited with a value of usize::MAX. Since this
// is the default for non-existent hash entries, we
// save space by removing entries whose value is
// usize::MAX.
let mut to_remove = vec![];
for (lx, mx) in max_tokens_map.iter() {
if *mx == usize::MAX {
Expand All @@ -1357,6 +1389,7 @@ impl ParserState {
for lx in to_remove {
max_tokens_map.remove(&lx);
}

self.row_infos.push(RowInfo {
lexeme: Lexeme::bogus(),
token_idx_start: self.token_idx,
Expand Down

0 comments on commit edde0d5

Please sign in to comment.