Skip to content

Commit

Permalink
add no_forcing and allow_initial_skip options on Grammar
Browse files Browse the repository at this point in the history
  • Loading branch information
mmoskal committed Jul 24, 2024
1 parent 4291533 commit d8d4fb3
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 6 deletions.
2 changes: 1 addition & 1 deletion parser/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ version = "0.1.4"
edition = "2021"

[dependencies]
toktrie = { git = "https://github.com/microsoft/toktrie", rev = "158139aff4aa0ec0c049a2870492b000bbc375b9" }
toktrie = { git = "https://github.com/microsoft/toktrie", rev = "ae506a08efc9d41928155d7e447011965e172aa6" }
derivre = { git = "https://github.com/microsoft/derivre", rev = "e83d8fb3cd92d2c6dd0437e98bfa9b64d8d8284b" }
serde = { version = "1.0.192", features = ["derive"] }
serde_json = "1.0.108"
Expand Down
12 changes: 12 additions & 0 deletions parser/src/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,18 @@ pub struct GrammarWithLexer {
/// When set, the regexps can be referenced by their id (position in this list).
#[serde(default)]
pub rx_nodes: Vec<RegexNode>,

/// Normally, when a sequence of bytes is forced by grammar, it is tokenized
/// canonically and forced as tokens.
/// With `no_forcing`, we let the model decide on tokenization.
/// This generally reduces both quality and speed, so should not be used
/// outside of testing.
#[serde(default)]
pub no_forcing: bool,

/// If set, the grammar will allow skip_rx as the first lexeme.
#[serde(default)]
pub allow_initial_skip: bool,
}

#[derive(Serialize, Deserialize)]
Expand Down
6 changes: 6 additions & 0 deletions parser/src/earley/from_guidance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,12 @@ fn grammar_from_json(input: GrammarWithLexer) -> Result<(LexerSpec, Grammar)> {
_ => RegexAst::NoMatch,
};
let mut lexer_spec = LexerSpec::new(builder, skip)?;
if input.no_forcing {
lexer_spec.no_forcing = true;
}
if input.allow_initial_skip {
lexer_spec.allow_initial_skip = true;
}
let mut grm = Grammar::new();
let node_map = input
.nodes
Expand Down
4 changes: 4 additions & 0 deletions parser/src/earley/lexerspec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ use super::regexvec::RegexVec;
pub struct LexerSpec {
pub lexemes: Vec<LexemeSpec>,
pub regex_builder: RegexBuilder,
pub no_forcing: bool,
pub allow_initial_skip: bool,
}

#[derive(Clone)]
Expand Down Expand Up @@ -69,6 +71,8 @@ impl LexerSpec {
let mut r = LexerSpec {
lexemes: Vec::new(),
regex_builder,
no_forcing: false,
allow_initial_skip: false,
};
let skip = r.add_lexeme_spec(LexemeSpec {
name: "SKIP".to_string(),
Expand Down
10 changes: 6 additions & 4 deletions parser/src/earley/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -361,10 +361,12 @@ impl Parser {
);
assert!(r.lexer_stack.len() == 1);
// set the correct initial lexer state
// the initial state, shall not allow the SKIP lexeme
r.rows[0]
.allowed_lexemes
.set(LexemeIdx::SKIP.as_usize(), false);
if !r.grammar.lexer_spec().allow_initial_skip {
// disallow initial SKIP if asked to
r.rows[0]
.allowed_lexemes
.set(LexemeIdx::SKIP.as_usize(), false);
}
r.lexer_stack[0].lexer_state = r.lexer.start_state(&r.rows[0].allowed_lexemes, None);
r.assert_definitive();

Expand Down
7 changes: 6 additions & 1 deletion parser/src/tokenparser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,9 @@ impl TokenParser {
let new_forced = grm_bytes[self.llm_bytes.len()..].to_vec();
let mut token_prefix = Vec::new();

if new_forced.len() > 0 || backtrack > 0 {
let do_force = new_forced.len() > 0 && !self.parser.grammar().lexer_spec().no_forcing;

if do_force || backtrack > 0 {
let mut grm_tokens = self.token_env.tokenize_bytes(&new_forced);
infoln!(
self,
Expand Down Expand Up @@ -476,6 +478,9 @@ impl TokenParser {
} else {
infoln!(self, "no fixed tokens");
}
} else if new_forced.len() > 0 {
token_prefix.extend_from_slice(&new_forced);
infoln!(self, "no-forced bytes:{:?}", new_forced);
}

if token_prefix.is_empty() {
Expand Down

0 comments on commit d8d4fb3

Please sign in to comment.