add no_forcing and allow_initial_skip options on Grammar

guidance-ai · Jul 24, 2024 · d8d4fb3 · d8d4fb3
1 parent 4291533
commit d8d4fb3
Show file tree

Hide file tree

Showing 6 changed files with 35 additions and 6 deletions.
diff --git a/parser/Cargo.toml b/parser/Cargo.toml
@@ -4,7 +4,7 @@ version = "0.1.4"
 edition = "2021"
 
 [dependencies]
-toktrie = { git = "https://github.com/microsoft/toktrie", rev = "158139aff4aa0ec0c049a2870492b000bbc375b9" }
+toktrie = { git = "https://github.com/microsoft/toktrie", rev = "ae506a08efc9d41928155d7e447011965e172aa6" }
 derivre = { git = "https://github.com/microsoft/derivre", rev = "e83d8fb3cd92d2c6dd0437e98bfa9b64d8d8284b" }
 serde = { version = "1.0.192", features = ["derive"] }
 serde_json = "1.0.108"

diff --git a/parser/src/api.rs b/parser/src/api.rs
@@ -34,6 +34,18 @@ pub struct GrammarWithLexer {
     /// When set, the regexps can be referenced by their id (position in this list).
     #[serde(default)]
     pub rx_nodes: Vec<RegexNode>,
+
+    /// Normally, when a sequence of bytes is forced by grammar, it is tokenized
+    /// canonically and forced as tokens.
+    /// With `no_forcing`, we let the model decide on tokenization.
+    /// This generally reduces both quality and speed, so should not be used
+    /// outside of testing.
+    #[serde(default)]
+    pub no_forcing: bool,
+
+    /// If set, the grammar will allow skip_rx as the first lexeme.
+    #[serde(default)]
+    pub allow_initial_skip: bool,
 }
 
 #[derive(Serialize, Deserialize)]

diff --git a/parser/src/earley/from_guidance.rs b/parser/src/earley/from_guidance.rs
@@ -71,6 +71,12 @@ fn grammar_from_json(input: GrammarWithLexer) -> Result<(LexerSpec, Grammar)> {
         _ => RegexAst::NoMatch,
     };
     let mut lexer_spec = LexerSpec::new(builder, skip)?;
+    if input.no_forcing {
+        lexer_spec.no_forcing = true;
+    }
+    if input.allow_initial_skip {
+        lexer_spec.allow_initial_skip = true;
+    }
     let mut grm = Grammar::new();
     let node_map = input
         .nodes

diff --git a/parser/src/earley/lexerspec.rs b/parser/src/earley/lexerspec.rs
@@ -9,6 +9,8 @@ use super::regexvec::RegexVec;
 pub struct LexerSpec {
     pub lexemes: Vec<LexemeSpec>,
     pub regex_builder: RegexBuilder,
+    pub no_forcing: bool,
+    pub allow_initial_skip: bool,
 }
 
 #[derive(Clone)]
@@ -69,6 +71,8 @@ impl LexerSpec {
         let mut r = LexerSpec {
             lexemes: Vec::new(),
             regex_builder,
+            no_forcing: false,
+            allow_initial_skip: false,
         };
         let skip = r.add_lexeme_spec(LexemeSpec {
             name: "SKIP".to_string(),

diff --git a/parser/src/earley/parser.rs b/parser/src/earley/parser.rs
@@ -361,10 +361,12 @@ impl Parser {
         );
         assert!(r.lexer_stack.len() == 1);
         // set the correct initial lexer state
-        // the initial state, shall not allow the SKIP lexeme
-        r.rows[0]
-            .allowed_lexemes
-            .set(LexemeIdx::SKIP.as_usize(), false);
+        if !r.grammar.lexer_spec().allow_initial_skip {
+            // disallow initial SKIP if asked to
+            r.rows[0]
+                .allowed_lexemes
+                .set(LexemeIdx::SKIP.as_usize(), false);
+        }
         r.lexer_stack[0].lexer_state = r.lexer.start_state(&r.rows[0].allowed_lexemes, None);
         r.assert_definitive();
 

diff --git a/parser/src/tokenparser.rs b/parser/src/tokenparser.rs
@@ -438,7 +438,9 @@ impl TokenParser {
         let new_forced = grm_bytes[self.llm_bytes.len()..].to_vec();
         let mut token_prefix = Vec::new();
 
-        if new_forced.len() > 0 || backtrack > 0 {
+        let do_force = new_forced.len() > 0 && !self.parser.grammar().lexer_spec().no_forcing;
+
+        if do_force || backtrack > 0 {
             let mut grm_tokens = self.token_env.tokenize_bytes(&new_forced);
             infoln!(
                 self,
@@ -476,6 +478,9 @@ impl TokenParser {
             } else {
                 infoln!(self, "no fixed tokens");
             }
+        } else if new_forced.len() > 0 {
+            token_prefix.extend_from_slice(&new_forced);
+            infoln!(self, "no-forced bytes:{:?}", new_forced);
         }
 
         if token_prefix.is_empty() {