From 0f6638e77625c94fc3d5c53d267cba960038bfa7 Mon Sep 17 00:00:00 2001
From: Michal Moskal <michal@moskal.me>
Date: Wed, 24 Jul 2024 15:05:13 -0700
Subject: [PATCH] support for allow_invalid_utf8 option in GrammarWithLexer

---
 parser/src/api.rs                  |  9 +++++++--
 parser/src/earley/from_guidance.rs | 11 +++++++++--
 2 files changed, 16 insertions(+), 4 deletions(-)
diff --git a/parser/src/api.rs b/parser/src/api.rs
index a9094f50..4009dac1 100644
--- a/parser/src/api.rs
+++ b/parser/src/api.rs
@@ -35,6 +35,10 @@ pub struct GrammarWithLexer {
     #[serde(default)]
     pub rx_nodes: Vec<RegexNode>,
 
+    /// If set, the grammar will allow skip_rx as the first lexeme.
+    #[serde(default)]
+    pub allow_initial_skip: bool,
+
     /// Normally, when a sequence of bytes is forced by grammar, it is tokenized
     /// canonically and forced as tokens.
     /// With `no_forcing`, we let the model decide on tokenization.
@@ -43,9 +47,10 @@ pub struct GrammarWithLexer {
     #[serde(default)]
     pub no_forcing: bool,
 
-    /// If set, the grammar will allow skip_rx as the first lexeme.
+    /// If set, the grammar will allow invalid utf8 byte sequences.
+    /// Any Unicode regex will cause an error.
     #[serde(default)]
-    pub allow_initial_skip: bool,
+    pub allow_invalid_utf8: bool,
 }
 
 #[derive(Serialize, Deserialize)]
diff --git a/parser/src/earley/from_guidance.rs b/parser/src/earley/from_guidance.rs
index 5911d0f8..cb5e61cb 100644
--- a/parser/src/earley/from_guidance.rs
+++ b/parser/src/earley/from_guidance.rs
@@ -32,8 +32,15 @@ fn map_rx_refs(rx_refs: &[ExprRef], ids: Vec<RegexId>) -> Result<Vec<RegexAst>>
     ids.into_iter().map(|id| map_rx_ref(rx_refs, id)).collect()
 }
 
-fn map_rx_nodes(rx_nodes: Vec<RegexNode>) -> Result<(RegexBuilder, Vec<ExprRef>)> {
+fn map_rx_nodes(
+    rx_nodes: Vec<RegexNode>,
+    allow_invalid_utf8: bool,
+) -> Result<(RegexBuilder, Vec<ExprRef>)> {
     let mut builder = RegexBuilder::new();
+    if allow_invalid_utf8 {
+        builder.utf8(false);
+        builder.unicode(false);
+    }
     let mut rx_refs = vec![];
     for node in rx_nodes {
         rx_refs.push(builder.mk(&map_node(&rx_refs, node)?)?);
@@ -64,7 +71,7 @@ fn map_rx_nodes(rx_nodes: Vec<RegexNode>) -> Result<(RegexBuilder, Vec<ExprRef>)
 }
 
 fn grammar_from_json(input: GrammarWithLexer) -> Result<(LexerSpec, Grammar)> {
-    let (builder, rx_nodes) = map_rx_nodes(input.rx_nodes)?;
+    let (builder, rx_nodes) = map_rx_nodes(input.rx_nodes, input.allow_invalid_utf8)?;
 
     let skip = match input.greedy_skip_rx {
         Some(rx) => resolve_rx(&rx_nodes, &rx)?,