diff --git a/parser/src/api.rs b/parser/src/api.rs index a9094f50..4009dac1 100644 --- a/parser/src/api.rs +++ b/parser/src/api.rs @@ -35,6 +35,10 @@ pub struct GrammarWithLexer { #[serde(default)] pub rx_nodes: Vec, + /// If set, the grammar will allow skip_rx as the first lexeme. + #[serde(default)] + pub allow_initial_skip: bool, + /// Normally, when a sequence of bytes is forced by grammar, it is tokenized /// canonically and forced as tokens. /// With `no_forcing`, we let the model decide on tokenization. @@ -43,9 +47,10 @@ pub struct GrammarWithLexer { #[serde(default)] pub no_forcing: bool, - /// If set, the grammar will allow skip_rx as the first lexeme. + /// If set, the grammar will allow invalid utf8 byte sequences. + /// Any Unicode regex will cause an error. #[serde(default)] - pub allow_initial_skip: bool, + pub allow_invalid_utf8: bool, } #[derive(Serialize, Deserialize)] diff --git a/parser/src/earley/from_guidance.rs b/parser/src/earley/from_guidance.rs index 5911d0f8..cb5e61cb 100644 --- a/parser/src/earley/from_guidance.rs +++ b/parser/src/earley/from_guidance.rs @@ -32,8 +32,15 @@ fn map_rx_refs(rx_refs: &[ExprRef], ids: Vec) -> Result> ids.into_iter().map(|id| map_rx_ref(rx_refs, id)).collect() } -fn map_rx_nodes(rx_nodes: Vec) -> Result<(RegexBuilder, Vec)> { +fn map_rx_nodes( + rx_nodes: Vec, + allow_invalid_utf8: bool, +) -> Result<(RegexBuilder, Vec)> { let mut builder = RegexBuilder::new(); + if allow_invalid_utf8 { + builder.utf8(false); + builder.unicode(false); + } let mut rx_refs = vec![]; for node in rx_nodes { rx_refs.push(builder.mk(&map_node(&rx_refs, node)?)?); @@ -64,7 +71,7 @@ fn map_rx_nodes(rx_nodes: Vec) -> Result<(RegexBuilder, Vec) } fn grammar_from_json(input: GrammarWithLexer) -> Result<(LexerSpec, Grammar)> { - let (builder, rx_nodes) = map_rx_nodes(input.rx_nodes)?; + let (builder, rx_nodes) = map_rx_nodes(input.rx_nodes, input.allow_invalid_utf8)?; let skip = match input.greedy_skip_rx { Some(rx) => resolve_rx(&rx_nodes, &rx)?,