Skip to content

Commit

Permalink
support for allow_invalid_utf8 option in GrammarWithLexer
Browse files Browse the repository at this point in the history
  • Loading branch information
mmoskal committed Jul 24, 2024
1 parent d8d4fb3 commit 0f6638e
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 4 deletions.
9 changes: 7 additions & 2 deletions parser/src/api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ pub struct GrammarWithLexer {
#[serde(default)]
pub rx_nodes: Vec<RegexNode>,

/// If set, the grammar will allow skip_rx as the first lexeme.
#[serde(default)]
pub allow_initial_skip: bool,

/// Normally, when a sequence of bytes is forced by grammar, it is tokenized
/// canonically and forced as tokens.
/// With `no_forcing`, we let the model decide on tokenization.
Expand All @@ -43,9 +47,10 @@ pub struct GrammarWithLexer {
#[serde(default)]
pub no_forcing: bool,

/// If set, the grammar will allow skip_rx as the first lexeme.
/// If set, the grammar will allow invalid utf8 byte sequences.
/// Any Unicode regex will cause an error.
#[serde(default)]
pub allow_initial_skip: bool,
pub allow_invalid_utf8: bool,
}

#[derive(Serialize, Deserialize)]
Expand Down
11 changes: 9 additions & 2 deletions parser/src/earley/from_guidance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,15 @@ fn map_rx_refs(rx_refs: &[ExprRef], ids: Vec<RegexId>) -> Result<Vec<RegexAst>>
ids.into_iter().map(|id| map_rx_ref(rx_refs, id)).collect()
}

fn map_rx_nodes(rx_nodes: Vec<RegexNode>) -> Result<(RegexBuilder, Vec<ExprRef>)> {
fn map_rx_nodes(
rx_nodes: Vec<RegexNode>,
allow_invalid_utf8: bool,
) -> Result<(RegexBuilder, Vec<ExprRef>)> {
let mut builder = RegexBuilder::new();
if allow_invalid_utf8 {
builder.utf8(false);
builder.unicode(false);
}
let mut rx_refs = vec![];
for node in rx_nodes {
rx_refs.push(builder.mk(&map_node(&rx_refs, node)?)?);
Expand Down Expand Up @@ -64,7 +71,7 @@ fn map_rx_nodes(rx_nodes: Vec<RegexNode>) -> Result<(RegexBuilder, Vec<ExprRef>)
}

fn grammar_from_json(input: GrammarWithLexer) -> Result<(LexerSpec, Grammar)> {
let (builder, rx_nodes) = map_rx_nodes(input.rx_nodes)?;
let (builder, rx_nodes) = map_rx_nodes(input.rx_nodes, input.allow_invalid_utf8)?;

let skip = match input.greedy_skip_rx {
Some(rx) => resolve_rx(&rx_nodes, &rx)?,
Expand Down

0 comments on commit 0f6638e

Please sign in to comment.