Skip to content

Commit

Permalink
Statement parse errors now refer to spans (#80)
Browse files Browse the repository at this point in the history
* Statement parse errors now refer to spans

* Prefer using `UndefinedToken`
  • Loading branch information
tirix authored Apr 2, 2022
1 parent d39372e commit 25f1f55
Show file tree
Hide file tree
Showing 3 changed files with 154 additions and 59 deletions.
20 changes: 10 additions & 10 deletions src/diag.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1190,19 +1190,19 @@ impl Diagnostic {
#[derive(Debug, Clone, Eq, PartialEq)]
#[allow(missing_docs)]
pub enum StmtParseError {
ParsedStatementTooShort(Option<Token>),
ParsedStatementTooShort(Span, Option<Token>),
ParsedStatementNoTypeCode,
ParsedStatementWrongTypeCode(Token),
UnknownToken(TokenIndex),
UnparseableStatement(TokenIndex),
UnknownToken(Span),
UnparseableStatement(Span),
}

impl StmtParseError {
/// The diagnostic's label
#[must_use]
pub fn label<'a>(&self) -> Cow<'a, str> {
match self {
StmtParseError::ParsedStatementTooShort(_) => "Parsed statement too short",
StmtParseError::ParsedStatementTooShort(_, _) => "Parsed statement too short",
StmtParseError::ParsedStatementWrongTypeCode(_) => {
"Parsed statement has wrong typecode"
}
Expand All @@ -1226,7 +1226,7 @@ impl StmtParseError {
}
let severity = self.severity();
let info = match self {
StmtParseError::ParsedStatementTooShort(ref opt_tok) => (
StmtParseError::ParsedStatementTooShort(span, ref opt_tok) => (
severity,
match opt_tok {
Some(tok) => format!(
Expand All @@ -1239,7 +1239,7 @@ impl StmtParseError {
}
},
stmt,
stmt.span(),
*span,
),
StmtParseError::ParsedStatementWrongTypeCode(ref found) => (
severity,
Expand All @@ -1251,17 +1251,17 @@ impl StmtParseError {
stmt,
stmt.span(),
),
StmtParseError::UnknownToken(index) => (
StmtParseError::UnknownToken(span) => (
severity,
"This token was not declared in any $v or $c statement".into(),
stmt,
stmt.math_span(*index),
*span,
),
StmtParseError::UnparseableStatement(index) => (
StmtParseError::UnparseableStatement(span) => (
severity,
"Could not parse this statement".into(),
stmt,
stmt.math_span(*index),
*span,
),
StmtParseError::ParsedStatementNoTypeCode => (
AnnotationType::Error,
Expand Down
147 changes: 108 additions & 39 deletions src/grammar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use crate::segment::Segment;
use crate::segment_set::SegmentSet;
use crate::statement::{CommandToken, SegmentId, StatementAddress, SymbolType, TokenRef};
use crate::util::HashMap;
use crate::{as_str, Database, StatementRef, StatementType};
use crate::{as_str, Database, Span, StatementRef, StatementType};
use log::{debug, warn};
use std::collections::hash_map::Entry;
use std::fmt;
Expand Down Expand Up @@ -401,8 +401,8 @@ impl Default for Grammar {
}
}

const fn undefined(token: TokenRef<'_>) -> StmtParseError {
StmtParseError::UnknownToken(token.address.token_index)
fn undefined(token: TokenRef<'_>, sref: &StatementRef<'_>) -> Diagnostic {
Diagnostic::UndefinedToken(sref.math_span(token.index()), token.slice.into())
}

fn undefined_cmd(token: &CommandToken, buf: &[u8]) -> Diagnostic {
Expand Down Expand Up @@ -466,8 +466,13 @@ impl Grammar {
}
}

fn too_short(map: &HashMap<(SymbolType, Atom), NextNode>, nset: &Nameset) -> StmtParseError {
fn too_short(
last_token: FormulaToken,
map: &HashMap<(SymbolType, Atom), NextNode>,
nset: &Nameset,
) -> StmtParseError {
StmtParseError::ParsedStatementTooShort(
last_token.span,
map.keys()
.find(|k| k.0 == SymbolType::Constant)
.map(|(_, expected_symbol)| nset.atom_name(*expected_symbol).into()),
Expand Down Expand Up @@ -587,7 +592,7 @@ impl Grammar {
while let Some(token) = tokens.next() {
let symbol = names
.lookup_symbol(token.slice)
.ok_or_else(|| undefined(token))?;
.ok_or_else(|| undefined(token, sref))?;
let atom = match symbol.stype {
SymbolType::Constant => symbol.atom,
SymbolType::Variable => {
Expand All @@ -596,7 +601,7 @@ impl Grammar {
// Ideally this information would be included in the LookupSymbol
names
.lookup_float(token.slice)
.ok_or_else(|| undefined(token))?
.ok_or_else(|| undefined(token, sref))?
.typecode_atom
}
};
Expand Down Expand Up @@ -1156,10 +1161,10 @@ impl Grammar {
Ok(())
}

fn do_shift(&self, symbol_iter: &mut dyn Iterator<Item = (usize, Symbol)>, nset: &Nameset) {
if let Some((_ix, symbol)) = symbol_iter.next() {
fn do_shift(&self, symbol_iter: &mut dyn Iterator<Item = FormulaToken>, nset: &Nameset) {
if let Some(token) = symbol_iter.next() {
if self.debug {
debug!(" SHIFT {:?}", as_str(nset.atom_name(symbol)));
debug!(" SHIFT {:?}", as_str(nset.atom_name(token.symbol)));
}
}
}
Expand All @@ -1184,7 +1189,7 @@ impl Grammar {
/// Parses the given list of symbols into a formula syntax tree.
pub fn parse_formula(
&self,
symbol_iter: &mut impl Iterator<Item = Symbol>,
symbol_iter: &mut impl Iterator<Item = FormulaToken>,
expected_typecodes: &[TypeCode],
nset: &Nameset,
) -> Result<Formula, StmtParseError> {
Expand All @@ -1194,8 +1199,10 @@ impl Grammar {
}

let mut formula_builder = FormulaBuilder::default();
let mut symbol_enum = symbol_iter.enumerate().peekable();
let mut ix = 0;
let mut symbol_enum = symbol_iter.peekable();
let mut last_token = *symbol_enum
.peek()
.ok_or(StmtParseError::ParsedStatementNoTypeCode)?;
let mut e = StackElement {
node_id: self.root,
expected_typecodes: expected_typecodes.to_vec().into_boxed_slice(),
Expand Down Expand Up @@ -1241,7 +1248,7 @@ impl Grammar {
// There are still symbols to parse, continue from root
let (next_node_id, leaf_label) = self
.next_var_node(self.root, typecode)
.ok_or(StmtParseError::UnparseableStatement(ix))?;
.ok_or(StmtParseError::UnparseableStatement(last_token.span))?;
for &reduce in leaf_label {
Self::do_reduce(&mut formula_builder, reduce, nset);
}
Expand All @@ -1264,22 +1271,22 @@ impl Grammar {
debug!(" ++ Wrong type obtained, continue.");
let (next_node_id, leaf_label) = self
.next_var_node(self.root, typecode)
.ok_or(StmtParseError::UnparseableStatement(ix))?;
.ok_or(StmtParseError::UnparseableStatement(last_token.span))?;
for &reduce in leaf_label {
Self::do_reduce(&mut formula_builder, reduce, nset);
}
e.node_id = next_node_id;
}
}
GrammarNode::Branch { ref map } => {
if let Some(&(index, symbol)) = symbol_enum.peek() {
ix = index as i32;
debug!(" {:?}", as_str(nset.atom_name(symbol)));
if let Some(&token) = symbol_enum.peek() {
last_token = token;
debug!(" {:?}", as_str(nset.atom_name(token.symbol)));

if let Some(NextNode {
next_node_id,
leaf_label,
}) = map.get(&(SymbolType::Constant, symbol))
}) = map.get(&(SymbolType::Constant, token.symbol))
{
// Found an atom matching one of our next nodes: First optionally REDUCE and continue
for &reduce in leaf_label {
Expand All @@ -1293,7 +1300,7 @@ impl Grammar {
} else {
// No matching constant, search among variables
if map.is_empty() || e.node_id == self.root {
return Err(StmtParseError::UnparseableStatement(ix));
return Err(StmtParseError::UnparseableStatement(token.span));
}

debug!(
Expand All @@ -1313,39 +1320,96 @@ impl Grammar {
};
}
} else {
return Err(Grammar::too_short(map, nset));
return Err(Grammar::too_short(last_token, map, nset));
}
}
}
}
}
}

/// An Atom which remembers its position in the source, for error handling
#[derive(Clone, Copy, Debug)]
pub struct FormulaToken {
/// The symbol's atom
pub symbol: Symbol,
/// The span of the original source string this token has been read from, used for error reporting.
pub span: Span,
}

/// An iterator through the tokens of a string
struct FormulaTokenIter<'a> {
string: &'a str,
chars: core::str::Chars<'a>,
nset: &'a Arc<Nameset>,
last_pos: usize,
done: bool,
}

impl<'a> FormulaTokenIter<'a> {
/// Builds a `FormulaTokenIter` from a string.
/// Characters are expected to be ASCII
fn from_str(string: &'a str, nset: &'a Arc<Nameset>) -> Self {
Self {
string,
chars: string.chars(),
nset,
last_pos: 0,
done: false,
}
}
}

impl Iterator for FormulaTokenIter<'_> {
type Item = Result<FormulaToken, StmtParseError>;

fn next(&mut self) -> Option<Self::Item> {
if self.done {
None
} else {
let span = if let Some(next_pos) =
self.chars.position(|c| c == ' ' || c == '\t' || c == '\n')
{
Span::new(self.last_pos, self.last_pos + next_pos)
} else {
self.done = true;
Span::new(self.last_pos, self.string.len())
};
self.last_pos = span.end as usize + 1;
let t = &self.string[span.start as usize..span.end as usize];
if let Some(l) = self.nset.lookup_symbol(t.as_bytes()) {
Some(Ok(FormulaToken {
symbol: l.atom,
span,
}))
} else {
Some(Err(StmtParseError::UnknownToken(span)))
}
}
}
}

impl Grammar {
/// Parses a character string into a formula
/// As a first math token, the string is expected to contain the typecode for the formula.
/// Diagnostics mark the errors with [Span]s based on the position in the input string.
pub fn parse_string(
&self,
formula_string: &str,
nset: &Arc<Nameset>,
) -> Result<Formula, StmtParseError> {
// TODO an iterator taking notes of the start and end of the math tokens would allow to return richer error messages, including actual spans rather than indices.
let mut symbols = formula_string.trim().split(&[' ', '\t', '\n']);
let typecode_name = symbols
let mut symbols = FormulaTokenIter::from_str(formula_string, nset)
.collect::<Result<Vec<_>, _>>()?
.into_iter();
let typecode = symbols
.next()
.ok_or(StmtParseError::ParsedStatementNoTypeCode)?;
let typecode = nset
.lookup_symbol(typecode_name.as_bytes())
.ok_or(StmtParseError::UnknownToken(0))?
.atom;
let expected_typecode = if typecode == self.provable_type {
let expected_typecode = if typecode.symbol == self.provable_type {
self.logic_type
} else {
typecode
typecode.symbol
};
self.parse_formula(
&mut symbols.map(|t| nset.lookup_symbol(t.as_bytes()).unwrap().atom),
&[expected_typecode],
nset,
)
self.parse_formula(&mut symbols, &[expected_typecode], nset)
}

fn parse_statement(
Expand Down Expand Up @@ -1383,10 +1447,14 @@ impl Grammar {
.math_iter()
.skip(1)
.map(|token| {
let span = sref.math_span(token.index());
if let Some(lookup) = names.lookup_symbol(token.slice) {
Ok(lookup.atom)
Ok(FormulaToken {
symbol: lookup.atom,
span,
})
} else {
Err(undefined(token))
Err(StmtParseError::UnknownToken(span))
}
})
.collect();
Expand Down Expand Up @@ -1505,9 +1573,7 @@ impl Grammar {
}
Ok(())
}
}

impl Grammar {
/// Called by [`crate::Database`] to build the grammar from the syntax axioms in the database.
///
/// The provided `sset`, and `nset` shall be the result of previous phases over the database.
Expand Down Expand Up @@ -1599,7 +1665,10 @@ impl StmtParse {
let math_iter = sref.math_iter().flat_map(|token| {
nset.lookup_symbol(token.slice)
.ok_or_else(|| {
(sref.address(), StmtParseError::UnknownToken(token.index()))
(
sref.address(),
StmtParseError::UnknownToken(sref.math_span(token.index())),
)
})
.map(|l| l.atom)
});
Expand Down
Loading

0 comments on commit 25f1f55

Please sign in to comment.