diff --git a/src/diag.rs b/src/diag.rs index a0fffa1..a2c6b9c 100644 --- a/src/diag.rs +++ b/src/diag.rs @@ -170,9 +170,6 @@ pub enum Diagnostic { NotAProvableStatement, OldAltNotDiscouraged, ParenOrderError(Span, Span), - ParsedStatementTooShort(Option), - ParsedStatementNoTypeCode, - ParsedStatementWrongTypeCode(Token), ProofDvViolation, ProofExcessEnd, ProofIncomplete, @@ -196,6 +193,7 @@ pub enum Diagnostic { StepOutOfRange, StepUsedAfterScope(Token), StepUsedBeforeDefinition(Token), + StmtParseError(StmtParseError), SymbolDuplicatesLabel(TokenIndex, StatementAddress), SymbolRedeclared(TokenIndex, TokenAddress), TabUsed(Span), @@ -217,10 +215,8 @@ pub enum Diagnostic { UninterpretedHtml(Span), UnknownLabel(Span), UnknownKeyword(Span), - UnknownToken(TokenIndex), UnknownTypesettingCommand(Span), UnmatchedCloseGroup, - UnparseableStatement(TokenIndex), VariableMissingFloat(TokenIndex), VariableRedeclaredAsConstant(TokenIndex, TokenAddress), WindowsReservedLabel(Span), @@ -793,12 +789,6 @@ impl Diagnostic { stmt, stmt.span(), )]), - ParsedStatementNoTypeCode => ("Empty statement".into(), vec![( - AnnotationType::Error, - "Empty statement".into(), - stmt, - stmt.span(), - )]), OldAltNotDiscouraged if stmt.statement_type() == StatementType::Axiom => { notes = &["Add (New usage is discouraged.) to the comment"]; ("OLD/ALT axiom not discouraged".into(), vec![( @@ -832,21 +822,6 @@ impl Diagnostic { later, )]) } - ParsedStatementTooShort(ref opt_tok) => ("Parsed statement too short".into(), vec![( - AnnotationType::Error, - match opt_tok { - Some(tok) => format!("Statement is too short, expecting for example {expected}", expected = t(tok)).into(), - None => "Statement is too short, and does not correspond to any valid prefix".into(), - }, - stmt, - stmt.span(), - )]), - ParsedStatementWrongTypeCode(ref found) => ("Parsed statement has wrong typecode".into(), vec![( - AnnotationType::Error, - format!("Type code {found} is not among the expected type codes", found = t(found)).into(), - stmt, - stmt.span(), - )]), ProofDvViolation => ("Distinct variable violation".into(), vec![( AnnotationType::Error, "Disjoint variable constraint violated".into(), @@ -1003,6 +978,7 @@ impl Diagnostic { stmt, stmt.span(), )]), + StmtParseError(err) => err.build_info(stmt), SymbolDuplicatesLabel(index, saddr) => ("Symbol duplicates label".into(), vec![( AnnotationType::Warning, "Metamath spec forbids symbols which are the same as labels in the same \ @@ -1171,24 +1147,12 @@ impl Diagnostic { stmt, *span, )]), - UnknownToken(index) => ("Unknown token".into(), vec![( - AnnotationType::Error, - "This token was not declared in any $v or $c statement".into(), - stmt, - stmt.math_span(*index), - )]), UnmatchedCloseGroup => ("Unmatched close group".into(), vec![( AnnotationType::Error, "This $} does not match any open ${".into(), stmt, stmt.span(), )]), - UnparseableStatement(index) => ("Unparseable statement".into(), vec![( - AnnotationType::Error, - "Could not parse this statement".into(), - stmt, - stmt.math_span(*index), - )]), VariableMissingFloat(index) => ("Variable missing float".into(), vec![( AnnotationType::Error, "Variable token used in statement must have an active $f".into(), @@ -1222,6 +1186,100 @@ impl Diagnostic { } } +/// An error during statement parsing. +#[derive(Debug, Clone, Eq, PartialEq)] +#[allow(missing_docs)] +pub enum StmtParseError { + ParsedStatementTooShort(Option), + ParsedStatementNoTypeCode, + ParsedStatementWrongTypeCode(Token), + UnknownToken(TokenIndex), + UnparseableStatement(TokenIndex), +} + +impl StmtParseError { + /// The diagnostic's label + #[must_use] + pub fn label<'a>(&self) -> Cow<'a, str> { + match self { + StmtParseError::ParsedStatementTooShort(_) => "Parsed statement too short", + StmtParseError::ParsedStatementWrongTypeCode(_) => { + "Parsed statement has wrong typecode" + } + StmtParseError::UnknownToken(_) => "Unknown token", + StmtParseError::UnparseableStatement(_) => "Unparseable statement", + StmtParseError::ParsedStatementNoTypeCode => "Empty statement", + } + .into() + } + + /// The diagnostic's severity + #[must_use] + #[allow(clippy::unused_self)] + pub const fn severity(&self) -> AnnotationType { + AnnotationType::Error + } + + fn build_info<'a>(&self, stmt: StatementRef<'a>) -> AnnInfo<'a> { + fn t(v: &Token) -> String { + as_str(v).to_owned() + } + let severity = self.severity(); + let info = match self { + StmtParseError::ParsedStatementTooShort(ref opt_tok) => ( + severity, + match opt_tok { + Some(tok) => format!( + "Statement is too short, expecting for example {expected}", + expected = t(tok) + ) + .into(), + None => { + "Statement is too short, and does not correspond to any valid prefix".into() + } + }, + stmt, + stmt.span(), + ), + StmtParseError::ParsedStatementWrongTypeCode(ref found) => ( + severity, + format!( + "Type code {found} is not among the expected type codes", + found = t(found) + ) + .into(), + stmt, + stmt.span(), + ), + StmtParseError::UnknownToken(index) => ( + severity, + "This token was not declared in any $v or $c statement".into(), + stmt, + stmt.math_span(*index), + ), + StmtParseError::UnparseableStatement(index) => ( + severity, + "Could not parse this statement".into(), + stmt, + stmt.math_span(*index), + ), + StmtParseError::ParsedStatementNoTypeCode => ( + AnnotationType::Error, + "Empty statement".into(), + stmt, + stmt.span(), + ), + }; + (self.label(), vec![info]) + } +} + +impl From for Diagnostic { + fn from(err: StmtParseError) -> Self { + Diagnostic::StmtParseError(err) + } +} + /// An error during bibliography parsing. #[derive(Debug, Clone, Copy)] #[allow(missing_docs)] diff --git a/src/grammar.rs b/src/grammar.rs index c04f1c3..7acb68c 100644 --- a/src/grammar.rs +++ b/src/grammar.rs @@ -3,12 +3,12 @@ // Possibly: Remove branch/leaf and keep only the optional leaf? (then final leaf = no next node id) -use crate::diag::Diagnostic; +use crate::diag::{Diagnostic, StmtParseError}; use crate::formula::{Formula, FormulaBuilder, Label, Symbol, TypeCode}; use crate::nameck::{Atom, NameReader, Nameset}; use crate::segment::Segment; use crate::segment_set::SegmentSet; -use crate::statement::{CommandToken, SegmentId, StatementAddress, SymbolType, TokenPtr, TokenRef}; +use crate::statement::{CommandToken, SegmentId, StatementAddress, SymbolType, TokenRef}; use crate::util::HashMap; use crate::{as_str, Database, StatementRef, StatementType}; use log::{debug, warn}; @@ -401,8 +401,12 @@ impl Default for Grammar { } } -const fn undefined(token: TokenRef<'_>) -> Diagnostic { - Diagnostic::UnknownToken(token.address.token_index) +const fn undefined(token: TokenRef<'_>) -> StmtParseError { + StmtParseError::UnknownToken(token.address.token_index) +} + +fn undefined_cmd(token: &CommandToken, buf: &[u8]) -> Diagnostic { + Diagnostic::UndefinedToken(token.full_span(), token.value(buf).into()) } impl Grammar { @@ -462,8 +466,8 @@ impl Grammar { } } - fn too_short(map: &HashMap<(SymbolType, Atom), NextNode>, nset: &Nameset) -> Diagnostic { - Diagnostic::ParsedStatementTooShort( + fn too_short(map: &HashMap<(SymbolType, Atom), NextNode>, nset: &Nameset) -> StmtParseError { + StmtParseError::ParsedStatementTooShort( map.keys() .find(|k| k.0 == SymbolType::Constant) .map(|(_, expected_symbol)| nset.atom_name(*expected_symbol).into()), @@ -551,7 +555,7 @@ impl Grammar { .lookup_label(sref.label()) .ok_or_else(|| Diagnostic::UnknownLabel(sref.label_span()))? .atom; - let this_typecode = nset.get_atom(tokens.next().ok_or(Diagnostic::UnknownToken(0))?.slice); + let this_typecode = nset.get_atom(tokens.next().ok_or(Diagnostic::EmptyMathString)?.slice); // In case of a non-syntax axiom, skip it. if this_typecode == self.provable_type { @@ -563,7 +567,7 @@ impl Grammar { let token_ptr = sref.math_at(1).slice; let symbol = names .lookup_symbol(token_ptr) - .ok_or(Diagnostic::UnknownToken(1))?; + .ok_or_else(|| Diagnostic::UndefinedToken(sref.math_span(1), token_ptr.into()))?; if symbol.stype == SymbolType::Variable { let from_typecode = match names.lookup_float(token_ptr) { Some(lookup_float) => lookup_float.typecode_atom, @@ -932,8 +936,9 @@ impl Grammar { #[allow(clippy::unnecessary_wraps)] fn handle_common_prefixes( &mut self, - prefix: &[TokenPtr<'_>], - shadows: &[TokenPtr<'_>], + prefix: &[CommandToken], + shadows: &[CommandToken], + buf: &[u8], db: &Database, names: &mut NameReader<'_>, ) -> Result<(), Diagnostic> { @@ -942,15 +947,15 @@ impl Grammar { // First we follow the tree to the common prefix loop { - if prefix[index] != shadows[index] { + if prefix[index].value(buf) != shadows[index].value(buf) { break; } // TODO(tirix): use https://rust-lang.github.io/rfcs/2497-if-let-chains.html once it's out! if let GrammarNode::Branch { map } = self.nodes.get(node_id) { let prefix_symbol = db .name_result() - .lookup_symbol(prefix[index]) - .ok_or(Diagnostic::UnknownToken(index as i32))? + .lookup_symbol(prefix[index].value(buf)) + .ok_or_else(|| undefined_cmd(&prefix[index], buf))? .atom; let next_node = map .get(&(SymbolType::Constant, prefix_symbol)) @@ -966,8 +971,8 @@ impl Grammar { // We note the typecode and next branch of the "shadowed" prefix let shadowed_typecode = names - .lookup_float(shadows[index]) - .ok_or(Diagnostic::UnknownToken(index as i32))? + .lookup_float(shadows[index].value(buf)) + .ok_or_else(|| undefined_cmd(&shadows[index], buf))? .typecode_atom; let (shadowed_next_node, _) = self.next_var_node(node_id, shadowed_typecode) @@ -982,11 +987,11 @@ impl Grammar { let mut missing_reduce = ReduceVec::new(); for token in &prefix[index..] { let lookup_symbol = names - .lookup_symbol(token) - .ok_or(Diagnostic::UnknownToken(index as i32))?; + .lookup_symbol(token.value(buf)) + .ok_or_else(|| undefined_cmd(&*token, buf))?; debug!( "Following prefix {}, at {} / {}", - as_str(token), + as_str(token.value(buf)), node_id, add_from_node_id ); @@ -996,8 +1001,8 @@ impl Grammar { SymbolType::Variable => { increment_offsets(&mut missing_reduce); names - .lookup_float(token) - .ok_or(Diagnostic::UnknownToken(index as i32))? + .lookup_float(token.value(buf)) + .ok_or_else(|| undefined_cmd(token, buf))? .typecode_atom } }; @@ -1024,7 +1029,7 @@ impl Grammar { } } - debug!("Shadowed token: {}", as_str(shadows[index])); + debug!("Shadowed token: {}", as_str(shadows[index].value(buf))); debug!("Missing reduces: {}", missing_reduce.len()); debug!( "Handle shadowed next node {}, typecode {:?}", @@ -1097,23 +1102,11 @@ impl Grammar { // syntax '|-' as 'wff'; self.provable_type = nset .lookup_symbol(ty.value(buf)) - .ok_or(( - address, - Diagnostic::UndefinedToken( - ty.full_span(), - ty.value(buf).into(), - ), - ))? + .ok_or((address, undefined_cmd(ty, buf)))? .atom; self.typecodes.push( nset.lookup_symbol(code.value(buf)) - .ok_or(( - address, - Diagnostic::UndefinedToken( - code.full_span(), - code.value(buf).into(), - ), - ))? + .ok_or((address, undefined_cmd(code, buf)))? .atom, ); } @@ -1121,13 +1114,7 @@ impl Grammar { // syntax 'setvar'; self.typecodes.push( nset.lookup_symbol(ty.value(buf)) - .ok_or(( - address, - Diagnostic::UndefinedToken( - ty.full_span(), - ty.value(buf).into(), - ), - ))? + .ok_or((address, undefined_cmd(ty, buf)))? .atom, ); } @@ -1140,14 +1127,8 @@ impl Grammar { .position(|t| matches!(t, Keyword(k) if k.as_ref(buf) == b"=>")) .ok_or((address, Diagnostic::BadCommand(*k)))?; // '=>' not present in 'garden_path' command let (prefix, shadows) = rest.split_at(split_index); - let prefix = - prefix.iter().map(|tk| tk.value(buf)).collect::>(); - let shadows = shadows[1..] - .iter() - .map(|tk| tk.value(buf)) - .collect::>(); if let Err(diag) = - self.handle_common_prefixes(&prefix, &shadows, db, names) + self.handle_common_prefixes(prefix, &shadows[1..], buf, db, names) { return Err((address, diag)); } @@ -1206,7 +1187,7 @@ impl Grammar { symbol_iter: &mut impl Iterator, expected_typecodes: &[TypeCode], nset: &Nameset, - ) -> Result { + ) -> Result { struct StackElement { node_id: NodeId, expected_typecodes: Box<[TypeCode]>, @@ -1260,7 +1241,7 @@ impl Grammar { // There are still symbols to parse, continue from root let (next_node_id, leaf_label) = self .next_var_node(self.root, typecode) - .ok_or(Diagnostic::UnparseableStatement(ix))?; + .ok_or(StmtParseError::UnparseableStatement(ix))?; for &reduce in leaf_label { Self::do_reduce(&mut formula_builder, reduce, nset); } @@ -1283,7 +1264,7 @@ impl Grammar { debug!(" ++ Wrong type obtained, continue."); let (next_node_id, leaf_label) = self .next_var_node(self.root, typecode) - .ok_or(Diagnostic::UnparseableStatement(ix))?; + .ok_or(StmtParseError::UnparseableStatement(ix))?; for &reduce in leaf_label { Self::do_reduce(&mut formula_builder, reduce, nset); } @@ -1312,7 +1293,7 @@ impl Grammar { } else { // No matching constant, search among variables if map.is_empty() || e.node_id == self.root { - return Err(Diagnostic::UnparseableStatement(ix)); + return Err(StmtParseError::UnparseableStatement(ix)); } debug!( @@ -1339,14 +1320,42 @@ impl Grammar { } } + /// Parses a character string into a formula + /// As a first math token, the string is expected to contain the typecode for the formula. + pub fn parse_string( + &self, + formula_string: &str, + nset: &Arc, + ) -> Result { + // TODO an iterator taking notes of the start and end of the math tokens would allow to return richer error messages, including actual spans rather than indices. + let mut symbols = formula_string.trim().split(&[' ', '\t', '\n']); + let typecode_name = symbols + .next() + .ok_or(StmtParseError::ParsedStatementNoTypeCode)?; + let typecode = nset + .lookup_symbol(typecode_name.as_bytes()) + .ok_or(StmtParseError::UnknownToken(0))? + .atom; + let expected_typecode = if typecode == self.provable_type { + self.logic_type + } else { + typecode + }; + self.parse_formula( + &mut symbols.map(|t| nset.lookup_symbol(t.as_bytes()).unwrap().atom), + &[expected_typecode], + nset, + ) + } + fn parse_statement( &self, sref: &StatementRef<'_>, nset: &Nameset, names: &mut NameReader<'_>, - ) -> Result, Diagnostic> { + ) -> Result, StmtParseError> { if sref.math_len() == 0 { - return Err(Diagnostic::ParsedStatementNoTypeCode); + return Err(StmtParseError::ParsedStatementNoTypeCode); } // Type token. It is safe to unwrap here since parser has checked for EmptyMathString error. @@ -1573,7 +1582,7 @@ impl StmtParse { let mut out = Vec::new(); for sps in self.segments.values() { for (&sa, diag) in &sps.diagnostics { - out.push((sa, diag.clone())); + out.push((sa, diag.clone().into())); } } out @@ -1589,7 +1598,9 @@ impl StmtParse { let sref = sset.statement(sa); let math_iter = sref.math_iter().flat_map(|token| { nset.lookup_symbol(token.slice) - .ok_or_else(|| (sref.address(), Diagnostic::UnknownToken(token.index()))) + .ok_or_else(|| { + (sref.address(), StmtParseError::UnknownToken(token.index())) + }) .map(|l| l.atom) }); let fmla_iter = formula.as_ref(db).iter(); @@ -1630,7 +1641,7 @@ impl StmtParse { #[derive(Debug)] struct StmtParseSegment { _source: Arc, - diagnostics: HashMap, + diagnostics: HashMap, formulas: HashMap, } diff --git a/src/grammar_tests.rs b/src/grammar_tests.rs index 46bfda9..8e3d3eb 100644 --- a/src/grammar_tests.rs +++ b/src/grammar_tests.rs @@ -2,6 +2,7 @@ use crate::as_str; use crate::database::Database; use crate::database::DbOptions; use crate::diag::Diagnostic; +use crate::diag::StmtParseError; use crate::statement::SegmentId; use crate::statement::StatementAddress; @@ -306,12 +307,12 @@ grammar_test!( b"$c |- $. err $a |- unknown $.", 2, 1, - Diagnostic::UnknownToken(1) + Diagnostic::UndefinedToken(crate::Span::new(19, 26), Box::new(*b"unknown")) ); grammar_test!( test_unknown_token_2, b"$c |- ( $. err $a |- ( unknown $.", 2, 1, - Diagnostic::UnknownToken(2) + Diagnostic::StmtParseError(StmtParseError::UnknownToken(2)) );