Implement Optional Footnote References Ignore

drahnr · Nov 7, 2023 · 351f12e · 351f12e
1 parent 5a1d955
commit 351f12e
Show file tree

Hide file tree

Showing 9 changed files with 108 additions and 32 deletions.
diff --git a/doc-chunks/src/chunk.rs b/doc-chunks/src/chunk.rs
@@ -9,11 +9,11 @@ use std::convert::TryFrom;
 use std::fmt;
 use std::path::Path;
 
-use crate::PlainOverlay;
 use crate::{
     util::{sub_char_range, sub_chars},
     Range, Span,
 };
+use crate::{Ignores, PlainOverlay};
 
 /// Definition of the source of a checkable chunk
 #[derive(Debug, Clone, Hash, Eq, PartialEq)]
@@ -364,8 +364,8 @@ impl CheckableChunk {
 
     /// Obtain an accessor object containing mapping and string representation,
     /// removing the markdown annotations.
-    pub fn erase_cmark(&self) -> PlainOverlay {
-        PlainOverlay::erase_cmark(self)
+    pub fn erase_cmark(&self, ignores: &Ignores) -> PlainOverlay {
+        PlainOverlay::erase_cmark(self, ignores)
     }
 
     /// Obtain the length in characters.

diff --git a/doc-chunks/src/markdown.rs b/doc-chunks/src/markdown.rs
@@ -136,7 +136,10 @@ impl<'a> PlainOverlay<'a> {
     }
 
     /// Ranges are mapped `cmark reduced/plain -> raw`.
-    pub fn extract_plain_with_mapping(cmark: &str) -> (String, IndexMap<Range, SourceRange>) {
+    pub fn extract_plain_with_mapping(
+        cmark: &str,
+        ignores: &Ignores,
+    ) -> (String, IndexMap<Range, SourceRange>) {
         let mut plain = String::with_capacity(cmark.len());
         let mut mapping = indexmap::IndexMap::with_capacity(128);
 
@@ -322,7 +325,7 @@ impl<'a> PlainOverlay<'a> {
                     }
                 }
                 Event::FootnoteReference(s) => {
-                    if !s.is_empty() {
+                    if !ignores.footnote_references && !s.is_empty() {
                         let char_range = Range {
                             start: char_range.start + 2,
                             end: char_range.end - 1,
@@ -368,8 +371,8 @@ impl<'a> PlainOverlay<'a> {
     /// reference.
     // TODO consider returning a `Vec<PlainOverlay<'a>>` to account for list items
     // or other non-linear information which might not pass a grammar check as a whole
-    pub fn erase_cmark(chunk: &'a CheckableChunk) -> Self {
-        let (plain, mapping) = Self::extract_plain_with_mapping(chunk.as_str());
+    pub fn erase_cmark(chunk: &'a CheckableChunk, ignores: &Ignores) -> Self {
+        let (plain, mapping) = Self::extract_plain_with_mapping(chunk.as_str(), ignores);
         Self {
             raw: chunk,
             plain,
@@ -518,3 +521,11 @@ impl<'a> fmt::Debug for PlainOverlay<'a> {
         Ok(())
     }
 }
+
+/// Explicitly ignored markdown entities.  The `Default` implementation means we
+/// do not ignore anything, which is the backwards compatible configuration.
+#[derive(Clone, Default)]
+pub struct Ignores {
+    /// Ignore [footnote references](Event::FootnoteReference).
+    pub footnote_references: bool,
+}
diff --git a/docs/configuration.md b/docs/configuration.md
@@ -97,6 +97,14 @@ allow_concatenation = true
 # And the counterpart, which accepts words with dashes, when the suggestion has
 # recommendations without the dashes. This is less common.
 allow_dashed = false
+# Check the expressions in the footnote references. By default this is turned on
+# to remain backwards compatible but disabling it could be particularly useful
+# when one uses abbreviations instead of numbers as footnote references.  For
+# instance by default the fragment `hello[^xyz]` would be spellchecked as
+# `helloxyz` which is obviously a misspelled word, but by turning this check
+# off, it will skip validating the reference altogether and will only check the
+# word `hello`.
+check_footnote_references = false
 
 [NlpRules]
 # Allows the user to override the default included

diff --git a/src/checker/dummy.rs b/src/checker/dummy.rs
@@ -39,7 +39,7 @@ impl Checker for DummyChecker {
         let chunk = chunks
             .first()
             .expect("DummyChecker expects at least one chunk");
-        let plain = chunk.erase_cmark();
+        let plain = chunk.erase_cmark(&Default::default());
         let txt = plain.as_str();
         for (index, range) in apply_tokenizer(&tokenizer, txt).enumerate() {
             log::trace!("****Token[{}]: >{}<", index, sub_chars(txt, range.clone()));

diff --git a/src/checker/hunspell.rs b/src/checker/hunspell.rs
@@ -23,6 +23,8 @@ use std::sync::Arc;
 
 use hunspell_rs::{CheckResult, Hunspell};
 
+use doc_chunks::Ignores;
+
 use crate::errors::*;
 
 use super::quirks::{
@@ -133,22 +135,28 @@ pub struct HunspellCheckerInner {
     allow_concatenated: bool,
     allow_dashed: bool,
     allow_emojis: bool,
+    check_footnote_references: bool,
     ignorelist: String,
 }
 
 impl HunspellCheckerInner {
     fn new(config: &<HunspellChecker as Checker>::Config) -> Result<Self> {
         // TODO allow override
-        let (transform_regex, allow_concatenated, allow_dashed, allow_emojis) = {
+        let (
+            transform_regex,
+            allow_concatenated,
+            allow_dashed,
+            allow_emojis,
+            check_footnote_references,
+        ) = {
             let quirks = &config.quirks;
-            {
-                (
-                    quirks.transform_regex().to_vec(),
-                    quirks.allow_concatenated(),
-                    quirks.allow_dashed(),
-                    quirks.allow_emojis(),
-                )
-            }
+            (
+                quirks.transform_regex().to_vec(),
+                quirks.allow_concatenated(),
+                quirks.allow_dashed(),
+                quirks.allow_emojis(),
+                quirks.check_footnote_references(),
+            )
         };
         // FIXME rename the config option
         let ignorelist = config.tokenization_splitchars.clone();
@@ -257,6 +265,7 @@ impl HunspellCheckerInner {
             allow_concatenated,
             allow_dashed,
             allow_emojis,
+            check_footnote_references,
             ignorelist,
         })
     }
@@ -299,7 +308,9 @@ impl Checker for HunspellChecker {
         let mut acc = Vec::with_capacity(chunks.len());
 
         for chunk in chunks {
-            let plain = chunk.erase_cmark();
+            let plain = chunk.erase_cmark(&Ignores {
+                footnote_references: !self.0.check_footnote_references,
+            });
             log::trace!("{plain:?}");
             let txt = plain.as_str();
             let hunspell = &*self.hunspell.0;

diff --git a/src/checker/nlprules.rs b/src/checker/nlprules.rs
@@ -114,7 +114,12 @@ fn check_chunk<'a>(
     tokenizer: &Tokenizer,
     rules: &Rules,
 ) -> Vec<Suggestion<'a>> {
-    let plain = chunk.erase_cmark();
+    // TODO We should control which parts need to be ignored of the markdown
+    // entities, however the `NlpRulesConfig`, which is the only configuration
+    // we receive in the constructor does not contain the same quirks (or in
+    // fact any other similar settings) as the Hunspell one, so we cannot obtain
+    // this setting, therefore we fallback to default
+    let plain = chunk.erase_cmark(&Default::default());
     log::trace!("{plain:?}");
     let txt = plain.as_str();
 

diff --git a/src/config/hunspell.rs b/src/config/hunspell.rs
@@ -30,15 +30,25 @@ pub struct Quirks {
     /// Treats sequences of emojis as OK.
     #[serde(default = "yes")]
     pub allow_emojis: bool,
+    /// Check the expressions in the footnote references. By default this is
+    /// turned on to remain backwards compatible but disabling it could be
+    /// particularly useful when one uses abbreviations instead of numbers as
+    /// footnote references.  For instance by default the fragment `hello[^xyz]`
+    /// would be spellchecked as `helloxyz` which is obviously a misspelled
+    /// word, but by turning this check off, it will skip validating the
+    /// reference altogether and will only check the word `hello`.
+    #[serde(default = "yes")]
+    pub check_footnote_references: bool,
 }
 
 impl Default for Quirks {
     fn default() -> Self {
         Self {
-            transform_regex: vec![],
+            transform_regex: Vec::new(),
             allow_concatenation: false,
             allow_dashes: false,
             allow_emojis: true,
+            check_footnote_references: true,
         }
     }
 }
@@ -59,6 +69,10 @@ impl Quirks {
     pub(crate) fn transform_regex(&self) -> &[WrappedRegex] {
         &self.transform_regex
     }
+
+    pub(crate) fn check_footnote_references(&self) -> bool {
+        self.check_footnote_references
+    }
 }
 
 fn default_tokenization_splitchars() -> String {

diff --git a/src/reflow/tests.rs b/src/reflow/tests.rs
@@ -92,7 +92,7 @@ macro_rules! reflow_content {
         let chunks = docs.get(&$content_type).expect("Contains test data. qed");
         assert_eq!(dbg!(chunks).len(), 1);
         let chunk = &chunks[0];
-        let _plain = chunk.erase_cmark();
+        let _plain = chunk.erase_cmark(&Default::default());
         let suggestions = reflow(&$content_type, chunk, &CFG).expect("Reflow is working. qed");
 
         let patches = suggestions
@@ -134,7 +134,7 @@ macro_rules! reflow_content {
         let chunks = docs.get(&$content_type).expect("Contains test data. qed");
         assert_eq!(dbg!(chunks).len(), 1);
         let chunk = &chunks[0];
-        let _plain = chunk.erase_cmark();
+        let _plain = chunk.erase_cmark(&Default::default());
         let suggestions = reflow(&$content_type, chunk, &CFG).expect("Reflow is working. qed");
 
         assert_eq!(
@@ -159,7 +159,7 @@ macro_rules! reflow_content {
         let chunks = docs.get(&$content_type).expect("Contains test data. qed");
         assert_eq!(dbg!(chunks).len(), 1);
         let chunk = &chunks[0];
-        let _plain = chunk.erase_cmark();
+        let _plain = chunk.erase_cmark(&Default::default());
         println!("reflow content:\n {:?}", $content);
         let suggestions = reflow(&$content_type, chunk, &CFG).expect("Reflow is working. qed");
         let patches = suggestions

diff --git a/src/tests.rs b/src/tests.rs
@@ -2,7 +2,7 @@ use super::*;
 use crate::checker::Checker;
 use crate::util::{load_span_from, sub_char_range, sub_chars};
 use crate::{chyrp_up, fluff_up};
-use doc_chunks::literalset::testhelper::gen_literal_set;
+use doc_chunks::{literalset::testhelper::gen_literal_set, Ignores};
 
 use crate::documentation::{testcase::annotated_literals, SourceRange};
 use indexmap::IndexMap;
@@ -31,7 +31,7 @@ fn parse_and_construct() {
     // TODO
     let chunk = &chunks[0];
     assert_eq!(chunk.as_str(), TEST_RAW.to_owned());
-    let plain = chunk.erase_cmark();
+    let plain = chunk.erase_cmark(&Default::default());
     println!("{plain:?}");
 
     assert_eq!(TEST_PLAIN, plain.as_str());
@@ -98,7 +98,7 @@ macro_rules! end2end {
         let chunks = docs.get(&origin).expect("Must contain dummy path");
         assert_eq!(dbg!(chunks).len(), 1);
         let chunk = &chunks[0];
-        let _plain = chunk.erase_cmark();
+        let _plain = chunk.erase_cmark(&Default::default());
         let cfg = $cfg;
         dbg!(std::any::type_name::<$checker>());
         let checker = <$checker>::new(&cfg).expect("Checker construction works");
@@ -259,7 +259,7 @@ struct CAPI;
     #[test]
     fn issue_281() {
         let dict_path = temp_dir().join(uuid::Uuid::new_v4().to_string() + ".dic");
-        // Any of the two hypthens cause havoc
+        // Any of the two hyphens cause havoc
         const EXTRA_DICT: &str = r###"2
 —
 –
@@ -326,7 +326,7 @@ struct CAPI;
             assert_eq!(chunks.len(), 1);
             assert_eq!(RAW, chunk.as_str());
 
-            let plain = dbg!(chunk.erase_cmark());
+            let plain = dbg!(chunk.erase_cmark(&Default::default()));
             assert_eq!(dbg!($plain), plain.as_str());
 
             let mut it = suggestions.into_iter();
@@ -600,6 +600,31 @@ Ref4"#;
     }
 }
 
+#[test]
+fn check_footnote_references() {
+    const SOURCE: &str = "Hello[^xyz].\n\n[^xyz]: World.";
+    let origin = ContentOrigin::TestEntityCommonMark;
+
+    let documentation = Documentation::load_from_str(origin.clone(), SOURCE, false, false);
+    assert_eq!(documentation.len(), 1);
+
+    let chunks = documentation.get(&origin).expect("Must contain dummy path");
+    assert_eq!(dbg!(chunks).len(), 1);
+
+    let chunk = &chunks[0];
+    assert_eq!(chunk.as_str(), SOURCE);
+
+    let plain = chunk.erase_cmark(&Ignores {
+        footnote_references: false,
+    });
+    assert_eq!(plain.as_str(), "Helloxyz.\n\nWorld.");
+
+    let plain = chunk.erase_cmark(&Ignores {
+        footnote_references: true,
+    });
+    assert_eq!(plain.as_str(), "Hello.\n\nWorld.");
+}
+
 #[test]
 fn find_spans_emoji() {
     const TEST: &str = r##"ab **🐡** xy"##;
@@ -1050,7 +1075,7 @@ fn drill_span() {
         CommentVariant::CommonMark,
     );
 
-    let plain = chunk.erase_cmark();
+    let plain = chunk.erase_cmark(&Default::default());
     assert_eq!(plain.find_spans(0..2).len(), 1);
     assert_eq!(plain.find_spans(3..4).len(), 1);
     assert_eq!(plain.find_spans(5..7).len(), 1);
@@ -1100,7 +1125,8 @@ Extra pagaph paragraph.
 
 
 And a line, or a rule."##;
-    let (reduced, mapping) = PlainOverlay::extract_plain_with_mapping(MARKDOWN);
+    let (reduced, mapping) =
+        PlainOverlay::extract_plain_with_mapping(MARKDOWN, &Default::default());
 
     assert_eq!(dbg!(&reduced).as_str(), PLAIN);
     assert_eq!(dbg!(&mapping).len(), 22);
@@ -1117,7 +1143,8 @@ fn reduction_leading_space() {
     const MARKDOWN: &str = r#"  Some __underlined__ **bold** text."#;
     const PLAIN: &str = r#"Some underlined bold text."#;
 
-    let (reduced, mapping) = PlainOverlay::extract_plain_with_mapping(MARKDOWN);
+    let (reduced, mapping) =
+        PlainOverlay::extract_plain_with_mapping(MARKDOWN, &Default::default());
 
     assert_eq!(dbg!(&reduced).as_str(), PLAIN);
     assert_eq!(dbg!(&mapping).len(), 5);
@@ -1158,7 +1185,7 @@ fn range_test() {
 }
 
 fn cmark_reduction_test(input: &'static str, expected: &'static str, expected_mapping_len: usize) {
-    let (plain, mapping) = PlainOverlay::extract_plain_with_mapping(input);
+    let (plain, mapping) = PlainOverlay::extract_plain_with_mapping(input, &Default::default());
     assert_eq!(dbg!(&plain).as_str(), expected);
     assert_eq!(dbg!(&mapping).len(), expected_mapping_len);
     for (reduced_range, markdown_range) in mapping.into_iter() {