Merge pull request #13 from G-Research/12-correct-index-non-ascii

Correct indexes for non-ASCII strings
G-Research · Jun 28, 2021 · 8a33ddf · 8a33ddf
2 parents d476e6a + 5068335
commit 8a33ddf
Show file tree

Hide file tree

Showing 5 changed files with 66 additions and 3 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,8 @@
+# Changelog
+
+## 0.10.0
+
+* Fixed bug where `find_matches_as_indexes()` didn't give correct offsets for
+  non-ASCII strings
+  ([#12](https://github.com/G-Research/ahocorasick_rs/issues/12)). Thanks to
+  @necrosovereign for reporting and @BurntSushi for suggesting fix.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -4,3 +4,4 @@ flake8
 black
 pyahocorasick
 maturin
+hypothesis
diff --git a/src/lib.rs b/src/lib.rs
@@ -70,11 +70,30 @@ impl PyAhoCorasick {
         overlapping: bool,
     ) -> PyResult<Vec<(usize, usize, usize)>> {
         self_.check_overlapping(overlapping)?;
+        // Map UTF-8 byte index to Unicode code point index; the latter is what
+        // Python users expect.
+        let mut byte_to_code_point = vec![usize::MAX; haystack.len() + 1];
+        let mut max_codepoint = 0;
+        for (codepoint_off, (byte_off, _)) in haystack.char_indices().enumerate() {
+            byte_to_code_point[byte_off] = codepoint_off;
+            max_codepoint = codepoint_off;
+        }
+        // End index is exclusive (e.g. 0:3 is first 3 characters), so handle
+        // the case where pattern is at end of string.
+        if haystack.len() > 0 {
+            byte_to_code_point[haystack.len()] = max_codepoint + 1;
+        }
         let py = self_.py();
         let matches = self_.get_matches(py, haystack, overlapping);
         Ok(matches
             .into_iter()
-            .map(|m| (m.pattern(), m.start(), m.end()))
+            .map(|m| {
+                (
+                    m.pattern(),
+                    byte_to_code_point[m.start()],
+                    byte_to_code_point[m.end()],
+                )
+            })
             .collect())
     }
 

diff --git a/tests/test_ac.py b/tests/test_ac.py
@@ -2,6 +2,9 @@
 
 import pytest
 
+from hypothesis import strategies as st
+from hypothesis import given, assume
+
 from ahocorasick_rs import (
     AhoCorasick,
     MATCHKIND_STANDARD,
@@ -12,7 +15,7 @@
 
 def test_basic_matching():
     """
-    find_matches_as_indexes() and find_matches_as_indexes() return matching
+    find_matches_as_indexes() and find_matches_as_strings() return matching
     patterns in the given string.
     """
     haystack = "hello, world, hello again"
@@ -30,6 +33,38 @@ def test_basic_matching():
     assert ac.find_matches_as_strings(haystack) == expected
 
 
+def test_unicode():
+    """
+    Non-ASCII unicode patterns still give correct results for
+    find_matches_as_indexes().
+    """
+    haystack = "hello, world ☃fishá l🤦l"
+    patterns = ["d ☃f", "há", "l🤦l"]
+    ac = AhoCorasick(patterns)
+    index_matches = ac.find_matches_as_indexes(haystack)
+    expected = ["d ☃f", "há", "l🤦l"]
+    assert [patterns[i] for (i, _, _) in index_matches] == expected
+    assert [haystack[s:e] for (_, s, e) in index_matches] == expected
+
+
+@given(st.text(), st.text(min_size=1), st.text())
+def test_unicode_extensive(prefix, pattern, suffix):
+    """
+    Non-ASCII unicode patterns still give correct results for
+    find_matches_as_indexes(), with property-testing.
+    """
+    assume(pattern not in prefix)
+    assume(pattern not in suffix)
+    haystack = prefix + pattern + suffix
+    ac = AhoCorasick([pattern])
+
+    index_matches = ac.find_matches_as_indexes(haystack)
+    expected = [pattern]
+    assert [i for (i, _, _) in index_matches] == [0]
+    assert [haystack[s:e] for (_, s, e) in index_matches] == expected
+    assert ac.find_matches_as_strings(haystack) == [pattern]
+
+
 def test_matchkind():
     """
     Different matchkinds give different results.
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,3 +4,4 @@ flake8 @@
     black
     pyahocorasick
     maturin
+    hypothesis