diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..3e562dd --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,8 @@ +# Changelog + +## 0.10.0 + +* Fixed bug where `find_matches_as_indexes()` didn't give correct offsets for + non-ASCII strings + ([#12](https://github.com/G-Research/ahocorasick_rs/issues/12)). Thanks to + @necrosovereign for reporting and @BurntSushi for suggesting fix. diff --git a/Cargo.lock b/Cargo.lock index b2cfeab..a3b87e7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,7 +11,7 @@ dependencies = [ [[package]] name = "ahocorasick_rs" -version = "0.1.0" +version = "0.9.1" dependencies = [ "aho-corasick", "pyo3", diff --git a/requirements-dev.txt b/requirements-dev.txt index 1006155..9291735 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,3 +4,4 @@ flake8 black pyahocorasick maturin +hypothesis diff --git a/src/lib.rs b/src/lib.rs index f8955f1..254da33 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -70,11 +70,30 @@ impl PyAhoCorasick { overlapping: bool, ) -> PyResult> { self_.check_overlapping(overlapping)?; + // Map UTF-8 byte index to Unicode code point index; the latter is what + // Python users expect. + let mut byte_to_code_point = vec![usize::MAX; haystack.len() + 1]; + let mut max_codepoint = 0; + for (codepoint_off, (byte_off, _)) in haystack.char_indices().enumerate() { + byte_to_code_point[byte_off] = codepoint_off; + max_codepoint = codepoint_off; + } + // End index is exclusive (e.g. 0:3 is first 3 characters), so handle + // the case where pattern is at end of string. + if haystack.len() > 0 { + byte_to_code_point[haystack.len()] = max_codepoint + 1; + } let py = self_.py(); let matches = self_.get_matches(py, haystack, overlapping); Ok(matches .into_iter() - .map(|m| (m.pattern(), m.start(), m.end())) + .map(|m| { + ( + m.pattern(), + byte_to_code_point[m.start()], + byte_to_code_point[m.end()], + ) + }) .collect()) } diff --git a/tests/test_ac.py b/tests/test_ac.py index 6ee3bc9..ceb5e5a 100644 --- a/tests/test_ac.py +++ b/tests/test_ac.py @@ -2,6 +2,9 @@ import pytest +from hypothesis import strategies as st +from hypothesis import given, assume + from ahocorasick_rs import ( AhoCorasick, MATCHKIND_STANDARD, @@ -12,7 +15,7 @@ def test_basic_matching(): """ - find_matches_as_indexes() and find_matches_as_indexes() return matching + find_matches_as_indexes() and find_matches_as_strings() return matching patterns in the given string. """ haystack = "hello, world, hello again" @@ -30,6 +33,38 @@ def test_basic_matching(): assert ac.find_matches_as_strings(haystack) == expected +def test_unicode(): + """ + Non-ASCII unicode patterns still give correct results for + find_matches_as_indexes(). + """ + haystack = "hello, world ☃fishá l🤦l" + patterns = ["d ☃f", "há", "l🤦l"] + ac = AhoCorasick(patterns) + index_matches = ac.find_matches_as_indexes(haystack) + expected = ["d ☃f", "há", "l🤦l"] + assert [patterns[i] for (i, _, _) in index_matches] == expected + assert [haystack[s:e] for (_, s, e) in index_matches] == expected + + +@given(st.text(), st.text(min_size=1), st.text()) +def test_unicode_extensive(prefix, pattern, suffix): + """ + Non-ASCII unicode patterns still give correct results for + find_matches_as_indexes(), with property-testing. + """ + assume(pattern not in prefix) + assume(pattern not in suffix) + haystack = prefix + pattern + suffix + ac = AhoCorasick([pattern]) + + index_matches = ac.find_matches_as_indexes(haystack) + expected = [pattern] + assert [i for (i, _, _) in index_matches] == [0] + assert [haystack[s:e] for (_, s, e) in index_matches] == expected + assert ac.find_matches_as_strings(haystack) == [pattern] + + def test_matchkind(): """ Different matchkinds give different results.