From fd604ce452d811115d5e509d5c7fc63c3e0b9e9c Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Mon, 28 Jun 2021 13:15:25 -0400 Subject: [PATCH 1/2] Simple test and corresponding bug fix. --- CHANGELOG.md | 8 ++++++++ Cargo.lock | 2 +- src/lib.rs | 19 ++++++++++++++++++- tests/test_ac.py | 16 +++++++++++++++- 4 files changed, 42 insertions(+), 3 deletions(-) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..3e562dd --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,8 @@ +# Changelog + +## 0.10.0 + +* Fixed bug where `find_matches_as_indexes()` didn't give correct offsets for + non-ASCII strings + ([#12](https://github.com/G-Research/ahocorasick_rs/issues/12)). Thanks to + @necrosovereign for reporting and @BurntSushi for suggesting fix. diff --git a/Cargo.lock b/Cargo.lock index b2cfeab..a3b87e7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,7 +11,7 @@ dependencies = [ [[package]] name = "ahocorasick_rs" -version = "0.1.0" +version = "0.9.1" dependencies = [ "aho-corasick", "pyo3", diff --git a/src/lib.rs b/src/lib.rs index f8955f1..e5e94f0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -70,11 +70,28 @@ impl PyAhoCorasick { overlapping: bool, ) -> PyResult> { self_.check_overlapping(overlapping)?; + // Map UTF-8 byte index to Unicode code point index; the latter is what + // Python users expect. + let mut byte_to_code_point = vec![usize::MAX; haystack.len() + 1]; + for (codepoint_off, (byte_off, _)) in haystack.char_indices().enumerate() { + byte_to_code_point[byte_off] = codepoint_off; + } + // End index is exclusive (e.g. 0:3 is first 3 characters), so handle + // the case where pattern is at end of string. + if haystack.len() > 0 { + byte_to_code_point[haystack.len()] = byte_to_code_point[haystack.len() - 1] + 1; + } let py = self_.py(); let matches = self_.get_matches(py, haystack, overlapping); Ok(matches .into_iter() - .map(|m| (m.pattern(), m.start(), m.end())) + .map(|m| { + ( + m.pattern(), + byte_to_code_point[m.start()], + byte_to_code_point[m.end()], + ) + }) .collect()) } diff --git a/tests/test_ac.py b/tests/test_ac.py index 6ee3bc9..588caba 100644 --- a/tests/test_ac.py +++ b/tests/test_ac.py @@ -12,7 +12,7 @@ def test_basic_matching(): """ - find_matches_as_indexes() and find_matches_as_indexes() return matching + find_matches_as_indexes() and find_matches_as_strings() return matching patterns in the given string. """ haystack = "hello, world, hello again" @@ -30,6 +30,20 @@ def test_basic_matching(): assert ac.find_matches_as_strings(haystack) == expected +def test_unicode(): + """ + Non-ASCII unicode patterns still give correct results for + find_matches_as_indexes(). + """ + haystack = "hello, world ☃fishá l🤦l" + patterns = ["d ☃f", "há", "l🤦l"] + ac = AhoCorasick(patterns) + index_matches = ac.find_matches_as_indexes(haystack) + expected = ["d ☃f", "há", "l🤦l"] + assert [patterns[i] for (i, _, _) in index_matches] == expected + assert [haystack[s:e] for (_, s, e) in index_matches] == expected + + def test_matchkind(): """ Different matchkinds give different results. From 5068335d15d174a3484f61e4b78183e8b4c43521 Mon Sep 17 00:00:00 2001 From: Itamar Turner-Trauring Date: Mon, 28 Jun 2021 13:33:01 -0400 Subject: [PATCH 2/2] Fix mapping logic, using Hypothesis to find edge case bugs. --- requirements-dev.txt | 1 + src/lib.rs | 4 +++- tests/test_ac.py | 21 +++++++++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index 1006155..9291735 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -4,3 +4,4 @@ flake8 black pyahocorasick maturin +hypothesis diff --git a/src/lib.rs b/src/lib.rs index e5e94f0..254da33 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -73,13 +73,15 @@ impl PyAhoCorasick { // Map UTF-8 byte index to Unicode code point index; the latter is what // Python users expect. let mut byte_to_code_point = vec![usize::MAX; haystack.len() + 1]; + let mut max_codepoint = 0; for (codepoint_off, (byte_off, _)) in haystack.char_indices().enumerate() { byte_to_code_point[byte_off] = codepoint_off; + max_codepoint = codepoint_off; } // End index is exclusive (e.g. 0:3 is first 3 characters), so handle // the case where pattern is at end of string. if haystack.len() > 0 { - byte_to_code_point[haystack.len()] = byte_to_code_point[haystack.len() - 1] + 1; + byte_to_code_point[haystack.len()] = max_codepoint + 1; } let py = self_.py(); let matches = self_.get_matches(py, haystack, overlapping); diff --git a/tests/test_ac.py b/tests/test_ac.py index 588caba..ceb5e5a 100644 --- a/tests/test_ac.py +++ b/tests/test_ac.py @@ -2,6 +2,9 @@ import pytest +from hypothesis import strategies as st +from hypothesis import given, assume + from ahocorasick_rs import ( AhoCorasick, MATCHKIND_STANDARD, @@ -44,6 +47,24 @@ def test_unicode(): assert [haystack[s:e] for (_, s, e) in index_matches] == expected +@given(st.text(), st.text(min_size=1), st.text()) +def test_unicode_extensive(prefix, pattern, suffix): + """ + Non-ASCII unicode patterns still give correct results for + find_matches_as_indexes(), with property-testing. + """ + assume(pattern not in prefix) + assume(pattern not in suffix) + haystack = prefix + pattern + suffix + ac = AhoCorasick([pattern]) + + index_matches = ac.find_matches_as_indexes(haystack) + expected = [pattern] + assert [i for (i, _, _) in index_matches] == [0] + assert [haystack[s:e] for (_, s, e) in index_matches] == expected + assert ac.find_matches_as_strings(haystack) == [pattern] + + def test_matchkind(): """ Different matchkinds give different results.