Skip to content

Commit

Permalink
Merge pull request #13 from G-Research/12-correct-index-non-ascii
Browse files Browse the repository at this point in the history
Correct indexes for non-ASCII strings
  • Loading branch information
itamarst authored Jun 28, 2021
2 parents d476e6a + 5068335 commit 8a33ddf
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 3 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Changelog

## 0.10.0

* Fixed bug where `find_matches_as_indexes()` didn't give correct offsets for
non-ASCII strings
([#12](https://github.com/G-Research/ahocorasick_rs/issues/12)). Thanks to
@necrosovereign for reporting and @BurntSushi for suggesting fix.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ flake8
black
pyahocorasick
maturin
hypothesis
21 changes: 20 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,30 @@ impl PyAhoCorasick {
overlapping: bool,
) -> PyResult<Vec<(usize, usize, usize)>> {
self_.check_overlapping(overlapping)?;
// Map UTF-8 byte index to Unicode code point index; the latter is what
// Python users expect.
let mut byte_to_code_point = vec![usize::MAX; haystack.len() + 1];
let mut max_codepoint = 0;
for (codepoint_off, (byte_off, _)) in haystack.char_indices().enumerate() {
byte_to_code_point[byte_off] = codepoint_off;
max_codepoint = codepoint_off;
}
// End index is exclusive (e.g. 0:3 is first 3 characters), so handle
// the case where pattern is at end of string.
if haystack.len() > 0 {
byte_to_code_point[haystack.len()] = max_codepoint + 1;
}
let py = self_.py();
let matches = self_.get_matches(py, haystack, overlapping);
Ok(matches
.into_iter()
.map(|m| (m.pattern(), m.start(), m.end()))
.map(|m| {
(
m.pattern(),
byte_to_code_point[m.start()],
byte_to_code_point[m.end()],
)
})
.collect())
}

Expand Down
37 changes: 36 additions & 1 deletion tests/test_ac.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

import pytest

from hypothesis import strategies as st
from hypothesis import given, assume

from ahocorasick_rs import (
AhoCorasick,
MATCHKIND_STANDARD,
Expand All @@ -12,7 +15,7 @@

def test_basic_matching():
"""
find_matches_as_indexes() and find_matches_as_indexes() return matching
find_matches_as_indexes() and find_matches_as_strings() return matching
patterns in the given string.
"""
haystack = "hello, world, hello again"
Expand All @@ -30,6 +33,38 @@ def test_basic_matching():
assert ac.find_matches_as_strings(haystack) == expected


def test_unicode():
"""
Non-ASCII unicode patterns still give correct results for
find_matches_as_indexes().
"""
haystack = "hello, world ☃fishá l🤦l"
patterns = ["d ☃f", "há", "l🤦l"]
ac = AhoCorasick(patterns)
index_matches = ac.find_matches_as_indexes(haystack)
expected = ["d ☃f", "há", "l🤦l"]
assert [patterns[i] for (i, _, _) in index_matches] == expected
assert [haystack[s:e] for (_, s, e) in index_matches] == expected


@given(st.text(), st.text(min_size=1), st.text())
def test_unicode_extensive(prefix, pattern, suffix):
"""
Non-ASCII unicode patterns still give correct results for
find_matches_as_indexes(), with property-testing.
"""
assume(pattern not in prefix)
assume(pattern not in suffix)
haystack = prefix + pattern + suffix
ac = AhoCorasick([pattern])

index_matches = ac.find_matches_as_indexes(haystack)
expected = [pattern]
assert [i for (i, _, _) in index_matches] == [0]
assert [haystack[s:e] for (_, s, e) in index_matches] == expected
assert ac.find_matches_as_strings(haystack) == [pattern]


def test_matchkind():
"""
Different matchkinds give different results.
Expand Down

0 comments on commit 8a33ddf

Please sign in to comment.