diff --git a/fuzzywuzzy/StringMatcher.py b/fuzzywuzzy/StringMatcher.py index d35e075f..dd1312c0 100644 --- a/fuzzywuzzy/StringMatcher.py +++ b/fuzzywuzzy/StringMatcher.py @@ -18,6 +18,7 @@ class StringMatcher: def _reset_cache(self): self._ratio = self._distance = None self._opcodes = self._editops = self._matching_blocks = None + self._jaro_ratio = self._prefix_length = None def __init__(self, isjunk=None, seq1='', seq2=''): if isjunk: @@ -78,3 +79,19 @@ def distance(self): if not self._distance: self._distance = distance(self._str1, self._str2) return self._distance + + def jaro_winkler_ratio(self): + if not self._jaro_ratio: + if self._prefix_length: + prefix_weight = 1.0 / self._prefix_length + self._jaro_ratio = jaro_winkler(self._str1, self._str2, prefix_weight) + return self._jaro_ratio + self._jaro_ratio = jaro_winkler(self._str1, self._str2) + return self._jaro_ratio + + def set_prefix_length(self, common_prefix_length): + if common_prefix_length: + if isinstance(common_prefix_length, int) and common_prefix_length > 0: + self._prefix_length = common_prefix_length + return + self._prefix_length = None diff --git a/fuzzywuzzy/fuzz.py b/fuzzywuzzy/fuzz.py index 27f80c9f..c606f964 100644 --- a/fuzzywuzzy/fuzz.py +++ b/fuzzywuzzy/fuzz.py @@ -28,6 +28,17 @@ def ratio(s1, s2): return utils.intr(100 * m.ratio()) +@utils.check_for_none +@utils.check_for_equivalence +@utils.check_empty_string +def jaro_winkler_ratio(s1, s2, common_prefix_length=None): + s1, s2 = utils.make_type_consistent(s1, s2) + + m = SequenceMatcher(None, s1, s2) + m.set_prefix_length(common_prefix_length) + return utils.intr(100 * m.jaro_winkler_ratio()) + + @utils.check_for_none @utils.check_for_equivalence @utils.check_empty_string diff --git a/test_fuzzywuzzy.py b/test_fuzzywuzzy.py index 58617b68..a78b2922 100644 --- a/test_fuzzywuzzy.py +++ b/test_fuzzywuzzy.py @@ -91,6 +91,17 @@ def setUp(self): self.s9a = '{a' self.s10 = 'a{' self.s10a = '{b' + # Jaro tests + self.s11 = 'initial 10test' + self.s11a = 'initial 10 test' + self.s12 = 'five test' + self.s12a = 'five case' + self.s13 = 'seven t' + self.s13a = 'seven test' + self.s14 = 'Thorkel' + self.s14a = 'Thorgier' + self.s15 = 'D' + self.s15a = 'Dinsdale' self.cirque_strings = [ "cirque du soleil - zarkana - las vegas", @@ -123,6 +134,24 @@ def testCaseInsensitive(self): def testPartialRatio(self): self.assertEqual(fuzz.partial_ratio(self.s1, self.s3), 100) + def testJaroWinkler(self): + self.assertEqual(fuzz.jaro_winkler_ratio(self.s1, self.s1a), 100) + self.assertEqual(fuzz.jaro_winkler_ratio(self.s8, self.s8a), 100) + self.assertEqual(fuzz.jaro_winkler_ratio(self.s9, self.s9a), 100) + self.assertEqual(fuzz.jaro_winkler_ratio(self.s11, self.s11a), 100) + self.assertEqual(fuzz.jaro_winkler_ratio(self.s12, self.s12a, common_prefix_length=5), 100) + self.assertEqual(fuzz.jaro_winkler_ratio(self.s12, self.s12a), 90) + self.assertEqual(fuzz.jaro_winkler_ratio(self.s13, self.s13a, common_prefix_length=7), 100) + self.assertEqual(fuzz.jaro_winkler_ratio(self.s14, self.s14a), 87) + self.assertEqual(fuzz.jaro_winkler_ratio(self.s14, self.s14a, common_prefix_length=4), 100) + self.assertEqual(fuzz.jaro_winkler_ratio(self.s14, self.s14a, common_prefix_length=0), 87) + self.assertEqual(fuzz.jaro_winkler_ratio(self.s14, self.s14a, common_prefix_length=4.2), 87) + self.assertEqual(fuzz.jaro_winkler_ratio(self.s15, self.s15a), 74) + self.assertEqual(fuzz.jaro_winkler_ratio(self.s15, self.s15a, common_prefix_length=1), 100) + self.assertEqual(fuzz.jaro_winkler_ratio(self.s15, self.s15a, common_prefix_length=-1), 74) + self.assertEqual(fuzz.jaro_winkler_ratio(self.s15, self.s15a, common_prefix_length="1"), 74) + self.assertEqual(fuzz.jaro_winkler_ratio(self.s15, self.s15a, common_prefix_length=1000), 71) + def testTokenSortRatio(self): self.assertEqual(fuzz.token_sort_ratio(self.s1, self.s1a), 100)