diff --git a/fuzzywuzzy/StringMatcher.py b/fuzzywuzzy/StringMatcher.py index d35e075f..c24c73f5 100644 --- a/fuzzywuzzy/StringMatcher.py +++ b/fuzzywuzzy/StringMatcher.py @@ -8,7 +8,7 @@ License available here: https://github.com/miohtama/python-Levenshtein/blob/master/COPYING """ -from Levenshtein import * +from Levenshtein._levenshtein import * from warnings import warn @@ -64,6 +64,16 @@ def ratio(self): self._ratio = ratio(self._str1, self._str2) return self._ratio + def setratio(self): + if(not hasattr(self, '_setratio')): + self._setratio = setratio(self._str1, self._str2) + return self._setratio + + def seqratio(self): + if(not hasattr(self, '_seqratio')): + self._seqratio = seqratio(self._str1, self._str2) + return self._seqratio + def quick_ratio(self): # This is usually quick enough :o) if not self._ratio: diff --git a/fuzzywuzzy/fuzz.py b/fuzzywuzzy/fuzz.py index 27f80c9f..5b69e146 100644 --- a/fuzzywuzzy/fuzz.py +++ b/fuzzywuzzy/fuzz.py @@ -8,7 +8,8 @@ from .StringMatcher import StringMatcher as SequenceMatcher except ImportError: if platform.python_implementation() != "PyPy": - warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning') + warnings.warn( + 'Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning') from difflib import SequenceMatcher from . import utils @@ -28,6 +29,26 @@ def ratio(s1, s2): return utils.intr(100 * m.ratio()) +@utils.check_for_none +@utils.check_for_equivalence +@utils.check_empty_string +def setratio(s1, s2): + s1, s2 = utils.make_type_consistent(s1, s2) + + m = SequenceMatcher(None, s1, s2) + return utils.intr(100 * m.setratio()) + + +@utils.check_for_none +@utils.check_for_equivalence +@utils.check_empty_string +def seqratio(s1, s2): + s1, s2 = utils.make_type_consistent(s1, s2) + + m = SequenceMatcher(None, s1, s2) + return utils.intr(100 * m.seqratio()) + + @utils.check_for_none @utils.check_for_equivalence @utils.check_empty_string @@ -124,8 +145,10 @@ def _token_set(s1, s2, partial=True, force_ascii=True, full_process=True): if not full_process and s1 == s2: return 100 - p1 = utils.full_process(s1, force_ascii=force_ascii) if full_process else s1 - p2 = utils.full_process(s2, force_ascii=force_ascii) if full_process else s2 + p1 = utils.full_process( + s1, force_ascii=force_ascii) if full_process else s1 + p2 = utils.full_process( + s2, force_ascii=force_ascii) if full_process else s2 if not utils.validate_string(p1): return 0 @@ -140,23 +163,28 @@ def _token_set(s1, s2, partial=True, force_ascii=True, full_process=True): diff1to2 = tokens1.difference(tokens2) diff2to1 = tokens2.difference(tokens1) - sorted_sect = " ".join(sorted(intersection)) - sorted_1to2 = " ".join(sorted(diff1to2)) - sorted_2to1 = " ".join(sorted(diff2to1)) + delimiter = "+++" + sorted_sect = delimiter.join(sorted(intersection)) + sorted_1to2 = delimiter.join(sorted(diff1to2)) + sorted_2to1 = delimiter.join(sorted(diff2to1)) - combined_1to2 = sorted_sect + " " + sorted_1to2 - combined_2to1 = sorted_sect + " " + sorted_2to1 + combined_1to2 = sorted_sect + delimiter + sorted_1to2 + combined_2to1 = sorted_sect + delimiter + sorted_2to1 # strip sorted_sect = sorted_sect.strip() combined_1to2 = combined_1to2.strip() combined_2to1 = combined_2to1.strip() + # replace + sorted_sect = sorted_sect.replace(delimiter, " ") + combined_1to2 = combined_1to2.replace(delimiter, " ") + combined_2to1 = combined_2to1.replace(delimiter, " ") + if partial: ratio_func = partial_ratio else: - ratio_func = ratio - + ratio_func = setratio pairwise = [ ratio_func(sorted_sect, combined_1to2), ratio_func(sorted_sect, combined_2to1), diff --git a/test_fuzzywuzzy.py b/test_fuzzywuzzy.py index 58617b68..7bf3e422 100644 --- a/test_fuzzywuzzy.py +++ b/test_fuzzywuzzy.py @@ -140,6 +140,7 @@ def testTokenSetRatio(self): self.assertEqual(fuzz.token_set_ratio(self.s9, self.s9a, full_process=True), 100) self.assertEqual(fuzz.token_set_ratio(self.s9, self.s9a, full_process=False), 100) self.assertEqual(fuzz.token_set_ratio(self.s10, self.s10a, full_process=False), 50) + def testPartialTokenSetRatio(self): self.assertEqual(fuzz.partial_token_set_ratio(self.s4, self.s7), 100)