Skip to content
This repository has been archived by the owner on Aug 26, 2024. It is now read-only.

_set_token_ratio now keeps tokenization. #300

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion fuzzywuzzy/StringMatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
License available here: https://github.com/miohtama/python-Levenshtein/blob/master/COPYING
"""

from Levenshtein import *
from Levenshtein._levenshtein import *
from warnings import warn


Expand Down Expand Up @@ -64,6 +64,16 @@ def ratio(self):
self._ratio = ratio(self._str1, self._str2)
return self._ratio

def setratio(self):
if(not hasattr(self, '_setratio')):
self._setratio = setratio(self._str1, self._str2)
return self._setratio

def seqratio(self):
if(not hasattr(self, '_seqratio')):
self._seqratio = seqratio(self._str1, self._str2)
return self._seqratio

def quick_ratio(self):
# This is usually quick enough :o)
if not self._ratio:
Expand Down
48 changes: 38 additions & 10 deletions fuzzywuzzy/fuzz.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
from .StringMatcher import StringMatcher as SequenceMatcher
except ImportError:
if platform.python_implementation() != "PyPy":
warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
warnings.warn(
'Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
from difflib import SequenceMatcher

from . import utils
Expand All @@ -28,6 +29,26 @@ def ratio(s1, s2):
return utils.intr(100 * m.ratio())


@utils.check_for_none
@utils.check_for_equivalence
@utils.check_empty_string
def setratio(s1, s2):
s1, s2 = utils.make_type_consistent(s1, s2)

m = SequenceMatcher(None, s1, s2)
return utils.intr(100 * m.setratio())


@utils.check_for_none
@utils.check_for_equivalence
@utils.check_empty_string
def seqratio(s1, s2):
s1, s2 = utils.make_type_consistent(s1, s2)

m = SequenceMatcher(None, s1, s2)
return utils.intr(100 * m.seqratio())


@utils.check_for_none
@utils.check_for_equivalence
@utils.check_empty_string
Expand Down Expand Up @@ -124,8 +145,10 @@ def _token_set(s1, s2, partial=True, force_ascii=True, full_process=True):
if not full_process and s1 == s2:
return 100

p1 = utils.full_process(s1, force_ascii=force_ascii) if full_process else s1
p2 = utils.full_process(s2, force_ascii=force_ascii) if full_process else s2
p1 = utils.full_process(
s1, force_ascii=force_ascii) if full_process else s1
p2 = utils.full_process(
s2, force_ascii=force_ascii) if full_process else s2

if not utils.validate_string(p1):
return 0
Expand All @@ -140,23 +163,28 @@ def _token_set(s1, s2, partial=True, force_ascii=True, full_process=True):
diff1to2 = tokens1.difference(tokens2)
diff2to1 = tokens2.difference(tokens1)

sorted_sect = " ".join(sorted(intersection))
sorted_1to2 = " ".join(sorted(diff1to2))
sorted_2to1 = " ".join(sorted(diff2to1))
delimiter = "+++"
sorted_sect = delimiter.join(sorted(intersection))
sorted_1to2 = delimiter.join(sorted(diff1to2))
sorted_2to1 = delimiter.join(sorted(diff2to1))

combined_1to2 = sorted_sect + " " + sorted_1to2
combined_2to1 = sorted_sect + " " + sorted_2to1
combined_1to2 = sorted_sect + delimiter + sorted_1to2
combined_2to1 = sorted_sect + delimiter + sorted_2to1

# strip
sorted_sect = sorted_sect.strip()
combined_1to2 = combined_1to2.strip()
combined_2to1 = combined_2to1.strip()

# replace
sorted_sect = sorted_sect.replace(delimiter, " ")
combined_1to2 = combined_1to2.replace(delimiter, " ")
combined_2to1 = combined_2to1.replace(delimiter, " ")

if partial:
ratio_func = partial_ratio
else:
ratio_func = ratio

ratio_func = setratio
pairwise = [
ratio_func(sorted_sect, combined_1to2),
ratio_func(sorted_sect, combined_2to1),
Expand Down
1 change: 1 addition & 0 deletions test_fuzzywuzzy.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ def testTokenSetRatio(self):
self.assertEqual(fuzz.token_set_ratio(self.s9, self.s9a, full_process=True), 100)
self.assertEqual(fuzz.token_set_ratio(self.s9, self.s9a, full_process=False), 100)
self.assertEqual(fuzz.token_set_ratio(self.s10, self.s10a, full_process=False), 50)


def testPartialTokenSetRatio(self):
self.assertEqual(fuzz.partial_token_set_ratio(self.s4, self.s7), 100)
Expand Down