Skip to content

Commit

Permalink
Remove compatibility with legacy versions of Python (#64)
Browse files Browse the repository at this point in the history
# Description

Please include:
* relevant motivation
* a summary of the change
* which issue is fixed.
* any additional dependencies that are required for this change.

Closes: # (issue)
Remove compatibility with legacy versions of Python
Remove `src/whoosh/compat.py` and then use Python 3 syntax.
Copy `b()`, `memoryview_()`, and `u()` into files that use them on
variables to encourage future local refactoring.

# Checklist:

- [x] I have performed a self-review of my own code
- [ ] I have commented my code in hard-to-understand areas
- [ ] I have made corresponding changes to the documentation
  • Loading branch information
ZeroCool940711 authored Feb 10, 2024
2 parents 14ab92f + daaaea4 commit 6c32996
Show file tree
Hide file tree
Showing 108 changed files with 3,482 additions and 3,916 deletions.
2 changes: 1 addition & 1 deletion benchmark/enron.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import tarfile
from email import message_from_string
from marshal import dump, load
from urllib.request import urlretrieve
from zlib import compress, decompress

try:
Expand All @@ -10,7 +11,6 @@
pass

from whoosh import analysis, fields
from whoosh.compat import next, urlretrieve
from whoosh.support.bench import Bench, Spec
from whoosh.util import now

Expand Down
5 changes: 2 additions & 3 deletions scripts/make_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from datetime import datetime, timezone

from whoosh import fields, index
from whoosh.compat import u

if len(sys.argv) < 2:
print("USAGE: make_checkpoint.py <dir>")
Expand All @@ -28,7 +27,7 @@
ngrams=fields.NGRAMWORDS,
)

words = u(
words = (
"alfa bravo charlie delta echo foxtrot golf hotel india"
"juliet kilo lima mike november oskar papa quebec romeo"
"sierra tango"
Expand All @@ -44,7 +43,7 @@
with ix.writer() as w:
for num in range(100):
frac += 0.15
path = u(f"{segnum}/{num}")
path = f"{segnum}/{num}"
title = " ".join(random.choice(words) for _ in range(100))
dt = datetime(
year=2000 + counter,
Expand Down
3 changes: 1 addition & 2 deletions scripts/read_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,14 @@
import sys

from whoosh import index, query
from whoosh.compat import u

if len(sys.argv) < 2:
print("USAGE: read_checkpoint.py <dir>")
sys.exit(1)
indexdir = sys.argv[1]
print("Reading checkpoint index in", indexdir)

words = u(
words = (
"alfa bravo charlie delta echo foxtrot golf hotel india"
"juliet kilo lima mike november oskar papa quebec romeo"
"sierra tango"
Expand Down
8 changes: 2 additions & 6 deletions src/whoosh/analysis/acore.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.

from whoosh.compat import iteritems

# Exceptions


Expand Down Expand Up @@ -126,9 +124,7 @@ def __init__(
self.__dict__.update(kwargs)

def __repr__(self):
parms = ", ".join(
f"{name}={value!r}" for name, value in iteritems(self.__dict__)
)
parms = ", ".join(f"{name}={value!r}" for name, value in self.__dict__.items())
return f"{self.__class__.__name__}({parms})"

def copy(self):
Expand All @@ -153,7 +149,7 @@ def __repr__(self):
attrs = ""
if self.__dict__:
attrs = ", ".join(
f"{key}={value!r}" for key, value in iteritems(self.__dict__)
f"{key}={value!r}" for key, value in self.__dict__.items()
)
return self.__class__.__name__ + f"({attrs})"

Expand Down
1 change: 0 additions & 1 deletion src/whoosh/analysis/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
from itertools import chain

from whoosh.analysis.acore import Composable
from whoosh.compat import next
from whoosh.util.text import rcompile

# Default list of stop words (words so common it's usually wasteful to index
Expand Down
21 changes: 10 additions & 11 deletions src/whoosh/analysis/intraword.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
from collections import deque

from whoosh.analysis.filters import Filter
from whoosh.compat import text_type, u


class CompoundWordFilter(Filter):
Expand Down Expand Up @@ -279,7 +278,7 @@ class IntraWordFilter(Filter):
is_morph = True

__inittypes__ = {
"delims": text_type,
"delims": str,
"splitwords": bool,
"splitnums": bool,
"mergewords": bool,
Expand All @@ -288,7 +287,7 @@ class IntraWordFilter(Filter):

def __init__(
self,
delims=u("-_'\"()!@#$%^&*[]{}<>\\|;:,./?`~=+"),
delims="-_'\"()!@#$%^&*[]{}<>\\|;:,./?`~=+",
splitwords=True,
splitnums=True,
mergewords=False,
Expand All @@ -311,22 +310,22 @@ def __init__(
self.delims = re.escape(delims)

# Expression for text between delimiter characters
self.between = re.compile(u("[^%s]+") % (self.delims,), re.UNICODE)
self.between = re.compile(f"[^{self.delims}]+", re.UNICODE)
# Expression for removing "'s" from the end of sub-words
dispat = u("(?<=[%s%s])'[Ss](?=$|[%s])") % (lowercase, uppercase, self.delims)
dispat = f"(?<=[{lowercase}{uppercase}])'[Ss](?=$|[{self.delims}])"
self.possessive = re.compile(dispat, re.UNICODE)

# Expression for finding case and letter-number transitions
lower2upper = u("[%s][%s]") % (lowercase, uppercase)
letter2digit = u("[%s%s][%s]") % (lowercase, uppercase, digits)
digit2letter = u("[%s][%s%s]") % (digits, lowercase, uppercase)
lower2upper = f"[{lowercase}][{uppercase}]"
letter2digit = f"[{lowercase}{uppercase}][{digits}]"
digit2letter = f"[{digits}][{lowercase}{uppercase}]"
if splitwords and splitnums:
splitpat = u("(%s|%s|%s)") % (lower2upper, letter2digit, digit2letter)
splitpat = f"({lower2upper}|{letter2digit}|{digit2letter})"
self.boundary = re.compile(splitpat, re.UNICODE)
elif splitwords:
self.boundary = re.compile(text_type(lower2upper), re.UNICODE)
self.boundary = re.compile(str(lower2upper), re.UNICODE)
elif splitnums:
numpat = u("(%s|%s)") % (letter2digit, digit2letter)
numpat = f"({letter2digit}|{digit2letter})"
self.boundary = re.compile(numpat, re.UNICODE)

self.splitting = splitwords or splitnums
Expand Down
3 changes: 1 addition & 2 deletions src/whoosh/analysis/morph.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
# policies, either expressed or implied, of Matt Chaput.

from whoosh.analysis.filters import Filter
from whoosh.compat import integer_types
from whoosh.lang.dmetaphone import double_metaphone
from whoosh.lang.porter import stem
from whoosh.util.cache import lfu_cache, unbound_cache
Expand Down Expand Up @@ -120,7 +119,7 @@ def clear(self):
else:
stemfn = self.stemfn

if isinstance(self.cachesize, integer_types) and self.cachesize != 0:
if isinstance(self.cachesize, int) and self.cachesize != 0:
if self.cachesize < 0:
self._stem = unbound_cache(stemfn)
elif self.cachesize > 1:
Expand Down
3 changes: 1 addition & 2 deletions src/whoosh/analysis/ngrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
from whoosh.analysis.acore import Token
from whoosh.analysis.filters import Filter, LowercaseFilter
from whoosh.analysis.tokenizers import RegexTokenizer, Tokenizer
from whoosh.compat import text_type

# Tokenizer

Expand Down Expand Up @@ -79,7 +78,7 @@ def __call__(
mode="",
**kwargs,
):
assert isinstance(value, text_type), f"{value!r} is not unicode"
assert isinstance(value, str), f"{value!r} is not unicode"

inlen = len(value)
t = Token(positions, chars, removestops=removestops, mode=mode)
Expand Down
15 changes: 7 additions & 8 deletions src/whoosh/analysis/tokenizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
# policies, either expressed or implied, of Matt Chaput.

from whoosh.analysis.acore import Composable, Token
from whoosh.compat import text_type, u
from whoosh.util.text import rcompile

default_pattern = rcompile(r"[\w\*]+(\.?[\w\*]+)*")
Expand Down Expand Up @@ -63,7 +62,7 @@ def __call__(
mode="",
**kwargs,
):
assert isinstance(value, text_type), f"{value!r} is not unicode"
assert isinstance(value, str), f"{value!r} is not unicode"
t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)
t.text = value
t.boost = 1.0
Expand All @@ -82,7 +81,7 @@ class RegexTokenizer(Tokenizer):
Uses a regular expression to extract tokens from text.
>>> rex = RegexTokenizer()
>>> [token.text for token in rex(u("hi there 3.141 big-time under_score"))]
>>> [token.text for token in rex("hi there 3.141 big-time under_score")]
["hi", "there", "3.141", "big", "time", "under_score"]
"""

Expand Down Expand Up @@ -131,7 +130,7 @@ def __call__(
:param tokenize: if True, the text should be tokenized.
"""

assert isinstance(value, text_type), f"{repr(value)} is not unicode"
assert isinstance(value, str), f"{repr(value)} is not unicode"

t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)
if not tokenize:
Expand Down Expand Up @@ -264,7 +263,7 @@ def __call__(
:param tokenize: if True, the text should be tokenized.
"""

assert isinstance(value, text_type), f"{value!r} is not unicode"
assert isinstance(value, str), f"{value!r} is not unicode"

t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)
if not tokenize:
Expand All @@ -277,7 +276,7 @@ def __call__(
t.endchar = start_char + len(value)
yield t
else:
text = u("")
text = ""
charmap = self.charmap
pos = start_pos
startchar = currentchar = start_char
Expand All @@ -299,7 +298,7 @@ def __call__(
t.endchar = currentchar
yield t
startchar = currentchar + 1
text = u("")
text = ""

currentchar += 1

Expand Down Expand Up @@ -352,7 +351,7 @@ def __init__(self, expression="[^/]+"):
self.expr = rcompile(expression)

def __call__(self, value, positions=False, start_pos=0, **kwargs):
assert isinstance(value, text_type), f"{value!r} is not unicode"
assert isinstance(value, str), f"{value!r} is not unicode"
token = Token(positions, **kwargs)
pos = start_pos
for match in self.expr.finditer(value):
Expand Down
Loading

0 comments on commit 6c32996

Please sign in to comment.