Remove compatibility with legacy versions of Python (#64)

# Description Please include: * relevant motivation * a summary of the change * which issue is fixed. * any additional dependencies that are required for this change. Closes: # (issue) Remove compatibility with legacy versions of Python Remove `src/whoosh/compat.py` and then use Python 3 syntax. Copy `b()`, `memoryview_()`, and `u()` into files that use them on variables to encourage future local refactoring. # Checklist: - [x] I have performed a self-review of my own code - [ ] I have commented my code in hard-to-understand areas - [ ] I have made corresponding changes to the documentation
Sygil-Dev · Feb 10, 2024 · 6c32996 · 6c32996
2 parents 14ab92f + daaaea4
commit 6c32996
Show file tree

Hide file tree

Showing 108 changed files with 3,482 additions and 3,916 deletions.
diff --git a/benchmark/enron.py b/benchmark/enron.py
@@ -2,6 +2,7 @@
 import tarfile
 from email import message_from_string
 from marshal import dump, load
+from urllib.request import urlretrieve
 from zlib import compress, decompress
 
 try:
@@ -10,7 +11,6 @@
     pass
 
 from whoosh import analysis, fields
-from whoosh.compat import next, urlretrieve
 from whoosh.support.bench import Bench, Spec
 from whoosh.util import now
 

diff --git a/scripts/make_checkpoint.py b/scripts/make_checkpoint.py
@@ -10,7 +10,6 @@
 from datetime import datetime, timezone
 
 from whoosh import fields, index
-from whoosh.compat import u
 
 if len(sys.argv) < 2:
     print("USAGE: make_checkpoint.py <dir>")
@@ -28,7 +27,7 @@
     ngrams=fields.NGRAMWORDS,
 )
 
-words = u(
+words = (
     "alfa bravo charlie delta echo foxtrot golf hotel india"
     "juliet kilo lima mike november oskar papa quebec romeo"
     "sierra tango"
@@ -44,7 +43,7 @@
     with ix.writer() as w:
         for num in range(100):
             frac += 0.15
-            path = u(f"{segnum}/{num}")
+            path = f"{segnum}/{num}"
             title = " ".join(random.choice(words) for _ in range(100))
             dt = datetime(
                 year=2000 + counter,

diff --git a/scripts/read_checkpoint.py b/scripts/read_checkpoint.py
@@ -6,15 +6,14 @@
 import sys
 
 from whoosh import index, query
-from whoosh.compat import u
 
 if len(sys.argv) < 2:
     print("USAGE: read_checkpoint.py <dir>")
     sys.exit(1)
 indexdir = sys.argv[1]
 print("Reading checkpoint index in", indexdir)
 
-words = u(
+words = (
     "alfa bravo charlie delta echo foxtrot golf hotel india"
     "juliet kilo lima mike november oskar papa quebec romeo"
     "sierra tango"

diff --git a/src/whoosh/analysis/acore.py b/src/whoosh/analysis/acore.py
@@ -25,8 +25,6 @@
 # those of the authors and should not be interpreted as representing official
 # policies, either expressed or implied, of Matt Chaput.
 
-from whoosh.compat import iteritems
-
 # Exceptions
 
 
@@ -126,9 +124,7 @@ def __init__(
         self.__dict__.update(kwargs)
 
     def __repr__(self):
-        parms = ", ".join(
-            f"{name}={value!r}" for name, value in iteritems(self.__dict__)
-        )
+        parms = ", ".join(f"{name}={value!r}" for name, value in self.__dict__.items())
         return f"{self.__class__.__name__}({parms})"
 
     def copy(self):
@@ -153,7 +149,7 @@ def __repr__(self):
         attrs = ""
         if self.__dict__:
             attrs = ", ".join(
-                f"{key}={value!r}" for key, value in iteritems(self.__dict__)
+                f"{key}={value!r}" for key, value in self.__dict__.items()
             )
         return self.__class__.__name__ + f"({attrs})"
 

diff --git a/src/whoosh/analysis/filters.py b/src/whoosh/analysis/filters.py
@@ -28,7 +28,6 @@
 from itertools import chain
 
 from whoosh.analysis.acore import Composable
-from whoosh.compat import next
 from whoosh.util.text import rcompile
 
 # Default list of stop words (words so common it's usually wasteful to index

diff --git a/src/whoosh/analysis/intraword.py b/src/whoosh/analysis/intraword.py
@@ -29,7 +29,6 @@
 from collections import deque
 
 from whoosh.analysis.filters import Filter
-from whoosh.compat import text_type, u
 
 
 class CompoundWordFilter(Filter):
@@ -279,7 +278,7 @@ class IntraWordFilter(Filter):
     is_morph = True
 
     __inittypes__ = {
-        "delims": text_type,
+        "delims": str,
         "splitwords": bool,
         "splitnums": bool,
         "mergewords": bool,
@@ -288,7 +287,7 @@ class IntraWordFilter(Filter):
 
     def __init__(
         self,
-        delims=u("-_'\"()!@#$%^&*[]{}<>\\|;:,./?`~=+"),
+        delims="-_'\"()!@#$%^&*[]{}<>\\|;:,./?`~=+",
         splitwords=True,
         splitnums=True,
         mergewords=False,
@@ -311,22 +310,22 @@ def __init__(
         self.delims = re.escape(delims)
 
         # Expression for text between delimiter characters
-        self.between = re.compile(u("[^%s]+") % (self.delims,), re.UNICODE)
+        self.between = re.compile(f"[^{self.delims}]+", re.UNICODE)
         # Expression for removing "'s" from the end of sub-words
-        dispat = u("(?<=[%s%s])'[Ss](?=$|[%s])") % (lowercase, uppercase, self.delims)
+        dispat = f"(?<=[{lowercase}{uppercase}])'[Ss](?=$|[{self.delims}])"
         self.possessive = re.compile(dispat, re.UNICODE)
 
         # Expression for finding case and letter-number transitions
-        lower2upper = u("[%s][%s]") % (lowercase, uppercase)
-        letter2digit = u("[%s%s][%s]") % (lowercase, uppercase, digits)
-        digit2letter = u("[%s][%s%s]") % (digits, lowercase, uppercase)
+        lower2upper = f"[{lowercase}][{uppercase}]"
+        letter2digit = f"[{lowercase}{uppercase}][{digits}]"
+        digit2letter = f"[{digits}][{lowercase}{uppercase}]"
         if splitwords and splitnums:
-            splitpat = u("(%s|%s|%s)") % (lower2upper, letter2digit, digit2letter)
+            splitpat = f"({lower2upper}|{letter2digit}|{digit2letter})"
             self.boundary = re.compile(splitpat, re.UNICODE)
         elif splitwords:
-            self.boundary = re.compile(text_type(lower2upper), re.UNICODE)
+            self.boundary = re.compile(str(lower2upper), re.UNICODE)
         elif splitnums:
-            numpat = u("(%s|%s)") % (letter2digit, digit2letter)
+            numpat = f"({letter2digit}|{digit2letter})"
             self.boundary = re.compile(numpat, re.UNICODE)
 
         self.splitting = splitwords or splitnums

diff --git a/src/whoosh/analysis/morph.py b/src/whoosh/analysis/morph.py
@@ -26,7 +26,6 @@
 # policies, either expressed or implied, of Matt Chaput.
 
 from whoosh.analysis.filters import Filter
-from whoosh.compat import integer_types
 from whoosh.lang.dmetaphone import double_metaphone
 from whoosh.lang.porter import stem
 from whoosh.util.cache import lfu_cache, unbound_cache
@@ -120,7 +119,7 @@ def clear(self):
         else:
             stemfn = self.stemfn
 
-        if isinstance(self.cachesize, integer_types) and self.cachesize != 0:
+        if isinstance(self.cachesize, int) and self.cachesize != 0:
             if self.cachesize < 0:
                 self._stem = unbound_cache(stemfn)
             elif self.cachesize > 1:

diff --git a/src/whoosh/analysis/ngrams.py b/src/whoosh/analysis/ngrams.py
@@ -28,7 +28,6 @@
 from whoosh.analysis.acore import Token
 from whoosh.analysis.filters import Filter, LowercaseFilter
 from whoosh.analysis.tokenizers import RegexTokenizer, Tokenizer
-from whoosh.compat import text_type
 
 # Tokenizer
 
@@ -79,7 +78,7 @@ def __call__(
         mode="",
         **kwargs,
     ):
-        assert isinstance(value, text_type), f"{value!r} is not unicode"
+        assert isinstance(value, str), f"{value!r} is not unicode"
 
         inlen = len(value)
         t = Token(positions, chars, removestops=removestops, mode=mode)

diff --git a/src/whoosh/analysis/tokenizers.py b/src/whoosh/analysis/tokenizers.py
@@ -26,7 +26,6 @@
 # policies, either expressed or implied, of Matt Chaput.
 
 from whoosh.analysis.acore import Composable, Token
-from whoosh.compat import text_type, u
 from whoosh.util.text import rcompile
 
 default_pattern = rcompile(r"[\w\*]+(\.?[\w\*]+)*")
@@ -63,7 +62,7 @@ def __call__(
         mode="",
         **kwargs,
     ):
-        assert isinstance(value, text_type), f"{value!r} is not unicode"
+        assert isinstance(value, str), f"{value!r} is not unicode"
         t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)
         t.text = value
         t.boost = 1.0
@@ -82,7 +81,7 @@ class RegexTokenizer(Tokenizer):
     Uses a regular expression to extract tokens from text.
 
     >>> rex = RegexTokenizer()
-    >>> [token.text for token in rex(u("hi there 3.141 big-time under_score"))]
+    >>> [token.text for token in rex("hi there 3.141 big-time under_score")]
     ["hi", "there", "3.141", "big", "time", "under_score"]
     """
 
@@ -131,7 +130,7 @@ def __call__(
         :param tokenize: if True, the text should be tokenized.
         """
 
-        assert isinstance(value, text_type), f"{repr(value)} is not unicode"
+        assert isinstance(value, str), f"{repr(value)} is not unicode"
 
         t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)
         if not tokenize:
@@ -264,7 +263,7 @@ def __call__(
         :param tokenize: if True, the text should be tokenized.
         """
 
-        assert isinstance(value, text_type), f"{value!r} is not unicode"
+        assert isinstance(value, str), f"{value!r} is not unicode"
 
         t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)
         if not tokenize:
@@ -277,7 +276,7 @@ def __call__(
                 t.endchar = start_char + len(value)
             yield t
         else:
-            text = u("")
+            text = ""
             charmap = self.charmap
             pos = start_pos
             startchar = currentchar = start_char
@@ -299,7 +298,7 @@ def __call__(
                             t.endchar = currentchar
                         yield t
                     startchar = currentchar + 1
-                    text = u("")
+                    text = ""
 
                 currentchar += 1
 
@@ -352,7 +351,7 @@ def __init__(self, expression="[^/]+"):
         self.expr = rcompile(expression)
 
     def __call__(self, value, positions=False, start_pos=0, **kwargs):
-        assert isinstance(value, text_type), f"{value!r} is not unicode"
+        assert isinstance(value, str), f"{value!r} is not unicode"
         token = Token(positions, **kwargs)
         pos = start_pos
         for match in self.expr.finditer(value):