Code quality improvements and bug fixes (#46)

This pull request includes several code quality improvements and bug fixes. It removes unused code, improves exception handling, fixes import issues, and enhances code readability. It also adds missing arguments and references to unused variables for future improvements. Additionally, it replaces generic exceptions with more specific ones and uses binary search instead of linear search. The code has been refactored to improve readability and converted to use f-strings for string formatting.
Sygil-Dev · Feb 4, 2024 · d3dd90a · d3dd90a
2 parents eeb6a98 + 0be14d1
commit d3dd90a
Show file tree

Hide file tree

Showing 81 changed files with 691 additions and 1,001 deletions.
diff --git a/.github/workflows/deploy-github-pages.yml b/.github/workflows/deploy-github-pages.yml
@@ -14,8 +14,8 @@ jobs:
     name: Deploy to GitHub Pages
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
+      - uses: actions/checkout@v4.1.1
+      - uses: actions/setup-python@v5.0.0
         with:
             python-version: '3.7'
 

diff --git a/README.md b/README.md
@@ -1,8 +1,20 @@
 [![CodeFactor](https://www.codefactor.io/repository/github/sygil-dev/whoosh-reloaded/badge/main)](https://www.codefactor.io/repository/github/sygil-dev/whoosh-reloaded/overview/main)
+[![codecov](https://codecov.io/gh/Sygil-Dev/whoosh-reloaded/graph/badge.svg?token=O3Z2DFB8UA)](https://codecov.io/gh/Sygil-Dev/whoosh-reloaded)
 [![Documentation Status](https://readthedocs.org/projects/whoosh-reloaded/badge/?version=latest)](https://whoosh-reloaded.readthedocs.io/en/latest/?badge=latest)
-[![codecov](https://codecov.io/gh/Sygil-Dev/whoosh-reloaded/branch/master/graph/badge.svg)](https://codecov.io/gh/Sygil-Dev/whoosh-reloaded)
 [![PyPI version](https://badge.fury.io/py/Whoosh-Reloaded.svg)](https://badge.fury.io/py/Whoosh-Reloaded) [![Downloads](https://pepy.tech/badge/whoosh-reloaded)](https://pepy.tech/project/whoosh-reloaded) [![License](https://img.shields.io/pypi/l/Whoosh-Reloaded)](https://pypi.org/project/Whoosh-Reloaded/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/Whoosh-Reloaded)](https://pypi.org/project/Whoosh-Reloaded/) [![PyPI - Wheel](https://img.shields.io/pypi/wheel/Whoosh-Reloaded)](https://pypi.org/project/Whoosh-Reloaded/) [![PyPI - Format](https://img.shields.io/pypi/format/Whoosh-Reloaded)](https://pypi.org/project/Whoosh-Reloaded/) [![PyPI - Status](https://img.shields.io/pypi/status/Whoosh-Reloaded)](https://pypi.org/project/Whoosh-Reloaded/)
 
+[![Lines of Code](https://sonarcloud.io/api/project_badges/measure?project=Sygil-Dev_whoosh-reloaded&metric=ncloc)](https://sonarcloud.io/summary/new_code?id=Sygil-Dev_whoosh-reloaded)
+[![Code Smells](https://sonarcloud.io/api/project_badges/measure?project=Sygil-Dev_whoosh-reloaded&metric=code_smells)](https://sonarcloud.io/summary/new_code?id=Sygil-Dev_whoosh-reloaded)
+[![Maintainability Rating](https://sonarcloud.io/api/project_badges/measure?project=Sygil-Dev_whoosh-reloaded&metric=sqale_rating)](https://sonarcloud.io/summary/new_code?id=Sygil-Dev_whoosh-reloaded)
+[![Security Rating](https://sonarcloud.io/api/project_badges/measure?project=Sygil-Dev_whoosh-reloaded&metric=security_rating)](https://sonarcloud.io/summary/new_code?id=Sygil-Dev_whoosh-reloaded)
+[![Bugs](https://sonarcloud.io/api/project_badges/measure?project=Sygil-Dev_whoosh-reloaded&metric=bugs)](https://sonarcloud.io/summary/new_code?id=Sygil-Dev_whoosh-reloaded)
+[![Vulnerabilities](https://sonarcloud.io/api/project_badges/measure?project=Sygil-Dev_whoosh-reloaded&metric=vulnerabilities)](https://sonarcloud.io/summary/new_code?id=Sygil-Dev_whoosh-reloaded)
+[![Duplicated Lines (%)](https://sonarcloud.io/api/project_badges/measure?project=Sygil-Dev_whoosh-reloaded&metric=duplicated_lines_density)](https://sonarcloud.io/summary/new_code?id=Sygil-Dev_whoosh-reloaded)
+[![Reliability Rating](https://sonarcloud.io/api/project_badges/measure?project=Sygil-Dev_whoosh-reloaded&metric=reliability_rating)](https://sonarcloud.io/summary/new_code?id=Sygil-Dev_whoosh-reloaded)
+[![Quality Gate Status](https://sonarcloud.io/api/project_badges/measure?project=Sygil-Dev_whoosh-reloaded&metric=alert_status)](https://sonarcloud.io/summary/new_code?id=Sygil-Dev_whoosh-reloaded)
+[![Technical Debt](https://sonarcloud.io/api/project_badges/measure?project=Sygil-Dev_whoosh-reloaded&metric=sqale_index)](https://sonarcloud.io/summary/new_code?id=Sygil-Dev_whoosh-reloaded)
+
+
 --------------------------------------
 
 > **Notice:** This repository (**whoosh-reloaded**) is a fork and continuation of the Whoosh project. The original Whoosh project is no longer maintained. 

diff --git a/benchmark/dictionary.py b/benchmark/dictionary.py
@@ -1,4 +1,5 @@
-import os.path, gzip
+import gzip
+import os.path
 
 from whoosh import analysis, fields
 from whoosh.support.bench import Bench, Spec
@@ -28,7 +29,7 @@ def documents(self):
 
     def whoosh_schema(self):
         ana = analysis.StemmingAnalyzer()
-        # ana = analysis.StandardAnalyzer()
+
         schema = fields.Schema(
             head=fields.ID(stored=True), body=fields.TEXT(analyzer=ana, stored=True)
         )

diff --git a/benchmark/enron.py b/benchmark/enron.py
@@ -45,10 +45,10 @@ class Enron(Spec):
     # the messages in an easier-to-digest format
 
     def download_archive(self, archive):
-        print("Downloading Enron email archive to %r..." % archive)
+        print(f"Downloading Enron email archive to {archive}...")
         t = now()
         urlretrieve(self.enron_archive_url, archive)
-        print("Downloaded in ", now() - t, "seconds")
+        print(f"Downloaded in {now() - t} seconds")
 
     @staticmethod
     def get_texts(archive):
@@ -84,10 +84,10 @@ def get_messages(archive, headers=True):
             yield d
 
     def cache_messages(self, archive, cache):
-        print("Caching messages in %s..." % cache)
+        print(f"Caching messages in {cache}...")
 
         if not os.path.exists(archive):
-            raise Exception("Archive file %r does not exist" % archive)
+            raise FileNotFoundError(f"Archive file {archive} does not exist")
 
         t = now()
         f = open(cache, "wb")
@@ -98,7 +98,7 @@ def cache_messages(self, archive, cache):
             if not c % 1000:
                 print(c)
         f.close()
-        print("Cached messages in ", now() - t, "seconds")
+        print(f"Cached messages in {now() - t} seconds")
 
     def setup(self):
         archive = os.path.abspath(
@@ -118,7 +118,7 @@ def setup(self):
 
     def documents(self):
         if not os.path.exists(self.cache_filename):
-            raise Exception("Message cache does not exist, use --setup")
+            raise FileNotFoundError("Message cache does not exist, use --setup")
 
         f = open(self.cache_filename, "rb")
         try:
@@ -176,7 +176,7 @@ def process_document_whoosh(self, d):
         d["filepos"] = self.filepos
         if self.options.storebody:
             mf = self.main_field
-            d["_stored_%s" % mf] = compress(d[mf], 9)
+            d[f"_stored_{mf}"] = compress(d[mf], 9)
 
     def process_result_whoosh(self, d):
         mf = self.main_field

diff --git a/benchmark/marc21.py b/benchmark/marc21.py
@@ -27,7 +27,7 @@ def read_file(dbfile, tags=None):
         if not first5:
             return
         if len(first5) < 5:
-            raise Exception
+            raise ValueError("Invalid length")
         length = int(first5)
         chunk = dbfile.read(length - 5)
         yield parse_record(first5 + chunk, tags), pos
@@ -63,7 +63,7 @@ def parse_record(data, tags=None):
         start = dirstart + i * DIRECTORY_ENTRY_LEN
         end = start + DIRECTORY_ENTRY_LEN
         tag = data[start : start + 3]
-        if tags and not tag in tags:
+        if tags and tag not in tags:
             continue
 
         entry = data[start:end]
@@ -135,7 +135,7 @@ def uniform_title(d):
 
 
 subjectfields = (
-    "600 610 611 630 648 650 651 653 654 655 656 657 658 662 " "690 691 696 697 698 699"
+    "600 610 611 630 648 650 651 653 654 655 656 657 658 662 690 691 696 697 698 699"
 ).split()
 
 
@@ -191,7 +191,7 @@ def make_index(basedir, ixdir, procs=4, limitmb=128, multisegment=True, glob="*.
     mfields.update("100 110 111".split())  # Author
     mfields.add("245")  # Title
 
-    print("Indexing with %d processor(s) and %d MB per processor" % (procs, limitmb))
+    print(f"Indexing with {procs} processor(s) and {limitmb} MB per processor")
     c = 0
     t = now()
     ix = index.create_in(ixdir, schema)
@@ -241,7 +241,7 @@ def search(qstring, ixdir, basedir, limit=None, optimize=True, scores=True):
             r = s.search(q, limit=limit, optimize=optimize)
             for hit in r:
                 print_record(hit.rank, basedir, hit["file"], hit["pos"])
-            print("Found %d records in %0.06f seconds" % (len(r), r.runtime))
+            print(f"Found {len(r)} records in {r.runtime:0.06f} seconds")
         else:
             t = now()
             for i, docnum in enumerate(s.docs_for_query(q)):
@@ -302,7 +302,7 @@ def search(qstring, ixdir, basedir, limit=None, optimize=True, scores=True):
         "-M",
         "--merge-segments",
         dest="multisegment",
-        help="If indexing with multiproc, merge the segments after" " indexing",
+        help="If indexing with multiproc, merge the segments after indexing",
         action="store_false",
         default=True,
     )

diff --git a/scripts/make_checkpoint.py b/scripts/make_checkpoint.py
@@ -43,7 +43,7 @@
     with ix.writer() as w:
         for num in range(100):
             frac += 0.15
-            path = u("%s/%s" % (segnum, num))
+            path = u(f"{segnum}/{num}")
             title = " ".join(random.choice(words) for _ in range(100))
             dt = datetime(year=2000 + counter, month=(counter % 12) + 1, day=15)
 

diff --git a/setup.cfg b/setup.cfg
@@ -1,5 +1,5 @@
 [wheel]
-universal = 1
+universal = True
 
 [build_sphinx]
 build-dir = docs/build
@@ -12,27 +12,13 @@ upload-dir = docs/build/html
 formats = zip,gztar
 
 [aliases]
-push = sdist bdist_wheel upload
+push = sdist bdist_wheel twine upload
 pushdocs = build_sphinx upload_sphinx
 
 [tool:pytest]
 ; --tb= traceback print mode (long/short/line/native/no)
-addopts = -rs --tb=native
+addopts = -rs --tb=short
 
 norecursedirs = .hg .tox _build tmp* env* benchmark stress
-minversion = 2.0
+minversion = 3.0
 python_files = test_*.py
-pep8ignore =
- *.py E121 E122 E123 E124 E125 E126 E127 E128  # continuation line indentation
- *.py E401  # imports on separate lines
- *.py W391  # blank line at end of file
-
- test_*.py E501  # Ignore long lines in tests
-
- upload.py ALL  # 3rd party (and not in the repo): rietveld upload tool
- docs/source/conf.py ALL  # sphinx stuff, automatically generated, don't check this
- src/whoosh/lang/*.py ALL  # 3rd party / crashing py.test with non-ascii stuff
- src/whoosh/lang/snowball/*.py ALL  # 3rd party
- src/whoosh/support/relativedelta.py ALL  # 3rd party
- src/whoosh/support/charset.py ALL  # non-ascii py.test crash
- src/whoosh/support/unicode.py ALL  # non-ascii py.test crash
diff --git a/src/whoosh/analysis/acore.py b/src/whoosh/analysis/acore.py
@@ -123,9 +123,9 @@ def __init__(self, positions=False, chars=False, removestops=True, mode='',
         self.__dict__.update(kwargs)
 
     def __repr__(self):
-        parms = ", ".join("%s=%r" % (name, value)
+        parms = ", ".join(f"{name}={value!r}"
                           for name, value in iteritems(self.__dict__))
-        return "%s(%s)" % (self.__class__.__name__, parms)
+        return f"{self.__class__.__name__}({parms})"
 
     def copy(self):
         # This is faster than using the copy module
@@ -141,16 +141,16 @@ def __or__(self, other):
         from whoosh.analysis.analyzers import CompositeAnalyzer
 
         if not isinstance(other, Composable):
-            raise TypeError("%r is not composable with %r" % (self, other))
+            raise TypeError(f"{self!r} is not composable with {other!r}")
         return CompositeAnalyzer(self, other)
 
     def __repr__(self):
         attrs = ""
         if self.__dict__:
-            attrs = ", ".join("%s=%r" % (key, value)
+            attrs = ", ".join(f"{key}={value!r}"
                               for key, value
                               in iteritems(self.__dict__))
-        return self.__class__.__name__ + "(%s)" % attrs
+        return self.__class__.__name__ + f"({attrs})"
 
     def has_morph(self):
         return self.is_morph
diff --git a/src/whoosh/analysis/analyzers.py b/src/whoosh/analysis/analyzers.py
@@ -46,7 +46,7 @@ class Analyzer(Composable):
     """Abstract base class for analyzers."""
 
     def __repr__(self):
-        return "%s()" % self.__class__.__name__
+        return f"{self.__class__.__name__}()"
 
     def __eq__(self, other):
         return (
@@ -59,6 +59,7 @@ def __call__(self, value, **kwargs):
         raise NotImplementedError
 
     def clean(self):
+        # This method is intentionally left empty.
         pass
 
 
@@ -78,8 +79,7 @@ def __init__(self, *composables):
         for item in self.items[1:]:
             if isinstance(item, Tokenizer):
                 raise CompositionError(
-                    "Only one tokenizer allowed at the start"
-                    " of the analyzer: %r" % self.items
+                    f"Only one tokenizer allowed at the start of the analyzer: {self.items}"
                 )
 
     def __repr__(self):
@@ -239,7 +239,6 @@ def FancyAnalyzer(
     expression=r"\s+",
     stoplist=STOP_WORDS,
     minsize=2,
-    maxsize=None,
     gaps=True,
     splitwords=True,
     splitnums=True,

diff --git a/src/whoosh/analysis/filters.py b/src/whoosh/analysis/filters.py
@@ -114,7 +114,7 @@ def __eq__(self, other):
         )
 
     def __ne__(self, other):
-        return not self == other
+        return self != other
 
     def __call__(self, tokens):
         raise NotImplementedError
@@ -181,8 +181,8 @@ def __eq__(self, other):
     def __call__(self, tokens):
         # Only selects on the first token
         t = next(tokens)
-        filter = self.filters.get(t.mode, self.default_filter)
-        return filter(chain([t], tokens))
+        selected_filter = self.filters.get(t.mode, self.default_filter)
+        return selected_filter(chain([t], tokens))
 
 
 class TeeFilter(Filter):
@@ -212,7 +212,7 @@ class TeeFilter(Filter):
 
     def __init__(self, *filters):
         if len(filters) < 2:
-            raise Exception("TeeFilter requires two or more filters")
+            raise ValueError("TeeFilter requires two or more filters")
         self.filters = filters
 
     def __eq__(self, other):

diff --git a/src/whoosh/analysis/ngrams.py b/src/whoosh/analysis/ngrams.py
@@ -81,7 +81,7 @@ def __call__(
         mode="",
         **kwargs
     ):
-        assert isinstance(value, text_type), "%r is not unicode" % value
+        assert isinstance(value, text_type), f"{value!r} is not unicode"
 
         inlen = len(value)
         t = Token(positions, chars, removestops=removestops, mode=mode)

diff --git a/src/whoosh/analysis/tokenizers.py b/src/whoosh/analysis/tokenizers.py
@@ -64,7 +64,7 @@ def __call__(
         mode="",
         **kwargs
     ):
-        assert isinstance(value, text_type), "%r is not unicode" % value
+        assert isinstance(value, text_type), f"{value!r} is not unicode"
         t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)
         t.text = value
         t.boost = 1.0
@@ -132,7 +132,7 @@ def __call__(
         :param tokenize: if True, the text should be tokenized.
         """
 
-        assert isinstance(value, text_type), "%s is not unicode" % repr(value)
+        assert isinstance(value, text_type), f"{repr(value)} is not unicode"
 
         t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)
         if not tokenize:
@@ -265,7 +265,7 @@ def __call__(
         :param tokenize: if True, the text should be tokenized.
         """
 
-        assert isinstance(value, text_type), "%r is not unicode" % value
+        assert isinstance(value, text_type), f"{value!r} is not unicode"
 
         t = Token(positions, chars, removestops=removestops, mode=mode, **kwargs)
         if not tokenize:
@@ -353,7 +353,7 @@ def __init__(self, expression="[^/]+"):
         self.expr = rcompile(expression)
 
     def __call__(self, value, positions=False, start_pos=0, **kwargs):
-        assert isinstance(value, text_type), "%r is not unicode" % value
+        assert isinstance(value, text_type), f"{value!r} is not unicode"
         token = Token(positions, **kwargs)
         pos = start_pos
         for match in self.expr.finditer(value):

diff --git a/src/whoosh/automata/fsa.py b/src/whoosh/automata/fsa.py
@@ -19,7 +19,7 @@ def __init__(self, name):
         self.name = name
 
     def __repr__(self):
-        return "<%s>" % self.name
+        return f"<{self.name}>"
 
 
 EPSILON = Marker("EPSILON")
@@ -636,7 +636,7 @@ def __init__(self, n):
         self.final = False
 
     def __repr__(self):
-        return "<%s, %r>" % (self.n, self.tuple())
+        return f"<{self.n}, {self.tuple()!r}>"
 
     def __hash__(self):
         return hash(self.tuple())