wfondrie · wfondrie · Mar 7, 2024 · Dec 6, 2023 · Dec 6, 2023 · Dec 6, 2023
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -8,7 +8,7 @@ repos:
   - id: trailing-whitespace
   - id: detect-private-key
 - repo: https://github.com/charliermarsh/ruff-pre-commit
-  rev: v0.1.7
+  rev: v0.3.1
   hooks:
     # Run the linter.
     - id: ruff

diff --git a/depthcharge/__init__.py b/depthcharge/__init__.py
@@ -1,4 +1,5 @@
 """Initialize the depthcharge package."""
+
 # Ignore a bunch of pkg_resources warnings from dependencies:
 import warnings
 

diff --git a/depthcharge/constants.py b/depthcharge/constants.py
@@ -1,4 +1,5 @@
 """Constants."""
+
 HYDROGEN = 1.007825035
 OXYGEN = 15.99491463
 H2O = 2 * HYDROGEN + OXYGEN

diff --git a/depthcharge/data/__init__.py b/depthcharge/data/__init__.py
@@ -1,12 +1,13 @@
 """The Pytorch Datasets."""
+
 from . import preprocessing
+from .analyte_datasets import AnalyteDataset
 from .arrow import (
     spectra_to_df,
     spectra_to_parquet,
     spectra_to_stream,
 )
 from .fields import CustomField
-from .peptide_datasets import PeptideDataset
 from .spectrum_datasets import (
     AnnotatedSpectrumDataset,
     SpectrumDataset,

diff --git a/depthcharge/data/peptide_datasets.py → depthcharge/data/analyte_datasets.py b/depthcharge/data/peptide_datasets.py → depthcharge/data/analyte_datasets.py
@@ -1,13 +1,14 @@
 """Datasets for working with peptide sequences."""
+
 from collections.abc import Iterable
 
 import torch
 from torch.utils.data import DataLoader, TensorDataset
 
-from ..tokenizers import PeptideTokenizer
+from ..tokenizers import Tokenizer
 
 
-class PeptideDataset(TensorDataset):
+class AnalyteDataset(TensorDataset):
     """A dataset for peptide sequences.
 
     Parameters
@@ -18,33 +19,26 @@ class PeptideDataset(TensorDataset):
     sequences : Iterable[str]
         The peptide sequences in a format compatible with
         your tokenizer. ProForma is preferred.
-    charges : torch.Tensor,
-        The charge state for each peptide.
     *args : torch.Tensor, optional
         Additional values to include during data loading.
+
     """
 
     def __init__(
         self,
-        tokenizer: PeptideTokenizer,
+        tokenizer: Tokenizer,
         sequences: Iterable[str],
-        charges: torch.Tensor,
         *args: torch.Tensor,
     ) -> None:
         """Initialize a PeptideDataset."""
         tokens = tokenizer.tokenize(sequences)
-        super().__init__(tokens, charges, *args)
+        super().__init__(tokens, *args)
 
     @property
     def tokens(self) -> torch.Tensor:
         """The peptide sequence tokens."""
         return self.tensors[0]
 
-    @property
-    def charges(self) -> torch.Tensor:
-        """The peptide charges."""
-        return self.tensors[1]
-
     def loader(self, *args: tuple, **kwargs: dict) -> DataLoader:
         """A PyTorch DataLoader for peptides.
 
@@ -61,5 +55,6 @@ def loader(self, *args: tuple, **kwargs: dict) -> DataLoader:
         -------
         torch.utils.data.DataLoader
             A DataLoader for the peptide.
+
         """
         return DataLoader(self, *args, **kwargs)
diff --git a/depthcharge/data/arrow.py b/depthcharge/data/arrow.py
@@ -1,4 +1,5 @@
 """Store spectrum data as Arrow tables."""
+
 from collections.abc import Callable, Generator, Iterable
 from os import PathLike
 from pathlib import Path
@@ -82,6 +83,7 @@ def spectra_to_stream(
     -------
     Generator of pyarrow.RecordBatch
         Batches of parsed spectra.
+
     """
     parser_args = {
         "ms_level": ms_level,
@@ -195,6 +197,7 @@ def spectra_to_parquet(
     -------
     Path
         The Parquet file that was written.
+
     """
     streamer = spectra_to_stream(
         peak_file=peak_file,
@@ -210,12 +213,15 @@ def spectra_to_parquet(
     if parquet_file is None:
         parquet_file = Path(Path(peak_file).stem).with_suffix(".parquet")
 
-    writer = None
-    for batch in streamer:
-        if writer is None:
-            writer = pq.ParquetWriter(parquet_file, schema=batch.schema)
+    try:
+        writer = None
+        for batch in streamer:
+            if writer is None:
+                writer = pq.ParquetWriter(parquet_file, schema=batch.schema)
 
-        writer.write_batch(batch)
+            writer.write_batch(batch)
+    finally:
+        writer.close()
 
     return parquet_file
 
@@ -287,6 +293,7 @@ def spectra_to_df(
     -------
     Path
         The Parquet file that was written.
+
     """
     streamer = spectra_to_stream(
         peak_file=peak_file,

diff --git a/depthcharge/data/fields.py b/depthcharge/data/fields.py
@@ -1,4 +1,5 @@
 """Custom fields for the Arrow Schema."""
+
 from collections.abc import Callable
 from dataclasses import dataclass
 
@@ -24,6 +25,7 @@ class CustomField:
         each spectrum.
     dtype: pyarrow.DataType
         The expected Arrow data type for the column in the schema.
+
     """
 
     name: str

diff --git a/depthcharge/data/parsers.py b/depthcharge/data/parsers.py
@@ -1,4 +1,5 @@
 """Mass spectrometry data parsers."""
+
 from __future__ import annotations
 
 import logging
@@ -44,6 +45,7 @@ class BaseParser(ABC):
         Enable or disable the progress bar.
     id_type : str, optional
         The Hupo-PSI prefix for the spectrum identifier.
+
     """
 
     def __init__(
@@ -111,6 +113,7 @@ def sniff(self) -> None:
         ------
         IOError
             Raised if the file is not the expected format.
+
         """
 
     @abstractmethod
@@ -130,6 +133,7 @@ def parse_spectrum(self, spectrum: dict) -> MassSpectrum | None:
         -------
         MassSpectrum or None
             The parsed mass spectrum or None if it is skipped.
+
         """
 
     def parse_custom_fields(self, spectrum: dict) -> dict[str, Any]:
@@ -144,6 +148,7 @@ def parse_custom_fields(self, spectrum: dict) -> dict[str, Any]:
         -------
         dict
             The parsed value of each, whatever it may be.
+
         """
         out = {}
         if self.custom_fields is None:
@@ -167,6 +172,7 @@ def iter_batches(self, batch_size: int | None) -> pa.RecordBatch:
         ------
         RecordBatch
             A batch of spectra and their metadata.
+
         """
         batch_size = float("inf") if batch_size is None else batch_size
         pbar_args = {
@@ -229,6 +235,7 @@ def _update_batch(self, entry: dict) -> None:
         ----------
         entry : dict
             The elemtn to add.
+
         """
         if self._batch is None:
             self._batch = {k: [v] for k, v in entry.items()}
@@ -264,6 +271,7 @@ class MzmlParser(BaseParser):
         spectrum from the corresponding Pyteomics parser.
     progress : bool, optional
         Enable or disable the progress bar.
+
     """
 
     def sniff(self) -> None:
@@ -273,6 +281,7 @@ def sniff(self) -> None:
         ------
         IOError
             Raised if the file is not the expected format.
+
         """
         with self.peak_file.open() as mzdat:
             next(mzdat)
@@ -295,6 +304,7 @@ def parse_spectrum(self, spectrum: dict) -> MassSpectrum | None:
         -------
         MassSpectrum or None
             The parsed mass spectrum or None if not at the correct MS level.
+
         """
         ms_level = spectrum["ms level"]
         if self.ms_level is not None and ms_level not in self.ms_level:
@@ -363,6 +373,7 @@ class MzxmlParser(BaseParser):
         spectrum from the corresponding Pyteomics parser.
     progress : bool, optional
         Enable or disable the progress bar.
+
     """
 
     def sniff(self) -> None:
@@ -372,6 +383,7 @@ def sniff(self) -> None:
         ------
         IOError
             Raised if the file is not the expected format.
+
         """
         scent = "http://sashimi.sourceforge.net/schema_revision/mzXML"
         with self.peak_file.open() as mzdat:
@@ -395,6 +407,7 @@ def parse_spectrum(self, spectrum: dict) -> MassSpectrum | None:
         -------
         MassSpectrum
             The parsed mass spectrum.
+
         """
         ms_level = spectrum["msLevel"]
         if self.ms_level is not None and ms_level not in self.ms_level:
@@ -442,6 +455,7 @@ class MgfParser(BaseParser):
         spectrum from the corresponding Pyteomics parser.
     progress : bool, optional
         Enable or disable the progress bar.
+
     """
 
     def __init__(
@@ -476,6 +490,7 @@ def sniff(self) -> None:
         ------
         IOError
             Raised if the file is not the expected format.
+
         """
         with self.peak_file.open() as mzdat:
             if not next(mzdat).startswith("BEGIN IONS"):
@@ -492,6 +507,7 @@ def parse_spectrum(self, spectrum: dict) -> MassSpectrum:
         ----------
         spectrum : dict
             The dictionary defining the spectrum in MGF format.
+
         """
         self._counter += 1
         if self.ms_level is not None and 1 not in self.ms_level:
@@ -531,6 +547,7 @@ def _parse_scan_id(scan_str: str | int) -> int:
     -------
     int
         The scan ID number.
+
     """
     try:
         return int(scan_str)
@@ -565,6 +582,7 @@ def get_parser(cls, peak_file: PathLike, **kwargs: dict) -> BaseParser:
             The peak file to parse.
         kwargs : dict
             Keyword arguments to pass to the parser.
+
         """
         for parser in cls.parsers:
             try:

diff --git a/depthcharge/data/preprocessing.py b/depthcharge/data/preprocessing.py
@@ -39,6 +39,7 @@ def my_func(spec: MassSpectrum) -> MassSpectrum:
 ```
 
 """
+
 from collections.abc import Callable
 from functools import wraps
 
@@ -79,6 +80,7 @@ def wrapper(
         -------
         Callable
             A valid deptcharge preprocessing function.
+
         """
 
         @wraps(wrapper)
@@ -94,6 +96,7 @@ def preprocess(spec: MassSpectrum) -> MassSpectrum:
             -------
             MassSpectrum
                 The processed mass spectrum.
+
             """
             # Call the spectrum_utils method:
             getattr(spec, func)(*args, **kwargs)