small fixes regarding documentation, import syntax, etc.

Noble-Lab · Aug 12, 2024 · 7cb8e14 · 7cb8e14
1 parent fe8794d
commit 7cb8e14
Show file tree

Hide file tree

Showing 7 changed files with 137 additions and 181 deletions.
diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py
@@ -130,7 +130,7 @@ def sequence(
 ) -> None:
     """De novo sequence peptides from tandem mass spectra.
 
-    PEAK_PATH must be one or more mzMl, mzXML, or MGF files from which
+    PEAK_PATH must be one or more mzML, mzXML, or MGF files from which
     to sequence peptides.
     """
     output = setup_logging(output, verbosity)
@@ -205,7 +205,7 @@ def sequence(
 )
 @click.option(
     "--digestion",
-    help="Digestion: full, partial",
+    help="Full: standard digestion. Semi: Include products of semi-specific cleavage",
     type=click.Choice(
         ["full", "partial"],
         case_sensitive=False,
@@ -214,37 +214,41 @@ def sequence(
 )
 @click.option(
     "--missed_cleavages",
-    help="Number of allowed missed cleavages",
+    help="Number of allowed missed cleavages when digesting protein",
     type=int,
     default=0,
 )
 @click.option(
     "--max_mods",
-    help="Maximum number of modifications per peptide",
+    help="Maximum number of amino acid modifications per peptide",
     type=int,
     default=0,
 )
 @click.option(
-    "--min_length",
-    help="Minimum peptide length",
+    "--min_peptide_length",
+    help="Minimum peptide length to consider",
     type=int,
     default=6,
 )
 @click.option(
-    "--max_length",
-    help="Maximum peptide length",
+    "--max_peptide_length",
+    help="Maximum peptide length to consider",
     type=int,
     default=50,
 )
 @click.option(
     "--precursor_tolerance",
-    help="Precursor tolerance window size (ppm)",
-    type=int,
+    help="Precursor tolerance window size (units: ppm)",
+    type=float,
     default=20,
 )
 @click.option(
     "--isotope_error",
-    help="Isotope error levels to consider (list of ints, e.g: 1,2)",
+    help="Isotope error levels to consider. \
+        Creates multiple mass windows to consider per spectrum \
+        to account for observed mass not matching monoisotopic mass \
+        due to the instrument assigning the 13C isotope \
+        peak as the precursor (list of ints, e.g: 1,2)",
     type=str,
     default="0",
 )
@@ -255,9 +259,9 @@ def db_search(
     digestion: str,
     missed_cleavages: int,
     max_mods: int,
-    min_length: int,
-    max_length: int,
-    precursor_tolerance: int,
+    min_peptide_length: int,
+    max_peptide_length: int,
+    precursor_tolerance: float,
     isotope_error: str,
     model: Optional[str],
     config: Optional[str],
@@ -266,7 +270,8 @@ def db_search(
 ) -> None:
     """Perform a database search on MS/MS data using Casanovo-DB.
 
-    PEAK_PATH must be one MGF file. FASTA_PATH must be one FASTA file.
+    PEAK_PATH must be one or more mzML, mzXML, or MGF files.
+    FASTA_PATH must be one FASTA file.
     """
     output = setup_logging(output, verbosity)
     config, model = setup_model(model, config, output, False)
@@ -284,8 +289,8 @@ def db_search(
             digestion,
             missed_cleavages,
             max_mods,
-            min_length,
-            max_length,
+            min_peptide_length,
+            max_peptide_length,
             precursor_tolerance,
             isotope_error,
             output,

diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py
@@ -1,15 +1,16 @@
 """Unique methods used within db-search mode"""
 
-import os
-import depthcharge.masses
-from pyteomics import fasta, parser
 import bisect
 import logging
-
+import os
 from typing import List, Tuple
 
+import depthcharge.masses
+from pyteomics import fasta, parser
+
 logger = logging.getLogger("casanovo")
 
+
 # CONSTANTS
 HYDROGEN = 1.007825035
 OXYGEN = 15.99491463
@@ -51,8 +52,8 @@ def digest_fasta(
     digestion: str,
     missed_cleavages: int,
     max_mods: int,
-    min_length: int,
-    max_length: int,
+    min_peptide_length: int,
+    max_peptide_length: int,
 ):
     """
     Digests a FASTA file and returns the peptides, their masses, and associated protein.
@@ -70,9 +71,9 @@ def digest_fasta(
         The number of missed cleavages to allow.
     max_mods : int
         The maximum number of modifications to allow per peptide.
-    min_length : int
+    min_peptide_length : int
         The minimum length of peptides to consider.
-    max_length : int
+    max_peptide_length : int
         The maximum length of peptides to consider.
 
     Returns
@@ -81,35 +82,36 @@ def digest_fasta(
         A list of tuples containing the peptide sequence, mass,
         and associated protein. Sorted by neutral mass in ascending order.
     """
-
-    # Verify the eistence of the file:
+    # Verify the existence of the file:
     if not os.path.isfile(fasta_filename):
-        print(f"File {fasta_filename} does not exist.")
+        logger.error("File %s does not exist.", fasta_filename)
         raise FileNotFoundError(f"File {fasta_filename} does not exist.")
 
     fasta_data = fasta.read(fasta_filename)
     peptide_list = []
-    if digestion in ["full", "partial"]:
-        semi = True if digestion == "partial" else False
-        for header, seq in fasta_data:
-            pep_set = parser.cleave(
-                seq,
-                rule=parser.expasy_rules[enzyme],
-                missed_cleavages=missed_cleavages,
-                semi=semi,
-            )
-            protein = header.split()[0]
-            for pep in pep_set:
-                if len(pep) < min_length or len(pep) > max_length:
-                    continue
-                if "X" in pep or "U" in pep:
-                    logger.warn(
-                        "Skipping peptide with ambiguous amino acids: %s", pep
-                    )
-                    continue
-                peptide_list.append((pep, protein))
-    else:
+    if digestion not in ["full", "partial"]:
+        logger.error("Digestion type %s not recognized.", digestion)
         raise ValueError(f"Digestion type {digestion} not recognized.")
+    semi = digestion == "partial"
+    for header, seq in fasta_data:
+        pep_set = parser.cleave(
+            seq,
+            rule=parser.expasy_rules[enzyme],
+            missed_cleavages=missed_cleavages,
+            semi=semi,
+        )
+        protein = header.split()[0]
+        for pep in pep_set:
+            if len(pep) < min_peptide_length or len(pep) > max_peptide_length:
+                continue
+            if any(
+                aa in pep for aa in "BJOUXZ"
+            ):  # Check for incorrect AA letters
+                logger.warn(
+                    "Skipping peptide with ambiguous amino acids: %s", pep
+                )
+                continue
+            peptide_list.append((pep, protein))
 
     # Generate modified peptides
     mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb")
@@ -136,7 +138,7 @@ def get_candidates(
     precursor_mz: float,
     charge: int,
     peptide_list: List[Tuple[str, float, str]],
-    precursor_tolerance: int,
+    precursor_tolerance: float,
     isotope_error: str,
 ):
     """
@@ -156,7 +158,6 @@ def get_candidates(
     isotope_error : str
         The isotope error levels to consider.
     """
-
     candidates = set()
 
     isotope_error = [int(x) for x in isotope_error.split(",")]
@@ -219,7 +220,9 @@ def _to_raw_mass(mz_mass, charge):
 
 
 def get_mass_indices(masses, m_low, m_high):
-    """Grabs mass indices from a list of mass values that fall within a specified range.
+    """Grabs mass indices that fall within a specified range.
+
+    Pulls from masses, a list of mass values.
     Requires that the mass values are sorted in ascending order.
 
     Parameters

diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py
@@ -2,20 +2,20 @@
 
 import functools
 import os
-from typing import List, Optional, Tuple
-from functools import partial
 import logging
+from typing import List, Optional, Tuple
 
+from depthcharge.data import AnnotatedSpectrumIndex
 import lightning.pytorch as pl
 import numpy as np
 import torch
-from depthcharge.data import AnnotatedSpectrumIndex
 
+from ..data import db_utils
 from ..data.datasets import (
     AnnotatedSpectrumDataset,
     SpectrumDataset,
 )
-from ..data import db_utils
+
 
 logger = logging.getLogger("casanovo")
 
@@ -186,7 +186,7 @@ def db_dataloader(self) -> torch.utils.data.DataLoader:
         return torch.utils.data.DataLoader(
             self.test_dataset,
             batch_size=self.eval_batch_size,
-            collate_fn=partial(
+            collate_fn=functools.partial(
                 prepare_psm_batch,
                 digest=self.digest,
                 precursor_tolerance=self.precursor_tolerance,

diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py
@@ -16,7 +16,7 @@
 
 from . import evaluate
 from .. import config
-from ..data import ms_io, db_utils
+from ..data import ms_io
 
 logger = logging.getLogger("casanovo")
 
@@ -991,7 +991,8 @@ def configure_optimizers(
 
 class DbSpec2Pep(Spec2Pep):
     """
-    Subclass of Spec2Pep for the use of Casanovo as an MS/MS database search score function.
+    Subclass of Spec2Pep for the use of Casanovo as an \
+    MS/MS database search score function.
 
     Uses teacher forcing to 'query' Casanovo for its score for each AA
     within a candidate peptide, and takes the geometric average of these scores
@@ -1008,7 +1009,6 @@ class DbSpec2Pep(Spec2Pep):
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.total_psms = 0
         self.psm_batch_size = 1024
 
     def predict_step(self, batch, *args):
@@ -1029,11 +1029,14 @@ def predict_step(self, batch, *args):
             scores, amino acid-level scores, and associated proteins.
         """
         predictions = []
-        while len(batch[0]) > 0:
-            next_batch = [b[self.psm_batch_size :] for b in batch]
-            batch = [b[: self.psm_batch_size] for b in batch]
+        for start_idx in range(0, len(batch[0]), self.psm_batch_size):
+            current_batch = [
+                b[start_idx : start_idx + self.psm_batch_size] for b in batch
+            ]
             pred, truth = self.decoder(
-                batch[3], batch[1], *self.encoder(batch[0])
+                current_batch[3],
+                current_batch[1],
+                *self.encoder(current_batch[0]),
             )
             pred = self.softmax(pred)
             all_scores, per_aa_scores = _calc_match_score(
@@ -1048,13 +1051,13 @@ def predict_step(self, batch, *args):
                 peptide,
                 protein,
             ) in zip(
-                batch[1][:, 1].cpu().detach().numpy(),
-                batch[1][:, 2].cpu().detach().numpy(),
-                batch[2],
+                current_batch[1][:, 1].cpu().detach().numpy(),
+                current_batch[1][:, 2].cpu().detach().numpy(),
+                current_batch[2],
                 all_scores.cpu().detach().numpy(),
                 per_aa_scores.cpu().detach().numpy(),
-                batch[3],
-                batch[4],
+                current_batch[3],
+                current_batch[4],
             ):
                 predictions.append(
                     (
@@ -1067,8 +1070,6 @@ def predict_step(self, batch, *args):
                         protein,
                     )
                 )
-            batch = next_batch
-        self.total_psms += len(predictions)
         return predictions
 
     def on_predict_batch_end(
@@ -1088,8 +1089,6 @@ def on_predict_batch_end(
             aa_scores,
             protein,
         ) in outputs:
-            if len(peptide) == 0:
-                continue
             self.out_writer.psms.append(
                 (
                     peptide,