db_utils fixes

Noble-Lab · Nov 3, 2024 · 68e67e8 · 68e67e8
1 parent 2b6198b
commit 68e67e8
Show file tree

Hide file tree

Showing 4 changed files with 85 additions and 90 deletions.
diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py
@@ -236,6 +236,7 @@ def db_search(
     config, model = setup_model(
         model, config, output_path, output_root_name, False
     )
+    start_time = time.time()
     with ModelRunner(
         config,
         model,
@@ -246,6 +247,7 @@ def db_search(
         logger.info("Performing database search on:")
         for peak_file in peak_path:
             logger.info("  %s", peak_file)
+
         logger.info("Using the following FASTA file:")
         logger.info("  %s", fasta_path)
 
@@ -254,8 +256,7 @@ def db_search(
             fasta_path,
             str((output_path / output_root).with_suffix(".mztab")),
         )
-
-    logger.info("DONE!")
+        utils.log_run_report(start_time=start_time, end_time=time.time())
 
 
 @main.command(cls=_SharedParams)

diff --git a/casanovo/config.yaml b/casanovo/config.yaml
@@ -23,6 +23,13 @@ max_peptide_len: 100
 predict_batch_size: 1024
 # Number of PSMs for each spectrum.
 top_match: 1
+# The hardware accelerator to use. Must be one of:
+# "cpu", "gpu", "tpu", "ipu", "hpu", "mps", or "auto".
+accelerator: "auto"
+# The devices to use. Can be set to a positive number int, or the value -1 to
+# indicate all available devices should be used. If left empty, the appropriate
+# number will be automatically selected for based on the chosen accelerator.
+devices:
 
 
 ###
@@ -31,13 +38,6 @@ top_match: 1
 
 # Number of beams used in beam search.
 n_beams: 1
-# The hardware accelerator to use. Must be one of:
-# "cpu", "gpu", "tpu", "ipu", "hpu", "mps", or "auto".
-accelerator: "auto"
-# The devices to use. Can be set to a positive number int, or the value -1 to
-# indicate all available devices should be used. If left empty, the appropriate
-# number will be automatically selected for based on the chosen accelerator.
-devices:
 
 
 ###
@@ -46,7 +46,7 @@ devices:
 
 # Enzyme for in silico digestion, used to generate candidate peptides.
 # See pyteomics.parser.expasy_rules for valid enzymes.
-# Can also take a regex expression to specify custom digestion rules.
+# Can also take a regex to specify custom digestion rules.
 enzyme: "trypsin"
 # Digestion type for candidate peptide generation.
 # full: standard digestion. 
@@ -60,9 +60,9 @@ missed_cleavages: 0
 max_mods: 1
 # Select which modifications from the vocabulary can be used in candidate creation.
 # Format: Comma-separated list of "aa:mod_residue", 
-# where aa is a standard amino acid or "nterm" for an N-terminal mod
+# where aa is a standard amino acid (or "nterm" for an N-terminal mod)
 # and mod_residue is a key from the "residues" dictionary.
-# Example: "M:M+15.995,X:+43.006-17.027"
+# Example: "M:M+15.995,nterm:+43.006"
 allowed_fixed_mods: "C:C+57.021"
 allowed_var_mods: "M:M+15.995,N:N+0.984,Q:Q+0.984,nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027"
 

diff --git a/casanovo/data/datasets.py b/casanovo/data/datasets.py
@@ -1,6 +1,6 @@
 """A PyTorch Dataset class for annotated spectra."""
 
-from typing import List, Optional, Tuple
+from typing import Optional, Tuple
 
 import depthcharge
 import numpy as np

diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py
@@ -24,7 +24,8 @@
 
 class ProteinDatabase:
     """
-    Store digested .fasta data and return candidate peptides for a given precursor mass.
+    Store digested .fasta data and return candidate peptides
+    for a given precursor mass.
 
     Parameters
     ----------
@@ -34,7 +35,8 @@ class ProteinDatabase:
         The enzyme to use for digestion.
         See pyteomics.parser.expasy_rules for valid enzymes.
     digestion : str
-        The type of digestion to perform. Either 'full' or 'partial'.
+        The type of digestion to perform.
+        Either 'full', 'partial' or 'non-specific'.
     missed_cleavages : int
         The number of missed cleavages to allow.
     min_peptide_len : int
@@ -46,12 +48,13 @@ class ProteinDatabase:
     precursor_tolerance : float
         The precursor mass tolerance in ppm.
     isotope_error : Tuple[int, int]
-        Isotope range [min, max] to consider when comparing predicted and observed precursor m/z's.
+        Isotope range [min, max] to consider when comparing predicted
+        and observed precursor m/z's.
     allowed_fixed_mods : str
         A comma separated string of fixed modifications to consider.
     allowed_var_mods : str
         A comma separated string of variable modifications to consider.
-    residues : dict
+    residues : dict[str, float]
         A dictionary of amino acid masses.
     """
 
@@ -68,7 +71,7 @@ def __init__(
         isotope_error: Tuple[int, int],
         allowed_fixed_mods: str,
         allowed_var_mods: str,
-        residues: dict,
+        residues: dict[str, float],
     ):
         self.fixed_mods, self.var_mods, self.swap_map = _construct_mods_dict(
             allowed_fixed_mods, allowed_var_mods
@@ -84,7 +87,7 @@ def __init__(
             missed_cleavages,
             min_peptide_len,
             max_peptide_len,
-            set(list(residues.keys()) + ["C"]),
+            set([aa[0] for aa in residues.keys() if aa[0].isalpha()]),
         )
         self.db_peptides, self.prot_map = self._digest_fasta(peptide_generator)
         self.precursor_tolerance = precursor_tolerance
@@ -94,9 +97,10 @@ def get_candidates(
         self,
         precursor_mz: float,
         charge: int,
-    ) -> List[Tuple[str, str]]:
+    ) -> pd.Series:
         """
-        Returns a list of candidate peptides that fall within the specified mass range.
+        Returns a list of candidate peptides that fall within the
+        specified mass range.
 
         Parameters
         ----------
@@ -115,7 +119,7 @@ def get_candidates(
         for e in range(self.isotope_error[0], self.isotope_error[1] + 1):
             iso_shift = ISOTOPE_SPACING * e
             shift_raw_mass = float(
-                _to_raw_mass(precursor_mz, charge) - iso_shift
+                _to_neutral_mass(precursor_mz, charge) - iso_shift
             )
             upper_bound = shift_raw_mass * (
                 1 + (self.precursor_tolerance / 1e6)
@@ -154,9 +158,10 @@ def get_associated_protein(self, peptide: str) -> str:
     def _digest_fasta(
         self,
         peptide_generator: Iterator[Tuple[str, str]],
-    ) -> Tuple[pd.DataFrame, dict]:
+    ) -> Tuple[pd.DataFrame, dict[str, str]]:
         """
-        Digests a FASTA file and returns the peptides, their masses, and associated protein.
+        Digests a FASTA file and returns the peptides, their masses,
+        and associated protein.
 
         Parameters
         ----------
@@ -168,13 +173,9 @@ def _digest_fasta(
         pep_table : pd.DataFrame
             A Pandas DataFrame with peptide and mass columns.
             Sorted by neutral mass in ascending order.
-        prot_map : dict
+        prot_map : dict[str, str]
             A dictionary mapping peptides to associated proteins.
         """
-        peptide_list = []
-        for pep, prot in peptide_generator:
-            peptide_list.append((pep, prot))
-
         # Generate modified peptides
         mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb")
         peptide_isoforms = [
@@ -187,7 +188,7 @@ def _digest_fasta(
                 ),
                 prot,
             )
-            for pep, prot in peptide_list
+            for pep, prot in peptide_generator
         ]
         mod_peptide_list = [
             (mod_pep, mass_calculator.mass(mod_pep), prot)
@@ -203,9 +204,9 @@ def _digest_fasta(
         ]
 
         # Create a dictionary mapping for easy accession of associated proteins
-        prot_map = defaultdict(list)
+        prot_map = defaultdict(set)
         for pep, _, prot in mod_peptide_list:
-            prot_map[pep].append(prot)
+            prot_map[pep].add(prot)
 
         # Create a DataFrame for easy sorting and filtering
         pep_table = pd.DataFrame(
@@ -227,8 +228,8 @@ def _peptide_generator(
     enzyme: str,
     digestion: str,
     missed_cleavages: int,
-    min_peptide_length: int,
-    max_peptide_length: int,
+    min_peptide_len: int,
+    max_peptide_len: int,
     valid_aa: set[str],
 ) -> Iterator[str]:
     """
@@ -242,14 +243,15 @@ def _peptide_generator(
     enzyme : str
         The enzyme to use for digestion.
         See pyteomics.parser.expasy_rules for valid enzymes.
-        Can also be a regex pattern.
+        Can also be a regex.
     digestion : str
-        The type of digestion to perform. Either 'full', 'partial' or 'non-specific'.
+        The type of digestion to perform.
+        Either 'full', 'partial' or 'non-specific'.
     missed_cleavages : int
         The number of missed cleavages to allow.
-    min_peptide_length : int
+    min_peptide_len : int
         The minimum length of peptides to consider.
-    max_peptide_length : int
+    max_peptide_len : int
         The maximum length of peptides to consider.
     valid_aa : set[str]
         A set of valid amino acids.
@@ -261,19 +263,6 @@ def _peptide_generator(
     protein : str
         The associated protein.
     """
-    # Verify the existence of the file:
-    if not os.path.isfile(fasta_filename):
-        logger.error("File %s does not exist.", fasta_filename)
-        raise FileNotFoundError(f"File {fasta_filename} does not exist.")
-    if digestion not in ["full", "partial", "non-specific"]:
-        logger.error("Digestion type %s not recognized.", digestion)
-        raise ValueError(f"Digestion type {digestion} not recognized.")
-    if enzyme not in parser.expasy_rules:
-        logger.info(
-            "Enzyme %s not recognized. Interpreting as cleavage rule.",
-            enzyme,
-        )
-
     # Verify the existence of the file:
     if not os.path.isfile(fasta_filename):
         logger.error("File %s does not exist.", fasta_filename)
@@ -292,12 +281,12 @@ def _peptide_generator(
             # Generate all possible peptides
             for i in range(len(seq)):
                 for j in range(
-                    i + min_peptide_length,
-                    min(i + max_peptide_length + 1, len(seq) + 1),
+                    i + min_peptide_len,
+                    min(i + max_peptide_len + 1, len(seq) + 1),
                 ):
                     pep = seq[i:j]
                     if any(aa not in valid_aa for aa in pep):
-                        logger.warn(
+                        logger.warning(
                             "Skipping peptide with unknown amino acids: %s",
                             pep,
                         )
@@ -314,12 +303,9 @@ def _peptide_generator(
             )
             protein = header.split()[0]
             for pep in pep_set:
-                if (
-                    len(pep) >= min_peptide_length
-                    and len(pep) <= max_peptide_length
-                ):
+                if len(pep) >= min_peptide_len and len(pep) <= max_peptide_len:
                     if any(aa not in valid_aa for aa in pep):
-                        logger.warn(
+                        logger.warning(
                             "Skipping peptide with unknown amino acids: %s",
                             pep,
                         )
@@ -348,7 +334,7 @@ def _to_mz(precursor_mass: float, charge: int) -> float:
 
 
 @njit
-def _to_raw_mass(mz_mass: float, charge: int) -> float:
+def _to_neutral_mass(mz_mass: float, charge: int) -> float:
     """
     Convert precursor m/z value to neutral mass.
 
@@ -367,23 +353,33 @@ def _to_raw_mass(mz_mass: float, charge: int) -> float:
     return charge * (mz_mass - PROTON)
 
 
-def _convert_from_modx(seq: str, swap_map: dict, swap_regex: str) -> str:
-    """Converts peptide sequence from modX format to Casanovo-acceptable modifications.
-
-    Args:
-        seq : str
-            Peptide in modX format
-        swap_map : dict
-            Dictionary that allows for swapping of modX to Casanovo-acceptable modifications.
-        swap_regex : str
-            Regular expression to match modX format.
+def _convert_from_modx(
+    seq: str, swap_map: dict[str, str], swap_regex: str
+) -> str:
+    """
+    Converts peptide sequence from modX format to
+    Casanovo-acceptable modifications.
+
+    Parameters:
+    -----------
+    seq : str
+        Peptide in modX format
+    swap_map : dict[str, str]
+        Dictionary that allows for swapping of modX to Casanovo-acceptable modifications.
+    swap_regex : str
+        Regular expression to match modX format.
+
+    Returns:
+    --------
+    swap_regex : str
+        Peptide in Casanovo-acceptable modifications.
     """
     return swap_regex.sub(lambda x: swap_map[x.group()], seq)
 
 
 def _construct_mods_dict(
     allowed_fixed_mods: str, allowed_var_mods: str
-) -> Tuple[dict, dict, dict]:
+) -> Tuple[dict[str, str], dict[str, str], dict[str, str]]:
     """
     Constructs dictionaries of fixed and variable modifications.
 
@@ -396,30 +392,28 @@ def _construct_mods_dict(
 
     Returns
     -------
-    fixed_mods : dict
+    fixed_mods : dict[str, str]
         A dictionary of fixed modifications.
-    var_mods : dict
+    var_mods : dict[str, str]
         A dictionary of variable modifications.
-    swap_map : dict
-        A dictionary that allows for swapping of modX to Casanovo-acceptable modifications.
+    swap_map : dict[str, str]
+        A dictionary that allows for swapping of modX to
+        Casanovo-acceptable modifications.
     """
     swap_map = {}
     fixed_mods = {}
-    for idx, mod in enumerate(allowed_fixed_mods.split(",")):
-        aa, mod_aa = mod.split(":")
-        mod_id = string.ascii_lowercase[idx]
-        fixed_mods[mod_id] = [aa]
-        swap_map[f"{mod_id}{aa}"] = f"{mod_aa}"
-
     var_mods = {}
-    for idx, mod in enumerate(allowed_var_mods.split(",")):
-        aa, mod_aa = mod.split(":")
-        mod_id = string.ascii_lowercase[idx]
-        if aa == "nterm":
-            var_mods[f"{mod_id}-"] = True
-            swap_map[f"{mod_id}-"] = f"{mod_aa}"
-        else:
-            var_mods[mod_id] = [aa]
-            swap_map[f"{mod_id}{aa}"] = f"{mod_aa}"
+    for mod_map, allowed_mods in zip(
+        [fixed_mods, var_mods], [allowed_fixed_mods, allowed_var_mods]
+    ):
+        for idx, mod in enumerate(allowed_mods.split(",")):
+            aa, mod_aa = mod.split(":")
+            mod_id = string.ascii_lowercase[idx]
+            if aa == "nterm":
+                mod_map[f"{mod_id}-"] = True
+                swap_map[f"{mod_id}-"] = f"{mod_aa}"
+            else:
+                mod_map[mod_id] = [aa]
+                swap_map[f"{mod_id}{aa}"] = f"{mod_aa}"
 
     return fixed_mods, var_mods, swap_map