Noble-Lab · VarunAnanth2003 · Sep 13, 2024 · Jul 2, 2024 · Jul 3, 2024 · Jul 3, 2024
diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py
@@ -42,7 +42,6 @@
 from . import utils
 from .denovo import ModelRunner
 from .config import Config
-from .data.annotate_db import annotate_mgf
 
 logger = logging.getLogger("casanovo")
 click.rich_click.USE_MARKDOWN = True
@@ -131,7 +130,7 @@ def sequence(
 ) -> None:
     """De novo sequence peptides from tandem mass spectra.
 
-    PEAK_PATH must be one or more mzMl, mzXML, or MGF files from which
+    PEAK_PATH must be one or more mzML, mzXML, or MGF files from which
     to sequence peptides.
     """
     output = setup_logging(output, verbosity)
@@ -146,91 +145,46 @@ def sequence(
     logger.info("DONE!")
 
 
-@main.command()
+@main.command(cls=_SharedParams)
 @click.argument(
     "peak_path",
     required=True,
-    nargs=1,
+    nargs=-1,
     type=click.Path(exists=True, dir_okay=False),
 )
 @click.argument(
-    "tide_path",
+    "fasta_path",
     required=True,
     nargs=1,
-    type=click.Path(exists=True, dir_okay=True),
-)
-@click.option(
-    "-o",
-    "--output",
-    help="The output annotated MGF file.",
-    type=click.Path(dir_okay=False),
-)
-@click.option(
-    "-v",
-    "--verbosity",
-    help="""
-    Set the verbosity of console logging messages. Log files are
-    always set to 'debug'.
-    """,
-    type=click.Choice(
-        ["debug", "info", "warning", "error"],
-        case_sensitive=False,
-    ),
-    default="info",
-)
-def annotate(
-    peak_path: str,
-    tide_path: str,
-    output: Optional[str],
-    verbosity: str,
-) -> None:
-    """Annotate a given .mgf with candidates as selected by a Tide search for Casanovo-DB.
-
-    PEAK_PATH must be one MGF file from which to annotate spectra.
-
-    TIDE_PATH must be one directory containing the Tide search results of the <PEAK_PATH> .mgf.
-    This directory must contain tide-search.decoy.txt and tide-search.target.txt
-    """
-    if output is None:
-        output = setup_logging(output, verbosity)
-        logger.info(
-            "Output file not specified. \
-            Annotated MGF will be saved in the same directory \
-            as the input MGF."
-        )
-        output = peak_path.replace(".mgf", "_annotated.mgf")
-    else:
-        output = setup_logging(output, verbosity)
-
-    annotate_mgf(peak_path, tide_path, output)
-
-    logger.info("DONE!")
-
-
-@main.command(cls=_SharedParams)
-@click.argument(
-    "peak_path",
-    required=True,
-    nargs=-1,
     type=click.Path(exists=True, dir_okay=False),
 )
 def db_search(
     peak_path: Tuple[str],
+    fasta_path: str,
     model: Optional[str],
     config: Optional[str],
     output: Optional[str],
     verbosity: str,
 ) -> None:
-    """Perform a search using Casanovo-DB.
+    """Perform a database search on MS/MS data using Casanovo-DB.
 
-    PEAK_PATH must be one MGF file that has ANNOTATED spectra,
-    as output by annotate mode.
+    PEAK_PATH must be one or more mzML, mzXML, or MGF files.
+    FASTA_PATH must be one FASTA file.
     """
     output = setup_logging(output, verbosity)
     config, model = setup_model(model, config, output, False)
     with ModelRunner(config, model) as runner:
-        logger.info("DB-searching peptides from: %s", peak_path)
-        runner.db_search(peak_path, output)
+        logger.info("Performing database search on:")
+        for peak_file in peak_path:
+            logger.info("  %s", peak_file)
+        logger.info("Using the following FASTA file:")
+        logger.info("  %s", fasta_path)
+
+        runner.db_search(
+            peak_path,
+            fasta_path,
+            output,
+        )
 
     logger.info("DONE!")
 

diff --git a/casanovo/config.yaml b/casanovo/config.yaml
@@ -5,18 +5,28 @@
 
 ###
 # The following parameters can be modified when running inference or when
-# fine-tuning an existing Casanovo model.
+# fine-tuning an existing Casanovo model. They also affect database search
+# parameters when running Casanovo in DB-search mode.
 ###
 
 # Max absolute difference allowed with respect to observed precursor m/z.
-# Predictions outside the tolerance range are assigned a negative peptide score.
+# denovo: Predictions outside the tolerance range are assigned a negative peptide score.
+# db-search: Select candidate peptides within the specified precursor m/z tolerance.
 precursor_mass_tol: 50  # ppm
 # Isotopes to consider when comparing predicted and observed precursor m/z's.
 isotope_error_range: [0, 1]
-# The minimum length of predicted peptides.
+# The minimum length of considered peptides.
 min_peptide_len: 6
+# The maximum length of considered peptides.
+max_length: 100
 # Number of spectra in one inference batch.
 predict_batch_size: 1024
+
+
+###
+# The following parameters are unique to Casanovo's inference/finetuning mode.
+###
+
 # Number of beams used in beam search.
 n_beams: 1
 # Number of PSMs for each spectrum.
@@ -29,6 +39,32 @@ accelerator: "auto"
 # number will be automatically selected for based on the chosen accelerator.
 devices:
 
+
+###
+# The following parameters are unique to Casanovo's database search mode.
+###
+
+# Enzyme for in silico digestion, used to generate candidate peptides.
+# See pyteomics.parser.expasy_rules for valid enzymes.
+enzyme: "trypsin"
+# Digestion type for candidate peptide generation.
+# full: standard digestion. semi: Include products of semi-specific cleavage.
+# Can also take a regex expression to specify custom digestion rules.
+digestion: "full"
+# Number of allowed missed cleavages when digesting protein.
+missed_cleavages: 0
+# Maximum number of amino acid modifications per peptide,
+# None generates all possible isoforms as candidates.
+max_mods: 1
+# Select which modifications from the vocabulary can be used in candidate creation.
+# Format: Comma-separated list of "aa:mod_residue", 
+# where aa is a standard amino acid or "X" for an N-terminal mod
+# and mod_residue is a key from the "residues" dictionary.
+# Example: "M:M+15.995,X:+43.006-17.027"
+allowed_fixed_mods: "C:C+57.021"
+allowed_var_mods: "M:M+15.995,N:N+0.984,Q:Q+0.984,X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027"
+
+
 ###
 # The following parameters should only be modified if you are training a new
 # Casanovo model from scratch.
@@ -78,8 +114,6 @@ dropout: 0.0
 # Number of dimensions to use for encoding peak intensity.
 # Projected up to `dim_model` by default and summed with the peak m/z encoding.
 dim_intensity:
-# Max decoded peptide length.
-max_length: 100
 # The number of iterations for the linear warm-up of the learning rate.
 warmup_iters: 100_000
 # The number of iterations for the cosine half period of the learning rate.

diff --git a/casanovo/data/annotate_db.py b/casanovo/data/annotate_db.py
diff --git a/casanovo/data/datasets.py b/casanovo/data/datasets.py
@@ -1,6 +1,6 @@
 """A PyTorch Dataset class for annotated spectra."""
 
-from typing import Optional, Tuple
+from typing import List, Optional, Tuple
 
 import depthcharge
 import numpy as np
@@ -212,8 +212,6 @@ class AnnotatedSpectrumDataset(SpectrumDataset):
     random_state : Optional[int]
         The NumPy random state. ``None`` leaves mass spectra in the order they
         were parsed.
-    track_spectrum_id : Optional[bool]
-        Whether to keep track of the identifier of the MS/MS spectra.
     """
 
     def __init__(
@@ -225,7 +223,6 @@ def __init__(
         min_intensity: float = 0.01,
         remove_precursor_tol: float = 2.0,
         random_state: Optional[int] = None,
-        track_spectrum_id: Optional[bool] = False,
     ):
         super().__init__(
             annotated_spectrum_index,
@@ -236,7 +233,6 @@ def __init__(
             remove_precursor_tol=remove_precursor_tol,
             random_state=random_state,
         )
-        self.track_spectrum_id = track_spectrum_id
 
     def __getitem__(self, idx: int) -> Tuple[torch.Tensor, float, int, str]:
         """
@@ -268,12 +264,4 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, float, int, str]:
         spectrum = self._process_peaks(
             mz_array, int_array, precursor_mz, precursor_charge
         )
-        if self.track_spectrum_id:
-            return (
-                spectrum,
-                precursor_mz,
-                precursor_charge,
-                peptide,
-                self.get_spectrum_id(idx),
-            )
         return spectrum, precursor_mz, precursor_charge, peptide