add proteindatabase

Noble-Lab · Aug 20, 2024 · b2f08ac · b2f08ac
1 parent 7cb8e14
commit b2f08ac
Show file tree

Hide file tree

Showing 9 changed files with 404 additions and 466 deletions.
diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py
@@ -158,111 +158,9 @@ def sequence(
     nargs=1,
     type=click.Path(exists=True, dir_okay=False),
 )
-@click.option(
-    "--enzyme",
-    help="Enzyme for in silico digestion, \
-    See pyteomics.parser.expasy_rules for valid enzymes",
-    type=click.Choice(
-        [
-            "arg-c",
-            "asp-n",
-            "bnps-skatole",
-            "caspase 1",
-            "caspase 2",
-            "caspase 3",
-            "caspase 4",
-            "caspase 5",
-            "caspase 6",
-            "caspase 7",
-            "caspase 8",
-            "caspase 9",
-            "caspase 10",
-            "chymotrypsin high specificity",
-            "chymotrypsin low specificity",
-            "clostripain",
-            "cnbr",
-            "enterokinase",
-            "factor xa",
-            "formic acid",
-            "glutamyl endopeptidase",
-            "granzyme b",
-            "hydroxylamine",
-            "iodosobenzoic acid",
-            "lysc",
-            "ntcb",
-            "pepsin ph1.3",
-            "pepsin ph2.0",
-            "proline endopeptidase",
-            "proteinase k",
-            "staphylococcal peptidase i",
-            "thermolysin",
-            "thrombin",
-            "trypsin",
-            "trypsin_exception",
-        ]
-    ),
-    default="trypsin",
-)
-@click.option(
-    "--digestion",
-    help="Full: standard digestion. Semi: Include products of semi-specific cleavage",
-    type=click.Choice(
-        ["full", "partial"],
-        case_sensitive=False,
-    ),
-    default="full",
-)
-@click.option(
-    "--missed_cleavages",
-    help="Number of allowed missed cleavages when digesting protein",
-    type=int,
-    default=0,
-)
-@click.option(
-    "--max_mods",
-    help="Maximum number of amino acid modifications per peptide",
-    type=int,
-    default=0,
-)
-@click.option(
-    "--min_peptide_length",
-    help="Minimum peptide length to consider",
-    type=int,
-    default=6,
-)
-@click.option(
-    "--max_peptide_length",
-    help="Maximum peptide length to consider",
-    type=int,
-    default=50,
-)
-@click.option(
-    "--precursor_tolerance",
-    help="Precursor tolerance window size (units: ppm)",
-    type=float,
-    default=20,
-)
-@click.option(
-    "--isotope_error",
-    help="Isotope error levels to consider. \
-        Creates multiple mass windows to consider per spectrum \
-        to account for observed mass not matching monoisotopic mass \
-        due to the instrument assigning the 13C isotope \
-        peak as the precursor (list of ints, e.g: 1,2)",
-    type=str,
-    default="0",
-)
 def db_search(
     peak_path: Tuple[str],
     fasta_path: str,
-    enzyme: str,
-    digestion: str,
-    missed_cleavages: int,
-    max_mods: int,
-    min_peptide_length: int,
-    max_peptide_length: int,
-    precursor_tolerance: float,
-    isotope_error: str,
     model: Optional[str],
     config: Optional[str],
     output: Optional[str],
@@ -285,14 +183,6 @@ def db_search(
         runner.db_search(
             peak_path,
             fasta_path,
-            enzyme,
-            digestion,
-            missed_cleavages,
-            max_mods,
-            min_peptide_length,
-            max_peptide_length,
-            precursor_tolerance,
-            isotope_error,
             output,
         )
 

diff --git a/casanovo/config.yaml b/casanovo/config.yaml
@@ -5,18 +5,26 @@
 
 ###
 # The following parameters can be modified when running inference or when
-# fine-tuning an existing Casanovo model.
+# fine-tuning an existing Casanovo model. They also affect database search
+# parameters when running Casanovo in DB-search mode.
 ###
 
 # Max absolute difference allowed with respect to observed precursor m/z.
-# Predictions outside the tolerance range are assigned a negative peptide score.
+# denovo: Predictions outside the tolerance range are assigned a negative peptide score.
+# db-search: Used to create mas windows for candidate generation.
 precursor_mass_tol: 50  # ppm
 # Isotopes to consider when comparing predicted and observed precursor m/z's.
 isotope_error_range: [0, 1]
-# The minimum length of predicted peptides.
+# The minimum length of predicted/scored peptides.
 min_peptide_len: 6
-# Number of spectra in one inference batch.
+# Number of spectra or psms in one inference batch.
 predict_batch_size: 1024
+
+
+###
+# The following parameters are unique to Casanovo's inference/finetuning mode.
+###
+
 # Number of beams used in beam search.
 n_beams: 1
 # Number of PSMs for each spectrum.
@@ -29,6 +37,26 @@ accelerator: "auto"
 # number will be automatically selected for based on the chosen accelerator.
 devices:
 
+
+###
+# The following parameters are unique to Casanovo's database search mode.
+###
+
+# Enzyme for in silico digestion, used to generate candidate peptides.
+# See pyteomics.parser.expasy_rules for valid enzymes
+enzyme: "trypsin"
+# Digestion type for candidate peptide generation.
+# Full: standard digestion. Semi: Include products of semi-specific cleavage
+digestion: "full"
+# Number of allowed missed cleavages when digesting protein
+missed_cleavages: 0
+# Maximum number of amino acid modifications per peptide.
+# None generates all possible isoforms as candidates.
+max_mods: 
+# Maximum peptide length to consider
+max_peptide_len: 50
+
+
 ###
 # The following parameters should only be modified if you are training a new
 # Casanovo model from scratch.

diff --git a/casanovo/data/datasets.py b/casanovo/data/datasets.py
@@ -1,6 +1,6 @@
 """A PyTorch Dataset class for annotated spectra."""
 
-from typing import Optional, Tuple
+from typing import List, Optional, Tuple
 
 import depthcharge
 import numpy as np