Skip to content

Commit

Permalink
add proteindatabase
Browse files Browse the repository at this point in the history
  • Loading branch information
VarunAnanth2003 committed Aug 20, 2024
1 parent 7cb8e14 commit b2f08ac
Show file tree
Hide file tree
Showing 9 changed files with 404 additions and 466 deletions.
110 changes: 0 additions & 110 deletions casanovo/casanovo.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,111 +158,9 @@ def sequence(
nargs=1,
type=click.Path(exists=True, dir_okay=False),
)
@click.option(
"--enzyme",
help="Enzyme for in silico digestion, \
See pyteomics.parser.expasy_rules for valid enzymes",
type=click.Choice(
[
"arg-c",
"asp-n",
"bnps-skatole",
"caspase 1",
"caspase 2",
"caspase 3",
"caspase 4",
"caspase 5",
"caspase 6",
"caspase 7",
"caspase 8",
"caspase 9",
"caspase 10",
"chymotrypsin high specificity",
"chymotrypsin low specificity",
"clostripain",
"cnbr",
"enterokinase",
"factor xa",
"formic acid",
"glutamyl endopeptidase",
"granzyme b",
"hydroxylamine",
"iodosobenzoic acid",
"lysc",
"ntcb",
"pepsin ph1.3",
"pepsin ph2.0",
"proline endopeptidase",
"proteinase k",
"staphylococcal peptidase i",
"thermolysin",
"thrombin",
"trypsin",
"trypsin_exception",
]
),
default="trypsin",
)
@click.option(
"--digestion",
help="Full: standard digestion. Semi: Include products of semi-specific cleavage",
type=click.Choice(
["full", "partial"],
case_sensitive=False,
),
default="full",
)
@click.option(
"--missed_cleavages",
help="Number of allowed missed cleavages when digesting protein",
type=int,
default=0,
)
@click.option(
"--max_mods",
help="Maximum number of amino acid modifications per peptide",
type=int,
default=0,
)
@click.option(
"--min_peptide_length",
help="Minimum peptide length to consider",
type=int,
default=6,
)
@click.option(
"--max_peptide_length",
help="Maximum peptide length to consider",
type=int,
default=50,
)
@click.option(
"--precursor_tolerance",
help="Precursor tolerance window size (units: ppm)",
type=float,
default=20,
)
@click.option(
"--isotope_error",
help="Isotope error levels to consider. \
Creates multiple mass windows to consider per spectrum \
to account for observed mass not matching monoisotopic mass \
due to the instrument assigning the 13C isotope \
peak as the precursor (list of ints, e.g: 1,2)",
type=str,
default="0",
)
def db_search(
peak_path: Tuple[str],
fasta_path: str,
enzyme: str,
digestion: str,
missed_cleavages: int,
max_mods: int,
min_peptide_length: int,
max_peptide_length: int,
precursor_tolerance: float,
isotope_error: str,
model: Optional[str],
config: Optional[str],
output: Optional[str],
Expand All @@ -285,14 +183,6 @@ def db_search(
runner.db_search(
peak_path,
fasta_path,
enzyme,
digestion,
missed_cleavages,
max_mods,
min_peptide_length,
max_peptide_length,
precursor_tolerance,
isotope_error,
output,
)

Expand Down
36 changes: 32 additions & 4 deletions casanovo/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,26 @@

###
# The following parameters can be modified when running inference or when
# fine-tuning an existing Casanovo model.
# fine-tuning an existing Casanovo model. They also affect database search
# parameters when running Casanovo in DB-search mode.
###

# Max absolute difference allowed with respect to observed precursor m/z.
# Predictions outside the tolerance range are assigned a negative peptide score.
# denovo: Predictions outside the tolerance range are assigned a negative peptide score.
# db-search: Used to create mas windows for candidate generation.
precursor_mass_tol: 50 # ppm
# Isotopes to consider when comparing predicted and observed precursor m/z's.
isotope_error_range: [0, 1]
# The minimum length of predicted peptides.
# The minimum length of predicted/scored peptides.
min_peptide_len: 6
# Number of spectra in one inference batch.
# Number of spectra or psms in one inference batch.
predict_batch_size: 1024


###
# The following parameters are unique to Casanovo's inference/finetuning mode.
###

# Number of beams used in beam search.
n_beams: 1
# Number of PSMs for each spectrum.
Expand All @@ -29,6 +37,26 @@ accelerator: "auto"
# number will be automatically selected for based on the chosen accelerator.
devices:


###
# The following parameters are unique to Casanovo's database search mode.
###

# Enzyme for in silico digestion, used to generate candidate peptides.
# See pyteomics.parser.expasy_rules for valid enzymes
enzyme: "trypsin"
# Digestion type for candidate peptide generation.
# Full: standard digestion. Semi: Include products of semi-specific cleavage
digestion: "full"
# Number of allowed missed cleavages when digesting protein
missed_cleavages: 0
# Maximum number of amino acid modifications per peptide.
# None generates all possible isoforms as candidates.
max_mods:
# Maximum peptide length to consider
max_peptide_len: 50


###
# The following parameters should only be modified if you are training a new
# Casanovo model from scratch.
Expand Down
2 changes: 1 addition & 1 deletion casanovo/data/datasets.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""A PyTorch Dataset class for annotated spectra."""

from typing import Optional, Tuple
from typing import List, Optional, Tuple

import depthcharge
import numpy as np
Expand Down
Loading

0 comments on commit b2f08ac

Please sign in to comment.