Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Full implementation of Casanovo-DB #352

Merged
merged 23 commits into from
Sep 13, 2024
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
f25ace8
rough implementation
VarunAnanth2003 Jul 2, 2024
f7dfbc8
tested implementation of db search
VarunAnanth2003 Jul 3, 2024
e2ce317
fix for issue with 0 candidates
VarunAnanth2003 Jul 3, 2024
5ef27e0
minor fixes added
VarunAnanth2003 Jul 3, 2024
5f0675f
reordered and renamed variables for consistency
VarunAnanth2003 Jul 3, 2024
b4fd8ff
casanovo-db full working version with code simplification
VarunAnanth2003 Jul 4, 2024
35ba7d4
Generate new screengrabs with rich-codex
github-actions[bot] Jul 4, 2024
f8a1a89
fix batching issues
VarunAnanth2003 Jul 8, 2024
fe8794d
Merge branch 'db_search_full' of https://github.com/Noble-Lab/casanov…
VarunAnanth2003 Jul 8, 2024
7cb8e14
small fixes regarding documentation, import syntax, etc.
VarunAnanth2003 Aug 12, 2024
b2f08ac
add proteindatabase
VarunAnanth2003 Aug 20, 2024
3d0b0b9
Generate new screengrabs with rich-codex
github-actions[bot] Aug 20, 2024
812226e
finish proteindatabase
VarunAnanth2003 Aug 21, 2024
df68c1d
Merge branch 'db_search_full' of https://github.com/Noble-Lab/casanov…
VarunAnanth2003 Aug 21, 2024
cfd39e8
all comments addressed
VarunAnanth2003 Aug 23, 2024
106c4ec
new comments addressed
VarunAnanth2003 Aug 28, 2024
0dfdb2c
final adjustments added
VarunAnanth2003 Sep 3, 2024
4a5b238
minor changes regarding formatting and small efficiency boosts
VarunAnanth2003 Sep 3, 2024
4352bbd
changes before reformatting config
VarunAnanth2003 Sep 3, 2024
ddff67f
replace all occurences of "max_length" with "max_peptide_len"
VarunAnanth2003 Sep 3, 2024
a3548d0
added nonspecific digestion
VarunAnanth2003 Sep 3, 2024
e8d4682
minor comments
VarunAnanth2003 Sep 13, 2024
68b6926
full branch comments addressed
VarunAnanth2003 Sep 13, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 19 additions & 65 deletions casanovo/casanovo.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
from . import utils
from .denovo import ModelRunner
from .config import Config
from .data.annotate_db import annotate_mgf

logger = logging.getLogger("casanovo")
click.rich_click.USE_MARKDOWN = True
Expand Down Expand Up @@ -131,7 +130,7 @@ def sequence(
) -> None:
"""De novo sequence peptides from tandem mass spectra.

PEAK_PATH must be one or more mzMl, mzXML, or MGF files from which
PEAK_PATH must be one or more mzML, mzXML, or MGF files from which
to sequence peptides.
"""
output = setup_logging(output, verbosity)
Expand All @@ -146,91 +145,46 @@ def sequence(
logger.info("DONE!")


@main.command()
@main.command(cls=_SharedParams)
VarunAnanth2003 marked this conversation as resolved.
Show resolved Hide resolved
@click.argument(
"peak_path",
required=True,
nargs=1,
nargs=-1,
type=click.Path(exists=True, dir_okay=False),
)
@click.argument(
"tide_path",
"fasta_path",
required=True,
nargs=1,
type=click.Path(exists=True, dir_okay=True),
)
@click.option(
"-o",
"--output",
help="The output annotated MGF file.",
type=click.Path(dir_okay=False),
)
@click.option(
"-v",
"--verbosity",
help="""
Set the verbosity of console logging messages. Log files are
always set to 'debug'.
""",
type=click.Choice(
["debug", "info", "warning", "error"],
case_sensitive=False,
),
default="info",
)
def annotate(
peak_path: str,
tide_path: str,
output: Optional[str],
verbosity: str,
) -> None:
"""Annotate a given .mgf with candidates as selected by a Tide search for Casanovo-DB.

PEAK_PATH must be one MGF file from which to annotate spectra.

TIDE_PATH must be one directory containing the Tide search results of the <PEAK_PATH> .mgf.
This directory must contain tide-search.decoy.txt and tide-search.target.txt
"""
if output is None:
output = setup_logging(output, verbosity)
logger.info(
"Output file not specified. \
Annotated MGF will be saved in the same directory \
as the input MGF."
)
output = peak_path.replace(".mgf", "_annotated.mgf")
else:
output = setup_logging(output, verbosity)

annotate_mgf(peak_path, tide_path, output)

logger.info("DONE!")


@main.command(cls=_SharedParams)
@click.argument(
"peak_path",
required=True,
nargs=-1,
type=click.Path(exists=True, dir_okay=False),
)
def db_search(
peak_path: Tuple[str],
fasta_path: str,
model: Optional[str],
config: Optional[str],
output: Optional[str],
verbosity: str,
) -> None:
"""Perform a search using Casanovo-DB.
"""Perform a database search on MS/MS data using Casanovo-DB.

PEAK_PATH must be one MGF file that has ANNOTATED spectra,
as output by annotate mode.
PEAK_PATH must be one or more mzML, mzXML, or MGF files.
FASTA_PATH must be one FASTA file.
"""
output = setup_logging(output, verbosity)
config, model = setup_model(model, config, output, False)
with ModelRunner(config, model) as runner:
logger.info("DB-searching peptides from: %s", peak_path)
runner.db_search(peak_path, output)
logger.info("Performing database search on:")
for peak_file in peak_path:
logger.info(" %s", peak_file)
logger.info("Using the following FASTA file:")
logger.info(" %s", fasta_path)

runner.db_search(
peak_path,
fasta_path,
output,
)

logger.info("DONE!")

Expand Down
44 changes: 39 additions & 5 deletions casanovo/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,28 @@

###
# The following parameters can be modified when running inference or when
# fine-tuning an existing Casanovo model.
# fine-tuning an existing Casanovo model. They also affect database search
# parameters when running Casanovo in DB-search mode.
###

# Max absolute difference allowed with respect to observed precursor m/z.
# Predictions outside the tolerance range are assigned a negative peptide score.
# denovo: Predictions outside the tolerance range are assigned a negative peptide score.
# db-search: Select candidate peptides within the specified precursor m/z tolerance.
precursor_mass_tol: 50 # ppm
# Isotopes to consider when comparing predicted and observed precursor m/z's.
isotope_error_range: [0, 1]
# The minimum length of predicted peptides.
# The minimum length of considered peptides.
min_peptide_len: 6
# The maximum length of considered peptides.
max_length: 100
VarunAnanth2003 marked this conversation as resolved.
Show resolved Hide resolved
# Number of spectra in one inference batch.
predict_batch_size: 1024


###
# The following parameters are unique to Casanovo's inference/finetuning mode.
###

# Number of beams used in beam search.
n_beams: 1
# Number of PSMs for each spectrum.
Expand All @@ -29,6 +39,32 @@ accelerator: "auto"
# number will be automatically selected for based on the chosen accelerator.
devices:


###
# The following parameters are unique to Casanovo's database search mode.
###

# Enzyme for in silico digestion, used to generate candidate peptides.
# See pyteomics.parser.expasy_rules for valid enzymes.
enzyme: "trypsin"
# Digestion type for candidate peptide generation.
# full: standard digestion. semi: Include products of semi-specific cleavage.
# Can also take a regex expression to specify custom digestion rules.
VarunAnanth2003 marked this conversation as resolved.
Show resolved Hide resolved
digestion: "full"
# Number of allowed missed cleavages when digesting protein.
missed_cleavages: 0
# Maximum number of amino acid modifications per peptide,
VarunAnanth2003 marked this conversation as resolved.
Show resolved Hide resolved
# None generates all possible isoforms as candidates.
max_mods: 1
# Select which modifications from the vocabulary can be used in candidate creation.
# Format: Comma-separated list of "aa:mod_residue",
# where aa is a standard amino acid or "X" for an N-terminal mod
VarunAnanth2003 marked this conversation as resolved.
Show resolved Hide resolved
# and mod_residue is a key from the "residues" dictionary.
# Example: "M:M+15.995,X:+43.006-17.027"
allowed_fixed_mods: "C:C+57.021"
allowed_var_mods: "M:M+15.995,N:N+0.984,Q:Q+0.984,X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027"


###
# The following parameters should only be modified if you are training a new
# Casanovo model from scratch.
Expand Down Expand Up @@ -78,8 +114,6 @@ dropout: 0.0
# Number of dimensions to use for encoding peak intensity.
# Projected up to `dim_model` by default and summed with the peak m/z encoding.
dim_intensity:
# Max decoded peptide length.
max_length: 100
# The number of iterations for the linear warm-up of the learning rate.
warmup_iters: 100_000
# The number of iterations for the cosine half period of the learning rate.
Expand Down
138 changes: 0 additions & 138 deletions casanovo/data/annotate_db.py

This file was deleted.

14 changes: 1 addition & 13 deletions casanovo/data/datasets.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""A PyTorch Dataset class for annotated spectra."""

from typing import Optional, Tuple
from typing import List, Optional, Tuple

import depthcharge
import numpy as np
Expand Down Expand Up @@ -212,8 +212,6 @@ class AnnotatedSpectrumDataset(SpectrumDataset):
random_state : Optional[int]
The NumPy random state. ``None`` leaves mass spectra in the order they
were parsed.
track_spectrum_id : Optional[bool]
Whether to keep track of the identifier of the MS/MS spectra.
"""

def __init__(
Expand All @@ -225,7 +223,6 @@ def __init__(
min_intensity: float = 0.01,
remove_precursor_tol: float = 2.0,
random_state: Optional[int] = None,
track_spectrum_id: Optional[bool] = False,
):
super().__init__(
annotated_spectrum_index,
Expand All @@ -236,7 +233,6 @@ def __init__(
remove_precursor_tol=remove_precursor_tol,
random_state=random_state,
)
self.track_spectrum_id = track_spectrum_id

def __getitem__(self, idx: int) -> Tuple[torch.Tensor, float, int, str]:
"""
Expand Down Expand Up @@ -268,12 +264,4 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, float, int, str]:
spectrum = self._process_peaks(
mz_array, int_array, precursor_mz, precursor_charge
)
if self.track_spectrum_id:
return (
spectrum,
precursor_mz,
precursor_charge,
peptide,
self.get_spectrum_id(idx),
)
return spectrum, precursor_mz, precursor_charge, peptide
Loading
Loading