Skip to content

Commit

Permalink
small fixes regarding documentation, import syntax, etc.
Browse files Browse the repository at this point in the history
  • Loading branch information
VarunAnanth2003 committed Aug 12, 2024
1 parent fe8794d commit 7cb8e14
Show file tree
Hide file tree
Showing 7 changed files with 137 additions and 181 deletions.
39 changes: 22 additions & 17 deletions casanovo/casanovo.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def sequence(
) -> None:
"""De novo sequence peptides from tandem mass spectra.
PEAK_PATH must be one or more mzMl, mzXML, or MGF files from which
PEAK_PATH must be one or more mzML, mzXML, or MGF files from which
to sequence peptides.
"""
output = setup_logging(output, verbosity)
Expand Down Expand Up @@ -205,7 +205,7 @@ def sequence(
)
@click.option(
"--digestion",
help="Digestion: full, partial",
help="Full: standard digestion. Semi: Include products of semi-specific cleavage",
type=click.Choice(
["full", "partial"],
case_sensitive=False,
Expand All @@ -214,37 +214,41 @@ def sequence(
)
@click.option(
"--missed_cleavages",
help="Number of allowed missed cleavages",
help="Number of allowed missed cleavages when digesting protein",
type=int,
default=0,
)
@click.option(
"--max_mods",
help="Maximum number of modifications per peptide",
help="Maximum number of amino acid modifications per peptide",
type=int,
default=0,
)
@click.option(
"--min_length",
help="Minimum peptide length",
"--min_peptide_length",
help="Minimum peptide length to consider",
type=int,
default=6,
)
@click.option(
"--max_length",
help="Maximum peptide length",
"--max_peptide_length",
help="Maximum peptide length to consider",
type=int,
default=50,
)
@click.option(
"--precursor_tolerance",
help="Precursor tolerance window size (ppm)",
type=int,
help="Precursor tolerance window size (units: ppm)",
type=float,
default=20,
)
@click.option(
"--isotope_error",
help="Isotope error levels to consider (list of ints, e.g: 1,2)",
help="Isotope error levels to consider. \
Creates multiple mass windows to consider per spectrum \
to account for observed mass not matching monoisotopic mass \
due to the instrument assigning the 13C isotope \
peak as the precursor (list of ints, e.g: 1,2)",
type=str,
default="0",
)
Expand All @@ -255,9 +259,9 @@ def db_search(
digestion: str,
missed_cleavages: int,
max_mods: int,
min_length: int,
max_length: int,
precursor_tolerance: int,
min_peptide_length: int,
max_peptide_length: int,
precursor_tolerance: float,
isotope_error: str,
model: Optional[str],
config: Optional[str],
Expand All @@ -266,7 +270,8 @@ def db_search(
) -> None:
"""Perform a database search on MS/MS data using Casanovo-DB.
PEAK_PATH must be one MGF file. FASTA_PATH must be one FASTA file.
PEAK_PATH must be one or more mzML, mzXML, or MGF files.
FASTA_PATH must be one FASTA file.
"""
output = setup_logging(output, verbosity)
config, model = setup_model(model, config, output, False)
Expand All @@ -284,8 +289,8 @@ def db_search(
digestion,
missed_cleavages,
max_mods,
min_length,
max_length,
min_peptide_length,
max_peptide_length,
precursor_tolerance,
isotope_error,
output,
Expand Down
71 changes: 37 additions & 34 deletions casanovo/data/db_utils.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
"""Unique methods used within db-search mode"""

import os
import depthcharge.masses
from pyteomics import fasta, parser
import bisect
import logging

import os
from typing import List, Tuple

import depthcharge.masses
from pyteomics import fasta, parser

logger = logging.getLogger("casanovo")


# CONSTANTS
HYDROGEN = 1.007825035
OXYGEN = 15.99491463
Expand Down Expand Up @@ -51,8 +52,8 @@ def digest_fasta(
digestion: str,
missed_cleavages: int,
max_mods: int,
min_length: int,
max_length: int,
min_peptide_length: int,
max_peptide_length: int,
):
"""
Digests a FASTA file and returns the peptides, their masses, and associated protein.
Expand All @@ -70,9 +71,9 @@ def digest_fasta(
The number of missed cleavages to allow.
max_mods : int
The maximum number of modifications to allow per peptide.
min_length : int
min_peptide_length : int
The minimum length of peptides to consider.
max_length : int
max_peptide_length : int
The maximum length of peptides to consider.
Returns
Expand All @@ -81,35 +82,36 @@ def digest_fasta(
A list of tuples containing the peptide sequence, mass,
and associated protein. Sorted by neutral mass in ascending order.
"""

# Verify the eistence of the file:
# Verify the existence of the file:
if not os.path.isfile(fasta_filename):
print(f"File {fasta_filename} does not exist.")
logger.error("File %s does not exist.", fasta_filename)
raise FileNotFoundError(f"File {fasta_filename} does not exist.")

fasta_data = fasta.read(fasta_filename)
peptide_list = []
if digestion in ["full", "partial"]:
semi = True if digestion == "partial" else False
for header, seq in fasta_data:
pep_set = parser.cleave(
seq,
rule=parser.expasy_rules[enzyme],
missed_cleavages=missed_cleavages,
semi=semi,
)
protein = header.split()[0]
for pep in pep_set:
if len(pep) < min_length or len(pep) > max_length:
continue
if "X" in pep or "U" in pep:
logger.warn(
"Skipping peptide with ambiguous amino acids: %s", pep
)
continue
peptide_list.append((pep, protein))
else:
if digestion not in ["full", "partial"]:
logger.error("Digestion type %s not recognized.", digestion)
raise ValueError(f"Digestion type {digestion} not recognized.")
semi = digestion == "partial"
for header, seq in fasta_data:
pep_set = parser.cleave(
seq,
rule=parser.expasy_rules[enzyme],
missed_cleavages=missed_cleavages,
semi=semi,
)
protein = header.split()[0]
for pep in pep_set:
if len(pep) < min_peptide_length or len(pep) > max_peptide_length:
continue
if any(
aa in pep for aa in "BJOUXZ"
): # Check for incorrect AA letters
logger.warn(
"Skipping peptide with ambiguous amino acids: %s", pep
)
continue
peptide_list.append((pep, protein))

# Generate modified peptides
mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb")
Expand All @@ -136,7 +138,7 @@ def get_candidates(
precursor_mz: float,
charge: int,
peptide_list: List[Tuple[str, float, str]],
precursor_tolerance: int,
precursor_tolerance: float,
isotope_error: str,
):
"""
Expand All @@ -156,7 +158,6 @@ def get_candidates(
isotope_error : str
The isotope error levels to consider.
"""

candidates = set()

isotope_error = [int(x) for x in isotope_error.split(",")]
Expand Down Expand Up @@ -219,7 +220,9 @@ def _to_raw_mass(mz_mass, charge):


def get_mass_indices(masses, m_low, m_high):
"""Grabs mass indices from a list of mass values that fall within a specified range.
"""Grabs mass indices that fall within a specified range.
Pulls from masses, a list of mass values.
Requires that the mass values are sorted in ascending order.
Parameters
Expand Down
10 changes: 5 additions & 5 deletions casanovo/denovo/dataloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,20 @@

import functools
import os
from typing import List, Optional, Tuple
from functools import partial
import logging
from typing import List, Optional, Tuple

from depthcharge.data import AnnotatedSpectrumIndex
import lightning.pytorch as pl
import numpy as np
import torch
from depthcharge.data import AnnotatedSpectrumIndex

from ..data import db_utils
from ..data.datasets import (
AnnotatedSpectrumDataset,
SpectrumDataset,
)
from ..data import db_utils


logger = logging.getLogger("casanovo")

Expand Down Expand Up @@ -186,7 +186,7 @@ def db_dataloader(self) -> torch.utils.data.DataLoader:
return torch.utils.data.DataLoader(
self.test_dataset,
batch_size=self.eval_batch_size,
collate_fn=partial(
collate_fn=functools.partial(
prepare_psm_batch,
digest=self.digest,
precursor_tolerance=self.precursor_tolerance,
Expand Down
31 changes: 15 additions & 16 deletions casanovo/denovo/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from . import evaluate
from .. import config
from ..data import ms_io, db_utils
from ..data import ms_io

logger = logging.getLogger("casanovo")

Expand Down Expand Up @@ -991,7 +991,8 @@ def configure_optimizers(

class DbSpec2Pep(Spec2Pep):
"""
Subclass of Spec2Pep for the use of Casanovo as an MS/MS database search score function.
Subclass of Spec2Pep for the use of Casanovo as an \
MS/MS database search score function.
Uses teacher forcing to 'query' Casanovo for its score for each AA
within a candidate peptide, and takes the geometric average of these scores
Expand All @@ -1008,7 +1009,6 @@ class DbSpec2Pep(Spec2Pep):

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.total_psms = 0
self.psm_batch_size = 1024

def predict_step(self, batch, *args):
Expand All @@ -1029,11 +1029,14 @@ def predict_step(self, batch, *args):
scores, amino acid-level scores, and associated proteins.
"""
predictions = []
while len(batch[0]) > 0:
next_batch = [b[self.psm_batch_size :] for b in batch]
batch = [b[: self.psm_batch_size] for b in batch]
for start_idx in range(0, len(batch[0]), self.psm_batch_size):
current_batch = [
b[start_idx : start_idx + self.psm_batch_size] for b in batch
]
pred, truth = self.decoder(
batch[3], batch[1], *self.encoder(batch[0])
current_batch[3],
current_batch[1],
*self.encoder(current_batch[0]),
)
pred = self.softmax(pred)
all_scores, per_aa_scores = _calc_match_score(
Expand All @@ -1048,13 +1051,13 @@ def predict_step(self, batch, *args):
peptide,
protein,
) in zip(
batch[1][:, 1].cpu().detach().numpy(),
batch[1][:, 2].cpu().detach().numpy(),
batch[2],
current_batch[1][:, 1].cpu().detach().numpy(),
current_batch[1][:, 2].cpu().detach().numpy(),
current_batch[2],
all_scores.cpu().detach().numpy(),
per_aa_scores.cpu().detach().numpy(),
batch[3],
batch[4],
current_batch[3],
current_batch[4],
):
predictions.append(
(
Expand All @@ -1067,8 +1070,6 @@ def predict_step(self, batch, *args):
protein,
)
)
batch = next_batch
self.total_psms += len(predictions)
return predictions

def on_predict_batch_end(
Expand All @@ -1088,8 +1089,6 @@ def on_predict_batch_end(
aa_scores,
protein,
) in outputs:
if len(peptide) == 0:
continue
self.out_writer.psms.append(
(
peptide,
Expand Down
Loading

0 comments on commit 7cb8e14

Please sign in to comment.