Skip to content

Commit

Permalink
Merge branch 'dev' into dev_db_search
Browse files Browse the repository at this point in the history
  • Loading branch information
VarunAnanth2003 committed Oct 2, 2024
2 parents 310c3fd + 396f838 commit 1651fd5
Show file tree
Hide file tree
Showing 8 changed files with 290 additions and 67 deletions.
45 changes: 2 additions & 43 deletions casanovo/data/ms_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,58 +2,17 @@

import collections
import csv
import dataclasses
import operator
import os
import re
from pathlib import Path
from typing import List, Tuple, Iterable
from typing import List

import natsort

from .. import __version__
from ..config import Config


@dataclasses.dataclass
class PepSpecMatch:
"""
Peptide Spectrum Match (PSM) dataclass
Parameters
----------
sequence : str
The amino acid sequence of the peptide.
spectrum_id : Tuple[str, str]
A tuple containing the spectrum identifier in the form
(spectrum file name, spectrum file idx)
peptide_score : float
Score of the match between the full peptide sequence and the
spectrum.
charge : int
The precursor charge state of the peptide ion observed in the spectrum.
calc_mz : float
The calculated mass-to-charge ratio (m/z) of the peptide based on its
sequence and charge state.
exp_mz : float
The observed (experimental) precursor mass-to-charge ratio (m/z) of the
peptide as detected in the spectrum.
aa_scores : Iterable[float]
A list of scores for individual amino acids in the peptide
sequence, where len(aa_scores) == len(sequence)
protein : str
For DB-search mode, the protein from which the peptide
in the PSM was derived. Default value is "null".
"""

sequence: str
spectrum_id: Tuple[str, str]
peptide_score: float
charge: int
calc_mz: float
exp_mz: float
aa_scores: Iterable[float]
protein: str = "null"
from .psm import PepSpecMatch


class MztabWriter:
Expand Down
41 changes: 41 additions & 0 deletions casanovo/data/psm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""Peptide spectrum match dataclass"""

import dataclasses
from typing import Tuple, Iterable


@dataclasses.dataclass
class PepSpecMatch:
"""
Peptide Spectrum Match (PSM) dataclass
Parameters
----------
sequence : str
The amino acid sequence of the peptide.
spectrum_id : Tuple[str, str]
A tuple containing the spectrum identifier in the form
(spectrum file name, spectrum file idx)
peptide_score : float
Score of the match between the full peptide sequence and the
spectrum.
charge : int
The precursor charge state of the peptide ion observed in the spectrum.
calc_mz : float
The calculated mass-to-charge ratio (m/z) of the peptide based on its
sequence and charge state.
exp_mz : float
The observed (experimental) precursor mass-to-charge ratio (m/z) of the
peptide as detected in the spectrum.
aa_scores : Iterable[float]
A list of scores for individual amino acids in the peptide
sequence, where len(aa_scores) == len(sequence)
"""

sequence: str
spectrum_id: Tuple[str, str]
peptide_score: float
charge: int
calc_mz: float
exp_mz: float
aa_scores: Iterable[float]
20 changes: 14 additions & 6 deletions casanovo/denovo/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,8 @@ def aa_match_prefix_suffix(


def aa_match(
peptide1: List[str],
peptide2: List[str],
peptide1: List[str] | None,
peptide2: List[str] | None,
aa_dict: Dict[str, float],
cum_mass_threshold: float = 0.5,
ind_mass_threshold: float = 0.1,
Expand All @@ -139,9 +139,9 @@ def aa_match(
Parameters
----------
peptide1 : List[str]
peptide1 : List[str] | None,
The first tokenized peptide sequence to be compared.
peptide2 : List[str]
peptide2 : List[str] | None
The second tokenized peptide sequence to be compared.
aa_dict : Dict[str, float]
Mapping of amino acid tokens to their mass values.
Expand All @@ -161,7 +161,12 @@ def aa_match(
pep_match : bool
Boolean flag to indicate whether the two peptide sequences fully match.
"""
if mode == "best":
if peptide1 is None and peptide2 is None:
return np.empty(0, dtype=bool), False
elif peptide1 is None or peptide2 is None:
peptide = peptide1 if peptide2 is None else peptide2
return np.zeros(len(peptide), dtype=bool), False
elif mode == "best":
return aa_match_prefix_suffix(
peptide1, peptide2, aa_dict, cum_mass_threshold, ind_mass_threshold
)
Expand Down Expand Up @@ -225,9 +230,12 @@ def aa_match_batch(
# Split peptides into individual AAs if necessary.
if isinstance(peptide1, str):
peptide1 = re.split(r"(?<=.)(?=[A-Z])", peptide1)

if isinstance(peptide2, str):
peptide2 = re.split(r"(?<=.)(?=[A-Z])", peptide2)
n_aa1, n_aa2 = n_aa1 + len(peptide1), n_aa2 + len(peptide2)

n_aa1 += 0 if peptide1 is None else len(peptide1)
n_aa2 += 0 if peptide2 is None else len(peptide2)
aa_matches_batch.append(
aa_match(
peptide1,
Expand Down
4 changes: 2 additions & 2 deletions casanovo/denovo/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from . import evaluate
from .. import config
from ..data import ms_io
from ..data import ms_io, psm

logger = logging.getLogger("casanovo")

Expand Down Expand Up @@ -914,7 +914,7 @@ def on_predict_batch_end(
if len(peptide) == 0:
continue
self.out_writer.psms.append(
ms_io.PepSpecMatch(
psm.PepSpecMatch(
sequence=peptide,
spectrum_id=tuple(spectrum_i),
peptide_score=peptide_score,
Expand Down
40 changes: 28 additions & 12 deletions casanovo/denovo/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,20 +220,38 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None:
Index containing the annotated spectra used to generate model
predictions
"""
model_output = [psm.sequence for psm in self.writer.psms]
spectrum_annotations = [
test_index[i][4] for i in range(test_index.n_spectra)
]
aa_precision, _, pep_precision = aa_match_metrics(
seq_pred = []
seq_true = []
pred_idx = 0

with test_index as t_ind:
for true_idx in range(t_ind.n_spectra):
seq_true.append(t_ind[true_idx][4])
if pred_idx < len(self.writer.psms) and self.writer.psms[
pred_idx
].spectrum_id == t_ind.get_spectrum_id(true_idx):
seq_pred.append(self.writer.psms[pred_idx].sequence)
pred_idx += 1
else:
seq_pred.append(None)

aa_precision, aa_recall, pep_precision = aa_match_metrics(
*aa_match_batch(
spectrum_annotations,
model_output,
seq_true,
seq_pred,
depthcharge.masses.PeptideMass().masses,
)
)

if self.config["top_match"] > 1:
logger.warning(
"The behavior for calculating evaluation metrics is undefined when "
"the 'top_match' configuration option is set to a value greater than 1."
)

logger.info("Peptide Precision: %.2f%%", 100 * pep_precision)
logger.info("Amino Acid Precision: %.2f%%", 100 * aa_precision)
logger.info("Amino Acid Recall: %.2f%%", 100 * aa_recall)

def predict(
self,
Expand Down Expand Up @@ -312,10 +330,10 @@ def initialize_trainer(self, train: bool) -> None:
strategy=self._get_strategy(),
val_check_interval=self.config.val_check_interval,
check_val_every_n_epoch=None,
log_every_n_steps=self.config.get("log_every_n_steps"),
log_every_n_steps=self.config.log_every_n_steps,
)

if self.config.get("log_metrics"):
if self.config.log_metrics:
if not self.output_dir:
logger.warning(
"Output directory not set in model runner. "
Expand All @@ -336,9 +354,7 @@ def initialize_trainer(self, train: bool) -> None:
version=csv_log_dir,
name=None,
),
"log_every_n_steps": self.config.get(
"log_every_n_steps"
),
"log_every_n_steps": self.config.log_every_n_steps,
}
)

Expand Down
2 changes: 1 addition & 1 deletion casanovo/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import psutil
import torch

from .data.ms_io import PepSpecMatch
from .data.psm import PepSpecMatch


SCORE_BINS = [0.0, 0.5, 0.9, 0.95, 0.99]
Expand Down
Loading

0 comments on commit 1651fd5

Please sign in to comment.