Skip to content

Commit

Permalink
Eval metrics and circular import bug fix. (#380)
Browse files Browse the repository at this point in the history
* eval metrics bug fix

* better eval metrics bug fix

* eval metrics bug fix

* better eval metrics bug fix

* eval stats unit test, circular import fix

* log metrics unit test

* removed unused import

* log metrics refactor, additional log metrics test case

* aa_match_batch hanles none, additional skipped spectra test cases

* Log optimizer and training metrics to CSV file (#376)

* csv logger

* optimizer metrics logger

* metrics logging unit tests

* config item retrieval, additional requested changes

* Generate new screengrabs with rich-codex

* changelog update

* Generate new screengrabs with rich-codex

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>

* aa_match_batch and aa_match handle None

* top_match eval metrics warning

* removed unused import

* log metrics refactor, additional log metrics test case

* aa_match_batch hanles none, additional skipped spectra test cases

* aa_match_batch and aa_match handle None

* top_match eval metrics warning

* eval metrics bug fix

* better eval metrics bug fix

* eval stats unit test, circular import fix

* log metrics unit test

* removed unused import

* log metrics refactor, additional log metrics test case

* aa_match_batch hanles none, additional skipped spectra test cases

* aa_match_batch and aa_match handle None

* top_match eval metrics warning

* removed unused import

* log metrics refactor, additional log metrics test case

* metrics file logging bug fix

* aa_match test cases, minor aa_match refactor

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
  • Loading branch information
Lilferrit and github-actions[bot] authored Sep 25, 2024
1 parent 34c456d commit 396f838
Show file tree
Hide file tree
Showing 8 changed files with 290 additions and 63 deletions.
41 changes: 2 additions & 39 deletions casanovo/data/ms_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,54 +2,17 @@

import collections
import csv
import dataclasses
import operator
import os
import re
from pathlib import Path
from typing import List, Tuple, Iterable
from typing import List

import natsort

from .. import __version__
from ..config import Config


@dataclasses.dataclass
class PepSpecMatch:
"""
Peptide Spectrum Match (PSM) dataclass
Parameters
----------
sequence : str
The amino acid sequence of the peptide.
spectrum_id : Tuple[str, str]
A tuple containing the spectrum identifier in the form
(spectrum file name, spectrum file idx)
peptide_score : float
Score of the match between the full peptide sequence and the
spectrum.
charge : int
The precursor charge state of the peptide ion observed in the spectrum.
calc_mz : float
The calculated mass-to-charge ratio (m/z) of the peptide based on its
sequence and charge state.
exp_mz : float
The observed (experimental) precursor mass-to-charge ratio (m/z) of the
peptide as detected in the spectrum.
aa_scores : Iterable[float]
A list of scores for individual amino acids in the peptide
sequence, where len(aa_scores) == len(sequence)
"""

sequence: str
spectrum_id: Tuple[str, str]
peptide_score: float
charge: int
calc_mz: float
exp_mz: float
aa_scores: Iterable[float]
from .psm import PepSpecMatch


class MztabWriter:
Expand Down
41 changes: 41 additions & 0 deletions casanovo/data/psm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""Peptide spectrum match dataclass"""

import dataclasses
from typing import Tuple, Iterable


@dataclasses.dataclass
class PepSpecMatch:
"""
Peptide Spectrum Match (PSM) dataclass
Parameters
----------
sequence : str
The amino acid sequence of the peptide.
spectrum_id : Tuple[str, str]
A tuple containing the spectrum identifier in the form
(spectrum file name, spectrum file idx)
peptide_score : float
Score of the match between the full peptide sequence and the
spectrum.
charge : int
The precursor charge state of the peptide ion observed in the spectrum.
calc_mz : float
The calculated mass-to-charge ratio (m/z) of the peptide based on its
sequence and charge state.
exp_mz : float
The observed (experimental) precursor mass-to-charge ratio (m/z) of the
peptide as detected in the spectrum.
aa_scores : Iterable[float]
A list of scores for individual amino acids in the peptide
sequence, where len(aa_scores) == len(sequence)
"""

sequence: str
spectrum_id: Tuple[str, str]
peptide_score: float
charge: int
calc_mz: float
exp_mz: float
aa_scores: Iterable[float]
20 changes: 14 additions & 6 deletions casanovo/denovo/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,8 +127,8 @@ def aa_match_prefix_suffix(


def aa_match(
peptide1: List[str],
peptide2: List[str],
peptide1: List[str] | None,
peptide2: List[str] | None,
aa_dict: Dict[str, float],
cum_mass_threshold: float = 0.5,
ind_mass_threshold: float = 0.1,
Expand All @@ -139,9 +139,9 @@ def aa_match(
Parameters
----------
peptide1 : List[str]
peptide1 : List[str] | None,
The first tokenized peptide sequence to be compared.
peptide2 : List[str]
peptide2 : List[str] | None
The second tokenized peptide sequence to be compared.
aa_dict : Dict[str, float]
Mapping of amino acid tokens to their mass values.
Expand All @@ -161,7 +161,12 @@ def aa_match(
pep_match : bool
Boolean flag to indicate whether the two peptide sequences fully match.
"""
if mode == "best":
if peptide1 is None and peptide2 is None:
return np.empty(0, dtype=bool), False
elif peptide1 is None or peptide2 is None:
peptide = peptide1 if peptide2 is None else peptide2
return np.zeros(len(peptide), dtype=bool), False
elif mode == "best":
return aa_match_prefix_suffix(
peptide1, peptide2, aa_dict, cum_mass_threshold, ind_mass_threshold
)
Expand Down Expand Up @@ -225,9 +230,12 @@ def aa_match_batch(
# Split peptides into individual AAs if necessary.
if isinstance(peptide1, str):
peptide1 = re.split(r"(?<=.)(?=[A-Z])", peptide1)

if isinstance(peptide2, str):
peptide2 = re.split(r"(?<=.)(?=[A-Z])", peptide2)
n_aa1, n_aa2 = n_aa1 + len(peptide1), n_aa2 + len(peptide2)

n_aa1 += 0 if peptide1 is None else len(peptide1)
n_aa2 += 0 if peptide2 is None else len(peptide2)
aa_matches_batch.append(
aa_match(
peptide1,
Expand Down
4 changes: 2 additions & 2 deletions casanovo/denovo/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from . import evaluate
from .. import config
from ..data import ms_io
from ..data import ms_io, psm

logger = logging.getLogger("casanovo")

Expand Down Expand Up @@ -914,7 +914,7 @@ def on_predict_batch_end(
if len(peptide) == 0:
continue
self.out_writer.psms.append(
ms_io.PepSpecMatch(
psm.PepSpecMatch(
sequence=peptide,
spectrum_id=tuple(spectrum_i),
peptide_score=peptide_score,
Expand Down
40 changes: 28 additions & 12 deletions casanovo/denovo/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,20 +167,38 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None:
Index containing the annotated spectra used to generate model
predictions
"""
model_output = [psm.sequence for psm in self.writer.psms]
spectrum_annotations = [
test_index[i][4] for i in range(test_index.n_spectra)
]
aa_precision, _, pep_precision = aa_match_metrics(
seq_pred = []
seq_true = []
pred_idx = 0

with test_index as t_ind:
for true_idx in range(t_ind.n_spectra):
seq_true.append(t_ind[true_idx][4])
if pred_idx < len(self.writer.psms) and self.writer.psms[
pred_idx
].spectrum_id == t_ind.get_spectrum_id(true_idx):
seq_pred.append(self.writer.psms[pred_idx].sequence)
pred_idx += 1
else:
seq_pred.append(None)

aa_precision, aa_recall, pep_precision = aa_match_metrics(
*aa_match_batch(
spectrum_annotations,
model_output,
seq_true,
seq_pred,
depthcharge.masses.PeptideMass().masses,
)
)

if self.config["top_match"] > 1:
logger.warning(
"The behavior for calculating evaluation metrics is undefined when "
"the 'top_match' configuration option is set to a value greater than 1."
)

logger.info("Peptide Precision: %.2f%%", 100 * pep_precision)
logger.info("Amino Acid Precision: %.2f%%", 100 * aa_precision)
logger.info("Amino Acid Recall: %.2f%%", 100 * aa_recall)

def predict(
self,
Expand Down Expand Up @@ -259,10 +277,10 @@ def initialize_trainer(self, train: bool) -> None:
strategy=self._get_strategy(),
val_check_interval=self.config.val_check_interval,
check_val_every_n_epoch=None,
log_every_n_steps=self.config.get("log_every_n_steps"),
log_every_n_steps=self.config.log_every_n_steps,
)

if self.config.get("log_metrics"):
if self.config.log_metrics:
if not self.output_dir:
logger.warning(
"Output directory not set in model runner. "
Expand All @@ -283,9 +301,7 @@ def initialize_trainer(self, train: bool) -> None:
version=csv_log_dir,
name=None,
),
"log_every_n_steps": self.config.get(
"log_every_n_steps"
),
"log_every_n_steps": self.config.log_every_n_steps,
}
)

Expand Down
2 changes: 1 addition & 1 deletion casanovo/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import psutil
import torch

from .data.ms_io import PepSpecMatch
from .data.psm import PepSpecMatch


SCORE_BINS = [0.0, 0.5, 0.9, 0.95, 0.99]
Expand Down
Loading

0 comments on commit 396f838

Please sign in to comment.