Eval metrics and circular import bug fix. (#380)

* eval metrics bug fix * better eval metrics bug fix * eval metrics bug fix * better eval metrics bug fix * eval stats unit test, circular import fix * log metrics unit test * removed unused import * log metrics refactor, additional log metrics test case * aa_match_batch hanles none, additional skipped spectra test cases * Log optimizer and training metrics to CSV file (#376) * csv logger * optimizer metrics logger * metrics logging unit tests * config item retrieval, additional requested changes * Generate new screengrabs with rich-codex * changelog update * Generate new screengrabs with rich-codex --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> * aa_match_batch and aa_match handle None * top_match eval metrics warning * removed unused import * log metrics refactor, additional log metrics test case * aa_match_batch hanles none, additional skipped spectra test cases * aa_match_batch and aa_match handle None * top_match eval metrics warning * eval metrics bug fix * better eval metrics bug fix * eval stats unit test, circular import fix * log metrics unit test * removed unused import * log metrics refactor, additional log metrics test case * aa_match_batch hanles none, additional skipped spectra test cases * aa_match_batch and aa_match handle None * top_match eval metrics warning * removed unused import * log metrics refactor, additional log metrics test case * metrics file logging bug fix * aa_match test cases, minor aa_match refactor --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Noble-Lab · Sep 25, 2024 · 396f838 · 396f838
1 parent 34c456d
commit 396f838
Show file tree

Hide file tree

Showing 8 changed files with 290 additions and 63 deletions.
diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py
@@ -2,54 +2,17 @@
 
 import collections
 import csv
-import dataclasses
 import operator
 import os
 import re
 from pathlib import Path
-from typing import List, Tuple, Iterable
+from typing import List
 
 import natsort
 
 from .. import __version__
 from ..config import Config
-
-
-@dataclasses.dataclass
-class PepSpecMatch:
-    """
-    Peptide Spectrum Match (PSM) dataclass
-
-    Parameters
-    ----------
-    sequence : str
-        The amino acid sequence of the peptide.
-    spectrum_id : Tuple[str, str]
-        A tuple containing the spectrum identifier in the form
-        (spectrum file name, spectrum file idx)
-    peptide_score : float
-        Score of the match between the full peptide sequence and the
-        spectrum.
-    charge : int
-        The precursor charge state of the peptide ion observed in the spectrum.
-    calc_mz : float
-        The calculated mass-to-charge ratio (m/z) of the peptide based on its
-        sequence and charge state.
-    exp_mz : float
-        The observed (experimental) precursor mass-to-charge ratio (m/z) of the
-        peptide as detected in the spectrum.
-    aa_scores : Iterable[float]
-        A list of scores for individual amino acids in the peptide
-        sequence, where len(aa_scores) == len(sequence)
-    """
-
-    sequence: str
-    spectrum_id: Tuple[str, str]
-    peptide_score: float
-    charge: int
-    calc_mz: float
-    exp_mz: float
-    aa_scores: Iterable[float]
+from .psm import PepSpecMatch
 
 
 class MztabWriter:

diff --git a/casanovo/data/psm.py b/casanovo/data/psm.py
@@ -0,0 +1,41 @@
+"""Peptide spectrum match dataclass"""
+
+import dataclasses
+from typing import Tuple, Iterable
+
+
+@dataclasses.dataclass
+class PepSpecMatch:
+    """
+    Peptide Spectrum Match (PSM) dataclass
+
+    Parameters
+    ----------
+    sequence : str
+        The amino acid sequence of the peptide.
+    spectrum_id : Tuple[str, str]
+        A tuple containing the spectrum identifier in the form
+        (spectrum file name, spectrum file idx)
+    peptide_score : float
+        Score of the match between the full peptide sequence and the
+        spectrum.
+    charge : int
+        The precursor charge state of the peptide ion observed in the spectrum.
+    calc_mz : float
+        The calculated mass-to-charge ratio (m/z) of the peptide based on its
+        sequence and charge state.
+    exp_mz : float
+        The observed (experimental) precursor mass-to-charge ratio (m/z) of the
+        peptide as detected in the spectrum.
+    aa_scores : Iterable[float]
+        A list of scores for individual amino acids in the peptide
+        sequence, where len(aa_scores) == len(sequence)
+    """
+
+    sequence: str
+    spectrum_id: Tuple[str, str]
+    peptide_score: float
+    charge: int
+    calc_mz: float
+    exp_mz: float
+    aa_scores: Iterable[float]
diff --git a/casanovo/denovo/evaluate.py b/casanovo/denovo/evaluate.py
@@ -127,8 +127,8 @@ def aa_match_prefix_suffix(
 
 
 def aa_match(
-    peptide1: List[str],
-    peptide2: List[str],
+    peptide1: List[str] | None,
+    peptide2: List[str] | None,
     aa_dict: Dict[str, float],
     cum_mass_threshold: float = 0.5,
     ind_mass_threshold: float = 0.1,
@@ -139,9 +139,9 @@ def aa_match(
 
     Parameters
     ----------
-    peptide1 : List[str]
+    peptide1 : List[str] | None,
         The first tokenized peptide sequence to be compared.
-    peptide2 : List[str]
+    peptide2 : List[str] | None
         The second tokenized peptide sequence to be compared.
     aa_dict : Dict[str, float]
         Mapping of amino acid tokens to their mass values.
@@ -161,7 +161,12 @@ def aa_match(
     pep_match : bool
         Boolean flag to indicate whether the two peptide sequences fully match.
     """
-    if mode == "best":
+    if peptide1 is None and peptide2 is None:
+        return np.empty(0, dtype=bool), False
+    elif peptide1 is None or peptide2 is None:
+        peptide = peptide1 if peptide2 is None else peptide2
+        return np.zeros(len(peptide), dtype=bool), False
+    elif mode == "best":
         return aa_match_prefix_suffix(
             peptide1, peptide2, aa_dict, cum_mass_threshold, ind_mass_threshold
         )
@@ -225,9 +230,12 @@ def aa_match_batch(
         # Split peptides into individual AAs if necessary.
         if isinstance(peptide1, str):
             peptide1 = re.split(r"(?<=.)(?=[A-Z])", peptide1)
+
         if isinstance(peptide2, str):
             peptide2 = re.split(r"(?<=.)(?=[A-Z])", peptide2)
-        n_aa1, n_aa2 = n_aa1 + len(peptide1), n_aa2 + len(peptide2)
+
+        n_aa1 += 0 if peptide1 is None else len(peptide1)
+        n_aa2 += 0 if peptide2 is None else len(peptide2)
         aa_matches_batch.append(
             aa_match(
                 peptide1,

diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py
@@ -16,7 +16,7 @@
 
 from . import evaluate
 from .. import config
-from ..data import ms_io
+from ..data import ms_io, psm
 
 logger = logging.getLogger("casanovo")
 
@@ -914,7 +914,7 @@ def on_predict_batch_end(
             if len(peptide) == 0:
                 continue
             self.out_writer.psms.append(
-                ms_io.PepSpecMatch(
+                psm.PepSpecMatch(
                     sequence=peptide,
                     spectrum_id=tuple(spectrum_i),
                     peptide_score=peptide_score,

diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py
@@ -167,20 +167,38 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None:
             Index containing the annotated spectra used to generate model
             predictions
         """
-        model_output = [psm.sequence for psm in self.writer.psms]
-        spectrum_annotations = [
-            test_index[i][4] for i in range(test_index.n_spectra)
-        ]
-        aa_precision, _, pep_precision = aa_match_metrics(
+        seq_pred = []
+        seq_true = []
+        pred_idx = 0
+
+        with test_index as t_ind:
+            for true_idx in range(t_ind.n_spectra):
+                seq_true.append(t_ind[true_idx][4])
+                if pred_idx < len(self.writer.psms) and self.writer.psms[
+                    pred_idx
+                ].spectrum_id == t_ind.get_spectrum_id(true_idx):
+                    seq_pred.append(self.writer.psms[pred_idx].sequence)
+                    pred_idx += 1
+                else:
+                    seq_pred.append(None)
+
+        aa_precision, aa_recall, pep_precision = aa_match_metrics(
             *aa_match_batch(
-                spectrum_annotations,
-                model_output,
+                seq_true,
+                seq_pred,
                 depthcharge.masses.PeptideMass().masses,
             )
         )
 
+        if self.config["top_match"] > 1:
+            logger.warning(
+                "The behavior for calculating evaluation metrics is undefined when "
+                "the 'top_match' configuration option is set to a value greater than 1."
+            )
+
         logger.info("Peptide Precision: %.2f%%", 100 * pep_precision)
         logger.info("Amino Acid Precision: %.2f%%", 100 * aa_precision)
+        logger.info("Amino Acid Recall: %.2f%%", 100 * aa_recall)
 
     def predict(
         self,
@@ -259,10 +277,10 @@ def initialize_trainer(self, train: bool) -> None:
                 strategy=self._get_strategy(),
                 val_check_interval=self.config.val_check_interval,
                 check_val_every_n_epoch=None,
-                log_every_n_steps=self.config.get("log_every_n_steps"),
+                log_every_n_steps=self.config.log_every_n_steps,
             )
 
-            if self.config.get("log_metrics"):
+            if self.config.log_metrics:
                 if not self.output_dir:
                     logger.warning(
                         "Output directory not set in model runner. "
@@ -283,9 +301,7 @@ def initialize_trainer(self, train: bool) -> None:
                                 version=csv_log_dir,
                                 name=None,
                             ),
-                            "log_every_n_steps": self.config.get(
-                                "log_every_n_steps"
-                            ),
+                            "log_every_n_steps": self.config.log_every_n_steps,
                         }
                     )
 

diff --git a/casanovo/utils.py b/casanovo/utils.py
@@ -15,7 +15,7 @@
 import psutil
 import torch
 
-from .data.ms_io import PepSpecMatch
+from .data.psm import PepSpecMatch
 
 
 SCORE_BINS = [0.0, 0.5, 0.9, 0.95, 0.99]