From 617fcb8f25cc4721b6636e433be722aa2f092e18 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Thu, 12 Sep 2024 11:16:22 -0700 Subject: [PATCH 01/16] eval metrics bug fix --- casanovo/denovo/model_runner.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 0f39fe46..f622f9de 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -132,10 +132,17 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: Index containing the annotated spectra used to generate model predictions """ + with test_index as t_ind: + all_spectrum_annotations = { + "".join(t_ind.get_spectrum_id(i)): t_ind[i][4] + for i in range(t_ind.n_spectra) + } model_output = [psm.sequence for psm in self.writer.psms] spectrum_annotations = [ - test_index[i][4] for i in range(test_index.n_spectra) + all_spectrum_annotations["".join(psm.spectrum_id)] + for psm in self.writer.psms ] + aa_precision, _, pep_precision = aa_match_metrics( *aa_match_batch( spectrum_annotations, @@ -143,7 +150,6 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: depthcharge.masses.PeptideMass().masses, ) ) - logger.info("Peptide Precision: %.2f%%", 100 * pep_precision) logger.info("Amino Acid Precision: %.2f%%", 100 * aa_precision) From a52ba83649a638eb7269b5d14cf6d8f84100d469 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Thu, 12 Sep 2024 14:38:06 -0700 Subject: [PATCH 02/16] better eval metrics bug fix --- casanovo/denovo/model_runner.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index f622f9de..24a2bf81 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -132,16 +132,27 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: Index containing the annotated spectra used to generate model predictions """ + model_output = [] + spectrum_annotations = [] + psms = iter(self.writer.psms) + curr_psm = next(psms, None) + with test_index as t_ind: - all_spectrum_annotations = { - "".join(t_ind.get_spectrum_id(i)): t_ind[i][4] - for i in range(t_ind.n_spectra) - } - model_output = [psm.sequence for psm in self.writer.psms] - spectrum_annotations = [ - all_spectrum_annotations["".join(psm.spectrum_id)] - for psm in self.writer.psms - ] + for i in range(t_ind.n_spectra): + if curr_psm is None: + break + + if curr_psm.spectrum_id != t_ind.get_spectrum_id(i): + continue + + model_output.append(curr_psm.sequence) + spectrum_annotations.append(t_ind[i][4]) + curr_psm = next(psms, None) + + if curr_psm is not None: + logger.warning( + "Some spectra were not matched to annotations during evaluation." + ) aa_precision, _, pep_precision = aa_match_metrics( *aa_match_batch( From 81f45157f7ce178424480919602eb2dfa48eaa4a Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Thu, 12 Sep 2024 11:16:22 -0700 Subject: [PATCH 03/16] eval metrics bug fix --- casanovo/denovo/model_runner.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 9e94199e..3c5e0a7e 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -163,10 +163,17 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: Index containing the annotated spectra used to generate model predictions """ + with test_index as t_ind: + all_spectrum_annotations = { + "".join(t_ind.get_spectrum_id(i)): t_ind[i][4] + for i in range(t_ind.n_spectra) + } model_output = [psm.sequence for psm in self.writer.psms] spectrum_annotations = [ - test_index[i][4] for i in range(test_index.n_spectra) + all_spectrum_annotations["".join(psm.spectrum_id)] + for psm in self.writer.psms ] + aa_precision, _, pep_precision = aa_match_metrics( *aa_match_batch( spectrum_annotations, @@ -174,7 +181,6 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: depthcharge.masses.PeptideMass().masses, ) ) - logger.info("Peptide Precision: %.2f%%", 100 * pep_precision) logger.info("Amino Acid Precision: %.2f%%", 100 * aa_precision) From e30b674850761a8fa7935c06b57d8fff001115e1 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Thu, 12 Sep 2024 14:38:06 -0700 Subject: [PATCH 04/16] better eval metrics bug fix --- casanovo/denovo/model_runner.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 3c5e0a7e..5975ffdc 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -163,16 +163,27 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: Index containing the annotated spectra used to generate model predictions """ + model_output = [] + spectrum_annotations = [] + psms = iter(self.writer.psms) + curr_psm = next(psms, None) + with test_index as t_ind: - all_spectrum_annotations = { - "".join(t_ind.get_spectrum_id(i)): t_ind[i][4] - for i in range(t_ind.n_spectra) - } - model_output = [psm.sequence for psm in self.writer.psms] - spectrum_annotations = [ - all_spectrum_annotations["".join(psm.spectrum_id)] - for psm in self.writer.psms - ] + for i in range(t_ind.n_spectra): + if curr_psm is None: + break + + if curr_psm.spectrum_id != t_ind.get_spectrum_id(i): + continue + + model_output.append(curr_psm.sequence) + spectrum_annotations.append(t_ind[i][4]) + curr_psm = next(psms, None) + + if curr_psm is not None: + logger.warning( + "Some spectra were not matched to annotations during evaluation." + ) aa_precision, _, pep_precision = aa_match_metrics( *aa_match_batch( From 7b6bab38865690b81af9a2083f32a9b71a89861e Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 16 Sep 2024 11:58:08 -0700 Subject: [PATCH 05/16] eval stats unit test, circular import fix --- casanovo/data/ms_io.py | 41 +----------- casanovo/data/pep_spec_match.py | 41 ++++++++++++ casanovo/denovo/model.py | 4 +- casanovo/utils.py | 2 +- tests/unit_tests/test_runner.py | 110 ++++++++++++++++++++++++++++++++ 5 files changed, 156 insertions(+), 42 deletions(-) create mode 100644 casanovo/data/pep_spec_match.py diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index d1e937f9..389c94e5 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -2,54 +2,17 @@ import collections import csv -import dataclasses import operator import os import re from pathlib import Path -from typing import List, Tuple, Iterable +from typing import List import natsort from .. import __version__ from ..config import Config - - -@dataclasses.dataclass -class PepSpecMatch: - """ - Peptide Spectrum Match (PSM) dataclass - - Parameters - ---------- - sequence : str - The amino acid sequence of the peptide. - spectrum_id : Tuple[str, str] - A tuple containing the spectrum identifier in the form - (spectrum file name, spectrum file idx) - peptide_score : float - Score of the match between the full peptide sequence and the - spectrum. - charge : int - The precursor charge state of the peptide ion observed in the spectrum. - calc_mz : float - The calculated mass-to-charge ratio (m/z) of the peptide based on its - sequence and charge state. - exp_mz : float - The observed (experimental) precursor mass-to-charge ratio (m/z) of the - peptide as detected in the spectrum. - aa_scores : Iterable[float] - A list of scores for individual amino acids in the peptide - sequence, where len(aa_scores) == len(sequence) - """ - - sequence: str - spectrum_id: Tuple[str, str] - peptide_score: float - charge: int - calc_mz: float - exp_mz: float - aa_scores: Iterable[float] +from .pep_spec_match import PepSpecMatch class MztabWriter: diff --git a/casanovo/data/pep_spec_match.py b/casanovo/data/pep_spec_match.py new file mode 100644 index 00000000..0dc3c48b --- /dev/null +++ b/casanovo/data/pep_spec_match.py @@ -0,0 +1,41 @@ +"""Peptide spectrum match dataclass""" + +import dataclasses +from typing import Tuple, Iterable + + +@dataclasses.dataclass +class PepSpecMatch: + """ + Peptide Spectrum Match (PSM) dataclass + + Parameters + ---------- + sequence : str + The amino acid sequence of the peptide. + spectrum_id : Tuple[str, str] + A tuple containing the spectrum identifier in the form + (spectrum file name, spectrum file idx) + peptide_score : float + Score of the match between the full peptide sequence and the + spectrum. + charge : int + The precursor charge state of the peptide ion observed in the spectrum. + calc_mz : float + The calculated mass-to-charge ratio (m/z) of the peptide based on its + sequence and charge state. + exp_mz : float + The observed (experimental) precursor mass-to-charge ratio (m/z) of the + peptide as detected in the spectrum. + aa_scores : Iterable[float] + A list of scores for individual amino acids in the peptide + sequence, where len(aa_scores) == len(sequence) + """ + + sequence: str + spectrum_id: Tuple[str, str] + peptide_score: float + charge: int + calc_mz: float + exp_mz: float + aa_scores: Iterable[float] diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 6e984a1d..3a1e3d2c 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -16,7 +16,7 @@ from . import evaluate from .. import config -from ..data import ms_io +from ..data import ms_io, pep_spec_match logger = logging.getLogger("casanovo") @@ -914,7 +914,7 @@ def on_predict_batch_end( if len(peptide) == 0: continue self.out_writer.psms.append( - ms_io.PepSpecMatch( + pep_spec_match.PepSpecMatch( sequence=peptide, spectrum_id=tuple(spectrum_i), peptide_score=peptide_score, diff --git a/casanovo/utils.py b/casanovo/utils.py index 1161b5eb..1646c5cc 100644 --- a/casanovo/utils.py +++ b/casanovo/utils.py @@ -15,7 +15,7 @@ import psutil import torch -from .data.ms_io import PepSpecMatch +from .data.pep_spec_match import PepSpecMatch SCORE_BINS = [0.0, 0.5, 0.9, 0.95, 0.99] diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index 8cea9021..ce9ac7f3 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -1,5 +1,6 @@ """Unit tests specifically for the model_runner module.""" +import unittest.mock from pathlib import Path import pytest @@ -7,6 +8,7 @@ from casanovo.config import Config from casanovo.denovo.model_runner import ModelRunner +from casanovo.data.ms_io import PepSpecMatch, MztabWriter def test_initialize_model(tmp_path, mgf_small): @@ -282,3 +284,111 @@ def test_evaluate( ) result_file.unlink() + + +def test_log_metrics_with(monkeypatch, tiny_config): + """Test the log_metrics function using monkeypatch context manager""" + + # Mocking AnnotatedSpectrumIndex context manager + def get_mock_index(psm_list): + mock_test_index = unittest.mock.MagicMock() + mock_test_index.__enter__.return_value = mock_test_index + mock_test_index.__exit__.return_value = False + mock_test_index.n_spectra = len(psm_list) + mock_test_index.get_spectrum_id = lambda idx: psm_list[idx].spectrum_id + + mock_spectra = [ + (None, None, None, None, curr_psm.sequence) + for curr_psm in psm_list + ] + mock_test_index.__getitem__ = lambda idx, _: mock_spectra[idx] + + return mock_test_index + + def get_mock_psm(sequence, spectrum_id): + return PepSpecMatch( + sequence=sequence, + spectrum_id=spectrum_id, + peptide_score=None, + charge=None, + exp_mz=None, + aa_scores=None, + calc_mz=None, + ) + + with monkeypatch.context() as ctx: + mock_logger = unittest.mock.MagicMock() + ctx.setattr("casanovo.denovo.model_runner.logger", mock_logger) + + with ModelRunner(Config(tiny_config)) as runner: + # Test 100% peptide precision + psms = [ + get_mock_psm("PEP", ("foo", "index=1")), + get_mock_psm("PET", ("foo", "index=2")), + ] + runner.writer = unittest.mock.Mock(spec=MztabWriter) + runner.writer.psms = psms + mock_index = get_mock_index(psms.copy()) + + runner.log_metrics(mock_index) + mock_logger.info.assert_any_call( + "Peptide Precision: %.2f%%", 100.0 + ) + + # Test 50% peptide precision (one wrong) + infer_psms = [ + get_mock_psm("PEP", ("foo", "index=1")), + get_mock_psm("PET", ("foo", "index=2")), + ] + + act_psms = [ + get_mock_psm("PEP", ("foo", "index=1")), + get_mock_psm("PEP", ("foo", "index=2")), + ] + + runner.writer.psms = infer_psms + mock_index = get_mock_index(act_psms) + mock_logger.info.assert_any_call("Peptide Precision: %.2f%%", 50.0) + + # Test skipped spectra + act_psms = [ + get_mock_psm("PEP", ("foo", "index=1")), + get_mock_psm("PET", ("foo", "index=2")), + get_mock_psm("PEI", ("foo", "index=3")), + get_mock_psm("PEG", ("foo", "index=4")), + get_mock_psm("PEA", ("foo", "index=5")), + ] + + infer_psms = [ + get_mock_psm("PEP", ("foo", "index=1")), + get_mock_psm("PET", ("foo", "index=3")), + get_mock_psm("PEG", ("foo", "index=4")), + get_mock_psm("PET", ("foo", "index=5")), + ] + + runner.writer.psms = infer_psms + mock_index = get_mock_index(act_psms) + mock_logger.info.assert_any_call("Peptide Precision: %.2f%%", 75.0) + + # Test un-inferred spectra + act_psms = [ + get_mock_psm("PEP", ("foo", "index=1")), + get_mock_psm("PET", ("foo", "index=2")), + get_mock_psm("PEI", ("foo", "index=3")), + get_mock_psm("PEG", ("foo", "index=4")), + ] + + infer_psms = [ + get_mock_psm("PE", ("foo", "index=1")), + get_mock_psm("PE", ("foo", "index=2")), + get_mock_psm("PE", ("foo", "index=3")), + get_mock_psm("PE", ("foo", "index=4")), + get_mock_psm("PE", ("foo", "index=5")), + ] + + runner.writer.psms = infer_psms + mock_index = get_mock_index(act_psms) + mock_logger.info.assert_any_call("Peptide Precision: %.2f%%", 0.0) + mock_logger.warning.assert_called_once_with( + "Some spectra were not matched to annotations during evaluation." + ) From ddbc93a31fe5ac71bc9fa2599755532ce7eb5379 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Tue, 17 Sep 2024 13:19:50 -0700 Subject: [PATCH 06/16] log metrics unit test --- casanovo/denovo/model_runner.py | 10 +++--- tests/unit_tests/test_runner.py | 63 ++++++++++++++++++++++----------- 2 files changed, 48 insertions(+), 25 deletions(-) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 5975ffdc..6fb8cb95 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -173,12 +173,12 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: if curr_psm is None: break - if curr_psm.spectrum_id != t_ind.get_spectrum_id(i): - continue - - model_output.append(curr_psm.sequence) spectrum_annotations.append(t_ind[i][4]) - curr_psm = next(psms, None) + if curr_psm.spectrum_id == t_ind.get_spectrum_id(i): + model_output.append(curr_psm.sequence) + curr_psm = next(psms, None) + else: + model_output.append("") if curr_psm is not None: logger.warning( diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index ce9ac7f3..dfc9c3a6 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -286,10 +286,9 @@ def test_evaluate( result_file.unlink() -def test_log_metrics_with(monkeypatch, tiny_config): - """Test the log_metrics function using monkeypatch context manager""" +def test_log_metrics(monkeypatch, tiny_config): + TEST_EPSILON = 10**-5 - # Mocking AnnotatedSpectrumIndex context manager def get_mock_index(psm_list): mock_test_index = unittest.mock.MagicMock() mock_test_index.__enter__.return_value = mock_test_index @@ -301,8 +300,7 @@ def get_mock_index(psm_list): (None, None, None, None, curr_psm.sequence) for curr_psm in psm_list ] - mock_test_index.__getitem__ = lambda idx, _: mock_spectra[idx] - + mock_test_index.__getitem__.side_effect = lambda idx: mock_spectra[idx] return mock_test_index def get_mock_psm(sequence, spectrum_id): @@ -321,19 +319,27 @@ def get_mock_psm(sequence, spectrum_id): ctx.setattr("casanovo.denovo.model_runner.logger", mock_logger) with ModelRunner(Config(tiny_config)) as runner: + runner.writer = unittest.mock.MagicMock() + # Test 100% peptide precision - psms = [ + infer_psms = [ get_mock_psm("PEP", ("foo", "index=1")), get_mock_psm("PET", ("foo", "index=2")), ] - runner.writer = unittest.mock.Mock(spec=MztabWriter) - runner.writer.psms = psms - mock_index = get_mock_index(psms.copy()) + act_psms = [ + get_mock_psm("PEP", ("foo", "index=1")), + get_mock_psm("PET", ("foo", "index=2")), + ] + + runner.writer.psms = infer_psms + mock_index = get_mock_index(act_psms) runner.log_metrics(mock_index) - mock_logger.info.assert_any_call( - "Peptide Precision: %.2f%%", 100.0 - ) + + pep_precision = mock_logger.info.call_args_list[-2][0][1] + aa_precision = mock_logger.info.call_args_list[-1][0][1] + assert abs(pep_precision - 100) < TEST_EPSILON + assert abs(aa_precision - 100) < TEST_EPSILON # Test 50% peptide precision (one wrong) infer_psms = [ @@ -348,7 +354,12 @@ def get_mock_psm(sequence, spectrum_id): runner.writer.psms = infer_psms mock_index = get_mock_index(act_psms) - mock_logger.info.assert_any_call("Peptide Precision: %.2f%%", 50.0) + runner.log_metrics(mock_index) + + pep_precision = mock_logger.info.call_args_list[-2][0][1] + aa_precision = mock_logger.info.call_args_list[-1][0][1] + assert abs(pep_precision - 100 * (1 / 2)) < TEST_EPSILON + assert abs(aa_precision - 100 * (5 / 6)) < TEST_EPSILON # Test skipped spectra act_psms = [ @@ -361,14 +372,19 @@ def get_mock_psm(sequence, spectrum_id): infer_psms = [ get_mock_psm("PEP", ("foo", "index=1")), - get_mock_psm("PET", ("foo", "index=3")), - get_mock_psm("PEG", ("foo", "index=4")), - get_mock_psm("PET", ("foo", "index=5")), + get_mock_psm("PET", ("foo", "index=2")), + get_mock_psm("PEI", ("foo", "index=3")), + get_mock_psm("PEA", ("foo", "index=5")), ] runner.writer.psms = infer_psms mock_index = get_mock_index(act_psms) - mock_logger.info.assert_any_call("Peptide Precision: %.2f%%", 75.0) + runner.log_metrics(mock_index) + + pep_precision = mock_logger.info.call_args_list[-2][0][1] + aa_precision = mock_logger.info.call_args_list[-1][0][1] + assert abs(pep_precision - 100 * (4 / 5)) < TEST_EPSILON + assert abs(aa_precision - 100 * (12 / 13)) < TEST_EPSILON # Test un-inferred spectra act_psms = [ @@ -388,7 +404,14 @@ def get_mock_psm(sequence, spectrum_id): runner.writer.psms = infer_psms mock_index = get_mock_index(act_psms) - mock_logger.info.assert_any_call("Peptide Precision: %.2f%%", 0.0) - mock_logger.warning.assert_called_once_with( - "Some spectra were not matched to annotations during evaluation." + runner.log_metrics(mock_index) + + pep_precision = mock_logger.info.call_args_list[-2][0][1] + aa_precision = mock_logger.info.call_args_list[-1][0][1] + last_warning_msg = mock_logger.warning.call_args_list[-1][0][0] + assert abs(pep_precision) < TEST_EPSILON + assert abs(aa_precision - 100) < TEST_EPSILON + assert ( + last_warning_msg + == "Some spectra were not matched to annotations during evaluation." ) From 9d4109ee9029f61a475971f8bcd5ada85422ce3c Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Tue, 17 Sep 2024 15:13:58 -0700 Subject: [PATCH 07/16] removed unused import --- tests/unit_tests/test_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index dfc9c3a6..b57e7475 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -8,7 +8,7 @@ from casanovo.config import Config from casanovo.denovo.model_runner import ModelRunner -from casanovo.data.ms_io import PepSpecMatch, MztabWriter +from casanovo.data.ms_io import PepSpecMatch def test_initialize_model(tmp_path, mgf_small): From 86747d97edea9360ca1dcc2ed7dd3fbba51979b6 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Thu, 19 Sep 2024 14:18:32 -0700 Subject: [PATCH 08/16] log metrics refactor, additional log metrics test case --- casanovo/data/ms_io.py | 2 +- casanovo/data/{pep_spec_match.py => psm.py} | 0 casanovo/denovo/model.py | 4 +- casanovo/denovo/model_runner.py | 33 ++++++-------- casanovo/utils.py | 2 +- tests/unit_tests/test_runner.py | 49 ++++++++++++++------- 6 files changed, 50 insertions(+), 40 deletions(-) rename casanovo/data/{pep_spec_match.py => psm.py} (100%) diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index 389c94e5..50932399 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -12,7 +12,7 @@ from .. import __version__ from ..config import Config -from .pep_spec_match import PepSpecMatch +from .psm import PepSpecMatch class MztabWriter: diff --git a/casanovo/data/pep_spec_match.py b/casanovo/data/psm.py similarity index 100% rename from casanovo/data/pep_spec_match.py rename to casanovo/data/psm.py diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 3a1e3d2c..ce8621d8 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -16,7 +16,7 @@ from . import evaluate from .. import config -from ..data import ms_io, pep_spec_match +from ..data import ms_io, psm logger = logging.getLogger("casanovo") @@ -914,7 +914,7 @@ def on_predict_batch_end( if len(peptide) == 0: continue self.out_writer.psms.append( - pep_spec_match.PepSpecMatch( + psm.PepSpecMatch( sequence=peptide, spectrum_id=tuple(spectrum_i), peptide_score=peptide_score, diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 6fb8cb95..03cab931 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -163,32 +163,25 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: Index containing the annotated spectra used to generate model predictions """ - model_output = [] - spectrum_annotations = [] - psms = iter(self.writer.psms) - curr_psm = next(psms, None) + seq_pred = [] + seq_true = [] + pred_idx = 0 with test_index as t_ind: - for i in range(t_ind.n_spectra): - if curr_psm is None: - break - - spectrum_annotations.append(t_ind[i][4]) - if curr_psm.spectrum_id == t_ind.get_spectrum_id(i): - model_output.append(curr_psm.sequence) - curr_psm = next(psms, None) + for true_idx in range(t_ind.n_spectra): + seq_true.append(t_ind[true_idx][4]) + if pred_idx < len(self.writer.psms) and self.writer.psms[ + pred_idx + ].spectrum_id == t_ind.get_spectrum_id(true_idx): + seq_pred.append(self.writer.psms[pred_idx].sequence) + pred_idx += 1 else: - model_output.append("") - - if curr_psm is not None: - logger.warning( - "Some spectra were not matched to annotations during evaluation." - ) + seq_pred.append("") aa_precision, _, pep_precision = aa_match_metrics( *aa_match_batch( - spectrum_annotations, - model_output, + seq_true, + seq_pred, depthcharge.masses.PeptideMass().masses, ) ) diff --git a/casanovo/utils.py b/casanovo/utils.py index 1646c5cc..43b1cb7d 100644 --- a/casanovo/utils.py +++ b/casanovo/utils.py @@ -15,7 +15,7 @@ import psutil import torch -from .data.pep_spec_match import PepSpecMatch +from .data.psm import PepSpecMatch SCORE_BINS = [0.0, 0.5, 0.9, 0.95, 0.99] diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index b57e7475..01966b36 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -7,8 +7,8 @@ import torch from casanovo.config import Config +from casanovo.data.psm import PepSpecMatch from casanovo.denovo.model_runner import ModelRunner -from casanovo.data.ms_io import PepSpecMatch def test_initialize_model(tmp_path, mgf_small): @@ -287,8 +287,6 @@ def test_evaluate( def test_log_metrics(monkeypatch, tiny_config): - TEST_EPSILON = 10**-5 - def get_mock_index(psm_list): mock_test_index = unittest.mock.MagicMock() mock_test_index.__enter__.return_value = mock_test_index @@ -338,8 +336,8 @@ def get_mock_psm(sequence, spectrum_id): pep_precision = mock_logger.info.call_args_list[-2][0][1] aa_precision = mock_logger.info.call_args_list[-1][0][1] - assert abs(pep_precision - 100) < TEST_EPSILON - assert abs(aa_precision - 100) < TEST_EPSILON + assert pep_precision == pytest.approx(100) + assert aa_precision == pytest.approx(100) # Test 50% peptide precision (one wrong) infer_psms = [ @@ -358,8 +356,8 @@ def get_mock_psm(sequence, spectrum_id): pep_precision = mock_logger.info.call_args_list[-2][0][1] aa_precision = mock_logger.info.call_args_list[-1][0][1] - assert abs(pep_precision - 100 * (1 / 2)) < TEST_EPSILON - assert abs(aa_precision - 100 * (5 / 6)) < TEST_EPSILON + assert pep_precision == pytest.approx(100 * (1 / 2)) + assert aa_precision == pytest.approx(100 * (5 / 6)) # Test skipped spectra act_psms = [ @@ -383,8 +381,32 @@ def get_mock_psm(sequence, spectrum_id): pep_precision = mock_logger.info.call_args_list[-2][0][1] aa_precision = mock_logger.info.call_args_list[-1][0][1] - assert abs(pep_precision - 100 * (4 / 5)) < TEST_EPSILON - assert abs(aa_precision - 100 * (12 / 13)) < TEST_EPSILON + assert pep_precision == pytest.approx(100 * (4 / 5)) + assert aa_precision == pytest.approx(100 * (12 / 13)) + + act_psms = [ + get_mock_psm("PEP", ("foo", "index=1")), + get_mock_psm("PET", ("foo", "index=2")), + get_mock_psm("PEI", ("foo", "index=3")), + get_mock_psm("PEG", ("foo", "index=4")), + get_mock_psm("PEA", ("foo", "index=5")), + ] + + infer_psms = [ + get_mock_psm("PEP", ("foo", "index=1")), + get_mock_psm("PET", ("foo", "index=2")), + get_mock_psm("PEI", ("foo", "index=3")), + get_mock_psm("PEG", ("foo", "index=4")), + ] + + runner.writer.psms = infer_psms + mock_index = get_mock_index(act_psms) + runner.log_metrics(mock_index) + + pep_precision = mock_logger.info.call_args_list[-2][0][1] + aa_precision = mock_logger.info.call_args_list[-1][0][1] + assert pep_precision == pytest.approx(100 * (4 / 5)) + assert aa_precision == pytest.approx(100 * (12 / 13)) # Test un-inferred spectra act_psms = [ @@ -408,10 +430,5 @@ def get_mock_psm(sequence, spectrum_id): pep_precision = mock_logger.info.call_args_list[-2][0][1] aa_precision = mock_logger.info.call_args_list[-1][0][1] - last_warning_msg = mock_logger.warning.call_args_list[-1][0][0] - assert abs(pep_precision) < TEST_EPSILON - assert abs(aa_precision - 100) < TEST_EPSILON - assert ( - last_warning_msg - == "Some spectra were not matched to annotations during evaluation." - ) + assert pep_precision == pytest.approx(0) + assert aa_precision == pytest.approx(100) From c863b4a853413a0e792c5f41ba76bd032ae2953b Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Fri, 20 Sep 2024 13:53:20 -0700 Subject: [PATCH 09/16] aa_match_batch hanles none, additional skipped spectra test cases --- casanovo/denovo/evaluate.py | 6 ++++++ casanovo/denovo/model_runner.py | 2 +- tests/unit_tests/test_runner.py | 34 ++++++++++++++++++++++++++------- 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/casanovo/denovo/evaluate.py b/casanovo/denovo/evaluate.py index cbf9e74f..30540373 100644 --- a/casanovo/denovo/evaluate.py +++ b/casanovo/denovo/evaluate.py @@ -225,8 +225,14 @@ def aa_match_batch( # Split peptides into individual AAs if necessary. if isinstance(peptide1, str): peptide1 = re.split(r"(?<=.)(?=[A-Z])", peptide1) + elif peptide1 is None: + peptide1 = [] + if isinstance(peptide2, str): peptide2 = re.split(r"(?<=.)(?=[A-Z])", peptide2) + elif peptide2 is None: + peptide2 = [] + n_aa1, n_aa2 = n_aa1 + len(peptide1), n_aa2 + len(peptide2) aa_matches_batch.append( aa_match( diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 03cab931..615522e1 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -176,7 +176,7 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: seq_pred.append(self.writer.psms[pred_idx].sequence) pred_idx += 1 else: - seq_pred.append("") + seq_pred.append(None) aa_precision, _, pep_precision = aa_match_metrics( *aa_match_batch( diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index 01966b36..c5442e2a 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -382,21 +382,27 @@ def get_mock_psm(sequence, spectrum_id): pep_precision = mock_logger.info.call_args_list[-2][0][1] aa_precision = mock_logger.info.call_args_list[-1][0][1] assert pep_precision == pytest.approx(100 * (4 / 5)) - assert aa_precision == pytest.approx(100 * (12 / 13)) + assert aa_precision == pytest.approx(100) - act_psms = [ + infer_psms = [ get_mock_psm("PEP", ("foo", "index=1")), get_mock_psm("PET", ("foo", "index=2")), get_mock_psm("PEI", ("foo", "index=3")), get_mock_psm("PEG", ("foo", "index=4")), - get_mock_psm("PEA", ("foo", "index=5")), ] + runner.writer.psms = infer_psms + mock_index = get_mock_index(act_psms) + runner.log_metrics(mock_index) + + pep_precision = mock_logger.info.call_args_list[-2][0][1] + aa_precision = mock_logger.info.call_args_list[-1][0][1] + assert pep_precision == pytest.approx(100 * (4 / 5)) + assert aa_precision == pytest.approx(100) + infer_psms = [ get_mock_psm("PEP", ("foo", "index=1")), - get_mock_psm("PET", ("foo", "index=2")), get_mock_psm("PEI", ("foo", "index=3")), - get_mock_psm("PEG", ("foo", "index=4")), ] runner.writer.psms = infer_psms @@ -405,8 +411,22 @@ def get_mock_psm(sequence, spectrum_id): pep_precision = mock_logger.info.call_args_list[-2][0][1] aa_precision = mock_logger.info.call_args_list[-1][0][1] - assert pep_precision == pytest.approx(100 * (4 / 5)) - assert aa_precision == pytest.approx(100 * (12 / 13)) + assert pep_precision == pytest.approx(100 * (2 / 5)) + assert aa_precision == pytest.approx(100) + + infer_psms = [ + get_mock_psm("PEP", ("foo", "index=1")), + get_mock_psm("PEA", ("foo", "index=5")), + ] + + runner.writer.psms = infer_psms + mock_index = get_mock_index(act_psms) + runner.log_metrics(mock_index) + + pep_precision = mock_logger.info.call_args_list[-2][0][1] + aa_precision = mock_logger.info.call_args_list[-1][0][1] + assert pep_precision == pytest.approx(100 * (2 / 5)) + assert aa_precision == pytest.approx(100) # Test un-inferred spectra act_psms = [ From 8f21edbff7ea1360f37345fe1ee1832c0401366e Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 23 Sep 2024 13:58:22 -0700 Subject: [PATCH 10/16] aa_match_batch and aa_match handle None --- casanovo/denovo/evaluate.py | 22 +++++++++-------- casanovo/denovo/model_runner.py | 3 ++- tests/unit_tests/test_runner.py | 42 ++++++++++++++++++++++----------- 3 files changed, 42 insertions(+), 25 deletions(-) diff --git a/casanovo/denovo/evaluate.py b/casanovo/denovo/evaluate.py index 30540373..ab56d8cf 100644 --- a/casanovo/denovo/evaluate.py +++ b/casanovo/denovo/evaluate.py @@ -127,8 +127,8 @@ def aa_match_prefix_suffix( def aa_match( - peptide1: List[str], - peptide2: List[str], + peptide1: List[str] | None, + peptide2: List[str] | None, aa_dict: Dict[str, float], cum_mass_threshold: float = 0.5, ind_mass_threshold: float = 0.1, @@ -139,9 +139,9 @@ def aa_match( Parameters ---------- - peptide1 : List[str] + peptide1 : List[str] | None, The first tokenized peptide sequence to be compared. - peptide2 : List[str] + peptide2 : List[str] | None The second tokenized peptide sequence to be compared. aa_dict : Dict[str, float] Mapping of amino acid tokens to their mass values. @@ -161,7 +161,12 @@ def aa_match( pep_match : bool Boolean flag to indicate whether the two peptide sequences fully match. """ - if mode == "best": + if peptide1 is None and peptide2 is None: + return np.array([], dtype=bool), False + elif (peptide1 is None) != (peptide2 is None): + peptide = peptide1 if peptide2 is None else peptide2 + return np.array([False] * len(peptide)), False + elif mode == "best": return aa_match_prefix_suffix( peptide1, peptide2, aa_dict, cum_mass_threshold, ind_mass_threshold ) @@ -225,15 +230,12 @@ def aa_match_batch( # Split peptides into individual AAs if necessary. if isinstance(peptide1, str): peptide1 = re.split(r"(?<=.)(?=[A-Z])", peptide1) - elif peptide1 is None: - peptide1 = [] if isinstance(peptide2, str): peptide2 = re.split(r"(?<=.)(?=[A-Z])", peptide2) - elif peptide2 is None: - peptide2 = [] - n_aa1, n_aa2 = n_aa1 + len(peptide1), n_aa2 + len(peptide2) + n_aa1 += 0 if peptide1 is None else len(peptide1) + n_aa2 += 0 if peptide2 is None else len(peptide2) aa_matches_batch.append( aa_match( peptide1, diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 615522e1..d8d5185a 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -178,7 +178,7 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: else: seq_pred.append(None) - aa_precision, _, pep_precision = aa_match_metrics( + aa_precision, aa_recall, pep_precision = aa_match_metrics( *aa_match_batch( seq_true, seq_pred, @@ -187,6 +187,7 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: ) logger.info("Peptide Precision: %.2f%%", 100 * pep_precision) logger.info("Amino Acid Precision: %.2f%%", 100 * aa_precision) + logger.info("Amino Acid Recall: %.2f%%", 100 * aa_recall) def predict( self, diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index c5442e2a..9f7bd957 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -334,10 +334,12 @@ def get_mock_psm(sequence, spectrum_id): mock_index = get_mock_index(act_psms) runner.log_metrics(mock_index) - pep_precision = mock_logger.info.call_args_list[-2][0][1] - aa_precision = mock_logger.info.call_args_list[-1][0][1] + pep_precision = mock_logger.info.call_args_list[-3][0][1] + aa_precision = mock_logger.info.call_args_list[-2][0][1] + aa_recall = mock_logger.info.call_args_list[-1][0][1] assert pep_precision == pytest.approx(100) assert aa_precision == pytest.approx(100) + assert aa_recall == pytest.approx(100) # Test 50% peptide precision (one wrong) infer_psms = [ @@ -354,10 +356,12 @@ def get_mock_psm(sequence, spectrum_id): mock_index = get_mock_index(act_psms) runner.log_metrics(mock_index) - pep_precision = mock_logger.info.call_args_list[-2][0][1] - aa_precision = mock_logger.info.call_args_list[-1][0][1] + pep_precision = mock_logger.info.call_args_list[-3][0][1] + aa_precision = mock_logger.info.call_args_list[-2][0][1] + aa_recall = mock_logger.info.call_args_list[-1][0][1] assert pep_precision == pytest.approx(100 * (1 / 2)) assert aa_precision == pytest.approx(100 * (5 / 6)) + assert aa_recall == pytest.approx(100 * (5 / 6)) # Test skipped spectra act_psms = [ @@ -379,10 +383,12 @@ def get_mock_psm(sequence, spectrum_id): mock_index = get_mock_index(act_psms) runner.log_metrics(mock_index) - pep_precision = mock_logger.info.call_args_list[-2][0][1] - aa_precision = mock_logger.info.call_args_list[-1][0][1] + pep_precision = mock_logger.info.call_args_list[-3][0][1] + aa_precision = mock_logger.info.call_args_list[-2][0][1] + aa_recall = mock_logger.info.call_args_list[-1][0][1] assert pep_precision == pytest.approx(100 * (4 / 5)) assert aa_precision == pytest.approx(100) + assert aa_recall == pytest.approx(100 * (4 / 5)) infer_psms = [ get_mock_psm("PEP", ("foo", "index=1")), @@ -395,10 +401,12 @@ def get_mock_psm(sequence, spectrum_id): mock_index = get_mock_index(act_psms) runner.log_metrics(mock_index) - pep_precision = mock_logger.info.call_args_list[-2][0][1] - aa_precision = mock_logger.info.call_args_list[-1][0][1] + pep_precision = mock_logger.info.call_args_list[-3][0][1] + aa_precision = mock_logger.info.call_args_list[-2][0][1] + aa_recall = mock_logger.info.call_args_list[-1][0][1] assert pep_precision == pytest.approx(100 * (4 / 5)) assert aa_precision == pytest.approx(100) + assert aa_recall == pytest.approx(100 * (4 / 5)) infer_psms = [ get_mock_psm("PEP", ("foo", "index=1")), @@ -409,10 +417,12 @@ def get_mock_psm(sequence, spectrum_id): mock_index = get_mock_index(act_psms) runner.log_metrics(mock_index) - pep_precision = mock_logger.info.call_args_list[-2][0][1] - aa_precision = mock_logger.info.call_args_list[-1][0][1] + pep_precision = mock_logger.info.call_args_list[-3][0][1] + aa_precision = mock_logger.info.call_args_list[-2][0][1] + aa_recall = mock_logger.info.call_args_list[-1][0][1] assert pep_precision == pytest.approx(100 * (2 / 5)) assert aa_precision == pytest.approx(100) + assert aa_recall == pytest.approx(100 * (2 / 5)) infer_psms = [ get_mock_psm("PEP", ("foo", "index=1")), @@ -423,10 +433,12 @@ def get_mock_psm(sequence, spectrum_id): mock_index = get_mock_index(act_psms) runner.log_metrics(mock_index) - pep_precision = mock_logger.info.call_args_list[-2][0][1] - aa_precision = mock_logger.info.call_args_list[-1][0][1] + pep_precision = mock_logger.info.call_args_list[-3][0][1] + aa_precision = mock_logger.info.call_args_list[-2][0][1] + aa_recall = mock_logger.info.call_args_list[-1][0][1] assert pep_precision == pytest.approx(100 * (2 / 5)) assert aa_precision == pytest.approx(100) + assert aa_recall == pytest.approx(100 * (2 / 5)) # Test un-inferred spectra act_psms = [ @@ -448,7 +460,9 @@ def get_mock_psm(sequence, spectrum_id): mock_index = get_mock_index(act_psms) runner.log_metrics(mock_index) - pep_precision = mock_logger.info.call_args_list[-2][0][1] - aa_precision = mock_logger.info.call_args_list[-1][0][1] + pep_precision = mock_logger.info.call_args_list[-3][0][1] + aa_precision = mock_logger.info.call_args_list[-2][0][1] + aa_recall = mock_logger.info.call_args_list[-1][0][1] assert pep_precision == pytest.approx(0) assert aa_precision == pytest.approx(100) + assert aa_recall == pytest.approx(100 * (2 / 3)) From 217eeb8b8a3963b74b39f80cfd0e23bc451eb373 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 23 Sep 2024 14:08:41 -0700 Subject: [PATCH 11/16] top_match eval metrics warning --- casanovo/denovo/model_runner.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index d8d5185a..ae906648 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -185,6 +185,13 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: depthcharge.masses.PeptideMass().masses, ) ) + + if self.config["top_match"] > 1: + logger.warning( + "The behavior for calculating evaluation metrics is undefined when " + "the 'top_match' configuration option is set to a value greater than 1." + ) + logger.info("Peptide Precision: %.2f%%", 100 * pep_precision) logger.info("Amino Acid Precision: %.2f%%", 100 * aa_precision) logger.info("Amino Acid Recall: %.2f%%", 100 * aa_recall) From 3b2758232ba5a2ba2002ab7ec93c888740647a57 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Tue, 17 Sep 2024 15:13:58 -0700 Subject: [PATCH 12/16] removed unused import --- tests/unit_tests/test_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index dfc9c3a6..b57e7475 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -8,7 +8,7 @@ from casanovo.config import Config from casanovo.denovo.model_runner import ModelRunner -from casanovo.data.ms_io import PepSpecMatch, MztabWriter +from casanovo.data.ms_io import PepSpecMatch def test_initialize_model(tmp_path, mgf_small): From 4e89028eb5548cb23854c69d81da8937738a5f5f Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Thu, 19 Sep 2024 14:18:32 -0700 Subject: [PATCH 13/16] log metrics refactor, additional log metrics test case --- casanovo/data/ms_io.py | 2 +- casanovo/data/{pep_spec_match.py => psm.py} | 0 casanovo/denovo/model.py | 4 +- casanovo/denovo/model_runner.py | 33 ++++++-------- casanovo/utils.py | 2 +- tests/unit_tests/test_runner.py | 49 ++++++++++++++------- 6 files changed, 50 insertions(+), 40 deletions(-) rename casanovo/data/{pep_spec_match.py => psm.py} (100%) diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index 389c94e5..50932399 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -12,7 +12,7 @@ from .. import __version__ from ..config import Config -from .pep_spec_match import PepSpecMatch +from .psm import PepSpecMatch class MztabWriter: diff --git a/casanovo/data/pep_spec_match.py b/casanovo/data/psm.py similarity index 100% rename from casanovo/data/pep_spec_match.py rename to casanovo/data/psm.py diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 3a1e3d2c..ce8621d8 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -16,7 +16,7 @@ from . import evaluate from .. import config -from ..data import ms_io, pep_spec_match +from ..data import ms_io, psm logger = logging.getLogger("casanovo") @@ -914,7 +914,7 @@ def on_predict_batch_end( if len(peptide) == 0: continue self.out_writer.psms.append( - pep_spec_match.PepSpecMatch( + psm.PepSpecMatch( sequence=peptide, spectrum_id=tuple(spectrum_i), peptide_score=peptide_score, diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 6fb8cb95..03cab931 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -163,32 +163,25 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: Index containing the annotated spectra used to generate model predictions """ - model_output = [] - spectrum_annotations = [] - psms = iter(self.writer.psms) - curr_psm = next(psms, None) + seq_pred = [] + seq_true = [] + pred_idx = 0 with test_index as t_ind: - for i in range(t_ind.n_spectra): - if curr_psm is None: - break - - spectrum_annotations.append(t_ind[i][4]) - if curr_psm.spectrum_id == t_ind.get_spectrum_id(i): - model_output.append(curr_psm.sequence) - curr_psm = next(psms, None) + for true_idx in range(t_ind.n_spectra): + seq_true.append(t_ind[true_idx][4]) + if pred_idx < len(self.writer.psms) and self.writer.psms[ + pred_idx + ].spectrum_id == t_ind.get_spectrum_id(true_idx): + seq_pred.append(self.writer.psms[pred_idx].sequence) + pred_idx += 1 else: - model_output.append("") - - if curr_psm is not None: - logger.warning( - "Some spectra were not matched to annotations during evaluation." - ) + seq_pred.append("") aa_precision, _, pep_precision = aa_match_metrics( *aa_match_batch( - spectrum_annotations, - model_output, + seq_true, + seq_pred, depthcharge.masses.PeptideMass().masses, ) ) diff --git a/casanovo/utils.py b/casanovo/utils.py index 1646c5cc..43b1cb7d 100644 --- a/casanovo/utils.py +++ b/casanovo/utils.py @@ -15,7 +15,7 @@ import psutil import torch -from .data.pep_spec_match import PepSpecMatch +from .data.psm import PepSpecMatch SCORE_BINS = [0.0, 0.5, 0.9, 0.95, 0.99] diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index b57e7475..01966b36 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -7,8 +7,8 @@ import torch from casanovo.config import Config +from casanovo.data.psm import PepSpecMatch from casanovo.denovo.model_runner import ModelRunner -from casanovo.data.ms_io import PepSpecMatch def test_initialize_model(tmp_path, mgf_small): @@ -287,8 +287,6 @@ def test_evaluate( def test_log_metrics(monkeypatch, tiny_config): - TEST_EPSILON = 10**-5 - def get_mock_index(psm_list): mock_test_index = unittest.mock.MagicMock() mock_test_index.__enter__.return_value = mock_test_index @@ -338,8 +336,8 @@ def get_mock_psm(sequence, spectrum_id): pep_precision = mock_logger.info.call_args_list[-2][0][1] aa_precision = mock_logger.info.call_args_list[-1][0][1] - assert abs(pep_precision - 100) < TEST_EPSILON - assert abs(aa_precision - 100) < TEST_EPSILON + assert pep_precision == pytest.approx(100) + assert aa_precision == pytest.approx(100) # Test 50% peptide precision (one wrong) infer_psms = [ @@ -358,8 +356,8 @@ def get_mock_psm(sequence, spectrum_id): pep_precision = mock_logger.info.call_args_list[-2][0][1] aa_precision = mock_logger.info.call_args_list[-1][0][1] - assert abs(pep_precision - 100 * (1 / 2)) < TEST_EPSILON - assert abs(aa_precision - 100 * (5 / 6)) < TEST_EPSILON + assert pep_precision == pytest.approx(100 * (1 / 2)) + assert aa_precision == pytest.approx(100 * (5 / 6)) # Test skipped spectra act_psms = [ @@ -383,8 +381,32 @@ def get_mock_psm(sequence, spectrum_id): pep_precision = mock_logger.info.call_args_list[-2][0][1] aa_precision = mock_logger.info.call_args_list[-1][0][1] - assert abs(pep_precision - 100 * (4 / 5)) < TEST_EPSILON - assert abs(aa_precision - 100 * (12 / 13)) < TEST_EPSILON + assert pep_precision == pytest.approx(100 * (4 / 5)) + assert aa_precision == pytest.approx(100 * (12 / 13)) + + act_psms = [ + get_mock_psm("PEP", ("foo", "index=1")), + get_mock_psm("PET", ("foo", "index=2")), + get_mock_psm("PEI", ("foo", "index=3")), + get_mock_psm("PEG", ("foo", "index=4")), + get_mock_psm("PEA", ("foo", "index=5")), + ] + + infer_psms = [ + get_mock_psm("PEP", ("foo", "index=1")), + get_mock_psm("PET", ("foo", "index=2")), + get_mock_psm("PEI", ("foo", "index=3")), + get_mock_psm("PEG", ("foo", "index=4")), + ] + + runner.writer.psms = infer_psms + mock_index = get_mock_index(act_psms) + runner.log_metrics(mock_index) + + pep_precision = mock_logger.info.call_args_list[-2][0][1] + aa_precision = mock_logger.info.call_args_list[-1][0][1] + assert pep_precision == pytest.approx(100 * (4 / 5)) + assert aa_precision == pytest.approx(100 * (12 / 13)) # Test un-inferred spectra act_psms = [ @@ -408,10 +430,5 @@ def get_mock_psm(sequence, spectrum_id): pep_precision = mock_logger.info.call_args_list[-2][0][1] aa_precision = mock_logger.info.call_args_list[-1][0][1] - last_warning_msg = mock_logger.warning.call_args_list[-1][0][0] - assert abs(pep_precision) < TEST_EPSILON - assert abs(aa_precision - 100) < TEST_EPSILON - assert ( - last_warning_msg - == "Some spectra were not matched to annotations during evaluation." - ) + assert pep_precision == pytest.approx(0) + assert aa_precision == pytest.approx(100) From 64a681f6c8a0959d3cbd4c3c8dd36fd0d27f7ab9 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Fri, 20 Sep 2024 13:53:20 -0700 Subject: [PATCH 14/16] aa_match_batch hanles none, additional skipped spectra test cases --- casanovo/denovo/evaluate.py | 6 ++++++ casanovo/denovo/model_runner.py | 2 +- tests/unit_tests/test_runner.py | 34 ++++++++++++++++++++++++++------- 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/casanovo/denovo/evaluate.py b/casanovo/denovo/evaluate.py index cbf9e74f..30540373 100644 --- a/casanovo/denovo/evaluate.py +++ b/casanovo/denovo/evaluate.py @@ -225,8 +225,14 @@ def aa_match_batch( # Split peptides into individual AAs if necessary. if isinstance(peptide1, str): peptide1 = re.split(r"(?<=.)(?=[A-Z])", peptide1) + elif peptide1 is None: + peptide1 = [] + if isinstance(peptide2, str): peptide2 = re.split(r"(?<=.)(?=[A-Z])", peptide2) + elif peptide2 is None: + peptide2 = [] + n_aa1, n_aa2 = n_aa1 + len(peptide1), n_aa2 + len(peptide2) aa_matches_batch.append( aa_match( diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 03cab931..615522e1 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -176,7 +176,7 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: seq_pred.append(self.writer.psms[pred_idx].sequence) pred_idx += 1 else: - seq_pred.append("") + seq_pred.append(None) aa_precision, _, pep_precision = aa_match_metrics( *aa_match_batch( diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index 01966b36..c5442e2a 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -382,21 +382,27 @@ def get_mock_psm(sequence, spectrum_id): pep_precision = mock_logger.info.call_args_list[-2][0][1] aa_precision = mock_logger.info.call_args_list[-1][0][1] assert pep_precision == pytest.approx(100 * (4 / 5)) - assert aa_precision == pytest.approx(100 * (12 / 13)) + assert aa_precision == pytest.approx(100) - act_psms = [ + infer_psms = [ get_mock_psm("PEP", ("foo", "index=1")), get_mock_psm("PET", ("foo", "index=2")), get_mock_psm("PEI", ("foo", "index=3")), get_mock_psm("PEG", ("foo", "index=4")), - get_mock_psm("PEA", ("foo", "index=5")), ] + runner.writer.psms = infer_psms + mock_index = get_mock_index(act_psms) + runner.log_metrics(mock_index) + + pep_precision = mock_logger.info.call_args_list[-2][0][1] + aa_precision = mock_logger.info.call_args_list[-1][0][1] + assert pep_precision == pytest.approx(100 * (4 / 5)) + assert aa_precision == pytest.approx(100) + infer_psms = [ get_mock_psm("PEP", ("foo", "index=1")), - get_mock_psm("PET", ("foo", "index=2")), get_mock_psm("PEI", ("foo", "index=3")), - get_mock_psm("PEG", ("foo", "index=4")), ] runner.writer.psms = infer_psms @@ -405,8 +411,22 @@ def get_mock_psm(sequence, spectrum_id): pep_precision = mock_logger.info.call_args_list[-2][0][1] aa_precision = mock_logger.info.call_args_list[-1][0][1] - assert pep_precision == pytest.approx(100 * (4 / 5)) - assert aa_precision == pytest.approx(100 * (12 / 13)) + assert pep_precision == pytest.approx(100 * (2 / 5)) + assert aa_precision == pytest.approx(100) + + infer_psms = [ + get_mock_psm("PEP", ("foo", "index=1")), + get_mock_psm("PEA", ("foo", "index=5")), + ] + + runner.writer.psms = infer_psms + mock_index = get_mock_index(act_psms) + runner.log_metrics(mock_index) + + pep_precision = mock_logger.info.call_args_list[-2][0][1] + aa_precision = mock_logger.info.call_args_list[-1][0][1] + assert pep_precision == pytest.approx(100 * (2 / 5)) + assert aa_precision == pytest.approx(100) # Test un-inferred spectra act_psms = [ From a3d57637b59f2eddb4a062c8aa856154091e44bb Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 23 Sep 2024 13:58:22 -0700 Subject: [PATCH 15/16] aa_match_batch and aa_match handle None --- casanovo/denovo/evaluate.py | 22 +++++++++-------- casanovo/denovo/model_runner.py | 3 ++- tests/unit_tests/test_runner.py | 42 ++++++++++++++++++++++----------- 3 files changed, 42 insertions(+), 25 deletions(-) diff --git a/casanovo/denovo/evaluate.py b/casanovo/denovo/evaluate.py index 30540373..ab56d8cf 100644 --- a/casanovo/denovo/evaluate.py +++ b/casanovo/denovo/evaluate.py @@ -127,8 +127,8 @@ def aa_match_prefix_suffix( def aa_match( - peptide1: List[str], - peptide2: List[str], + peptide1: List[str] | None, + peptide2: List[str] | None, aa_dict: Dict[str, float], cum_mass_threshold: float = 0.5, ind_mass_threshold: float = 0.1, @@ -139,9 +139,9 @@ def aa_match( Parameters ---------- - peptide1 : List[str] + peptide1 : List[str] | None, The first tokenized peptide sequence to be compared. - peptide2 : List[str] + peptide2 : List[str] | None The second tokenized peptide sequence to be compared. aa_dict : Dict[str, float] Mapping of amino acid tokens to their mass values. @@ -161,7 +161,12 @@ def aa_match( pep_match : bool Boolean flag to indicate whether the two peptide sequences fully match. """ - if mode == "best": + if peptide1 is None and peptide2 is None: + return np.array([], dtype=bool), False + elif (peptide1 is None) != (peptide2 is None): + peptide = peptide1 if peptide2 is None else peptide2 + return np.array([False] * len(peptide)), False + elif mode == "best": return aa_match_prefix_suffix( peptide1, peptide2, aa_dict, cum_mass_threshold, ind_mass_threshold ) @@ -225,15 +230,12 @@ def aa_match_batch( # Split peptides into individual AAs if necessary. if isinstance(peptide1, str): peptide1 = re.split(r"(?<=.)(?=[A-Z])", peptide1) - elif peptide1 is None: - peptide1 = [] if isinstance(peptide2, str): peptide2 = re.split(r"(?<=.)(?=[A-Z])", peptide2) - elif peptide2 is None: - peptide2 = [] - n_aa1, n_aa2 = n_aa1 + len(peptide1), n_aa2 + len(peptide2) + n_aa1 += 0 if peptide1 is None else len(peptide1) + n_aa2 += 0 if peptide2 is None else len(peptide2) aa_matches_batch.append( aa_match( peptide1, diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 615522e1..d8d5185a 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -178,7 +178,7 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: else: seq_pred.append(None) - aa_precision, _, pep_precision = aa_match_metrics( + aa_precision, aa_recall, pep_precision = aa_match_metrics( *aa_match_batch( seq_true, seq_pred, @@ -187,6 +187,7 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: ) logger.info("Peptide Precision: %.2f%%", 100 * pep_precision) logger.info("Amino Acid Precision: %.2f%%", 100 * aa_precision) + logger.info("Amino Acid Recall: %.2f%%", 100 * aa_recall) def predict( self, diff --git a/tests/unit_tests/test_runner.py b/tests/unit_tests/test_runner.py index c5442e2a..9f7bd957 100644 --- a/tests/unit_tests/test_runner.py +++ b/tests/unit_tests/test_runner.py @@ -334,10 +334,12 @@ def get_mock_psm(sequence, spectrum_id): mock_index = get_mock_index(act_psms) runner.log_metrics(mock_index) - pep_precision = mock_logger.info.call_args_list[-2][0][1] - aa_precision = mock_logger.info.call_args_list[-1][0][1] + pep_precision = mock_logger.info.call_args_list[-3][0][1] + aa_precision = mock_logger.info.call_args_list[-2][0][1] + aa_recall = mock_logger.info.call_args_list[-1][0][1] assert pep_precision == pytest.approx(100) assert aa_precision == pytest.approx(100) + assert aa_recall == pytest.approx(100) # Test 50% peptide precision (one wrong) infer_psms = [ @@ -354,10 +356,12 @@ def get_mock_psm(sequence, spectrum_id): mock_index = get_mock_index(act_psms) runner.log_metrics(mock_index) - pep_precision = mock_logger.info.call_args_list[-2][0][1] - aa_precision = mock_logger.info.call_args_list[-1][0][1] + pep_precision = mock_logger.info.call_args_list[-3][0][1] + aa_precision = mock_logger.info.call_args_list[-2][0][1] + aa_recall = mock_logger.info.call_args_list[-1][0][1] assert pep_precision == pytest.approx(100 * (1 / 2)) assert aa_precision == pytest.approx(100 * (5 / 6)) + assert aa_recall == pytest.approx(100 * (5 / 6)) # Test skipped spectra act_psms = [ @@ -379,10 +383,12 @@ def get_mock_psm(sequence, spectrum_id): mock_index = get_mock_index(act_psms) runner.log_metrics(mock_index) - pep_precision = mock_logger.info.call_args_list[-2][0][1] - aa_precision = mock_logger.info.call_args_list[-1][0][1] + pep_precision = mock_logger.info.call_args_list[-3][0][1] + aa_precision = mock_logger.info.call_args_list[-2][0][1] + aa_recall = mock_logger.info.call_args_list[-1][0][1] assert pep_precision == pytest.approx(100 * (4 / 5)) assert aa_precision == pytest.approx(100) + assert aa_recall == pytest.approx(100 * (4 / 5)) infer_psms = [ get_mock_psm("PEP", ("foo", "index=1")), @@ -395,10 +401,12 @@ def get_mock_psm(sequence, spectrum_id): mock_index = get_mock_index(act_psms) runner.log_metrics(mock_index) - pep_precision = mock_logger.info.call_args_list[-2][0][1] - aa_precision = mock_logger.info.call_args_list[-1][0][1] + pep_precision = mock_logger.info.call_args_list[-3][0][1] + aa_precision = mock_logger.info.call_args_list[-2][0][1] + aa_recall = mock_logger.info.call_args_list[-1][0][1] assert pep_precision == pytest.approx(100 * (4 / 5)) assert aa_precision == pytest.approx(100) + assert aa_recall == pytest.approx(100 * (4 / 5)) infer_psms = [ get_mock_psm("PEP", ("foo", "index=1")), @@ -409,10 +417,12 @@ def get_mock_psm(sequence, spectrum_id): mock_index = get_mock_index(act_psms) runner.log_metrics(mock_index) - pep_precision = mock_logger.info.call_args_list[-2][0][1] - aa_precision = mock_logger.info.call_args_list[-1][0][1] + pep_precision = mock_logger.info.call_args_list[-3][0][1] + aa_precision = mock_logger.info.call_args_list[-2][0][1] + aa_recall = mock_logger.info.call_args_list[-1][0][1] assert pep_precision == pytest.approx(100 * (2 / 5)) assert aa_precision == pytest.approx(100) + assert aa_recall == pytest.approx(100 * (2 / 5)) infer_psms = [ get_mock_psm("PEP", ("foo", "index=1")), @@ -423,10 +433,12 @@ def get_mock_psm(sequence, spectrum_id): mock_index = get_mock_index(act_psms) runner.log_metrics(mock_index) - pep_precision = mock_logger.info.call_args_list[-2][0][1] - aa_precision = mock_logger.info.call_args_list[-1][0][1] + pep_precision = mock_logger.info.call_args_list[-3][0][1] + aa_precision = mock_logger.info.call_args_list[-2][0][1] + aa_recall = mock_logger.info.call_args_list[-1][0][1] assert pep_precision == pytest.approx(100 * (2 / 5)) assert aa_precision == pytest.approx(100) + assert aa_recall == pytest.approx(100 * (2 / 5)) # Test un-inferred spectra act_psms = [ @@ -448,7 +460,9 @@ def get_mock_psm(sequence, spectrum_id): mock_index = get_mock_index(act_psms) runner.log_metrics(mock_index) - pep_precision = mock_logger.info.call_args_list[-2][0][1] - aa_precision = mock_logger.info.call_args_list[-1][0][1] + pep_precision = mock_logger.info.call_args_list[-3][0][1] + aa_precision = mock_logger.info.call_args_list[-2][0][1] + aa_recall = mock_logger.info.call_args_list[-1][0][1] assert pep_precision == pytest.approx(0) assert aa_precision == pytest.approx(100) + assert aa_recall == pytest.approx(100 * (2 / 3)) From 8be20abeb707c23619e9c396db5b33ceea01b678 Mon Sep 17 00:00:00 2001 From: Lilferrit Date: Mon, 23 Sep 2024 14:08:41 -0700 Subject: [PATCH 16/16] top_match eval metrics warning --- casanovo/denovo/model_runner.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index d8d5185a..ae906648 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -185,6 +185,13 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: depthcharge.masses.PeptideMass().masses, ) ) + + if self.config["top_match"] > 1: + logger.warning( + "The behavior for calculating evaluation metrics is undefined when " + "the 'top_match' configuration option is set to a value greater than 1." + ) + logger.info("Peptide Precision: %.2f%%", 100 * pep_precision) logger.info("Amino Acid Precision: %.2f%%", 100 * aa_precision) logger.info("Amino Acid Recall: %.2f%%", 100 * aa_recall)