diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index da8f603c..d1e937f9 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -2,11 +2,12 @@ import collections import csv +import dataclasses import operator import os import re from pathlib import Path -from typing import List +from typing import List, Tuple, Iterable import natsort @@ -14,6 +15,43 @@ from ..config import Config +@dataclasses.dataclass +class PepSpecMatch: + """ + Peptide Spectrum Match (PSM) dataclass + + Parameters + ---------- + sequence : str + The amino acid sequence of the peptide. + spectrum_id : Tuple[str, str] + A tuple containing the spectrum identifier in the form + (spectrum file name, spectrum file idx) + peptide_score : float + Score of the match between the full peptide sequence and the + spectrum. + charge : int + The precursor charge state of the peptide ion observed in the spectrum. + calc_mz : float + The calculated mass-to-charge ratio (m/z) of the peptide based on its + sequence and charge state. + exp_mz : float + The observed (experimental) precursor mass-to-charge ratio (m/z) of the + peptide as detected in the spectrum. + aa_scores : Iterable[float] + A list of scores for individual amino acids in the peptide + sequence, where len(aa_scores) == len(sequence) + """ + + sequence: str + spectrum_id: Tuple[str, str] + peptide_score: float + charge: int + calc_mz: float + exp_mz: float + aa_scores: Iterable[float] + + class MztabWriter: """ Export spectrum identifications to an mzTab file. @@ -42,7 +80,7 @@ def __init__(self, filename: str): ), ] self._run_map = {} - self.psms = [] + self.psms: List[PepSpecMatch] = [] def set_metadata(self, config: Config, **kwargs) -> None: """ @@ -178,34 +216,39 @@ def save(self) -> None: ] ) for i, psm in enumerate( - natsort.natsorted(self.psms, key=operator.itemgetter(1)), 1 + natsort.natsorted( + self.psms, key=operator.attrgetter("spectrum_id") + ), + 1, ): - filename, idx = os.path.abspath(psm[1][0]), psm[1][1] + filename = os.path.abspath(psm.spectrum_id[0]) + idx = psm.spectrum_id[1] writer.writerow( [ "PSM", - psm[0], # sequence + psm.sequence, # sequence i, # PSM_ID "null", # accession "null", # unique "null", # database "null", # database_version f"[MS, MS:1003281, Casanovo, {__version__}]", - psm[2], # search_engine_score[1] + psm.peptide_score, # search_engine_score[1] # FIXME: Modifications should be specified as # controlled vocabulary terms. "null", # modifications # FIXME: Can we get the retention time from the data # loader? "null", # retention_time - int(psm[3]), # charge - psm[4], # exp_mass_to_charge - psm[5], # calc_mass_to_charge + psm.charge, # charge + psm.exp_mz, # exp_mass_to_charge + psm.calc_mz, # calc_mass_to_charge f"ms_run[{self._run_map[filename]}]:{idx}", "null", # pre "null", # post "null", # start "null", # end - psm[6], # opt_ms_run[1]_aa_scores + # opt_ms_run[1]_aa_scores + ",".join(list(map("{:.5f}".format, psm.aa_scores))), ] ) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 77df6df5..6e984a1d 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -914,15 +914,15 @@ def on_predict_batch_end( if len(peptide) == 0: continue self.out_writer.psms.append( - ( - peptide, - tuple(spectrum_i), - peptide_score, - charge, - precursor_mz, - self.peptide_mass_calculator.mass(peptide, charge), - ",".join(list(map("{:.5f}".format, aa_scores))), - ), + ms_io.PepSpecMatch( + sequence=peptide, + spectrum_id=tuple(spectrum_i), + peptide_score=peptide_score, + charge=int(charge), + calc_mz=precursor_mz, + exp_mz=self.peptide_mass_calculator.mass(peptide, charge), + aa_scores=aa_scores, + ) ) def _log_history(self) -> None: diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index b88c5542..0f39fe46 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -132,7 +132,7 @@ def log_metrics(self, test_index: AnnotatedSpectrumIndex) -> None: Index containing the annotated spectra used to generate model predictions """ - model_output = [psm[0] for psm in self.writer.psms] + model_output = [psm.sequence for psm in self.writer.psms] spectrum_annotations = [ test_index[i][4] for i in range(test_index.n_spectra) ] diff --git a/casanovo/utils.py b/casanovo/utils.py index fde6cd05..0d2698ce 100644 --- a/casanovo/utils.py +++ b/casanovo/utils.py @@ -1,13 +1,11 @@ """Small utility functions""" -import heapq import logging import os import platform import re import socket import sys -import time from datetime import datetime from typing import Tuple, Dict, List, Optional @@ -16,6 +14,8 @@ import psutil import torch +from .data.ms_io import PepSpecMatch + SCORE_BINS = [0.0, 0.5, 0.9, 0.95, 0.99] @@ -195,7 +195,7 @@ def log_run_report( def log_sequencing_report( - predictions: Tuple[str, Tuple[str, str], float, float, float, float, str], + predictions: List[PepSpecMatch], start_time: Optional[int] = None, end_time: Optional[int] = None, score_bins: List[float] = SCORE_BINS, @@ -219,8 +219,8 @@ def log_sequencing_report( run_report = get_report_dict( pd.DataFrame( { - "sequence": [psm[0] for psm in predictions], - "score": [psm[2] for psm in predictions], + "sequence": [psm.sequence for psm in predictions], + "score": [psm.peptide_score for psm in predictions], } ), score_bins=score_bins,