diff --git a/casanovo/data/datasets.py b/casanovo/data/datasets.py deleted file mode 100644 index 3917a2c8..00000000 --- a/casanovo/data/datasets.py +++ /dev/null @@ -1,269 +0,0 @@ -"""A PyTorch Dataset class for annotated spectra.""" - -from typing import Optional, Tuple - -import depthcharge -import numpy as np -import spectrum_utils.spectrum as sus -import torch -from torch.utils.data import Dataset - - -class SpectrumDataset(Dataset): - """ - Parse and retrieve collections of MS/MS spectra. - - Parameters - ---------- - spectrum_index : depthcharge.data.SpectrumIndex - The MS/MS spectra to use as a dataset. - n_peaks : Optional[int] - The number of top-n most intense peaks to keep in each spectrum. `None` - retains all peaks. - min_mz : float - The minimum m/z to include. The default is 140 m/z, in order to exclude - TMT and iTRAQ reporter ions. - max_mz : float - The maximum m/z to include. - min_intensity : float - Remove peaks whose intensity is below `min_intensity` percentage of the - base peak intensity. - remove_precursor_tol : float - Remove peaks within the given mass tolerance in Dalton around the - precursor mass. - random_state : Optional[int] - The NumPy random state. ``None`` leaves mass spectra in the order they - were parsed. - """ - - def __init__( - self, - spectrum_index: depthcharge.data.SpectrumIndex, - n_peaks: int = 150, - min_mz: float = 140.0, - max_mz: float = 2500.0, - min_intensity: float = 0.01, - remove_precursor_tol: float = 2.0, - random_state: Optional[int] = None, - ): - """Initialize a SpectrumDataset""" - super().__init__() - self.n_peaks = n_peaks - self.min_mz = min_mz - self.max_mz = max_mz - self.min_intensity = min_intensity - self.remove_precursor_tol = remove_precursor_tol - self.rng = np.random.default_rng(random_state) - self._index = spectrum_index - - def __len__(self) -> int: - """The number of spectra.""" - return self.n_spectra - - def __getitem__( - self, idx - ) -> Tuple[torch.Tensor, float, int, Tuple[str, str]]: - """ - Return the MS/MS spectrum with the given index. - - Parameters - ---------- - idx : int - The index of the spectrum to return. - - Returns - ------- - spectrum : torch.Tensor of shape (n_peaks, 2) - A tensor of the spectrum with the m/z and intensity peak values. - precursor_mz : float - The precursor m/z. - precursor_charge : int - The precursor charge. - spectrum_id: Tuple[str, str] - The unique spectrum identifier, formed by its original peak file and - identifier (index or scan number) therein. - """ - mz_array, int_array, precursor_mz, precursor_charge = self.index[idx][ - :4 - ] - spectrum = self._process_peaks( - mz_array, int_array, precursor_mz, precursor_charge - ) - return ( - spectrum, - precursor_mz, - precursor_charge, - self.get_spectrum_id(idx), - ) - - def get_spectrum_id(self, idx: int) -> Tuple[str, str]: - """ - Return the identifier of the MS/MS spectrum with the given index. - - Parameters - ---------- - idx : int - The index of the MS/MS spectrum within the SpectrumIndex. - - Returns - ------- - ms_data_file : str - The peak file from which the MS/MS spectrum was originally parsed. - identifier : str - The MS/MS spectrum identifier, per PSI recommendations. - """ - with self.index: - return self.index.get_spectrum_id(idx) - - def _process_peaks( - self, - mz_array: np.ndarray, - int_array: np.ndarray, - precursor_mz: float, - precursor_charge: int, - ) -> torch.Tensor: - """ - Preprocess the spectrum by removing noise peaks and scaling the peak - intensities. - - Parameters - ---------- - mz_array : numpy.ndarray of shape (n_peaks,) - The spectrum peak m/z values. - int_array : numpy.ndarray of shape (n_peaks,) - The spectrum peak intensity values. - precursor_mz : float - The precursor m/z. - precursor_charge : int - The precursor charge. - - Returns - ------- - torch.Tensor of shape (n_peaks, 2) - A tensor of the spectrum with the m/z and intensity peak values. - """ - spectrum = sus.MsmsSpectrum( - "", - precursor_mz, - precursor_charge, - mz_array.astype(np.float64), - int_array.astype(np.float32), - ) - try: - spectrum.set_mz_range(self.min_mz, self.max_mz) - if len(spectrum.mz) == 0: - raise ValueError - spectrum.remove_precursor_peak(self.remove_precursor_tol, "Da") - if len(spectrum.mz) == 0: - raise ValueError - spectrum.filter_intensity(self.min_intensity, self.n_peaks) - if len(spectrum.mz) == 0: - raise ValueError - spectrum.scale_intensity("root", 1) - intensities = spectrum.intensity / np.linalg.norm( - spectrum.intensity - ) - return torch.tensor(np.array([spectrum.mz, intensities])).T.float() - except ValueError: - # Replace invalid spectra by a dummy spectrum. - return torch.tensor([[0, 1]]).float() - - @property - def n_spectra(self) -> int: - """The total number of spectra.""" - return self.index.n_spectra - - @property - def index(self) -> depthcharge.data.SpectrumIndex: - """The underlying SpectrumIndex.""" - return self._index - - @property - def rng(self): - """The NumPy random number generator.""" - return self._rng - - @rng.setter - def rng(self, seed): - """Set the NumPy random number generator.""" - self._rng = np.random.default_rng(seed) - - -class AnnotatedSpectrumDataset(SpectrumDataset): - """ - Parse and retrieve collections of annotated MS/MS spectra. - - Parameters - ---------- - annotated_spectrum_index : depthcharge.data.SpectrumIndex - The MS/MS spectra to use as a dataset. - n_peaks : Optional[int] - The number of top-n most intense peaks to keep in each spectrum. `None` - retains all peaks. - min_mz : float - The minimum m/z to include. The default is 140 m/z, in order to exclude - TMT and iTRAQ reporter ions. - max_mz : float - The maximum m/z to include. - min_intensity : float - Remove peaks whose intensity is below `min_intensity` percentage of the - base peak intensity. - remove_precursor_tol : float - Remove peaks within the given mass tolerance in Dalton around the - precursor mass. - random_state : Optional[int] - The NumPy random state. ``None`` leaves mass spectra in the order they - were parsed. - """ - - def __init__( - self, - annotated_spectrum_index: depthcharge.data.SpectrumIndex, - n_peaks: int = 150, - min_mz: float = 140.0, - max_mz: float = 2500.0, - min_intensity: float = 0.01, - remove_precursor_tol: float = 2.0, - random_state: Optional[int] = None, - ): - super().__init__( - annotated_spectrum_index, - n_peaks=n_peaks, - min_mz=min_mz, - max_mz=max_mz, - min_intensity=min_intensity, - remove_precursor_tol=remove_precursor_tol, - random_state=random_state, - ) - - def __getitem__(self, idx: int) -> Tuple[torch.Tensor, float, int, str]: - """ - Return the annotated MS/MS spectrum with the given index. - - Parameters - ---------- - idx : int - The index of the spectrum to return. - - Returns - ------- - spectrum : torch.Tensor of shape (n_peaks, 2) - A tensor of the spectrum with the m/z and intensity peak values. - precursor_mz : float - The precursor m/z. - precursor_charge : int - The precursor charge. - annotation : str - The peptide annotation of the spectrum. - """ - ( - mz_array, - int_array, - precursor_mz, - precursor_charge, - peptide, - ) = self.index[idx] - spectrum = self._process_peaks( - mz_array, int_array, precursor_mz, precursor_charge - ) - return spectrum, precursor_mz, precursor_charge, peptide diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index d3670930..7d7b1ae9 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -7,7 +7,6 @@ import string from typing import Dict, Iterator, Pattern, Set, Tuple -import depthcharge.masses import numpy as np import pandas as pd import pyteomics.fasta @@ -53,8 +52,8 @@ class ProteinDatabase: A comma-separated string of fixed modifications to consider. allowed_var_mods : str A comma-separated string of variable modifications to consider. - residues : Dict[str, float] - A dictionary of amino acid masses. + tokenizer: depthcharge.tokenizers.PeptideTokenizer + Used to access residues. """ def __init__( @@ -95,13 +94,14 @@ def __init__( digestion, missed_cleavages, ) - self.db_peptides = self._digest_fasta(peptide_generator) + self.db_peptides = self._digest_fasta(peptide_generator, residues) self.precursor_tolerance = precursor_tolerance self.isotope_error = isotope_error def _digest_fasta( self, peptide_generator: Iterator[Tuple[str, str]], + residues: Dict[str, float], ) -> pd.DataFrame: """ Digests a FASTA file and returns the peptides, their masses, @@ -148,10 +148,7 @@ def _digest_fasta( .reset_index() ) # Calculate the mass of each peptide. - mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") - peptides["calc_mass"] = ( - peptides["peptide"].apply(mass_calculator.mass).round(5) - ) + peptides["calc_mass"] = peptides["peptide"].map(residues).round(5) # Sort by peptide mass and index by peptide sequence. peptides.sort_values( by=["calc_mass", "peptide"], ascending=True, inplace=True diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index 7b954d71..959c5bf7 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -7,7 +7,6 @@ import re from pathlib import Path from typing import List -import pprint import natsort from .. import __version__ diff --git a/casanovo/data/pep_spec_match.py b/casanovo/data/pep_spec_match.py deleted file mode 100644 index 0dc3c48b..00000000 --- a/casanovo/data/pep_spec_match.py +++ /dev/null @@ -1,41 +0,0 @@ -"""Peptide spectrum match dataclass""" - -import dataclasses -from typing import Tuple, Iterable - - -@dataclasses.dataclass -class PepSpecMatch: - """ - Peptide Spectrum Match (PSM) dataclass - - Parameters - ---------- - sequence : str - The amino acid sequence of the peptide. - spectrum_id : Tuple[str, str] - A tuple containing the spectrum identifier in the form - (spectrum file name, spectrum file idx) - peptide_score : float - Score of the match between the full peptide sequence and the - spectrum. - charge : int - The precursor charge state of the peptide ion observed in the spectrum. - calc_mz : float - The calculated mass-to-charge ratio (m/z) of the peptide based on its - sequence and charge state. - exp_mz : float - The observed (experimental) precursor mass-to-charge ratio (m/z) of the - peptide as detected in the spectrum. - aa_scores : Iterable[float] - A list of scores for individual amino acids in the peptide - sequence, where len(aa_scores) == len(sequence) - """ - - sequence: str - spectrum_id: Tuple[str, str] - peptide_score: float - charge: int - calc_mz: float - exp_mz: float - aa_scores: Iterable[float] diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 95084206..74d3b7e3 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -256,10 +256,12 @@ def setup(self, stage: str = None, annotated: bool = True) -> None: def _make_loader( self, dataset: torch.utils.data.Dataset, - shuffle: Optional[bool] = None, + batch_size: int, + shuffle: bool = False, ) -> torch.utils.data.DataLoader: """ Create a PyTorch DataLoader. + Parameters ---------- dataset : torch.utils.data.Dataset @@ -278,37 +280,33 @@ def _make_loader( """ return DataLoader( dataset, - shuffle=shuffle, - num_workers=0, # self.n_workers, - # precision=torch.float32, + batch_size=batch_size, pin_memory=True, + num_workers=self.n_workers, + shuffle=shuffle, ) def train_dataloader(self) -> torch.utils.data.DataLoader: """Get the training DataLoader.""" - return self._make_loader(self.train_dataset, self.shuffle) + return self._make_loader( + self.train_dataset, self.train_batch_size, shuffle=self.shuffle + ) def val_dataloader(self) -> torch.utils.data.DataLoader: """Get the validation DataLoader.""" - return self._make_loader(self.valid_dataset) + return self._make_loader(self.valid_dataset, self.eval_batch_size) def test_dataloader(self) -> torch.utils.data.DataLoader: """Get the test DataLoader.""" - return self._make_loader(self.test_dataset) + return self._make_loader(self.test_dataset, self.eval_batch_size) def predict_dataloader(self) -> torch.utils.data.DataLoader: """Get the predict DataLoader.""" - return self._make_loader(self.test_dataset) + return self._make_loader(self.test_dataset, self.eval_batch_size) def db_dataloader(self) -> torch.utils.data.DataLoader: """Get a special dataloader for DB search.""" - return self._make_loader( - self.test_dataset, - self.eval_batch_size, - collate_fn=functools.partial( - prepare_psm_batch, protein_database=self.protein_database - ), - ) + return self._make_loader(self.test_dataset, self.eval_batch_size) def scale_to_unit_norm(spectrum): diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index a63a5263..19ea7244 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -2,21 +2,21 @@ import collections import heapq +import itertools import logging import warnings -from typing import Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import einops import torch import numpy as np import lightning.pytorch as pl -from torch.utils.tensorboard import SummaryWriter from depthcharge.tokenizers import PeptideTokenizer from . import evaluate from .. import config -from ..data import ms_io, pep_spec_match +from ..data import ms_io, psm from ..denovo.transformers import SpectrumEncoder, PeptideDecoder logger = logging.getLogger("casanovo") @@ -76,9 +76,6 @@ class Spec2Pep(pl.LightningModule): Number of PSMs to return for each spectrum. n_log : int The number of epochs to wait between logging messages. - tb_summarywriter : Optional[Path] - Folder path to record performance metrics during training. If - ``None``, don't use a ``SummaryWriter``. train_label_smoothing : float Smoothing factor when calculating the training loss. warmup_iters : int @@ -105,7 +102,6 @@ def __init__( dim_feedforward: int = 1024, n_layers: int = 9, dropout: float = 0.0, - dim_intensity: Optional[int] = None, max_peptide_len: int = 100, residues: Union[Dict[str, float], str] = "canonical", max_charge: int = 5, @@ -121,7 +117,6 @@ def __init__( out_writer: Optional[ms_io.MztabWriter] = None, calculate_precision: bool = False, tokenizer: Optional[PeptideTokenizer] = None, - tb_summarywriter: Optional[SummaryWriter] = None, # TODO **kwargs: Dict, ): super().__init__() @@ -241,22 +236,21 @@ def beam_search_decode( the m/z-intensity pair for each peak. These should be zero-padded, such that all the spectra in the batch are the same length. precursors : torch.Tensor of size (n_spectra, 3) - The measured precursor mass (axis 0), precursor charge - (axis 1), and precursor m/z (axis 2) of each MS/MS spectrum. + The measured precursor mass (axis 0), precursor charge (axis 1), and + precursor m/z (axis 2) of each MS/MS spectrum. Returns ------- pred_peptides : List[List[Tuple[float, np.ndarray, str]]] - For each spectrum, a list with the top peptide - prediction(s). A peptide predictions consists of a tuple - with the peptide score, the amino acid scores, and the - predicted peptide sequence. + For each spectrum, a list with the top peptide prediction(s). A + peptide predictions consists of a tuple with the peptide score, + the amino acid scores, and the predicted peptide sequence. """ memories, mem_masks = self.encoder(mzs, ints) # Sizes. batch = mzs.shape[0] # B - length = self.max_length + 1 # L + length = self.max_peptide_len + 1 # L vocab = self.vocab_size # V beam = self.n_beams # S @@ -293,16 +287,15 @@ def beam_search_decode( # The main decoding loop. for step in range(0, self.max_peptide_len): - # Terminate beams exceeding the precursor m/z tolerance and - # track all finished beams (either terminated or stop token - # predicted). + # Terminate beams exceeding the precursor m/z tolerance and track + # all finished beams (either terminated or stop token predicted). ( finished_beams, beam_fits_precursor, discarded_beams, ) = self._finish_beams(tokens, precursors, step) - # Cache peptide predictions from the finished beams (but not - # the discarded beams). + # Cache peptide predictions from the finished beams (but not the + # discarded beams). self._cache_finished_beams( tokens, scores, @@ -313,8 +306,7 @@ def beam_search_decode( ) # Stop decoding when all current beams have been finished. - # Continue with beams that have not been finished and not - # discarded. + # Continue with beams that have not been finished and not discarded. finished_beams |= discarded_beams if finished_beams.all(): break @@ -325,8 +317,8 @@ def beam_search_decode( memory=memories[~finished_beams, :, :], memory_key_padding_mask=mem_masks[~finished_beams, :], ) - # Find the top-k beams with the highest scores and continue - # decoding those. + # Find the top-k beams with the highest scores and continue decoding + # those. tokens, scores = self._get_topk_beams( tokens, scores, finished_beams, batch, step + 1 ) @@ -343,33 +335,33 @@ def _finish_beams( step: int, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: """ - Track all beams that have been finished, either by predicting - the stop token or because they were terminated due to exceeding - the precursor m/z tolerance. + Track all beams that have been finished, either by predicting the stop + token or because they were terminated due to exceeding the precursor + m/z tolerance. Parameters ---------- - tokens : torch.Tensor of shape (n_spectra * n_beams, max_peptide_len) + tokens : torch.Tensor of shape (n_spectra * n_beams, max_length) Predicted amino acid tokens for all beams and all spectra. scores : torch.Tensor of shape - (n_spectra * n_beams, max_peptide_len, n_amino_acids) - Scores for the predicted amino acid tokens for all beams and - all spectra. + (n_spectra * n_beams, max_length, n_amino_acids) + Scores for the predicted amino acid tokens for all beams and all + spectra. step : int Index of the current decoding step. Returns ------- finished_beams : torch.Tensor of shape (n_spectra * n_beams) - Boolean tensor indicating whether the current beams have - been finished. + Boolean tensor indicating whether the current beams have been + finished. beam_fits_precursor: torch.Tensor of shape (n_spectra * n_beams) - Boolean tensor indicating if current beams are within - precursor m/z tolerance. + Boolean tensor indicating if current beams are within precursor m/z + tolerance. discarded_beams : torch.Tensor of shape (n_spectra * n_beams) - Boolean tensor indicating whether the current beams should - be discarded (e.g. because they were predicted to end but - violate the minimum peptide length). + Boolean tensor indicating whether the current beams should be + discarded (e.g. because they were predicted to end but violate the + minimum peptide length). """ # Check for tokens with a negative mass (i.e. neutral loss). aa_neg_mass_idx = [None] @@ -390,8 +382,7 @@ def _finish_beams( beam_fits_precursor = torch.zeros( tokens.shape[0], dtype=torch.bool ).to(self.encoder.device) - # Beams with a stop token predicted in the current step can be - # finished. + # Beams with a stop token predicted in the current step can be finished. finished_beams = torch.zeros(tokens.shape[0], dtype=torch.bool).to( self.encoder.device ) @@ -404,9 +395,8 @@ def _finish_beams( ) discarded_beams[tokens[:, step] == 0] = True - # Discard beams with invalid modification combinations (i.e. - # N-terminal modifications occur multiple times or in internal - # positions). + # Discard beams with invalid modification combinations (i.e. N-terminal + # modifications occur multiple times or in internal positions). if step > 1: # Only relevant for longer predictions. dim0 = torch.arange(tokens.shape[0]) final_pos = torch.full((ends_stop_token.shape[0],), step) @@ -423,8 +413,8 @@ def _finish_beams( ).any(dim=1) discarded_beams[multiple_mods | internal_mods] = True - # Check which beams should be terminated or discarded based on - # the predicted peptide. + # Check which beams should be terminated or discarded based on the + # predicted peptide. for i in range(len(finished_beams)): # Skip already discarded beams. if discarded_beams[i]: @@ -442,15 +432,15 @@ def _finish_beams( ): pred_tokens = pred_tokens[:-1] peptide_len -= 1 - # Discard beams that were predicted to end but don't fit the - # minimum peptide length. + # Discard beams that were predicted to end but don't fit the minimum + # peptide length. if finished_beams[i] and peptide_len < self.min_peptide_len: discarded_beams[i] = True continue - # Terminate the beam if it has not been finished by the - # model but the peptide mass exceeds the precursor m/z to an - # extent that it cannot be corrected anymore by a - # subsequently predicted AA with negative mass. + # Terminate the beam if it has not been finished by the model but + # the peptide mass exceeds the precursor m/z to an extent that it + # cannot be corrected anymore by a subsequently predicted AA with + # negative mass. precursor_charge = precursors[i, 1] precursor_mz = precursors[i, 2] matches_precursor_mz = exceeds_precursor_mz = False @@ -487,18 +477,16 @@ def _finish_beams( self.isotope_error_range[1] + 1, ) ] - # Terminate the beam if the calculated m/z for the - # predicted peptide (without potential additional - # AAs with negative mass) is within the precursor - # m/z tolerance. + # Terminate the beam if the calculated m/z for the predicted + # peptide (without potential additional AAs with negative + # mass) is within the precursor m/z tolerance. matches_precursor_mz = aa is None and any( abs(d) < self.precursor_mass_tol for d in delta_mass_ppm ) - # Terminate the beam if the calculated m/z exceeds - # the precursor m/z + tolerance and hasn't been - # corrected by a subsequently predicted AA with - # negative mass. + # Terminate the beam if the calculated m/z exceeds the + # precursor m/z + tolerance and hasn't been corrected by a + # subsequently predicted AA with negative mass. if matches_precursor_mz: exceeds_precursor_mz = False else: @@ -513,8 +501,8 @@ def _finish_beams( except KeyError: matches_precursor_mz = exceeds_precursor_mz = False # Finish beams that fit or exceed the precursor m/z. - # Don't finish beams that don't include a stop token if they - # don't exceed the precursor m/z tolerance yet. + # Don't finish beams that don't include a stop token if they don't + # exceed the precursor m/z tolerance yet. if finished_beams[i]: beam_fits_precursor[i] = matches_precursor_mz elif exceeds_precursor_mz: @@ -538,17 +526,17 @@ def _cache_finished_beams( Parameters ---------- - tokens : torch.Tensor of shape (n_spectra * n_beams, max_peptide_len) + tokens : torch.Tensor of shape (n_spectra * n_beams, max_length) Predicted amino acid tokens for all beams and all spectra. scores : torch.Tensor of shape - (n_spectra * n_beams, max_peptide_len, n_amino_acids) - Scores for the predicted amino acid tokens for all beams and - all spectra. + (n_spectra * n_beams, max_length, n_amino_acids) + Scores for the predicted amino acid tokens for all beams and all + spectra. step : int Index of the current decoding step. beams_to_cache : torch.Tensor of shape (n_spectra * n_beams) - Boolean tensor indicating whether the current beams are - ready for caching. + Boolean tensor indicating whether the current beams are ready for + caching. beam_fits_precursor: torch.Tensor of shape (n_spectra * n_beams) Boolean tensor indicating whether the beams are within the precursor m/z tolerance. @@ -556,9 +544,9 @@ def _cache_finished_beams( int, List[Tuple[float, float, np.ndarray, torch.Tensor]] ] Priority queue with finished beams for each spectrum, ordered by - peptide score. For each finished beam, a tuple with the - (negated) peptide score, a random tie-breaking float, the - amino acid-level scores, and the predicted tokens is stored. + peptide score. For each finished beam, a tuple with the (negated) + peptide score, a random tie-breaking float, the amino acid-level + scores, and the predicted tokens is stored. """ for i in range(len(beams_to_cache)): if not beams_to_cache[i]: @@ -580,8 +568,8 @@ def _cache_finished_beams( continue smx = self.softmax(scores[i : i + 1, : step + 1, :]) aa_scores = smx[0, range(len(pred_tokens)), pred_tokens].tolist() - # Add an explicit score 0 for the missing stop token in case - # this was not predicted (i.e. early stopping). + # Add an explicit score 0 for the missing stop token in case this + # was not predicted (i.e. early stopping). if not has_stop_token: aa_scores.append(0) aa_scores = np.asarray(aa_scores) @@ -591,8 +579,8 @@ def _cache_finished_beams( ) # Omit the stop token from the amino acid-level scores. aa_scores = aa_scores[:-1] - # Add the prediction to the cache (minimum priority queue, - # maximum the number of beams elements). + # Add the prediction to the cache (minimum priority queue, maximum + # the number of beams elements). if len(pred_cache[spec_idx]) < self.n_beams: heapadd = heapq.heappush else: @@ -616,22 +604,22 @@ def _get_topk_beams( step: int, ) -> Tuple[torch.tensor, torch.tensor]: """ - Find the top-k beams with the highest scores and continue - decoding those. + Find the top-k beams with the highest scores and continue decoding + those. Stop decoding for beams that have been finished. Parameters ---------- - tokens : torch.Tensor of shape (n_spectra * n_beams, max_peptide_len) + tokens : torch.Tensor of shape (n_spectra * n_beams, max_length) Predicted amino acid tokens for all beams and all spectra. scores : torch.Tensor of shape - (n_spectra * n_beams, max_peptide_len, n_amino_acids) - Scores for the predicted amino acid tokens for all beams and - all spectra. + (n_spectra * n_beams, max_length, n_amino_acids) + Scores for the predicted amino acid tokens for all beams and all + spectra. finished_beams : torch.Tensor of shape (n_spectra * n_beams) - Boolean tensor indicating whether the current beams are - ready for caching. + Boolean tensor indicating whether the current beams are ready for + caching. batch: int Number of spectra in the batch. step : int @@ -639,12 +627,12 @@ def _get_topk_beams( Returns ------- - tokens : torch.Tensor of shape (n_spectra * n_beams, max_peptide_len) + tokens : torch.Tensor of shape (n_spectra * n_beams, max_length) Predicted amino acid tokens for all beams and all spectra. scores : torch.Tensor of shape - (n_spectra * n_beams, max_peptide_len, n_amino_acids) - Scores for the predicted amino acid tokens for all beams and - all spectra. + (n_spectra * n_beams, max_length, n_amino_acids) + Scores for the predicted amino acid tokens for all beams and all + spectra. """ beam = self.n_beams # S vocab = self.vocab_size # V @@ -679,7 +667,7 @@ def _get_topk_beams( ).float() # Mask out the index '0', i.e. padding token, by default. # FIXME: Set this to a very small, yet non-zero value, to only - # get padding after stop token. + # get padding after stop token. active_mask[:, :beam] = 1e-8 # Figure out the top K decodings. @@ -743,6 +731,23 @@ def _get_top_peptide( else: yield [] + def _unsqueeze_batch(self, batch: Dict[str, Any]) -> None: + """ + Unsqueeze the first dimension of each tensor in the batch. + + + Parameters + ---------- + batch : Dict[str, Any] + A dictionary where each key corresponds to a component of the batch, + and the values are tensors or other data structures. + """ + for k in batch.keys(): + try: + batch[k] = batch[k].squeeze(0) + except: + continue + def _process_batch(self, batch): """Prepare batch returned from AnnotatedSpectrumDataset of the latest depthcharge version @@ -764,13 +769,7 @@ def _process_batch(self, batch): sequences (during training). """ - # Squeeze torch tensors in first dimension - for k in batch.keys(): - try: - batch[k] = batch[k].squeeze(0) - except: - continue - + self._unsqueeze_batch(batch) precursor_mzs = batch["precursor_mz"] precursor_charges = batch["precursor_charge"] precursor_masses = (precursor_mzs - 1.007276) * precursor_charges @@ -933,11 +932,9 @@ def predict_step( predictions: List[ms_io.PepSpecMatch] Predicted PSMs for the given batch of spectra. """ - _, _, precursors, _ = self._process_batch(batch) prec_charges = precursors[:, 1].cpu().detach().numpy() prec_mzs = precursors[:, 2].cpu().detach().numpy() - predictions = [] for ( precursor_charge, @@ -1035,30 +1032,15 @@ def on_predict_batch_end( ) self.out_writer.psms.append( -<<<<<<< HEAD - ( - peptide, - scan, - peptide_score, - charge, - precursor_mz, - calc_mass, - ",".join(list(map("{:.5f}".format, aa_scores))), - file_name, - true_seq, - title, - ), -======= - pep_spec_match.PepSpecMatch( + psm.PepSpecMatch( sequence=peptide, spectrum_id=(file_name, scan), peptide_score=peptide_score, charge=int(charge), - calc_mz=precursor_mz, - exp_mz=calc_mass.item(), + calc_mz=calc_mass.item(), + exp_mz=precursor_mz, aa_scores=aa_scores, ) ->>>>>>> 5719cdc (circular import bug) ) def on_train_start(self): @@ -1159,14 +1141,20 @@ def predict_step( predictions: List[ms_io.PepSpecMatch] Predicted PSMs for the given batch of spectra. """ + pred, truth = self._forward_step(batch) predictions_all = collections.defaultdict(list) - for start_i in range(0, len(batch[0]), self.psm_batch_size): + # self._unsqueeze_batch(batch) + for start_i in range(0, len(batch), self.psm_batch_size): + psm_batch = { + label: data[start_i : start_i + self.psm_batch_size] + for label, data in batch.items() + } + + """" psm_batch = [ b[start_i : start_i + self.psm_batch_size] for b in batch ] - pred, truth = self._forward_step( - psm_batch[0], psm_batch[1], psm_batch[3] - ) + """ pred = self.softmax(pred) batch_peptide_scores, batch_aa_scores = _calc_match_score( pred, truth, self.decoder.reverse @@ -1188,7 +1176,7 @@ def predict_step( ): spectrum_i = tuple(spectrum_i) predictions_all[spectrum_i].append( - ms_io.PepSpecMatch( + psm.PepSpecMatch( sequence=peptide, spectrum_id=spectrum_i, peptide_score=peptide_score, diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 9366d33f..c8cdddb8 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -16,8 +16,7 @@ import torch.utils.data from lightning.pytorch.strategies import DDPStrategy -from lightning.pytorch.callbacks import ModelCheckpoint -from lightning.pytorch.loggers import TensorBoardLogger +from lightning.pytorch.callbacks import ModelCheckpoint, LearningRateMonitor from torch.utils.data import DataLoader from depthcharge.tokenizers import PeptideTokenizer @@ -114,11 +113,6 @@ def __init__( ), ] - if config.tb_summarywriter is not None: - self.callbacks.append( - LearningRateMonitor(logging_interval="step", log_momentum=True) - ) - def __enter__(self): """Enter the context manager""" self.tmp_dir = tempfile.TemporaryDirectory() @@ -155,6 +149,7 @@ def db_search( config_filename=self.config.file, ) self.initialize_trainer(train=True) + self.initialize_tokenizer() self.initialize_model(train=False, db_search=True) self.model.out_writer = self.writer self.model.psm_batch_size = self.config.predict_batch_size @@ -172,10 +167,9 @@ def db_search( self.config.allowed_var_mods, self.config.residues, ) - test_index = self._get_index(peak_path, False, "db search") - self.writer.set_ms_run(test_index.ms_files) - - self.initialize_data_module(test_index=test_index) + test_paths = self._get_input_paths(peak_path, False, "test") + self.writer.set_ms_run(test_paths) + self.initialize_data_module(test_paths=test_paths) self.loaders.protein_database = self.model.protein_database self.loaders.setup(stage="test", annotated=False) self.trainer.predict(self.model, self.loaders.db_dataloader()) @@ -215,13 +209,17 @@ def log_metrics(self, test_dataloader: DataLoader) -> None: """Log peptide precision and amino acid precision Calculate and log peptide precision and amino acid precision - based off of model predictions and spectrum annotations. + based off of model predictions and spectrum annotations Parameters ---------- test_index : AnnotatedSpectrumIndex Index containing the annotated spectra used to generate model predictions + """ + seq_pred = [] + seq_true = [] + pred_idx = 0 for batch in test_dataloader: for peak_file, scan_id, curr_seq_true in zip( @@ -251,16 +249,13 @@ def log_metrics(self, test_dataloader: DataLoader) -> None: if self.config["top_match"] > 1: logger.warning( - "The behavior for calculating evaluation metrics is undefined " - "when the 'top_match' configuration option is set to a value " - "greater than 1." + "The behavior for calculating evaluation metrics is undefined when " + "the 'top_match' configuration option is set to a value greater than 1." ) logger.info("Peptide Precision: %.2f%%", 100 * pep_precision) logger.info("Amino Acid Precision: %.2f%%", 100 * aa_precision) - """ - # TODO: Fix log_metrics, wait for eval bug fix to be merged in - return + logger.info("Amino Acid Recall: %.2f%%", 100 * aa_recall) def predict( self, @@ -426,15 +421,6 @@ def initialize_model(self, train: bool, db_search: bool = False) -> None: db_search : bool Determines whether to use the DB search model subclass. """ - tb_summarywriter = None - if self.config.tb_summarywriter: - if self.output_dir is None: - logger.warning( - "Can not create tensorboard because the output directory " - "is not set in the model runner." - ) - else: - tb_summarywriter = self.output_dir / "tensorboard" try: tokenizer = self.tokenizer except AttributeError: @@ -446,8 +432,6 @@ def initialize_model(self, train: bool, db_search: bool = False) -> None: dim_feedforward=self.config.dim_feedforward, n_layers=self.config.n_layers, dropout=self.config.dropout, - dim_intensity=self.config.dim_intensity, - max_length=self.config.max_length, max_charge=self.config.max_charge, precursor_mass_tol=self.config.precursor_mass_tol, isotope_error_range=self.config.isotope_error_range, @@ -455,7 +439,6 @@ def initialize_model(self, train: bool, db_search: bool = False) -> None: n_beams=self.config.n_beams, top_match=self.config.top_match, n_log=self.config.n_log, - tb_summarywriter=tb_summarywriter, train_label_smoothing=self.config.train_label_smoothing, warmup_iters=self.config.warmup_iters, cosine_schedule_period_iters=self.config.cosine_schedule_period_iters, @@ -476,7 +459,6 @@ def initialize_model(self, train: bool, db_search: bool = False) -> None: min_peptide_len=self.config.min_peptide_len, top_match=self.config.top_match, n_log=self.config.n_log, - tb_summarywriter=tb_summarywriter, train_label_smoothing=self.config.train_label_smoothing, warmup_iters=self.config.warmup_iters, cosine_schedule_period_iters=self.config.cosine_schedule_period_iters, diff --git a/tests/conftest.py b/tests/conftest.py index 84051d85..699302fc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -81,9 +81,13 @@ def _create_mgf( rng = np.random.default_rng(random_state) entries = [ _create_mgf_entry( - p, rng.choice([2, 3]), mod_aa_mass=mod_aa_mass, annotate=annotate + p, + i, + rng.choice([2, 3]), + mod_aa_mass=mod_aa_mass, + annotate=annotate, ) - for p in peptides + for i, p in enumerate(peptides) ] with mgf_file.open("w+") as mgf_ref: mgf_ref.write("\n".join(entries)) @@ -91,7 +95,9 @@ def _create_mgf( return mgf_file -def _create_mgf_entry(peptide, charge=2, mod_aa_mass=None, annotate=True): +def _create_mgf_entry( + peptide, title, charge=2, mod_aa_mass=None, annotate=True +): """ Create a MassIVE-KB style MGF entry for a single PSM. @@ -249,7 +255,42 @@ def _create_mzml(peptides, mzml_file, random_state=42): @pytest.fixture -def tiny_config(tmp_path): +def residues_dict(): + return { + "G": 57.021464, + "A": 71.037114, + "S": 87.032028, + "P": 97.052764, + "V": 99.068414, + "T": 101.047670, + "C[Carbamidomethyl]": 160.030649, # 103.009185 + 57.021464 + "L": 113.084064, + "I": 113.084064, + "N": 114.042927, + "D": 115.026943, + "Q": 128.058578, + "K": 128.094963, + "E": 129.042593, + "M": 131.040485, + "H": 137.058912, + "F": 147.068414, + "R": 156.101111, + "Y": 163.063329, + "W": 186.079313, + # Amino acid modifications. + "M[Oxidation]": 147.035400, # Met oxidation: 131.040485 + 15.994915 + "N[Deamidated]": 115.026943, # Asn deamidation: 114.042927 + 0.984016 + "Q[Deamidated]": 129.042594, # Gln deamidation: 128.058578 + 0.984016 + # N-terminal modifications. + "[Acetyl]-": 42.010565, # Acetylation + "[Carbamyl]-": 43.005814, # Carbamylation "+43.006" + "[Ammonia-loss]-": -17.026549, # NH3 loss + "[+25.980265]-": 25.980265, # Carbamylation and NH3 loss + } + + +@pytest.fixture +def tiny_config(tmp_path, residues_dict): """A config file for a tiny model.""" cfg = { "n_head": 2, @@ -302,37 +343,7 @@ def tiny_config(tmp_path): "replace_isoleucine_with_leucine": True, "reverse_peptides": False, "mskb_tokenizer": True, - "residues": { - "G": 57.021464, - "A": 71.037114, - "S": 87.032028, - "P": 97.052764, - "V": 99.068414, - "T": 101.047670, - "C[Carbamidomethyl]": 160.030649, # 103.009185 + 57.021464 - "L": 113.084064, - "I": 113.084064, - "N": 114.042927, - "D": 115.026943, - "Q": 128.058578, - "K": 128.094963, - "E": 129.042593, - "M": 131.040485, - "H": 137.058912, - "F": 147.068414, - "R": 156.101111, - "Y": 163.063329, - "W": 186.079313, - # Amino acid modifications. - "M[Oxidation]": 147.035400, # Met oxidation: 131.040485 + 15.994915 - "N[Deamidated]": 115.026943, # Asn deamidation: 114.042927 + 0.984016 - "Q[Deamidated]": 129.042594, # Gln deamidation: 128.058578 + 0.984016 - # N-terminal modifications. - "[Acetyl]-": 42.010565, # Acetylation - "[Carbamyl]-": 43.005814, # Carbamylation "+43.006" - "[Ammonia-loss]-": -17.026549, # NH3 loss - "[+25.980265]-": 25.980265, # Carbamylation and NH3 loss - }, + "residues": residues_dict, "allowed_fixed_mods": "C:C+57.021", "allowed_var_mods": ( "M:M+15.995,N:N+0.984,Q:Q+0.984," @@ -345,8 +356,3 @@ def tiny_config(tmp_path): yaml.dump(cfg, out_file) return cfg_file - - -@pytest.fixture -def residues_dict(): - return depthcharge.masses.PeptideMass("massivekb").masses diff --git a/tests/test_integration.py b/tests/test_integration.py index a0ab75eb..50efce51 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -11,75 +11,14 @@ TEST_DIR = Path(__file__).resolve().parent -def test_db_search( - mgf_medium, tiny_fasta_file, tiny_config, tmp_path, monkeypatch -): - # Run a command: - monkeypatch.setattr(casanovo, "__version__", "4.1.0") - run = functools.partial( - CliRunner().invoke, casanovo.main, catch_exceptions=False - ) - - output_rootname = "db" - output_filename = (tmp_path / output_rootname).with_suffix(".mztab") - - search_args = [ - "db-search", - "--config", - tiny_config, - "--output_dir", - str(tmp_path), - "--output_root", - output_rootname, - str(mgf_medium), - str(tiny_fasta_file), - ] - - result = run(search_args) - - assert result.exit_code == 0 - assert output_filename.exists() - - mztab = pyteomics.mztab.MzTab(str(output_filename)) - - psms = mztab.spectrum_match_table - assert list(psms.sequence) == [ - "ATSIPAR", - "VTLSC+57.021R", - "LLIYGASTR", - "EIVMTQSPPTLSLSPGER", - "MEAPAQLLFLLLLWLPDTTR", - "ASQSVSSSYLTWYQQKPGQAPR", - "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", - ] - - # Validate mztab output - validate_args = [ - "java", - "-jar", - f"{TEST_DIR}/jmzTabValidator.jar", - "--check", - f"inFile={output_filename}", - ] - - validate_result = subprocess.run( - validate_args, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - text=True, - ) - - assert validate_result.returncode == 0 - assert not any( - [ - line.startswith("[Error-") - for line in validate_result.stdout.splitlines() - ] - ) - - def test_train_and_run( - mgf_small, mzml_small, tiny_config, tmp_path, monkeypatch + mgf_small, + mzml_small, + tiny_config, + tmp_path, + monkeypatch, + mgf_medium, + tiny_fasta_file, ): # We can use this to explicitly test different versions. monkeypatch.setattr(casanovo, "__version__", "3.0.1") @@ -164,7 +103,6 @@ def test_train_and_run( "--output_root", output_rootname, str(mgf_small), - str(mzml_small), "--evaluate", ] @@ -212,6 +150,66 @@ def test_train_and_run( assert output_filename.is_file() + monkeypatch.setattr(casanovo, "__version__", "4.1.0") + output_rootname = "db" + output_filename = (tmp_path / output_rootname).with_suffix(".mztab") + + search_args = [ + "db-search", + "--model", + str(model_file), + "--config", + tiny_config, + "--output_dir", + str(tmp_path), + "--output_root", + output_rootname, + str(mgf_small), + str(tiny_fasta_file), + ] + + result = run(search_args) + + assert result.exit_code == 0 + assert output_filename.exists() + + mztab = pyteomics.mztab.MzTab(str(output_filename)) + + psms = mztab.spectrum_match_table + assert list(psms.sequence) == [ + "ATSIPAR", + "VTLSC+57.021R", + "LLIYGASTR", + "EIVMTQSPPTLSLSPGER", + "MEAPAQLLFLLLLWLPDTTR", + "ASQSVSSSYLTWYQQKPGQAPR", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + ] + + # Validate mztab output + validate_args = [ + "java", + "-jar", + f"{TEST_DIR}/jmzTabValidator.jar", + "--check", + f"inFile={output_filename}", + ] + + validate_result = subprocess.run( + validate_args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + + assert validate_result.returncode == 0 + assert not any( + [ + line.startswith("[Error-") + for line in validate_result.stdout.splitlines() + ] + ) + def test_auxilliary_cli(tmp_path, monkeypatch): """Test the secondary CLI commands""" diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 3e276f01..2c6a5091 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -28,7 +28,6 @@ from casanovo import casanovo from casanovo import utils from casanovo.data import db_utils, ms_io -from casanovo.data.datasets import SpectrumDataset, AnnotatedSpectrumDataset from casanovo.denovo.evaluate import aa_match, aa_match_batch, aa_match_metrics from casanovo.denovo.model import Spec2Pep, _aa_pep_score, _calc_match_score from casanovo.data import ms_io @@ -567,7 +566,6 @@ def test_calc_match_score(): def test_digest_fasta_cleave(tiny_fasta_file, residues_dict): - # No missed cleavages expected_normal = [ "ATSIPAR", @@ -1086,7 +1084,6 @@ def test_get_candidates(tiny_fasta_file, residues_dict): def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): - # Tide isotope error windows for 496.2, 2+: # 0: [980.481617, 1000.289326] # 1: [979.491114, 999.278813] @@ -1234,7 +1231,7 @@ def test_beam_search_decode(tiny_config): # Sizes. batch = 1 # B - length = model.max_length + 1 # L + length = model.max_peptide_len + 1 # L vocab = len(model.tokenizer) + 1 # V beam = model.n_beams # S step = 3 @@ -1367,7 +1364,7 @@ def test_beam_search_decode(tiny_config): mzs = ints = torch.zeros(1, 5) precursors = torch.tensor([[469.25364, 2.0, 235.63410]]) assert len(list(model.beam_search_decode(mzs, ints, precursors))[0]) == 0 - model.max_length = 100 + model.max_peptide_len = 100 # Re-initialize scores and tokens to further test caching functionality. scores = torch.full( @@ -1554,7 +1551,7 @@ def test_beam_search_decode(tiny_config): # Sizes and other variables. batch = 2 # B beam = model.n_beams # S - length = model.max_length + 1 # L + length = model.max_peptide_len + 1 # L vocab = len(model.tokenizer) + 1 # V step = 4 @@ -1596,7 +1593,7 @@ def test_beam_search_decode(tiny_config): ) batch = 2 # B beam = model.n_beams # S - length = model.max_length + 1 # L + length = model.max_peptide_len + 1 # L vocab = len(model.tokenizer) + 1 # V step = 4