From 63ec7d30028b2f453dcc8effb10f997bcb371baf Mon Sep 17 00:00:00 2001 From: melihyilmaz Date: Fri, 10 Nov 2023 13:34:41 -0800 Subject: [PATCH] Skip spectra with few peaks --- casanovo/config.py | 1 + casanovo/config.yaml | 2 ++ casanovo/data/datasets.py | 34 ++++++++++++++++++++++----------- casanovo/denovo/dataloaders.py | 8 ++++++++ casanovo/denovo/model_runner.py | 1 + 5 files changed, 35 insertions(+), 11 deletions(-) diff --git a/casanovo/config.py b/casanovo/config.py index c07073d6..58dc681a 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -34,6 +34,7 @@ class Config: _default_config = Path(__file__).parent / "config.yaml" _config_types = dict( random_seed=int, + min_n_peaks=int, n_peaks=int, min_mz=float, max_mz=float, diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 729e827d..23d9e3e5 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -52,6 +52,8 @@ model_save_folder_path: "" val_check_interval: 50_000 # SPECTRUM PROCESSING OPTIONS +# Min number of peaks allowed in a spectrum +min_n_peaks: 20 # Number of the most intense peaks to retain, any remaining peaks are discarded n_peaks: 150 # Min peak m/z allowed, peaks with smaller m/z are discarded diff --git a/casanovo/data/datasets.py b/casanovo/data/datasets.py index 23b3d8e3..3ff51194 100644 --- a/casanovo/data/datasets.py +++ b/casanovo/data/datasets.py @@ -16,6 +16,9 @@ class SpectrumDataset(Dataset): ---------- spectrum_index : depthcharge.data.SpectrumIndex The MS/MS spectra to use as a dataset. + min_n_peaks : int + Minimum number of peaks allowed in each spectrum. Spectra with fewer + peaks are discarded. n_peaks : Optional[int] The number of top-n most intense peaks to keep in each spectrum. `None` retains all peaks. @@ -38,6 +41,7 @@ class SpectrumDataset(Dataset): def __init__( self, spectrum_index: depthcharge.data.SpectrumIndex, + min_n_peaks: int = 20, n_peaks: int = 150, min_mz: float = 140.0, max_mz: float = 2500.0, @@ -47,6 +51,7 @@ def __init__( ): """Initialize a SpectrumDataset""" super().__init__() + self.min_n_peaks = min_n_peaks self.n_peaks = n_peaks self.min_mz = min_mz self.max_mz = max_mz @@ -86,12 +91,13 @@ def __getitem__( spectrum = self._process_peaks( mz_array, int_array, precursor_mz, precursor_charge ) - return ( - spectrum, - precursor_mz, - precursor_charge, - self.get_spectrum_id(idx), - ) + if spectrum is not None: + return ( + spectrum, + precursor_mz, + precursor_charge, + self.get_spectrum_id(idx), + ) def get_spectrum_id(self, idx: int) -> Tuple[str, str]: """ @@ -148,13 +154,13 @@ def _process_peaks( ) try: spectrum.set_mz_range(self.min_mz, self.max_mz) - if len(spectrum.mz) == 0: + if len(spectrum.mz) < self.min_n_peaks: raise ValueError spectrum.remove_precursor_peak(self.remove_precursor_tol, "Da") - if len(spectrum.mz) == 0: + if len(spectrum.mz) < self.min_n_peaks: raise ValueError spectrum.filter_intensity(self.min_intensity, self.n_peaks) - if len(spectrum.mz) == 0: + if len(spectrum.mz) < self.min_n_peaks: raise ValueError spectrum.scale_intensity("root", 1) intensities = spectrum.intensity / np.linalg.norm( @@ -163,7 +169,7 @@ def _process_peaks( return torch.tensor(np.array([spectrum.mz, intensities])).T.float() except ValueError: # Replace invalid spectra by a dummy spectrum. - return torch.tensor([[0, 1]]).float() + return None # torch.tensor([[3, 3]]).float() @property def n_spectra(self) -> int: @@ -194,6 +200,9 @@ class AnnotatedSpectrumDataset(SpectrumDataset): ---------- annotated_spectrum_index : depthcharge.data.SpectrumIndex The MS/MS spectra to use as a dataset. + min_n_peaks : int + Minimum number of peaks allowed in each spectrum. Spectra with fewer + peaks are discarded. n_peaks : Optional[int] The number of top-n most intense peaks to keep in each spectrum. `None` retains all peaks. @@ -216,6 +225,7 @@ class AnnotatedSpectrumDataset(SpectrumDataset): def __init__( self, annotated_spectrum_index: depthcharge.data.SpectrumIndex, + min_n_peaks: int = 20, n_peaks: int = 150, min_mz: float = 140.0, max_mz: float = 2500.0, @@ -225,6 +235,7 @@ def __init__( ): super().__init__( annotated_spectrum_index, + min_n_peaks=min_n_peaks, n_peaks=n_peaks, min_mz=min_mz, max_mz=max_mz, @@ -263,4 +274,5 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, float, int, str]: spectrum = self._process_peaks( mz_array, int_array, precursor_mz, precursor_charge ) - return spectrum, precursor_mz, precursor_charge, peptide + if spectrum is not None: + return spectrum, precursor_mz, precursor_charge, peptide diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 998fa66a..e2a0ac85 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -27,6 +27,9 @@ class DeNovoDataModule(pl.LightningDataModule): The batch size to use for training. eval_batch_size : int The batch size to use for inference. + min_n_peaks : int + Minimum number of peaks allowed in each spectrum. Spectra with fewer + peaks are discarded. n_peaks : Optional[int] The number of top-n most intense peaks to keep in each spectrum. `None` retains all peaks. @@ -56,6 +59,7 @@ def __init__( test_index: Optional[AnnotatedSpectrumIndex] = None, train_batch_size: int = 128, eval_batch_size: int = 1028, + min_n_peaks: int = 20, n_peaks: Optional[int] = 150, min_mz: float = 50.0, max_mz: float = 2500.0, @@ -70,6 +74,7 @@ def __init__( self.test_index = test_index self.train_batch_size = train_batch_size self.eval_batch_size = eval_batch_size + self.min_n_peaks = min_n_peaks self.n_peaks = n_peaks self.min_mz = min_mz self.max_mz = max_mz @@ -97,6 +102,7 @@ def setup(self, stage: str = None, annotated: bool = True) -> None: if stage in (None, "fit", "validate"): make_dataset = functools.partial( AnnotatedSpectrumDataset, + min_n_peaks=self.min_n_peaks, n_peaks=self.n_peaks, min_mz=self.min_mz, max_mz=self.max_mz, @@ -113,6 +119,7 @@ def setup(self, stage: str = None, annotated: bool = True) -> None: if stage in (None, "test"): make_dataset = functools.partial( AnnotatedSpectrumDataset if annotated else SpectrumDataset, + min_n_peaks=self.min_n_peaks, n_peaks=self.n_peaks, min_mz=self.min_mz, max_mz=self.max_mz, @@ -201,6 +208,7 @@ def prepare_batch( The spectrum identifiers (during de novo sequencing) or peptide sequences (during training). """ + batch = [spectrum for spectrum in batch if spectrum is not None] spectra, precursor_mzs, precursor_charges, spectrum_ids = list(zip(*batch)) spectra = torch.nn.utils.rnn.pad_sequence(spectra, batch_first=True) precursor_mzs = torch.tensor(precursor_mzs) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 852860d3..8b7ba278 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -292,6 +292,7 @@ def initialize_data_module( train_index=train_index, valid_index=valid_index, test_index=test_index, + min_n_peaks=self.config.min_n_peaks, min_mz=self.config.min_mz, max_mz=self.config.max_mz, min_intensity=self.config.min_intensity,