From f25ace84af22ea0eefb0a96a705cfc7957912d13 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Tue, 2 Jul 2024 16:22:02 -0700 Subject: [PATCH 01/21] rough implementation --- casanovo/casanovo.py | 90 +++++++++++++++++- casanovo/data/datasets.py | 14 +-- casanovo/data/db_utils.py | 156 ++++++++++++++++++++++++++++++++ casanovo/data/ms_io.py | 4 +- casanovo/denovo/dataloaders.py | 57 +----------- casanovo/denovo/model.py | 62 ++++++------- casanovo/denovo/model_runner.py | 39 ++++++-- 7 files changed, 307 insertions(+), 115 deletions(-) create mode 100644 casanovo/data/db_utils.py diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 7db5faa8..df3cc79f 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -214,8 +214,74 @@ def annotate( nargs=-1, type=click.Path(exists=True, dir_okay=False), ) +@click.argument( + "fasta_path", + required=True, + nargs=1, + type=click.Path(exists=True, dir_okay=False), +) +@click.option( + "--enzyme", + help="Enzyme for in silico digestion, see pyteomics.parser.expasy_rules", + type=str, + default="trypsin", +) +@click.option( + "--digestion", + help="Digestion: full, partial", + type=click.Choice( + ["full", "partial"], + case_sensitive=False, + ), + default="full", +) +@click.option( + "--missed_cleavages", + help="Number of allowed missed cleavages", + type=int, + default=0, +) +@click.option( + "--max_mods", + help="Maximum number of modifications per peptide", + type=int, + default=0, +) +@click.option( + "--min_length", + help="Minimum peptide length", + type=int, + default=6, +) +@click.option( + "--max_length", + help="Maximum peptide length", + type=int, + default=50, +) +@click.option( + "--precursor_tolerance", + help="Precursor tolerance window size (ppm)", + type=int, + default=20, +) +@click.option( + "--isotope_error", + help="Isotope error levels to consider (list of ints, e.g: 1,2)", + type=str, + default="0", +) def db_search( peak_path: Tuple[str], + fasta_path: str, + enzyme: str, + digestion: str, + missed_cleavages: int, + max_mods: int, + min_length: int, + max_length: int, + precursor_tolerance: int, + isotope_error: str, model: Optional[str], config: Optional[str], output: Optional[str], @@ -223,14 +289,30 @@ def db_search( ) -> None: """Perform a search using Casanovo-DB. - PEAK_PATH must be one MGF file that has ANNOTATED spectra, - as output by annotate mode. + PEAK_PATH must be one MGF file. FASTA_PATH must be one FASTA file. """ output = setup_logging(output, verbosity) config, model = setup_model(model, config, output, False) with ModelRunner(config, model) as runner: - logger.info("DB-searching peptides from: %s", peak_path) - runner.db_search(peak_path, output) + logger.info("Performing database search on:") + for peak_file in peak_path: + logger.info(" %s", peak_file) + logger.info("Using the following FASTA file:") + logger.info(" %s", fasta_path) + + runner.db_search( + peak_path, + fasta_path, + enzyme, + digestion, + missed_cleavages, + max_mods, + min_length, + max_length, + precursor_tolerance, + isotope_error, + output, + ) logger.info("DONE!") diff --git a/casanovo/data/datasets.py b/casanovo/data/datasets.py index aff6af85..59f56b68 100644 --- a/casanovo/data/datasets.py +++ b/casanovo/data/datasets.py @@ -134,6 +134,8 @@ def _process_peaks( The precursor m/z. precursor_charge : int The precursor charge. + track_spectrum_id : Optional[bool] + Whether to keep track of the identifier of the MS/MS spectra. Returns ------- @@ -212,8 +214,6 @@ class AnnotatedSpectrumDataset(SpectrumDataset): random_state : Optional[int] The NumPy random state. ``None`` leaves mass spectra in the order they were parsed. - track_spectrum_id : Optional[bool] - Whether to keep track of the identifier of the MS/MS spectra. """ def __init__( @@ -225,7 +225,6 @@ def __init__( min_intensity: float = 0.01, remove_precursor_tol: float = 2.0, random_state: Optional[int] = None, - track_spectrum_id: Optional[bool] = False, ): super().__init__( annotated_spectrum_index, @@ -236,7 +235,6 @@ def __init__( remove_precursor_tol=remove_precursor_tol, random_state=random_state, ) - self.track_spectrum_id = track_spectrum_id def __getitem__(self, idx: int) -> Tuple[torch.Tensor, float, int, str]: """ @@ -268,12 +266,4 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, float, int, str]: spectrum = self._process_peaks( mz_array, int_array, precursor_mz, precursor_charge ) - if self.track_spectrum_id: - return ( - spectrum, - precursor_mz, - precursor_charge, - peptide, - self.get_spectrum_id(idx), - ) return spectrum, precursor_mz, precursor_charge, peptide diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py new file mode 100644 index 00000000..c961e35e --- /dev/null +++ b/casanovo/data/db_utils.py @@ -0,0 +1,156 @@ +"""Unique methods used within db-search mode""" + +import os +import depthcharge.masses +from pyteomics import fasta, parser +import bisect + +HYDROGEN = 1.007825035 +OXYGEN = 15.99491463 +H2O = 2 * HYDROGEN + OXYGEN +PROTON = 1.00727646677 +ISOTOPE_SPACING = 1.003355 # - 0.00288 + +var_mods = { + "d": ["N", "Q"], + "ox": ["M"], + "ace-": True, + "carb-": True, + "nh3x-": True, + "carbnh3x-": True, +} +fixed_mods = {"carbm": ["C"]} + + +def convert_from_modx(seq): + """Converts peptide sequence from modX format to Casanovo-acceptable modifications. + + Args: + seq (str): Peptide in modX format + """ + seq = seq.replace("carbmC", "C+57.021") # Fixed modification + seq = seq.replace("oxM", "M+15.995") + seq = seq.replace("dN", "N+0.984") + seq = seq.replace("dQ", "Q+0.984") + seq = seq.replace("ace-", "+42.011") + seq = seq.replace("carbnh3x-", "+43.006-17.027") + seq = seq.replace("carb-", "+43.006") + seq = seq.replace("nh3x-", "-17.027") + return seq + + +def digest_fasta( + fasta_filename, + enzyme, + digestion, + missed_cleavages, + max_mods, + min_length, + max_length, +): + """TODO: Add docstring""" + + # Verify the eistence of the file: + if not os.path.isfile(fasta_filename): + print(f"File {fasta_filename} does not exist.") + raise FileNotFoundError(f"File {fasta_filename} does not exist.") + + fasta_data = fasta.read(fasta_filename) + peptide_list = [] + if digestion in ["full", "partial"]: + semi = True if digestion == "partial" else False + for header, seq in fasta_data: + pep_set = parser.cleave( + seq, + rule=parser.expasy_rules[enzyme], + missed_cleavages=missed_cleavages, + semi=semi, + ) + protein = header.split()[0] + peptide_list.extend([(pep, protein) for pep in pep_set]) + else: + raise ValueError(f"Digestion type {digestion} not recognized.") + + # Generate modified peptides + mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") + mass_calculator.masses.update({"X": 0.0}) # TODO: REMOVE? + mod_peptide_list = [] + for pep, prot in peptide_list: + if len(pep) < min_length or len(pep) > max_length: + continue + peptide_isoforms = parser.isoforms( + pep, + variable_mods=var_mods, + fixed_mods=fixed_mods, + max_mods=max_mods, + ) + peptide_isoforms = list(map(convert_from_modx, peptide_isoforms)) + mod_peptide_list.extend( + (mod_pep, mass_calculator.mass(mod_pep), prot) + for mod_pep in peptide_isoforms + ) + + # Sort the peptides by mass and return. + mod_peptide_list.sort(key=lambda x: x[1]) + return mod_peptide_list + + +def get_candidates( + precursor_mass, charge, peptide_list, precursor_tolerance, isotope_error +): + """TODO: ADD DOCSTRING""" + + candidates = set() + + isotope_error = [int(x) for x in isotope_error.split(",")] + for e in isotope_error: + iso_shift = ISOTOPE_SPACING * e + upper_bound = (_to_raw_mass(precursor_mass, charge) - iso_shift) * ( + 1 + (precursor_tolerance / 1e6) + ) + lower_bound = (_to_raw_mass(precursor_mass, charge) - iso_shift) * ( + 1 - (precursor_tolerance / 1e6) + ) + + start, end = get_mass_indices( + [x[1] for x in peptide_list], lower_bound, upper_bound + ) + + candidates.update(peptide_list[start:end]) + + candidates = list(candidates) + candidates.sort(key=lambda x: x[1]) + return candidates + + +def _to_mz(precursor_mass, charge): + """TODO: ADD DOCSTRING""" + return (precursor_mass + (charge * PROTON)) / charge + + +def _to_raw_mass(mz_mass, charge): + """TODO: ADD DOCSTRING""" + return charge * (mz_mass - PROTON) + + +def get_mass_indices(masses, m_low, m_high): + """Grabs mass indices from a list of mass values that fall within a specified range. + Requires that the mass values are sorted in ascending order. + + Parameters + ---------- + masses : List[int] + List of mass values + m_low : int + Lower bound of mass range (inclusive) + m_high : int + Upper bound of mass range (inclusive) + + Return + ------ + indices : Tuple[int, int] + Indices of mass values that fall within the specified range + """ + start = bisect.bisect_left(masses, m_low) + end = bisect.bisect_right(masses, m_high) + return start, end diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index c4cfc7cb..d47b9b04 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -251,7 +251,6 @@ def save_db_variant(self) -> None: "start", "end", "opt_ms_run[1]_aa_scores", - "opt_cv_MS:1002217_decoy_peptide", ] ) for i, psm in enumerate(self.psms): @@ -259,7 +258,7 @@ def save_db_variant(self) -> None: [ "PSM", psm[0], # sequence - f"{psm[5]}:{i}", # spectra_ref + f"{psm[5]}:{i}", # PSM_ID (spectrum # :candidate #) "null", # accession "null", # unique "null", # database @@ -284,6 +283,5 @@ def save_db_variant(self) -> None: ) ) ), # opt_ms_run[1]_aa_scores - bool(psm[7]), # opt_cv_MS:1002217_decoy_peptide ] ) diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index aff860a1..ba02936c 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -127,13 +127,12 @@ def setup(self, stage: str = None, annotated: bool = True) -> None: self.test_dataset = make_dataset(self.test_index) if stage == "db": make_dataset = functools.partial( - AnnotatedSpectrumDataset, + SpectrumDataset, n_peaks=self.n_peaks, min_mz=self.min_mz, max_mz=self.max_mz, min_intensity=self.min_intensity, remove_precursor_tol=self.remove_precursor_tol, - track_spectrum_id=True, ) if self.test_index is not None: self.test_dataset = make_dataset(self.test_index) @@ -143,7 +142,6 @@ def _make_loader( dataset: torch.utils.data.Dataset, batch_size: int, shuffle: bool = False, - db_mode: bool = False, ) -> torch.utils.data.DataLoader: """ Create a PyTorch DataLoader. @@ -167,7 +165,7 @@ def _make_loader( return torch.utils.data.DataLoader( dataset, batch_size=batch_size, - collate_fn=prepare_batch if not db_mode else prepare_db_batch, + collate_fn=prepare_batch, pin_memory=True, num_workers=self.n_workers, shuffle=shuffle, @@ -191,12 +189,6 @@ def predict_dataloader(self) -> torch.utils.data.DataLoader: """Get the predict DataLoader.""" return self._make_loader(self.test_dataset, self.eval_batch_size) - def db_dataloader(self) -> torch.utils.data.DataLoader: - """Get the predict DataLoader.""" - return self._make_loader( - self.test_dataset, self.eval_batch_size, db_mode=True - ) - def prepare_batch( batch: List[Tuple[torch.Tensor, float, int, str]] @@ -235,48 +227,3 @@ def prepare_batch( [precursor_masses, precursor_charges, precursor_mzs] ).T.float() return spectra, precursors, np.asarray(spectrum_ids) - - -def prepare_db_batch( - batch: List[Tuple[torch.Tensor, float, int, str, Tuple[str, str]]] -) -> Tuple[torch.Tensor, torch.Tensor, np.ndarray, Tuple[str, str]]: - """ - Collate MS/MS spectra into a batch meant for Casanovo-DB. - - Parameters - ---------- - batch : List[Tuple[torch.Tensor, Tuple[float, int, float], str, Tuple[str, str]]] - A batch of data from an AnnotatedSpectrumDataset, consisting of for each - spectrum (i) a tensor with the m/z and intensity peak values, - (ii) the precursor information [mass, charge, m/z], (iii) the - peptide sequence, the precursor m/z, (iv) spectrum identifiers - (file and scan). - - Returns - ------- - spectra : torch.Tensor of shape (batch_size, n_peaks, 2) - The padded mass spectra tensor with the m/z and intensity peak values - for each spectrum. - precursors : torch.Tensor of shape (batch_size, 3) - A tensor with the precursor neutral mass, precursor charge, and - precursor m/z. - spectrum_peps : np.ndarray - Peptide sequences - spectrum_ids : Tuple[str, str] - Peak file and spectrum identifier - """ - ( - spectra, - precursor_mzs, - precursor_charges, - spectrum_peps, - spectrum_ids, - ) = list(zip(*batch)) - spectra = torch.nn.utils.rnn.pad_sequence(spectra, batch_first=True) - precursor_mzs = torch.tensor(precursor_mzs) - precursor_charges = torch.tensor(precursor_charges) - precursor_masses = (precursor_mzs - 1.007276) * precursor_charges - precursors = torch.vstack( - [precursor_masses, precursor_charges, precursor_mzs] - ).T.float() - return spectra, precursors, np.asarray(spectrum_peps), spectrum_ids diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 71f4a6fa..be7dba9a 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -17,7 +17,7 @@ from . import evaluate from .. import config -from ..data import ms_io +from ..data import ms_io, db_utils logger = logging.getLogger("casanovo") @@ -1009,19 +1009,18 @@ def predict_step(self, batch, *args): ---------- batch : Tuple[torch.Tensor, torch.Tensor, torch.Tensor] A batch of (i) MS/MS spectra, (ii) precursor information, (iii) - spectrum identifiers as torch Tensors, (iv) scan numbers. + spectrum identifiers as torch Tensors Returns ------- - predictions: List[Tuple[int, bool, str, float, np.ndarray, np.ndarray]] + predictions: List[Tuple[int, str, float, np.ndarray, np.ndarray]] Model predictions for the given batch of spectra containing spectrum - scan number, decoy flag, peptide sequence, Casanovo-DB score, + scan number, peptide sequence, Casanovo-DB score, amino acid-level confidence scores, and precursor information. """ batch_res = [] for ( indexes, - is_decoy, peptides, precursors, encoded_ms, @@ -1034,7 +1033,6 @@ def predict_step(self, batch, *args): batch_res.append( ( indexes, - is_decoy, peptides, score_result.cpu().detach().numpy(), per_aa_score.cpu().detach().numpy(), @@ -1043,27 +1041,25 @@ def predict_step(self, batch, *args): ) return batch_res - def smart_batch_gen(self, batch): + def smart_batch_gen(self, spectrum_batch): + """TODO: ADD DOCSTRING""" all_psm = [] - batch_size = len(batch[0]) - enc = self.encoder(batch[0]) - precursors = batch[1] - indexes = batch[3] + batch_size = len(spectrum_batch[0]) + enc = self.encoder(spectrum_batch[0]) enc = list(zip(*enc)) + precursors = spectrum_batch[1] + indexes = spectrum_batch[2] for idx in range(batch_size): - spec_peptides = batch[2][idx].split(",") - # Check for decoy prefixes and create a bit-vector indicating targets (1) or decoys (0) - decoy_prefix = "decoy_" # Decoy prefix - id_decoys = np.array( - [ - (0, p.removeprefix(decoy_prefix)) - if p.startswith(decoy_prefix) - else (1, p) - for p in spec_peptides - ] + spec_peptides = db_utils.get_candidates( + precursors[idx][2], + precursors[idx][1], + self.digest, + self.precursor_tolerance, + self.isotope_error, ) - decoy_mask = np.array(id_decoys[:, 0], dtype=bool) - spec_peptides = list(id_decoys[:, 1]) + spec_peptides = [ + a[0] for a in spec_peptides + ] # TODO: USE MASS AND PROTEIN INFORMATION spec_precursors = [precursors[idx]] * len(spec_peptides) spec_enc = [enc[idx]] * len(spec_peptides) spec_idx = [indexes[idx]] * len(spec_peptides) @@ -1074,24 +1070,22 @@ def smart_batch_gen(self, batch): spec_precursors, spec_peptides, spec_idx, - decoy_mask, ) ) ) # Continually grab num_pairs items from all_psm until list is exhausted while len(all_psm) > 0: - batch = all_psm[:batch_size] + psm_batch = all_psm[:batch_size] all_psm = all_psm[batch_size:] - batch = list(zip(*batch)) + psm_batch = list(zip(*psm_batch)) encoded_ms = ( - torch.stack([a[0] for a in batch[0]]), - torch.stack([a[1] for a in batch[0]]), + torch.stack([a[0] for a in psm_batch[0]]), + torch.stack([a[1] for a in psm_batch[0]]), ) - prec_data = torch.stack(batch[1]) - pep_str = list(batch[2]) - indexes = [a[1] for a in batch[3]] - is_decoy = batch[4] - yield (indexes, is_decoy, pep_str, prec_data, encoded_ms) + prec_data = torch.stack(psm_batch[1]) + pep_str = list(psm_batch[2]) + indexes = [a[1] for a in psm_batch[3]] + yield (indexes, pep_str, prec_data, encoded_ms) def on_predict_batch_end( self, @@ -1102,7 +1096,6 @@ def on_predict_batch_end( return for ( indexes, - t_or_d, peptides, score_result, per_aa_score, @@ -1123,7 +1116,6 @@ def on_predict_batch_end( calc_mz, indexes, per_aa_score, - t_or_d, ): self.out_writer.psms.append(row) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index e150ab2d..73dfdff2 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -18,7 +18,7 @@ from lightning.pytorch.callbacks import ModelCheckpoint from ..config import Config -from ..data import ms_io +from ..data import ms_io, db_utils from ..denovo.dataloaders import DeNovoDataModule from ..denovo.model import Spec2Pep, DbSpec2Pep @@ -79,13 +79,29 @@ def __exit__(self, exc_type, exc_value, traceback): if self.writer is not None: self.writer.save() - def db_search(self, peak_path: Iterable[str], output: str) -> None: + def db_search( + self, + peak_path: Iterable[str], + fasta_path: str, + enzyme: str, + digestion: str, + missed_cleavages: int, + max_mods: int, + min_length: int, + max_length: int, + precursor_tolerance: float, + isotope_error: float, + output: str, + ) -> None: """Perform database search with Casanovo. Parameters ---------- - peak_path : iterable of str - The path to the annotated .mgf data files for database search. + peak_path : Iterable[str] + The path to the .mgf data file for database search. + fasta_path : str + The path to the FASTA file for database search. + # TODO: ADD ALL DOCUMENTATION output : str Where should the output be saved? @@ -105,12 +121,23 @@ def db_search(self, peak_path: Iterable[str], output: str) -> None: self.initialize_trainer(train=True) self.initialize_model(train=False, db_search=True) self.model.out_writer = self.writer + self.model.digest = db_utils.digest_fasta( + fasta_path, + enzyme, + digestion, + missed_cleavages, + max_mods, + min_length, + max_length, + ) + self.model.precursor_tolerance = precursor_tolerance + self.model.isotope_error = isotope_error - test_index = self._get_index(peak_path, True, "db search") + test_index = self._get_index(peak_path, False, "db search") self.writer.set_ms_run(test_index.ms_files) self.initialize_data_module(test_index=test_index) self.loaders.setup(stage="db") - self.trainer.predict(self.model, self.loaders.db_dataloader()) + self.trainer.predict(self.model, self.loaders.predict_dataloader()) def train( self, From f7dfbc8356d8993c219dbfaeccf59753f555fa07 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Tue, 2 Jul 2024 19:54:56 -0700 Subject: [PATCH 02/21] tested implementation of db search --- casanovo/casanovo.py | 107 ++++---- casanovo/data/annotate_db.py | 138 ---------- casanovo/data/db_utils.py | 109 ++++++-- casanovo/data/ms_io.py | 2 +- casanovo/denovo/model.py | 32 ++- casanovo/denovo/model_runner.py | 19 +- tests/conftest.py | 51 +++- tests/test_integration.py | 99 +------- tests/unit_tests/test_unit.py | 430 +++++++++++++++++++++++++++++++- 9 files changed, 666 insertions(+), 321 deletions(-) delete mode 100644 casanovo/data/annotate_db.py diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index df3cc79f..8ae9a81b 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -42,7 +42,6 @@ from . import utils from .denovo import ModelRunner from .config import Config -from .data.annotate_db import annotate_mgf logger = logging.getLogger("casanovo") click.rich_click.USE_MARKDOWN = True @@ -146,67 +145,6 @@ def sequence( logger.info("DONE!") -@main.command() -@click.argument( - "peak_path", - required=True, - nargs=1, - type=click.Path(exists=True, dir_okay=False), -) -@click.argument( - "tide_path", - required=True, - nargs=1, - type=click.Path(exists=True, dir_okay=True), -) -@click.option( - "-o", - "--output", - help="The output annotated MGF file.", - type=click.Path(dir_okay=False), -) -@click.option( - "-v", - "--verbosity", - help=""" - Set the verbosity of console logging messages. Log files are - always set to 'debug'. - """, - type=click.Choice( - ["debug", "info", "warning", "error"], - case_sensitive=False, - ), - default="info", -) -def annotate( - peak_path: str, - tide_path: str, - output: Optional[str], - verbosity: str, -) -> None: - """Annotate a given .mgf with candidates as selected by a Tide search for Casanovo-DB. - - PEAK_PATH must be one MGF file from which to annotate spectra. - - TIDE_PATH must be one directory containing the Tide search results of the .mgf. - This directory must contain tide-search.decoy.txt and tide-search.target.txt - """ - if output is None: - output = setup_logging(output, verbosity) - logger.info( - "Output file not specified. \ - Annotated MGF will be saved in the same directory \ - as the input MGF." - ) - output = peak_path.replace(".mgf", "_annotated.mgf") - else: - output = setup_logging(output, verbosity) - - annotate_mgf(peak_path, tide_path, output) - - logger.info("DONE!") - - @main.command(cls=_SharedParams) @click.argument( "peak_path", @@ -222,8 +160,47 @@ def annotate( ) @click.option( "--enzyme", - help="Enzyme for in silico digestion, see pyteomics.parser.expasy_rules", - type=str, + help="Enzyme for in silico digestion, \ + See pyteomics.parser.expasy_rules for valid enzymes", + type=click.Choice( + [ + "arg-c", + "asp-n", + "bnps-skatole", + "caspase 1", + "caspase 2", + "caspase 3", + "caspase 4", + "caspase 5", + "caspase 6", + "caspase 7", + "caspase 8", + "caspase 9", + "caspase 10", + "chymotrypsin high specificity", + "chymotrypsin low specificity", + "clostripain", + "cnbr", + "enterokinase", + "factor xa", + "formic acid", + "glutamyl endopeptidase", + "granzyme b", + "hydroxylamine", + "iodosobenzoic acid", + "lysc", + "ntcb", + "pepsin ph1.3", + "pepsin ph2.0", + "proline endopeptidase", + "proteinase k", + "staphylococcal peptidase i", + "thermolysin", + "thrombin", + "trypsin", + "trypsin_exception", + ] + ), default="trypsin", ) @click.option( @@ -287,7 +264,7 @@ def db_search( output: Optional[str], verbosity: str, ) -> None: - """Perform a search using Casanovo-DB. + """Perform a database search on MS/MS data using Casanovo-DB. PEAK_PATH must be one MGF file. FASTA_PATH must be one FASTA file. """ diff --git a/casanovo/data/annotate_db.py b/casanovo/data/annotate_db.py deleted file mode 100644 index dd2e6c64..00000000 --- a/casanovo/data/annotate_db.py +++ /dev/null @@ -1,138 +0,0 @@ -"""Methods used to annotate an .mgf so that it can be used by Casanovo-DB""" - -from pathlib import Path -from typing import Optional, Tuple -import os -import re -import logging - -import pandas as pd -import pyteomics.mgf as mgf - - -def _normalize_mods(seq: str) -> str: - """ - Turns tide-style modifications into the format used by Casanovo-DB. - - Parameters - ---------- - seq : str - The peptide sequence with tide-style modifications. - - Returns - ------- - str - The peptide sequence with Casanovo-DB-style modifications. - """ - logger = logging.getLogger("casanovo") - seq = seq.replace("C", "C+57.021") - seq = re.sub(r"M\[15\.[0-9]*\]", r"M+15.995", seq) - seq = re.sub(r"N\[0\.9[0-9]*\]", r"N+0.984", seq) - seq = re.sub(r"Q\[0\.9[0-9]*\]", r"Q+0.984", seq) - seq = re.sub(r"(.*)\[42\.[0-9]*\]", r"+42.011\1", seq) - seq = re.sub(r"(.*)\[43\.[0-9]*\]", r"+43.006\1", seq) - seq = re.sub(r"(.*)\[\-17\.[0-9]*\]", r"-17.027\1", seq) - seq = re.sub(r"(.*)\[25\.[0-9]*\]", r"+43.006-17.027\1", seq) - return seq - - -def annotate_mgf(peak_path: str, tide_path: str, output: Optional[str]): - """ - Accepts a directory containing the results of a successful tide search, - and an .mgf file containing MS/MS spectra. - The .mgf file is then annotated in the SEQ field with - all of the candidate peptides for each spectrum, as well as their target/decoy status. - This annotated .mgf can be given directly to Casanovo-DB to perfrom a database search. - - Parameters - ---------- - tide_dir_path : str - Path to the directory containing the results of a successful tide search. - mgf_file : str - Path to the .mgf file containing MS/MS spectra. - output_file : str - Path to where the annotated .mgf will be written. - - """ - logger = logging.getLogger("casanovo") - # Get paths to tide search text files - tdf_path = os.path.join(tide_path, "tide-search.target.txt") - ddf_path = os.path.join(tide_path, "tide-search.decoy.txt") - try: - target_df = pd.read_csv( - tdf_path, sep="\t", usecols=["scan", "sequence", "target/decoy"] - ) - decoy_df = pd.read_csv( - ddf_path, sep="\t", usecols=["scan", "sequence", "target/decoy"] - ) - except FileNotFoundError as e: - logger.error( - "Could not find tide search results in the specified directory. " - "Please ensure that the directory contains the following files: " - "tide-search.target.txt and tide-search.decoy.txt" - ) - raise e - - logger.info("Successfully read tide search results from %s.", tide_path) - - df = pd.concat([target_df, decoy_df]) - scan_groups = df.groupby("scan")[["sequence", "target/decoy"]] - - scan_map = {} - - for scan, item in scan_groups: - td_group = item.groupby("target/decoy")["sequence"].apply(list) - if "target" in td_group.index: - target_candidate_list = list( - map( - _normalize_mods, - td_group["target"], - ) - ) - else: - target_candidate_list = [] - logger.warn(f"No target peptides found for scan {scan}.") - if "decoy" in td_group.index: - decoy_candidate_list = list( - map( - _normalize_mods, - td_group["decoy"], - ) - ) - decoy_candidate_list = list( - map(lambda x: "decoy_" + str(x), decoy_candidate_list) - ) - else: - decoy_candidate_list = [] - logger.warn(f"No decoy peptides found for scan {scan}.") - - pep_list = target_candidate_list + decoy_candidate_list - if len(pep_list) == 0: - logger.warn(f"No peptides found for scan {scan}.") - else: - scan_map[scan] = target_candidate_list + decoy_candidate_list - - all_spec = [] - for idx, spec_dict in enumerate(mgf.read(peak_path)): - try: - scan = int(spec_dict["params"]["scans"]) - except KeyError as e: - logger.error( - "Could not find the scan number in the .mgf file." - "Please ensure that the .mgf file contains the scan number in the 'SCANS' field." - ) - raise e - try: - spec_dict["params"]["seq"] = ",".join(list(scan_map[scan])) - all_spec.append(spec_dict) - except KeyError as e: - # No need to do anything if the scan is not found in the scan map - pass - try: - output = str(output) - mgf.write(all_spec, output, file_mode="w") - logger.info("Annotated .mgf file written to %s.", output) - except Exception as e: - logger.error( - "Write to %s failed. Check if the file path is correct.", output - ) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index c961e35e..341a6162 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -5,11 +5,14 @@ from pyteomics import fasta, parser import bisect +from typing import List, Tuple + +# CONSTANTS HYDROGEN = 1.007825035 OXYGEN = 15.99491463 H2O = 2 * HYDROGEN + OXYGEN PROTON = 1.00727646677 -ISOTOPE_SPACING = 1.003355 # - 0.00288 +ISOTOPE_SPACING = 1.003355 var_mods = { "d": ["N", "Q"], @@ -22,7 +25,7 @@ fixed_mods = {"carbm": ["C"]} -def convert_from_modx(seq): +def convert_from_modx(seq: str): """Converts peptide sequence from modX format to Casanovo-acceptable modifications. Args: @@ -40,15 +43,41 @@ def convert_from_modx(seq): def digest_fasta( - fasta_filename, - enzyme, - digestion, - missed_cleavages, - max_mods, - min_length, - max_length, + fasta_filename: str, + enzyme: str, + digestion: str, + missed_cleavages: int, + max_mods: int, + min_length: int, + max_length: int, ): - """TODO: Add docstring""" + """ + Digests a FASTA file and returns the peptides, their masses, and associated protein. + + Parameters + ---------- + fasta_filename : str + Path to the FASTA file. + enzyme : str + The enzyme to use for digestion. + See pyteomics.parser.expasy_rules for valid enzymes. + digestion : str + The type of digestion to perform. Either 'full' or 'partial'. + missed_cleavages : int + The number of missed cleavages to allow. + max_mods : int + The maximum number of modifications to allow per peptide. + min_length : int + The minimum length of peptides to consider. + max_length : int + The maximum length of peptides to consider. + + Returns + ------- + mod_peptide_list : List[Tuple[str, float, str]] + A list of tuples containing the peptide sequence, mass, + and associated protein. Sorted by neutral mass in ascending order. + """ # Verify the eistence of the file: if not os.path.isfile(fasta_filename): @@ -96,19 +125,39 @@ def digest_fasta( def get_candidates( - precursor_mass, charge, peptide_list, precursor_tolerance, isotope_error + precursor_mz: float, + charge: int, + peptide_list: List[Tuple[str, float, str]], + precursor_tolerance: int, + isotope_error: str, ): - """TODO: ADD DOCSTRING""" + """ + Returns a list of candidate peptides that fall within the specified mass range. + + Parameters + ---------- + precursor_mz : float + The precursor mass-to-charge ratio. + charge : int + The precursor charge. + peptide_list : List[Tuple[str, float, str]] + A list of tuples containing the peptide sequence, mass, and associated protein. + Must be sorted by mass in ascending order. Uses neutral masses. + precursor_tolerance : float + The precursor mass tolerance in parts-per-million. + isotope_error : str + The isotope error levels to consider. + """ candidates = set() isotope_error = [int(x) for x in isotope_error.split(",")] for e in isotope_error: iso_shift = ISOTOPE_SPACING * e - upper_bound = (_to_raw_mass(precursor_mass, charge) - iso_shift) * ( + upper_bound = (_to_raw_mass(precursor_mz, charge) - iso_shift) * ( 1 + (precursor_tolerance / 1e6) ) - lower_bound = (_to_raw_mass(precursor_mass, charge) - iso_shift) * ( + lower_bound = (_to_raw_mass(precursor_mz, charge) - iso_shift) * ( 1 - (precursor_tolerance / 1e6) ) @@ -124,12 +173,40 @@ def get_candidates( def _to_mz(precursor_mass, charge): - """TODO: ADD DOCSTRING""" + """ + Convert precursor neutral mass to m/z value. + + Parameters + ---------- + precursor_mass : float + The precursor neutral mass. + charge : int + The precursor charge. + + Returns + ------- + mz : float + The calculated precursor mass-to-charge ratio. + """ return (precursor_mass + (charge * PROTON)) / charge def _to_raw_mass(mz_mass, charge): - """TODO: ADD DOCSTRING""" + """ + Convert precursor m/z value to neutral mass. + + Parameters + ---------- + mz_mass : float + The precursor mass-to-charge ratio. + charge : int + The precursor charge. + + Returns + ------- + mass : float + The calculated precursor neutral mass. + """ return charge * (mz_mass - PROTON) diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index d47b9b04..a701b627 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -221,7 +221,7 @@ def save_db_variant(self) -> None: Export the Casanovo-DB search results to the mzTab file. Outputs PSMs in the order they were scored - (i.e. the order in the annotated .mgf file). + (i.e. the order in the .mgf file). """ with open(self.filename, "w", newline="") as f: writer = csv.writer(f, delimiter="\t", lineterminator=os.linesep) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index be7dba9a..4d9bd41b 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1009,7 +1009,7 @@ def predict_step(self, batch, *args): ---------- batch : Tuple[torch.Tensor, torch.Tensor, torch.Tensor] A batch of (i) MS/MS spectra, (ii) precursor information, (iii) - spectrum identifiers as torch Tensors + spectrum identifiers as torch Tensors. Returns ------- @@ -1042,7 +1042,21 @@ def predict_step(self, batch, *args): return batch_res def smart_batch_gen(self, spectrum_batch): - """TODO: ADD DOCSTRING""" + """ + Transforms a batch of spectra into multiple equally-sized batches of PSMs. + + Parameters + ---------- + spectrum batch : Tuple[torch.Tensor, torch.Tensor, torch.Tensor] + A batch of (i) MS/MS spectra, (ii) precursor information, (iii) + spectrum identifiers as torch Tensors. + + Yields + ------- + psm_batch: Tuple[List[int], List[str], torch.Tensor, Tuple[torch.Tensor, torch.Tensor]] + A batch of PSMs containing the spectrum index, peptide sequence, + precursor information, and encoded MS/MS spectra. + """ all_psm = [] batch_size = len(spectrum_batch[0]) enc = self.encoder(spectrum_batch[0]) @@ -1050,16 +1064,22 @@ def smart_batch_gen(self, spectrum_batch): precursors = spectrum_batch[1] indexes = spectrum_batch[2] for idx in range(batch_size): - spec_peptides = db_utils.get_candidates( + digest_data = db_utils.get_candidates( precursors[idx][2], precursors[idx][1], self.digest, self.precursor_tolerance, self.isotope_error, ) - spec_peptides = [ - a[0] for a in spec_peptides - ] # TODO: USE MASS AND PROTEIN INFORMATION + logger.debug("%s", digest_data) + try: + spec_peptides, pep_masses, pep_protein = list( + zip(*digest_data) + ) + except ValueError: + logger.info( + "No peptides found for precursor %s", precursors[idx] + ) spec_precursors = [precursors[idx]] * len(spec_peptides) spec_enc = [enc[idx]] * len(spec_peptides) spec_idx = [indexes[idx]] * len(spec_peptides) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 73dfdff2..284acbe8 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -90,7 +90,7 @@ def db_search( min_length: int, max_length: int, precursor_tolerance: float, - isotope_error: float, + isotope_error: str, output: str, ) -> None: """Perform database search with Casanovo. @@ -101,7 +101,22 @@ def db_search( The path to the .mgf data file for database search. fasta_path : str The path to the FASTA file for database search. - # TODO: ADD ALL DOCUMENTATION + enzyme : str + The enzyme used for digestion. + digestion : str + The digestion type, full or partial. + missed_cleavages : int + The number of missed cleavages allowed. + max_mods : int + The maximum number of modifications allowed per peptide. + min_length : int + The minimum peptide length. + max_length : int + The maximum peptide length. + precursor_tolerance : float + The precursor mass tolerance in ppm. + isotope_error : str + Isotope error levels to consider, in comma-delineated string form. output : str Where should the output be saved? diff --git a/tests/conftest.py b/tests/conftest.py index eed4f39a..cac1a873 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,7 +5,7 @@ import psims import pytest import yaml -from pyteomics.mass import calculate_mass +from pyteomics.mass import calculate_mass, fast_mass, std_aa_mass @pytest.fixture @@ -263,6 +263,36 @@ def tiny_config(tmp_path): return cfg_file +@pytest.fixture +def tiny_fasta_file(tmp_path, fasta_raw_data): + fasta_file = tmp_path / "tiny_fasta.fasta" + with fasta_file.open("w+") as fasta_ref: + fasta_ref.write(fasta_raw_data) + + return fasta_file + + +@pytest.fixture +def fasta_raw_data(): + return ">foo\nMEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGERVTLSCRASQSVSSSYLTWYQQKPGQAPRLLIYGASTRATSIPARFSGSGSGTDFTLTISSLQPEDFAVYYCQQDYNLP" + + +@pytest.fixture +def mgf_db_search(tmp_path): + """An MGF file with 2 unannotated spectra and scan numbers.""" + peptides = [ + "ATSIPAR", + "VTLSCR", + "LLIYGASTR", + "EIVMTQSPPTLSLSPGER", + "MEAPAQLLFLLLLWLPDTTR", + "ASQSVSSSYLTWYQQKPGQAPR", + "FSGSGSGTDFTLTISSLQPEDFAVYYCQQDYNLP", + ] + mgf_file = tmp_path / "db_search.mgf" + return _create_unannotated_mgf(peptides, mgf_file, c_mod=True) + + @pytest.fixture def mgf_small_unannotated(tmp_path): """An MGF file with 2 unannotated spectra and scan numbers.""" @@ -271,7 +301,7 @@ def mgf_small_unannotated(tmp_path): return _create_unannotated_mgf(peptides, mgf_file) -def _create_unannotated_mgf(peptides, mgf_file, random_state=999): +def _create_unannotated_mgf(peptides, mgf_file, random_state=999, c_mod=False): """ Create a fake MGF file from one or more peptides. This file will have no SEQ= parameter, but will have a SCANS= parameter. @@ -284,6 +314,9 @@ def _create_unannotated_mgf(peptides, mgf_file, random_state=999): The MGF file to create. random_state : int or numpy.random.Generator, optional The random seed. The charge states are chosen to be 2 or 3 randomly. + c_mod : bool, optional + Whether to use the constant carbamidomethylation + of C in mass calculations. Returns ------- @@ -291,7 +324,7 @@ def _create_unannotated_mgf(peptides, mgf_file, random_state=999): """ rng = np.random.default_rng(random_state) entries = [ - _create_unannotated_mgf_entry(p, idx, rng.choice([2, 3])) + _create_unannotated_mgf_entry(p, idx, rng.choice([2, 3]), c_mod=c_mod) for idx, p in enumerate(peptides) ] with mgf_file.open("w+") as mgf_ref: @@ -300,7 +333,7 @@ def _create_unannotated_mgf(peptides, mgf_file, random_state=999): return mgf_file -def _create_unannotated_mgf_entry(peptide, scan_num, charge): +def _create_unannotated_mgf_entry(peptide, scan_num, charge, c_mod=False): """ Create a MassIVE-KB style MGF entry for a single PSM. Each entry will have no SEQ= parameter, but will have a SCANS= parameter. @@ -313,13 +346,21 @@ def _create_unannotated_mgf_entry(peptide, scan_num, charge): The scan number. charge : int, optional The peptide charge state. + c_mod : bool, optional + Whether to use the constant carbamidomethylation + of C in mass calculations. Returns ------- str The PSM entry in an MGF file format. """ - precursor_mz = calculate_mass(peptide, charge=int(charge)) + if not c_mod: + precursor_mz = calculate_mass(peptide, charge=int(charge)) + else: + aa_mass = std_aa_mass + aa_mass.update({"C": 160.030649}) # Carbamidomethylated C mass + precursor_mz = fast_mass(peptide, charge=int(charge), aa_mass=aa_mass) mzs, intensities = _peptide_to_peaks(peptide, charge) frags = "\n".join([f"{m} {i}" for m, i in zip(mzs, intensities)]) diff --git a/tests/test_integration.py b/tests/test_integration.py index 60e3977b..4bd55174 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -7,50 +7,8 @@ from casanovo import casanovo -def test_annotate(mgf_small_unannotated, tide_dir_small, tmp_path): - - # Run a command: - run = functools.partial( - CliRunner().invoke, casanovo.main, catch_exceptions=False - ) - - annotate_args = [ - "annotate", - str(mgf_small_unannotated), - str(tide_dir_small), - "--output", - str(tmp_path / "annotated_mgf.mgf"), - ] - - result = run(annotate_args) - - assert result.exit_code == 0 - assert (tmp_path / "annotated_mgf.mgf").exists() - - # Read in the annotated file - with open(tmp_path / "annotated_mgf.mgf") as f: - annotated_lines = f.readlines() - - # Get each SEQ= line - seq_lines = [line for line in annotated_lines if line.startswith("SEQ=")] - assert len(seq_lines) == 3 - assert ( - seq_lines[0].strip() - == "SEQ=LESLIEK,PEPTIDEK,decoy_KEILSEL,decoy_KEDITEPP" - ) - assert ( - seq_lines[1].strip() - == "SEQ=LESLIEK,PEPTIDEK,decoy_KEILSEL,decoy_KEDITEPP" - ) - assert ( - seq_lines[2].strip() == "SEQ=+42.011LEM+15.995SLIM+15.995EK," - "+43.006PEN+0.984PTIQ+0.984DEK,decoy_-17.027KM+15.995EILSEL," - "decoy_+43.006-17.027KEDITEPP,decoy_KEDIQ+0.984TEPPQ+0.984" - ) - - def test_db_search( - mgf_small_unannotated, tide_dir_small, tiny_config, tmp_path, monkeypatch + mgf_db_search, tiny_fasta_file, tiny_config, tmp_path, monkeypatch ): # Run a command: monkeypatch.setattr(casanovo, "__version__", "4.1.0") @@ -58,30 +16,18 @@ def test_db_search( CliRunner().invoke, casanovo.main, catch_exceptions=False ) - annotate_args = [ - "annotate", - str(mgf_small_unannotated), - str(tide_dir_small), - "--output", - str(tmp_path / "annotated_mgf.mgf"), - ] - - result = run(annotate_args) - - assert result.exit_code == 0 - assert (tmp_path / "annotated_mgf.mgf").exists() - - # Follow up annotate run with db search - output_path = tmp_path / "db_search.mztab" search_args = [ "db-search", - str(tmp_path / "annotated_mgf.mgf"), "--config", tiny_config, "--output", str(output_path), + "--precursor_tolerance", + str(100), + str(mgf_db_search), + str(tiny_fasta_file), ] result = run(search_args) @@ -94,34 +40,13 @@ def test_db_search( psms = mztab.spectrum_match_table assert list(psms.sequence) == [ - "LESLIEK", - "PEPTIDEK", - "KEILSEL", - "KEDITEPP", - "LESLIEK", - "PEPTIDEK", - "KEILSEL", - "KEDITEPP", - "+42.011LEM+15.995SLIM+15.995EK", - "+43.006PEN+0.984PTIQ+0.984DEK", - "-17.027KM+15.995EILSEL", - "+43.006-17.027KEDITEPP", - "KEDIQ+0.984TEPPQ+0.984", - ] - assert list(psms["opt_cv_MS:1002217_decoy_peptide"]) == [ - "True", - "True", - "False", - "False", - "True", - "True", - "False", - "False", - "True", - "True", - "False", - "False", - "False", + "ATSIPAR", + "VTLSC+57.021R", + "LLIYGASTR", + "EIVMTQSPPTLSLSPGER", + "MEAPAQLLFLLLLWLPDTTR", + "ASQSVSSSYLTWYQQKPGQAPR", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", ] diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index ec9085c0..e3707917 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -10,10 +10,11 @@ import numpy as np import pytest import torch +import re from casanovo import casanovo from casanovo import utils -from casanovo.data import ms_io +from casanovo.data import ms_io, db_utils from casanovo.data.datasets import SpectrumDataset, AnnotatedSpectrumDataset from casanovo.denovo.evaluate import aa_match_batch, aa_match_metrics from casanovo.denovo.model import Spec2Pep, _aa_pep_score, _calc_match_score @@ -219,6 +220,433 @@ def test_calc_match_score(): assert np.sum(masked_per_aa_scores.numpy()[3]) == 3 +def test_digest_fasta_cleave(fasta_raw_data): + + with open("temp_fasta", "w") as file: + file.write(fasta_raw_data) + + # No missed cleavages + expected_normal = [ + "ATSIPAR", + "VTLSC+57.021R", + "LLIYGASTR", + "EIVMTQSPPTLSLSPGER", + "MEAPAQLLFLLLLWLPDTTR", + "ASQSVSSSYLTWYQQKPGQAPR", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + ] + + # 1 missed cleavage + expected_1missedcleavage = [ + "ATSIPAR", + "VTLSC+57.021R", + "LLIYGASTR", + "LLIYGASTRATSIPAR", + "EIVMTQSPPTLSLSPGER", + "MEAPAQLLFLLLLWLPDTTR", + "ASQSVSSSYLTWYQQKPGQAPR", + "EIVMTQSPPTLSLSPGERVTLSC+57.021R", + "VTLSC+57.021RASQSVSSSYLTWYQQKPGQAPR", + "ASQSVSSSYLTWYQQKPGQAPRLLIYGASTR", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + "MEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGER", + "ATSIPARFSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + ] + + # 3 missed cleavages + expected_3missedcleavage = [ + "ATSIPAR", + "VTLSC+57.021R", + "LLIYGASTR", + "LLIYGASTRATSIPAR", + "EIVMTQSPPTLSLSPGER", + "MEAPAQLLFLLLLWLPDTTR", + "ASQSVSSSYLTWYQQKPGQAPR", + "EIVMTQSPPTLSLSPGERVTLSC+57.021R", + "VTLSC+57.021RASQSVSSSYLTWYQQKPGQAPR", + "ASQSVSSSYLTWYQQKPGQAPRLLIYGASTR", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + "ASQSVSSSYLTWYQQKPGQAPRLLIYGASTRATSIPAR", + "VTLSC+57.021RASQSVSSSYLTWYQQKPGQAPRLLIYGASTR", + "MEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGER", + "ATSIPARFSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + "VTLSC+57.021RASQSVSSSYLTWYQQKPGQAPRLLIYGASTRATSIPAR", + "MEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGERVTLSC+57.021R", + "EIVMTQSPPTLSLSPGERVTLSC+57.021RASQSVSSSYLTWYQQKPGQAPR", + "LLIYGASTRATSIPARFSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + ] + + peptide_list = db_utils.digest_fasta( + fasta_filename="temp_fasta", + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + max_mods=0, + min_length=6, + max_length=50, + ) + peptide_list = [x[0] for x in peptide_list] + assert peptide_list == expected_normal + + peptide_list = db_utils.digest_fasta( + fasta_filename="temp_fasta", + enzyme="trypsin", + digestion="full", + missed_cleavages=1, + max_mods=0, + min_length=6, + max_length=50, + ) + peptide_list = [x[0] for x in peptide_list] + assert peptide_list == expected_1missedcleavage + + peptide_list = db_utils.digest_fasta( + fasta_filename="temp_fasta", + enzyme="trypsin", + digestion="full", + missed_cleavages=3, + max_mods=0, + min_length=6, + max_length=50, + ) + peptide_list = [x[0] for x in peptide_list] + assert peptide_list == expected_3missedcleavage + + +def test_digest_fasta_mods(fasta_raw_data): + + with open("temp_fasta", "w") as file: + file.write(fasta_raw_data) + + # 1 modification allowed + # fixed: C+57.02146 + # variable: 1M+15.994915,1N+0.984016,1Q+0.984016 + # nterm: 1X+42.010565,1X+43.005814,1X-17.026549,1X+25.980265 + expected_1mod = [ + "-17.027ATSIPAR", + "ATSIPAR", + "-17.027VTLSC+57.021R", + "VTLSC+57.021R", + "+43.006-17.027ATSIPAR", + "+42.011ATSIPAR", + "+43.006ATSIPAR", + "+43.006-17.027VTLSC+57.021R", + "+42.011VTLSC+57.021R", + "+43.006VTLSC+57.021R", + "-17.027LLIYGASTR", + "LLIYGASTR", + "+43.006-17.027LLIYGASTR", + "+42.011LLIYGASTR", + "+43.006LLIYGASTR", + "-17.027EIVMTQSPPTLSLSPGER", + "EIVMTQSPPTLSLSPGER", + "EIVMTQ+0.984SPPTLSLSPGER", + "EIVM+15.995TQSPPTLSLSPGER", + "+43.006-17.027EIVMTQSPPTLSLSPGER", + "+42.011EIVMTQSPPTLSLSPGER", + "+43.006EIVMTQSPPTLSLSPGER", + "-17.027MEAPAQLLFLLLLWLPDTTR", + "MEAPAQLLFLLLLWLPDTTR", + "MEAPAQ+0.984LLFLLLLWLPDTTR", + "M+15.995EAPAQLLFLLLLWLPDTTR", + "+43.006-17.027MEAPAQLLFLLLLWLPDTTR", + "+42.011MEAPAQLLFLLLLWLPDTTR", + "+43.006MEAPAQLLFLLLLWLPDTTR", + "-17.027ASQSVSSSYLTWYQQKPGQAPR", + "ASQSVSSSYLTWYQQKPGQAPR", + "ASQ+0.984SVSSSYLTWYQQKPGQAPR", + "ASQSVSSSYLTWYQ+0.984QKPGQAPR", + "ASQSVSSSYLTWYQQ+0.984KPGQAPR", + "ASQSVSSSYLTWYQQKPGQ+0.984APR", + "+43.006-17.027ASQSVSSSYLTWYQQKPGQAPR", + "+42.011ASQSVSSSYLTWYQQKPGQAPR", + "+43.006ASQSVSSSYLTWYQQKPGQAPR", + "-17.027FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + "FSGSGSGTDFTLTISSLQ+0.984PEDFAVYYC+57.021QQDYNLP", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021Q+0.984QDYNLP", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQ+0.984DYNLP", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYN+0.984LP", + "+43.006-17.027FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + "+42.011FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + "+43.006FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + ] + + peptide_list = db_utils.digest_fasta( + fasta_filename="temp_fasta", + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + max_mods=1, + min_length=6, + max_length=50, + ) + peptide_list = [x[0] for x in peptide_list] + peptide_list = [ + x + for x in peptide_list + if not re.search( + r"(\+42\.011|\+43\.006|\-17\.027|\+43\.006\-17\.027)+[A-Z]\+", x + ) + ] + assert peptide_list == expected_1mod + + +def test_length_restrictions(fasta_raw_data): + + with open("temp_fasta", "w") as file: + file.write(fasta_raw_data) + + # length between 20 and 50 + expected_long = [ + "MEAPAQLLFLLLLWLPDTTR", + "ASQSVSSSYLTWYQQKPGQAPR", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + ] + + # length between 6 and 8 + expected_short = ["ATSIPAR", "VTLSC+57.021R"] + + peptide_list = db_utils.digest_fasta( + fasta_filename="temp_fasta", + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + max_mods=0, + min_length=20, + max_length=50, + ) + peptide_list = [x[0] for x in peptide_list] + assert peptide_list == expected_long + + peptide_list = db_utils.digest_fasta( + fasta_filename="temp_fasta", + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + max_mods=0, + min_length=6, + max_length=8, + ) + peptide_list = [x[0] for x in peptide_list] + assert peptide_list == expected_short + + +def test_digest_fasta_enzyme(fasta_raw_data): + + with open("temp_fasta", "w") as file: + file.write(fasta_raw_data) + + # arg-c enzyme + expected_argc = [ + "ATSIPAR", + "VTLSC+57.021R", + "LLIYGASTR", + "EIVMTQSPPTLSLSPGER", + "MEAPAQLLFLLLLWLPDTTR", + "ASQSVSSSYLTWYQQKPGQAPR", + "FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", + ] + + # asp-n enzyme + expected_aspn = ["DFAVYYC+57.021QQ", "DFTLTISSLQPE", "MEAPAQLLFLLLLWLP"] + + peptide_list = db_utils.digest_fasta( + fasta_filename="temp_fasta", + enzyme="arg-c", + digestion="full", + missed_cleavages=0, + max_mods=0, + min_length=6, + max_length=50, + ) + peptide_list = [x[0] for x in peptide_list] + assert peptide_list == expected_argc + + peptide_list = db_utils.digest_fasta( + fasta_filename="temp_fasta", + enzyme="asp-n", + digestion="full", + missed_cleavages=0, + max_mods=0, + min_length=6, + max_length=50, + ) + peptide_list = [x[0] for x in peptide_list] + assert peptide_list == expected_aspn + + +def test_get_candidates(fasta_raw_data): + + with open("temp_fasta", "w") as file: + file.write(fasta_raw_data) + + # precursor_window is 10000 + expected_smallwindow = ["LLIYGASTR"] + + # precursor window is 150000 + expected_midwindow = ["LLIYGASTR"] + + # precursor window is 600000 + expected_widewindow = ["ATSIPAR", "VTLSC+57.021R", "LLIYGASTR"] + + peptide_list = db_utils.digest_fasta( + fasta_filename="temp_fasta", + enzyme="trypsin", + digestion="full", + missed_cleavages=1, + max_mods=0, + min_length=6, + max_length=50, + ) + + candidates = db_utils.get_candidates( + precursor_mz=496.2, + charge=2, + peptide_list=peptide_list, + precursor_tolerance=10000, + isotope_error="0", + ) + candidates = [x[0] for x in candidates] + assert expected_smallwindow == candidates + + peptide_list = db_utils.digest_fasta( + fasta_filename="temp_fasta", + enzyme="trypsin", + digestion="full", + missed_cleavages=1, + max_mods=0, + min_length=6, + max_length=50, + ) + + candidates = db_utils.get_candidates( + precursor_mz=496.2, + charge=2, + peptide_list=peptide_list, + precursor_tolerance=150000, + isotope_error="0", + ) + candidates = [x[0] for x in candidates] + assert expected_midwindow == candidates + + peptide_list = db_utils.digest_fasta( + fasta_filename="temp_fasta", + enzyme="trypsin", + digestion="full", + missed_cleavages=1, + max_mods=0, + min_length=6, + max_length=50, + ) + + candidates = db_utils.get_candidates( + precursor_mz=496.2, + charge=2, + peptide_list=peptide_list, + precursor_tolerance=600000, + isotope_error="0", + ) + candidates = [x[0] for x in candidates] + assert expected_widewindow == candidates + + +def test_get_candidates_isotope_error(): + + # Tide isotope error windows for 496.2, 2+: + # 0: [980.481617, 1000.289326] + # 1: [979.491114, 999.278813] + # 2: [978.500611, 998.268300] + # 3: [977.510108, 997.257787] + + peptide_list = [ + ("A", 1001), + ("B", 1000), + ("C", 999), + ("D", 998), + ("E", 997), + ("F", 996), + ("G", 995), + ("H", 994), + ("I", 993), + ("J", 992), + ("K", 991), + ("L", 990), + ("M", 989), + ("N", 988), + ("O", 987), + ("P", 986), + ("Q", 985), + ("R", 984), + ("S", 983), + ("T", 982), + ("U", 981), + ("V", 980), + ("W", 979), + ("X", 978), + ("Y", 977), + ("Z", 976), + ] + + peptide_list.sort(key=lambda x: x[1]) + + expected_isotope0 = list("UTSRQPONMLKJIHGFEDCB") + expected_isotope1 = list("VUTSRQPONMLKJIHGFEDC") + expected_isotope2 = list("WVUTSRQPONMLKJIHGFED") + expected_isotope3 = list("XWVUTSRQPONMLKJIHGFE") + expected_isotope0123 = list("XWVUTSRQPONMLKJIHGFEDCB") + + candidates = db_utils.get_candidates( + precursor_mz=496.2, + charge=2, + peptide_list=peptide_list, + precursor_tolerance=10000, + isotope_error="0", + ) + candidates = [x[0] for x in candidates] + assert expected_isotope0 == candidates + + candidates = db_utils.get_candidates( + precursor_mz=496.2, + charge=2, + peptide_list=peptide_list, + precursor_tolerance=10000, + isotope_error="1", + ) + candidates = [x[0] for x in candidates] + assert expected_isotope1 == candidates + + candidates = db_utils.get_candidates( + precursor_mz=496.2, + charge=2, + peptide_list=peptide_list, + precursor_tolerance=10000, + isotope_error="2", + ) + candidates = [x[0] for x in candidates] + assert expected_isotope2 == candidates + + candidates = db_utils.get_candidates( + precursor_mz=496.2, + charge=2, + peptide_list=peptide_list, + precursor_tolerance=10000, + isotope_error="3", + ) + candidates = [x[0] for x in candidates] + assert expected_isotope3 == candidates + + candidates = db_utils.get_candidates( + precursor_mz=496.2, + charge=2, + peptide_list=peptide_list, + precursor_tolerance=10000, + isotope_error="0,1,2,3", + ) + candidates = [x[0] for x in candidates] + assert expected_isotope0123 == candidates + + def test_beam_search_decode(): """ Test beam search decoding and its sub-functions. From e2ce3172c89a5c4fc74256689fa3cdf6b01d1faf Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Tue, 2 Jul 2024 20:20:25 -0700 Subject: [PATCH 03/21] fix for issue with 0 candidates --- casanovo/denovo/model.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 4d9bd41b..02a324d3 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1071,15 +1071,13 @@ def smart_batch_gen(self, spectrum_batch): self.precursor_tolerance, self.isotope_error, ) - logger.debug("%s", digest_data) try: spec_peptides, pep_masses, pep_protein = list( zip(*digest_data) ) except ValueError: - logger.info( - "No peptides found for precursor %s", precursors[idx] - ) + logger.info("No peptides found for spectrum %s", indexes[idx]) + continue spec_precursors = [precursors[idx]] * len(spec_peptides) spec_enc = [enc[idx]] * len(spec_peptides) spec_idx = [indexes[idx]] * len(spec_peptides) From 5ef27e0c7dfffd219e5b248205a7ced0187ce4bb Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Wed, 3 Jul 2024 11:33:36 -0700 Subject: [PATCH 04/21] minor fixes added --- casanovo/data/datasets.py | 2 - casanovo/denovo/dataloaders.py | 13 --- casanovo/denovo/model.py | 31 +++--- casanovo/denovo/model_runner.py | 2 +- tests/conftest.py | 164 +++++++++----------------------- 5 files changed, 67 insertions(+), 145 deletions(-) diff --git a/casanovo/data/datasets.py b/casanovo/data/datasets.py index 59f56b68..6244e88f 100644 --- a/casanovo/data/datasets.py +++ b/casanovo/data/datasets.py @@ -134,8 +134,6 @@ def _process_peaks( The precursor m/z. precursor_charge : int The precursor charge. - track_spectrum_id : Optional[bool] - Whether to keep track of the identifier of the MS/MS spectra. Returns ------- diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index ba02936c..97bfb2fc 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -125,17 +125,6 @@ def setup(self, stage: str = None, annotated: bool = True) -> None: ) if self.test_index is not None: self.test_dataset = make_dataset(self.test_index) - if stage == "db": - make_dataset = functools.partial( - SpectrumDataset, - n_peaks=self.n_peaks, - min_mz=self.min_mz, - max_mz=self.max_mz, - min_intensity=self.min_intensity, - remove_precursor_tol=self.remove_precursor_tol, - ) - if self.test_index is not None: - self.test_dataset = make_dataset(self.test_index) def _make_loader( self, @@ -154,8 +143,6 @@ def _make_loader( The batch size to use. shuffle : bool Option to shuffle the batches. - db_mode : bool - Option to use the DataLoader for Casanovo-DB. Returns ------- diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 02a324d3..312e7f92 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -12,7 +12,6 @@ import numpy as np import lightning.pytorch as pl from torch.utils.tensorboard import SummaryWriter -from pyteomics import mass from depthcharge.components import ModelMixin, PeptideDecoder, SpectrumEncoder from . import evaluate @@ -992,10 +991,19 @@ def configure_optimizers( class DbSpec2Pep(Spec2Pep): """ - Inherits Spec2Pep + Subclass of Spec2Pep for the use of Casanovo as an MS/MS database search score function. - Hijacks teacher-forcing implemented in Spec2Pep and - uses it to predict scores between a spectra and associated peptide. + Uses teacher forcing to 'query' Casanovo for its score for each AA + within a candidate peptide, and takes the geometric average of these scores + and reports this as the score for the spectrum-peptide pair. Note that the + geometric mean of the AA scores is actually calculated by a + summation and average of the log of the scores, to preserve numerical + stability. This does not affect PSM ranking. + + Also note that although teacher-forcing is used within this method, + there is *no training* involved. This is a prediction-only method. + + Output is provided in .mztab format. """ def __init__(self, *args, **kwargs): @@ -1119,7 +1127,6 @@ def on_predict_batch_end( per_aa_score, precursors, ) in outputs: - prec_mass = precursors[:, 0] prec_charge = precursors[:, 1] prec_mz = precursors[:, 2] calc_mz = [ @@ -1140,9 +1147,9 @@ def on_predict_batch_end( def _calc_match_score( batch_all_aa_scores: torch.Tensor, - truth_aa_indicies: torch.Tensor, + truth_aa_indices: torch.Tensor, decoder_reverse: bool = False, -) -> List[float]: +) -> Tuple[torch.Tensor, torch.Tensor]: """ Calculate the score between the input spectra and associated peptide. @@ -1158,7 +1165,7 @@ def _calc_match_score( Amino acid scores for all amino acids in the vocabulary for every prediction made to generate the associated peptide (for an entire batch) - truth_aa_indicies : torch.Tensor + truth_aa_indices : torch.Tensor Indicies of the score for each actual amino acid in the peptide (for an entire batch) decoder_reverse : bool @@ -1166,7 +1173,7 @@ def _calc_match_score( Returns ------- - score : list[float], list[list[float]] + (all_scores, per_aa_scores) : Tuple[torch.Tensor, torch.Tensor] The score between the input spectra and associated peptide (for an entire batch) a list of lists of per amino acid scores @@ -1175,7 +1182,7 @@ def _calc_match_score( # Remove trailing tokens from predictions based on decoder reversal if decoder_reverse: batch_all_aa_scores = batch_all_aa_scores[:, 1:] - elif not decoder_reverse: + else: batch_all_aa_scores = batch_all_aa_scores[:, :-1] # Vectorized scoring using efficient indexing. @@ -1186,10 +1193,10 @@ def _calc_match_score( ) cols = torch.arange(0, batch_all_aa_scores.shape[1]).expand_as(rows) - per_aa_scores = batch_all_aa_scores[rows, cols, truth_aa_indicies] + per_aa_scores = batch_all_aa_scores[rows, cols, truth_aa_indices] per_aa_scores[per_aa_scores == 0] += 1e-10 - score_mask = truth_aa_indicies != 0 + score_mask = truth_aa_indices != 0 per_aa_scores[~score_mask] = 0 log_per_aa_scores = torch.log(per_aa_scores) all_scores = torch.where( diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 284acbe8..865df71b 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -151,7 +151,7 @@ def db_search( test_index = self._get_index(peak_path, False, "db search") self.writer.set_ms_run(test_index.ms_files) self.initialize_data_module(test_index=test_index) - self.loaders.setup(stage="db") + self.loaders.setup(stage="test", annotated=False) self.trainer.predict(self.model, self.loaders.predict_dataloader()) def train( diff --git a/tests/conftest.py b/tests/conftest.py index cac1a873..b2244308 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -16,7 +16,37 @@ def mgf_small(tmp_path): return _create_mgf(peptides, mgf_file) -def _create_mgf(peptides, mgf_file, random_state=42): +@pytest.fixture +def tiny_fasta_file(tmp_path, fasta_raw_data): + fasta_file = tmp_path / "tiny_fasta.fasta" + with fasta_file.open("w+") as fasta_ref: + fasta_ref.write(fasta_raw_data) + + return fasta_file + + +@pytest.fixture +def fasta_raw_data(): + return ">foo\nMEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGERVTLSCRASQSVSSSYLTWYQQKPGQAPRLLIYGASTRATSIPARFSGSGSGTDFTLTISSLQPEDFAVYYCQQDYNLP" + + +@pytest.fixture +def mgf_db_search(tmp_path): + """An MGF file with 7 spectra and scan numbers, C+57.021 mass modification considered""" + peptides = [ + "ATSIPAR", + "VTLSCR", + "LLIYGASTR", + "EIVMTQSPPTLSLSPGER", + "MEAPAQLLFLLLLWLPDTTR", + "ASQSVSSSYLTWYQQKPGQAPR", + "FSGSGSGTDFTLTISSLQPEDFAVYYCQQDYNLP", + ] + mgf_file = tmp_path / "db_search.mgf" + return _create_mgf(peptides, mgf_file, c_mod=True) + + +def _create_mgf(peptides, mgf_file, random_state=42, c_mod=False): """ Create a fake MGF file from one or more peptides. @@ -28,20 +58,25 @@ def _create_mgf(peptides, mgf_file, random_state=42): The MGF file to create. random_state : int or numpy.random.Generator, optional The random seed. The charge states are chosen to be 2 or 3 randomly. + c_mod : bool, optional + Whether to use the constant carbamidomethylation + of C in mass calculations. Returns ------- mgf_file : Path """ rng = np.random.default_rng(random_state) - entries = [_create_mgf_entry(p, rng.choice([2, 3])) for p in peptides] + entries = [ + _create_mgf_entry(p, rng.choice([2, 3]), c_mod) for p in peptides + ] with mgf_file.open("w+") as mgf_ref: mgf_ref.write("\n".join(entries)) return mgf_file -def _create_mgf_entry(peptide, charge=2): +def _create_mgf_entry(peptide, charge=2, c_mod=False): """ Create a MassIVE-KB style MGF entry for a single PSM. @@ -51,13 +86,21 @@ def _create_mgf_entry(peptide, charge=2): A peptide sequence. charge : int, optional The peptide charge state. + c_mod : bool, optional + Whether to use the constant carbamidomethylation + of C in mass calculations. Returns ------- str The PSM entry in an MGF file format. """ - precursor_mz = calculate_mass(peptide, charge=int(charge)) + if not c_mod: + precursor_mz = calculate_mass(peptide, charge=int(charge)) + else: + aa_mass = std_aa_mass + aa_mass.update({"C": 160.030649}) # Carbamidomethylated C mass + precursor_mz = fast_mass(peptide, charge=int(charge), aa_mass=aa_mass) mzs, intensities = _peptide_to_peaks(peptide, charge) frags = "\n".join([f"{m} {i}" for m, i in zip(mzs, intensities)]) @@ -263,119 +306,6 @@ def tiny_config(tmp_path): return cfg_file -@pytest.fixture -def tiny_fasta_file(tmp_path, fasta_raw_data): - fasta_file = tmp_path / "tiny_fasta.fasta" - with fasta_file.open("w+") as fasta_ref: - fasta_ref.write(fasta_raw_data) - - return fasta_file - - -@pytest.fixture -def fasta_raw_data(): - return ">foo\nMEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGERVTLSCRASQSVSSSYLTWYQQKPGQAPRLLIYGASTRATSIPARFSGSGSGTDFTLTISSLQPEDFAVYYCQQDYNLP" - - -@pytest.fixture -def mgf_db_search(tmp_path): - """An MGF file with 2 unannotated spectra and scan numbers.""" - peptides = [ - "ATSIPAR", - "VTLSCR", - "LLIYGASTR", - "EIVMTQSPPTLSLSPGER", - "MEAPAQLLFLLLLWLPDTTR", - "ASQSVSSSYLTWYQQKPGQAPR", - "FSGSGSGTDFTLTISSLQPEDFAVYYCQQDYNLP", - ] - mgf_file = tmp_path / "db_search.mgf" - return _create_unannotated_mgf(peptides, mgf_file, c_mod=True) - - -@pytest.fixture -def mgf_small_unannotated(tmp_path): - """An MGF file with 2 unannotated spectra and scan numbers.""" - peptides = ["LESLIEK", "PEPTIDEK", "LESTIEK"] - mgf_file = tmp_path / "small_unannotated.mgf" - return _create_unannotated_mgf(peptides, mgf_file) - - -def _create_unannotated_mgf(peptides, mgf_file, random_state=999, c_mod=False): - """ - Create a fake MGF file from one or more peptides. - This file will have no SEQ= parameter, but will have a SCANS= parameter. - - Parameters - ---------- - peptides : str or list of str - The peptides for which to create spectra. - mgf_file : Path - The MGF file to create. - random_state : int or numpy.random.Generator, optional - The random seed. The charge states are chosen to be 2 or 3 randomly. - c_mod : bool, optional - Whether to use the constant carbamidomethylation - of C in mass calculations. - - Returns - ------- - mgf_file : Path - """ - rng = np.random.default_rng(random_state) - entries = [ - _create_unannotated_mgf_entry(p, idx, rng.choice([2, 3]), c_mod=c_mod) - for idx, p in enumerate(peptides) - ] - with mgf_file.open("w+") as mgf_ref: - mgf_ref.write("\n".join(entries)) - - return mgf_file - - -def _create_unannotated_mgf_entry(peptide, scan_num, charge, c_mod=False): - """ - Create a MassIVE-KB style MGF entry for a single PSM. - Each entry will have no SEQ= parameter, but will have a SCANS= parameter. - - Parameters - ---------- - peptide : str - A peptide sequence. - scan_num : int - The scan number. - charge : int, optional - The peptide charge state. - c_mod : bool, optional - Whether to use the constant carbamidomethylation - of C in mass calculations. - - Returns - ------- - str - The PSM entry in an MGF file format. - """ - if not c_mod: - precursor_mz = calculate_mass(peptide, charge=int(charge)) - else: - aa_mass = std_aa_mass - aa_mass.update({"C": 160.030649}) # Carbamidomethylated C mass - precursor_mz = fast_mass(peptide, charge=int(charge), aa_mass=aa_mass) - mzs, intensities = _peptide_to_peaks(peptide, charge) - frags = "\n".join([f"{m} {i}" for m, i in zip(mzs, intensities)]) - - mgf = [ - "BEGIN IONS", - f"TITLE=title::{scan_num}", - f"PEPMASS={precursor_mz}", - f"CHARGE={charge}+", - f"SCANS={scan_num}", - f"{frags}", - "END IONS", - ] - return "\n".join(mgf) - - @pytest.fixture def tide_dir_small(tmp_path): """A directory with a very small TIDE search result.""" From 5f0675f032579e2976718c619969bdfd47cc68c5 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Wed, 3 Jul 2024 14:20:56 -0700 Subject: [PATCH 05/21] reordered and renamed variables for consistency --- casanovo/denovo/model.py | 45 ++++++++++++++++++--------------- casanovo/denovo/model_runner.py | 10 ++++---- 2 files changed, 29 insertions(+), 26 deletions(-) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 312e7f92..8bb0dbee 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1021,30 +1021,34 @@ def predict_step(self, batch, *args): Returns ------- - predictions: List[Tuple[int, str, float, np.ndarray, np.ndarray]] + predictions: List[Tuple[int, int, float, str, np.ndarray, np.ndarray]] Model predictions for the given batch of spectra containing spectrum - scan number, peptide sequence, Casanovo-DB score, - amino acid-level confidence scores, and precursor information. + ids, precursor charge and m/z, candidate peptide sequences, peptide + scores, and amino acid-level scores. """ batch_res = [] for ( - indexes, + spectrum_i, peptides, precursors, encoded_ms, ) in self.smart_batch_gen(batch): pred, truth = self.decoder(peptides, precursors, *encoded_ms) pred = self.softmax(pred) - score_result, per_aa_score = _calc_match_score( + peptide_scores, aa_scores = _calc_match_score( pred, truth, self.decoder.reverse ) + precursor_info = precursors.cpu().detach().numpy() + precursor_charge = precursor_info[:, 1] + precursor_mz = precursor_info[:, 2] batch_res.append( ( - indexes, + spectrum_i, + precursor_charge, + precursor_mz, peptides, - score_result.cpu().detach().numpy(), - per_aa_score.cpu().detach().numpy(), - precursors.cpu().detach().numpy(), + peptide_scores.cpu().detach().numpy(), + aa_scores.cpu().detach().numpy(), ) ) return batch_res @@ -1121,26 +1125,25 @@ def on_predict_batch_end( if self.out_writer is None: return for ( - indexes, + spectrum_i, + precursor_charge, + precursor_mz, peptides, - score_result, - per_aa_score, - precursors, + peptide_scores, + aa_scores, ) in outputs: - prec_charge = precursors[:, 1] - prec_mz = precursors[:, 2] calc_mz = [ self.peptide_mass_calculator.mass(peptide, charge) - for peptide, charge in zip(peptides, prec_charge) + for peptide, charge in zip(peptides, precursor_charge) ] for row in zip( peptides, - score_result, - prec_charge, - prec_mz, + peptide_scores, + precursor_charge, + precursor_mz, calc_mz, - indexes, - per_aa_score, + spectrum_i, + aa_scores, ): self.out_writer.psms.append(row) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 865df71b..1457df38 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -333,12 +333,12 @@ def initialize_model( if self.model_filename is None: # Train a model from scratch if no model file is provided. + if db_search: + logger.error("DB search mode requires a model file") + raise ValueError( + "A model file must be provided for DB search mode" + ) if train: - if db_search: - logger.error("Db search mode requires a model file.") - raise ValueError( - "A model file must be provided for DB search mode" - ) self.model = Spec2Pep(**model_params) return # Else we're not training, so a model file must be provided. From b4fd8ff05eaebcf62351627c8ceee2fee3bc23a1 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Thu, 4 Jul 2024 14:39:36 -0700 Subject: [PATCH 06/21] casanovo-db full working version with code simplification --- casanovo/data/db_utils.py | 16 +++- casanovo/data/ms_io.py | 80 +--------------- casanovo/denovo/dataloaders.py | 111 ++++++++++++++++++++++ casanovo/denovo/model.py | 159 +++++++++++--------------------- casanovo/denovo/model_runner.py | 32 ++++--- 5 files changed, 198 insertions(+), 200 deletions(-) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 341a6162..921c75bd 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -4,9 +4,12 @@ import depthcharge.masses from pyteomics import fasta, parser import bisect +import logging from typing import List, Tuple +logger = logging.getLogger("casanovo") + # CONSTANTS HYDROGEN = 1.007825035 OXYGEN = 15.99491463 @@ -96,17 +99,22 @@ def digest_fasta( semi=semi, ) protein = header.split()[0] - peptide_list.extend([(pep, protein) for pep in pep_set]) + for pep in pep_set: + if len(pep) < min_length or len(pep) > max_length: + continue + if "X" in pep or "U" in pep: + logger.warn( + "Skipping peptide with ambiguous amino acids: %s", pep + ) + continue + peptide_list.append((pep, protein)) else: raise ValueError(f"Digestion type {digestion} not recognized.") # Generate modified peptides mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") - mass_calculator.masses.update({"X": 0.0}) # TODO: REMOVE? mod_peptide_list = [] for pep, prot in peptide_list: - if len(pep) < min_length or len(pep) > max_length: - continue peptide_isoforms = parser.isoforms( pep, variable_mods=var_mods, diff --git a/casanovo/data/ms_io.py b/casanovo/data/ms_io.py index a701b627..b27f083b 100644 --- a/casanovo/data/ms_io.py +++ b/casanovo/data/ms_io.py @@ -22,13 +22,10 @@ class MztabWriter: ---------- filename : str The name of the mzTab file. - is_db_variant : bool - Whether the mzTab file is for a Casanovo-DB search. """ - def __init__(self, filename: str, is_db_variant: bool = False): + def __init__(self, filename: str): self.filename = filename - self.is_db_variant = is_db_variant self.metadata = [ ("mzTab-version", "1.0.0"), ("mzTab-mode", "Summary"), @@ -150,9 +147,6 @@ def save(self) -> None: """ Export the spectrum identifications to the mzTab file. """ - if self.is_db_variant: - self.save_db_variant() - return with open(self.filename, "w", newline="") as f: writer = csv.writer(f, delimiter="\t", lineterminator=os.linesep) # Write metadata. @@ -192,7 +186,7 @@ def save(self) -> None: "PSM", psm[0], # sequence i, # PSM_ID - "null", # accession + "null" if len(psm) < 8 else psm[7], # accession "null", # unique "null", # database "null", # database_version @@ -215,73 +209,3 @@ def save(self) -> None: psm[6], # opt_ms_run[1]_aa_scores ] ) - - def save_db_variant(self) -> None: - """ - Export the Casanovo-DB search results to the mzTab file. - - Outputs PSMs in the order they were scored - (i.e. the order in the .mgf file). - """ - with open(self.filename, "w", newline="") as f: - writer = csv.writer(f, delimiter="\t", lineterminator=os.linesep) - # Write metadata. - for row in self.metadata: - writer.writerow(["MTD", *row]) - # Write PSMs. - writer.writerow( - [ - "PSH", - "sequence", - "PSM_ID", - "accession", - "unique", - "database", - "database_version", - "search_engine", - "search_engine_score[1]", - "modifications", - "retention_time", - "charge", - "exp_mass_to_charge", - "calc_mass_to_charge", - "spectra_ref", - "pre", - "post", - "start", - "end", - "opt_ms_run[1]_aa_scores", - ] - ) - for i, psm in enumerate(self.psms): - writer.writerow( - [ - "PSM", - psm[0], # sequence - f"{psm[5]}:{i}", # PSM_ID (spectrum # :candidate #) - "null", # accession - "null", # unique - "null", # database - "null", # database_version - "null", # search_engine - psm[1], # search_engine_score[1] - "null", # modifications - "null", # retention_time - int(psm[2]), # charge - psm[3], # exp_mass_to_charge - psm[4], # calc_mass_to_charge - psm[5], # spectra_ref - "null", # pre - "null", # post - "null", # start - "null", # end - ",".join( - list( - map( - "{:.5f}".format, - psm[6][psm[6] != 0], - ) - ) - ), # opt_ms_run[1]_aa_scores - ] - ) diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 97bfb2fc..80a4f7dc 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -3,6 +3,8 @@ import functools import os from typing import List, Optional, Tuple +from functools import partial +import logging import lightning.pytorch as pl import numpy as np @@ -13,6 +15,9 @@ AnnotatedSpectrumDataset, SpectrumDataset, ) +from ..data import db_utils + +logger = logging.getLogger("casanovo") class DeNovoDataModule(pl.LightningDataModule): @@ -176,6 +181,22 @@ def predict_dataloader(self) -> torch.utils.data.DataLoader: """Get the predict DataLoader.""" return self._make_loader(self.test_dataset, self.eval_batch_size) + def db_dataloader(self) -> torch.utils.data.DataLoader: + """Get a special dataloader for DB search""" + return torch.utils.data.DataLoader( + self.test_dataset, + batch_size=self.eval_batch_size, + collate_fn=partial( + prepare_psm_batch, + digest=self.digest, + precursor_tolerance=self.precursor_tolerance, + isotope_error=self.isotope_error, + ), + pin_memory=True, + num_workers=self.n_workers, + shuffle=False, + ) + def prepare_batch( batch: List[Tuple[torch.Tensor, float, int, str]] @@ -214,3 +235,93 @@ def prepare_batch( [precursor_masses, precursor_charges, precursor_mzs] ).T.float() return spectra, precursors, np.asarray(spectrum_ids) + + +def prepare_psm_batch( + batch: List[Tuple[torch.Tensor, float, int, str]], + digest: List[Tuple[str, float, str]], + precursor_tolerance: float, + isotope_error: str, +): + """ + Collate MS/MS spectra into a batch for DB search. + + The MS/MS spectra will be padded so that they fit nicely as a tensor. + However, the padded elements are ignored during the subsequent steps. + + Parameters + ---------- + batch : List[Tuple[torch.Tensor, float, int, str]] + A batch of data from an AnnotatedSpectrumDataset, consisting of for each + spectrum (i) a tensor with the m/z and intensity peak values, (ii), the + precursor m/z, (iii) the precursor charge, (iv) the spectrum identifier. + digest : List[Tuple[str, float, str]] + A list of tuples containing the peptide sequence, mass, and associated protein + from digesting a .fasta file. Sorted by mass in ascending order. Uses neutral masses. + precursor_tolerance : float + The precursor mass tolerance in parts-per-million. + isotope_error : str + The isotope error levels to consider. + + Returns + ------- + all_spectra : torch.Tensor of shape (batch_size, n_peaks, 2) + The padded mass spectra tensor with the m/z and intensity peak values + for each spectrum. + all_precursors : torch.Tensor of shape (batch_size, 3) + A tensor with the precursor neutral mass, precursor charge, and + precursor m/z. + all_spectrum_ids : np.ndarray + The spectrum identifiers. + all_peptides : List[str] + The candidate peptides for each spectrum. + all_proteins : List[str] + The associated proteins for each candidate peptide. + """ + spectra, precursor_mzs, precursor_charges, spectrum_ids = list(zip(*batch)) + spectra = torch.nn.utils.rnn.pad_sequence(spectra, batch_first=True) + + precursor_mzs = torch.tensor(precursor_mzs) + precursor_charges = torch.tensor(precursor_charges) + precursor_masses = (precursor_mzs - 1.007276) * precursor_charges + precursors = torch.vstack( + [precursor_masses, precursor_charges, precursor_mzs] + ).T.float() + + all_spectra = [] + all_precursors = [] + all_spectrum_ids = [] + all_peptides = [] + all_proteins = [] + for idx in range(len(batch)): + digest_data = db_utils.get_candidates( + precursor_mzs[idx], + precursor_charges[idx], + digest, + precursor_tolerance, + isotope_error, + ) + try: + spec_peptides, _, pep_protein = list(zip(*digest_data)) + all_spectra.append( + spectra[idx].unsqueeze(0).repeat(len(spec_peptides), 1, 1) + ) + all_precursors.append( + precursors[idx].unsqueeze(0).repeat(len(spec_peptides), 1) + ) + all_spectrum_ids.extend([spectrum_ids[idx]] * len(spec_peptides)) + all_peptides.extend(spec_peptides) + all_proteins.extend(pep_protein) + except ValueError: + logger.warning( + "No candidates found for spectrum %s", spectrum_ids[idx] + ) + continue + + return ( + torch.cat(all_spectra, dim=0), + torch.cat(all_precursors, dim=0), + all_spectrum_ids, + all_peptides, + all_proteins, + ) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 8bb0dbee..2256946c 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1008,6 +1008,7 @@ class DbSpec2Pep(Spec2Pep): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self.total_psms = 0 def predict_step(self, batch, *args): """ @@ -1015,137 +1016,85 @@ def predict_step(self, batch, *args): Parameters ---------- - batch : Tuple[torch.Tensor, torch.Tensor, torch.Tensor] + batch : Tuple[torch.Tensor, torch.Tensor, np.array, List[str], List[str]] A batch of (i) MS/MS spectra, (ii) precursor information, (iii) - spectrum identifiers as torch Tensors. + spectrum identifiers, (iv) candidate peptides, (v) associated proteins. Returns ------- - predictions: List[Tuple[int, int, float, str, np.ndarray, np.ndarray]] + predictions: List[Tuple[int, int, float, str, np.ndarray, np.ndarray, str]] Model predictions for the given batch of spectra containing spectrum ids, precursor charge and m/z, candidate peptide sequences, peptide - scores, and amino acid-level scores. + scores, amino acid-level scores, and associated proteins. """ - batch_res = [] + predictions = [] + pred, truth = self.decoder(batch[3], batch[1], *self.encoder(batch[0])) + pred = self.softmax(pred) + all_scores, per_aa_scores = _calc_match_score( + pred, truth, self.decoder.reverse + ) for ( + precursor_charge, + precursor_mz, spectrum_i, - peptides, - precursors, - encoded_ms, - ) in self.smart_batch_gen(batch): - pred, truth = self.decoder(peptides, precursors, *encoded_ms) - pred = self.softmax(pred) - peptide_scores, aa_scores = _calc_match_score( - pred, truth, self.decoder.reverse - ) - precursor_info = precursors.cpu().detach().numpy() - precursor_charge = precursor_info[:, 1] - precursor_mz = precursor_info[:, 2] - batch_res.append( + peptide_score, + aa_scores, + peptide, + protein, + ) in zip( + batch[1][:, 1].cpu().detach().numpy(), + batch[1][:, 2].cpu().detach().numpy(), + batch[2], + all_scores.cpu().detach().numpy(), + per_aa_scores.cpu().detach().numpy(), + batch[3], + batch[4], + ): + predictions.append( ( spectrum_i, precursor_charge, precursor_mz, - peptides, - peptide_scores.cpu().detach().numpy(), - aa_scores.cpu().detach().numpy(), - ) - ) - return batch_res - - def smart_batch_gen(self, spectrum_batch): - """ - Transforms a batch of spectra into multiple equally-sized batches of PSMs. - - Parameters - ---------- - spectrum batch : Tuple[torch.Tensor, torch.Tensor, torch.Tensor] - A batch of (i) MS/MS spectra, (ii) precursor information, (iii) - spectrum identifiers as torch Tensors. - - Yields - ------- - psm_batch: Tuple[List[int], List[str], torch.Tensor, Tuple[torch.Tensor, torch.Tensor]] - A batch of PSMs containing the spectrum index, peptide sequence, - precursor information, and encoded MS/MS spectra. - """ - all_psm = [] - batch_size = len(spectrum_batch[0]) - enc = self.encoder(spectrum_batch[0]) - enc = list(zip(*enc)) - precursors = spectrum_batch[1] - indexes = spectrum_batch[2] - for idx in range(batch_size): - digest_data = db_utils.get_candidates( - precursors[idx][2], - precursors[idx][1], - self.digest, - self.precursor_tolerance, - self.isotope_error, - ) - try: - spec_peptides, pep_masses, pep_protein = list( - zip(*digest_data) - ) - except ValueError: - logger.info("No peptides found for spectrum %s", indexes[idx]) - continue - spec_precursors = [precursors[idx]] * len(spec_peptides) - spec_enc = [enc[idx]] * len(spec_peptides) - spec_idx = [indexes[idx]] * len(spec_peptides) - all_psm.extend( - list( - zip( - spec_enc, - spec_precursors, - spec_peptides, - spec_idx, - ) + peptide, + peptide_score, + aa_scores, + protein, ) ) - # Continually grab num_pairs items from all_psm until list is exhausted - while len(all_psm) > 0: - psm_batch = all_psm[:batch_size] - all_psm = all_psm[batch_size:] - psm_batch = list(zip(*psm_batch)) - encoded_ms = ( - torch.stack([a[0] for a in psm_batch[0]]), - torch.stack([a[1] for a in psm_batch[0]]), - ) - prec_data = torch.stack(psm_batch[1]) - pep_str = list(psm_batch[2]) - indexes = [a[1] for a in psm_batch[3]] - yield (indexes, pep_str, prec_data, encoded_ms) + self.total_psms += len(predictions) + return predictions def on_predict_batch_end( self, outputs: List[Tuple[np.ndarray, List[str], torch.Tensor]], *args, ) -> None: - if self.out_writer is None: - return + """ + Write the database search results to the output file. + """ for ( spectrum_i, - precursor_charge, + charge, precursor_mz, - peptides, - peptide_scores, + peptide, + peptide_score, aa_scores, + protein, ) in outputs: - calc_mz = [ - self.peptide_mass_calculator.mass(peptide, charge) - for peptide, charge in zip(peptides, precursor_charge) - ] - for row in zip( - peptides, - peptide_scores, - precursor_charge, - precursor_mz, - calc_mz, - spectrum_i, - aa_scores, - ): - self.out_writer.psms.append(row) + if len(peptide) == 0: + continue + self.out_writer.psms.append( + ( + peptide, + tuple(spectrum_i), + peptide_score, + charge, + precursor_mz, + self.peptide_mass_calculator.mass(peptide, charge), + ",".join(list(map("{:.5f}".format, aa_scores))), + protein, + ), + ) def _calc_match_score( diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 1457df38..3286f4b8 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -10,6 +10,8 @@ from pathlib import Path from typing import Iterable, List, Optional, Union +import time + import lightning.pytorch as pl import numpy as np import torch @@ -124,19 +126,21 @@ def db_search( ------- self """ - self.writer = ms_io.MztabWriter( - Path(output).with_suffix(".mztab"), is_db_variant=True - ) + self.writer = ms_io.MztabWriter(Path(output).with_suffix(".mztab")) self.writer.set_metadata( self.config, model=str(self.model_filename), config_filename=self.config.file, ) - self.initialize_trainer(train=True) self.initialize_model(train=False, db_search=True) self.model.out_writer = self.writer - self.model.digest = db_utils.digest_fasta( + test_index = self._get_index(peak_path, False, "db search") + self.writer.set_ms_run(test_index.ms_files) + + self.initialize_data_module(test_index=test_index) + self.loaders.setup(stage="test", annotated=False) + self.loaders.digest = db_utils.digest_fasta( fasta_path, enzyme, digestion, @@ -145,14 +149,16 @@ def db_search( min_length, max_length, ) - self.model.precursor_tolerance = precursor_tolerance - self.model.isotope_error = isotope_error - - test_index = self._get_index(peak_path, False, "db search") - self.writer.set_ms_run(test_index.ms_files) - self.initialize_data_module(test_index=test_index) - self.loaders.setup(stage="test", annotated=False) - self.trainer.predict(self.model, self.loaders.predict_dataloader()) + self.loaders.precursor_tolerance = precursor_tolerance + self.loaders.isotope_error = isotope_error + + t1 = time.time() + self.trainer.predict(self.model, self.loaders.db_dataloader()) + t2 = time.time() + logger.info("Database search took %.3f seconds", t2 - t1) + logger.info("Scored %s PSMs", self.model.total_psms) + logger.info("%.3f PSMs per second", self.model.total_psms / (t2 - t1)) + logger.info("%s seconds per PSM", (t2 - t1) / self.model.total_psms) def train( self, From 35ba7d497cbc0c044ca5e13fd8e6e09162f77590 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 4 Jul 2024 21:50:44 +0000 Subject: [PATCH 07/21] Generate new screengrabs with rich-codex --- docs/images/configure-help.svg | 154 +++++++++++++++------- docs/images/evaluate-help.svg | 182 ++++++++++++++------------ docs/images/help.svg | 224 ++++++++++++++------------------ docs/images/sequence-help.svg | 182 ++++++++++++++------------ docs/images/train-help.svg | 228 ++++++++++++++------------------- 5 files changed, 493 insertions(+), 477 deletions(-) diff --git a/docs/images/configure-help.svg b/docs/images/configure-help.svg index 0822927a..4092bce3 100644 --- a/docs/images/configure-help.svg +++ b/docs/images/configure-help.svg @@ -1,4 +1,4 @@ - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - - $ casanovo configure --help - - Usage: casanovo configure [OPTIONS]                                             - - Generate a Casanovo configuration file to customize.                            - The casanovo configuration file is in the YAML format.                          - -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --output  -o  FILE  The output configuration file.                           │ -│ --help    -h        Show this message and exit.                              │ -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo configure --help +Traceback (most recent call last): +  File "/opt/hostedtoolcache/Python/3.10.14/x64/bin/casanovo", line 5, in <module> +    from casanovo.casanovo import main +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/casanovo/casanovo.py", line 32, in <module> +    import depthcharge +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/__init__.py", line 3, in <module> +    from . import components +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/__init__.py", line 2, in <module> +    from .transformers import SpectrumEncoder, PeptideDecoder +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/transformers.py", line 8, in <module> +    from .. import utils +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/utils.py", line 5, in <module> +    from tensorboard.backend.event_processing.event_accumulator import ( +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_accumulator.py", line 24, in <module> +    from tensorboard.backend.event_processing import event_file_loader +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_file_loader.py", line 21, in <module> +    from tensorboard import dataclass_compat +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/dataclass_compat.py", line 33, in <module> +    from tensorboard.plugins.hparams import metadata as hparams_metadata +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/plugins/hparams/metadata.py", line 32, in <module> +    NULL_TENSOR = tensor_util.make_tensor_proto( +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/util/tensor_util.py", line 405, in make_tensor_proto +    numpy_dtype = dtypes.as_dtype(nparray.dtype) +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py", line 677, in as_dtype +    if type_value.type == np.string_ or type_value.type == np.unicode_: +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/numpy/__init__.py", line 397, in __getattr__ +    raise AttributeError( +AttributeError: `np.string_` was removed in the NumPy 2.0 release. Use `np.bytes_` instead.. Did you mean: 'strings'? diff --git a/docs/images/evaluate-help.svg b/docs/images/evaluate-help.svg index b16c4ffd..d86b2497 100644 --- a/docs/images/evaluate-help.svg +++ b/docs/images/evaluate-help.svg @@ -1,4 +1,4 @@ - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + + + + + + + + + + - + - + - - $ casanovo evaluate --help - - Usage: casanovo evaluate [OPTIONS] ANNOTATED_PEAK_PATH...                       - - Evaluate de novo peptide sequencing performance.                                - ANNOTATED_PEAK_PATH must be one or more annoated MGF files, such as those       - provided by MassIVE-KB.                                                         - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -│ *  ANNOTATED_PEAK_PATH    FILE  [required]                                   │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --model      -m  FILE                        The model weights (.ckpt file). │ -│                                              If not provided, Casanovo will  │ -│                                              try to download the latest      │ -│                                              release.                        │ -│ --output     -o  FILE                        The mzTab file to which results │ -│                                              will be written.                │ -│ --config     -c  FILE                        The YAML configuration file     │ -│                                              overriding the default options. │ -│ --verbosity  -v  [debug|info|warning|error]  Set the verbosity of console    │ -│                                              logging messages. Log files are │ -│                                              always set to 'debug'.          │ -│ --help       -h                              Show this message and exit.     │ -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo evaluate --help +Traceback (most recent call last): +  File "/opt/hostedtoolcache/Python/3.10.14/x64/bin/casanovo", line 5, in <module> +    from casanovo.casanovo import main +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/casanovo/casanovo.py", line 32, in <module> +    import depthcharge +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/__init__.py", line 3, in <module> +    from . import components +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/__init__.py", line 2, in <module> +    from .transformers import SpectrumEncoder, PeptideDecoder +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/transformers.py", line 8, in <module> +    from .. import utils +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/utils.py", line 5, in <module> +    from tensorboard.backend.event_processing.event_accumulator import ( +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_accumulator.py", line 24, in <module> +    from tensorboard.backend.event_processing import event_file_loader +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_file_loader.py", line 21, in <module> +    from tensorboard import dataclass_compat +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/dataclass_compat.py", line 33, in <module> +    from tensorboard.plugins.hparams import metadata as hparams_metadata +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/plugins/hparams/metadata.py", line 32, in <module> +    NULL_TENSOR = tensor_util.make_tensor_proto( +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/util/tensor_util.py", line 405, in make_tensor_proto +    numpy_dtype = dtypes.as_dtype(nparray.dtype) +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py", line 677, in as_dtype +    if type_value.type == np.string_ or type_value.type == np.unicode_: +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/numpy/__init__.py", line 397, in __getattr__ +    raise AttributeError( +AttributeError: `np.string_` was removed in the NumPy 2.0 release. Use `np.bytes_` instead.. Did you mean: 'strings'? diff --git a/docs/images/help.svg b/docs/images/help.svg index 67dca83e..dfb1039c 100644 --- a/docs/images/help.svg +++ b/docs/images/help.svg @@ -1,4 +1,4 @@ - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - + - + - - $ casanovo --help - - Usage: casanovo [OPTIONS] COMMAND [ARGS]...                                     - - ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓  - ┃                                  Casanovo                                  ┃  - ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛  - Casanovo de novo sequences peptides from tandem mass spectra using a            - Transformer model. Casanovo currently supports mzML, mzXML, and MGF files for   - de novo sequencing and annotated MGF files, such as those from MassIVE-KB, for  - training new models.                                                            - - Links:                                                                          - -  • Documentation: https://casanovo.readthedocs.io                               -  • Official code repository: https://github.com/Noble-Lab/casanovo              - - If you use Casanovo in your work, please cite:                                  - -  • Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo   -    mass spectrometry peptide sequencing with a transformer model. Proceedings   -    of the 39th International Conference on Machine Learning - ICML '22 (2022)   -    doi:10.1101/2022.02.07.479481.                                               - -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --help  -h    Show this message and exit.                                    │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Commands ───────────────────────────────────────────────────────────────────╮ -│ annotate   Annotate a given .mgf with candidates as selected by a Tide       │ -│            search for Casanovo-DB.                                           │ -│ configure  Generate a Casanovo configuration file to customize.              │ -│ db-search  Perform a search using Casanovo-DB.                               │ -│ evaluate   Evaluate de novo peptide sequencing performance.                  │ -│ sequence   De novo sequence peptides from tandem mass spectra.               │ -│ train      Train a Casanovo model on your own data.                          │ -│ version    Get the Casanovo version information                              │ -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo --help +Traceback (most recent call last): +  File "/opt/hostedtoolcache/Python/3.10.14/x64/bin/casanovo", line 5, in <module> +    from casanovo.casanovo import main +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/casanovo/casanovo.py", line 32, in <module> +    import depthcharge +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/__init__.py", line 3, in <module> +    from . import components +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/__init__.py", line 2, in <module> +    from .transformers import SpectrumEncoder, PeptideDecoder +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/transformers.py", line 8, in <module> +    from .. import utils +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/utils.py", line 5, in <module> +    from tensorboard.backend.event_processing.event_accumulator import ( +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_accumulator.py", line 24, in <module> +    from tensorboard.backend.event_processing import event_file_loader +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_file_loader.py", line 21, in <module> +    from tensorboard import dataclass_compat +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/dataclass_compat.py", line 33, in <module> +    from tensorboard.plugins.hparams import metadata as hparams_metadata +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/plugins/hparams/metadata.py", line 32, in <module> +    NULL_TENSOR = tensor_util.make_tensor_proto( +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/util/tensor_util.py", line 405, in make_tensor_proto +    numpy_dtype = dtypes.as_dtype(nparray.dtype) +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py", line 677, in as_dtype +    if type_value.type == np.string_ or type_value.type == np.unicode_: +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/numpy/__init__.py", line 397, in __getattr__ +    raise AttributeError( +AttributeError: `np.string_` was removed in the NumPy 2.0 release. Use `np.bytes_` instead.. Did you mean: 'strings'? diff --git a/docs/images/sequence-help.svg b/docs/images/sequence-help.svg index f5799766..b9b96d74 100644 --- a/docs/images/sequence-help.svg +++ b/docs/images/sequence-help.svg @@ -1,4 +1,4 @@ - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + + + + + + + + + + - + - + - - $ casanovo sequence --help - - Usage: casanovo sequence [OPTIONS] PEAK_PATH...                                 - - De novo sequence peptides from tandem mass spectra.                             - PEAK_PATH must be one or more mzMl, mzXML, or MGF files from which to sequence  - peptides.                                                                       - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -│ *  PEAK_PATH    FILE  [required]                                             │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ --model      -m  FILE                        The model weights (.ckpt file). │ -│                                              If not provided, Casanovo will  │ -│                                              try to download the latest      │ -│                                              release.                        │ -│ --output     -o  FILE                        The mzTab file to which results │ -│                                              will be written.                │ -│ --config     -c  FILE                        The YAML configuration file     │ -│                                              overriding the default options. │ -│ --verbosity  -v  [debug|info|warning|error]  Set the verbosity of console    │ -│                                              logging messages. Log files are │ -│                                              always set to 'debug'.          │ -│ --help       -h                              Show this message and exit.     │ -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo sequence --help +Traceback (most recent call last): +  File "/opt/hostedtoolcache/Python/3.10.14/x64/bin/casanovo", line 5, in <module> +    from casanovo.casanovo import main +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/casanovo/casanovo.py", line 32, in <module> +    import depthcharge +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/__init__.py", line 3, in <module> +    from . import components +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/__init__.py", line 2, in <module> +    from .transformers import SpectrumEncoder, PeptideDecoder +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/transformers.py", line 8, in <module> +    from .. import utils +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/utils.py", line 5, in <module> +    from tensorboard.backend.event_processing.event_accumulator import ( +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_accumulator.py", line 24, in <module> +    from tensorboard.backend.event_processing import event_file_loader +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_file_loader.py", line 21, in <module> +    from tensorboard import dataclass_compat +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/dataclass_compat.py", line 33, in <module> +    from tensorboard.plugins.hparams import metadata as hparams_metadata +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/plugins/hparams/metadata.py", line 32, in <module> +    NULL_TENSOR = tensor_util.make_tensor_proto( +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/util/tensor_util.py", line 405, in make_tensor_proto +    numpy_dtype = dtypes.as_dtype(nparray.dtype) +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py", line 677, in as_dtype +    if type_value.type == np.string_ or type_value.type == np.unicode_: +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/numpy/__init__.py", line 397, in __getattr__ +    raise AttributeError( +AttributeError: `np.string_` was removed in the NumPy 2.0 release. Use `np.bytes_` instead.. Did you mean: 'strings'? diff --git a/docs/images/train-help.svg b/docs/images/train-help.svg index fccd4140..a71b8915 100644 --- a/docs/images/train-help.svg +++ b/docs/images/train-help.svg @@ -1,4 +1,4 @@ - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - + - + - - $ casanovo train --help - - Usage: casanovo train [OPTIONS] TRAIN_PEAK_PATH...                              - - Train a Casanovo model on your own data.                                        - TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided  - by MassIVE-KB, from which to train a new Casnovo model.                         - -╭─ Arguments ──────────────────────────────────────────────────────────────────╮ -│ *  TRAIN_PEAK_PATH    FILE  [required]                                       │ -╰──────────────────────────────────────────────────────────────────────────────╯ -╭─ Options ────────────────────────────────────────────────────────────────────╮ -│ *  --validation_peak_pa…  -p  FILE                    An annotated MGF file  │ -│                                                       for validation, like   │ -│                                                       from MassIVE-KB. Use   │ -│                                                       this option multiple   │ -│                                                       times to specify       │ -│                                                       multiple files.        │ -│                                                       [required]             │ -│    --model                -m  FILE                    The model weights      │ -│                                                       (.ckpt file). If not   │ -│                                                       provided, Casanovo     │ -│                                                       will try to download   │ -│                                                       the latest release.    │ -│    --output               -o  FILE                    The mzTab file to      │ -│                                                       which results will be  │ -│                                                       written.               │ -│    --config               -c  FILE                    The YAML configuration │ -│                                                       file overriding the    │ -│                                                       default options.       │ -│    --verbosity            -v  [debug|info|warning|er  Set the verbosity of   │ -│                               ror]                    console logging        │ -│                                                       messages. Log files    │ -│                                                       are always set to      │ -│                                                       'debug'.               │ -│    --help                 -h                          Show this message and  │ -│                                                       exit.                  │ -╰──────────────────────────────────────────────────────────────────────────────╯ - + + $ casanovo train --help +Traceback (most recent call last): +  File "/opt/hostedtoolcache/Python/3.10.14/x64/bin/casanovo", line 5, in <module> +    from casanovo.casanovo import main +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/casanovo/casanovo.py", line 32, in <module> +    import depthcharge +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/__init__.py", line 3, in <module> +    from . import components +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/__init__.py", line 2, in <module> +    from .transformers import SpectrumEncoder, PeptideDecoder +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/transformers.py", line 8, in <module> +    from .. import utils +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/utils.py", line 5, in <module> +    from tensorboard.backend.event_processing.event_accumulator import ( +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_accumulator.py", line 24, in <module> +    from tensorboard.backend.event_processing import event_file_loader +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_file_loader.py", line 21, in <module> +    from tensorboard import dataclass_compat +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/dataclass_compat.py", line 33, in <module> +    from tensorboard.plugins.hparams import metadata as hparams_metadata +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/plugins/hparams/metadata.py", line 32, in <module> +    NULL_TENSOR = tensor_util.make_tensor_proto( +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/util/tensor_util.py", line 405, in make_tensor_proto +    numpy_dtype = dtypes.as_dtype(nparray.dtype) +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py", line 677, in as_dtype +    if type_value.type == np.string_ or type_value.type == np.unicode_: +  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/numpy/__init__.py", line 397, in __getattr__ +    raise AttributeError( +AttributeError: `np.string_` was removed in the NumPy 2.0 release. Use `np.bytes_` instead.. Did you mean: 'strings'? From f8a1a8964f929b793cd58844072d76656b4ac0f1 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Mon, 8 Jul 2024 12:14:52 -0700 Subject: [PATCH 08/21] fix batching issues --- casanovo/denovo/model.py | 71 ++++++++++++++++++--------------- casanovo/denovo/model_runner.py | 1 + 2 files changed, 40 insertions(+), 32 deletions(-) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 2256946c..3a069dcd 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1009,6 +1009,7 @@ class DbSpec2Pep(Spec2Pep): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.total_psms = 0 + self.psm_batch_size = 1024 def predict_step(self, batch, *args): """ @@ -1028,39 +1029,45 @@ def predict_step(self, batch, *args): scores, amino acid-level scores, and associated proteins. """ predictions = [] - pred, truth = self.decoder(batch[3], batch[1], *self.encoder(batch[0])) - pred = self.softmax(pred) - all_scores, per_aa_scores = _calc_match_score( - pred, truth, self.decoder.reverse - ) - for ( - precursor_charge, - precursor_mz, - spectrum_i, - peptide_score, - aa_scores, - peptide, - protein, - ) in zip( - batch[1][:, 1].cpu().detach().numpy(), - batch[1][:, 2].cpu().detach().numpy(), - batch[2], - all_scores.cpu().detach().numpy(), - per_aa_scores.cpu().detach().numpy(), - batch[3], - batch[4], - ): - predictions.append( - ( - spectrum_i, - precursor_charge, - precursor_mz, - peptide, - peptide_score, - aa_scores, - protein, - ) + while len(batch[0]) > 0: + next_batch = [b[self.psm_batch_size :] for b in batch] + batch = [b[: self.psm_batch_size] for b in batch] + pred, truth = self.decoder( + batch[3], batch[1], *self.encoder(batch[0]) ) + pred = self.softmax(pred) + all_scores, per_aa_scores = _calc_match_score( + pred, truth, self.decoder.reverse + ) + for ( + precursor_charge, + precursor_mz, + spectrum_i, + peptide_score, + aa_scores, + peptide, + protein, + ) in zip( + batch[1][:, 1].cpu().detach().numpy(), + batch[1][:, 2].cpu().detach().numpy(), + batch[2], + all_scores.cpu().detach().numpy(), + per_aa_scores.cpu().detach().numpy(), + batch[3], + batch[4], + ): + predictions.append( + ( + spectrum_i, + precursor_charge, + precursor_mz, + peptide, + peptide_score, + aa_scores, + protein, + ) + ) + batch = next_batch self.total_psms += len(predictions) return predictions diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 3286f4b8..a6b59ed9 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -135,6 +135,7 @@ def db_search( self.initialize_trainer(train=True) self.initialize_model(train=False, db_search=True) self.model.out_writer = self.writer + self.model.psm_batch_size = self.config.predict_batch_size test_index = self._get_index(peak_path, False, "db search") self.writer.set_ms_run(test_index.ms_files) From 7cb8e141ccab5b865a3af00711d290cd6cab788d Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Mon, 12 Aug 2024 14:50:18 -0700 Subject: [PATCH 09/21] small fixes regarding documentation, import syntax, etc. --- casanovo/casanovo.py | 39 ++++++---- casanovo/data/db_utils.py | 71 +++++++++-------- casanovo/denovo/dataloaders.py | 10 +-- casanovo/denovo/model.py | 31 ++++---- casanovo/denovo/model_runner.py | 24 ++---- tests/conftest.py | 11 +-- tests/unit_tests/test_unit.py | 132 +++++++++++--------------------- 7 files changed, 137 insertions(+), 181 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 8ae9a81b..4b9b4e38 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -130,7 +130,7 @@ def sequence( ) -> None: """De novo sequence peptides from tandem mass spectra. - PEAK_PATH must be one or more mzMl, mzXML, or MGF files from which + PEAK_PATH must be one or more mzML, mzXML, or MGF files from which to sequence peptides. """ output = setup_logging(output, verbosity) @@ -205,7 +205,7 @@ def sequence( ) @click.option( "--digestion", - help="Digestion: full, partial", + help="Full: standard digestion. Semi: Include products of semi-specific cleavage", type=click.Choice( ["full", "partial"], case_sensitive=False, @@ -214,37 +214,41 @@ def sequence( ) @click.option( "--missed_cleavages", - help="Number of allowed missed cleavages", + help="Number of allowed missed cleavages when digesting protein", type=int, default=0, ) @click.option( "--max_mods", - help="Maximum number of modifications per peptide", + help="Maximum number of amino acid modifications per peptide", type=int, default=0, ) @click.option( - "--min_length", - help="Minimum peptide length", + "--min_peptide_length", + help="Minimum peptide length to consider", type=int, default=6, ) @click.option( - "--max_length", - help="Maximum peptide length", + "--max_peptide_length", + help="Maximum peptide length to consider", type=int, default=50, ) @click.option( "--precursor_tolerance", - help="Precursor tolerance window size (ppm)", - type=int, + help="Precursor tolerance window size (units: ppm)", + type=float, default=20, ) @click.option( "--isotope_error", - help="Isotope error levels to consider (list of ints, e.g: 1,2)", + help="Isotope error levels to consider. \ + Creates multiple mass windows to consider per spectrum \ + to account for observed mass not matching monoisotopic mass \ + due to the instrument assigning the 13C isotope \ + peak as the precursor (list of ints, e.g: 1,2)", type=str, default="0", ) @@ -255,9 +259,9 @@ def db_search( digestion: str, missed_cleavages: int, max_mods: int, - min_length: int, - max_length: int, - precursor_tolerance: int, + min_peptide_length: int, + max_peptide_length: int, + precursor_tolerance: float, isotope_error: str, model: Optional[str], config: Optional[str], @@ -266,7 +270,8 @@ def db_search( ) -> None: """Perform a database search on MS/MS data using Casanovo-DB. - PEAK_PATH must be one MGF file. FASTA_PATH must be one FASTA file. + PEAK_PATH must be one or more mzML, mzXML, or MGF files. + FASTA_PATH must be one FASTA file. """ output = setup_logging(output, verbosity) config, model = setup_model(model, config, output, False) @@ -284,8 +289,8 @@ def db_search( digestion, missed_cleavages, max_mods, - min_length, - max_length, + min_peptide_length, + max_peptide_length, precursor_tolerance, isotope_error, output, diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 921c75bd..1af09a47 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -1,15 +1,16 @@ """Unique methods used within db-search mode""" -import os -import depthcharge.masses -from pyteomics import fasta, parser import bisect import logging - +import os from typing import List, Tuple +import depthcharge.masses +from pyteomics import fasta, parser + logger = logging.getLogger("casanovo") + # CONSTANTS HYDROGEN = 1.007825035 OXYGEN = 15.99491463 @@ -51,8 +52,8 @@ def digest_fasta( digestion: str, missed_cleavages: int, max_mods: int, - min_length: int, - max_length: int, + min_peptide_length: int, + max_peptide_length: int, ): """ Digests a FASTA file and returns the peptides, their masses, and associated protein. @@ -70,9 +71,9 @@ def digest_fasta( The number of missed cleavages to allow. max_mods : int The maximum number of modifications to allow per peptide. - min_length : int + min_peptide_length : int The minimum length of peptides to consider. - max_length : int + max_peptide_length : int The maximum length of peptides to consider. Returns @@ -81,35 +82,36 @@ def digest_fasta( A list of tuples containing the peptide sequence, mass, and associated protein. Sorted by neutral mass in ascending order. """ - - # Verify the eistence of the file: + # Verify the existence of the file: if not os.path.isfile(fasta_filename): - print(f"File {fasta_filename} does not exist.") + logger.error("File %s does not exist.", fasta_filename) raise FileNotFoundError(f"File {fasta_filename} does not exist.") fasta_data = fasta.read(fasta_filename) peptide_list = [] - if digestion in ["full", "partial"]: - semi = True if digestion == "partial" else False - for header, seq in fasta_data: - pep_set = parser.cleave( - seq, - rule=parser.expasy_rules[enzyme], - missed_cleavages=missed_cleavages, - semi=semi, - ) - protein = header.split()[0] - for pep in pep_set: - if len(pep) < min_length or len(pep) > max_length: - continue - if "X" in pep or "U" in pep: - logger.warn( - "Skipping peptide with ambiguous amino acids: %s", pep - ) - continue - peptide_list.append((pep, protein)) - else: + if digestion not in ["full", "partial"]: + logger.error("Digestion type %s not recognized.", digestion) raise ValueError(f"Digestion type {digestion} not recognized.") + semi = digestion == "partial" + for header, seq in fasta_data: + pep_set = parser.cleave( + seq, + rule=parser.expasy_rules[enzyme], + missed_cleavages=missed_cleavages, + semi=semi, + ) + protein = header.split()[0] + for pep in pep_set: + if len(pep) < min_peptide_length or len(pep) > max_peptide_length: + continue + if any( + aa in pep for aa in "BJOUXZ" + ): # Check for incorrect AA letters + logger.warn( + "Skipping peptide with ambiguous amino acids: %s", pep + ) + continue + peptide_list.append((pep, protein)) # Generate modified peptides mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") @@ -136,7 +138,7 @@ def get_candidates( precursor_mz: float, charge: int, peptide_list: List[Tuple[str, float, str]], - precursor_tolerance: int, + precursor_tolerance: float, isotope_error: str, ): """ @@ -156,7 +158,6 @@ def get_candidates( isotope_error : str The isotope error levels to consider. """ - candidates = set() isotope_error = [int(x) for x in isotope_error.split(",")] @@ -219,7 +220,9 @@ def _to_raw_mass(mz_mass, charge): def get_mass_indices(masses, m_low, m_high): - """Grabs mass indices from a list of mass values that fall within a specified range. + """Grabs mass indices that fall within a specified range. + + Pulls from masses, a list of mass values. Requires that the mass values are sorted in ascending order. Parameters diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 80a4f7dc..14a0ff99 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -2,20 +2,20 @@ import functools import os -from typing import List, Optional, Tuple -from functools import partial import logging +from typing import List, Optional, Tuple +from depthcharge.data import AnnotatedSpectrumIndex import lightning.pytorch as pl import numpy as np import torch -from depthcharge.data import AnnotatedSpectrumIndex +from ..data import db_utils from ..data.datasets import ( AnnotatedSpectrumDataset, SpectrumDataset, ) -from ..data import db_utils + logger = logging.getLogger("casanovo") @@ -186,7 +186,7 @@ def db_dataloader(self) -> torch.utils.data.DataLoader: return torch.utils.data.DataLoader( self.test_dataset, batch_size=self.eval_batch_size, - collate_fn=partial( + collate_fn=functools.partial( prepare_psm_batch, digest=self.digest, precursor_tolerance=self.precursor_tolerance, diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 3a069dcd..79848682 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -16,7 +16,7 @@ from . import evaluate from .. import config -from ..data import ms_io, db_utils +from ..data import ms_io logger = logging.getLogger("casanovo") @@ -991,7 +991,8 @@ def configure_optimizers( class DbSpec2Pep(Spec2Pep): """ - Subclass of Spec2Pep for the use of Casanovo as an MS/MS database search score function. + Subclass of Spec2Pep for the use of Casanovo as an \ + MS/MS database search score function. Uses teacher forcing to 'query' Casanovo for its score for each AA within a candidate peptide, and takes the geometric average of these scores @@ -1008,7 +1009,6 @@ class DbSpec2Pep(Spec2Pep): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.total_psms = 0 self.psm_batch_size = 1024 def predict_step(self, batch, *args): @@ -1029,11 +1029,14 @@ def predict_step(self, batch, *args): scores, amino acid-level scores, and associated proteins. """ predictions = [] - while len(batch[0]) > 0: - next_batch = [b[self.psm_batch_size :] for b in batch] - batch = [b[: self.psm_batch_size] for b in batch] + for start_idx in range(0, len(batch[0]), self.psm_batch_size): + current_batch = [ + b[start_idx : start_idx + self.psm_batch_size] for b in batch + ] pred, truth = self.decoder( - batch[3], batch[1], *self.encoder(batch[0]) + current_batch[3], + current_batch[1], + *self.encoder(current_batch[0]), ) pred = self.softmax(pred) all_scores, per_aa_scores = _calc_match_score( @@ -1048,13 +1051,13 @@ def predict_step(self, batch, *args): peptide, protein, ) in zip( - batch[1][:, 1].cpu().detach().numpy(), - batch[1][:, 2].cpu().detach().numpy(), - batch[2], + current_batch[1][:, 1].cpu().detach().numpy(), + current_batch[1][:, 2].cpu().detach().numpy(), + current_batch[2], all_scores.cpu().detach().numpy(), per_aa_scores.cpu().detach().numpy(), - batch[3], - batch[4], + current_batch[3], + current_batch[4], ): predictions.append( ( @@ -1067,8 +1070,6 @@ def predict_step(self, batch, *args): protein, ) ) - batch = next_batch - self.total_psms += len(predictions) return predictions def on_predict_batch_end( @@ -1088,8 +1089,6 @@ def on_predict_batch_end( aa_scores, protein, ) in outputs: - if len(peptide) == 0: - continue self.out_writer.psms.append( ( peptide, diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index a6b59ed9..c2b71098 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -10,8 +10,6 @@ from pathlib import Path from typing import Iterable, List, Optional, Union -import time - import lightning.pytorch as pl import numpy as np import torch @@ -20,7 +18,7 @@ from lightning.pytorch.callbacks import ModelCheckpoint from ..config import Config -from ..data import ms_io, db_utils +from ..data import db_utils, ms_io from ..denovo.dataloaders import DeNovoDataModule from ..denovo.model import Spec2Pep, DbSpec2Pep @@ -89,8 +87,8 @@ def db_search( digestion: str, missed_cleavages: int, max_mods: int, - min_length: int, - max_length: int, + min_peptide_length: int, + max_peptide_length: int, precursor_tolerance: float, isotope_error: str, output: str, @@ -100,7 +98,7 @@ def db_search( Parameters ---------- peak_path : Iterable[str] - The path to the .mgf data file for database search. + The paths to the .mgf data files for database search. fasta_path : str The path to the FASTA file for database search. enzyme : str @@ -111,9 +109,9 @@ def db_search( The number of missed cleavages allowed. max_mods : int The maximum number of modifications allowed per peptide. - min_length : int + min_peptide_length : int The minimum peptide length. - max_length : int + max_peptide_length : int The maximum peptide length. precursor_tolerance : float The precursor mass tolerance in ppm. @@ -147,19 +145,13 @@ def db_search( digestion, missed_cleavages, max_mods, - min_length, - max_length, + min_peptide_length, + max_peptide_length, ) self.loaders.precursor_tolerance = precursor_tolerance self.loaders.isotope_error = isotope_error - t1 = time.time() self.trainer.predict(self.model, self.loaders.db_dataloader()) - t2 = time.time() - logger.info("Database search took %.3f seconds", t2 - t1) - logger.info("Scored %s PSMs", self.model.total_psms) - logger.info("%.3f PSMs per second", self.model.total_psms / (t2 - t1)) - logger.info("%s seconds per PSM", (t2 - t1) / self.model.total_psms) def train( self, diff --git a/tests/conftest.py b/tests/conftest.py index b2244308..60afcd83 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,19 +17,16 @@ def mgf_small(tmp_path): @pytest.fixture -def tiny_fasta_file(tmp_path, fasta_raw_data): +def tiny_fasta_file(tmp_path): fasta_file = tmp_path / "tiny_fasta.fasta" with fasta_file.open("w+") as fasta_ref: - fasta_ref.write(fasta_raw_data) + fasta_ref.write( + ">foo\nMEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGERVTLSCRASQSVSSSYLTWYQQKPGQAPRLLIYGASTRATSIPARFSGSGSGTDFTLTISSLQPEDFAVYYCQQDYNLP" + ) return fasta_file -@pytest.fixture -def fasta_raw_data(): - return ">foo\nMEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGERVTLSCRASQSVSSSYLTWYQQKPGQAPRLLIYGASTRATSIPARFSGSGSGTDFTLTISSLQPEDFAVYYCQQDYNLP" - - @pytest.fixture def mgf_db_search(tmp_path): """An MGF file with 7 spectra and scan numbers, C+57.021 mass modification considered""" diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index e3707917..419cf3ef 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -2,6 +2,7 @@ import heapq import os import platform +import re import shutil import tempfile @@ -10,11 +11,10 @@ import numpy as np import pytest import torch -import re from casanovo import casanovo from casanovo import utils -from casanovo.data import ms_io, db_utils +from casanovo.data import db_utils, ms_io from casanovo.data.datasets import SpectrumDataset, AnnotatedSpectrumDataset from casanovo.denovo.evaluate import aa_match_batch, aa_match_metrics from casanovo.denovo.model import Spec2Pep, _aa_pep_score, _calc_match_score @@ -220,10 +220,7 @@ def test_calc_match_score(): assert np.sum(masked_per_aa_scores.numpy()[3]) == 3 -def test_digest_fasta_cleave(fasta_raw_data): - - with open("temp_fasta", "w") as file: - file.write(fasta_raw_data) +def test_digest_fasta_cleave(tiny_fasta_file): # No missed cleavages expected_normal = [ @@ -275,49 +272,24 @@ def test_digest_fasta_cleave(fasta_raw_data): "EIVMTQSPPTLSLSPGERVTLSC+57.021RASQSVSSSYLTWYQQKPGQAPR", "LLIYGASTRATSIPARFSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", ] + for missed_cleavages, expected in zip( + (0, 1, 3), + (expected_normal, expected_1missedcleavage, expected_3missedcleavage), + ): + peptide_list = db_utils.digest_fasta( + fasta_filename=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=missed_cleavages, + max_mods=0, + min_peptide_length=6, + max_peptide_length=50, + ) + peptide_list = [x[0] for x in peptide_list] + assert peptide_list == expected - peptide_list = db_utils.digest_fasta( - fasta_filename="temp_fasta", - enzyme="trypsin", - digestion="full", - missed_cleavages=0, - max_mods=0, - min_length=6, - max_length=50, - ) - peptide_list = [x[0] for x in peptide_list] - assert peptide_list == expected_normal - - peptide_list = db_utils.digest_fasta( - fasta_filename="temp_fasta", - enzyme="trypsin", - digestion="full", - missed_cleavages=1, - max_mods=0, - min_length=6, - max_length=50, - ) - peptide_list = [x[0] for x in peptide_list] - assert peptide_list == expected_1missedcleavage - - peptide_list = db_utils.digest_fasta( - fasta_filename="temp_fasta", - enzyme="trypsin", - digestion="full", - missed_cleavages=3, - max_mods=0, - min_length=6, - max_length=50, - ) - peptide_list = [x[0] for x in peptide_list] - assert peptide_list == expected_3missedcleavage - - -def test_digest_fasta_mods(fasta_raw_data): - - with open("temp_fasta", "w") as file: - file.write(fasta_raw_data) +def test_digest_fasta_mods(tiny_fasta_file): # 1 modification allowed # fixed: C+57.02146 # variable: 1M+15.994915,1N+0.984016,1Q+0.984016 @@ -373,13 +345,13 @@ def test_digest_fasta_mods(fasta_raw_data): ] peptide_list = db_utils.digest_fasta( - fasta_filename="temp_fasta", + fasta_filename=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=0, max_mods=1, - min_length=6, - max_length=50, + min_peptide_length=6, + max_peptide_length=50, ) peptide_list = [x[0] for x in peptide_list] peptide_list = [ @@ -392,11 +364,7 @@ def test_digest_fasta_mods(fasta_raw_data): assert peptide_list == expected_1mod -def test_length_restrictions(fasta_raw_data): - - with open("temp_fasta", "w") as file: - file.write(fasta_raw_data) - +def test_length_restrictions(tiny_fasta_file): # length between 20 and 50 expected_long = [ "MEAPAQLLFLLLLWLPDTTR", @@ -408,35 +376,31 @@ def test_length_restrictions(fasta_raw_data): expected_short = ["ATSIPAR", "VTLSC+57.021R"] peptide_list = db_utils.digest_fasta( - fasta_filename="temp_fasta", + fasta_filename=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=0, max_mods=0, - min_length=20, - max_length=50, + min_peptide_length=20, + max_peptide_length=50, ) peptide_list = [x[0] for x in peptide_list] assert peptide_list == expected_long peptide_list = db_utils.digest_fasta( - fasta_filename="temp_fasta", + fasta_filename=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=0, max_mods=0, - min_length=6, - max_length=8, + min_peptide_length=6, + max_peptide_length=8, ) peptide_list = [x[0] for x in peptide_list] assert peptide_list == expected_short -def test_digest_fasta_enzyme(fasta_raw_data): - - with open("temp_fasta", "w") as file: - file.write(fasta_raw_data) - +def test_digest_fasta_enzyme(tiny_fasta_file): # arg-c enzyme expected_argc = [ "ATSIPAR", @@ -452,35 +416,31 @@ def test_digest_fasta_enzyme(fasta_raw_data): expected_aspn = ["DFAVYYC+57.021QQ", "DFTLTISSLQPE", "MEAPAQLLFLLLLWLP"] peptide_list = db_utils.digest_fasta( - fasta_filename="temp_fasta", + fasta_filename=str(tiny_fasta_file), enzyme="arg-c", digestion="full", missed_cleavages=0, max_mods=0, - min_length=6, - max_length=50, + min_peptide_length=6, + max_peptide_length=50, ) peptide_list = [x[0] for x in peptide_list] assert peptide_list == expected_argc peptide_list = db_utils.digest_fasta( - fasta_filename="temp_fasta", + fasta_filename=str(tiny_fasta_file), enzyme="asp-n", digestion="full", missed_cleavages=0, max_mods=0, - min_length=6, - max_length=50, + min_peptide_length=6, + max_peptide_length=50, ) peptide_list = [x[0] for x in peptide_list] assert peptide_list == expected_aspn -def test_get_candidates(fasta_raw_data): - - with open("temp_fasta", "w") as file: - file.write(fasta_raw_data) - +def test_get_candidates(tiny_fasta_file): # precursor_window is 10000 expected_smallwindow = ["LLIYGASTR"] @@ -491,13 +451,13 @@ def test_get_candidates(fasta_raw_data): expected_widewindow = ["ATSIPAR", "VTLSC+57.021R", "LLIYGASTR"] peptide_list = db_utils.digest_fasta( - fasta_filename="temp_fasta", + fasta_filename=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=1, max_mods=0, - min_length=6, - max_length=50, + min_peptide_length=6, + max_peptide_length=50, ) candidates = db_utils.get_candidates( @@ -511,13 +471,13 @@ def test_get_candidates(fasta_raw_data): assert expected_smallwindow == candidates peptide_list = db_utils.digest_fasta( - fasta_filename="temp_fasta", + fasta_filename=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=1, max_mods=0, - min_length=6, - max_length=50, + min_peptide_length=6, + max_peptide_length=50, ) candidates = db_utils.get_candidates( @@ -531,13 +491,13 @@ def test_get_candidates(fasta_raw_data): assert expected_midwindow == candidates peptide_list = db_utils.digest_fasta( - fasta_filename="temp_fasta", + fasta_filename=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=1, max_mods=0, - min_length=6, - max_length=50, + min_peptide_length=6, + max_peptide_length=50, ) candidates = db_utils.get_candidates( From b2f08ac307f50c4dabc458745cd79b3ec2058f35 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Mon, 19 Aug 2024 19:09:26 -0700 Subject: [PATCH 10/21] add proteindatabase --- casanovo/casanovo.py | 110 -------- casanovo/config.yaml | 36 ++- casanovo/data/datasets.py | 2 +- casanovo/data/db_utils.py | 442 +++++++++++++++++--------------- casanovo/denovo/dataloaders.py | 28 +- casanovo/denovo/model_runner.py | 45 +--- tests/conftest.py | 5 + tests/test_integration.py | 2 - tests/unit_tests/test_unit.py | 200 +++++++++------ 9 files changed, 404 insertions(+), 466 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 4b9b4e38..b153512d 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -158,111 +158,9 @@ def sequence( nargs=1, type=click.Path(exists=True, dir_okay=False), ) -@click.option( - "--enzyme", - help="Enzyme for in silico digestion, \ - See pyteomics.parser.expasy_rules for valid enzymes", - type=click.Choice( - [ - "arg-c", - "asp-n", - "bnps-skatole", - "caspase 1", - "caspase 2", - "caspase 3", - "caspase 4", - "caspase 5", - "caspase 6", - "caspase 7", - "caspase 8", - "caspase 9", - "caspase 10", - "chymotrypsin high specificity", - "chymotrypsin low specificity", - "clostripain", - "cnbr", - "enterokinase", - "factor xa", - "formic acid", - "glutamyl endopeptidase", - "granzyme b", - "hydroxylamine", - "iodosobenzoic acid", - "lysc", - "ntcb", - "pepsin ph1.3", - "pepsin ph2.0", - "proline endopeptidase", - "proteinase k", - "staphylococcal peptidase i", - "thermolysin", - "thrombin", - "trypsin", - "trypsin_exception", - ] - ), - default="trypsin", -) -@click.option( - "--digestion", - help="Full: standard digestion. Semi: Include products of semi-specific cleavage", - type=click.Choice( - ["full", "partial"], - case_sensitive=False, - ), - default="full", -) -@click.option( - "--missed_cleavages", - help="Number of allowed missed cleavages when digesting protein", - type=int, - default=0, -) -@click.option( - "--max_mods", - help="Maximum number of amino acid modifications per peptide", - type=int, - default=0, -) -@click.option( - "--min_peptide_length", - help="Minimum peptide length to consider", - type=int, - default=6, -) -@click.option( - "--max_peptide_length", - help="Maximum peptide length to consider", - type=int, - default=50, -) -@click.option( - "--precursor_tolerance", - help="Precursor tolerance window size (units: ppm)", - type=float, - default=20, -) -@click.option( - "--isotope_error", - help="Isotope error levels to consider. \ - Creates multiple mass windows to consider per spectrum \ - to account for observed mass not matching monoisotopic mass \ - due to the instrument assigning the 13C isotope \ - peak as the precursor (list of ints, e.g: 1,2)", - type=str, - default="0", -) def db_search( peak_path: Tuple[str], fasta_path: str, - enzyme: str, - digestion: str, - missed_cleavages: int, - max_mods: int, - min_peptide_length: int, - max_peptide_length: int, - precursor_tolerance: float, - isotope_error: str, model: Optional[str], config: Optional[str], output: Optional[str], @@ -285,14 +183,6 @@ def db_search( runner.db_search( peak_path, fasta_path, - enzyme, - digestion, - missed_cleavages, - max_mods, - min_peptide_length, - max_peptide_length, - precursor_tolerance, - isotope_error, output, ) diff --git a/casanovo/config.yaml b/casanovo/config.yaml index c7186ff7..860cfabb 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -5,18 +5,26 @@ ### # The following parameters can be modified when running inference or when -# fine-tuning an existing Casanovo model. +# fine-tuning an existing Casanovo model. They also affect database search +# parameters when running Casanovo in DB-search mode. ### # Max absolute difference allowed with respect to observed precursor m/z. -# Predictions outside the tolerance range are assigned a negative peptide score. +# denovo: Predictions outside the tolerance range are assigned a negative peptide score. +# db-search: Used to create mas windows for candidate generation. precursor_mass_tol: 50 # ppm # Isotopes to consider when comparing predicted and observed precursor m/z's. isotope_error_range: [0, 1] -# The minimum length of predicted peptides. +# The minimum length of predicted/scored peptides. min_peptide_len: 6 -# Number of spectra in one inference batch. +# Number of spectra or psms in one inference batch. predict_batch_size: 1024 + + +### +# The following parameters are unique to Casanovo's inference/finetuning mode. +### + # Number of beams used in beam search. n_beams: 1 # Number of PSMs for each spectrum. @@ -29,6 +37,26 @@ accelerator: "auto" # number will be automatically selected for based on the chosen accelerator. devices: + +### +# The following parameters are unique to Casanovo's database search mode. +### + +# Enzyme for in silico digestion, used to generate candidate peptides. +# See pyteomics.parser.expasy_rules for valid enzymes +enzyme: "trypsin" +# Digestion type for candidate peptide generation. +# Full: standard digestion. Semi: Include products of semi-specific cleavage +digestion: "full" +# Number of allowed missed cleavages when digesting protein +missed_cleavages: 0 +# Maximum number of amino acid modifications per peptide. +# None generates all possible isoforms as candidates. +max_mods: +# Maximum peptide length to consider +max_peptide_len: 50 + + ### # The following parameters should only be modified if you are training a new # Casanovo model from scratch. diff --git a/casanovo/data/datasets.py b/casanovo/data/datasets.py index 6244e88f..3f05811f 100644 --- a/casanovo/data/datasets.py +++ b/casanovo/data/datasets.py @@ -1,6 +1,6 @@ """A PyTorch Dataset class for annotated spectra.""" -from typing import Optional, Tuple +from typing import List, Optional, Tuple import depthcharge import numpy as np diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 1af09a47..a7b5e850 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -6,15 +6,12 @@ from typing import List, Tuple import depthcharge.masses +from numba import jit from pyteomics import fasta, parser logger = logging.getLogger("casanovo") - # CONSTANTS -HYDROGEN = 1.007825035 -OXYGEN = 15.99491463 -H2O = 2 * HYDROGEN + OXYGEN PROTON = 1.00727646677 ISOTOPE_SPACING = 1.003355 @@ -29,216 +26,243 @@ fixed_mods = {"carbm": ["C"]} -def convert_from_modx(seq: str): - """Converts peptide sequence from modX format to Casanovo-acceptable modifications. - - Args: - seq (str): Peptide in modX format - """ - seq = seq.replace("carbmC", "C+57.021") # Fixed modification - seq = seq.replace("oxM", "M+15.995") - seq = seq.replace("dN", "N+0.984") - seq = seq.replace("dQ", "Q+0.984") - seq = seq.replace("ace-", "+42.011") - seq = seq.replace("carbnh3x-", "+43.006-17.027") - seq = seq.replace("carb-", "+43.006") - seq = seq.replace("nh3x-", "-17.027") - return seq - - -def digest_fasta( - fasta_filename: str, - enzyme: str, - digestion: str, - missed_cleavages: int, - max_mods: int, - min_peptide_length: int, - max_peptide_length: int, -): - """ - Digests a FASTA file and returns the peptides, their masses, and associated protein. - - Parameters - ---------- - fasta_filename : str - Path to the FASTA file. - enzyme : str - The enzyme to use for digestion. - See pyteomics.parser.expasy_rules for valid enzymes. - digestion : str - The type of digestion to perform. Either 'full' or 'partial'. - missed_cleavages : int - The number of missed cleavages to allow. - max_mods : int - The maximum number of modifications to allow per peptide. - min_peptide_length : int - The minimum length of peptides to consider. - max_peptide_length : int - The maximum length of peptides to consider. - - Returns - ------- - mod_peptide_list : List[Tuple[str, float, str]] - A list of tuples containing the peptide sequence, mass, - and associated protein. Sorted by neutral mass in ascending order. - """ - # Verify the existence of the file: - if not os.path.isfile(fasta_filename): - logger.error("File %s does not exist.", fasta_filename) - raise FileNotFoundError(f"File {fasta_filename} does not exist.") - - fasta_data = fasta.read(fasta_filename) - peptide_list = [] - if digestion not in ["full", "partial"]: - logger.error("Digestion type %s not recognized.", digestion) - raise ValueError(f"Digestion type {digestion} not recognized.") - semi = digestion == "partial" - for header, seq in fasta_data: - pep_set = parser.cleave( - seq, - rule=parser.expasy_rules[enzyme], - missed_cleavages=missed_cleavages, - semi=semi, - ) - protein = header.split()[0] - for pep in pep_set: - if len(pep) < min_peptide_length or len(pep) > max_peptide_length: - continue - if any( - aa in pep for aa in "BJOUXZ" - ): # Check for incorrect AA letters - logger.warn( - "Skipping peptide with ambiguous amino acids: %s", pep - ) - continue - peptide_list.append((pep, protein)) - - # Generate modified peptides - mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") - mod_peptide_list = [] - for pep, prot in peptide_list: - peptide_isoforms = parser.isoforms( - pep, - variable_mods=var_mods, - fixed_mods=fixed_mods, - max_mods=max_mods, - ) - peptide_isoforms = list(map(convert_from_modx, peptide_isoforms)) - mod_peptide_list.extend( - (mod_pep, mass_calculator.mass(mod_pep), prot) - for mod_pep in peptide_isoforms - ) - - # Sort the peptides by mass and return. - mod_peptide_list.sort(key=lambda x: x[1]) - return mod_peptide_list - - -def get_candidates( - precursor_mz: float, - charge: int, - peptide_list: List[Tuple[str, float, str]], - precursor_tolerance: float, - isotope_error: str, -): +class ProteinDatabase: """ - Returns a list of candidate peptides that fall within the specified mass range. + TODO Parameters ---------- - precursor_mz : float - The precursor mass-to-charge ratio. - charge : int - The precursor charge. - peptide_list : List[Tuple[str, float, str]] - A list of tuples containing the peptide sequence, mass, and associated protein. - Must be sorted by mass in ascending order. Uses neutral masses. - precursor_tolerance : float - The precursor mass tolerance in parts-per-million. - isotope_error : str - The isotope error levels to consider. + TODO """ - candidates = set() - isotope_error = [int(x) for x in isotope_error.split(",")] - for e in isotope_error: - iso_shift = ISOTOPE_SPACING * e - upper_bound = (_to_raw_mass(precursor_mz, charge) - iso_shift) * ( - 1 + (precursor_tolerance / 1e6) - ) - lower_bound = (_to_raw_mass(precursor_mz, charge) - iso_shift) * ( - 1 - (precursor_tolerance / 1e6) + def __init__( + self, + fasta_path: str, + enzyme: str, + digestion: str, + missed_cleavages: int, + min_peptide_len: int, + max_peptide_len: int, + max_mods: int, + precursor_tolerance: float, + isotope_error: List[int], + ): + self.digest = self._digest_fasta( + fasta_path, + enzyme, + digestion, + missed_cleavages, + max_mods, + min_peptide_len, + max_peptide_len, ) - - start, end = get_mass_indices( - [x[1] for x in peptide_list], lower_bound, upper_bound + self.precursor_tolerance = precursor_tolerance + self.isotope_error = isotope_error + + def get_candidates( + self, + precursor_mz: float, + charge: int, + ): + """ + Returns a list of candidate peptides that fall within the specified mass range. + + Parameters + ---------- + precursor_mz : float + The precursor mass-to-charge ratio. + charge : int + The precursor charge. + """ + candidates = set() + + for e in self.isotope_error: + iso_shift = ISOTOPE_SPACING * e + upper_bound = ( + ProteinDatabase._to_raw_mass(precursor_mz, charge) - iso_shift + ) * (1 + (self.precursor_tolerance / 1e6)) + lower_bound = ( + ProteinDatabase._to_raw_mass(precursor_mz, charge) - iso_shift + ) * (1 - (self.precursor_tolerance / 1e6)) + + start, end = ProteinDatabase._get_mass_indices( + [x[1] for x in self.digest], lower_bound, upper_bound + ) + + candidates.update(self.digest[start:end]) + + candidates = list(candidates) + candidates.sort(key=lambda x: x[1]) + return candidates + + def _digest_fasta( + self, + fasta_filename: str, + enzyme: str, + digestion: str, + missed_cleavages: int, + max_mods: int, + min_peptide_length: int, + max_peptide_length: int, + ): + """ + Digests a FASTA file and returns the peptides, their masses, and associated protein. + + Parameters + ---------- + fasta_filename : str + Path to the FASTA file. + enzyme : str + The enzyme to use for digestion. + See pyteomics.parser.expasy_rules for valid enzymes. + digestion : str + The type of digestion to perform. Either 'full' or 'partial'. + missed_cleavages : int + The number of missed cleavages to allow. + max_mods : int + The maximum number of modifications to allow per peptide. + min_peptide_length : int + The minimum length of peptides to consider. + max_peptide_length : int + The maximum length of peptides to consider. + + Returns + ------- + mod_peptide_list : List[Tuple[str, float, str]] + A list of tuples containing the peptide sequence, mass, + and associated protein. Sorted by neutral mass in ascending order. + """ + # Verify the existence of the file: + if not os.path.isfile(fasta_filename): + logger.error("File %s does not exist.", fasta_filename) + raise FileNotFoundError(f"File {fasta_filename} does not exist.") + + fasta_data = fasta.read(fasta_filename) + peptide_list = [] + if digestion not in ["full", "partial"]: + logger.error("Digestion type %s not recognized.", digestion) + raise ValueError(f"Digestion type {digestion} not recognized.") + semi = digestion == "partial" + for header, seq in fasta_data: + pep_set = parser.cleave( + seq, + rule=parser.expasy_rules[enzyme], + missed_cleavages=missed_cleavages, + semi=semi, + ) + protein = header.split()[0] + for pep in pep_set: + if ( + len(pep) < min_peptide_length + or len(pep) > max_peptide_length + ): + continue + if any( + aa in pep for aa in "BJOUXZ" + ): # Check for incorrect AA letters + logger.warn( + "Skipping peptide with ambiguous amino acids: %s", pep + ) + continue + peptide_list.append((pep, protein)) + + # Generate modified peptides + mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") + mod_peptide_list = [] + for pep, prot in peptide_list: + peptide_isoforms = parser.isoforms( + pep, + variable_mods=var_mods, + fixed_mods=fixed_mods, + max_mods=max_mods, + ) + peptide_isoforms = list( + map(ProteinDatabase._convert_from_modx, peptide_isoforms) + ) + mod_peptide_list.extend( + (mod_pep, mass_calculator.mass(mod_pep), prot) + for mod_pep in peptide_isoforms + ) + + # Sort the peptides by mass and return. + mod_peptide_list.sort(key=lambda x: x[1]) + logger.info( + "Digestion complete. %d peptides generated.", len(mod_peptide_list) ) - - candidates.update(peptide_list[start:end]) - - candidates = list(candidates) - candidates.sort(key=lambda x: x[1]) - return candidates - - -def _to_mz(precursor_mass, charge): - """ - Convert precursor neutral mass to m/z value. - - Parameters - ---------- - precursor_mass : float - The precursor neutral mass. - charge : int - The precursor charge. - - Returns - ------- - mz : float - The calculated precursor mass-to-charge ratio. - """ - return (precursor_mass + (charge * PROTON)) / charge - - -def _to_raw_mass(mz_mass, charge): - """ - Convert precursor m/z value to neutral mass. - - Parameters - ---------- - mz_mass : float - The precursor mass-to-charge ratio. - charge : int - The precursor charge. - - Returns - ------- - mass : float - The calculated precursor neutral mass. - """ - return charge * (mz_mass - PROTON) - - -def get_mass_indices(masses, m_low, m_high): - """Grabs mass indices that fall within a specified range. - - Pulls from masses, a list of mass values. - Requires that the mass values are sorted in ascending order. - - Parameters - ---------- - masses : List[int] - List of mass values - m_low : int - Lower bound of mass range (inclusive) - m_high : int - Upper bound of mass range (inclusive) - - Return - ------ - indices : Tuple[int, int] - Indices of mass values that fall within the specified range - """ - start = bisect.bisect_left(masses, m_low) - end = bisect.bisect_right(masses, m_high) - return start, end + return mod_peptide_list + + def _to_mz(precursor_mass, charge): + """ + Convert precursor neutral mass to m/z value. + + Parameters + ---------- + precursor_mass : float + The precursor neutral mass. + charge : int + The precursor charge. + + Returns + ------- + mz : float + The calculated precursor mass-to-charge ratio. + """ + return (precursor_mass + (charge * PROTON)) / charge + + def _to_raw_mass(mz_mass, charge): + """ + Convert precursor m/z value to neutral mass. + + Parameters + ---------- + mz_mass : float + The precursor mass-to-charge ratio. + charge : int + The precursor charge. + + Returns + ------- + mass : float + The calculated precursor neutral mass. + """ + return charge * (mz_mass - PROTON) + + def _get_mass_indices(masses, m_low, m_high): + """Grabs mass indices that fall within a specified range. + + Pulls from masses, a list of mass values. + Requires that the mass values are sorted in ascending order. + + Parameters + ---------- + masses : List[int] + List of mass values + m_low : int + Lower bound of mass range (inclusive) + m_high : int + Upper bound of mass range (inclusive) + + Return + ------ + indices : Tuple[int, int] + Indices of mass values that fall within the specified range + """ + start = bisect.bisect_left(masses, m_low) + end = bisect.bisect_right(masses, m_high) + return start, end + + def _convert_from_modx(seq: str): + """Converts peptide sequence from modX format to Casanovo-acceptable modifications. + + Args: + seq (str): Peptide in modX format + """ + seq = seq.replace("carbmC", "C+57.021") # Fixed modification + seq = seq.replace("oxM", "M+15.995") + seq = seq.replace("dN", "N+0.984") + seq = seq.replace("dQ", "Q+0.984") + seq = seq.replace("ace-", "+42.011") + seq = seq.replace("carbnh3x-", "+43.006-17.027") + seq = seq.replace("carb-", "+43.006") + seq = seq.replace("nh3x-", "-17.027") + return seq diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 14a0ff99..4d5524f4 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -89,6 +89,7 @@ def __init__( self.train_dataset = None self.valid_dataset = None self.test_dataset = None + self.pdb = None def setup(self, stage: str = None, annotated: bool = True) -> None: """ @@ -96,7 +97,7 @@ def setup(self, stage: str = None, annotated: bool = True) -> None: Parameters ---------- - stage : str {"fit", "validate", "test", "db"} + stage : str {"fit", "validate", "test"} The stage indicating which Datasets to prepare. All are prepared by default. annotated: bool @@ -186,12 +187,7 @@ def db_dataloader(self) -> torch.utils.data.DataLoader: return torch.utils.data.DataLoader( self.test_dataset, batch_size=self.eval_batch_size, - collate_fn=functools.partial( - prepare_psm_batch, - digest=self.digest, - precursor_tolerance=self.precursor_tolerance, - isotope_error=self.isotope_error, - ), + collate_fn=functools.partial(prepare_psm_batch, pdb=self.pdb), pin_memory=True, num_workers=self.n_workers, shuffle=False, @@ -239,9 +235,7 @@ def prepare_batch( def prepare_psm_batch( batch: List[Tuple[torch.Tensor, float, int, str]], - digest: List[Tuple[str, float, str]], - precursor_tolerance: float, - isotope_error: str, + pdb: db_utils.ProteinDatabase, ): """ Collate MS/MS spectra into a batch for DB search. @@ -255,13 +249,8 @@ def prepare_psm_batch( A batch of data from an AnnotatedSpectrumDataset, consisting of for each spectrum (i) a tensor with the m/z and intensity peak values, (ii), the precursor m/z, (iii) the precursor charge, (iv) the spectrum identifier. - digest : List[Tuple[str, float, str]] - A list of tuples containing the peptide sequence, mass, and associated protein - from digesting a .fasta file. Sorted by mass in ascending order. Uses neutral masses. - precursor_tolerance : float - The precursor mass tolerance in parts-per-million. - isotope_error : str - The isotope error levels to consider. + pdb : db_utils.ProteinDatabase + The protein database to use for candidate peptide retrieval. Returns ------- @@ -294,12 +283,9 @@ def prepare_psm_batch( all_peptides = [] all_proteins = [] for idx in range(len(batch)): - digest_data = db_utils.get_candidates( + digest_data = pdb.get_candidates( precursor_mzs[idx], precursor_charges[idx], - digest, - precursor_tolerance, - isotope_error, ) try: spec_peptides, _, pep_protein = list(zip(*digest_data)) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index c2b71098..b90f06b0 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -83,14 +83,6 @@ def db_search( self, peak_path: Iterable[str], fasta_path: str, - enzyme: str, - digestion: str, - missed_cleavages: int, - max_mods: int, - min_peptide_length: int, - max_peptide_length: int, - precursor_tolerance: float, - isotope_error: str, output: str, ) -> None: """Perform database search with Casanovo. @@ -101,22 +93,6 @@ def db_search( The paths to the .mgf data files for database search. fasta_path : str The path to the FASTA file for database search. - enzyme : str - The enzyme used for digestion. - digestion : str - The digestion type, full or partial. - missed_cleavages : int - The number of missed cleavages allowed. - max_mods : int - The maximum number of modifications allowed per peptide. - min_peptide_length : int - The minimum peptide length. - max_peptide_length : int - The maximum peptide length. - precursor_tolerance : float - The precursor mass tolerance in ppm. - isotope_error : str - Isotope error levels to consider, in comma-delineated string form. output : str Where should the output be saved? @@ -138,19 +114,18 @@ def db_search( self.writer.set_ms_run(test_index.ms_files) self.initialize_data_module(test_index=test_index) - self.loaders.setup(stage="test", annotated=False) - self.loaders.digest = db_utils.digest_fasta( + self.loaders.pdb = db_utils.ProteinDatabase( fasta_path, - enzyme, - digestion, - missed_cleavages, - max_mods, - min_peptide_length, - max_peptide_length, + self.config.enzyme, + self.config.digestion, + self.config.missed_cleavages, + self.config.min_peptide_len, + self.config.max_peptide_len, + self.config.max_mods, + self.config.precursor_mass_tol, + self.config.isotope_error_range, ) - self.loaders.precursor_tolerance = precursor_tolerance - self.loaders.isotope_error = isotope_error - + self.loaders.setup(stage="test", annotated=False) self.trainer.predict(self.model, self.loaders.db_dataloader()) def train( diff --git a/tests/conftest.py b/tests/conftest.py index 60afcd83..f20d7879 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -242,6 +242,11 @@ def tiny_config(tmp_path): "precursor_mass_tol": 5, "isotope_error_range": [0, 1], "min_peptide_len": 6, + "max_peptide_len": 50, + "enzyme": "trypsin", + "digestion": "full", + "missed_cleavages": 0, + "max_mods": None, "predict_batch_size": 1024, "n_beams": 1, "top_match": 1, diff --git a/tests/test_integration.py b/tests/test_integration.py index 4bd55174..61f735c3 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -24,8 +24,6 @@ def test_db_search( tiny_config, "--output", str(output_path), - "--precursor_tolerance", - str(100), str(mgf_db_search), str(tiny_fasta_file), ] diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 419cf3ef..7a37e771 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -276,15 +276,18 @@ def test_digest_fasta_cleave(tiny_fasta_file): (0, 1, 3), (expected_normal, expected_1missedcleavage, expected_3missedcleavage), ): - peptide_list = db_utils.digest_fasta( - fasta_filename=str(tiny_fasta_file), + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=missed_cleavages, + min_peptide_len=6, + max_peptide_len=50, max_mods=0, - min_peptide_length=6, - max_peptide_length=50, + precursor_tolerance=20, + isotope_error=[0], ) + peptide_list = pdb.digest peptide_list = [x[0] for x in peptide_list] assert peptide_list == expected @@ -343,16 +346,18 @@ def test_digest_fasta_mods(tiny_fasta_file): "+42.011FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", "+43.006FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", ] - - peptide_list = db_utils.digest_fasta( - fasta_filename=str(tiny_fasta_file), + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=0, + min_peptide_len=6, + max_peptide_len=50, max_mods=1, - min_peptide_length=6, - max_peptide_length=50, + precursor_tolerance=20, + isotope_error=[0], ) + peptide_list = pdb.digest peptide_list = [x[0] for x in peptide_list] peptide_list = [ x @@ -375,27 +380,33 @@ def test_length_restrictions(tiny_fasta_file): # length between 6 and 8 expected_short = ["ATSIPAR", "VTLSC+57.021R"] - peptide_list = db_utils.digest_fasta( - fasta_filename=str(tiny_fasta_file), + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=0, + min_peptide_len=20, + max_peptide_len=50, max_mods=0, - min_peptide_length=20, - max_peptide_length=50, + precursor_tolerance=20, + isotope_error=[0], ) + peptide_list = pdb.digest peptide_list = [x[0] for x in peptide_list] assert peptide_list == expected_long - peptide_list = db_utils.digest_fasta( - fasta_filename=str(tiny_fasta_file), + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=0, + min_peptide_len=6, + max_peptide_len=8, max_mods=0, - min_peptide_length=6, - max_peptide_length=8, + precursor_tolerance=20, + isotope_error=[0], ) + peptide_list = pdb.digest peptide_list = [x[0] for x in peptide_list] assert peptide_list == expected_short @@ -415,27 +426,33 @@ def test_digest_fasta_enzyme(tiny_fasta_file): # asp-n enzyme expected_aspn = ["DFAVYYC+57.021QQ", "DFTLTISSLQPE", "MEAPAQLLFLLLLWLP"] - peptide_list = db_utils.digest_fasta( - fasta_filename=str(tiny_fasta_file), + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), enzyme="arg-c", digestion="full", missed_cleavages=0, + min_peptide_len=6, + max_peptide_len=50, max_mods=0, - min_peptide_length=6, - max_peptide_length=50, + precursor_tolerance=20, + isotope_error=[0], ) + peptide_list = pdb.digest peptide_list = [x[0] for x in peptide_list] assert peptide_list == expected_argc - peptide_list = db_utils.digest_fasta( - fasta_filename=str(tiny_fasta_file), + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), enzyme="asp-n", digestion="full", missed_cleavages=0, + min_peptide_len=6, + max_peptide_len=50, max_mods=0, - min_peptide_length=6, - max_peptide_length=50, + precursor_tolerance=20, + isotope_error=[0], ) + peptide_list = pdb.digest peptide_list = [x[0] for x in peptide_list] assert peptide_list == expected_aspn @@ -450,68 +467,53 @@ def test_get_candidates(tiny_fasta_file): # precursor window is 600000 expected_widewindow = ["ATSIPAR", "VTLSC+57.021R", "LLIYGASTR"] - peptide_list = db_utils.digest_fasta( - fasta_filename=str(tiny_fasta_file), + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=1, + min_peptide_len=6, + max_peptide_len=50, max_mods=0, - min_peptide_length=6, - max_peptide_length=50, - ) - - candidates = db_utils.get_candidates( - precursor_mz=496.2, - charge=2, - peptide_list=peptide_list, precursor_tolerance=10000, - isotope_error="0", + isotope_error=[0], ) + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) candidates = [x[0] for x in candidates] assert expected_smallwindow == candidates - peptide_list = db_utils.digest_fasta( - fasta_filename=str(tiny_fasta_file), + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=1, + min_peptide_len=6, + max_peptide_len=50, max_mods=0, - min_peptide_length=6, - max_peptide_length=50, - ) - - candidates = db_utils.get_candidates( - precursor_mz=496.2, - charge=2, - peptide_list=peptide_list, precursor_tolerance=150000, - isotope_error="0", + isotope_error=[0], ) + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) candidates = [x[0] for x in candidates] assert expected_midwindow == candidates - peptide_list = db_utils.digest_fasta( - fasta_filename=str(tiny_fasta_file), + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=1, + min_peptide_len=6, + max_peptide_len=50, max_mods=0, - min_peptide_length=6, - max_peptide_length=50, - ) - - candidates = db_utils.get_candidates( - precursor_mz=496.2, - charge=2, - peptide_list=peptide_list, precursor_tolerance=600000, - isotope_error="0", + isotope_error=[0], ) + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) candidates = [x[0] for x in candidates] assert expected_widewindow == candidates -def test_get_candidates_isotope_error(): +def test_get_candidates_isotope_error(tiny_fasta_file): # Tide isotope error windows for 496.2, 2+: # 0: [980.481617, 1000.289326] @@ -556,53 +558,83 @@ def test_get_candidates_isotope_error(): expected_isotope3 = list("XWVUTSRQPONMLKJIHGFE") expected_isotope0123 = list("XWVUTSRQPONMLKJIHGFEDCB") - candidates = db_utils.get_candidates( - precursor_mz=496.2, - charge=2, - peptide_list=peptide_list, + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + min_peptide_len=0, + max_peptide_len=0, + max_mods=0, precursor_tolerance=10000, - isotope_error="0", + isotope_error=[0], ) + pdb.digest = peptide_list + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) candidates = [x[0] for x in candidates] assert expected_isotope0 == candidates - candidates = db_utils.get_candidates( - precursor_mz=496.2, - charge=2, - peptide_list=peptide_list, + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + min_peptide_len=0, + max_peptide_len=0, + max_mods=0, precursor_tolerance=10000, - isotope_error="1", + isotope_error=[1], ) + pdb.digest = peptide_list + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) candidates = [x[0] for x in candidates] assert expected_isotope1 == candidates - candidates = db_utils.get_candidates( - precursor_mz=496.2, - charge=2, - peptide_list=peptide_list, + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + min_peptide_len=0, + max_peptide_len=0, + max_mods=0, precursor_tolerance=10000, - isotope_error="2", + isotope_error=[2], ) + pdb.digest = peptide_list + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) candidates = [x[0] for x in candidates] assert expected_isotope2 == candidates - candidates = db_utils.get_candidates( - precursor_mz=496.2, - charge=2, - peptide_list=peptide_list, + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + min_peptide_len=0, + max_peptide_len=0, + max_mods=0, precursor_tolerance=10000, - isotope_error="3", + isotope_error=[3], ) + pdb.digest = peptide_list + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) candidates = [x[0] for x in candidates] assert expected_isotope3 == candidates - candidates = db_utils.get_candidates( - precursor_mz=496.2, - charge=2, - peptide_list=peptide_list, + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + min_peptide_len=0, + max_peptide_len=0, + max_mods=0, precursor_tolerance=10000, - isotope_error="0,1,2,3", + isotope_error=[0, 1, 2, 3], ) + pdb.digest = peptide_list + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) candidates = [x[0] for x in candidates] assert expected_isotope0123 == candidates From 3d0b0b9b6f3c4efedd7034aab4ecc62de2a9a4ca Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 20 Aug 2024 02:12:46 +0000 Subject: [PATCH 11/21] Generate new screengrabs with rich-codex --- docs/images/configure-help.svg | 160 +++++++--------------- docs/images/evaluate-help.svg | 191 +++++++++++++------------- docs/images/help.svg | 223 ++++++++++++++++++------------- docs/images/sequence-help.svg | 191 +++++++++++++------------- docs/images/train-help.svg | 237 ++++++++++++++++++++------------- 5 files changed, 509 insertions(+), 493 deletions(-) diff --git a/docs/images/configure-help.svg b/docs/images/configure-help.svg index 4092bce3..b1fcce10 100644 --- a/docs/images/configure-help.svg +++ b/docs/images/configure-help.svg @@ -1,4 +1,4 @@ - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - + - + - - $ casanovo configure --help -Traceback (most recent call last): -  File "/opt/hostedtoolcache/Python/3.10.14/x64/bin/casanovo", line 5, in <module> -    from casanovo.casanovo import main -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/casanovo/casanovo.py", line 32, in <module> -    import depthcharge -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/__init__.py", line 3, in <module> -    from . import components -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/__init__.py", line 2, in <module> -    from .transformers import SpectrumEncoder, PeptideDecoder -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/transformers.py", line 8, in <module> -    from .. import utils -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/utils.py", line 5, in <module> -    from tensorboard.backend.event_processing.event_accumulator import ( -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_accumulator.py", line 24, in <module> -    from tensorboard.backend.event_processing import event_file_loader -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_file_loader.py", line 21, in <module> -    from tensorboard import dataclass_compat -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/dataclass_compat.py", line 33, in <module> -    from tensorboard.plugins.hparams import metadata as hparams_metadata -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/plugins/hparams/metadata.py", line 32, in <module> -    NULL_TENSOR = tensor_util.make_tensor_proto( -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/util/tensor_util.py", line 405, in make_tensor_proto -    numpy_dtype = dtypes.as_dtype(nparray.dtype) -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py", line 677, in as_dtype -    if type_value.type == np.string_ or type_value.type == np.unicode_: -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/numpy/__init__.py", line 397, in __getattr__ -    raise AttributeError( -AttributeError: `np.string_` was removed in the NumPy 2.0 release. Use `np.bytes_` instead.. Did you mean: 'strings'? + + $ casanovo configure --help + +Usage:casanovo configure [OPTIONS]                                             + + Generate a Casanovo configuration file to customize.                            + The casanovo configuration file is in the YAML format.                          + +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--output-oFILE  The output configuration file.                            +--help-h  Show this message and exit.                               +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/evaluate-help.svg b/docs/images/evaluate-help.svg index d86b2497..2f770e2e 100644 --- a/docs/images/evaluate-help.svg +++ b/docs/images/evaluate-help.svg @@ -1,4 +1,4 @@ - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - - - - - - - - - - + + - + - + - - $ casanovo evaluate --help -Traceback (most recent call last): -  File "/opt/hostedtoolcache/Python/3.10.14/x64/bin/casanovo", line 5, in <module> -    from casanovo.casanovo import main -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/casanovo/casanovo.py", line 32, in <module> -    import depthcharge -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/__init__.py", line 3, in <module> -    from . import components -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/__init__.py", line 2, in <module> -    from .transformers import SpectrumEncoder, PeptideDecoder -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/transformers.py", line 8, in <module> -    from .. import utils -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/utils.py", line 5, in <module> -    from tensorboard.backend.event_processing.event_accumulator import ( -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_accumulator.py", line 24, in <module> -    from tensorboard.backend.event_processing import event_file_loader -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_file_loader.py", line 21, in <module> -    from tensorboard import dataclass_compat -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/dataclass_compat.py", line 33, in <module> -    from tensorboard.plugins.hparams import metadata as hparams_metadata -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/plugins/hparams/metadata.py", line 32, in <module> -    NULL_TENSOR = tensor_util.make_tensor_proto( -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/util/tensor_util.py", line 405, in make_tensor_proto -    numpy_dtype = dtypes.as_dtype(nparray.dtype) -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py", line 677, in as_dtype -    if type_value.type == np.string_ or type_value.type == np.unicode_: -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/numpy/__init__.py", line 397, in __getattr__ -    raise AttributeError( -AttributeError: `np.string_` was removed in the NumPy 2.0 release. Use `np.bytes_` instead.. Did you mean: 'strings'? + + $ casanovo evaluate --help + +Usage:casanovo evaluate [OPTIONSANNOTATED_PEAK_PATH...                       + + Evaluate de novo peptide sequencing performance.                                + ANNOTATED_PEAK_PATH must be one or more annoated MGF files, such as those       + provided by MassIVE-KB.                                                         + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  ANNOTATED_PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--model-mFILE                        The model weights (.ckpt file).  +                                              If not provided, Casanovo will   +                                              try to download the latest       +                                              release.                         +--output-oFILE                        The mzTab file to which results  +                                              will be written.                 +--config-cFILE                        The YAML configuration file      +                                              overriding the default options.  +--verbosity-v[debug|info|warning|error]  Set the verbosity of console     +                                              logging messages. Log files are  +                                              always set to 'debug'.           +--help-h  Show this message and exit.      +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/help.svg b/docs/images/help.svg index dfb1039c..6243538a 100644 --- a/docs/images/help.svg +++ b/docs/images/help.svg @@ -1,4 +1,4 @@ - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + + + + + + + + + + + + + + + + + + + + + + - + - + - - $ casanovo --help -Traceback (most recent call last): -  File "/opt/hostedtoolcache/Python/3.10.14/x64/bin/casanovo", line 5, in <module> -    from casanovo.casanovo import main -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/casanovo/casanovo.py", line 32, in <module> -    import depthcharge -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/__init__.py", line 3, in <module> -    from . import components -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/__init__.py", line 2, in <module> -    from .transformers import SpectrumEncoder, PeptideDecoder -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/transformers.py", line 8, in <module> -    from .. import utils -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/utils.py", line 5, in <module> -    from tensorboard.backend.event_processing.event_accumulator import ( -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_accumulator.py", line 24, in <module> -    from tensorboard.backend.event_processing import event_file_loader -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_file_loader.py", line 21, in <module> -    from tensorboard import dataclass_compat -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/dataclass_compat.py", line 33, in <module> -    from tensorboard.plugins.hparams import metadata as hparams_metadata -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/plugins/hparams/metadata.py", line 32, in <module> -    NULL_TENSOR = tensor_util.make_tensor_proto( -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/util/tensor_util.py", line 405, in make_tensor_proto -    numpy_dtype = dtypes.as_dtype(nparray.dtype) -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py", line 677, in as_dtype -    if type_value.type == np.string_ or type_value.type == np.unicode_: -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/numpy/__init__.py", line 397, in __getattr__ -    raise AttributeError( -AttributeError: `np.string_` was removed in the NumPy 2.0 release. Use `np.bytes_` instead.. Did you mean: 'strings'? + + $ casanovo --help + +Usage:casanovo [OPTIONSCOMMAND [ARGS]...                                     + + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓  + ┃                                  Casanovo                                  ┃  + ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛  + Casanovo de novo sequences peptides from tandem mass spectra using a            + Transformer model. Casanovo currently supports mzML, mzXML, and MGF files for   + de novo sequencing and annotated MGF files, such as those from MassIVE-KB, for  + training new models.                                                            + + Links:                                                                          + + • Documentation: https://casanovo.readthedocs.io + • Official code repository: https://github.com/Noble-Lab/casanovo + + If you use Casanovo in your work, please cite:                                  + + • Yilmaz, M., Fondrie, W. E., Bittremieux, W., Oh, S. & Noble, W. S. De novo   +mass spectrometry peptide sequencing with a transformer model. Proceedings   +of the 39th International Conference on Machine Learning - ICML '22 (2022)   +doi:10.1101/2022.02.07.479481.                                               + +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--help-h    Show this message and exit.                                     +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Commands ───────────────────────────────────────────────────────────────────╮ +configure Generate a Casanovo configuration file to customize.               +db-search Perform a database search on MS/MS data using Casanovo-DB.         +evaluate  Evaluate de novo peptide sequencing performance.                   +sequence  De novo sequence peptides from tandem mass spectra.                +train     Train a Casanovo model on your own data.                           +version   Get the Casanovo version information                               +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/sequence-help.svg b/docs/images/sequence-help.svg index b9b96d74..7a1bbff6 100644 --- a/docs/images/sequence-help.svg +++ b/docs/images/sequence-help.svg @@ -1,4 +1,4 @@ - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - - - - - - - - - - + + - + - + - - $ casanovo sequence --help -Traceback (most recent call last): -  File "/opt/hostedtoolcache/Python/3.10.14/x64/bin/casanovo", line 5, in <module> -    from casanovo.casanovo import main -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/casanovo/casanovo.py", line 32, in <module> -    import depthcharge -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/__init__.py", line 3, in <module> -    from . import components -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/__init__.py", line 2, in <module> -    from .transformers import SpectrumEncoder, PeptideDecoder -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/transformers.py", line 8, in <module> -    from .. import utils -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/utils.py", line 5, in <module> -    from tensorboard.backend.event_processing.event_accumulator import ( -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_accumulator.py", line 24, in <module> -    from tensorboard.backend.event_processing import event_file_loader -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_file_loader.py", line 21, in <module> -    from tensorboard import dataclass_compat -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/dataclass_compat.py", line 33, in <module> -    from tensorboard.plugins.hparams import metadata as hparams_metadata -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/plugins/hparams/metadata.py", line 32, in <module> -    NULL_TENSOR = tensor_util.make_tensor_proto( -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/util/tensor_util.py", line 405, in make_tensor_proto -    numpy_dtype = dtypes.as_dtype(nparray.dtype) -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py", line 677, in as_dtype -    if type_value.type == np.string_ or type_value.type == np.unicode_: -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/numpy/__init__.py", line 397, in __getattr__ -    raise AttributeError( -AttributeError: `np.string_` was removed in the NumPy 2.0 release. Use `np.bytes_` instead.. Did you mean: 'strings'? + + $ casanovo sequence --help + +Usage:casanovo sequence [OPTIONSPEAK_PATH...                                 + + De novo sequence peptides from tandem mass spectra.                             + PEAK_PATH must be one or more mzML, mzXML, or MGF files from which to sequence  + peptides.                                                                       + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +--model-mFILE                        The model weights (.ckpt file).  +                                              If not provided, Casanovo will   +                                              try to download the latest       +                                              release.                         +--output-oFILE                        The mzTab file to which results  +                                              will be written.                 +--config-cFILE                        The YAML configuration file      +                                              overriding the default options.  +--verbosity-v[debug|info|warning|error]  Set the verbosity of console     +                                              logging messages. Log files are  +                                              always set to 'debug'.           +--help-h  Show this message and exit.      +╰──────────────────────────────────────────────────────────────────────────────╯ + diff --git a/docs/images/train-help.svg b/docs/images/train-help.svg index a71b8915..58251215 100644 --- a/docs/images/train-help.svg +++ b/docs/images/train-help.svg @@ -1,4 +1,4 @@ - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + - + - - $ casanovo train --help -Traceback (most recent call last): -  File "/opt/hostedtoolcache/Python/3.10.14/x64/bin/casanovo", line 5, in <module> -    from casanovo.casanovo import main -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/casanovo/casanovo.py", line 32, in <module> -    import depthcharge -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/__init__.py", line 3, in <module> -    from . import components -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/__init__.py", line 2, in <module> -    from .transformers import SpectrumEncoder, PeptideDecoder -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/components/transformers.py", line 8, in <module> -    from .. import utils -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/depthcharge/utils.py", line 5, in <module> -    from tensorboard.backend.event_processing.event_accumulator import ( -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_accumulator.py", line 24, in <module> -    from tensorboard.backend.event_processing import event_file_loader -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/backend/event_processing/event_file_loader.py", line 21, in <module> -    from tensorboard import dataclass_compat -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/dataclass_compat.py", line 33, in <module> -    from tensorboard.plugins.hparams import metadata as hparams_metadata -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/plugins/hparams/metadata.py", line 32, in <module> -    NULL_TENSOR = tensor_util.make_tensor_proto( -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/util/tensor_util.py", line 405, in make_tensor_proto -    numpy_dtype = dtypes.as_dtype(nparray.dtype) -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/tensorboard/compat/tensorflow_stub/dtypes.py", line 677, in as_dtype -    if type_value.type == np.string_ or type_value.type == np.unicode_: -  File "/opt/hostedtoolcache/Python/3.10.14/x64/lib/python3.10/site-packages/numpy/__init__.py", line 397, in __getattr__ -    raise AttributeError( -AttributeError: `np.string_` was removed in the NumPy 2.0 release. Use `np.bytes_` instead.. Did you mean: 'strings'? + + $ casanovo train --help + +Usage:casanovo train [OPTIONSTRAIN_PEAK_PATH...                              + + Train a Casanovo model on your own data.                                        + TRAIN_PEAK_PATH must be one or more annoated MGF files, such as those provided  + by MassIVE-KB, from which to train a new Casnovo model.                         + +╭─ Arguments ──────────────────────────────────────────────────────────────────╮ +*  TRAIN_PEAK_PATH    FILE[required] +╰──────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ────────────────────────────────────────────────────────────────────╮ +*--validation_peak_pa…-pFILE                    An annotated MGF file   +                                                       for validation, like    +                                                       from MassIVE-KB. Use    +                                                       this option multiple    +                                                       times to specify        +                                                       multiple files.         +[required]             +--model-mFILE                    The model weights       +                                                       (.ckpt file). If not    +                                                       provided, Casanovo      +                                                       will try to download    +                                                       the latest release.     +--output-oFILE                    The mzTab file to       +                                                       which results will be   +                                                       written.                +--config-cFILE                    The YAML configuration  +                                                       file overriding the     +                                                       default options.        +--verbosity-v[debug|info|warning|er  Set the verbosity of    +ror]  console logging         +                                                       messages. Log files     +                                                       are always set to       +                                                       'debug'.                +--help-h  Show this message and   +                                                       exit.                   +╰──────────────────────────────────────────────────────────────────────────────╯ + From 812226e396f667f2d9e628e1aabd76546f8c18a1 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Tue, 20 Aug 2024 20:21:29 -0700 Subject: [PATCH 12/21] finish proteindatabase --- casanovo/data/db_utils.py | 101 +++++++++++++++++---------------- casanovo/denovo/dataloaders.py | 6 +- tests/unit_tests/test_unit.py | 100 +++++++++++++++----------------- 3 files changed, 101 insertions(+), 106 deletions(-) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index a7b5e850..d249e0c7 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -1,12 +1,12 @@ """Unique methods used within db-search mode""" -import bisect import logging import os -from typing import List, Tuple +from typing import List import depthcharge.masses from numba import jit +import pandas as pd from pyteomics import fasta, parser logger = logging.getLogger("casanovo") @@ -28,11 +28,29 @@ class ProteinDatabase: """ - TODO + Store digested .fasta data and return candidate peptides for a given precursor mass. Parameters ---------- - TODO + fasta_path : str + Path to the FASTA file. + enzyme : str + The enzyme to use for digestion. + See pyteomics.parser.expasy_rules for valid enzymes. + digestion : str + The type of digestion to perform. Either 'full' or 'partial'. + missed_cleavages : int + The number of missed cleavages to allow. + min_peptide_len : int + The minimum length of peptides to consider. + max_peptide_len : int + The maximum length of peptides to consider. + max_mods : int + The maximum number of modifications to allow per peptide. + precursor_tolerance : float + The precursor mass tolerance in ppm. + isotope_error : List[int] + Isotopes to consider when comparing predicted and observed precursor m/z's. """ def __init__( @@ -73,27 +91,34 @@ def get_candidates( The precursor mass-to-charge ratio. charge : int The precursor charge. + + Returns + ------- + candidates : List[Tuple[str, str]] + A list of candidate peptides and associated + protein. """ - candidates = set() + candidates = [] for e in self.isotope_error: iso_shift = ISOTOPE_SPACING * e - upper_bound = ( + upper_bound = float( ProteinDatabase._to_raw_mass(precursor_mz, charge) - iso_shift ) * (1 + (self.precursor_tolerance / 1e6)) - lower_bound = ( + lower_bound = float( ProteinDatabase._to_raw_mass(precursor_mz, charge) - iso_shift ) * (1 - (self.precursor_tolerance / 1e6)) - start, end = ProteinDatabase._get_mass_indices( - [x[1] for x in self.digest], lower_bound, upper_bound - ) + window = self.digest[ + (self.digest["calc_mass"] >= lower_bound) + & (self.digest["calc_mass"] <= upper_bound) + ] + candidates.append(window[["peptide", "calc_mass", "protein"]]) - candidates.update(self.digest[start:end]) - - candidates = list(candidates) - candidates.sort(key=lambda x: x[1]) - return candidates + candidates = pd.concat(candidates) + candidates.drop_duplicates(inplace=True) + candidates.sort_values(by=["calc_mass", "peptide"], inplace=True) + return list(candidates["peptide"]), list(candidates["protein"]) def _digest_fasta( self, @@ -128,9 +153,9 @@ def _digest_fasta( Returns ------- - mod_peptide_list : List[Tuple[str, float, str]] - A list of tuples containing the peptide sequence, mass, - and associated protein. Sorted by neutral mass in ascending order. + mod_peptide_list : pd.DataFrame + A Pandas DataFrame with peptide, mass, + and protein columns. Sorted by neutral mass in ascending order. """ # Verify the existence of the file: if not os.path.isfile(fasta_filename): @@ -180,17 +205,20 @@ def _digest_fasta( map(ProteinDatabase._convert_from_modx, peptide_isoforms) ) mod_peptide_list.extend( - (mod_pep, mass_calculator.mass(mod_pep), prot) + [mod_pep, mass_calculator.mass(mod_pep), prot] for mod_pep in peptide_isoforms ) - # Sort the peptides by mass and return. - mod_peptide_list.sort(key=lambda x: x[1]) - logger.info( - "Digestion complete. %d peptides generated.", len(mod_peptide_list) + # Create a DataFrame for easy sorting and filtering + pdb_df = pd.DataFrame( + mod_peptide_list, columns=["peptide", "calc_mass", "protein"] ) - return mod_peptide_list + pdb_df.sort_values(by=["calc_mass", "peptide"], inplace=True) + + logger.info("Digestion complete. %d peptides generated.", len(pdb_df)) + return pdb_df + @jit def _to_mz(precursor_mass, charge): """ Convert precursor neutral mass to m/z value. @@ -209,6 +237,7 @@ def _to_mz(precursor_mass, charge): """ return (precursor_mass + (charge * PROTON)) / charge + @jit def _to_raw_mass(mz_mass, charge): """ Convert precursor m/z value to neutral mass. @@ -227,30 +256,6 @@ def _to_raw_mass(mz_mass, charge): """ return charge * (mz_mass - PROTON) - def _get_mass_indices(masses, m_low, m_high): - """Grabs mass indices that fall within a specified range. - - Pulls from masses, a list of mass values. - Requires that the mass values are sorted in ascending order. - - Parameters - ---------- - masses : List[int] - List of mass values - m_low : int - Lower bound of mass range (inclusive) - m_high : int - Upper bound of mass range (inclusive) - - Return - ------ - indices : Tuple[int, int] - Indices of mass values that fall within the specified range - """ - start = bisect.bisect_left(masses, m_low) - end = bisect.bisect_right(masses, m_high) - return start, end - def _convert_from_modx(seq: str): """Converts peptide sequence from modX format to Casanovo-acceptable modifications. diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 4d5524f4..2d9e200b 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -284,11 +284,11 @@ def prepare_psm_batch( all_proteins = [] for idx in range(len(batch)): digest_data = pdb.get_candidates( - precursor_mzs[idx], - precursor_charges[idx], + float(precursor_mzs[idx]), + float(precursor_charges[idx]), ) try: - spec_peptides, _, pep_protein = list(zip(*digest_data)) + spec_peptides, pep_protein = digest_data all_spectra.append( spectra[idx].unsqueeze(0).repeat(len(spec_peptides), 1, 1) ) diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 7a37e771..2473a168 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -9,6 +9,7 @@ import einops import github import numpy as np +import pandas as pd import pytest import torch @@ -287,8 +288,7 @@ def test_digest_fasta_cleave(tiny_fasta_file): precursor_tolerance=20, isotope_error=[0], ) - peptide_list = pdb.digest - peptide_list = [x[0] for x in peptide_list] + peptide_list = list(pdb.digest["peptide"]) assert peptide_list == expected @@ -357,8 +357,7 @@ def test_digest_fasta_mods(tiny_fasta_file): precursor_tolerance=20, isotope_error=[0], ) - peptide_list = pdb.digest - peptide_list = [x[0] for x in peptide_list] + peptide_list = list(pdb.digest["peptide"]) peptide_list = [ x for x in peptide_list @@ -391,8 +390,7 @@ def test_length_restrictions(tiny_fasta_file): precursor_tolerance=20, isotope_error=[0], ) - peptide_list = pdb.digest - peptide_list = [x[0] for x in peptide_list] + peptide_list = list(pdb.digest["peptide"]) assert peptide_list == expected_long pdb = db_utils.ProteinDatabase( @@ -406,8 +404,7 @@ def test_length_restrictions(tiny_fasta_file): precursor_tolerance=20, isotope_error=[0], ) - peptide_list = pdb.digest - peptide_list = [x[0] for x in peptide_list] + peptide_list = list(pdb.digest["peptide"]) assert peptide_list == expected_short @@ -437,8 +434,7 @@ def test_digest_fasta_enzyme(tiny_fasta_file): precursor_tolerance=20, isotope_error=[0], ) - peptide_list = pdb.digest - peptide_list = [x[0] for x in peptide_list] + peptide_list = list(pdb.digest["peptide"]) assert peptide_list == expected_argc pdb = db_utils.ProteinDatabase( @@ -452,8 +448,7 @@ def test_digest_fasta_enzyme(tiny_fasta_file): precursor_tolerance=20, isotope_error=[0], ) - peptide_list = pdb.digest - peptide_list = [x[0] for x in peptide_list] + peptide_list = list(pdb.digest["peptide"]) assert peptide_list == expected_aspn @@ -478,8 +473,7 @@ def test_get_candidates(tiny_fasta_file): precursor_tolerance=10000, isotope_error=[0], ) - candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) - candidates = [x[0] for x in candidates] + candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_smallwindow == candidates pdb = db_utils.ProteinDatabase( @@ -493,8 +487,7 @@ def test_get_candidates(tiny_fasta_file): precursor_tolerance=150000, isotope_error=[0], ) - candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) - candidates = [x[0] for x in candidates] + candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_midwindow == candidates pdb = db_utils.ProteinDatabase( @@ -508,8 +501,7 @@ def test_get_candidates(tiny_fasta_file): precursor_tolerance=600000, isotope_error=[0], ) - candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) - candidates = [x[0] for x in candidates] + candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_widewindow == candidates @@ -522,35 +514,38 @@ def test_get_candidates_isotope_error(tiny_fasta_file): # 3: [977.510108, 997.257787] peptide_list = [ - ("A", 1001), - ("B", 1000), - ("C", 999), - ("D", 998), - ("E", 997), - ("F", 996), - ("G", 995), - ("H", 994), - ("I", 993), - ("J", 992), - ("K", 991), - ("L", 990), - ("M", 989), - ("N", 988), - ("O", 987), - ("P", 986), - ("Q", 985), - ("R", 984), - ("S", 983), - ("T", 982), - ("U", 981), - ("V", 980), - ("W", 979), - ("X", 978), - ("Y", 977), - ("Z", 976), + ("A", 1001, "foo"), + ("B", 1000, "foo"), + ("C", 999, "foo"), + ("D", 998, "foo"), + ("E", 997, "foo"), + ("F", 996, "foo"), + ("G", 995, "foo"), + ("H", 994, "foo"), + ("I", 993, "foo"), + ("J", 992, "foo"), + ("K", 991, "foo"), + ("L", 990, "foo"), + ("M", 989, "foo"), + ("N", 988, "foo"), + ("O", 987, "foo"), + ("P", 986, "foo"), + ("Q", 985, "foo"), + ("R", 984, "foo"), + ("S", 983, "foo"), + ("T", 982, "foo"), + ("U", 981, "foo"), + ("V", 980, "foo"), + ("W", 979, "foo"), + ("X", 978, "foo"), + ("Y", 977, "foo"), + ("Z", 976, "foo"), ] - peptide_list.sort(key=lambda x: x[1]) + peptide_list = pd.DataFrame( + peptide_list, columns=["peptide", "calc_mass", "protein"] + ) + peptide_list.sort_values("calc_mass", inplace=True) expected_isotope0 = list("UTSRQPONMLKJIHGFEDCB") expected_isotope1 = list("VUTSRQPONMLKJIHGFEDC") @@ -570,8 +565,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file): isotope_error=[0], ) pdb.digest = peptide_list - candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) - candidates = [x[0] for x in candidates] + candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_isotope0 == candidates pdb = db_utils.ProteinDatabase( @@ -586,8 +580,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file): isotope_error=[1], ) pdb.digest = peptide_list - candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) - candidates = [x[0] for x in candidates] + candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_isotope1 == candidates pdb = db_utils.ProteinDatabase( @@ -602,8 +595,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file): isotope_error=[2], ) pdb.digest = peptide_list - candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) - candidates = [x[0] for x in candidates] + candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_isotope2 == candidates pdb = db_utils.ProteinDatabase( @@ -618,8 +610,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file): isotope_error=[3], ) pdb.digest = peptide_list - candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) - candidates = [x[0] for x in candidates] + candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_isotope3 == candidates pdb = db_utils.ProteinDatabase( @@ -634,8 +625,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file): isotope_error=[0, 1, 2, 3], ) pdb.digest = peptide_list - candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) - candidates = [x[0] for x in candidates] + candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_isotope0123 == candidates From cfd39e80b4898077f92cacc6491a5c891c5a9454 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Fri, 23 Aug 2024 14:12:50 -0700 Subject: [PATCH 13/21] all comments addressed --- casanovo/config.yaml | 7 +++- casanovo/data/db_utils.py | 68 +++++++++++++++++++++++++++------ casanovo/denovo/model_runner.py | 1 + tests/conftest.py | 4 ++ tests/unit_tests/test_unit.py | 56 +++++++++++++++++++++++++++ 5 files changed, 123 insertions(+), 13 deletions(-) diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 860cfabb..87795db8 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -46,7 +46,7 @@ devices: # See pyteomics.parser.expasy_rules for valid enzymes enzyme: "trypsin" # Digestion type for candidate peptide generation. -# Full: standard digestion. Semi: Include products of semi-specific cleavage +# full: standard digestion. semi: Include products of semi-specific cleavage digestion: "full" # Number of allowed missed cleavages when digesting protein missed_cleavages: 0 @@ -55,6 +55,11 @@ missed_cleavages: 0 max_mods: # Maximum peptide length to consider max_peptide_len: 50 +# Toggle allowed modifications on/off +# Permanent fixed mod (don't include): C+57.021 +# Allowed variable mods: M+15.995, N+0.984, Q+0.984, +# Allowed N-terminal mods: +42.011, +43.006, -17.027, +43.006-17.027 +allowed_mods: "M+15.995,N+0.984,Q+0.984,+42.011,+43.006,-17.027,+43.006-17.027" ### diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index d249e0c7..2bdf3828 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -15,16 +15,6 @@ PROTON = 1.00727646677 ISOTOPE_SPACING = 1.003355 -var_mods = { - "d": ["N", "Q"], - "ox": ["M"], - "ace-": True, - "carb-": True, - "nh3x-": True, - "carbnh3x-": True, -} -fixed_mods = {"carbm": ["C"]} - class ProteinDatabase: """ @@ -51,6 +41,8 @@ class ProteinDatabase: The precursor mass tolerance in ppm. isotope_error : List[int] Isotopes to consider when comparing predicted and observed precursor m/z's. + allowed_mods : List[str] + A list of allowed modifications to consider. """ def __init__( @@ -64,7 +56,11 @@ def __init__( max_mods: int, precursor_tolerance: float, isotope_error: List[int], + allowed_mods: List[str], ): + self.fixed_mods, self.var_mods = self._construct_mods_dict( + allowed_mods + ) self.digest = self._digest_fasta( fasta_path, enzyme, @@ -197,8 +193,8 @@ def _digest_fasta( for pep, prot in peptide_list: peptide_isoforms = parser.isoforms( pep, - variable_mods=var_mods, - fixed_mods=fixed_mods, + variable_mods=self.var_mods, + fixed_mods=self.fixed_mods, max_mods=max_mods, ) peptide_isoforms = list( @@ -218,6 +214,54 @@ def _digest_fasta( logger.info("Digestion complete. %d peptides generated.", len(pdb_df)) return pdb_df + def _construct_mods_dict(self, allowed_mods): + """ + Constructs dictionaries of fixed and variable modifications. + + Parameters + ---------- + allowed_mods : str + A comma-separated list of allowed modifications. + + Returns + ------- + fixed_mods : dict + A dictionary of fixed modifications. + var_mods : dict + A dictionary of variable modifications. + """ + fixed_mods = {"carbm": ["C"]} + var_mods = {} + + if allowed_mods is "" or None: + return fixed_mods, var_mods + for mod in allowed_mods.split(","): + if mod == "M+15.995": + if "ox" not in var_mods: + var_mods["ox"] = [] + var_mods["ox"].append("M") + elif mod == "N+0.984": + if "d" not in var_mods: + var_mods["d"] = [] + var_mods["d"].append("N") + elif mod == "Q+0.984": + if "d" not in var_mods: + var_mods["d"] = [] + var_mods["d"].append("Q") + elif mod == "+42.011": + var_mods["ace-"] = True + elif mod == "+43.006": + var_mods["carb-"] = True + elif mod == "-17.027": + var_mods["nh3x-"] = True + elif mod == "+43.006-17.027": + var_mods["carbnh3x-"] = True + else: + logger.error("Modification %s not recognized.", mod) + raise ValueError(f"Modification {mod} not recognized.") + + return fixed_mods, var_mods + @jit def _to_mz(precursor_mass, charge): """ diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index b90f06b0..789c960b 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -124,6 +124,7 @@ def db_search( self.config.max_mods, self.config.precursor_mass_tol, self.config.isotope_error_range, + self.config.allowed_mods, ) self.loaders.setup(stage="test", annotated=False) self.trainer.predict(self.model, self.loaders.db_dataloader()) diff --git a/tests/conftest.py b/tests/conftest.py index f20d7879..452316c8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -299,6 +299,10 @@ def tiny_config(tmp_path): "-17.027": -17.026549, "+43.006-17.027": 25.980265, }, + "allowed_mods": ( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), } cfg_file = tmp_path / "config.yml" diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 2473a168..a31e2024 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -287,6 +287,10 @@ def test_digest_fasta_cleave(tiny_fasta_file): max_mods=0, precursor_tolerance=20, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) peptide_list = list(pdb.digest["peptide"]) assert peptide_list == expected @@ -356,6 +360,10 @@ def test_digest_fasta_mods(tiny_fasta_file): max_mods=1, precursor_tolerance=20, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) peptide_list = list(pdb.digest["peptide"]) peptide_list = [ @@ -389,6 +397,10 @@ def test_length_restrictions(tiny_fasta_file): max_mods=0, precursor_tolerance=20, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) peptide_list = list(pdb.digest["peptide"]) assert peptide_list == expected_long @@ -403,6 +415,10 @@ def test_length_restrictions(tiny_fasta_file): max_mods=0, precursor_tolerance=20, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) peptide_list = list(pdb.digest["peptide"]) assert peptide_list == expected_short @@ -433,6 +449,10 @@ def test_digest_fasta_enzyme(tiny_fasta_file): max_mods=0, precursor_tolerance=20, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) peptide_list = list(pdb.digest["peptide"]) assert peptide_list == expected_argc @@ -447,6 +467,10 @@ def test_digest_fasta_enzyme(tiny_fasta_file): max_mods=0, precursor_tolerance=20, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) peptide_list = list(pdb.digest["peptide"]) assert peptide_list == expected_aspn @@ -472,6 +496,10 @@ def test_get_candidates(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_smallwindow == candidates @@ -486,6 +514,10 @@ def test_get_candidates(tiny_fasta_file): max_mods=0, precursor_tolerance=150000, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_midwindow == candidates @@ -500,6 +532,10 @@ def test_get_candidates(tiny_fasta_file): max_mods=0, precursor_tolerance=600000, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_widewindow == candidates @@ -563,6 +599,10 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[0], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) pdb.digest = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) @@ -578,6 +618,10 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[1], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) pdb.digest = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) @@ -593,6 +637,10 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[2], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) pdb.digest = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) @@ -608,6 +656,10 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[3], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) pdb.digest = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) @@ -623,6 +675,10 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[0, 1, 2, 3], + allowed_mods=( + "M+15.995,N+0.984,Q+0.984," + "+42.011,+43.006,-17.027,+43.006-17.027" + ), ) pdb.digest = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) From 106c4ecc524c202a7624d6fa025afc82adac1a0c Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Wed, 28 Aug 2024 16:41:24 -0700 Subject: [PATCH 14/21] new comments addressed --- casanovo/config.yaml | 22 +-- casanovo/data/db_utils.py | 276 +++++++++++++++++--------------- casanovo/denovo/dataloaders.py | 22 +-- casanovo/denovo/model.py | 2 +- casanovo/denovo/model_runner.py | 2 +- tests/conftest.py | 25 ++- tests/test_integration.py | 4 +- tests/unit_tests/test_unit.py | 74 ++++----- 8 files changed, 209 insertions(+), 218 deletions(-) diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 87795db8..6c9063f5 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -11,13 +11,13 @@ # Max absolute difference allowed with respect to observed precursor m/z. # denovo: Predictions outside the tolerance range are assigned a negative peptide score. -# db-search: Used to create mas windows for candidate generation. +# db-search: Select candidate peptides within the specified precursor m/z tolerance. precursor_mass_tol: 50 # ppm # Isotopes to consider when comparing predicted and observed precursor m/z's. isotope_error_range: [0, 1] -# The minimum length of predicted/scored peptides. +# The minimum length of considered peptides. min_peptide_len: 6 -# Number of spectra or psms in one inference batch. +# Number of spectra in one inference batch. predict_batch_size: 1024 @@ -43,21 +43,21 @@ devices: ### # Enzyme for in silico digestion, used to generate candidate peptides. -# See pyteomics.parser.expasy_rules for valid enzymes +# See pyteomics.parser.expasy_rules for valid enzymes. enzyme: "trypsin" # Digestion type for candidate peptide generation. -# full: standard digestion. semi: Include products of semi-specific cleavage +# full: standard digestion. semi: Include products of semi-specific cleavage. digestion: "full" -# Number of allowed missed cleavages when digesting protein +# Number of allowed missed cleavages when digesting protein. missed_cleavages: 0 -# Maximum number of amino acid modifications per peptide. +# Maximum number of amino acid modifications per peptide, # None generates all possible isoforms as candidates. -max_mods: -# Maximum peptide length to consider +max_mods: 0 +# Maximum peptide length to consider. max_peptide_len: 50 -# Toggle allowed modifications on/off +# Select which modifications from the vocabulary can be used in candidate creation. # Permanent fixed mod (don't include): C+57.021 -# Allowed variable mods: M+15.995, N+0.984, Q+0.984, +# Allowed variable mods: M+15.995, N+0.984, Q+0.984 # Allowed N-terminal mods: +42.011, +43.006, -17.027, +43.006-17.027 allowed_mods: "M+15.995,N+0.984,Q+0.984,+42.011,+43.006,-17.027,+43.006-17.027" diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 2bdf3828..c1d5e91e 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -2,11 +2,11 @@ import logging import os -from typing import List +from typing import List, Tuple import depthcharge.masses -from numba import jit import pandas as pd +from numba import njit from pyteomics import fasta, parser logger = logging.getLogger("casanovo") @@ -39,10 +39,10 @@ class ProteinDatabase: The maximum number of modifications to allow per peptide. precursor_tolerance : float The precursor mass tolerance in ppm. - isotope_error : List[int] - Isotopes to consider when comparing predicted and observed precursor m/z's. - allowed_mods : List[str] - A list of allowed modifications to consider. + isotope_error : Tuple[int, int] + Isotope range [min, max] to consider when comparing predicted and observed precursor m/z's. + allowed_mods : str + A comma separated string of allowed modifications to consider. """ def __init__( @@ -55,13 +55,11 @@ def __init__( max_peptide_len: int, max_mods: int, precursor_tolerance: float, - isotope_error: List[int], - allowed_mods: List[str], + isotope_error: Tuple[int, int], + allowed_mods: str, ): - self.fixed_mods, self.var_mods = self._construct_mods_dict( - allowed_mods - ) - self.digest = self._digest_fasta( + self.fixed_mods, self.var_mods = _construct_mods_dict(allowed_mods) + self.db_peptides = self._digest_fasta( fasta_path, enzyme, digestion, @@ -77,7 +75,7 @@ def get_candidates( self, precursor_mz: float, charge: int, - ): + ) -> List[Tuple[str, str]]: """ Returns a list of candidate peptides that fall within the specified mass range. @@ -96,18 +94,18 @@ def get_candidates( """ candidates = [] - for e in self.isotope_error: + for e in range(self.isotope_error[0], self.isotope_error[1] + 1): iso_shift = ISOTOPE_SPACING * e upper_bound = float( - ProteinDatabase._to_raw_mass(precursor_mz, charge) - iso_shift + _to_raw_mass(precursor_mz, charge) - iso_shift ) * (1 + (self.precursor_tolerance / 1e6)) lower_bound = float( - ProteinDatabase._to_raw_mass(precursor_mz, charge) - iso_shift + _to_raw_mass(precursor_mz, charge) - iso_shift ) * (1 - (self.precursor_tolerance / 1e6)) - window = self.digest[ - (self.digest["calc_mass"] >= lower_bound) - & (self.digest["calc_mass"] <= upper_bound) + window = self.db_peptides[ + (self.db_peptides["calc_mass"] >= lower_bound) + & (self.db_peptides["calc_mass"] <= upper_bound) ] candidates.append(window[["peptide", "calc_mass", "protein"]]) @@ -125,7 +123,7 @@ def _digest_fasta( max_mods: int, min_peptide_length: int, max_peptide_length: int, - ): + ) -> pd.DataFrame: """ Digests a FASTA file and returns the peptides, their masses, and associated protein. @@ -158,13 +156,18 @@ def _digest_fasta( logger.error("File %s does not exist.", fasta_filename) raise FileNotFoundError(f"File {fasta_filename} does not exist.") - fasta_data = fasta.read(fasta_filename) peptide_list = [] if digestion not in ["full", "partial"]: logger.error("Digestion type %s not recognized.", digestion) raise ValueError(f"Digestion type {digestion} not recognized.") + if enzyme not in parser.expasy_rules: + logger.error( + "Enzyme %s not recognized. Must be in pyteomics.parser.expasy_rules", + enzyme, + ) + raise ValueError(f"Enzyme {enzyme} not recognized.") semi = digestion == "partial" - for header, seq in fasta_data: + for header, seq in fasta.read(fasta_filename): pep_set = parser.cleave( seq, rule=parser.expasy_rules[enzyme], @@ -182,136 +185,143 @@ def _digest_fasta( aa in pep for aa in "BJOUXZ" ): # Check for incorrect AA letters logger.warn( - "Skipping peptide with ambiguous amino acids: %s", pep + "Skipping peptide with unknown amino acids: %s", pep ) continue peptide_list.append((pep, protein)) # Generate modified peptides mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") - mod_peptide_list = [] - for pep, prot in peptide_list: - peptide_isoforms = parser.isoforms( - pep, - variable_mods=self.var_mods, - fixed_mods=self.fixed_mods, - max_mods=max_mods, - ) - peptide_isoforms = list( - map(ProteinDatabase._convert_from_modx, peptide_isoforms) - ) - mod_peptide_list.extend( - [mod_pep, mass_calculator.mass(mod_pep), prot] - for mod_pep in peptide_isoforms + peptide_isoforms = [ + ( + parser.isoforms( + pep, + variable_mods=self.var_mods, + fixed_mods=self.fixed_mods, + max_mods=max_mods, + ), + prot, ) - + for pep, prot in peptide_list + ] + mod_peptide_list = [ + (mod_pep, mass_calculator.mass(mod_pep), prot) + for isos, prot in peptide_isoforms + for mod_pep in map(_convert_from_modx, isos) + ] # Create a DataFrame for easy sorting and filtering - pdb_df = pd.DataFrame( + pep_table = pd.DataFrame( mod_peptide_list, columns=["peptide", "calc_mass", "protein"] ) - pdb_df.sort_values(by=["calc_mass", "peptide"], inplace=True) - - logger.info("Digestion complete. %d peptides generated.", len(pdb_df)) - return pdb_df + pep_table.sort_values(by=["calc_mass", "peptide"], inplace=True) - def _construct_mods_dict(self, allowed_mods): - """ - Constructs dictionaries of fixed and variable modifications. + logger.info( + "Digestion complete. %d peptides generated.", len(pep_table) + ) + return pep_table - Parameters - ---------- - allowed_mods : str - A comma-separated list of allowed modifications. - Returns - ------- - fixed_mods : dict - A dictionary of fixed modifications. - var_mods : dict - A dictionary of variable modifications. - """ - fixed_mods = {"carbm": ["C"]} - var_mods = {} - - if allowed_mods is "" or None: - return fixed_mods, var_mods - for mod in allowed_mods.split(","): - if mod == "M+15.995": - if "ox" not in var_mods: - var_mods["ox"] = [] - var_mods["ox"].append("M") - elif mod == "N+0.984": - if "d" not in var_mods: - var_mods["d"] = [] - var_mods["d"].append("N") - elif mod == "Q+0.984": - if "d" not in var_mods: - var_mods["d"] = [] - var_mods["d"].append("Q") - elif mod == "+42.011": - var_mods["ace-"] = True - elif mod == "+43.006": - var_mods["carb-"] = True - elif mod == "-17.027": - var_mods["nh3x-"] = True - elif mod == "+43.006-17.027": - var_mods["carbnh3x-"] = True - else: - logger.error("Modification %s not recognized.", mod) - raise ValueError(f"Modification {mod} not recognized.") +@njit +def _to_mz(precursor_mass, charge): + """ + Convert precursor neutral mass to m/z value. - return fixed_mods, var_mods + Parameters + ---------- + precursor_mass : float + The precursor neutral mass. + charge : int + The precursor charge. + + Returns + ------- + mz : float + The calculated precursor mass-to-charge ratio. + """ + return (precursor_mass + (charge * PROTON)) / charge - @jit - def _to_mz(precursor_mass, charge): - """ - Convert precursor neutral mass to m/z value. - Parameters - ---------- - precursor_mass : float - The precursor neutral mass. - charge : int - The precursor charge. +@njit +def _to_raw_mass(mz_mass, charge): + """ + Convert precursor m/z value to neutral mass. - Returns - ------- - mz : float - The calculated precursor mass-to-charge ratio. - """ - return (precursor_mass + (charge * PROTON)) / charge + Parameters + ---------- + mz_mass : float + The precursor mass-to-charge ratio. + charge : int + The precursor charge. + + Returns + ------- + mass : float + The calculated precursor neutral mass. + """ + return charge * (mz_mass - PROTON) - @jit - def _to_raw_mass(mz_mass, charge): - """ - Convert precursor m/z value to neutral mass. - Parameters - ---------- - mz_mass : float - The precursor mass-to-charge ratio. - charge : int - The precursor charge. +def _convert_from_modx(seq: str): + """Converts peptide sequence from modX format to Casanovo-acceptable modifications. - Returns - ------- - mass : float - The calculated precursor neutral mass. - """ - return charge * (mz_mass - PROTON) + Args: + seq (str): Peptide in modX format + """ + seq = seq.replace("carbmC", "C+57.021") # Fixed modification + seq = seq.replace("oxM", "M+15.995") + seq = seq.replace("dN", "N+0.984") + seq = seq.replace("dQ", "Q+0.984") + seq = seq.replace("ace-", "+42.011") + seq = seq.replace("carbnh3x-", "+43.006-17.027") + seq = seq.replace("carb-", "+43.006") + seq = seq.replace("nh3x-", "-17.027") + return seq + + +def _construct_mods_dict(allowed_mods): + """ + Constructs dictionaries of fixed and variable modifications. - def _convert_from_modx(seq: str): - """Converts peptide sequence from modX format to Casanovo-acceptable modifications. + Parameters + ---------- + allowed_mods : str + A comma-separated list of allowed modifications. + + Returns + ------- + fixed_mods : dict + A dictionary of fixed modifications. + var_mods : dict + A dictionary of variable modifications. + """ + fixed_mods = {"carbm": ["C"]} + var_mods = {} - Args: - seq (str): Peptide in modX format - """ - seq = seq.replace("carbmC", "C+57.021") # Fixed modification - seq = seq.replace("oxM", "M+15.995") - seq = seq.replace("dN", "N+0.984") - seq = seq.replace("dQ", "Q+0.984") - seq = seq.replace("ace-", "+42.011") - seq = seq.replace("carbnh3x-", "+43.006-17.027") - seq = seq.replace("carb-", "+43.006") - seq = seq.replace("nh3x-", "-17.027") - return seq + if not allowed_mods: + return fixed_mods, var_mods + for mod in allowed_mods.split(","): + if mod == "M+15.995": + if "ox" not in var_mods: + var_mods["ox"] = [] + var_mods["ox"].append("M") + elif mod == "N+0.984": + if "d" not in var_mods: + var_mods["d"] = [] + var_mods["d"].append("N") + elif mod == "Q+0.984": + if "d" not in var_mods: + var_mods["d"] = [] + var_mods["d"].append("Q") + elif mod == "+42.011": + var_mods["ace-"] = True + elif mod == "+43.006": + var_mods["carb-"] = True + elif mod == "-17.027": + var_mods["nh3x-"] = True + elif mod == "+43.006-17.027": + var_mods["carbnh3x-"] = True + else: + logger.error("Modification %s not recognized.", mod) + raise ValueError(f"Modification {mod} not recognized.") + + return fixed_mods, var_mods diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 2d9e200b..a6ab8ddc 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -1,14 +1,14 @@ """Data loaders for the de novo sequencing task.""" import functools -import os import logging +import os from typing import List, Optional, Tuple -from depthcharge.data import AnnotatedSpectrumIndex import lightning.pytorch as pl import numpy as np import torch +from depthcharge.data import AnnotatedSpectrumIndex from ..data import db_utils from ..data.datasets import ( @@ -89,7 +89,7 @@ def __init__( self.train_dataset = None self.valid_dataset = None self.test_dataset = None - self.pdb = None + self.protein_database = None def setup(self, stage: str = None, annotated: bool = True) -> None: """ @@ -187,7 +187,9 @@ def db_dataloader(self) -> torch.utils.data.DataLoader: return torch.utils.data.DataLoader( self.test_dataset, batch_size=self.eval_batch_size, - collate_fn=functools.partial(prepare_psm_batch, pdb=self.pdb), + collate_fn=functools.partial( + prepare_psm_batch, protein_database=self.protein_database + ), pin_memory=True, num_workers=self.n_workers, shuffle=False, @@ -235,8 +237,8 @@ def prepare_batch( def prepare_psm_batch( batch: List[Tuple[torch.Tensor, float, int, str]], - pdb: db_utils.ProteinDatabase, -): + protein_database: db_utils.ProteinDatabase, +) -> Tuple[torch.Tensor, torch.Tensor, np.ndarray, List[str], List[str]]: """ Collate MS/MS spectra into a batch for DB search. @@ -249,7 +251,7 @@ def prepare_psm_batch( A batch of data from an AnnotatedSpectrumDataset, consisting of for each spectrum (i) a tensor with the m/z and intensity peak values, (ii), the precursor m/z, (iii) the precursor charge, (iv) the spectrum identifier. - pdb : db_utils.ProteinDatabase + protein_database : db_utils.ProteinDatabase The protein database to use for candidate peptide retrieval. Returns @@ -283,9 +285,9 @@ def prepare_psm_batch( all_peptides = [] all_proteins = [] for idx in range(len(batch)): - digest_data = pdb.get_candidates( - float(precursor_mzs[idx]), - float(precursor_charges[idx]), + digest_data = protein_database.get_candidates( + precursor_mzs[idx].type(torch.float64).item(), + precursor_charges[idx].type(torch.int64).item(), ) try: spec_peptides, pep_protein = digest_data diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 79848682..b38a27c0 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -991,7 +991,7 @@ def configure_optimizers( class DbSpec2Pep(Spec2Pep): """ - Subclass of Spec2Pep for the use of Casanovo as an \ + Subclass of Spec2Pep for the use of Casanovo as an MS/MS database search score function. Uses teacher forcing to 'query' Casanovo for its score for each AA diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 789c960b..6928560d 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -114,7 +114,7 @@ def db_search( self.writer.set_ms_run(test_index.ms_files) self.initialize_data_module(test_index=test_index) - self.loaders.pdb = db_utils.ProteinDatabase( + self.loaders.protein_database = db_utils.ProteinDatabase( fasta_path, self.config.enzyme, self.config.digestion, diff --git a/tests/conftest.py b/tests/conftest.py index 452316c8..90e522fe 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -28,7 +28,7 @@ def tiny_fasta_file(tmp_path): @pytest.fixture -def mgf_db_search(tmp_path): +def mgf_medium(tmp_path): """An MGF file with 7 spectra and scan numbers, C+57.021 mass modification considered""" peptides = [ "ATSIPAR", @@ -40,10 +40,10 @@ def mgf_db_search(tmp_path): "FSGSGSGTDFTLTISSLQPEDFAVYYCQQDYNLP", ] mgf_file = tmp_path / "db_search.mgf" - return _create_mgf(peptides, mgf_file, c_mod=True) + return _create_mgf(peptides, mgf_file, mod_aa_mass={"C": 160.030649}) -def _create_mgf(peptides, mgf_file, random_state=42, c_mod=False): +def _create_mgf(peptides, mgf_file, random_state=42, mod_aa_mass=None): """ Create a fake MGF file from one or more peptides. @@ -55,9 +55,9 @@ def _create_mgf(peptides, mgf_file, random_state=42, c_mod=False): The MGF file to create. random_state : int or numpy.random.Generator, optional The random seed. The charge states are chosen to be 2 or 3 randomly. - c_mod : bool, optional - Whether to use the constant carbamidomethylation - of C in mass calculations. + mod_aa_mass : dict, optional + A dictionary that specifies the modified masses of amino acids. + e.g. {"C": 160.030649} for carbamidomethylated C. Returns ------- @@ -65,7 +65,7 @@ def _create_mgf(peptides, mgf_file, random_state=42, c_mod=False): """ rng = np.random.default_rng(random_state) entries = [ - _create_mgf_entry(p, rng.choice([2, 3]), c_mod) for p in peptides + _create_mgf_entry(p, rng.choice([2, 3]), mod_aa_mass) for p in peptides ] with mgf_file.open("w+") as mgf_ref: mgf_ref.write("\n".join(entries)) @@ -73,7 +73,7 @@ def _create_mgf(peptides, mgf_file, random_state=42, c_mod=False): return mgf_file -def _create_mgf_entry(peptide, charge=2, c_mod=False): +def _create_mgf_entry(peptide, charge=2, mod_aa_mass=None): """ Create a MassIVE-KB style MGF entry for a single PSM. @@ -83,20 +83,19 @@ def _create_mgf_entry(peptide, charge=2, c_mod=False): A peptide sequence. charge : int, optional The peptide charge state. - c_mod : bool, optional - Whether to use the constant carbamidomethylation - of C in mass calculations. + mod_aa_mass : dict, optional + A dictionary that specifies the modified masses of amino acids. Returns ------- str The PSM entry in an MGF file format. """ - if not c_mod: + if mod_aa_mass is None: precursor_mz = calculate_mass(peptide, charge=int(charge)) else: aa_mass = std_aa_mass - aa_mass.update({"C": 160.030649}) # Carbamidomethylated C mass + aa_mass.update(mod_aa_mass) precursor_mz = fast_mass(peptide, charge=int(charge), aa_mass=aa_mass) mzs, intensities = _peptide_to_peaks(peptide, charge) frags = "\n".join([f"{m} {i}" for m, i in zip(mzs, intensities)]) diff --git a/tests/test_integration.py b/tests/test_integration.py index 61f735c3..4275d792 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -8,7 +8,7 @@ def test_db_search( - mgf_db_search, tiny_fasta_file, tiny_config, tmp_path, monkeypatch + mgf_medium, tiny_fasta_file, tiny_config, tmp_path, monkeypatch ): # Run a command: monkeypatch.setattr(casanovo, "__version__", "4.1.0") @@ -24,7 +24,7 @@ def test_db_search( tiny_config, "--output", str(output_path), - str(mgf_db_search), + str(mgf_medium), str(tiny_fasta_file), ] diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index a31e2024..51d9a3c9 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -286,13 +286,13 @@ def test_digest_fasta_cleave(tiny_fasta_file): max_peptide_len=50, max_mods=0, precursor_tolerance=20, - isotope_error=[0], + isotope_error=[0, 0], allowed_mods=( "M+15.995,N+0.984,Q+0.984," "+42.011,+43.006,-17.027,+43.006-17.027" ), ) - peptide_list = list(pdb.digest["peptide"]) + peptide_list = list(pdb.db_peptides["peptide"]) assert peptide_list == expected @@ -359,13 +359,13 @@ def test_digest_fasta_mods(tiny_fasta_file): max_peptide_len=50, max_mods=1, precursor_tolerance=20, - isotope_error=[0], + isotope_error=[0, 0], allowed_mods=( "M+15.995,N+0.984,Q+0.984," "+42.011,+43.006,-17.027,+43.006-17.027" ), ) - peptide_list = list(pdb.digest["peptide"]) + peptide_list = list(pdb.db_peptides["peptide"]) peptide_list = [ x for x in peptide_list @@ -396,13 +396,13 @@ def test_length_restrictions(tiny_fasta_file): max_peptide_len=50, max_mods=0, precursor_tolerance=20, - isotope_error=[0], + isotope_error=[0, 0], allowed_mods=( "M+15.995,N+0.984,Q+0.984," "+42.011,+43.006,-17.027,+43.006-17.027" ), ) - peptide_list = list(pdb.digest["peptide"]) + peptide_list = list(pdb.db_peptides["peptide"]) assert peptide_list == expected_long pdb = db_utils.ProteinDatabase( @@ -414,13 +414,13 @@ def test_length_restrictions(tiny_fasta_file): max_peptide_len=8, max_mods=0, precursor_tolerance=20, - isotope_error=[0], + isotope_error=[0, 0], allowed_mods=( "M+15.995,N+0.984,Q+0.984," "+42.011,+43.006,-17.027,+43.006-17.027" ), ) - peptide_list = list(pdb.digest["peptide"]) + peptide_list = list(pdb.db_peptides["peptide"]) assert peptide_list == expected_short @@ -448,13 +448,13 @@ def test_digest_fasta_enzyme(tiny_fasta_file): max_peptide_len=50, max_mods=0, precursor_tolerance=20, - isotope_error=[0], + isotope_error=[0, 0], allowed_mods=( "M+15.995,N+0.984,Q+0.984," "+42.011,+43.006,-17.027,+43.006-17.027" ), ) - peptide_list = list(pdb.digest["peptide"]) + peptide_list = list(pdb.db_peptides["peptide"]) assert peptide_list == expected_argc pdb = db_utils.ProteinDatabase( @@ -466,13 +466,13 @@ def test_digest_fasta_enzyme(tiny_fasta_file): max_peptide_len=50, max_mods=0, precursor_tolerance=20, - isotope_error=[0], + isotope_error=[0, 0], allowed_mods=( "M+15.995,N+0.984,Q+0.984," "+42.011,+43.006,-17.027,+43.006-17.027" ), ) - peptide_list = list(pdb.digest["peptide"]) + peptide_list = list(pdb.db_peptides["peptide"]) assert peptide_list == expected_aspn @@ -495,7 +495,7 @@ def test_get_candidates(tiny_fasta_file): max_peptide_len=50, max_mods=0, precursor_tolerance=10000, - isotope_error=[0], + isotope_error=[0, 0], allowed_mods=( "M+15.995,N+0.984,Q+0.984," "+42.011,+43.006,-17.027,+43.006-17.027" @@ -513,7 +513,7 @@ def test_get_candidates(tiny_fasta_file): max_peptide_len=50, max_mods=0, precursor_tolerance=150000, - isotope_error=[0], + isotope_error=[0, 0], allowed_mods=( "M+15.995,N+0.984,Q+0.984," "+42.011,+43.006,-17.027,+43.006-17.027" @@ -531,7 +531,7 @@ def test_get_candidates(tiny_fasta_file): max_peptide_len=50, max_mods=0, precursor_tolerance=600000, - isotope_error=[0], + isotope_error=[0, 0], allowed_mods=( "M+15.995,N+0.984,Q+0.984," "+42.011,+43.006,-17.027,+43.006-17.027" @@ -584,9 +584,8 @@ def test_get_candidates_isotope_error(tiny_fasta_file): peptide_list.sort_values("calc_mass", inplace=True) expected_isotope0 = list("UTSRQPONMLKJIHGFEDCB") - expected_isotope1 = list("VUTSRQPONMLKJIHGFEDC") - expected_isotope2 = list("WVUTSRQPONMLKJIHGFED") - expected_isotope3 = list("XWVUTSRQPONMLKJIHGFE") + expected_isotope01 = list("VUTSRQPONMLKJIHGFEDCB") + expected_isotope012 = list("WVUTSRQPONMLKJIHGFEDCB") expected_isotope0123 = list("XWVUTSRQPONMLKJIHGFEDCB") pdb = db_utils.ProteinDatabase( @@ -598,13 +597,13 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_peptide_len=0, max_mods=0, precursor_tolerance=10000, - isotope_error=[0], + isotope_error=[0, 0], allowed_mods=( "M+15.995,N+0.984,Q+0.984," "+42.011,+43.006,-17.027,+43.006-17.027" ), ) - pdb.digest = peptide_list + pdb.db_peptides = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_isotope0 == candidates @@ -617,15 +616,15 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_peptide_len=0, max_mods=0, precursor_tolerance=10000, - isotope_error=[1], + isotope_error=[0, 1], allowed_mods=( "M+15.995,N+0.984,Q+0.984," "+42.011,+43.006,-17.027,+43.006-17.027" ), ) - pdb.digest = peptide_list + pdb.db_peptides = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) - assert expected_isotope1 == candidates + assert expected_isotope01 == candidates pdb = db_utils.ProteinDatabase( fasta_path=str(tiny_fasta_file), @@ -636,15 +635,15 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_peptide_len=0, max_mods=0, precursor_tolerance=10000, - isotope_error=[2], + isotope_error=[0, 2], allowed_mods=( "M+15.995,N+0.984,Q+0.984," "+42.011,+43.006,-17.027,+43.006-17.027" ), ) - pdb.digest = peptide_list + pdb.db_peptides = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) - assert expected_isotope2 == candidates + assert expected_isotope012 == candidates pdb = db_utils.ProteinDatabase( fasta_path=str(tiny_fasta_file), @@ -655,32 +654,13 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_peptide_len=0, max_mods=0, precursor_tolerance=10000, - isotope_error=[3], + isotope_error=[0, 3], allowed_mods=( "M+15.995,N+0.984,Q+0.984," "+42.011,+43.006,-17.027,+43.006-17.027" ), ) - pdb.digest = peptide_list - candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) - assert expected_isotope3 == candidates - - pdb = db_utils.ProteinDatabase( - fasta_path=str(tiny_fasta_file), - enzyme="trypsin", - digestion="full", - missed_cleavages=0, - min_peptide_len=0, - max_peptide_len=0, - max_mods=0, - precursor_tolerance=10000, - isotope_error=[0, 1, 2, 3], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" - ), - ) - pdb.digest = peptide_list + pdb.db_peptides = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_isotope0123 == candidates From 0dfdb2cb89514a0189e20cf19c231363567a7c72 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Mon, 2 Sep 2024 17:48:31 -0700 Subject: [PATCH 15/21] final adjustments added --- casanovo/config.yaml | 19 ++-- casanovo/data/db_utils.py | 158 +++++++++++++++++++------------- casanovo/denovo/dataloaders.py | 7 +- casanovo/denovo/model.py | 4 +- casanovo/denovo/model_runner.py | 17 ++-- tests/conftest.py | 43 ++++++++- tests/unit_tests/test_unit.py | 151 +++++++++++++++++++----------- 7 files changed, 254 insertions(+), 145 deletions(-) diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 6c9063f5..af2f79d1 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -17,6 +17,8 @@ precursor_mass_tol: 50 # ppm isotope_error_range: [0, 1] # The minimum length of considered peptides. min_peptide_len: 6 +# The maximum length of considered peptides. +max_length: 100 # Number of spectra in one inference batch. predict_batch_size: 1024 @@ -47,19 +49,20 @@ devices: enzyme: "trypsin" # Digestion type for candidate peptide generation. # full: standard digestion. semi: Include products of semi-specific cleavage. +# Can also take a regex expression to specify custom digestion rules. digestion: "full" # Number of allowed missed cleavages when digesting protein. missed_cleavages: 0 # Maximum number of amino acid modifications per peptide, # None generates all possible isoforms as candidates. -max_mods: 0 -# Maximum peptide length to consider. -max_peptide_len: 50 +max_mods: 1 # Select which modifications from the vocabulary can be used in candidate creation. -# Permanent fixed mod (don't include): C+57.021 -# Allowed variable mods: M+15.995, N+0.984, Q+0.984 -# Allowed N-terminal mods: +42.011, +43.006, -17.027, +43.006-17.027 -allowed_mods: "M+15.995,N+0.984,Q+0.984,+42.011,+43.006,-17.027,+43.006-17.027" +# Format: Comma-separated list of "aa:mod_residue", +# where aa is a standard amino acid or "X" for an N-terminal mod +# and mod_residue is a key from the "residues" dictionary. +# Example: "M:M+15.995,X:+43.006-17.027" +allowed_fixed_mods: "C:C+57.021" +allowed_var_mods: "M:M+15.995,N:N+0.984,Q:Q+0.984,X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ### @@ -111,8 +114,6 @@ dropout: 0.0 # Number of dimensions to use for encoding peak intensity. # Projected up to `dim_model` by default and summed with the peak m/z encoding. dim_intensity: -# Max decoded peptide length. -max_length: 100 # The number of iterations for the linear warm-up of the learning rate. warmup_iters: 100_000 # The number of iterations for the cosine half period of the learning rate. diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index c1d5e91e..c9201538 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -1,13 +1,17 @@ """Unique methods used within db-search mode""" +import functools import logging import os +import re +import string from typing import List, Tuple import depthcharge.masses import pandas as pd +import pyteomics.fasta as fasta +import pyteomics.parser as parser from numba import njit -from pyteomics import fasta, parser logger = logging.getLogger("casanovo") @@ -41,8 +45,12 @@ class ProteinDatabase: The precursor mass tolerance in ppm. isotope_error : Tuple[int, int] Isotope range [min, max] to consider when comparing predicted and observed precursor m/z's. - allowed_mods : str - A comma separated string of allowed modifications to consider. + allowed_fixed_mods : str + A comma separated string of fixed modifications to consider. + allowed_var_mods : str + A comma separated string of variable modifications to consider. + residues : dict + A dictionary of amino acid masses. """ def __init__( @@ -56,9 +64,14 @@ def __init__( max_mods: int, precursor_tolerance: float, isotope_error: Tuple[int, int], - allowed_mods: str, + allowed_fixed_mods: str, + allowed_var_mods: str, + residues: dict, ): - self.fixed_mods, self.var_mods = _construct_mods_dict(allowed_mods) + self.residues = residues + self.fixed_mods, self.var_mods, self.swap_map = _construct_mods_dict( + allowed_fixed_mods, allowed_var_mods + ) self.db_peptides = self._digest_fasta( fasta_path, enzyme, @@ -88,20 +101,22 @@ def get_candidates( Returns ------- - candidates : List[Tuple[str, str]] - A list of candidate peptides and associated - protein. + candidates : pd.Series + A series of candidate peptides. """ candidates = [] for e in range(self.isotope_error[0], self.isotope_error[1] + 1): iso_shift = ISOTOPE_SPACING * e - upper_bound = float( - _to_raw_mass(precursor_mz, charge) - iso_shift - ) * (1 + (self.precursor_tolerance / 1e6)) - lower_bound = float( + shift_raw_mass = float( _to_raw_mass(precursor_mz, charge) - iso_shift - ) * (1 - (self.precursor_tolerance / 1e6)) + ) + upper_bound = shift_raw_mass * ( + 1 + (self.precursor_tolerance / 1e6) + ) + lower_bound = shift_raw_mass * ( + 1 - (self.precursor_tolerance / 1e6) + ) window = self.db_peptides[ (self.db_peptides["calc_mass"] >= lower_bound) @@ -112,7 +127,25 @@ def get_candidates( candidates = pd.concat(candidates) candidates.drop_duplicates(inplace=True) candidates.sort_values(by=["calc_mass", "peptide"], inplace=True) - return list(candidates["peptide"]), list(candidates["protein"]) + return candidates["peptide"], candidates["protein"] + + def get_associated_protein(self, peptide: str) -> str: + """ + Returns the associated protein for a given peptide. + + Parameters + ---------- + peptide : str + The peptide sequence. + + Returns + ------- + protein : str + The associated protein. + """ + return self.db_peptides[self.db_peptides["peptide"] == peptide][ + "protein" + ].values[0] def _digest_fasta( self, @@ -161,16 +194,18 @@ def _digest_fasta( logger.error("Digestion type %s not recognized.", digestion) raise ValueError(f"Digestion type {digestion} not recognized.") if enzyme not in parser.expasy_rules: - logger.error( - "Enzyme %s not recognized. Must be in pyteomics.parser.expasy_rules", + logger.info( + "Enzyme %s not recognized. Interpreting as cleavage rule.", enzyme, ) - raise ValueError(f"Enzyme {enzyme} not recognized.") semi = digestion == "partial" + valid_aa = set( + [re.sub(r"[^A-Z]+", "", res) for res in self.residues.keys()] + ) for header, seq in fasta.read(fasta_filename): pep_set = parser.cleave( seq, - rule=parser.expasy_rules[enzyme], + rule=enzyme, missed_cleavages=missed_cleavages, semi=semi, ) @@ -181,9 +216,8 @@ def _digest_fasta( or len(pep) > max_peptide_length ): continue - if any( - aa in pep for aa in "BJOUXZ" - ): # Check for incorrect AA letters + + if any(aa not in valid_aa for aa in pep): logger.warn( "Skipping peptide with unknown amino acids: %s", pep ) @@ -207,7 +241,10 @@ def _digest_fasta( mod_peptide_list = [ (mod_pep, mass_calculator.mass(mod_pep), prot) for isos, prot in peptide_isoforms - for mod_pep in map(_convert_from_modx, isos) + for mod_pep in map( + functools.partial(_convert_from_modx, swap_map=self.swap_map), + isos, + ) ] # Create a DataFrame for easy sorting and filtering pep_table = pd.DataFrame( @@ -261,31 +298,29 @@ def _to_raw_mass(mz_mass, charge): return charge * (mz_mass - PROTON) -def _convert_from_modx(seq: str): +def _convert_from_modx(seq: str, swap_map: dict) -> str: """Converts peptide sequence from modX format to Casanovo-acceptable modifications. Args: - seq (str): Peptide in modX format + seq : str + Peptide in modX format + swap_map : dict + Dictionary that allows for swapping of modX to Casanovo-acceptable modifications. """ - seq = seq.replace("carbmC", "C+57.021") # Fixed modification - seq = seq.replace("oxM", "M+15.995") - seq = seq.replace("dN", "N+0.984") - seq = seq.replace("dQ", "Q+0.984") - seq = seq.replace("ace-", "+42.011") - seq = seq.replace("carbnh3x-", "+43.006-17.027") - seq = seq.replace("carb-", "+43.006") - seq = seq.replace("nh3x-", "-17.027") - return seq - - -def _construct_mods_dict(allowed_mods): + regex = re.compile("(%s)" % "|".join(map(re.escape, swap_map.keys()))) + return regex.sub(lambda x: swap_map[x.group()], seq) + + +def _construct_mods_dict(allowed_fixed_mods, allowed_var_mods): """ Constructs dictionaries of fixed and variable modifications. Parameters ---------- - allowed_mods : str - A comma-separated list of allowed modifications. + allowed_fixed_mods : str + A comma separated string of fixed modifications to consider. + allowed_var_mods : str + A comma separated string of variable modifications to consider. Returns ------- @@ -293,35 +328,26 @@ def _construct_mods_dict(allowed_mods): A dictionary of fixed modifications. var_mods : dict A dictionary of variable modifications. + swap_map : dict + A dictionary that allows for swapping of modX to Casanovo-acceptable modifications. """ - fixed_mods = {"carbm": ["C"]} - var_mods = {} + swap_map = {} + fixed_mods = {} + for idx, mod in enumerate(allowed_fixed_mods.split(",")): + aa, mod_aa = mod.split(":") + mod_id = string.ascii_lowercase[idx] + fixed_mods[mod_id] = [aa] + swap_map[f"{mod_id}{aa}"] = f"{mod_aa}" - if not allowed_mods: - return fixed_mods, var_mods - for mod in allowed_mods.split(","): - if mod == "M+15.995": - if "ox" not in var_mods: - var_mods["ox"] = [] - var_mods["ox"].append("M") - elif mod == "N+0.984": - if "d" not in var_mods: - var_mods["d"] = [] - var_mods["d"].append("N") - elif mod == "Q+0.984": - if "d" not in var_mods: - var_mods["d"] = [] - var_mods["d"].append("Q") - elif mod == "+42.011": - var_mods["ace-"] = True - elif mod == "+43.006": - var_mods["carb-"] = True - elif mod == "-17.027": - var_mods["nh3x-"] = True - elif mod == "+43.006-17.027": - var_mods["carbnh3x-"] = True + var_mods = {} + for idx, mod in enumerate(allowed_var_mods.split(",")): + aa, mod_aa = mod.split(":") + mod_id = string.ascii_lowercase[idx] + if aa == "X": + var_mods[f"{mod_id}-"] = True + swap_map[f"{mod_id}-"] = f"{mod_aa}" else: - logger.error("Modification %s not recognized.", mod) - raise ValueError(f"Modification {mod} not recognized.") + var_mods[mod_id] = [aa] + swap_map[f"{mod_id}{aa}"] = f"{mod_aa}" - return fixed_mods, var_mods + return fixed_mods, var_mods, swap_map diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index a6ab8ddc..6e8c93b3 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -267,7 +267,7 @@ def prepare_psm_batch( all_peptides : List[str] The candidate peptides for each spectrum. all_proteins : List[str] - The associated proteins for each candidate peptide. + The proteins associated with each candidate peptide. """ spectra, precursor_mzs, precursor_charges, spectrum_ids = list(zip(*batch)) spectra = torch.nn.utils.rnn.pad_sequence(spectra, batch_first=True) @@ -285,12 +285,11 @@ def prepare_psm_batch( all_peptides = [] all_proteins = [] for idx in range(len(batch)): - digest_data = protein_database.get_candidates( + spec_peptides, spec_proteins = protein_database.get_candidates( precursor_mzs[idx].type(torch.float64).item(), precursor_charges[idx].type(torch.int64).item(), ) try: - spec_peptides, pep_protein = digest_data all_spectra.append( spectra[idx].unsqueeze(0).repeat(len(spec_peptides), 1, 1) ) @@ -299,7 +298,7 @@ def prepare_psm_batch( ) all_spectrum_ids.extend([spectrum_ids[idx]] * len(spec_peptides)) all_peptides.extend(spec_peptides) - all_proteins.extend(pep_protein) + all_proteins.extend(spec_proteins) except ValueError: logger.warning( "No candidates found for spectrum %s", spectrum_ids[idx] diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index b38a27c0..dc7e5f7b 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1017,9 +1017,9 @@ def predict_step(self, batch, *args): Parameters ---------- - batch : Tuple[torch.Tensor, torch.Tensor, np.array, List[str], List[str]] + batch : Tuple[torch.Tensor, torch.Tensor, np.array, List[str]] A batch of (i) MS/MS spectra, (ii) precursor information, (iii) - spectrum identifiers, (iv) candidate peptides, (v) associated proteins. + spectrum identifiers, (iv) candidate peptides, (v) associated protein. Returns ------- diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 6928560d..395320e5 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -110,22 +110,25 @@ def db_search( self.initialize_model(train=False, db_search=True) self.model.out_writer = self.writer self.model.psm_batch_size = self.config.predict_batch_size - test_index = self._get_index(peak_path, False, "db search") - self.writer.set_ms_run(test_index.ms_files) - - self.initialize_data_module(test_index=test_index) - self.loaders.protein_database = db_utils.ProteinDatabase( + self.model.protein_database = db_utils.ProteinDatabase( fasta_path, self.config.enzyme, self.config.digestion, self.config.missed_cleavages, self.config.min_peptide_len, - self.config.max_peptide_len, + self.config.max_length, self.config.max_mods, self.config.precursor_mass_tol, self.config.isotope_error_range, - self.config.allowed_mods, + self.config.allowed_fixed_mods, + self.config.allowed_var_mods, + self.config.residues, ) + test_index = self._get_index(peak_path, False, "db search") + self.writer.set_ms_run(test_index.ms_files) + + self.initialize_data_module(test_index=test_index) + self.loaders.protein_database = self.model.protein_database self.loaders.setup(stage="test", annotated=False) self.trainer.predict(self.model, self.loaders.db_dataloader()) diff --git a/tests/conftest.py b/tests/conftest.py index 90e522fe..3b94896a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -241,7 +241,7 @@ def tiny_config(tmp_path): "precursor_mass_tol": 5, "isotope_error_range": [0, 1], "min_peptide_len": 6, - "max_peptide_len": 50, + "max_length": 100, "enzyme": "trypsin", "digestion": "full", "missed_cleavages": 0, @@ -263,7 +263,6 @@ def tiny_config(tmp_path): "dim_model": 512, "dropout": 0.0, "dim_intensity": None, - "max_length": 100, "learning_rate": 5e-4, "weight_decay": 1e-5, "train_batch_size": 32, @@ -298,9 +297,10 @@ def tiny_config(tmp_path): "-17.027": -17.026549, "+43.006-17.027": 25.980265, }, - "allowed_mods": ( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + "allowed_fixed_mods": "C:C+57.021", + "allowed_var_mods": ( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), } @@ -311,6 +311,39 @@ def tiny_config(tmp_path): return cfg_file +@pytest.fixture +def residues_dict(): + return { + "G": 57.021464, + "A": 71.037114, + "S": 87.032028, + "P": 97.052764, + "V": 99.068414, + "T": 101.047670, + "C+57.021": 160.030649, + "L": 113.084064, + "I": 113.084064, + "N": 114.042927, + "D": 115.026943, + "Q": 128.058578, + "K": 128.094963, + "E": 129.042593, + "M": 131.040485, + "H": 137.058912, + "F": 147.068414, + "R": 156.101111, + "Y": 163.063329, + "W": 186.079313, + "M+15.995": 147.035400, + "N+0.984": 115.026943, + "Q+0.984": 129.042594, + "+42.011": 42.010565, + "+43.006": 43.005814, + "-17.027": -17.026549, + "+43.006-17.027": 25.980265, + } + + @pytest.fixture def tide_dir_small(tmp_path): """A directory with a very small TIDE search result.""" diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 51d9a3c9..c06ec788 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -221,7 +221,7 @@ def test_calc_match_score(): assert np.sum(masked_per_aa_scores.numpy()[3]) == 3 -def test_digest_fasta_cleave(tiny_fasta_file): +def test_digest_fasta_cleave(tiny_fasta_file, residues_dict): # No missed cleavages expected_normal = [ @@ -287,16 +287,18 @@ def test_digest_fasta_cleave(tiny_fasta_file): max_mods=0, precursor_tolerance=20, isotope_error=[0, 0], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), + residues=residues_dict, ) peptide_list = list(pdb.db_peptides["peptide"]) assert peptide_list == expected -def test_digest_fasta_mods(tiny_fasta_file): +def test_digest_fasta_mods(tiny_fasta_file, residues_dict): # 1 modification allowed # fixed: C+57.02146 # variable: 1M+15.994915,1N+0.984016,1Q+0.984016 @@ -360,10 +362,12 @@ def test_digest_fasta_mods(tiny_fasta_file): max_mods=1, precursor_tolerance=20, isotope_error=[0, 0], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), + residues=residues_dict, ) peptide_list = list(pdb.db_peptides["peptide"]) peptide_list = [ @@ -376,7 +380,7 @@ def test_digest_fasta_mods(tiny_fasta_file): assert peptide_list == expected_1mod -def test_length_restrictions(tiny_fasta_file): +def test_length_restrictions(tiny_fasta_file, residues_dict): # length between 20 and 50 expected_long = [ "MEAPAQLLFLLLLWLPDTTR", @@ -397,10 +401,12 @@ def test_length_restrictions(tiny_fasta_file): max_mods=0, precursor_tolerance=20, isotope_error=[0, 0], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), + residues=residues_dict, ) peptide_list = list(pdb.db_peptides["peptide"]) assert peptide_list == expected_long @@ -415,16 +421,18 @@ def test_length_restrictions(tiny_fasta_file): max_mods=0, precursor_tolerance=20, isotope_error=[0, 0], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), + residues=residues_dict, ) peptide_list = list(pdb.db_peptides["peptide"]) assert peptide_list == expected_short -def test_digest_fasta_enzyme(tiny_fasta_file): +def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): # arg-c enzyme expected_argc = [ "ATSIPAR", @@ -449,10 +457,12 @@ def test_digest_fasta_enzyme(tiny_fasta_file): max_mods=0, precursor_tolerance=20, isotope_error=[0, 0], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), + residues=residues_dict, ) peptide_list = list(pdb.db_peptides["peptide"]) assert peptide_list == expected_argc @@ -467,16 +477,39 @@ def test_digest_fasta_enzyme(tiny_fasta_file): max_mods=0, precursor_tolerance=20, isotope_error=[0, 0], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), + residues=residues_dict, ) peptide_list = list(pdb.db_peptides["peptide"]) assert peptide_list == expected_aspn + # Tesr regex rule instead of named enzyme + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="R", + digestion="full", + missed_cleavages=0, + min_peptide_len=6, + max_peptide_len=50, + max_mods=0, + precursor_tolerance=20, + isotope_error=[0, 0], + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + ), + residues=residues_dict, + ) + peptide_list = list(pdb.db_peptides["peptide"]) + assert peptide_list == expected_argc + -def test_get_candidates(tiny_fasta_file): +def test_get_candidates(tiny_fasta_file, residues_dict): # precursor_window is 10000 expected_smallwindow = ["LLIYGASTR"] @@ -496,13 +529,15 @@ def test_get_candidates(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[0, 0], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), + residues=residues_dict, ) candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) - assert expected_smallwindow == candidates + assert expected_smallwindow == list(candidates) pdb = db_utils.ProteinDatabase( fasta_path=str(tiny_fasta_file), @@ -514,13 +549,15 @@ def test_get_candidates(tiny_fasta_file): max_mods=0, precursor_tolerance=150000, isotope_error=[0, 0], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), + residues=residues_dict, ) candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) - assert expected_midwindow == candidates + assert expected_midwindow == list(candidates) pdb = db_utils.ProteinDatabase( fasta_path=str(tiny_fasta_file), @@ -532,16 +569,18 @@ def test_get_candidates(tiny_fasta_file): max_mods=0, precursor_tolerance=600000, isotope_error=[0, 0], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), + residues=residues_dict, ) candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) - assert expected_widewindow == candidates + assert expected_widewindow == list(candidates) -def test_get_candidates_isotope_error(tiny_fasta_file): +def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): # Tide isotope error windows for 496.2, 2+: # 0: [980.481617, 1000.289326] @@ -598,14 +637,16 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[0, 0], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), + residues=residues_dict, ) pdb.db_peptides = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) - assert expected_isotope0 == candidates + assert expected_isotope0 == list(candidates) pdb = db_utils.ProteinDatabase( fasta_path=str(tiny_fasta_file), @@ -617,14 +658,16 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[0, 1], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), + residues=residues_dict, ) pdb.db_peptides = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) - assert expected_isotope01 == candidates + assert expected_isotope01 == list(candidates) pdb = db_utils.ProteinDatabase( fasta_path=str(tiny_fasta_file), @@ -636,14 +679,16 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[0, 2], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), + residues=residues_dict, ) pdb.db_peptides = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) - assert expected_isotope012 == candidates + assert expected_isotope012 == list(candidates) pdb = db_utils.ProteinDatabase( fasta_path=str(tiny_fasta_file), @@ -655,14 +700,16 @@ def test_get_candidates_isotope_error(tiny_fasta_file): max_mods=0, precursor_tolerance=10000, isotope_error=[0, 3], - allowed_mods=( - "M+15.995,N+0.984,Q+0.984," - "+42.011,+43.006,-17.027,+43.006-17.027" + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" ), + residues=residues_dict, ) pdb.db_peptides = peptide_list candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) - assert expected_isotope0123 == candidates + assert expected_isotope0123 == list(candidates) def test_beam_search_decode(): From 4a5b238133aaa1db27f584f52d9328b2f90c35f4 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Tue, 3 Sep 2024 10:29:23 -0700 Subject: [PATCH 16/21] minor changes regarding formatting and small efficiency boosts --- casanovo/config.yaml | 8 +++--- casanovo/data/db_utils.py | 52 ++++++++++++++++++++-------------- casanovo/denovo/dataloaders.py | 13 ++++----- casanovo/denovo/model.py | 2 +- 4 files changed, 42 insertions(+), 33 deletions(-) diff --git a/casanovo/config.yaml b/casanovo/config.yaml index af2f79d1..17cba6a4 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -46,23 +46,23 @@ devices: # Enzyme for in silico digestion, used to generate candidate peptides. # See pyteomics.parser.expasy_rules for valid enzymes. +# Can also take a regex expression to specify custom digestion rules. enzyme: "trypsin" # Digestion type for candidate peptide generation. # full: standard digestion. semi: Include products of semi-specific cleavage. -# Can also take a regex expression to specify custom digestion rules. digestion: "full" # Number of allowed missed cleavages when digesting protein. missed_cleavages: 0 -# Maximum number of amino acid modifications per peptide, +# Maximum number of variable amino acid modifications per peptide, # None generates all possible isoforms as candidates. max_mods: 1 # Select which modifications from the vocabulary can be used in candidate creation. # Format: Comma-separated list of "aa:mod_residue", -# where aa is a standard amino acid or "X" for an N-terminal mod +# where aa is a standard amino acid or "nterm" for an N-terminal mod # and mod_residue is a key from the "residues" dictionary. # Example: "M:M+15.995,X:+43.006-17.027" allowed_fixed_mods: "C:C+57.021" -allowed_var_mods: "M:M+15.995,N:N+0.984,Q:Q+0.984,X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" +allowed_var_mods: "M:M+15.995,N:N+0.984,Q:Q+0.984,nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ### diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index c9201538..86c2112d 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -5,6 +5,7 @@ import os import re import string +from collections import defaultdict from typing import List, Tuple import depthcharge.masses @@ -13,6 +14,7 @@ import pyteomics.parser as parser from numba import njit + logger = logging.getLogger("casanovo") # CONSTANTS @@ -72,6 +74,9 @@ def __init__( self.fixed_mods, self.var_mods, self.swap_map = _construct_mods_dict( allowed_fixed_mods, allowed_var_mods ) + self.swap_regex = re.compile( + "(%s)" % "|".join(map(re.escape, self.swap_map.keys())) + ) self.db_peptides = self._digest_fasta( fasta_path, enzyme, @@ -167,6 +172,7 @@ def _digest_fasta( enzyme : str The enzyme to use for digestion. See pyteomics.parser.expasy_rules for valid enzymes. + Can also be a regex pattern. digestion : str The type of digestion to perform. Either 'full' or 'partial'. missed_cleavages : int @@ -199,9 +205,7 @@ def _digest_fasta( enzyme, ) semi = digestion == "partial" - valid_aa = set( - [re.sub(r"[^A-Z]+", "", res) for res in self.residues.keys()] - ) + valid_aa = set(list(self.residues.keys()) + ["C"]) for header, seq in fasta.read(fasta_filename): pep_set = parser.cleave( seq, @@ -212,17 +216,16 @@ def _digest_fasta( protein = header.split()[0] for pep in pep_set: if ( - len(pep) < min_peptide_length - or len(pep) > max_peptide_length + len(pep) >= min_peptide_length + or len(pep) <= max_peptide_length ): - continue - - if any(aa not in valid_aa for aa in pep): - logger.warn( - "Skipping peptide with unknown amino acids: %s", pep - ) - continue - peptide_list.append((pep, protein)) + if any(aa not in valid_aa for aa in pep): + logger.warn( + "Skipping peptide with unknown amino acids: %s", + pep, + ) + else: + peptide_list.append((pep, protein)) # Generate modified peptides mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") @@ -242,7 +245,11 @@ def _digest_fasta( (mod_pep, mass_calculator.mass(mod_pep), prot) for isos, prot in peptide_isoforms for mod_pep in map( - functools.partial(_convert_from_modx, swap_map=self.swap_map), + functools.partial( + _convert_from_modx, + swap_map=self.swap_map, + swap_regex=self.swap_regex, + ), isos, ) ] @@ -259,7 +266,7 @@ def _digest_fasta( @njit -def _to_mz(precursor_mass, charge): +def _to_mz(precursor_mass: float, charge: int) -> float: """ Convert precursor neutral mass to m/z value. @@ -279,7 +286,7 @@ def _to_mz(precursor_mass, charge): @njit -def _to_raw_mass(mz_mass, charge): +def _to_raw_mass(mz_mass: float, charge: int) -> float: """ Convert precursor m/z value to neutral mass. @@ -298,7 +305,7 @@ def _to_raw_mass(mz_mass, charge): return charge * (mz_mass - PROTON) -def _convert_from_modx(seq: str, swap_map: dict) -> str: +def _convert_from_modx(seq: str, swap_map: dict, swap_regex: str) -> str: """Converts peptide sequence from modX format to Casanovo-acceptable modifications. Args: @@ -306,12 +313,15 @@ def _convert_from_modx(seq: str, swap_map: dict) -> str: Peptide in modX format swap_map : dict Dictionary that allows for swapping of modX to Casanovo-acceptable modifications. + swap_regex : str + Regular expression to match modX format. """ - regex = re.compile("(%s)" % "|".join(map(re.escape, swap_map.keys()))) - return regex.sub(lambda x: swap_map[x.group()], seq) + return swap_regex.sub(lambda x: swap_map[x.group()], seq) -def _construct_mods_dict(allowed_fixed_mods, allowed_var_mods): +def _construct_mods_dict( + allowed_fixed_mods: str, allowed_var_mods: str +) -> Tuple[dict, dict, dict]: """ Constructs dictionaries of fixed and variable modifications. @@ -343,7 +353,7 @@ def _construct_mods_dict(allowed_fixed_mods, allowed_var_mods): for idx, mod in enumerate(allowed_var_mods.split(",")): aa, mod_aa = mod.split(":") mod_id = string.ascii_lowercase[idx] - if aa == "X": + if aa == "nterm": var_mods[f"{mod_id}-"] = True swap_map[f"{mod_id}-"] = f"{mod_aa}" else: diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 6e8c93b3..4793e2f3 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -272,11 +272,11 @@ def prepare_psm_batch( spectra, precursor_mzs, precursor_charges, spectrum_ids = list(zip(*batch)) spectra = torch.nn.utils.rnn.pad_sequence(spectra, batch_first=True) - precursor_mzs = torch.tensor(precursor_mzs) - precursor_charges = torch.tensor(precursor_charges) - precursor_masses = (precursor_mzs - 1.007276) * precursor_charges + precursor_mzs_t = torch.tensor(precursor_mzs) + precursor_charges_t = torch.tensor(precursor_charges) + precursor_masses_t = (precursor_mzs_t - 1.007276) * precursor_charges_t precursors = torch.vstack( - [precursor_masses, precursor_charges, precursor_mzs] + [precursor_masses_t, precursor_charges_t, precursor_mzs_t] ).T.float() all_spectra = [] @@ -286,8 +286,8 @@ def prepare_psm_batch( all_proteins = [] for idx in range(len(batch)): spec_peptides, spec_proteins = protein_database.get_candidates( - precursor_mzs[idx].type(torch.float64).item(), - precursor_charges[idx].type(torch.int64).item(), + precursor_mzs[idx], + precursor_charges[idx], ) try: all_spectra.append( @@ -303,7 +303,6 @@ def prepare_psm_batch( logger.warning( "No candidates found for spectrum %s", spectrum_ids[idx] ) - continue return ( torch.cat(all_spectra, dim=0), diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index dc7e5f7b..31757d81 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1009,7 +1009,7 @@ class DbSpec2Pep(Spec2Pep): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.psm_batch_size = 1024 + self.psm_batch_size = None def predict_step(self, batch, *args): """ From 4352bbdfb41aeeb61675c9a290f7bc83eae2f717 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Tue, 3 Sep 2024 11:24:18 -0700 Subject: [PATCH 17/21] changes before reformatting config --- casanovo/data/db_utils.py | 21 +++++++++++++-------- tests/conftest.py | 2 +- tests/unit_tests/test_unit.py | 28 ++++++++++++++-------------- 3 files changed, 28 insertions(+), 23 deletions(-) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 86c2112d..26f7152c 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -77,7 +77,7 @@ def __init__( self.swap_regex = re.compile( "(%s)" % "|".join(map(re.escape, self.swap_map.keys())) ) - self.db_peptides = self._digest_fasta( + self.db_peptides, self.prot_map = self._digest_fasta( fasta_path, enzyme, digestion, @@ -146,11 +146,9 @@ def get_associated_protein(self, peptide: str) -> str: Returns ------- protein : str - The associated protein. + The associated protein(s). """ - return self.db_peptides[self.db_peptides["peptide"] == peptide][ - "protein" - ].values[0] + return ",".join(self.prot_map[peptide]) def _digest_fasta( self, @@ -186,9 +184,11 @@ def _digest_fasta( Returns ------- - mod_peptide_list : pd.DataFrame + pep_table : pd.DataFrame A Pandas DataFrame with peptide, mass, and protein columns. Sorted by neutral mass in ascending order. + prot_map : dict + A dictionary mapping peptides to associated proteins. """ # Verify the existence of the file: if not os.path.isfile(fasta_filename): @@ -217,7 +217,7 @@ def _digest_fasta( for pep in pep_set: if ( len(pep) >= min_peptide_length - or len(pep) <= max_peptide_length + and len(pep) <= max_peptide_length ): if any(aa not in valid_aa for aa in pep): logger.warn( @@ -259,10 +259,15 @@ def _digest_fasta( ) pep_table.sort_values(by=["calc_mass", "peptide"], inplace=True) + # Create a dictionary mapping for easy accession of associated proteins + prot_map = defaultdict(list) + for pep, _, prot in mod_peptide_list: + prot_map[pep].append(prot) + logger.info( "Digestion complete. %d peptides generated.", len(pep_table) ) - return pep_table + return pep_table, prot_map @njit diff --git a/tests/conftest.py b/tests/conftest.py index 3b94896a..bf02a3ab 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -300,7 +300,7 @@ def tiny_config(tmp_path): "allowed_fixed_mods": "C:C+57.021", "allowed_var_mods": ( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), } diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index c06ec788..d03d6f7f 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -290,7 +290,7 @@ def test_digest_fasta_cleave(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) @@ -365,7 +365,7 @@ def test_digest_fasta_mods(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) @@ -404,7 +404,7 @@ def test_length_restrictions(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) @@ -424,7 +424,7 @@ def test_length_restrictions(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) @@ -460,7 +460,7 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) @@ -480,7 +480,7 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) @@ -501,7 +501,7 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) @@ -532,7 +532,7 @@ def test_get_candidates(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) @@ -552,7 +552,7 @@ def test_get_candidates(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) @@ -572,7 +572,7 @@ def test_get_candidates(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) @@ -640,7 +640,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) @@ -661,7 +661,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) @@ -682,7 +682,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) @@ -703,7 +703,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): allowed_fixed_mods="C:C+57.021", allowed_var_mods=( "M:M+15.995,N:N+0.984,Q:Q+0.984," - "X:+42.011,X:+43.006,X:-17.027,X:+43.006-17.027" + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" ), residues=residues_dict, ) From ddff67fb03b06d3b27f73ff58dfdd478cd8a826b Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Tue, 3 Sep 2024 12:00:28 -0700 Subject: [PATCH 18/21] replace all occurences of "max_length" with "max_peptide_len" --- casanovo/config.py | 2 +- casanovo/config.yaml | 2 +- casanovo/denovo/model.py | 26 +++++++++++++------------- casanovo/denovo/model_runner.py | 6 +++--- tests/conftest.py | 2 +- tests/unit_tests/test_unit.py | 10 +++++----- 6 files changed, 24 insertions(+), 24 deletions(-) diff --git a/casanovo/config.py b/casanovo/config.py index 792da35a..8577d087 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -59,7 +59,7 @@ class Config: n_layers=int, dropout=float, dim_intensity=int, - max_length=int, + max_peptide_len=int, residues=dict, n_log=int, tb_summarywriter=str, diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 17cba6a4..e8732b20 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -18,7 +18,7 @@ isotope_error_range: [0, 1] # The minimum length of considered peptides. min_peptide_len: 6 # The maximum length of considered peptides. -max_length: 100 +max_peptide_len: 100 # Number of spectra in one inference batch. predict_batch_size: 1024 diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 31757d81..6fe34bfa 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -46,7 +46,7 @@ class Spec2Pep(pl.LightningModule, ModelMixin): (``dim_model - dim_intensity``) are reserved for encoding the m/z value. If ``None``, the intensity will be projected up to ``dim_model`` using a linear layer, then summed with the m/z encoding for each peak. - max_length : int + max_peptide_len : int The maximum peptide length to decode. residues : Union[Dict[str, float], str] The amino acid dictionary and their masses. By default ("canonical) this @@ -99,7 +99,7 @@ def __init__( n_layers: int = 9, dropout: float = 0.0, dim_intensity: Optional[int] = None, - max_length: int = 100, + max_peptide_len: int = 100, residues: Union[Dict[str, float], str] = "canonical", max_charge: int = 5, precursor_mass_tol: float = 50, @@ -158,7 +158,7 @@ def __init__( self.opt_kwargs = kwargs # Data properties. - self.max_length = max_length + self.max_peptide_len = max_peptide_len self.residues = residues self.precursor_mass_tol = precursor_mass_tol self.isotope_error_range = isotope_error_range @@ -241,7 +241,7 @@ def beam_search_decode( # Sizes. batch = spectra.shape[0] # B - length = self.max_length + 1 # L + length = self.max_peptide_len + 1 # L vocab = self.decoder.vocab_size + 1 # V beam = self.n_beams # S @@ -269,7 +269,7 @@ def beam_search_decode( scores = einops.rearrange(scores, "B L V S -> (B S) L V") # The main decoding loop. - for step in range(0, self.max_length): + for step in range(0, self.max_peptide_len): # Terminate beams exceeding the precursor m/z tolerance and track # all finished beams (either terminated or stop token predicted). ( @@ -323,10 +323,10 @@ def _finish_beams( Parameters ---------- - tokens : torch.Tensor of shape (n_spectra * n_beams, max_length) + tokens : torch.Tensor of shape (n_spectra * n_beams, max_peptide_len) Predicted amino acid tokens for all beams and all spectra. scores : torch.Tensor of shape - (n_spectra * n_beams, max_length, n_amino_acids) + (n_spectra * n_beams, max_peptide_len, n_amino_acids) Scores for the predicted amino acid tokens for all beams and all spectra. step : int @@ -491,10 +491,10 @@ def _cache_finished_beams( Parameters ---------- - tokens : torch.Tensor of shape (n_spectra * n_beams, max_length) + tokens : torch.Tensor of shape (n_spectra * n_beams, max_peptide_len) Predicted amino acid tokens for all beams and all spectra. scores : torch.Tensor of shape - (n_spectra * n_beams, max_length, n_amino_acids) + (n_spectra * n_beams, max_peptide_len, n_amino_acids) Scores for the predicted amino acid tokens for all beams and all spectra. step : int @@ -576,10 +576,10 @@ def _get_topk_beams( Parameters ---------- - tokens : torch.Tensor of shape (n_spectra * n_beams, max_length) + tokens : torch.Tensor of shape (n_spectra * n_beams, max_peptide_len) Predicted amino acid tokens for all beams and all spectra. scores : torch.Tensor of shape - (n_spectra * n_beams, max_length, n_amino_acids) + (n_spectra * n_beams, max_peptide_len, n_amino_acids) Scores for the predicted amino acid tokens for all beams and all spectra. finished_beams : torch.Tensor of shape (n_spectra * n_beams) @@ -592,10 +592,10 @@ def _get_topk_beams( Returns ------- - tokens : torch.Tensor of shape (n_spectra * n_beams, max_length) + tokens : torch.Tensor of shape (n_spectra * n_beams, max_peptide_len) Predicted amino acid tokens for all beams and all spectra. scores : torch.Tensor of shape - (n_spectra * n_beams, max_length, n_amino_acids) + (n_spectra * n_beams, max_peptide_len, n_amino_acids) Scores for the predicted amino acid tokens for all beams and all spectra. """ diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index 395320e5..efb380cb 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -116,7 +116,7 @@ def db_search( self.config.digestion, self.config.missed_cleavages, self.config.min_peptide_len, - self.config.max_length, + self.config.max_peptide_len, self.config.max_mods, self.config.precursor_mass_tol, self.config.isotope_error_range, @@ -271,7 +271,7 @@ def initialize_model( n_layers=self.config.n_layers, dropout=self.config.dropout, dim_intensity=self.config.dim_intensity, - max_length=self.config.max_length, + max_peptide_len=self.config.max_peptide_len, residues=self.config.residues, max_charge=self.config.max_charge, precursor_mass_tol=self.config.precursor_mass_tol, @@ -292,7 +292,7 @@ def initialize_model( # Reconfigurable non-architecture related parameters for a loaded model. loaded_model_params = dict( - max_length=self.config.max_length, + max_peptide_len=self.config.max_peptide_len, precursor_mass_tol=self.config.precursor_mass_tol, isotope_error_range=self.config.isotope_error_range, n_beams=self.config.n_beams, diff --git a/tests/conftest.py b/tests/conftest.py index bf02a3ab..95ef2d02 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -241,7 +241,7 @@ def tiny_config(tmp_path): "precursor_mass_tol": 5, "isotope_error_range": [0, 1], "min_peptide_len": 6, - "max_length": 100, + "max_peptide_len": 100, "enzyme": "trypsin", "digestion": "full", "missed_cleavages": 0, diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index d03d6f7f..63d492f8 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -722,7 +722,7 @@ def test_beam_search_decode(): # Sizes. batch = 1 # B - length = model.max_length + 1 # L + length = model.max_peptide_len + 1 # L vocab = model.decoder.vocab_size + 1 # V beam = model.n_beams # S step = 3 @@ -839,12 +839,12 @@ def test_beam_search_decode(): assert torch.equal(new_scores[:, step, :], expected_scores) # Test output if decoding loop isn't stopped with termination of all beams. - model.max_length = 0 + model.max_peptide_len = 0 # 1 spectrum with 5 peaks (2 values: m/z and intensity). spectra = torch.zeros(1, 5, 2) precursors = torch.tensor([[469.25364, 2.0, 235.63410]]) assert len(list(model.beam_search_decode(spectra, precursors))[0]) == 0 - model.max_length = 100 + model.max_peptide_len = 100 # Re-initialize scores and tokens to further test caching functionality. scores = torch.full( @@ -1004,7 +1004,7 @@ def test_beam_search_decode(): batch = 2 # B beam = model.n_beams # S model.decoder.reverse = True - length = model.max_length + 1 # L + length = model.max_peptide_len + 1 # L vocab = model.decoder.vocab_size + 1 # V step = 4 @@ -1045,7 +1045,7 @@ def test_beam_search_decode(): batch = 2 # B beam = model.n_beams # S model.decoder.reverse = True - length = model.max_length + 1 # L + length = model.max_peptide_len + 1 # L vocab = model.decoder.vocab_size + 1 # V step = 4 From a3548d00124c1242350a62fdbcb2f719484254fe Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Tue, 3 Sep 2024 13:37:46 -0700 Subject: [PATCH 19/21] added nonspecific digestion --- casanovo/config.py | 1 + casanovo/config.yaml | 4 +- casanovo/data/db_utils.py | 67 +++++++----- tests/unit_tests/test_unit.py | 185 ++++++++++++++++++++++++++++++++-- 4 files changed, 225 insertions(+), 32 deletions(-) diff --git a/casanovo/config.py b/casanovo/config.py index 8577d087..dc2a3d2c 100644 --- a/casanovo/config.py +++ b/casanovo/config.py @@ -18,6 +18,7 @@ _config_deprecated = dict( every_n_train_steps="val_check_interval", max_iters="cosine_schedule_period_iters", + max_length="max_peptide_len", ) diff --git a/casanovo/config.yaml b/casanovo/config.yaml index e8732b20..df6fa8bb 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -49,7 +49,9 @@ devices: # Can also take a regex expression to specify custom digestion rules. enzyme: "trypsin" # Digestion type for candidate peptide generation. -# full: standard digestion. semi: Include products of semi-specific cleavage. +# full: standard digestion. +# semi: Include products of semi-specific cleavage. +# non-specific: Include products of non-specific cleavage. digestion: "full" # Number of allowed missed cleavages when digesting protein. missed_cleavages: 0 diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 26f7152c..f9c669ed 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -172,7 +172,7 @@ def _digest_fasta( See pyteomics.parser.expasy_rules for valid enzymes. Can also be a regex pattern. digestion : str - The type of digestion to perform. Either 'full' or 'partial'. + The type of digestion to perform. Either 'full', 'partial' or 'non-specific'. missed_cleavages : int The number of missed cleavages to allow. max_mods : int @@ -196,7 +196,7 @@ def _digest_fasta( raise FileNotFoundError(f"File {fasta_filename} does not exist.") peptide_list = [] - if digestion not in ["full", "partial"]: + if digestion not in ["full", "partial", "non-specific"]: logger.error("Digestion type %s not recognized.", digestion) raise ValueError(f"Digestion type {digestion} not recognized.") if enzyme not in parser.expasy_rules: @@ -204,28 +204,49 @@ def _digest_fasta( "Enzyme %s not recognized. Interpreting as cleavage rule.", enzyme, ) - semi = digestion == "partial" valid_aa = set(list(self.residues.keys()) + ["C"]) - for header, seq in fasta.read(fasta_filename): - pep_set = parser.cleave( - seq, - rule=enzyme, - missed_cleavages=missed_cleavages, - semi=semi, - ) - protein = header.split()[0] - for pep in pep_set: - if ( - len(pep) >= min_peptide_length - and len(pep) <= max_peptide_length - ): - if any(aa not in valid_aa for aa in pep): - logger.warn( - "Skipping peptide with unknown amino acids: %s", - pep, - ) - else: - peptide_list.append((pep, protein)) + if digestion == "non-specific": + for header, seq in fasta.read(fasta_filename): + pep_set = [] + # Generate all possible peptides + for i in range(len(seq)): + for j in range(i + 1, len(seq) + 1): + pep_set.append(seq[i:j]) + protein = header.split()[0] + for pep in pep_set: + if ( + len(pep) >= min_peptide_length + and len(pep) <= max_peptide_length + ): + if any(aa not in valid_aa for aa in pep): + logger.warn( + "Skipping peptide with unknown amino acids: %s", + pep, + ) + else: + peptide_list.append((pep, protein)) + else: + semi = digestion == "partial" + for header, seq in fasta.read(fasta_filename): + pep_set = parser.cleave( + seq, + rule=enzyme, + missed_cleavages=missed_cleavages, + semi=semi, + ) + protein = header.split()[0] + for pep in pep_set: + if ( + len(pep) >= min_peptide_length + and len(pep) <= max_peptide_length + ): + if any(aa not in valid_aa for aa in pep): + logger.warn( + "Skipping peptide with unknown amino acids: %s", + pep, + ) + else: + peptide_list.append((pep, protein)) # Generate modified peptides mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 63d492f8..594552af 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -327,12 +327,16 @@ def test_digest_fasta_mods(tiny_fasta_file, residues_dict): "+42.011EIVMTQSPPTLSLSPGER", "+43.006EIVMTQSPPTLSLSPGER", "-17.027MEAPAQLLFLLLLWLPDTTR", + "-17.027M+15.995EAPAQLLFLLLLWLPDTTR", # "MEAPAQLLFLLLLWLPDTTR", "MEAPAQ+0.984LLFLLLLWLPDTTR", "M+15.995EAPAQLLFLLLLWLPDTTR", "+43.006-17.027MEAPAQLLFLLLLWLPDTTR", + "+43.006-17.027M+15.995EAPAQLLFLLLLWLPDTTR", # "+42.011MEAPAQLLFLLLLWLPDTTR", "+43.006MEAPAQLLFLLLLWLPDTTR", + "+42.011M+15.995EAPAQLLFLLLLWLPDTTR", # + "+43.006M+15.995EAPAQLLFLLLLWLPDTTR", # "-17.027ASQSVSSSYLTWYQQKPGQAPR", "ASQSVSSSYLTWYQQKPGQAPR", "ASQ+0.984SVSSSYLTWYQQKPGQAPR", @@ -370,13 +374,6 @@ def test_digest_fasta_mods(tiny_fasta_file, residues_dict): residues=residues_dict, ) peptide_list = list(pdb.db_peptides["peptide"]) - peptide_list = [ - x - for x in peptide_list - if not re.search( - r"(\+42\.011|\+43\.006|\-17\.027|\+43\.006\-17\.027)+[A-Z]\+", x - ) - ] assert peptide_list == expected_1mod @@ -447,6 +444,136 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): # asp-n enzyme expected_aspn = ["DFAVYYC+57.021QQ", "DFTLTISSLQPE", "MEAPAQLLFLLLLWLP"] + expected_semispecific = [ + "FSGSGS", + "ATSIPA", + "ASQSVS", + "PGQAPR", + "TSIPAR", + "MEAPAQ", + "LLIYGA", + "YGASTR", + "LSPGER", + "LPDTTR", + "EIVMTQ", + "VTLSC+57.021R", + "QDYNLP", + ] + + expected_nonspecific = [ + "SGSGSG", + "GSGSGT", + "SGSGTD", + "FSGSGS", + "ATSIPA", + "GASTRA", + "LSLSPG", + "ASQSVS", + "GSGTDF", + "SLSPGE", + "QSVSSS", + "SQSVSS", + "KPGQAP", + "SPPTLS", + "ASTRAT", + "RFSGSG", + "IYGAST", + "APAQLL", + "PTLSLS", + "TLSLSP", + "TLTISS", + "STRATS", + "LIYGAS", + "ARFSGS", + "PGQAPR", + "SGTDFT", + "PPTLSL", + "EAPAQL", + "QKPGQA", + "SVSSSY", + "TQSPPT", + "LTISSL", + "PARFSG", + "GQAPRL", + "QSPPTL", + "SPGERV", + "ISSLQP", + "RATSIP", + "TSIPAR", + "MEAPAQ", + "RASQSV", + "TISSLQ", + "TRATSI", + "LLIYGA", + "GTDFTL", + "YGASTR", + "VSSSYL", + "SSSYLT", + "LSPGER", + "PGERVT", + "MTQSPP", + "SSLQPE", + "VMTQSP", + "GERVTL", + "PEDFAV", + "IVMTQS", + "FTLTIS", + "APRLLI", + "QQKPGQ", + "SLQPED", + "PAQLLF", + "IPARFS", + "SIPARF", + "LSC+57.021RAS", + "TDFTLT", + "QAPRLL", + "LPDTTR", + "ERVTLS", + "AQLLFL", + "QPEDFA", + "TLSC+57.021RA", + "C+57.021RASQS", + "SC+57.021RASQ", + "DFTLTI", + "PDTTRE", + "TTREIV", + "EIVMTQ", + "YQQKPG", + "LFLLLL", + "LLFLLL", + "WLPDTT", + "DTTREI", + "RLLIYG", + "RVTLSC+57.021", + "VTLSC+57.021R", + "EDFAVY", + "LWLPDT", + "QLLFLL", + "LQPEDF", + "REIVMT", + "TREIVM", + "QDYNLP", + "LLLWLP", + "SSYLTW", + "LLWLPD", + "LLLLWL", + "PRLLIY", + "DFAVYY", + "QQDYNL", + "AVYYC+57.021Q", + "FLLLLW", + "FAVYYC+57.021", + "C+57.021QQDYN", + "SYLTWY", + "LTWYQQ", + "WYQQKP", + "TWYQQK", + "VYYC+57.021QQ", + "YLTWYQ", + "YC+57.021QQDY", + "YYC+57.021QQD", + ] + pdb = db_utils.ProteinDatabase( fasta_path=str(tiny_fasta_file), enzyme="arg-c", @@ -487,7 +614,7 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): peptide_list = list(pdb.db_peptides["peptide"]) assert peptide_list == expected_aspn - # Tesr regex rule instead of named enzyme + # Test regex rule instead of named enzyme pdb = db_utils.ProteinDatabase( fasta_path=str(tiny_fasta_file), enzyme="R", @@ -508,6 +635,48 @@ def test_digest_fasta_enzyme(tiny_fasta_file, residues_dict): peptide_list = list(pdb.db_peptides["peptide"]) assert peptide_list == expected_argc + # Test semispecific digest + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="partial", + missed_cleavages=0, + min_peptide_len=6, + max_peptide_len=6, + max_mods=0, + precursor_tolerance=10000, + isotope_error=[0, 0], + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" + ), + residues=residues_dict, + ) + peptide_list = list(pdb.db_peptides["peptide"]) + assert peptide_list == expected_semispecific + + # Test nonspecific digest + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="non-specific", + missed_cleavages=0, + min_peptide_len=6, + max_peptide_len=6, + max_mods=0, + precursor_tolerance=10000, + isotope_error=[0, 0], + allowed_fixed_mods="C:C+57.021", + allowed_var_mods=( + "M:M+15.995,N:N+0.984,Q:Q+0.984," + "nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" + ), + residues=residues_dict, + ) + peptide_list = list(pdb.db_peptides["peptide"]) + assert peptide_list == expected_nonspecific + def test_get_candidates(tiny_fasta_file, residues_dict): # precursor_window is 10000 From e8d4682241b9b4d10384e9dfd92fd04258103e3e Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Fri, 13 Sep 2024 12:06:31 -0700 Subject: [PATCH 20/21] minor comments --- casanovo/data/db_utils.py | 35 +++++++++++++++++----------------- casanovo/denovo/dataloaders.py | 7 +------ casanovo/denovo/model.py | 6 ++---- 3 files changed, 20 insertions(+), 28 deletions(-) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index f9c669ed..19b312e2 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -127,12 +127,12 @@ def get_candidates( (self.db_peptides["calc_mass"] >= lower_bound) & (self.db_peptides["calc_mass"] <= upper_bound) ] - candidates.append(window[["peptide", "calc_mass", "protein"]]) + candidates.append(window[["peptide", "calc_mass"]]) candidates = pd.concat(candidates) candidates.drop_duplicates(inplace=True) candidates.sort_values(by=["calc_mass", "peptide"], inplace=True) - return candidates["peptide"], candidates["protein"] + return candidates["peptide"] def get_associated_protein(self, peptide: str) -> str: """ @@ -159,7 +159,7 @@ def _digest_fasta( max_mods: int, min_peptide_length: int, max_peptide_length: int, - ) -> pd.DataFrame: + ) -> Tuple[pd.DataFrame, dict]: """ Digests a FASTA file and returns the peptides, their masses, and associated protein. @@ -185,8 +185,8 @@ def _digest_fasta( Returns ------- pep_table : pd.DataFrame - A Pandas DataFrame with peptide, mass, - and protein columns. Sorted by neutral mass in ascending order. + A Pandas DataFrame with peptide and mass columns. + Sorted by neutral mass in ascending order. prot_map : dict A dictionary mapping peptides to associated proteins. """ @@ -207,17 +207,14 @@ def _digest_fasta( valid_aa = set(list(self.residues.keys()) + ["C"]) if digestion == "non-specific": for header, seq in fasta.read(fasta_filename): - pep_set = [] + protein = header.split()[0] # Generate all possible peptides for i in range(len(seq)): - for j in range(i + 1, len(seq) + 1): - pep_set.append(seq[i:j]) - protein = header.split()[0] - for pep in pep_set: - if ( - len(pep) >= min_peptide_length - and len(pep) <= max_peptide_length + for j in range( + i + min_peptide_length, + min(i + max_peptide_length + 1, len(seq) + 1), ): + pep = seq[i:j] if any(aa not in valid_aa for aa in pep): logger.warn( "Skipping peptide with unknown amino acids: %s", @@ -274,17 +271,19 @@ def _digest_fasta( isos, ) ] - # Create a DataFrame for easy sorting and filtering - pep_table = pd.DataFrame( - mod_peptide_list, columns=["peptide", "calc_mass", "protein"] - ) - pep_table.sort_values(by=["calc_mass", "peptide"], inplace=True) # Create a dictionary mapping for easy accession of associated proteins prot_map = defaultdict(list) for pep, _, prot in mod_peptide_list: prot_map[pep].append(prot) + # Create a DataFrame for easy sorting and filtering + pep_table = pd.DataFrame( + [(pep, mass) for pep, mass, _ in mod_peptide_list], + columns=["peptide", "calc_mass"], + ) + pep_table.sort_values(by=["calc_mass", "peptide"], inplace=True) + logger.info( "Digestion complete. %d peptides generated.", len(pep_table) ) diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 4793e2f3..2646329d 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -266,8 +266,6 @@ def prepare_psm_batch( The spectrum identifiers. all_peptides : List[str] The candidate peptides for each spectrum. - all_proteins : List[str] - The proteins associated with each candidate peptide. """ spectra, precursor_mzs, precursor_charges, spectrum_ids = list(zip(*batch)) spectra = torch.nn.utils.rnn.pad_sequence(spectra, batch_first=True) @@ -283,9 +281,8 @@ def prepare_psm_batch( all_precursors = [] all_spectrum_ids = [] all_peptides = [] - all_proteins = [] for idx in range(len(batch)): - spec_peptides, spec_proteins = protein_database.get_candidates( + spec_peptides = protein_database.get_candidates( precursor_mzs[idx], precursor_charges[idx], ) @@ -298,7 +295,6 @@ def prepare_psm_batch( ) all_spectrum_ids.extend([spectrum_ids[idx]] * len(spec_peptides)) all_peptides.extend(spec_peptides) - all_proteins.extend(spec_proteins) except ValueError: logger.warning( "No candidates found for spectrum %s", spectrum_ids[idx] @@ -309,5 +305,4 @@ def prepare_psm_batch( torch.cat(all_precursors, dim=0), all_spectrum_ids, all_peptides, - all_proteins, ) diff --git a/casanovo/denovo/model.py b/casanovo/denovo/model.py index 6fe34bfa..ca5557fc 100644 --- a/casanovo/denovo/model.py +++ b/casanovo/denovo/model.py @@ -1019,7 +1019,7 @@ def predict_step(self, batch, *args): ---------- batch : Tuple[torch.Tensor, torch.Tensor, np.array, List[str]] A batch of (i) MS/MS spectra, (ii) precursor information, (iii) - spectrum identifiers, (iv) candidate peptides, (v) associated protein. + spectrum identifiers, (iv) candidate peptides Returns ------- @@ -1049,7 +1049,6 @@ def predict_step(self, batch, *args): peptide_score, aa_scores, peptide, - protein, ) in zip( current_batch[1][:, 1].cpu().detach().numpy(), current_batch[1][:, 2].cpu().detach().numpy(), @@ -1057,7 +1056,6 @@ def predict_step(self, batch, *args): all_scores.cpu().detach().numpy(), per_aa_scores.cpu().detach().numpy(), current_batch[3], - current_batch[4], ): predictions.append( ( @@ -1067,7 +1065,7 @@ def predict_step(self, batch, *args): peptide, peptide_score, aa_scores, - protein, + self.protein_database.get_associated_protein(peptide), ) ) return predictions From 68b6926032814dcc4a6b650e1736c8ff92edf7cb Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Fri, 13 Sep 2024 13:41:39 -0700 Subject: [PATCH 21/21] full branch comments addressed --- casanovo/data/db_utils.py | 197 ++++++++++++++++++++-------------- tests/unit_tests/test_unit.py | 14 +-- 2 files changed, 123 insertions(+), 88 deletions(-) diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 19b312e2..34671eb1 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -6,7 +6,7 @@ import re import string from collections import defaultdict -from typing import List, Tuple +from typing import List, Tuple, Iterator import depthcharge.masses import pandas as pd @@ -70,22 +70,23 @@ def __init__( allowed_var_mods: str, residues: dict, ): - self.residues = residues self.fixed_mods, self.var_mods, self.swap_map = _construct_mods_dict( allowed_fixed_mods, allowed_var_mods ) + self.max_mods = max_mods self.swap_regex = re.compile( "(%s)" % "|".join(map(re.escape, self.swap_map.keys())) ) - self.db_peptides, self.prot_map = self._digest_fasta( + peptide_generator = _peptide_generator( fasta_path, enzyme, digestion, missed_cleavages, - max_mods, min_peptide_len, max_peptide_len, + set(list(residues.keys()) + ["C"]), ) + self.db_peptides, self.prot_map = self._digest_fasta(peptide_generator) self.precursor_tolerance = precursor_tolerance self.isotope_error = isotope_error @@ -152,35 +153,15 @@ def get_associated_protein(self, peptide: str) -> str: def _digest_fasta( self, - fasta_filename: str, - enzyme: str, - digestion: str, - missed_cleavages: int, - max_mods: int, - min_peptide_length: int, - max_peptide_length: int, + peptide_generator: Iterator[Tuple[str, str]], ) -> Tuple[pd.DataFrame, dict]: """ Digests a FASTA file and returns the peptides, their masses, and associated protein. Parameters ---------- - fasta_filename : str - Path to the FASTA file. - enzyme : str - The enzyme to use for digestion. - See pyteomics.parser.expasy_rules for valid enzymes. - Can also be a regex pattern. - digestion : str - The type of digestion to perform. Either 'full', 'partial' or 'non-specific'. - missed_cleavages : int - The number of missed cleavages to allow. - max_mods : int - The maximum number of modifications to allow per peptide. - min_peptide_length : int - The minimum length of peptides to consider. - max_peptide_length : int - The maximum length of peptides to consider. + peptide_generator : Iterator[Tuple[str, str]] + An iterator that yields peptides and associated proteins. Returns ------- @@ -190,60 +171,9 @@ def _digest_fasta( prot_map : dict A dictionary mapping peptides to associated proteins. """ - # Verify the existence of the file: - if not os.path.isfile(fasta_filename): - logger.error("File %s does not exist.", fasta_filename) - raise FileNotFoundError(f"File {fasta_filename} does not exist.") - peptide_list = [] - if digestion not in ["full", "partial", "non-specific"]: - logger.error("Digestion type %s not recognized.", digestion) - raise ValueError(f"Digestion type {digestion} not recognized.") - if enzyme not in parser.expasy_rules: - logger.info( - "Enzyme %s not recognized. Interpreting as cleavage rule.", - enzyme, - ) - valid_aa = set(list(self.residues.keys()) + ["C"]) - if digestion == "non-specific": - for header, seq in fasta.read(fasta_filename): - protein = header.split()[0] - # Generate all possible peptides - for i in range(len(seq)): - for j in range( - i + min_peptide_length, - min(i + max_peptide_length + 1, len(seq) + 1), - ): - pep = seq[i:j] - if any(aa not in valid_aa for aa in pep): - logger.warn( - "Skipping peptide with unknown amino acids: %s", - pep, - ) - else: - peptide_list.append((pep, protein)) - else: - semi = digestion == "partial" - for header, seq in fasta.read(fasta_filename): - pep_set = parser.cleave( - seq, - rule=enzyme, - missed_cleavages=missed_cleavages, - semi=semi, - ) - protein = header.split()[0] - for pep in pep_set: - if ( - len(pep) >= min_peptide_length - and len(pep) <= max_peptide_length - ): - if any(aa not in valid_aa for aa in pep): - logger.warn( - "Skipping peptide with unknown amino acids: %s", - pep, - ) - else: - peptide_list.append((pep, protein)) + for pep, prot in peptide_generator: + peptide_list.append((pep, prot)) # Generate modified peptides mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") @@ -253,7 +183,7 @@ def _digest_fasta( pep, variable_mods=self.var_mods, fixed_mods=self.fixed_mods, - max_mods=max_mods, + max_mods=self.max_mods, ), prot, ) @@ -290,6 +220,111 @@ def _digest_fasta( return pep_table, prot_map +def _peptide_generator( + fasta_filename: str, + enzyme: str, + digestion: str, + missed_cleavages: int, + min_peptide_length: int, + max_peptide_length: int, + valid_aa: set[str], +) -> Iterator[str]: + """ + Create a generator the yields peptides from a FASTA file + depending on the type of digestion specified. + + Parameters + ---------- + fasta_filename : str + Path to the FASTA file. + enzyme : str + The enzyme to use for digestion. + See pyteomics.parser.expasy_rules for valid enzymes. + Can also be a regex pattern. + digestion : str + The type of digestion to perform. Either 'full', 'partial' or 'non-specific'. + missed_cleavages : int + The number of missed cleavages to allow. + min_peptide_length : int + The minimum length of peptides to consider. + max_peptide_length : int + The maximum length of peptides to consider. + valid_aa : set[str] + A set of valid amino acids. + + Yields + ------ + pep : str + A peptide sequence, unmodified. + protein : str + The associated protein. + """ + # Verify the existence of the file: + if not os.path.isfile(fasta_filename): + logger.error("File %s does not exist.", fasta_filename) + raise FileNotFoundError(f"File {fasta_filename} does not exist.") + if digestion not in ["full", "partial", "non-specific"]: + logger.error("Digestion type %s not recognized.", digestion) + raise ValueError(f"Digestion type {digestion} not recognized.") + if enzyme not in parser.expasy_rules: + logger.info( + "Enzyme %s not recognized. Interpreting as cleavage rule.", + enzyme, + ) + + # Verify the existence of the file: + if not os.path.isfile(fasta_filename): + logger.error("File %s does not exist.", fasta_filename) + raise FileNotFoundError(f"File {fasta_filename} does not exist.") + if digestion not in ["full", "partial", "non-specific"]: + logger.error("Digestion type %s not recognized.", digestion) + raise ValueError(f"Digestion type {digestion} not recognized.") + if enzyme not in parser.expasy_rules: + logger.info( + "Enzyme %s not recognized. Interpreting as cleavage rule.", + enzyme, + ) + if digestion == "non-specific": + for header, seq in fasta.read(fasta_filename): + protein = header.split()[0] + # Generate all possible peptides + for i in range(len(seq)): + for j in range( + i + min_peptide_length, + min(i + max_peptide_length + 1, len(seq) + 1), + ): + pep = seq[i:j] + if any(aa not in valid_aa for aa in pep): + logger.warn( + "Skipping peptide with unknown amino acids: %s", + pep, + ) + else: + yield pep, protein + else: + semi = digestion == "partial" + for header, seq in fasta.read(fasta_filename): + pep_set = parser.cleave( + seq, + rule=enzyme, + missed_cleavages=missed_cleavages, + semi=semi, + ) + protein = header.split()[0] + for pep in pep_set: + if ( + len(pep) >= min_peptide_length + and len(pep) <= max_peptide_length + ): + if any(aa not in valid_aa for aa in pep): + logger.warn( + "Skipping peptide with unknown amino acids: %s", + pep, + ) + else: + yield pep, protein + + @njit def _to_mz(precursor_mass: float, charge: int) -> float: """ diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 594552af..a0b0935d 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -705,7 +705,7 @@ def test_get_candidates(tiny_fasta_file, residues_dict): ), residues=residues_dict, ) - candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_smallwindow == list(candidates) pdb = db_utils.ProteinDatabase( @@ -725,7 +725,7 @@ def test_get_candidates(tiny_fasta_file, residues_dict): ), residues=residues_dict, ) - candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_midwindow == list(candidates) pdb = db_utils.ProteinDatabase( @@ -745,7 +745,7 @@ def test_get_candidates(tiny_fasta_file, residues_dict): ), residues=residues_dict, ) - candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_widewindow == list(candidates) @@ -814,7 +814,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): residues=residues_dict, ) pdb.db_peptides = peptide_list - candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_isotope0 == list(candidates) pdb = db_utils.ProteinDatabase( @@ -835,7 +835,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): residues=residues_dict, ) pdb.db_peptides = peptide_list - candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_isotope01 == list(candidates) pdb = db_utils.ProteinDatabase( @@ -856,7 +856,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): residues=residues_dict, ) pdb.db_peptides = peptide_list - candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_isotope012 == list(candidates) pdb = db_utils.ProteinDatabase( @@ -877,7 +877,7 @@ def test_get_candidates_isotope_error(tiny_fasta_file, residues_dict): residues=residues_dict, ) pdb.db_peptides = peptide_list - candidates, _ = pdb.get_candidates(precursor_mz=496.2, charge=2) + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) assert expected_isotope0123 == list(candidates)