From b2f08ac307f50c4dabc458745cd79b3ec2058f35 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Mon, 19 Aug 2024 19:09:26 -0700 Subject: [PATCH] add proteindatabase --- casanovo/casanovo.py | 110 -------- casanovo/config.yaml | 36 ++- casanovo/data/datasets.py | 2 +- casanovo/data/db_utils.py | 442 +++++++++++++++++--------------- casanovo/denovo/dataloaders.py | 28 +- casanovo/denovo/model_runner.py | 45 +--- tests/conftest.py | 5 + tests/test_integration.py | 2 - tests/unit_tests/test_unit.py | 200 +++++++++------ 9 files changed, 404 insertions(+), 466 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 4b9b4e38..b153512d 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -158,111 +158,9 @@ def sequence( nargs=1, type=click.Path(exists=True, dir_okay=False), ) -@click.option( - "--enzyme", - help="Enzyme for in silico digestion, \ - See pyteomics.parser.expasy_rules for valid enzymes", - type=click.Choice( - [ - "arg-c", - "asp-n", - "bnps-skatole", - "caspase 1", - "caspase 2", - "caspase 3", - "caspase 4", - "caspase 5", - "caspase 6", - "caspase 7", - "caspase 8", - "caspase 9", - "caspase 10", - "chymotrypsin high specificity", - "chymotrypsin low specificity", - "clostripain", - "cnbr", - "enterokinase", - "factor xa", - "formic acid", - "glutamyl endopeptidase", - "granzyme b", - "hydroxylamine", - "iodosobenzoic acid", - "lysc", - "ntcb", - "pepsin ph1.3", - "pepsin ph2.0", - "proline endopeptidase", - "proteinase k", - "staphylococcal peptidase i", - "thermolysin", - "thrombin", - "trypsin", - "trypsin_exception", - ] - ), - default="trypsin", -) -@click.option( - "--digestion", - help="Full: standard digestion. Semi: Include products of semi-specific cleavage", - type=click.Choice( - ["full", "partial"], - case_sensitive=False, - ), - default="full", -) -@click.option( - "--missed_cleavages", - help="Number of allowed missed cleavages when digesting protein", - type=int, - default=0, -) -@click.option( - "--max_mods", - help="Maximum number of amino acid modifications per peptide", - type=int, - default=0, -) -@click.option( - "--min_peptide_length", - help="Minimum peptide length to consider", - type=int, - default=6, -) -@click.option( - "--max_peptide_length", - help="Maximum peptide length to consider", - type=int, - default=50, -) -@click.option( - "--precursor_tolerance", - help="Precursor tolerance window size (units: ppm)", - type=float, - default=20, -) -@click.option( - "--isotope_error", - help="Isotope error levels to consider. \ - Creates multiple mass windows to consider per spectrum \ - to account for observed mass not matching monoisotopic mass \ - due to the instrument assigning the 13C isotope \ - peak as the precursor (list of ints, e.g: 1,2)", - type=str, - default="0", -) def db_search( peak_path: Tuple[str], fasta_path: str, - enzyme: str, - digestion: str, - missed_cleavages: int, - max_mods: int, - min_peptide_length: int, - max_peptide_length: int, - precursor_tolerance: float, - isotope_error: str, model: Optional[str], config: Optional[str], output: Optional[str], @@ -285,14 +183,6 @@ def db_search( runner.db_search( peak_path, fasta_path, - enzyme, - digestion, - missed_cleavages, - max_mods, - min_peptide_length, - max_peptide_length, - precursor_tolerance, - isotope_error, output, ) diff --git a/casanovo/config.yaml b/casanovo/config.yaml index c7186ff7..860cfabb 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -5,18 +5,26 @@ ### # The following parameters can be modified when running inference or when -# fine-tuning an existing Casanovo model. +# fine-tuning an existing Casanovo model. They also affect database search +# parameters when running Casanovo in DB-search mode. ### # Max absolute difference allowed with respect to observed precursor m/z. -# Predictions outside the tolerance range are assigned a negative peptide score. +# denovo: Predictions outside the tolerance range are assigned a negative peptide score. +# db-search: Used to create mas windows for candidate generation. precursor_mass_tol: 50 # ppm # Isotopes to consider when comparing predicted and observed precursor m/z's. isotope_error_range: [0, 1] -# The minimum length of predicted peptides. +# The minimum length of predicted/scored peptides. min_peptide_len: 6 -# Number of spectra in one inference batch. +# Number of spectra or psms in one inference batch. predict_batch_size: 1024 + + +### +# The following parameters are unique to Casanovo's inference/finetuning mode. +### + # Number of beams used in beam search. n_beams: 1 # Number of PSMs for each spectrum. @@ -29,6 +37,26 @@ accelerator: "auto" # number will be automatically selected for based on the chosen accelerator. devices: + +### +# The following parameters are unique to Casanovo's database search mode. +### + +# Enzyme for in silico digestion, used to generate candidate peptides. +# See pyteomics.parser.expasy_rules for valid enzymes +enzyme: "trypsin" +# Digestion type for candidate peptide generation. +# Full: standard digestion. Semi: Include products of semi-specific cleavage +digestion: "full" +# Number of allowed missed cleavages when digesting protein +missed_cleavages: 0 +# Maximum number of amino acid modifications per peptide. +# None generates all possible isoforms as candidates. +max_mods: +# Maximum peptide length to consider +max_peptide_len: 50 + + ### # The following parameters should only be modified if you are training a new # Casanovo model from scratch. diff --git a/casanovo/data/datasets.py b/casanovo/data/datasets.py index 6244e88f..3f05811f 100644 --- a/casanovo/data/datasets.py +++ b/casanovo/data/datasets.py @@ -1,6 +1,6 @@ """A PyTorch Dataset class for annotated spectra.""" -from typing import Optional, Tuple +from typing import List, Optional, Tuple import depthcharge import numpy as np diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index 1af09a47..a7b5e850 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -6,15 +6,12 @@ from typing import List, Tuple import depthcharge.masses +from numba import jit from pyteomics import fasta, parser logger = logging.getLogger("casanovo") - # CONSTANTS -HYDROGEN = 1.007825035 -OXYGEN = 15.99491463 -H2O = 2 * HYDROGEN + OXYGEN PROTON = 1.00727646677 ISOTOPE_SPACING = 1.003355 @@ -29,216 +26,243 @@ fixed_mods = {"carbm": ["C"]} -def convert_from_modx(seq: str): - """Converts peptide sequence from modX format to Casanovo-acceptable modifications. - - Args: - seq (str): Peptide in modX format - """ - seq = seq.replace("carbmC", "C+57.021") # Fixed modification - seq = seq.replace("oxM", "M+15.995") - seq = seq.replace("dN", "N+0.984") - seq = seq.replace("dQ", "Q+0.984") - seq = seq.replace("ace-", "+42.011") - seq = seq.replace("carbnh3x-", "+43.006-17.027") - seq = seq.replace("carb-", "+43.006") - seq = seq.replace("nh3x-", "-17.027") - return seq - - -def digest_fasta( - fasta_filename: str, - enzyme: str, - digestion: str, - missed_cleavages: int, - max_mods: int, - min_peptide_length: int, - max_peptide_length: int, -): - """ - Digests a FASTA file and returns the peptides, their masses, and associated protein. - - Parameters - ---------- - fasta_filename : str - Path to the FASTA file. - enzyme : str - The enzyme to use for digestion. - See pyteomics.parser.expasy_rules for valid enzymes. - digestion : str - The type of digestion to perform. Either 'full' or 'partial'. - missed_cleavages : int - The number of missed cleavages to allow. - max_mods : int - The maximum number of modifications to allow per peptide. - min_peptide_length : int - The minimum length of peptides to consider. - max_peptide_length : int - The maximum length of peptides to consider. - - Returns - ------- - mod_peptide_list : List[Tuple[str, float, str]] - A list of tuples containing the peptide sequence, mass, - and associated protein. Sorted by neutral mass in ascending order. - """ - # Verify the existence of the file: - if not os.path.isfile(fasta_filename): - logger.error("File %s does not exist.", fasta_filename) - raise FileNotFoundError(f"File {fasta_filename} does not exist.") - - fasta_data = fasta.read(fasta_filename) - peptide_list = [] - if digestion not in ["full", "partial"]: - logger.error("Digestion type %s not recognized.", digestion) - raise ValueError(f"Digestion type {digestion} not recognized.") - semi = digestion == "partial" - for header, seq in fasta_data: - pep_set = parser.cleave( - seq, - rule=parser.expasy_rules[enzyme], - missed_cleavages=missed_cleavages, - semi=semi, - ) - protein = header.split()[0] - for pep in pep_set: - if len(pep) < min_peptide_length or len(pep) > max_peptide_length: - continue - if any( - aa in pep for aa in "BJOUXZ" - ): # Check for incorrect AA letters - logger.warn( - "Skipping peptide with ambiguous amino acids: %s", pep - ) - continue - peptide_list.append((pep, protein)) - - # Generate modified peptides - mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") - mod_peptide_list = [] - for pep, prot in peptide_list: - peptide_isoforms = parser.isoforms( - pep, - variable_mods=var_mods, - fixed_mods=fixed_mods, - max_mods=max_mods, - ) - peptide_isoforms = list(map(convert_from_modx, peptide_isoforms)) - mod_peptide_list.extend( - (mod_pep, mass_calculator.mass(mod_pep), prot) - for mod_pep in peptide_isoforms - ) - - # Sort the peptides by mass and return. - mod_peptide_list.sort(key=lambda x: x[1]) - return mod_peptide_list - - -def get_candidates( - precursor_mz: float, - charge: int, - peptide_list: List[Tuple[str, float, str]], - precursor_tolerance: float, - isotope_error: str, -): +class ProteinDatabase: """ - Returns a list of candidate peptides that fall within the specified mass range. + TODO Parameters ---------- - precursor_mz : float - The precursor mass-to-charge ratio. - charge : int - The precursor charge. - peptide_list : List[Tuple[str, float, str]] - A list of tuples containing the peptide sequence, mass, and associated protein. - Must be sorted by mass in ascending order. Uses neutral masses. - precursor_tolerance : float - The precursor mass tolerance in parts-per-million. - isotope_error : str - The isotope error levels to consider. + TODO """ - candidates = set() - isotope_error = [int(x) for x in isotope_error.split(",")] - for e in isotope_error: - iso_shift = ISOTOPE_SPACING * e - upper_bound = (_to_raw_mass(precursor_mz, charge) - iso_shift) * ( - 1 + (precursor_tolerance / 1e6) - ) - lower_bound = (_to_raw_mass(precursor_mz, charge) - iso_shift) * ( - 1 - (precursor_tolerance / 1e6) + def __init__( + self, + fasta_path: str, + enzyme: str, + digestion: str, + missed_cleavages: int, + min_peptide_len: int, + max_peptide_len: int, + max_mods: int, + precursor_tolerance: float, + isotope_error: List[int], + ): + self.digest = self._digest_fasta( + fasta_path, + enzyme, + digestion, + missed_cleavages, + max_mods, + min_peptide_len, + max_peptide_len, ) - - start, end = get_mass_indices( - [x[1] for x in peptide_list], lower_bound, upper_bound + self.precursor_tolerance = precursor_tolerance + self.isotope_error = isotope_error + + def get_candidates( + self, + precursor_mz: float, + charge: int, + ): + """ + Returns a list of candidate peptides that fall within the specified mass range. + + Parameters + ---------- + precursor_mz : float + The precursor mass-to-charge ratio. + charge : int + The precursor charge. + """ + candidates = set() + + for e in self.isotope_error: + iso_shift = ISOTOPE_SPACING * e + upper_bound = ( + ProteinDatabase._to_raw_mass(precursor_mz, charge) - iso_shift + ) * (1 + (self.precursor_tolerance / 1e6)) + lower_bound = ( + ProteinDatabase._to_raw_mass(precursor_mz, charge) - iso_shift + ) * (1 - (self.precursor_tolerance / 1e6)) + + start, end = ProteinDatabase._get_mass_indices( + [x[1] for x in self.digest], lower_bound, upper_bound + ) + + candidates.update(self.digest[start:end]) + + candidates = list(candidates) + candidates.sort(key=lambda x: x[1]) + return candidates + + def _digest_fasta( + self, + fasta_filename: str, + enzyme: str, + digestion: str, + missed_cleavages: int, + max_mods: int, + min_peptide_length: int, + max_peptide_length: int, + ): + """ + Digests a FASTA file and returns the peptides, their masses, and associated protein. + + Parameters + ---------- + fasta_filename : str + Path to the FASTA file. + enzyme : str + The enzyme to use for digestion. + See pyteomics.parser.expasy_rules for valid enzymes. + digestion : str + The type of digestion to perform. Either 'full' or 'partial'. + missed_cleavages : int + The number of missed cleavages to allow. + max_mods : int + The maximum number of modifications to allow per peptide. + min_peptide_length : int + The minimum length of peptides to consider. + max_peptide_length : int + The maximum length of peptides to consider. + + Returns + ------- + mod_peptide_list : List[Tuple[str, float, str]] + A list of tuples containing the peptide sequence, mass, + and associated protein. Sorted by neutral mass in ascending order. + """ + # Verify the existence of the file: + if not os.path.isfile(fasta_filename): + logger.error("File %s does not exist.", fasta_filename) + raise FileNotFoundError(f"File {fasta_filename} does not exist.") + + fasta_data = fasta.read(fasta_filename) + peptide_list = [] + if digestion not in ["full", "partial"]: + logger.error("Digestion type %s not recognized.", digestion) + raise ValueError(f"Digestion type {digestion} not recognized.") + semi = digestion == "partial" + for header, seq in fasta_data: + pep_set = parser.cleave( + seq, + rule=parser.expasy_rules[enzyme], + missed_cleavages=missed_cleavages, + semi=semi, + ) + protein = header.split()[0] + for pep in pep_set: + if ( + len(pep) < min_peptide_length + or len(pep) > max_peptide_length + ): + continue + if any( + aa in pep for aa in "BJOUXZ" + ): # Check for incorrect AA letters + logger.warn( + "Skipping peptide with ambiguous amino acids: %s", pep + ) + continue + peptide_list.append((pep, protein)) + + # Generate modified peptides + mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") + mod_peptide_list = [] + for pep, prot in peptide_list: + peptide_isoforms = parser.isoforms( + pep, + variable_mods=var_mods, + fixed_mods=fixed_mods, + max_mods=max_mods, + ) + peptide_isoforms = list( + map(ProteinDatabase._convert_from_modx, peptide_isoforms) + ) + mod_peptide_list.extend( + (mod_pep, mass_calculator.mass(mod_pep), prot) + for mod_pep in peptide_isoforms + ) + + # Sort the peptides by mass and return. + mod_peptide_list.sort(key=lambda x: x[1]) + logger.info( + "Digestion complete. %d peptides generated.", len(mod_peptide_list) ) - - candidates.update(peptide_list[start:end]) - - candidates = list(candidates) - candidates.sort(key=lambda x: x[1]) - return candidates - - -def _to_mz(precursor_mass, charge): - """ - Convert precursor neutral mass to m/z value. - - Parameters - ---------- - precursor_mass : float - The precursor neutral mass. - charge : int - The precursor charge. - - Returns - ------- - mz : float - The calculated precursor mass-to-charge ratio. - """ - return (precursor_mass + (charge * PROTON)) / charge - - -def _to_raw_mass(mz_mass, charge): - """ - Convert precursor m/z value to neutral mass. - - Parameters - ---------- - mz_mass : float - The precursor mass-to-charge ratio. - charge : int - The precursor charge. - - Returns - ------- - mass : float - The calculated precursor neutral mass. - """ - return charge * (mz_mass - PROTON) - - -def get_mass_indices(masses, m_low, m_high): - """Grabs mass indices that fall within a specified range. - - Pulls from masses, a list of mass values. - Requires that the mass values are sorted in ascending order. - - Parameters - ---------- - masses : List[int] - List of mass values - m_low : int - Lower bound of mass range (inclusive) - m_high : int - Upper bound of mass range (inclusive) - - Return - ------ - indices : Tuple[int, int] - Indices of mass values that fall within the specified range - """ - start = bisect.bisect_left(masses, m_low) - end = bisect.bisect_right(masses, m_high) - return start, end + return mod_peptide_list + + def _to_mz(precursor_mass, charge): + """ + Convert precursor neutral mass to m/z value. + + Parameters + ---------- + precursor_mass : float + The precursor neutral mass. + charge : int + The precursor charge. + + Returns + ------- + mz : float + The calculated precursor mass-to-charge ratio. + """ + return (precursor_mass + (charge * PROTON)) / charge + + def _to_raw_mass(mz_mass, charge): + """ + Convert precursor m/z value to neutral mass. + + Parameters + ---------- + mz_mass : float + The precursor mass-to-charge ratio. + charge : int + The precursor charge. + + Returns + ------- + mass : float + The calculated precursor neutral mass. + """ + return charge * (mz_mass - PROTON) + + def _get_mass_indices(masses, m_low, m_high): + """Grabs mass indices that fall within a specified range. + + Pulls from masses, a list of mass values. + Requires that the mass values are sorted in ascending order. + + Parameters + ---------- + masses : List[int] + List of mass values + m_low : int + Lower bound of mass range (inclusive) + m_high : int + Upper bound of mass range (inclusive) + + Return + ------ + indices : Tuple[int, int] + Indices of mass values that fall within the specified range + """ + start = bisect.bisect_left(masses, m_low) + end = bisect.bisect_right(masses, m_high) + return start, end + + def _convert_from_modx(seq: str): + """Converts peptide sequence from modX format to Casanovo-acceptable modifications. + + Args: + seq (str): Peptide in modX format + """ + seq = seq.replace("carbmC", "C+57.021") # Fixed modification + seq = seq.replace("oxM", "M+15.995") + seq = seq.replace("dN", "N+0.984") + seq = seq.replace("dQ", "Q+0.984") + seq = seq.replace("ace-", "+42.011") + seq = seq.replace("carbnh3x-", "+43.006-17.027") + seq = seq.replace("carb-", "+43.006") + seq = seq.replace("nh3x-", "-17.027") + return seq diff --git a/casanovo/denovo/dataloaders.py b/casanovo/denovo/dataloaders.py index 14a0ff99..4d5524f4 100644 --- a/casanovo/denovo/dataloaders.py +++ b/casanovo/denovo/dataloaders.py @@ -89,6 +89,7 @@ def __init__( self.train_dataset = None self.valid_dataset = None self.test_dataset = None + self.pdb = None def setup(self, stage: str = None, annotated: bool = True) -> None: """ @@ -96,7 +97,7 @@ def setup(self, stage: str = None, annotated: bool = True) -> None: Parameters ---------- - stage : str {"fit", "validate", "test", "db"} + stage : str {"fit", "validate", "test"} The stage indicating which Datasets to prepare. All are prepared by default. annotated: bool @@ -186,12 +187,7 @@ def db_dataloader(self) -> torch.utils.data.DataLoader: return torch.utils.data.DataLoader( self.test_dataset, batch_size=self.eval_batch_size, - collate_fn=functools.partial( - prepare_psm_batch, - digest=self.digest, - precursor_tolerance=self.precursor_tolerance, - isotope_error=self.isotope_error, - ), + collate_fn=functools.partial(prepare_psm_batch, pdb=self.pdb), pin_memory=True, num_workers=self.n_workers, shuffle=False, @@ -239,9 +235,7 @@ def prepare_batch( def prepare_psm_batch( batch: List[Tuple[torch.Tensor, float, int, str]], - digest: List[Tuple[str, float, str]], - precursor_tolerance: float, - isotope_error: str, + pdb: db_utils.ProteinDatabase, ): """ Collate MS/MS spectra into a batch for DB search. @@ -255,13 +249,8 @@ def prepare_psm_batch( A batch of data from an AnnotatedSpectrumDataset, consisting of for each spectrum (i) a tensor with the m/z and intensity peak values, (ii), the precursor m/z, (iii) the precursor charge, (iv) the spectrum identifier. - digest : List[Tuple[str, float, str]] - A list of tuples containing the peptide sequence, mass, and associated protein - from digesting a .fasta file. Sorted by mass in ascending order. Uses neutral masses. - precursor_tolerance : float - The precursor mass tolerance in parts-per-million. - isotope_error : str - The isotope error levels to consider. + pdb : db_utils.ProteinDatabase + The protein database to use for candidate peptide retrieval. Returns ------- @@ -294,12 +283,9 @@ def prepare_psm_batch( all_peptides = [] all_proteins = [] for idx in range(len(batch)): - digest_data = db_utils.get_candidates( + digest_data = pdb.get_candidates( precursor_mzs[idx], precursor_charges[idx], - digest, - precursor_tolerance, - isotope_error, ) try: spec_peptides, _, pep_protein = list(zip(*digest_data)) diff --git a/casanovo/denovo/model_runner.py b/casanovo/denovo/model_runner.py index c2b71098..b90f06b0 100644 --- a/casanovo/denovo/model_runner.py +++ b/casanovo/denovo/model_runner.py @@ -83,14 +83,6 @@ def db_search( self, peak_path: Iterable[str], fasta_path: str, - enzyme: str, - digestion: str, - missed_cleavages: int, - max_mods: int, - min_peptide_length: int, - max_peptide_length: int, - precursor_tolerance: float, - isotope_error: str, output: str, ) -> None: """Perform database search with Casanovo. @@ -101,22 +93,6 @@ def db_search( The paths to the .mgf data files for database search. fasta_path : str The path to the FASTA file for database search. - enzyme : str - The enzyme used for digestion. - digestion : str - The digestion type, full or partial. - missed_cleavages : int - The number of missed cleavages allowed. - max_mods : int - The maximum number of modifications allowed per peptide. - min_peptide_length : int - The minimum peptide length. - max_peptide_length : int - The maximum peptide length. - precursor_tolerance : float - The precursor mass tolerance in ppm. - isotope_error : str - Isotope error levels to consider, in comma-delineated string form. output : str Where should the output be saved? @@ -138,19 +114,18 @@ def db_search( self.writer.set_ms_run(test_index.ms_files) self.initialize_data_module(test_index=test_index) - self.loaders.setup(stage="test", annotated=False) - self.loaders.digest = db_utils.digest_fasta( + self.loaders.pdb = db_utils.ProteinDatabase( fasta_path, - enzyme, - digestion, - missed_cleavages, - max_mods, - min_peptide_length, - max_peptide_length, + self.config.enzyme, + self.config.digestion, + self.config.missed_cleavages, + self.config.min_peptide_len, + self.config.max_peptide_len, + self.config.max_mods, + self.config.precursor_mass_tol, + self.config.isotope_error_range, ) - self.loaders.precursor_tolerance = precursor_tolerance - self.loaders.isotope_error = isotope_error - + self.loaders.setup(stage="test", annotated=False) self.trainer.predict(self.model, self.loaders.db_dataloader()) def train( diff --git a/tests/conftest.py b/tests/conftest.py index 60afcd83..f20d7879 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -242,6 +242,11 @@ def tiny_config(tmp_path): "precursor_mass_tol": 5, "isotope_error_range": [0, 1], "min_peptide_len": 6, + "max_peptide_len": 50, + "enzyme": "trypsin", + "digestion": "full", + "missed_cleavages": 0, + "max_mods": None, "predict_batch_size": 1024, "n_beams": 1, "top_match": 1, diff --git a/tests/test_integration.py b/tests/test_integration.py index 4bd55174..61f735c3 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -24,8 +24,6 @@ def test_db_search( tiny_config, "--output", str(output_path), - "--precursor_tolerance", - str(100), str(mgf_db_search), str(tiny_fasta_file), ] diff --git a/tests/unit_tests/test_unit.py b/tests/unit_tests/test_unit.py index 419cf3ef..7a37e771 100644 --- a/tests/unit_tests/test_unit.py +++ b/tests/unit_tests/test_unit.py @@ -276,15 +276,18 @@ def test_digest_fasta_cleave(tiny_fasta_file): (0, 1, 3), (expected_normal, expected_1missedcleavage, expected_3missedcleavage), ): - peptide_list = db_utils.digest_fasta( - fasta_filename=str(tiny_fasta_file), + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=missed_cleavages, + min_peptide_len=6, + max_peptide_len=50, max_mods=0, - min_peptide_length=6, - max_peptide_length=50, + precursor_tolerance=20, + isotope_error=[0], ) + peptide_list = pdb.digest peptide_list = [x[0] for x in peptide_list] assert peptide_list == expected @@ -343,16 +346,18 @@ def test_digest_fasta_mods(tiny_fasta_file): "+42.011FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", "+43.006FSGSGSGTDFTLTISSLQPEDFAVYYC+57.021QQDYNLP", ] - - peptide_list = db_utils.digest_fasta( - fasta_filename=str(tiny_fasta_file), + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=0, + min_peptide_len=6, + max_peptide_len=50, max_mods=1, - min_peptide_length=6, - max_peptide_length=50, + precursor_tolerance=20, + isotope_error=[0], ) + peptide_list = pdb.digest peptide_list = [x[0] for x in peptide_list] peptide_list = [ x @@ -375,27 +380,33 @@ def test_length_restrictions(tiny_fasta_file): # length between 6 and 8 expected_short = ["ATSIPAR", "VTLSC+57.021R"] - peptide_list = db_utils.digest_fasta( - fasta_filename=str(tiny_fasta_file), + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=0, + min_peptide_len=20, + max_peptide_len=50, max_mods=0, - min_peptide_length=20, - max_peptide_length=50, + precursor_tolerance=20, + isotope_error=[0], ) + peptide_list = pdb.digest peptide_list = [x[0] for x in peptide_list] assert peptide_list == expected_long - peptide_list = db_utils.digest_fasta( - fasta_filename=str(tiny_fasta_file), + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=0, + min_peptide_len=6, + max_peptide_len=8, max_mods=0, - min_peptide_length=6, - max_peptide_length=8, + precursor_tolerance=20, + isotope_error=[0], ) + peptide_list = pdb.digest peptide_list = [x[0] for x in peptide_list] assert peptide_list == expected_short @@ -415,27 +426,33 @@ def test_digest_fasta_enzyme(tiny_fasta_file): # asp-n enzyme expected_aspn = ["DFAVYYC+57.021QQ", "DFTLTISSLQPE", "MEAPAQLLFLLLLWLP"] - peptide_list = db_utils.digest_fasta( - fasta_filename=str(tiny_fasta_file), + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), enzyme="arg-c", digestion="full", missed_cleavages=0, + min_peptide_len=6, + max_peptide_len=50, max_mods=0, - min_peptide_length=6, - max_peptide_length=50, + precursor_tolerance=20, + isotope_error=[0], ) + peptide_list = pdb.digest peptide_list = [x[0] for x in peptide_list] assert peptide_list == expected_argc - peptide_list = db_utils.digest_fasta( - fasta_filename=str(tiny_fasta_file), + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), enzyme="asp-n", digestion="full", missed_cleavages=0, + min_peptide_len=6, + max_peptide_len=50, max_mods=0, - min_peptide_length=6, - max_peptide_length=50, + precursor_tolerance=20, + isotope_error=[0], ) + peptide_list = pdb.digest peptide_list = [x[0] for x in peptide_list] assert peptide_list == expected_aspn @@ -450,68 +467,53 @@ def test_get_candidates(tiny_fasta_file): # precursor window is 600000 expected_widewindow = ["ATSIPAR", "VTLSC+57.021R", "LLIYGASTR"] - peptide_list = db_utils.digest_fasta( - fasta_filename=str(tiny_fasta_file), + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=1, + min_peptide_len=6, + max_peptide_len=50, max_mods=0, - min_peptide_length=6, - max_peptide_length=50, - ) - - candidates = db_utils.get_candidates( - precursor_mz=496.2, - charge=2, - peptide_list=peptide_list, precursor_tolerance=10000, - isotope_error="0", + isotope_error=[0], ) + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) candidates = [x[0] for x in candidates] assert expected_smallwindow == candidates - peptide_list = db_utils.digest_fasta( - fasta_filename=str(tiny_fasta_file), + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=1, + min_peptide_len=6, + max_peptide_len=50, max_mods=0, - min_peptide_length=6, - max_peptide_length=50, - ) - - candidates = db_utils.get_candidates( - precursor_mz=496.2, - charge=2, - peptide_list=peptide_list, precursor_tolerance=150000, - isotope_error="0", + isotope_error=[0], ) + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) candidates = [x[0] for x in candidates] assert expected_midwindow == candidates - peptide_list = db_utils.digest_fasta( - fasta_filename=str(tiny_fasta_file), + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), enzyme="trypsin", digestion="full", missed_cleavages=1, + min_peptide_len=6, + max_peptide_len=50, max_mods=0, - min_peptide_length=6, - max_peptide_length=50, - ) - - candidates = db_utils.get_candidates( - precursor_mz=496.2, - charge=2, - peptide_list=peptide_list, precursor_tolerance=600000, - isotope_error="0", + isotope_error=[0], ) + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) candidates = [x[0] for x in candidates] assert expected_widewindow == candidates -def test_get_candidates_isotope_error(): +def test_get_candidates_isotope_error(tiny_fasta_file): # Tide isotope error windows for 496.2, 2+: # 0: [980.481617, 1000.289326] @@ -556,53 +558,83 @@ def test_get_candidates_isotope_error(): expected_isotope3 = list("XWVUTSRQPONMLKJIHGFE") expected_isotope0123 = list("XWVUTSRQPONMLKJIHGFEDCB") - candidates = db_utils.get_candidates( - precursor_mz=496.2, - charge=2, - peptide_list=peptide_list, + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + min_peptide_len=0, + max_peptide_len=0, + max_mods=0, precursor_tolerance=10000, - isotope_error="0", + isotope_error=[0], ) + pdb.digest = peptide_list + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) candidates = [x[0] for x in candidates] assert expected_isotope0 == candidates - candidates = db_utils.get_candidates( - precursor_mz=496.2, - charge=2, - peptide_list=peptide_list, + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + min_peptide_len=0, + max_peptide_len=0, + max_mods=0, precursor_tolerance=10000, - isotope_error="1", + isotope_error=[1], ) + pdb.digest = peptide_list + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) candidates = [x[0] for x in candidates] assert expected_isotope1 == candidates - candidates = db_utils.get_candidates( - precursor_mz=496.2, - charge=2, - peptide_list=peptide_list, + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + min_peptide_len=0, + max_peptide_len=0, + max_mods=0, precursor_tolerance=10000, - isotope_error="2", + isotope_error=[2], ) + pdb.digest = peptide_list + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) candidates = [x[0] for x in candidates] assert expected_isotope2 == candidates - candidates = db_utils.get_candidates( - precursor_mz=496.2, - charge=2, - peptide_list=peptide_list, + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + min_peptide_len=0, + max_peptide_len=0, + max_mods=0, precursor_tolerance=10000, - isotope_error="3", + isotope_error=[3], ) + pdb.digest = peptide_list + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) candidates = [x[0] for x in candidates] assert expected_isotope3 == candidates - candidates = db_utils.get_candidates( - precursor_mz=496.2, - charge=2, - peptide_list=peptide_list, + pdb = db_utils.ProteinDatabase( + fasta_path=str(tiny_fasta_file), + enzyme="trypsin", + digestion="full", + missed_cleavages=0, + min_peptide_len=0, + max_peptide_len=0, + max_mods=0, precursor_tolerance=10000, - isotope_error="0,1,2,3", + isotope_error=[0, 1, 2, 3], ) + pdb.digest = peptide_list + candidates = pdb.get_candidates(precursor_mz=496.2, charge=2) candidates = [x[0] for x in candidates] assert expected_isotope0123 == candidates