From 68e67e833607aeb8e5856181f1e74e357e993235 Mon Sep 17 00:00:00 2001 From: VarunAnanth2003 Date: Sun, 3 Nov 2024 06:08:00 -0800 Subject: [PATCH] db_utils fixes --- casanovo/casanovo.py | 5 +- casanovo/config.yaml | 20 +++--- casanovo/data/datasets.py | 2 +- casanovo/data/db_utils.py | 148 ++++++++++++++++++-------------------- 4 files changed, 85 insertions(+), 90 deletions(-) diff --git a/casanovo/casanovo.py b/casanovo/casanovo.py index 4feff0cb..5547a807 100644 --- a/casanovo/casanovo.py +++ b/casanovo/casanovo.py @@ -236,6 +236,7 @@ def db_search( config, model = setup_model( model, config, output_path, output_root_name, False ) + start_time = time.time() with ModelRunner( config, model, @@ -246,6 +247,7 @@ def db_search( logger.info("Performing database search on:") for peak_file in peak_path: logger.info(" %s", peak_file) + logger.info("Using the following FASTA file:") logger.info(" %s", fasta_path) @@ -254,8 +256,7 @@ def db_search( fasta_path, str((output_path / output_root).with_suffix(".mztab")), ) - - logger.info("DONE!") + utils.log_run_report(start_time=start_time, end_time=time.time()) @main.command(cls=_SharedParams) diff --git a/casanovo/config.yaml b/casanovo/config.yaml index 6df7d094..014f02ee 100644 --- a/casanovo/config.yaml +++ b/casanovo/config.yaml @@ -23,6 +23,13 @@ max_peptide_len: 100 predict_batch_size: 1024 # Number of PSMs for each spectrum. top_match: 1 +# The hardware accelerator to use. Must be one of: +# "cpu", "gpu", "tpu", "ipu", "hpu", "mps", or "auto". +accelerator: "auto" +# The devices to use. Can be set to a positive number int, or the value -1 to +# indicate all available devices should be used. If left empty, the appropriate +# number will be automatically selected for based on the chosen accelerator. +devices: ### @@ -31,13 +38,6 @@ top_match: 1 # Number of beams used in beam search. n_beams: 1 -# The hardware accelerator to use. Must be one of: -# "cpu", "gpu", "tpu", "ipu", "hpu", "mps", or "auto". -accelerator: "auto" -# The devices to use. Can be set to a positive number int, or the value -1 to -# indicate all available devices should be used. If left empty, the appropriate -# number will be automatically selected for based on the chosen accelerator. -devices: ### @@ -46,7 +46,7 @@ devices: # Enzyme for in silico digestion, used to generate candidate peptides. # See pyteomics.parser.expasy_rules for valid enzymes. -# Can also take a regex expression to specify custom digestion rules. +# Can also take a regex to specify custom digestion rules. enzyme: "trypsin" # Digestion type for candidate peptide generation. # full: standard digestion. @@ -60,9 +60,9 @@ missed_cleavages: 0 max_mods: 1 # Select which modifications from the vocabulary can be used in candidate creation. # Format: Comma-separated list of "aa:mod_residue", -# where aa is a standard amino acid or "nterm" for an N-terminal mod +# where aa is a standard amino acid (or "nterm" for an N-terminal mod) # and mod_residue is a key from the "residues" dictionary. -# Example: "M:M+15.995,X:+43.006-17.027" +# Example: "M:M+15.995,nterm:+43.006" allowed_fixed_mods: "C:C+57.021" allowed_var_mods: "M:M+15.995,N:N+0.984,Q:Q+0.984,nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027" diff --git a/casanovo/data/datasets.py b/casanovo/data/datasets.py index 33d84e49..3917a2c8 100644 --- a/casanovo/data/datasets.py +++ b/casanovo/data/datasets.py @@ -1,6 +1,6 @@ """A PyTorch Dataset class for annotated spectra.""" -from typing import List, Optional, Tuple +from typing import Optional, Tuple import depthcharge import numpy as np diff --git a/casanovo/data/db_utils.py b/casanovo/data/db_utils.py index c68d208c..8d141117 100644 --- a/casanovo/data/db_utils.py +++ b/casanovo/data/db_utils.py @@ -24,7 +24,8 @@ class ProteinDatabase: """ - Store digested .fasta data and return candidate peptides for a given precursor mass. + Store digested .fasta data and return candidate peptides + for a given precursor mass. Parameters ---------- @@ -34,7 +35,8 @@ class ProteinDatabase: The enzyme to use for digestion. See pyteomics.parser.expasy_rules for valid enzymes. digestion : str - The type of digestion to perform. Either 'full' or 'partial'. + The type of digestion to perform. + Either 'full', 'partial' or 'non-specific'. missed_cleavages : int The number of missed cleavages to allow. min_peptide_len : int @@ -46,12 +48,13 @@ class ProteinDatabase: precursor_tolerance : float The precursor mass tolerance in ppm. isotope_error : Tuple[int, int] - Isotope range [min, max] to consider when comparing predicted and observed precursor m/z's. + Isotope range [min, max] to consider when comparing predicted + and observed precursor m/z's. allowed_fixed_mods : str A comma separated string of fixed modifications to consider. allowed_var_mods : str A comma separated string of variable modifications to consider. - residues : dict + residues : dict[str, float] A dictionary of amino acid masses. """ @@ -68,7 +71,7 @@ def __init__( isotope_error: Tuple[int, int], allowed_fixed_mods: str, allowed_var_mods: str, - residues: dict, + residues: dict[str, float], ): self.fixed_mods, self.var_mods, self.swap_map = _construct_mods_dict( allowed_fixed_mods, allowed_var_mods @@ -84,7 +87,7 @@ def __init__( missed_cleavages, min_peptide_len, max_peptide_len, - set(list(residues.keys()) + ["C"]), + set([aa[0] for aa in residues.keys() if aa[0].isalpha()]), ) self.db_peptides, self.prot_map = self._digest_fasta(peptide_generator) self.precursor_tolerance = precursor_tolerance @@ -94,9 +97,10 @@ def get_candidates( self, precursor_mz: float, charge: int, - ) -> List[Tuple[str, str]]: + ) -> pd.Series: """ - Returns a list of candidate peptides that fall within the specified mass range. + Returns a list of candidate peptides that fall within the + specified mass range. Parameters ---------- @@ -115,7 +119,7 @@ def get_candidates( for e in range(self.isotope_error[0], self.isotope_error[1] + 1): iso_shift = ISOTOPE_SPACING * e shift_raw_mass = float( - _to_raw_mass(precursor_mz, charge) - iso_shift + _to_neutral_mass(precursor_mz, charge) - iso_shift ) upper_bound = shift_raw_mass * ( 1 + (self.precursor_tolerance / 1e6) @@ -154,9 +158,10 @@ def get_associated_protein(self, peptide: str) -> str: def _digest_fasta( self, peptide_generator: Iterator[Tuple[str, str]], - ) -> Tuple[pd.DataFrame, dict]: + ) -> Tuple[pd.DataFrame, dict[str, str]]: """ - Digests a FASTA file and returns the peptides, their masses, and associated protein. + Digests a FASTA file and returns the peptides, their masses, + and associated protein. Parameters ---------- @@ -168,13 +173,9 @@ def _digest_fasta( pep_table : pd.DataFrame A Pandas DataFrame with peptide and mass columns. Sorted by neutral mass in ascending order. - prot_map : dict + prot_map : dict[str, str] A dictionary mapping peptides to associated proteins. """ - peptide_list = [] - for pep, prot in peptide_generator: - peptide_list.append((pep, prot)) - # Generate modified peptides mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb") peptide_isoforms = [ @@ -187,7 +188,7 @@ def _digest_fasta( ), prot, ) - for pep, prot in peptide_list + for pep, prot in peptide_generator ] mod_peptide_list = [ (mod_pep, mass_calculator.mass(mod_pep), prot) @@ -203,9 +204,9 @@ def _digest_fasta( ] # Create a dictionary mapping for easy accession of associated proteins - prot_map = defaultdict(list) + prot_map = defaultdict(set) for pep, _, prot in mod_peptide_list: - prot_map[pep].append(prot) + prot_map[pep].add(prot) # Create a DataFrame for easy sorting and filtering pep_table = pd.DataFrame( @@ -227,8 +228,8 @@ def _peptide_generator( enzyme: str, digestion: str, missed_cleavages: int, - min_peptide_length: int, - max_peptide_length: int, + min_peptide_len: int, + max_peptide_len: int, valid_aa: set[str], ) -> Iterator[str]: """ @@ -242,14 +243,15 @@ def _peptide_generator( enzyme : str The enzyme to use for digestion. See pyteomics.parser.expasy_rules for valid enzymes. - Can also be a regex pattern. + Can also be a regex. digestion : str - The type of digestion to perform. Either 'full', 'partial' or 'non-specific'. + The type of digestion to perform. + Either 'full', 'partial' or 'non-specific'. missed_cleavages : int The number of missed cleavages to allow. - min_peptide_length : int + min_peptide_len : int The minimum length of peptides to consider. - max_peptide_length : int + max_peptide_len : int The maximum length of peptides to consider. valid_aa : set[str] A set of valid amino acids. @@ -261,19 +263,6 @@ def _peptide_generator( protein : str The associated protein. """ - # Verify the existence of the file: - if not os.path.isfile(fasta_filename): - logger.error("File %s does not exist.", fasta_filename) - raise FileNotFoundError(f"File {fasta_filename} does not exist.") - if digestion not in ["full", "partial", "non-specific"]: - logger.error("Digestion type %s not recognized.", digestion) - raise ValueError(f"Digestion type {digestion} not recognized.") - if enzyme not in parser.expasy_rules: - logger.info( - "Enzyme %s not recognized. Interpreting as cleavage rule.", - enzyme, - ) - # Verify the existence of the file: if not os.path.isfile(fasta_filename): logger.error("File %s does not exist.", fasta_filename) @@ -292,12 +281,12 @@ def _peptide_generator( # Generate all possible peptides for i in range(len(seq)): for j in range( - i + min_peptide_length, - min(i + max_peptide_length + 1, len(seq) + 1), + i + min_peptide_len, + min(i + max_peptide_len + 1, len(seq) + 1), ): pep = seq[i:j] if any(aa not in valid_aa for aa in pep): - logger.warn( + logger.warning( "Skipping peptide with unknown amino acids: %s", pep, ) @@ -314,12 +303,9 @@ def _peptide_generator( ) protein = header.split()[0] for pep in pep_set: - if ( - len(pep) >= min_peptide_length - and len(pep) <= max_peptide_length - ): + if len(pep) >= min_peptide_len and len(pep) <= max_peptide_len: if any(aa not in valid_aa for aa in pep): - logger.warn( + logger.warning( "Skipping peptide with unknown amino acids: %s", pep, ) @@ -348,7 +334,7 @@ def _to_mz(precursor_mass: float, charge: int) -> float: @njit -def _to_raw_mass(mz_mass: float, charge: int) -> float: +def _to_neutral_mass(mz_mass: float, charge: int) -> float: """ Convert precursor m/z value to neutral mass. @@ -367,23 +353,33 @@ def _to_raw_mass(mz_mass: float, charge: int) -> float: return charge * (mz_mass - PROTON) -def _convert_from_modx(seq: str, swap_map: dict, swap_regex: str) -> str: - """Converts peptide sequence from modX format to Casanovo-acceptable modifications. - - Args: - seq : str - Peptide in modX format - swap_map : dict - Dictionary that allows for swapping of modX to Casanovo-acceptable modifications. - swap_regex : str - Regular expression to match modX format. +def _convert_from_modx( + seq: str, swap_map: dict[str, str], swap_regex: str +) -> str: + """ + Converts peptide sequence from modX format to + Casanovo-acceptable modifications. + + Parameters: + ----------- + seq : str + Peptide in modX format + swap_map : dict[str, str] + Dictionary that allows for swapping of modX to Casanovo-acceptable modifications. + swap_regex : str + Regular expression to match modX format. + + Returns: + -------- + swap_regex : str + Peptide in Casanovo-acceptable modifications. """ return swap_regex.sub(lambda x: swap_map[x.group()], seq) def _construct_mods_dict( allowed_fixed_mods: str, allowed_var_mods: str -) -> Tuple[dict, dict, dict]: +) -> Tuple[dict[str, str], dict[str, str], dict[str, str]]: """ Constructs dictionaries of fixed and variable modifications. @@ -396,30 +392,28 @@ def _construct_mods_dict( Returns ------- - fixed_mods : dict + fixed_mods : dict[str, str] A dictionary of fixed modifications. - var_mods : dict + var_mods : dict[str, str] A dictionary of variable modifications. - swap_map : dict - A dictionary that allows for swapping of modX to Casanovo-acceptable modifications. + swap_map : dict[str, str] + A dictionary that allows for swapping of modX to + Casanovo-acceptable modifications. """ swap_map = {} fixed_mods = {} - for idx, mod in enumerate(allowed_fixed_mods.split(",")): - aa, mod_aa = mod.split(":") - mod_id = string.ascii_lowercase[idx] - fixed_mods[mod_id] = [aa] - swap_map[f"{mod_id}{aa}"] = f"{mod_aa}" - var_mods = {} - for idx, mod in enumerate(allowed_var_mods.split(",")): - aa, mod_aa = mod.split(":") - mod_id = string.ascii_lowercase[idx] - if aa == "nterm": - var_mods[f"{mod_id}-"] = True - swap_map[f"{mod_id}-"] = f"{mod_aa}" - else: - var_mods[mod_id] = [aa] - swap_map[f"{mod_id}{aa}"] = f"{mod_aa}" + for mod_map, allowed_mods in zip( + [fixed_mods, var_mods], [allowed_fixed_mods, allowed_var_mods] + ): + for idx, mod in enumerate(allowed_mods.split(",")): + aa, mod_aa = mod.split(":") + mod_id = string.ascii_lowercase[idx] + if aa == "nterm": + mod_map[f"{mod_id}-"] = True + swap_map[f"{mod_id}-"] = f"{mod_aa}" + else: + mod_map[mod_id] = [aa] + swap_map[f"{mod_id}{aa}"] = f"{mod_aa}" return fixed_mods, var_mods, swap_map