Skip to content

Commit

Permalink
db_utils fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
VarunAnanth2003 committed Nov 3, 2024
1 parent 2b6198b commit 68e67e8
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 90 deletions.
5 changes: 3 additions & 2 deletions casanovo/casanovo.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ def db_search(
config, model = setup_model(
model, config, output_path, output_root_name, False
)
start_time = time.time()
with ModelRunner(
config,
model,
Expand All @@ -246,6 +247,7 @@ def db_search(
logger.info("Performing database search on:")
for peak_file in peak_path:
logger.info(" %s", peak_file)

logger.info("Using the following FASTA file:")
logger.info(" %s", fasta_path)

Expand All @@ -254,8 +256,7 @@ def db_search(
fasta_path,
str((output_path / output_root).with_suffix(".mztab")),
)

logger.info("DONE!")
utils.log_run_report(start_time=start_time, end_time=time.time())


@main.command(cls=_SharedParams)
Expand Down
20 changes: 10 additions & 10 deletions casanovo/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,13 @@ max_peptide_len: 100
predict_batch_size: 1024
# Number of PSMs for each spectrum.
top_match: 1
# The hardware accelerator to use. Must be one of:
# "cpu", "gpu", "tpu", "ipu", "hpu", "mps", or "auto".
accelerator: "auto"
# The devices to use. Can be set to a positive number int, or the value -1 to
# indicate all available devices should be used. If left empty, the appropriate
# number will be automatically selected for based on the chosen accelerator.
devices:


###
Expand All @@ -31,13 +38,6 @@ top_match: 1

# Number of beams used in beam search.
n_beams: 1
# The hardware accelerator to use. Must be one of:
# "cpu", "gpu", "tpu", "ipu", "hpu", "mps", or "auto".
accelerator: "auto"
# The devices to use. Can be set to a positive number int, or the value -1 to
# indicate all available devices should be used. If left empty, the appropriate
# number will be automatically selected for based on the chosen accelerator.
devices:


###
Expand All @@ -46,7 +46,7 @@ devices:

# Enzyme for in silico digestion, used to generate candidate peptides.
# See pyteomics.parser.expasy_rules for valid enzymes.
# Can also take a regex expression to specify custom digestion rules.
# Can also take a regex to specify custom digestion rules.
enzyme: "trypsin"
# Digestion type for candidate peptide generation.
# full: standard digestion.
Expand All @@ -60,9 +60,9 @@ missed_cleavages: 0
max_mods: 1
# Select which modifications from the vocabulary can be used in candidate creation.
# Format: Comma-separated list of "aa:mod_residue",
# where aa is a standard amino acid or "nterm" for an N-terminal mod
# where aa is a standard amino acid (or "nterm" for an N-terminal mod)
# and mod_residue is a key from the "residues" dictionary.
# Example: "M:M+15.995,X:+43.006-17.027"
# Example: "M:M+15.995,nterm:+43.006"
allowed_fixed_mods: "C:C+57.021"
allowed_var_mods: "M:M+15.995,N:N+0.984,Q:Q+0.984,nterm:+42.011,nterm:+43.006,nterm:-17.027,nterm:+43.006-17.027"

Expand Down
2 changes: 1 addition & 1 deletion casanovo/data/datasets.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""A PyTorch Dataset class for annotated spectra."""

from typing import List, Optional, Tuple
from typing import Optional, Tuple

import depthcharge
import numpy as np
Expand Down
148 changes: 71 additions & 77 deletions casanovo/data/db_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@

class ProteinDatabase:
"""
Store digested .fasta data and return candidate peptides for a given precursor mass.
Store digested .fasta data and return candidate peptides
for a given precursor mass.
Parameters
----------
Expand All @@ -34,7 +35,8 @@ class ProteinDatabase:
The enzyme to use for digestion.
See pyteomics.parser.expasy_rules for valid enzymes.
digestion : str
The type of digestion to perform. Either 'full' or 'partial'.
The type of digestion to perform.
Either 'full', 'partial' or 'non-specific'.
missed_cleavages : int
The number of missed cleavages to allow.
min_peptide_len : int
Expand All @@ -46,12 +48,13 @@ class ProteinDatabase:
precursor_tolerance : float
The precursor mass tolerance in ppm.
isotope_error : Tuple[int, int]
Isotope range [min, max] to consider when comparing predicted and observed precursor m/z's.
Isotope range [min, max] to consider when comparing predicted
and observed precursor m/z's.
allowed_fixed_mods : str
A comma separated string of fixed modifications to consider.
allowed_var_mods : str
A comma separated string of variable modifications to consider.
residues : dict
residues : dict[str, float]
A dictionary of amino acid masses.
"""

Expand All @@ -68,7 +71,7 @@ def __init__(
isotope_error: Tuple[int, int],
allowed_fixed_mods: str,
allowed_var_mods: str,
residues: dict,
residues: dict[str, float],
):
self.fixed_mods, self.var_mods, self.swap_map = _construct_mods_dict(
allowed_fixed_mods, allowed_var_mods
Expand All @@ -84,7 +87,7 @@ def __init__(
missed_cleavages,
min_peptide_len,
max_peptide_len,
set(list(residues.keys()) + ["C"]),
set([aa[0] for aa in residues.keys() if aa[0].isalpha()]),
)
self.db_peptides, self.prot_map = self._digest_fasta(peptide_generator)
self.precursor_tolerance = precursor_tolerance
Expand All @@ -94,9 +97,10 @@ def get_candidates(
self,
precursor_mz: float,
charge: int,
) -> List[Tuple[str, str]]:
) -> pd.Series:
"""
Returns a list of candidate peptides that fall within the specified mass range.
Returns a list of candidate peptides that fall within the
specified mass range.
Parameters
----------
Expand All @@ -115,7 +119,7 @@ def get_candidates(
for e in range(self.isotope_error[0], self.isotope_error[1] + 1):
iso_shift = ISOTOPE_SPACING * e
shift_raw_mass = float(
_to_raw_mass(precursor_mz, charge) - iso_shift
_to_neutral_mass(precursor_mz, charge) - iso_shift
)
upper_bound = shift_raw_mass * (
1 + (self.precursor_tolerance / 1e6)
Expand Down Expand Up @@ -154,9 +158,10 @@ def get_associated_protein(self, peptide: str) -> str:
def _digest_fasta(
self,
peptide_generator: Iterator[Tuple[str, str]],
) -> Tuple[pd.DataFrame, dict]:
) -> Tuple[pd.DataFrame, dict[str, str]]:
"""
Digests a FASTA file and returns the peptides, their masses, and associated protein.
Digests a FASTA file and returns the peptides, their masses,
and associated protein.
Parameters
----------
Expand All @@ -168,13 +173,9 @@ def _digest_fasta(
pep_table : pd.DataFrame
A Pandas DataFrame with peptide and mass columns.
Sorted by neutral mass in ascending order.
prot_map : dict
prot_map : dict[str, str]
A dictionary mapping peptides to associated proteins.
"""
peptide_list = []
for pep, prot in peptide_generator:
peptide_list.append((pep, prot))

# Generate modified peptides
mass_calculator = depthcharge.masses.PeptideMass(residues="massivekb")
peptide_isoforms = [
Expand All @@ -187,7 +188,7 @@ def _digest_fasta(
),
prot,
)
for pep, prot in peptide_list
for pep, prot in peptide_generator
]
mod_peptide_list = [
(mod_pep, mass_calculator.mass(mod_pep), prot)
Expand All @@ -203,9 +204,9 @@ def _digest_fasta(
]

# Create a dictionary mapping for easy accession of associated proteins
prot_map = defaultdict(list)
prot_map = defaultdict(set)
for pep, _, prot in mod_peptide_list:
prot_map[pep].append(prot)
prot_map[pep].add(prot)

# Create a DataFrame for easy sorting and filtering
pep_table = pd.DataFrame(
Expand All @@ -227,8 +228,8 @@ def _peptide_generator(
enzyme: str,
digestion: str,
missed_cleavages: int,
min_peptide_length: int,
max_peptide_length: int,
min_peptide_len: int,
max_peptide_len: int,
valid_aa: set[str],
) -> Iterator[str]:
"""
Expand All @@ -242,14 +243,15 @@ def _peptide_generator(
enzyme : str
The enzyme to use for digestion.
See pyteomics.parser.expasy_rules for valid enzymes.
Can also be a regex pattern.
Can also be a regex.
digestion : str
The type of digestion to perform. Either 'full', 'partial' or 'non-specific'.
The type of digestion to perform.
Either 'full', 'partial' or 'non-specific'.
missed_cleavages : int
The number of missed cleavages to allow.
min_peptide_length : int
min_peptide_len : int
The minimum length of peptides to consider.
max_peptide_length : int
max_peptide_len : int
The maximum length of peptides to consider.
valid_aa : set[str]
A set of valid amino acids.
Expand All @@ -261,19 +263,6 @@ def _peptide_generator(
protein : str
The associated protein.
"""
# Verify the existence of the file:
if not os.path.isfile(fasta_filename):
logger.error("File %s does not exist.", fasta_filename)
raise FileNotFoundError(f"File {fasta_filename} does not exist.")
if digestion not in ["full", "partial", "non-specific"]:
logger.error("Digestion type %s not recognized.", digestion)
raise ValueError(f"Digestion type {digestion} not recognized.")
if enzyme not in parser.expasy_rules:
logger.info(
"Enzyme %s not recognized. Interpreting as cleavage rule.",
enzyme,
)

# Verify the existence of the file:
if not os.path.isfile(fasta_filename):
logger.error("File %s does not exist.", fasta_filename)
Expand All @@ -292,12 +281,12 @@ def _peptide_generator(
# Generate all possible peptides
for i in range(len(seq)):
for j in range(
i + min_peptide_length,
min(i + max_peptide_length + 1, len(seq) + 1),
i + min_peptide_len,
min(i + max_peptide_len + 1, len(seq) + 1),
):
pep = seq[i:j]
if any(aa not in valid_aa for aa in pep):
logger.warn(
logger.warning(

Check warning on line 289 in casanovo/data/db_utils.py

View check run for this annotation

Codecov / codecov/patch

casanovo/data/db_utils.py#L289

Added line #L289 was not covered by tests
"Skipping peptide with unknown amino acids: %s",
pep,
)
Expand All @@ -314,12 +303,9 @@ def _peptide_generator(
)
protein = header.split()[0]
for pep in pep_set:
if (
len(pep) >= min_peptide_length
and len(pep) <= max_peptide_length
):
if len(pep) >= min_peptide_len and len(pep) <= max_peptide_len:
if any(aa not in valid_aa for aa in pep):
logger.warn(
logger.warning(

Check warning on line 308 in casanovo/data/db_utils.py

View check run for this annotation

Codecov / codecov/patch

casanovo/data/db_utils.py#L308

Added line #L308 was not covered by tests
"Skipping peptide with unknown amino acids: %s",
pep,
)
Expand Down Expand Up @@ -348,7 +334,7 @@ def _to_mz(precursor_mass: float, charge: int) -> float:


@njit
def _to_raw_mass(mz_mass: float, charge: int) -> float:
def _to_neutral_mass(mz_mass: float, charge: int) -> float:
"""
Convert precursor m/z value to neutral mass.
Expand All @@ -367,23 +353,33 @@ def _to_raw_mass(mz_mass: float, charge: int) -> float:
return charge * (mz_mass - PROTON)

Check warning on line 353 in casanovo/data/db_utils.py

View check run for this annotation

Codecov / codecov/patch

casanovo/data/db_utils.py#L353

Added line #L353 was not covered by tests


def _convert_from_modx(seq: str, swap_map: dict, swap_regex: str) -> str:
"""Converts peptide sequence from modX format to Casanovo-acceptable modifications.
Args:
seq : str
Peptide in modX format
swap_map : dict
Dictionary that allows for swapping of modX to Casanovo-acceptable modifications.
swap_regex : str
Regular expression to match modX format.
def _convert_from_modx(
seq: str, swap_map: dict[str, str], swap_regex: str
) -> str:
"""
Converts peptide sequence from modX format to
Casanovo-acceptable modifications.
Parameters:
-----------
seq : str
Peptide in modX format
swap_map : dict[str, str]
Dictionary that allows for swapping of modX to Casanovo-acceptable modifications.
swap_regex : str
Regular expression to match modX format.
Returns:
--------
swap_regex : str
Peptide in Casanovo-acceptable modifications.
"""
return swap_regex.sub(lambda x: swap_map[x.group()], seq)


def _construct_mods_dict(
allowed_fixed_mods: str, allowed_var_mods: str
) -> Tuple[dict, dict, dict]:
) -> Tuple[dict[str, str], dict[str, str], dict[str, str]]:
"""
Constructs dictionaries of fixed and variable modifications.
Expand All @@ -396,30 +392,28 @@ def _construct_mods_dict(
Returns
-------
fixed_mods : dict
fixed_mods : dict[str, str]
A dictionary of fixed modifications.
var_mods : dict
var_mods : dict[str, str]
A dictionary of variable modifications.
swap_map : dict
A dictionary that allows for swapping of modX to Casanovo-acceptable modifications.
swap_map : dict[str, str]
A dictionary that allows for swapping of modX to
Casanovo-acceptable modifications.
"""
swap_map = {}
fixed_mods = {}
for idx, mod in enumerate(allowed_fixed_mods.split(",")):
aa, mod_aa = mod.split(":")
mod_id = string.ascii_lowercase[idx]
fixed_mods[mod_id] = [aa]
swap_map[f"{mod_id}{aa}"] = f"{mod_aa}"

var_mods = {}
for idx, mod in enumerate(allowed_var_mods.split(",")):
aa, mod_aa = mod.split(":")
mod_id = string.ascii_lowercase[idx]
if aa == "nterm":
var_mods[f"{mod_id}-"] = True
swap_map[f"{mod_id}-"] = f"{mod_aa}"
else:
var_mods[mod_id] = [aa]
swap_map[f"{mod_id}{aa}"] = f"{mod_aa}"
for mod_map, allowed_mods in zip(
[fixed_mods, var_mods], [allowed_fixed_mods, allowed_var_mods]
):
for idx, mod in enumerate(allowed_mods.split(",")):
aa, mod_aa = mod.split(":")
mod_id = string.ascii_lowercase[idx]
if aa == "nterm":
mod_map[f"{mod_id}-"] = True
swap_map[f"{mod_id}-"] = f"{mod_aa}"
else:
mod_map[mod_id] = [aa]
swap_map[f"{mod_id}{aa}"] = f"{mod_aa}"

return fixed_mods, var_mods, swap_map

0 comments on commit 68e67e8

Please sign in to comment.