Skip to content

Commit

Permalink
Speedup of peptide parsing and annotation (#61)
Browse files Browse the repository at this point in the history
* added shortcuts to parsing and annotating

* added simple parser to manifest

* review suggestions

* fixed deprecation warning

* fixed proforma annot

* reverted datetime deprecation (not in py3.10)
  • Loading branch information
jspaezp authored Apr 17, 2024
1 parent b1fc7bf commit 40151e2
Show file tree
Hide file tree
Showing 2 changed files with 129 additions and 58 deletions.
45 changes: 35 additions & 10 deletions spectrum_utils/proforma.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
import pyteomics.mass as pmass


UNMODIFIED_PEPTIDE_REGEX = re.compile(r"^([A-Za-z]+)(/-?[0-9]+)?$")

# Set to None to disable caching.
cache_dir = platformdirs.user_cache_dir("spectrum_utils", False)

Expand Down Expand Up @@ -615,6 +617,25 @@ def _modification_sort_key(mod: Modification):
return -4


@functools.lru_cache(1)
def _build_parser() -> lark.Lark:
"""Build a lark parser for proforma sequences.
This function also caches the parser in-memory, thus loading it only
once per process.
"""
dir_name = os.path.dirname(os.path.realpath(__file__))
with open(os.path.join(dir_name, "proforma.ebnf")) as f_in:
parser = lark.Lark(
f_in.read(),
start="proforma",
parser="earley",
lexer="dynamic_complete",
import_paths=[dir_name],
)
return parser


def parse(proforma: str) -> List[Proteoform]:
"""
Parse a ProForma-encoded string.
Expand Down Expand Up @@ -647,18 +668,22 @@ def parse(proforma: str) -> List[Proteoform]:
ValueError
If no mass was specified for a GNO term or its parent terms.
"""
dir_name = os.path.dirname(os.path.realpath(__file__))
with open(os.path.join(dir_name, "proforma.ebnf")) as f_in:
parser = lark.Lark(
f_in.read(),
start="proforma",
parser="earley",
lexer="dynamic_complete",
import_paths=[dir_name],
)
match_unmod = UNMODIFIED_PEPTIDE_REGEX.match(proforma)
if match_unmod is not None:
# Fast path for unmodified peptides.
charge = match_unmod.group(2)
if charge is not None:
charge = Charge(int(charge[1:]))
return [
Proteoform(sequence=match_unmod.group(1).upper(), charge=charge)
]

parser = _build_parser()
# noinspection PyUnresolvedReferences
try:
return ProFormaTransformer().transform(parser.parse(proforma))
parsed = parser.parse(proforma)
parsed = ProFormaTransformer().transform(parsed)
return parsed
except lark.visitors.VisitError as e:
raise e.orig_exc

Expand Down
142 changes: 94 additions & 48 deletions spectrum_utils/spectrum.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

import copy
import functools
import urllib.parse
Expand Down Expand Up @@ -622,73 +624,49 @@ def scale_intensity(
)
return self

def annotate_proforma(
def _annotate_proteoforms(
self,
proteoforms: list[proforma.Proteoform],
proforma_str: str,
fragment_tol_mass: float,
fragment_tol_mode: str,
ion_types: str = "by",
max_ion_charge: Optional[int] = None,
neutral_losses: Union[bool, Dict[Optional[str], float]] = False,
) -> "MsmsSpectrum":
) -> MsmsSpectrum:
"""
Assign fragment ion labels to the peaks from a ProForma annotation.
Parameters
----------
proforma_str : str
The ProForma spectrum annotation.
fragment_tol_mass : float
Fragment mass tolerance to match spectrum peaks against theoretical
peaks.
fragment_tol_mode : {'Da', 'ppm'}
Fragment mass tolerance unit. Either 'Da' or 'ppm'.
ion_types : str, optional
The ion types to generate. Can be any combination of 'a', 'b', 'c',
'x', 'y', and 'z' for peptide fragments, 'I' for immonium ions, 'm'
for internal fragment ions, 'p' for the precursor ion, and 'r' for
reporter ions. The default is 'by', which means that b and y
peptide ions will be generated.
max_ion_charge : Optional[int], optional
All fragments up to and including the given charge will be
annotated (by default all fragments with a charge up to the
precursor minus one (minimum charge one) will be annotated).
neutral_losses : Union[bool, Dict[Optional[str], float]], optional
Neutral losses to consider for each peak. If `None` or `False`, no
neutral losses are considered. If specified as a dictionary, keys
should be the molecular formulas of the neutral losses and values
the corresponding mass differences. Note that mass differences
should typically be negative. If `True`, all of the following
neutral losses are considered:
This is meant to be an internal function that uses a pre-parsed proforma string
instead of parsing it internally. This can be useful when the same parsed
sequence is used multiple times (since parsing the sequence is a lot slower
than annotating the peaks)
- Loss of hydrogen (H): -1.007825.
- Loss of ammonia (NH3): -17.026549.
- Loss of water (H2O): -18.010565.
- Loss of carbon monoxide (CO): -27.994915.
- Loss of carbon dioxide (CO2): -43.989829.
- Loss of formamide (HCONH2): -45.021464.
- Loss of formic acid (HCOOH): -46.005479.
- Loss of methanesulfenic acid (CH4OS): -63.998301.
- Loss of sulfur trioxide (SO3): -79.956818.
- Loss of metaphosphoric acid (HPO3): -79.966331.
- Loss of mercaptoacetamide (C2H5NOS): -91.009195.
- Loss of mercaptoacetic acid (C2H4O2S): -91.993211.
- Loss of phosphoric acid (H3PO4): -97.976896.
>>> proforma_sequence = "MYPEPTIDEK/2"
>>> parsed_proforma = proforma.parse(proforma_sequence)
>>> spectrum.annotate_proforma(proforma_sequence, ...)
Returns
-------
MsmsSpectrum
or
>>> spectrum._annotate_proteoforms(parsed_proforma, proforma_sequence, ...)
WARN:
This function does not check that the passed sequence corresponds to the
passed proteoforms.
For additional information on the arguments, see the
`MsmsSpectrum.annotate_proforma` documentation.
"""
if fragment_tol_mode not in ("Da", "ppm"):
raise ValueError(
"Unknown fragment mass tolerance unit specified. Supported "
'values are "Da" or "ppm".'
)
self.proforma = proforma_str
mass_diff = functools.partial(
utils.mass_diff, mode_is_da=fragment_tol_mode == "Da"
)

self.proforma = proforma_str
self._annotation = np.full_like(self.mz, None, object)
# By default, peak charges are assumed to be smaller than the precursor
# charge.
Expand All @@ -703,9 +681,7 @@ def annotate_proforma(
neutral_losses = fa._neutral_loss
if neutral_losses is not None and None not in neutral_losses:
neutral_losses[None] = 0
# Parse the ProForma string and find peaks that match the theoretical
# fragments.
proteoforms = proforma.parse(self.proforma)

analyte_number = 1 if len(proteoforms) > 1 else None
for proteoform in proteoforms:
fragments = fa.get_theoretical_fragments(
Expand Down Expand Up @@ -742,4 +718,74 @@ def annotate_proforma(
self.annotation[peak_i] = pi
if analyte_number is not None:
analyte_number += 1

return self

def annotate_proforma(
self,
proforma_str: str,
fragment_tol_mass: float,
fragment_tol_mode: str,
ion_types: str = "by",
max_ion_charge: Optional[int] = None,
neutral_losses: Union[bool, Dict[Optional[str], float]] = False,
) -> MsmsSpectrum:
"""
Assign fragment ion labels to the peaks from a ProForma annotation.
Parameters
----------
proforma_str : str
The ProForma spectrum annotation.
fragment_tol_mass : float
Fragment mass tolerance to match spectrum peaks against theoretical
peaks.
fragment_tol_mode : {'Da', 'ppm'}
Fragment mass tolerance unit. Either 'Da' or 'ppm'.
ion_types : str, optional
The ion types to generate. Can be any combination of 'a', 'b', 'c',
'x', 'y', and 'z' for peptide fragments, 'I' for immonium ions, 'm'
for internal fragment ions, 'p' for the precursor ion, and 'r' for
reporter ions. The default is 'by', which means that b and y
peptide ions will be generated.
max_ion_charge : Optional[int], optional
All fragments up to and including the given charge will be
annotated (by default all fragments with a charge up to the
precursor minus one (minimum charge one) will be annotated).
neutral_losses : Union[bool, Dict[Optional[str], float]], optional
Neutral losses to consider for each peak. If `None` or `False`, no
neutral losses are considered. If specified as a dictionary, keys
should be the molecular formulas of the neutral losses and values
the corresponding mass differences. Note that mass differences
should typically be negative. If `True`, all of the following
neutral losses are considered:
- Loss of hydrogen (H): -1.007825.
- Loss of ammonia (NH3): -17.026549.
- Loss of water (H2O): -18.010565.
- Loss of carbon monoxide (CO): -27.994915.
- Loss of carbon dioxide (CO2): -43.989829.
- Loss of formamide (HCONH2): -45.021464.
- Loss of formic acid (HCOOH): -46.005479.
- Loss of methanesulfenic acid (CH4OS): -63.998301.
- Loss of sulfur trioxide (SO3): -79.956818.
- Loss of metaphosphoric acid (HPO3): -79.966331.
- Loss of mercaptoacetamide (C2H5NOS): -91.009195.
- Loss of mercaptoacetic acid (C2H4O2S): -91.993211.
- Loss of phosphoric acid (H3PO4): -97.976896.
Returns
-------
MsmsSpectrum
"""
proteoforms = proforma.parse(proforma_str)

return self._annotate_proteoforms(
proforma_str=proforma_str,
proteoforms=proteoforms,
fragment_tol_mass=fragment_tol_mass,
fragment_tol_mode=fragment_tol_mode,
ion_types=ion_types,
max_ion_charge=max_ion_charge,
neutral_losses=neutral_losses,
)

0 comments on commit 40151e2

Please sign in to comment.