Speedup of peptide parsing and annotation (#61)

* added shortcuts to parsing and annotating * added simple parser to manifest * review suggestions * fixed deprecation warning * fixed proforma annot * reverted datetime deprecation (not in py3.10)
bittremieux-lab · Apr 17, 2024 · 40151e2 · 40151e2
1 parent b1fc7bf
commit 40151e2
Show file tree

Hide file tree

Showing 2 changed files with 129 additions and 58 deletions.
diff --git a/spectrum_utils/proforma.py b/spectrum_utils/proforma.py
@@ -34,6 +34,8 @@
     import pyteomics.mass as pmass
 
 
+UNMODIFIED_PEPTIDE_REGEX = re.compile(r"^([A-Za-z]+)(/-?[0-9]+)?$")
+
 # Set to None to disable caching.
 cache_dir = platformdirs.user_cache_dir("spectrum_utils", False)
 
@@ -615,6 +617,25 @@ def _modification_sort_key(mod: Modification):
         return -4
 
 
+@functools.lru_cache(1)
+def _build_parser() -> lark.Lark:
+    """Build a lark parser for proforma sequences.
+
+    This function also caches the parser in-memory, thus loading it only
+    once per process.
+    """
+    dir_name = os.path.dirname(os.path.realpath(__file__))
+    with open(os.path.join(dir_name, "proforma.ebnf")) as f_in:
+        parser = lark.Lark(
+            f_in.read(),
+            start="proforma",
+            parser="earley",
+            lexer="dynamic_complete",
+            import_paths=[dir_name],
+        )
+    return parser
+
+
 def parse(proforma: str) -> List[Proteoform]:
     """
     Parse a ProForma-encoded string.
@@ -647,18 +668,22 @@ def parse(proforma: str) -> List[Proteoform]:
     ValueError
         If no mass was specified for a GNO term or its parent terms.
     """
-    dir_name = os.path.dirname(os.path.realpath(__file__))
-    with open(os.path.join(dir_name, "proforma.ebnf")) as f_in:
-        parser = lark.Lark(
-            f_in.read(),
-            start="proforma",
-            parser="earley",
-            lexer="dynamic_complete",
-            import_paths=[dir_name],
-        )
+    match_unmod = UNMODIFIED_PEPTIDE_REGEX.match(proforma)
+    if match_unmod is not None:
+        # Fast path for unmodified peptides.
+        charge = match_unmod.group(2)
+        if charge is not None:
+            charge = Charge(int(charge[1:]))
+        return [
+            Proteoform(sequence=match_unmod.group(1).upper(), charge=charge)
+        ]
+
+    parser = _build_parser()
     # noinspection PyUnresolvedReferences
     try:
-        return ProFormaTransformer().transform(parser.parse(proforma))
+        parsed = parser.parse(proforma)
+        parsed = ProFormaTransformer().transform(parsed)
+        return parsed
     except lark.visitors.VisitError as e:
         raise e.orig_exc
 

diff --git a/spectrum_utils/spectrum.py b/spectrum_utils/spectrum.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import copy
 import functools
 import urllib.parse
@@ -622,73 +624,49 @@ def scale_intensity(
         )
         return self
 
-    def annotate_proforma(
+    def _annotate_proteoforms(
         self,
+        proteoforms: list[proforma.Proteoform],
         proforma_str: str,
         fragment_tol_mass: float,
         fragment_tol_mode: str,
         ion_types: str = "by",
         max_ion_charge: Optional[int] = None,
         neutral_losses: Union[bool, Dict[Optional[str], float]] = False,
-    ) -> "MsmsSpectrum":
+    ) -> MsmsSpectrum:
         """
         Assign fragment ion labels to the peaks from a ProForma annotation.
 
-        Parameters
-        ----------
-        proforma_str : str
-            The ProForma spectrum annotation.
-        fragment_tol_mass : float
-            Fragment mass tolerance to match spectrum peaks against theoretical
-            peaks.
-        fragment_tol_mode : {'Da', 'ppm'}
-            Fragment mass tolerance unit. Either 'Da' or 'ppm'.
-        ion_types : str, optional
-            The ion types to generate. Can be any combination of 'a', 'b', 'c',
-            'x', 'y', and 'z' for peptide fragments, 'I' for immonium ions, 'm'
-            for internal fragment ions, 'p' for the precursor ion, and 'r' for
-            reporter ions. The default is 'by', which means that b and y
-            peptide ions will be generated.
-        max_ion_charge : Optional[int], optional
-            All fragments up to and including the given charge will be
-            annotated (by default all fragments with a charge up to the
-            precursor minus one (minimum charge one) will be annotated).
-        neutral_losses : Union[bool, Dict[Optional[str], float]], optional
-            Neutral losses to consider for each peak. If `None` or `False`, no
-            neutral losses are considered. If specified as a dictionary, keys
-            should be the molecular formulas of the neutral losses and values
-            the corresponding mass differences. Note that mass differences
-            should typically be negative. If `True`, all of the following
-            neutral losses are considered:
+        This is meant to be an internal function that uses a pre-parsed proforma string
+        instead of parsing it internally. This can be useful when the same parsed
+        sequence is used multiple times (since parsing the sequence is a lot slower
+        than annotating the peaks)
 
-            - Loss of hydrogen (H): -1.007825.
-            - Loss of ammonia (NH3): -17.026549.
-            - Loss of water (H2O): -18.010565.
-            - Loss of carbon monoxide (CO): -27.994915.
-            - Loss of carbon dioxide (CO2): -43.989829.
-            - Loss of formamide (HCONH2): -45.021464.
-            - Loss of formic acid (HCOOH): -46.005479.
-            - Loss of methanesulfenic acid (CH4OS): -63.998301.
-            - Loss of sulfur trioxide (SO3): -79.956818.
-            - Loss of metaphosphoric acid (HPO3): -79.966331.
-            - Loss of mercaptoacetamide (C2H5NOS): -91.009195.
-            - Loss of mercaptoacetic acid (C2H4O2S): -91.993211.
-            - Loss of phosphoric acid (H3PO4): -97.976896.
+        >>> proforma_sequence = "MYPEPTIDEK/2"
+        >>> parsed_proforma = proforma.parse(proforma_sequence)
+        >>> spectrum.annotate_proforma(proforma_sequence, ...)
 
-        Returns
-        -------
-        MsmsSpectrum
+        or
+
+        >>> spectrum._annotate_proteoforms(parsed_proforma, proforma_sequence, ...)
+
+        WARN:
+            This function does not check that the passed sequence corresponds to the
+            passed proteoforms.
+
+        For additional information on the arguments, see the
+        `MsmsSpectrum.annotate_proforma` documentation.
         """
         if fragment_tol_mode not in ("Da", "ppm"):
             raise ValueError(
                 "Unknown fragment mass tolerance unit specified. Supported "
                 'values are "Da" or "ppm".'
             )
+        self.proforma = proforma_str
         mass_diff = functools.partial(
             utils.mass_diff, mode_is_da=fragment_tol_mode == "Da"
         )
 
-        self.proforma = proforma_str
         self._annotation = np.full_like(self.mz, None, object)
         # By default, peak charges are assumed to be smaller than the precursor
         # charge.
@@ -703,9 +681,7 @@ def annotate_proforma(
                 neutral_losses = fa._neutral_loss
         if neutral_losses is not None and None not in neutral_losses:
             neutral_losses[None] = 0
-        # Parse the ProForma string and find peaks that match the theoretical
-        # fragments.
-        proteoforms = proforma.parse(self.proforma)
+
         analyte_number = 1 if len(proteoforms) > 1 else None
         for proteoform in proteoforms:
             fragments = fa.get_theoretical_fragments(
@@ -742,4 +718,74 @@ def annotate_proforma(
                 self.annotation[peak_i] = pi
             if analyte_number is not None:
                 analyte_number += 1
+
         return self
+
+    def annotate_proforma(
+        self,
+        proforma_str: str,
+        fragment_tol_mass: float,
+        fragment_tol_mode: str,
+        ion_types: str = "by",
+        max_ion_charge: Optional[int] = None,
+        neutral_losses: Union[bool, Dict[Optional[str], float]] = False,
+    ) -> MsmsSpectrum:
+        """
+        Assign fragment ion labels to the peaks from a ProForma annotation.
+
+        Parameters
+        ----------
+        proforma_str : str
+            The ProForma spectrum annotation.
+        fragment_tol_mass : float
+            Fragment mass tolerance to match spectrum peaks against theoretical
+            peaks.
+        fragment_tol_mode : {'Da', 'ppm'}
+            Fragment mass tolerance unit. Either 'Da' or 'ppm'.
+        ion_types : str, optional
+            The ion types to generate. Can be any combination of 'a', 'b', 'c',
+            'x', 'y', and 'z' for peptide fragments, 'I' for immonium ions, 'm'
+            for internal fragment ions, 'p' for the precursor ion, and 'r' for
+            reporter ions. The default is 'by', which means that b and y
+            peptide ions will be generated.
+        max_ion_charge : Optional[int], optional
+            All fragments up to and including the given charge will be
+            annotated (by default all fragments with a charge up to the
+            precursor minus one (minimum charge one) will be annotated).
+        neutral_losses : Union[bool, Dict[Optional[str], float]], optional
+            Neutral losses to consider for each peak. If `None` or `False`, no
+            neutral losses are considered. If specified as a dictionary, keys
+            should be the molecular formulas of the neutral losses and values
+            the corresponding mass differences. Note that mass differences
+            should typically be negative. If `True`, all of the following
+            neutral losses are considered:
+
+            - Loss of hydrogen (H): -1.007825.
+            - Loss of ammonia (NH3): -17.026549.
+            - Loss of water (H2O): -18.010565.
+            - Loss of carbon monoxide (CO): -27.994915.
+            - Loss of carbon dioxide (CO2): -43.989829.
+            - Loss of formamide (HCONH2): -45.021464.
+            - Loss of formic acid (HCOOH): -46.005479.
+            - Loss of methanesulfenic acid (CH4OS): -63.998301.
+            - Loss of sulfur trioxide (SO3): -79.956818.
+            - Loss of metaphosphoric acid (HPO3): -79.966331.
+            - Loss of mercaptoacetamide (C2H5NOS): -91.009195.
+            - Loss of mercaptoacetic acid (C2H4O2S): -91.993211.
+            - Loss of phosphoric acid (H3PO4): -97.976896.
+
+        Returns
+        -------
+        MsmsSpectrum
+        """
+        proteoforms = proforma.parse(proforma_str)
+
+        return self._annotate_proteoforms(
+            proforma_str=proforma_str,
+            proteoforms=proteoforms,
+            fragment_tol_mass=fragment_tol_mass,
+            fragment_tol_mode=fragment_tol_mode,
+            ion_types=ion_types,
+            max_ion_charge=max_ion_charge,
+            neutral_losses=neutral_losses,
+        )