Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add EncyclopeDIA backend #72

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
ccba625
checkpoint
mobiusklein Mar 31, 2023
7e3e720
Merge branch 'master' into feature/clusters
mobiusklein Apr 1, 2023
2fb766f
Checkpoint
mobiusklein Apr 18, 2023
469aab1
Merge branch 'master' into feature/clusters
mobiusklein Apr 28, 2023
a256e28
Missing import
mobiusklein May 7, 2023
f3f9f19
Merge branch 'master' into feature/clusters
mobiusklein May 7, 2023
dd6dc2a
Add cluster support, fix crosswired key vs index at the heart of the …
mobiusklein May 12, 2023
f1cb7cb
Fix up slicing
mobiusklein May 12, 2023
778cb32
Fix cluster retrieval in JSON backend
mobiusklein May 12, 2023
a1a965e
Update machinery for testing
mobiusklein May 16, 2023
ecadc0a
Add tests for Spectronaut and DIA-NN libraries
mobiusklein May 16, 2023
a59847a
Fix cluster indexing
mobiusklein May 16, 2023
890d8fa
Make enums internal
mobiusklein May 26, 2023
080facc
Violating MAY rules is still valid
mobiusklein May 26, 2023
a784c27
Rebuild references
mobiusklein May 26, 2023
33c5c0a
Merge branch 'master' of https://github.com/HUPO-PSI/mzSpecLib into f…
mobiusklein May 27, 2023
760df7b
Checkpoint
mobiusklein Jun 18, 2023
539af14
Fix Spectronaut spectrum origin type
mobiusklein Jun 18, 2023
d95af64
Make method public
mobiusklein Jun 18, 2023
152c006
Emit warnings when space delimiters are found
mobiusklein Jun 19, 2023
2588b41
Ensure the object is an iterator
mobiusklein Jun 19, 2023
832ce4b
Handle integers in aggregation
mobiusklein Jun 23, 2023
9d7e64a
checkpoint
mobiusklein Jun 30, 2023
7142e54
Working draft
mobiusklein Jul 1, 2023
70e4fb3
Working draft
mobiusklein Jul 16, 2023
0fbe5a6
Merge branch 'master' of https://github.com/HUPO-PSI/mzSpecLib into f…
mobiusklein Jul 28, 2023
1f32b58
Merge branch 'master' of https://github.com/mobiusklein/SpectralLibra…
mobiusklein Jul 28, 2023
ae93e94
Decoy label
mobiusklein Jul 28, 2023
0c07226
Fix N-terminal modification parsing
mobiusklein Aug 11, 2023
b9e0d5a
Remove breakpoint
mobiusklein Aug 11, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions implementations/python/mzlib/backends/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,9 +577,12 @@ def read(self) -> Iterator[Spectrum]:
with open_stream(self.filename, 'rt') as stream:
i = 0
reader = self._open_reader(stream)
if self._headers:
# Skip the header line if we've already parsed them
_ = next(reader)
buffering_reader = self._batch_rows(reader)
for i, buffer in enumerate(buffering_reader):
yield self._parse(buffer, i)
yield self._parse_from_buffer(buffer, i)


class SpectralLibraryWriterBase(_VocabularyResolverMixin, metaclass=SubclassRegisteringMetaclass):
Expand Down Expand Up @@ -629,7 +632,7 @@ def write_library(self, library: SpectralLibraryBackendBase):
step = max(min(n // 100, 5000), 1)
ident = ''
i = 0
for i, entry in enumerate(library):
for i, entry in enumerate(library.read()):
if i % step == 0 and i:
if isinstance(entry, SpectrumCluster):
tag = "cluster "
Expand Down
10 changes: 4 additions & 6 deletions implementations/python/mzlib/backends/bibliospec.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
from dataclasses import dataclass
from multiprocessing import connection
import re
import os
import sqlite3
import zlib
from dataclasses import dataclass

from typing import Iterator, List, Mapping, Tuple, Iterable, Type

Expand All @@ -25,10 +22,11 @@ class BibliospecBase:
connection: sqlite3.Connection

def _correct_modifications_in_sequence(self, row: Mapping) -> proforma.ProForma:
'''Correct the modifications in Bibliospec's modified peptide sequence.
"""
Correct the modifications in Bibliospec's modified peptide sequence.

Bibliospec only stores modifications as delta masses.
'''
"""
mods = self.connection.execute("SELECT * FROM Modifications WHERE RefSpectraID = ?", (row['id'], )).fetchall()
peptide = proforma.ProForma.parse(row["peptideModSeq"])
for mod in mods:
Expand Down
179 changes: 179 additions & 0 deletions implementations/python/mzlib/backends/encyclopedia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
import sqlite3
import zlib
from dataclasses import dataclass

from typing import Any, Iterator, List, Mapping, Tuple, Iterable, Type

import numpy as np

from pyteomics import proforma

from mzlib import annotation
from mzlib.analyte import FIRST_ANALYTE_KEY, FIRST_INTERPRETATION_KEY, Analyte, ProteinDescription
from mzlib.spectrum import Spectrum, SPECTRUM_NAME, CHARGE_STATE
from mzlib.attributes import AttributeManager, Attributed, Attribute

from mzlib.backends.base import SpectralLibraryBackendBase, FORMAT_VERSION_TERM, DEFAULT_VERSION

from mzlib.index.base import IndexBase


DECOY_SPECTRUM = "MS:1003192|decoy spectrum"
DECOY_PEPTIDE_SPECTRUM = "MS:1003195|unnatural peptidoform decoy spectrum"


def _decode_peaks(record: sqlite3.Row):
raw_data = zlib.decompress(record['MassArray'])
mass_array = np.frombuffer(raw_data, dtype='>d')
raw_data = zlib.decompress(record['IntensityArray'])
intensity_array = np.frombuffer(raw_data, dtype='>f')
return mass_array, intensity_array


@dataclass
class EncyclopediaIndexRecord:
number: int
precursor_mz: float
precursor_charge: int
peptide: str


class EncyclopediaIndex(IndexBase):
connection: sqlite3.Connection

def __init__(self, connection):
self.connection = connection

def __getitem__(self, i):
if isinstance(i, int):
return self.search(i + 1)
elif isinstance(i, slice):
return [self.search(j + 1) for j in range(i.start or 0, i.stop or len(self), i.step or 1)]
else:
raise TypeError(f"Cannot index {self.__class__.__name__} with {i}")

def _record_from(self, row: Mapping) -> EncyclopediaIndexRecord:
peptide_sequence = row['PeptideModSeq']
return EncyclopediaIndexRecord(row['rowid'], row['PrecursorMz'], row['PrecursorCharge'], peptide_sequence)

def search(self, i):
if isinstance(i, int):
info = self.connection.execute("SELECT rowid, PrecursorMz, PrecursorCharge, PeptideModSeq FROM entries WHERE rowid = ?", (i, )).fetchone()
return self._record_from(info)
elif isinstance(i, str):
raise NotImplementedError()

def __iter__(self):
return map(self._record_from, self.connection.execute("SELECT rowid, PrecursorMz, PrecursorCharge, PeptideModSeq FROM entries ORDER BY rowid").fetchall())

def __len__(self):
return self.connection.execute("SELECT count(rowid) FROM entries;").fetchone()[0]


class EncyclopediaSpectralLibrary(SpectralLibraryBackendBase):
"""Read EncyclopeDIA SQLite3 spectral library files."""

connection: sqlite3.Connection

file_format = "dlib"
format_name = "encyclopedia"

@classmethod
def has_index_preference(cls, filename) -> Type[IndexBase]:
return EncyclopediaIndex

def __init__(self, filename: str, **kwargs):
super().__init__(filename)
self.connection = sqlite3.connect(filename)
self.connection.row_factory = sqlite3.Row
self.index = EncyclopediaIndex(self.connection)
self.read_header()

def read_header(self) -> bool:
attribs = AttributeManager()
attribs.add_attribute(FORMAT_VERSION_TERM, DEFAULT_VERSION)
attribs.add_attribute("MS:1003207|library creation software", "EncyclopeDIA")
self.attributes = attribs
return True

def _populate_analyte(self, analyte: Analyte, row: Mapping[str, Any]):
"""
Fill an analyte with details describing a peptide sequence and inferring
from context its traits based upon the assumptions EncyclopeDIA makes.

EncyclopeDIA only stores modifications as delta masses.
"""
peptide = proforma.ProForma.parse(row['PeptideModSeq'])
analyte.add_attribute("MS:1003169|proforma peptidoform sequence", str(peptide))
analyte.add_attribute("MS:1001117|theoretical mass", peptide.mass)
analyte.add_attribute("MS:1000888|stripped peptide sequence", row['PeptideSeq'])
analyte.add_attribute(CHARGE_STATE, row['PrecursorCharge'])

cursor = self.connection.execute(
"SELECT ProteinAccession, isDecoy FROM peptidetoprotein WHERE PeptideSeq = ?;", (row['PeptideSeq'], ))

had_decoy = False
for protrow in cursor:
accession = protrow['ProteinAccession']
is_decoy = bool(int(protrow['isDecoy']))
had_decoy = had_decoy or is_decoy
analyte.add_attribute_group([
Attribute('MS:1000885|protein accession', accession)
])
return had_decoy

def get_spectrum(self, spectrum_number: int = None, spectrum_name: str = None):
"""
Read a spectrum from the spectrum library.

EncyclopeDIA does not support alternative labeling of spectra with a
plain text name so looking up by `spectrum_name` is not supported.
"""
if spectrum_number is None:
raise ValueError("Only spectrum number queries are supported. spectrum_number must have an integer value")

info = self.connection.execute("SELECT rowid, * FROM entries WHERE rowid = ?;", (spectrum_number, )).fetchone()
spectrum = self._new_spectrum()
spectrum.key = info['rowid']
spectrum.index = info['rowid'] - 1
spectrum.precursor_mz = info['PrecursorMz']
try:
spectrum.add_attribute("MS:1000894|retention time", info['RTInSeconds'] / 60.0)
except KeyError:
pass

try:
spectrum.add_attribute(
"MS:1003203|constituent spectrum file",
f"file://{info['SourceFile']}"
)
except KeyError:
pass


analyte = self._new_analyte(1)
had_decoy = self._populate_analyte(analyte, info)
if had_decoy:
spectrum.add_attribute(DECOY_SPECTRUM, DECOY_PEPTIDE_SPECTRUM)

spectrum.add_analyte(analyte)

interp = self._new_interpretation(1)
interp.add_analyte(analyte)
spectrum.add_interpretation(interp)

mz_array, intensity_array = _decode_peaks(info)
n_peaks = len(mz_array)
spectrum.add_attribute("MS:1003059|number of peaks", n_peaks)

peak_list = []
# EncyclopeDIA does not encode product ion identities
for i, mz in enumerate(mz_array):
row = (mz, intensity_array[i], [], [])
peak_list.append(row)
spectrum.peak_list = peak_list
return spectrum

def read(self) -> Iterator[Spectrum]:
for rec in self.index:
yield self.get_spectrum(rec.number)
16 changes: 13 additions & 3 deletions implementations/python/mzlib/backends/spectronaut.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,12 @@ def _rewrite_modified_peptide_as_proforma(sequence: str) -> str:
last_paren = None
for i, c in enumerate(sequence):
if c == ']':
# Erase any text in parentheses as these indicate the modification
# rule and not the modificatin name. We could look at the modification
# rule to infer N-term and C-term rules, but we don't have enough examples
if last_paren is not None:
k = i - last_paren
for j in range(k + 1):
for _ in range(k + 1):
buffer.pop()
last_paren = None
buffer.append(c)
Expand All @@ -45,7 +48,15 @@ def _rewrite_modified_peptide_as_proforma(sequence: str) -> str:
buffer.append(c)
else:
buffer.append(c)
return ''.join(buffer)
pf_seq = ''.join(buffer)
# A peptide with an N-terminal modification will start with a square brace
# but needs to have a "-" added to be well-formed ProForma
if pf_seq.startswith("["):
i = pf_seq.find(']') + 1
if i == 0:
raise ValueError(f"Malformed peptide sequence {sequence}")
pf_seq = f"{pf_seq[:i]}-{pf_seq[i:]}"
return pf_seq


def _parse_value(value: str) -> Union[float, int, str, bool]:
Expand Down Expand Up @@ -205,7 +216,6 @@ def _generate_peaks(self, batch: List[Dict[str, Any]]) -> List[Tuple[float, floa
def _build_analyte(self, description: Dict[str, Any], analyte: Analyte) -> Analyte:
pf_seq = _rewrite_modified_peptide_as_proforma(description['ModifiedPeptide'])
peptide = proforma.ProForma.parse(pf_seq)

analyte.add_attribute(STRIPPED_PEPTIDE_TERM, description['StrippedPeptide'])
analyte.add_attribute(PROFORMA_PEPTIDE_TERM, pf_seq)
analyte.add_attribute("MS:1001117|theoretical mass", peptide.mass)
Expand Down