Skip to content

Commit

Permalink
Update API and add support for small molecules (#43)
Browse files Browse the repository at this point in the history
* Add small molecule support and update peptide transformers to analyte transformers

* Various fixes and new precursor calculation

* Finished tests and fixed some bugs

* Revert spliting peptide datasets

* Fix bugs and improve test coverage

* Make missing residues an error

* Add molecule tests

* Added customizable start and stop tokens

* Allow add stop and start even if no token exists

* Update test

* Add embed method

* Start making Wout's edits

* Most of Wout's edits done

* Final fixes

* Fix formatting errors

* Bump pre-commit versions

* Ruff format update
  • Loading branch information
wfondrie authored Mar 7, 2024
1 parent bd2861f commit b1f25ce
Show file tree
Hide file tree
Showing 41 changed files with 1,173 additions and 791 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ repos:
- id: trailing-whitespace
- id: detect-private-key
- repo: https://github.com/charliermarsh/ruff-pre-commit
rev: v0.1.7
rev: v0.3.1
hooks:
# Run the linter.
- id: ruff
Expand Down
1 change: 1 addition & 0 deletions depthcharge/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Initialize the depthcharge package."""

# Ignore a bunch of pkg_resources warnings from dependencies:
import warnings

Expand Down
1 change: 1 addition & 0 deletions depthcharge/constants.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Constants."""

HYDROGEN = 1.007825035
OXYGEN = 15.99491463
H2O = 2 * HYDROGEN + OXYGEN
Expand Down
3 changes: 2 additions & 1 deletion depthcharge/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
"""The Pytorch Datasets."""

from . import preprocessing
from .analyte_datasets import AnalyteDataset
from .arrow import (
spectra_to_df,
spectra_to_parquet,
spectra_to_stream,
)
from .fields import CustomField
from .peptide_datasets import PeptideDataset
from .spectrum_datasets import (
AnnotatedSpectrumDataset,
SpectrumDataset,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
"""Datasets for working with peptide sequences."""

from collections.abc import Iterable

import torch
from torch.utils.data import DataLoader, TensorDataset

from ..tokenizers import PeptideTokenizer
from ..tokenizers import Tokenizer


class PeptideDataset(TensorDataset):
class AnalyteDataset(TensorDataset):
"""A dataset for peptide sequences.
Parameters
Expand All @@ -18,33 +19,26 @@ class PeptideDataset(TensorDataset):
sequences : Iterable[str]
The peptide sequences in a format compatible with
your tokenizer. ProForma is preferred.
charges : torch.Tensor,
The charge state for each peptide.
*args : torch.Tensor, optional
Additional values to include during data loading.
"""

def __init__(
self,
tokenizer: PeptideTokenizer,
tokenizer: Tokenizer,
sequences: Iterable[str],
charges: torch.Tensor,
*args: torch.Tensor,
) -> None:
"""Initialize a PeptideDataset."""
tokens = tokenizer.tokenize(sequences)
super().__init__(tokens, charges, *args)
super().__init__(tokens, *args)

@property
def tokens(self) -> torch.Tensor:
"""The peptide sequence tokens."""
return self.tensors[0]

@property
def charges(self) -> torch.Tensor:
"""The peptide charges."""
return self.tensors[1]

def loader(self, *args: tuple, **kwargs: dict) -> DataLoader:
"""A PyTorch DataLoader for peptides.
Expand All @@ -61,5 +55,6 @@ def loader(self, *args: tuple, **kwargs: dict) -> DataLoader:
-------
torch.utils.data.DataLoader
A DataLoader for the peptide.
"""
return DataLoader(self, *args, **kwargs)
17 changes: 12 additions & 5 deletions depthcharge/data/arrow.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Store spectrum data as Arrow tables."""

from collections.abc import Callable, Generator, Iterable
from os import PathLike
from pathlib import Path
Expand Down Expand Up @@ -82,6 +83,7 @@ def spectra_to_stream(
-------
Generator of pyarrow.RecordBatch
Batches of parsed spectra.
"""
parser_args = {
"ms_level": ms_level,
Expand Down Expand Up @@ -195,6 +197,7 @@ def spectra_to_parquet(
-------
Path
The Parquet file that was written.
"""
streamer = spectra_to_stream(
peak_file=peak_file,
Expand All @@ -210,12 +213,15 @@ def spectra_to_parquet(
if parquet_file is None:
parquet_file = Path(Path(peak_file).stem).with_suffix(".parquet")

writer = None
for batch in streamer:
if writer is None:
writer = pq.ParquetWriter(parquet_file, schema=batch.schema)
try:
writer = None
for batch in streamer:
if writer is None:
writer = pq.ParquetWriter(parquet_file, schema=batch.schema)

writer.write_batch(batch)
writer.write_batch(batch)
finally:
writer.close()

return parquet_file

Expand Down Expand Up @@ -287,6 +293,7 @@ def spectra_to_df(
-------
Path
The Parquet file that was written.
"""
streamer = spectra_to_stream(
peak_file=peak_file,
Expand Down
2 changes: 2 additions & 0 deletions depthcharge/data/fields.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Custom fields for the Arrow Schema."""

from collections.abc import Callable
from dataclasses import dataclass

Expand All @@ -24,6 +25,7 @@ class CustomField:
each spectrum.
dtype: pyarrow.DataType
The expected Arrow data type for the column in the schema.
"""

name: str
Expand Down
18 changes: 18 additions & 0 deletions depthcharge/data/parsers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Mass spectrometry data parsers."""

from __future__ import annotations

import logging
Expand Down Expand Up @@ -44,6 +45,7 @@ class BaseParser(ABC):
Enable or disable the progress bar.
id_type : str, optional
The Hupo-PSI prefix for the spectrum identifier.
"""

def __init__(
Expand Down Expand Up @@ -111,6 +113,7 @@ def sniff(self) -> None:
------
IOError
Raised if the file is not the expected format.
"""

@abstractmethod
Expand All @@ -130,6 +133,7 @@ def parse_spectrum(self, spectrum: dict) -> MassSpectrum | None:
-------
MassSpectrum or None
The parsed mass spectrum or None if it is skipped.
"""

def parse_custom_fields(self, spectrum: dict) -> dict[str, Any]:
Expand All @@ -144,6 +148,7 @@ def parse_custom_fields(self, spectrum: dict) -> dict[str, Any]:
-------
dict
The parsed value of each, whatever it may be.
"""
out = {}
if self.custom_fields is None:
Expand All @@ -167,6 +172,7 @@ def iter_batches(self, batch_size: int | None) -> pa.RecordBatch:
------
RecordBatch
A batch of spectra and their metadata.
"""
batch_size = float("inf") if batch_size is None else batch_size
pbar_args = {
Expand Down Expand Up @@ -229,6 +235,7 @@ def _update_batch(self, entry: dict) -> None:
----------
entry : dict
The elemtn to add.
"""
if self._batch is None:
self._batch = {k: [v] for k, v in entry.items()}
Expand Down Expand Up @@ -264,6 +271,7 @@ class MzmlParser(BaseParser):
spectrum from the corresponding Pyteomics parser.
progress : bool, optional
Enable or disable the progress bar.
"""

def sniff(self) -> None:
Expand All @@ -273,6 +281,7 @@ def sniff(self) -> None:
------
IOError
Raised if the file is not the expected format.
"""
with self.peak_file.open() as mzdat:
next(mzdat)
Expand All @@ -295,6 +304,7 @@ def parse_spectrum(self, spectrum: dict) -> MassSpectrum | None:
-------
MassSpectrum or None
The parsed mass spectrum or None if not at the correct MS level.
"""
ms_level = spectrum["ms level"]
if self.ms_level is not None and ms_level not in self.ms_level:
Expand Down Expand Up @@ -363,6 +373,7 @@ class MzxmlParser(BaseParser):
spectrum from the corresponding Pyteomics parser.
progress : bool, optional
Enable or disable the progress bar.
"""

def sniff(self) -> None:
Expand All @@ -372,6 +383,7 @@ def sniff(self) -> None:
------
IOError
Raised if the file is not the expected format.
"""
scent = "http://sashimi.sourceforge.net/schema_revision/mzXML"
with self.peak_file.open() as mzdat:
Expand All @@ -395,6 +407,7 @@ def parse_spectrum(self, spectrum: dict) -> MassSpectrum | None:
-------
MassSpectrum
The parsed mass spectrum.
"""
ms_level = spectrum["msLevel"]
if self.ms_level is not None and ms_level not in self.ms_level:
Expand Down Expand Up @@ -442,6 +455,7 @@ class MgfParser(BaseParser):
spectrum from the corresponding Pyteomics parser.
progress : bool, optional
Enable or disable the progress bar.
"""

def __init__(
Expand Down Expand Up @@ -476,6 +490,7 @@ def sniff(self) -> None:
------
IOError
Raised if the file is not the expected format.
"""
with self.peak_file.open() as mzdat:
if not next(mzdat).startswith("BEGIN IONS"):
Expand All @@ -492,6 +507,7 @@ def parse_spectrum(self, spectrum: dict) -> MassSpectrum:
----------
spectrum : dict
The dictionary defining the spectrum in MGF format.
"""
self._counter += 1
if self.ms_level is not None and 1 not in self.ms_level:
Expand Down Expand Up @@ -531,6 +547,7 @@ def _parse_scan_id(scan_str: str | int) -> int:
-------
int
The scan ID number.
"""
try:
return int(scan_str)
Expand Down Expand Up @@ -565,6 +582,7 @@ def get_parser(cls, peak_file: PathLike, **kwargs: dict) -> BaseParser:
The peak file to parse.
kwargs : dict
Keyword arguments to pass to the parser.
"""
for parser in cls.parsers:
try:
Expand Down
3 changes: 3 additions & 0 deletions depthcharge/data/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def my_func(spec: MassSpectrum) -> MassSpectrum:
```
"""

from collections.abc import Callable
from functools import wraps

Expand Down Expand Up @@ -79,6 +80,7 @@ def wrapper(
-------
Callable
A valid deptcharge preprocessing function.
"""

@wraps(wrapper)
Expand All @@ -94,6 +96,7 @@ def preprocess(spec: MassSpectrum) -> MassSpectrum:
-------
MassSpectrum
The processed mass spectrum.
"""
# Call the spectrum_utils method:
getattr(spec, func)(*args, **kwargs)
Expand Down
Loading

0 comments on commit b1f25ce

Please sign in to comment.