Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update API and add support for small molecules #43

Merged
merged 18 commits into from
Mar 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ repos:
- id: trailing-whitespace
- id: detect-private-key
- repo: https://github.com/charliermarsh/ruff-pre-commit
rev: v0.1.7
rev: v0.3.1
hooks:
# Run the linter.
- id: ruff
Expand Down
1 change: 1 addition & 0 deletions depthcharge/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Initialize the depthcharge package."""

# Ignore a bunch of pkg_resources warnings from dependencies:
import warnings

Expand Down
1 change: 1 addition & 0 deletions depthcharge/constants.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Constants."""

HYDROGEN = 1.007825035
OXYGEN = 15.99491463
H2O = 2 * HYDROGEN + OXYGEN
Expand Down
3 changes: 2 additions & 1 deletion depthcharge/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
"""The Pytorch Datasets."""

from . import preprocessing
from .analyte_datasets import AnalyteDataset
from .arrow import (
spectra_to_df,
spectra_to_parquet,
spectra_to_stream,
)
from .fields import CustomField
from .peptide_datasets import PeptideDataset
from .spectrum_datasets import (
AnnotatedSpectrumDataset,
SpectrumDataset,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
"""Datasets for working with peptide sequences."""

from collections.abc import Iterable

import torch
from torch.utils.data import DataLoader, TensorDataset

from ..tokenizers import PeptideTokenizer
from ..tokenizers import Tokenizer


class PeptideDataset(TensorDataset):
class AnalyteDataset(TensorDataset):
"""A dataset for peptide sequences.

Parameters
Expand All @@ -18,33 +19,26 @@ class PeptideDataset(TensorDataset):
sequences : Iterable[str]
The peptide sequences in a format compatible with
your tokenizer. ProForma is preferred.
charges : torch.Tensor,
The charge state for each peptide.
*args : torch.Tensor, optional
Additional values to include during data loading.

"""

def __init__(
self,
tokenizer: PeptideTokenizer,
tokenizer: Tokenizer,
sequences: Iterable[str],
charges: torch.Tensor,
*args: torch.Tensor,
) -> None:
"""Initialize a PeptideDataset."""
tokens = tokenizer.tokenize(sequences)
super().__init__(tokens, charges, *args)
super().__init__(tokens, *args)

@property
def tokens(self) -> torch.Tensor:
"""The peptide sequence tokens."""
return self.tensors[0]

@property
def charges(self) -> torch.Tensor:
"""The peptide charges."""
return self.tensors[1]

def loader(self, *args: tuple, **kwargs: dict) -> DataLoader:
"""A PyTorch DataLoader for peptides.

Expand All @@ -61,5 +55,6 @@ def loader(self, *args: tuple, **kwargs: dict) -> DataLoader:
-------
torch.utils.data.DataLoader
A DataLoader for the peptide.

"""
return DataLoader(self, *args, **kwargs)
17 changes: 12 additions & 5 deletions depthcharge/data/arrow.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Store spectrum data as Arrow tables."""

from collections.abc import Callable, Generator, Iterable
from os import PathLike
from pathlib import Path
Expand Down Expand Up @@ -82,6 +83,7 @@ def spectra_to_stream(
-------
Generator of pyarrow.RecordBatch
Batches of parsed spectra.

"""
parser_args = {
"ms_level": ms_level,
Expand Down Expand Up @@ -195,6 +197,7 @@ def spectra_to_parquet(
-------
Path
The Parquet file that was written.

"""
streamer = spectra_to_stream(
peak_file=peak_file,
Expand All @@ -210,12 +213,15 @@ def spectra_to_parquet(
if parquet_file is None:
parquet_file = Path(Path(peak_file).stem).with_suffix(".parquet")

writer = None
for batch in streamer:
if writer is None:
writer = pq.ParquetWriter(parquet_file, schema=batch.schema)
try:
writer = None
for batch in streamer:
if writer is None:
writer = pq.ParquetWriter(parquet_file, schema=batch.schema)

writer.write_batch(batch)
writer.write_batch(batch)
finally:
writer.close()

return parquet_file

Expand Down Expand Up @@ -287,6 +293,7 @@ def spectra_to_df(
-------
Path
The Parquet file that was written.

"""
streamer = spectra_to_stream(
peak_file=peak_file,
Expand Down
2 changes: 2 additions & 0 deletions depthcharge/data/fields.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Custom fields for the Arrow Schema."""

from collections.abc import Callable
from dataclasses import dataclass

Expand All @@ -24,6 +25,7 @@ class CustomField:
each spectrum.
dtype: pyarrow.DataType
The expected Arrow data type for the column in the schema.

"""

name: str
Expand Down
18 changes: 18 additions & 0 deletions depthcharge/data/parsers.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Mass spectrometry data parsers."""

from __future__ import annotations

import logging
Expand Down Expand Up @@ -44,6 +45,7 @@ class BaseParser(ABC):
Enable or disable the progress bar.
id_type : str, optional
The Hupo-PSI prefix for the spectrum identifier.

"""

def __init__(
Expand Down Expand Up @@ -111,6 +113,7 @@ def sniff(self) -> None:
------
IOError
Raised if the file is not the expected format.

"""

@abstractmethod
Expand All @@ -130,6 +133,7 @@ def parse_spectrum(self, spectrum: dict) -> MassSpectrum | None:
-------
MassSpectrum or None
The parsed mass spectrum or None if it is skipped.

"""

def parse_custom_fields(self, spectrum: dict) -> dict[str, Any]:
Expand All @@ -144,6 +148,7 @@ def parse_custom_fields(self, spectrum: dict) -> dict[str, Any]:
-------
dict
The parsed value of each, whatever it may be.

"""
out = {}
if self.custom_fields is None:
Expand All @@ -167,6 +172,7 @@ def iter_batches(self, batch_size: int | None) -> pa.RecordBatch:
------
RecordBatch
A batch of spectra and their metadata.

"""
batch_size = float("inf") if batch_size is None else batch_size
pbar_args = {
Expand Down Expand Up @@ -229,6 +235,7 @@ def _update_batch(self, entry: dict) -> None:
----------
entry : dict
The elemtn to add.

"""
if self._batch is None:
self._batch = {k: [v] for k, v in entry.items()}
Expand Down Expand Up @@ -264,6 +271,7 @@ class MzmlParser(BaseParser):
spectrum from the corresponding Pyteomics parser.
progress : bool, optional
Enable or disable the progress bar.

"""

def sniff(self) -> None:
Expand All @@ -273,6 +281,7 @@ def sniff(self) -> None:
------
IOError
Raised if the file is not the expected format.

"""
with self.peak_file.open() as mzdat:
next(mzdat)
Expand All @@ -295,6 +304,7 @@ def parse_spectrum(self, spectrum: dict) -> MassSpectrum | None:
-------
MassSpectrum or None
The parsed mass spectrum or None if not at the correct MS level.

"""
ms_level = spectrum["ms level"]
if self.ms_level is not None and ms_level not in self.ms_level:
Expand Down Expand Up @@ -363,6 +373,7 @@ class MzxmlParser(BaseParser):
spectrum from the corresponding Pyteomics parser.
progress : bool, optional
Enable or disable the progress bar.

"""

def sniff(self) -> None:
Expand All @@ -372,6 +383,7 @@ def sniff(self) -> None:
------
IOError
Raised if the file is not the expected format.

"""
scent = "http://sashimi.sourceforge.net/schema_revision/mzXML"
with self.peak_file.open() as mzdat:
Expand All @@ -395,6 +407,7 @@ def parse_spectrum(self, spectrum: dict) -> MassSpectrum | None:
-------
MassSpectrum
The parsed mass spectrum.

"""
ms_level = spectrum["msLevel"]
if self.ms_level is not None and ms_level not in self.ms_level:
Expand Down Expand Up @@ -442,6 +455,7 @@ class MgfParser(BaseParser):
spectrum from the corresponding Pyteomics parser.
progress : bool, optional
Enable or disable the progress bar.

"""

def __init__(
Expand Down Expand Up @@ -476,6 +490,7 @@ def sniff(self) -> None:
------
IOError
Raised if the file is not the expected format.

"""
with self.peak_file.open() as mzdat:
if not next(mzdat).startswith("BEGIN IONS"):
Expand All @@ -492,6 +507,7 @@ def parse_spectrum(self, spectrum: dict) -> MassSpectrum:
----------
spectrum : dict
The dictionary defining the spectrum in MGF format.

"""
self._counter += 1
if self.ms_level is not None and 1 not in self.ms_level:
Expand Down Expand Up @@ -531,6 +547,7 @@ def _parse_scan_id(scan_str: str | int) -> int:
-------
int
The scan ID number.

"""
try:
return int(scan_str)
Expand Down Expand Up @@ -565,6 +582,7 @@ def get_parser(cls, peak_file: PathLike, **kwargs: dict) -> BaseParser:
The peak file to parse.
kwargs : dict
Keyword arguments to pass to the parser.

"""
for parser in cls.parsers:
try:
Expand Down
3 changes: 3 additions & 0 deletions depthcharge/data/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def my_func(spec: MassSpectrum) -> MassSpectrum:
```

"""

from collections.abc import Callable
from functools import wraps

Expand Down Expand Up @@ -79,6 +80,7 @@ def wrapper(
-------
Callable
A valid deptcharge preprocessing function.

"""

@wraps(wrapper)
Expand All @@ -94,6 +96,7 @@ def preprocess(spec: MassSpectrum) -> MassSpectrum:
-------
MassSpectrum
The processed mass spectrum.

"""
# Call the spectrum_utils method:
getattr(spec, func)(*args, **kwargs)
Expand Down
Loading
Loading