Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update API and add support for small molecules #43

Merged
merged 18 commits into from
Mar 7, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion depthcharge/data/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
"""The Pytorch Datasets."""
from . import preprocessing
from .analyte_datasets import AnalyteDataset
from .arrow import (
spectra_to_df,
spectra_to_parquet,
spectra_to_stream,
)
from .fields import CustomField
from .peptide_datasets import PeptideDataset
from .spectrum_datasets import (
AnnotatedSpectrumDataset,
SpectrumDataset,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
import torch
from torch.utils.data import DataLoader, TensorDataset

from ..tokenizers import PeptideTokenizer
from ..tokenizers import MoleculeTokenizer, PeptideTokenizer


class PeptideDataset(TensorDataset):
class AnalyteDataset(TensorDataset):
"""A dataset for peptide sequences.

Parameters
Expand All @@ -18,33 +18,25 @@ class PeptideDataset(TensorDataset):
sequences : Iterable[str]
The peptide sequences in a format compatible with
your tokenizer. ProForma is preferred.
charges : torch.Tensor,
The charge state for each peptide.
*args : torch.Tensor, optional
Additional values to include during data loading.
"""

def __init__(
self,
tokenizer: PeptideTokenizer,
tokenizer: PeptideTokenizer | MoleculeTokenizer,
wfondrie marked this conversation as resolved.
Show resolved Hide resolved
sequences: Iterable[str],
charges: torch.Tensor,
*args: torch.Tensor,
) -> None:
"""Initialize a PeptideDataset."""
tokens = tokenizer.tokenize(sequences)
super().__init__(tokens, charges, *args)
super().__init__(tokens, *args)

@property
def tokens(self) -> torch.Tensor:
"""The peptide sequence tokens."""
return self.tensors[0]

@property
def charges(self) -> torch.Tensor:
"""The peptide charges."""
return self.tensors[1]

def loader(self, *args: tuple, **kwargs: dict) -> DataLoader:
"""A PyTorch DataLoader for peptides.

Expand Down
40 changes: 40 additions & 0 deletions depthcharge/mixins.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""Helpful Mixins."""
import torch


class ModelMixin:
"""Add helpful properties for depthcharge models."""

@property
def device(self) -> torch.device:
"""The current device for first parameter of the model."""
return next(self.parameters()).device


class TransformerMixin:
"""Properties shard by Transformer models."""
wfondrie marked this conversation as resolved.
Show resolved Hide resolved

@property
def d_model(self) -> int:
"""The latent dimensionality of the model."""
return self._d_model

@property
def nhead(self) -> int:
"""The number of attention heads."""
return self._nhead

@property
def dim_feedforward(self) -> int:
"""The dimensionality of the Transformer feedforward layers."""
return self._dim_feedforward

@property
def n_layers(self) -> int:
"""The number of Transformer layers."""
return self._n_layers

@property
def dropout(self) -> float:
"""The dropout for the transformer layers."""
return self._dropout
3 changes: 2 additions & 1 deletion depthcharge/tokenizers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
"""Deptcharge tokenizers."""
"""Depthcharge tokenizers."""
from .molecules import MoleculeTokenizer
from .peptides import PeptideTokenizer
from .tokenizer import Tokenizer
134 changes: 134 additions & 0 deletions depthcharge/tokenizers/molecules.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
"""Tokenizers for small molecules."""
from __future__ import annotations

from collections.abc import Iterable

import selfies as sf

from .. import utils
from ..primitives import Molecule
from .tokenizer import Tokenizer


class MoleculeTokenizer(Tokenizer):
"""A tokenizer for small molecules.

SMILES strings representing small molecules are parsed as
wfondrie marked this conversation as resolved.
Show resolved Hide resolved
SELFIES representations and split into tokens.

Parameters
----------
selfies_vocab : Iterable[str]
The SELFIES tokens to be considered.
start_token : str, optional
The start token to use.
stop_token : str, optional
The stop token to use.

Attributes
----------
index : SortedDicte{str, int}
wfondrie marked this conversation as resolved.
Show resolved Hide resolved
The mapping of residues and modifications to integer representations.
reverse_index : list[None | str]
The ordered residues and modifications where the list index is the
integer representation for a token.
start_token : str
The start token
stop_token : str
The stop token.
start_int : int
The integer representation of the start token
stop_int : int
The integer representation of the stop token.
padding_int : int
The integer used to represent padding.
"""

def __init__(
self,
selfies_vocab: Iterable[str] | None = None,
start_token: str | None = None,
stop_token: str | None = "$",
) -> None:
"""Initialize a MoleculeTokenizer."""
if selfies_vocab is None:
selfies_vocab = sf.get_semantic_robust_alphabet()

self.selfies_vocab = selfies_vocab
super().__init__(selfies_vocab, start_token, stop_token)

def split(self, sequence: str) -> list[str]:
"""Split a SMILES or SELFIES string into SELFIES tokens.

Parameters
----------
sequence : str
The SMILES or SELFIES string representing a molecule.

Returns
-------
List[str]
The SELFIES tokens representing the molecule.
"""
try:
return sf.split_selfies(sequence)
except AttributeError:
wfondrie marked this conversation as resolved.
Show resolved Hide resolved
return sf.split_selfies(Molecule(sequence).to_selfies())

Check warning on line 76 in depthcharge/tokenizers/molecules.py

View check run for this annotation

Codecov / codecov/patch

depthcharge/tokenizers/molecules.py#L75-L76

Added lines #L75 - L76 were not covered by tests

@classmethod
def from_smiles(
cls,
smiles: Iterable[str] | str,
start_token: str | None = None,
stop_token: str | None = "$",
) -> MoleculeTokenizer:
"""Learn the vocabulary from SMILES strings.

Parameters
----------
smiles : Iterable[str] | str
Create a vocabulary from all unique tokens in these SMILES strings.
start_token : str, optional
The start token to use.
stop_token : str, optional
The stop token to use.

Returns
-------
MoleculeTokenizer
The tokenizer restricted to the vocabulary present in the
input SMILES strings.
"""
vocab = sf.get_alphabet_from_selfies(
wfondrie marked this conversation as resolved.
Show resolved Hide resolved
Molecule(s).to_selfies() for s in utils.listify(smiles)
)

return cls(vocab, start_token, stop_token)

@classmethod
def from_selfies(
cls,
selfies: Iterable[str] | str,
start_token: str | None = None,
stop_token: str | None = "$",
) -> MoleculeTokenizer:
"""Learn the vocabulary from SELFIES strings.

Parameters
----------
selfies : Iterable[str] | str
Create a vocabulary from all unique tokens in these SELFIES
strings.
start_token : str, optional
The start token to use.
stop_token : str, optional
The stop token to use.

Returns
-------
MoleculeTokenizer
The tokenizer restricted to the vocabulary present in the
input SMILES strings.
"""
vocab = sf.get_alphabet_from_selfies(utils.listify(selfies))
return cls(vocab, start_token, stop_token)
Loading