wfondrie · wfondrie · Mar 7, 2024 · Dec 6, 2023 · Dec 6, 2023 · Dec 6, 2023
diff --git a/depthcharge/data/__init__.py b/depthcharge/data/__init__.py
@@ -1,12 +1,12 @@
 """The Pytorch Datasets."""
 from . import preprocessing
+from .analyte_datasets import AnalyteDataset
 from .arrow import (
     spectra_to_df,
     spectra_to_parquet,
     spectra_to_stream,
 )
 from .fields import CustomField
-from .peptide_datasets import PeptideDataset
 from .spectrum_datasets import (
     AnnotatedSpectrumDataset,
     SpectrumDataset,

diff --git a/depthcharge/data/peptide_datasets.py → depthcharge/data/analyte_datasets.py b/depthcharge/data/peptide_datasets.py → depthcharge/data/analyte_datasets.py
@@ -4,10 +4,10 @@
 import torch
 from torch.utils.data import DataLoader, TensorDataset
 
-from ..tokenizers import PeptideTokenizer
+from ..tokenizers import MoleculeTokenizer, PeptideTokenizer
 
 
-class PeptideDataset(TensorDataset):
+class AnalyteDataset(TensorDataset):
     """A dataset for peptide sequences.
 
     Parameters
@@ -18,33 +18,25 @@ class PeptideDataset(TensorDataset):
     sequences : Iterable[str]
         The peptide sequences in a format compatible with
         your tokenizer. ProForma is preferred.
-    charges : torch.Tensor,
-        The charge state for each peptide.
     *args : torch.Tensor, optional
         Additional values to include during data loading.
     """
 
     def __init__(
         self,
-        tokenizer: PeptideTokenizer,
+        tokenizer: PeptideTokenizer | MoleculeTokenizer,
         sequences: Iterable[str],
-        charges: torch.Tensor,
         *args: torch.Tensor,
     ) -> None:
         """Initialize a PeptideDataset."""
         tokens = tokenizer.tokenize(sequences)
-        super().__init__(tokens, charges, *args)
+        super().__init__(tokens, *args)
 
     @property
     def tokens(self) -> torch.Tensor:
         """The peptide sequence tokens."""
         return self.tensors[0]
 
-    @property
-    def charges(self) -> torch.Tensor:
-        """The peptide charges."""
-        return self.tensors[1]
-
     def loader(self, *args: tuple, **kwargs: dict) -> DataLoader:
         """A PyTorch DataLoader for peptides.
 

diff --git a/depthcharge/mixins.py b/depthcharge/mixins.py
@@ -0,0 +1,40 @@
+"""Helpful Mixins."""
+import torch
+
+
+class ModelMixin:
+    """Add helpful properties for depthcharge models."""
+
+    @property
+    def device(self) -> torch.device:
+        """The current device for first parameter of the model."""
+        return next(self.parameters()).device
+
+
+class TransformerMixin:
+    """Properties shard by Transformer models."""
+
+    @property
+    def d_model(self) -> int:
+        """The latent dimensionality of the model."""
+        return self._d_model
+
+    @property
+    def nhead(self) -> int:
+        """The number of attention heads."""
+        return self._nhead
+
+    @property
+    def dim_feedforward(self) -> int:
+        """The dimensionality of the Transformer feedforward layers."""
+        return self._dim_feedforward
+
+    @property
+    def n_layers(self) -> int:
+        """The number of Transformer layers."""
+        return self._n_layers
+
+    @property
+    def dropout(self) -> float:
+        """The dropout for the transformer layers."""
+        return self._dropout
diff --git a/depthcharge/tokenizers/__init__.py b/depthcharge/tokenizers/__init__.py
@@ -1,3 +1,4 @@
-"""Deptcharge tokenizers."""
+"""Depthcharge tokenizers."""
+from .molecules import MoleculeTokenizer
 from .peptides import PeptideTokenizer
 from .tokenizer import Tokenizer
diff --git a/depthcharge/tokenizers/molecules.py b/depthcharge/tokenizers/molecules.py
@@ -0,0 +1,134 @@
+"""Tokenizers for small molecules."""
+from __future__ import annotations
+
+from collections.abc import Iterable
+
+import selfies as sf
+
+from .. import utils
+from ..primitives import Molecule
+from .tokenizer import Tokenizer
+
+
+class MoleculeTokenizer(Tokenizer):
+    """A tokenizer for small molecules.
+
+    SMILES strings representing small molecules are parsed as
+    SELFIES representations and split into tokens.
+
+    Parameters
+    ----------
+    selfies_vocab : Iterable[str]
+        The SELFIES tokens to be considered.
+    start_token : str, optional
+        The start token to use.
+    stop_token : str, optional
+        The stop token to use.
+
+    Attributes
+    ----------
+    index : SortedDicte{str, int}
+        The mapping of residues and modifications to integer representations.
+    reverse_index : list[None | str]
+        The ordered residues and modifications where the list index is the
+        integer representation for a token.
+    start_token : str
+        The start token
+    stop_token : str
+        The stop token.
+    start_int : int
+        The integer representation of the start token
+    stop_int : int
+        The integer representation of the stop token.
+    padding_int : int
+        The integer used to represent padding.
+    """
+
+    def __init__(
+        self,
+        selfies_vocab: Iterable[str] | None = None,
+        start_token: str | None = None,
+        stop_token: str | None = "$",
+    ) -> None:
+        """Initialize a MoleculeTokenizer."""
+        if selfies_vocab is None:
+            selfies_vocab = sf.get_semantic_robust_alphabet()
+
+        self.selfies_vocab = selfies_vocab
+        super().__init__(selfies_vocab, start_token, stop_token)
+
+    def split(self, sequence: str) -> list[str]:
+        """Split a SMILES or SELFIES string into SELFIES tokens.
+
+        Parameters
+        ----------
+        sequence : str
+            The SMILES or SELFIES string representing a molecule.
+
+        Returns
+        -------
+        List[str]
+            The SELFIES tokens representing the molecule.
+        """
+        try:
+            return sf.split_selfies(sequence)
+        except AttributeError:
+            return sf.split_selfies(Molecule(sequence).to_selfies())
+
+    @classmethod
+    def from_smiles(
+        cls,
+        smiles: Iterable[str] | str,
+        start_token: str | None = None,
+        stop_token: str | None = "$",
+    ) -> MoleculeTokenizer:
+        """Learn the vocabulary from SMILES strings.
+
+        Parameters
+        ----------
+        smiles : Iterable[str] | str
+            Create a vocabulary from all unique tokens in these SMILES strings.
+        start_token : str, optional
+            The start token to use.
+        stop_token : str, optional
+            The stop token to use.
+
+        Returns
+        -------
+        MoleculeTokenizer
+            The tokenizer restricted to the vocabulary present in the
+            input SMILES strings.
+        """
+        vocab = sf.get_alphabet_from_selfies(
+            Molecule(s).to_selfies() for s in utils.listify(smiles)
+        )
+
+        return cls(vocab, start_token, stop_token)
+
+    @classmethod
+    def from_selfies(
+        cls,
+        selfies: Iterable[str] | str,
+        start_token: str | None = None,
+        stop_token: str | None = "$",
+    ) -> MoleculeTokenizer:
+        """Learn the vocabulary from SELFIES strings.
+
+        Parameters
+        ----------
+        selfies : Iterable[str] | str
+            Create a vocabulary from all unique tokens in these SELFIES
+            strings.
+        start_token : str, optional
+            The start token to use.
+        stop_token : str, optional
+            The stop token to use.
+
+        Returns
+        -------
+        MoleculeTokenizer
+            The tokenizer restricted to the vocabulary present in the
+            input SMILES strings.
+        """
+        vocab = sf.get_alphabet_from_selfies(utils.listify(selfies))
+        return cls(vocab, start_token, stop_token)