Add molecule tests

wfondrie · Dec 9, 2023 · 94c6413 · 94c6413
1 parent 953cc3c
commit 94c6413
Show file tree

Hide file tree

Showing 2 changed files with 79 additions and 3 deletions.
diff --git a/depthcharge/tokenizers/molecules.py b/depthcharge/tokenizers/molecules.py
@@ -5,6 +5,7 @@
 
 import selfies as sf
 
+from .. import utils
 from ..primitives import Molecule
 from .tokenizer import Tokenizer
 
@@ -14,6 +15,11 @@ class MoleculeTokenizer(Tokenizer):
 
     SMILES strings representing small molecules are parsed as
     SELFIES representations and split into tokens.
+
+    Parameters
+    ----------
+    selfies_vocab : Iterable[str]
+        The SELFIES tokens to be considered.
     """
 
     def __init__(self, selfies_vocab: Iterable[str] | None = None) -> None:
@@ -25,16 +31,59 @@ def __init__(self, selfies_vocab: Iterable[str] | None = None) -> None:
         super().__init__(selfies_vocab)
 
     def split(self, sequence: str) -> list[str]:
-        """Split a SMILES string into SELFIES tokens.
+        """Split a SMILES or SELFIES string into SELFIES tokens.
 
         Parameters
         ----------
         sequence : str
-            The SMILES string representing a molecule.
+            The SMILES or SELFIES string representing a molecule.
 
         Returns
         -------
         List[str]
             The SELFIES tokens representing the molecule.
         """
-        return sf.split_selfies(Molecule(sequence).to_selfies())
+        try:
+            return sf.split_selfies(sequence)
+        except AttributeError:
+            return sf.split_selfies(Molecule(sequence).to_selfies())
+
+    @classmethod
+    def from_smiles(cls, smiles: Iterable[str] | str) -> MoleculeTokenizer:
+        """Learn the vocabulary from SMILES strings.
+
+        Parameters
+        ----------
+        smiles : Iterable[str] | str
+            Create a vocabulary from all unique tokens in these SMILES strings.
+
+        Returns
+        -------
+        MoleculeTokenizer
+            The tokenizer restricted to the vocabulary present in the
+            input SMILES strings.
+        """
+        vocab = sf.get_alphabet_from_selfies(
+            Molecule(s).to_selfies() for s in utils.listify(smiles)
+        )
+
+        return cls(vocab)
+
+    @classmethod
+    def from_selfies(cls, selfies: Iterable[str] | str) -> MoleculeTokenizer:
+        """Learn the vocabulary from SELFIES strings.
+
+        Parameters
+        ----------
+        selfies : Iterable[str] | str
+            Create a vocabulary from all unique tokens in these SELFIES
+            strings.
+
+        Returns
+        -------
+        MoleculeTokenizer
+            The tokenizer restricted to the vocabulary present in the
+            input SMILES strings.
+        """
+        vocab = sf.get_alphabet_from_selfies(utils.listify(selfies))
+        return cls(vocab)
diff --git a/tests/unit_tests/test_tokenizers/test_molecules.py b/tests/unit_tests/test_tokenizers/test_molecules.py
@@ -0,0 +1,27 @@
+"""Test the molecule tokenizer."""
+import pytest
+
+from depthcharge.tokenizers import MoleculeTokenizer
+
+
+@pytest.mark.parametrize(
+    ["mode", "vocab", "len_vocab"],
+    [
+        ("basic", None, 69),
+        ("basic", ["x", "y"], 2),
+        ("selfies", ["[C][O][C]", "[F][C][F]", "[O][=O]"], 4),
+        ("selfies", "[C][O]", 2),
+        ("smiles", "CN1C=NC2=C1C(=O)N(C(=O)N2C)C", 8),
+        ("smiles", ["CN", "CC(=O)O"], 5),
+    ],
+)
+def test_init(mode, vocab, len_vocab):
+    """Test initialization."""
+    if mode == "smiles":
+        tokenizer = MoleculeTokenizer.from_smiles(vocab)
+    elif mode == "selfies":
+        tokenizer = MoleculeTokenizer.from_selfies(vocab)
+    else:
+        tokenizer = MoleculeTokenizer(vocab)
+
+    assert len(tokenizer.selfies_vocab) == len_vocab