Skip to content

Commit

Permalink
Add molecule tests
Browse files Browse the repository at this point in the history
  • Loading branch information
wfondrie committed Dec 9, 2023
1 parent 953cc3c commit 94c6413
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 3 deletions.
55 changes: 52 additions & 3 deletions depthcharge/tokenizers/molecules.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import selfies as sf

from .. import utils
from ..primitives import Molecule
from .tokenizer import Tokenizer

Expand All @@ -14,6 +15,11 @@ class MoleculeTokenizer(Tokenizer):
SMILES strings representing small molecules are parsed as
SELFIES representations and split into tokens.
Parameters
----------
selfies_vocab : Iterable[str]
The SELFIES tokens to be considered.
"""

def __init__(self, selfies_vocab: Iterable[str] | None = None) -> None:
Expand All @@ -25,16 +31,59 @@ def __init__(self, selfies_vocab: Iterable[str] | None = None) -> None:
super().__init__(selfies_vocab)

def split(self, sequence: str) -> list[str]:
"""Split a SMILES string into SELFIES tokens.
"""Split a SMILES or SELFIES string into SELFIES tokens.
Parameters
----------
sequence : str
The SMILES string representing a molecule.
The SMILES or SELFIES string representing a molecule.
Returns
-------
List[str]
The SELFIES tokens representing the molecule.
"""
return sf.split_selfies(Molecule(sequence).to_selfies())
try:
return sf.split_selfies(sequence)
except AttributeError:
return sf.split_selfies(Molecule(sequence).to_selfies())

Check warning on line 49 in depthcharge/tokenizers/molecules.py

View check run for this annotation

Codecov / codecov/patch

depthcharge/tokenizers/molecules.py#L48-L49

Added lines #L48 - L49 were not covered by tests

@classmethod
def from_smiles(cls, smiles: Iterable[str] | str) -> MoleculeTokenizer:
"""Learn the vocabulary from SMILES strings.
Parameters
----------
smiles : Iterable[str] | str
Create a vocabulary from all unique tokens in these SMILES strings.
Returns
-------
MoleculeTokenizer
The tokenizer restricted to the vocabulary present in the
input SMILES strings.
"""
vocab = sf.get_alphabet_from_selfies(
Molecule(s).to_selfies() for s in utils.listify(smiles)
)

return cls(vocab)

@classmethod
def from_selfies(cls, selfies: Iterable[str] | str) -> MoleculeTokenizer:
"""Learn the vocabulary from SELFIES strings.
Parameters
----------
selfies : Iterable[str] | str
Create a vocabulary from all unique tokens in these SELFIES
strings.
Returns
-------
MoleculeTokenizer
The tokenizer restricted to the vocabulary present in the
input SMILES strings.
"""
vocab = sf.get_alphabet_from_selfies(utils.listify(selfies))
return cls(vocab)
27 changes: 27 additions & 0 deletions tests/unit_tests/test_tokenizers/test_molecules.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
"""Test the molecule tokenizer."""
import pytest

from depthcharge.tokenizers import MoleculeTokenizer


@pytest.mark.parametrize(
["mode", "vocab", "len_vocab"],
[
("basic", None, 69),
("basic", ["x", "y"], 2),
("selfies", ["[C][O][C]", "[F][C][F]", "[O][=O]"], 4),
("selfies", "[C][O]", 2),
("smiles", "CN1C=NC2=C1C(=O)N(C(=O)N2C)C", 8),
("smiles", ["CN", "CC(=O)O"], 5),
],
)
def test_init(mode, vocab, len_vocab):
"""Test initialization."""
if mode == "smiles":
tokenizer = MoleculeTokenizer.from_smiles(vocab)
elif mode == "selfies":
tokenizer = MoleculeTokenizer.from_selfies(vocab)
else:
tokenizer = MoleculeTokenizer(vocab)

assert len(tokenizer.selfies_vocab) == len_vocab

0 comments on commit 94c6413

Please sign in to comment.