From f4308debe7504cbdaa894c0b3c7bcbccb0de4c2b Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Thu, 27 Jun 2024 13:15:18 +0200 Subject: [PATCH 1/8] Add MACEOFF Dataset --- torchmdnet/datasets/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/torchmdnet/datasets/__init__.py b/torchmdnet/datasets/__init__.py index b57cd95a6..f395e6e6c 100644 --- a/torchmdnet/datasets/__init__.py +++ b/torchmdnet/datasets/__init__.py @@ -23,6 +23,7 @@ from .qm9q import QM9q from .spice import SPICE from .genentech import GenentechTorsions +from .maceoff import MACEOFF __all__ = [ "Ace", @@ -47,4 +48,5 @@ "SPICE", "Tripeptides", "WaterBox", + "MACEOFF", ] From 64e7ee65e231163a9105e21cec5aeedc21f7cd34 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Thu, 27 Jun 2024 16:38:12 +0200 Subject: [PATCH 2/8] Add MACEOFF dataset --- torchmdnet/datasets/maceoff.py | 101 +++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 torchmdnet/datasets/maceoff.py diff --git a/torchmdnet/datasets/maceoff.py b/torchmdnet/datasets/maceoff.py new file mode 100644 index 000000000..acb7d2a21 --- /dev/null +++ b/torchmdnet/datasets/maceoff.py @@ -0,0 +1,101 @@ +# Copyright Universitat Pompeu Fabra 2020-2023 https://www.compscience.org +# Distributed under the MIT License. +# (See accompanying file README.md file or copy at http://opensource.org/licenses/MIT) + +import hashlib +import ase +import h5py +import numpy as np +import os +import torch as pt +from torchmdnet.datasets.memdataset import MemmappedDataset +from torch_geometric.data import Data, download_url +from tqdm import tqdm +import tarfile +import tempfile +import re +import ase.io +import logging + + +class MACEOFF(MemmappedDataset): + """ + MACEOFF dataset from MACE-OFF23: Transferable Machine Learning Force Fields for Organic Molecules, Kovacs et.al. https://arxiv.org/abs/2312.15211 + This dataset consists of arounf 100K conformations with 95% of them coming from SPICE and augmented with conformations from QMugs, COMP6 and clusters of water carved out of MD simulations of liquid water. + """ + + VERSIONS = { + "1.0": { + "url": "https://api.repository.cam.ac.uk/server/api/core/bitstreams/b185b5ab-91cf-489a-9302-63bfac42824a/content", + "file": "train_large_neut_no_bad_clean.tar.gz", + }, + } + + @property + def raw_dir(self): + return os.path.join(super().raw_dir, "maceoff", self.version) + + @property + def raw_file_names(self): + return self.VERSIONS[self.version]["file"] + + @property + def raw_url(self): + return f"{self.VERSIONS[self.version]['url']}" + + def __init__( + self, + root=None, + transform=None, + pre_transform=None, + pre_filter=None, + version="1.0", + max_gradient=None, + ): + arg_hash = f"{version}{max_gradient}" + arg_hash = hashlib.md5(arg_hash.encode()).hexdigest() + self.name = f"{self.__class__.__name__}-{arg_hash}" + self.version = str(version) + assert self.version in self.VERSIONS + self.max_gradient = max_gradient + super().__init__( + root, + transform, + pre_transform, + pre_filter, + properties=("y", "neg_dy"), + ) + + def sample_iter(self, mol_ids=False): + assert len(self.raw_paths) == 1 + logging.info(f"Processing dataset {self.raw_file_names}") + with tempfile.TemporaryDirectory() as tmp_dir: + tar_path = os.path.join(self.raw_dir, self.raw_file_names) + xyz_path = os.path.join( + tmp_dir, re.sub(r"\.tar\.gz$", ".xyz", self.raw_file_names) + ) + logging.info(f"Extracting {tar_path} to {tmp_dir}") + with tarfile.open(tar_path, "r:gz") as tar: + tar.extractall(tmp_dir) + assert os.path.exists(xyz_path) + for mol in tqdm(ase.io.iread(xyz_path), desc="Processing conformations"): + energy = ( + mol.info["energy"] + if "energy" in mol.info + else mol.get_potential_energy() + ) + forces = ( + mol.arrays["forces"] if "forces" in mol.arrays else mol.get_forces() + ) + data = Data( + **dict( + z=pt.tensor(np.array(mol.numbers), dtype=pt.long), + pos=pt.tensor(mol.positions, dtype=pt.float32), + y=pt.tensor(energy, dtype=pt.float64).view(1, 1), + neg_dy=pt.tensor(forces, dtype=pt.float32), + ) + ) + yield data + + def download(self): + download_url(self.raw_url, self.raw_dir, filename=self.raw_file_names) From 2ebe79f1f03d85b8306f4a8cfc1fd12c42f84179 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Fri, 28 Jun 2024 09:12:24 +0200 Subject: [PATCH 3/8] Update docstring --- torchmdnet/datasets/maceoff.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/torchmdnet/datasets/maceoff.py b/torchmdnet/datasets/maceoff.py index acb7d2a21..6485dae05 100644 --- a/torchmdnet/datasets/maceoff.py +++ b/torchmdnet/datasets/maceoff.py @@ -20,8 +20,14 @@ class MACEOFF(MemmappedDataset): """ - MACEOFF dataset from MACE-OFF23: Transferable Machine Learning Force Fields for Organic Molecules, Kovacs et.al. https://arxiv.org/abs/2312.15211 - This dataset consists of arounf 100K conformations with 95% of them coming from SPICE and augmented with conformations from QMugs, COMP6 and clusters of water carved out of MD simulations of liquid water. + MACEOFF dataset from MACE-OFF23: Transferable Machine Learning Force Fields for Organic Molecules, Kovacs et.al. https://arxiv.org/abs/2312.15211 + This dataset consists of arounf 100K conformations with 95% of them coming from SPICE and augmented with conformations from QMugs, COMP6 and clusters of water carved out of MD simulations of liquid water. + + From the repository: + The core of the training set is the SPICE dataset. 95% of the data were used for training and validation, and 5% for testing. The MACE-OFF23 model is trained to reproduce the energies and forces computed at the ωB97M-D3(BJ)/def2-TZVPPD level of quantum mechanics, as implemented in the PSI4 software. We have used a subset of SPICE that contains the ten chemical elements H, C, N, O, F, P, S, Cl, Br, and I, and has a neutral formal charge. We have also removed the ion pairs subset. Overall, we used about 85% of the full SPICE dataset. + + Contains energy and force data in units of eV and eV/Angstrom + """ VERSIONS = { From 525b16d5ec6c5b9493466b65832ba599fcbe8372 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Fri, 28 Jun 2024 09:25:08 +0200 Subject: [PATCH 4/8] Add check for max_gradient --- torchmdnet/datasets/maceoff.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/torchmdnet/datasets/maceoff.py b/torchmdnet/datasets/maceoff.py index 6485dae05..a4f795e15 100644 --- a/torchmdnet/datasets/maceoff.py +++ b/torchmdnet/datasets/maceoff.py @@ -101,6 +101,10 @@ def sample_iter(self, mol_ids=False): neg_dy=pt.tensor(forces, dtype=pt.float32), ) ) + # Skip samples with large forces + if self.max_gradient: + if data.neg_dy.norm(dim=1).max() > float(self.max_gradient): + continue yield data def download(self): From 3dc2b90307cbc7e22966edb90d760b581d454865 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Fri, 28 Jun 2024 09:26:02 +0200 Subject: [PATCH 5/8] Add MACEOFF example --- examples/TensorNet-MACEOFF.yaml | 59 +++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 examples/TensorNet-MACEOFF.yaml diff --git a/examples/TensorNet-MACEOFF.yaml b/examples/TensorNet-MACEOFF.yaml new file mode 100644 index 000000000..426434f4c --- /dev/null +++ b/examples/TensorNet-MACEOFF.yaml @@ -0,0 +1,59 @@ +activation: silu +aggr: add +atom_filter: -1 +batch_size: 16 +coord_files: null +cutoff_lower: 0.0 +cutoff_upper: 10.0 +dataset: MACEOFF +dataset_arg: + max_gradient: 50.94 +dataset_root: ~/data +derivative: true +early_stopping_patience: 50 +ema_alpha_neg_dy: 1.0 +ema_alpha_y: 1.0 +embed_files: null +embedding_dimension: 128 +energy_files: null +equivariance_invariance_group: O(3) +y_weight: 1.0 +force_files: null +neg_dy_weight: 10.0 +gradient_clipping: 100.0 +inference_batch_size: 16 +load_model: null +log_dir: logs/ +lr: 0.0001 +lr_factor: 0.5 +lr_min: 1.0e-08 +lr_patience: 5 +lr_warmup_steps: 500 +max_num_neighbors: 128 +max_z: 128 +model: tensornet +ngpus: -1 +num_epochs: 500 +num_layers: 2 +num_nodes: 1 +num_rbf: 64 +num_workers: 4 +output_model: Scalar +precision: 32 +prior_model: null +rbf_type: expnorm +redirect: false +reduce_op: add +save_interval: 10 +splits: null +seed: 1 +standardize: false +test_interval: 10 +test_size: null +train_size: 0.8 +trainable_rbf: false +val_size: 0.1 +weight_decay: 0.0 +box_vecs: null +charge: false +spin: false From 603e79506f46fe71e4048eee87a31215b224e049 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Fri, 28 Jun 2024 12:09:41 +0200 Subject: [PATCH 6/8] Add error checking --- torchmdnet/datasets/maceoff.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/torchmdnet/datasets/maceoff.py b/torchmdnet/datasets/maceoff.py index a4f795e15..0326a01d3 100644 --- a/torchmdnet/datasets/maceoff.py +++ b/torchmdnet/datasets/maceoff.py @@ -101,6 +101,9 @@ def sample_iter(self, mol_ids=False): neg_dy=pt.tensor(forces, dtype=pt.float32), ) ) + assert data.y.shape == (1, 1) + assert data.z.shape[0] == data.pos.shape[0] + assert data.neg_dy.shape[0] == data.pos.shape[0] # Skip samples with large forces if self.max_gradient: if data.neg_dy.norm(dim=1).max() > float(self.max_gradient): From 33cd98883b78d76dfbf5887cf98047fc77458c01 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Fri, 28 Jun 2024 12:11:48 +0200 Subject: [PATCH 7/8] Add pre_transform and pre_filter --- torchmdnet/datasets/maceoff.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/torchmdnet/datasets/maceoff.py b/torchmdnet/datasets/maceoff.py index 0326a01d3..5725e3dff 100644 --- a/torchmdnet/datasets/maceoff.py +++ b/torchmdnet/datasets/maceoff.py @@ -108,6 +108,10 @@ def sample_iter(self, mol_ids=False): if self.max_gradient: if data.neg_dy.norm(dim=1).max() > float(self.max_gradient): continue + if self.pre_filter is not None and not self.pre_filter(data): + continue + if self.pre_transform is not None: + data = self.pre_transform(data) yield data def download(self): From 191f45442855d1c374bb45cf39ee23e1afdc23c0 Mon Sep 17 00:00:00 2001 From: RaulPPealez Date: Tue, 9 Jul 2024 10:02:28 +0200 Subject: [PATCH 8/8] Use @stefdoerr xyz parser --- torchmdnet/datasets/maceoff.py | 104 ++++++++++++++++++++------------- 1 file changed, 62 insertions(+), 42 deletions(-) diff --git a/torchmdnet/datasets/maceoff.py b/torchmdnet/datasets/maceoff.py index 5725e3dff..cb0d7f758 100644 --- a/torchmdnet/datasets/maceoff.py +++ b/torchmdnet/datasets/maceoff.py @@ -3,19 +3,54 @@ # (See accompanying file README.md file or copy at http://opensource.org/licenses/MIT) import hashlib -import ase -import h5py +from ase.data import atomic_numbers import numpy as np import os import torch as pt from torchmdnet.datasets.memdataset import MemmappedDataset from torch_geometric.data import Data, download_url -from tqdm import tqdm import tarfile -import tempfile -import re -import ase.io import logging +import re +from tqdm import tqdm + + +def parse_maceoff_tar(tar_file): + energy_re = re.compile("energy=(\S+)") + with tarfile.open(tar_file, "r:gz") as tar: + for member in tar.getmembers(): + f = tar.extractfile(member) + if f is None: + continue + n_atoms = None + counter = 0 + positions = [] + numbers = [] + forces = [] + energy = None + for line in f: + line = line.decode("utf-8").strip() + if n_atoms is None: + n_atoms = int(line) + positions = [] + numbers = [] + forces = [] + energy = None + counter = 1 + continue + if counter == 1: + props = line + energy = float(energy_re.search(props).group(1)) + counter = 2 + continue + el, x, y, z, fx, fy, fz, _, _, _ = line.split() + numbers.append(atomic_numbers[el]) + positions.append([float(x), float(y), float(z)]) + forces.append([float(fx), float(fy), float(fz)]) + counter += 1 + if counter == n_atoms + 2: + n_atoms = None + yield energy, numbers, positions, forces class MACEOFF(MemmappedDataset): @@ -75,44 +110,29 @@ def __init__( def sample_iter(self, mol_ids=False): assert len(self.raw_paths) == 1 logging.info(f"Processing dataset {self.raw_file_names}") - with tempfile.TemporaryDirectory() as tmp_dir: - tar_path = os.path.join(self.raw_dir, self.raw_file_names) - xyz_path = os.path.join( - tmp_dir, re.sub(r"\.tar\.gz$", ".xyz", self.raw_file_names) - ) - logging.info(f"Extracting {tar_path} to {tmp_dir}") - with tarfile.open(tar_path, "r:gz") as tar: - tar.extractall(tmp_dir) - assert os.path.exists(xyz_path) - for mol in tqdm(ase.io.iread(xyz_path), desc="Processing conformations"): - energy = ( - mol.info["energy"] - if "energy" in mol.info - else mol.get_potential_energy() + for energy, numbers, positions, forces in tqdm( + parse_maceoff_tar(self.raw_paths[0]), desc="Processing conformations" + ): + data = Data( + **dict( + z=pt.tensor(np.array(numbers), dtype=pt.long), + pos=pt.tensor(positions, dtype=pt.float32), + y=pt.tensor(energy, dtype=pt.float64).view(1, 1), + neg_dy=pt.tensor(forces, dtype=pt.float32), ) - forces = ( - mol.arrays["forces"] if "forces" in mol.arrays else mol.get_forces() - ) - data = Data( - **dict( - z=pt.tensor(np.array(mol.numbers), dtype=pt.long), - pos=pt.tensor(mol.positions, dtype=pt.float32), - y=pt.tensor(energy, dtype=pt.float64).view(1, 1), - neg_dy=pt.tensor(forces, dtype=pt.float32), - ) - ) - assert data.y.shape == (1, 1) - assert data.z.shape[0] == data.pos.shape[0] - assert data.neg_dy.shape[0] == data.pos.shape[0] - # Skip samples with large forces - if self.max_gradient: - if data.neg_dy.norm(dim=1).max() > float(self.max_gradient): - continue - if self.pre_filter is not None and not self.pre_filter(data): + ) + assert data.y.shape == (1, 1) + assert data.z.shape[0] == data.pos.shape[0] + assert data.neg_dy.shape[0] == data.pos.shape[0] + # Skip samples with large forces + if self.max_gradient: + if data.neg_dy.norm(dim=1).max() > float(self.max_gradient): continue - if self.pre_transform is not None: - data = self.pre_transform(data) - yield data + if self.pre_filter is not None and not self.pre_filter(data): + continue + if self.pre_transform is not None: + data = self.pre_transform(data) + yield data def download(self): download_url(self.raw_url, self.raw_dir, filename=self.raw_file_names)