autoatml · naik-aakash · Dec 8, 2024 · Dec 8, 2024 · Dec 8, 2024 · Dec 8, 2024
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -1,6 +1,7 @@
 {
   "name": "DeveloperEnv",
   "image": "ghcr.io/autoatml/autoplex/autoplex-python-3.10:0.1.0",
+  "initializeCommand": "docker system prune --all --force",
   "hostRequirements": {
    "cpus": 4,
    "memory": "16gb",

diff --git a/Dockerfile b/Dockerfile
@@ -7,6 +7,7 @@ FROM mambaorg/micromamba:1.5.10
 # Set environment variables for micromamba
 ENV MAMBA_DOCKERFILE_ACTIVATE=1
 ENV MAMBA_ROOT_PREFIX=/opt/conda
+ENV MAMBA_NO_LOW_SPEED_LIMIT=1
 
 # Switch to root to install all dependencies (using non-root user causes permission issues)
 USER root
@@ -31,20 +32,14 @@ RUN apt-get update && apt-get install -y \
     git \
     && rm -rf /var/lib/apt/lists/*
 
-# Install Python
+# Install Python, cuda toolkit and clean up tarballs
 RUN micromamba install -y -n base -c conda-forge \ python=${PYTHON_VERSION} && \
     micromamba clean --all --yes
 
-# Install testing dependencies
-RUN python -m pip install --no-cache-dir --upgrade pip \
-    && pip install --no-cache-dir uv \
-    && uv pip install pre-commit pytest pytest-mock pytest-split pytest-cov types-setuptools
-
 # Install Julia
 RUN curl -fsSL https://julialang-s3.julialang.org/bin/linux/x64/1.9/julia-1.9.2-linux-x86_64.tar.gz | tar -xz -C /opt \
     && ln -s /opt/julia-1.9.2/bin/julia /usr/local/bin/julia
 
-
 # Set up Julia environment (ACEpotentials.jl interface)
 RUN julia -e 'using Pkg; Pkg.Registry.add("General"); Pkg.Registry.add(Pkg.Registry.RegistrySpec(url="https://github.com/ACEsuit/ACEregistry")); Pkg.add(Pkg.PackageSpec(;name="ACEpotentials", version="0.6.7")); Pkg.add("DataFrames"); Pkg.add("CSV")'
 
@@ -74,15 +69,18 @@ RUN curl -fsSL https://download.lammps.org/tars/lammps-29Aug2024_update1.tar.gz
      && make install-python \
      && cmake --build . --target clean
 
-# Add LAMMPS to PATH and Shared LAMMPS library to LD_LIBRARY_PATH
+# Add LAMMPS to PATH and Update LD_LIBRARY_PATH
 ENV PATH="${PATH}:/root/.local/bin"
-ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/root/.local/lib"
+ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/root/.local/lib:/opt/conda/lib"
 
 # Set the working directory
 WORKDIR /workspace
 
 # Copy the current directory contents into the container at /workspace
 COPY . /workspace
 
-# Install autoplex and clear cache
-RUN uv pip install --prerelease=allow .[strict,docs] && uv cache clean && rm -rf /tmp/*
+# Install autoplex, testing dependencies and clear cache
+RUN python -m pip install --no-cache-dir --upgrade pip \
+    && pip install --no-cache-dir uv \
+    && uv pip install pre-commit pytest pytest-mock pytest-split pytest-cov types-setuptools \
+    && uv pip install --prerelease=allow .[strict,docs] && uv cache clean && rm -rf /tmp/*
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,6 +26,7 @@ dependencies = [
      "pymatgen>=2024.9.17.1",
      "atomate2[strict]>=0.0.18",
      "ase==3.23.0",
+     "calorine>=3.0",
      "matgl==1.1.3",
      "mace-torch==0.3.9",
      "numpy==1.26.4",
@@ -65,6 +66,7 @@ docs = [
 strict = [
      "pymatgen==2024.10.3", #?
      "atomate2[strict]==0.0.18",
+     "calorine==3.0",
      "matgl==1.1.3",
      "quippy-ase==0.9.14; python_version < '3.12'",
      "torch==2.2.1",

diff --git a/src/autoplex/data/common/jobs.py b/src/autoplex/data/common/jobs.py
@@ -706,6 +706,7 @@ def preprocess_data(
     distillation: bool = False,
     force_max: float = 40,
     force_label: str = "REF_forces",
+    energy_label: str = "REF_energy",
     pre_database_dir: str | None = None,
     reg_minmax: list[tuple] | None = None,
     isolated_atom_energies: dict | None = None,
@@ -737,6 +738,8 @@ def preprocess_data(
         Maximum force value to exclude structures.
     force_label: str
         The label of force values to use for distillation.
+    energy_label: str
+        The label of energy values to use for distillation.
     pre_database_dir : str
         Directory where the previous database was saved.
     reg_minmax: list[tuple]
@@ -759,7 +762,9 @@ def preprocess_data(
     if test_ratio == 0 or test_ratio is None:
         train_structures, test_structures = atoms, atoms
     else:
-        train_structures, test_structures = stratified_dataset_split(atoms, test_ratio)
+        train_structures, test_structures = stratified_dataset_split(
+            atoms, test_ratio, energy_label
+        )
 
     if pre_database_dir and os.path.exists(pre_database_dir):
         files_to_copy = ["train.extxyz", "test.extxyz"]

diff --git a/src/autoplex/data/common/utils.py b/src/autoplex/data/common/utils.py
@@ -1490,7 +1490,9 @@ def data_distillation(
     return atoms_distilled
 
 
-def stratified_dataset_split(atoms: Atoms, split_ratio: float) -> tuple[
+def stratified_dataset_split(
+    atoms: Atoms, split_ratio: float, energy_label: str
+) -> tuple[
     list[Atom | Atoms]
     | list[Atom | Atoms | list[Atom | Atoms] | list[Atom | Atoms | list]],
     list[Atom | Atoms | list[Atom | Atoms] | list[Atom | Atoms | list]],
@@ -1504,6 +1506,8 @@ def stratified_dataset_split(atoms: Atoms, split_ratio: float) -> tuple[
         ASE Atoms object
     split_ratio: float
         Parameter to divide the training set and the test set.
+    energy_label: str
+        The label for the energy property in the atoms.
 
     Returns
     -------
@@ -1525,7 +1529,15 @@ def stratified_dataset_split(atoms: Atoms, split_ratio: float) -> tuple[
     if len(atoms) != len(atom_bulk):
         atoms = atom_bulk
 
-    average_energies = np.array([atom.info["REF_energy"] / len(atom) for atom in atoms])
+    # Need this try except block because the energy label is not present as info
+    try:
+        average_energies = np.array(
+            [atom.info[energy_label] / len(atom) for atom in atoms]
+        )
+    except KeyError:
+        average_energies = np.array(
+            [atom.get_potential_energy() / len(atom) for atom in atoms]
+        )
     # sort by energy
     sorted_indices = np.argsort(average_energies)
     atoms = [atoms[i] for i in sorted_indices]

diff --git a/src/autoplex/fitting/common/flows.py b/src/autoplex/fitting/common/flows.py
@@ -5,6 +5,7 @@
 import shutil
 from dataclasses import dataclass
 from pathlib import Path
+from typing import Literal
 
 import ase.io
 from jobflow import Flow, Maker, job
@@ -91,7 +92,7 @@ class MLIPFitMaker(Maker):
     """
 
     name: str = "MLpotentialFit"
-    mlip_type: str = "GAP"
+    mlip_type: Literal["GAP", "J-ACE", "NEP", "NEQUIP", "M3GNET", "MACE"] = "GAP"
     hyperpara_opt: bool = False
     ref_energy_name: str = "REF_energy"
     ref_force_name: str = "REF_forces"
@@ -139,10 +140,10 @@ def make(
         fit_kwargs: dict
             Additional keyword arguments for MLIP fitting.
         """
-        if self.mlip_type not in ["GAP", "J-ACE", "NEQUIP", "M3GNET", "MACE"]:
+        if self.mlip_type not in ["GAP", "J-ACE", "NEP", "NEQUIP", "M3GNET", "MACE"]:
             raise ValueError(
                 "Please correct the MLIP name!"
-                "The current version ONLY supports the following models: GAP, J-ACE, NEQUIP, M3GNET, and MACE."
+                "The current version ONLY supports the following models: GAP, J-ACE, NEP, NEQUIP, M3GNET, and MACE."
             )
 
         if self.apply_data_preprocessing:
@@ -156,6 +157,9 @@ def make(
                 pre_xyz_files=self.pre_xyz_files,
                 pre_database_dir=self.pre_database_dir,
                 force_min=self.force_min,
+                ref_virial_name=self.ref_virial_name,
+                ref_force_name=self.ref_force_name,
+                ref_energy_name=self.ref_energy_name,
                 atomwise_regularization_parameter=self.atomwise_regularization_parameter,
                 atom_wise_regularization=self.atom_wise_regularization,
             ).make(
@@ -226,6 +230,12 @@ class DataPreprocessing(Maker):
         Repeat the fit for each data_type available in the (combined) database.
     distillation: bool
         For using data distillation.
+    ref_energy_name : str
+        Reference energy name in xyz file.
+    ref_force_name : str
+        Reference force name in xyz file.
+    ref_virial_name : str
+        Reference virial name in xyz file.
     force_max: float
         Maximally allowed force in the data set.
     force_min: float
@@ -238,6 +248,10 @@ class DataPreprocessing(Maker):
         Regularization value for the atom-wise force components.
     atom_wise_regularization: bool
         If True, includes atom-wise regularization.
+    train_data_file: str
+        Name of the training xyz data file.
+    test_data_file: str
+        Name of the test xyz data file.
 
     """
 
@@ -246,12 +260,17 @@ class DataPreprocessing(Maker):
     regularization: bool = False
     separated: bool = False
     distillation: bool = False
+    ref_energy_name: str = "REF_energy"
+    ref_force_name: str = "REF_forces"
+    ref_virial_name: str = "REF_virial"
     force_max: float = 40.0
     force_min: float = 0.01  # unit: eV Å-1
     pre_database_dir: str | None = None
     pre_xyz_files: list[str] | None = None
     atomwise_regularization_parameter: float = 0.1
     atom_wise_regularization: bool = True
+    train_data_file: str = "train.extxyz"
+    test_data_file: str = "test.extxyz"
 
     @job
     def make(
@@ -308,16 +327,25 @@ def make(
             f_min=self.force_min,
             regularization=self.atomwise_regularization_parameter,
             atom_wise_regularization=self.atom_wise_regularization,
+            ref_force_name=self.ref_force_name,
+            ref_energy_name=self.ref_energy_name,
+            ref_virial_name=self.ref_virial_name,
         )
 
         write_after_distillation_data_split(
-            self.distillation, self.force_max, self.split_ratio
+            distillation=self.distillation,
+            force_max=self.force_max,
+            split_ratio=self.split_ratio,
+            force_label=self.ref_force_name,
+            energy_label=self.ref_energy_name,
+            train_name=self.train_data_file,
+            test_name=self.test_data_file,
         )
 
         # Merging database
         if self.pre_database_dir and os.path.exists(self.pre_database_dir):
             if len(self.pre_xyz_files) == 2:
-                files_new = ["train.extxyz", "test.extxyz"]
+                files_new = [self.train_data_file, self.test_data_file]
                 for file_name, file_new in zip(self.pre_xyz_files, files_new):
                     with (
                         open(
@@ -340,13 +368,13 @@ def make(
                 logging.info(f"Created/verified folder: {folder_name}")
             except Exception as e:
                 logging.warning(f"Error creating folder {folder_name}: {e}")
-            train_path = os.path.join(folder_name, "train.extxyz")
-            test_path = os.path.join(folder_name, "test.extxyz")
-            atoms = ase.io.read("train.extxyz", index=":")
+            train_path = os.path.join(folder_name, self.train_data_file)
+            test_path = os.path.join(folder_name, self.test_data_file)
+            atoms = ase.io.read(self.train_data_file, index=":")
             ase.io.write(train_path, atoms, format="extxyz")
             logging.info(f"Written train file without regularization to: {train_path}")
             try:
-                shutil.copy("test.extxyz", test_path)
+                shutil.copy(self.test_data_file, test_path)
                 logging.info(f"Copied test file to: {test_path}")
             except FileNotFoundError:
                 logging.warning("test.extxyz not found. Skipping copy.")
@@ -356,11 +384,11 @@ def make(
                 atoms,
                 reg_minmax=[(0.1, 1), (0.001, 0.1), (0.0316, 0.316), (0.0632, 0.632)],
             )
-            ase.io.write("train.extxyz", atoms_with_sigma, format="extxyz")
+            ase.io.write(self.train_data_file, atoms_with_sigma, format="extxyz")
         if self.separated:
             base_dir = os.getcwd()
-            atoms_train = ase.io.read("train.extxyz", index=":")
-            atoms_test = ase.io.read("test.extxyz", index=":")
+            atoms_train = ase.io.read(self.train_data_file, index=":")
+            atoms_test = ase.io.read(self.test_data_file, index=":")
             for dt in set(data_types):
                 data_type = dt.removesuffix("_dir")
                 if data_type != "iso_atoms":
@@ -375,8 +403,8 @@ def make(
                         )
                         continue
                     vasp_ref_path = os.path.join(folder_name, "vasp_ref.extxyz")
-                    train_path = os.path.join(folder_name, "train.extxyz")
-                    test_path = os.path.join(folder_name, "test.extxyz")
+                    train_path = os.path.join(folder_name, self.train_data_file)
+                    test_path = os.path.join(folder_name, self.test_data_file)
 
                     for atoms in atoms_train + atoms_test:
                         if atoms.info["data_type"] == "iso_atoms":
@@ -401,6 +429,7 @@ def make(
                             vasp_ref_name=vasp_ref_path,
                             train_name=train_path,
                             test_name=test_path,
+                            force_label=self.ref_force_name,
                         )
                         logging.info(f"Data split written: {train_path}, {test_path}")
                     except Exception as e:

diff --git a/src/autoplex/fitting/common/jobs.py b/src/autoplex/fitting/common/jobs.py
@@ -10,6 +10,7 @@
     jace_fitting,
     m3gnet_fitting,
     mace_fitting,
+    nep_fitting,
     nequip_fitting,
 )
 
@@ -27,6 +28,7 @@ def machine_learning_fit(
     auto_delta: bool = True,
     glue_xml: bool = False,
     glue_file_path: str = "glue.xml",
+    gpu_identifier_indices: list[int] | None = None,
     mlip_type: str | None = None,
     ref_energy_name: str = "REF_energy",
     ref_force_name: str = "REF_forces",
@@ -57,9 +59,11 @@ def machine_learning_fit(
         Use the glue.xml core potential instead of fitting 2b terms.
     glue_file_path: str
         Name of the glue.xml file path.
+    gpu_identifier_indices: list[int]
+        List of GPU indices to be used for fitting. Only used for NEP fitting.
     mlip_type: str
         Choose one specific MLIP type to be fitted:
-        'GAP' | 'J-ACE' | 'NEQUIP' | 'M3GNET' | 'MACE'
+        'GAP' | 'J-ACE' | 'NEQUIP' | 'NEP' | 'M3GNET' | 'MACE'
     ref_energy_name: str
         Reference energy name.
     ref_force_name: str
@@ -129,6 +133,23 @@ def machine_learning_fit(
         )
         mlip_paths.append(train_test_error["mlip_path"])
 
+    elif mlip_type == "NEP":
+        if gpu_identifier_indices is None:
+            gpu_identifier_indices = [0]
+
+        train_test_error = nep_fitting(
+            db_dir=database_dir,
+            path_to_hyperparameters=path_to_hyperparameters,
+            ref_energy_name=ref_energy_name,
+            ref_force_name=ref_force_name,
+            ref_virial_name=ref_virial_name,
+            species_list=species_list,
+            gpu_identifier_indices=gpu_identifier_indices,
+            fit_kwargs=fit_kwargs,
+        )
+
+        mlip_paths.append(train_test_error["mlip_path"])
+
     elif mlip_type == "NEQUIP":
         train_test_error = nequip_fitting(
             db_dir=database_dir,

diff --git a/src/autoplex/fitting/common/mlip-rss-defaults.json b/src/autoplex/fitting/common/mlip-rss-defaults.json
@@ -56,5 +56,26 @@
     "restart_latest": true,
     "seed": 123,
     "device": "cpu"
+  },
+  "NEP": {
+    "version": 4,
+    "type": [1, "X"],
+    "type_weight": 1.0,
+    "model_type ": 0,
+    "prediction": 0,
+    "cutoff": [6, 5],
+    "n_max": [4, 4],
+    "basis_size": [8, 8],
+    "l_max": [4, 2, 1],
+    "neuron ": 80,
+    "lambda_1": 0,
+    "lambda_e": 1.0,
+    "lambda_f": 1.0,
+    "lambda_v": 0.1,
+    "force_delta": 0,
+    "batch": 1000,
+    "population": 60,
+    "generation": 100000,
+    "zbl" : 2
   }
 }