use rdkit2ase (#344)

* use rdkit2ase * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update rdkit2ase * rename `Smiles2Atoms` * refactor to use rdkit2ase * update packmol * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
zincware · Dec 3, 2024 · c62086f · c62086f
1 parent d849313
commit c62086f
Show file tree

Hide file tree

Showing 9 changed files with 105 additions and 189 deletions.
diff --git a/docs/source/_get_started/quickstart.rst b/docs/source/_get_started/quickstart.rst
@@ -58,17 +58,17 @@ The goal is to place water molecules inside a box at a specified density.
 
     with ips.Project() as project:
         # Generate a single water molecule
-        mol = ips.SmilesToAtoms(smiles="O")
+        mol = ips.Smiles2Atoms(smiles="O")
         # Duplicate water molecules
         packmol = ips.Packmol(
             data=[mol.atoms], count=[10], density=876
             )
 
     project.build()
 
-The *SmilesToAtoms* node takes the `SMILES <https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system>`_ string for water ("O"),
+The *Smiles2Atoms* node takes the `SMILES <https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system>`_ string for water ("O"),
 and generates an an `ASE atoms object <https://wiki.fysik.dtu.dk/ase/ase/atoms.html>`_ from it.
-This atom from the SmilesToAtoms instance can be accessd by calling `mol.atoms`.
+This atom from the Smiles2Atoms instance can be accessd by calling `mol.atoms`.
 This instance attribute is then handed over to the Packmol node in the `data` attribute.
 It is possible to hand over multiple different types of ASE atoms,
 so `mol.atoms` as to be given as a list. The next attribute we need to set is the amount of molecules we want to place in the box.
@@ -80,7 +80,7 @@ used with `Mermaid <https://mermaid.js.org/>`_ to better visualise the workflows
 
 The parameters files, namely `params.yaml` can be viewed and parameters can be changed without rerunning the python script.
 
-By calling `dvc repro` the workflow consisting of the nodes is executed, which results in the generation of firstly a single water molecule by the SmilesToAtoms node and then the placement
+By calling `dvc repro` the workflow consisting of the nodes is executed, which results in the generation of firstly a single water molecule by the Smiles2Atoms node and then the placement
 of multiple water molecules in a box by the Packmol node.
 After running `dvc repro` a new folder `nodes` is created which each node having their own subfolder. These folders
 contain the results from the Nodes.
@@ -90,9 +90,9 @@ we can visualise the H5MD files generated by the nodes.
 
 .. code-block:: bash
 
-    zndraw nodes/SmilesToAtoms/structures.h5
+    zndraw nodes/Smiles2Atoms/structures.h5
 
-will visualise the single water molecule generated by SmilesToAtoms. There will also be a H5MD file in the Packmol folder
+will visualise the single water molecule generated by Smiles2Atoms. There will also be a H5MD file in the Packmol folder
 
 .. figure:: ../images/water_packmol.png
     :alt: ZnDraw Screenshot

diff --git a/docs/source/examples/06_Bootstrapping_Datasets.ipynb b/docs/source/examples/06_Bootstrapping_Datasets.ipynb
@@ -123,7 +123,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Running DVC command: 'stage add --name SmilesToAtoms --force ...'\n",
+      "Running DVC command: 'stage add --name Smiles2Atoms --force ...'\n",
       "Running DVC command: 'stage add --name Packmol --force ...'\n",
       "Running DVC command: 'stage add --name opt_calc --force ...'\n",
       "Running DVC command: 'stage add --name ASEGeoOpt --force ...'\n",
@@ -371,7 +371,7 @@
     "mapping = ips.geometry.BarycenterMapping()\n",
     "\n",
     "with ips.Project() as project:\n",
-    "    water = ips.SmilesToAtoms(smiles=\"O\")\n",
+    "    water = ips.Smiles2Atoms(smiles=\"O\")\n",
     "    packmol = ips.Packmol(\n",
     "        data=[water.atoms], count=[10], density=997\n",
     "    )\n",

diff --git a/ipsuite/__init__.pyi b/ipsuite/__init__.pyi
@@ -73,9 +73,9 @@ from .calculators import (
 from .configuration_generation import (
     MultiPackmol,
     Packmol,
+    Smiles2Atoms,
+    Smiles2Conformers,
     Smiles2Gromacs,
-    SmilesToAtoms,
-    SmilesToConformers,
 )
 
 # Configuration Selection
@@ -127,8 +127,8 @@ __all__ = [
     # Configuration Generation
     "Packmol",
     "MultiPackmol",
-    "SmilesToAtoms",
-    "SmilesToConformers",
+    "Smiles2Atoms",
+    "Smiles2Conformers",
     "Smiles2Gromacs",
     # Data
     "AddData",

diff --git a/ipsuite/configuration_generation/__init__.pyi b/ipsuite/configuration_generation/__init__.pyi
@@ -2,12 +2,12 @@
 
 from .gmx import Smiles2Gromacs
 from .packmol import MultiPackmol, Packmol
-from .smiles_to_atoms import SmilesToAtoms, SmilesToConformers
+from .smiles_to_atoms import Smiles2Atoms, Smiles2Conformers
 
 __all__ = [
-    "SmilesToAtoms",
+    "Smiles2Atoms",
     "Packmol",
-    "SmilesToConformers",
+    "Smiles2Conformers",
     "MultiPackmol",
     "Smiles2Gromacs",
 ]
diff --git a/ipsuite/configuration_generation/packmol.py b/ipsuite/configuration_generation/packmol.py
@@ -1,17 +1,15 @@
 """Use packmole to create a periodic box"""
 
 import logging
-import pathlib
-import subprocess
+import random
 
 import ase
 import ase.units
 import numpy as np
+import rdkit2ase
 import zntrack
-from ase.visualize import view
 
 from ipsuite import base, fields
-from ipsuite.utils.ase_sim import get_box_from_density
 
 log = logging.getLogger(__name__)
 
@@ -31,8 +29,6 @@ class Packmol(base.IPSNode):
         Number of molecules to add for each entry in data.
     tolerance : float
         Tolerance for the distance of atoms in angstrom.
-    box : list[float]
-        Box size in angstrom. Either density or box is required.
     density : float
         Density of the system in kg/m^3. Either density or box is required.
     pbc : bool
@@ -45,73 +41,38 @@ class Packmol(base.IPSNode):
     data_ids: list[int] = zntrack.params(None)
     count: list = zntrack.params()
     tolerance: float = zntrack.params(2.0)
-    box: list = zntrack.params(None)
-    density: float = zntrack.params(None)
-    structures: pathlib.Path = zntrack.outs_path(zntrack.nwd / "packmol")
+    density: float = zntrack.params()
     frames: list[ase.Atoms] = fields.Atoms()
     pbc: bool = zntrack.params(True)
 
     def __post_init__(self):
-        if self.box is None and self.density is None:
-            raise ValueError("Either box or density must be set.")
         if len(self.data) != len(self.count):
             raise ValueError("The number of data and count must be the same.")
-        if self.data_ids is not None and len(self.data) != len(self.data_ids):
-            raise ValueError("The number of data and data_ids must be the same.")
-        if self.box is not None and isinstance(self.box, (int, float)):
-            self.box = [self.box, self.box, self.box]
 
     def run(self):
-        self.structures.mkdir(exist_ok=True, parents=True)
-        for idx, atoms in enumerate(self.data):
-            atoms = atoms[-1] if self.data_ids is None else atoms[self.data_ids[idx]]
-            ase.io.write(self.structures / f"{idx}.xyz", atoms)
-
-        if self.density is not None:
-            self._get_box_from_molar_volume()
-
-        if self.pbc:
-            scaled_box = [x - self.tolerance for x in self.box]
+        data = []
+        if self.data_ids is not None:
+            for idx, frames in zip(self.data_ids, self.data):
+                data.append([frames[idx]])
         else:
-            scaled_box = self.box
-
-        file = f"""
-        tolerance {self.tolerance}
-        filetype xyz
-        output mixture.xyz
-        """
-        for idx, count in enumerate(self.count):
-            file += f"""
-            structure {idx}.xyz
-                number {count}
-                inside box 0 0 0 {" ".join([f"{x:.4f}" for x in scaled_box])}
-            end structure
-            """
-        with pathlib.Path(self.structures / "packmole.inp").open("w") as f:
-            f.write(file)
-
-        subprocess.check_call("packmol < packmole.inp", shell=True, cwd=self.structures)
-
-        atoms = ase.io.read(self.structures / "mixture.xyz")
-        if self.pbc:
-            atoms.cell = self.box
-            atoms.pbc = True
-        self.frames = [atoms]
-
-    def _get_box_from_molar_volume(self):
-        """Get the box size from the molar volume"""
-        self.box = get_box_from_density(self.data, self.count, self.density)
-        log.info(f"estimated box size: {self.box}")
-
-    def view(self) -> view:
-        return view(self.frames, viewer="x3d")
+            data = self.data
+
+        self.frames = [
+            rdkit2ase.pack(
+                data=data,
+                counts=self.count,
+                tolerance=self.tolerance,
+                density=self.density,
+                pbc=self.pbc,
+            )
+        ]
 
 
 class MultiPackmol(Packmol):
     """Create multiple configurations with packmol.
 
     This Node generates multiple configurations with packmol.
-    This is best used in conjunction with SmilesToConformers:
+    This is best used in conjunction with Smiles2Conformers:
 
     Example
     -------
@@ -120,7 +81,7 @@ class MultiPackmol(Packmol):
 
     >>> import ipsuite as ips
     >>> with ips.Project() as project:
-    ...     water = ips.SmilesToConformers(
+    ...     water = ips.Smiles2Conformers(
     ...         smiles='O', numConfs=100
     ...         )
     ...     boxes = ips.MultiPackmol(
@@ -141,50 +102,23 @@ class MultiPackmol(Packmol):
 
     n_configurations: int = zntrack.params()
     seed: int = zntrack.params(42)
-    data_ids = None
 
     def run(self):
         np.random.seed(self.seed)
         self.frames = []
-
-        if self.density is not None:
-            self._get_box_from_molar_volume()
-
-        if self.pbc:
-            scaled_box = [x - self.tolerance for x in self.box]
-        else:
-            scaled_box = self.box
-
-        self.structures.mkdir(exist_ok=True, parents=True)
-        for idx, atoms_list in enumerate(self.data):
-            for jdx, atoms in enumerate(atoms_list):
-                ase.io.write(self.structures / f"{idx}_{jdx}.xyz", atoms)
-
-        for idx in range(self.n_configurations):
-            file = f"""
-            tolerance {self.tolerance}
-            filetype xyz
-            output mixture_{idx}.xyz
-            """
-            for jdx, count in enumerate(self.count):
-                choices = np.random.choice(len(self.data[jdx]), count)
-                for kdx in choices:
-                    file += f"""
-                    structure {jdx}_{kdx}.xyz
-                        number 1
-                        inside box 0 0 0 {" ".join([f"{x:.4f}" for x in scaled_box])}
-                    end structure
-                    """
-            with pathlib.Path(self.structures / f"packmole_{idx}.inp").open("w") as f:
-                f.write(file)
-
-            subprocess.check_call(
-                f"packmol < packmole_{idx}.inp", shell=True, cwd=self.structures
+        for _ in range(self.n_configurations):
+            # shuffle each data entry
+            data = []
+            for frames in self.data:
+                random.shuffle(frames)
+                data.append(frames)
+
+            self.frames.append(
+                rdkit2ase.pack(
+                    data=data,
+                    counts=self.count,
+                    tolerance=self.tolerance,
+                    density=self.density,
+                    pbc=self.pbc,
+                )
             )
-
-            atoms = ase.io.read(self.structures / f"mixture_{idx}.xyz")
-            if self.pbc:
-                atoms.cell = self.box
-                atoms.pbc = True
-
-            self.frames.append(atoms)
diff --git a/ipsuite/configuration_generation/smiles_to_atoms.py b/ipsuite/configuration_generation/smiles_to_atoms.py
@@ -1,50 +1,29 @@
-import pathlib
-
 import ase
+import rdkit2ase
 import zntrack
-from ase.visualize import view
-from rdkit import Chem
-from rdkit.Chem import AllChem, Draw
 
 from ipsuite import base, fields
 
 
-class SmilesToAtoms(base.IPSNode):
+class Smiles2Atoms(base.IPSNode):
     atoms: list[ase.Atoms] = fields.Atoms()
 
     smiles: str = zntrack.params()
     cell: float = zntrack.params(None)
     seed: int = zntrack.params(1234)
-    optimizer: str = zntrack.params("UFF")
-    image: pathlib.Path = zntrack.outs_path(zntrack.nwd / "molecule.png")
 
     def run(self):
-        mol = Chem.MolFromSmiles(self.smiles)
-        Draw.MolToFile(mol, self.image)
-
-        mol = Chem.AddHs(mol)
-        AllChem.EmbedMolecule(mol, randomSeed=self.seed)
-
-        if self.optimizer == "UFF":
-            AllChem.UFFOptimizeMolecule(mol)
-        elif self.optimizer == "MMFF":
-            AllChem.MMFFOptimizeMolecule(mol)
-
-        atoms = ase.Atoms(
-            positions=mol.GetConformer().GetPositions(),
-            numbers=[atom.GetAtomicNum() for atom in mol.GetAtoms()],
+        atoms = rdkit2ase.smiles2atoms(
+            smiles=self.smiles,
+            seed=self.seed,
         )
-        atoms.positions -= atoms.get_center_of_mass()
-        if self.cell is not None:
+        if self.cell:
             atoms.set_cell([self.cell, self.cell, self.cell])
             atoms.center()
         self.atoms = [atoms]
 
-    def view(self) -> view:
-        return view(self.atoms[0], viewer="x3d")
 
-
-class SmilesToConformers(base.IPSNode):
+class Smiles2Conformers(base.IPSNode):
     frames: list[ase.Atoms] = fields.Atoms()
 
     smiles: str = zntrack.params()
@@ -54,24 +33,14 @@ class SmilesToConformers(base.IPSNode):
     cell: float = zntrack.params(100)
 
     def run(self):
-        mol = Chem.MolFromSmiles(self.smiles)
-        mol = Chem.AddHs(mol)
-        AllChem.EmbedMultipleConfs(
-            mol,
+        frames = rdkit2ase.smiles2conformers(
+            smiles=self.smiles,
             numConfs=self.numConfs,
             randomSeed=self.seed,
             maxAttempts=self.maxAttempts,
         )
-        self.frames = []
-        for conf in mol.GetConformers():
-            atoms = ase.Atoms(
-                positions=conf.GetPositions(),
-                numbers=[atom.GetAtomicNum() for atom in mol.GetAtoms()],
-            )
-            atoms.positions -= atoms.get_center_of_mass()
-
-            if self.cell is not None:
+        if self.cell:
+            for atoms in frames:
                 atoms.set_cell([self.cell, self.cell, self.cell])
                 atoms.center()
-
-            self.frames.append(atoms)
+        self.frames = frames