Skip to content
26 changes: 26 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ The `System` or `LabeledSystem` can be constructed from the following file forma
| deepmd | npy | True | False | System | 'deepmd/npy' |
| deepmd | raw | True | True | LabeledSystem | 'deepmd/raw' |
| deepmd | npy | True | True | LabeledSystem | 'deepmd/npy' |
| deepmd | npy | True | True | MultiSystems | 'deepmd/npy/mixed' |
| deepmd | npy | True | False | MultiSystems | 'deepmd/npy/mixed' |
| gaussian| log | False | True | LabeledSystem | 'gaussian/log'|
| gaussian| log | True | True | LabeledSystem | 'gaussian/md' |
| siesta | output | False | True | LabeledSystem | 'siesta/output'|
Expand Down Expand Up @@ -278,6 +280,30 @@ print(syst.get_charge()) # return the total charge of the system

If a valence of 3 is detected on carbon, the formal charge will be assigned to -1. Because for most cases (in alkynyl anion, isonitrile, cyclopentadienyl anion), the formal charge on 3-valence carbon is -1, and this is also consisent with the 8-electron rule.

## Mixed Type Format
The format `deepmd/npy/mixed` is the mixed type numpy format for DeePMD-kit, and can be loaded or dumped through class `dpdata.MultiSystems`.

Under this format, systems with the same number of atoms but different formula can be put together
for a larger system, especially when the frame numbers in systems are sparse.

This also helps to mixture the type information together for model training with type embedding network.

Here are examples using `deepmd/npy/mixed` format:

- Dump a MultiSystems into a mixed type numpy directory:
```python
import dpdata

dpdata.MultiSystems(*systems).to_deepmd_npy_mixed("mixed_dir")
```

- Load a mixed type data into a MultiSystems:
```python
import dpdata

dpdata.MultiSystems().load_systems_from_file("mixed_dir", fmt="deepmd/npy/mixed")
```

# Plugins

One can follow [a simple example](plugin_example/) to add their own format by creating and installing plugins. It's critical to add the [Format](dpdata/format.py) class to `entry_points['dpdata.plugins']` in [`pyproject.toml`](plugin_example/pyproject.toml):
Expand Down
251 changes: 251 additions & 0 deletions dpdata/deepmd/mixed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
import glob
import os
import shutil

import numpy as np


def load_type(folder):
data = {}
data["atom_names"] = []
# if find type_map.raw, use it
assert os.path.isfile(
os.path.join(folder, "type_map.raw")
), "Mixed type system must have type_map.raw!"
with open(os.path.join(folder, "type_map.raw")) as fp:
data["atom_names"] = fp.read().split()

return data


def formula(atom_names, atom_numbs):
"""
Return the formula of this system, like C3H5O2
"""
return "".join(
["{}{}".format(symbol, numb) for symbol, numb in zip(atom_names, atom_numbs)]
)


def _cond_load_data(fname):
tmp = None
if os.path.isfile(fname):
tmp = np.load(fname)
return tmp


def _load_set(folder, nopbc: bool):
coords = np.load(os.path.join(folder, "coord.npy"))
if nopbc:
cells = np.zeros((coords.shape[0], 3, 3))
else:
cells = np.load(os.path.join(folder, "box.npy"))
eners = _cond_load_data(os.path.join(folder, "energy.npy"))
forces = _cond_load_data(os.path.join(folder, "force.npy"))
virs = _cond_load_data(os.path.join(folder, "virial.npy"))
real_atom_types = np.load(os.path.join(folder, "real_atom_types.npy"))
return cells, coords, eners, forces, virs, real_atom_types


def to_system_data(folder, type_map=None, labels=True):
# data is empty
data = load_type(folder)
data["orig"] = np.zeros([3])
if os.path.isfile(os.path.join(folder, "nopbc")):
data["nopbc"] = True
sets = sorted(glob.glob(os.path.join(folder, "set.*")))
assert len(sets) == 1, "Mixed type must have only one set!"
cells, coords, eners, forces, virs, real_atom_types = _load_set(
sets[0], data.get("nopbc", False)
)
nframes = np.reshape(cells, [-1, 3, 3]).shape[0]
cells = np.reshape(cells, [nframes, 3, 3])
coords = np.reshape(coords, [nframes, -1, 3])
real_atom_types = np.reshape(real_atom_types, [nframes, -1])
natom = real_atom_types.shape[1]
if labels:
if eners is not None and eners.size > 0:
eners = np.reshape(eners, [nframes])
if forces is not None and forces.size > 0:
forces = np.reshape(forces, [nframes, -1, 3])
if virs is not None and virs.size > 0:
virs = np.reshape(virs, [nframes, 3, 3])
data_list = []
while True:
if real_atom_types.size == 0:
break
temp_atom_numbs = [
np.count_nonzero(real_atom_types[0] == i)
for i in range(len(data["atom_names"]))
]
# temp_formula = formula(data['atom_names'], temp_atom_numbs)
temp_idx = np.arange(real_atom_types.shape[0])[
(real_atom_types == real_atom_types[0]).all(-1)
]
rest_idx = np.arange(real_atom_types.shape[0])[
(real_atom_types != real_atom_types[0]).any(-1)
]
temp_data = data.copy()
temp_data["atom_numbs"] = temp_atom_numbs
temp_data["atom_types"] = real_atom_types[0]
real_atom_types = real_atom_types[rest_idx]
temp_data["cells"] = cells[temp_idx]
cells = cells[rest_idx]
temp_data["coords"] = coords[temp_idx]
coords = coords[rest_idx]
if labels:
if eners is not None and eners.size > 0:
temp_data["energies"] = eners[temp_idx]
eners = eners[rest_idx]
if forces is not None and forces.size > 0:
temp_data["forces"] = forces[temp_idx]
forces = forces[rest_idx]
if virs is not None and virs.size > 0:
temp_data["virials"] = virs[temp_idx]
virs = virs[rest_idx]
data_list.append(temp_data)
return data_list


def dump(folder, data, comp_prec=np.float32, remove_sets=True):
os.makedirs(folder, exist_ok=True)
sets = sorted(glob.glob(os.path.join(folder, "set.*")))
if len(sets) > 0:
if remove_sets:
for ii in sets:
shutil.rmtree(ii)
else:
raise RuntimeError(
"found "
+ str(sets)
+ " in "
+ folder
+ "not a clean deepmd raw dir. please firstly clean set.* then try compress"
)
# if not converted to mixed
if "real_atom_types" not in data:
from dpdata import LabeledSystem, System

if "energies" in data:
temp_sys = LabeledSystem(data=data)
else:
temp_sys = System(data=data)
temp_sys.convert_to_mixed_type()
# dump raw
np.savetxt(os.path.join(folder, "type.raw"), data["atom_types"], fmt="%d")
np.savetxt(os.path.join(folder, "type_map.raw"), data["real_atom_names"], fmt="%s")
# BondOrder System
if "bonds" in data:
np.savetxt(
os.path.join(folder, "bonds.raw"),
data["bonds"],
header="begin_atom, end_atom, bond_order",
)
if "formal_charges" in data:
np.savetxt(os.path.join(folder, "formal_charges.raw"), data["formal_charges"])
# reshape frame properties and convert prec
nframes = data["cells"].shape[0]
cells = np.reshape(data["cells"], [nframes, 9]).astype(comp_prec)
coords = np.reshape(data["coords"], [nframes, -1]).astype(comp_prec)
eners = None
forces = None
virials = None
real_atom_types = None
if "energies" in data:
eners = np.reshape(data["energies"], [nframes]).astype(comp_prec)
if "forces" in data:
forces = np.reshape(data["forces"], [nframes, -1]).astype(comp_prec)
if "virials" in data:
virials = np.reshape(data["virials"], [nframes, 9]).astype(comp_prec)
if "atom_pref" in data:
atom_pref = np.reshape(data["atom_pref"], [nframes, -1]).astype(comp_prec)
if "real_atom_types" in data:
real_atom_types = np.reshape(data["real_atom_types"], [nframes, -1]).astype(
np.int64
)
# dump frame properties: cell, coord, energy, force and virial
set_folder = os.path.join(folder, "set.%03d" % 0)
os.makedirs(set_folder)
np.save(os.path.join(set_folder, "box"), cells)
np.save(os.path.join(set_folder, "coord"), coords)
if eners is not None:
np.save(os.path.join(set_folder, "energy"), eners)
if forces is not None:
np.save(os.path.join(set_folder, "force"), forces)
if virials is not None:
np.save(os.path.join(set_folder, "virial"), virials)
if real_atom_types is not None:
np.save(os.path.join(set_folder, "real_atom_types"), real_atom_types)
if "atom_pref" in data:
np.save(os.path.join(set_folder, "atom_pref"), atom_pref)
try:
os.remove(os.path.join(folder, "nopbc"))
except OSError:
pass
if data.get("nopbc", False):
with open(os.path.join(folder, "nopbc"), "w") as fw_nopbc:
pass


def mix_system(*system, type_map, split_num=200, **kwargs):
"""Mix the systems into mixed_type ones

Parameters
----------
*system : System
The systems to mix
type_map : list of str
Maps atom type to name
split_num : int
Number of frames in each system

Returns
-------
mixed_systems: dict
dict of mixed system with key '{atom_numbs}/sys.xxx'
"""
mixed_systems = {}
temp_systems = {}
atom_numbs_sys_index = {} # index of sys
atom_numbs_frame_index = {} # index of frames in cur sys
for sys in system:
tmp_sys = sys.copy()
natom = tmp_sys.get_natoms()
tmp_sys.convert_to_mixed_type(type_map=type_map)
if str(natom) not in atom_numbs_sys_index:
atom_numbs_sys_index[str(natom)] = 0
if str(natom) not in atom_numbs_frame_index:
atom_numbs_frame_index[str(natom)] = 0
atom_numbs_frame_index[str(natom)] += tmp_sys.get_nframes()
if str(natom) not in temp_systems or not temp_systems[str(natom)]:
temp_systems[str(natom)] = tmp_sys
else:
temp_systems[str(natom)].append(tmp_sys)
if atom_numbs_frame_index[str(natom)] >= split_num:
while True:
sys_split, temp_systems[str(natom)], rest_num = split_system(
temp_systems[str(natom)], split_num=split_num
)
sys_name = (
f"{str(natom)}/sys." + "%.6d" % atom_numbs_sys_index[str(natom)]
)
mixed_systems[sys_name] = sys_split
atom_numbs_sys_index[str(natom)] += 1
if rest_num < split_num:
atom_numbs_frame_index[str(natom)] = rest_num
break
for natom in temp_systems:
if atom_numbs_frame_index[natom] > 0:
sys_name = f"{natom}/sys." + "%.6d" % atom_numbs_sys_index[natom]
mixed_systems[sys_name] = temp_systems[natom]
return mixed_systems


def split_system(sys, split_num=100):
rest = sys.get_nframes() - split_num
if rest <= 0:
return sys, None, 0
else:
split_sys = sys.sub_system(range(split_num))
rest_sys = sys.sub_system(range(split_num, sys.get_nframes()))
return split_sys, rest_sys, rest
21 changes: 21 additions & 0 deletions dpdata/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,24 @@ def to_multi_systems(self, formulas, directory, **kwargs):
raise NotImplementedError(
"%s doesn't support MultiSystems.to" % (self.__class__.__name__)
)

def mix_system(self, *system, type_map, split_num=200, **kwargs):
"""Mix the systems into mixed_type ones according to the unified given type_map.

Parameters
----------
*system : System
The systems to mix
type_map : list of str
Maps atom type to name
split_num : int
Number of frames in each system

Returns
-------
mixed_systems: dict
dict of mixed system with key '{atom_numbs}/sys.xxx'
"""
raise NotImplementedError(
"%s doesn't support System.from" % (self.__class__.__name__)
)
Loading