diff --git a/README.md b/README.md index 281920ba4..4af23cc0f 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,8 @@ The `System` or `LabeledSystem` can be constructed from the following file forma | deepmd | npy | True | False | System | 'deepmd/npy' | | deepmd | raw | True | True | LabeledSystem | 'deepmd/raw' | | deepmd | npy | True | True | LabeledSystem | 'deepmd/npy' | +| deepmd | npy | True | True | MultiSystems | 'deepmd/npy/mixed' | +| deepmd | npy | True | False | MultiSystems | 'deepmd/npy/mixed' | | gaussian| log | False | True | LabeledSystem | 'gaussian/log'| | gaussian| log | True | True | LabeledSystem | 'gaussian/md' | | siesta | output | False | True | LabeledSystem | 'siesta/output'| @@ -278,6 +280,30 @@ print(syst.get_charge()) # return the total charge of the system If a valence of 3 is detected on carbon, the formal charge will be assigned to -1. Because for most cases (in alkynyl anion, isonitrile, cyclopentadienyl anion), the formal charge on 3-valence carbon is -1, and this is also consisent with the 8-electron rule. +## Mixed Type Format +The format `deepmd/npy/mixed` is the mixed type numpy format for DeePMD-kit, and can be loaded or dumped through class `dpdata.MultiSystems`. + +Under this format, systems with the same number of atoms but different formula can be put together +for a larger system, especially when the frame numbers in systems are sparse. + +This also helps to mixture the type information together for model training with type embedding network. + +Here are examples using `deepmd/npy/mixed` format: + +- Dump a MultiSystems into a mixed type numpy directory: +```python +import dpdata + +dpdata.MultiSystems(*systems).to_deepmd_npy_mixed("mixed_dir") +``` + +- Load a mixed type data into a MultiSystems: +```python +import dpdata + +dpdata.MultiSystems().load_systems_from_file("mixed_dir", fmt="deepmd/npy/mixed") +``` + # Plugins One can follow [a simple example](plugin_example/) to add their own format by creating and installing plugins. It's critical to add the [Format](dpdata/format.py) class to `entry_points['dpdata.plugins']` in [`pyproject.toml`](plugin_example/pyproject.toml): diff --git a/dpdata/deepmd/mixed.py b/dpdata/deepmd/mixed.py new file mode 100644 index 000000000..0e770dc20 --- /dev/null +++ b/dpdata/deepmd/mixed.py @@ -0,0 +1,251 @@ +import glob +import os +import shutil + +import numpy as np + + +def load_type(folder): + data = {} + data["atom_names"] = [] + # if find type_map.raw, use it + assert os.path.isfile( + os.path.join(folder, "type_map.raw") + ), "Mixed type system must have type_map.raw!" + with open(os.path.join(folder, "type_map.raw")) as fp: + data["atom_names"] = fp.read().split() + + return data + + +def formula(atom_names, atom_numbs): + """ + Return the formula of this system, like C3H5O2 + """ + return "".join( + ["{}{}".format(symbol, numb) for symbol, numb in zip(atom_names, atom_numbs)] + ) + + +def _cond_load_data(fname): + tmp = None + if os.path.isfile(fname): + tmp = np.load(fname) + return tmp + + +def _load_set(folder, nopbc: bool): + coords = np.load(os.path.join(folder, "coord.npy")) + if nopbc: + cells = np.zeros((coords.shape[0], 3, 3)) + else: + cells = np.load(os.path.join(folder, "box.npy")) + eners = _cond_load_data(os.path.join(folder, "energy.npy")) + forces = _cond_load_data(os.path.join(folder, "force.npy")) + virs = _cond_load_data(os.path.join(folder, "virial.npy")) + real_atom_types = np.load(os.path.join(folder, "real_atom_types.npy")) + return cells, coords, eners, forces, virs, real_atom_types + + +def to_system_data(folder, type_map=None, labels=True): + # data is empty + data = load_type(folder) + data["orig"] = np.zeros([3]) + if os.path.isfile(os.path.join(folder, "nopbc")): + data["nopbc"] = True + sets = sorted(glob.glob(os.path.join(folder, "set.*"))) + assert len(sets) == 1, "Mixed type must have only one set!" + cells, coords, eners, forces, virs, real_atom_types = _load_set( + sets[0], data.get("nopbc", False) + ) + nframes = np.reshape(cells, [-1, 3, 3]).shape[0] + cells = np.reshape(cells, [nframes, 3, 3]) + coords = np.reshape(coords, [nframes, -1, 3]) + real_atom_types = np.reshape(real_atom_types, [nframes, -1]) + natom = real_atom_types.shape[1] + if labels: + if eners is not None and eners.size > 0: + eners = np.reshape(eners, [nframes]) + if forces is not None and forces.size > 0: + forces = np.reshape(forces, [nframes, -1, 3]) + if virs is not None and virs.size > 0: + virs = np.reshape(virs, [nframes, 3, 3]) + data_list = [] + while True: + if real_atom_types.size == 0: + break + temp_atom_numbs = [ + np.count_nonzero(real_atom_types[0] == i) + for i in range(len(data["atom_names"])) + ] + # temp_formula = formula(data['atom_names'], temp_atom_numbs) + temp_idx = np.arange(real_atom_types.shape[0])[ + (real_atom_types == real_atom_types[0]).all(-1) + ] + rest_idx = np.arange(real_atom_types.shape[0])[ + (real_atom_types != real_atom_types[0]).any(-1) + ] + temp_data = data.copy() + temp_data["atom_numbs"] = temp_atom_numbs + temp_data["atom_types"] = real_atom_types[0] + real_atom_types = real_atom_types[rest_idx] + temp_data["cells"] = cells[temp_idx] + cells = cells[rest_idx] + temp_data["coords"] = coords[temp_idx] + coords = coords[rest_idx] + if labels: + if eners is not None and eners.size > 0: + temp_data["energies"] = eners[temp_idx] + eners = eners[rest_idx] + if forces is not None and forces.size > 0: + temp_data["forces"] = forces[temp_idx] + forces = forces[rest_idx] + if virs is not None and virs.size > 0: + temp_data["virials"] = virs[temp_idx] + virs = virs[rest_idx] + data_list.append(temp_data) + return data_list + + +def dump(folder, data, comp_prec=np.float32, remove_sets=True): + os.makedirs(folder, exist_ok=True) + sets = sorted(glob.glob(os.path.join(folder, "set.*"))) + if len(sets) > 0: + if remove_sets: + for ii in sets: + shutil.rmtree(ii) + else: + raise RuntimeError( + "found " + + str(sets) + + " in " + + folder + + "not a clean deepmd raw dir. please firstly clean set.* then try compress" + ) + # if not converted to mixed + if "real_atom_types" not in data: + from dpdata import LabeledSystem, System + + if "energies" in data: + temp_sys = LabeledSystem(data=data) + else: + temp_sys = System(data=data) + temp_sys.convert_to_mixed_type() + # dump raw + np.savetxt(os.path.join(folder, "type.raw"), data["atom_types"], fmt="%d") + np.savetxt(os.path.join(folder, "type_map.raw"), data["real_atom_names"], fmt="%s") + # BondOrder System + if "bonds" in data: + np.savetxt( + os.path.join(folder, "bonds.raw"), + data["bonds"], + header="begin_atom, end_atom, bond_order", + ) + if "formal_charges" in data: + np.savetxt(os.path.join(folder, "formal_charges.raw"), data["formal_charges"]) + # reshape frame properties and convert prec + nframes = data["cells"].shape[0] + cells = np.reshape(data["cells"], [nframes, 9]).astype(comp_prec) + coords = np.reshape(data["coords"], [nframes, -1]).astype(comp_prec) + eners = None + forces = None + virials = None + real_atom_types = None + if "energies" in data: + eners = np.reshape(data["energies"], [nframes]).astype(comp_prec) + if "forces" in data: + forces = np.reshape(data["forces"], [nframes, -1]).astype(comp_prec) + if "virials" in data: + virials = np.reshape(data["virials"], [nframes, 9]).astype(comp_prec) + if "atom_pref" in data: + atom_pref = np.reshape(data["atom_pref"], [nframes, -1]).astype(comp_prec) + if "real_atom_types" in data: + real_atom_types = np.reshape(data["real_atom_types"], [nframes, -1]).astype( + np.int64 + ) + # dump frame properties: cell, coord, energy, force and virial + set_folder = os.path.join(folder, "set.%03d" % 0) + os.makedirs(set_folder) + np.save(os.path.join(set_folder, "box"), cells) + np.save(os.path.join(set_folder, "coord"), coords) + if eners is not None: + np.save(os.path.join(set_folder, "energy"), eners) + if forces is not None: + np.save(os.path.join(set_folder, "force"), forces) + if virials is not None: + np.save(os.path.join(set_folder, "virial"), virials) + if real_atom_types is not None: + np.save(os.path.join(set_folder, "real_atom_types"), real_atom_types) + if "atom_pref" in data: + np.save(os.path.join(set_folder, "atom_pref"), atom_pref) + try: + os.remove(os.path.join(folder, "nopbc")) + except OSError: + pass + if data.get("nopbc", False): + with open(os.path.join(folder, "nopbc"), "w") as fw_nopbc: + pass + + +def mix_system(*system, type_map, split_num=200, **kwargs): + """Mix the systems into mixed_type ones + + Parameters + ---------- + *system : System + The systems to mix + type_map : list of str + Maps atom type to name + split_num : int + Number of frames in each system + + Returns + ------- + mixed_systems: dict + dict of mixed system with key '{atom_numbs}/sys.xxx' + """ + mixed_systems = {} + temp_systems = {} + atom_numbs_sys_index = {} # index of sys + atom_numbs_frame_index = {} # index of frames in cur sys + for sys in system: + tmp_sys = sys.copy() + natom = tmp_sys.get_natoms() + tmp_sys.convert_to_mixed_type(type_map=type_map) + if str(natom) not in atom_numbs_sys_index: + atom_numbs_sys_index[str(natom)] = 0 + if str(natom) not in atom_numbs_frame_index: + atom_numbs_frame_index[str(natom)] = 0 + atom_numbs_frame_index[str(natom)] += tmp_sys.get_nframes() + if str(natom) not in temp_systems or not temp_systems[str(natom)]: + temp_systems[str(natom)] = tmp_sys + else: + temp_systems[str(natom)].append(tmp_sys) + if atom_numbs_frame_index[str(natom)] >= split_num: + while True: + sys_split, temp_systems[str(natom)], rest_num = split_system( + temp_systems[str(natom)], split_num=split_num + ) + sys_name = ( + f"{str(natom)}/sys." + "%.6d" % atom_numbs_sys_index[str(natom)] + ) + mixed_systems[sys_name] = sys_split + atom_numbs_sys_index[str(natom)] += 1 + if rest_num < split_num: + atom_numbs_frame_index[str(natom)] = rest_num + break + for natom in temp_systems: + if atom_numbs_frame_index[natom] > 0: + sys_name = f"{natom}/sys." + "%.6d" % atom_numbs_sys_index[natom] + mixed_systems[sys_name] = temp_systems[natom] + return mixed_systems + + +def split_system(sys, split_num=100): + rest = sys.get_nframes() - split_num + if rest <= 0: + return sys, None, 0 + else: + split_sys = sys.sub_system(range(split_num)) + rest_sys = sys.sub_system(range(split_num, sys.get_nframes())) + return split_sys, rest_sys, rest diff --git a/dpdata/format.py b/dpdata/format.py index 0ad991d84..b4fc5a8e5 100644 --- a/dpdata/format.py +++ b/dpdata/format.py @@ -131,3 +131,24 @@ def to_multi_systems(self, formulas, directory, **kwargs): raise NotImplementedError( "%s doesn't support MultiSystems.to" % (self.__class__.__name__) ) + + def mix_system(self, *system, type_map, split_num=200, **kwargs): + """Mix the systems into mixed_type ones according to the unified given type_map. + + Parameters + ---------- + *system : System + The systems to mix + type_map : list of str + Maps atom type to name + split_num : int + Number of frames in each system + + Returns + ------- + mixed_systems: dict + dict of mixed system with key '{atom_numbs}/sys.xxx' + """ + raise NotImplementedError( + "%s doesn't support System.from" % (self.__class__.__name__) + ) diff --git a/dpdata/plugins/deepmd.py b/dpdata/plugins/deepmd.py index df56bc492..dcb9d810c 100644 --- a/dpdata/plugins/deepmd.py +++ b/dpdata/plugins/deepmd.py @@ -1,3 +1,4 @@ +import os from typing import List, Optional, Union import h5py @@ -6,6 +7,7 @@ import dpdata import dpdata.deepmd.comp import dpdata.deepmd.hdf5 +import dpdata.deepmd.mixed import dpdata.deepmd.raw from dpdata.driver import Driver from dpdata.format import Format @@ -69,6 +71,108 @@ def from_labeled_system(self, file_name, type_map=None, **kwargs): MultiMode = Format.MultiModes.Directory +@Format.register("deepmd/npy/mixed") +class DeePMDMixedFormat(Format): + """Mixed type numpy format for DeePMD-kit. + Under this format, systems with the same number of atoms but different formula can be put together + for a larger system, especially when the frame numbers in systems are sparse. + This also helps to mixture the type information together for model training with type embedding network. + + Examples + -------- + Dump a MultiSystems into a mixed type numpy directory: + >>> import dpdata + >>> dpdata.MultiSystems(*systems).to_deepmd_npy_mixed("mixed_dir") + + Load a mixed type data into a MultiSystems: + >>> import dpdata + >>> dpdata.MultiSystems().load_systems_from_file("mixed_dir", fmt="deepmd/npy/mixed") + """ + + def from_system_mix(self, file_name, type_map=None, **kwargs): + return dpdata.deepmd.mixed.to_system_data( + file_name, type_map=type_map, labels=False + ) + + def to_system(self, data, file_name, prec=np.float64, **kwargs): + """ + Dump the system in deepmd mixed type format (numpy binary) to `folder`. + + The frames were already split to different systems, so these frames can be dumped to one single subfolders + named as `folder/set.000`, containing less than `set_size` frames. + + Parameters + ---------- + data : dict + System data + file_name : str + The output folder + prec : {numpy.float32, numpy.float64} + The floating point precision of the compressed data + """ + dpdata.deepmd.mixed.dump(file_name, data, comp_prec=prec) + + def from_labeled_system_mix(self, file_name, type_map=None, **kwargs): + return dpdata.deepmd.mixed.to_system_data( + file_name, type_map=type_map, labels=True + ) + + def mix_system(self, *system, type_map, split_num=200, **kwargs): + """Mix the systems into mixed_type ones according to the unified given type_map. + + Parameters + ---------- + *system : System + The systems to mix + type_map : list of str + Maps atom type to name + split_num : int + Number of frames in each system + + Returns + ------- + mixed_systems: dict + dict of mixed system with key '{atom_numbs}/sys.xxx' + """ + return dpdata.deepmd.mixed.mix_system( + *system, type_map=type_map, split_num=split_num, **kwargs + ) + + def from_multi_systems(self, directory, **kwargs): + """MultiSystems.from + + Parameters + ---------- + directory : str + directory of system + + Returns + ------- + filenames: list[str] + list of filenames + """ + if self.MultiMode == self.MultiModes.Directory: + level_1_dir = [ + os.path.join(directory, name) + for name in os.listdir(directory) + if os.path.isdir(os.path.join(directory, name)) + and os.path.isfile(os.path.join(directory, name, "type_map.raw")) + ] + level_2_dir = [ + os.path.join(directory, name1, name2) + for name1 in os.listdir(directory) + for name2 in os.listdir(os.path.join(directory, name1)) + if os.path.isdir(os.path.join(directory, name1)) + and os.path.isdir(os.path.join(directory, name1, name2)) + and os.path.isfile( + os.path.join(directory, name1, name2, "type_map.raw") + ) + ] + return level_1_dir + level_2_dir + + MultiMode = Format.MultiModes.Directory + + @Format.register("deepmd/hdf5") class DeePMDHDF5Format(Format): """HDF5 format for DeePMD-kit. diff --git a/dpdata/system.py b/dpdata/system.py index 017797d75..887aba15f 100644 --- a/dpdata/system.py +++ b/dpdata/system.py @@ -178,6 +178,10 @@ class System(MSONable): DataType("orig", np.ndarray, (3,)), DataType("cells", np.ndarray, (Axis.NFRAMES, 3, 3)), DataType("coords", np.ndarray, (Axis.NFRAMES, Axis.NATOMS, 3)), + DataType( + "real_atom_types", np.ndarray, (Axis.NFRAMES, Axis.NATOMS), required=False + ), + DataType("real_atom_names", list, (Axis.NTYPES,), required=False), DataType("nopbc", bool, required=False), ) @@ -558,6 +562,33 @@ def append(self, system): self.data["nopbc"] = False return True + def convert_to_mixed_type(self, type_map=None): + """ + Convert the data dict to mixed type format structure, in order to append systems + with different formula but the same number of atoms. Change the 'atom_names' to + one placeholder type 'MIXED_TOKEN' and add 'real_atom_types' to store the real type + vectors according to the given type_map. + + Parameters + ---------- + type_map : list + type_map + """ + if "real_atom_types" in self.data.keys(): + return + if type_map is None: + type_map = self.get_atom_names() + type_index = [type_map.index(i) for i in self.data["atom_names"]] + frames = self.get_nframes() + self.data["real_atom_types"] = np.tile( + np.array([type_index[i] for i in self.data["atom_types"]]), [frames, 1] + ) + self.data["real_atom_names"] = type_map + natoms = self.get_natoms() + self.data["atom_types"] = np.zeros((natoms,), dtype=int) + self.data["atom_numbs"] = [natoms] + self.data["atom_names"] = ["MIXED_TOKEN"] + def sort_atom_names(self, type_map=None): """ Sort atom_names of the system and reorder atom_numbs and atom_types accoarding @@ -1261,21 +1292,46 @@ def __init__(self, *systems, type_map=None): self.append(*systems) def from_fmt_obj(self, fmtobj, directory, labeled=True, **kwargs): - for dd in fmtobj.from_multi_systems(directory, **kwargs): - if labeled: - system = LabeledSystem().from_fmt_obj(fmtobj, dd, **kwargs) - else: - system = System().from_fmt_obj(fmtobj, dd, **kwargs) - system.sort_atom_names() - self.append(system) - return self + if not isinstance(fmtobj, dpdata.plugins.deepmd.DeePMDMixedFormat): + for dd in fmtobj.from_multi_systems(directory, **kwargs): + if labeled: + system = LabeledSystem().from_fmt_obj(fmtobj, dd, **kwargs) + else: + system = System().from_fmt_obj(fmtobj, dd, **kwargs) + system.sort_atom_names() + self.append(system) + return self + else: + system_list = [] + for dd in fmtobj.from_multi_systems(directory, **kwargs): + if labeled: + data_list = fmtobj.from_labeled_system_mix(dd, **kwargs) + for data_item in data_list: + system_list.append(LabeledSystem(data=data_item)) + else: + data_list = fmtobj.from_system_mix(dd, **kwargs) + for data_item in data_list: + system_list.append(System(data=data_item)) + return self.__class__( + *system_list, + type_map=kwargs["type_map"] if "type_map" in kwargs else None, + ) def to_fmt_obj(self, fmtobj, directory, *args, **kwargs): - for fn, ss in zip( - fmtobj.to_multi_systems(self.systems.keys(), directory, **kwargs), - self.systems.values(), - ): - ss.to_fmt_obj(fmtobj, fn, *args, **kwargs) + if not isinstance(fmtobj, dpdata.plugins.deepmd.DeePMDMixedFormat): + for fn, ss in zip( + fmtobj.to_multi_systems(self.systems.keys(), directory, **kwargs), + self.systems.values(), + ): + ss.to_fmt_obj(fmtobj, fn, *args, **kwargs) + else: + mixed_systems = fmtobj.mix_system( + *list(self.systems.values()), type_map=self.atom_names, **kwargs + ) + for fn in mixed_systems: + mixed_systems[fn].to_fmt_obj( + fmtobj, os.path.join(directory, fn), *args, **kwargs + ) return self def to(self, fmt: str, *args, **kwargs) -> "MultiSystems": diff --git a/tests/test_deepmd_mixed.py b/tests/test_deepmd_mixed.py new file mode 100644 index 000000000..9e6ee9dd1 --- /dev/null +++ b/tests/test_deepmd_mixed.py @@ -0,0 +1,110 @@ +import os +import shutil +import unittest +from itertools import permutations + +import numpy as np +from comp_sys import CompLabeledSys, CompSys, IsNoPBC, MultiSystems +from context import dpdata + + +class TestMixedMultiSystems(unittest.TestCase, CompLabeledSys, MultiSystems, IsNoPBC): + def setUp(self): + self.places = 6 + self.e_places = 6 + self.f_places = 6 + self.v_places = 6 + + # C1H4 + system_1 = dpdata.LabeledSystem( + "gaussian/methane.gaussianlog", fmt="gaussian/log" + ) + + # C1H3 + system_2 = dpdata.LabeledSystem( + "gaussian/methane_sub.gaussianlog", fmt="gaussian/log" + ) + + tmp_data = system_1.data.copy() + tmp_data["atom_numbs"] = [1, 1, 1, 2] + tmp_data["atom_names"] = ["C", "H", "A", "B"] + tmp_data["atom_types"] = np.array([0, 1, 2, 3, 3]) + # C1H1A1B2 + system_1_modified_type_1 = dpdata.LabeledSystem(data=tmp_data) + + tmp_data = system_1.data.copy() + tmp_data["atom_numbs"] = [1, 1, 2, 1] + tmp_data["atom_names"] = ["C", "H", "A", "B"] + tmp_data["atom_types"] = np.array([0, 1, 2, 2, 3]) + # C1H1A2B1 + system_1_modified_type_2 = dpdata.LabeledSystem(data=tmp_data) + + tmp_data = system_1.data.copy() + tmp_data["atom_numbs"] = [1, 1, 1, 2] + tmp_data["atom_names"] = ["C", "H", "A", "D"] + tmp_data["atom_types"] = np.array([0, 1, 2, 3, 3]) + # C1H1A1C2 + system_1_modified_type_3 = dpdata.LabeledSystem(data=tmp_data) + + self.ms = dpdata.MultiSystems( + system_1, + system_2, + system_1_modified_type_1, + system_1_modified_type_2, + system_1_modified_type_3, + ) + self.ms.to_deepmd_npy_mixed("tmp.deepmd.mixed") + self.place_holder_ms = dpdata.MultiSystems().load_systems_from_file( + "tmp.deepmd.mixed/5", fmt="deepmd/npy" + ) + self.place_holder_ms += dpdata.MultiSystems().load_systems_from_file( + "tmp.deepmd.mixed/4", fmt="deepmd/npy" + ) + self.systems = dpdata.MultiSystems().load_systems_from_file( + "tmp.deepmd.mixed", fmt="deepmd/npy/mixed" + ) + self.system_1 = self.ms["C1H4A0B0D0"] + self.system_2 = self.systems["C1H4A0B0D0"] + + self.system_names = [ + "C1H4A0B0D0", + "C1H3A0B0D0", + "C1H1A1B2D0", + "C1H1A2B1D0", + "C1H1A1B0D2", + ] + self.system_sizes = { + "C1H4A0B0D0": 1, + "C1H3A0B0D0": 1, + "C1H1A1B2D0": 1, + "C1H1A2B1D0": 1, + "C1H1A1B0D2": 1, + } + self.atom_names = ["C", "H", "A", "B", "D"] + + def tearDown(self): + if os.path.exists("tmp.deepmd.mixed"): + shutil.rmtree("tmp.deepmd.mixed") + + def test_len(self): + self.assertEqual(len(self.ms), 5) + self.assertEqual(len(self.place_holder_ms), 2) + self.assertEqual(len(self.systems), 5) + + def test_get_nframes(self): + self.assertEqual(self.ms.get_nframes(), 5) + self.assertEqual(self.place_holder_ms.get_nframes(), 5) + self.assertEqual(self.systems.get_nframes(), 5) + + def test_str(self): + self.assertEqual(str(self.ms), "MultiSystems (5 systems containing 5 frames)") + self.assertEqual( + str(self.place_holder_ms), "MultiSystems (2 systems containing 5 frames)" + ) + self.assertEqual( + str(self.systems), "MultiSystems (5 systems containing 5 frames)" + ) + + +if __name__ == "__main__": + unittest.main()