From 98a7dec2771bb9986cb914ef2f01c202d746b20e Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Sun, 19 Feb 2023 23:54:21 +0800 Subject: [PATCH 1/8] Add support for 'deepmd/mixed' format with dpdata.MultiSystems --- dpdata/deepmd/mixed.py | 241 +++++++++++++++++++++++++++++++++++++ dpdata/format.py | 21 ++++ dpdata/plugins/deepmd.py | 82 +++++++++++++ dpdata/system.py | 71 +++++++++-- tests/test_deepmd_mixed.py | 46 +++++++ 5 files changed, 448 insertions(+), 13 deletions(-) create mode 100644 dpdata/deepmd/mixed.py create mode 100644 tests/test_deepmd_mixed.py diff --git a/dpdata/deepmd/mixed.py b/dpdata/deepmd/mixed.py new file mode 100644 index 000000000..50be0a645 --- /dev/null +++ b/dpdata/deepmd/mixed.py @@ -0,0 +1,241 @@ +import glob +import os +import shutil + +import numpy as np + + +def load_type(folder): + data = {} + data["atom_names"] = [] + # if find type_map.raw, use it + assert os.path.isfile(os.path.join(folder, "type_map.raw")), "Mixed type system must have type_map.raw!" + with open(os.path.join(folder, "type_map.raw")) as fp: + data["atom_names"] = fp.read().split() + + return data + + +def formula(atom_names, atom_numbs): + """ + Return the formula of this system, like C3H5O2 + """ + return "".join( + [ + "{}{}".format(symbol, numb) + for symbol, numb in zip( + atom_names, atom_numbs + ) + ] + ) + + +def _cond_load_data(fname): + tmp = None + if os.path.isfile(fname): + tmp = np.load(fname) + return tmp + + +def _load_set(folder, nopbc: bool): + coords = np.load(os.path.join(folder, "coord.npy")) + if nopbc: + cells = np.zeros((coords.shape[0], 3, 3)) + else: + cells = np.load(os.path.join(folder, "box.npy")) + eners = _cond_load_data(os.path.join(folder, "energy.npy")) + forces = _cond_load_data(os.path.join(folder, "force.npy")) + virs = _cond_load_data(os.path.join(folder, "virial.npy")) + real_atom_types = np.load(os.path.join(folder, "real_atom_types.npy")) + return cells, coords, eners, forces, virs, real_atom_types + + +def to_system_data(folder, type_map=None, labels=True): + # data is empty + data = load_type(folder) + data["orig"] = np.zeros([3]) + if os.path.isfile(os.path.join(folder, "nopbc")): + data["nopbc"] = True + sets = sorted(glob.glob(os.path.join(folder, "set.*"))) + assert len(sets) == 1, 'Mixed type must have only one set!' + cells, coords, eners, forces, virs, real_atom_types = _load_set(sets[0], data.get("nopbc", False)) + nframes = np.reshape(cells, [-1, 3, 3]).shape[0] + cells = np.reshape(cells, [nframes, 3, 3]) + coords = np.reshape(coords, [nframes, -1, 3]) + real_atom_types = np.reshape(real_atom_types, [nframes, -1]) + natom = real_atom_types.shape[1] + if labels: + if eners is not None and eners.size > 0: + eners = np.reshape(eners, [nframes]) + if forces is not None and forces.size > 0: + forces = np.reshape(forces, [nframes, -1, 3]) + if virs is not None and virs.size > 0: + virs = np.reshape(virs, [nframes, 3, 3]) + data_list = [] + while True: + if real_atom_types.size == 0: + break + temp_atom_numbs = [np.count_nonzero(real_atom_types[0] == i) for i in range(len(data['atom_names']))] + # temp_formula = formula(data['atom_names'], temp_atom_numbs) + temp_idx = np.arange(real_atom_types.shape[0])[(real_atom_types == real_atom_types[0]).all(-1)] + rest_idx = np.arange(real_atom_types.shape[0])[(real_atom_types != real_atom_types[0]).any(-1)] + temp_data = data.copy() + temp_data["atom_numbs"] = temp_atom_numbs + temp_data["atom_types"] = real_atom_types[0] + real_atom_types = real_atom_types[rest_idx] + temp_data["cells"] = cells[temp_idx] + cells = cells[rest_idx] + temp_data["coords"] = coords[temp_idx] + coords = coords[rest_idx] + if labels: + if eners is not None and eners.size > 0: + temp_data["energies"] = eners[temp_idx] + eners = eners[rest_idx] + if forces is not None and forces.size > 0: + temp_data["forces"] = forces[temp_idx] + forces = forces[rest_idx] + if virs is not None and virs.size > 0: + temp_data["virials"] = virs[temp_idx] + virs = virs[rest_idx] + data_list.append(temp_data) + return data_list + + +def dump(folder, data, set_size=5000, comp_prec=np.float32, remove_sets=True): + os.makedirs(folder, exist_ok=True) + sets = sorted(glob.glob(os.path.join(folder, "set.*"))) + if len(sets) > 0: + if remove_sets: + for ii in sets: + shutil.rmtree(ii) + else: + raise RuntimeError( + "found " + + str(sets) + + " in " + + folder + + "not a clean deepmd raw dir. please firstly clean set.* then try compress" + ) + # if not converted to mixed + if 'real_atom_types' not in data: + from dpdata import LabeledSystem, System + if 'energies' in data: + temp_sys = LabeledSystem(data=data) + else: + temp_sys = System(data=data) + temp_sys.convert_to_mixed_type() + # dump raw + np.savetxt(os.path.join(folder, "type.raw"), data["atom_types"], fmt="%d") + np.savetxt(os.path.join(folder, "type_map.raw"), data["real_atom_names"], fmt="%s") + # BondOrder System + if "bonds" in data: + np.savetxt( + os.path.join(folder, "bonds.raw"), + data["bonds"], + header="begin_atom, end_atom, bond_order", + ) + if "formal_charges" in data: + np.savetxt(os.path.join(folder, "formal_charges.raw"), data["formal_charges"]) + # reshape frame properties and convert prec + nframes = data["cells"].shape[0] + cells = np.reshape(data["cells"], [nframes, 9]).astype(comp_prec) + coords = np.reshape(data["coords"], [nframes, -1]).astype(comp_prec) + eners = None + forces = None + virials = None + real_atom_types = None + if "energies" in data: + eners = np.reshape(data["energies"], [nframes]).astype(comp_prec) + if "forces" in data: + forces = np.reshape(data["forces"], [nframes, -1]).astype(comp_prec) + if "virials" in data: + virials = np.reshape(data["virials"], [nframes, 9]).astype(comp_prec) + if "atom_pref" in data: + atom_pref = np.reshape(data["atom_pref"], [nframes, -1]).astype(comp_prec) + if "real_atom_types" in data: + real_atom_types = np.reshape(data["real_atom_types"], [nframes, -1]).astype(np.int64) + # dump frame properties: cell, coord, energy, force and virial + assert nframes <= set_size, \ + "Can not put more than {} frames into one mixed_type systems with one set! " \ + "Please split them into different systems." + set_folder = os.path.join(folder, "set.%03d" % 0) + os.makedirs(set_folder) + np.save(os.path.join(set_folder, "box"), cells) + np.save(os.path.join(set_folder, "coord"), coords) + if eners is not None: + np.save(os.path.join(set_folder, "energy"), eners) + if forces is not None: + np.save(os.path.join(set_folder, "force"), forces) + if virials is not None: + np.save(os.path.join(set_folder, "virial"), virials) + if real_atom_types is not None: + np.save(os.path.join(set_folder, "real_atom_types"), real_atom_types) + if "atom_pref" in data: + np.save(os.path.join(set_folder, "atom_pref"), atom_pref) + try: + os.remove(os.path.join(folder, "nopbc")) + except OSError: + pass + if data.get("nopbc", False): + with open(os.path.join(folder, "nopbc"), "w") as fw_nopbc: + pass + + +def mix_system(*system, type_map, split_num=100, **kwargs): + """Mix the systems into mixed_type ones + + Parameters + ---------- + *system : System + The systems to mix + type_map : list of str + Maps atom type to name + split_num : int + Number of frames in each system + + Returns + ------- + mixed_systems: dict + dict of mixed system with key '{atom_numbs}/sys.xxx' + """ + mixed_systems = {} + temp_systems = {} + atom_numbs_sys_index = {} # index of sys + atom_numbs_frame_index = {} # index of frames in cur sys + for sys in system: + tmp_sys = sys.copy() + natom = tmp_sys.get_natoms() + tmp_sys.convert_to_mixed_type(type_map=type_map) + if str(natom) not in atom_numbs_sys_index: + atom_numbs_sys_index[str(natom)] = 0 + if str(natom) not in atom_numbs_frame_index: + atom_numbs_frame_index[str(natom)] = 0 + atom_numbs_frame_index[str(natom)] += tmp_sys.get_nframes() + if str(natom) not in temp_systems or not temp_systems[str(natom)]: + temp_systems[str(natom)] = tmp_sys + else: + temp_systems[str(natom)].append(tmp_sys) + if atom_numbs_frame_index[str(natom)] >= split_num: + while True: + sys_split, temp_systems[str(natom)], rest_num = split_system(temp_systems[str(natom)], split_num=split_num) + sys_name = f'{str(natom)}/sys.' + "%.6d" % atom_numbs_sys_index[str(natom)] + mixed_systems[sys_name] = sys_split + atom_numbs_sys_index[str(natom)] += 1 + if rest_num < split_num: + atom_numbs_frame_index[str(natom)] = rest_num + break + for natom in temp_systems: + if atom_numbs_frame_index[natom] > 0: + sys_name = f'{natom}/sys.' + "%.6d" % atom_numbs_sys_index[natom] + mixed_systems[sys_name] = temp_systems[natom] + return mixed_systems + + +def split_system(sys, split_num=100): + rest = sys.get_nframes() - split_num + if rest <= 0: + return sys, None, 0 + else: + split_sys = sys.sub_system(range(split_num)) + rest_sys = sys.sub_system(range(split_num, sys.get_nframes())) + return split_sys, rest_sys, rest diff --git a/dpdata/format.py b/dpdata/format.py index 0ad991d84..d9d45549e 100644 --- a/dpdata/format.py +++ b/dpdata/format.py @@ -131,3 +131,24 @@ def to_multi_systems(self, formulas, directory, **kwargs): raise NotImplementedError( "%s doesn't support MultiSystems.to" % (self.__class__.__name__) ) + + def mix_system(self, *system, type_map, split_num=100, **kwargs): + """Mix the systems into mixed_type ones according to the unified given type_map. + + Parameters + ---------- + *system : System + The systems to mix + type_map : list of str + Maps atom type to name + split_num : int + Number of frames in each system + + Returns + ------- + mixed_systems: dict + dict of mixed system with key '{atom_numbs}/sys.xxx' + """ + raise NotImplementedError( + "%s doesn't support System.from" % (self.__class__.__name__) + ) diff --git a/dpdata/plugins/deepmd.py b/dpdata/plugins/deepmd.py index df56bc492..73920afd9 100644 --- a/dpdata/plugins/deepmd.py +++ b/dpdata/plugins/deepmd.py @@ -7,6 +7,8 @@ import dpdata.deepmd.comp import dpdata.deepmd.hdf5 import dpdata.deepmd.raw +import dpdata.deepmd.mixed +import os from dpdata.driver import Driver from dpdata.format import Format @@ -69,6 +71,86 @@ def from_labeled_system(self, file_name, type_map=None, **kwargs): MultiMode = Format.MultiModes.Directory +@Format.register("deepmd/mixed") +class DeePMDMixedFormat(Format): + def from_system_mix(self, file_name, type_map=None, **kwargs): + return dpdata.deepmd.mixed.to_system_data( + file_name, type_map=type_map, labels=False + ) + + def to_system(self, data, file_name, set_size=200, prec=np.float64, **kwargs): + """ + Dump the system in deepmd mixed type format (numpy binary) to `folder`. + + The frames were already split to different systems, so these frames can be dumped to one single subfolders + named as `folder/set.000`, containing less than `set_size` frames. + + Parameters + ---------- + data : dict + System data + file_name : str + The output folder + set_size : int + The max size of set. + prec : {numpy.float32, numpy.float64} + The floating point precision of the compressed data + """ + dpdata.deepmd.mixed.dump(file_name, data, set_size=set_size, comp_prec=prec) + + def from_labeled_system_mix(self, file_name, type_map=None, **kwargs): + return dpdata.deepmd.mixed.to_system_data( + file_name, type_map=type_map, labels=True + ) + + def mix_system(self, *system, **kwargs): + """Mix the systems into mixed_type ones + + Parameters + ---------- + file_name : str + file name + + Returns + ------- + data: dict + system data + """ + return dpdata.deepmd.mixed.mix_system(*system, **kwargs) + + def from_multi_systems(self, directory, **kwargs): + """MultiSystems.from + + Parameters + ---------- + directory : str + directory of system + + Returns + ------- + filenames: list[str] + list of filenames + """ + if self.MultiMode == self.MultiModes.Directory: + level_1_dir = [ + os.path.join(directory, name) + for name in os.listdir(directory) + if os.path.isdir(os.path.join(directory, name)) and + os.path.isfile(os.path.join(directory, name, 'type_map.raw')) + ] + level_2_dir = [ + os.path.join(directory, name1, name2) + for name1 in os.listdir(directory) + for name2 in os.listdir(os.path.join(directory, name1)) + if os.path.isdir(os.path.join(directory, name1)) and + os.path.isdir(os.path.join(directory, name1, name2)) and + os.path.isfile(os.path.join(directory, name1, name2, 'type_map.raw')) + ] + return level_1_dir + level_2_dir + + MultiMode = Format.MultiModes.Directory + + @Format.register("deepmd/hdf5") class DeePMDHDF5Format(Format): """HDF5 format for DeePMD-kit. diff --git a/dpdata/system.py b/dpdata/system.py index 017797d75..c009530a2 100644 --- a/dpdata/system.py +++ b/dpdata/system.py @@ -178,6 +178,8 @@ class System(MSONable): DataType("orig", np.ndarray, (3,)), DataType("cells", np.ndarray, (Axis.NFRAMES, 3, 3)), DataType("coords", np.ndarray, (Axis.NFRAMES, Axis.NATOMS, 3)), + DataType("real_atom_types", np.ndarray, (Axis.NFRAMES, Axis.NATOMS), required=False), + DataType("real_atom_names", list, (Axis.NTYPES,), required=False), DataType("nopbc", bool, required=False), ) @@ -558,6 +560,31 @@ def append(self, system): self.data["nopbc"] = False return True + def convert_to_mixed_type(self, type_map=None): + """ + Convert the data dict to mixed type format structure, in order to append systems + with different formula but the same number of atoms. Change the 'atom_names' to + one placeholder type 'MIXED_TOKEN' and add 'real_atom_types' to store the real type + vectors according to the given type_map. + + Parameters + ---------- + type_map : list + type_map + """ + if 'real_atom_types' in self.data.keys(): + return + if type_map is None: + type_map = self.get_atom_names() + type_index = [type_map.index(i) for i in self.data['atom_names']] + frames = self.get_nframes() + self.data['real_atom_types'] = np.tile(np.array([type_index[i] for i in self.data['atom_types']]), [frames, 1]) + self.data['real_atom_names'] = type_map + natoms = self.get_natoms() + self.data['atom_types'] = np.array([0 for i in range(natoms)]) + self.data['atom_numbs'] = [natoms] + self.data['atom_names'] = ['MIXED_TOKEN'] + def sort_atom_names(self, type_map=None): """ Sort atom_names of the system and reorder atom_numbs and atom_types accoarding @@ -1261,21 +1288,39 @@ def __init__(self, *systems, type_map=None): self.append(*systems) def from_fmt_obj(self, fmtobj, directory, labeled=True, **kwargs): - for dd in fmtobj.from_multi_systems(directory, **kwargs): - if labeled: - system = LabeledSystem().from_fmt_obj(fmtobj, dd, **kwargs) - else: - system = System().from_fmt_obj(fmtobj, dd, **kwargs) - system.sort_atom_names() - self.append(system) - return self + if not isinstance(fmtobj, dpdata.plugins.deepmd.DeePMDMixedFormat): + for dd in fmtobj.from_multi_systems(directory, **kwargs): + if labeled: + system = LabeledSystem().from_fmt_obj(fmtobj, dd, **kwargs) + else: + system = System().from_fmt_obj(fmtobj, dd, **kwargs) + system.sort_atom_names() + self.append(system) + return self + else: + system_list = [] + for dd in fmtobj.from_multi_systems(directory, **kwargs): + if labeled: + data_list = fmtobj.from_labeled_system_mix(dd, **kwargs) + for data_item in data_list: + system_list.append(LabeledSystem(data=data_item)) + else: + data_list = fmtobj.from_system_mix(dd, **kwargs) + for data_item in data_list: + system_list.append(System(data=data_item)) + return self.__class__(*system_list, type_map=kwargs['type_map'] if 'type_map' in kwargs else None) def to_fmt_obj(self, fmtobj, directory, *args, **kwargs): - for fn, ss in zip( - fmtobj.to_multi_systems(self.systems.keys(), directory, **kwargs), - self.systems.values(), - ): - ss.to_fmt_obj(fmtobj, fn, *args, **kwargs) + if not isinstance(fmtobj, dpdata.plugins.deepmd.DeePMDMixedFormat): + for fn, ss in zip( + fmtobj.to_multi_systems(self.systems.keys(), directory, **kwargs), + self.systems.values(), + ): + ss.to_fmt_obj(fmtobj, fn, *args, **kwargs) + else: + mixed_systems = fmtobj.mix_system(*list(self.systems.values()), type_map=self.atom_names, **kwargs) + for fn in mixed_systems: + mixed_systems[fn].to_fmt_obj(fmtobj, os.path.join(directory, fn), *args, **kwargs) return self def to(self, fmt: str, *args, **kwargs) -> "MultiSystems": diff --git a/tests/test_deepmd_mixed.py b/tests/test_deepmd_mixed.py new file mode 100644 index 000000000..743b4c4bd --- /dev/null +++ b/tests/test_deepmd_mixed.py @@ -0,0 +1,46 @@ +import os +import shutil +import unittest +from itertools import permutations + +import numpy as np +from comp_sys import CompLabeledSys, CompSys, IsNoPBC, MultiSystems +from context import dpdata + + +class TestMixedMultiSystems(unittest.TestCase, CompLabeledSys, IsNoPBC): + def setUp(self): + self.places = 6 + self.e_places = 6 + self.f_places = 6 + self.v_places = 6 + + system_1 = dpdata.LabeledSystem( + "gaussian/methane.gaussianlog", fmt="gaussian/log" + ) + system_2 = dpdata.LabeledSystem( + "gaussian/methane_reordered.gaussianlog", fmt="gaussian/log" + ) + system_3 = dpdata.LabeledSystem( + "gaussian/methane_sub.gaussianlog", fmt="gaussian/log" + ) + system_4 = dpdata.LabeledSystem( + "gaussian/noncoveraged.gaussianlog", fmt="gaussian/log" + ) + + self.systems = dpdata.MultiSystems(system_1, system_2, system_3, system_4) + self.systems.to_deepmd_mixed('tmp.deepmd.mixed') + mixms = dpdata.MultiSystems().load_systems_from_file('tmp.deepmd.mixed', fmt='deepmd/mixed') + self.system_1 = self.systems["C1H3"] + self.system_2 = mixms["C1H3"] + self.places = 6 + self.e_places = 6 + self.f_places = 6 + + def tearDown(self): + if os.path.exists("tmp.deepmd.npy"): + shutil.rmtree("tmp.deepmd.npy") + + +if __name__ == "__main__": + unittest.main() From 68058c84c6c96d4d8ba38aa9336c0f7a413d5127 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 19 Feb 2023 16:01:23 +0000 Subject: [PATCH 2/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dpdata/deepmd/mixed.py | 54 ++++++++++++++++++++++++-------------- dpdata/plugins/deepmd.py | 16 ++++++----- dpdata/system.py | 33 +++++++++++++++-------- tests/test_deepmd_mixed.py | 6 +++-- 4 files changed, 69 insertions(+), 40 deletions(-) diff --git a/dpdata/deepmd/mixed.py b/dpdata/deepmd/mixed.py index 50be0a645..491b32ae5 100644 --- a/dpdata/deepmd/mixed.py +++ b/dpdata/deepmd/mixed.py @@ -9,7 +9,9 @@ def load_type(folder): data = {} data["atom_names"] = [] # if find type_map.raw, use it - assert os.path.isfile(os.path.join(folder, "type_map.raw")), "Mixed type system must have type_map.raw!" + assert os.path.isfile( + os.path.join(folder, "type_map.raw") + ), "Mixed type system must have type_map.raw!" with open(os.path.join(folder, "type_map.raw")) as fp: data["atom_names"] = fp.read().split() @@ -21,12 +23,7 @@ def formula(atom_names, atom_numbs): Return the formula of this system, like C3H5O2 """ return "".join( - [ - "{}{}".format(symbol, numb) - for symbol, numb in zip( - atom_names, atom_numbs - ) - ] + ["{}{}".format(symbol, numb) for symbol, numb in zip(atom_names, atom_numbs)] ) @@ -57,8 +54,10 @@ def to_system_data(folder, type_map=None, labels=True): if os.path.isfile(os.path.join(folder, "nopbc")): data["nopbc"] = True sets = sorted(glob.glob(os.path.join(folder, "set.*"))) - assert len(sets) == 1, 'Mixed type must have only one set!' - cells, coords, eners, forces, virs, real_atom_types = _load_set(sets[0], data.get("nopbc", False)) + assert len(sets) == 1, "Mixed type must have only one set!" + cells, coords, eners, forces, virs, real_atom_types = _load_set( + sets[0], data.get("nopbc", False) + ) nframes = np.reshape(cells, [-1, 3, 3]).shape[0] cells = np.reshape(cells, [nframes, 3, 3]) coords = np.reshape(coords, [nframes, -1, 3]) @@ -75,10 +74,17 @@ def to_system_data(folder, type_map=None, labels=True): while True: if real_atom_types.size == 0: break - temp_atom_numbs = [np.count_nonzero(real_atom_types[0] == i) for i in range(len(data['atom_names']))] + temp_atom_numbs = [ + np.count_nonzero(real_atom_types[0] == i) + for i in range(len(data["atom_names"])) + ] # temp_formula = formula(data['atom_names'], temp_atom_numbs) - temp_idx = np.arange(real_atom_types.shape[0])[(real_atom_types == real_atom_types[0]).all(-1)] - rest_idx = np.arange(real_atom_types.shape[0])[(real_atom_types != real_atom_types[0]).any(-1)] + temp_idx = np.arange(real_atom_types.shape[0])[ + (real_atom_types == real_atom_types[0]).all(-1) + ] + rest_idx = np.arange(real_atom_types.shape[0])[ + (real_atom_types != real_atom_types[0]).any(-1) + ] temp_data = data.copy() temp_data["atom_numbs"] = temp_atom_numbs temp_data["atom_types"] = real_atom_types[0] @@ -117,9 +123,10 @@ def dump(folder, data, set_size=5000, comp_prec=np.float32, remove_sets=True): + "not a clean deepmd raw dir. please firstly clean set.* then try compress" ) # if not converted to mixed - if 'real_atom_types' not in data: + if "real_atom_types" not in data: from dpdata import LabeledSystem, System - if 'energies' in data: + + if "energies" in data: temp_sys = LabeledSystem(data=data) else: temp_sys = System(data=data) @@ -153,11 +160,14 @@ def dump(folder, data, set_size=5000, comp_prec=np.float32, remove_sets=True): if "atom_pref" in data: atom_pref = np.reshape(data["atom_pref"], [nframes, -1]).astype(comp_prec) if "real_atom_types" in data: - real_atom_types = np.reshape(data["real_atom_types"], [nframes, -1]).astype(np.int64) + real_atom_types = np.reshape(data["real_atom_types"], [nframes, -1]).astype( + np.int64 + ) # dump frame properties: cell, coord, energy, force and virial - assert nframes <= set_size, \ - "Can not put more than {} frames into one mixed_type systems with one set! " \ + assert nframes <= set_size, ( + "Can not put more than {} frames into one mixed_type systems with one set! " "Please split them into different systems." + ) set_folder = os.path.join(folder, "set.%03d" % 0) os.makedirs(set_folder) np.save(os.path.join(set_folder, "box"), cells) @@ -217,8 +227,12 @@ def mix_system(*system, type_map, split_num=100, **kwargs): temp_systems[str(natom)].append(tmp_sys) if atom_numbs_frame_index[str(natom)] >= split_num: while True: - sys_split, temp_systems[str(natom)], rest_num = split_system(temp_systems[str(natom)], split_num=split_num) - sys_name = f'{str(natom)}/sys.' + "%.6d" % atom_numbs_sys_index[str(natom)] + sys_split, temp_systems[str(natom)], rest_num = split_system( + temp_systems[str(natom)], split_num=split_num + ) + sys_name = ( + f"{str(natom)}/sys." + "%.6d" % atom_numbs_sys_index[str(natom)] + ) mixed_systems[sys_name] = sys_split atom_numbs_sys_index[str(natom)] += 1 if rest_num < split_num: @@ -226,7 +240,7 @@ def mix_system(*system, type_map, split_num=100, **kwargs): break for natom in temp_systems: if atom_numbs_frame_index[natom] > 0: - sys_name = f'{natom}/sys.' + "%.6d" % atom_numbs_sys_index[natom] + sys_name = f"{natom}/sys." + "%.6d" % atom_numbs_sys_index[natom] mixed_systems[sys_name] = temp_systems[natom] return mixed_systems diff --git a/dpdata/plugins/deepmd.py b/dpdata/plugins/deepmd.py index 73920afd9..c6e1966f4 100644 --- a/dpdata/plugins/deepmd.py +++ b/dpdata/plugins/deepmd.py @@ -1,3 +1,4 @@ +import os from typing import List, Optional, Union import h5py @@ -6,9 +7,8 @@ import dpdata import dpdata.deepmd.comp import dpdata.deepmd.hdf5 -import dpdata.deepmd.raw import dpdata.deepmd.mixed -import os +import dpdata.deepmd.raw from dpdata.driver import Driver from dpdata.format import Format @@ -135,16 +135,18 @@ def from_multi_systems(self, directory, **kwargs): level_1_dir = [ os.path.join(directory, name) for name in os.listdir(directory) - if os.path.isdir(os.path.join(directory, name)) and - os.path.isfile(os.path.join(directory, name, 'type_map.raw')) + if os.path.isdir(os.path.join(directory, name)) + and os.path.isfile(os.path.join(directory, name, "type_map.raw")) ] level_2_dir = [ os.path.join(directory, name1, name2) for name1 in os.listdir(directory) for name2 in os.listdir(os.path.join(directory, name1)) - if os.path.isdir(os.path.join(directory, name1)) and - os.path.isdir(os.path.join(directory, name1, name2)) and - os.path.isfile(os.path.join(directory, name1, name2, 'type_map.raw')) + if os.path.isdir(os.path.join(directory, name1)) + and os.path.isdir(os.path.join(directory, name1, name2)) + and os.path.isfile( + os.path.join(directory, name1, name2, "type_map.raw") + ) ] return level_1_dir + level_2_dir diff --git a/dpdata/system.py b/dpdata/system.py index c009530a2..b59b44a14 100644 --- a/dpdata/system.py +++ b/dpdata/system.py @@ -178,7 +178,9 @@ class System(MSONable): DataType("orig", np.ndarray, (3,)), DataType("cells", np.ndarray, (Axis.NFRAMES, 3, 3)), DataType("coords", np.ndarray, (Axis.NFRAMES, Axis.NATOMS, 3)), - DataType("real_atom_types", np.ndarray, (Axis.NFRAMES, Axis.NATOMS), required=False), + DataType( + "real_atom_types", np.ndarray, (Axis.NFRAMES, Axis.NATOMS), required=False + ), DataType("real_atom_names", list, (Axis.NTYPES,), required=False), DataType("nopbc", bool, required=False), ) @@ -572,18 +574,20 @@ def convert_to_mixed_type(self, type_map=None): type_map : list type_map """ - if 'real_atom_types' in self.data.keys(): + if "real_atom_types" in self.data.keys(): return if type_map is None: type_map = self.get_atom_names() - type_index = [type_map.index(i) for i in self.data['atom_names']] + type_index = [type_map.index(i) for i in self.data["atom_names"]] frames = self.get_nframes() - self.data['real_atom_types'] = np.tile(np.array([type_index[i] for i in self.data['atom_types']]), [frames, 1]) - self.data['real_atom_names'] = type_map + self.data["real_atom_types"] = np.tile( + np.array([type_index[i] for i in self.data["atom_types"]]), [frames, 1] + ) + self.data["real_atom_names"] = type_map natoms = self.get_natoms() - self.data['atom_types'] = np.array([0 for i in range(natoms)]) - self.data['atom_numbs'] = [natoms] - self.data['atom_names'] = ['MIXED_TOKEN'] + self.data["atom_types"] = np.array([0 for i in range(natoms)]) + self.data["atom_numbs"] = [natoms] + self.data["atom_names"] = ["MIXED_TOKEN"] def sort_atom_names(self, type_map=None): """ @@ -1308,7 +1312,10 @@ def from_fmt_obj(self, fmtobj, directory, labeled=True, **kwargs): data_list = fmtobj.from_system_mix(dd, **kwargs) for data_item in data_list: system_list.append(System(data=data_item)) - return self.__class__(*system_list, type_map=kwargs['type_map'] if 'type_map' in kwargs else None) + return self.__class__( + *system_list, + type_map=kwargs["type_map"] if "type_map" in kwargs else None, + ) def to_fmt_obj(self, fmtobj, directory, *args, **kwargs): if not isinstance(fmtobj, dpdata.plugins.deepmd.DeePMDMixedFormat): @@ -1318,9 +1325,13 @@ def to_fmt_obj(self, fmtobj, directory, *args, **kwargs): ): ss.to_fmt_obj(fmtobj, fn, *args, **kwargs) else: - mixed_systems = fmtobj.mix_system(*list(self.systems.values()), type_map=self.atom_names, **kwargs) + mixed_systems = fmtobj.mix_system( + *list(self.systems.values()), type_map=self.atom_names, **kwargs + ) for fn in mixed_systems: - mixed_systems[fn].to_fmt_obj(fmtobj, os.path.join(directory, fn), *args, **kwargs) + mixed_systems[fn].to_fmt_obj( + fmtobj, os.path.join(directory, fn), *args, **kwargs + ) return self def to(self, fmt: str, *args, **kwargs) -> "MultiSystems": diff --git a/tests/test_deepmd_mixed.py b/tests/test_deepmd_mixed.py index 743b4c4bd..fac858d46 100644 --- a/tests/test_deepmd_mixed.py +++ b/tests/test_deepmd_mixed.py @@ -29,8 +29,10 @@ def setUp(self): ) self.systems = dpdata.MultiSystems(system_1, system_2, system_3, system_4) - self.systems.to_deepmd_mixed('tmp.deepmd.mixed') - mixms = dpdata.MultiSystems().load_systems_from_file('tmp.deepmd.mixed', fmt='deepmd/mixed') + self.systems.to_deepmd_mixed("tmp.deepmd.mixed") + mixms = dpdata.MultiSystems().load_systems_from_file( + "tmp.deepmd.mixed", fmt="deepmd/mixed" + ) self.system_1 = self.systems["C1H3"] self.system_2 = mixms["C1H3"] self.places = 6 From 6737b732e18b67c6aecbcf4a56bf188caf1253a1 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Tue, 21 Feb 2023 15:25:29 +0800 Subject: [PATCH 3/8] Update the uts; change format name to 'deepmd/npy/mixed'; remove 'set_size' in dump func. --- dpdata/deepmd/mixed.py | 8 +--- dpdata/format.py | 2 +- dpdata/plugins/deepmd.py | 26 +++++------ dpdata/system.py | 2 +- tests/test_deepmd_mixed.py | 88 ++++++++++++++++++++++++++++++-------- 5 files changed, 88 insertions(+), 38 deletions(-) diff --git a/dpdata/deepmd/mixed.py b/dpdata/deepmd/mixed.py index 491b32ae5..0e770dc20 100644 --- a/dpdata/deepmd/mixed.py +++ b/dpdata/deepmd/mixed.py @@ -107,7 +107,7 @@ def to_system_data(folder, type_map=None, labels=True): return data_list -def dump(folder, data, set_size=5000, comp_prec=np.float32, remove_sets=True): +def dump(folder, data, comp_prec=np.float32, remove_sets=True): os.makedirs(folder, exist_ok=True) sets = sorted(glob.glob(os.path.join(folder, "set.*"))) if len(sets) > 0: @@ -164,10 +164,6 @@ def dump(folder, data, set_size=5000, comp_prec=np.float32, remove_sets=True): np.int64 ) # dump frame properties: cell, coord, energy, force and virial - assert nframes <= set_size, ( - "Can not put more than {} frames into one mixed_type systems with one set! " - "Please split them into different systems." - ) set_folder = os.path.join(folder, "set.%03d" % 0) os.makedirs(set_folder) np.save(os.path.join(set_folder, "box"), cells) @@ -191,7 +187,7 @@ def dump(folder, data, set_size=5000, comp_prec=np.float32, remove_sets=True): pass -def mix_system(*system, type_map, split_num=100, **kwargs): +def mix_system(*system, type_map, split_num=200, **kwargs): """Mix the systems into mixed_type ones Parameters diff --git a/dpdata/format.py b/dpdata/format.py index d9d45549e..b4fc5a8e5 100644 --- a/dpdata/format.py +++ b/dpdata/format.py @@ -132,7 +132,7 @@ def to_multi_systems(self, formulas, directory, **kwargs): "%s doesn't support MultiSystems.to" % (self.__class__.__name__) ) - def mix_system(self, *system, type_map, split_num=100, **kwargs): + def mix_system(self, *system, type_map, split_num=200, **kwargs): """Mix the systems into mixed_type ones according to the unified given type_map. Parameters diff --git a/dpdata/plugins/deepmd.py b/dpdata/plugins/deepmd.py index c6e1966f4..c8d80cbb0 100644 --- a/dpdata/plugins/deepmd.py +++ b/dpdata/plugins/deepmd.py @@ -71,14 +71,14 @@ def from_labeled_system(self, file_name, type_map=None, **kwargs): MultiMode = Format.MultiModes.Directory -@Format.register("deepmd/mixed") +@Format.register("deepmd/npy/mixed") class DeePMDMixedFormat(Format): def from_system_mix(self, file_name, type_map=None, **kwargs): return dpdata.deepmd.mixed.to_system_data( file_name, type_map=type_map, labels=False ) - def to_system(self, data, file_name, set_size=200, prec=np.float64, **kwargs): + def to_system(self, data, file_name, prec=np.float64, **kwargs): """ Dump the system in deepmd mixed type format (numpy binary) to `folder`. @@ -91,32 +91,34 @@ def to_system(self, data, file_name, set_size=200, prec=np.float64, **kwargs): System data file_name : str The output folder - set_size : int - The max size of set. prec : {numpy.float32, numpy.float64} The floating point precision of the compressed data """ - dpdata.deepmd.mixed.dump(file_name, data, set_size=set_size, comp_prec=prec) + dpdata.deepmd.mixed.dump(file_name, data, comp_prec=prec) def from_labeled_system_mix(self, file_name, type_map=None, **kwargs): return dpdata.deepmd.mixed.to_system_data( file_name, type_map=type_map, labels=True ) - def mix_system(self, *system, **kwargs): - """Mix the systems into mixed_type ones + def mix_system(self, *system, type_map, split_num=200, **kwargs): + """Mix the systems into mixed_type ones according to the unified given type_map. Parameters ---------- - file_name : str - file name + *system : System + The systems to mix + type_map : list of str + Maps atom type to name + split_num : int + Number of frames in each system Returns ------- - data: dict - system data + mixed_systems: dict + dict of mixed system with key '{atom_numbs}/sys.xxx' """ - return dpdata.deepmd.mixed.mix_system(*system, **kwargs) + return dpdata.deepmd.mixed.mix_system(*system, type_map=type_map, split_num=split_num, **kwargs) def from_multi_systems(self, directory, **kwargs): """MultiSystems.from diff --git a/dpdata/system.py b/dpdata/system.py index b59b44a14..887aba15f 100644 --- a/dpdata/system.py +++ b/dpdata/system.py @@ -585,7 +585,7 @@ def convert_to_mixed_type(self, type_map=None): ) self.data["real_atom_names"] = type_map natoms = self.get_natoms() - self.data["atom_types"] = np.array([0 for i in range(natoms)]) + self.data["atom_types"] = np.zeros((natoms,), dtype=int) self.data["atom_numbs"] = [natoms] self.data["atom_names"] = ["MIXED_TOKEN"] diff --git a/tests/test_deepmd_mixed.py b/tests/test_deepmd_mixed.py index fac858d46..bd7daf227 100644 --- a/tests/test_deepmd_mixed.py +++ b/tests/test_deepmd_mixed.py @@ -8,40 +8,92 @@ from context import dpdata -class TestMixedMultiSystems(unittest.TestCase, CompLabeledSys, IsNoPBC): +class TestMixedMultiSystems(unittest.TestCase, CompLabeledSys, MultiSystems, IsNoPBC): def setUp(self): self.places = 6 self.e_places = 6 self.f_places = 6 self.v_places = 6 + # C1H4 system_1 = dpdata.LabeledSystem( "gaussian/methane.gaussianlog", fmt="gaussian/log" ) + + # C1H3 system_2 = dpdata.LabeledSystem( - "gaussian/methane_reordered.gaussianlog", fmt="gaussian/log" - ) - system_3 = dpdata.LabeledSystem( "gaussian/methane_sub.gaussianlog", fmt="gaussian/log" ) - system_4 = dpdata.LabeledSystem( - "gaussian/noncoveraged.gaussianlog", fmt="gaussian/log" - ) - self.systems = dpdata.MultiSystems(system_1, system_2, system_3, system_4) - self.systems.to_deepmd_mixed("tmp.deepmd.mixed") - mixms = dpdata.MultiSystems().load_systems_from_file( - "tmp.deepmd.mixed", fmt="deepmd/mixed" + tmp_data = system_1.data.copy() + tmp_data['atom_numbs'] = [1, 1, 1, 2] + tmp_data['atom_names'] = ['C', 'H', 'A', 'B'] + tmp_data['atom_types'] = np.array([0, 1, 2, 3, 3]) + # C1H1A1B2 + system_1_modified_type_1 = dpdata.LabeledSystem(data=tmp_data) + + tmp_data = system_1.data.copy() + tmp_data['atom_numbs'] = [1, 1, 2, 1] + tmp_data['atom_names'] = ['C', 'H', 'A', 'B'] + tmp_data['atom_types'] = np.array([0, 1, 2, 2, 3]) + # C1H1A2B1 + system_1_modified_type_2 = dpdata.LabeledSystem(data=tmp_data) + + tmp_data = system_1.data.copy() + tmp_data['atom_numbs'] = [1, 1, 1, 2] + tmp_data['atom_names'] = ['C', 'H', 'A', 'D'] + tmp_data['atom_types'] = np.array([0, 1, 2, 3, 3]) + # C1H1A1C2 + system_1_modified_type_3 = dpdata.LabeledSystem(data=tmp_data) + + self.ms = dpdata.MultiSystems( + system_1, + system_2, + system_1_modified_type_1, + system_1_modified_type_2, + system_1_modified_type_3, ) - self.system_1 = self.systems["C1H3"] - self.system_2 = mixms["C1H3"] - self.places = 6 - self.e_places = 6 - self.f_places = 6 + self.ms.to_deepmd_npy_mixed("tmp.deepmd.mixed") + self.place_holder_ms = dpdata.MultiSystems().load_systems_from_file( + "tmp.deepmd.mixed/5", fmt="deepmd/npy" + ) + self.place_holder_ms += dpdata.MultiSystems().load_systems_from_file( + "tmp.deepmd.mixed/4", fmt="deepmd/npy" + ) + self.systems = dpdata.MultiSystems().load_systems_from_file( + "tmp.deepmd.mixed", fmt="deepmd/npy/mixed" + ) + self.system_1 = self.ms["C1H4A0B0D0"] + self.system_2 = self.systems["C1H4A0B0D0"] + + self.system_names = ['C1H4A0B0D0', 'C1H3A0B0D0', 'C1H1A1B2D0', 'C1H1A2B1D0', 'C1H1A1B0D2'] + self.system_sizes = {'C1H4A0B0D0': 1, 'C1H3A0B0D0': 1, 'C1H1A1B2D0': 1, 'C1H1A2B1D0': 1, 'C1H1A1B0D2': 1} + self.atom_names = ["C", "H", "A", "B", "D"] def tearDown(self): - if os.path.exists("tmp.deepmd.npy"): - shutil.rmtree("tmp.deepmd.npy") + if os.path.exists("tmp.deepmd.mixed"): + shutil.rmtree("tmp.deepmd.mixed") + + def test_len(self): + self.assertEqual(len(self.ms), 5) + self.assertEqual(len(self.place_holder_ms), 2) + self.assertEqual(len(self.systems), 5) + + def test_get_nframes(self): + self.assertEqual(self.ms.get_nframes(), 5) + self.assertEqual(self.place_holder_ms.get_nframes(), 5) + self.assertEqual(self.systems.get_nframes(), 5) + + def test_str(self): + self.assertEqual( + str(self.ms), "MultiSystems (5 systems containing 5 frames)" + ) + self.assertEqual( + str(self.place_holder_ms), "MultiSystems (2 systems containing 5 frames)" + ) + self.assertEqual( + str(self.systems), "MultiSystems (5 systems containing 5 frames)" + ) if __name__ == "__main__": From 254b3d2ea6ef6de2bfb0a01f01e82c69011dd026 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 21 Feb 2023 07:25:47 +0000 Subject: [PATCH 4/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dpdata/plugins/deepmd.py | 4 +++- tests/test_deepmd_mixed.py | 38 ++++++++++++++++++++++++-------------- 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/dpdata/plugins/deepmd.py b/dpdata/plugins/deepmd.py index c8d80cbb0..64cc845ef 100644 --- a/dpdata/plugins/deepmd.py +++ b/dpdata/plugins/deepmd.py @@ -118,7 +118,9 @@ def mix_system(self, *system, type_map, split_num=200, **kwargs): mixed_systems: dict dict of mixed system with key '{atom_numbs}/sys.xxx' """ - return dpdata.deepmd.mixed.mix_system(*system, type_map=type_map, split_num=split_num, **kwargs) + return dpdata.deepmd.mixed.mix_system( + *system, type_map=type_map, split_num=split_num, **kwargs + ) def from_multi_systems(self, directory, **kwargs): """MultiSystems.from diff --git a/tests/test_deepmd_mixed.py b/tests/test_deepmd_mixed.py index bd7daf227..9e6ee9dd1 100644 --- a/tests/test_deepmd_mixed.py +++ b/tests/test_deepmd_mixed.py @@ -26,23 +26,23 @@ def setUp(self): ) tmp_data = system_1.data.copy() - tmp_data['atom_numbs'] = [1, 1, 1, 2] - tmp_data['atom_names'] = ['C', 'H', 'A', 'B'] - tmp_data['atom_types'] = np.array([0, 1, 2, 3, 3]) + tmp_data["atom_numbs"] = [1, 1, 1, 2] + tmp_data["atom_names"] = ["C", "H", "A", "B"] + tmp_data["atom_types"] = np.array([0, 1, 2, 3, 3]) # C1H1A1B2 system_1_modified_type_1 = dpdata.LabeledSystem(data=tmp_data) tmp_data = system_1.data.copy() - tmp_data['atom_numbs'] = [1, 1, 2, 1] - tmp_data['atom_names'] = ['C', 'H', 'A', 'B'] - tmp_data['atom_types'] = np.array([0, 1, 2, 2, 3]) + tmp_data["atom_numbs"] = [1, 1, 2, 1] + tmp_data["atom_names"] = ["C", "H", "A", "B"] + tmp_data["atom_types"] = np.array([0, 1, 2, 2, 3]) # C1H1A2B1 system_1_modified_type_2 = dpdata.LabeledSystem(data=tmp_data) tmp_data = system_1.data.copy() - tmp_data['atom_numbs'] = [1, 1, 1, 2] - tmp_data['atom_names'] = ['C', 'H', 'A', 'D'] - tmp_data['atom_types'] = np.array([0, 1, 2, 3, 3]) + tmp_data["atom_numbs"] = [1, 1, 1, 2] + tmp_data["atom_names"] = ["C", "H", "A", "D"] + tmp_data["atom_types"] = np.array([0, 1, 2, 3, 3]) # C1H1A1C2 system_1_modified_type_3 = dpdata.LabeledSystem(data=tmp_data) @@ -66,8 +66,20 @@ def setUp(self): self.system_1 = self.ms["C1H4A0B0D0"] self.system_2 = self.systems["C1H4A0B0D0"] - self.system_names = ['C1H4A0B0D0', 'C1H3A0B0D0', 'C1H1A1B2D0', 'C1H1A2B1D0', 'C1H1A1B0D2'] - self.system_sizes = {'C1H4A0B0D0': 1, 'C1H3A0B0D0': 1, 'C1H1A1B2D0': 1, 'C1H1A2B1D0': 1, 'C1H1A1B0D2': 1} + self.system_names = [ + "C1H4A0B0D0", + "C1H3A0B0D0", + "C1H1A1B2D0", + "C1H1A2B1D0", + "C1H1A1B0D2", + ] + self.system_sizes = { + "C1H4A0B0D0": 1, + "C1H3A0B0D0": 1, + "C1H1A1B2D0": 1, + "C1H1A2B1D0": 1, + "C1H1A1B0D2": 1, + } self.atom_names = ["C", "H", "A", "B", "D"] def tearDown(self): @@ -85,9 +97,7 @@ def test_get_nframes(self): self.assertEqual(self.systems.get_nframes(), 5) def test_str(self): - self.assertEqual( - str(self.ms), "MultiSystems (5 systems containing 5 frames)" - ) + self.assertEqual(str(self.ms), "MultiSystems (5 systems containing 5 frames)") self.assertEqual( str(self.place_holder_ms), "MultiSystems (2 systems containing 5 frames)" ) From ddbee97d99bd86023ef43c4fbc01cf4479bac925 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Thu, 23 Feb 2023 02:29:36 +0800 Subject: [PATCH 5/8] Add docs for deepmd/npy/mixed format --- dpdata/plugins/deepmd.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/dpdata/plugins/deepmd.py b/dpdata/plugins/deepmd.py index 64cc845ef..a4b20fd36 100644 --- a/dpdata/plugins/deepmd.py +++ b/dpdata/plugins/deepmd.py @@ -73,6 +73,21 @@ def from_labeled_system(self, file_name, type_map=None, **kwargs): @Format.register("deepmd/npy/mixed") class DeePMDMixedFormat(Format): + """Mixed type numpy format for DeePMD-kit. + Under this format, systems with the same number of atoms but different formula can be put together + for a larger system, especially when the frame numbers in systems are sparse. + This also helps to mixture the type information together for model training with type embedding network. + + Examples + -------- + Dump a MultiSystems into a mixed type numpy directory: + >>> import dpdata + >>> dpdata.MultiSystems(*systems).to_deepmd_npy_mixed("mixed_dir") + + Load a mixed type data into a MultiSystems: + >>> import dpdata + >>> dpdata.MultiSystems().load_systems_from_file("mixed_dir", fmt="deepmd/npy/mixed") + """ def from_system_mix(self, file_name, type_map=None, **kwargs): return dpdata.deepmd.mixed.to_system_data( file_name, type_map=type_map, labels=False From b4aab8a9f0baf4eed0471c0c2e9a054e0fdf1fe7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 22 Feb 2023 18:29:53 +0000 Subject: [PATCH 6/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- dpdata/plugins/deepmd.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dpdata/plugins/deepmd.py b/dpdata/plugins/deepmd.py index a4b20fd36..dcb9d810c 100644 --- a/dpdata/plugins/deepmd.py +++ b/dpdata/plugins/deepmd.py @@ -88,6 +88,7 @@ class DeePMDMixedFormat(Format): >>> import dpdata >>> dpdata.MultiSystems().load_systems_from_file("mixed_dir", fmt="deepmd/npy/mixed") """ + def from_system_mix(self, file_name, type_map=None, **kwargs): return dpdata.deepmd.mixed.to_system_data( file_name, type_map=type_map, labels=False From a0951a0363921d86a60f9735202d40a2be9dbeb5 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Thu, 23 Feb 2023 10:09:50 +0800 Subject: [PATCH 7/8] Add docs in README for deepmd/npy/mixed --- README.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/README.md b/README.md index 281920ba4..077f6a88d 100644 --- a/README.md +++ b/README.md @@ -62,6 +62,8 @@ The `System` or `LabeledSystem` can be constructed from the following file forma | deepmd | npy | True | False | System | 'deepmd/npy' | | deepmd | raw | True | True | LabeledSystem | 'deepmd/raw' | | deepmd | npy | True | True | LabeledSystem | 'deepmd/npy' | +| deepmd | npy | True | True | MultiSystems | 'deepmd/npy/mixed' | +| deepmd | npy | True | False | MultiSystems | 'deepmd/npy/mixed' | | gaussian| log | False | True | LabeledSystem | 'gaussian/log'| | gaussian| log | True | True | LabeledSystem | 'gaussian/md' | | siesta | output | False | True | LabeledSystem | 'siesta/output'| @@ -278,6 +280,28 @@ print(syst.get_charge()) # return the total charge of the system If a valence of 3 is detected on carbon, the formal charge will be assigned to -1. Because for most cases (in alkynyl anion, isonitrile, cyclopentadienyl anion), the formal charge on 3-valence carbon is -1, and this is also consisent with the 8-electron rule. +## Mixed Type Format +The format `deepmd/npy/mixed` is the mixed type numpy format for DeePMD-kit, and can be loaded or dumped through class `dpdata.MultiSystems`. + +Under this format, systems with the same number of atoms but different formula can be put together +for a larger system, especially when the frame numbers in systems are sparse. + +This also helps to mixture the type information together for model training with type embedding network. + +Here are examples using `deepmd/npy/mixed` format: + +- Dump a MultiSystems into a mixed type numpy directory: +```python +import dpdata +dpdata.MultiSystems(*systems).to_deepmd_npy_mixed("mixed_dir") +``` + +- Load a mixed type data into a MultiSystems: +```python +import dpdata +dpdata.MultiSystems().load_systems_from_file("mixed_dir", fmt="deepmd/npy/mixed") +``` + # Plugins One can follow [a simple example](plugin_example/) to add their own format by creating and installing plugins. It's critical to add the [Format](dpdata/format.py) class to `entry_points['dpdata.plugins']` in [`pyproject.toml`](plugin_example/pyproject.toml): From 17edd353c38c63bc64992f55cbe267895c80598b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 23 Feb 2023 02:10:06 +0000 Subject: [PATCH 8/8] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 077f6a88d..4af23cc0f 100644 --- a/README.md +++ b/README.md @@ -293,12 +293,14 @@ Here are examples using `deepmd/npy/mixed` format: - Dump a MultiSystems into a mixed type numpy directory: ```python import dpdata + dpdata.MultiSystems(*systems).to_deepmd_npy_mixed("mixed_dir") ``` - Load a mixed type data into a MultiSystems: ```python import dpdata + dpdata.MultiSystems().load_systems_from_file("mixed_dir", fmt="deepmd/npy/mixed") ```