From 099e1db8e0df8e69eb29160ed8ece3d12b4f086d Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 7 Oct 2021 21:51:26 -0400 Subject: [PATCH 1/3] add deepmd/hdf5 format To support deepmodeling/deepmd-kit#1163. --- dpdata/deepmd/hdf5.py | 148 ++++++++++++++++++++++++++++++++++++++ dpdata/plugins/deepmd.py | 40 +++++++++++ setup.py | 2 +- tests/test_deepmd_hdf5.py | 45 ++++++++++++ 4 files changed, 234 insertions(+), 1 deletion(-) create mode 100644 dpdata/deepmd/hdf5.py create mode 100644 tests/test_deepmd_hdf5.py diff --git a/dpdata/deepmd/hdf5.py b/dpdata/deepmd/hdf5.py new file mode 100644 index 000000000..b569a9d09 --- /dev/null +++ b/dpdata/deepmd/hdf5.py @@ -0,0 +1,148 @@ +"""Utils for deepmd/hdf5 format.""" +import h5py +import numpy as np + +from wcmatch.glob import globfilter + + +__all__ = ['to_system_data', 'dump'] + +def to_system_data(f: h5py.File, + folder: str, + type_map: list = None, + labels: bool = True) : + """Load a HDF5 file. + + Parameters + ---------- + f : h5py.File + HDF5 file object + folder : str + path in the HDF5 file + type_map : list + type map + labels : bool + labels + """ + g = f[folder] if folder else f + + data = {} + data['atom_types'] = g['type.raw'][:] + ntypes = np.max(data['atom_types']) + 1 + natoms = data['atom_types'].size + data['atom_numbs'] = [] + for ii in range (ntypes) : + data['atom_numbs'].append(np.count_nonzero(data['atom_types'] == ii)) + data['atom_names'] = [] + # if find type_map.raw, use it + if 'type_map.raw' in g.keys(): + my_type_map = list(np.char.decode(g['type_map.raw'][:])) + # else try to use arg type_map + elif type_map is not None: + my_type_map = type_map + # in the last case, make artificial atom names + else: + my_type_map = [] + for ii in range(ntypes) : + my_type_map.append('Type_%d' % ii) + assert(len(my_type_map) >= len(data['atom_numbs'])) + for ii in range(len(data['atom_numbs'])) : + data['atom_names'].append(my_type_map[ii]) + + data['orig'] = np.zeros([3]) + if 'nopbc' in g.keys(): + data['nopbc'] = True + sets = globfilter(g.keys(), 'set.*') + + data_types = { + 'cells': {'labeled': False, 'shape': (3,3), 'required': 'nopbc' not in data}, + 'coords': {'labeled': False, 'shape': (natoms,3), 'required': True}, + 'energies': {'labeled': True, 'shape': tuple(), 'required': False}, + 'forces': {'labeled': True, 'shape': (natoms,3), 'required': False}, + 'virials': {'labeled': True, 'shape': (3,3), 'required': False}, + } + + for dt, prop in data_types.items(): + all_data = [] + + for ii in sets: + set = g[ii] + fn = '%s.npy' % dt + if fn in set.keys(): + dd = set[fn][:] + nframes = dd.shape[0] + all_data.append(np.reshape(dd, (nframes, *prop['shape']))) + elif prop['required']: + raise RuntimeError("%s/%s/%s not found" % (folder, ii, fn)) + + if len(all_data) > 0 : + data[dt] = np.concatenate(all_data, axis = 0) + return data + +def dump(f: h5py.File, + folder: str, + data: dict, + set_size = 5000, + comp_prec = np.float32, + ) -> None: + """Dump data to a HDF5 file. + + Parameters + ---------- + f : h5py.File + HDF5 file object + folder : str + path in the HDF5 file + data : dict + System or LabeledSystem data + set_size : int, default: 5000 + size of a set + comp_prec : np.dtype, default: np.float32 + precision of data + """ + data_types = ('cells', 'coords', 'energies', 'forces', 'virials') + + # if folder is None, use the root of the file + if folder: + if folder in f: + del f[folder] + g = f.create_group(folder) + else: + g = f + # dump raw (array in fact) + g.create_dataset('type.raw', data=data['atom_types']) + g.create_dataset('type_map.raw', data=np.array(data['atom_names'], dtype='S')) + # BondOrder System + if "bonds" in data: + g.create_dataset("bonds.raw", data=data['bonds']) + if "formal_charges" in data: + g.create_dataset("formal_charges.raw", data=data['formal_charges']) + # reshape frame properties and convert prec + nframes = data['cells'].shape[0] + + nopbc = data.get("nopbc", False) + reshaped_data = {} + for dt in data_types: + if dt in data: + if dt == 'energies': + reshaped_data[dt] = np.reshape(data[dt], (nframes,)).astype(comp_prec) + elif nopbc and dt == 'cells': + # skip dump cells since deepmd-kit v2.0.2 does not need cells for a nopbc system + pass + else: + reshaped_data[dt] = np.reshape(data[dt], (nframes, -1)).astype(comp_prec) + + # dump frame properties: cell, coord, energy, force and virial + nsets = nframes // set_size + if set_size * nsets < nframes : + nsets += 1 + for ii in range(nsets) : + set_stt = ii * set_size + set_end = (ii+1) * set_size + set_folder = g.create_group('set.%03d' % ii) + for dt in data_types: + if dt in reshaped_data: + set_folder.create_dataset('%s.npy' % dt, data=reshaped_data[dt][set_stt:set_end]) + + if nopbc: + g.create_dataset("nopbc", True) diff --git a/dpdata/plugins/deepmd.py b/dpdata/plugins/deepmd.py index c2636e993..44014543b 100644 --- a/dpdata/plugins/deepmd.py +++ b/dpdata/plugins/deepmd.py @@ -1,6 +1,8 @@ import dpdata.deepmd.raw import dpdata.deepmd.comp +import dpdata.deepmd.hdf5 import numpy as np +import h5py from dpdata.format import Format @@ -54,3 +56,41 @@ def from_labeled_system(self, file_name, type_map=None, **kwargs): return dpdata.deepmd.comp.to_system_data(file_name, type_map=type_map, labels=True) MultiMode = Format.MultiModes.Directory + +@Format.register("deepmd/hdf5") +class DeePMDCompFormat(Format): + def from_system(self, file_name, type_map=None, **kwargs): + s = file_name.split("#") + name = s[1] if len(s) > 1 else "" + with h5py.File(s[0], 'r') as f: + return dpdata.deepmd.hdf5.to_system_data(f, name, type_map=type_map, labels=False) + + def from_labeled_system(self, file_name, type_map=None, **kwargs): + s = file_name.split("#") + name = s[1] if len(s) > 1 else "" + with h5py.File(s[0], 'r') as f: + return dpdata.deepmd.hdf5.to_system_data(f, name, type_map=type_map, labels=True) + + def to_system(self, + data : dict, + file_name : str, + set_size : int = 5000, + comp_prec : np.dtype = np.float32, + **kwargs): + s = file_name.split("#") + name = s[1] if len(s) > 1 else "" + mode = 'a' if name else 'w' + with h5py.File(s[0], mode) as f: + dpdata.deepmd.hdf5.dump(f, name, data, set_size = set_size, comp_prec = comp_prec) + + def from_multi_systems(self, + directory, + **kwargs): + with h5py.File(directory, 'r') as f: + return ["%s#%s" % (directory, ff) for ff in f.keys()] + + def to_multi_systems(self, + formulas, + directory, + **kwargs): + return ["%s#%s" % (directory, ff) for ff in formulas] diff --git a/setup.py b/setup.py index c0ded7819..99cac1219 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ readme = f.read() # install_requires = ['xml'] -install_requires=['numpy>=1.14.3', 'monty', 'scipy'] +install_requires=['numpy>=1.14.3', 'monty', 'scipy', 'h5py', 'wcmatch'] setuptools.setup( name="dpdata", diff --git a/tests/test_deepmd_hdf5.py b/tests/test_deepmd_hdf5.py new file mode 100644 index 000000000..3c15b45e8 --- /dev/null +++ b/tests/test_deepmd_hdf5.py @@ -0,0 +1,45 @@ +import os +import numpy as np +import unittest +from context import dpdata +from comp_sys import CompLabeledSys, CompSys, IsPBC + +class TestDeepmdLoadDumpComp(unittest.TestCase, CompLabeledSys, IsPBC): + def setUp (self) : + self.system_1 = dpdata.LabeledSystem('poscars/OUTCAR.h2o.md', + fmt = 'vasp/outcar') + self.system_1.to_deepmd_hdf5('tmp.deepmd.hdf5', + prec = np.float64, + set_size = 2) + + self.system_2 = dpdata.LabeledSystem('tmp.deepmd.hdf5', + fmt = 'deepmd/hdf5', + type_map = ['O', 'H']) + self.places = 6 + self.e_places = 6 + self.f_places = 6 + self.v_places = 6 + + def tearDown(self) : + if os.path.exists('tmp.deepmd.hdf5'): + os.remove('tmp.deepmd.hdf5') + + +class TestDeepmdCompNoLabels(unittest.TestCase, CompSys, IsPBC) : + def setUp (self) : + self.system_1 = dpdata.System('poscars/POSCAR.h2o.md', + fmt = 'vasp/poscar') + self.system_1.to_deepmd_hdf5('tmp.deepmd.hdf5', + prec = np.float64, + set_size = 2) + self.system_2 = dpdata.System('tmp.deepmd.hdf5', + fmt = 'deepmd/hdf5', + type_map = ['O', 'H']) + self.places = 6 + self.e_places = 6 + self.f_places = 6 + self.v_places = 6 + + def tearDown(self) : + if os.path.exists('tmp.deepmd.hdf5'): + os.remove('tmp.deepmd.hdf5') From 00e3af10e29bdb18a5cfea80021fb3bbf6caf5d3 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 7 Oct 2021 22:05:08 -0400 Subject: [PATCH 2/3] bugfix --- dpdata/deepmd/hdf5.py | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/dpdata/deepmd/hdf5.py b/dpdata/deepmd/hdf5.py index b569a9d09..ea373ae49 100644 --- a/dpdata/deepmd/hdf5.py +++ b/dpdata/deepmd/hdf5.py @@ -55,11 +55,11 @@ def to_system_data(f: h5py.File, sets = globfilter(g.keys(), 'set.*') data_types = { - 'cells': {'labeled': False, 'shape': (3,3), 'required': 'nopbc' not in data}, - 'coords': {'labeled': False, 'shape': (natoms,3), 'required': True}, - 'energies': {'labeled': True, 'shape': tuple(), 'required': False}, - 'forces': {'labeled': True, 'shape': (natoms,3), 'required': False}, - 'virials': {'labeled': True, 'shape': (3,3), 'required': False}, + 'cells': {'fn': 'box', 'labeled': False, 'shape': (3,3), 'required': 'nopbc' not in data}, + 'coords': {'fn': 'coord', 'labeled': False, 'shape': (natoms,3), 'required': True}, + 'energies': {'fn': 'energy', 'labeled': True, 'shape': tuple(), 'required': False}, + 'forces': {'fn': 'force', 'labeled': True, 'shape': (natoms,3), 'required': False}, + 'virials': {'fn': 'virial', 'labeled': True, 'shape': (3,3), 'required': False}, } for dt, prop in data_types.items(): @@ -67,7 +67,7 @@ def to_system_data(f: h5py.File, for ii in sets: set = g[ii] - fn = '%s.npy' % dt + fn = '%s.npy' % prop['fn'] if fn in set.keys(): dd = set[fn][:] nframes = dd.shape[0] @@ -100,8 +100,6 @@ def dump(f: h5py.File, comp_prec : np.dtype, default: np.float32 precision of data """ - data_types = ('cells', 'coords', 'energies', 'forces', 'virials') - # if folder is None, use the root of the file if folder: if folder in f: @@ -122,15 +120,18 @@ def dump(f: h5py.File, nopbc = data.get("nopbc", False) reshaped_data = {} - for dt in data_types: + + data_types = { + 'cells': {'fn': 'box', 'shape': (nframes, 9), 'dump': not nopbc}, + 'coords': {'fn': 'coord', 'shape': (nframes, -1), 'dump': True}, + 'energies': {'fn': 'energy', 'shape': (nframes,), 'dump': True}, + 'forces': {'fn': 'force', 'shape': (nframes, -1), 'dump': True}, + 'virials': {'fn': 'virial', 'shape': (nframes, 9), 'dump': True}, + } + for dt, prop in data_types.items(): if dt in data: - if dt == 'energies': - reshaped_data[dt] = np.reshape(data[dt], (nframes,)).astype(comp_prec) - elif nopbc and dt == 'cells': - # skip dump cells since deepmd-kit v2.0.2 does not need cells for a nopbc system - pass - else: - reshaped_data[dt] = np.reshape(data[dt], (nframes, -1)).astype(comp_prec) + if prop['dump']: + reshaped_data[dt] = np.reshape(data[dt], prop['shape']).astype(comp_prec) # dump frame properties: cell, coord, energy, force and virial nsets = nframes // set_size @@ -140,9 +141,9 @@ def dump(f: h5py.File, set_stt = ii * set_size set_end = (ii+1) * set_size set_folder = g.create_group('set.%03d' % ii) - for dt in data_types: + for dt, prop in data_types.items(): if dt in reshaped_data: - set_folder.create_dataset('%s.npy' % dt, data=reshaped_data[dt][set_stt:set_end]) + set_folder.create_dataset('%s.npy' % prop['fn'], data=reshaped_data[dt][set_stt:set_end]) if nopbc: g.create_dataset("nopbc", True) From eb14fd32001b2c385f98681cabe8e194a712c8d7 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 7 Oct 2021 22:08:05 -0400 Subject: [PATCH 3/3] add an example --- dpdata/plugins/deepmd.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/dpdata/plugins/deepmd.py b/dpdata/plugins/deepmd.py index 44014543b..6fd209a3f 100644 --- a/dpdata/plugins/deepmd.py +++ b/dpdata/plugins/deepmd.py @@ -59,6 +59,14 @@ def from_labeled_system(self, file_name, type_map=None, **kwargs): @Format.register("deepmd/hdf5") class DeePMDCompFormat(Format): + """HDF5 format for DeePMD-kit. + + Examples + -------- + Dump a MultiSystems to a HDF5 file: + >>> import dpdata + >>> dpdata.MultiSystems().from_deepmd_npy("data").to_deepmd_hdf5("data.hdf5") + """ def from_system(self, file_name, type_map=None, **kwargs): s = file_name.split("#") name = s[1] if len(s) > 1 else ""