diff --git a/dpdata/deepmd/hdf5.py b/dpdata/deepmd/hdf5.py new file mode 100644 index 000000000..ea373ae49 --- /dev/null +++ b/dpdata/deepmd/hdf5.py @@ -0,0 +1,149 @@ +"""Utils for deepmd/hdf5 format.""" +import h5py +import numpy as np + +from wcmatch.glob import globfilter + + +__all__ = ['to_system_data', 'dump'] + +def to_system_data(f: h5py.File, + folder: str, + type_map: list = None, + labels: bool = True) : + """Load a HDF5 file. + + Parameters + ---------- + f : h5py.File + HDF5 file object + folder : str + path in the HDF5 file + type_map : list + type map + labels : bool + labels + """ + g = f[folder] if folder else f + + data = {} + data['atom_types'] = g['type.raw'][:] + ntypes = np.max(data['atom_types']) + 1 + natoms = data['atom_types'].size + data['atom_numbs'] = [] + for ii in range (ntypes) : + data['atom_numbs'].append(np.count_nonzero(data['atom_types'] == ii)) + data['atom_names'] = [] + # if find type_map.raw, use it + if 'type_map.raw' in g.keys(): + my_type_map = list(np.char.decode(g['type_map.raw'][:])) + # else try to use arg type_map + elif type_map is not None: + my_type_map = type_map + # in the last case, make artificial atom names + else: + my_type_map = [] + for ii in range(ntypes) : + my_type_map.append('Type_%d' % ii) + assert(len(my_type_map) >= len(data['atom_numbs'])) + for ii in range(len(data['atom_numbs'])) : + data['atom_names'].append(my_type_map[ii]) + + data['orig'] = np.zeros([3]) + if 'nopbc' in g.keys(): + data['nopbc'] = True + sets = globfilter(g.keys(), 'set.*') + + data_types = { + 'cells': {'fn': 'box', 'labeled': False, 'shape': (3,3), 'required': 'nopbc' not in data}, + 'coords': {'fn': 'coord', 'labeled': False, 'shape': (natoms,3), 'required': True}, + 'energies': {'fn': 'energy', 'labeled': True, 'shape': tuple(), 'required': False}, + 'forces': {'fn': 'force', 'labeled': True, 'shape': (natoms,3), 'required': False}, + 'virials': {'fn': 'virial', 'labeled': True, 'shape': (3,3), 'required': False}, + } + + for dt, prop in data_types.items(): + all_data = [] + + for ii in sets: + set = g[ii] + fn = '%s.npy' % prop['fn'] + if fn in set.keys(): + dd = set[fn][:] + nframes = dd.shape[0] + all_data.append(np.reshape(dd, (nframes, *prop['shape']))) + elif prop['required']: + raise RuntimeError("%s/%s/%s not found" % (folder, ii, fn)) + + if len(all_data) > 0 : + data[dt] = np.concatenate(all_data, axis = 0) + return data + +def dump(f: h5py.File, + folder: str, + data: dict, + set_size = 5000, + comp_prec = np.float32, + ) -> None: + """Dump data to a HDF5 file. + + Parameters + ---------- + f : h5py.File + HDF5 file object + folder : str + path in the HDF5 file + data : dict + System or LabeledSystem data + set_size : int, default: 5000 + size of a set + comp_prec : np.dtype, default: np.float32 + precision of data + """ + # if folder is None, use the root of the file + if folder: + if folder in f: + del f[folder] + g = f.create_group(folder) + else: + g = f + # dump raw (array in fact) + g.create_dataset('type.raw', data=data['atom_types']) + g.create_dataset('type_map.raw', data=np.array(data['atom_names'], dtype='S')) + # BondOrder System + if "bonds" in data: + g.create_dataset("bonds.raw", data=data['bonds']) + if "formal_charges" in data: + g.create_dataset("formal_charges.raw", data=data['formal_charges']) + # reshape frame properties and convert prec + nframes = data['cells'].shape[0] + + nopbc = data.get("nopbc", False) + reshaped_data = {} + + data_types = { + 'cells': {'fn': 'box', 'shape': (nframes, 9), 'dump': not nopbc}, + 'coords': {'fn': 'coord', 'shape': (nframes, -1), 'dump': True}, + 'energies': {'fn': 'energy', 'shape': (nframes,), 'dump': True}, + 'forces': {'fn': 'force', 'shape': (nframes, -1), 'dump': True}, + 'virials': {'fn': 'virial', 'shape': (nframes, 9), 'dump': True}, + } + for dt, prop in data_types.items(): + if dt in data: + if prop['dump']: + reshaped_data[dt] = np.reshape(data[dt], prop['shape']).astype(comp_prec) + + # dump frame properties: cell, coord, energy, force and virial + nsets = nframes // set_size + if set_size * nsets < nframes : + nsets += 1 + for ii in range(nsets) : + set_stt = ii * set_size + set_end = (ii+1) * set_size + set_folder = g.create_group('set.%03d' % ii) + for dt, prop in data_types.items(): + if dt in reshaped_data: + set_folder.create_dataset('%s.npy' % prop['fn'], data=reshaped_data[dt][set_stt:set_end]) + + if nopbc: + g.create_dataset("nopbc", True) diff --git a/dpdata/plugins/deepmd.py b/dpdata/plugins/deepmd.py index c2636e993..6fd209a3f 100644 --- a/dpdata/plugins/deepmd.py +++ b/dpdata/plugins/deepmd.py @@ -1,6 +1,8 @@ import dpdata.deepmd.raw import dpdata.deepmd.comp +import dpdata.deepmd.hdf5 import numpy as np +import h5py from dpdata.format import Format @@ -54,3 +56,49 @@ def from_labeled_system(self, file_name, type_map=None, **kwargs): return dpdata.deepmd.comp.to_system_data(file_name, type_map=type_map, labels=True) MultiMode = Format.MultiModes.Directory + +@Format.register("deepmd/hdf5") +class DeePMDCompFormat(Format): + """HDF5 format for DeePMD-kit. + + Examples + -------- + Dump a MultiSystems to a HDF5 file: + >>> import dpdata + >>> dpdata.MultiSystems().from_deepmd_npy("data").to_deepmd_hdf5("data.hdf5") + """ + def from_system(self, file_name, type_map=None, **kwargs): + s = file_name.split("#") + name = s[1] if len(s) > 1 else "" + with h5py.File(s[0], 'r') as f: + return dpdata.deepmd.hdf5.to_system_data(f, name, type_map=type_map, labels=False) + + def from_labeled_system(self, file_name, type_map=None, **kwargs): + s = file_name.split("#") + name = s[1] if len(s) > 1 else "" + with h5py.File(s[0], 'r') as f: + return dpdata.deepmd.hdf5.to_system_data(f, name, type_map=type_map, labels=True) + + def to_system(self, + data : dict, + file_name : str, + set_size : int = 5000, + comp_prec : np.dtype = np.float32, + **kwargs): + s = file_name.split("#") + name = s[1] if len(s) > 1 else "" + mode = 'a' if name else 'w' + with h5py.File(s[0], mode) as f: + dpdata.deepmd.hdf5.dump(f, name, data, set_size = set_size, comp_prec = comp_prec) + + def from_multi_systems(self, + directory, + **kwargs): + with h5py.File(directory, 'r') as f: + return ["%s#%s" % (directory, ff) for ff in f.keys()] + + def to_multi_systems(self, + formulas, + directory, + **kwargs): + return ["%s#%s" % (directory, ff) for ff in formulas] diff --git a/setup.py b/setup.py index c0ded7819..99cac1219 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ readme = f.read() # install_requires = ['xml'] -install_requires=['numpy>=1.14.3', 'monty', 'scipy'] +install_requires=['numpy>=1.14.3', 'monty', 'scipy', 'h5py', 'wcmatch'] setuptools.setup( name="dpdata", diff --git a/tests/test_deepmd_hdf5.py b/tests/test_deepmd_hdf5.py new file mode 100644 index 000000000..3c15b45e8 --- /dev/null +++ b/tests/test_deepmd_hdf5.py @@ -0,0 +1,45 @@ +import os +import numpy as np +import unittest +from context import dpdata +from comp_sys import CompLabeledSys, CompSys, IsPBC + +class TestDeepmdLoadDumpComp(unittest.TestCase, CompLabeledSys, IsPBC): + def setUp (self) : + self.system_1 = dpdata.LabeledSystem('poscars/OUTCAR.h2o.md', + fmt = 'vasp/outcar') + self.system_1.to_deepmd_hdf5('tmp.deepmd.hdf5', + prec = np.float64, + set_size = 2) + + self.system_2 = dpdata.LabeledSystem('tmp.deepmd.hdf5', + fmt = 'deepmd/hdf5', + type_map = ['O', 'H']) + self.places = 6 + self.e_places = 6 + self.f_places = 6 + self.v_places = 6 + + def tearDown(self) : + if os.path.exists('tmp.deepmd.hdf5'): + os.remove('tmp.deepmd.hdf5') + + +class TestDeepmdCompNoLabels(unittest.TestCase, CompSys, IsPBC) : + def setUp (self) : + self.system_1 = dpdata.System('poscars/POSCAR.h2o.md', + fmt = 'vasp/poscar') + self.system_1.to_deepmd_hdf5('tmp.deepmd.hdf5', + prec = np.float64, + set_size = 2) + self.system_2 = dpdata.System('tmp.deepmd.hdf5', + fmt = 'deepmd/hdf5', + type_map = ['O', 'H']) + self.places = 6 + self.e_places = 6 + self.f_places = 6 + self.v_places = 6 + + def tearDown(self) : + if os.path.exists('tmp.deepmd.hdf5'): + os.remove('tmp.deepmd.hdf5')