From 4648764c2a92d58a56e4ca33e4711d824c100a4e Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 18 Jun 2021 04:25:59 -0400 Subject: [PATCH 1/8] only add inputs_zero node if atom_ener exists (#766) --- deepmd/fit/ener.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/deepmd/fit/ener.py b/deepmd/fit/ener.py index 2708aa7621..03145076cb 100644 --- a/deepmd/fit/ener.py +++ b/deepmd/fit/ener.py @@ -357,7 +357,9 @@ def build (self, initializer = tf.constant_initializer(self.aparam_inv_std)) inputs = tf.cast(tf.reshape(inputs, [-1, self.dim_descrpt * natoms[0]]), self.fitting_precision) - inputs_zero = tf.zeros_like(inputs, dtype=GLOBAL_TF_FLOAT_PRECISION) + if len(self.atom_ener): + # only for atom_ener + inputs_zero = tf.zeros_like(inputs, dtype=GLOBAL_TF_FLOAT_PRECISION) if bias_atom_e is not None : From 0c7f490de24c21bcf57089336cbb5c584cf5b6f8 Mon Sep 17 00:00:00 2001 From: tuoping <80671886+tuoping@users.noreply.github.com> Date: Sat, 19 Jun 2021 17:39:48 +0800 Subject: [PATCH 2/8] add type-embedding developer doc (#762) * add type-embedding developer doc * add link to development/type-embedding.md in train-se-e2-a-tebd.md * changed the link in doc/development/type-embedding.md to relative. Co-authored-by: tuoping --- doc/development/index.md | 1 + doc/development/type-embedding.md | 67 +++++++++++++++++++++++++++++++ doc/train-se-e2-a-tebd.md | 6 ++- 3 files changed, 72 insertions(+), 2 deletions(-) create mode 100644 doc/development/type-embedding.md diff --git a/doc/development/index.md b/doc/development/index.md index 6f456a06e6..79fab0d980 100644 --- a/doc/development/index.md +++ b/doc/development/index.md @@ -3,3 +3,4 @@ - [Python API](../api.rst) - [C++ API](../API_CC/api_cc.rst) - [Coding Conventions](coding-conventions.rst) +- [Atom Type Embedding](type-embedding.md) diff --git a/doc/development/type-embedding.md b/doc/development/type-embedding.md new file mode 100644 index 0000000000..17c8a63ba5 --- /dev/null +++ b/doc/development/type-embedding.md @@ -0,0 +1,67 @@ +# Atom Type Embedding +## Overview +Here is an overview of the deepmd-kit algorithm. Given a specific centric atom, we can obtain the matrix describing its local environment, named as `R`. It is consist of the distance between centric atom and its neighbors, as well as a direction vector. We can embed each distance into a vector of M1 dimension by a `embedding net`, so the environment matrix `R` can be embed into matrix `G`. We can thus extract a descriptor vector (of M1*M2 dim) of the centric atom from the `G` by some matrix multiplication, and put the descriptor into `fitting net` to get predicted energy `E`. The vanilla version of deepmd-kit build `embedding net` and `fitting net` relying on the atom type, resulting in O(N) memory usage. After applying atom type embedding, in deepmd-kit v2.0, we can share one `embedding net` and one `fitting net` in total, which decline training complexity largely. + +## Preliminary +In the following chart, you can find the meaning of symbols used to clarify the atom type embedding algorithm. + +Symbol| Meaning +---Atom| :---: +is| Type of centric atom +j| Type of neighbor atom +s_ij| Distance between centric atom and neighbor atom +G_ij(·)Atom| Origin embedding net, take s_ij as input and output embedding vector of M1 dim +G(·) | Shared embedding net +Multi(·) | Matrix multiplication and flattening, output the descriptor vector of M1*M2 dim +F_i(·) | Origin fitting net, take the descriptor vector as input and output energy +F(·) | Shared fitting net +A(·) | Atom type embedding net, input is atom type, output is type embedding vector of dim `nchanl` + +So, we can formulate the training process as follows. +Vanilla deepmd-kit algorithm: +``` +Energy = F_i( Multi( G_ij( s_ij ) ) ) +``` +Deepmd-kit applying atom type embedding: +``` +Energy = F( [ Multi( G_ij( [s_ij, A(i), A(j)] ) ), A(j)] ) +``` +or +``` +Energy = F( [ Multi( G_ij( [s_ij, A(j)] ) ), A(j)] ) +``` +The difference between two variants above is whether using the information of centric atom when generating the descriptor. Users can choose by modifying the `type_one_side` hyper-parameter in the input json file. + +## How to use +A detailed introduction can be found at [`se_e2_a_tebd`](../train-se-e2-a-tebd.md). Looking for a fast start up, you can simply add a `type_embedding` section in the input json file as displayed in the following, and the algorithm will adopt atom type embedding algorithm automatically. +An example of `type_embedding` is like +```json= + "type_embedding":{ + "neuron":Type[2, 4, 8], + "resnet_dt":Atomfalse, + "seed":Type1 + } +``` + + +## Code Modification +Atom type embedding can be applied to varied `embedding net` and `fitting net`, as a result we build a class `TypeEmbedNet` to support this free combination. In the following, we will go through the execution process of the code to explain our code modification. + +### trainer (train/trainer.py) +In trainer.py, it will parse the parameter from the input json file. If a `type_embedding` section is detected, it will build a `TypeEmbedNet`, which will be later input in the `model`. `model` will be built in the function `_build_network`. +### model (model/ener.py) +When building the operation graph of the `model` in `model.build`. If a `TypeEmbedNet` is detected, it will build the operation graph of `type embed net`, `embedding net` and `fitting net` by order. The building process of `type embed net` can be found in `TypeEmbedNet.build`, which output the type embedding vector of each atom type (of [ntypes * nchanl] dimension). We then save the type embedding vector into `input_dict`, so that they can be fetched later in `embedding net` and `fitting net`. +### embedding net (descriptor/se*.py) +In `embedding net`, we shall take local environment `R` as input and output matrix `G`. Functions called in this process by order is +``` +build -> _pass_filter -> _filter -> _filter_lower +``` +* `_pass_filter`: It will first detect whether an atom type embedding exists, if so, it will apply atom type embedding algorithm and doesn't divide the input by type. +* `_filter`: It will call `_filter_lower` function to obtain the result of matrix multiplication (`G^T·R` ), do further multiplication involved in Multi(·), and finally output the result of descriptor vector of M1*M2 dim. +* `_filter_lower`: The main function handling input modification. If type embedding exists, it will call `_concat_type_embedding` function to concat the first column of input `R` (the column of s_ij) with the atom type embedding information. It will decide whether using the atom type embedding vector of centric atom according to the value of `type_one_side` (if set **True**, then we only use the vector of the neighbor atom). The modified input will be put into the `fitting net` to get `G` for further matrix multiplication stage. + +### fitting net (fit/ener.py) +In `fitting net`, it take the descriptor vector as input, whose dimension is [natoms, (M1*M2)]. Because we need to involve information of centric atom in this step, we need to generate a matrix named as `atype_embed` (of dim [natoms, nchanl]), in which each row is the type embedding vector of the specific centric atom. The input is sorted by type of centric atom, we also know the number of a particular atom type (stored in `natoms[2+i]`), thus we get the type vector of centric atom. In the build phrase of fitting net, it will check whether type embedding exist in `input_dict` and fetch them. After that calling `embed_atom_type` function to lookup embedding vector for type vector of centric atom to obtain `atype_embed`, and concat input with it ([input, atype_embed]). The modified input go through `fitting net` to get predicted energy. + + +**P.S.: You can't apply compression method while using atom type embedding** diff --git a/doc/train-se-e2-a-tebd.md b/doc/train-se-e2-a-tebd.md index e895adc858..2179b8b598 100644 --- a/doc/train-se-e2-a-tebd.md +++ b/doc/train-se-e2-a-tebd.md @@ -35,8 +35,10 @@ The construction of type embedding net is given by `type_embedding`. An example * `seed` gives the random seed that is used to generate random numbers when initializing the model parameters. - A complete training input script of this example can be find in the directory. ```bash $deepmd_source_dir/examples/water/se_e2_a_tebd/input.json -``` \ No newline at end of file +``` +See [here](development/type-embedding.md) for further explanation of `type embedding`. + +**P.S.: You can't apply compression method while using atom type embedding** From f326a86dba80287c8dc3f12db39ea640f86e3eef Mon Sep 17 00:00:00 2001 From: Denghui Lu Date: Sat, 19 Jun 2021 17:40:33 +0800 Subject: [PATCH 3/8] add model compression support for models with exclude_types feature (#754) --- deepmd/descriptor/se_a.py | 2 +- deepmd/utils/tabulate.py | 99 ++++++++++-------- source/tests/test_model_compression.py | 134 +++++++++++++++++++++++++ 3 files changed, 194 insertions(+), 41 deletions(-) diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py index 10ea49abd9..a95f0ca39a 100644 --- a/deepmd/descriptor/se_a.py +++ b/deepmd/descriptor/se_a.py @@ -268,7 +268,7 @@ def enable_compression(self, self.compress = True self.model_file = model_file self.table_config = [table_extrapolate, table_stride_1, table_stride_2, check_frequency] - self.table = DeepTabulate(self.model_file, self.type_one_side) + self.table = DeepTabulate(self.model_file, self.type_one_side, self.exclude_types) self.lower, self.upper \ = self.table.build(min_nbor_dist, table_extrapolate, diff --git a/deepmd/utils/tabulate.py b/deepmd/utils/tabulate.py index 757189c16d..944428bf48 100644 --- a/deepmd/utils/tabulate.py +++ b/deepmd/utils/tabulate.py @@ -19,7 +19,8 @@ class DeepTabulate(): """ def __init__(self, model_file : str, - type_one_side : bool = False) -> None: + type_one_side : bool = False, + exclude_types : List[List[int]] = []) -> None: """ Constructor @@ -29,10 +30,15 @@ def __init__(self, The frozen model type_one_side Try to build N_types tables. Otherwise, building N_types^2 tables + exclude_types : list[list[int]] + The Excluded types """ self.model_file = model_file self.type_one_side = type_one_side + self.exclude_types = exclude_types + if self.type_one_side and len(self.exclude_types) != 0: + raise RunTimeError('"type_one_side" is not compatible with "exclude_types"') self.graph, self.graph_def = self._load_graph() self.sess = tf.Session(graph = self.graph) @@ -55,19 +61,26 @@ def __init__(self, self.rcut_smth = self.descrpt.get_attr('rcut_r_smth') self.filter_variable_nodes = self._load_matrix_node() - self.layer_size = int(len(self.filter_variable_nodes) / (self.ntypes * self.ntypes * 2)) + for tt in self.exclude_types: + if (tt[0] not in range(self.ntypes)) or (tt[1] not in range(self.ntypes)): + raise RuntimeError("exclude types" + str(tt) + " must within the number of atomic types " + str(self.ntypes) + "!") + if (self.ntypes * self.ntypes - len(self.exclude_types) == 0): + raise RuntimeError("empty embedding-net are not supported in model compression!") + + self.layer_size = len(self.filter_variable_nodes) // ((self.ntypes * self.ntypes - len(self.exclude_types)) * 2) self.table_size = self.ntypes * self.ntypes if type_one_side : - self.layer_size = int(len(self.filter_variable_nodes) / (self.ntypes * 2)) + self.layer_size = len(self.filter_variable_nodes) // (self.ntypes * 2) self.table_size = self.ntypes # self.value_type = self.filter_variable_nodes["filter_type_0/matrix_1_0"].dtype #"filter_type_0/matrix_1_0" must exit~ # get trained variables self.bias = self._get_bias() self.matrix = self._get_matrix() - self.data_type = type(self.matrix["layer_1"][0][0][0]) - assert self.matrix["layer_1"][0].size > 0, "no matrix exist in matrix array!" - self.last_layer_size = self.matrix["layer_" + str(self.layer_size)][0].shape[1] + for item in self.matrix["layer_" + str(self.layer_size)]: + if len(item) != 0: + self.data_type = type(item[0][0]) + self.last_layer_size = item.shape[1] # define tables self.data = {} @@ -91,7 +104,7 @@ def build(self, The uniform stride of the first table stride1 The uniform stride of the second table - + Returns ---------- lower @@ -106,27 +119,27 @@ def build(self, xx = np.append(xx, np.array([extrapolate * upper], dtype = self.data_type)) self.nspline = int((upper - lower) / stride0 + (extrapolate * upper - upper) / stride1) for ii in range(self.table_size): - vv, dd, d2 = self._make_data(xx, ii) - if self.type_one_side: - net = "filter_-1_net_" + str(int(ii)) - else: - net = "filter_" + str(int(ii / self.ntypes)) + "_net_" + str(int(ii % self.ntypes)) - self.data[net] = np.zeros([self.nspline, 6 * self.last_layer_size], dtype = self.data_type) - # for jj in tqdm(range(self.nspline), desc = 'DEEPMD INFO |-> deepmd.utils.tabulate\t\t\t' + net + ', tabulating'): - for jj in range(self.nspline): - for kk in range(self.last_layer_size): - if jj < int((upper - lower) / stride0): - tt = stride0 - else: - tt = stride1 - hh = vv[jj + 1][kk] - vv[jj][kk] - self.data[net][jj][kk * 6 + 0] = vv[jj][kk] - self.data[net][jj][kk * 6 + 1] = dd[jj][kk] - self.data[net][jj][kk * 6 + 2] = 0.5 * d2[jj][kk] - self.data[net][jj][kk * 6 + 3] = (1 / (2 * tt * tt * tt)) * (20 * hh - (8 * dd[jj + 1][kk] + 12 * dd[jj][kk]) * tt - (3 * d2[jj][kk] - d2[jj + 1][kk]) * tt * tt) - self.data[net][jj][kk * 6 + 4] = (1 / (2 * tt * tt * tt * tt)) * (-30 * hh + (14 * dd[jj + 1][kk] + 16 * dd[jj][kk]) * tt + (3 * d2[jj][kk] - 2 * d2[jj + 1][kk]) * tt * tt) - self.data[net][jj][kk * 6 + 5] = (1 / (2 * tt * tt * tt * tt * tt)) * (12 * hh - 6 * (dd[jj + 1][kk] + dd[jj][kk]) * tt + (d2[jj + 1][kk] - d2[jj][kk]) * tt * tt) - self.data[net] + if self.type_one_side or (ii // self.ntypes, int(ii % self.ntypes)) not in self.exclude_types: + vv, dd, d2 = self._make_data(xx, ii) + if self.type_one_side: + net = "filter_-1_net_" + str(ii) + else: + net = "filter_" + str(ii // self.ntypes) + "_net_" + str(int(ii % self.ntypes)) + self.data[net] = np.zeros([self.nspline, 6 * self.last_layer_size], dtype = self.data_type) + # for jj in tqdm(range(self.nspline), desc = 'DEEPMD INFO |-> deepmd.utils.tabulate\t\t\t' + net + ', tabulating'): + for jj in range(self.nspline): + for kk in range(self.last_layer_size): + if jj < int((upper - lower) / stride0): + tt = stride0 + else: + tt = stride1 + hh = vv[jj + 1][kk] - vv[jj][kk] + self.data[net][jj][kk * 6 + 0] = vv[jj][kk] + self.data[net][jj][kk * 6 + 1] = dd[jj][kk] + self.data[net][jj][kk * 6 + 2] = 0.5 * d2[jj][kk] + self.data[net][jj][kk * 6 + 3] = (1 / (2 * tt * tt * tt)) * (20 * hh - (8 * dd[jj + 1][kk] + 12 * dd[jj][kk]) * tt - (3 * d2[jj][kk] - d2[jj + 1][kk]) * tt * tt) + self.data[net][jj][kk * 6 + 4] = (1 / (2 * tt * tt * tt * tt)) * (-30 * hh + (14 * dd[jj + 1][kk] + 16 * dd[jj][kk]) * tt + (3 * d2[jj][kk] - 2 * d2[jj + 1][kk]) * tt * tt) + self.data[net][jj][kk * 6 + 5] = (1 / (2 * tt * tt * tt * tt * tt)) * (12 * hh - 6 * (dd[jj + 1][kk] + dd[jj][kk]) * tt + (d2[jj + 1][kk] - d2[jj][kk]) * tt * tt) return lower, upper def _load_graph(self): @@ -165,14 +178,17 @@ def _get_bias(self): bias["layer_" + str(layer)] = [] if self.type_one_side: for ii in range(0, self.ntypes): - tensor_value = np.frombuffer (self.filter_variable_nodes["filter_type_all/bias_" + str(layer) + "_" + str(int(ii))].tensor_content) - tensor_shape = tf.TensorShape(self.filter_variable_nodes["filter_type_all/bias_" + str(layer) + "_" + str(int(ii))].tensor_shape).as_list() + tensor_value = np.frombuffer (self.filter_variable_nodes["filter_type_all/bias_" + str(layer) + "_" + str(ii)].tensor_content) + tensor_shape = tf.TensorShape(self.filter_variable_nodes["filter_type_all/bias_" + str(layer) + "_" + str(ii)].tensor_shape).as_list() bias["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape)) else: for ii in range(0, self.ntypes * self.ntypes): - tensor_value = np.frombuffer(self.filter_variable_nodes["filter_type_" + str(int(ii / self.ntypes)) + "/bias_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_content) - tensor_shape = tf.TensorShape(self.filter_variable_nodes["filter_type_" + str(int(ii / self.ntypes)) + "/bias_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_shape).as_list() - bias["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape)) + if (ii // self.ntypes, int(ii % self.ntypes)) not in self.exclude_types: + tensor_value = np.frombuffer(self.filter_variable_nodes["filter_type_" + str(ii // self.ntypes) + "/bias_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_content) + tensor_shape = tf.TensorShape(self.filter_variable_nodes["filter_type_" + str(ii // self.ntypes) + "/bias_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_shape).as_list() + bias["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape)) + else: + bias["layer_" + str(layer)].append(np.array([])) return bias def _get_matrix(self): @@ -181,14 +197,17 @@ def _get_matrix(self): matrix["layer_" + str(layer)] = [] if self.type_one_side: for ii in range(0, self.ntypes): - tensor_value = np.frombuffer (self.filter_variable_nodes["filter_type_all/matrix_" + str(layer) + "_" + str(int(ii))].tensor_content) - tensor_shape = tf.TensorShape(self.filter_variable_nodes["filter_type_all/matrix_" + str(layer) + "_" + str(int(ii))].tensor_shape).as_list() + tensor_value = np.frombuffer (self.filter_variable_nodes["filter_type_all/matrix_" + str(layer) + "_" + str(ii)].tensor_content) + tensor_shape = tf.TensorShape(self.filter_variable_nodes["filter_type_all/matrix_" + str(layer) + "_" + str(ii)].tensor_shape).as_list() matrix["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape)) else: for ii in range(0, self.ntypes * self.ntypes): - tensor_value = np.frombuffer(self.filter_variable_nodes["filter_type_" + str(int(ii / self.ntypes)) + "/matrix_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_content) - tensor_shape = tf.TensorShape(self.filter_variable_nodes["filter_type_" + str(int(ii / self.ntypes)) + "/matrix_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_shape).as_list() - matrix["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape)) + if (ii // self.ntypes, int(ii % self.ntypes)) not in self.exclude_types: + tensor_value = np.frombuffer(self.filter_variable_nodes["filter_type_" + str(ii // self.ntypes) + "/matrix_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_content) + tensor_shape = tf.TensorShape(self.filter_variable_nodes["filter_type_" + str(ii // self.ntypes) + "/matrix_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_shape).as_list() + matrix["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape)) + else: + matrix["layer_" + str(layer)].append(np.array([])) return matrix # one-by-one executions @@ -221,8 +240,8 @@ def _layer_1(self, x, w, b): def _save_data(self): for ii in range(self.ntypes * self.ntypes): - net = "filter_" + str(int(ii / self.ntypes)) + "_net_" + str(int(ii % self.ntypes)) - np.savetxt('data_' + str(int(ii)), self.data[net]) + net = "filter_" + str(ii // self.ntypes) + "_net_" + str(int(ii % self.ntypes)) + np.savetxt('data_' + str(ii), self.data[net]) def _get_env_mat_range(self, min_nbor_dist): diff --git a/source/tests/test_model_compression.py b/source/tests/test_model_compression.py index 91a073c4c2..d67e209c42 100644 --- a/source/tests/test_model_compression.py +++ b/source/tests/test_model_compression.py @@ -384,3 +384,137 @@ def test_ase(self): for ii in range(nframes): self.assertAlmostEqual(ee0.reshape([-1])[ii], ee1.reshape([-1])[ii], places = default_places) +class TestDeepPotAPBCExcludeTypes(unittest.TestCase) : + def setUp(self): + self.data_file = str(tests_path / os.path.join("model_compression", "data")) + self.frozen_model = str(tests_path / "dp-original.pb") + self.compressed_model = str(tests_path / "dp-compressed.pb") + self.INPUT = str(tests_path / "input.json") + jdata = j_loader(str(tests_path / os.path.join("model_compression", "input.json"))) + jdata["training"]["training_data"]["systems"] = self.data_file + jdata["training"]["validation_data"]["systems"] = self.data_file + jdata["model"]["descriptor"]["exclude_types"] = [[0, 1]] + with open(self.INPUT, "w") as fp: + json.dump(jdata, fp, indent=4) + + ret = os.system("dp train " + self.INPUT) + assert(ret == 0), "DP train error!" + ret = os.system("dp freeze -o " + self.frozen_model) + assert(ret == 0), "DP freeze error!" + ret = os.system("dp compress " + self.INPUT + " -i " + self.frozen_model + " -o " + self.compressed_model) + assert(ret == 0), "DP model compression error!" + + self.dp_original = DeepPot(self.frozen_model) + self.dp_compressed = DeepPot(self.compressed_model) + self.coords = np.array([12.83, 2.56, 2.18, + 12.09, 2.87, 2.74, + 00.25, 3.32, 1.68, + 3.36, 3.00, 1.81, + 3.51, 2.51, 2.60, + 4.27, 3.22, 1.56]) + self.atype = [0, 1, 1, 0, 1, 1] + self.box = np.array([13., 0., 0., 0., 13., 0., 0., 0., 13.]) + + def tearDown(self): + _file_delete(self.INPUT) + _file_delete(self.frozen_model) + _file_delete(self.compressed_model) + _file_delete("out.json") + _file_delete("compress.json") + _file_delete("checkpoint") + _file_delete("lcurve.out") + _file_delete("model.ckpt.meta") + _file_delete("model.ckpt.index") + _file_delete("model.ckpt.data-00000-of-00001") + + def test_attrs(self): + self.assertEqual(self.dp_original.get_ntypes(), 2) + self.assertAlmostEqual(self.dp_original.get_rcut(), 6.0, places = default_places) + self.assertEqual(self.dp_original.get_type_map(), ['O', 'H']) + self.assertEqual(self.dp_original.get_dim_fparam(), 0) + self.assertEqual(self.dp_original.get_dim_aparam(), 0) + + self.assertEqual(self.dp_compressed.get_ntypes(), 2) + self.assertAlmostEqual(self.dp_compressed.get_rcut(), 6.0, places = default_places) + self.assertEqual(self.dp_compressed.get_type_map(), ['O', 'H']) + self.assertEqual(self.dp_compressed.get_dim_fparam(), 0) + self.assertEqual(self.dp_compressed.get_dim_aparam(), 0) + + def test_1frame(self): + ee0, ff0, vv0 = self.dp_original.eval(self.coords, self.box, self.atype, atomic = False) + ee1, ff1, vv1 = self.dp_compressed.eval(self.coords, self.box, self.atype, atomic = False) + # check shape of the returns + nframes = 1 + natoms = len(self.atype) + self.assertEqual(ee0.shape, (nframes,1)) + self.assertEqual(ff0.shape, (nframes,natoms,3)) + self.assertEqual(vv0.shape, (nframes,9)) + self.assertEqual(ee1.shape, (nframes,1)) + self.assertEqual(ff1.shape, (nframes,natoms,3)) + self.assertEqual(vv1.shape, (nframes,9)) + # check values + for ii in range(ff0.size): + self.assertAlmostEqual(ff0.reshape([-1])[ii], ff1.reshape([-1])[ii], places = default_places) + for ii in range(nframes): + self.assertAlmostEqual(ee0.reshape([-1])[ii], ee1.reshape([-1])[ii], places = default_places) + for ii in range(nframes, 9): + self.assertAlmostEqual(vv0.reshape([-1])[ii], vv1.reshape([-1])[ii], places = default_places) + + def test_1frame_atm(self): + ee0, ff0, vv0, ae0, av0 = self.dp_original.eval(self.coords, self.box, self.atype, atomic = True) + ee1, ff1, vv1, ae1, av1 = self.dp_compressed.eval(self.coords, self.box, self.atype, atomic = True) + # check shape of the returns + nframes = 1 + natoms = len(self.atype) + self.assertEqual(ee0.shape, (nframes,1)) + self.assertEqual(ff0.shape, (nframes,natoms,3)) + self.assertEqual(vv0.shape, (nframes,9)) + self.assertEqual(ae0.shape, (nframes,natoms,1)) + self.assertEqual(av0.shape, (nframes,natoms,9)) + self.assertEqual(ee1.shape, (nframes,1)) + self.assertEqual(ff1.shape, (nframes,natoms,3)) + self.assertEqual(vv1.shape, (nframes,9)) + self.assertEqual(ae1.shape, (nframes,natoms,1)) + self.assertEqual(av1.shape, (nframes,natoms,9)) + # check values + for ii in range(ff0.size): + self.assertAlmostEqual(ff0.reshape([-1])[ii], ff1.reshape([-1])[ii], places = default_places) + for ii in range(ae0.size): + self.assertAlmostEqual(ae0.reshape([-1])[ii], ae1.reshape([-1])[ii], places = default_places) + for ii in range(av0.size): + self.assertAlmostEqual(av0.reshape([-1])[ii], av1.reshape([-1])[ii], places = default_places) + for ii in range(nframes): + self.assertAlmostEqual(ee0.reshape([-1])[ii], ee1.reshape([-1])[ii], places = default_places) + for ii in range(nframes, 9): + self.assertAlmostEqual(vv0.reshape([-1])[ii], vv1.reshape([-1])[ii], places = default_places) + + def test_2frame_atm(self): + coords2 = np.concatenate((self.coords, self.coords)) + box2 = np.concatenate((self.box, self.box)) + ee0, ff0, vv0, ae0, av0 = self.dp_original.eval(coords2, box2, self.atype, atomic = True) + ee1, ff1, vv1, ae1, av1 = self.dp_compressed.eval(coords2, box2, self.atype, atomic = True) + # check shape of the returns + nframes = 2 + natoms = len(self.atype) + self.assertEqual(ee0.shape, (nframes,1)) + self.assertEqual(ff0.shape, (nframes,natoms,3)) + self.assertEqual(vv0.shape, (nframes,9)) + self.assertEqual(ae0.shape, (nframes,natoms,1)) + self.assertEqual(av0.shape, (nframes,natoms,9)) + self.assertEqual(ee1.shape, (nframes,1)) + self.assertEqual(ff1.shape, (nframes,natoms,3)) + self.assertEqual(vv1.shape, (nframes,9)) + self.assertEqual(ae1.shape, (nframes,natoms,1)) + self.assertEqual(av1.shape, (nframes,natoms,9)) + + # check values + for ii in range(ff0.size): + self.assertAlmostEqual(ff0.reshape([-1])[ii], ff1.reshape([-1])[ii], places = default_places) + for ii in range(ae0.size): + self.assertAlmostEqual(ae0.reshape([-1])[ii], ae1.reshape([-1])[ii], places = default_places) + for ii in range(av0.size): + self.assertAlmostEqual(av0.reshape([-1])[ii], av1.reshape([-1])[ii], places = default_places) + for ii in range(nframes): + self.assertAlmostEqual(ee0.reshape([-1])[ii], ee1.reshape([-1])[ii], places = default_places) + for ii in range(nframes, 9): + self.assertAlmostEqual(vv0.reshape([-1])[ii], vv1.reshape([-1])[ii], places = default_places) \ No newline at end of file From 44d49f346dbab84723db0b086e9bac9c5792abd7 Mon Sep 17 00:00:00 2001 From: Denghui Lu Date: Sun, 20 Jun 2021 22:15:01 +0800 Subject: [PATCH 4/8] Add a more detailed introduction for model compression (#772) --- deepmd/entrypoints/compress.py | 16 +++++----- deepmd/entrypoints/main.py | 35 ++++++++++++--------- doc/getting-started.md | 47 ++++++++++++++++++++++------ source/tests/test_argument_parser.py | 8 ++--- 4 files changed, 70 insertions(+), 36 deletions(-) diff --git a/deepmd/entrypoints/compress.py b/deepmd/entrypoints/compress.py index 2193f0c1f9..1222b1e51b 100644 --- a/deepmd/entrypoints/compress.py +++ b/deepmd/entrypoints/compress.py @@ -23,7 +23,7 @@ def compress( input: str, output: str, extrapolate: int, - stride: float, + step: float, frequency: str, checkpoint_folder: str, mpi_log: str, @@ -34,9 +34,9 @@ def compress( """Compress model. The table is composed of fifth-order polynomial coefficients and is assembled from - two sub-tables. The first table takes the stride(parameter) as it's uniform stride, - while the second table takes 10 * stride as it's uniform stride. The range of the - first table is automatically detected by deepmd-kit, while the second table ranges + two sub-tables. The first table takes the step parameter as the domain's uniform step size, + while the second table takes 10 * step as it's uniform step size. The range of the + first table is automatically detected by the code, while the second table ranges from the first table's upper boundary(upper) to the extrapolate(parameter) * upper. Parameters @@ -49,8 +49,8 @@ def compress( compressed model filename extrapolate : int scale of model extrapolation - stride : float - uniform stride of tabulation's first table + step : float + uniform step size of the tabulation's first table frequency : str frequency of tabulation overflow check checkpoint_folder : str @@ -71,8 +71,8 @@ def compress( jdata["model"]["compress"]["model_file"] = input jdata["model"]["compress"]["table_config"] = [ extrapolate, - stride, - 10 * stride, + step, + 10 * step, int(frequency), ] # be careful here, if one want to refine the model diff --git a/deepmd/entrypoints/main.py b/deepmd/entrypoints/main.py index e0c1d8d4af..b245053053 100644 --- a/deepmd/entrypoints/main.py +++ b/deepmd/entrypoints/main.py @@ -242,8 +242,8 @@ def parse_args(args: Optional[List[str]] = None): # * compress model ***************************************************************** # Compress a model, which including tabulating the embedding-net. # The table is composed of fifth-order polynomial coefficients and is assembled - # from two sub-tables. The first table takes the stride(parameter) as it's uniform - # stride, while the second table takes 10 * stride as it\s uniform stride + # from two sub-tables. The first table takes the step(parameter) as it's uniform + # step, while the second table takes 10 * step as it\s uniform step #  The range of the first table is automatically detected by deepmd-kit, while the # second table ranges from the first table's upper boundary(upper) to the # extrapolate(parameter) * upper. @@ -263,36 +263,43 @@ def parse_args(args: Optional[List[str]] = None): "--input", default="frozen_model.pb", type=str, - help="The original frozen model, which will be compressed by the deepmd-kit", + help="The original frozen model, which will be compressed by the code", ) parser_compress.add_argument( "-o", "--output", - default="frozen_model_compress.pb", + default="frozen_model_compressed.pb", type=str, help="The compressed model", ) + parser_compress.add_argument( + "-s", + "--step", + default=0.01, + type=float, + help="Model compression uses fifth-order polynomials to interpolate the embedding-net. " + "It introduces two tables with different step size to store the parameters of the polynomials. " + "The first table covers the range of the training data, while the second table is an extrapolation of the training data. " + "The domain of each table is uniformly divided by a given step size. " + "And the step(parameter) denotes the step size of the first table and the second table will " + "use 10 * step as it's step size to save the memory. " + "Usually the value ranges from 0.1 to 0.001. " + "Smaller step means higher accuracy and bigger model size", + ) parser_compress.add_argument( "-e", "--extrapolate", default=5, type=int, - help="The scale of model extrapolation", - ) - parser_compress.add_argument( - "-s", - "--stride", - default=0.01, - type=float, - help="The uniform stride of tabulation's first table, the second table will " - "use 10 * stride as it's uniform stride", + help="The domain range of the first table is automatically detected by the code: [d_low, d_up]. " + "While the second table ranges from the first table's upper boundary(d_up) to the extrapolate(parameter) * d_up: [d_up, extrapolate * d_up]", ) parser_compress.add_argument( "-f", "--frequency", default=-1, type=int, - help="The frequency of tabulation overflow check(If the input environment " + help="The frequency of tabulation overflow check(Whether the input environment " "matrix overflow the first or second table range). " "By default do not check the overflow", ) diff --git a/doc/getting-started.md b/doc/getting-started.md index a355fc9836..6a10a49eee 100644 --- a/doc/getting-started.md +++ b/doc/getting-started.md @@ -244,23 +244,50 @@ positional arguments: optional arguments: -h, --help show this help message and exit + -v {DEBUG,3,INFO,2,WARNING,1,ERROR,0}, --log-level {DEBUG,3,INFO,2,WARNING,1,ERROR,0} + set verbosity level by string or number, 0=ERROR, + 1=WARNING, 2=INFO and 3=DEBUG (default: INFO) + -l LOG_PATH, --log-path LOG_PATH + set log file to log messages to disk, if not + specified, the logs will only be output to console + (default: None) + -m {master,collect,workers}, --mpi-log {master,collect,workers} + Set the manner of logging when running with MPI. + 'master' logs only on main process, 'collect' + broadcasts logs from workers to master and 'workers' + means each process will output its own log (default: + master) -i INPUT, --input INPUT The original frozen model, which will be compressed by - the deepmd-kit + the code (default: frozen_model.pb) -o OUTPUT, --output OUTPUT - The compressed model + The compressed model (default: + frozen_model_compressed.pb) + -s STEP, --step STEP Model compression uses fifth-order polynomials to + interpolate the embedding-net. It introduces two + tables with different step size to store the + parameters of the polynomials. The first table covers + the range of the training data, while the second table + is an extrapolation of the training data. The domain + of each table is uniformly divided by a given step + size. And the step(parameter) denotes the step size of + the first table and the second table will use 10 * + step as it's step size to save the memory. Usually the + value ranges from 0.1 to 0.001. Smaller step means + higher accuracy and bigger model size (default: 0.01) -e EXTRAPOLATE, --extrapolate EXTRAPOLATE - The scale of model extrapolation - -s STRIDE, --stride STRIDE - The uniform stride of tabulation's first table, the - second table will use 10 * stride as it's uniform - stride + The domain range of the first table is automatically + detected by the code: [d_low, d_up]. While the second + table ranges from the first table's upper + boundary(d_up) to the extrapolate(parameter) * d_up: + [d_up, extrapolate * d_up] (default: 5) -f FREQUENCY, --frequency FREQUENCY - The frequency of tabulation overflow check(If the + The frequency of tabulation overflow check(Whether the input environment matrix overflow the first or second table range). By default do not check the overflow - -d FOLDER, --folder FOLDER - path to checkpoint folder + (default: -1) + -c CHECKPOINT_FOLDER, --checkpoint-folder CHECKPOINT_FOLDER + path to checkpoint folder (default: .) ``` **Parameter explanation** diff --git a/source/tests/test_argument_parser.py b/source/tests/test_argument_parser.py index 1c85728e40..f9f28fb81b 100644 --- a/source/tests/test_argument_parser.py +++ b/source/tests/test_argument_parser.py @@ -272,10 +272,10 @@ def test_parser_compress(self): ARGS = { "INPUT": dict(type=str, value="INFILE"), "--output": dict(type=str, value="OUTFILE"), - "--extrapolate": dict(type=int, value=10), - "--stride": dict(type=float, value=0.1), - "--frequency": dict(type=int, value=1), - "--checkpoint-folder": dict(type=str, value="FOLDER"), + "--extrapolate": dict(type=int, value=5), + "--step": dict(type=float, value=0.1), + "--frequency": dict(type=int, value=-1), + "--checkpoint-folder": dict(type=str, value="."), } self.run_test(command="compress", mapping=ARGS) From b15944d82dac0466f1bb7245fb762a1eefd24cfa Mon Sep 17 00:00:00 2001 From: Yixiao Chen <19890787+y1xiaoc@users.noreply.github.com> Date: Tue, 22 Jun 2021 01:59:07 -0400 Subject: [PATCH 5/8] allow c++ tests to run without internet (#785) * fix bug when trying to find gtest in cmake * link librt explicitly * add script to test cc without installing tf --- source/api_cc/tests/CMakeLists.txt | 8 ++++---- source/install/test_cc_local.sh | 33 ++++++++++++++++++++++++++++++ source/lib/tests/CMakeLists.txt | 2 +- 3 files changed, 38 insertions(+), 5 deletions(-) create mode 100755 source/install/test_cc_local.sh diff --git a/source/api_cc/tests/CMakeLists.txt b/source/api_cc/tests/CMakeLists.txt index 111c7646bb..6768ff2ee6 100644 --- a/source/api_cc/tests/CMakeLists.txt +++ b/source/api_cc/tests/CMakeLists.txt @@ -95,17 +95,17 @@ else() endif() if (USE_CUDA_TOOLKIT) - target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread ${TensorFlow_LIBRARY} deepmd_op_cuda coverage_config) + target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread ${TensorFlow_LIBRARY} rt deepmd_op_cuda coverage_config) elseif(USE_ROCM_TOOLKIT) - target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread ${TensorFlow_LIBRARY} deepmd_op_rocm coverage_config) + target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread ${TensorFlow_LIBRARY} rt deepmd_op_rocm coverage_config) else() - target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread ${TensorFlow_LIBRARY} coverage_config) + target_link_libraries(runUnitTests gtest gtest_main ${libname} ${apiname} ${opname} pthread ${TensorFlow_LIBRARY} rt coverage_config) endif() add_test( runUnitTests runUnitTests ) find_package(GTest) -if(NOT GTEST_LIBRARY) +if(NOT GTEST_LIBRARIES) configure_file(../../cmake/googletest.cmake.in googletest-download/CMakeLists.txt) execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . RESULT_VARIABLE result diff --git a/source/install/test_cc_local.sh b/source/install/test_cc_local.sh new file mode 100755 index 0000000000..1f28d7efa8 --- /dev/null +++ b/source/install/test_cc_local.sh @@ -0,0 +1,33 @@ +set -e + +#------------------ + +SCRIPT_PATH=$(dirname $(realpath -s $0)) +NPROC=$(nproc --all) + +#------------------ + +BUILD_TMP_DIR=${SCRIPT_PATH}/../build_tests +mkdir -p ${BUILD_TMP_DIR} +cd ${BUILD_TMP_DIR} +cmake ../lib/tests +make -j${NPROC} + +#------------------ +${BUILD_TMP_DIR}/runUnitTests + + +#------------------ + +echo "try to find tensorflow in ${tensorflow_root}" +BUILD_TMP_DIR=${SCRIPT_PATH}/../build_cc_tests +INSTALL_PREFIX=${SCRIPT_PATH}/../../dp +mkdir -p ${BUILD_TMP_DIR} +mkdir -p ${INSTALL_PREFIX} +cd ${BUILD_TMP_DIR} +cmake -DINSTALL_TENSORFLOW=FALSE -DTENSORFLOW_ROOT=${tensorflow_root} ../api_cc/tests +make -j${NPROC} + +#------------------ +cd ${SCRIPT_PATH}/../api_cc/tests +${BUILD_TMP_DIR}/runUnitTests diff --git a/source/lib/tests/CMakeLists.txt b/source/lib/tests/CMakeLists.txt index b12734af9b..b5a0460c54 100644 --- a/source/lib/tests/CMakeLists.txt +++ b/source/lib/tests/CMakeLists.txt @@ -87,7 +87,7 @@ add_test( runUnitTests runUnitTests ) # ) find_package(GTest) -if(NOT GTEST_LIBRARY) +if(NOT GTEST_LIBRARIES) configure_file(../../cmake/googletest.cmake.in googletest-download/CMakeLists.txt) execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . RESULT_VARIABLE result From 45550344a73bbea16febc969459562f401412543 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Tue, 22 Jun 2021 11:33:51 -0400 Subject: [PATCH 6/8] support converting models generated in v1.3 to 2.0 compatibility (#725) * add v1.3 compatibility * remove TestModelMajorCompatability as compatibility was added By the way: Compatability should be compatibility * Also remove TestModelMinorCompatability * Update test_deeppot_a.py * Revert "Update test_deeppot_a.py" This reverts commit a03b5ee62107cfdd4f6fb621db4565c8531c4cd6. * Revert "Also remove TestModelMinorCompatability" This reverts commit 11fdd5c67f7b467bdaa4fb04d6280841caa90ffb. * Revert "remove TestModelMajorCompatability as compatibility was added" This reverts commit 40dd8073b268bf2a557cb4de4e50ae6fcb25f85b. * revert allowing 0.0 model * convert from model 1.3 to 2.0 * fix .gitignore --- deepmd/entrypoints/__init__.py | 4 +- deepmd/entrypoints/convert.py | 13 ++++++ deepmd/entrypoints/main.py | 33 +++++++++++++- deepmd/utils/convert.py | 59 ++++++++++++++++++++++++++ source/api_cc/include/common.h | 4 ++ source/api_cc/src/DeepPot.cc | 5 +++ source/api_cc/src/common.cc | 2 +- source/op/prod_env_mat_multi_device.cc | 53 ++++++++++++++++++++++- 8 files changed, 169 insertions(+), 4 deletions(-) create mode 100644 deepmd/entrypoints/convert.py create mode 100644 deepmd/utils/convert.py diff --git a/deepmd/entrypoints/__init__.py b/deepmd/entrypoints/__init__.py index 3beceace3a..4a02b995f3 100644 --- a/deepmd/entrypoints/__init__.py +++ b/deepmd/entrypoints/__init__.py @@ -8,6 +8,7 @@ from .train import train from .transfer import transfer from ..infer.model_devi import make_model_devi +from .convert import convert __all__ = [ "config", @@ -18,5 +19,6 @@ "transfer", "compress", "doc_train_input", - "make_model_devi" + "make_model_devi", + "convert", ] diff --git a/deepmd/entrypoints/convert.py b/deepmd/entrypoints/convert.py new file mode 100644 index 0000000000..4bf514fe51 --- /dev/null +++ b/deepmd/entrypoints/convert.py @@ -0,0 +1,13 @@ +from deepmd.utils.convert import convert_13_to_20 + +def convert( + *, + FROM: str, + input_model: str, + output_model: str, + **kwargs, +): + if FROM == '1.3': + convert_13_to_20(input_model, output_model) + else: + raise RuntimeError('unsupported model version ' + FROM) diff --git a/deepmd/entrypoints/main.py b/deepmd/entrypoints/main.py index b245053053..04dc245271 100644 --- a/deepmd/entrypoints/main.py +++ b/deepmd/entrypoints/main.py @@ -3,7 +3,7 @@ import argparse import logging from pathlib import Path -from typing import List, Optional +from typing import Dict, List, Optional from deepmd.entrypoints import ( compress, @@ -14,6 +14,7 @@ train, transfer, make_model_devi, + convert, ) from deepmd.loggers import set_log_handles @@ -359,6 +360,34 @@ def parse_args(args: Optional[List[str]] = None): help="The trajectory frequency of the system" ) + # * convert models + # supported: 1.3->2.0 + parser_transform = subparsers.add_parser( + 'convert-from', + parents=[parser_log], + help='convert lower model version to supported version', + ) + parser_transform.add_argument( + 'FROM', + type = str, + choices = ['1.3'], + help="The original model compatibility", + ) + parser_transform.add_argument( + '-i', + "--input-model", + default = "frozen_model.pb", + type=str, + help = "the input model", + ) + parser_transform.add_argument( + "-o", + "--output-model", + default = "convert_out.pb", + type=str, + help='the output model', + ) + parsed_args = parser.parse_args(args=args) if parsed_args.command is None: parser.print_help() @@ -402,6 +431,8 @@ def main(): doc_train_input() elif args.command == "model-devi": make_model_devi(**dict_args) + elif args.command == "convert-from": + convert(**dict_args) elif args.command is None: pass else: diff --git a/deepmd/utils/convert.py b/deepmd/utils/convert.py new file mode 100644 index 0000000000..0d9c39df88 --- /dev/null +++ b/deepmd/utils/convert.py @@ -0,0 +1,59 @@ +import os +from deepmd.env import tf +from google.protobuf import text_format +from tensorflow.python.platform import gfile + +def convert_13_to_20(input_model: str, output_model: str): + convert_pb_to_pbtxt(input_model, 'frozen_model.pbtxt') + convert_dp13_to_dp20('frozen_model.pbtxt') + convert_pbtxt_to_pb('frozen_model.pbtxt', output_model) + if os.path.isfile('frozen_model.pbtxt'): + os.remove('frozen_model.pbtxt') + print("the converted output model (2.0 support) is saved in %s" % output_model) + +def convert_pb_to_pbtxt(pbfile: str, pbtxtfile: str): + with gfile.FastGFile(pbfile, 'rb') as f: + graph_def = tf.GraphDef() + graph_def.ParseFromString(f.read()) + tf.import_graph_def(graph_def, name='') + tf.train.write_graph(graph_def, './', pbtxtfile, as_text=True) + +def convert_pbtxt_to_pb(pbtxtfile: str, pbfile: str): + with tf.gfile.FastGFile(pbtxtfile, 'r') as f: + graph_def = tf.GraphDef() + file_content = f.read() + # Merges the human-readable string in `file_content` into `graph_def`. + text_format.Merge(file_content, graph_def) + tf.train.write_graph(graph_def, './', pbfile, as_text=False) + +def convert_dp13_to_dp20(fname: str): + with open(fname) as fp: + file_content = fp.read() + file_content += """ +node { + name: "model_attr/model_version" + op: "Const" + attr { + key: "dtype" + value { + type: DT_STRING + } + } + attr { + key: "value" + value { + tensor { + dtype: DT_STRING + tensor_shape { + } + string_val: "1.0" + } + } + } +} +""" + file_content = file_content\ + .replace('DescrptSeA', 'ProdEnvMatA')\ + .replace('DescrptSeR', 'ProdEnvMatR') + with open(fname, 'w') as fp: + fp.write(file_content) diff --git a/source/api_cc/include/common.h b/source/api_cc/include/common.h index d59878693e..75fd61a6f7 100644 --- a/source/api_cc/include/common.h +++ b/source/api_cc/include/common.h @@ -87,6 +87,10 @@ void get_env_nthreads(int & num_intra_nthreads, int & num_inter_nthreads); +struct +tf_exception: public std::exception { +}; + /** * @brief Check TensorFlow status. Exit if not OK. * @param[in] status TensorFlow status. diff --git a/source/api_cc/src/DeepPot.cc b/source/api_cc/src/DeepPot.cc index 9400b47691..c862bb84fd 100644 --- a/source/api_cc/src/DeepPot.cc +++ b/source/api_cc/src/DeepPot.cc @@ -254,7 +254,12 @@ init (const std::string & model, const int & gpu_rank, const std::string & file_ if (dfparam < 0) dfparam = 0; if (daparam < 0) daparam = 0; model_type = get_scalar("model_attr/model_type"); + try{ model_version = get_scalar("model_attr/model_version"); + } catch (deepmd::tf_exception& e){ + // no model version defined in old models + model_version = "0.0"; + } if(! model_compatable(model_version)){ throw std::runtime_error( "incompatable model: version " + model_version diff --git a/source/api_cc/src/common.cc b/source/api_cc/src/common.cc index 74c317529e..579216cb2c 100644 --- a/source/api_cc/src/common.cc +++ b/source/api_cc/src/common.cc @@ -201,7 +201,7 @@ deepmd:: check_status(const tensorflow::Status& status) { if (!status.ok()) { std::cout << status.ToString() << std::endl; - exit(1); + throw deepmd::tf_exception(); } } diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc index e4e12cac2b..6320f1f501 100644 --- a/source/op/prod_env_mat_multi_device.cc +++ b/source/op/prod_env_mat_multi_device.cc @@ -25,6 +25,26 @@ REGISTER_OP("ProdEnvMatA") .Output("nlist: int32"); // only sel_a and rcut_r uesd. +// an alias of ProdEnvMatA -- Compatible with v1.3 +REGISTER_OP("DescrptSeA") + .Attr("T: {float, double}") + .Input("coord: T") + .Input("type: int32") + .Input("natoms: int32") + .Input("box : T") + .Input("mesh : int32") + .Input("davg: T") + .Input("dstd: T") + .Attr("rcut_a: float") + .Attr("rcut_r: float") + .Attr("rcut_r_smth: float") + .Attr("sel_a: list(int)") + .Attr("sel_r: list(int)") + .Output("descrpt: T") + .Output("descrpt_deriv: T") + .Output("rij: T") + .Output("nlist: int32"); + REGISTER_OP("ProdEnvMatR") .Attr("T: {float, double}") .Input("coord: T") @@ -42,6 +62,23 @@ REGISTER_OP("ProdEnvMatR") .Output("rij: T") .Output("nlist: int32"); +// an alias of ProdEnvMatR -- Compatible with v1.3 +REGISTER_OP("DescrptSeR") + .Attr("T: {float, double}") + .Input("coord: T") + .Input("type: int32") + .Input("natoms: int32") + .Input("box: T") + .Input("mesh: int32") + .Input("davg: T") + .Input("dstd: T") + .Attr("rcut: float") + .Attr("rcut_smth: float") + .Attr("sel: list(int)") + .Output("descrpt: T") + .Output("descrpt_deriv: T") + .Output("rij: T") + .Output("nlist: int32"); template static int @@ -1364,17 +1401,25 @@ _prepare_coord_nlist_gpu_rocm( // Register the CPU kernels. +// Compatible with v1.3 #define REGISTER_CPU(T) \ REGISTER_KERNEL_BUILDER( \ Name("ProdEnvMatA").Device(DEVICE_CPU).TypeConstraint("T"), \ ProdEnvMatAOp); \ REGISTER_KERNEL_BUILDER( \ Name("ProdEnvMatR").Device(DEVICE_CPU).TypeConstraint("T"), \ - ProdEnvMatROp); + ProdEnvMatROp); \ +REGISTER_KERNEL_BUILDER( \ + Name("DescrptSeA").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdEnvMatAOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("DescrptSeR").Device(DEVICE_CPU).TypeConstraint("T"), \ + ProdEnvMatROp); REGISTER_CPU(float); REGISTER_CPU(double); // Register the GPU kernels. +// Compatible with v1.3 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define REGISTER_GPU(T) \ REGISTER_KERNEL_BUILDER( \ @@ -1382,6 +1427,12 @@ REGISTER_KERNEL_BUILDER( ProdEnvMatAOp); \ REGISTER_KERNEL_BUILDER( \ Name("ProdEnvMatR").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("natoms").HostMemory("box"), \ + ProdEnvMatROp); \ +REGISTER_KERNEL_BUILDER( \ + Name("DescrptSeA").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("natoms").HostMemory("box"), \ + ProdEnvMatAOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("DescrptSeR").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("natoms").HostMemory("box"), \ ProdEnvMatROp); REGISTER_GPU(float); REGISTER_GPU(double); From 7d145c5a0329c683d0151dc6bef50154a9fea86c Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Sat, 26 Jun 2021 06:05:24 -0400 Subject: [PATCH 7/8] add documents for conda (#798) * add documentation for conda fix #650, #740, #750 * add docs * Update install.md Co-authored-by: Han Wang --- doc/install.md | 46 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/doc/install.md b/doc/install.md index e2d7859097..cd0c944161 100644 --- a/doc/install.md +++ b/doc/install.md @@ -3,6 +3,7 @@ - [Easy installation methods](#easy-installation-methods) - [Install from source code](#install-from-source-code) - [Install i-PI](#install-i-pi) +- [Building conda packages](#building-conda-packages) ## Easy installation methods @@ -18,17 +19,33 @@ After your easy installation, DeePMD-kit (`dp`) and LAMMPS (`lmp`) will be avail ### Install off-line packages Both CPU and GPU version offline packages are avaiable in [the Releases page](https://github.com/deepmodeling/deepmd-kit/releases). +Some packages are splited into two files due to size limit of GitHub. One may merge them into one after downloading: +```bash +cat deepmd-kit-2.0.0-cuda11.1_gpu-Linux-x86_64.sh.0 deepmd-kit-2.0.0-cuda11.1_gpu-Linux-x86_64.sh.1 > deepmd-kit-2.0.0-cuda11.1_gpu-Linux-x86_64.sh +``` + ### Install with conda DeePMD-kit is avaiable with [conda](https://github.com/conda/conda). Install [Anaconda](https://www.anaconda.com/distribution/#download-section) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html) first. -To install the CPU version: +One may create an environment that contains the CPU version of DeePMD-kit and LAMMPS: +```bash +conda create -n deepmd deepmd-kit=*=*cpu lammps-dp=*=*cpu -c https://conda.deepmodeling.org +``` + +Or one may want to create a GPU environment containing [CUDA Toolkit](https://docs.nvidia.com/deploy/cuda-compatibility/index.html#binary-compatibility__table-toolkit-driver): ```bash -conda install deepmd-kit=*=*cpu lammps-dp=*=*cpu -c deepmodeling +conda create -n deepmd deepmd-kit=*=*gpu lammps-dp=*=*gpu cudatoolkit=11.1 -c https://conda.deepmodeling.org -c nvidia ``` +One could change the CUDA Toolkit version from `11.1` to `10.1` or `10.0`. -To install the GPU version containing [CUDA 10.1](https://docs.nvidia.com/deploy/cuda-compatibility/index.html#binary-compatibility__table-toolkit-driver): +One may speficy the DeePMD-kit version such as `2.0.0` using ```bash -conda install deepmd-kit=*=*gpu lammps-dp=*=*gpu -c deepmodeling +conda create -n deepmd deepmd-kit=2.0.0=*cpu lammps-dp=2.0.0=*cpu -c https://conda.deepmodeling.org +``` + +One may enable the environment using +```bash +conda activate deepmd ``` ### Install with docker @@ -249,3 +266,24 @@ Test with Pytest: pip install pytest pytest --pyargs ipi.tests ``` + +## Building conda packages + +One may want to keep both convenience and personalization of the DeePMD-kit. To achieve this goal, one can consider builing conda packages. We provide building scripts in [deepmd-kit-recipes organization](https://github.com/deepmd-kit-recipes/). These building tools are driven by [conda-build](https://github.com/conda/conda-build) and [conda-smithy](https://github.com/conda-forge/conda-smithy). + +For example, if one wants to turn on `MPIIO` package in LAMMPS, go to [`lammps-dp-feedstock`](https://github.com/deepmd-kit-recipes/lammps-dp-feedstock/) repository and modify `recipe/build.sh`. `-D PKG_MPIIO=OFF` should be changed to `-D PKG_MPIIO=ON`. Then go to the main directory and executing + +```sh +./build-locally.py +``` + +This requires the Docker has been installed. After the building, the packages will be generated in `build_artifacts/linux-64` and `build_artifacts/noarch`, and then one can install then execuating +```sh +conda create -n deepmd lammps-dp -c file:///path/to/build_artifacts -c https://conda.deepmodeling.org -c nvidia +``` + +One may also upload packages to one's Anaconda channel, so they can be installed on other machines: + +```sh +anaconda upload /path/to/build_artifacts/linux-64/*.tar.bz2 /path/to/build_artifacts/noarch/*.tar.bz2 +``` From b145ca336e63a6d2628d8cdd3970ac395e577b98 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Sat, 26 Jun 2021 06:10:48 -0400 Subject: [PATCH 8/8] throw a message if tf runtime is incompatible (#797) * throw a message if tf runtime is incompatible fix #557 and #796. * still raise if tf version is correct * detect TF_CXX11_ABI_FLAG * format codes * fix lint * move messages into the function * fix lint * fix lints --- deepmd/env.py | 82 +++++++++++++++++++++++++------ source/cmake/Findtensorflow.cmake | 19 ++++++- source/cmake/tf_version.cpp | 10 ++++ source/config/run_config.ini | 2 + 4 files changed, 98 insertions(+), 15 deletions(-) create mode 100644 source/cmake/tf_version.cpp diff --git a/deepmd/env.py b/deepmd/env.py index 8c6937b7f7..5f5c344031 100644 --- a/deepmd/env.py +++ b/deepmd/env.py @@ -1,13 +1,14 @@ """Module that sets tensorflow working environment and exports inportant constants.""" -import os -from pathlib import Path import logging +import os import platform -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Any -import numpy as np -from imp import reload from configparser import ConfigParser +from imp import reload +from pathlib import Path +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple + +import numpy as np if TYPE_CHECKING: from types import ModuleType @@ -37,6 +38,7 @@ SHARED_LIB_MODULE = "op" + def set_env_if_empty(key: str, value: str, verbose: bool = True): """Set environment variable only if it is empty. @@ -74,7 +76,8 @@ def set_mkl(): """ if "mkl_rt" in np.__config__.get_info("blas_mkl_info").get("libraries", []): set_env_if_empty("KMP_BLOCKTIME", "0") - set_env_if_empty("KMP_AFFINITY", "granularity=fine,verbose,compact,1,0") + set_env_if_empty( + "KMP_AFFINITY", "granularity=fine,verbose,compact,1,0") reload(np) @@ -118,8 +121,10 @@ def get_tf_session_config() -> Any: intra_op_parallelism_threads=intra, inter_op_parallelism_threads=inter ) + default_tf_session_config = get_tf_session_config() + def get_module(module_name: str) -> "ModuleType": """Load force module. @@ -149,14 +154,59 @@ def get_module(module_name: str) -> "ModuleType": if not module_file.is_file(): raise FileNotFoundError(f"module {module_name} does not exist") else: - module = tf.load_op_library(str(module_file)) + try: + module = tf.load_op_library(str(module_file)) + except tf.errors.NotFoundError as e: + # check CXX11_ABI_FLAG is compatiblity + # see https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html + # ABI should be the same + if 'CXX11_ABI_FLAG' in tf.__dict__: + tf_cxx11_abi_flag = tf.CXX11_ABI_FLAG + else: + tf_cxx11_abi_flag = tf.sysconfig.CXX11_ABI_FLAG + if TF_CXX11_ABI_FLAG != tf_cxx11_abi_flag: + raise RuntimeError( + "This deepmd-kit package was compiled with " + "CXX11_ABI_FLAG=%d, but TensorFlow runtime was compiled " + "with CXX11_ABI_FLAG=%d. These two library ABIs are " + "incompatible and thus an error is raised when loading %s." + "You need to rebuild deepmd-kit against this TensorFlow " + "runtime." % ( + TF_CXX11_ABI_FLAG, + tf_cxx11_abi_flag, + module_name, + )) from e + + # different versions may cause incompatibility + # see #406, #447, #557, #774, and #796 for example + # throw a message if versions are different + if TF_VERSION != tf.version.VERSION: + raise RuntimeError( + "The version of TensorFlow used to compile this " + "deepmd-kit package is %s, but the version of TensorFlow " + "runtime you are using is %s. These two versions are " + "incompatible and thus an error is raised when loading %s. " + "You need to install TensorFlow %s, or rebuild deepmd-kit " + "against TensorFlow %s.\nIf you are using a wheel from " + "pypi, you may consider to install deepmd-kit execuating " + "`pip install deepmd-kit --no-binary deepmd-kit` " + "instead." % ( + TF_VERSION, + tf.version.VERSION, + module_name, + TF_VERSION, + tf.version.VERSION, + )) from e + raise RuntimeError( + "This deepmd-kit package is inconsitent with TensorFlow" + "Runtime, thus an error is raised when loading %s." + "You need to rebuild deepmd-kit against this TensorFlow" + "runtime." % ( + module_name, + )) from e return module -op_module = get_module("libop_abi") -op_grads_module = get_module("libop_grads") - - def _get_package_constants( config_file: Path = Path(__file__).parent / "pkg_config/run_config.ini", ) -> Dict[str, str]: @@ -165,7 +215,7 @@ def _get_package_constants( Parameters ---------- config_file : str, optional - path to CONFIG file, by default "config/run_config.ini" + path to CONFIG file, by default "pkg_config/run_config.ini" Returns ------- @@ -176,8 +226,14 @@ def _get_package_constants( config.read(config_file) return dict(config.items("CONFIG")) + GLOBAL_CONFIG = _get_package_constants() MODEL_VERSION = GLOBAL_CONFIG["model_version"] +TF_VERSION = GLOBAL_CONFIG["tf_version"] +TF_CXX11_ABI_FLAG = int(GLOBAL_CONFIG["tf_cxx11_abi_flag"]) + +op_module = get_module("libop_abi") +op_grads_module = get_module("libop_grads") if GLOBAL_CONFIG["precision"] == "-DHIGH_PREC": GLOBAL_TF_FLOAT_PRECISION = tf.float64 @@ -221,5 +277,3 @@ def global_cvt_2_ener_float(xx: tf.Tensor) -> tf.Tensor: output tensor cast to `GLOBAL_ENER_FLOAT_PRECISION` """ return tf.cast(xx, GLOBAL_ENER_FLOAT_PRECISION) - - diff --git a/source/cmake/Findtensorflow.cmake b/source/cmake/Findtensorflow.cmake index 8901c698b9..91ed0809a3 100644 --- a/source/cmake/Findtensorflow.cmake +++ b/source/cmake/Findtensorflow.cmake @@ -137,10 +137,27 @@ else (BUILD_CPP_IF) endif () endif (BUILD_CPP_IF) +# detect TensorFlow version +try_run( + TENSORFLOW_VERSION_RUN_RESULT_VAR TENSORFLOW_VERSION_COMPILE_RESULT_VAR + ${CMAKE_CURRENT_BINARY_DIR}/tf_version + "${CMAKE_CURRENT_LIST_DIR}/tf_version.cpp" + LINK_LIBRARIES ${TensorFlowFramework_LIBRARY} + CMAKE_FLAGS "-DINCLUDE_DIRECTORIES:STRING=${TensorFlow_INCLUDE_DIRS}" + RUN_OUTPUT_VARIABLE TENSORFLOW_VERSION + COMPILE_OUTPUT_VARIABLE TENSORFLOW_VERSION_COMPILE_OUTPUT_VAR +) +if (NOT ${TENSORFLOW_VERSION_COMPILE_RESULT_VAR}) + message(FATAL_ERROR "Failed to compile: \n ${TENSORFLOW_VERSION_COMPILE_OUTPUT_VAR}" ) +endif() +if (NOT ${TENSORFLOW_VERSION_RUN_RESULT_VAR} EQUAL "0") + message(FATAL_ERROR "Failed to run, return code: ${TENSORFLOW_VERSION}" ) +endif() + # print message if (NOT TensorFlow_FIND_QUIETLY) message(STATUS "Found TensorFlow: ${TensorFlow_INCLUDE_DIRS}, ${TensorFlow_LIBRARY}, ${TensorFlowFramework_LIBRARY} " - " in ${TensorFlow_search_PATHS}") + " in ${TensorFlow_search_PATHS} (found version \"${TENSORFLOW_VERSION}\")") endif () unset(TensorFlow_search_PATHS) diff --git a/source/cmake/tf_version.cpp b/source/cmake/tf_version.cpp new file mode 100644 index 0000000000..9d129aefb8 --- /dev/null +++ b/source/cmake/tf_version.cpp @@ -0,0 +1,10 @@ +#include +#include "tensorflow/core/public/version.h" + +int main(int argc, char * argv[]) +{ + // See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/public/version.h + // TF_VERSION_STRING has been avaiable since TensorFlow v0.6 + std::cout << TF_VERSION_STRING; + return 0; +} diff --git a/source/config/run_config.ini b/source/config/run_config.ini index 3f2e8cc86a..bb04319e47 100644 --- a/source/config/run_config.ini +++ b/source/config/run_config.ini @@ -6,5 +6,7 @@ GIT_DATE = @GIT_DATE@ GIT_BRANCH = @GIT_BRANCH@ TF_INCLUDE_DIR = @TensorFlow_INCLUDE_DIRS@ TF_LIBS = @TensorFlow_LIBRARY@ +TF_VERSION = @TENSORFLOW_VERSION@ +TF_CXX11_ABI_FLAG = @OP_CXX_ABI@ PRECISION = @PREC_DEF@ MODEL_VERSION=@MODEL_VERSION@