diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py index ce8af71379..74b12a412a 100644 --- a/deepmd/descriptor/se_a.py +++ b/deepmd/descriptor/se_a.py @@ -323,12 +323,19 @@ def enable_compression(self, suffix : str, optional The suffix of the scope """ + # do some checks before the mocel compression process assert ( not self.filter_resnet_dt ), "Model compression error: descriptor resnet_dt must be false!" + for tt in self.exclude_types: + if (tt[0] not in range(self.ntypes)) or (tt[1] not in range(self.ntypes)): + raise RuntimeError("exclude types" + str(tt) + " must within the number of atomic types " + str(self.ntypes) + "!") + if (self.ntypes * self.ntypes - len(self.exclude_types) == 0): + raise RuntimeError("empty embedding-net are not supported in model compression!") + self.compress = True self.table = DPTabulate( - model_file, self.type_one_side, self.exclude_types, self.compress_activation_fn, suffix=suffix) + self, self.filter_neuron, model_file, self.type_one_side, self.exclude_types, self.compress_activation_fn, suffix=suffix) self.table_config = [table_extrapolate, table_stride_1, table_stride_2, check_frequency] self.lower, self.upper \ = self.table.build(min_nbor_dist, @@ -686,7 +693,7 @@ def _filter_lower( net = 'filter_-1_net_' + str(type_i) else: net = 'filter_' + str(type_input) + '_net_' + str(type_i) - return op_module.tabulate_fusion(tf.cast(self.table.data[net], self.filter_precision), info, xyz_scatter, tf.reshape(inputs_i, [natom, shape_i[1]//4, 4]), last_layer_size = outputs_size[-1]) + return op_module.tabulate_fusion_se_a(tf.cast(self.table.data[net], self.filter_precision), info, xyz_scatter, tf.reshape(inputs_i, [natom, shape_i[1]//4, 4]), last_layer_size = outputs_size[-1]) else: if (not is_exclude): xyz_scatter = embedding_net( diff --git a/deepmd/descriptor/se_t.py b/deepmd/descriptor/se_t.py index f43fb6f40c..d5ffdf3970 100644 --- a/deepmd/descriptor/se_t.py +++ b/deepmd/descriptor/se_t.py @@ -10,6 +10,8 @@ from deepmd.env import default_tf_session_config from deepmd.utils.network import embedding_net, embedding_net_rand_seed_shift from deepmd.utils.sess import run_sess +from deepmd.utils.graph import load_graph_def, get_tensor_by_name_from_graph +from deepmd.utils.tabulate import DPTabulate from .descriptor import Descriptor from .se import DescrptSe @@ -98,6 +100,7 @@ def __init__ (self, self.useBN = False self.dstd = None self.davg = None + self.compress = False self.embedding_net_variables = None self.place_holders = {} @@ -224,6 +227,53 @@ def compute_input_stats (self, self.dstd = np.array(all_dstd) + def enable_compression(self, + min_nbor_dist : float, + model_file : str = 'frozon_model.pb', + table_extrapolate : float = 5, + table_stride_1 : float = 0.01, + table_stride_2 : float = 0.1, + check_frequency : int = -1, + suffix : str = "", + ) -> None: + """ + Reveive the statisitcs (distance, max_nbor_size and env_mat_range) of the training data. + + Parameters + ---------- + min_nbor_dist + The nearest distance between atoms + model_file + The original frozen model, which will be compressed by the program + table_extrapolate + The scale of model extrapolation + table_stride_1 + The uniform stride of the first table + table_stride_2 + The uniform stride of the second table + check_frequency + The overflow check frequency + suffix : str, optional + The suffix of the scope + """ + assert ( + not self.filter_resnet_dt + ), "Model compression error: descriptor resnet_dt must be false!" + self.compress = True + self.table = DPTabulate( + self, self.filter_neuron, model_file, activation_fn = self.filter_activation_fn, suffix=suffix) + self.table_config = [table_extrapolate, table_stride_1 * 10, table_stride_2 * 10, check_frequency] + self.lower, self.upper \ + = self.table.build(min_nbor_dist, + table_extrapolate, + table_stride_1 * 10, + table_stride_2 * 10) + + graph, _ = load_graph_def(model_file) + self.davg = get_tensor_by_name_from_graph(graph, 'descrpt_attr%s/t_avg' % suffix) + self.dstd = get_tensor_by_name_from_graph(graph, 'descrpt_attr%s/t_std' % suffix) + + def build (self, coord_ : tf.Tensor, atype_ : tf.Tensor, @@ -497,25 +547,30 @@ def _filter(self, env_ij = tf.einsum('ijm,ikm->ijk', env_i, env_j) # with (natom x nei_type_i x nei_type_j) ebd_env_ij = tf.reshape(env_ij, [-1, 1]) - # with (natom x nei_type_i x nei_type_j) x out_size - ebd_env_ij = embedding_net(ebd_env_ij, - self.filter_neuron, - self.filter_precision, - activation_fn = activation_fn, - resnet_dt = self.filter_resnet_dt, - name_suffix = f"_{type_i}_{type_j}", - stddev = stddev, - bavg = bavg, - seed = self.seed, - trainable = trainable, - uniform_seed = self.uniform_seed, - initial_variables = self.embedding_net_variables, - ) - if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift - # with natom x nei_type_i x nei_type_j x out_size - ebd_env_ij = tf.reshape(ebd_env_ij, [-1, nei_type_i, nei_type_j, outputs_size[-1]]) - # with natom x out_size - res_ij = tf.einsum('ijk,ijkm->im', env_ij, ebd_env_ij) + if self.compress: + info = [self.lower, self.upper, self.upper * self.table_config[0], self.table_config[1], self.table_config[2], self.table_config[3]] + net = 'filter_' + str(type_i) + '_net_' + str(type_j) + res_ij = op_module.tabulate_fusion_se_t(tf.cast(self.table.data[net], self.filter_precision), info, ebd_env_ij, env_ij, last_layer_size = outputs_size[-1]) + else: + # with (natom x nei_type_i x nei_type_j) x out_size + ebd_env_ij = embedding_net(ebd_env_ij, + self.filter_neuron, + self.filter_precision, + activation_fn = activation_fn, + resnet_dt = self.filter_resnet_dt, + name_suffix = f"_{type_i}_{type_j}", + stddev = stddev, + bavg = bavg, + seed = self.seed, + trainable = trainable, + uniform_seed = self.uniform_seed, + initial_variables = self.embedding_net_variables, + ) + if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift + # with natom x nei_type_i x nei_type_j x out_size + ebd_env_ij = tf.reshape(ebd_env_ij, [-1, nei_type_i, nei_type_j, outputs_size[-1]]) + # with natom x out_size + res_ij = tf.einsum('ijk,ijkm->im', env_ij, ebd_env_ij) res_ij = res_ij * (1.0 / float(nei_type_i) / float(nei_type_j)) if result is None: result = res_ij diff --git a/deepmd/entrypoints/compress.py b/deepmd/entrypoints/compress.py index b03e8cf653..8cb8070594 100644 --- a/deepmd/entrypoints/compress.py +++ b/deepmd/entrypoints/compress.py @@ -93,8 +93,6 @@ def compress( name = 'train_attr/min_nbor_dist', dtype = GLOBAL_ENER_FLOAT_PRECISION) jdata["model"]["compress"] = {} - jdata["model"]["compress"]["type"] = 'se_e2_a' - jdata["model"]["compress"]["compress"] = True jdata["model"]["compress"]["model_file"] = input jdata["model"]["compress"]["min_nbor_dist"] = t_min_nbor_dist jdata["model"]["compress"]["table_config"] = [ diff --git a/deepmd/entrypoints/convert.py b/deepmd/entrypoints/convert.py index 25f2271cdb..3f277c5134 100644 --- a/deepmd/entrypoints/convert.py +++ b/deepmd/entrypoints/convert.py @@ -1,4 +1,4 @@ -from deepmd.utils.convert import convert_13_to_20, convert_12_to_20 +from deepmd.utils.convert import convert_20_to_21, convert_13_to_21, convert_12_to_21 def convert( *, @@ -8,8 +8,10 @@ def convert( **kwargs, ): if FROM == '1.2': - convert_12_to_20(input_model, output_model) + convert_12_to_21(input_model, output_model) elif FROM == '1.3': - convert_13_to_20(input_model, output_model) + convert_13_to_21(input_model, output_model) + elif FROM == '2.0': + convert_20_to_21(input_model, output_model) else: raise RuntimeError('unsupported model version ' + FROM) diff --git a/deepmd/entrypoints/main.py b/deepmd/entrypoints/main.py index 721eed357c..4f772526c8 100644 --- a/deepmd/entrypoints/main.py +++ b/deepmd/entrypoints/main.py @@ -386,7 +386,7 @@ def parse_args(args: Optional[List[str]] = None): parser_transform.add_argument( 'FROM', type = str, - choices = ['1.2', '1.3'], + choices = ['1.2', '1.3', '2.0'], help="The original model compatibility", ) parser_transform.add_argument( diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index 857dc04e10..db31469c29 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -371,13 +371,11 @@ def modifier_variant_type_args(): # --- model compression configurations: --- # def model_compression(): - doc_compress = f"The name of the frozen model file." doc_model_file = f"The input model file, which will be compressed by the DeePMD-kit." doc_table_config = f"The arguments of model compression, including extrapolate(scale of model extrapolation), stride(uniform stride of tabulation's first and second table), and frequency(frequency of tabulation overflow check)." doc_min_nbor_dist = f"The nearest distance between neighbor atoms saved in the frozen model." return [ - Argument("compress", bool, optional = False, doc = doc_compress), Argument("model_file", str, optional = False, doc = doc_model_file), Argument("table_config", list, optional = False, doc = doc_table_config), Argument("min_nbor_dist", float, optional = False, doc = doc_min_nbor_dist), diff --git a/deepmd/utils/convert.py b/deepmd/utils/convert.py index 6ace6a2132..b17178c761 100644 --- a/deepmd/utils/convert.py +++ b/deepmd/utils/convert.py @@ -3,22 +3,32 @@ from google.protobuf import text_format from tensorflow.python.platform import gfile -def convert_13_to_20(input_model: str, output_model: str): +def convert_13_to_21(input_model: str, output_model: str): convert_pb_to_pbtxt(input_model, 'frozen_model.pbtxt') convert_dp13_to_dp20('frozen_model.pbtxt') + convert_dp20_to_dp21('frozen_model.pbtxt') convert_pbtxt_to_pb('frozen_model.pbtxt', output_model) if os.path.isfile('frozen_model.pbtxt'): os.remove('frozen_model.pbtxt') - print("the converted output model (2.0 support) is saved in %s" % output_model) + print("the converted output model (2.1 support) is saved in %s" % output_model) -def convert_12_to_20(input_model: str, output_model: str): +def convert_12_to_21(input_model: str, output_model: str): convert_pb_to_pbtxt(input_model, 'frozen_model.pbtxt') convert_dp12_to_dp13('frozen_model.pbtxt') convert_dp13_to_dp20('frozen_model.pbtxt') + convert_dp20_to_dp21('frozen_model.pbtxt') convert_pbtxt_to_pb('frozen_model.pbtxt', output_model) if os.path.isfile('frozen_model.pbtxt'): os.remove('frozen_model.pbtxt') - print("the converted output model (2.0 support) is saved in %s" % output_model) + print("the converted output model (2.1 support) is saved in %s" % output_model) + +def convert_20_to_21(input_model: str, output_model: str): + convert_pb_to_pbtxt(input_model, 'frozen_model.pbtxt') + convert_dp20_to_dp21('frozen_model.pbtxt') + convert_pbtxt_to_pb('frozen_model.pbtxt', output_model) + if os.path.isfile('frozen_model.pbtxt'): + os.remove('frozen_model.pbtxt') + print("the converted output model (2.1 support) is saved in %s" % output_model) def convert_pb_to_pbtxt(pbfile: str, pbtxtfile: str): with gfile.FastGFile(pbfile, 'rb') as f: @@ -88,3 +98,60 @@ def convert_dp13_to_dp20(fname: str): .replace('DescrptSeR', 'ProdEnvMatR') with open(fname, 'w') as fp: fp.write(file_content) + +def convert_dp20_to_dp21(fname: str): + with open(fname) as fp: + file_content = fp.read() + old_model_version_node = """ +node { + name: "model_attr/model_version" + op: "Const" + attr { + key: "dtype" + value { + type: DT_STRING + } + } + attr { + key: "value" + value { + tensor { + dtype: DT_STRING + tensor_shape { + } + string_val: "1.0" + } + } + } +} +""" + new_model_version_node = """ +node { + name: "model_attr/model_version" + op: "Const" + attr { + key: "dtype" + value { + type: DT_STRING + } + } + attr { + key: "value" + value { + tensor { + dtype: DT_STRING + tensor_shape { + } + string_val: "1.1" + } + } + } +} +""" + file_content = file_content\ + .replace(old_model_version_node, new_model_version_node)\ + .replace('TabulateFusion', 'TabulateFusionSeA')\ + .replace('TabulateFusionGrad', 'TabulateFusionSeAGrad')\ + .replace('TabulateFusionGradGrad', 'TabulateFusionSeAGradGrad') + with open(fname, 'w') as fp: + fp.write(file_content) \ No newline at end of file diff --git a/deepmd/utils/tabulate.py b/deepmd/utils/tabulate.py index fe95173890..f0dc571142 100644 --- a/deepmd/utils/tabulate.py +++ b/deepmd/utils/tabulate.py @@ -2,14 +2,17 @@ import math import logging import numpy as np +import deepmd from typing import Callable from typing import Tuple, List +from scipy.special import comb from deepmd.env import tf from deepmd.env import op_module from deepmd.common import ACTIVATION_FN_DICT from deepmd.utils.sess import run_sess from deepmd.utils.graph import get_tensor_by_name_from_graph, load_graph_def from deepmd.utils.graph import get_embedding_net_nodes_from_graph_def +from deepmd.descriptor import Descriptor from tensorflow.python.platform import gfile from tensorflow.python.framework import tensor_util @@ -25,6 +28,10 @@ class DPTabulate(): Parameters ---------- + descrpt + Descriptor of the original model + neuron + Number of neurons in each hidden layers of the embedding net :math:`\mathcal{N}` model_file The frozen model type_one_side @@ -38,6 +45,8 @@ class DPTabulate(): The suffix of the scope """ def __init__(self, + descrpt : Descriptor, + neuron : List[int], model_file : str, type_one_side : bool = False, exclude_types : List[List[int]] = [], @@ -47,13 +56,12 @@ def __init__(self, """ Constructor """ - + self.descrpt = descrpt + self.neuron = neuron self.model_file = model_file self.type_one_side = type_one_side self.exclude_types = exclude_types self.suffix = suffix - if self.type_one_side and len(self.exclude_types) != 0: - raise RuntimeError('"type_one_side" is not compatible with "exclude_types"') # functype if activation_fn == ACTIVATION_FN_DICT["tanh"]: @@ -72,42 +80,36 @@ def __init__(self, try: self.sel_a = self.graph.get_operation_by_name('ProdEnvMatA').get_attr('sel_a') - self.descrpt = self.graph.get_operation_by_name ('ProdEnvMatA') + self.prod_env_mat_op = self.graph.get_operation_by_name ('ProdEnvMatA') except Exception: self.sel_a = self.graph.get_operation_by_name('DescrptSeA').get_attr('sel_a') - self.descrpt = self.graph.get_operation_by_name ('DescrptSeA') + self.prod_env_mat_op = self.graph.get_operation_by_name ('DescrptSeA') self.davg = get_tensor_by_name_from_graph(self.graph, f'descrpt_attr{self.suffix}/t_avg') self.dstd = get_tensor_by_name_from_graph(self.graph, f'descrpt_attr{self.suffix}/t_std') self.ntypes = get_tensor_by_name_from_graph(self.graph, 'descrpt_attr/ntypes') - self.rcut = self.descrpt.get_attr('rcut_r') - self.rcut_smth = self.descrpt.get_attr('rcut_r_smth') + self.rcut = self.prod_env_mat_op.get_attr('rcut_r') + self.rcut_smth = self.prod_env_mat_op.get_attr('rcut_r_smth') self.embedding_net_nodes = get_embedding_net_nodes_from_graph_def(self.graph_def, suffix=self.suffix) - for tt in self.exclude_types: - if (tt[0] not in range(self.ntypes)) or (tt[1] not in range(self.ntypes)): - raise RuntimeError("exclude types" + str(tt) + " must within the number of atomic types " + str(self.ntypes) + "!") - if (self.ntypes * self.ntypes - len(self.exclude_types) == 0): - raise RuntimeError("empty embedding-net are not supported in model compression!") - - self.layer_size = len(self.embedding_net_nodes) // ((self.ntypes * self.ntypes - len(self.exclude_types)) * 2) - self.table_size = self.ntypes * self.ntypes - if type_one_side : - self.layer_size = len(self.embedding_net_nodes) // (self.ntypes * 2) - self.table_size = self.ntypes - # self.value_type = self.embedding_net_nodes["filter_type_0/matrix_1_0"].dtype #"filter_type_0/matrix_1_0" must exit~ - # get trained variables + # move it to the descriptor class + # for tt in self.exclude_types: + # if (tt[0] not in range(self.ntypes)) or (tt[1] not in range(self.ntypes)): + # raise RuntimeError("exclude types" + str(tt) + " must within the number of atomic types " + str(self.ntypes) + "!") + # if (self.ntypes * self.ntypes - len(self.exclude_types) == 0): + # raise RuntimeError("empty embedding-net are not supported in model compression!") + self.layer_size = self._get_layer_size() + self.table_size = self._get_table_size() + self.bias = self._get_bias() self.matrix = self._get_matrix() - for item in self.matrix["layer_" + str(self.layer_size)]: - if len(item) != 0: - self.data_type = type(item[0][0]) - self.last_layer_size = item.shape[1] - # define tables + self.data_type = self._get_data_type() + self.last_layer_size = self._get_last_layer_size() + self.data = {} @@ -129,6 +131,8 @@ def build(self, The uniform stride of the first table stride1 The uniform stride of the second table + neuron + Number of neurons in each hidden layers of the embedding net :math:`\mathcal{N}` Returns ---------- @@ -139,34 +143,57 @@ def build(self, """ # tabulate range [lower, upper] with stride0 'stride0' lower, upper = self._get_env_mat_range(min_nbor_dist) - xx = np.arange(lower, upper, stride0, dtype = self.data_type) - xx = np.append(xx, np.arange(upper, extrapolate * upper, stride1, dtype = self.data_type)) - xx = np.append(xx, np.array([extrapolate * upper], dtype = self.data_type)) - self.nspline = int((upper - lower) / stride0 + (extrapolate * upper - upper) / stride1) - for ii in range(self.table_size): - if self.type_one_side or (ii // self.ntypes, int(ii % self.ntypes)) not in self.exclude_types: - vv, dd, d2 = self._make_data(xx, ii) - if self.type_one_side: - net = "filter_-1_net_" + str(ii) - else: - net = "filter_" + str(ii // self.ntypes) + "_net_" + str(int(ii % self.ntypes)) - self.data[net] = np.zeros([self.nspline, 6 * self.last_layer_size], dtype = self.data_type) - # for jj in tqdm(range(self.nspline), desc = 'DEEPMD INFO |-> deepmd.utils.tabulate\t\t\t' + net + ', tabulating'): - for jj in range(self.nspline): - for kk in range(self.last_layer_size): - if jj < int((upper - lower) / stride0): - tt = stride0 - else: - tt = stride1 - hh = vv[jj + 1][kk] - vv[jj][kk] - self.data[net][jj][kk * 6 + 0] = vv[jj][kk] - self.data[net][jj][kk * 6 + 1] = dd[jj][kk] - self.data[net][jj][kk * 6 + 2] = 0.5 * d2[jj][kk] - self.data[net][jj][kk * 6 + 3] = (1 / (2 * tt * tt * tt)) * (20 * hh - (8 * dd[jj + 1][kk] + 12 * dd[jj][kk]) * tt - (3 * d2[jj][kk] - d2[jj + 1][kk]) * tt * tt) - self.data[net][jj][kk * 6 + 4] = (1 / (2 * tt * tt * tt * tt)) * (-30 * hh + (14 * dd[jj + 1][kk] + 16 * dd[jj][kk]) * tt + (3 * d2[jj][kk] - 2 * d2[jj + 1][kk]) * tt * tt) - self.data[net][jj][kk * 6 + 5] = (1 / (2 * tt * tt * tt * tt * tt)) * (12 * hh - 6 * (dd[jj + 1][kk] + dd[jj][kk]) * tt + (d2[jj + 1][kk] - d2[jj][kk]) * tt * tt) + + if isinstance(self.descrpt, deepmd.descriptor.DescrptSeA): + xx = np.arange(lower, upper, stride0, dtype = self.data_type) + xx = np.append(xx, np.arange(upper, extrapolate * upper, stride1, dtype = self.data_type)) + xx = np.append(xx, np.array([extrapolate * upper], dtype = self.data_type)) + self.nspline = int((upper - lower) / stride0 + (extrapolate * upper - upper) / stride1) + for ii in range(self.table_size): + if self.type_one_side or (ii // self.ntypes, ii % self.ntypes) not in self.exclude_types: + if self.type_one_side: + net = "filter_-1_net_" + str(ii) + else: + net = "filter_" + str(ii // self.ntypes) + "_net_" + str(ii % self.ntypes) + self._build_lower(net, xx, ii, upper, lower, stride0, stride1, extrapolate) + elif isinstance(self.descrpt, deepmd.descriptor.DescrptSeT): + xx = np.arange(extrapolate * lower, lower, stride1, dtype = self.data_type) + xx = np.append(xx, np.arange(lower, upper, stride0, dtype = self.data_type)) + xx = np.append(xx, np.arange(upper, extrapolate * upper, stride1, dtype = self.data_type)) + xx = np.append(xx, np.array([extrapolate * upper], dtype = self.data_type)) + self.nspline = int((upper - lower) / stride0 + 2 * ((extrapolate * upper - upper) / stride1)) + idx = 0 + for ii in range(self.ntypes): + for jj in range(ii, self.ntypes): + net = "filter_" + str(ii) + "_net_" + str(jj) + self._build_lower(net, xx, idx, upper, lower, stride0, stride1, extrapolate) + idx += 1 return lower, upper + def _build_lower(self, net, xx, idx, upper, lower, stride0, stride1, extrapolate): + vv, dd, d2 = self._make_data(xx, idx) + self.data[net] = np.zeros([self.nspline, 6 * self.last_layer_size], dtype = self.data_type) + # for jj in tqdm(range(self.nspline), desc = 'DEEPMD INFO |-> deepmd.utils.tabulate\t\t\t' + net + ', tabulating'): + for jj in range(self.nspline): + for kk in range(self.last_layer_size): + if isinstance(self.descrpt, deepmd.descriptor.DescrptSeA): + if jj < int((upper - lower) / stride0): + tt = stride0 + else: + tt = stride1 + elif isinstance(self.descrpt, deepmd.descriptor.DescrptSeT): + if jj > int((lower - extrapolate * lower) / stride1) and jj < (int((lower - extrapolate * lower) / stride1) + int((upper - lower) / stride0)): + tt = stride0 + else: + tt = stride1 + hh = vv[jj + 1][kk] - vv[jj][kk] + self.data[net][jj][kk * 6 + 0] = vv[jj][kk] + self.data[net][jj][kk * 6 + 1] = dd[jj][kk] + self.data[net][jj][kk * 6 + 2] = 0.5 * d2[jj][kk] + self.data[net][jj][kk * 6 + 3] = (1 / (2 * tt * tt * tt)) * (20 * hh - (8 * dd[jj + 1][kk] + 12 * dd[jj][kk]) * tt - (3 * d2[jj][kk] - d2[jj + 1][kk]) * tt * tt) + self.data[net][jj][kk * 6 + 4] = (1 / (2 * tt * tt * tt * tt)) * (-30 * hh + (14 * dd[jj + 1][kk] + 16 * dd[jj][kk]) * tt + (3 * d2[jj][kk] - 2 * d2[jj + 1][kk]) * tt * tt) + self.data[net][jj][kk * 6 + 5] = (1 / (2 * tt * tt * tt * tt * tt)) * (12 * hh - 6 * (dd[jj + 1][kk] + dd[jj][kk]) * tt + (d2[jj + 1][kk] - d2[jj][kk]) * tt * tt) + def _load_sub_graph(self): sub_graph_def = tf.GraphDef() with tf.Graph().as_default() as sub_graph: @@ -177,42 +204,46 @@ def _get_bias(self): bias = {} for layer in range(1, self.layer_size + 1): bias["layer_" + str(layer)] = [] - if self.type_one_side: - for ii in range(0, self.ntypes): - node = self.embedding_net_nodes[f"filter_type_all{self.suffix}/bias_{layer}_{ii}"] - tensor_value = np.frombuffer (node.tensor_content, dtype = tf.as_dtype(node.dtype).as_numpy_dtype) - tensor_shape = tf.TensorShape(node.tensor_shape).as_list() - bias["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape)) - else: - for ii in range(0, self.ntypes * self.ntypes): - if (ii // self.ntypes, int(ii % self.ntypes)) not in self.exclude_types: - node = self.embedding_net_nodes[f"filter_type_{ii // self.ntypes}{self.suffix}/bias_{layer}_{ii % self.ntypes}"] - tensor_value = np.frombuffer(node.tensor_content, dtype = tf.as_dtype(node.dtype).as_numpy_dtype) - tensor_shape = tf.TensorShape(node.tensor_shape).as_list() - bias["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape)) - else: - bias["layer_" + str(layer)].append(np.array([])) + if isinstance(self.descrpt, deepmd.descriptor.DescrptSeA): + if self.type_one_side: + for ii in range(0, self.ntypes): + node = self.embedding_net_nodes[f"filter_type_all{self.suffix}/bias_{layer}_{ii}"] + bias["layer_" + str(layer)].append(tf.make_ndarray(node)) + else: + for ii in range(0, self.ntypes * self.ntypes): + if (ii // self.ntypes, ii % self.ntypes) not in self.exclude_types: + node = self.embedding_net_nodes[f"filter_type_{ii // self.ntypes}{self.suffix}/bias_{layer}_{ii % self.ntypes}"] + bias["layer_" + str(layer)].append(tf.make_ndarray(node)) + else: + bias["layer_" + str(layer)].append(np.array([])) + elif isinstance(self.descrpt, deepmd.descriptor.DescrptSeT): + for ii in range(self.ntypes): + for jj in range(ii, self.ntypes): + node = self.embedding_net_nodes[f"filter_type_all{self.suffix}/bias_{layer}_{ii}_{jj}"] + bias["layer_" + str(layer)].append(tf.make_ndarray(node)) return bias def _get_matrix(self): matrix = {} for layer in range(1, self.layer_size + 1): matrix["layer_" + str(layer)] = [] - if self.type_one_side: - for ii in range(0, self.ntypes): - node = self.embedding_net_nodes[f"filter_type_all{self.suffix}/matrix_{layer}_{ii}"] - tensor_value = np.frombuffer (node.tensor_content, dtype = tf.as_dtype(node.dtype).as_numpy_dtype) - tensor_shape = tf.TensorShape(node.tensor_shape).as_list() - matrix["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape)) - else: - for ii in range(0, self.ntypes * self.ntypes): - if (ii // self.ntypes, int(ii % self.ntypes)) not in self.exclude_types: - node = self.embedding_net_nodes[f"filter_type_{ii // self.ntypes}{self.suffix}/matrix_{layer}_{ii % self.ntypes}"] - tensor_value = np.frombuffer(node.tensor_content, dtype = tf.as_dtype(node.dtype).as_numpy_dtype) - tensor_shape = tf.TensorShape(node.tensor_shape).as_list() - matrix["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape)) - else: - matrix["layer_" + str(layer)].append(np.array([])) + if isinstance(self.descrpt, deepmd.descriptor.DescrptSeA): + if self.type_one_side: + for ii in range(0, self.ntypes): + node = self.embedding_net_nodes[f"filter_type_all{self.suffix}/matrix_{layer}_{ii}"] + matrix["layer_" + str(layer)].append(tf.make_ndarray(node)) + else: + for ii in range(0, self.ntypes * self.ntypes): + if (ii // self.ntypes, ii % self.ntypes) not in self.exclude_types: + node = self.embedding_net_nodes[f"filter_type_{ii // self.ntypes}{self.suffix}/matrix_{layer}_{ii % self.ntypes}"] + matrix["layer_" + str(layer)].append(tf.make_ndarray(node)) + else: + matrix["layer_" + str(layer)].append(np.array([])) + elif isinstance(self.descrpt, deepmd.descriptor.DescrptSeT): + for ii in range(self.ntypes): + for jj in range(ii, self.ntypes): + node = self.embedding_net_nodes[f"filter_type_all{self.suffix}/matrix_{layer}_{ii}_{jj}"] + matrix["layer_" + str(layer)].append(tf.make_ndarray(node)) return matrix # one-by-one executions @@ -223,13 +254,28 @@ def _make_data(self, xx, idx): for layer in range(self.layer_size): if layer == 0: xbar = tf.matmul( - xx, self.matrix["layer_" + str(layer + 1)][idx]) + self.bias["layer_" + str(layer + 1)][idx] - yy = self._layer_0( - xx, self.matrix["layer_" + str(layer + 1)][idx], self.bias["layer_" + str(layer + 1)][idx]) - dy = op_module.unaggregated_dy_dx_s( - yy, self.matrix["layer_" + str(layer + 1)][idx], xbar, tf.constant(self.functype)) - dy2 = op_module.unaggregated_dy2_dx_s( - yy, dy, self.matrix["layer_" + str(layer + 1)][idx], xbar, tf.constant(self.functype)) + xx, self.matrix["layer_" + str(layer + 1)][idx]) + self.bias["layer_" + str(layer + 1)][idx] + if self.neuron[0] == 1: + yy = self._layer_0( + xx, self.matrix["layer_" + str(layer + 1)][idx], self.bias["layer_" + str(layer + 1)][idx]) + xx + dy = op_module.unaggregated_dy_dx_s( + yy, self.matrix["layer_" + str(layer + 1)][idx], xbar, tf.constant(self.functype)) + tf.ones([1, 1], yy.dtype) + dy2 = op_module.unaggregated_dy2_dx_s( + yy, dy, self.matrix["layer_" + str(layer + 1)][idx], xbar, tf.constant(self.functype)) + elif self.neuron[0] == 2: + tt, yy = self._layer_1( + xx, self.matrix["layer_" + str(layer + 1)][idx], self.bias["layer_" + str(layer + 1)][idx]) + dy = op_module.unaggregated_dy_dx_s( + yy - tt, self.matrix["layer_" + str(layer + 1)][idx], xbar, tf.constant(self.functype)) + tf.ones([1, 2], yy.dtype) + dy2 = op_module.unaggregated_dy2_dx_s( + yy - tt, dy, self.matrix["layer_" + str(layer + 1)][idx], xbar, tf.constant(self.functype)) + else: + yy = self._layer_0( + xx, self.matrix["layer_" + str(layer + 1)][idx], self.bias["layer_" + str(layer + 1)][idx]) + dy = op_module.unaggregated_dy_dx_s( + yy, self.matrix["layer_" + str(layer + 1)][idx], xbar, tf.constant(self.functype)) + dy2 = op_module.unaggregated_dy2_dx_s( + yy, dy, self.matrix["layer_" + str(layer + 1)][idx], xbar, tf.constant(self.functype)) else: ybar = tf.matmul( yy, self.matrix["layer_" + str(layer + 1)][idx]) + self.bias["layer_" + str(layer + 1)][idx] @@ -254,21 +300,19 @@ def _layer_1(self, x, w, b): t = tf.concat([x, x], axis=1) return t, self.activation_fn(tf.matmul(x, w) + b) + t - def _save_data(self): - for ii in range(self.ntypes * self.ntypes): - net = "filter_" + str(ii // self.ntypes) + "_net_" + str(int(ii % self.ntypes)) - np.savetxt('data_' + str(ii), self.data[net]) - + # Change the embedding net range to sw / min_nbor_dist def _get_env_mat_range(self, min_nbor_dist): - lower = 100.0 - upper = -10.0 + lower = +100.0 + upper = -100.0 sw = self._spline5_switch(min_nbor_dist, self.rcut_smth, self.rcut) - for ii in range(self.ntypes): - if lower > -self.davg[ii][0] / self.dstd[ii][0]: - lower = -self.davg[ii][0] / self.dstd[ii][0] - if upper < ((1 / min_nbor_dist) * sw - self.davg[ii][0]) / self.dstd[ii][0]: - upper = ((1 / min_nbor_dist) * sw - self.davg[ii][0]) / self.dstd[ii][0] + if isinstance(self.descrpt, deepmd.descriptor.DescrptSeA): + lower = np.min(-self.davg[:, 0] / self.dstd[:, 0]) + upper = np.max(((1 / min_nbor_dist) * sw - self.davg[:, 0]) / self.dstd[:, 0]) + elif isinstance(self.descrpt, deepmd.descriptor.DescrptSeT): + var = np.square(sw / (min_nbor_dist * self.dstd[:, 1:4])) + lower = np.min(-var) + upper = np.max(var) log.info('training data with lower boundary: ' + str(lower)) log.info('training data with upper boundary: ' + str(upper)) return math.floor(lower), math.ceil(upper) @@ -285,3 +329,35 @@ def _spline5_switch(self, else: vv = 0 return vv + + def _get_layer_size(self): + layer_size = 0 + if isinstance(self.descrpt, deepmd.descriptor.DescrptSeA): + layer_size = len(self.embedding_net_nodes) // ((self.ntypes * self.ntypes - len(self.exclude_types)) * 2) + if self.type_one_side : + layer_size = len(self.embedding_net_nodes) // (self.ntypes * 2) + elif isinstance(self.descrpt, deepmd.descriptor.DescrptSeT): + layer_size = len(self.embedding_net_nodes) // int(comb(self.ntypes + 1, 2) * 2) + return layer_size + + def _get_table_size(self): + table_size = 0 + if isinstance(self.descrpt, deepmd.descriptor.DescrptSeA): + table_size = self.ntypes * self.ntypes + if self.type_one_side : + table_size = self.ntypes + elif isinstance(self.descrpt, deepmd.descriptor.DescrptSeT): + table_size = int(comb(self.ntypes + 1, 2)) + return table_size + + def _get_data_type(self): + for item in self.matrix["layer_" + str(self.layer_size)]: + if len(item) != 0: + return type(item[0][0]) + return None + + def _get_last_layer_size(self): + for item in self.matrix["layer_" + str(self.layer_size)]: + if len(item) != 0: + return item.shape[1] + return 0 \ No newline at end of file diff --git a/doc/freeze/compress.md b/doc/freeze/compress.md index c7abd90737..ed55db0c4b 100644 --- a/doc/freeze/compress.md +++ b/doc/freeze/compress.md @@ -68,7 +68,7 @@ optional arguments: **Parameter explanation** Model compression, which including tabulating the embedding-net. -The table is composed of fifth-order polynomial coefficients and is assembled from two sub-tables. The first sub-table takes the stride(parameter) as it's uniform stride, while the second sub-table takes 10 * stride as it's uniform stride. +The table is composed of fifth-order polynomial coefficients and is assembled from two sub-tables. For model descriptor with `se_e2_a` type, the first sub-table takes the stride(parameter) as it's uniform stride, while the second sub-table takes 10 * stride as it's uniform stride; For model descriptor with `se_e3` type, the first sub-table takes 10 * stride as it's uniform stride, while the second sub-table takes 100 * stride as it's uniform stride. The range of the first table is automatically detected by deepmd-kit, while the second table ranges from the first table's upper boundary(upper) to the extrapolate(parameter) * upper. Finally, we added a check frequency parameter. It indicates how often the program checks for overflow(if the input environment matrix overflow the first or second table range) during the MD inference. @@ -79,3 +79,7 @@ Model compression, with little loss of accuracy, can greatly speed up MD inferen **Acceptable original model version** The model compression interface requires the version of deepmd-kit used in original model generation should be `2.0.0-alpha.0` or above. If one has a frozen 1.2 or 1.3 model, one can upgrade it through the `dp convert-from` interface.(eg: ```dp convert-from 1.2/1.3 -i old_frozen_model.pb -o new_frozen_model.pb```) + +**Acceptable descriptor type** + +Note only descriptors with `se_e2_a` or `se_e3` type are supported by the model compression feature. Hybrid mixed with above descriptors is also supported. diff --git a/doc/troubleshooting/model-compatability.md b/doc/troubleshooting/model-compatability.md index bc1b464047..2b7e46a4b7 100644 --- a/doc/troubleshooting/model-compatability.md +++ b/doc/troubleshooting/model-compatability.md @@ -6,11 +6,11 @@ DeePMD-kit guarantees that the codes with the same major and minor revisions are One can execute `dp convert-from` to convert an old model to a new one. -| Model version | v0.12 | v1.0 | v1.1 | v1.2 | v1.3 | v2.0 | -|:-:|:-----------:|:----------:|:----------:|:----------:|:----------:|:----------:| -| Compatibility | 😢 | 😢 | 😢 | 😊 | 😊 | 😄 | +| Model version | v0.12 | v1.0 | v1.1 | v1.2 | v1.3 | v2.0 | v2.1 | +|:-:|:-----------:|:----------:|:----------:|:----------:|:----------:|:----------:|:----------:| +| Compatibility | 😢 | 😢 | 😢 | 😊 | 😊 | 😄 | 😄 | **Legend**: - 😄: The model is compatible with the DeePMD-kit package. -- 😊: The model is incompatible with the DeePMD-kit package, but one can execute `dp convert-from` to convert an old model to v2.0. +- 😊: The model is incompatible with the DeePMD-kit package, but one can execute `dp convert-from` to convert an old model to v2.1. - 😢: The model is incompatible with the DeePMD-kit package, and there is no way to convert models. diff --git a/source/config/MODEL_VER b/source/config/MODEL_VER index d3827e75a5..9459d4ba2a 100644 --- a/source/config/MODEL_VER +++ b/source/config/MODEL_VER @@ -1 +1 @@ -1.0 +1.1 diff --git a/source/lib/include/tabulate.h b/source/lib/include/tabulate.h index ccc0e6fa65..841848380e 100644 --- a/source/lib/include/tabulate.h +++ b/source/lib/include/tabulate.h @@ -3,7 +3,7 @@ namespace deepmd{ template -void tabulate_fusion_cpu( +void tabulate_fusion_se_a_cpu( FPTYPE * out, const FPTYPE * table, const FPTYPE * table_info, @@ -14,7 +14,7 @@ void tabulate_fusion_cpu( const int last_layer_size); template -void tabulate_fusion_grad_cpu( +void tabulate_fusion_se_a_grad_cpu( FPTYPE * dy_dem_x, FPTYPE * dy_dem, const FPTYPE * table, @@ -27,7 +27,7 @@ void tabulate_fusion_grad_cpu( const int last_layer_size); template -void tabulate_fusion_grad_grad_cpu( +void tabulate_fusion_se_a_grad_grad_cpu( FPTYPE * dz_dy, const FPTYPE * table, const FPTYPE * table_info, @@ -39,9 +39,49 @@ void tabulate_fusion_grad_grad_cpu( const int nnei, const int last_layer_size); +template +void tabulate_fusion_se_t_cpu( + FPTYPE * out, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size); + +template +void tabulate_fusion_se_t_grad_cpu( + FPTYPE * dy_dem_x, + FPTYPE * dy_dem, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dy, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size); + +template +void tabulate_fusion_se_t_grad_grad_cpu( + FPTYPE * dz_dy, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dz_dy_dem_x, + const FPTYPE * dz_dy_dem, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size); + #if GOOGLE_CUDA template -void tabulate_fusion_gpu_cuda( +void tabulate_fusion_se_a_gpu_cuda( FPTYPE * out, const FPTYPE * table, const FPTYPE * table_info, @@ -52,7 +92,7 @@ void tabulate_fusion_gpu_cuda( const int last_layer_size); template -void tabulate_fusion_grad_gpu_cuda( +void tabulate_fusion_se_a_grad_gpu_cuda( FPTYPE * dy_dem_x, FPTYPE * dy_dem, const FPTYPE * table, @@ -65,7 +105,7 @@ void tabulate_fusion_grad_gpu_cuda( const int last_layer_size); template -void tabulate_fusion_grad_grad_gpu_cuda( +void tabulate_fusion_se_a_grad_grad_gpu_cuda( FPTYPE * dz_dy, const FPTYPE * table, const FPTYPE * table_info, @@ -76,11 +116,51 @@ void tabulate_fusion_grad_grad_gpu_cuda( const int nloc, const int nnei, const int last_layer_size); + +template +void tabulate_fusion_se_t_gpu_cuda( + FPTYPE * out, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size); + +template +void tabulate_fusion_se_t_grad_gpu_cuda( + FPTYPE * dy_dem_x, + FPTYPE * dy_dem, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dy, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size); + +template +void tabulate_fusion_se_t_grad_grad_gpu_cuda( + FPTYPE * dz_dy, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dz_dy_dem_x, + const FPTYPE * dz_dy_dem, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM template -void tabulate_fusion_gpu_rocm( +void tabulate_fusion_se_a_gpu_rocm( FPTYPE * out, const FPTYPE * table, const FPTYPE * table_info, @@ -91,7 +171,7 @@ void tabulate_fusion_gpu_rocm( const int last_layer_size); template -void tabulate_fusion_grad_gpu_rocm( +void tabulate_fusion_se_a_grad_gpu_rocm( FPTYPE * dy_dem_x, FPTYPE * dy_dem, const FPTYPE * table, @@ -104,7 +184,7 @@ void tabulate_fusion_grad_gpu_rocm( const int last_layer_size); template -void tabulate_fusion_grad_grad_gpu_rocm( +void tabulate_fusion_se_a_grad_grad_gpu_rocm( FPTYPE * dz_dy, const FPTYPE * table, const FPTYPE * table_info, @@ -115,6 +195,46 @@ void tabulate_fusion_grad_grad_gpu_rocm( const int nloc, const int nnei, const int last_layer_size); + +template +void tabulate_fusion_se_t_gpu_rocm( + FPTYPE * out, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size); + +template +void tabulate_fusion_se_t_grad_gpu_rocm( + FPTYPE * dy_dem_x, + FPTYPE * dy_dem, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dy, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size); + +template +void tabulate_fusion_se_t_grad_grad_gpu_rocm( + FPTYPE * dz_dy, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dz_dy_dem_x, + const FPTYPE * dz_dy_dem, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size); #endif // TENSORFLOW_USE_ROCM } diff --git a/source/lib/src/cuda/tabulate.cu b/source/lib/src/cuda/tabulate.cu index 265a2baffe..47ae73577f 100644 --- a/source/lib/src/cuda/tabulate.cu +++ b/source/lib/src/cuda/tabulate.cu @@ -37,6 +37,42 @@ void locate_xx( } } +template +__forceinline__ __device__ +void locate_xx_se_t( + FPTYPE& xx, + int& table_idx, + const FPTYPE& lower, + const FPTYPE& upper, + const FPTYPE& min, + const FPTYPE& max, + const FPTYPE& stride0, + const FPTYPE& stride1) +{ + if (xx < min) { + table_idx = 0; + xx = 0; + } + else if (xx < lower) { + table_idx = (int)((xx - min) / stride1); + xx -= (table_idx * stride1 + min); + } + else if (xx < upper) { + int first_stride = int((lower - min) / stride1); + table_idx = first_stride + (int)((xx - lower) / stride0); + xx -= ((table_idx - first_stride) * stride0 + lower); + } + else if (xx < max) { + int first_stride = int((lower - min) / stride1) + int((upper - lower) / stride0); + table_idx = first_stride + (int)((xx - upper) / stride1); + xx -= ((table_idx - first_stride) * stride1 + upper); + } + else { + table_idx = int((lower - min) / stride1) + int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1; + xx = 0; + } +} + template __forceinline__ __device__ FPTYPE dot( @@ -60,7 +96,7 @@ template < typename FPTYPE, int MTILE, int KTILE> -__global__ void tabulate_fusion_fifth_order_polynomial( +__global__ void tabulate_fusion_se_a_fifth_order_polynomial( FPTYPE * out, const FPTYPE * table, const FPTYPE * em_x, @@ -111,7 +147,7 @@ template < typename FPTYPE, int MTILE, int KTILE> -__global__ void tabulate_fusion_grad_fifth_order_polynomial( +__global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial( FPTYPE * dy_dem_x, FPTYPE * dy_dem, const FPTYPE * table, @@ -191,7 +227,7 @@ template < typename FPTYPE, int MTILE, int KTILE> -__global__ void tabulate_fusion_grad_grad_fifth_order_polynomial( +__global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial( FPTYPE * dz_dy, const FPTYPE * table, const FPTYPE * em_x, @@ -247,9 +283,191 @@ __global__ void tabulate_fusion_grad_grad_fifth_order_polynomial( } } +template < + typename FPTYPE, + int MTILE, + int KTILE> +__global__ void tabulate_fusion_se_t_fifth_order_polynomial( + FPTYPE * out, + const FPTYPE * table, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE lower, + const FPTYPE upper, + const FPTYPE max, + const FPTYPE stride0, + const FPTYPE stride1, + const int nnei_i, + const int nnei_j, + const int last_layer_size) +{ + const int block_idx = blockIdx.x; // nloc + const int thread_idx = threadIdx.x; // last_layer_size + + FPTYPE sum = 0.f; + for (int ii = 0; ii < nnei_i; ii++) { + FPTYPE ago = __shfl_sync(0xffffffff, em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + nnei_j - 1], 0); + int breakpoint = nnei_j - 1; + bool unloop = false; + for (int jj = 0; jj < nnei_j; jj++) { + FPTYPE xx = em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + jj]; + FPTYPE tmp = xx; + if (xx == ago) { + unloop = true; + breakpoint = jj - 1; + } + int table_idx = 0; + locate_xx_se_t(xx, table_idx, lower, upper, -max, max, stride0, stride1); + FPTYPE var[6]; + var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0]; + var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1]; + var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2]; + var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3]; + var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4]; + var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5]; + FPTYPE res = var[0] + (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * xx; + + sum += (nnei_j - breakpoint) * tmp * res; + if (unloop) break; + } + } + out[block_idx * last_layer_size + thread_idx] = sum; +} + +template < + typename FPTYPE, + int MTILE, + int KTILE> +__global__ void tabulate_fusion_se_t_grad_fifth_order_polynomial( + FPTYPE * dy_dem_x, + FPTYPE * dy_dem, + const FPTYPE * table, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dy, + const FPTYPE lower, + const FPTYPE upper, + const FPTYPE max, + const FPTYPE stride0, + const FPTYPE stride1, + const int nnei_i, + const int nnei_j, + const int last_layer_size) +{ + extern __shared__ int _data[]; + const int block_idx = blockIdx.x; // nloc + const int thread_idx = threadIdx.x; // KTILE * WARP_SIZE, usally 128 here~ + int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / WARP_SIZE, 0); + int lane_idx = threadIdx.x % WARP_SIZE; + FPTYPE * iteratorA = (FPTYPE *)&_data[0]; // dy + for (int ii = thread_idx; ii < last_layer_size; ii += blockDim.x) { + iteratorA[ii] = dy[block_idx * last_layer_size + ii]; + } + __syncthreads(); + + for (int ii = 0; ii < nnei_i; ii++) { + FPTYPE ago = __shfl_sync(0xffffffff, em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + nnei_j - 1], 0); + bool unloop = false; + for (int jj = warp_idx; jj < nnei_j; jj += KTILE) { + FPTYPE xx = em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + jj]; + FPTYPE tmp = xx; + if (ago == xx) { + unloop = true; + } + int table_idx = 0; + locate_xx_se_t(xx, table_idx, lower, upper, -max, max, stride0, stride1); + FPTYPE sum = 0.f; + FPTYPE Csub = 0.f; + for (int kk = lane_idx; kk < last_layer_size; kk += WARP_SIZE) { + FPTYPE var[6]; + // load iteratorB through table + var[0] = table[table_idx * last_layer_size * 6 + 6 * kk + 0]; + var[1] = table[table_idx * last_layer_size * 6 + 6 * kk + 1]; + var[2] = table[table_idx * last_layer_size * 6 + 6 * kk + 2]; + var[3] = table[table_idx * last_layer_size * 6 + 6 * kk + 3]; + var[4] = table[table_idx * last_layer_size * 6 + 6 * kk + 4]; + var[5] = table[table_idx * last_layer_size * 6 + 6 * kk + 5]; + FPTYPE res = var[0] + (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * xx; + + sum += iteratorA[kk] * res; + Csub += iteratorA[kk] * tmp * (var[1] + (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * xx); + } + __syncwarp(); + warp_reduce(sum); + warp_reduce(Csub); + if (lane_idx == 0) { + dy_dem [block_idx * nnei_i * nnei_j + ii * nnei_j + jj] = sum; + dy_dem_x[block_idx * nnei_i * nnei_j + ii * nnei_j + jj] = Csub; + } + if (unloop) break; + } + } +} + +template < + typename FPTYPE, + int MTILE, + int KTILE> +__global__ void tabulate_fusion_se_t_grad_grad_fifth_order_polynomial( + FPTYPE * dz_dy, + const FPTYPE * table, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dz_dy_dem_x, + const FPTYPE * dz_dy_dem, + const FPTYPE lower, + const FPTYPE upper, + const FPTYPE max, + const FPTYPE stride0, + const FPTYPE stride1, + const int nnei, + const int nnei_j, + const int last_layer_size) +{ + extern __shared__ int _data[]; + const int block_idx = blockIdx.x; // nloc + const int thread_idx = threadIdx.x; // last_layer_size + FPTYPE ago = __shfl_sync(0xffffffff, em_x[block_idx * nnei + nnei - 1], 0); + bool unloop = false; + int breakpoint = nnei - 1; + FPTYPE * iteratorC = (FPTYPE*) &_data[0]; + for (int kk = 0; kk < MTILE; kk++) + iteratorC[kk * last_layer_size + thread_idx] = 0.f; + __syncthreads(); + + for (int ii = 0; ii < nnei; ii++) { + FPTYPE var[6]; + FPTYPE xx = em_x[block_idx * nnei + ii]; + FPTYPE dz_xx = dz_dy_dem_x[block_idx * nnei + ii]; + if (xx == ago) { + unloop = true; + breakpoint = ii; + } + int table_idx = 0; + locate_xx_se_t(xx, table_idx, lower, upper, -max, max, stride0, stride1); + var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0]; + var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1]; + var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2]; + var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3]; + var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4]; + var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5]; + FPTYPE res = var[0] + (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * xx; + FPTYPE res_grad = var[1] + (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * xx; + + for (int kk = 0; kk < MTILE; kk++) { + int em_index = block_idx * nnei * MTILE + ii * MTILE + kk; + iteratorC[kk * last_layer_size + thread_idx] += (nnei - breakpoint) * (em[em_index] * res_grad * dz_xx + dz_dy_dem[em_index] * res); + } + if (unloop) break; + } + for (int ii = 0; ii < MTILE; ii++) { + dz_dy[block_idx * MTILE * last_layer_size + ii * last_layer_size + thread_idx] = iteratorC[ii * last_layer_size + thread_idx]; + } +} + namespace deepmd { template -void tabulate_fusion_gpu_cuda( +void tabulate_fusion_se_a_gpu_cuda( FPTYPE * out, const FPTYPE * table, const FPTYPE * table_info, @@ -260,7 +478,7 @@ void tabulate_fusion_gpu_cuda( const int last_layer_size) { if (nloc <= 0) {return;} - tabulate_fusion_fifth_order_polynomial <<>>( + tabulate_fusion_se_a_fifth_order_polynomial <<>>( out, table, em_x, em, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size); DPErrcheck(cudaGetLastError()); @@ -268,7 +486,7 @@ void tabulate_fusion_gpu_cuda( } template -void tabulate_fusion_grad_gpu_cuda( +void tabulate_fusion_se_a_grad_gpu_cuda( FPTYPE * dy_dem_x, FPTYPE * dy_dem, const FPTYPE * table, @@ -288,7 +506,7 @@ void tabulate_fusion_grad_gpu_cuda( dy_dem, 0.0, sizeof(FPTYPE) * nloc * nnei * 4)); - tabulate_fusion_grad_fifth_order_polynomial <<>>( + tabulate_fusion_se_a_grad_fifth_order_polynomial <<>>( dy_dem_x, dy_dem, table, em_x, em, dy, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size); DPErrcheck(cudaGetLastError()); @@ -296,7 +514,7 @@ void tabulate_fusion_grad_gpu_cuda( } template -void tabulate_fusion_grad_grad_gpu_cuda( +void tabulate_fusion_se_a_grad_grad_gpu_cuda( FPTYPE * dz_dy, const FPTYPE * table, const FPTYPE * table_info, @@ -312,17 +530,98 @@ void tabulate_fusion_grad_grad_gpu_cuda( DPErrcheck(cudaMemset( dz_dy, 0.0, sizeof(FPTYPE) * nloc * 4 * last_layer_size)); - tabulate_fusion_grad_grad_fifth_order_polynomial <<>>( + tabulate_fusion_se_a_grad_grad_fifth_order_polynomial <<>>( dz_dy, table, em_x, em, dz_dy_dem_x, dz_dy_dem, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size); DPErrcheck(cudaGetLastError()); DPErrcheck(cudaDeviceSynchronize()); } -template void tabulate_fusion_gpu_cuda(float * out, const float * table, const float * table_info, const float * em_x, const float * em, const int nloc, const int nnei, const int last_layer_size); -template void tabulate_fusion_gpu_cuda(double * out, const double * table, const double * table_info, const double * em_x, const double * em, const int nloc, const int nnei, const int last_layer_size); -template void tabulate_fusion_grad_gpu_cuda (float * dy_dem_x, float * dy_dem, const float * table, const float * table_info, const float * em_x, const float * em, const float * dy, const int nloc, const int nnei, const int last_layer_size); -template void tabulate_fusion_grad_gpu_cuda (double * dy_dem_x, double * dy_dem, const double * table, const double * table_info, const double * em_x, const double * em, const double * dy, const int nloc, const int nnei, const int last_layer_size); -template void tabulate_fusion_grad_grad_gpu_cuda (float * dz_dy, const float * table, const float * table_info, const float * em_x, const float * em, const float * dz_dy_dem_x, const float * dz_dy_dem, const int nloc, const int nnei, const int last_layer_size); -template void tabulate_fusion_grad_grad_gpu_cuda (double * dz_dy, const double * table, const double * table_info, const double * em_x, const double * em, const double * dz_dy_dem_x, const double * dz_dy_dem, const int nloc, const int nnei, const int last_layer_size); +template +void tabulate_fusion_se_t_gpu_cuda( + FPTYPE * out, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size) +{ + if (nloc <= 0) {return;} + tabulate_fusion_se_t_fifth_order_polynomial <<>>( + out, + table, em_x, em, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei_i, nnei_j, last_layer_size); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); +} + +template +void tabulate_fusion_se_t_grad_gpu_cuda( + FPTYPE * dy_dem_x, + FPTYPE * dy_dem, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dy, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size) +{ + if (nloc <= 0) {return;} + DPErrcheck(cudaMemset( + dy_dem_x, + 0.0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j)); + DPErrcheck(cudaMemset( + dy_dem, + 0.0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j)); + + tabulate_fusion_se_t_grad_fifth_order_polynomial <<>>( + dy_dem_x, dy_dem, + table, em_x, em, dy, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei_i, nnei_j, last_layer_size); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); +} + +template +void tabulate_fusion_se_t_grad_grad_gpu_cuda( + FPTYPE * dz_dy, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dz_dy_dem_x, + const FPTYPE * dz_dy_dem, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size) +{ + if (nloc <= 0) {return;} + DPErrcheck(cudaMemset( + dz_dy, + 0.0, sizeof(FPTYPE) * nloc * last_layer_size)); + tabulate_fusion_se_t_grad_grad_fifth_order_polynomial <<>>( + dz_dy, + table, em_x, em, dz_dy_dem_x, dz_dy_dem, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei_i, nnei_j, last_layer_size); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); +} + +template void tabulate_fusion_se_a_gpu_cuda(float * out, const float * table, const float * table_info, const float * em_x, const float * em, const int nloc, const int nnei, const int last_layer_size); +template void tabulate_fusion_se_a_gpu_cuda(double * out, const double * table, const double * table_info, const double * em_x, const double * em, const int nloc, const int nnei, const int last_layer_size); +template void tabulate_fusion_se_a_grad_gpu_cuda (float * dy_dem_x, float * dy_dem, const float * table, const float * table_info, const float * em_x, const float * em, const float * dy, const int nloc, const int nnei, const int last_layer_size); +template void tabulate_fusion_se_a_grad_gpu_cuda (double * dy_dem_x, double * dy_dem, const double * table, const double * table_info, const double * em_x, const double * em, const double * dy, const int nloc, const int nnei, const int last_layer_size); +template void tabulate_fusion_se_a_grad_grad_gpu_cuda (float * dz_dy, const float * table, const float * table_info, const float * em_x, const float * em, const float * dz_dy_dem_x, const float * dz_dy_dem, const int nloc, const int nnei, const int last_layer_size); +template void tabulate_fusion_se_a_grad_grad_gpu_cuda (double * dz_dy, const double * table, const double * table_info, const double * em_x, const double * em, const double * dz_dy_dem_x, const double * dz_dy_dem, const int nloc, const int nnei, const int last_layer_size); + +template void tabulate_fusion_se_t_gpu_cuda(float * out, const float * table, const float * table_info, const float * em_x, const float * em, const int nloc, const int nnei_i, const int nnei_j, const int last_layer_size); +template void tabulate_fusion_se_t_gpu_cuda(double * out, const double * table, const double * table_info, const double * em_x, const double * em, const int nloc, const int nnei_i, const int nnei_j, const int last_layer_size); +template void tabulate_fusion_se_t_grad_gpu_cuda (float * dy_dem_x, float * dy_dem, const float * table, const float * table_info, const float * em_x, const float * em, const float * dy, const int nloc, const int nnei_i, const int nnei_j, const int last_layer_size); +template void tabulate_fusion_se_t_grad_gpu_cuda (double * dy_dem_x, double * dy_dem, const double * table, const double * table_info, const double * em_x, const double * em, const double * dy, const int nloc, const int nnei_i, const int nnei_j, const int last_layer_size); +template void tabulate_fusion_se_t_grad_grad_gpu_cuda (float * dz_dy, const float * table, const float * table_info, const float * em_x, const float * em, const float * dz_dy_dem_x, const float * dz_dy_dem, const int nloc, const int nnei_i, const int nnei_j, const int last_layer_size); +template void tabulate_fusion_se_t_grad_grad_gpu_cuda (double * dz_dy, const double * table, const double * table_info, const double * em_x, const double * em, const double * dz_dy_dem_x, const double * dz_dy_dem, const int nloc, const int nnei_i, const int nnei_j, const int last_layer_size); } diff --git a/source/lib/src/rocm/tabulate.hip.cu b/source/lib/src/rocm/tabulate.hip.cu index 050bf1658a..055d52d7b8 100644 --- a/source/lib/src/rocm/tabulate.hip.cu +++ b/source/lib/src/rocm/tabulate.hip.cu @@ -37,6 +37,42 @@ void locate_xx( } } +template +__forceinline__ __device__ +void locate_xx_se_t( + FPTYPE& xx, + int& table_idx, + const FPTYPE& lower, + const FPTYPE& upper, + const FPTYPE& min, + const FPTYPE& max, + const FPTYPE& stride0, + const FPTYPE& stride1) +{ + if (xx < min) { + table_idx = 0; + xx = 0; + } + else if (xx < lower) { + table_idx = (int)((xx - min) / stride1); + xx -= (table_idx * stride1 + min); + } + else if (xx < upper) { + int first_stride = int((lower - min) / stride1); + table_idx = first_stride + (int)((xx - lower) / stride0); + xx -= ((table_idx - first_stride) * stride0 + lower); + } + else if (xx < max) { + int first_stride = int((lower - min) / stride1) + int((upper - lower) / stride0); + table_idx = first_stride + (int)((xx - upper) / stride1); + xx -= ((table_idx - first_stride) * stride1 + upper); + } + else { + table_idx = int((lower - min) / stride1) + int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1; + xx = 0; + } +} + template __forceinline__ __device__ FPTYPE dot( @@ -60,7 +96,7 @@ template < typename FPTYPE, int MTILE, int KTILE> -__global__ void tabulate_fusion_fifth_order_polynomial( +__global__ void tabulate_fusion_se_a_fifth_order_polynomial( FPTYPE * out, const FPTYPE * table, const FPTYPE * em_x, @@ -115,7 +151,7 @@ template < typename FPTYPE, int MTILE, int KTILE> -__global__ void tabulate_fusion_grad_fifth_order_polynomial( +__global__ void tabulate_fusion_se_a_grad_fifth_order_polynomial( FPTYPE * dy_dem_x, FPTYPE * dy_dem, const FPTYPE * table, @@ -196,7 +232,7 @@ template < typename FPTYPE, int MTILE, int KTILE> -__global__ void tabulate_fusion_grad_grad_fifth_order_polynomial( +__global__ void tabulate_fusion_se_a_grad_grad_fifth_order_polynomial( FPTYPE * dz_dy, const FPTYPE * table, const FPTYPE * em_x, @@ -252,9 +288,192 @@ __global__ void tabulate_fusion_grad_grad_fifth_order_polynomial( } } +template < + typename FPTYPE, + int MTILE, + int KTILE> +__global__ void tabulate_fusion_se_t_fifth_order_polynomial( + FPTYPE * out, + const FPTYPE * table, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE lower, + const FPTYPE upper, + const FPTYPE max, + const FPTYPE stride0, + const FPTYPE stride1, + const int nnei_i, + const int nnei_j, + const int last_layer_size) +{ + HIP_DYNAMIC_SHARED( int, _data) + const int block_idx = blockIdx.x; // nloc + const int thread_idx = threadIdx.x; // last_layer_size + + FPTYPE sum = 0.f; + for (int ii = 0; ii < nnei_i; ii++) { + FPTYPE ago = __shfl(em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + nnei_j - 1], 0); + int breakpoint = nnei_j - 1; + bool unloop = false; + for (int jj = 0; jj < nnei_j; jj++) { + FPTYPE xx = em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + jj]; + FPTYPE tmp = xx; + if (xx == ago) { + unloop = true; + breakpoint = jj - 1; + } + int table_idx = 0; + locate_xx_se_t(xx, table_idx, lower, upper, -max, max, stride0, stride1); + FPTYPE var[6]; + var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0]; + var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1]; + var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2]; + var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3]; + var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4]; + var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5]; + FPTYPE res = var[0] + (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * xx; + + sum += (nnei_j - breakpoint) * tmp * res; + if (unloop) break; + } + } + out[block_idx * last_layer_size + thread_idx] = sum; +} + +template < + typename FPTYPE, + int MTILE, + int KTILE> +__global__ void tabulate_fusion_se_t_grad_fifth_order_polynomial( + FPTYPE * dy_dem_x, + FPTYPE * dy_dem, + const FPTYPE * table, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dy, + const FPTYPE lower, + const FPTYPE upper, + const FPTYPE max, + const FPTYPE stride0, + const FPTYPE stride1, + const int nnei_i, + const int nnei_j, + const int last_layer_size) +{ + HIP_DYNAMIC_SHARED( int, _data) + const int block_idx = blockIdx.x; // nloc + const int thread_idx = threadIdx.x; // KTILE * WARP_SIZE, usally 128 here~ + int warp_idx = __shfl(threadIdx.x / 64, 0); + int lane_idx = threadIdx.x % 64; + FPTYPE * iteratorA = (FPTYPE *)&_data[0]; // dy + for (int ii = thread_idx; ii < last_layer_size; ii += blockDim.x) { + iteratorA[ii] = dy[block_idx * last_layer_size + ii]; + } + __syncthreads(); + + for (int ii = 0; ii < nnei_i; ii++) { + FPTYPE ago = __shfl(em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + nnei_j - 1], 0); + bool unloop = false; + for (int jj = warp_idx; jj < nnei_j; jj += KTILE) { + FPTYPE xx = em_x[block_idx * nnei_i * nnei_j + ii * nnei_j + jj]; + FPTYPE tmp = xx; + if (ago == xx) { + unloop = true; + } + int table_idx = 0; + locate_xx_se_t(xx, table_idx, lower, upper, -max, max, stride0, stride1); + FPTYPE sum = 0.f; + FPTYPE Csub = 0.f; + for (int kk = lane_idx; kk < last_layer_size; kk += WARP_SIZE) { + FPTYPE var[6]; + // load iteratorB through table + var[0] = table[table_idx * last_layer_size * 6 + 6 * kk + 0]; + var[1] = table[table_idx * last_layer_size * 6 + 6 * kk + 1]; + var[2] = table[table_idx * last_layer_size * 6 + 6 * kk + 2]; + var[3] = table[table_idx * last_layer_size * 6 + 6 * kk + 3]; + var[4] = table[table_idx * last_layer_size * 6 + 6 * kk + 4]; + var[5] = table[table_idx * last_layer_size * 6 + 6 * kk + 5]; + FPTYPE res = var[0] + (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * xx; + + sum += iteratorA[kk] * res; + Csub += iteratorA[kk] * tmp * (var[1] + (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * xx); + } + __syncthreads(); + warp_reduce(sum); + warp_reduce(Csub); + if (lane_idx == 0) { + dy_dem [block_idx * nnei_i * nnei_j + ii * nnei_j + jj] = sum; + dy_dem_x[block_idx * nnei_i * nnei_j + ii * nnei_j + jj] = Csub; + } + if (unloop) break; + } + } +} + +template < + typename FPTYPE, + int MTILE, + int KTILE> +__global__ void tabulate_fusion_se_t_grad_grad_fifth_order_polynomial( + FPTYPE * dz_dy, + const FPTYPE * table, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dz_dy_dem_x, + const FPTYPE * dz_dy_dem, + const FPTYPE lower, + const FPTYPE upper, + const FPTYPE max, + const FPTYPE stride0, + const FPTYPE stride1, + const int nnei, + const int nnei_j, + const int last_layer_size) +{ + extern __shared__ int _data[]; + const int block_idx = blockIdx.x; // nloc + const int thread_idx = threadIdx.x; // last_layer_size + FPTYPE ago = __shfl( em_x[block_idx * nnei + nnei - 1], 0); + bool unloop = false; + int breakpoint = nnei - 1; + FPTYPE * iteratorC = (FPTYPE*) &_data[0]; + for (int kk = 0; kk < MTILE; kk++) + iteratorC[kk * last_layer_size + thread_idx] = 0.f; + __syncthreads(); + + for (int ii = 0; ii < nnei; ii++) { + FPTYPE var[6]; + FPTYPE xx = em_x[block_idx * nnei + ii]; + FPTYPE dz_xx = dz_dy_dem_x[block_idx * nnei + ii]; + if (xx == ago) { + unloop = true; + breakpoint = ii; + } + int table_idx = 0; + locate_xx(xx, table_idx, lower, upper, max, stride0, stride1); + var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0]; + var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1]; + var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2]; + var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3]; + var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4]; + var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5]; + FPTYPE res = var[0] + (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * xx; + FPTYPE res_grad = var[1] + (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * xx; + + for (int kk = 0; kk < MTILE; kk++) { + int em_index = block_idx * nnei * MTILE + ii * MTILE + kk; + iteratorC[kk * last_layer_size + thread_idx] += (nnei - breakpoint) * (em[em_index] * res_grad * dz_xx + dz_dy_dem[em_index] * res); + } + if (unloop) break; + } + for (int ii = 0; ii < MTILE; ii++) { + dz_dy[block_idx * MTILE * last_layer_size + ii * last_layer_size + thread_idx] = iteratorC[ii * last_layer_size + thread_idx]; + } +} + namespace deepmd { template -void tabulate_fusion_gpu_rocm( +void tabulate_fusion_se_a_gpu_rocm( FPTYPE * out, const FPTYPE * table, const FPTYPE * table_info, @@ -265,7 +484,7 @@ void tabulate_fusion_gpu_rocm( const int last_layer_size) { if(nloc <= 0){return;} - hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_fifth_order_polynomial), nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_se_a_fifth_order_polynomial), nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, out, table, em_x, em, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size); DPErrcheck(hipGetLastError()); @@ -273,7 +492,7 @@ void tabulate_fusion_gpu_rocm( } template -void tabulate_fusion_grad_gpu_rocm( +void tabulate_fusion_se_a_grad_gpu_rocm( FPTYPE * dy_dem_x, FPTYPE * dy_dem, const FPTYPE * table, @@ -293,7 +512,7 @@ void tabulate_fusion_grad_gpu_rocm( dy_dem, 0.0, sizeof(FPTYPE) * nloc * nnei * 4)); - hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_grad_fifth_order_polynomial), nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_se_a_grad_fifth_order_polynomial), nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size, 0, dy_dem_x, dy_dem, table, em_x, em, dy, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size); DPErrcheck(hipGetLastError()); @@ -301,7 +520,7 @@ void tabulate_fusion_grad_gpu_rocm( } template -void tabulate_fusion_grad_grad_gpu_rocm( +void tabulate_fusion_se_a_grad_grad_gpu_rocm( FPTYPE * dz_dy, const FPTYPE * table, const FPTYPE * table_info, @@ -317,17 +536,98 @@ void tabulate_fusion_grad_grad_gpu_rocm( DPErrcheck(hipMemset( dz_dy, 0.0, sizeof(FPTYPE) * nloc * 4 * last_layer_size)); - hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_grad_grad_fifth_order_polynomial), nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, + hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_se_a_grad_grad_fifth_order_polynomial), nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, dz_dy, table, em_x, em, dz_dy_dem_x, dz_dy_dem, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size); DPErrcheck(hipGetLastError()); DPErrcheck(hipDeviceSynchronize()); } -template void tabulate_fusion_gpu_rocm(float * out, const float * table, const float * table_info, const float * em_x, const float * em, const int nloc, const int nnei, const int last_layer_size); -template void tabulate_fusion_gpu_rocm(double * out, const double * table, const double * table_info, const double * em_x, const double * em, const int nloc, const int nnei, const int last_layer_size); -template void tabulate_fusion_grad_gpu_rocm (float * dy_dem_x, float * dy_dem, const float * table, const float * table_info, const float * em_x, const float * em, const float * dy, const int nloc, const int nnei, const int last_layer_size); -template void tabulate_fusion_grad_gpu_rocm (double * dy_dem_x, double * dy_dem, const double * table, const double * table_info, const double * em_x, const double * em, const double * dy, const int nloc, const int nnei, const int last_layer_size); -template void tabulate_fusion_grad_grad_gpu_rocm (float * dz_dy, const float * table, const float * table_info, const float * em_x, const float * em, const float * dz_dy_dem_x, const float * dz_dy_dem, const int nloc, const int nnei, const int last_layer_size); -template void tabulate_fusion_grad_grad_gpu_rocm (double * dz_dy, const double * table, const double * table_info, const double * em_x, const double * em, const double * dz_dy_dem_x, const double * dz_dy_dem, const int nloc, const int nnei, const int last_layer_size); +template +void tabulate_fusion_se_t_gpu_rocm( + FPTYPE * out, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size) +{ + if(nloc <= 0){return;} + hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_se_t_fifth_order_polynomial), nloc, last_layer_size, 0, 0, + out, + table, em_x, em, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei_i, nnei_j, last_layer_size); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); +} + +template +void tabulate_fusion_se_t_grad_gpu_rocm( + FPTYPE * dy_dem_x, + FPTYPE * dy_dem, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dy, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size) +{ + if(nloc <= 0) {return;} + DPErrcheck(hipMemset( + dy_dem_x, + 0.0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j)); + DPErrcheck(hipMemset( + dy_dem, + 0.0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j)); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_se_t_grad_fifth_order_polynomial), nloc, KK * WARP_SIZE, sizeof(FPTYPE) * last_layer_size, 0, + dy_dem_x, dy_dem, + table, em_x, em, dy, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei_i, nnei_j, last_layer_size); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); +} + +template +void tabulate_fusion_se_t_grad_grad_gpu_rocm( + FPTYPE * dz_dy, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dz_dy_dem_x, + const FPTYPE * dz_dy_dem, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size) +{ + if(nloc <= 0) {return;} + DPErrcheck(hipMemset( + dz_dy, + 0.0, sizeof(FPTYPE) * nloc * last_layer_size)); + hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_se_t_grad_grad_fifth_order_polynomial), nloc, last_layer_size, sizeof(FPTYPE) * last_layer_size, 0, + dz_dy, + table, em_x, em, dz_dy_dem_x, dz_dy_dem, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei_i, nnei_j, last_layer_size); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); +} + +template void tabulate_fusion_se_a_gpu_rocm(float * out, const float * table, const float * table_info, const float * em_x, const float * em, const int nloc, const int nnei, const int last_layer_size); +template void tabulate_fusion_se_a_gpu_rocm(double * out, const double * table, const double * table_info, const double * em_x, const double * em, const int nloc, const int nnei, const int last_layer_size); +template void tabulate_fusion_se_a_grad_gpu_rocm (float * dy_dem_x, float * dy_dem, const float * table, const float * table_info, const float * em_x, const float * em, const float * dy, const int nloc, const int nnei, const int last_layer_size); +template void tabulate_fusion_se_a_grad_gpu_rocm (double * dy_dem_x, double * dy_dem, const double * table, const double * table_info, const double * em_x, const double * em, const double * dy, const int nloc, const int nnei, const int last_layer_size); +template void tabulate_fusion_se_a_grad_grad_gpu_rocm (float * dz_dy, const float * table, const float * table_info, const float * em_x, const float * em, const float * dz_dy_dem_x, const float * dz_dy_dem, const int nloc, const int nnei, const int last_layer_size); +template void tabulate_fusion_se_a_grad_grad_gpu_rocm (double * dz_dy, const double * table, const double * table_info, const double * em_x, const double * em, const double * dz_dy_dem_x, const double * dz_dy_dem, const int nloc, const int nnei, const int last_layer_size); + +template void tabulate_fusion_se_t_gpu_rocm(float * out, const float * table, const float * table_info, const float * em_x, const float * em, const int nloc, const int nnei_i, const int nnei_j, const int last_layer_size); +template void tabulate_fusion_se_t_gpu_rocm(double * out, const double * table, const double * table_info, const double * em_x, const double * em, const int nloc, const int nnei_i, const int nnei_j, const int last_layer_size); +template void tabulate_fusion_se_t_grad_gpu_rocm (float * dy_dem_x, float * dy_dem, const float * table, const float * table_info, const float * em_x, const float * em, const float * dy, const int nloc, const int nnei_i, const int nnei_j, const int last_layer_size); +template void tabulate_fusion_se_t_grad_gpu_rocm (double * dy_dem_x, double * dy_dem, const double * table, const double * table_info, const double * em_x, const double * em, const double * dy, const int nloc, const int nnei_i, const int nnei_j, const int last_layer_size); +template void tabulate_fusion_se_t_grad_grad_gpu_rocm (float * dz_dy, const float * table, const float * table_info, const float * em_x, const float * em, const float * dz_dy_dem_x, const float * dz_dy_dem, const int nloc, const int nnei_i, const int nnei_j, const int last_layer_size); +template void tabulate_fusion_se_t_grad_grad_gpu_rocm (double * dz_dy, const double * table, const double * table_info, const double * em_x, const double * em, const double * dz_dy_dem_x, const double * dz_dy_dem, const int nloc, const int nnei_i, const int nnei_j, const int last_layer_size); } diff --git a/source/lib/src/tabulate.cc b/source/lib/src/tabulate.cc index 385c68523b..ffacb96fdb 100644 --- a/source/lib/src/tabulate.cc +++ b/source/lib/src/tabulate.cc @@ -42,6 +42,42 @@ inline void locate_xx( } } + +template +inline void locate_xx_se_t( + const FPTYPE& lower, + const FPTYPE& upper, + const FPTYPE& min, + const FPTYPE& max, + const FPTYPE& stride0, + const FPTYPE& stride1, + FPTYPE& xx, + int& table_idx) +{ + if (xx < min) { + table_idx = 0; + xx = 0; + } + else if (xx < lower) { + table_idx = (int)((xx - min) / stride1); + xx -= (table_idx * stride1 + min); + } + else if (xx < upper) { + int first_stride = int((lower - min) / stride1); + table_idx = first_stride + (int)((xx - lower) / stride0); + xx -= ((table_idx - first_stride) * stride0 + lower); + } + else if (xx < max) { + int first_stride = int((lower - min) / stride1) + int((upper - lower) / stride0); + table_idx = first_stride + (int)((xx - upper) / stride1); + xx -= ((table_idx - first_stride) * stride1 + upper); + } + else { + table_idx = int((lower - min) / stride1) + int((upper - lower) / stride0) + (int)((max - upper) / stride1) - 1; + xx = 0; + } +} + template inline FPTYPE dot( FPTYPE a[4], @@ -51,7 +87,7 @@ inline FPTYPE dot( } template -void deepmd::tabulate_fusion_cpu( +void deepmd::tabulate_fusion_se_a_cpu( FPTYPE * out, const FPTYPE * table, const FPTYPE * table_info, @@ -112,7 +148,7 @@ void deepmd::tabulate_fusion_cpu( } template -void deepmd::tabulate_fusion_grad_cpu( +void deepmd::tabulate_fusion_se_a_grad_cpu( FPTYPE * dy_dem_x, FPTYPE * dy_dem, const FPTYPE * table, @@ -187,7 +223,7 @@ void deepmd::tabulate_fusion_grad_cpu( } template -void deepmd::tabulate_fusion_grad_grad_cpu( +void deepmd::tabulate_fusion_se_a_grad_grad_cpu( FPTYPE * dz_dy, const FPTYPE * table, const FPTYPE * table_info, @@ -256,9 +292,207 @@ void deepmd::tabulate_fusion_grad_grad_cpu( } } -template void deepmd::tabulate_fusion_cpu(float * out, const float * table, const float * table_info, const float * em_x, const float * em, const int nloc, const int nnei, const int last_layer_size); -template void deepmd::tabulate_fusion_cpu(double * out, const double * table, const double * table_info, const double * em_x, const double * em, const int nloc, const int nnei, const int last_layer_size); -template void deepmd::tabulate_fusion_grad_cpu (float * dy_dem_x, float * dy_dem, const float * table, const float * table_info, const float * em_x, const float * em, const float * dy, const int nloc, const int nnei, const int last_layer_size); -template void deepmd::tabulate_fusion_grad_cpu (double * dy_dem_x, double * dy_dem, const double * table, const double * table_info, const double * em_x, const double * em, const double * dy, const int nloc, const int nnei, const int last_layer_size); -template void deepmd::tabulate_fusion_grad_grad_cpu(float * dz_dy, const float * table, const float * table_info, const float * em_x, const float * em, const float * dz_dy_dem_x, const float * dz_dy_dem, const int nloc, const int nnei, const int last_layer_size); -template void deepmd::tabulate_fusion_grad_grad_cpu(double * dz_dy, const double * table, const double * table_info, const double * em_x, const double * em, const double * dz_dy_dem_x, const double * dz_dy_dem, const int nloc, const int nnei, const int last_layer_size); +template +void deepmd::tabulate_fusion_se_t_cpu( + FPTYPE * out, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size) +{ + memset(out, 0.0, sizeof(FPTYPE) * nloc * last_layer_size); + const FPTYPE lower = table_info[0]; + const FPTYPE upper = table_info[1]; + const FPTYPE _max = table_info[2]; + const FPTYPE stride0 = table_info[3]; + const FPTYPE stride1 = table_info[4]; + // for every atom, execute a small manual gemm ~ + // FPTYPE * res = new FPTYPE[4 * last_layer_size]; + #pragma omp parallel for + for (int ii = 0; ii < nloc; ii++) { + for (int jj = 0; jj < nnei_i; jj++) { + FPTYPE ago = em_x[ii * nnei_i * nnei_j + jj * nnei_j + nnei_j - 1]; + bool unloop = false; + for (int kk = 0; kk < nnei_j; kk++) { + FPTYPE xx = em_x[ii * nnei_i * nnei_j + jj * nnei_j + kk]; + FPTYPE ll = xx; + if (ago == xx) { + unloop = true; + } + int table_idx = 0; + locate_xx_se_t(lower, upper, -_max, _max, stride0, stride1, xx, table_idx); + for (int mm = 0; mm < last_layer_size; mm++) { + FPTYPE a0 = table[table_idx * last_layer_size * 6 + 6 * mm + 0]; + FPTYPE a1 = table[table_idx * last_layer_size * 6 + 6 * mm + 1]; + FPTYPE a2 = table[table_idx * last_layer_size * 6 + 6 * mm + 2]; + FPTYPE a3 = table[table_idx * last_layer_size * 6 + 6 * mm + 3]; + FPTYPE a4 = table[table_idx * last_layer_size * 6 + 6 * mm + 4]; + FPTYPE a5 = table[table_idx * last_layer_size * 6 + 6 * mm + 5]; + FPTYPE var = a0 + (a1 + (a2 + (a3 + (a4 + a5 * xx) * xx) * xx) * xx) * xx; + if (unloop) { + out[ii * last_layer_size + mm] += (nnei_j - kk) * var * ll; + } + else { + out[ii * last_layer_size + mm] += var * ll; + } + } + if (unloop) break; + } + } + } +} + +template +void deepmd::tabulate_fusion_se_t_grad_cpu( + FPTYPE * dy_dem_x, + FPTYPE * dy_dem, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dy, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size) +{ + memset(dy_dem_x, 0.0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j); + memset(dy_dem, 0.0, sizeof(FPTYPE) * nloc * nnei_i * nnei_j); + FPTYPE const lower = table_info[0]; + FPTYPE const upper = table_info[1]; + FPTYPE const _max = table_info[2]; + FPTYPE const stride0 = table_info[3]; + FPTYPE const stride1 = table_info[4]; + // for every atom, execute a small gemm~ + // FPTYPE * res = new FPTYPE[4 * last_layer_size]; + #pragma omp parallel for + for (int ii = 0; ii < nloc; ii++) { + FPTYPE ll = 0; + FPTYPE rr = 0; + for (int jj = 0; jj < nnei_i; jj++) { + FPTYPE ago = em_x[ii * nnei_i * nnei_j + jj * nnei_j + nnei_j - 1]; + bool unloop = false; + for (int kk = 0; kk < nnei_j; kk++) { + // construct the dy/dx + FPTYPE xx = em_x[ii * nnei_i * nnei_j + jj * nnei_j + kk]; + ll = xx; + if (ago == xx) { + unloop = true; + } + int table_idx = 0; + locate_xx_se_t(lower, upper, -_max, _max, stride0, stride1, xx, table_idx); + FPTYPE grad = 0.0; + for (int mm = 0; mm < last_layer_size; mm++) { + rr = dy[ii * last_layer_size + mm]; + FPTYPE a0 = table[table_idx * last_layer_size * 6 + 6 * mm + 0]; + FPTYPE a1 = table[table_idx * last_layer_size * 6 + 6 * mm + 1]; + FPTYPE a2 = table[table_idx * last_layer_size * 6 + 6 * mm + 2]; + FPTYPE a3 = table[table_idx * last_layer_size * 6 + 6 * mm + 3]; + FPTYPE a4 = table[table_idx * last_layer_size * 6 + 6 * mm + 4]; + FPTYPE a5 = table[table_idx * last_layer_size * 6 + 6 * mm + 5]; + FPTYPE res = a0 + (a1 + (a2 + (a3 + (a4 + a5 * xx) * xx) * xx) * xx) * xx; + + if (unloop) { + grad += (a1 + (2 * a2 + (3 * a3 + (4 * a4 + 5 * a5 * xx) * xx) * xx) * xx) * ll * rr * (nnei_j - kk); + dy_dem[ii * nnei_i * nnei_j + jj * nnei_j + kk] += res * rr * (nnei_j - kk); + } + else { + grad += (a1 + (2 * a2 + (3 * a3 + (4 * a4 + 5 * a5 * xx) * xx) * xx) * xx) * ll * rr; + dy_dem[ii * nnei_i * nnei_j + jj * nnei_j + kk] += res * rr; + } + } + dy_dem_x[ii * nnei_i * nnei_j + jj * nnei_j + kk] = grad; + if (unloop) break; + } + } + } +} + +template +void deepmd::tabulate_fusion_se_t_grad_grad_cpu( + FPTYPE * dz_dy, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dz_dy_dem_x, + const FPTYPE * dz_dy_dem, + const int nloc, + const int nnei, + const int nnei_j, + const int last_layer_size) +{ + memset(dz_dy, 0.0, sizeof(FPTYPE) * nloc * 4 * last_layer_size); + const FPTYPE lower = table_info[0]; + const FPTYPE upper = table_info[1]; + const FPTYPE _max = table_info[2]; + const FPTYPE stride0 = table_info[3]; + const FPTYPE stride1 = table_info[4]; + // for every atom, execute a small manual gemm ~ + // FPTYPE * res = new FPTYPE[4 * last_layer_size]; + #pragma omp parallel for + for (int ii = 0; ii < nloc; ii++) { + FPTYPE ll[4]; + FPTYPE hh[4]; + FPTYPE ago = em_x[ii * nnei + nnei - 1]; + bool unloop = false; + for (int jj = 0; jj < nnei; jj++) { + ll[0] = em[ii * nnei * 4 + jj * 4 + 0]; + ll[1] = em[ii * nnei * 4 + jj * 4 + 1]; + ll[2] = em[ii * nnei * 4 + jj * 4 + 2]; + ll[3] = em[ii * nnei * 4 + jj * 4 + 3]; + hh[0] = dz_dy_dem[ii * nnei * 4 + jj * 4 + 0]; + hh[1] = dz_dy_dem[ii * nnei * 4 + jj * 4 + 1]; + hh[2] = dz_dy_dem[ii * nnei * 4 + jj * 4 + 2]; + hh[3] = dz_dy_dem[ii * nnei * 4 + jj * 4 + 3]; + FPTYPE xx = em_x[ii * nnei + jj]; + FPTYPE dz_xx = dz_dy_dem_x[ii * nnei + jj]; + if (ago == xx) { + unloop = true; + } + int table_idx = 0; + locate_xx(lower, upper, _max, stride0, stride1, xx, table_idx); + for (int kk = 0; kk < last_layer_size; kk++) { + FPTYPE a0 = table[table_idx * last_layer_size * 6 + 6 * kk + 0]; + FPTYPE a1 = table[table_idx * last_layer_size * 6 + 6 * kk + 1]; + FPTYPE a2 = table[table_idx * last_layer_size * 6 + 6 * kk + 2]; + FPTYPE a3 = table[table_idx * last_layer_size * 6 + 6 * kk + 3]; + FPTYPE a4 = table[table_idx * last_layer_size * 6 + 6 * kk + 4]; + FPTYPE a5 = table[table_idx * last_layer_size * 6 + 6 * kk + 5]; + FPTYPE var = a0 + (a1 + (a2 + (a3 + (a4 + a5 * xx) * xx) * xx) * xx) * xx; + FPTYPE var_grad = a1 + (2 * a2 + (3 * a3 + (4 * a4 + 5 * a5 * xx) * xx) * xx) * xx; + if (unloop) { + dz_dy[ii * last_layer_size * 4 + 0 * last_layer_size + kk] += (nnei - jj) * (var * hh[0] + dz_xx * var_grad * ll[0]); + dz_dy[ii * last_layer_size * 4 + 1 * last_layer_size + kk] += (nnei - jj) * (var * hh[1] + dz_xx * var_grad * ll[1]); + dz_dy[ii * last_layer_size * 4 + 2 * last_layer_size + kk] += (nnei - jj) * (var * hh[2] + dz_xx * var_grad * ll[2]); + dz_dy[ii * last_layer_size * 4 + 3 * last_layer_size + kk] += (nnei - jj) * (var * hh[3] + dz_xx * var_grad * ll[3]); + } + else { + dz_dy[ii * last_layer_size * 4 + 0 * last_layer_size + kk] += var * hh[0] + dz_xx * var_grad * ll[0]; + dz_dy[ii * last_layer_size * 4 + 1 * last_layer_size + kk] += var * hh[1] + dz_xx * var_grad * ll[1]; + dz_dy[ii * last_layer_size * 4 + 2 * last_layer_size + kk] += var * hh[2] + dz_xx * var_grad * ll[2]; + dz_dy[ii * last_layer_size * 4 + 3 * last_layer_size + kk] += var * hh[3] + dz_xx * var_grad * ll[3]; + } + } + if (unloop) break; + } + } +} + +template void deepmd::tabulate_fusion_se_a_cpu(float * out, const float * table, const float * table_info, const float * em_x, const float * em, const int nloc, const int nnei, const int last_layer_size); +template void deepmd::tabulate_fusion_se_a_cpu(double * out, const double * table, const double * table_info, const double * em_x, const double * em, const int nloc, const int nnei, const int last_layer_size); +template void deepmd::tabulate_fusion_se_a_grad_cpu (float * dy_dem_x, float * dy_dem, const float * table, const float * table_info, const float * em_x, const float * em, const float * dy, const int nloc, const int nnei, const int last_layer_size); +template void deepmd::tabulate_fusion_se_a_grad_cpu (double * dy_dem_x, double * dy_dem, const double * table, const double * table_info, const double * em_x, const double * em, const double * dy, const int nloc, const int nnei, const int last_layer_size); +template void deepmd::tabulate_fusion_se_a_grad_grad_cpu(float * dz_dy, const float * table, const float * table_info, const float * em_x, const float * em, const float * dz_dy_dem_x, const float * dz_dy_dem, const int nloc, const int nnei, const int last_layer_size); +template void deepmd::tabulate_fusion_se_a_grad_grad_cpu(double * dz_dy, const double * table, const double * table_info, const double * em_x, const double * em, const double * dz_dy_dem_x, const double * dz_dy_dem, const int nloc, const int nnei, const int last_layer_size); + +template void deepmd::tabulate_fusion_se_t_cpu(float * out, const float * table, const float * table_info, const float * em_x, const float * em, const int nloc, const int nnei_i, const int nnei_j, const int last_layer_size); +template void deepmd::tabulate_fusion_se_t_cpu(double * out, const double * table, const double * table_info, const double * em_x, const double * em, const int nloc, const int nnei_i, const int nnei_j, const int last_layer_size); +template void deepmd::tabulate_fusion_se_t_grad_cpu (float * dy_dem_x, float * dy_dem, const float * table, const float * table_info, const float * em_x, const float * em, const float * dy, const int nloc, const int nnei_i, const int nnei_j, const int last_layer_size); +template void deepmd::tabulate_fusion_se_t_grad_cpu (double * dy_dem_x, double * dy_dem, const double * table, const double * table_info, const double * em_x, const double * em, const double * dy, const int nloc, const int nnei_i, const int nnei_j, const int last_layer_size); +template void deepmd::tabulate_fusion_se_t_grad_grad_cpu(float * dz_dy, const float * table, const float * table_info, const float * em_x, const float * em, const float * dz_dy_dem_x, const float * dz_dy_dem, const int nloc, const int nnei_i, const int nnei_j, const int last_layer_size); +template void deepmd::tabulate_fusion_se_t_grad_grad_cpu(double * dz_dy, const double * table, const double * table_info, const double * em_x, const double * em, const double * dz_dy_dem_x, const double * dz_dy_dem, const int nloc, const int nnei_i, const int nnei_j, const int last_layer_size); diff --git a/source/lib/tests/test_tabulate.cc b/source/lib/tests/test_tabulate.cc index 43c0ef798e..3fe0b27d78 100644 --- a/source/lib/tests/test_tabulate.cc +++ b/source/lib/tests/test_tabulate.cc @@ -144,10 +144,10 @@ class TestTabulate : public ::testing::Test } }; -TEST_F(TestTabulate, tabulate_fusion_cpu) +TEST_F(TestTabulate, tabulate_fusion_se_a_cpu) { std::vector xyz_scatter(nloc * nnei * last_layer_size); - deepmd::tabulate_fusion_cpu(&xyz_scatter[0], &table[0], &info[0], &em_x[0], &em[0], nloc, nnei, last_layer_size); + deepmd::tabulate_fusion_se_a_cpu(&xyz_scatter[0], &table[0], &info[0], &em_x[0], &em[0], nloc, nnei, last_layer_size); EXPECT_EQ(xyz_scatter.size(), nloc * nnei * last_layer_size); EXPECT_EQ(xyz_scatter.size(), expected_xyz_scatter.size()); for (int jj = 0; jj < xyz_scatter.size(); ++jj){ @@ -155,12 +155,12 @@ TEST_F(TestTabulate, tabulate_fusion_cpu) } } -TEST_F(TestTabulate, tabulate_fusion_grad_cpu) +TEST_F(TestTabulate, tabulate_fusion_se_a_grad_cpu) { std::vector dy_dem_x(em_x.size()); std::vector dy_dem(em.size()); std::vector dy(nloc * nnei * last_layer_size, 1.0); - deepmd::tabulate_fusion_grad_cpu(&dy_dem_x[0], &dy_dem[0], &table[0], &info[0], &em_x[0], &em[0], &dy[0], nloc, nnei, last_layer_size); + deepmd::tabulate_fusion_se_a_grad_cpu(&dy_dem_x[0], &dy_dem[0], &table[0], &info[0], &em_x[0], &em[0], &dy[0], nloc, nnei, last_layer_size); EXPECT_EQ(dy_dem_x.size(), nloc * nnei); EXPECT_EQ(dy_dem.size(), nloc * nnei * 4); EXPECT_EQ(dy_dem_x.size(), expected_dy_dem_x.size()); @@ -174,7 +174,7 @@ TEST_F(TestTabulate, tabulate_fusion_grad_cpu) } #if GOOGLE_CUDA -TEST_F(TestTabulate, tabulate_fusion_gpu_cuda) +TEST_F(TestTabulate, tabulate_fusion_se_a_gpu_cuda) { std::vector xyz_scatter(nloc * nnei * last_layer_size, 0.0); @@ -183,7 +183,7 @@ TEST_F(TestTabulate, tabulate_fusion_gpu_cuda) deepmd::malloc_device_memory_sync(table_dev, table); deepmd::malloc_device_memory_sync(em_x_dev, em_x); deepmd::malloc_device_memory_sync(em_dev, em); - deepmd::tabulate_fusion_gpu_cuda(xyz_scatter_dev, table_dev, &info[0], em_x_dev, em_dev, nloc, nnei, last_layer_size); + deepmd::tabulate_fusion_se_a_gpu_cuda(xyz_scatter_dev, table_dev, &info[0], em_x_dev, em_dev, nloc, nnei, last_layer_size); deepmd::memcpy_device_to_host(xyz_scatter_dev, xyz_scatter); deepmd::delete_device_memory(xyz_scatter_dev); deepmd::delete_device_memory(table_dev); @@ -197,7 +197,7 @@ TEST_F(TestTabulate, tabulate_fusion_gpu_cuda) } } -TEST_F(TestTabulate, tabulate_fusion_grad_gpu_cuda) +TEST_F(TestTabulate, tabulate_fusion_se_a_grad_gpu_cuda) { std::vector dy_dem_x(em_x.size(), 0.0); std::vector dy_dem(em.size(), 0.0); @@ -210,7 +210,7 @@ TEST_F(TestTabulate, tabulate_fusion_grad_gpu_cuda) deepmd::malloc_device_memory_sync(em_x_dev, em_x); deepmd::malloc_device_memory_sync(em_dev, em); deepmd::malloc_device_memory_sync(dy_dev, dy); - deepmd::tabulate_fusion_grad_gpu_cuda(dy_dem_x_dev, dy_dem_dev, table_dev, &info[0], em_x_dev, em_dev, dy_dev, nloc, nnei, last_layer_size); + deepmd::tabulate_fusion_se_a_grad_gpu_cuda(dy_dem_x_dev, dy_dem_dev, table_dev, &info[0], em_x_dev, em_dev, dy_dev, nloc, nnei, last_layer_size); deepmd::memcpy_device_to_host(dy_dem_x_dev, dy_dem_x); deepmd::memcpy_device_to_host(dy_dem_dev, dy_dem); deepmd::delete_device_memory(dy_dem_x_dev); @@ -234,7 +234,7 @@ TEST_F(TestTabulate, tabulate_fusion_grad_gpu_cuda) #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM -TEST_F(TestTabulate, tabulate_fusion_gpu_rocm) +TEST_F(TestTabulate, tabulate_fusion_se_a_gpu_rocm) { std::vector xyz_scatter(nloc * nnei * last_layer_size, 0.0); @@ -243,7 +243,7 @@ TEST_F(TestTabulate, tabulate_fusion_gpu_rocm) deepmd::malloc_device_memory_sync(table_dev, table); deepmd::malloc_device_memory_sync(em_x_dev, em_x); deepmd::malloc_device_memory_sync(em_dev, em); - deepmd::tabulate_fusion_gpu_rocm(xyz_scatter_dev, table_dev, &info[0], em_x_dev, em_dev, nloc, nnei, last_layer_size); + deepmd::tabulate_fusion_se_a_gpu_rocm(xyz_scatter_dev, table_dev, &info[0], em_x_dev, em_dev, nloc, nnei, last_layer_size); deepmd::memcpy_device_to_host(xyz_scatter_dev, xyz_scatter); deepmd::delete_device_memory(xyz_scatter_dev); deepmd::delete_device_memory(table_dev); @@ -257,7 +257,7 @@ TEST_F(TestTabulate, tabulate_fusion_gpu_rocm) } } -TEST_F(TestTabulate, tabulate_fusion_grad_gpu_rocm) +TEST_F(TestTabulate, tabulate_fusion_se_a_grad_gpu_rocm) { std::vector dy_dem_x(em_x.size(), 0.0); std::vector dy_dem(em.size(), 0.0); @@ -270,7 +270,7 @@ TEST_F(TestTabulate, tabulate_fusion_grad_gpu_rocm) deepmd::malloc_device_memory_sync(em_x_dev, em_x); deepmd::malloc_device_memory_sync(em_dev, em); deepmd::malloc_device_memory_sync(dy_dev, dy); - deepmd::tabulate_fusion_grad_gpu_rocm(dy_dem_x_dev, dy_dem_dev, table_dev, &info[0], em_x_dev, em_dev, dy_dev, nloc, nnei, last_layer_size); + deepmd::tabulate_fusion_se_a_grad_gpu_rocm(dy_dem_x_dev, dy_dem_dev, table_dev, &info[0], em_x_dev, em_dev, dy_dev, nloc, nnei, last_layer_size); deepmd::memcpy_device_to_host(dy_dem_x_dev, dy_dem_x); deepmd::memcpy_device_to_host(dy_dem_dev, dy_dem); deepmd::delete_device_memory(dy_dem_x_dev); diff --git a/source/op/_tabulate_grad.py b/source/op/_tabulate_grad.py index f7be9445c7..855d9a470e 100644 --- a/source/op/_tabulate_grad.py +++ b/source/op/_tabulate_grad.py @@ -9,11 +9,23 @@ # from deepmd.DescrptSeATabulate import last_layer_size @ops.RegisterGradient("TabulateFusion") -def _tabulate_fusion_grad_cc (op, dy): - dy_dx, dy_df = op_module.tabulate_fusion_grad(op.inputs[0], op.inputs[1], op.inputs[2], op.inputs[3], dy, op.outputs[0]) +@ops.RegisterGradient("TabulateFusionSeA") +def _tabulate_fusion_se_a_grad_cc (op, dy): + dy_dx, dy_df = op_module.tabulate_fusion_se_a_grad(op.inputs[0], op.inputs[1], op.inputs[2], op.inputs[3], dy, op.outputs[0]) return [None, None, dy_dx, dy_df] @ops.RegisterGradient("TabulateFusionGrad") -def _tabulate_fusion_grad_grad_cc (op, dy, dy_): - dz_dy = op_module.tabulate_fusion_grad_grad(op.inputs[0], op.inputs[1], op.inputs[2], op.inputs[3], dy, dy_, op.inputs[5]) +@ops.RegisterGradient("TabulateFusionSeAGrad") +def _tabulate_fusion_se_a_grad_grad_cc (op, dy, dy_): + dz_dy = op_module.tabulate_fusion_se_a_grad_grad(op.inputs[0], op.inputs[1], op.inputs[2], op.inputs[3], dy, dy_, op.inputs[5]) + return [None, None, None, None, dz_dy, None] + +@ops.RegisterGradient("TabulateFusionSeT") +def _tabulate_fusion_se_t_grad_cc (op, dy): + dy_dx, dy_df = op_module.tabulate_fusion_se_t_grad(op.inputs[0], op.inputs[1], op.inputs[2], op.inputs[3], dy, op.outputs[0]) + return [None, None, dy_dx, dy_df] + +@ops.RegisterGradient("TabulateFusionSeTGrad") +def _tabulate_fusion_se_t_grad_grad_cc (op, dy, dy_): + dz_dy = op_module.tabulate_fusion_se_t_grad_grad(op.inputs[0], op.inputs[1], op.inputs[2], op.inputs[3], dy, dy_, op.inputs[5]) return [None, None, None, None, dz_dy, None] \ No newline at end of file diff --git a/source/op/tabulate_multi_device.cc b/source/op/tabulate_multi_device.cc index 3d3019b188..c19c88e48b 100644 --- a/source/op/tabulate_multi_device.cc +++ b/source/op/tabulate_multi_device.cc @@ -32,10 +32,72 @@ REGISTER_OP("TabulateFusionGradGrad") .Input("descriptor: T") .Output("dz_dy: T"); +REGISTER_OP("TabulateFusionSeA") + .Attr("T: {float, double} = DT_DOUBLE") + .Input("table: T") + .Input("table_info: T") + .Input("em_x: T") + .Input("em: T") + .Attr("last_layer_size: int") + .Output("descriptor: T"); + +REGISTER_OP("TabulateFusionSeAGrad") + .Attr("T: {float, double} = DT_DOUBLE") + .Input("table: T") + .Input("table_info: T") + .Input("em_x: T") + .Input("em: T") + .Input("dy: T") + .Input("descriptor: T") + .Output("dy_dem_x: T") + .Output("dy_dem: T"); + +REGISTER_OP("TabulateFusionSeAGradGrad") + .Attr("T: {float, double}") + .Input("table: T") + .Input("table_info: T") + .Input("em_x: T") + .Input("em: T") + .Input("dz_dy_dem_x: T") + .Input("dz_dy_dem: T") + .Input("descriptor: T") + .Output("dz_dy: T"); + +REGISTER_OP("TabulateFusionSeT") + .Attr("T: {float, double} = DT_DOUBLE") + .Input("table: T") + .Input("table_info: T") + .Input("em_x: T") + .Input("em: T") + .Attr("last_layer_size: int") + .Output("descriptor: T"); + +REGISTER_OP("TabulateFusionSeTGrad") + .Attr("T: {float, double} = DT_DOUBLE") + .Input("table: T") + .Input("table_info: T") + .Input("em_x: T") + .Input("em: T") + .Input("dy: T") + .Input("descriptor: T") + .Output("dy_dem_x: T") + .Output("dy_dem: T"); + +REGISTER_OP("TabulateFusionSeTGradGrad") + .Attr("T: {float, double}") + .Input("table: T") + .Input("table_info: T") + .Input("em_x: T") + .Input("em: T") + .Input("dz_dy_dem_x: T") + .Input("dz_dy_dem: T") + .Input("descriptor: T") + .Output("dz_dy: T"); + template -class TabulateFusionOp : public OpKernel { +class TabulateFusionSeAOp : public OpKernel { public: - explicit TabulateFusionOp(OpKernelConstruction* context) : OpKernel(context) { + explicit TabulateFusionSeAOp(OpKernelConstruction* context) : OpKernel(context) { OP_REQUIRES_OK(context, context->GetAttr("last_layer_size", &last_layer_size)); } void Compute(OpKernelContext* context) override { @@ -78,19 +140,19 @@ class TabulateFusionOp : public OpKernel { if (device == "GPU") { #if GOOGLE_CUDA - deepmd::tabulate_fusion_gpu_cuda( + deepmd::tabulate_fusion_se_a_gpu_cuda( descriptor, table, table_info, em_x, em, nloc, nnei, last_layer_size); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::tabulate_fusion_gpu_rocm( + deepmd::tabulate_fusion_se_a_gpu_rocm( descriptor, table, table_info, em_x, em, nloc, nnei, last_layer_size); #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { - deepmd::tabulate_fusion_cpu( + deepmd::tabulate_fusion_se_a_cpu( descriptor, table, table_info, em_x, em, nloc, nnei, last_layer_size); } @@ -101,9 +163,9 @@ class TabulateFusionOp : public OpKernel { }; template -class TabulateFusionGradOp : public OpKernel { +class TabulateFusionSeAGradOp : public OpKernel { public: - explicit TabulateFusionGradOp(OpKernelConstruction* context) : OpKernel(context) {} + explicit TabulateFusionSeAGradOp(OpKernelConstruction* context) : OpKernel(context) {} void Compute(OpKernelContext* context) override { deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);}); } @@ -150,19 +212,19 @@ class TabulateFusionGradOp : public OpKernel { if (device == "GPU") { #if GOOGLE_CUDA - deepmd::tabulate_fusion_grad_gpu_cuda( + deepmd::tabulate_fusion_se_a_grad_gpu_cuda( dy_dem_x, dy_dem, table, table_info, em_x, em, dy, nloc, nnei, last_layer_size); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::tabulate_fusion_grad_gpu_rocm( + deepmd::tabulate_fusion_se_a_grad_gpu_rocm( dy_dem_x, dy_dem, table, table_info, em_x, em, dy, nloc, nnei, last_layer_size); #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { - deepmd::tabulate_fusion_grad_cpu( + deepmd::tabulate_fusion_se_a_grad_cpu( dy_dem_x, dy_dem, table, table_info, em_x, em, dy, nloc, nnei, last_layer_size); } @@ -172,9 +234,9 @@ class TabulateFusionGradOp : public OpKernel { }; template -class TabulateFusionGradGradOp : public OpKernel { +class TabulateFusionSeAGradGradOp : public OpKernel { public: - explicit TabulateFusionGradGradOp(OpKernelConstruction* context) : OpKernel(context) {} + explicit TabulateFusionSeAGradGradOp(OpKernelConstruction* context) : OpKernel(context) {} void Compute(OpKernelContext* context) override { // Grab the input tensor int context_input_index = 0; @@ -213,19 +275,19 @@ class TabulateFusionGradGradOp : public OpKernel { if (device == "GPU") { #if GOOGLE_CUDA - deepmd::tabulate_fusion_grad_grad_gpu_cuda( + deepmd::tabulate_fusion_se_a_grad_grad_gpu_cuda( dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc, nnei, last_layer_size); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::tabulate_fusion_grad_grad_gpu_rocm( + deepmd::tabulate_fusion_se_a_grad_grad_gpu_rocm( dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc, nnei, last_layer_size); #endif // TENSORFLOW_USE_ROCM OP_REQUIRES (context, (last_layer_size <= 1024), errors::InvalidArgument ("In the process of model compression, the size of the last layer of embedding net must be less than 1024!")); } else if (device == "CPU") { - deepmd::tabulate_fusion_grad_grad_cpu( + deepmd::tabulate_fusion_se_a_grad_grad_cpu( dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc, nnei, last_layer_size); } @@ -234,30 +296,270 @@ class TabulateFusionGradGradOp : public OpKernel { std::string device; }; -#define REGISTER_CPU(T) \ -REGISTER_KERNEL_BUILDER( \ - Name("TabulateFusion").Device(DEVICE_CPU).TypeConstraint("T").HostMemory("table_info"), \ - TabulateFusionOp); \ -REGISTER_KERNEL_BUILDER( \ - Name("TabulateFusionGrad").Device(DEVICE_CPU).TypeConstraint("T").HostMemory("table_info"), \ - TabulateFusionGradOp); \ -REGISTER_KERNEL_BUILDER( \ - Name("TabulateFusionGradGrad").Device(DEVICE_CPU).TypeConstraint("T").HostMemory("table_info"), \ - TabulateFusionGradGradOp); +template +class TabulateFusionSeTOp : public OpKernel { + public: + explicit TabulateFusionSeTOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("last_layer_size", &last_layer_size)); + } + void Compute(OpKernelContext* context) override { + deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);}); + } + + void _Compute(OpKernelContext* context) { + // Grab the input tensor + int context_input_index = 0; + const Tensor& table_tensor = context->input(context_input_index++); + const Tensor& table_info_tensor = context->input(context_input_index++); + const Tensor& em_x_tensor = context->input(context_input_index++); + const Tensor& em_tensor = context->input(context_input_index++); + // set size of the sample + OP_REQUIRES (context, (table_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of table should be 2")); + OP_REQUIRES (context, (em_x_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of em_x_tensor should be 2")); + OP_REQUIRES (context, (em_tensor.shape().dims() == 3), errors::InvalidArgument ("Dim of em_tensor should be 3")); + TensorShape descriptor_shape; + descriptor_shape.AddDim (em_tensor.shape().dim_size(0)); + descriptor_shape.AddDim (last_layer_size); + int context_output_index = 0; + Tensor* descriptor_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output( + context_output_index++, + descriptor_shape, + &descriptor_tensor)); + DeviceFunctor() ( + device, + context->eigen_device() + ); + // flat the tensors + FPTYPE * descriptor = descriptor_tensor->flat().data(); + const FPTYPE * table = table_tensor.flat().data(); + const FPTYPE * table_info = table_info_tensor.flat().data(); + const FPTYPE * em_x = em_x_tensor.flat().data(); + const FPTYPE * em = em_tensor.flat().data(); + const int nloc = em_tensor.shape().dim_size(0); + const int nnei_i = em_tensor.shape().dim_size(1); + const int nnei_j = em_tensor.shape().dim_size(2); + + if (device == "GPU") { + #if GOOGLE_CUDA + deepmd::tabulate_fusion_se_t_gpu_cuda( + descriptor, + table, table_info, em_x, em, nloc, nnei_i, nnei_j, last_layer_size); + #endif // GOOGLE_CUDA + + #if TENSORFLOW_USE_ROCM + deepmd::tabulate_fusion_se_t_gpu_rocm( + descriptor, + table, table_info, em_x, em, nloc, nnei_i, nnei_j, last_layer_size); + #endif // TENSORFLOW_USE_ROCM + } + else if (device == "CPU") { + deepmd::tabulate_fusion_se_t_cpu( + descriptor, + table, table_info, em_x, em, nloc, nnei_i, nnei_j, last_layer_size); + } + } +private: + int last_layer_size; + std::string device; +}; + +template +class TabulateFusionSeTGradOp : public OpKernel { + public: + explicit TabulateFusionSeTGradOp(OpKernelConstruction* context) : OpKernel(context) {} + void Compute(OpKernelContext* context) override { + deepmd::safe_compute(context, [this](OpKernelContext* context) {this->_Compute(context);}); + } + + void _Compute(OpKernelContext* context) { + // Grab the input tensor + int context_input_index = 0; + const Tensor& table_tensor = context->input(context_input_index++); + const Tensor& table_info_tensor = context->input(context_input_index++); + const Tensor& em_x_tensor = context->input(context_input_index++); + const Tensor& em_tensor = context->input(context_input_index++); + const Tensor& dy_tensor = context->input(context_input_index++); + const Tensor& descriptor_tensor = context->input(context_input_index++); + // set size of the sample + OP_REQUIRES (context, (dy_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of dy_tensor should be 2")); + int context_output_index = 0; + Tensor* dy_dem_x_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output( + context_output_index++, + em_x_tensor.shape(), + &dy_dem_x_tensor)); + Tensor* dy_dem_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output( + context_output_index++, + em_tensor.shape(), + &dy_dem_tensor)); + DeviceFunctor() ( + device, + context->eigen_device() + ); + + // flat the tensors + FPTYPE * dy_dem_x = dy_dem_x_tensor->flat().data(); + FPTYPE * dy_dem = dy_dem_tensor->flat().data(); + const FPTYPE * descriptor = descriptor_tensor.flat().data(); + const FPTYPE * table = table_tensor.flat().data(); + const FPTYPE * table_info = table_info_tensor.flat().data(); + const FPTYPE * em_x = em_x_tensor.flat().data(); + const FPTYPE * em = em_tensor.flat().data(); + const FPTYPE * dy = dy_tensor.flat().data(); + const int nloc = em_tensor.shape().dim_size(0); + const int nnei_i = em_tensor.shape().dim_size(1); + const int nnei_j = em_tensor.shape().dim_size(2); + const int last_layer_size = descriptor_tensor.shape().dim_size(1); + + if (device == "GPU") { + #if GOOGLE_CUDA + deepmd::tabulate_fusion_se_t_grad_gpu_cuda( + dy_dem_x, dy_dem, + table, table_info, em_x, em, dy, nloc, nnei_i, nnei_j, last_layer_size); + #endif // GOOGLE_CUDA + + #if TENSORFLOW_USE_ROCM + deepmd::tabulate_fusion_se_t_grad_gpu_rocm( + dy_dem_x, dy_dem, + table, table_info, em_x, em, dy, nloc, nnei_i, nnei_j, last_layer_size); + #endif // TENSORFLOW_USE_ROCM + } + else if (device == "CPU") { + deepmd::tabulate_fusion_se_t_grad_cpu( + dy_dem_x, dy_dem, + table, table_info, em_x, em, dy, nloc, nnei_i, nnei_j, last_layer_size); + } + } +private: + std::string device; +}; + +template +class TabulateFusionSeTGradGradOp : public OpKernel { + public: + explicit TabulateFusionSeTGradGradOp(OpKernelConstruction* context) : OpKernel(context) {} + void Compute(OpKernelContext* context) override { + // Grab the input tensor + int context_input_index = 0; + const Tensor& table_tensor = context->input(context_input_index++); + const Tensor& table_info_tensor = context->input(context_input_index++); + const Tensor& em_x_tensor = context->input(context_input_index++); + const Tensor& em_tensor = context->input(context_input_index++); + const Tensor& dz_dy_dem_x_tensor = context->input(context_input_index++); + const Tensor& dz_dy_dem_tensor = context->input(context_input_index++); + const Tensor& descriptor_tensor = context->input(context_input_index++); + // set size of the sample + OP_REQUIRES (context, (dz_dy_dem_x_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); + OP_REQUIRES (context, (dz_dy_dem_tensor.shape().dims() == 3), errors::InvalidArgument ("Dim of input should be 3")); + int context_output_index = 0; + Tensor* dz_dy_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output( + context_output_index++, + descriptor_tensor.shape(), + &dz_dy_tensor)); + DeviceFunctor() ( + device, + context->eigen_device() + ); + + // flat the tensors + FPTYPE * dz_dy = dz_dy_tensor->flat().data(); + const FPTYPE * table = table_tensor.flat().data(); + const FPTYPE * table_info = table_info_tensor.flat().data(); + const FPTYPE * em_x = em_x_tensor.flat().data(); + const FPTYPE * em = em_tensor.flat().data(); + const FPTYPE * dz_dy_dem_x = dz_dy_dem_x_tensor.flat().data(); + const FPTYPE * dz_dy_dem = dz_dy_dem_tensor.flat().data(); + const int nloc = em_tensor.shape().dim_size(0); + const int nnei_i = em_tensor.shape().dim_size(1); + const int nnei_j = em_tensor.shape().dim_size(2); + const int last_layer_size = descriptor_tensor.shape().dim_size(2); + + if (device == "GPU") { + #if GOOGLE_CUDA + deepmd::tabulate_fusion_se_t_grad_grad_gpu_cuda( + dz_dy, + table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc, nnei_i, nnei_j, last_layer_size); + #endif // GOOGLE_CUDA + #if TENSORFLOW_USE_ROCM + deepmd::tabulate_fusion_se_t_grad_grad_gpu_rocm( + dz_dy, + table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc, nnei_i, nnei_j, last_layer_size); + #endif // TENSORFLOW_USE_ROCM + OP_REQUIRES (context, (last_layer_size <= 1024), errors::InvalidArgument ("In the process of model compression, the size of the last layer of embedding net must be less than 1024!")); + } + else if (device == "CPU") { + deepmd::tabulate_fusion_se_t_grad_grad_cpu( + dz_dy, + table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc, nnei_i, nnei_j, last_layer_size); + } + } +private: + std::string device; +}; + +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("TabulateFusion").Device(DEVICE_CPU).TypeConstraint("T"), \ + TabulateFusionSeAOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("TabulateFusionGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + TabulateFusionSeAGradOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("TabulateFusionGradGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + TabulateFusionSeAGradGradOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("TabulateFusionSeA").Device(DEVICE_CPU).TypeConstraint("T"), \ + TabulateFusionSeAOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("TabulateFusionSeAGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + TabulateFusionSeAGradOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("TabulateFusionSeAGradGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + TabulateFusionSeAGradGradOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("TabulateFusionSeT").Device(DEVICE_CPU).TypeConstraint("T"), \ + TabulateFusionSeTOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("TabulateFusionSeTGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + TabulateFusionSeTGradOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("TabulateFusionSeTGradGrad").Device(DEVICE_CPU).TypeConstraint("T"), \ + TabulateFusionSeTGradGradOp); REGISTER_CPU(float); REGISTER_CPU(double); #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM -#define REGISTER_GPU(T) \ -REGISTER_KERNEL_BUILDER( \ - Name("TabulateFusion").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("table_info"), \ - TabulateFusionOp); \ -REGISTER_KERNEL_BUILDER( \ - Name("TabulateFusionGrad").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("table_info"), \ - TabulateFusionGradOp); \ -REGISTER_KERNEL_BUILDER( \ - Name("TabulateFusionGradGrad").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("table_info"), \ - TabulateFusionGradGradOp); +#define REGISTER_GPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("TabulateFusion").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("table_info"), \ + TabulateFusionSeAOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("TabulateFusionGrad").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("table_info"), \ + TabulateFusionSeAGradOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("TabulateFusionGradGrad").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("table_info"), \ + TabulateFusionSeAGradGradOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("TabulateFusionSeA").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("table_info"), \ + TabulateFusionSeAOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("TabulateFusionSeAGrad").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("table_info"), \ + TabulateFusionSeAGradOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("TabulateFusionSeAGradGrad").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("table_info"), \ + TabulateFusionSeAGradGradOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("TabulateFusionSeT").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("table_info"), \ + TabulateFusionSeTOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("TabulateFusionSeTGrad").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("table_info"), \ + TabulateFusionSeTGradOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("TabulateFusionSeTGradGrad").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("table_info"), \ + TabulateFusionSeTGradGradOp); REGISTER_GPU(float); REGISTER_GPU(double); #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/source/tests/infer/deepdipole.pbtxt b/source/tests/infer/deepdipole.pbtxt index b503c29336..8968da9409 100644 --- a/source/tests/infer/deepdipole.pbtxt +++ b/source/tests/infer/deepdipole.pbtxt @@ -5966,7 +5966,7 @@ node { dtype: DT_STRING tensor_shape { } - string_val: "1.0" + string_val: "1.1" } } } diff --git a/source/tests/infer/deepdipole_fake.pbtxt b/source/tests/infer/deepdipole_fake.pbtxt index 5b41d69f20..0ad18beae8 100644 --- a/source/tests/infer/deepdipole_fake.pbtxt +++ b/source/tests/infer/deepdipole_fake.pbtxt @@ -180,7 +180,7 @@ node { dtype: DT_STRING tensor_shape { } - string_val: "1.0" + string_val: "1.1" } } } diff --git a/source/tests/infer/deepdipole_new.pbtxt b/source/tests/infer/deepdipole_new.pbtxt index 4a76d9d79d..ef697e2f5c 100644 --- a/source/tests/infer/deepdipole_new.pbtxt +++ b/source/tests/infer/deepdipole_new.pbtxt @@ -180,7 +180,7 @@ node { dtype: DT_STRING tensor_shape { } - string_val: "1.0" + string_val: "1.1" } } } diff --git a/source/tests/infer/deeppolar.pbtxt b/source/tests/infer/deeppolar.pbtxt index 49b9645b68..22d9bbc71d 100644 --- a/source/tests/infer/deeppolar.pbtxt +++ b/source/tests/infer/deeppolar.pbtxt @@ -6194,7 +6194,7 @@ node { dtype: DT_STRING tensor_shape { } - string_val: "1.0" + string_val: "1.1" } } } diff --git a/source/tests/infer/deeppolar_new.pbtxt b/source/tests/infer/deeppolar_new.pbtxt index f680add745..aa74204142 100644 --- a/source/tests/infer/deeppolar_new.pbtxt +++ b/source/tests/infer/deeppolar_new.pbtxt @@ -180,7 +180,7 @@ node { dtype: DT_STRING tensor_shape { } - string_val: "1.0" + string_val: "1.1" } } } diff --git a/source/tests/infer/deeppot-1.pbtxt b/source/tests/infer/deeppot-1.pbtxt index 0819df4b9e..1dbd60b2e1 100644 --- a/source/tests/infer/deeppot-1.pbtxt +++ b/source/tests/infer/deeppot-1.pbtxt @@ -8891,7 +8891,7 @@ node { dtype: DT_STRING tensor_shape { } - string_val: "1.0" + string_val: "1.1" } } } diff --git a/source/tests/infer/deeppot-r.pbtxt b/source/tests/infer/deeppot-r.pbtxt index c307be00f0..2bf26d40c5 100644 --- a/source/tests/infer/deeppot-r.pbtxt +++ b/source/tests/infer/deeppot-r.pbtxt @@ -8545,7 +8545,7 @@ node { dtype: DT_STRING tensor_shape { } - string_val: "1.0" + string_val: "1.1" } } } diff --git a/source/tests/infer/deeppot.pbtxt b/source/tests/infer/deeppot.pbtxt index c7c49e2483..47eac4825f 100644 --- a/source/tests/infer/deeppot.pbtxt +++ b/source/tests/infer/deeppot.pbtxt @@ -8891,7 +8891,7 @@ node { dtype: DT_STRING tensor_shape { } - string_val: "1.0" + string_val: "1.1" } } } diff --git a/source/tests/infer/dipolecharge_d.pbtxt b/source/tests/infer/dipolecharge_d.pbtxt index 6be963119f..7ea390dcb8 100644 --- a/source/tests/infer/dipolecharge_d.pbtxt +++ b/source/tests/infer/dipolecharge_d.pbtxt @@ -7119,7 +7119,7 @@ node { dtype: DT_STRING tensor_shape { } - string_val: "1.0" + string_val: "1.1" } } } diff --git a/source/tests/infer/dipolecharge_e.pbtxt b/source/tests/infer/dipolecharge_e.pbtxt index ec9412a111..891ec09d68 100644 --- a/source/tests/infer/dipolecharge_e.pbtxt +++ b/source/tests/infer/dipolecharge_e.pbtxt @@ -12418,7 +12418,7 @@ node { dtype: DT_STRING tensor_shape { } - string_val: "1.0" + string_val: "1.1" } } } @@ -56051,7 +56051,7 @@ node { dtype: DT_STRING tensor_shape { } - string_val: "1.0" + string_val: "1.1" } } } diff --git a/source/tests/test_model_compression.py b/source/tests/test_model_compression_se_a.py similarity index 100% rename from source/tests/test_model_compression.py rename to source/tests/test_model_compression_se_a.py diff --git a/source/tests/test_model_compression_se_t.py b/source/tests/test_model_compression_se_t.py new file mode 100644 index 0000000000..5d1efc08a2 --- /dev/null +++ b/source/tests/test_model_compression_se_t.py @@ -0,0 +1,420 @@ +import os,sys,platform,shutil,dpdata,json +import numpy as np +import unittest +import subprocess as sp + +from deepmd.infer import DeepPot +from deepmd.env import MODEL_VERSION +# from deepmd.entrypoints.compress import compress +from common import j_loader, tests_path + +from deepmd.env import GLOBAL_NP_FLOAT_PRECISION +if GLOBAL_NP_FLOAT_PRECISION == np.float32 : + default_places = 4 +else : + default_places = 10 + +def _file_delete(file) : + if os.path.isdir(file): + os.rmdir(file) + elif os.path.isfile(file): + os.remove(file) + +def _subprocess_run(command): + popen = sp.Popen(command.split(), shell=False, stdout=sp.PIPE, stderr=sp.STDOUT) + for line in iter(popen.stdout.readline, b''): + if hasattr(line, 'decode'): + line = line.decode('utf-8') + line = line.rstrip() + print(line) + popen.wait() + return popen.returncode + +def _init_models(): + data_file = str(tests_path / os.path.join("model_compression", "data")) + frozen_model = str(tests_path / "dp-original-se-t.pb") + compressed_model = str(tests_path / "dp-compressed-se-t.pb") + INPUT = str(tests_path / "input.json") + jdata = j_loader(str(tests_path / os.path.join("model_compression", "input.json"))) + jdata["model"]["descriptor"] = {} + jdata["model"]["descriptor"]["type"] = "se_e3" + jdata["model"]["descriptor"]["sel"] = [46, 92] + jdata["model"]["descriptor"]["rcut_smth"] = 0.5 + jdata["model"]["descriptor"]["rcut"] = 6.0 + jdata["model"]["descriptor"]["neuron"] = [4,8,16] + jdata["model"]["descriptor"]["resnet_dt"] = False + jdata["model"]["descriptor"]["seed"] = 1 + jdata["training"]["training_data"]["systems"] = data_file + jdata["training"]["validation_data"]["systems"] = data_file + with open(INPUT, "w") as fp: + json.dump(jdata, fp, indent=4) + + ret = _subprocess_run("dp train " + INPUT) + np.testing.assert_equal(ret, 0, 'DP train failed!') + ret = _subprocess_run("dp freeze -o " + frozen_model) + np.testing.assert_equal(ret, 0, 'DP freeze failed!') + ret = _subprocess_run("dp compress " + " -i " + frozen_model + " -o " + compressed_model) + np.testing.assert_equal(ret, 0, 'DP model compression failed!') + return INPUT, frozen_model, compressed_model + +INPUT, FROZEN_MODEL, COMPRESSED_MODEL = _init_models() + +class TestDeepPotAPBC(unittest.TestCase) : + @classmethod + def setUpClass(self): + self.dp_original = DeepPot(FROZEN_MODEL) + self.dp_compressed = DeepPot(COMPRESSED_MODEL) + self.coords = np.array([12.83, 2.56, 2.18, + 12.09, 2.87, 2.74, + 00.25, 3.32, 1.68, + 3.36, 3.00, 1.81, + 3.51, 2.51, 2.60, + 4.27, 3.22, 1.56]) + self.atype = [0, 1, 1, 0, 1, 1] + self.box = np.array([13., 0., 0., 0., 13., 0., 0., 0., 13.]) + + def test_attrs(self): + self.assertEqual(self.dp_original.get_ntypes(), 2) + self.assertAlmostEqual(self.dp_original.get_rcut(), 6.0, places = default_places) + self.assertEqual(self.dp_original.get_type_map(), ['O', 'H']) + self.assertEqual(self.dp_original.get_dim_fparam(), 0) + self.assertEqual(self.dp_original.get_dim_aparam(), 0) + + self.assertEqual(self.dp_compressed.get_ntypes(), 2) + self.assertAlmostEqual(self.dp_compressed.get_rcut(), 6.0, places = default_places) + self.assertEqual(self.dp_compressed.get_type_map(), ['O', 'H']) + self.assertEqual(self.dp_compressed.get_dim_fparam(), 0) + self.assertEqual(self.dp_compressed.get_dim_aparam(), 0) + + def test_1frame(self): + ee0, ff0, vv0 = self.dp_original.eval(self.coords, self.box, self.atype, atomic = False) + ee1, ff1, vv1 = self.dp_compressed.eval(self.coords, self.box, self.atype, atomic = False) + # check shape of the returns + nframes = 1 + natoms = len(self.atype) + self.assertEqual(ee0.shape, (nframes,1)) + self.assertEqual(ff0.shape, (nframes,natoms,3)) + self.assertEqual(vv0.shape, (nframes,9)) + self.assertEqual(ee1.shape, (nframes,1)) + self.assertEqual(ff1.shape, (nframes,natoms,3)) + self.assertEqual(vv1.shape, (nframes,9)) + # check values + np.testing.assert_almost_equal(ff0, ff1, default_places) + np.testing.assert_almost_equal(ee0, ee1, default_places) + np.testing.assert_almost_equal(vv0, vv1, default_places) + + def test_1frame_atm(self): + ee0, ff0, vv0, ae0, av0 = self.dp_original.eval(self.coords, self.box, self.atype, atomic = True) + ee1, ff1, vv1, ae1, av1 = self.dp_compressed.eval(self.coords, self.box, self.atype, atomic = True) + # check shape of the returns + nframes = 1 + natoms = len(self.atype) + self.assertEqual(ee0.shape, (nframes,1)) + self.assertEqual(ff0.shape, (nframes,natoms,3)) + self.assertEqual(vv0.shape, (nframes,9)) + self.assertEqual(ae0.shape, (nframes,natoms,1)) + self.assertEqual(av0.shape, (nframes,natoms,9)) + self.assertEqual(ee1.shape, (nframes,1)) + self.assertEqual(ff1.shape, (nframes,natoms,3)) + self.assertEqual(vv1.shape, (nframes,9)) + self.assertEqual(ae1.shape, (nframes,natoms,1)) + self.assertEqual(av1.shape, (nframes,natoms,9)) + # check values + np.testing.assert_almost_equal(ff0, ff1, default_places) + np.testing.assert_almost_equal(ae0, ae1, default_places) + np.testing.assert_almost_equal(av0, av1, default_places) + np.testing.assert_almost_equal(ee0, ee1, default_places) + np.testing.assert_almost_equal(vv0, vv1, default_places) + + def test_2frame_atm(self): + coords2 = np.concatenate((self.coords, self.coords)) + box2 = np.concatenate((self.box, self.box)) + ee0, ff0, vv0, ae0, av0 = self.dp_original.eval(coords2, box2, self.atype, atomic = True) + ee1, ff1, vv1, ae1, av1 = self.dp_compressed.eval(coords2, box2, self.atype, atomic = True) + # check shape of the returns + nframes = 2 + natoms = len(self.atype) + self.assertEqual(ee0.shape, (nframes,1)) + self.assertEqual(ff0.shape, (nframes,natoms,3)) + self.assertEqual(vv0.shape, (nframes,9)) + self.assertEqual(ae0.shape, (nframes,natoms,1)) + self.assertEqual(av0.shape, (nframes,natoms,9)) + self.assertEqual(ee1.shape, (nframes,1)) + self.assertEqual(ff1.shape, (nframes,natoms,3)) + self.assertEqual(vv1.shape, (nframes,9)) + self.assertEqual(ae1.shape, (nframes,natoms,1)) + self.assertEqual(av1.shape, (nframes,natoms,9)) + + # check values + np.testing.assert_almost_equal(ff0, ff1, default_places) + np.testing.assert_almost_equal(ae0, ae1, default_places) + np.testing.assert_almost_equal(av0, av1, default_places) + np.testing.assert_almost_equal(ee0, ee1, default_places) + np.testing.assert_almost_equal(vv0, vv1, default_places) + + +class TestDeepPotANoPBC(unittest.TestCase) : + @classmethod + def setUpClass(self): + self.dp_original = DeepPot(FROZEN_MODEL) + self.dp_compressed = DeepPot(COMPRESSED_MODEL) + self.coords = np.array([12.83, 2.56, 2.18, + 12.09, 2.87, 2.74, + 00.25, 3.32, 1.68, + 3.36, 3.00, 1.81, + 3.51, 2.51, 2.60, + 4.27, 3.22, 1.56]) + self.atype = [0, 1, 1, 0, 1, 1] + self.box = None + + def test_1frame(self): + ee0, ff0, vv0 = self.dp_original.eval(self.coords, self.box, self.atype, atomic = False) + ee1, ff1, vv1 = self.dp_compressed.eval(self.coords, self.box, self.atype, atomic = False) + # check shape of the returns + nframes = 1 + natoms = len(self.atype) + self.assertEqual(ee0.shape, (nframes,1)) + self.assertEqual(ff0.shape, (nframes,natoms,3)) + self.assertEqual(vv0.shape, (nframes,9)) + self.assertEqual(ee1.shape, (nframes,1)) + self.assertEqual(ff1.shape, (nframes,natoms,3)) + self.assertEqual(vv1.shape, (nframes,9)) + # check values + np.testing.assert_almost_equal(ff0, ff1, default_places) + np.testing.assert_almost_equal(ee0, ee1, default_places) + np.testing.assert_almost_equal(vv0, vv1, default_places) + + def test_1frame_atm(self): + ee0, ff0, vv0, ae0, av0 = self.dp_original.eval(self.coords, self.box, self.atype, atomic = True) + ee1, ff1, vv1, ae1, av1 = self.dp_compressed.eval(self.coords, self.box, self.atype, atomic = True) + # check shape of the returns + nframes = 1 + natoms = len(self.atype) + self.assertEqual(ee0.shape, (nframes,1)) + self.assertEqual(ff0.shape, (nframes,natoms,3)) + self.assertEqual(vv0.shape, (nframes,9)) + self.assertEqual(ae0.shape, (nframes,natoms,1)) + self.assertEqual(av0.shape, (nframes,natoms,9)) + self.assertEqual(ee1.shape, (nframes,1)) + self.assertEqual(ff1.shape, (nframes,natoms,3)) + self.assertEqual(vv1.shape, (nframes,9)) + self.assertEqual(ae1.shape, (nframes,natoms,1)) + self.assertEqual(av1.shape, (nframes,natoms,9)) + # check values + np.testing.assert_almost_equal(ff0, ff1, default_places) + np.testing.assert_almost_equal(ae0, ae1, default_places) + np.testing.assert_almost_equal(av0, av1, default_places) + np.testing.assert_almost_equal(ee0, ee1, default_places) + np.testing.assert_almost_equal(vv0, vv1, default_places) + + def test_2frame_atm(self): + coords2 = np.concatenate((self.coords, self.coords)) + ee0, ff0, vv0, ae0, av0 = self.dp_original.eval(coords2, self.box, self.atype, atomic = True) + ee1, ff1, vv1, ae1, av1 = self.dp_compressed.eval(coords2, self.box, self.atype, atomic = True) + # check shape of the returns + nframes = 2 + natoms = len(self.atype) + self.assertEqual(ee0.shape, (nframes,1)) + self.assertEqual(ff0.shape, (nframes,natoms,3)) + self.assertEqual(vv0.shape, (nframes,9)) + self.assertEqual(ae0.shape, (nframes,natoms,1)) + self.assertEqual(av0.shape, (nframes,natoms,9)) + self.assertEqual(ee1.shape, (nframes,1)) + self.assertEqual(ff1.shape, (nframes,natoms,3)) + self.assertEqual(vv1.shape, (nframes,9)) + self.assertEqual(ae1.shape, (nframes,natoms,1)) + self.assertEqual(av1.shape, (nframes,natoms,9)) + + # check values + np.testing.assert_almost_equal(ff0, ff1, default_places) + np.testing.assert_almost_equal(ae0, ae1, default_places) + np.testing.assert_almost_equal(av0, av1, default_places) + np.testing.assert_almost_equal(ee0, ee1, default_places) + np.testing.assert_almost_equal(vv0, vv1, default_places) + + +class TestDeepPotALargeBoxNoPBC(unittest.TestCase) : + @classmethod + def setUpClass(self): + self.dp_original = DeepPot(FROZEN_MODEL) + self.dp_compressed = DeepPot(COMPRESSED_MODEL) + self.coords = np.array([12.83, 2.56, 2.18, + 12.09, 2.87, 2.74, + 00.25, 3.32, 1.68, + 3.36, 3.00, 1.81, + 3.51, 2.51, 2.60, + 4.27, 3.22, 1.56]) + self.atype = [0, 1, 1, 0, 1, 1] + self.box = np.array([19., 0., 0., 0., 13., 0., 0., 0., 13.]) + + def test_1frame(self): + ee0, ff0, vv0 = self.dp_original.eval(self.coords, self.box, self.atype, atomic = False) + ee1, ff1, vv1 = self.dp_compressed.eval(self.coords, self.box, self.atype, atomic = False) + # check shape of the returns + nframes = 1 + natoms = len(self.atype) + self.assertEqual(ee0.shape, (nframes,1)) + self.assertEqual(ff0.shape, (nframes,natoms,3)) + self.assertEqual(vv0.shape, (nframes,9)) + self.assertEqual(ee1.shape, (nframes,1)) + self.assertEqual(ff1.shape, (nframes,natoms,3)) + self.assertEqual(vv1.shape, (nframes,9)) + # check values + np.testing.assert_almost_equal(ff0, ff1, default_places) + np.testing.assert_almost_equal(ee0, ee1, default_places) + np.testing.assert_almost_equal(vv0, vv1, default_places) + + def test_1frame_atm(self): + ee0, ff0, vv0, ae0, av0 = self.dp_original.eval(self.coords, self.box, self.atype, atomic = True) + ee1, ff1, vv1, ae1, av1 = self.dp_compressed.eval(self.coords, self.box, self.atype, atomic = True) + # check shape of the returns + nframes = 1 + natoms = len(self.atype) + self.assertEqual(ee0.shape, (nframes,1)) + self.assertEqual(ff0.shape, (nframes,natoms,3)) + self.assertEqual(vv0.shape, (nframes,9)) + self.assertEqual(ae0.shape, (nframes,natoms,1)) + self.assertEqual(av0.shape, (nframes,natoms,9)) + self.assertEqual(ee1.shape, (nframes,1)) + self.assertEqual(ff1.shape, (nframes,natoms,3)) + self.assertEqual(vv1.shape, (nframes,9)) + self.assertEqual(ae1.shape, (nframes,natoms,1)) + self.assertEqual(av1.shape, (nframes,natoms,9)) + # check values + np.testing.assert_almost_equal(ff0, ff1, default_places) + np.testing.assert_almost_equal(ae0, ae1, default_places) + np.testing.assert_almost_equal(av0, av1, default_places) + np.testing.assert_almost_equal(ee0, ee1, default_places) + np.testing.assert_almost_equal(vv0, vv1, default_places) + + def test_ase(self): + from ase import Atoms + from deepmd.calculator import DP + water0 = Atoms('OHHOHH', + positions=self.coords.reshape((-1,3)), + cell=self.box.reshape((3,3)), + calculator=DP(FROZEN_MODEL)) + water1 = Atoms('OHHOHH', + positions=self.coords.reshape((-1,3)), + cell=self.box.reshape((3,3)), + calculator=DP(COMPRESSED_MODEL)) + ee0 = water0.get_potential_energy() + ff0 = water0.get_forces() + ee1 = water1.get_potential_energy() + ff1 = water1.get_forces() + nframes = 1 + np.testing.assert_almost_equal(ff0, ff1, default_places) + np.testing.assert_almost_equal(ee0, ee1, default_places) + +class TestDeepPotAPBCExcludeTypes(unittest.TestCase) : + @classmethod + def setUpClass(self): + self.dp_original = DeepPot(FROZEN_MODEL) + self.dp_compressed = DeepPot(COMPRESSED_MODEL) + self.coords = np.array([12.83, 2.56, 2.18, + 12.09, 2.87, 2.74, + 00.25, 3.32, 1.68, + 3.36, 3.00, 1.81, + 3.51, 2.51, 2.60, + 4.27, 3.22, 1.56]) + self.atype = [0, 1, 1, 0, 1, 1] + self.box = np.array([13., 0., 0., 0., 13., 0., 0., 0., 13.]) + + @classmethod + def tearDownClass(self): + _file_delete(INPUT) + _file_delete(FROZEN_MODEL) + _file_delete(COMPRESSED_MODEL) + _file_delete("out.json") + _file_delete("compress.json") + _file_delete("checkpoint") + _file_delete("model.ckpt.meta") + _file_delete("model.ckpt.index") + _file_delete("model.ckpt.data-00000-of-00001") + _file_delete("model.ckpt-100.meta") + _file_delete("model.ckpt-100.index") + _file_delete("model.ckpt-100.data-00000-of-00001") + _file_delete("model-compression/checkpoint") + _file_delete("model-compression/model.ckpt.meta") + _file_delete("model-compression/model.ckpt.index") + _file_delete("model-compression/model.ckpt.data-00000-of-00001") + _file_delete("model-compression") + + def test_attrs(self): + self.assertEqual(self.dp_original.get_ntypes(), 2) + self.assertAlmostEqual(self.dp_original.get_rcut(), 6.0, places = default_places) + self.assertEqual(self.dp_original.get_type_map(), ['O', 'H']) + self.assertEqual(self.dp_original.get_dim_fparam(), 0) + self.assertEqual(self.dp_original.get_dim_aparam(), 0) + + self.assertEqual(self.dp_compressed.get_ntypes(), 2) + self.assertAlmostEqual(self.dp_compressed.get_rcut(), 6.0, places = default_places) + self.assertEqual(self.dp_compressed.get_type_map(), ['O', 'H']) + self.assertEqual(self.dp_compressed.get_dim_fparam(), 0) + self.assertEqual(self.dp_compressed.get_dim_aparam(), 0) + + def test_1frame(self): + ee0, ff0, vv0 = self.dp_original.eval(self.coords, self.box, self.atype, atomic = False) + ee1, ff1, vv1 = self.dp_compressed.eval(self.coords, self.box, self.atype, atomic = False) + # check shape of the returns + nframes = 1 + natoms = len(self.atype) + self.assertEqual(ee0.shape, (nframes,1)) + self.assertEqual(ff0.shape, (nframes,natoms,3)) + self.assertEqual(vv0.shape, (nframes,9)) + self.assertEqual(ee1.shape, (nframes,1)) + self.assertEqual(ff1.shape, (nframes,natoms,3)) + self.assertEqual(vv1.shape, (nframes,9)) + # check values + np.testing.assert_almost_equal(ff0, ff1, default_places) + np.testing.assert_almost_equal(ee0, ee1, default_places) + np.testing.assert_almost_equal(vv0, vv1, default_places) + + def test_1frame_atm(self): + ee0, ff0, vv0, ae0, av0 = self.dp_original.eval(self.coords, self.box, self.atype, atomic = True) + ee1, ff1, vv1, ae1, av1 = self.dp_compressed.eval(self.coords, self.box, self.atype, atomic = True) + # check shape of the returns + nframes = 1 + natoms = len(self.atype) + self.assertEqual(ee0.shape, (nframes,1)) + self.assertEqual(ff0.shape, (nframes,natoms,3)) + self.assertEqual(vv0.shape, (nframes,9)) + self.assertEqual(ae0.shape, (nframes,natoms,1)) + self.assertEqual(av0.shape, (nframes,natoms,9)) + self.assertEqual(ee1.shape, (nframes,1)) + self.assertEqual(ff1.shape, (nframes,natoms,3)) + self.assertEqual(vv1.shape, (nframes,9)) + self.assertEqual(ae1.shape, (nframes,natoms,1)) + self.assertEqual(av1.shape, (nframes,natoms,9)) + # check values + np.testing.assert_almost_equal(ff0, ff1, default_places) + np.testing.assert_almost_equal(ae0, ae1, default_places) + np.testing.assert_almost_equal(av0, av1, default_places) + np.testing.assert_almost_equal(ee0, ee1, default_places) + np.testing.assert_almost_equal(vv0, vv1, default_places) + + def test_2frame_atm(self): + coords2 = np.concatenate((self.coords, self.coords)) + box2 = np.concatenate((self.box, self.box)) + ee0, ff0, vv0, ae0, av0 = self.dp_original.eval(coords2, box2, self.atype, atomic = True) + ee1, ff1, vv1, ae1, av1 = self.dp_compressed.eval(coords2, box2, self.atype, atomic = True) + # check shape of the returns + nframes = 2 + natoms = len(self.atype) + self.assertEqual(ee0.shape, (nframes,1)) + self.assertEqual(ff0.shape, (nframes,natoms,3)) + self.assertEqual(vv0.shape, (nframes,9)) + self.assertEqual(ae0.shape, (nframes,natoms,1)) + self.assertEqual(av0.shape, (nframes,natoms,9)) + self.assertEqual(ee1.shape, (nframes,1)) + self.assertEqual(ff1.shape, (nframes,natoms,3)) + self.assertEqual(vv1.shape, (nframes,9)) + self.assertEqual(ae1.shape, (nframes,natoms,1)) + self.assertEqual(av1.shape, (nframes,natoms,9)) + + # check values + np.testing.assert_almost_equal(ff0, ff1, default_places) + np.testing.assert_almost_equal(ae0, ae1, default_places) + np.testing.assert_almost_equal(av0, av1, default_places) + np.testing.assert_almost_equal(ee0, ee1, default_places) + np.testing.assert_almost_equal(vv0, vv1, default_places)