From b60fb14b3d1f2f88c8ed40d9907bf6d8c5bb7cb0 Mon Sep 17 00:00:00 2001 From: denghuilu Date: Fri, 20 Aug 2021 14:27:10 +0800 Subject: [PATCH 1/9] add model compression training support --- deepmd/common.py | 41 +----- deepmd/descriptor/se_a.py | 73 ++++++++++- deepmd/entrypoints/compress.py | 4 +- deepmd/entrypoints/freeze.py | 1 + deepmd/entrypoints/main.py | 7 + deepmd/entrypoints/train.py | 6 +- deepmd/entrypoints/transfer.py | 7 +- deepmd/fit/ener.py | 25 +++- deepmd/model/ener.py | 40 ++++-- deepmd/train/run_options.py | 4 + deepmd/train/trainer.py | 54 ++++++++ deepmd/utils/graph.py | 193 ++++++++++++++++++++++++++++ deepmd/utils/network.py | 36 +++--- deepmd/utils/tabulate.py | 69 ++++------ deepmd/utils/type_embed.py | 1 - source/lib/include/tabulate.h | 39 ++++++ source/lib/src/cuda/tabulate.cu | 86 +++++++++++++ source/lib/src/rocm/tabulate.hip.cu | 129 +++++++++++-------- source/lib/src/tabulate.cc | 72 +++++++++++ source/op/CMakeLists.txt | 2 - source/op/_tabulate_grad.py | 25 +--- source/op/tabulate_multi_device.cc | 107 +++++++++++++-- 22 files changed, 814 insertions(+), 207 deletions(-) create mode 100644 deepmd/utils/graph.py diff --git a/deepmd/common.py b/deepmd/common.py index 60af7b1493..94f2e43cd6 100644 --- a/deepmd/common.py +++ b/deepmd/common.py @@ -42,6 +42,12 @@ "float64": tf.float64, } +PRECISION_MAPPING: Dict[int, type] = { + 1: np.float32, + 2: np.float64, + 19: np.float16, +} + def gelu(x: tf.Tensor) -> tf.Tensor: """Gaussian Error Linear Unit. @@ -485,38 +491,3 @@ def get_np_precision(precision: "_PRECISION") -> np.dtype: return np.float64 else: raise RuntimeError(f"{precision} is not a valid precision") - - -def get_tensor_by_name(model_file: str, - tensor_name: str) -> tf.Tensor: - """Load tensor value from the frozen model(model_file) - - Parameters - ---------- - model_file : str - The input frozen model. - tensor : tensor_name - Indicates which tensor which will be loaded from the frozen model. - - Returns - ------- - tf.Tensor - The tensor which was loaded from the frozen model. - - Raises - ------ - GraphWithoutTensorError - Whether the tensor_name is within the frozen model. - """ - graph_def = tf.GraphDef() - with open(model_file, "rb") as f: - graph_def.ParseFromString(f.read()) - with tf.Graph().as_default() as graph: - tf.import_graph_def(graph_def, name="") - try: - tensor = graph.get_tensor_by_name(tensor_name + ":0") - except KeyError as e: - raise GraphWithoutTensorError() from e - with tf.Session(graph=graph) as sess: - tensor = run_sess(sess, tensor) - return tensor diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py index 60de701886..6e86287946 100644 --- a/deepmd/descriptor/se_a.py +++ b/deepmd/descriptor/se_a.py @@ -1,6 +1,6 @@ import math import numpy as np -from typing import Tuple, List +from typing import Tuple, List, Dict, Any from deepmd.env import tf from deepmd.common import get_activation_func, get_precision, ACTIVATION_FN_DICT, PRECISION_DICT, docstring_parameter, get_np_precision @@ -10,7 +10,7 @@ from deepmd.env import op_module from deepmd.env import default_tf_session_config from deepmd.utils.network import embedding_net, embedding_net_rand_seed_shift -from deepmd.utils.tabulate import DeepTabulate +from deepmd.utils.tabulate import DPTabulate from deepmd.utils.type_embed import embed_atom_type from deepmd.utils.sess import run_sess @@ -267,15 +267,15 @@ def enable_compression(self, The overflow check frequency """ self.compress = True - self.model_file = model_file + self.table = DPTabulate(model_file, self.type_one_side, self.exclude_types) self.table_config = [table_extrapolate, table_stride_1, table_stride_2, check_frequency] - self.table = DeepTabulate(self.model_file, self.type_one_side, self.exclude_types) self.lower, self.upper \ = self.table.build(min_nbor_dist, table_extrapolate, table_stride_1, table_stride_2) + def build (self, coord_ : tf.Tensor, atype_ : tf.Tensor, @@ -392,6 +392,71 @@ def get_rot_mat(self) -> tf.Tensor: """ return self.qmat + def pass_tensors_from_frz_model(self, + descrpt_reshape : tf.Tensor, + descrpt_deriv : tf.Tensor, + rij : tf.Tensor, + nlist : tf.Tensor + ): + """ + Pass the descrpt_reshape tensor as well as descrpt_deriv tensor from the frz graph_def + + Parameters + ---------- + descrpt_reshape + The passed descrpt_reshape tensor + descrpt_deriv + The passed descrpt_deriv tensor + rij + The passed rij tensor + nlist + The passed nlist tensor + """ + self.rij = rij + self.nlist = nlist + self.descrpt_deriv = descrpt_deriv + self.descrpt_reshape = descrpt_reshape + + def get_feed_dict(self, + coord_, + atype_, + natoms, + box, + mesh): + """ + generate the deed_dict for current descriptor + + Parameters + ---------- + coord_ + The coordinate of atoms + atype_ + The type of atoms + natoms + The number of atoms. This tensor has the length of Ntypes + 2 + natoms[0]: number of local atoms + natoms[1]: total number of atoms held by this processor + natoms[i]: 2 <= i < Ntypes+2, number of type i atoms + box + The box. Can be generated by deepmd.model.make_stat_input + mesh + For historical reasons, only the length of the Tensor matters. + if size of mesh == 6, pbc is assumed. + if size of mesh == 0, no-pbc is assumed. + + Returns + ------- + feed_dict + The output feed_dict of current descriptor + """ + feed_dict = { + 't_coord:0' :coord_, + 't_type:0' :atype_, + 't_natoms:0' :natoms, + 't_box:0' :box, + 't_mesh:0' :mesh + } + return feed_dict def prod_force_virial(self, atom_ener : tf.Tensor, diff --git a/deepmd/entrypoints/compress.py b/deepmd/entrypoints/compress.py index 7755156601..35020fdd22 100644 --- a/deepmd/entrypoints/compress.py +++ b/deepmd/entrypoints/compress.py @@ -6,10 +6,11 @@ from typing import Optional from deepmd.env import tf -from deepmd.common import j_loader, get_tensor_by_name, GLOBAL_TF_FLOAT_PRECISION +from deepmd.common import j_loader, GLOBAL_TF_FLOAT_PRECISION from deepmd.utils.argcheck import normalize from deepmd.utils.compat import updata_deepmd_input from deepmd.utils.errors import GraphTooLargeError, GraphWithoutTensorError +from deepmd.utils.graph import get_tensor_by_name from .freeze import freeze from .train import train, get_rcut, get_min_nbor_dist @@ -121,6 +122,7 @@ def compress( INPUT=control_file, init_model=None, restart=None, + init_frz_model=None, output=control_file, mpi_log=mpi_log, log_level=log_level, diff --git a/deepmd/entrypoints/freeze.py b/deepmd/entrypoints/freeze.py index a8a1294a64..c6bb89fb8f 100755 --- a/deepmd/entrypoints/freeze.py +++ b/deepmd/entrypoints/freeze.py @@ -40,6 +40,7 @@ def _make_node_names(model_type: str, modifier_type: Optional[str] = None) -> Li if unknown model type """ nodes = [ + "model_type", "descrpt_attr/rcut", "descrpt_attr/ntypes", "model_attr/tmap", diff --git a/deepmd/entrypoints/main.py b/deepmd/entrypoints/main.py index 5789cfe47b..bef0d2bf58 100644 --- a/deepmd/entrypoints/main.py +++ b/deepmd/entrypoints/main.py @@ -163,6 +163,13 @@ def parse_args(args: Optional[List[str]] = None): default="out.json", help="The output file of the parameters used in training.", ) + parser_train.add_argument( + "-f", + "--init-frz-model", + type=str, + default=None, + help="Initialize the training from the frozen model.", + ) # * freeze script ****************************************************************** parser_frz = subparsers.add_parser( diff --git a/deepmd/entrypoints/train.py b/deepmd/entrypoints/train.py index 3fe0466eae..6a54e6a8dd 100755 --- a/deepmd/entrypoints/train.py +++ b/deepmd/entrypoints/train.py @@ -32,6 +32,7 @@ def train( init_model: Optional[str], restart: Optional[str], output: str, + init_frz_model: str, mpi_log: str, log_level: int, log_path: Optional[str], @@ -50,13 +51,15 @@ def train( path to checkpoint folder or None output : str path for dump file with arguments + init_frz_model : str + path to frozen model or None mpi_log : str mpi logging mode log_level : int logging level defined by int 0-3 log_path : Optional[str] logging file path or None if logs are to be output only to stdout - is_compress: Bool + is_compress: bool indicates whether in the model compress mode Raises @@ -84,6 +87,7 @@ def train( run_opt = RunOptions( init_model=init_model, restart=restart, + init_frz_model=init_frz_model, log_path=log_path, log_level=log_level, mpi_log=mpi_log diff --git a/deepmd/entrypoints/transfer.py b/deepmd/entrypoints/transfer.py index 0af45a4244..9efc07c668 100644 --- a/deepmd/entrypoints/transfer.py +++ b/deepmd/entrypoints/transfer.py @@ -2,6 +2,7 @@ from typing import Dict, Optional, Sequence, Tuple from deepmd.env import tf +from deepmd.common import PRECISION_MAPPING import re import numpy as np import logging @@ -10,12 +11,6 @@ log = logging.getLogger(__name__) -PRECISION_MAPPING: Dict[int, type] = { - 1: np.float32, - 2: np.float64, - 19: np.float16, -} - @np.vectorize def convert_number(number: int) -> float: diff --git a/deepmd/fit/ener.py b/deepmd/fit/ener.py index 03145076cb..bf8cc5bf5b 100644 --- a/deepmd/fit/ener.py +++ b/deepmd/fit/ener.py @@ -117,6 +117,9 @@ def __init__ (self, self.aparam_std = None self.aparam_inv_std = None + self.compress = False + self.fitting_net_variables = None + def get_numb_fparam(self) -> int: """ Get the number of frame parameters @@ -257,7 +260,8 @@ def _build_lower( activation_fn = self.fitting_activation_fn, precision = self.fitting_precision, trainable = self.trainable[ii], - uniform_seed = self.uniform_seed) + uniform_seed = self.uniform_seed, + initial_variables = self.fitting_net_variables) else : layer = one_layer( layer, @@ -268,7 +272,8 @@ def _build_lower( activation_fn = self.fitting_activation_fn, precision = self.fitting_precision, trainable = self.trainable[ii], - uniform_seed = self.uniform_seed) + uniform_seed = self.uniform_seed, + initial_variables = self.fitting_net_variables) if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift final_layer = one_layer( layer, @@ -280,7 +285,8 @@ def _build_lower( seed = self.seed, precision = self.fitting_precision, trainable = self.trainable[-1], - uniform_seed = self.uniform_seed) + uniform_seed = self.uniform_seed, + initial_variables = self.fitting_net_variables) if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift return final_layer @@ -445,3 +451,16 @@ def build (self, return tf.cast(tf.reshape(outs, [-1]), GLOBAL_TF_FLOAT_PRECISION) + def init_variables(self, + fitting_net_variables: dict + ) -> None: + """ + Init the fitting net variables with the given dict + + Parameters + ---------- + fitting_net_variables + The input dict which stores the fitting net variables + """ + self.compress = True + self.fitting_net_variables = fitting_net_variables \ No newline at end of file diff --git a/deepmd/model/ener.py b/deepmd/model/ener.py index 6d7230b6cd..2aa2436a57 100644 --- a/deepmd/model/ener.py +++ b/deepmd/model/ener.py @@ -3,8 +3,9 @@ from deepmd.env import tf from deepmd.utils.pair_tab import PairTab +from deepmd.utils.graph import load_graph_def from deepmd.common import ClassArg -from deepmd.env import global_cvt_2_ener_float, MODEL_VERSION +from deepmd.env import global_cvt_2_ener_float, MODEL_VERSION, GLOBAL_TF_FLOAT_PRECISION from deepmd.env import op_module from .model_stat import make_stat_input, merge_sys_stat @@ -111,6 +112,7 @@ def build (self, box, mesh, input_dict, + frz_model = None, suffix = '', reuse = None): @@ -150,16 +152,29 @@ def build (self, ) input_dict['type_embedding'] = type_embedding - dout \ - = self.descrpt.build(coord_, - atype_, - natoms, - box, - mesh, - input_dict, - suffix = suffix, - reuse = reuse) - dout = tf.identity(dout, name='o_descriptor') + if frz_model == None: + dout \ + = self.descrpt.build(coord_, + atype_, + natoms, + box, + mesh, + input_dict, + suffix = suffix, + reuse = reuse) + dout = tf.identity(dout, name='o_descriptor') + else: + tf.constant(self.rcut, + name = 'descrpt_attr/rcut', + dtype = GLOBAL_TF_FLOAT_PRECISION) + tf.constant(self.ntypes, + name = 'descrpt_attr/ntypes', + dtype = tf.int32) + feed_dict = self.descrpt.get_feed_dict(coord_, atype_, natoms, box, mesh) + return_elements = ['o_rmat:0', 'o_rmat_deriv:0', 'o_rij:0', 'o_nlist:0', 'o_descriptor:0'] + descrpt_reshape, descrpt_deriv, rij, nlist, dout = self._import_graph_def_from_frz_model(frz_model, feed_dict, return_elements) + self.descrpt.pass_tensors_from_frz_model(descrpt_reshape, descrpt_deriv, rij, nlist) + if self.srtab is not None : nlist, rij, sel_a, sel_r = self.descrpt.get_nlist() @@ -249,3 +264,6 @@ def build (self, return model_dict + def _import_graph_def_from_frz_model(self, frz_model, feed_dict, return_elements): + graph, graph_def = load_graph_def(frz_model) + return tf.import_graph_def(graph_def, input_map = feed_dict, return_elements = return_elements) \ No newline at end of file diff --git a/deepmd/train/run_options.py b/deepmd/train/run_options.py index 1ad5fc4574..8cd095323d 100644 --- a/deepmd/train/run_options.py +++ b/deepmd/train/run_options.py @@ -87,6 +87,7 @@ class RunOptions: def __init__( self, init_model: Optional[str] = None, + init_frz_model: Optional[str] = None, restart: Optional[str] = None, log_path: Optional[str] = None, log_level: int = 0, @@ -110,6 +111,9 @@ def __init__( elif init_model is not None: self.init_model = os.path.abspath(init_model) self.init_mode = "init_from_model" + elif init_frz_model is not None: + self.init_frz_model = os.path.abspath(init_frz_model) + self.init_mode = "init_from_frz_model" self._setup_logger(Path(log_path) if log_path else None, log_level, mpi_log) diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index 62545281ba..7b66753376 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -25,9 +25,11 @@ from deepmd.utils.neighbor_stat import NeighborStat from deepmd.utils.sess import run_sess from deepmd.utils.type_embed import TypeEmbedNet +from deepmd.utils.graph import get_tensor_by_name, get_fitting_net_variables from tensorflow.python.client import timeline from deepmd.env import op_module +from deepmd.utils.errors import GraphWithoutTensorError # load grad of force module import deepmd.op @@ -110,6 +112,7 @@ def _init_param(self, jdata): # fitting net fitting_type = fitting_param.get('type', 'ener') + self.fitting_type = fitting_type fitting_param.pop('type', None) fitting_param['descrpt'] = self.descrpt if fitting_type == 'ener': @@ -271,6 +274,11 @@ def _init_param(self, jdata): else: self.valid_numb_batch = 1 + # if init the graph with the frozen model + self.frz_model = None + self.model_type = None + self.init_from_frz_model = False + def build (self, data = None, @@ -292,6 +300,10 @@ def build (self, self.batch_size = data.get_batch_size() self.model.data_stat(data) + # config the init_frz_model command + if self.run_opt.init_mode == 'init_from_frz_model': + self._init_from_frz_model() + self.neighbor_stat \ = NeighborStat(self.ntypes, self.descrpt_param['rcut']) self.min_nbor_dist, self.max_nbor_size \ @@ -305,6 +317,12 @@ def build (self, else : assert 'rcut' in self.descrpt_param, "Error: descriptor must have attr rcut!" self.descrpt.enable_compression(self.model_param['compress']["min_nbor_dist"], self.model_param['compress']['model_file'], self.model_param['compress']['table_config'][0], self.model_param['compress']['table_config'][1], self.model_param['compress']['table_config'][2], self.model_param['compress']['table_config'][3]) + self.fitting.init_variables(get_fitting_net_variables(self.model_param['compress']['model_file'])) + + if self.is_compress == True or self.model_type == 'compressed_model': + tf.constant("compressed_model", name = 'model_type', dtype = tf.string) + else: + tf.constant("original_model", name = 'model_type', dtype = tf.string) self._build_lr() self._build_network(data) @@ -337,6 +355,7 @@ def _build_network(self, data): self.place_holders['box'], self.place_holders['default_mesh'], self.place_holders, + self.frz_model, suffix = "", reuse = False) @@ -392,6 +411,11 @@ def _init_session(self): log.info("restart from model %s" % self.run_opt.restart) run_sess(self.sess, init_op) self.saver.restore (self.sess, self.run_opt.restart) + elif self.run_opt.init_mode == 'init_from_frz_model' : + log.info("initialize training from the frozen model") + run_sess(self.sess, init_op) + fp = open(self.disp_file, "w") + fp.close () else : raise RuntimeError ("unkown init mode") else: @@ -631,3 +655,33 @@ def _get_place_horders(self, data_dict): prec = GLOBAL_ENER_FLOAT_PRECISION self.place_holders[kk] = tf.placeholder(prec, [None], name = 't_' + kk) self.place_holders['find_' + kk] = tf.placeholder(tf.float32, name = 't_find_' + kk) + + def _init_from_frz_model(self): + # get the model type from the frozen model(self.run_opt.init_frz_model) + try: + t_model_type = get_tensor_by_name(self.run_opt.init_frz_model, 'model_type') + self.model_type = bytes.decode(t_model_type) + except GraphWithoutTensorError as e: + # throw runtime error if there's no frozen model + if os.path.exists(self.run_opt.init_frz_model) == False: + raise RuntimeError( + "The input frozen model %s does not exist! Please check the path of the training script. " % (self.run_opt.init_frz_model + "(" + os.path.abspath(self.run_opt.init_frz_model) + ")") + ) from e + # throw runtime error if the frozen_model has no model type information... + else: + raise RuntimeError( + "The input frozen model: %s has no 'model_type' information, " + "which is not supported by the 'dp train init-frz-model' interface. " % self.run_opt.init_frz_model + ) from e + + # self.frz_model will control the self.model to import the descriptor from the given frozen model instead of building from scratch... + # initialize fitting net with the given compressed frozen model + if self.model_type == 'compressed_model' and self.fitting_type == 'ener': + self.init_from_frz_model = True + self.frz_model = self.run_opt.init_frz_model + self.fitting.init_variables(get_fitting_net_variables(self.frz_model)) + tf.constant("compressed_model", name = 'model_type', dtype = tf.string) + elif self.fitting_type != 'ener': + raise RuntimeError("The 'dp train init-frz-model' command only supports the 'ener' type fitting net currently!") + else: + raise RuntimeError("The 'dp train init-frz-model' command only supports the compressed model currently!") diff --git a/deepmd/utils/graph.py b/deepmd/utils/graph.py new file mode 100644 index 0000000000..76f66e5cf9 --- /dev/null +++ b/deepmd/utils/graph.py @@ -0,0 +1,193 @@ +import re +import numpy as np +from deepmd.env import tf +from deepmd.common import PRECISION_MAPPING +from deepmd.utils.sess import run_sess +from deepmd.utils.errors import GraphWithoutTensorError + +def load_graph_def(model_file: str): + """ + Load graph as well as the graph_def from the frozen model(model_file) + + Parameters + ---------- + model_file : str + The input frozen model. + + Returns + ------- + graph + The graph loaded from the frozen model. + graph_def + The graph_def loaded from the frozen model. + """ + graph_def = tf.GraphDef() + with open(model_file, "rb") as f: + graph_def.ParseFromString(f.read()) + with tf.Graph().as_default() as graph: + tf.import_graph_def(graph_def, name = "") + return graph, graph_def + + +def get_tensor_by_name(model_file: str, + tensor_name: str) -> tf.Tensor: + """ + Load tensor value from the frozen model(model_file) + + Parameters + ---------- + model_file : str + The input frozen model. + tensor : tensor_name + Indicates which tensor which will be loaded from the frozen model. + + Returns + ------- + tf.Tensor + The tensor which was loaded from the frozen model. + + Raises + ------ + GraphWithoutTensorError + Whether the tensor_name is within the frozen model. + """ + graph, _ = load_graph_def(model_file) + try: + tensor = graph.get_tensor_by_name(tensor_name + ':0') + except KeyError as e: + raise GraphWithoutTensorError() from e + with tf.Session(graph=graph) as sess: + tensor = run_sess(sess, tensor) + return tensor + + +def get_tensor_by_type(node, + data_type : str): + """ + Get the tensor value within the given node according to the input data_type + + Parameters + ---------- + node + The given tensorflow graph node + data_type + The data type of the node + + Returns + ---------- + tensor + The tensor value of the given node + """ + if data_type == np.float64: + tensor = np.array(node.double_val) + elif data_type == np.float32: + tensor = np.array(node.float_val) + else: + raise RunTimeError('model compression does not support the half precision') + return tensor + + +def get_embedding_net_nodes(model_file: str): + """ + Get the embedding net nodes with the given frozen model(model_file) + + Parameters + ---------- + model_file + The input frozen model. + + Returns + ---------- + embedding_net_nodes + The embedding net nodes with the given frozen model. + """ + _, graph_def = load_graph_def(model_file) + embedding_net_nodes = {} + embedding_net_pattern = "filter_type_\d+/matrix_\d+_\d+|filter_type_\d+/bias_\d+_\d+|filter_type_\d+/idt_\d+_\d+|filter_type_all/matrix_\d+_\d+|filter_type_all/bias_\d+_\d+|filter_type_all/idt_\d+_\d" + for node in graph_def.node: + if re.fullmatch(embedding_net_pattern, node.name) != None: + embedding_net_nodes[node.name] = node.attr["value"].tensor + for key in embedding_net_nodes.keys(): + assert key.find('bias') > 0 or key.find( + 'matrix') > 0, "currently, only support weight matrix and bias matrix at the tabulation op!" + return embedding_net_nodes + + +def get_embedding_net_variables(model_file : str): + """ + Get the embedding net variables with the given frozen model(model_file) + + Parameters + ---------- + model_file + The input frozen model. + + Returns + ---------- + The embedding net variables within the given frozen model. + """ + embedding_net_variables = {} + embedding_net_nodes = get_embedding_net_nodes(model_file) + for item in embedding_net_nodes: + node = embedding_net_nodes[item] + dtype = PRECISION_MAPPING[node.dtype] + tensor_shape = tf.TensorShape(node.tensor_shape).as_list() + if (len(tensor_shape) != 1) or (tensor_shape[0] != 1): + tensor_value = np.frombuffer(node.tensor_content) + else: + tensor_value = get_tensor_by_type(node, dtype) + embedding_net_variables[item] = np.reshape(tensor_value, tensor_shape) + return embedding_net_variables + + +def get_fitting_net_nodes(model_file : str): + """ + Get the fitting net nodes with the given frozen model(model_file) + + Parameters + ---------- + model_file + The input frozen model. + + Returns + ---------- + fitting_net_nodes + The fitting net nodes with the given frozen model. + """ + _, graph_def = load_graph_def(model_file) + fitting_net_nodes = {} + fitting_net_pattern = "layer_\d+_type_\d+/matrix+|layer_\d+_type_\d+/bias+|layer_\d+_type_\d+/idt+|final_layer_type_\d+/matrix+|final_layer_type_\d+/bias" + for node in graph_def.node: + if re.fullmatch(fitting_net_pattern, node.name) != None: + fitting_net_nodes[node.name] = node.attr["value"].tensor + for key in fitting_net_nodes.keys(): + assert key.find('bias') > 0 or key.find('matrix') > 0 or key.find( + 'idt') > 0, "currently, only support weight matrix, bias and idt at the model compression process!" + return fitting_net_nodes + + +def get_fitting_net_variables(model_file : str): + """ + Get the fitting net variables with the given frozen model(model_file) + + Parameters + ---------- + model_file + The input frozen model. + + Returns + ---------- + The fitting net variables within the given frozen model. + """ + fitting_net_variables = {} + fitting_net_nodes = get_fitting_net_nodes(model_file) + for item in fitting_net_nodes: + node = fitting_net_nodes[item] + dtype= PRECISION_MAPPING[node.dtype] + tensor_shape = tf.TensorShape(node.tensor_shape).as_list() + if (len(tensor_shape) != 1) or (tensor_shape[0] != 1): + tensor_value = np.frombuffer(node.tensor_content) + else: + tensor_value = get_tensor_by_type(node, dtype) + fitting_net_variables[item] = np.reshape(tensor_value, tensor_shape) + return fitting_net_variables \ No newline at end of file diff --git a/deepmd/utils/network.py b/deepmd/utils/network.py index b7d2fb24c2..f6b8108831 100644 --- a/deepmd/utils/network.py +++ b/deepmd/utils/network.py @@ -18,38 +18,44 @@ def one_layer(inputs, use_timestep = False, trainable = True, useBN = False, - uniform_seed = False): + uniform_seed = False, + initial_variables = None): with tf.variable_scope(name, reuse=reuse): shape = inputs.get_shape().as_list() + w_initializer = tf.random_normal_initializer( + stddev=stddev / np.sqrt(shape[1] + outputs_size), + seed=seed if (seed is None or uniform_seed) else seed + 0) + b_initializer = tf.random_normal_initializer( + stddev=stddev, + mean=bavg, + seed=seed if (seed is None or uniform_seed) else seed + 1) + if initial_variables is not None: + w_initializer = tf.constant_initializer(initial_variables[name + '/matrix']) + b_initializer = tf.constant_initializer(initial_variables[name + '/bias']) w = tf.get_variable('matrix', [shape[1], outputs_size], precision, - tf.random_normal_initializer( - stddev=stddev/np.sqrt(shape[1]+outputs_size), - seed = seed if (seed is None or uniform_seed) else seed + 0 - ), + w_initializer, trainable = trainable) variable_summaries(w, 'matrix') b = tf.get_variable('bias', [outputs_size], precision, - tf.random_normal_initializer( - stddev=stddev, - mean = bavg, - seed = seed if (seed is None or uniform_seed) else seed + 1 - ), + b_initializer, trainable = trainable) variable_summaries(b, 'bias') hidden = tf.matmul(inputs, w) + b if activation_fn != None and use_timestep : + idt_initializer = tf.random_normal_initializer( + stddev=0.001, + mean=0.1, + seed=seed if (seed is None or uniform_seed) else seed + 2) + if initial_variables is not None: + idt_initializer = tf.constant_initializer(initial_variables[name + '/idt']) idt = tf.get_variable('idt', [outputs_size], precision, - tf.random_normal_initializer( - stddev=0.001, - mean = 0.1, - seed = seed if (seed is None or uniform_seed) else seed + 2 - ), + idt_initializer, trainable = trainable) variable_summaries(idt, 'idt') if activation_fn != None: diff --git a/deepmd/utils/tabulate.py b/deepmd/utils/tabulate.py index 6a2f54e33e..439c28fa75 100644 --- a/deepmd/utils/tabulate.py +++ b/deepmd/utils/tabulate.py @@ -6,12 +6,15 @@ from deepmd.env import tf from deepmd.env import op_module from deepmd.utils.sess import run_sess +from deepmd.utils.graph import get_tensor_by_name, load_graph_def +from deepmd.utils.graph import get_fitting_net_nodes, get_embedding_net_nodes +from deepmd.utils.graph import get_fitting_net_variables, get_embedding_net_variables from tensorflow.python.platform import gfile from tensorflow.python.framework import tensor_util log = logging.getLogger(__name__) -class DeepTabulate(): +class DPTabulate(): """ Class for tabulation. Compress a model, which including tabulating the embedding-net. @@ -41,7 +44,7 @@ def __init__(self, if self.type_one_side and len(self.exclude_types) != 0: raise RunTimeError('"type_one_side" is not compatible with "exclude_types"') - self.graph, self.graph_def = self._load_graph() + self.graph, self.graph_def = load_graph_def(self.model_file) self.sess = tf.Session(graph = self.graph) self.sub_graph, self.sub_graph_def = self._load_sub_graph() @@ -53,27 +56,30 @@ def __init__(self, except Exception: self.sel_a = self.graph.get_operation_by_name('DescrptSeA').get_attr('sel_a') self.descrpt = self.graph.get_operation_by_name ('DescrptSeA') - self.ntypes = self._get_tensor_value(self.graph.get_tensor_by_name ('descrpt_attr/ntypes:0')) - self.davg = self._get_tensor_value(self.graph.get_tensor_by_name ('descrpt_attr/t_avg:0')) - self.dstd = self._get_tensor_value(self.graph.get_tensor_by_name ('descrpt_attr/t_std:0')) + + self.davg = get_tensor_by_name(self.model_file, 'descrpt_attr/t_avg') + self.dstd = get_tensor_by_name(self.model_file, 'descrpt_attr/t_std') + self.ntypes = get_tensor_by_name(self.model_file, 'descrpt_attr/ntypes') self.rcut = self.descrpt.get_attr('rcut_r') self.rcut_smth = self.descrpt.get_attr('rcut_r_smth') - self.filter_variable_nodes = self._load_matrix_node() + self.fitting_net_nodes = get_fitting_net_nodes(self.model_file) + self.embedding_net_nodes = get_embedding_net_nodes(self.model_file) + for tt in self.exclude_types: if (tt[0] not in range(self.ntypes)) or (tt[1] not in range(self.ntypes)): raise RuntimeError("exclude types" + str(tt) + " must within the number of atomic types " + str(self.ntypes) + "!") if (self.ntypes * self.ntypes - len(self.exclude_types) == 0): raise RuntimeError("empty embedding-net are not supported in model compression!") - self.layer_size = len(self.filter_variable_nodes) // ((self.ntypes * self.ntypes - len(self.exclude_types)) * 2) + self.layer_size = len(self.embedding_net_nodes) // ((self.ntypes * self.ntypes - len(self.exclude_types)) * 2) self.table_size = self.ntypes * self.ntypes if type_one_side : - self.layer_size = len(self.filter_variable_nodes) // (self.ntypes * 2) + self.layer_size = len(self.embedding_net_nodes) // (self.ntypes * 2) self.table_size = self.ntypes - # self.value_type = self.filter_variable_nodes["filter_type_0/matrix_1_0"].dtype #"filter_type_0/matrix_1_0" must exit~ + # self.value_type = self.embedding_net_nodes["filter_type_0/matrix_1_0"].dtype #"filter_type_0/matrix_1_0" must exit~ # get trained variables self.bias = self._get_bias() self.matrix = self._get_matrix() @@ -85,7 +91,8 @@ def __init__(self, # define tables self.data = {} - # TODO: Need a check function to determine if the current model is properly + self.fitting_net_variables = get_fitting_net_variables(self.model_file) + self.embedding_net_variables = get_embedding_net_variables(self.model_file) def build(self, min_nbor_dist : float, @@ -143,50 +150,26 @@ def build(self, self.data[net][jj][kk * 6 + 5] = (1 / (2 * tt * tt * tt * tt * tt)) * (12 * hh - 6 * (dd[jj + 1][kk] + dd[jj][kk]) * tt + (d2[jj + 1][kk] - d2[jj][kk]) * tt * tt) return lower, upper - def _load_graph(self): - graph_def = tf.GraphDef() - with open(self.model_file, "rb") as f: - graph_def.ParseFromString(f.read()) - with tf.Graph().as_default() as graph: - tf.import_graph_def(graph_def, name = "") - return graph, graph_def - def _load_sub_graph(self): sub_graph_def = tf.GraphDef() with tf.Graph().as_default() as sub_graph: tf.import_graph_def(sub_graph_def, name = "") return sub_graph, sub_graph_def - def _get_tensor_value(self, tensor) : - with self.sess.as_default(): - run_sess(self.sess, tensor) - value = tensor.eval() - return value - - def _load_matrix_node(self): - matrix_node = {} - matrix_node_pattern = "filter_type_\d+/matrix_\d+_\d+|filter_type_\d+/bias_\d+_\d+|filter_type_\d+/idt_\d+_\d+|filter_type_all/matrix_\d+_\d+|filter_type_all/bias_\d+_\d+|filter_type_all/idt_\d+_\d" - for node in self.graph_def.node: - if re.fullmatch(matrix_node_pattern, node.name) != None: - matrix_node[node.name] = node.attr["value"].tensor - for key in matrix_node.keys() : - assert key.find('bias') > 0 or key.find('matrix') > 0, "currently, only support weight matrix and bias matrix at the tabulation op!" - return matrix_node - def _get_bias(self): bias = {} for layer in range(1, self.layer_size + 1): bias["layer_" + str(layer)] = [] if self.type_one_side: for ii in range(0, self.ntypes): - tensor_value = np.frombuffer (self.filter_variable_nodes["filter_type_all/bias_" + str(layer) + "_" + str(ii)].tensor_content) - tensor_shape = tf.TensorShape(self.filter_variable_nodes["filter_type_all/bias_" + str(layer) + "_" + str(ii)].tensor_shape).as_list() + tensor_value = np.frombuffer (self.embedding_net_nodes["filter_type_all/bias_" + str(layer) + "_" + str(ii)].tensor_content) + tensor_shape = tf.TensorShape(self.embedding_net_nodes["filter_type_all/bias_" + str(layer) + "_" + str(ii)].tensor_shape).as_list() bias["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape)) else: for ii in range(0, self.ntypes * self.ntypes): if (ii // self.ntypes, int(ii % self.ntypes)) not in self.exclude_types: - tensor_value = np.frombuffer(self.filter_variable_nodes["filter_type_" + str(ii // self.ntypes) + "/bias_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_content) - tensor_shape = tf.TensorShape(self.filter_variable_nodes["filter_type_" + str(ii // self.ntypes) + "/bias_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_shape).as_list() + tensor_value = np.frombuffer(self.embedding_net_nodes["filter_type_" + str(ii // self.ntypes) + "/bias_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_content) + tensor_shape = tf.TensorShape(self.embedding_net_nodes["filter_type_" + str(ii // self.ntypes) + "/bias_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_shape).as_list() bias["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape)) else: bias["layer_" + str(layer)].append(np.array([])) @@ -198,14 +181,14 @@ def _get_matrix(self): matrix["layer_" + str(layer)] = [] if self.type_one_side: for ii in range(0, self.ntypes): - tensor_value = np.frombuffer (self.filter_variable_nodes["filter_type_all/matrix_" + str(layer) + "_" + str(ii)].tensor_content) - tensor_shape = tf.TensorShape(self.filter_variable_nodes["filter_type_all/matrix_" + str(layer) + "_" + str(ii)].tensor_shape).as_list() + tensor_value = np.frombuffer (self.embedding_net_nodes["filter_type_all/matrix_" + str(layer) + "_" + str(ii)].tensor_content) + tensor_shape = tf.TensorShape(self.embedding_net_nodes["filter_type_all/matrix_" + str(layer) + "_" + str(ii)].tensor_shape).as_list() matrix["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape)) else: for ii in range(0, self.ntypes * self.ntypes): if (ii // self.ntypes, int(ii % self.ntypes)) not in self.exclude_types: - tensor_value = np.frombuffer(self.filter_variable_nodes["filter_type_" + str(ii // self.ntypes) + "/matrix_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_content) - tensor_shape = tf.TensorShape(self.filter_variable_nodes["filter_type_" + str(ii // self.ntypes) + "/matrix_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_shape).as_list() + tensor_value = np.frombuffer(self.embedding_net_nodes["filter_type_" + str(ii // self.ntypes) + "/matrix_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_content) + tensor_shape = tf.TensorShape(self.embedding_net_nodes["filter_type_" + str(ii // self.ntypes) + "/matrix_" + str(layer) + "_" + str(int(ii % self.ntypes))].tensor_shape).as_list() matrix["layer_" + str(layer)].append(np.reshape(tensor_value, tensor_shape)) else: matrix["layer_" + str(layer)].append(np.array([])) @@ -269,4 +252,4 @@ def _spline5_switch(self, vv = uu*uu*uu * (-6 * uu*uu + 15 * uu - 10) + 1 else: vv = 0 - return vv \ No newline at end of file + return vv diff --git a/deepmd/utils/type_embed.py b/deepmd/utils/type_embed.py index 1c36d522b4..14f8114e9c 100644 --- a/deepmd/utils/type_embed.py +++ b/deepmd/utils/type_embed.py @@ -12,7 +12,6 @@ import math from deepmd.common import get_activation_func, get_precision, ACTIVATION_FN_DICT, PRECISION_DICT, docstring_parameter, get_np_precision from deepmd.utils.argcheck import list_to_doc -from deepmd.utils.tabulate import DeepTabulate def embed_atom_type( diff --git a/source/lib/include/tabulate.h b/source/lib/include/tabulate.h index bce0913ebd..ccc0e6fa65 100644 --- a/source/lib/include/tabulate.h +++ b/source/lib/include/tabulate.h @@ -26,6 +26,19 @@ void tabulate_fusion_grad_cpu( const int nnei, const int last_layer_size); +template +void tabulate_fusion_grad_grad_cpu( + FPTYPE * dz_dy, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dz_dy_dem_x, + const FPTYPE * dz_dy_dem, + const int nloc, + const int nnei, + const int last_layer_size); + #if GOOGLE_CUDA template void tabulate_fusion_gpu_cuda( @@ -50,6 +63,19 @@ void tabulate_fusion_grad_gpu_cuda( const int nloc, const int nnei, const int last_layer_size); + +template +void tabulate_fusion_grad_grad_gpu_cuda( + FPTYPE * dz_dy, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dz_dy_dem_x, + const FPTYPE * dz_dy_dem, + const int nloc, + const int nnei, + const int last_layer_size); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM @@ -76,6 +102,19 @@ void tabulate_fusion_grad_gpu_rocm( const int nloc, const int nnei, const int last_layer_size); + +template +void tabulate_fusion_grad_grad_gpu_rocm( + FPTYPE * dz_dy, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dz_dy_dem_x, + const FPTYPE * dz_dy_dem, + const int nloc, + const int nnei, + const int last_layer_size); #endif // TENSORFLOW_USE_ROCM } diff --git a/source/lib/src/cuda/tabulate.cu b/source/lib/src/cuda/tabulate.cu index 2cab6d75a7..d501474bc3 100644 --- a/source/lib/src/cuda/tabulate.cu +++ b/source/lib/src/cuda/tabulate.cu @@ -187,6 +187,66 @@ __global__ void tabulate_fusion_grad_fifth_order_polynomial( } } +template < + typename FPTYPE, + int MTILE, + int KTILE> +__global__ void tabulate_fusion_grad_grad_fifth_order_polynomial( + FPTYPE * dz_dy, + const FPTYPE * table, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dz_dy_dem_x, + const FPTYPE * dz_dy_dem, + const FPTYPE lower, + const FPTYPE upper, + const FPTYPE max, + const FPTYPE stride0, + const FPTYPE stride1, + const int nnei, + const int last_layer_size) +{ + extern __shared__ int _data[]; + const int block_idx = blockIdx.x; // nloc + const int thread_idx = threadIdx.x; // last_layer_size + FPTYPE ago = __shfl_sync(0xffffffff, em_x[block_idx * nnei + nnei - 1], 0); + bool unloop = false; + int breakpoint = nnei - 1; + FPTYPE * iteratorC = (FPTYPE*) &_data[0]; + for (int kk = 0; kk < MTILE; kk++) + iteratorC[kk * last_layer_size + thread_idx] = 0.f; + __syncthreads(); + + for (int ii = 0; ii < nnei; ii++) { + FPTYPE var[6]; + FPTYPE xx = em_x[block_idx * nnei + ii]; + FPTYPE dz_xx = dz_dy_dem_x[block_idx * nnei + ii]; + if (xx == ago) { + unloop = true; + breakpoint = ii; + } + int table_idx = 0; + locate_xx(xx, table_idx, lower, upper, max, stride0, stride1); + var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0]; + var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1]; + var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2]; + var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3]; + var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4]; + var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5]; + FPTYPE res = var[0] + (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * xx; + FPTYPE res_grad = var[1] + (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * xx; + + for (int kk = 0; kk < MTILE; kk++) { + int em_index = block_idx * nnei * MTILE + ii * MTILE + kk; + iteratorC[kk * last_layer_size + thread_idx] += (nnei - breakpoint) * (em[em_index] * res_grad * dz_xx + dz_dy_dem[em_index] * res); + } + if (unloop) break; + } + for (int ii = 0; ii < MTILE; ii++) { + dz_dy[block_idx * MTILE * last_layer_size + ii * last_layer_size + thread_idx] = iteratorC[ii * last_layer_size + thread_idx]; + } +} + namespace deepmd { template void tabulate_fusion_gpu_cuda( @@ -235,8 +295,34 @@ void tabulate_fusion_grad_gpu_cuda( DPErrcheck(cudaDeviceSynchronize()); } +template +void tabulate_fusion_grad_grad_gpu_cuda( + FPTYPE * dz_dy, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dz_dy_dem_x, + const FPTYPE * dz_dy_dem, + const int nloc, + const int nnei, + const int last_layer_size) +{ + if (nloc <= 0) {return;} + DPErrcheck(cudaMemset( + dz_dy, + 0.0, sizeof(FPTYPE) * nloc * 4 * last_layer_size)); + tabulate_fusion_grad_grad_fifth_order_polynomial <<>>( + dz_dy, + table, em_x, em, dz_dy_dem_x, dz_dy_dem, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); +} + template void tabulate_fusion_gpu_cuda(float * out, const float * table, const float * table_info, const float * em_x, const float * em, const int nloc, const int nnei, const int last_layer_size); template void tabulate_fusion_gpu_cuda(double * out, const double * table, const double * table_info, const double * em_x, const double * em, const int nloc, const int nnei, const int last_layer_size); template void tabulate_fusion_grad_gpu_cuda (float * dy_dem_x, float * dy_dem, const float * table, const float * table_info, const float * em_x, const float * em, const float * dy, const int nloc, const int nnei, const int last_layer_size); template void tabulate_fusion_grad_gpu_cuda (double * dy_dem_x, double * dy_dem, const double * table, const double * table_info, const double * em_x, const double * em, const double * dy, const int nloc, const int nnei, const int last_layer_size); +template void tabulate_fusion_grad_grad_gpu_cuda (float * dz_dy, const float * table, const float * table_info, const float * em_x, const float * em, const float * dz_dy_dem_x, const float * dz_dy_dem, const int nloc, const int nnei, const int last_layer_size); +template void tabulate_fusion_grad_grad_gpu_cuda (double * dz_dy, const double * table, const double * table_info, const double * em_x, const double * em, const double * dz_dy_dem_x, const double * dz_dy_dem, const int nloc, const int nnei, const int last_layer_size); } diff --git a/source/lib/src/rocm/tabulate.hip.cu b/source/lib/src/rocm/tabulate.hip.cu index 497f4931cd..ea8a30f6b5 100644 --- a/source/lib/src/rocm/tabulate.hip.cu +++ b/source/lib/src/rocm/tabulate.hip.cu @@ -197,56 +197,81 @@ __global__ void tabulate_fusion_grad_fifth_order_polynomial( } namespace deepmd { - template - void tabulate_fusion_gpu_rocm( - FPTYPE * out, - const FPTYPE * table, - const FPTYPE * table_info, - const FPTYPE * em_x, - const FPTYPE * em, - const int nloc, - const int nnei, - const int last_layer_size) - { - if(nloc <= 0){return;} - hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_fifth_order_polynomial), nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, - out, - table, em_x, em, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); - } - - template - void tabulate_fusion_grad_gpu_rocm( - FPTYPE * dy_dem_x, - FPTYPE * dy_dem, - const FPTYPE * table, - const FPTYPE * table_info, - const FPTYPE * em_x, - const FPTYPE * em, - const FPTYPE * dy, - const int nloc, - const int nnei, - const int last_layer_size) - { - if( nloc<=0 ) { return;} - DPErrcheck(hipMemset( - dy_dem_x, - 0.0, sizeof(FPTYPE) * nloc * nnei)); - DPErrcheck(hipMemset( - dy_dem, - 0.0, sizeof(FPTYPE) * nloc * nnei * 4)); - - hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_grad_fifth_order_polynomial), nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size, 0, - dy_dem_x, dy_dem, - table, em_x, em, dy, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size); - DPErrcheck(hipGetLastError()); - DPErrcheck(hipDeviceSynchronize()); - } - - template void tabulate_fusion_gpu_rocm(float * out, const float * table, const float * table_info, const float * em_x, const float * em, const int nloc, const int nnei, const int last_layer_size); - template void tabulate_fusion_gpu_rocm(double * out, const double * table, const double * table_info, const double * em_x, const double * em, const int nloc, const int nnei, const int last_layer_size); - template void tabulate_fusion_grad_gpu_rocm (float * dy_dem_x, float * dy_dem, const float * table, const float * table_info, const float * em_x, const float * em, const float * dy, const int nloc, const int nnei, const int last_layer_size); - template void tabulate_fusion_grad_gpu_rocm (double * dy_dem_x, double * dy_dem, const double * table, const double * table_info, const double * em_x, const double * em, const double * dy, const int nloc, const int nnei, const int last_layer_size); - } +void tabulate_fusion_gpu_rocm( + FPTYPE * out, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const int nloc, + const int nnei, + const int last_layer_size) +{ + if(nloc <= 0){return;} + hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_fifth_order_polynomial), nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, + out, + table, em_x, em, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); +} + +template +void tabulate_fusion_grad_gpu_rocm( + FPTYPE * dy_dem_x, + FPTYPE * dy_dem, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dy, + const int nloc, + const int nnei, + const int last_layer_size) +{ + if(nloc <= 0) {return;} + DPErrcheck(hipMemset( + dy_dem_x, + 0.0, sizeof(FPTYPE) * nloc * nnei)); + DPErrcheck(hipMemset( + dy_dem, + 0.0, sizeof(FPTYPE) * nloc * nnei * 4)); + + hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_grad_fifth_order_polynomial), nloc, KK * WARP_SIZE, sizeof(FPTYPE) * MM * last_layer_size, 0, + dy_dem_x, dy_dem, + table, em_x, em, dy, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); +} + +template +void tabulate_fusion_grad_grad_gpu_rocm( + FPTYPE * dz_dy, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dz_dy_dem_x, + const FPTYPE * dz_dy_dem, + const int nloc, + const int nnei, + const int last_layer_size) +{ + if(nloc <= 0) {return;} + DPErrcheck(hipMemset( + dz_dy, + 0.0, sizeof(FPTYPE) * nloc * 4 * last_layer_size)); + hipLaunchKernelGGL(HIP_KERNEL_NAME(tabulate_fusion_grad_grad_fifth_order_polynomial), nloc, last_layer_size, sizeof(FPTYPE) * MM * last_layer_size, 0, + dz_dy, + table, em_x, em, dz_dy_dem_x, dz_dy_dem, table_info[0], table_info[1], table_info[2], table_info[3], table_info[4], nnei, last_layer_size); + DPErrcheck(hipGetLastError()); + DPErrcheck(hipDeviceSynchronize()); +} + +template void tabulate_fusion_gpu_rocm(float * out, const float * table, const float * table_info, const float * em_x, const float * em, const int nloc, const int nnei, const int last_layer_size); +template void tabulate_fusion_gpu_rocm(double * out, const double * table, const double * table_info, const double * em_x, const double * em, const int nloc, const int nnei, const int last_layer_size); +template void tabulate_fusion_grad_gpu_rocm (float * dy_dem_x, float * dy_dem, const float * table, const float * table_info, const float * em_x, const float * em, const float * dy, const int nloc, const int nnei, const int last_layer_size); +template void tabulate_fusion_grad_gpu_rocm (double * dy_dem_x, double * dy_dem, const double * table, const double * table_info, const double * em_x, const double * em, const double * dy, const int nloc, const int nnei, const int last_layer_size); +template void tabulate_fusion_grad_grad_gpu_rocm (float * dz_dy, const float * table, const float * table_info, const float * em_x, const float * em, const float * dz_dy_dem_x, const float * dz_dy_dem, const int nloc, const int nnei, const int last_layer_size); +template void tabulate_fusion_grad_grad_gpu_rocm (double * dz_dy, const double * table, const double * table_info, const double * em_x, const double * em, const double * dz_dy_dem_x, const double * dz_dy_dem, const int nloc, const int nnei, const int last_layer_size); +} diff --git a/source/lib/src/tabulate.cc b/source/lib/src/tabulate.cc index b1049226d3..385c68523b 100644 --- a/source/lib/src/tabulate.cc +++ b/source/lib/src/tabulate.cc @@ -186,7 +186,79 @@ void deepmd::tabulate_fusion_grad_cpu( } } +template +void deepmd::tabulate_fusion_grad_grad_cpu( + FPTYPE * dz_dy, + const FPTYPE * table, + const FPTYPE * table_info, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dz_dy_dem_x, + const FPTYPE * dz_dy_dem, + const int nloc, + const int nnei, + const int last_layer_size) +{ + memset(dz_dy, 0.0, sizeof(FPTYPE) * nloc * 4 * last_layer_size); + const FPTYPE lower = table_info[0]; + const FPTYPE upper = table_info[1]; + const FPTYPE _max = table_info[2]; + const FPTYPE stride0 = table_info[3]; + const FPTYPE stride1 = table_info[4]; + // for every atom, execute a small manual gemm ~ + // FPTYPE * res = new FPTYPE[4 * last_layer_size]; + #pragma omp parallel for + for (int ii = 0; ii < nloc; ii++) { + FPTYPE ll[4]; + FPTYPE hh[4]; + FPTYPE ago = em_x[ii * nnei + nnei - 1]; + bool unloop = false; + for (int jj = 0; jj < nnei; jj++) { + ll[0] = em[ii * nnei * 4 + jj * 4 + 0]; + ll[1] = em[ii * nnei * 4 + jj * 4 + 1]; + ll[2] = em[ii * nnei * 4 + jj * 4 + 2]; + ll[3] = em[ii * nnei * 4 + jj * 4 + 3]; + hh[0] = dz_dy_dem[ii * nnei * 4 + jj * 4 + 0]; + hh[1] = dz_dy_dem[ii * nnei * 4 + jj * 4 + 1]; + hh[2] = dz_dy_dem[ii * nnei * 4 + jj * 4 + 2]; + hh[3] = dz_dy_dem[ii * nnei * 4 + jj * 4 + 3]; + FPTYPE xx = em_x[ii * nnei + jj]; + FPTYPE dz_xx = dz_dy_dem_x[ii * nnei + jj]; + if (ago == xx) { + unloop = true; + } + int table_idx = 0; + locate_xx(lower, upper, _max, stride0, stride1, xx, table_idx); + for (int kk = 0; kk < last_layer_size; kk++) { + FPTYPE a0 = table[table_idx * last_layer_size * 6 + 6 * kk + 0]; + FPTYPE a1 = table[table_idx * last_layer_size * 6 + 6 * kk + 1]; + FPTYPE a2 = table[table_idx * last_layer_size * 6 + 6 * kk + 2]; + FPTYPE a3 = table[table_idx * last_layer_size * 6 + 6 * kk + 3]; + FPTYPE a4 = table[table_idx * last_layer_size * 6 + 6 * kk + 4]; + FPTYPE a5 = table[table_idx * last_layer_size * 6 + 6 * kk + 5]; + FPTYPE var = a0 + (a1 + (a2 + (a3 + (a4 + a5 * xx) * xx) * xx) * xx) * xx; + FPTYPE var_grad = a1 + (2 * a2 + (3 * a3 + (4 * a4 + 5 * a5 * xx) * xx) * xx) * xx; + if (unloop) { + dz_dy[ii * last_layer_size * 4 + 0 * last_layer_size + kk] += (nnei - jj) * (var * hh[0] + dz_xx * var_grad * ll[0]); + dz_dy[ii * last_layer_size * 4 + 1 * last_layer_size + kk] += (nnei - jj) * (var * hh[1] + dz_xx * var_grad * ll[1]); + dz_dy[ii * last_layer_size * 4 + 2 * last_layer_size + kk] += (nnei - jj) * (var * hh[2] + dz_xx * var_grad * ll[2]); + dz_dy[ii * last_layer_size * 4 + 3 * last_layer_size + kk] += (nnei - jj) * (var * hh[3] + dz_xx * var_grad * ll[3]); + } + else { + dz_dy[ii * last_layer_size * 4 + 0 * last_layer_size + kk] += var * hh[0] + dz_xx * var_grad * ll[0]; + dz_dy[ii * last_layer_size * 4 + 1 * last_layer_size + kk] += var * hh[1] + dz_xx * var_grad * ll[1]; + dz_dy[ii * last_layer_size * 4 + 2 * last_layer_size + kk] += var * hh[2] + dz_xx * var_grad * ll[2]; + dz_dy[ii * last_layer_size * 4 + 3 * last_layer_size + kk] += var * hh[3] + dz_xx * var_grad * ll[3]; + } + } + if (unloop) break; + } + } +} + template void deepmd::tabulate_fusion_cpu(float * out, const float * table, const float * table_info, const float * em_x, const float * em, const int nloc, const int nnei, const int last_layer_size); template void deepmd::tabulate_fusion_cpu(double * out, const double * table, const double * table_info, const double * em_x, const double * em, const int nloc, const int nnei, const int last_layer_size); template void deepmd::tabulate_fusion_grad_cpu (float * dy_dem_x, float * dy_dem, const float * table, const float * table_info, const float * em_x, const float * em, const float * dy, const int nloc, const int nnei, const int last_layer_size); template void deepmd::tabulate_fusion_grad_cpu (double * dy_dem_x, double * dy_dem, const double * table, const double * table_info, const double * em_x, const double * em, const double * dy, const int nloc, const int nnei, const int last_layer_size); +template void deepmd::tabulate_fusion_grad_grad_cpu(float * dz_dy, const float * table, const float * table_info, const float * em_x, const float * em, const float * dz_dy_dem_x, const float * dz_dy_dem, const int nloc, const int nnei, const int last_layer_size); +template void deepmd::tabulate_fusion_grad_grad_cpu(double * dz_dy, const double * table, const double * table_info, const double * em_x, const double * em, const double * dz_dy_dem_x, const double * dz_dy_dem, const int nloc, const int nnei, const int last_layer_size); diff --git a/source/op/CMakeLists.txt b/source/op/CMakeLists.txt index abe81d6f32..07d030ef01 100644 --- a/source/op/CMakeLists.txt +++ b/source/op/CMakeLists.txt @@ -4,8 +4,6 @@ set(OP_LIB ${PROJECT_SOURCE_DIR}/lib/src/SimulationRegion.cpp ${PROJECT_SOURCE_D set (OP_CXX_FLAG -D_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI} ) file(GLOB OP_SRC custom_op.cc prod_force.cc prod_virial.cc descrpt.cc descrpt_se_a_ef.cc descrpt_se_a_ef.cc descrpt_se_a_ef_para.cc descrpt_se_a_ef_vert.cc pair_tab.cc prod_force_multi_device.cc prod_virial_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc ewald_recp.cc gelu_multi_device.cc map_aparam.cc neighbor_stat.cc unaggregated_grad.cc tabulate_multi_device.cc prod_env_mat_multi_device.cc) -file(GLOB OP_CUDA_SRC custom_op.cc prod_force.cc prod_virial.cc descrpt.cc prod_env_mat_multi_device.cc pair_tab.cc prod_force_multi_device.cc prod_virial_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_multi_device.cc tabulate_multi_device.cc) -file(GLOB OP_ROCM_SRC custom_op.cc prod_force.cc prod_virial.cc descrpt.cc prod_env_mat_multi_device.cc pair_tab.cc prod_force_multi_device.cc prod_virial_multi_device.cc soft_min.cc soft_min_force.cc soft_min_virial.cc gelu_multi_device.cc tabulate_multi_device.cc) file(GLOB OP_GRADS_SRC custom_op.cc prod_force_grad.cc prod_force_grad_multi_device.cc prod_virial_grad.cc prod_virial_grad_multi_device.cc soft_min_force_grad.cc soft_min_virial_grad.cc ) file(GLOB OP_PY *.py) diff --git a/source/op/_tabulate_grad.py b/source/op/_tabulate_grad.py index 6f8ba1f8bc..f7be9445c7 100644 --- a/source/op/_tabulate_grad.py +++ b/source/op/_tabulate_grad.py @@ -8,25 +8,12 @@ from deepmd.env import tf # from deepmd.DescrptSeATabulate import last_layer_size -# refine is needed! -# accurate gradient is needed! -# 'tabulate_one_side' is needed! -@ops.RegisterGradient("TabulateGrad") -def _tabulate_grad_cc (op, dy): - return [None, dy] - -@ops.RegisterGradient("TabulateFusionGrad") -def _tabulate_grad_cc (op, dy, dy_): - return [None, None, dy, dy_, None, None] - -# old implementations here. - -@ops.RegisterGradient("Tabulate") -def _tabulate_grad_cc (op, dy, dy_): - dy = op_module.tabulate_grad(dy, op.outputs[1]) - return [None, None, dy] - @ops.RegisterGradient("TabulateFusion") def _tabulate_fusion_grad_cc (op, dy): dy_dx, dy_df = op_module.tabulate_fusion_grad(op.inputs[0], op.inputs[1], op.inputs[2], op.inputs[3], dy, op.outputs[0]) - return [None, None, dy_dx, dy_df] \ No newline at end of file + return [None, None, dy_dx, dy_df] + +@ops.RegisterGradient("TabulateFusionGrad") +def _tabulate_fusion_grad_grad_cc (op, dy, dy_): + dz_dy = op_module.tabulate_fusion_grad_grad(op.inputs[0], op.inputs[1], op.inputs[2], op.inputs[3], dy, dy_, op.inputs[5]) + return [None, None, None, None, dz_dy, None] \ No newline at end of file diff --git a/source/op/tabulate_multi_device.cc b/source/op/tabulate_multi_device.cc index 3d5765b843..d0c16fd122 100644 --- a/source/op/tabulate_multi_device.cc +++ b/source/op/tabulate_multi_device.cc @@ -21,6 +21,17 @@ REGISTER_OP("TabulateFusionGrad") .Output("dy_dem_x: T") .Output("dy_dem: T"); +REGISTER_OP("TabulateFusionGradGrad") + .Attr("T: {float, double}") + .Input("table: T") + .Input("table_info: T") + .Input("em_x: T") + .Input("em: T") + .Input("dz_dy_dem_x: T") + .Input("dz_dy_dem: T") + .Input("descriptor: T") + .Output("dz_dy: T"); + template class TabulateFusionOp : public OpKernel { public: @@ -160,24 +171,92 @@ class TabulateFusionGradOp : public OpKernel { std::string device; }; -#define REGISTER_CPU(T) \ -REGISTER_KERNEL_BUILDER( \ - Name("TabulateFusion").Device(DEVICE_CPU).TypeConstraint("T").HostMemory("table_info"), \ - TabulateFusionOp); \ -REGISTER_KERNEL_BUILDER( \ - Name("TabulateFusionGrad").Device(DEVICE_CPU).TypeConstraint("T").HostMemory("table_info"), \ - TabulateFusionGradOp); +template +class TabulateFusionGradGradOp : public OpKernel { + public: + explicit TabulateFusionGradGradOp(OpKernelConstruction* context) : OpKernel(context) {} + void Compute(OpKernelContext* context) override { + // Grab the input tensor + int context_input_index = 0; + const Tensor& table_tensor = context->input(context_input_index++); + const Tensor& table_info_tensor = context->input(context_input_index++); + const Tensor& em_x_tensor = context->input(context_input_index++); + const Tensor& em_tensor = context->input(context_input_index++); + const Tensor& dz_dy_dem_x_tensor = context->input(context_input_index++); + const Tensor& dz_dy_dem_tensor = context->input(context_input_index++); + const Tensor& descriptor_tensor = context->input(context_input_index++); + // set size of the sample + OP_REQUIRES (context, (dz_dy_dem_x_tensor.shape().dims() == 2), errors::InvalidArgument ("Dim of input should be 2")); + OP_REQUIRES (context, (dz_dy_dem_tensor.shape().dims() == 3), errors::InvalidArgument ("Dim of input should be 3")); + int context_output_index = 0; + Tensor* dz_dy_tensor = NULL; + OP_REQUIRES_OK(context, context->allocate_output( + context_output_index++, + descriptor_tensor.shape(), + &dz_dy_tensor)); + DeviceFunctor() ( + device, + context->eigen_device() + ); + + // flat the tensors + FPTYPE * dz_dy = dz_dy_tensor->flat().data(); + const FPTYPE * table = table_tensor.flat().data(); + const FPTYPE * table_info = table_info_tensor.flat().data(); + const FPTYPE * em_x = em_x_tensor.flat().data(); + const FPTYPE * em = em_tensor.flat().data(); + const FPTYPE * dz_dy_dem_x = dz_dy_dem_x_tensor.flat().data(); + const FPTYPE * dz_dy_dem = dz_dy_dem_tensor.flat().data(); + const int nloc = em_tensor.shape().dim_size(0); + const int nnei = em_tensor.shape().dim_size(1); + const int last_layer_size = descriptor_tensor.shape().dim_size(2); + + if (device == "GPU") { + #if GOOGLE_CUDA + deepmd::tabulate_fusion_grad_grad_gpu_cuda( + dz_dy, + table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc, nnei, last_layer_size); + #endif // GOOGLE_CUDA + #if TENSORFLOW_USE_ROCM + deepmd::tabulate_fusion_grad_grad_gpu_rocm( + dz_dy, + table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc, nnei, last_layer_size); + #endif // TENSORFLOW_USE_ROCM + } + else if (device == "CPU") { + deepmd::tabulate_fusion_grad_grad_cpu( + dz_dy, + table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc, nnei, last_layer_size); + } + } +private: + std::string device; +}; + +#define REGISTER_CPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("TabulateFusion").Device(DEVICE_CPU).TypeConstraint("T").HostMemory("table_info"), \ + TabulateFusionOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("TabulateFusionGrad").Device(DEVICE_CPU).TypeConstraint("T").HostMemory("table_info"), \ + TabulateFusionGradOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("TabulateFusionGradGrad").Device(DEVICE_CPU).TypeConstraint("T").HostMemory("table_info"), \ + TabulateFusionGradGradOp); REGISTER_CPU(float); REGISTER_CPU(double); #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM -#define REGISTER_GPU(T) \ -REGISTER_KERNEL_BUILDER( \ - Name("TabulateFusion").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("table_info"), \ - TabulateFusionOp); \ -REGISTER_KERNEL_BUILDER( \ - Name("TabulateFusionGrad").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("table_info"), \ - TabulateFusionGradOp); +#define REGISTER_GPU(T) \ +REGISTER_KERNEL_BUILDER( \ + Name("TabulateFusion").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("table_info"), \ + TabulateFusionOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("TabulateFusionGrad").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("table_info"), \ + TabulateFusionGradOp); \ +REGISTER_KERNEL_BUILDER( \ + Name("TabulateFusionGradGrad").Device(DEVICE_GPU).TypeConstraint("T").HostMemory("table_info"), \ + TabulateFusionGradGradOp); REGISTER_GPU(float); REGISTER_GPU(double); #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM From a2c6fba8546fd2cbf8999a075d113da5838b1f16 Mon Sep 17 00:00:00 2001 From: denghuilu Date: Fri, 20 Aug 2021 19:15:25 +0800 Subject: [PATCH 2/9] fix UT error --- deepmd/model/tensor.py | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/deepmd/model/tensor.py b/deepmd/model/tensor.py index 4cd41b2540..14eb4a1211 100644 --- a/deepmd/model/tensor.py +++ b/deepmd/model/tensor.py @@ -5,6 +5,7 @@ from deepmd.common import ClassArg from deepmd.env import global_cvt_2_ener_float, MODEL_VERSION from deepmd.env import op_module +from deepmd.utils.graph import load_graph_def from .model_stat import make_stat_input, merge_sys_stat class TensorModel() : @@ -93,6 +94,7 @@ def build (self, box, mesh, input_dict, + frz_model = None, suffix = '', reuse = None): with tf.variable_scope('model_attr' + suffix, reuse = reuse) : @@ -115,16 +117,29 @@ def build (self, natomsel = sum(natoms[2+type_i] for type_i in self.get_sel_type()) nout = self.get_out_size() - dout \ - = self.descrpt.build(coord_, - atype_, - natoms, - box, - mesh, - input_dict, - suffix = suffix, - reuse = reuse) - dout = tf.identity(dout, name='o_descriptor') + if frz_model == None: + dout \ + = self.descrpt.build(coord_, + atype_, + natoms, + box, + mesh, + input_dict, + suffix = suffix, + reuse = reuse) + dout = tf.identity(dout, name='o_descriptor') + else: + tf.constant(self.rcut, + name = 'descrpt_attr/rcut', + dtype = GLOBAL_TF_FLOAT_PRECISION) + tf.constant(self.ntypes, + name = 'descrpt_attr/ntypes', + dtype = tf.int32) + feed_dict = self.descrpt.get_feed_dict(coord_, atype_, natoms, box, mesh) + return_elements = ['o_rmat:0', 'o_rmat_deriv:0', 'o_rij:0', 'o_nlist:0', 'o_descriptor:0'] + descrpt_reshape, descrpt_deriv, rij, nlist, dout = self._import_graph_def_from_frz_model(frz_model, feed_dict, return_elements) + self.descrpt.pass_tensors_from_frz_model(descrpt_reshape, descrpt_deriv, rij, nlist) + rot_mat = self.descrpt.get_rot_mat() rot_mat = tf.identity(rot_mat, name = 'o_rot_mat'+suffix) @@ -170,6 +185,9 @@ def build (self, return model_dict + def _import_graph_def_from_frz_model(self, frz_model, feed_dict, return_elements): + graph, graph_def = load_graph_def(frz_model) + return tf.import_graph_def(graph_def, input_map = feed_dict, return_elements = return_elements) class WFCModel(TensorModel): def __init__( From 2e2f6b6806916fd0eb41c00d1e9b0b9c56ebb8c1 Mon Sep 17 00:00:00 2001 From: denghuilu Date: Fri, 20 Aug 2021 20:23:29 +0800 Subject: [PATCH 3/9] address comments --- deepmd/entrypoints/compress.py | 4 ++-- deepmd/entrypoints/train.py | 6 +++--- deepmd/model/ener.py | 3 ++- deepmd/model/tensor.py | 3 ++- deepmd/train/trainer.py | 8 ++++---- deepmd/utils/graph.py | 4 ++-- 6 files changed, 15 insertions(+), 13 deletions(-) diff --git a/deepmd/entrypoints/compress.py b/deepmd/entrypoints/compress.py index 35020fdd22..5114d6410e 100644 --- a/deepmd/entrypoints/compress.py +++ b/deepmd/entrypoints/compress.py @@ -78,9 +78,9 @@ def compress( "Please consider using the --training-script command within the model compression interface to provide the training script of the input frozen model. " "Note that the input training script must contain the correct path to the training data." % input ) from e - elif os.path.exists(training_script) == False: + elif not os.path.exists(training_script): raise RuntimeError( - "The input training script %s does not exist! Please check the path of the training script. " % (input + "(" + os.path.abspath(input) + ")") + "The input training script %s (%s) does not exist! Please check the path of the training script. " % (input, os.path.abspath(input)) ) from e else: log.info("stage 0: compute the min_nbor_dist") diff --git a/deepmd/entrypoints/train.py b/deepmd/entrypoints/train.py index 6a54e6a8dd..01ddfa4752 100755 --- a/deepmd/entrypoints/train.py +++ b/deepmd/entrypoints/train.py @@ -74,7 +74,7 @@ def train( jdata = normalize(jdata) - if is_compress == False: + if not is_compress: jdata = update_sel(jdata) with open(output, "w") as fp: @@ -145,7 +145,7 @@ def _do_work(jdata: Dict[str, Any], run_opt: RunOptions, is_compress: bool = Fal # decouple the training data from the model compress process train_data = None valid_data = None - if is_compress == False: + if not is_compress: # init data train_data = get_data(jdata["training"]["training_data"], rcut, ipt_type_map, modifier) train_data.print_summary("training") @@ -157,7 +157,7 @@ def _do_work(jdata: Dict[str, Any], run_opt: RunOptions, is_compress: bool = Fal stop_batch = j_must_have(jdata["training"], "numb_steps") model.build(train_data, stop_batch) - if is_compress == False: + if not is_compress: # train the model with the provided systems in a cyclic way start_time = time.time() model.train(train_data, valid_data) diff --git a/deepmd/model/ener.py b/deepmd/model/ener.py index 2aa2436a57..0f4865ce7c 100644 --- a/deepmd/model/ener.py +++ b/deepmd/model/ener.py @@ -172,7 +172,8 @@ def build (self, dtype = tf.int32) feed_dict = self.descrpt.get_feed_dict(coord_, atype_, natoms, box, mesh) return_elements = ['o_rmat:0', 'o_rmat_deriv:0', 'o_rij:0', 'o_nlist:0', 'o_descriptor:0'] - descrpt_reshape, descrpt_deriv, rij, nlist, dout = self._import_graph_def_from_frz_model(frz_model, feed_dict, return_elements) + descrpt_reshape, descrpt_deriv, rij, nlist, dout \ + = self._import_graph_def_from_frz_model(frz_model, feed_dict, return_elements) self.descrpt.pass_tensors_from_frz_model(descrpt_reshape, descrpt_deriv, rij, nlist) diff --git a/deepmd/model/tensor.py b/deepmd/model/tensor.py index 14eb4a1211..3b94dff601 100644 --- a/deepmd/model/tensor.py +++ b/deepmd/model/tensor.py @@ -137,7 +137,8 @@ def build (self, dtype = tf.int32) feed_dict = self.descrpt.get_feed_dict(coord_, atype_, natoms, box, mesh) return_elements = ['o_rmat:0', 'o_rmat_deriv:0', 'o_rij:0', 'o_nlist:0', 'o_descriptor:0'] - descrpt_reshape, descrpt_deriv, rij, nlist, dout = self._import_graph_def_from_frz_model(frz_model, feed_dict, return_elements) + descrpt_reshape, descrpt_deriv, rij, nlist, dout \ + = self._import_graph_def_from_frz_model(frz_model, feed_dict, return_elements) self.descrpt.pass_tensors_from_frz_model(descrpt_reshape, descrpt_deriv, rij, nlist) rot_mat = self.descrpt.get_rot_mat() diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index 7b66753376..8d3cfe2b42 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -291,7 +291,7 @@ def build (self, else: log.info("training without frame parameter") - if self.is_compress == False: + if not self.is_compress: # Usually, the type number of the model should be equal to that of the data # However, nt_model > nt_data should be allowed, since users may only want to # train using a dataset that only have some of elements @@ -319,7 +319,7 @@ def build (self, self.descrpt.enable_compression(self.model_param['compress']["min_nbor_dist"], self.model_param['compress']['model_file'], self.model_param['compress']['table_config'][0], self.model_param['compress']['table_config'][1], self.model_param['compress']['table_config'][2], self.model_param['compress']['table_config'][3]) self.fitting.init_variables(get_fitting_net_variables(self.model_param['compress']['model_file'])) - if self.is_compress == True or self.model_type == 'compressed_model': + if self.is_compress or self.model_type == 'compressed_model': tf.constant("compressed_model", name = 'model_type', dtype = tf.string) else: tf.constant("original_model", name = 'model_type', dtype = tf.string) @@ -663,9 +663,9 @@ def _init_from_frz_model(self): self.model_type = bytes.decode(t_model_type) except GraphWithoutTensorError as e: # throw runtime error if there's no frozen model - if os.path.exists(self.run_opt.init_frz_model) == False: + if not os.path.exists(self.run_opt.init_frz_model): raise RuntimeError( - "The input frozen model %s does not exist! Please check the path of the training script. " % (self.run_opt.init_frz_model + "(" + os.path.abspath(self.run_opt.init_frz_model) + ")") + "The input frozen model %s (%s) does not exist! Please check the path of the frozen model. " % (self.run_opt.init_frz_model, os.path.abspath(self.run_opt.init_frz_model)) ) from e # throw runtime error if the frozen_model has no model type information... else: diff --git a/deepmd/utils/graph.py b/deepmd/utils/graph.py index 76f66e5cf9..b343e54ff2 100644 --- a/deepmd/utils/graph.py +++ b/deepmd/utils/graph.py @@ -38,7 +38,7 @@ def get_tensor_by_name(model_file: str, ---------- model_file : str The input frozen model. - tensor : tensor_name + tensor_name : str Indicates which tensor which will be loaded from the frozen model. Returns @@ -62,7 +62,7 @@ def get_tensor_by_name(model_file: str, def get_tensor_by_type(node, - data_type : str): + data_type : np.dtype): """ Get the tensor value within the given node according to the input data_type From 792d74c3e9a5defc9357ecb86a74ab2b5353428e Mon Sep 17 00:00:00 2001 From: denghuilu Date: Fri, 20 Aug 2021 22:50:01 +0800 Subject: [PATCH 4/9] address comments --- deepmd/utils/graph.py | 180 ++++++++++++++++++++++++++++++--------- deepmd/utils/tabulate.py | 18 ++-- 2 files changed, 146 insertions(+), 52 deletions(-) diff --git a/deepmd/utils/graph.py b/deepmd/utils/graph.py index b343e54ff2..8d228ab13a 100644 --- a/deepmd/utils/graph.py +++ b/deepmd/utils/graph.py @@ -1,25 +1,26 @@ import re import numpy as np +from typing import Tuple, Dict from deepmd.env import tf from deepmd.common import PRECISION_MAPPING from deepmd.utils.sess import run_sess from deepmd.utils.errors import GraphWithoutTensorError -def load_graph_def(model_file: str): +def load_graph_def(model_file: str) -> Tuple[tf.Graph, tf.GraphDef]: """ Load graph as well as the graph_def from the frozen model(model_file) Parameters ---------- model_file : str - The input frozen model. + The input frozen model path Returns ------- - graph - The graph loaded from the frozen model. - graph_def - The graph_def loaded from the frozen model. + tf.Graph + The graph loaded from the frozen model + tf.GraphDef + The graph_def loaded from the frozen model """ graph_def = tf.GraphDef() with open(model_file, "rb") as f: @@ -29,29 +30,28 @@ def load_graph_def(model_file: str): return graph, graph_def -def get_tensor_by_name(model_file: str, - tensor_name: str) -> tf.Tensor: +def get_tensor_by_name_from_graph(graph: tf.Graph, + tensor_name: str) -> tf.Tensor: """ - Load tensor value from the frozen model(model_file) + Load tensor value from the given tf.Graph object Parameters ---------- - model_file : str - The input frozen model. + graph : tf.Graph + The input TensorFlow graph tensor_name : str - Indicates which tensor which will be loaded from the frozen model. + Indicates which tensor which will be loaded from the frozen model Returns ------- tf.Tensor - The tensor which was loaded from the frozen model. + The tensor which was loaded from the frozen model Raises ------ GraphWithoutTensorError - Whether the tensor_name is within the frozen model. + Whether the tensor_name is within the frozen model """ - graph, _ = load_graph_def(model_file) try: tensor = graph.get_tensor_by_name(tensor_name + ':0') except KeyError as e: @@ -61,8 +61,34 @@ def get_tensor_by_name(model_file: str, return tensor +def get_tensor_by_name(model_file: str, + tensor_name: str) -> tf.Tensor: + """ + Load tensor value from the frozen model(model_file) + + Parameters + ---------- + model_file : str + The input frozen model path +pathpath tensor_name : str + Indicates which tensor which will be loaded from the frozen model + + Returns + ------- + tf.Tensor + The tensor which was loaded from the frozen model + + Raises + ------ + GraphWithoutTensorError + Whether the tensor_name is within the frozen model + """ + graph, _ = load_graph_def(model_file) + return get_tensor_by_name_from_graph(graph, tensor_name) + + def get_tensor_by_type(node, - data_type : np.dtype): + data_type : np.dtype) -> tf.Tensor: """ Get the tensor value within the given node according to the input data_type @@ -75,7 +101,7 @@ def get_tensor_by_type(node, Returns ---------- - tensor + tf.Tensor The tensor value of the given node """ if data_type == np.float64: @@ -87,21 +113,20 @@ def get_tensor_by_type(node, return tensor -def get_embedding_net_nodes(model_file: str): +def get_embedding_net_nodes_from_graph_def(graph_def: tf.GraphDef) -> Dict: """ - Get the embedding net nodes with the given frozen model(model_file) + Get the embedding net nodes with the given tf.GraphDef object Parameters ---------- - model_file - The input frozen model. + graph_def + The input tf.GraphDef object Returns ---------- - embedding_net_nodes - The embedding net nodes with the given frozen model. + Dict + The embedding net nodes within the given tf.GraphDef object """ - _, graph_def = load_graph_def(model_file) embedding_net_nodes = {} embedding_net_pattern = "filter_type_\d+/matrix_\d+_\d+|filter_type_\d+/bias_\d+_\d+|filter_type_\d+/idt_\d+_\d+|filter_type_all/matrix_\d+_\d+|filter_type_all/bias_\d+_\d+|filter_type_all/idt_\d+_\d" for node in graph_def.node: @@ -113,21 +138,40 @@ def get_embedding_net_nodes(model_file: str): return embedding_net_nodes -def get_embedding_net_variables(model_file : str): +def get_embedding_net_nodes(model_file: str) -> Dict: """ - Get the embedding net variables with the given frozen model(model_file) + Get the embedding net nodes with the given frozen model(model_file) Parameters ---------- model_file - The input frozen model. + The input frozen model path +pathpath + Returns + ---------- + Dict + The embedding net nodes with the given frozen model + """ + _, graph_def = load_graph_def(model_file) + return get_embedding_net_nodes_from_graph_def(graph_def) + + +def get_embedding_net_variables_from_graph_def(graph_def : tf.GraphDef) -> Dict: + """ + Get the embedding net variables with the given tf.GraphDef object + + Parameters + ---------- + graph_def + The input tf.GraphDef object Returns ---------- - The embedding net variables within the given frozen model. + Dict + The embedding net variables within the given tf.GraphDef object """ embedding_net_variables = {} - embedding_net_nodes = get_embedding_net_nodes(model_file) + embedding_net_nodes = get_embedding_net_nodes_from_graph_def(graph_def) for item in embedding_net_nodes: node = embedding_net_nodes[item] dtype = PRECISION_MAPPING[node.dtype] @@ -139,22 +183,38 @@ def get_embedding_net_variables(model_file : str): embedding_net_variables[item] = np.reshape(tensor_value, tensor_shape) return embedding_net_variables - -def get_fitting_net_nodes(model_file : str): +def get_embedding_net_variables(model_file : str) -> Dict: """ - Get the fitting net nodes with the given frozen model(model_file) + Get the embedding net variables with the given frozen model(model_file) Parameters ---------- model_file - The input frozen model. - + The input frozen model path +pathpath Returns ---------- - fitting_net_nodes - The fitting net nodes with the given frozen model. + Dict + The embedding net variables within the given frozen model """ _, graph_def = load_graph_def(model_file) + return get_embedding_net_variables_from_graph_def(graph_def) + + +def get_fitting_net_nodes_from_graph_def(graph_def: tf.GraphDef) -> Dict: + """ + Get the fitting net nodes with the given tf.GraphDef object + + Parameters + ---------- + graph_def + The input tf.GraphDef object + + Returns + ---------- + Dict + The fitting net nodes within the given tf.GraphDef object + """ fitting_net_nodes = {} fitting_net_pattern = "layer_\d+_type_\d+/matrix+|layer_\d+_type_\d+/bias+|layer_\d+_type_\d+/idt+|final_layer_type_\d+/matrix+|final_layer_type_\d+/bias" for node in graph_def.node: @@ -166,21 +226,40 @@ def get_fitting_net_nodes(model_file : str): return fitting_net_nodes -def get_fitting_net_variables(model_file : str): +def get_fitting_net_nodes(model_file : str) -> Dict: """ - Get the fitting net variables with the given frozen model(model_file) + Get the fitting net nodes with the given frozen model(model_file) Parameters ---------- model_file - The input frozen model. + The input frozen model path + + Returns + ---------- + Dict + The fitting net nodes with the given frozen model + """ + _, graph_def = load_graph_def(model_file) + return get_fitting_net_nodes_from_graph_def(graph_def) + + +def get_fitting_net_variables_from_graph_def(graph_def : tf.GraphDef) -> Dict: + """ + Get the fitting net variables with the given tf.GraphDef object + + Parameters + ---------- + graph_def + The input tf.GraphDef object Returns ---------- - The fitting net variables within the given frozen model. + Dict + The fitting net variables within the given tf.GraphDef object """ fitting_net_variables = {} - fitting_net_nodes = get_fitting_net_nodes(model_file) + fitting_net_nodes = get_fitting_net_nodes_from_graph_def(graph_def) for item in fitting_net_nodes: node = fitting_net_nodes[item] dtype= PRECISION_MAPPING[node.dtype] @@ -190,4 +269,21 @@ def get_fitting_net_variables(model_file : str): else: tensor_value = get_tensor_by_type(node, dtype) fitting_net_variables[item] = np.reshape(tensor_value, tensor_shape) - return fitting_net_variables \ No newline at end of file + return fitting_net_variables + +def get_fitting_net_variables(model_file : str) -> Dict: + """ + Get the fitting net variables with the given frozen model(model_file) + + Parameters + ---------- + model_file + The input frozen model path + + Returns + ---------- + Dict + The fitting net variables within the given frozen model + """ + _, graph_def = load_graph_def(model_file) + return get_fitting_net_variables_from_graph_def(graph_def) diff --git a/deepmd/utils/tabulate.py b/deepmd/utils/tabulate.py index 439c28fa75..9db80d2db5 100644 --- a/deepmd/utils/tabulate.py +++ b/deepmd/utils/tabulate.py @@ -6,9 +6,9 @@ from deepmd.env import tf from deepmd.env import op_module from deepmd.utils.sess import run_sess -from deepmd.utils.graph import get_tensor_by_name, load_graph_def -from deepmd.utils.graph import get_fitting_net_nodes, get_embedding_net_nodes -from deepmd.utils.graph import get_fitting_net_variables, get_embedding_net_variables +from deepmd.utils.graph import get_tensor_by_name_from_graph, load_graph_def +from deepmd.utils.graph import get_embedding_net_nodes_from_graph_def +from deepmd.utils.graph import get_fitting_net_variables_from_graph_def from tensorflow.python.platform import gfile from tensorflow.python.framework import tensor_util @@ -57,16 +57,15 @@ def __init__(self, self.sel_a = self.graph.get_operation_by_name('DescrptSeA').get_attr('sel_a') self.descrpt = self.graph.get_operation_by_name ('DescrptSeA') - self.davg = get_tensor_by_name(self.model_file, 'descrpt_attr/t_avg') - self.dstd = get_tensor_by_name(self.model_file, 'descrpt_attr/t_std') - self.ntypes = get_tensor_by_name(self.model_file, 'descrpt_attr/ntypes') + self.davg = get_tensor_by_name_from_graph(self.graph, 'descrpt_attr/t_avg') + self.dstd = get_tensor_by_name_from_graph(self.graph, 'descrpt_attr/t_std') + self.ntypes = get_tensor_by_name_from_graph(self.graph, 'descrpt_attr/ntypes') self.rcut = self.descrpt.get_attr('rcut_r') self.rcut_smth = self.descrpt.get_attr('rcut_r_smth') - self.fitting_net_nodes = get_fitting_net_nodes(self.model_file) - self.embedding_net_nodes = get_embedding_net_nodes(self.model_file) + self.embedding_net_nodes = get_embedding_net_nodes_from_graph_def(self.graph_def) for tt in self.exclude_types: if (tt[0] not in range(self.ntypes)) or (tt[1] not in range(self.ntypes)): @@ -91,8 +90,7 @@ def __init__(self, # define tables self.data = {} - self.fitting_net_variables = get_fitting_net_variables(self.model_file) - self.embedding_net_variables = get_embedding_net_variables(self.model_file) + self.fitting_net_variables = get_fitting_net_variables_from_graph_def(self.graph_def) def build(self, min_nbor_dist : float, From df7aaf6cf4f6620cc9b1c304278ada7d717583f1 Mon Sep 17 00:00:00 2001 From: denghuilu Date: Fri, 20 Aug 2021 22:52:14 +0800 Subject: [PATCH 5/9] rm fitting_net_variables from class DPTabulate --- deepmd/utils/tabulate.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deepmd/utils/tabulate.py b/deepmd/utils/tabulate.py index 9db80d2db5..a503ad8b8a 100644 --- a/deepmd/utils/tabulate.py +++ b/deepmd/utils/tabulate.py @@ -90,7 +90,6 @@ def __init__(self, # define tables self.data = {} - self.fitting_net_variables = get_fitting_net_variables_from_graph_def(self.graph_def) def build(self, min_nbor_dist : float, From 4d9c7ec1cc35e10f861a2c5370c604c4f734ab4b Mon Sep 17 00:00:00 2001 From: denghuilu Date: Fri, 20 Aug 2021 22:53:05 +0800 Subject: [PATCH 6/9] clean class DPTabulate --- deepmd/utils/tabulate.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deepmd/utils/tabulate.py b/deepmd/utils/tabulate.py index a503ad8b8a..9f46a7f30e 100644 --- a/deepmd/utils/tabulate.py +++ b/deepmd/utils/tabulate.py @@ -8,7 +8,6 @@ from deepmd.utils.sess import run_sess from deepmd.utils.graph import get_tensor_by_name_from_graph, load_graph_def from deepmd.utils.graph import get_embedding_net_nodes_from_graph_def -from deepmd.utils.graph import get_fitting_net_variables_from_graph_def from tensorflow.python.platform import gfile from tensorflow.python.framework import tensor_util From 3ee56f88c78c631a943b72a91be28afed6b41e6b Mon Sep 17 00:00:00 2001 From: denghuilu Date: Sat, 21 Aug 2021 08:11:23 +0800 Subject: [PATCH 7/9] fix typo --- deepmd/train/trainer.py | 1 - deepmd/utils/graph.py | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index 8d3cfe2b42..49c6a5a537 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -317,7 +317,6 @@ def build (self, else : assert 'rcut' in self.descrpt_param, "Error: descriptor must have attr rcut!" self.descrpt.enable_compression(self.model_param['compress']["min_nbor_dist"], self.model_param['compress']['model_file'], self.model_param['compress']['table_config'][0], self.model_param['compress']['table_config'][1], self.model_param['compress']['table_config'][2], self.model_param['compress']['table_config'][3]) - self.fitting.init_variables(get_fitting_net_variables(self.model_param['compress']['model_file'])) if self.is_compress or self.model_type == 'compressed_model': tf.constant("compressed_model", name = 'model_type', dtype = tf.string) diff --git a/deepmd/utils/graph.py b/deepmd/utils/graph.py index 8d228ab13a..ace87f5438 100644 --- a/deepmd/utils/graph.py +++ b/deepmd/utils/graph.py @@ -70,7 +70,7 @@ def get_tensor_by_name(model_file: str, ---------- model_file : str The input frozen model path -pathpath tensor_name : str + tensor_name : str Indicates which tensor which will be loaded from the frozen model Returns @@ -146,7 +146,7 @@ def get_embedding_net_nodes(model_file: str) -> Dict: ---------- model_file The input frozen model path -pathpath + Returns ---------- Dict @@ -191,7 +191,7 @@ def get_embedding_net_variables(model_file : str) -> Dict: ---------- model_file The input frozen model path -pathpath + Returns ---------- Dict From e16d49f5348eec511c3ff30c24c0b1db71e04381 Mon Sep 17 00:00:00 2001 From: Denghui Lu Date: Sat, 21 Aug 2021 08:39:08 +0800 Subject: [PATCH 8/9] add doc for init-frz-model add doc for init-frz-model at training-advanced.md --- doc/train/training-advanced.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/train/training-advanced.md b/doc/train/training-advanced.md index 1f52534672..32deaca906 100644 --- a/doc/train/training-advanced.md +++ b/doc/train/training-advanced.md @@ -95,15 +95,22 @@ positional arguments: optional arguments: -h, --help show this help message and exit + --init-model INIT_MODEL Initialize a model by the provided checkpoint + --restart RESTART Restart the training from the provided checkpoint + + --init-frz-model INIT_FRZ_MODEL + Initialize the training from the frozen model. ``` **`--init-model model.ckpt`**, initializes the model training with an existing model that is stored in the checkpoint `model.ckpt`, the network architectures should match. **`--restart model.ckpt`**, continues the training from the checkpoint `model.ckpt`. +**`--init-frz-model frozen_model.pb`**, initializes the training with an existing model that is stored in `frozen_model.pb`. Note that the `init-frz-model` command only supports the compressed model currently! + On some resources limited machines, one may want to control the number of threads used by DeePMD-kit. This is achieved by three environmental variables: `OMP_NUM_THREADS`, `TF_INTRA_OP_PARALLELISM_THREADS` and `TF_INTER_OP_PARALLELISM_THREADS`. `OMP_NUM_THREADS` controls the multithreading of DeePMD-kit implemented operations. `TF_INTRA_OP_PARALLELISM_THREADS` and `TF_INTER_OP_PARALLELISM_THREADS` controls `intra_op_parallelism_threads` and `inter_op_parallelism_threads`, which are Tensorflow configurations for multithreading. An explanation is found [here](https://stackoverflow.com/questions/41233635/meaning-of-inter-op-parallelism-threads-and-intra-op-parallelism-threads). For example if you wish to use 3 cores of 2 CPUs on one node, you may set the environmental variables and run DeePMD-kit as follows: From 5222cfb18c0c1cdcee2f584553f231241c6b4c06 Mon Sep 17 00:00:00 2001 From: denghuilu Date: Sat, 21 Aug 2021 11:42:48 +0800 Subject: [PATCH 9/9] fix rocm error --- source/lib/src/rocm/tabulate.hip.cu | 60 +++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/source/lib/src/rocm/tabulate.hip.cu b/source/lib/src/rocm/tabulate.hip.cu index ea8a30f6b5..4492daeb73 100644 --- a/source/lib/src/rocm/tabulate.hip.cu +++ b/source/lib/src/rocm/tabulate.hip.cu @@ -196,6 +196,66 @@ __global__ void tabulate_fusion_grad_fifth_order_polynomial( } } +template < + typename FPTYPE, + int MTILE, + int KTILE> +__global__ void tabulate_fusion_grad_grad_fifth_order_polynomial( + FPTYPE * dz_dy, + const FPTYPE * table, + const FPTYPE * em_x, + const FPTYPE * em, + const FPTYPE * dz_dy_dem_x, + const FPTYPE * dz_dy_dem, + const FPTYPE lower, + const FPTYPE upper, + const FPTYPE max, + const FPTYPE stride0, + const FPTYPE stride1, + const int nnei, + const int last_layer_size) +{ + extern __shared__ int _data[]; + const int block_idx = blockIdx.x; // nloc + const int thread_idx = threadIdx.x; // last_layer_size + FPTYPE ago = __shfl( em_x[block_idx * nnei + nnei - 1], 0); + bool unloop = false; + int breakpoint = nnei - 1; + FPTYPE * iteratorC = (FPTYPE*) &_data[0]; + for (int kk = 0; kk < MTILE; kk++) + iteratorC[kk * last_layer_size + thread_idx] = 0.f; + __syncthreads(); + + for (int ii = 0; ii < nnei; ii++) { + FPTYPE var[6]; + FPTYPE xx = em_x[block_idx * nnei + ii]; + FPTYPE dz_xx = dz_dy_dem_x[block_idx * nnei + ii]; + if (xx == ago) { + unloop = true; + breakpoint = ii; + } + int table_idx = 0; + locate_xx(xx, table_idx, lower, upper, max, stride0, stride1); + var[0] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 0]; + var[1] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 1]; + var[2] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 2]; + var[3] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 3]; + var[4] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 4]; + var[5] = table[table_idx * last_layer_size * 6 + thread_idx * 6 + 5]; + FPTYPE res = var[0] + (var[1] + (var[2] + (var[3] + (var[4] + var[5] * xx) * xx) * xx) * xx) * xx; + FPTYPE res_grad = var[1] + (2 * var[2] + (3 * var[3] + (4 * var[4] + 5 * var[5] * xx) * xx) * xx) * xx; + + for (int kk = 0; kk < MTILE; kk++) { + int em_index = block_idx * nnei * MTILE + ii * MTILE + kk; + iteratorC[kk * last_layer_size + thread_idx] += (nnei - breakpoint) * (em[em_index] * res_grad * dz_xx + dz_dy_dem[em_index] * res); + } + if (unloop) break; + } + for (int ii = 0; ii < MTILE; ii++) { + dz_dy[block_idx * MTILE * last_layer_size + ii * last_layer_size + thread_idx] = iteratorC[ii * last_layer_size + thread_idx]; + } +} + namespace deepmd { template void tabulate_fusion_gpu_rocm(