diff --git a/deepmd/descriptor/descriptor.py b/deepmd/descriptor/descriptor.py index d179660a9d..231f3abe1e 100644 --- a/deepmd/descriptor/descriptor.py +++ b/deepmd/descriptor/descriptor.py @@ -262,6 +262,24 @@ def enable_compression(self, raise NotImplementedError( "Descriptor %s doesn't support compression!" % type(self).__name__) + def enable_mixed_precision(self, mixed_prec: dict = None) -> None: + """ + Reveive the mixed precision setting. + + Parameters + ---------- + mixed_prec + The mixed precision setting used in the embedding net + + Notes + ----- + This method is called by others when the descriptor supported compression. + """ + raise NotImplementedError( + "Descriptor %s doesn't support mixed precision training!" + % type(self).__name__ + ) + @abstractmethod def prod_force_virial(self, atom_ener: tf.Tensor, diff --git a/deepmd/descriptor/hybrid.py b/deepmd/descriptor/hybrid.py index 9d8967faee..39cb9fed0e 100644 --- a/deepmd/descriptor/hybrid.py +++ b/deepmd/descriptor/hybrid.py @@ -264,6 +264,20 @@ def enable_compression(self, for idx, ii in enumerate(self.descrpt_list): ii.enable_compression(min_nbor_dist, model_file, table_extrapolate, table_stride_1, table_stride_2, check_frequency, suffix=f"{suffix}_{idx}") + + def enable_mixed_precision(self, mixed_prec : dict = None) -> None: + """ + Reveive the mixed precision setting. + + Parameters + ---------- + mixed_prec + The mixed precision setting used in the embedding net + """ + for idx, ii in enumerate(self.descrpt_list): + ii.enable_mixed_precision(mixed_prec) + + def init_variables(self, model_file : str, suffix : str = "", diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py index 74b12a412a..91843a47d3 100644 --- a/deepmd/descriptor/se_a.py +++ b/deepmd/descriptor/se_a.py @@ -160,6 +160,7 @@ def __init__ (self, self.davg = None self.compress = False self.embedding_net_variables = None + self.mixed_prec = None self.place_holders = {} nei_type = np.array([]) for ii in range(self.ntypes): @@ -348,6 +349,18 @@ def enable_compression(self, self.dstd = get_tensor_by_name_from_graph(graph, 'descrpt_attr%s/t_std' % suffix) + def enable_mixed_precision(self, mixed_prec : dict = None) -> None: + """ + Reveive the mixed precision setting. + + Parameters + ---------- + mixed_prec + The mixed precision setting used in the embedding net + """ + self.mixed_prec = mixed_prec + self.filter_precision = get_precision(mixed_prec['output_prec']) + def build (self, coord_ : tf.Tensor, @@ -708,7 +721,8 @@ def _filter_lower( seed = self.seed, trainable = trainable, uniform_seed = self.uniform_seed, - initial_variables = self.embedding_net_variables) + initial_variables = self.embedding_net_variables, + mixed_prec = self.mixed_prec) if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift else: # we can safely return the final xyz_scatter filled with zero directly @@ -735,6 +749,8 @@ def _filter( name='linear', reuse=None, trainable = True): + if self.mixed_prec is not None: + inputs = tf.cast(inputs, get_precision(self.mixed_prec['compute_prec'])) nframes = tf.shape(tf.reshape(inputs, [-1, natoms[0], self.ndescrpt]))[0] # natom x (nei x 4) shape = inputs.get_shape().as_list() diff --git a/deepmd/env.py b/deepmd/env.py index 6e6543697e..577da78ed5 100644 --- a/deepmd/env.py +++ b/deepmd/env.py @@ -39,6 +39,7 @@ "TRANSFER_PATTERN", "FITTING_NET_PATTERN", "EMBEDDING_NET_PATTERN", + "TF_VERSION" ] SHARED_LIB_MODULE = "op" diff --git a/deepmd/fit/dipole.py b/deepmd/fit/dipole.py index 6c115e3fb3..7c2d5dea86 100644 --- a/deepmd/fit/dipole.py +++ b/deepmd/fit/dipole.py @@ -77,6 +77,7 @@ def __init__ (self, self.dim_rot_mat = self.dim_rot_mat_1 * 3 self.useBN = False self.fitting_net_variables = None + self.mixed_prec = None def get_sel_type(self) -> int: """ @@ -141,12 +142,12 @@ def build (self, layer = inputs_i for ii in range(0,len(self.n_neuron)) : if ii >= 1 and self.n_neuron[ii] == self.n_neuron[ii-1] : - layer+= one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, use_timestep = self.resnet_dt, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables) + layer+= one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, use_timestep = self.resnet_dt, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables, mixed_prec = self.mixed_prec) else : - layer = one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables) + layer = one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables, mixed_prec = self.mixed_prec) if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift # (nframes x natoms) x naxis - final_layer = one_layer(layer, self.dim_rot_mat_1, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables) + final_layer = one_layer(layer, self.dim_rot_mat_1, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables, mixed_prec = self.mixed_prec, final_layer = True) if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift # (nframes x natoms) x 1 * naxis final_layer = tf.reshape(final_layer, [tf.shape(inputs)[0] * natoms[2+type_i], 1, self.dim_rot_mat_1]) @@ -177,4 +178,17 @@ def init_variables(self, model_file : str The input frozen model file """ - self.fitting_net_variables = get_fitting_net_variables(model_file) \ No newline at end of file + self.fitting_net_variables = get_fitting_net_variables(model_file) + + + def enable_mixed_precision(self, mixed_prec : dict = None) -> None: + """ + Reveive the mixed precision setting. + + Parameters + ---------- + mixed_prec + The mixed precision setting used in the embedding net + """ + self.mixed_prec = mixed_prec + self.fitting_precision = get_precision(mixed_prec['output_prec']) \ No newline at end of file diff --git a/deepmd/fit/ener.py b/deepmd/fit/ener.py index 0afcf26de2..e6b0d0a763 100644 --- a/deepmd/fit/ener.py +++ b/deepmd/fit/ener.py @@ -150,6 +150,7 @@ def __init__ (self, self.aparam_inv_std = None self.fitting_net_variables = None + self.mixed_prec = None def get_numb_fparam(self) -> int: """ @@ -293,7 +294,8 @@ def _build_lower( precision = self.fitting_precision, trainable = self.trainable[ii], uniform_seed = self.uniform_seed, - initial_variables = self.fitting_net_variables) + initial_variables = self.fitting_net_variables, + mixed_prec = self.mixed_prec) else : layer = one_layer( layer, @@ -305,7 +307,8 @@ def _build_lower( precision = self.fitting_precision, trainable = self.trainable[ii], uniform_seed = self.uniform_seed, - initial_variables = self.fitting_net_variables) + initial_variables = self.fitting_net_variables, + mixed_prec = self.mixed_prec) if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift final_layer = one_layer( layer, @@ -318,7 +321,9 @@ def _build_lower( precision = self.fitting_precision, trainable = self.trainable[-1], uniform_seed = self.uniform_seed, - initial_variables = self.fitting_net_variables) + initial_variables = self.fitting_net_variables, + mixed_prec = self.mixed_prec, + final_layer = True) if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift return final_layer @@ -494,4 +499,17 @@ def init_variables(self, model_file : str The input frozen model file """ - self.fitting_net_variables = get_fitting_net_variables(model_file) \ No newline at end of file + self.fitting_net_variables = get_fitting_net_variables(model_file) + + + def enable_mixed_precision(self, mixed_prec : dict = None) -> None: + """ + Reveive the mixed precision setting. + + Parameters + ---------- + mixed_prec + The mixed precision setting used in the embedding net + """ + self.mixed_prec = mixed_prec + self.fitting_precision = get_precision(mixed_prec['output_prec']) \ No newline at end of file diff --git a/deepmd/fit/polar.py b/deepmd/fit/polar.py index 65b1ff6aef..5f6ddd7525 100644 --- a/deepmd/fit/polar.py +++ b/deepmd/fit/polar.py @@ -79,7 +79,7 @@ def build (self, else : layer = one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision) # (nframes x natoms) x 9 - final_layer = one_layer(layer, 9, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, precision = self.fitting_precision) + final_layer = one_layer(layer, 9, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, precision = self.fitting_precision, final_layer = True) # (nframes x natoms) x 3 x 3 final_layer = tf.reshape(final_layer, [tf.shape(inputs)[0] * natoms[2+type_i], 3, 3]) # (nframes x natoms) x 3 x 3 @@ -194,6 +194,7 @@ def __init__ (self, self.dim_rot_mat = self.dim_rot_mat_1 * 3 self.useBN = False self.fitting_net_variables = None + self.mixed_prec = None def get_sel_type(self) -> List[int]: """ @@ -324,9 +325,9 @@ def build (self, layer = inputs_i for ii in range(0,len(self.n_neuron)) : if ii >= 1 and self.n_neuron[ii] == self.n_neuron[ii-1] : - layer+= one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, use_timestep = self.resnet_dt, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables) + layer+= one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, use_timestep = self.resnet_dt, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables, mixed_prec = self.mixed_prec) else : - layer = one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables) + layer = one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables, mixed_prec = self.mixed_prec) if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift if self.fit_diag : bavg = np.zeros(self.dim_rot_mat_1) @@ -334,7 +335,7 @@ def build (self, # bavg[1] = self.avgeig[1] # bavg[2] = self.avgeig[2] # (nframes x natoms) x naxis - final_layer = one_layer(layer, self.dim_rot_mat_1, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, bavg = bavg, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables) + final_layer = one_layer(layer, self.dim_rot_mat_1, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, bavg = bavg, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables, mixed_prec = self.mixed_prec, final_layer = True) if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift # (nframes x natoms) x naxis final_layer = tf.reshape(final_layer, [tf.shape(inputs)[0] * natoms[2+type_i], self.dim_rot_mat_1]) @@ -346,7 +347,7 @@ def build (self, # bavg[1*self.dim_rot_mat_1+1] = self.avgeig[1] # bavg[2*self.dim_rot_mat_1+2] = self.avgeig[2] # (nframes x natoms) x (naxis x naxis) - final_layer = one_layer(layer, self.dim_rot_mat_1*self.dim_rot_mat_1, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, bavg = bavg, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables) + final_layer = one_layer(layer, self.dim_rot_mat_1*self.dim_rot_mat_1, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, bavg = bavg, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables, mixed_prec = self.mixed_prec, final_layer = True) if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift # (nframes x natoms) x naxis x naxis final_layer = tf.reshape(final_layer, [tf.shape(inputs)[0] * natoms[2+type_i], self.dim_rot_mat_1, self.dim_rot_mat_1]) @@ -387,6 +388,19 @@ def init_variables(self, self.fitting_net_variables = get_fitting_net_variables(model_file) + def enable_mixed_precision(self, mixed_prec : dict = None) -> None: + """ + Reveive the mixed precision setting. + + Parameters + ---------- + mixed_prec + The mixed precision setting used in the embedding net + """ + self.mixed_prec = mixed_prec + self.fitting_precision = get_precision(mixed_prec['output_prec']) + + class GlobalPolarFittingSeA () : """ Fit the system polarizability with descriptor se_a @@ -509,3 +523,14 @@ def init_variables(self, """ self.polar_fitting.init_variables(model_file) + + def enable_mixed_precision(self, mixed_prec : dict = None) -> None: + """ + Reveive the mixed precision setting. + + Parameters + ---------- + mixed_prec + The mixed precision setting used in the embedding net + """ + self.polar_fitting.enable_mixed_precision(mixed_prec) diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index 16d1234112..0e1ed6d3e8 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -7,6 +7,8 @@ import shutil import google.protobuf.message import numpy as np +from packaging.version import Version + from deepmd.env import tf from deepmd.env import get_tf_session_config from deepmd.env import GLOBAL_TF_FLOAT_PRECISION @@ -23,13 +25,13 @@ from deepmd.utils.graph import get_tensor_by_name from tensorflow.python.client import timeline -from deepmd.env import op_module +from deepmd.env import op_module, TF_VERSION from deepmd.utils.errors import GraphWithoutTensorError # load grad of force module import deepmd.op -from deepmd.common import j_must_have, ClassArg, data_requirement +from deepmd.common import j_must_have, ClassArg, data_requirement, get_precision log = logging.getLogger(__name__) @@ -227,6 +229,13 @@ def _init_param(self, jdata): self.tensorboard = self.run_opt.is_chief and tr_data.get('tensorboard', False) self.tensorboard_log_dir = tr_data.get('tensorboard_log_dir', 'log') self.tensorboard_freq = tr_data.get('tensorboard_freq', 1) + self.mixed_prec = tr_data.get('mixed_precision', None) + if self.mixed_prec is not None: + if (self.mixed_prec['compute_prec'] != 'float16' or self.mixed_prec['output_prec'] != 'float32'): + raise RuntimeError( + "Unsupported mixed precision option [output_prec, compute_prec]: [%s, %s], " + " Supported: [float32, float16], Please set mixed precision option correctly!" + % (self.mixed_prec['output_prec'], self.mixed_prec['compute_prec'])) # self.sys_probs = tr_data['sys_probs'] # self.auto_prob_style = tr_data['auto_prob'] self.useBN = False @@ -289,6 +298,10 @@ def build (self, tf.constant("compressed_model", name = 'model_type', dtype = tf.string) else: tf.constant("original_model", name = 'model_type', dtype = tf.string) + + if self.mixed_prec is not None: + self.descrpt.enable_mixed_precision(self.mixed_prec) + self.fitting.enable_mixed_precision(self.mixed_prec) self._build_lr() self._build_network(data) @@ -332,6 +345,8 @@ def _build_network(self, data): self.place_holders, suffix = "test") + if self.mixed_prec is not None: + self.l2_l = tf.cast(self.l2_l, get_precision(self.mixed_prec['output_prec'])) log.info("built network") def _build_training(self): @@ -345,6 +360,15 @@ def _build_training(self): optimizer = self.run_opt._HVD.DistributedOptimizer(optimizer) else: optimizer = tf.train.AdamOptimizer(learning_rate = self.learning_rate) + if self.mixed_prec is not None: + _TF_VERSION = Version(TF_VERSION) + # check the TF_VERSION, when TF < 1.12, mixed precision is not allowed + if _TF_VERSION < Version('1.12.0'): + raise RuntimeError("TensorFlow version %s is not compatible with the mixed precision setting. Please consider upgrading your TF version!" % TF_VERSION) + elif _TF_VERSION < Version('2.4.0'): + optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer) + else: + optimizer = tf.mixed_precision.enable_mixed_precision_graph_rewrite(optimizer) apply_op = optimizer.minimize(loss=self.l2_l, global_step=self.global_step, var_list=trainable_variables, diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index 36e9eb2ee6..847eccc52e 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -600,6 +600,24 @@ def validation_data_args(): # ! added by Ziyao: new specification style for dat sub_fields=args, sub_variants=[], doc=doc_validation_data) +def mixed_precision_args(): # ! added by Denghui. + doc_output_prec = 'The precision for mixed precision params. " \ + "The trainable variables precision during the mixed precision training process, " \ + "supported options are float32 only currently.' + doc_compute_prec = 'The precision for mixed precision compute. " \ + "The compute precision during the mixed precision training process, "" \ + "supported options are float16 only currently.' + + args = [ + Argument("output_prec", str, optional=True, default="float32", doc=doc_output_prec), + Argument("compute_prec", str, optional=False, default="float16", doc=doc_compute_prec), + ] + + doc_mixed_precision = "Configurations of mixed precision." + return Argument("mixed_precision", dict, optional=True, + sub_fields=args, sub_variants=[], doc=doc_mixed_precision) + + def training_args(): # ! modified by Ziyao: data configuration isolated. doc_numb_steps = 'Number of training batch. Each training uses one batch of data.' doc_seed = 'The random seed for getting frames from the training data set.' @@ -617,10 +635,12 @@ def training_args(): # ! modified by Ziyao: data configuration isolated. arg_training_data = training_data_args() arg_validation_data = validation_data_args() + mixed_precision_data = mixed_precision_args() args = [ arg_training_data, arg_validation_data, + mixed_precision_data, Argument("numb_steps", int, optional=False, doc=doc_numb_steps, alias=["stop_batch"]), Argument("seed", [int,None], optional=True, doc=doc_seed), Argument("disp_file", str, optional=True, default='lcurve.out', doc=doc_disp_file), diff --git a/deepmd/utils/network.py b/deepmd/utils/network.py index 5c78031167..c82721becb 100644 --- a/deepmd/utils/network.py +++ b/deepmd/utils/network.py @@ -2,6 +2,7 @@ from deepmd.env import tf from deepmd.env import GLOBAL_TF_FLOAT_PRECISION +from deepmd.common import get_precision def one_layer_rand_seed_shift(): return 3 @@ -19,7 +20,12 @@ def one_layer(inputs, trainable = True, useBN = False, uniform_seed = False, - initial_variables = None): + initial_variables = None, + mixed_prec = None, + final_layer = False): + # For good accuracy, the last layer of the fitting network uses a higher precision neuron network. + if mixed_prec is not None and final_layer: + inputs = tf.cast(inputs, get_precision(mixed_prec['output_prec'])) with tf.variable_scope(name, reuse=reuse): shape = inputs.get_shape().as_list() w_initializer = tf.random_normal_initializer( @@ -44,6 +50,12 @@ def one_layer(inputs, b_initializer, trainable = trainable) variable_summaries(b, 'bias') + + if mixed_prec is not None and not final_layer: + inputs = tf.cast(inputs, get_precision(mixed_prec['compute_prec'])) + w = tf.cast(w, get_precision(mixed_prec['compute_prec'])) + b = tf.cast(b, get_precision(mixed_prec['compute_prec'])) + hidden = tf.matmul(inputs, w) + b if activation_fn != None and use_timestep : idt_initializer = tf.random_normal_initializer( @@ -65,6 +77,8 @@ def one_layer(inputs, # return activation_fn(hidden_bn) else: if use_timestep : + if mixed_prec is not None and not final_layer: + idt = tf.cast(idt, get_precision(mixed_prec['compute_prec'])) return tf.reshape(activation_fn(hidden), [-1, outputs_size]) * idt else : return tf.reshape(activation_fn(hidden), [-1, outputs_size]) @@ -93,7 +107,8 @@ def embedding_net(xx, seed = None, trainable = True, uniform_seed = False, - initial_variables = None): + initial_variables = None, + mixed_prec = None): r"""The embedding network. The embedding network function :math:`\mathcal{N}` is constructed by is the @@ -146,6 +161,8 @@ def embedding_net(xx, Only for the purpose of backward compatibility, retrieves the old behavior of using the random seed initial_variables : dict The input dict which stores the embedding net variables + mixed_prec + The input dict which stores the mixed precision setting for the embedding net References @@ -185,6 +202,10 @@ def embedding_net(xx, trainable = trainable) variable_summaries(b, 'bias_'+str(ii)+name_suffix) + if mixed_prec is not None: + xx = tf.cast(xx, get_precision(mixed_prec['compute_prec'])) + w = tf.cast(w, get_precision(mixed_prec['compute_prec'])) + b = tf.cast(b, get_precision(mixed_prec['compute_prec'])) hidden = tf.reshape(activation_fn(tf.matmul(xx, w) + b), [-1, outputs_size[ii]]) if resnet_dt : idt_initializer = tf.random_normal_initializer( @@ -201,6 +222,8 @@ def embedding_net(xx, idt_initializer, trainable = trainable) variable_summaries(idt, 'idt_'+str(ii)+name_suffix) + if mixed_prec is not None: + idt = tf.cast(idt, get_precision(mixed_prec['compute_prec'])) if outputs_size[ii] == outputs_size[ii-1]: if resnet_dt : @@ -214,7 +237,6 @@ def embedding_net(xx, xx = tf.concat([xx,xx], 1) + hidden else: xx = hidden - return xx def variable_summaries(var: tf.Variable, name: str): diff --git a/doc/train/training-advanced.md b/doc/train/training-advanced.md index ea9e1e8075..7226b57847 100644 --- a/doc/train/training-advanced.md +++ b/doc/train/training-advanced.md @@ -36,6 +36,10 @@ Other training parameters are given in the `training` section. "batch_size": 1, "numb_btch": 3 }, + "mixed_precision": { + "output_prec": "float32", + "compute_prec": "float16" + }, "numb_step": 1000000, "seed": 1, @@ -75,6 +79,13 @@ The sections `"training_data"` and `"validation_data"` give the training dataset * `"auto:N"`: automatically determines the batch size so that the `batch_size` times the number of atoms in the system is no less than `N`. * The key `numb_batch` in `validate_data` gives the number of batches of model validation. Note that the batches may not be from the same system +The section `mixed_precision` specifies the mixed precision settings, which will enable the mixed precision training workflow for deepmd-kit. The keys are explained below: +* `output_prec` precision used in the output tensors, only `float32` is supported currently. +* `compute_prec` precision used in the computing tensors, only `float16` is supported currently. +Note there are severial limitations about the mixed precision training: +* Only 'se_e2_a' type descriptor is supported by the mixed precision training workflow. +* The precision of embedding net and fitting net are forced to be set to `float32`. + Other keys in the `training` section are explained below: * `numb_step` The number of training steps. * `seed` The random seed for getting frames from the training data set. diff --git a/examples/water/se_e2_a_mixed_prec/input.json b/examples/water/se_e2_a_mixed_prec/input.json new file mode 100644 index 0000000000..889abedabf --- /dev/null +++ b/examples/water/se_e2_a_mixed_prec/input.json @@ -0,0 +1,70 @@ +{ + "_comment": " model parameters", + "model": { + "type_map": ["O", "H"], + "descriptor" :{ + "type": "se_e2_a", + "sel": [46, 92], + "rcut_smth": 0.50, + "rcut": 6.00, + "neuron": [25, 50, 100], + "resnet_dt": false, + "axis_neuron": 16, + "seed": 1, + "_comment": " that's all" + }, + "fitting_net" : { + "neuron": [240, 240, 240], + "resnet_dt": true, + "seed": 1, + "_comment": " that's all" + }, + "_comment": " that's all" + }, + + "learning_rate" :{ + "type": "exp", + "decay_steps": 5000, + "start_lr": 0.001, + "stop_lr": 3.51e-8, + "_comment": "that's all" + }, + + "loss" :{ + "type": "ener", + "start_pref_e": 0.02, + "limit_pref_e": 1, + "start_pref_f": 1000, + "limit_pref_f": 1, + "start_pref_v": 0, + "limit_pref_v": 0, + "_comment": " that's all" + }, + + "training" : { + "training_data": { + "systems": ["../data/data_0/", "../data/data_1/", "../data/data_2/"], + "batch_size": "auto", + "_comment": "that's all" + }, + "validation_data":{ + "systems": ["../data/data_3"], + "batch_size": 1, + "numb_btch": 3, + "_comment": "that's all" + }, + "mixed_precision": { + "compute_prec": "float16", + "output_prec": "float32" + }, + "numb_steps": 1000000, + "seed": 10, + "disp_file": "lcurve.out", + "disp_freq": 100, + "save_freq": 1000, + "_comment": "that's all" + }, + + "_comment": "that's all" +} + diff --git a/requirements.txt b/requirements.txt index f3ead805b8..06b71f825c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ python-hostlist >= 1.21 typing_extensions; python_version < "3.7" h5py wcmatch +packaging