From 05086f5877e14d344a648ad472326af16f47c13e Mon Sep 17 00:00:00 2001 From: denghuilu Date: Tue, 16 Nov 2021 01:34:59 +0800 Subject: [PATCH 01/16] enable mixed precision support for dp --- deepmd/descriptor/se_a.py | 3 ++ deepmd/env.py | 79 +++++++++++++++++++++++++++++++++++++++ deepmd/train/trainer.py | 6 +++ deepmd/utils/network.py | 24 +++++++++++- 4 files changed, 111 insertions(+), 1 deletion(-) diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py index 74b12a412a..5ee3aa70f1 100644 --- a/deepmd/descriptor/se_a.py +++ b/deepmd/descriptor/se_a.py @@ -9,6 +9,7 @@ from deepmd.env import GLOBAL_NP_FLOAT_PRECISION from deepmd.env import op_module from deepmd.env import default_tf_session_config +from deepmd.env import DP_ENABLE_MIXED_PRECISION, cast_to_compute from deepmd.utils.network import embedding_net, embedding_net_rand_seed_shift from deepmd.utils.tabulate import DPTabulate from deepmd.utils.type_embed import embed_atom_type @@ -735,6 +736,8 @@ def _filter( name='linear', reuse=None, trainable = True): + if DP_ENABLE_MIXED_PRECISION: + inputs = cast_to_compute(inputs) nframes = tf.shape(tf.reshape(inputs, [-1, natoms[0], self.ndescrpt]))[0] # natom x (nei x 4) shape = inputs.get_shape().as_list() diff --git a/deepmd/env.py b/deepmd/env.py index 6e6543697e..c593e9e3b6 100644 --- a/deepmd/env.py +++ b/deepmd/env.py @@ -28,6 +28,9 @@ "GLOBAL_NP_FLOAT_PRECISION", "GLOBAL_ENER_FLOAT_PRECISION", "global_float_prec", + "DP_ENABLE_MIXED_PRECISION", + "DP_MIXED_OUTPUT_PRECISION", + "DP_MIXED_COMPUTE_PRECISION", "global_cvt_2_tf_float", "global_cvt_2_ener_float", "MODEL_VERSION", @@ -310,6 +313,82 @@ def _get_package_constants( "DP_INTERFACE_PREC." % dp_float_prec ) +# MIXED_PREC +# only support tf.float16 mixed precision training. +dp_mixed_prec = os.environ.get("DP_ENABLE_MIXED_PREC", "").lower() +if dp_mixed_prec is "fp16": + # default setting of the global precision + GLOBAL_TF_FLOAT_PRECISION = tf.float32 + GLOBAL_NP_FLOAT_PRECISION = np.float32 + GLOBAL_ENER_FLOAT_PRECISION = np.float64 + global_float_prec = "half" + # + DP_ENABLE_MIXED_PRECISION = True + DP_MIXED_OUTPUT_PRECISION = tf.float32 + DP_MIXED_COMPUTE_PRECISION = tf.float16 +elif dp_mixed_prec is "": + DP_ENABLE_MIXED_PRECISION = False + DP_MIXED_OUTPUT_PRECISION = None + DP_MIXED_COMPUTE_PRECISION = None +else: + raise RuntimeError( + "Unsupported mixed precision option: %s. Supported: fp16. " + "Please set mixed precision training with environmental variable " + "DP_ENABLE_MIXED_PREC." % dp_mixed_prec + ) + + +def cast_to_compute(xx: tf.Tensor) -> tf.Tensor: + """Cast tensor to compute precision. + + Parameters + ---------- + xx : tf.Tensor + input tensor + + Returns + ------- + tf.Tensor + output tensor cast to compute precision + + Raises + ------ + RuntimeError + if mixed precision training mode is on + """ + if DP_MIXED_COMPUTE_PRECISION is None: + raise RuntimeError( + "'cast_to_compute' function only support the mixed precision mode." + "Please set mixed precision training with environmental variable " + "DP_ENABLE_MIXED_PREC." + ) + return tf.cast(xx, DP_MIXED_COMPUTE_PRECISION) + +def cast_to_output(xx: tf.Tensor) -> tf.Tensor: + """Cast tensor to output precision. + + Parameters + ---------- + xx : tf.Tensor + input tensor + + Returns + ------- + tf.Tensor + output tensor cast to output precision + + Raises + ------ + RuntimeError + if mixed precision training mode is on + """ + if DP_MIXED_COMPUTE_PRECISION is None: + raise RuntimeError( + "'cast_to_output' function only support the mixed precision mode." + "Please set mixed precision training with environmental variable " + "DP_ENABLE_MIXED_PREC." + ) + return tf.cast(xx, DP_MIXED_OUTPUT_PRECISION) def global_cvt_2_tf_float(xx: tf.Tensor) -> tf.Tensor: """Cast tensor to globally set TF precision. diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index 16d1234112..d8751a506f 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -11,6 +11,7 @@ from deepmd.env import get_tf_session_config from deepmd.env import GLOBAL_TF_FLOAT_PRECISION from deepmd.env import GLOBAL_ENER_FLOAT_PRECISION +from deepmd.env import DP_ENABLE_MIXED_PRECISION from deepmd.fit import EnerFitting, WFCFitting, PolarFittingLocFrame, PolarFittingSeA, GlobalPolarFittingSeA, DipoleFittingSeA from deepmd.descriptor import Descriptor from deepmd.model import EnerModel, WFCModel, DipoleModel, PolarModel, GlobalPolarModel @@ -332,6 +333,8 @@ def _build_network(self, data): self.place_holders, suffix = "test") + if DP_ENABLE_MIXED_PRECISION: + self.l2_l = tf.cast(self.l2_l, GLOBAL_TF_FLOAT_PRECISION) log.info("built network") def _build_training(self): @@ -345,6 +348,9 @@ def _build_training(self): optimizer = self.run_opt._HVD.DistributedOptimizer(optimizer) else: optimizer = tf.train.AdamOptimizer(learning_rate = self.learning_rate) + if DP_ENABLE_MIXED_PRECISION: + # enable dynamic loss scale of the gradients + optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer) apply_op = optimizer.minimize(loss=self.l2_l, global_step=self.global_step, var_list=trainable_variables, diff --git a/deepmd/utils/network.py b/deepmd/utils/network.py index 5c78031167..3ded5db063 100644 --- a/deepmd/utils/network.py +++ b/deepmd/utils/network.py @@ -2,6 +2,7 @@ from deepmd.env import tf from deepmd.env import GLOBAL_TF_FLOAT_PRECISION +from deepmd.env import DP_ENABLE_MIXED_PRECISION, cast_to_compute, cast_to_output def one_layer_rand_seed_shift(): return 3 @@ -20,6 +21,12 @@ def one_layer(inputs, useBN = False, uniform_seed = False, initial_variables = None): + # Do mixed precision training check + if DP_ENABLE_MIXED_PRECISION and precision is not tf.float32: + raise RuntimeError("The network precision %s does not match the mixed precision training settting! Please check the input training script. " % (precision)) + # For good accuracy, the last layer of the fitting network uses a single-precision neuron network. + if DP_ENABLE_MIXED_PRECISION and outputs_size is 1: + inputs = cast_to_output(inputs) with tf.variable_scope(name, reuse=reuse): shape = inputs.get_shape().as_list() w_initializer = tf.random_normal_initializer( @@ -37,13 +44,17 @@ def one_layer(inputs, precision, w_initializer, trainable = trainable) - variable_summaries(w, 'matrix') + variable_summaries(w, 'matrix') b = tf.get_variable('bias', [outputs_size], precision, b_initializer, trainable = trainable) variable_summaries(b, 'bias') + if DP_ENABLE_MIXED_PRECISION and outputs_size is not 1: + inputs = cast_to_compute(inputs) + w = cast_to_compute(w) + b = cast_to_compute(b) hidden = tf.matmul(inputs, w) + b if activation_fn != None and use_timestep : idt_initializer = tf.random_normal_initializer( @@ -65,6 +76,8 @@ def one_layer(inputs, # return activation_fn(hidden_bn) else: if use_timestep : + if DP_ENABLE_MIXED_PRECISION and outputs_size is not 1: + idt = cast_to_compute(idt) return tf.reshape(activation_fn(hidden), [-1, outputs_size]) * idt else : return tf.reshape(activation_fn(hidden), [-1, outputs_size]) @@ -154,6 +167,9 @@ def embedding_net(xx, in deep residual networks. InComputer Vision – ECCV 2016,pages 630–645. Springer International Publishing, 2016. """ + # Do mixed precision training check + if DP_ENABLE_MIXED_PRECISION and precision is not tf.float32: + raise RuntimeError("The network precision %s does not match the mixed precision training settting! Please check the input training script. " % (precision)) input_shape = xx.get_shape().as_list() outputs_size = [input_shape[1]] + network_size @@ -185,6 +201,10 @@ def embedding_net(xx, trainable = trainable) variable_summaries(b, 'bias_'+str(ii)+name_suffix) + if DP_ENABLE_MIXED_PRECISION: + xx = cast_to_compute(xx) + w = cast_to_compute(w) + b = cast_to_compute(b) hidden = tf.reshape(activation_fn(tf.matmul(xx, w) + b), [-1, outputs_size[ii]]) if resnet_dt : idt_initializer = tf.random_normal_initializer( @@ -201,6 +221,8 @@ def embedding_net(xx, idt_initializer, trainable = trainable) variable_summaries(idt, 'idt_'+str(ii)+name_suffix) + if DP_ENABLE_MIXED_PRECISION: + idt = cast_to_compute(idt) if outputs_size[ii] == outputs_size[ii-1]: if resnet_dt : From e1cc674f9ece2d273d31c225a35542bd130c4146 Mon Sep 17 00:00:00 2001 From: denghuilu Date: Tue, 16 Nov 2021 02:12:29 +0800 Subject: [PATCH 02/16] set the default embedding net & fitting net precision --- deepmd/env.py | 4 ++++ deepmd/utils/argcheck.py | 15 ++++++++------- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/deepmd/env.py b/deepmd/env.py index c593e9e3b6..5494f10a19 100644 --- a/deepmd/env.py +++ b/deepmd/env.py @@ -24,6 +24,7 @@ __all__ = [ "GLOBAL_CONFIG", + "GLOBAL_FLOAT_PRECISION", "GLOBAL_TF_FLOAT_PRECISION", "GLOBAL_NP_FLOAT_PRECISION", "GLOBAL_ENER_FLOAT_PRECISION", @@ -297,11 +298,13 @@ def _get_package_constants( dp_float_prec = os.environ.get("DP_INTERFACE_PREC", "high").lower() if dp_float_prec in ("high", ""): # default is high + GLOBAL_FLOAT_PRECISION = "float64" GLOBAL_TF_FLOAT_PRECISION = tf.float64 GLOBAL_NP_FLOAT_PRECISION = np.float64 GLOBAL_ENER_FLOAT_PRECISION = np.float64 global_float_prec = "double" elif dp_float_prec == "low": + GLOBAL_FLOAT_PRECISION = "float32" GLOBAL_TF_FLOAT_PRECISION = tf.float32 GLOBAL_NP_FLOAT_PRECISION = np.float32 GLOBAL_ENER_FLOAT_PRECISION = np.float64 @@ -318,6 +321,7 @@ def _get_package_constants( dp_mixed_prec = os.environ.get("DP_ENABLE_MIXED_PREC", "").lower() if dp_mixed_prec is "fp16": # default setting of the global precision + GLOBAL_FLOAT_PRECISION = "float32" GLOBAL_TF_FLOAT_PRECISION = tf.float32 GLOBAL_NP_FLOAT_PRECISION = np.float32 GLOBAL_ENER_FLOAT_PRECISION = np.float64 diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index 36e9eb2ee6..736d2f894f 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -4,6 +4,7 @@ from deepmd import descriptor from deepmd.common import ACTIVATION_FN_DICT, PRECISION_DICT from deepmd.utils.plugin import Plugin +from deepmd.env import GLOBAL_FLOAT_PRECISION import json @@ -35,7 +36,7 @@ def type_embedding_args(): Argument("neuron", list, optional = True, default = [2, 4, 8], doc = doc_neuron), Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function), Argument("resnet_dt", bool, optional = True, default = False, doc = doc_resnet_dt), - Argument("precision", str, optional = True, default = "float64", doc = doc_precision), + Argument("precision", str, optional = True, default = GLOBAL_FLOAT_PRECISION, doc = doc_precision), Argument("trainable", bool, optional = True, default = True, doc = doc_trainable), Argument("seed", [int,None], optional = True, doc = doc_seed), ] @@ -138,7 +139,7 @@ def descrpt_se_a_args(): Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function), Argument("resnet_dt", bool, optional = True, default = False, doc = doc_resnet_dt), Argument("type_one_side", bool, optional = True, default = False, doc = doc_type_one_side), - Argument("precision", str, optional = True, default = "float64", doc = doc_precision), + Argument("precision", str, optional = True, default = GLOBAL_FLOAT_PRECISION, doc = doc_precision), Argument("trainable", bool, optional = True, default = True, doc = doc_trainable), Argument("seed", [int,None], optional = True, doc = doc_seed), Argument("exclude_types", list, optional = True, default = [], doc = doc_exclude_types), @@ -168,7 +169,7 @@ def descrpt_se_t_args(): Argument("neuron", list, optional = True, default = [10,20,40], doc = doc_neuron), Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function), Argument("resnet_dt", bool, optional = True, default = False, doc = doc_resnet_dt), - Argument("precision", str, optional = True, default = "float64", doc = doc_precision), + Argument("precision", str, optional = True, default = GLOBAL_FLOAT_PRECISION, doc = doc_precision), Argument("trainable", bool, optional = True, default = True, doc = doc_trainable), Argument("seed", [int,None], optional = True, doc = doc_seed), Argument("set_davg_zero", bool, optional = True, default = False, doc = doc_set_davg_zero) @@ -214,7 +215,7 @@ def descrpt_se_r_args(): Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function), Argument("resnet_dt", bool, optional = True, default = False, doc = doc_resnet_dt), Argument("type_one_side", bool, optional = True, default = False, doc = doc_type_one_side), - Argument("precision", str, optional = True, default = "float64", doc = doc_precision), + Argument("precision", str, optional = True, default = GLOBAL_FLOAT_PRECISION, doc = doc_precision), Argument("trainable", bool, optional = True, default = True, doc = doc_trainable), Argument("seed", [int,None], optional = True, doc = doc_seed), Argument("exclude_types", list, optional = True, default = [], doc = doc_exclude_types), @@ -269,7 +270,7 @@ def fitting_ener(): Argument("numb_aparam", int, optional = True, default = 0, doc = doc_numb_aparam), Argument("neuron", list, optional = True, default = [120,120,120], alias = ['n_neuron'], doc = doc_neuron), Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function), - Argument("precision", str, optional = True, default = 'float64', doc = doc_precision), + Argument("precision", str, optional = True, default = GLOBAL_FLOAT_PRECISION, doc = doc_precision), Argument("resnet_dt", bool, optional = True, default = True, doc = doc_resnet_dt), Argument("trainable", [list,bool], optional = True, default = True, doc = doc_trainable), Argument("rcond", float, optional = True, default = 1e-3, doc = doc_rcond), @@ -296,7 +297,7 @@ def fitting_polar(): Argument("neuron", list, optional = True, default = [120,120,120], alias = ['n_neuron'], doc = doc_neuron), Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function), Argument("resnet_dt", bool, optional = True, default = True, doc = doc_resnet_dt), - Argument("precision", str, optional = True, default = 'float64', doc = doc_precision), + Argument("precision", str, optional = True, default = GLOBAL_FLOAT_PRECISION, doc = doc_precision), Argument("fit_diag", bool, optional = True, default = True, doc = doc_fit_diag), Argument("scale", [list,float], optional = True, default = 1.0, doc = doc_scale), #Argument("diag_shift", [list,float], optional = True, default = 0.0, doc = doc_diag_shift), @@ -321,7 +322,7 @@ def fitting_dipole(): Argument("neuron", list, optional = True, default = [120,120,120], alias = ['n_neuron'], doc = doc_neuron), Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function), Argument("resnet_dt", bool, optional = True, default = True, doc = doc_resnet_dt), - Argument("precision", str, optional = True, default = 'float64', doc = doc_precision), + Argument("precision", str, optional = True, default = GLOBAL_FLOAT_PRECISION, doc = doc_precision), Argument("sel_type", [list,int,None], optional = True, alias = ['dipole_type'], doc = doc_sel_type), Argument("seed", [int,None], optional = True, doc = doc_seed) ] From 1589b12a513b604d320e0006adb8d1fb798cc96f Mon Sep 17 00:00:00 2001 From: Denghui Lu Date: Tue, 16 Nov 2021 02:28:46 +0800 Subject: [PATCH 03/16] add doc for mixed precision --- doc/train/training-advanced.md | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/train/training-advanced.md b/doc/train/training-advanced.md index ea9e1e8075..184a61f1d7 100644 --- a/doc/train/training-advanced.md +++ b/doc/train/training-advanced.md @@ -126,3 +126,4 @@ One can set other environmental variables: | Environment variables | Allowed value | Default value | Usage | | --------------------- | ---------------------- | ------------- | -------------------------- | | DP_INTERFACE_PREC | `high`, `low` | `high` | Control high (double) or low (float) precision of training. | +| DP_ENABLE_MIXED_PREC | `fp16` | | Control mixed precision(fp16) of training and inference. | From fb48b01398c59b2b59689664d553b0a903d1d611 Mon Sep 17 00:00:00 2001 From: denghuilu Date: Tue, 16 Nov 2021 02:32:07 +0800 Subject: [PATCH 04/16] fix typo --- deepmd/utils/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepmd/utils/network.py b/deepmd/utils/network.py index 3ded5db063..45df6357a1 100644 --- a/deepmd/utils/network.py +++ b/deepmd/utils/network.py @@ -44,7 +44,7 @@ def one_layer(inputs, precision, w_initializer, trainable = trainable) - variable_summaries(w, 'matrix') + variable_summaries(w, 'matrix') b = tf.get_variable('bias', [outputs_size], precision, From 4aae04b08f149258fec4b97b59a8e5db97be317e Mon Sep 17 00:00:00 2001 From: denghuilu Date: Tue, 16 Nov 2021 03:29:20 +0800 Subject: [PATCH 05/16] fix UT bug --- deepmd/env.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deepmd/env.py b/deepmd/env.py index 5494f10a19..26cbde003a 100644 --- a/deepmd/env.py +++ b/deepmd/env.py @@ -319,7 +319,7 @@ def _get_package_constants( # MIXED_PREC # only support tf.float16 mixed precision training. dp_mixed_prec = os.environ.get("DP_ENABLE_MIXED_PREC", "").lower() -if dp_mixed_prec is "fp16": +if dp_mixed_prec == "fp16": # default setting of the global precision GLOBAL_FLOAT_PRECISION = "float32" GLOBAL_TF_FLOAT_PRECISION = tf.float32 @@ -330,7 +330,7 @@ def _get_package_constants( DP_ENABLE_MIXED_PRECISION = True DP_MIXED_OUTPUT_PRECISION = tf.float32 DP_MIXED_COMPUTE_PRECISION = tf.float16 -elif dp_mixed_prec is "": +elif dp_mixed_prec == "": DP_ENABLE_MIXED_PRECISION = False DP_MIXED_OUTPUT_PRECISION = None DP_MIXED_COMPUTE_PRECISION = None From 5b633a813281004bd7a5c9361613e7b7cdfbef38 Mon Sep 17 00:00:00 2001 From: denghuilu Date: Sun, 21 Nov 2021 21:12:18 +0800 Subject: [PATCH 06/16] use input script to control the mixed precision workflow --- deepmd/descriptor/descriptor.py | 18 +++++++ deepmd/descriptor/se_a.py | 21 +++++++-- deepmd/env.py | 83 --------------------------------- deepmd/fit/dipole.py | 22 +++++++-- deepmd/fit/ener.py | 25 ++++++++-- deepmd/fit/polar.py | 33 +++++++++++-- deepmd/train/trainer.py | 20 ++++++-- deepmd/utils/argcheck.py | 35 ++++++++++---- deepmd/utils/network.py | 51 ++++++++++---------- 9 files changed, 170 insertions(+), 138 deletions(-) diff --git a/deepmd/descriptor/descriptor.py b/deepmd/descriptor/descriptor.py index d179660a9d..0642779985 100644 --- a/deepmd/descriptor/descriptor.py +++ b/deepmd/descriptor/descriptor.py @@ -262,6 +262,24 @@ def enable_compression(self, raise NotImplementedError( "Descriptor %s doesn't support compression!" % type(self).__name__) + + def enable_mixed_precision(self, mixed_prec : dict = None) -> None: + """ + Reveive the mixed precision setting. + + Parameters + ---------- + mixed_prec + The mixed precision setting used in the embedding net + + Notes + ----- + This method is called by others when the descriptor supported compression. + """ + raise NotImplementedError( + "Descriptor %s doesn't support mixed precision training!" % type(self).__name__) + + @abstractmethod def prod_force_virial(self, atom_ener: tf.Tensor, diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py index 5ee3aa70f1..91843a47d3 100644 --- a/deepmd/descriptor/se_a.py +++ b/deepmd/descriptor/se_a.py @@ -9,7 +9,6 @@ from deepmd.env import GLOBAL_NP_FLOAT_PRECISION from deepmd.env import op_module from deepmd.env import default_tf_session_config -from deepmd.env import DP_ENABLE_MIXED_PRECISION, cast_to_compute from deepmd.utils.network import embedding_net, embedding_net_rand_seed_shift from deepmd.utils.tabulate import DPTabulate from deepmd.utils.type_embed import embed_atom_type @@ -161,6 +160,7 @@ def __init__ (self, self.davg = None self.compress = False self.embedding_net_variables = None + self.mixed_prec = None self.place_holders = {} nei_type = np.array([]) for ii in range(self.ntypes): @@ -349,6 +349,18 @@ def enable_compression(self, self.dstd = get_tensor_by_name_from_graph(graph, 'descrpt_attr%s/t_std' % suffix) + def enable_mixed_precision(self, mixed_prec : dict = None) -> None: + """ + Reveive the mixed precision setting. + + Parameters + ---------- + mixed_prec + The mixed precision setting used in the embedding net + """ + self.mixed_prec = mixed_prec + self.filter_precision = get_precision(mixed_prec['output_prec']) + def build (self, coord_ : tf.Tensor, @@ -709,7 +721,8 @@ def _filter_lower( seed = self.seed, trainable = trainable, uniform_seed = self.uniform_seed, - initial_variables = self.embedding_net_variables) + initial_variables = self.embedding_net_variables, + mixed_prec = self.mixed_prec) if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift else: # we can safely return the final xyz_scatter filled with zero directly @@ -736,8 +749,8 @@ def _filter( name='linear', reuse=None, trainable = True): - if DP_ENABLE_MIXED_PRECISION: - inputs = cast_to_compute(inputs) + if self.mixed_prec is not None: + inputs = tf.cast(inputs, get_precision(self.mixed_prec['compute_prec'])) nframes = tf.shape(tf.reshape(inputs, [-1, natoms[0], self.ndescrpt]))[0] # natom x (nei x 4) shape = inputs.get_shape().as_list() diff --git a/deepmd/env.py b/deepmd/env.py index 26cbde003a..6e6543697e 100644 --- a/deepmd/env.py +++ b/deepmd/env.py @@ -24,14 +24,10 @@ __all__ = [ "GLOBAL_CONFIG", - "GLOBAL_FLOAT_PRECISION", "GLOBAL_TF_FLOAT_PRECISION", "GLOBAL_NP_FLOAT_PRECISION", "GLOBAL_ENER_FLOAT_PRECISION", "global_float_prec", - "DP_ENABLE_MIXED_PRECISION", - "DP_MIXED_OUTPUT_PRECISION", - "DP_MIXED_COMPUTE_PRECISION", "global_cvt_2_tf_float", "global_cvt_2_ener_float", "MODEL_VERSION", @@ -298,13 +294,11 @@ def _get_package_constants( dp_float_prec = os.environ.get("DP_INTERFACE_PREC", "high").lower() if dp_float_prec in ("high", ""): # default is high - GLOBAL_FLOAT_PRECISION = "float64" GLOBAL_TF_FLOAT_PRECISION = tf.float64 GLOBAL_NP_FLOAT_PRECISION = np.float64 GLOBAL_ENER_FLOAT_PRECISION = np.float64 global_float_prec = "double" elif dp_float_prec == "low": - GLOBAL_FLOAT_PRECISION = "float32" GLOBAL_TF_FLOAT_PRECISION = tf.float32 GLOBAL_NP_FLOAT_PRECISION = np.float32 GLOBAL_ENER_FLOAT_PRECISION = np.float64 @@ -316,83 +310,6 @@ def _get_package_constants( "DP_INTERFACE_PREC." % dp_float_prec ) -# MIXED_PREC -# only support tf.float16 mixed precision training. -dp_mixed_prec = os.environ.get("DP_ENABLE_MIXED_PREC", "").lower() -if dp_mixed_prec == "fp16": - # default setting of the global precision - GLOBAL_FLOAT_PRECISION = "float32" - GLOBAL_TF_FLOAT_PRECISION = tf.float32 - GLOBAL_NP_FLOAT_PRECISION = np.float32 - GLOBAL_ENER_FLOAT_PRECISION = np.float64 - global_float_prec = "half" - # - DP_ENABLE_MIXED_PRECISION = True - DP_MIXED_OUTPUT_PRECISION = tf.float32 - DP_MIXED_COMPUTE_PRECISION = tf.float16 -elif dp_mixed_prec == "": - DP_ENABLE_MIXED_PRECISION = False - DP_MIXED_OUTPUT_PRECISION = None - DP_MIXED_COMPUTE_PRECISION = None -else: - raise RuntimeError( - "Unsupported mixed precision option: %s. Supported: fp16. " - "Please set mixed precision training with environmental variable " - "DP_ENABLE_MIXED_PREC." % dp_mixed_prec - ) - - -def cast_to_compute(xx: tf.Tensor) -> tf.Tensor: - """Cast tensor to compute precision. - - Parameters - ---------- - xx : tf.Tensor - input tensor - - Returns - ------- - tf.Tensor - output tensor cast to compute precision - - Raises - ------ - RuntimeError - if mixed precision training mode is on - """ - if DP_MIXED_COMPUTE_PRECISION is None: - raise RuntimeError( - "'cast_to_compute' function only support the mixed precision mode." - "Please set mixed precision training with environmental variable " - "DP_ENABLE_MIXED_PREC." - ) - return tf.cast(xx, DP_MIXED_COMPUTE_PRECISION) - -def cast_to_output(xx: tf.Tensor) -> tf.Tensor: - """Cast tensor to output precision. - - Parameters - ---------- - xx : tf.Tensor - input tensor - - Returns - ------- - tf.Tensor - output tensor cast to output precision - - Raises - ------ - RuntimeError - if mixed precision training mode is on - """ - if DP_MIXED_COMPUTE_PRECISION is None: - raise RuntimeError( - "'cast_to_output' function only support the mixed precision mode." - "Please set mixed precision training with environmental variable " - "DP_ENABLE_MIXED_PREC." - ) - return tf.cast(xx, DP_MIXED_OUTPUT_PRECISION) def global_cvt_2_tf_float(xx: tf.Tensor) -> tf.Tensor: """Cast tensor to globally set TF precision. diff --git a/deepmd/fit/dipole.py b/deepmd/fit/dipole.py index 6c115e3fb3..5dfa5fab0f 100644 --- a/deepmd/fit/dipole.py +++ b/deepmd/fit/dipole.py @@ -77,6 +77,7 @@ def __init__ (self, self.dim_rot_mat = self.dim_rot_mat_1 * 3 self.useBN = False self.fitting_net_variables = None + self.mixed_prec = None def get_sel_type(self) -> int: """ @@ -141,12 +142,12 @@ def build (self, layer = inputs_i for ii in range(0,len(self.n_neuron)) : if ii >= 1 and self.n_neuron[ii] == self.n_neuron[ii-1] : - layer+= one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, use_timestep = self.resnet_dt, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables) + layer+= one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, use_timestep = self.resnet_dt, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables, mixed_prec = self.mixed_prec) else : - layer = one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables) + layer = one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables, mixed_prec = self.mixed_prec) if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift # (nframes x natoms) x naxis - final_layer = one_layer(layer, self.dim_rot_mat_1, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables) + final_layer = one_layer(layer, self.dim_rot_mat_1, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables, mixed_prec = self.mixed_prec) if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift # (nframes x natoms) x 1 * naxis final_layer = tf.reshape(final_layer, [tf.shape(inputs)[0] * natoms[2+type_i], 1, self.dim_rot_mat_1]) @@ -177,4 +178,17 @@ def init_variables(self, model_file : str The input frozen model file """ - self.fitting_net_variables = get_fitting_net_variables(model_file) \ No newline at end of file + self.fitting_net_variables = get_fitting_net_variables(model_file) + + + def enable_mixed_precision(self, mixed_prec : dict = None) -> None: + """ + Reveive the mixed precision setting. + + Parameters + ---------- + mixed_prec + The mixed precision setting used in the embedding net + """ + self.mixed_prec = mixed_prec + self.fitting_precision = get_precision(mixed_prec['output_prec']) \ No newline at end of file diff --git a/deepmd/fit/ener.py b/deepmd/fit/ener.py index 0afcf26de2..3fe4013616 100644 --- a/deepmd/fit/ener.py +++ b/deepmd/fit/ener.py @@ -150,6 +150,7 @@ def __init__ (self, self.aparam_inv_std = None self.fitting_net_variables = None + self.mixed_prec = None def get_numb_fparam(self) -> int: """ @@ -293,7 +294,8 @@ def _build_lower( precision = self.fitting_precision, trainable = self.trainable[ii], uniform_seed = self.uniform_seed, - initial_variables = self.fitting_net_variables) + initial_variables = self.fitting_net_variables, + mixed_prec = self.mixed_prec) else : layer = one_layer( layer, @@ -305,7 +307,8 @@ def _build_lower( precision = self.fitting_precision, trainable = self.trainable[ii], uniform_seed = self.uniform_seed, - initial_variables = self.fitting_net_variables) + initial_variables = self.fitting_net_variables, + mixed_prec = self.mixed_prec) if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift final_layer = one_layer( layer, @@ -318,7 +321,8 @@ def _build_lower( precision = self.fitting_precision, trainable = self.trainable[-1], uniform_seed = self.uniform_seed, - initial_variables = self.fitting_net_variables) + initial_variables = self.fitting_net_variables, + mixed_prec = self.mixed_prec) if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift return final_layer @@ -494,4 +498,17 @@ def init_variables(self, model_file : str The input frozen model file """ - self.fitting_net_variables = get_fitting_net_variables(model_file) \ No newline at end of file + self.fitting_net_variables = get_fitting_net_variables(model_file) + + + def enable_mixed_precision(self, mixed_prec : dict = None) -> None: + """ + Reveive the mixed precision setting. + + Parameters + ---------- + mixed_prec + The mixed precision setting used in the embedding net + """ + self.mixed_prec = mixed_prec + self.fitting_precision = get_precision(mixed_prec['output_prec']) \ No newline at end of file diff --git a/deepmd/fit/polar.py b/deepmd/fit/polar.py index 65b1ff6aef..d858d37ac1 100644 --- a/deepmd/fit/polar.py +++ b/deepmd/fit/polar.py @@ -194,6 +194,7 @@ def __init__ (self, self.dim_rot_mat = self.dim_rot_mat_1 * 3 self.useBN = False self.fitting_net_variables = None + self.mixed_prec = None def get_sel_type(self) -> List[int]: """ @@ -324,9 +325,9 @@ def build (self, layer = inputs_i for ii in range(0,len(self.n_neuron)) : if ii >= 1 and self.n_neuron[ii] == self.n_neuron[ii-1] : - layer+= one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, use_timestep = self.resnet_dt, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables) + layer+= one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, use_timestep = self.resnet_dt, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables, mixed_prec = self.mixed_prec) else : - layer = one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables) + layer = one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables, mixed_prec = self.mixed_prec) if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift if self.fit_diag : bavg = np.zeros(self.dim_rot_mat_1) @@ -334,7 +335,7 @@ def build (self, # bavg[1] = self.avgeig[1] # bavg[2] = self.avgeig[2] # (nframes x natoms) x naxis - final_layer = one_layer(layer, self.dim_rot_mat_1, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, bavg = bavg, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables) + final_layer = one_layer(layer, self.dim_rot_mat_1, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, bavg = bavg, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables, mixed_prec = self.mixed_prec) if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift # (nframes x natoms) x naxis final_layer = tf.reshape(final_layer, [tf.shape(inputs)[0] * natoms[2+type_i], self.dim_rot_mat_1]) @@ -346,7 +347,7 @@ def build (self, # bavg[1*self.dim_rot_mat_1+1] = self.avgeig[1] # bavg[2*self.dim_rot_mat_1+2] = self.avgeig[2] # (nframes x natoms) x (naxis x naxis) - final_layer = one_layer(layer, self.dim_rot_mat_1*self.dim_rot_mat_1, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, bavg = bavg, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables) + final_layer = one_layer(layer, self.dim_rot_mat_1*self.dim_rot_mat_1, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, bavg = bavg, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables, mixed_prec = self.mixed_prec) if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift # (nframes x natoms) x naxis x naxis final_layer = tf.reshape(final_layer, [tf.shape(inputs)[0] * natoms[2+type_i], self.dim_rot_mat_1, self.dim_rot_mat_1]) @@ -387,6 +388,19 @@ def init_variables(self, self.fitting_net_variables = get_fitting_net_variables(model_file) + def enable_mixed_precision(self, mixed_prec : dict = None) -> None: + """ + Reveive the mixed precision setting. + + Parameters + ---------- + mixed_prec + The mixed precision setting used in the embedding net + """ + self.mixed_prec = mixed_prec + self.fitting_precision = get_precision(mixed_prec['output_prec']) + + class GlobalPolarFittingSeA () : """ Fit the system polarizability with descriptor se_a @@ -509,3 +523,14 @@ def init_variables(self, """ self.polar_fitting.init_variables(model_file) + + def enable_mixed_precision(self, mixed_prec : dict = None) -> None: + """ + Reveive the mixed precision setting. + + Parameters + ---------- + mixed_prec + The mixed precision setting used in the embedding net + """ + self.polar_fitting.enable_mixed_precision(mixed_prec) diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index d8751a506f..c1ab19ed05 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -11,7 +11,6 @@ from deepmd.env import get_tf_session_config from deepmd.env import GLOBAL_TF_FLOAT_PRECISION from deepmd.env import GLOBAL_ENER_FLOAT_PRECISION -from deepmd.env import DP_ENABLE_MIXED_PRECISION from deepmd.fit import EnerFitting, WFCFitting, PolarFittingLocFrame, PolarFittingSeA, GlobalPolarFittingSeA, DipoleFittingSeA from deepmd.descriptor import Descriptor from deepmd.model import EnerModel, WFCModel, DipoleModel, PolarModel, GlobalPolarModel @@ -30,7 +29,7 @@ # load grad of force module import deepmd.op -from deepmd.common import j_must_have, ClassArg, data_requirement +from deepmd.common import j_must_have, ClassArg, data_requirement, get_precision log = logging.getLogger(__name__) @@ -228,6 +227,13 @@ def _init_param(self, jdata): self.tensorboard = self.run_opt.is_chief and tr_data.get('tensorboard', False) self.tensorboard_log_dir = tr_data.get('tensorboard_log_dir', 'log') self.tensorboard_freq = tr_data.get('tensorboard_freq', 1) + self.mixed_prec = tr_data.get('mixed_precision', None) + if self.mixed_prec is not None: + if (self.mixed_prec['compute_prec'] != 'float16' or self.mixed_prec['output_prec'] != 'float32'): + raise RuntimeError( + "Unsupported mixed precision option [output_prec, compute_prec]: [%s, %s], " + " Supported: [float32, float16], Please set mixed precision option correctly!" + % (self.mixed_prec['output_prec'], self.mixed_prec['compute_prec'])) # self.sys_probs = tr_data['sys_probs'] # self.auto_prob_style = tr_data['auto_prob'] self.useBN = False @@ -290,6 +296,10 @@ def build (self, tf.constant("compressed_model", name = 'model_type', dtype = tf.string) else: tf.constant("original_model", name = 'model_type', dtype = tf.string) + + if self.mixed_prec is not None: + self.descrpt.enable_mixed_precision(self.mixed_prec) + self.fitting.enable_mixed_precision(self.mixed_prec) self._build_lr() self._build_network(data) @@ -333,8 +343,8 @@ def _build_network(self, data): self.place_holders, suffix = "test") - if DP_ENABLE_MIXED_PRECISION: - self.l2_l = tf.cast(self.l2_l, GLOBAL_TF_FLOAT_PRECISION) + if self.mixed_prec is not None: + self.l2_l = tf.cast(self.l2_l, get_precision(self.mixed_prec['output_prec'])) log.info("built network") def _build_training(self): @@ -348,7 +358,7 @@ def _build_training(self): optimizer = self.run_opt._HVD.DistributedOptimizer(optimizer) else: optimizer = tf.train.AdamOptimizer(learning_rate = self.learning_rate) - if DP_ENABLE_MIXED_PRECISION: + if self.mixed_prec is not None: # enable dynamic loss scale of the gradients optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer) apply_op = optimizer.minimize(loss=self.l2_l, diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index 736d2f894f..847eccc52e 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -4,7 +4,6 @@ from deepmd import descriptor from deepmd.common import ACTIVATION_FN_DICT, PRECISION_DICT from deepmd.utils.plugin import Plugin -from deepmd.env import GLOBAL_FLOAT_PRECISION import json @@ -36,7 +35,7 @@ def type_embedding_args(): Argument("neuron", list, optional = True, default = [2, 4, 8], doc = doc_neuron), Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function), Argument("resnet_dt", bool, optional = True, default = False, doc = doc_resnet_dt), - Argument("precision", str, optional = True, default = GLOBAL_FLOAT_PRECISION, doc = doc_precision), + Argument("precision", str, optional = True, default = "float64", doc = doc_precision), Argument("trainable", bool, optional = True, default = True, doc = doc_trainable), Argument("seed", [int,None], optional = True, doc = doc_seed), ] @@ -139,7 +138,7 @@ def descrpt_se_a_args(): Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function), Argument("resnet_dt", bool, optional = True, default = False, doc = doc_resnet_dt), Argument("type_one_side", bool, optional = True, default = False, doc = doc_type_one_side), - Argument("precision", str, optional = True, default = GLOBAL_FLOAT_PRECISION, doc = doc_precision), + Argument("precision", str, optional = True, default = "float64", doc = doc_precision), Argument("trainable", bool, optional = True, default = True, doc = doc_trainable), Argument("seed", [int,None], optional = True, doc = doc_seed), Argument("exclude_types", list, optional = True, default = [], doc = doc_exclude_types), @@ -169,7 +168,7 @@ def descrpt_se_t_args(): Argument("neuron", list, optional = True, default = [10,20,40], doc = doc_neuron), Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function), Argument("resnet_dt", bool, optional = True, default = False, doc = doc_resnet_dt), - Argument("precision", str, optional = True, default = GLOBAL_FLOAT_PRECISION, doc = doc_precision), + Argument("precision", str, optional = True, default = "float64", doc = doc_precision), Argument("trainable", bool, optional = True, default = True, doc = doc_trainable), Argument("seed", [int,None], optional = True, doc = doc_seed), Argument("set_davg_zero", bool, optional = True, default = False, doc = doc_set_davg_zero) @@ -215,7 +214,7 @@ def descrpt_se_r_args(): Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function), Argument("resnet_dt", bool, optional = True, default = False, doc = doc_resnet_dt), Argument("type_one_side", bool, optional = True, default = False, doc = doc_type_one_side), - Argument("precision", str, optional = True, default = GLOBAL_FLOAT_PRECISION, doc = doc_precision), + Argument("precision", str, optional = True, default = "float64", doc = doc_precision), Argument("trainable", bool, optional = True, default = True, doc = doc_trainable), Argument("seed", [int,None], optional = True, doc = doc_seed), Argument("exclude_types", list, optional = True, default = [], doc = doc_exclude_types), @@ -270,7 +269,7 @@ def fitting_ener(): Argument("numb_aparam", int, optional = True, default = 0, doc = doc_numb_aparam), Argument("neuron", list, optional = True, default = [120,120,120], alias = ['n_neuron'], doc = doc_neuron), Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function), - Argument("precision", str, optional = True, default = GLOBAL_FLOAT_PRECISION, doc = doc_precision), + Argument("precision", str, optional = True, default = 'float64', doc = doc_precision), Argument("resnet_dt", bool, optional = True, default = True, doc = doc_resnet_dt), Argument("trainable", [list,bool], optional = True, default = True, doc = doc_trainable), Argument("rcond", float, optional = True, default = 1e-3, doc = doc_rcond), @@ -297,7 +296,7 @@ def fitting_polar(): Argument("neuron", list, optional = True, default = [120,120,120], alias = ['n_neuron'], doc = doc_neuron), Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function), Argument("resnet_dt", bool, optional = True, default = True, doc = doc_resnet_dt), - Argument("precision", str, optional = True, default = GLOBAL_FLOAT_PRECISION, doc = doc_precision), + Argument("precision", str, optional = True, default = 'float64', doc = doc_precision), Argument("fit_diag", bool, optional = True, default = True, doc = doc_fit_diag), Argument("scale", [list,float], optional = True, default = 1.0, doc = doc_scale), #Argument("diag_shift", [list,float], optional = True, default = 0.0, doc = doc_diag_shift), @@ -322,7 +321,7 @@ def fitting_dipole(): Argument("neuron", list, optional = True, default = [120,120,120], alias = ['n_neuron'], doc = doc_neuron), Argument("activation_function", str, optional = True, default = 'tanh', doc = doc_activation_function), Argument("resnet_dt", bool, optional = True, default = True, doc = doc_resnet_dt), - Argument("precision", str, optional = True, default = GLOBAL_FLOAT_PRECISION, doc = doc_precision), + Argument("precision", str, optional = True, default = 'float64', doc = doc_precision), Argument("sel_type", [list,int,None], optional = True, alias = ['dipole_type'], doc = doc_sel_type), Argument("seed", [int,None], optional = True, doc = doc_seed) ] @@ -601,6 +600,24 @@ def validation_data_args(): # ! added by Ziyao: new specification style for dat sub_fields=args, sub_variants=[], doc=doc_validation_data) +def mixed_precision_args(): # ! added by Denghui. + doc_output_prec = 'The precision for mixed precision params. " \ + "The trainable variables precision during the mixed precision training process, " \ + "supported options are float32 only currently.' + doc_compute_prec = 'The precision for mixed precision compute. " \ + "The compute precision during the mixed precision training process, "" \ + "supported options are float16 only currently.' + + args = [ + Argument("output_prec", str, optional=True, default="float32", doc=doc_output_prec), + Argument("compute_prec", str, optional=False, default="float16", doc=doc_compute_prec), + ] + + doc_mixed_precision = "Configurations of mixed precision." + return Argument("mixed_precision", dict, optional=True, + sub_fields=args, sub_variants=[], doc=doc_mixed_precision) + + def training_args(): # ! modified by Ziyao: data configuration isolated. doc_numb_steps = 'Number of training batch. Each training uses one batch of data.' doc_seed = 'The random seed for getting frames from the training data set.' @@ -618,10 +635,12 @@ def training_args(): # ! modified by Ziyao: data configuration isolated. arg_training_data = training_data_args() arg_validation_data = validation_data_args() + mixed_precision_data = mixed_precision_args() args = [ arg_training_data, arg_validation_data, + mixed_precision_data, Argument("numb_steps", int, optional=False, doc=doc_numb_steps, alias=["stop_batch"]), Argument("seed", [int,None], optional=True, doc=doc_seed), Argument("disp_file", str, optional=True, default='lcurve.out', doc=doc_disp_file), diff --git a/deepmd/utils/network.py b/deepmd/utils/network.py index 45df6357a1..cadcc7964b 100644 --- a/deepmd/utils/network.py +++ b/deepmd/utils/network.py @@ -2,7 +2,7 @@ from deepmd.env import tf from deepmd.env import GLOBAL_TF_FLOAT_PRECISION -from deepmd.env import DP_ENABLE_MIXED_PRECISION, cast_to_compute, cast_to_output +from deepmd.common import get_precision def one_layer_rand_seed_shift(): return 3 @@ -17,16 +17,14 @@ def one_layer(inputs, reuse=None, seed=None, use_timestep = False, - trainable = True, + trainable = False, useBN = False, uniform_seed = False, - initial_variables = None): - # Do mixed precision training check - if DP_ENABLE_MIXED_PRECISION and precision is not tf.float32: - raise RuntimeError("The network precision %s does not match the mixed precision training settting! Please check the input training script. " % (precision)) - # For good accuracy, the last layer of the fitting network uses a single-precision neuron network. - if DP_ENABLE_MIXED_PRECISION and outputs_size is 1: - inputs = cast_to_output(inputs) + initial_variables = None, + mixed_prec = None): + # For good accuracy, the last layer of the fitting network uses a higher precision neuron network. + if mixed_prec is not None and outputs_size == 1: + inputs = tf.cast(inputs, get_precision(mixed_prec['output_prec'])) with tf.variable_scope(name, reuse=reuse): shape = inputs.get_shape().as_list() w_initializer = tf.random_normal_initializer( @@ -51,10 +49,12 @@ def one_layer(inputs, b_initializer, trainable = trainable) variable_summaries(b, 'bias') - if DP_ENABLE_MIXED_PRECISION and outputs_size is not 1: - inputs = cast_to_compute(inputs) - w = cast_to_compute(w) - b = cast_to_compute(b) + + if mixed_prec is not None and outputs_size != 1: + inputs = tf.cast(inputs, get_precision(mixed_prec['compute_prec'])) + w = tf.cast(w, get_precision(mixed_prec['compute_prec'])) + b = tf.cast(b, get_precision(mixed_prec['compute_prec'])) + hidden = tf.matmul(inputs, w) + b if activation_fn != None and use_timestep : idt_initializer = tf.random_normal_initializer( @@ -76,8 +76,8 @@ def one_layer(inputs, # return activation_fn(hidden_bn) else: if use_timestep : - if DP_ENABLE_MIXED_PRECISION and outputs_size is not 1: - idt = cast_to_compute(idt) + if mixed_prec is not None and outputs_size != 1: + idt = tf.cast(idt, get_precision(mixed_prec['compute_prec'])) return tf.reshape(activation_fn(hidden), [-1, outputs_size]) * idt else : return tf.reshape(activation_fn(hidden), [-1, outputs_size]) @@ -106,7 +106,8 @@ def embedding_net(xx, seed = None, trainable = True, uniform_seed = False, - initial_variables = None): + initial_variables = None, + mixed_prec = None): r"""The embedding network. The embedding network function :math:`\mathcal{N}` is constructed by is the @@ -159,6 +160,8 @@ def embedding_net(xx, Only for the purpose of backward compatibility, retrieves the old behavior of using the random seed initial_variables : dict The input dict which stores the embedding net variables + mixed_prec + The input dict which stores the mixed precision setting for the embedding net References @@ -167,9 +170,6 @@ def embedding_net(xx, in deep residual networks. InComputer Vision – ECCV 2016,pages 630–645. Springer International Publishing, 2016. """ - # Do mixed precision training check - if DP_ENABLE_MIXED_PRECISION and precision is not tf.float32: - raise RuntimeError("The network precision %s does not match the mixed precision training settting! Please check the input training script. " % (precision)) input_shape = xx.get_shape().as_list() outputs_size = [input_shape[1]] + network_size @@ -201,10 +201,10 @@ def embedding_net(xx, trainable = trainable) variable_summaries(b, 'bias_'+str(ii)+name_suffix) - if DP_ENABLE_MIXED_PRECISION: - xx = cast_to_compute(xx) - w = cast_to_compute(w) - b = cast_to_compute(b) + if mixed_prec is not None: + xx = tf.cast(xx, get_precision(mixed_prec['compute_prec'])) + w = tf.cast(w, get_precision(mixed_prec['compute_prec'])) + b = tf.cast(b, get_precision(mixed_prec['compute_prec'])) hidden = tf.reshape(activation_fn(tf.matmul(xx, w) + b), [-1, outputs_size[ii]]) if resnet_dt : idt_initializer = tf.random_normal_initializer( @@ -221,8 +221,8 @@ def embedding_net(xx, idt_initializer, trainable = trainable) variable_summaries(idt, 'idt_'+str(ii)+name_suffix) - if DP_ENABLE_MIXED_PRECISION: - idt = cast_to_compute(idt) + if mixed_prec is not None: + idt = tf.cast(idt, get_precision(mixed_prec['compute_prec'])) if outputs_size[ii] == outputs_size[ii-1]: if resnet_dt : @@ -236,7 +236,6 @@ def embedding_net(xx, xx = tf.concat([xx,xx], 1) + hidden else: xx = hidden - return xx def variable_summaries(var: tf.Variable, name: str): From b47c56dd97757dd64f04ea5dc3013a36e88496eb Mon Sep 17 00:00:00 2001 From: denghuilu Date: Sun, 21 Nov 2021 21:35:44 +0800 Subject: [PATCH 07/16] add tf version check for mixed precision --- deepmd/env.py | 1 + deepmd/train/trainer.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/deepmd/env.py b/deepmd/env.py index 6e6543697e..577da78ed5 100644 --- a/deepmd/env.py +++ b/deepmd/env.py @@ -39,6 +39,7 @@ "TRANSFER_PATTERN", "FITTING_NET_PATTERN", "EMBEDDING_NET_PATTERN", + "TF_VERSION" ] SHARED_LIB_MODULE = "op" diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index c1ab19ed05..75d36cf246 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -23,7 +23,7 @@ from deepmd.utils.graph import get_tensor_by_name from tensorflow.python.client import timeline -from deepmd.env import op_module +from deepmd.env import op_module, TF_VERSION from deepmd.utils.errors import GraphWithoutTensorError # load grad of force module @@ -359,6 +359,9 @@ def _build_training(self): else: optimizer = tf.train.AdamOptimizer(learning_rate = self.learning_rate) if self.mixed_prec is not None: + # check the TF_VERSION, when TF < 1.12, mixed precision is not allowed + if TF_VERSION < "1.12": + raise RuntimeError("TensorFlow version %s is not compatible with the mixed precision setting. Please consider upgrade your TF version!" % TF_VERSION) # enable dynamic loss scale of the gradients optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer) apply_op = optimizer.minimize(loss=self.l2_l, From af3fcfb0872ef6d3da9da409ea9eea74e57d354f Mon Sep 17 00:00:00 2001 From: Denghui Lu Date: Sun, 21 Nov 2021 21:54:12 +0800 Subject: [PATCH 08/16] Update training-advanced.md --- doc/train/training-advanced.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/train/training-advanced.md b/doc/train/training-advanced.md index 184a61f1d7..7226b57847 100644 --- a/doc/train/training-advanced.md +++ b/doc/train/training-advanced.md @@ -36,6 +36,10 @@ Other training parameters are given in the `training` section. "batch_size": 1, "numb_btch": 3 }, + "mixed_precision": { + "output_prec": "float32", + "compute_prec": "float16" + }, "numb_step": 1000000, "seed": 1, @@ -75,6 +79,13 @@ The sections `"training_data"` and `"validation_data"` give the training dataset * `"auto:N"`: automatically determines the batch size so that the `batch_size` times the number of atoms in the system is no less than `N`. * The key `numb_batch` in `validate_data` gives the number of batches of model validation. Note that the batches may not be from the same system +The section `mixed_precision` specifies the mixed precision settings, which will enable the mixed precision training workflow for deepmd-kit. The keys are explained below: +* `output_prec` precision used in the output tensors, only `float32` is supported currently. +* `compute_prec` precision used in the computing tensors, only `float16` is supported currently. +Note there are severial limitations about the mixed precision training: +* Only 'se_e2_a' type descriptor is supported by the mixed precision training workflow. +* The precision of embedding net and fitting net are forced to be set to `float32`. + Other keys in the `training` section are explained below: * `numb_step` The number of training steps. * `seed` The random seed for getting frames from the training data set. @@ -126,4 +137,3 @@ One can set other environmental variables: | Environment variables | Allowed value | Default value | Usage | | --------------------- | ---------------------- | ------------- | -------------------------- | | DP_INTERFACE_PREC | `high`, `low` | `high` | Control high (double) or low (float) precision of training. | -| DP_ENABLE_MIXED_PREC | `fp16` | | Control mixed precision(fp16) of training and inference. | From 646233ea39d129a2acc6e593c97eb60d132e68f2 Mon Sep 17 00:00:00 2001 From: denghuilu Date: Sun, 21 Nov 2021 21:58:48 +0800 Subject: [PATCH 09/16] fix typo --- deepmd/train/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index 75d36cf246..78ef1e3b9a 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -361,7 +361,7 @@ def _build_training(self): if self.mixed_prec is not None: # check the TF_VERSION, when TF < 1.12, mixed precision is not allowed if TF_VERSION < "1.12": - raise RuntimeError("TensorFlow version %s is not compatible with the mixed precision setting. Please consider upgrade your TF version!" % TF_VERSION) + raise RuntimeError("TensorFlow version %s is not compatible with the mixed precision setting. Please consider upgrading your TF version!" % TF_VERSION) # enable dynamic loss scale of the gradients optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer) apply_op = optimizer.minimize(loss=self.l2_l, From e945ed01f7f2bb602ba9c7f4eb461a6d925a4778 Mon Sep 17 00:00:00 2001 From: denghuilu Date: Sun, 21 Nov 2021 23:22:42 +0800 Subject: [PATCH 10/16] fix TF_VERSION control --- deepmd/train/trainer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index 78ef1e3b9a..ae9a8a62a5 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -362,8 +362,10 @@ def _build_training(self): # check the TF_VERSION, when TF < 1.12, mixed precision is not allowed if TF_VERSION < "1.12": raise RuntimeError("TensorFlow version %s is not compatible with the mixed precision setting. Please consider upgrading your TF version!" % TF_VERSION) - # enable dynamic loss scale of the gradients - optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer) + elif TF_VERSION < "2.4": + optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer) + else: + optimizer = tf.mixed_precision.enable_mixed_precision_graph_rewrite(optimizer) apply_op = optimizer.minimize(loss=self.l2_l, global_step=self.global_step, var_list=trainable_variables, From 972a5b19fa746295d7716a9e958792e6832483c7 Mon Sep 17 00:00:00 2001 From: denghuilu Date: Sun, 21 Nov 2021 23:44:23 +0800 Subject: [PATCH 11/16] fix TF_VERSION comparison --- deepmd/train/trainer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index ae9a8a62a5..6c38dff376 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -359,10 +359,11 @@ def _build_training(self): else: optimizer = tf.train.AdamOptimizer(learning_rate = self.learning_rate) if self.mixed_prec is not None: + TF_VERSION_LIST = [int(item) for item in TF_VERSION.split('.')] # check the TF_VERSION, when TF < 1.12, mixed precision is not allowed - if TF_VERSION < "1.12": + if TF_VERSION_LIST < [1, 12, 0]: raise RuntimeError("TensorFlow version %s is not compatible with the mixed precision setting. Please consider upgrading your TF version!" % TF_VERSION) - elif TF_VERSION < "2.4": + elif TF_VERSION_LIST < [2, 4, 0]: optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer) else: optimizer = tf.mixed_precision.enable_mixed_precision_graph_rewrite(optimizer) From fcdfb31832badcff1c31fa20a131ed74ffc8cca5 Mon Sep 17 00:00:00 2001 From: denghuilu Date: Mon, 22 Nov 2021 09:02:57 +0800 Subject: [PATCH 12/16] enable mixed precision for hybrid descriptor --- deepmd/descriptor/hybrid.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/deepmd/descriptor/hybrid.py b/deepmd/descriptor/hybrid.py index 9d8967faee..39cb9fed0e 100644 --- a/deepmd/descriptor/hybrid.py +++ b/deepmd/descriptor/hybrid.py @@ -264,6 +264,20 @@ def enable_compression(self, for idx, ii in enumerate(self.descrpt_list): ii.enable_compression(min_nbor_dist, model_file, table_extrapolate, table_stride_1, table_stride_2, check_frequency, suffix=f"{suffix}_{idx}") + + def enable_mixed_precision(self, mixed_prec : dict = None) -> None: + """ + Reveive the mixed precision setting. + + Parameters + ---------- + mixed_prec + The mixed precision setting used in the embedding net + """ + for idx, ii in enumerate(self.descrpt_list): + ii.enable_mixed_precision(mixed_prec) + + def init_variables(self, model_file : str, suffix : str = "", From b868ea3ffe7faf4727aae1795b1b760a53693a04 Mon Sep 17 00:00:00 2001 From: denghuilu Date: Mon, 22 Nov 2021 09:28:54 +0800 Subject: [PATCH 13/16] Update network.py --- deepmd/utils/network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepmd/utils/network.py b/deepmd/utils/network.py index cadcc7964b..824af455d4 100644 --- a/deepmd/utils/network.py +++ b/deepmd/utils/network.py @@ -17,7 +17,7 @@ def one_layer(inputs, reuse=None, seed=None, use_timestep = False, - trainable = False, + trainable = True, useBN = False, uniform_seed = False, initial_variables = None, From 6d517edaaf76de6233149191c05e5c07e03d747b Mon Sep 17 00:00:00 2001 From: denghuilu Date: Mon, 22 Nov 2021 10:01:11 +0800 Subject: [PATCH 14/16] use parameter to control the network mixed precision output precision --- deepmd/fit/dipole.py | 2 +- deepmd/fit/ener.py | 3 ++- deepmd/fit/polar.py | 6 +++--- deepmd/utils/network.py | 9 +++++---- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/deepmd/fit/dipole.py b/deepmd/fit/dipole.py index 5dfa5fab0f..7c2d5dea86 100644 --- a/deepmd/fit/dipole.py +++ b/deepmd/fit/dipole.py @@ -147,7 +147,7 @@ def build (self, layer = one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables, mixed_prec = self.mixed_prec) if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift # (nframes x natoms) x naxis - final_layer = one_layer(layer, self.dim_rot_mat_1, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables, mixed_prec = self.mixed_prec) + final_layer = one_layer(layer, self.dim_rot_mat_1, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables, mixed_prec = self.mixed_prec, final_layer = True) if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift # (nframes x natoms) x 1 * naxis final_layer = tf.reshape(final_layer, [tf.shape(inputs)[0] * natoms[2+type_i], 1, self.dim_rot_mat_1]) diff --git a/deepmd/fit/ener.py b/deepmd/fit/ener.py index 3fe4013616..e6b0d0a763 100644 --- a/deepmd/fit/ener.py +++ b/deepmd/fit/ener.py @@ -322,7 +322,8 @@ def _build_lower( trainable = self.trainable[-1], uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables, - mixed_prec = self.mixed_prec) + mixed_prec = self.mixed_prec, + final_layer = True) if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift return final_layer diff --git a/deepmd/fit/polar.py b/deepmd/fit/polar.py index d858d37ac1..5f6ddd7525 100644 --- a/deepmd/fit/polar.py +++ b/deepmd/fit/polar.py @@ -79,7 +79,7 @@ def build (self, else : layer = one_layer(layer, self.n_neuron[ii], name='layer_'+str(ii)+'_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, activation_fn = self.fitting_activation_fn, precision = self.fitting_precision) # (nframes x natoms) x 9 - final_layer = one_layer(layer, 9, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, precision = self.fitting_precision) + final_layer = one_layer(layer, 9, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, precision = self.fitting_precision, final_layer = True) # (nframes x natoms) x 3 x 3 final_layer = tf.reshape(final_layer, [tf.shape(inputs)[0] * natoms[2+type_i], 3, 3]) # (nframes x natoms) x 3 x 3 @@ -335,7 +335,7 @@ def build (self, # bavg[1] = self.avgeig[1] # bavg[2] = self.avgeig[2] # (nframes x natoms) x naxis - final_layer = one_layer(layer, self.dim_rot_mat_1, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, bavg = bavg, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables, mixed_prec = self.mixed_prec) + final_layer = one_layer(layer, self.dim_rot_mat_1, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, bavg = bavg, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables, mixed_prec = self.mixed_prec, final_layer = True) if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift # (nframes x natoms) x naxis final_layer = tf.reshape(final_layer, [tf.shape(inputs)[0] * natoms[2+type_i], self.dim_rot_mat_1]) @@ -347,7 +347,7 @@ def build (self, # bavg[1*self.dim_rot_mat_1+1] = self.avgeig[1] # bavg[2*self.dim_rot_mat_1+2] = self.avgeig[2] # (nframes x natoms) x (naxis x naxis) - final_layer = one_layer(layer, self.dim_rot_mat_1*self.dim_rot_mat_1, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, bavg = bavg, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables, mixed_prec = self.mixed_prec) + final_layer = one_layer(layer, self.dim_rot_mat_1*self.dim_rot_mat_1, activation_fn = None, name='final_layer_type_'+str(type_i)+suffix, reuse=reuse, seed = self.seed, bavg = bavg, precision = self.fitting_precision, uniform_seed = self.uniform_seed, initial_variables = self.fitting_net_variables, mixed_prec = self.mixed_prec, final_layer = True) if (not self.uniform_seed) and (self.seed is not None): self.seed += self.seed_shift # (nframes x natoms) x naxis x naxis final_layer = tf.reshape(final_layer, [tf.shape(inputs)[0] * natoms[2+type_i], self.dim_rot_mat_1, self.dim_rot_mat_1]) diff --git a/deepmd/utils/network.py b/deepmd/utils/network.py index 824af455d4..c82721becb 100644 --- a/deepmd/utils/network.py +++ b/deepmd/utils/network.py @@ -21,9 +21,10 @@ def one_layer(inputs, useBN = False, uniform_seed = False, initial_variables = None, - mixed_prec = None): + mixed_prec = None, + final_layer = False): # For good accuracy, the last layer of the fitting network uses a higher precision neuron network. - if mixed_prec is not None and outputs_size == 1: + if mixed_prec is not None and final_layer: inputs = tf.cast(inputs, get_precision(mixed_prec['output_prec'])) with tf.variable_scope(name, reuse=reuse): shape = inputs.get_shape().as_list() @@ -50,7 +51,7 @@ def one_layer(inputs, trainable = trainable) variable_summaries(b, 'bias') - if mixed_prec is not None and outputs_size != 1: + if mixed_prec is not None and not final_layer: inputs = tf.cast(inputs, get_precision(mixed_prec['compute_prec'])) w = tf.cast(w, get_precision(mixed_prec['compute_prec'])) b = tf.cast(b, get_precision(mixed_prec['compute_prec'])) @@ -76,7 +77,7 @@ def one_layer(inputs, # return activation_fn(hidden_bn) else: if use_timestep : - if mixed_prec is not None and outputs_size != 1: + if mixed_prec is not None and not final_layer: idt = tf.cast(idt, get_precision(mixed_prec['compute_prec'])) return tf.reshape(activation_fn(hidden), [-1, outputs_size]) * idt else : From e447ab5b75b3927fb9e5840c2d05d2e6510b8a49 Mon Sep 17 00:00:00 2001 From: denghuilu Date: Mon, 22 Nov 2021 13:52:58 +0800 Subject: [PATCH 15/16] add example for mixed precision training workflow --- deepmd/descriptor/descriptor.py | 4 +- deepmd/train/trainer.py | 8 ++- examples/water/se_e2_a_mixed_prec/input.json | 70 ++++++++++++++++++++ requirements.txt | 1 + 4 files changed, 79 insertions(+), 4 deletions(-) create mode 100644 examples/water/se_e2_a_mixed_prec/input.json diff --git a/deepmd/descriptor/descriptor.py b/deepmd/descriptor/descriptor.py index 0642779985..a500826808 100644 --- a/deepmd/descriptor/descriptor.py +++ b/deepmd/descriptor/descriptor.py @@ -277,7 +277,9 @@ def enable_mixed_precision(self, mixed_prec : dict = None) -> None: This method is called by others when the descriptor supported compression. """ raise NotImplementedError( - "Descriptor %s doesn't support mixed precision training!" % type(self).__name__) + "Descriptor %s doesn't support mixed precision training!" + % type(self).__name__ + ) @abstractmethod diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index 6c38dff376..0e1ed6d3e8 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -7,6 +7,8 @@ import shutil import google.protobuf.message import numpy as np +from packaging.version import Version + from deepmd.env import tf from deepmd.env import get_tf_session_config from deepmd.env import GLOBAL_TF_FLOAT_PRECISION @@ -359,11 +361,11 @@ def _build_training(self): else: optimizer = tf.train.AdamOptimizer(learning_rate = self.learning_rate) if self.mixed_prec is not None: - TF_VERSION_LIST = [int(item) for item in TF_VERSION.split('.')] + _TF_VERSION = Version(TF_VERSION) # check the TF_VERSION, when TF < 1.12, mixed precision is not allowed - if TF_VERSION_LIST < [1, 12, 0]: + if _TF_VERSION < Version('1.12.0'): raise RuntimeError("TensorFlow version %s is not compatible with the mixed precision setting. Please consider upgrading your TF version!" % TF_VERSION) - elif TF_VERSION_LIST < [2, 4, 0]: + elif _TF_VERSION < Version('2.4.0'): optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer) else: optimizer = tf.mixed_precision.enable_mixed_precision_graph_rewrite(optimizer) diff --git a/examples/water/se_e2_a_mixed_prec/input.json b/examples/water/se_e2_a_mixed_prec/input.json new file mode 100644 index 0000000000..889abedabf --- /dev/null +++ b/examples/water/se_e2_a_mixed_prec/input.json @@ -0,0 +1,70 @@ +{ + "_comment": " model parameters", + "model": { + "type_map": ["O", "H"], + "descriptor" :{ + "type": "se_e2_a", + "sel": [46, 92], + "rcut_smth": 0.50, + "rcut": 6.00, + "neuron": [25, 50, 100], + "resnet_dt": false, + "axis_neuron": 16, + "seed": 1, + "_comment": " that's all" + }, + "fitting_net" : { + "neuron": [240, 240, 240], + "resnet_dt": true, + "seed": 1, + "_comment": " that's all" + }, + "_comment": " that's all" + }, + + "learning_rate" :{ + "type": "exp", + "decay_steps": 5000, + "start_lr": 0.001, + "stop_lr": 3.51e-8, + "_comment": "that's all" + }, + + "loss" :{ + "type": "ener", + "start_pref_e": 0.02, + "limit_pref_e": 1, + "start_pref_f": 1000, + "limit_pref_f": 1, + "start_pref_v": 0, + "limit_pref_v": 0, + "_comment": " that's all" + }, + + "training" : { + "training_data": { + "systems": ["../data/data_0/", "../data/data_1/", "../data/data_2/"], + "batch_size": "auto", + "_comment": "that's all" + }, + "validation_data":{ + "systems": ["../data/data_3"], + "batch_size": 1, + "numb_btch": 3, + "_comment": "that's all" + }, + "mixed_precision": { + "compute_prec": "float16", + "output_prec": "float32" + }, + "numb_steps": 1000000, + "seed": 10, + "disp_file": "lcurve.out", + "disp_freq": 100, + "save_freq": 1000, + "_comment": "that's all" + }, + + "_comment": "that's all" +} + diff --git a/requirements.txt b/requirements.txt index f3ead805b8..06b71f825c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ python-hostlist >= 1.21 typing_extensions; python_version < "3.7" h5py wcmatch +packaging From 6fa19c9404e01c8f0b5396af6e9c8220d01eee98 Mon Sep 17 00:00:00 2001 From: denghuilu Date: Mon, 22 Nov 2021 15:23:52 +0800 Subject: [PATCH 16/16] fix lint errors --- deepmd/descriptor/descriptor.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/deepmd/descriptor/descriptor.py b/deepmd/descriptor/descriptor.py index a500826808..231f3abe1e 100644 --- a/deepmd/descriptor/descriptor.py +++ b/deepmd/descriptor/descriptor.py @@ -262,8 +262,7 @@ def enable_compression(self, raise NotImplementedError( "Descriptor %s doesn't support compression!" % type(self).__name__) - - def enable_mixed_precision(self, mixed_prec : dict = None) -> None: + def enable_mixed_precision(self, mixed_prec: dict = None) -> None: """ Reveive the mixed precision setting. @@ -271,7 +270,7 @@ def enable_mixed_precision(self, mixed_prec : dict = None) -> None: ---------- mixed_prec The mixed precision setting used in the embedding net - + Notes ----- This method is called by others when the descriptor supported compression. @@ -281,7 +280,6 @@ def enable_mixed_precision(self, mixed_prec : dict = None) -> None: % type(self).__name__ ) - @abstractmethod def prod_force_virial(self, atom_ener: tf.Tensor,