From ae56a4e8d5bd25077ca2b2110644e6f629366618 Mon Sep 17 00:00:00 2001 From: denghuilu Date: Fri, 11 Feb 2022 11:35:00 +0800 Subject: [PATCH 1/6] fix bug of mixed precision training --- deepmd/descriptor/se_a.py | 2 +- source/tests/test_mixed_prec_training.py | 55 ++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 source/tests/test_mixed_prec_training.py diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py index cf218309bd..fd47b139f2 100644 --- a/deepmd/descriptor/se_a.py +++ b/deepmd/descriptor/se_a.py @@ -738,7 +738,7 @@ def _filter_lower( # we can safely return the final xyz_scatter filled with zero directly return tf.cast(tf.fill((natom, 4, outputs_size[-1]), 0.), self.filter_precision) # natom x nei_type_i x out_size - xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1]//4, outputs_size[-1])) + xyz_scatter = tf.cast(tf.reshape(xyz_scatter, (-1, shape_i[1]//4, outputs_size[-1])), self.filter_precision) # When using tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]) below # [588 24] -> [588 6 4] correct # but if sel is zero diff --git a/source/tests/test_mixed_prec_training.py b/source/tests/test_mixed_prec_training.py new file mode 100644 index 0000000000..2152541a7f --- /dev/null +++ b/source/tests/test_mixed_prec_training.py @@ -0,0 +1,55 @@ +import os,json +import numpy as np +import unittest +import subprocess as sp + +from deepmd.infer import DeepPot +# from deepmd.entrypoints.compress import compress +from common import j_loader, tests_path + + +def _file_delete(file) : + if os.path.isdir(file): + os.rmdir(file) + elif os.path.isfile(file): + os.remove(file) + +def _subprocess_run(command): + popen = sp.Popen(command.split(), shell=False, stdout=sp.PIPE, stderr=sp.STDOUT) + for line in iter(popen.stdout.readline, b''): + if hasattr(line, 'decode'): + line = line.decode('utf-8') + line = line.rstrip() + print(line) + popen.wait() + return popen.returncode + +class TestMixedPrecTraining(unittest.TestCase): + def setUp(self): + data_file = str(tests_path / os.path.join("model_compression", "data")) + self.INPUT = str(tests_path / "input.json") + jdata = j_loader(str(tests_path / os.path.join("model_compression", "input.json"))) + jdata["training"]["training_data"]["systems"] = data_file + jdata["training"]["validation_data"]["systems"] = data_file + jdata["training"]["mixed_precision"] = {} + jdata["training"]["mixed_precision"]["compute_prec"] = "float16" + jdata["training"]["mixed_precision"]["output_prec"] = "float32" + with open(self.INPUT, "w") as fp: + json.dump(jdata, fp, indent=4) + + def test_training(self): + ret = _subprocess_run("dp train " + self.INPUT) + np.testing.assert_equal(ret, 0, 'DP train failed!') + + def tearDown(self): + _file_delete(self.INPUT) + _file_delete("out.json") + _file_delete("checkpoint") + _file_delete("model.ckpt.meta") + _file_delete("model.ckpt.index") + _file_delete("model.ckpt.data-00000-of-00001") + _file_delete("model.ckpt-100.meta") + _file_delete("model.ckpt-100.index") + _file_delete("model.ckpt-100.data-00000-of-00001") + _file_delete("input_v2_compat.json") + _file_delete("lcurve.out") From b9157ba9ad4fca2a18528532173107dd4663be58 Mon Sep 17 00:00:00 2001 From: denghuilu Date: Fri, 11 Feb 2022 11:53:24 +0800 Subject: [PATCH 2/6] Update test_mixed_prec_training.py --- source/tests/test_mixed_prec_training.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/source/tests/test_mixed_prec_training.py b/source/tests/test_mixed_prec_training.py index 2152541a7f..975cdfbb51 100644 --- a/source/tests/test_mixed_prec_training.py +++ b/source/tests/test_mixed_prec_training.py @@ -2,10 +2,11 @@ import numpy as np import unittest import subprocess as sp +from packaging.version import Version from deepmd.infer import DeepPot # from deepmd.entrypoints.compress import compress -from common import j_loader, tests_path +from common import j_loader, tests_path, TF_VERSION def _file_delete(file) : @@ -38,8 +39,11 @@ def setUp(self): json.dump(jdata, fp, indent=4) def test_training(self): - ret = _subprocess_run("dp train " + self.INPUT) - np.testing.assert_equal(ret, 0, 'DP train failed!') + _TF_VERSION = Version(TF_VERSION) + # check the TF_VERSION, when TF < 1.12, mixed precision is not allowed + if _TF_VERSION >= Version('1.12.0'): + ret = _subprocess_run("dp train " + self.INPUT) + np.testing.assert_equal(ret, 0, 'DP train failed!') def tearDown(self): _file_delete(self.INPUT) From 813b163c724e8d1e82c9dcb3d6f9411412917b7d Mon Sep 17 00:00:00 2001 From: denghuilu Date: Fri, 11 Feb 2022 14:11:06 +0800 Subject: [PATCH 3/6] fix UT error --- source/tests/test_mixed_prec_training.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/tests/test_mixed_prec_training.py b/source/tests/test_mixed_prec_training.py index 975cdfbb51..03153213df 100644 --- a/source/tests/test_mixed_prec_training.py +++ b/source/tests/test_mixed_prec_training.py @@ -6,7 +6,8 @@ from deepmd.infer import DeepPot # from deepmd.entrypoints.compress import compress -from common import j_loader, tests_path, TF_VERSION +from common import j_loader, tests_path +from deepmd.env import TF_VERSION def _file_delete(file) : From 7c51aeab62a9af9e85caded505cf267067949c47 Mon Sep 17 00:00:00 2001 From: denghuilu Date: Fri, 11 Feb 2022 14:56:18 +0800 Subject: [PATCH 4/6] fix UT error --- source/tests/test_mixed_prec_training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/tests/test_mixed_prec_training.py b/source/tests/test_mixed_prec_training.py index 03153213df..28a1d485f7 100644 --- a/source/tests/test_mixed_prec_training.py +++ b/source/tests/test_mixed_prec_training.py @@ -42,7 +42,7 @@ def setUp(self): def test_training(self): _TF_VERSION = Version(TF_VERSION) # check the TF_VERSION, when TF < 1.12, mixed precision is not allowed - if _TF_VERSION >= Version('1.12.0'): + if _TF_VERSION >= Version('1.14.0'): ret = _subprocess_run("dp train " + self.INPUT) np.testing.assert_equal(ret, 0, 'DP train failed!') From a731e596fa02106b08700fa2f1a33b0451544198 Mon Sep 17 00:00:00 2001 From: denghuilu Date: Fri, 11 Feb 2022 16:01:21 +0800 Subject: [PATCH 5/6] fix UT error --- deepmd/train/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index 21f078b763..2b3f8a249c 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -372,7 +372,7 @@ def _build_training(self): if self.mixed_prec is not None: _TF_VERSION = Version(TF_VERSION) # check the TF_VERSION, when TF < 1.12, mixed precision is not allowed - if _TF_VERSION < Version('1.12.0'): + if _TF_VERSION < Version('1.14.0'): raise RuntimeError("TensorFlow version %s is not compatible with the mixed precision setting. Please consider upgrading your TF version!" % TF_VERSION) elif _TF_VERSION < Version('2.4.0'): optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer) From 22be0f516601f9ea47f81ef6002bdedc5450ab8a Mon Sep 17 00:00:00 2001 From: denghuilu Date: Mon, 14 Feb 2022 09:36:01 +0800 Subject: [PATCH 6/6] address comment --- deepmd/descriptor/se_a.py | 2 +- deepmd/utils/network.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py index fd47b139f2..cf218309bd 100644 --- a/deepmd/descriptor/se_a.py +++ b/deepmd/descriptor/se_a.py @@ -738,7 +738,7 @@ def _filter_lower( # we can safely return the final xyz_scatter filled with zero directly return tf.cast(tf.fill((natom, 4, outputs_size[-1]), 0.), self.filter_precision) # natom x nei_type_i x out_size - xyz_scatter = tf.cast(tf.reshape(xyz_scatter, (-1, shape_i[1]//4, outputs_size[-1])), self.filter_precision) + xyz_scatter = tf.reshape(xyz_scatter, (-1, shape_i[1]//4, outputs_size[-1])) # When using tf.reshape(inputs_i, [-1, shape_i[1]//4, 4]) below # [588 24] -> [588 6 4] correct # but if sel is zero diff --git a/deepmd/utils/network.py b/deepmd/utils/network.py index 57dd90f893..befd571f24 100644 --- a/deepmd/utils/network.py +++ b/deepmd/utils/network.py @@ -79,15 +79,13 @@ def one_layer(inputs, if use_timestep : if mixed_prec is not None and not final_layer: idt = tf.cast(idt, get_precision(mixed_prec['compute_prec'])) - return tf.reshape(activation_fn(hidden), [-1, outputs_size]) * idt + hidden = tf.reshape(activation_fn(hidden), [-1, outputs_size]) * idt else : - return tf.reshape(activation_fn(hidden), [-1, outputs_size]) - else: - if useBN: - None - # return self._batch_norm(hidden, name=name+'_normalization', reuse=reuse) - else: - return hidden + hidden = tf.reshape(activation_fn(hidden), [-1, outputs_size]) + + if mixed_prec is not None: + hidden = tf.cast(hidden, get_precision(mixed_prec['output_prec'])) + return hidden def embedding_net_rand_seed_shift( @@ -237,6 +235,8 @@ def embedding_net(xx, xx = tf.concat([xx,xx], 1) + hidden else: xx = hidden + if mixed_prec is not None: + xx = tf.cast(xx, get_precision(mixed_prec['output_prec'])) return xx def variable_summaries(var: tf.Variable, name: str):