From 3dda8e1ea23637d8c2ed165b2e953b56cb30d30f Mon Sep 17 00:00:00 2001 From: HuangJiameng <105633685+HuangJiameng@users.noreply.github.com> Date: Thu, 2 Feb 2023 20:02:40 +0800 Subject: [PATCH 01/27] Support learning rate dict in multi-task mode. --- deepmd/train/trainer.py | 157 +++++++++++++++++++++++++++------------ deepmd/utils/argcheck.py | 19 +++++ 2 files changed, 129 insertions(+), 47 deletions(-) diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index 1ca69641b3..3bddac41a4 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -216,21 +216,42 @@ def fitting_net_init(fitting_type_, descrpt_type_, params): ) # learning rate - lr_param = j_must_have(jdata, 'learning_rate') - scale_by_worker = lr_param.get('scale_by_worker', 'linear') - if scale_by_worker == 'linear': - self.scale_lr_coef = float(self.run_opt.world_size) - elif scale_by_worker == 'sqrt': - self.scale_lr_coef = np.sqrt(self.run_opt.world_size).real + if not self.multi_task_mode: + lr_param = j_must_have(jdata, 'learning_rate') + scale_by_worker = lr_param.get('scale_by_worker', 'linear') + if scale_by_worker == 'linear': + self.scale_lr_coef = float(self.run_opt.world_size) + elif scale_by_worker == 'sqrt': + self.scale_lr_coef = np.sqrt(self.run_opt.world_size).real + else: + self.scale_lr_coef = 1. + lr_type = lr_param.get('type', 'exp') + if lr_type == 'exp': + self.lr = LearningRateExp(lr_param['start_lr'], + lr_param['stop_lr'], + lr_param['decay_steps']) + else : + raise RuntimeError('unknown learning_rate type ' + lr_type) else: - self.scale_lr_coef = 1. - lr_type = lr_param.get('type', 'exp') - if lr_type == 'exp': - self.lr = LearningRateExp(lr_param['start_lr'], - lr_param['stop_lr'], - lr_param['decay_steps']) - else : - raise RuntimeError('unknown learning_rate type ' + lr_type) + self.lr_dict = {} + self.scale_lr_coef_dict = {} + lr_param_dict = jdata.get('learning_rate_dict', {}) + for fitting_key in self.fitting_type_dict: + lr_param = lr_param_dict.get(fitting_key, {}) + scale_by_worker = lr_param.get('scale_by_worker', 'linear') + if scale_by_worker == 'linear': + self.scale_lr_coef_dict[fitting_key] = float(self.run_opt.world_size) + elif scale_by_worker == 'sqrt': + self.scale_lr_coef_dict[fitting_key] = np.sqrt(self.run_opt.world_size).real + else: + self.scale_lr_coef_dict[fitting_key] = 1. + lr_type = lr_param.get('type', 'exp') + if lr_type == 'exp': + self.lr_dict[fitting_key] = LearningRateExp(lr_param['start_lr'], + lr_param['stop_lr'], + lr_param['decay_steps']) + else : + raise RuntimeError('unknown learning_rate type ' + lr_type) # loss # infer loss type by fitting_type @@ -283,7 +304,7 @@ def loss_init(_loss_param, _fitting_type, _fitting, _lr): for fitting_key in self.fitting_type_dict: loss_param = loss_param_dict.get(fitting_key, {}) self.loss_dict[fitting_key] = loss_init(loss_param, self.fitting_type_dict[fitting_key], - self.fitting_dict[fitting_key], self.lr) + self.fitting_dict[fitting_key], self.lr_dict[fitting_key]) # training tr_data = jdata['training'] @@ -449,7 +470,13 @@ def build (self, def _build_lr(self): self._extra_train_ops = [] self.global_step = tf.train.get_or_create_global_step() - self.learning_rate = self.lr.build(self.global_step, self.stop_batch) + if not self.multi_task_mode: + self.learning_rate = self.lr.build(self.global_step, self.stop_batch) + else: + self.learning_rate_dict = {} + for fitting_key in self.fitting_type_dict: + self.learning_rate_dict[fitting_key] = self.lr_dict[fitting_key].build(self.global_step, self.stop_batch) + log.info("built lr") def _build_network(self, data, suffix=""): @@ -493,7 +520,7 @@ def _build_network(self, data, suffix=""): self.l2_l, self.l2_more = {}, {} for fitting_key in self.fitting_type_dict: self.l2_l[fitting_key], self.l2_more[fitting_key]\ - = self.loss_dict[fitting_key].build(self.learning_rate, + = self.loss_dict[fitting_key].build(self.learning_rate_dict[fitting_key], self.place_holders['natoms_vec'], self.model_pred[fitting_key], self.place_holders, @@ -506,25 +533,26 @@ def _build_network(self, data, suffix=""): def _build_training(self): trainable_variables = tf.trainable_variables() - if self.run_opt.is_distrib: - if self.scale_lr_coef > 1.: - log.info('Scale learning rate by coef: %f', self.scale_lr_coef) - optimizer = tf.train.AdamOptimizer(self.learning_rate*self.scale_lr_coef) - else: - optimizer = tf.train.AdamOptimizer(self.learning_rate) - optimizer = self.run_opt._HVD.DistributedOptimizer(optimizer) - else: - optimizer = tf.train.AdamOptimizer(learning_rate = self.learning_rate) - if self.mixed_prec is not None: - _TF_VERSION = Version(TF_VERSION) - # check the TF_VERSION, when TF < 1.12, mixed precision is not allowed - if _TF_VERSION < Version('1.14.0'): - raise RuntimeError("TensorFlow version %s is not compatible with the mixed precision setting. Please consider upgrading your TF version!" % TF_VERSION) - elif _TF_VERSION < Version('2.4.0'): - optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer) - else: - optimizer = tf.mixed_precision.enable_mixed_precision_graph_rewrite(optimizer) if not self.multi_task_mode: + if self.run_opt.is_distrib: + if self.scale_lr_coef > 1.: + log.info('Scale learning rate by coef: %f', self.scale_lr_coef) + optimizer = tf.train.AdamOptimizer(self.learning_rate*self.scale_lr_coef) + else: + optimizer = tf.train.AdamOptimizer(self.learning_rate) + optimizer = self.run_opt._HVD.DistributedOptimizer(optimizer) + else: + optimizer = tf.train.AdamOptimizer(learning_rate = self.learning_rate) + if self.mixed_prec is not None: + _TF_VERSION = Version(TF_VERSION) + # check the TF_VERSION, when TF < 1.12, mixed precision is not allowed + if _TF_VERSION < Version('1.14.0'): + raise RuntimeError("TensorFlow version %s is not compatible with the mixed precision setting. Please consider upgrading your TF version!" % TF_VERSION) + elif _TF_VERSION < Version('2.4.0'): + optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer) + else: + optimizer = tf.mixed_precision.enable_mixed_precision_graph_rewrite(optimizer) + apply_op = optimizer.minimize(loss=self.l2_l, global_step=self.global_step, var_list=trainable_variables, @@ -533,7 +561,25 @@ def _build_training(self): self.train_op = tf.group(*train_ops) else: self.train_op = {} - for fitting_key in self.fitting_type_dict: + for fitting_key in self.fitting_type_dict: + if self.run_opt.is_distrib: + if self.scale_lr_coef_dict[fitting_key] > 1.: + log.info('Scale learning rate by coef: %f', self.scale_lr_coef_dict[fitting_key]) + optimizer = tf.train.AdamOptimizer(self.learning_rate_dict[fitting_key]*self.scale_lr_coef_dict[fitting_key]) + else: + optimizer = tf.train.AdamOptimizer(self.learning_rate_dict[fitting_key]) + optimizer = self.run_opt._HVD.DistributedOptimizer(optimizer) + else: + optimizer = tf.train.AdamOptimizer(learning_rate = self.learning_rate_dict[fitting_key]) + if self.mixed_prec is not None: + _TF_VERSION = Version(TF_VERSION) + # check the TF_VERSION, when TF < 1.12, mixed precision is not allowed + if _TF_VERSION < Version('1.14.0'): + raise RuntimeError("TensorFlow version %s is not compatible with the mixed precision setting. Please consider upgrading your TF version!" % TF_VERSION) + elif _TF_VERSION < Version('2.4.0'): + optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer) + else: + optimizer = tf.mixed_precision.enable_mixed_precision_graph_rewrite(optimizer) apply_op = optimizer.minimize(loss=self.l2_l[fitting_key], global_step=self.global_step, var_list=trainable_variables, @@ -612,13 +658,25 @@ def train (self, train_data = None, valid_data=None) : cur_batch = run_sess(self.sess, self.global_step) is_first_step = True self.cur_batch = cur_batch - log.info("start training at lr %.2e (== %.2e), decay_step %d, decay_rate %f, final lr will be %.2e" % - (run_sess(self.sess, self.learning_rate), - self.lr.value(cur_batch), - self.lr.decay_steps_, - self.lr.decay_rate_, - self.lr.value(stop_batch)) - ) + if not self.multi_task_mode: + log.info("start training at lr %.2e (== %.2e), decay_step %d, decay_rate %f, final lr will be %.2e" % + (run_sess(self.sess, self.learning_rate), + self.lr.value(cur_batch), + self.lr.decay_steps_, + self.lr.decay_rate_, + self.lr.value(stop_batch)) + ) + else: + for fitting_key in self.fitting_type_dict: + log.info("%s: start training at lr %.2e (== %.2e), decay_step %d, decay_rate %f, final lr will be %.2e" % + (fitting_key, + run_sess(self.sess, self.learning_rate_dict[fitting_key]), + self.lr_dict[fitting_key].value(cur_batch), + self.lr_dict[fitting_key].decay_steps_, + self.lr_dict[fitting_key].decay_rate_, + self.lr_dict[fitting_key].value(stop_batch)) + ) + prf_options = None prf_run_metadata = None @@ -678,7 +736,7 @@ def train (self, train_data = None, valid_data=None) : valid_batches[fitting_key] = [valid_data[fitting_key].get_batch() for ii in range(self.valid_numb_batch_dict[fitting_key])] \ if fitting_key in valid_data else None - self.valid_on_the_fly(fp, train_batches, valid_batches, print_header=True) + self.valid_on_the_fly(fp, train_batches, valid_batches, print_header=True, fitting_key=fitting_key) is_first_step = False if self.timing_in_training: tic = time.time() @@ -713,7 +771,7 @@ def train (self, train_data = None, valid_data=None) : valid_batches[fitting_key] = [valid_data[fitting_key].get_batch() for ii in range(self.valid_numb_batch_dict[fitting_key])] \ if fitting_key in valid_data else None - self.valid_on_the_fly(fp, train_batches, valid_batches) + self.valid_on_the_fly(fp, train_batches, valid_batches, fitting_key=fitting_key) if self.timing_in_training: toc = time.time() test_time = toc - tic @@ -792,12 +850,17 @@ def valid_on_the_fly(self, fp, train_batches, valid_batches, - print_header=False): + print_header=False, + fitting_key=None): train_results = self.get_evaluation_results(train_batches) valid_results = self.get_evaluation_results(valid_batches) cur_batch = self.cur_batch - current_lr = run_sess(self.sess, self.learning_rate) + if not self.multi_task_mode: + current_lr = run_sess(self.sess, self.learning_rate) + else: + assert fitting_key is not None, "Fitting key must be assigned in validation!" + current_lr = run_sess(self.sess, self.learning_rate_dict[fitting_key]) if print_header: self.print_header(fp, train_results, valid_results, self.multi_task_mode) self.print_on_training(fp, train_results, valid_results, cur_batch, current_lr, self.multi_task_mode) diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index a6d176750d..4b5d898f65 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -807,10 +807,12 @@ def normalize_multi_task(data): single_training_data = "training_data" in data["training"].keys() single_valid_data = "validation_data" in data["training"].keys() single_loss = "loss" in data.keys() + single_learning_rate = "learning_rate" in data.keys() multi_fitting_net = "fitting_net_dict" in data["model"].keys() multi_training_data = "data_dict" in data["training"].keys() multi_loss = "loss_dict" in data.keys() multi_fitting_weight = "fitting_weight" in data["training"].keys() + multi_learning_rate = "learning_rate_dict" in data.keys() assert (single_fitting_net == single_training_data) and \ (multi_fitting_net == multi_training_data), \ "In single-task mode, 'model/fitting_net' and 'training/training_data' must be defined at the same time! " \ @@ -824,11 +826,14 @@ def normalize_multi_task(data): assert not single_valid_data, "In multi-task mode, 'training/validation_data' should not appear " \ "outside 'training/data_dict'! Please check your input script." assert not single_loss, "In multi-task mode, please use 'model/loss_dict' in stead of 'model/loss'! " + assert not single_learning_rate, "In multi-task mode, please use 'model/leaning_rate_dict' in stead of 'model/learning_rate'! " assert "type_map" in data["model"], "In multi-task mode, 'model/type_map' must be defined! " data["model"]["fitting_net_dict"] = normalize_fitting_net_dict(data["model"]["fitting_net_dict"]) data["training"]["data_dict"] = normalize_data_dict(data["training"]["data_dict"]) data["loss_dict"] = normalize_loss_dict(data["model"]["fitting_net_dict"].keys(), data["loss_dict"]) if multi_loss else {} + data["learning_rate_dict"] = normalize_learning_rate_dict(data["model"]["fitting_net_dict"].keys(), + data["learning_rate_dict"]) if multi_learning_rate else {} fitting_weight = data["training"]["fitting_weight"] if multi_fitting_weight else None data["training"]["fitting_weight"] = \ normalize_fitting_weight(data["model"]["fitting_net_dict"].keys(), @@ -836,6 +841,7 @@ def normalize_multi_task(data): fitting_weight=fitting_weight) else: assert not multi_loss, "In single-task mode, please use 'model/loss' in stead of 'model/loss_dict'! " + assert not multi_learning_rate, "In single-task mode, please use 'model/learning_rate' in stead of 'model/learning_rate_dict'! " return data @@ -873,6 +879,19 @@ def normalize_loss_dict(fitting_keys, loss_dict): new_dict[item] = data return new_dict +def normalize_learning_rate_dict(fitting_keys, learning_rate_dict): + # check the learning_rate dict + failed_learning_rate_keys = [item for item in learning_rate_dict if item not in fitting_keys] + assert not failed_learning_rate_keys, \ + "Learning rate dict key(s) {} not have corresponding fitting keys in {}! ".format( + str(failed_learning_rate_keys), str(list(fitting_keys))) + new_dict = {} + base = Argument('base', dict, [], [learning_rate_variant_type_args()], doc="") + for item in learning_rate_dict: + data = base.normalize_value(learning_rate_dict[item], trim_pattern="_*") + base.check_value(data, strict=True) + new_dict[item] = data + return new_dict def normalize_fitting_weight(fitting_keys, data_keys, fitting_weight=None): # check the mapping From e0e7bc15921cb4e7b59bd61826f0cefffb48eb1b Mon Sep 17 00:00:00 2001 From: HuangJiameng <105633685+HuangJiameng@users.noreply.github.com> Date: Mon, 6 Feb 2023 15:24:48 +0800 Subject: [PATCH 02/27] update function learning_rate_dict_args --- deepmd/utils/argcheck.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index 13c351731f..3b1396a5a6 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -522,8 +522,16 @@ def learning_rate_args(): return Argument("learning_rate", dict, [Argument("scale_by_worker", str, optional=True, default='linear', doc=doc_scale_by_worker)], [learning_rate_variant_type_args()], + optional = True, doc = doc_lr) +def learning_rate_dict_args(): + doc_learning_rate_dict = 'The dictionary of definitions of learning rates in multi-task mode. ' \ + 'Each learning_rate_dict[fitting_key], with user-defined name `fitting_key` in `model/fitting_net_dict`, is the single definition of learning rate.\n' + ca = Argument('learning_rate_dict', dict, [], [], + optional = True, + doc = doc_learning_rate_dict) + return ca # --- Loss configurations: --- # def start_pref(item): @@ -762,6 +770,7 @@ def gen_doc(*, make_anchor=True, make_link=True, **kwargs): make_anchor = True ma = model_args() lra = learning_rate_args() + lrda = learning_rate_dict_args() la = loss_args() lda = loss_dict_args() ta = training_args() @@ -771,6 +780,7 @@ def gen_doc(*, make_anchor=True, make_link=True, **kwargs): ptr.append(la.gen_doc(make_anchor=make_anchor, make_link=make_link, **kwargs)) ptr.append(lda.gen_doc(make_anchor=make_anchor, make_link=make_link, **kwargs)) ptr.append(lra.gen_doc(make_anchor=make_anchor, make_link=make_link, **kwargs)) + ptr.append(lrda.gen_doc(make_anchor=make_anchor, make_link=make_link, **kwargs)) ptr.append(ta.gen_doc(make_anchor=make_anchor, make_link=make_link, **kwargs)) ptr.append(nvnmda.gen_doc(make_anchor=make_anchor, make_link=make_link, **kwargs)) @@ -815,10 +825,12 @@ def normalize_multi_task(data): single_training_data = "training_data" in data["training"].keys() single_valid_data = "validation_data" in data["training"].keys() single_loss = "loss" in data.keys() + single_learning_rate = "learning_rate" in data.keys() multi_fitting_net = "fitting_net_dict" in data["model"].keys() multi_training_data = "data_dict" in data["training"].keys() multi_loss = "loss_dict" in data.keys() multi_fitting_weight = "fitting_weight" in data["training"].keys() + multi_learning_rate = "learning_rate_dict" in data.keys() assert (single_fitting_net == single_training_data) and \ (multi_fitting_net == multi_training_data), \ "In single-task mode, 'model/fitting_net' and 'training/training_data' must be defined at the same time! " \ @@ -832,11 +844,14 @@ def normalize_multi_task(data): assert not single_valid_data, "In multi-task mode, 'training/validation_data' should not appear " \ "outside 'training/data_dict'! Please check your input script." assert not single_loss, "In multi-task mode, please use 'model/loss_dict' in stead of 'model/loss'! " + assert not single_learning_rate, "In multi-task mode, please use 'model/leaning_rate_dict' in stead of 'model/learning_rate'! " assert "type_map" in data["model"], "In multi-task mode, 'model/type_map' must be defined! " data["model"]["fitting_net_dict"] = normalize_fitting_net_dict(data["model"]["fitting_net_dict"]) data["training"]["data_dict"] = normalize_data_dict(data["training"]["data_dict"]) data["loss_dict"] = normalize_loss_dict(data["model"]["fitting_net_dict"].keys(), data["loss_dict"]) if multi_loss else {} + data["learning_rate_dict"] = normalize_learning_rate_dict(data["model"]["fitting_net_dict"].keys(), + data["learning_rate_dict"]) if multi_learning_rate else {} fitting_weight = data["training"]["fitting_weight"] if multi_fitting_weight else None data["training"]["fitting_weight"] = \ normalize_fitting_weight(data["model"]["fitting_net_dict"].keys(), @@ -844,6 +859,7 @@ def normalize_multi_task(data): fitting_weight=fitting_weight) else: assert not multi_loss, "In single-task mode, please use 'model/loss' in stead of 'model/loss_dict'! " + assert not multi_learning_rate, "In single-task mode, please use 'model/learning_rate' in stead of 'model/learning_rate_dict'! " return data @@ -881,6 +897,19 @@ def normalize_loss_dict(fitting_keys, loss_dict): new_dict[item] = data return new_dict +def normalize_learning_rate_dict(fitting_keys, learning_rate_dict): + # check the learning_rate dict + failed_learning_rate_keys = [item for item in learning_rate_dict if item not in fitting_keys] + assert not failed_learning_rate_keys, \ + "Learning rate dict key(s) {} not have corresponding fitting keys in {}! ".format( + str(failed_learning_rate_keys), str(list(fitting_keys))) + new_dict = {} + base = Argument('base', dict, [], [learning_rate_variant_type_args()], doc="") + for item in learning_rate_dict: + data = base.normalize_value(learning_rate_dict[item], trim_pattern="_*") + base.check_value(data, strict=True) + new_dict[item] = data + return new_dict def normalize_fitting_weight(fitting_keys, data_keys, fitting_weight=None): # check the mapping @@ -940,12 +969,13 @@ def normalize(data): data = normalize_multi_task(data) ma = model_args() lra = learning_rate_args() + lrda = learning_rate_dict_args() la = loss_args() lda = loss_dict_args() ta = training_args() nvnmda = nvnmd_args() - base = Argument("base", dict, [ma, lra, la, lda, ta, nvnmda]) + base = Argument("base", dict, [ma, lra, lrda, la, lda, ta, nvnmda]) data = base.normalize_value(data, trim_pattern="_*") base.check_value(data, strict=True) From 035278db3d7778f3d97aa1f3c2c33406823eca14 Mon Sep 17 00:00:00 2001 From: HuangJiameng <105633685+HuangJiameng@users.noreply.github.com> Date: Wed, 22 Feb 2023 10:59:32 +0800 Subject: [PATCH 03/27] print learning rate in multi-task mode --- deepmd/train/trainer.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index 3f84196a63..0104fe1a11 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -868,7 +868,11 @@ def valid_on_the_fly(self, current_lr = run_sess(self.sess, self.learning_rate_dict[fitting_key]) if print_header: self.print_header(fp, train_results, valid_results, self.multi_task_mode) - self.print_on_training(fp, train_results, valid_results, cur_batch, current_lr, self.multi_task_mode) + if not self.multi_task_mode: + self.print_on_training(fp, train_results, valid_results, cur_batch, current_lr, self.multi_task_mode) + else: + assert fitting_key is not None, "Fitting key must be assigned when printing learning rate!" + self.print_on_training(fp, train_results, valid_results, cur_batch, current_lr, self.multi_task_mode, fitting_key) @staticmethod def print_header(fp, train_results, valid_results, multi_task_mode=False): @@ -883,6 +887,7 @@ def print_header(fp, train_results, valid_results, multi_task_mode=False): prop_fmt = ' %11s' for k in train_results.keys(): print_str += prop_fmt % (k + '_trn') + print_str += ' %8s\n' % (k + '_lr') else: for fitting_key in train_results: if valid_results[fitting_key] is not None: @@ -893,12 +898,12 @@ def print_header(fp, train_results, valid_results, multi_task_mode=False): prop_fmt = ' %11s' for k in train_results[fitting_key].keys(): print_str += prop_fmt % (k + '_trn') - print_str += ' %8s\n' % 'lr' + print_str += ' %8s\n' % (k + '_lr') fp.write(print_str) fp.flush() @staticmethod - def print_on_training(fp, train_results, valid_results, cur_batch, cur_lr, multi_task_mode=False): + def print_on_training(fp, train_results, valid_results, cur_batch, cur_lr, multi_task_mode=False, cur_fitting_key = None): print_str = '' print_str += "%7d" % cur_batch if not multi_task_mode: @@ -922,7 +927,10 @@ def print_on_training(fp, train_results, valid_results, cur_batch, cur_lr, multi prop_fmt = " %11.2e" for k in train_results[fitting_key].keys(): print_str += prop_fmt % (train_results[fitting_key][k]) - print_str += " %8.1e\n" % cur_lr + if fitting_key == cur_fitting_key: + print_str += " %8.1e\n" % cur_lr + else: + print_str += " %8.1e\n" % 0 fp.write(print_str) fp.flush() From 677c00047bacb9c2366e6856b7c0211549b6d32e Mon Sep 17 00:00:00 2001 From: HuangJiameng <105633685+HuangJiameng@users.noreply.github.com> Date: Wed, 22 Feb 2023 15:58:52 +0800 Subject: [PATCH 04/27] Update trainer.py change fitting_key to fitting_key_ii to avoid overwriting --- deepmd/train/trainer.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index 0104fe1a11..61890bc8f7 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -736,11 +736,12 @@ def train (self, train_data = None, valid_data=None) : train_batches = {} valid_batches = {} # valid_numb_batch_dict - for fitting_key in train_data: - train_batches[fitting_key] = [train_data[fitting_key].get_batch()] - valid_batches[fitting_key] = [valid_data[fitting_key].get_batch() - for ii in range(self.valid_numb_batch_dict[fitting_key])] \ - if fitting_key in valid_data else None + for fitting_key_ii in train_data: + # enumerate fitting key as fitting_key_ii + train_batches[fitting_key_ii] = [train_data[fitting_key_ii].get_batch()] + valid_batches[fitting_key_ii] = [valid_data[fitting_key_ii].get_batch() + for ii in range(self.valid_numb_batch_dict[fitting_key_ii])] \ + if fitting_key_ii in valid_data else None self.valid_on_the_fly(fp, train_batches, valid_batches, print_header=True, fitting_key=fitting_key) is_first_step = False From 50503f5aeb935de9e7be41f7128d1ad6f6376e1b Mon Sep 17 00:00:00 2001 From: HuangJiameng <105633685+HuangJiameng@users.noreply.github.com> Date: Wed, 22 Feb 2023 16:11:46 +0800 Subject: [PATCH 05/27] enumerate fitting_key as fitting_key_ii to avoid overwriting --- deepmd/train/trainer.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index 61890bc8f7..c2df3b24b8 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -772,11 +772,11 @@ def train (self, train_data = None, valid_data=None) : else: train_batches = {} valid_batches = {} - for fitting_key in train_data: - train_batches[fitting_key] = [train_data[fitting_key].get_batch()] - valid_batches[fitting_key] = [valid_data[fitting_key].get_batch() - for ii in range(self.valid_numb_batch_dict[fitting_key])] \ - if fitting_key in valid_data else None + for fitting_key_ii in train_data: + train_batches[fitting_key_ii] = [train_data[fitting_key_ii].get_batch()] + valid_batches[fitting_key_ii] = [valid_data[fitting_key_ii].get_batch() + for ii in range(self.valid_numb_batch_dict[fitting_key_ii])] \ + if fitting_key_ii in valid_data else None self.valid_on_the_fly(fp, train_batches, valid_batches, fitting_key=fitting_key) if self.timing_in_training: toc = time.time() @@ -899,7 +899,7 @@ def print_header(fp, train_results, valid_results, multi_task_mode=False): prop_fmt = ' %11s' for k in train_results[fitting_key].keys(): print_str += prop_fmt % (k + '_trn') - print_str += ' %8s\n' % (k + '_lr') + print_str += ' %8s\n' % (fitting_key + '_lr') fp.write(print_str) fp.flush() From 306c4d0c96526b9aa7d4768152610e0b5f7230ae Mon Sep 17 00:00:00 2001 From: HuangJiameng <105633685+HuangJiameng@users.noreply.github.com> Date: Sun, 26 Feb 2023 15:20:55 +0800 Subject: [PATCH 06/27] fix print_on_training for lr_dict --- deepmd/train/trainer.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index c2df3b24b8..c4df558333 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -866,14 +866,18 @@ def valid_on_the_fly(self, current_lr = run_sess(self.sess, self.learning_rate) else: assert fitting_key is not None, "Fitting key must be assigned in validation!" - current_lr = run_sess(self.sess, self.learning_rate_dict[fitting_key]) + current_lr = None + # current_lr can be used as the learning rate of descriptor in the future + current_lr_dict = {} + for fitting_key_ii in train_batches: + current_lr_dict[fitting_key_ii] = run_sess(self.sess, self.learning_rate_dict[fitting_key_ii]) if print_header: self.print_header(fp, train_results, valid_results, self.multi_task_mode) if not self.multi_task_mode: self.print_on_training(fp, train_results, valid_results, cur_batch, current_lr, self.multi_task_mode) else: assert fitting_key is not None, "Fitting key must be assigned when printing learning rate!" - self.print_on_training(fp, train_results, valid_results, cur_batch, current_lr, self.multi_task_mode, fitting_key) + self.print_on_training(fp, train_results, valid_results, cur_batch, current_lr, self.multi_task_mode, current_lr_dict) @staticmethod def print_header(fp, train_results, valid_results, multi_task_mode=False): @@ -904,7 +908,7 @@ def print_header(fp, train_results, valid_results, multi_task_mode=False): fp.flush() @staticmethod - def print_on_training(fp, train_results, valid_results, cur_batch, cur_lr, multi_task_mode=False, cur_fitting_key = None): + def print_on_training(fp, train_results, valid_results, cur_batch, cur_lr, multi_task_mode=False, cur_lr_dict=None): print_str = '' print_str += "%7d" % cur_batch if not multi_task_mode: @@ -928,10 +932,7 @@ def print_on_training(fp, train_results, valid_results, cur_batch, cur_lr, multi prop_fmt = " %11.2e" for k in train_results[fitting_key].keys(): print_str += prop_fmt % (train_results[fitting_key][k]) - if fitting_key == cur_fitting_key: - print_str += " %8.1e\n" % cur_lr - else: - print_str += " %8.1e\n" % 0 + print_str += " %8.1e\n" % cur_lr_dict[fitting_key] fp.write(print_str) fp.flush() From 1b95621422620626ae71985e85a1d3490b57d202 Mon Sep 17 00:00:00 2001 From: HuangJiameng <105633685+HuangJiameng@users.noreply.github.com> Date: Sun, 26 Feb 2023 15:21:20 +0800 Subject: [PATCH 07/27] update unittest for lr_dict --- source/tests/water_multi.json | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/source/tests/water_multi.json b/source/tests/water_multi.json index 6202e8ebd2..24817aab6d 100644 --- a/source/tests/water_multi.json +++ b/source/tests/water_multi.json @@ -32,12 +32,22 @@ } } }, - "learning_rate" :{ - "type": "exp", - "start_lr": 0.001, - "decay_steps": 5000, - "decay_rate": 0.95, - "_comment": "that's all" + "learning_rate_dict" : + { + "water_ener": { + "type": "exp", + "start_lr": 0.001, + "decay_steps": 5000, + "decay_rate": 0.95, + "_comment": "that's all" + }, + "water_dipole": { + "type": "exp", + "start_lr": 0.001, + "decay_steps": 5000, + "decay_rate": 0.95, + "_comment": "that's all" + } }, "loss_dict" : { From 964fe47cdf753e115b0672940b92f5d2c05a5701 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 26 Feb 2023 07:47:12 +0000 Subject: [PATCH 08/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/train/trainer.py | 269 ++++++++++++++++++++++------------ deepmd/utils/argcheck.py | 2 +- source/tests/water_multi.json | 34 ++--- 3 files changed, 195 insertions(+), 110 deletions(-) diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index caaf880e0d..558681c177 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -270,41 +270,47 @@ def fitting_net_init(fitting_type_, descrpt_type_, params): # learning rate if not self.multi_task_mode: - lr_param = j_must_have(jdata, 'learning_rate') - scale_by_worker = lr_param.get('scale_by_worker', 'linear') - if scale_by_worker == 'linear': + lr_param = j_must_have(jdata, "learning_rate") + scale_by_worker = lr_param.get("scale_by_worker", "linear") + if scale_by_worker == "linear": self.scale_lr_coef = float(self.run_opt.world_size) - elif scale_by_worker == 'sqrt': + elif scale_by_worker == "sqrt": self.scale_lr_coef = np.sqrt(self.run_opt.world_size).real else: - self.scale_lr_coef = 1. - lr_type = lr_param.get('type', 'exp') - if lr_type == 'exp': - self.lr = LearningRateExp(lr_param['start_lr'], - lr_param['stop_lr'], - lr_param['decay_steps']) - else : - raise RuntimeError('unknown learning_rate type ' + lr_type) + self.scale_lr_coef = 1.0 + lr_type = lr_param.get("type", "exp") + if lr_type == "exp": + self.lr = LearningRateExp( + lr_param["start_lr"], lr_param["stop_lr"], lr_param["decay_steps"] + ) + else: + raise RuntimeError("unknown learning_rate type " + lr_type) else: self.lr_dict = {} self.scale_lr_coef_dict = {} - lr_param_dict = jdata.get('learning_rate_dict', {}) + lr_param_dict = jdata.get("learning_rate_dict", {}) for fitting_key in self.fitting_type_dict: lr_param = lr_param_dict.get(fitting_key, {}) - scale_by_worker = lr_param.get('scale_by_worker', 'linear') - if scale_by_worker == 'linear': - self.scale_lr_coef_dict[fitting_key] = float(self.run_opt.world_size) - elif scale_by_worker == 'sqrt': - self.scale_lr_coef_dict[fitting_key] = np.sqrt(self.run_opt.world_size).real + scale_by_worker = lr_param.get("scale_by_worker", "linear") + if scale_by_worker == "linear": + self.scale_lr_coef_dict[fitting_key] = float( + self.run_opt.world_size + ) + elif scale_by_worker == "sqrt": + self.scale_lr_coef_dict[fitting_key] = np.sqrt( + self.run_opt.world_size + ).real + else: + self.scale_lr_coef_dict[fitting_key] = 1.0 + lr_type = lr_param.get("type", "exp") + if lr_type == "exp": + self.lr_dict[fitting_key] = LearningRateExp( + lr_param["start_lr"], + lr_param["stop_lr"], + lr_param["decay_steps"], + ) else: - self.scale_lr_coef_dict[fitting_key] = 1. - lr_type = lr_param.get('type', 'exp') - if lr_type == 'exp': - self.lr_dict[fitting_key] = LearningRateExp(lr_param['start_lr'], - lr_param['stop_lr'], - lr_param['decay_steps']) - else : - raise RuntimeError('unknown learning_rate type ' + lr_type) + raise RuntimeError("unknown learning_rate type " + lr_type) # loss # infer loss type by fitting_type @@ -367,9 +373,9 @@ def loss_init(_loss_param, _fitting_type, _fitting, _lr): for fitting_key in self.fitting_type_dict: loss_param = loss_param_dict.get(fitting_key, {}) self.loss_dict[fitting_key] = loss_init( - loss_param, + loss_param, self.fitting_type_dict[fitting_key], - self.fitting_dict[fitting_key], + self.fitting_dict[fitting_key], self.lr_dict[fitting_key], ) @@ -585,8 +591,10 @@ def _build_lr(self): else: self.learning_rate_dict = {} for fitting_key in self.fitting_type_dict: - self.learning_rate_dict[fitting_key] = self.lr_dict[fitting_key].build(self.global_step, self.stop_batch) - + self.learning_rate_dict[fitting_key] = self.lr_dict[fitting_key].build( + self.global_step, self.stop_batch + ) + log.info("built lr") def _build_network(self, data, suffix=""): @@ -644,10 +652,10 @@ def _build_network(self, data, suffix=""): fitting_key ].build( self.learning_rate_dict[fitting_key], - self.place_holders['natoms_vec'], + self.place_holders["natoms_vec"], self.model_pred[fitting_key], self.place_holders, - suffix=fitting_key + suffix=fitting_key, ) if self.mixed_prec is not None: self.l2_l[fitting_key] = tf.cast( @@ -661,63 +669,90 @@ def _build_training(self): trainable_variables = tf.trainable_variables() if not self.multi_task_mode: if self.run_opt.is_distrib: - if self.scale_lr_coef > 1.: - log.info('Scale learning rate by coef: %f', self.scale_lr_coef) - optimizer = tf.train.AdamOptimizer(self.learning_rate*self.scale_lr_coef) + if self.scale_lr_coef > 1.0: + log.info("Scale learning rate by coef: %f", self.scale_lr_coef) + optimizer = tf.train.AdamOptimizer( + self.learning_rate * self.scale_lr_coef + ) else: optimizer = tf.train.AdamOptimizer(self.learning_rate) optimizer = self.run_opt._HVD.DistributedOptimizer(optimizer) else: - optimizer = tf.train.AdamOptimizer(learning_rate = self.learning_rate) + optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) if self.mixed_prec is not None: _TF_VERSION = Version(TF_VERSION) - # check the TF_VERSION, when TF < 1.12, mixed precision is not allowed - if _TF_VERSION < Version('1.14.0'): + # check the TF_VERSION, when TF < 1.12, mixed precision is not allowed + if _TF_VERSION < Version("1.14.0"): raise RuntimeError( "TensorFlow version %s is not compatible with the mixed precision setting. Please consider upgrading your TF version!" % TF_VERSION ) - elif _TF_VERSION < Version('2.4.0'): - optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( - optimizer + elif _TF_VERSION < Version("2.4.0"): + optimizer = ( + tf.train.experimental.enable_mixed_precision_graph_rewrite( + optimizer + ) ) else: optimizer = tf.mixed_precision.enable_mixed_precision_graph_rewrite( optimizer ) - apply_op = optimizer.minimize(loss=self.l2_l, - global_step=self.global_step, - var_list=trainable_variables, - name='train_step') + apply_op = optimizer.minimize( + loss=self.l2_l, + global_step=self.global_step, + var_list=trainable_variables, + name="train_step", + ) train_ops = [apply_op] + self._extra_train_ops self.train_op = tf.group(*train_ops) else: self.train_op = {} - for fitting_key in self.fitting_type_dict: + for fitting_key in self.fitting_type_dict: if self.run_opt.is_distrib: - if self.scale_lr_coef_dict[fitting_key] > 1.: - log.info('Scale learning rate by coef: %f', self.scale_lr_coef_dict[fitting_key]) - optimizer = tf.train.AdamOptimizer(self.learning_rate_dict[fitting_key]*self.scale_lr_coef_dict[fitting_key]) + if self.scale_lr_coef_dict[fitting_key] > 1.0: + log.info( + "Scale learning rate by coef: %f", + self.scale_lr_coef_dict[fitting_key], + ) + optimizer = tf.train.AdamOptimizer( + self.learning_rate_dict[fitting_key] + * self.scale_lr_coef_dict[fitting_key] + ) else: - optimizer = tf.train.AdamOptimizer(self.learning_rate_dict[fitting_key]) + optimizer = tf.train.AdamOptimizer( + self.learning_rate_dict[fitting_key] + ) optimizer = self.run_opt._HVD.DistributedOptimizer(optimizer) else: - optimizer = tf.train.AdamOptimizer(learning_rate = self.learning_rate_dict[fitting_key]) + optimizer = tf.train.AdamOptimizer( + learning_rate=self.learning_rate_dict[fitting_key] + ) if self.mixed_prec is not None: _TF_VERSION = Version(TF_VERSION) - # check the TF_VERSION, when TF < 1.12, mixed precision is not allowed - if _TF_VERSION < Version('1.14.0'): - raise RuntimeError("TensorFlow version %s is not compatible with the mixed precision setting. Please consider upgrading your TF version!" % TF_VERSION) - elif _TF_VERSION < Version('2.4.0'): - optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer) + # check the TF_VERSION, when TF < 1.12, mixed precision is not allowed + if _TF_VERSION < Version("1.14.0"): + raise RuntimeError( + "TensorFlow version %s is not compatible with the mixed precision setting. Please consider upgrading your TF version!" + % TF_VERSION + ) + elif _TF_VERSION < Version("2.4.0"): + optimizer = ( + tf.train.experimental.enable_mixed_precision_graph_rewrite( + optimizer + ) + ) else: - optimizer = tf.mixed_precision.enable_mixed_precision_graph_rewrite(optimizer) + optimizer = ( + tf.mixed_precision.enable_mixed_precision_graph_rewrite( + optimizer + ) + ) apply_op = optimizer.minimize( loss=self.l2_l[fitting_key], global_step=self.global_step, var_list=trainable_variables, - name='train_step_{}'.format(fitting_key) + name=f"train_step_{fitting_key}", ) train_ops = [apply_op] + self._extra_train_ops self.train_op[fitting_key] = tf.group(*train_ops) @@ -794,27 +829,28 @@ def train(self, train_data=None, valid_data=None): is_first_step = True self.cur_batch = cur_batch if not self.multi_task_mode: - log.info("start training at lr %.2e (== %.2e), decay_step %d, decay_rate %f, final lr will be %.2e" + log.info( + "start training at lr %.2e (== %.2e), decay_step %d, decay_rate %f, final lr will be %.2e" % ( run_sess(self.sess, self.learning_rate), - self.lr.value(cur_batch), + self.lr.value(cur_batch), self.lr.decay_steps_, self.lr.decay_rate_, - self.lr.value(stop_batch) - ) + self.lr.value(stop_batch), + ) ) else: for fitting_key in self.fitting_type_dict: log.info( - "%s: start training at lr %.2e (== %.2e), decay_step %d, decay_rate %f, final lr will be %.2e" + "%s: start training at lr %.2e (== %.2e), decay_step %d, decay_rate %f, final lr will be %.2e" % ( fitting_key, run_sess(self.sess, self.learning_rate_dict[fitting_key]), - self.lr_dict[fitting_key].value(cur_batch), + self.lr_dict[fitting_key].value(cur_batch), self.lr_dict[fitting_key].decay_steps_, self.lr_dict[fitting_key].decay_rate_, - self.lr_dict[fitting_key].value(stop_batch) - ) + self.lr_dict[fitting_key].value(stop_batch), + ) ) prf_options = None @@ -889,10 +925,23 @@ def train(self, train_data=None, valid_data=None): train_batches[fitting_key_ii] = [ train_data[fitting_key_ii].get_batch() ] - valid_batches[fitting_key_ii] = [valid_data[fitting_key_ii].get_batch() - for ii in range(self.valid_numb_batch_dict[fitting_key_ii])] \ - if fitting_key_ii in valid_data else None - self.valid_on_the_fly(fp, train_batches, valid_batches, print_header=True, fitting_key=fitting_key) + valid_batches[fitting_key_ii] = ( + [ + valid_data[fitting_key_ii].get_batch() + for ii in range( + self.valid_numb_batch_dict[fitting_key_ii] + ) + ] + if fitting_key_ii in valid_data + else None + ) + self.valid_on_the_fly( + fp, + train_batches, + valid_batches, + print_header=True, + fitting_key=fitting_key, + ) is_first_step = False if self.timing_in_training: @@ -943,11 +992,22 @@ def train(self, train_data=None, valid_data=None): train_batches = {} valid_batches = {} for fitting_key_ii in train_data: - train_batches[fitting_key_ii] = [train_data[fitting_key_ii].get_batch()] - valid_batches[fitting_key_ii] = [valid_data[fitting_key_ii].get_batch() - for ii in range(self.valid_numb_batch_dict[fitting_key_ii])] \ - if fitting_key_ii in valid_data else None - self.valid_on_the_fly(fp, train_batches, valid_batches, fitting_key=fitting_key) + train_batches[fitting_key_ii] = [ + train_data[fitting_key_ii].get_batch() + ] + valid_batches[fitting_key_ii] = ( + [ + valid_data[fitting_key_ii].get_batch() + for ii in range( + self.valid_numb_batch_dict[fitting_key_ii] + ) + ] + if fitting_key_ii in valid_data + else None + ) + self.valid_on_the_fly( + fp, train_batches, valid_batches, fitting_key=fitting_key + ) if self.timing_in_training: toc = time.time() test_time = toc - tic @@ -1051,12 +1111,9 @@ def get_global_step(self): # fp.write(print_str) # fp.close () - def valid_on_the_fly(self, - fp, - train_batches, - valid_batches, - print_header=False, - fitting_key=None): + def valid_on_the_fly( + self, fp, train_batches, valid_batches, print_header=False, fitting_key=None + ): train_results = self.get_evaluation_results(train_batches) valid_results = self.get_evaluation_results(valid_batches) @@ -1064,20 +1121,40 @@ def valid_on_the_fly(self, if not self.multi_task_mode: current_lr = run_sess(self.sess, self.learning_rate) else: - assert fitting_key is not None, "Fitting key must be assigned in validation!" + assert ( + fitting_key is not None + ), "Fitting key must be assigned in validation!" current_lr = None # current_lr can be used as the learning rate of descriptor in the future current_lr_dict = {} for fitting_key_ii in train_batches: - current_lr_dict[fitting_key_ii] = run_sess(self.sess, self.learning_rate_dict[fitting_key_ii]) + current_lr_dict[fitting_key_ii] = run_sess( + self.sess, self.learning_rate_dict[fitting_key_ii] + ) if print_header: self.print_header(fp, train_results, valid_results, self.multi_task_mode) if not self.multi_task_mode: - self.print_on_training(fp, train_results, valid_results, cur_batch, current_lr, self.multi_task_mode) + self.print_on_training( + fp, + train_results, + valid_results, + cur_batch, + current_lr, + self.multi_task_mode, + ) else: - assert fitting_key is not None, "Fitting key must be assigned when printing learning rate!" - self.print_on_training(fp, train_results, valid_results, cur_batch, current_lr, self.multi_task_mode, current_lr_dict) - + assert ( + fitting_key is not None + ), "Fitting key must be assigned when printing learning rate!" + self.print_on_training( + fp, + train_results, + valid_results, + cur_batch, + current_lr, + self.multi_task_mode, + current_lr_dict, + ) @staticmethod def print_header(fp, train_results, valid_results, multi_task_mode=False): @@ -1091,8 +1168,8 @@ def print_header(fp, train_results, valid_results, multi_task_mode=False): else: prop_fmt = " %11s" for k in train_results.keys(): - print_str += prop_fmt % (k + '_trn') - print_str += ' %8s\n' % (k + '_lr') + print_str += prop_fmt % (k + "_trn") + print_str += " %8s\n" % (k + "_lr") else: for fitting_key in train_results: if valid_results[fitting_key] is not None: @@ -1102,13 +1179,21 @@ def print_header(fp, train_results, valid_results, multi_task_mode=False): else: prop_fmt = " %11s" for k in train_results[fitting_key].keys(): - print_str += prop_fmt % (k + '_trn') - print_str += ' %8s\n' % (fitting_key + '_lr') + print_str += prop_fmt % (k + "_trn") + print_str += " %8s\n" % (fitting_key + "_lr") fp.write(print_str) fp.flush() @staticmethod - def print_on_training(fp, train_results, valid_results, cur_batch, cur_lr, multi_task_mode=False, cur_lr_dict=None): + def print_on_training( + fp, + train_results, + valid_results, + cur_batch, + cur_lr, + multi_task_mode=False, + cur_lr_dict=None, + ): print_str = "" print_str += "%7d" % cur_batch if not multi_task_mode: diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index c343210b97..322a16a159 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -789,7 +789,7 @@ def learning_rate_variant_type_args(): def learning_rate_args(): doc_scale_by_worker = 'When parallel training or batch size scaled, how to alter learning rate. Valid values are `linear`(default), `sqrt` or `none`.' - doc_lr = "The definitio of learning rate" + doc_lr = "The definitio of learning rate" return Argument("learning_rate", dict, [Argument("scale_by_worker", str, optional=True, default='linear', doc=doc_scale_by_worker)], [learning_rate_variant_type_args()], diff --git a/source/tests/water_multi.json b/source/tests/water_multi.json index 817e00df80..e74b724157 100644 --- a/source/tests/water_multi.json +++ b/source/tests/water_multi.json @@ -44,24 +44,24 @@ "seed": 1 } } - }, - "learning_rate_dict" : - { - "water_ener": { - "type": "exp", - "start_lr": 0.001, - "decay_steps": 5000, - "decay_rate": 0.95, - "_comment": "that's all" + }, + "learning_rate_dict": + { + "water_ener": { + "type": "exp", + "start_lr": 0.001, + "decay_steps": 5000, + "decay_rate": 0.95, + "_comment": "that's all" + }, + "water_dipole": { + "type": "exp", + "start_lr": 0.001, + "decay_steps": 5000, + "decay_rate": 0.95, + "_comment": "that's all" + } }, - "water_dipole": { - "type": "exp", - "start_lr": 0.001, - "decay_steps": 5000, - "decay_rate": 0.95, - "_comment": "that's all" - } - }, "loss_dict": { "water_ener": { From b379d4fc1e9d591ee970d73653372423b8d42575 Mon Sep 17 00:00:00 2001 From: HuangJiameng <105633685+HuangJiameng@users.noreply.github.com> Date: Sun, 26 Feb 2023 17:11:53 +0800 Subject: [PATCH 09/27] fix SyntaxError --- deepmd/utils/argcheck.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index 322a16a159..12d8fa057b 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -1314,7 +1314,6 @@ def normalize_multi_task(data): assert not (single_fitting_net and multi_fitting_net), \ "Single-task mode and multi-task mode can not be performed together. " \ "Please check your input script and choose just one format! " - ) assert ( single_fitting_net or multi_fitting_net ), "Please define your fitting net and training data! " From 9271dbb003b49b8efc1f992ed8ccfffd32c6d255 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 26 Feb 2023 09:12:18 +0000 Subject: [PATCH 10/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/utils/argcheck.py | 135 +++++++++++++++++++++++++++------------ 1 file changed, 94 insertions(+), 41 deletions(-) diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index 12d8fa057b..e2d5a42524 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -788,22 +788,37 @@ def learning_rate_variant_type_args(): def learning_rate_args(): - doc_scale_by_worker = 'When parallel training or batch size scaled, how to alter learning rate. Valid values are `linear`(default), `sqrt` or `none`.' + doc_scale_by_worker = "When parallel training or batch size scaled, how to alter learning rate. Valid values are `linear`(default), `sqrt` or `none`." doc_lr = "The definitio of learning rate" - return Argument("learning_rate", dict, - [Argument("scale_by_worker", str, optional=True, default='linear', doc=doc_scale_by_worker)], - [learning_rate_variant_type_args()], - optional = True, - doc = doc_lr) + return Argument( + "learning_rate", + dict, + [ + Argument( + "scale_by_worker", + str, + optional=True, + default="linear", + doc=doc_scale_by_worker, + ) + ], + [learning_rate_variant_type_args()], + optional=True, + doc=doc_lr, + ) + def learning_rate_dict_args(): - doc_learning_rate_dict = 'The dictionary of definitions of learning rates in multi-task mode. ' \ - 'Each learning_rate_dict[fitting_key], with user-defined name `fitting_key` in `model/fitting_net_dict`, is the single definition of learning rate.\n' - ca = Argument('learning_rate_dict', dict, [], [], - optional = True, - doc = doc_learning_rate_dict) + doc_learning_rate_dict = ( + "The dictionary of definitions of learning rates in multi-task mode. " + "Each learning_rate_dict[fitting_key], with user-defined name `fitting_key` in `model/fitting_net_dict`, is the single definition of learning rate.\n" + ) + ca = Argument( + "learning_rate_dict", dict, [], [], optional=True, doc=doc_learning_rate_dict + ) return ca + # --- Loss configurations: --- # def start_pref(item): return f"The prefactor of {item} loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the {item} label should be provided by file {item}.npy in each data system. If both start_pref_{item} and limit_pref_{item} are set to 0, then the {item} will be ignored." @@ -1306,37 +1321,69 @@ def normalize_multi_task(data): multi_loss = "loss_dict" in data.keys() multi_fitting_weight = "fitting_weight" in data["training"].keys() multi_learning_rate = "learning_rate_dict" in data.keys() - assert (single_fitting_net == single_training_data) and \ - (multi_fitting_net == multi_training_data), \ - "In single-task mode, 'model/fitting_net' and 'training/training_data' must be defined at the same time! " \ - "While in multi-task mode, 'model/fitting_net_dict', 'training/data_dict' " \ - "must be defined at the same time! Please check your input script. " - assert not (single_fitting_net and multi_fitting_net), \ - "Single-task mode and multi-task mode can not be performed together. " \ + assert (single_fitting_net == single_training_data) and ( + multi_fitting_net == multi_training_data + ), ( + "In single-task mode, 'model/fitting_net' and 'training/training_data' must be defined at the same time! " + "While in multi-task mode, 'model/fitting_net_dict', 'training/data_dict' " + "must be defined at the same time! Please check your input script. " + ) + assert not (single_fitting_net and multi_fitting_net), ( + "Single-task mode and multi-task mode can not be performed together. " "Please check your input script and choose just one format! " + ) assert ( single_fitting_net or multi_fitting_net ), "Please define your fitting net and training data! " if multi_fitting_net: - assert not single_valid_data, "In multi-task mode, 'training/validation_data' should not appear " \ - "outside 'training/data_dict'! Please check your input script." - assert not single_loss, "In multi-task mode, please use 'model/loss_dict' in stead of 'model/loss'! " - assert not single_learning_rate, "In multi-task mode, please use 'model/leaning_rate_dict' in stead of 'model/learning_rate'! " - assert "type_map" in data["model"], "In multi-task mode, 'model/type_map' must be defined! " - data["model"]["fitting_net_dict"] = normalize_fitting_net_dict(data["model"]["fitting_net_dict"]) - data["training"]["data_dict"] = normalize_data_dict(data["training"]["data_dict"]) - data["loss_dict"] = normalize_loss_dict(data["model"]["fitting_net_dict"].keys(), - data["loss_dict"]) if multi_loss else {} - data["learning_rate_dict"] = normalize_learning_rate_dict(data["model"]["fitting_net_dict"].keys(), - data["learning_rate_dict"]) if multi_learning_rate else {} - fitting_weight = data["training"]["fitting_weight"] if multi_fitting_weight else None - data["training"]["fitting_weight"] = \ - normalize_fitting_weight(data["model"]["fitting_net_dict"].keys(), - data["training"]["data_dict"].keys(), - fitting_weight=fitting_weight) + assert not single_valid_data, ( + "In multi-task mode, 'training/validation_data' should not appear " + "outside 'training/data_dict'! Please check your input script." + ) + assert ( + not single_loss + ), "In multi-task mode, please use 'model/loss_dict' in stead of 'model/loss'! " + assert ( + not single_learning_rate + ), "In multi-task mode, please use 'model/leaning_rate_dict' in stead of 'model/learning_rate'! " + assert ( + "type_map" in data["model"] + ), "In multi-task mode, 'model/type_map' must be defined! " + data["model"]["fitting_net_dict"] = normalize_fitting_net_dict( + data["model"]["fitting_net_dict"] + ) + data["training"]["data_dict"] = normalize_data_dict( + data["training"]["data_dict"] + ) + data["loss_dict"] = ( + normalize_loss_dict( + data["model"]["fitting_net_dict"].keys(), data["loss_dict"] + ) + if multi_loss + else {} + ) + data["learning_rate_dict"] = ( + normalize_learning_rate_dict( + data["model"]["fitting_net_dict"].keys(), data["learning_rate_dict"] + ) + if multi_learning_rate + else {} + ) + fitting_weight = ( + data["training"]["fitting_weight"] if multi_fitting_weight else None + ) + data["training"]["fitting_weight"] = normalize_fitting_weight( + data["model"]["fitting_net_dict"].keys(), + data["training"]["data_dict"].keys(), + fitting_weight=fitting_weight, + ) else: - assert not multi_loss, "In single-task mode, please use 'model/loss' in stead of 'model/loss_dict'! " - assert not multi_learning_rate, "In single-task mode, please use 'model/learning_rate' in stead of 'model/learning_rate_dict'! " + assert ( + not multi_loss + ), "In single-task mode, please use 'model/loss' in stead of 'model/loss_dict'! " + assert ( + not multi_learning_rate + ), "In single-task mode, please use 'model/learning_rate' in stead of 'model/learning_rate_dict'! " return data @@ -1380,20 +1427,26 @@ def normalize_loss_dict(fitting_keys, loss_dict): new_dict[item] = data return new_dict + def normalize_learning_rate_dict(fitting_keys, learning_rate_dict): # check the learning_rate dict - failed_learning_rate_keys = [item for item in learning_rate_dict if item not in fitting_keys] - assert not failed_learning_rate_keys, \ - "Learning rate dict key(s) {} not have corresponding fitting keys in {}! ".format( - str(failed_learning_rate_keys), str(list(fitting_keys))) + failed_learning_rate_keys = [ + item for item in learning_rate_dict if item not in fitting_keys + ] + assert ( + not failed_learning_rate_keys + ), "Learning rate dict key(s) {} not have corresponding fitting keys in {}! ".format( + str(failed_learning_rate_keys), str(list(fitting_keys)) + ) new_dict = {} - base = Argument('base', dict, [], [learning_rate_variant_type_args()], doc="") + base = Argument("base", dict, [], [learning_rate_variant_type_args()], doc="") for item in learning_rate_dict: data = base.normalize_value(learning_rate_dict[item], trim_pattern="_*") base.check_value(data, strict=True) new_dict[item] = data return new_dict + def normalize_fitting_weight(fitting_keys, data_keys, fitting_weight=None): # check the mapping failed_data_keys = [item for item in data_keys if item not in fitting_keys] From 32f614cea728ef52af21db6d9046228f1b42355f Mon Sep 17 00:00:00 2001 From: HuangJiameng <105633685+HuangJiameng@users.noreply.github.com> Date: Thu, 9 Mar 2023 13:31:01 +0800 Subject: [PATCH 11/27] update_lr_dict when "learning_rate_dict" is empty, the "learning_rate" setting is extended to "learning_rate_dict" by default --- deepmd/utils/argcheck.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index e2d5a42524..9dc1b781ca 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -1343,9 +1343,6 @@ def normalize_multi_task(data): assert ( not single_loss ), "In multi-task mode, please use 'model/loss_dict' in stead of 'model/loss'! " - assert ( - not single_learning_rate - ), "In multi-task mode, please use 'model/leaning_rate_dict' in stead of 'model/learning_rate'! " assert ( "type_map" in data["model"] ), "In multi-task mode, 'model/type_map' must be defined! " @@ -1362,13 +1359,18 @@ def normalize_multi_task(data): if multi_loss else {} ) - data["learning_rate_dict"] = ( - normalize_learning_rate_dict( - data["model"]["fitting_net_dict"].keys(), data["learning_rate_dict"] + if multi_learning_rate: + data["learning_rate_dict"] = ( + normalize_learning_rate_dict( + data["model"]["fitting_net_dict"].keys(), data["learning_rate_dict"] + ) + ) + elif single_learning_rate: + data["learning_rate_dict"] = ( + normalize_learning_rate_dict( + data["model"]["fitting_net_dict"].keys(), data["learning_rate"] + ) ) - if multi_learning_rate - else {} - ) fitting_weight = ( data["training"]["fitting_weight"] if multi_fitting_weight else None ) @@ -1446,6 +1448,14 @@ def normalize_learning_rate_dict(fitting_keys, learning_rate_dict): new_dict[item] = data return new_dict +def normalize_learning_rate_dict_with_single_learning_rate(fitting_keys, learning_rate): + new_dict = {} + base = Argument("base", dict, [], [learning_rate_variant_type_args()], doc="") + data = base.normalize_value(learning_rate, trim_pattern="_*") + base.check_value(data, strict=True) + for fitting_key in fitting_keys: + new_dict[fitting_key] = data + return new_dict def normalize_fitting_weight(fitting_keys, data_keys, fitting_weight=None): # check the mapping From 1556f888a09ac9c95179d89fa47f7f5b45d46d22 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Mar 2023 05:31:27 +0000 Subject: [PATCH 12/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/utils/argcheck.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index 9dc1b781ca..e77ed85a58 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -1360,16 +1360,12 @@ def normalize_multi_task(data): else {} ) if multi_learning_rate: - data["learning_rate_dict"] = ( - normalize_learning_rate_dict( - data["model"]["fitting_net_dict"].keys(), data["learning_rate_dict"] - ) + data["learning_rate_dict"] = normalize_learning_rate_dict( + data["model"]["fitting_net_dict"].keys(), data["learning_rate_dict"] ) elif single_learning_rate: - data["learning_rate_dict"] = ( - normalize_learning_rate_dict( - data["model"]["fitting_net_dict"].keys(), data["learning_rate"] - ) + data["learning_rate_dict"] = normalize_learning_rate_dict( + data["model"]["fitting_net_dict"].keys(), data["learning_rate"] ) fitting_weight = ( data["training"]["fitting_weight"] if multi_fitting_weight else None @@ -1448,6 +1444,7 @@ def normalize_learning_rate_dict(fitting_keys, learning_rate_dict): new_dict[item] = data return new_dict + def normalize_learning_rate_dict_with_single_learning_rate(fitting_keys, learning_rate): new_dict = {} base = Argument("base", dict, [], [learning_rate_variant_type_args()], doc="") @@ -1457,6 +1454,7 @@ def normalize_learning_rate_dict_with_single_learning_rate(fitting_keys, learnin new_dict[fitting_key] = data return new_dict + def normalize_fitting_weight(fitting_keys, data_keys, fitting_weight=None): # check the mapping failed_data_keys = [item for item in data_keys if item not in fitting_keys] From 58db96b0f5114fd8fb981660eb98f1375d5d0614 Mon Sep 17 00:00:00 2001 From: HuangJiameng <105633685+HuangJiameng@users.noreply.github.com> Date: Thu, 9 Mar 2023 13:46:25 +0800 Subject: [PATCH 13/27] update lr_dict --- deepmd/utils/argcheck.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index 9dc1b781ca..90fbf72939 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -1367,7 +1367,7 @@ def normalize_multi_task(data): ) elif single_learning_rate: data["learning_rate_dict"] = ( - normalize_learning_rate_dict( + normalize_learning_rate_dict_with_single_learning_rate( data["model"]["fitting_net_dict"].keys(), data["learning_rate"] ) ) From 5127ccc635d0137362b82ff7c265dbb2669cc9db Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Mar 2023 05:47:55 +0000 Subject: [PATCH 14/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/utils/argcheck.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index cc07728c31..9c30957491 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -1364,10 +1364,10 @@ def normalize_multi_task(data): data["model"]["fitting_net_dict"].keys(), data["learning_rate_dict"] ) elif single_learning_rate: - data["learning_rate_dict"] = ( - normalize_learning_rate_dict_with_single_learning_rate( - data["model"]["fitting_net_dict"].keys(), data["learning_rate"] - ) + data[ + "learning_rate_dict" + ] = normalize_learning_rate_dict_with_single_learning_rate( + data["model"]["fitting_net_dict"].keys(), data["learning_rate"] ) fitting_weight = ( data["training"]["fitting_weight"] if multi_fitting_weight else None From 036832fca29a491b2fd61e1641cfde74fb798e84 Mon Sep 17 00:00:00 2001 From: HuangJiameng <105633685+HuangJiameng@users.noreply.github.com> Date: Thu, 9 Mar 2023 13:53:44 +0800 Subject: [PATCH 15/27] add lr_dict to gen_json and gen_args --- deepmd/utils/argcheck.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index cc07728c31..2dabc9fbee 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -1279,6 +1279,7 @@ def gen_json(**kwargs): ( model_args(), learning_rate_args(), + learning_rate_dict_args(), loss_args(), loss_dict_args(), training_args(), @@ -1292,6 +1293,7 @@ def gen_args(**kwargs): return [ model_args(), learning_rate_args(), + learning_rate_dict_args(), loss_args(), loss_dict_args(), training_args(), From 51156741b3752401c81b48e0d74cdfcef636d347 Mon Sep 17 00:00:00 2001 From: HuangJiameng <105633685+HuangJiameng@users.noreply.github.com> Date: Thu, 9 Mar 2023 14:37:44 +0800 Subject: [PATCH 16/27] merge duplicated codes to methods --- deepmd/train/trainer.py | 103 +++++++++++++--------------------- deepmd/utils/learning_rate.py | 19 ++++++- 2 files changed, 57 insertions(+), 65 deletions(-) diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index 558681c177..bd1f5e9794 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -66,6 +66,7 @@ ) from deepmd.utils.learning_rate import ( LearningRateExp, + get_lr_and_coef, ) from deepmd.utils.neighbor_stat import ( NeighborStat, @@ -271,46 +272,14 @@ def fitting_net_init(fitting_type_, descrpt_type_, params): # learning rate if not self.multi_task_mode: lr_param = j_must_have(jdata, "learning_rate") - scale_by_worker = lr_param.get("scale_by_worker", "linear") - if scale_by_worker == "linear": - self.scale_lr_coef = float(self.run_opt.world_size) - elif scale_by_worker == "sqrt": - self.scale_lr_coef = np.sqrt(self.run_opt.world_size).real - else: - self.scale_lr_coef = 1.0 - lr_type = lr_param.get("type", "exp") - if lr_type == "exp": - self.lr = LearningRateExp( - lr_param["start_lr"], lr_param["stop_lr"], lr_param["decay_steps"] - ) - else: - raise RuntimeError("unknown learning_rate type " + lr_type) + self.lr, self.scale_lr_coef = self.get_lr_and_coef(lr_param) else: self.lr_dict = {} self.scale_lr_coef_dict = {} lr_param_dict = jdata.get("learning_rate_dict", {}) for fitting_key in self.fitting_type_dict: lr_param = lr_param_dict.get(fitting_key, {}) - scale_by_worker = lr_param.get("scale_by_worker", "linear") - if scale_by_worker == "linear": - self.scale_lr_coef_dict[fitting_key] = float( - self.run_opt.world_size - ) - elif scale_by_worker == "sqrt": - self.scale_lr_coef_dict[fitting_key] = np.sqrt( - self.run_opt.world_size - ).real - else: - self.scale_lr_coef_dict[fitting_key] = 1.0 - lr_type = lr_param.get("type", "exp") - if lr_type == "exp": - self.lr_dict[fitting_key] = LearningRateExp( - lr_param["start_lr"], - lr_param["stop_lr"], - lr_param["decay_steps"], - ) - else: - raise RuntimeError("unknown learning_rate type " + lr_type) + self.lr_dict[fitting_key], self.scale_lr_coef_dict[fitting_key] = self.get_lr_and_coef(lr_param) # loss # infer loss type by fitting_type @@ -597,6 +566,41 @@ def _build_lr(self): log.info("built lr") + def _build_optimizer(self): + if not self.multi_task_mode: + l2_l, l2_more = self.loss.build( + self.learning_rate, + self.place_holders["natoms_vec"], + self.model_pred, + self.place_holders, + suffix="test" + ) + + if self.mixed_prec is not None: + l2_l = tf.cast(l2_l, get_precision(self.mixed_prec["output_prec"])) + else: + l2_l, l2_more = {}, {} + for fitting_key in self.fitting_type_dict: + lr = self.learning_rate_dict[fitting_key] + model = self.model_pred[fitting_key] + loss_dict = self.loss_dict[fitting_key] + + l2_l[fitting_key], l2_more[fitting_key] = loss_dict.build( + lr, + self.place_holders["natoms_vec"], + model, + self.place_holders, + suffix=fitting_key + ) + + if self.mixed_prec is not None: + l2_l[fitting_key] = tf.cast( + l2_l[fitting_key], + get_precision(self.mixed_prec["output_prec"]) + ) + + return l2_l, l2_more + def _build_network(self, data, suffix=""): self.place_holders = {} if self.is_compress: @@ -632,36 +636,7 @@ def _build_network(self, data, suffix=""): reuse=False, ) - if not self.multi_task_mode: - self.l2_l, self.l2_more = self.loss.build( - self.learning_rate, - self.place_holders["natoms_vec"], - self.model_pred, - self.place_holders, - suffix="test", - ) - - if self.mixed_prec is not None: - self.l2_l = tf.cast( - self.l2_l, get_precision(self.mixed_prec["output_prec"]) - ) - else: - self.l2_l, self.l2_more = {}, {} - for fitting_key in self.fitting_type_dict: - self.l2_l[fitting_key], self.l2_more[fitting_key] = self.loss_dict[ - fitting_key - ].build( - self.learning_rate_dict[fitting_key], - self.place_holders["natoms_vec"], - self.model_pred[fitting_key], - self.place_holders, - suffix=fitting_key, - ) - if self.mixed_prec is not None: - self.l2_l[fitting_key] = tf.cast( - self.l2_l[fitting_key], - get_precision(self.mixed_prec["output_prec"]), - ) + self.l2_l, self.l2_more = self._build_optimizer() log.info("built network") diff --git a/deepmd/utils/learning_rate.py b/deepmd/utils/learning_rate.py index 6bc85a61f9..eb66dbe636 100644 --- a/deepmd/utils/learning_rate.py +++ b/deepmd/utils/learning_rate.py @@ -4,6 +4,23 @@ tf, ) +def get_lr_and_coef(self, lr_param): + scale_by_worker = lr_param.get("scale_by_worker", "linear") + if scale_by_worker == "linear": + scale_lr_coef = float(self.run_opt.world_size) + elif scale_by_worker == "sqrt": + scale_lr_coef = np.sqrt(self.run_opt.world_size).real + else: + scale_lr_coef = 1.0 + lr_type = lr_param.get("type", "exp") + if lr_type == "exp": + lr = LearningRateExp( + lr_param["start_lr"], lr_param["stop_lr"], lr_param["decay_steps"] + ) + else: + raise RuntimeError("unknown learning_rate type " + lr_type) + return lr, scale_lr_coef + class LearningRateExp: r"""The exponentially decaying learning rate. @@ -97,4 +114,4 @@ def start_lr(self) -> float: def value(self, step: int) -> float: """Get the lr at a certain step.""" - return self.start_lr_ * np.power(self.decay_rate_, (step // self.decay_steps_)) + return self.start_lr_ * np.power(self.decay_rate_, (step // self.decay_steps_)) \ No newline at end of file From 27730498c8cbaff9b66df9d8352c8d766a926ef2 Mon Sep 17 00:00:00 2001 From: HuangJiameng <105633685+HuangJiameng@users.noreply.github.com> Date: Thu, 9 Mar 2023 14:45:39 +0800 Subject: [PATCH 17/27] fix get_lr_and_coef --- deepmd/train/trainer.py | 23 ++++++++++++++++++++--- deepmd/utils/learning_rate.py | 18 ------------------ 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index bd1f5e9794..36855fddf2 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -66,7 +66,6 @@ ) from deepmd.utils.learning_rate import ( LearningRateExp, - get_lr_and_coef, ) from deepmd.utils.neighbor_stat import ( NeighborStat, @@ -269,17 +268,35 @@ def fitting_net_init(fitting_type_, descrpt_type_, params): model_param.get("sw_rmax"), ) + + def get_lr_and_coef(self, lr_param): + scale_by_worker = lr_param.get("scale_by_worker", "linear") + if scale_by_worker == "linear": + scale_lr_coef = float(self.run_opt.world_size) + elif scale_by_worker == "sqrt": + scale_lr_coef = np.sqrt(self.run_opt.world_size).real + else: + scale_lr_coef = 1.0 + lr_type = lr_param.get("type", "exp") + if lr_type == "exp": + lr = LearningRateExp( + lr_param["start_lr"], lr_param["stop_lr"], lr_param["decay_steps"] + ) + else: + raise RuntimeError("unknown learning_rate type " + lr_type) + return lr, scale_lr_coef + # learning rate if not self.multi_task_mode: lr_param = j_must_have(jdata, "learning_rate") - self.lr, self.scale_lr_coef = self.get_lr_and_coef(lr_param) + self.lr, self.scale_lr_coef = get_lr_and_coef(lr_param) else: self.lr_dict = {} self.scale_lr_coef_dict = {} lr_param_dict = jdata.get("learning_rate_dict", {}) for fitting_key in self.fitting_type_dict: lr_param = lr_param_dict.get(fitting_key, {}) - self.lr_dict[fitting_key], self.scale_lr_coef_dict[fitting_key] = self.get_lr_and_coef(lr_param) + self.lr_dict[fitting_key], self.scale_lr_coef_dict[fitting_key] = get_lr_and_coef(lr_param) # loss # infer loss type by fitting_type diff --git a/deepmd/utils/learning_rate.py b/deepmd/utils/learning_rate.py index eb66dbe636..76d56226af 100644 --- a/deepmd/utils/learning_rate.py +++ b/deepmd/utils/learning_rate.py @@ -4,24 +4,6 @@ tf, ) -def get_lr_and_coef(self, lr_param): - scale_by_worker = lr_param.get("scale_by_worker", "linear") - if scale_by_worker == "linear": - scale_lr_coef = float(self.run_opt.world_size) - elif scale_by_worker == "sqrt": - scale_lr_coef = np.sqrt(self.run_opt.world_size).real - else: - scale_lr_coef = 1.0 - lr_type = lr_param.get("type", "exp") - if lr_type == "exp": - lr = LearningRateExp( - lr_param["start_lr"], lr_param["stop_lr"], lr_param["decay_steps"] - ) - else: - raise RuntimeError("unknown learning_rate type " + lr_type) - return lr, scale_lr_coef - - class LearningRateExp: r"""The exponentially decaying learning rate. From 3a6ce32aba5c5bcc70e3de0d711649967e76f194 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Mar 2023 06:46:04 +0000 Subject: [PATCH 18/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/train/trainer.py | 39 ++++++++++++++++++----------------- deepmd/utils/learning_rate.py | 3 ++- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index 36855fddf2..04b9856425 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -268,7 +268,6 @@ def fitting_net_init(fitting_type_, descrpt_type_, params): model_param.get("sw_rmax"), ) - def get_lr_and_coef(self, lr_param): scale_by_worker = lr_param.get("scale_by_worker", "linear") if scale_by_worker == "linear": @@ -285,7 +284,7 @@ def get_lr_and_coef(self, lr_param): else: raise RuntimeError("unknown learning_rate type " + lr_type) return lr, scale_lr_coef - + # learning rate if not self.multi_task_mode: lr_param = j_must_have(jdata, "learning_rate") @@ -296,7 +295,10 @@ def get_lr_and_coef(self, lr_param): lr_param_dict = jdata.get("learning_rate_dict", {}) for fitting_key in self.fitting_type_dict: lr_param = lr_param_dict.get(fitting_key, {}) - self.lr_dict[fitting_key], self.scale_lr_coef_dict[fitting_key] = get_lr_and_coef(lr_param) + ( + self.lr_dict[fitting_key], + self.scale_lr_coef_dict[fitting_key], + ) = get_lr_and_coef(lr_param) # loss # infer loss type by fitting_type @@ -586,13 +588,13 @@ def _build_lr(self): def _build_optimizer(self): if not self.multi_task_mode: l2_l, l2_more = self.loss.build( - self.learning_rate, - self.place_holders["natoms_vec"], - self.model_pred, - self.place_holders, - suffix="test" + self.learning_rate, + self.place_holders["natoms_vec"], + self.model_pred, + self.place_holders, + suffix="test", ) - + if self.mixed_prec is not None: l2_l = tf.cast(l2_l, get_precision(self.mixed_prec["output_prec"])) else: @@ -601,21 +603,20 @@ def _build_optimizer(self): lr = self.learning_rate_dict[fitting_key] model = self.model_pred[fitting_key] loss_dict = self.loss_dict[fitting_key] - + l2_l[fitting_key], l2_more[fitting_key] = loss_dict.build( - lr, - self.place_holders["natoms_vec"], - model, - self.place_holders, - suffix=fitting_key + lr, + self.place_holders["natoms_vec"], + model, + self.place_holders, + suffix=fitting_key, ) - + if self.mixed_prec is not None: l2_l[fitting_key] = tf.cast( - l2_l[fitting_key], - get_precision(self.mixed_prec["output_prec"]) + l2_l[fitting_key], get_precision(self.mixed_prec["output_prec"]) ) - + return l2_l, l2_more def _build_network(self, data, suffix=""): diff --git a/deepmd/utils/learning_rate.py b/deepmd/utils/learning_rate.py index 76d56226af..6bc85a61f9 100644 --- a/deepmd/utils/learning_rate.py +++ b/deepmd/utils/learning_rate.py @@ -4,6 +4,7 @@ tf, ) + class LearningRateExp: r"""The exponentially decaying learning rate. @@ -96,4 +97,4 @@ def start_lr(self) -> float: def value(self, step: int) -> float: """Get the lr at a certain step.""" - return self.start_lr_ * np.power(self.decay_rate_, (step // self.decay_steps_)) \ No newline at end of file + return self.start_lr_ * np.power(self.decay_rate_, (step // self.decay_steps_)) From a511a67047648d5a12508c3da747a937bf301ad1 Mon Sep 17 00:00:00 2001 From: HuangJiameng <105633685+HuangJiameng@users.noreply.github.com> Date: Thu, 9 Mar 2023 15:01:33 +0800 Subject: [PATCH 19/27] fix get_lr_and_coef --- deepmd/train/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index 36855fddf2..6e823c1798 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -269,7 +269,7 @@ def fitting_net_init(fitting_type_, descrpt_type_, params): ) - def get_lr_and_coef(self, lr_param): + def get_lr_and_coef(lr_param): scale_by_worker = lr_param.get("scale_by_worker", "linear") if scale_by_worker == "linear": scale_lr_coef = float(self.run_opt.world_size) From 219f6e2c2d44644466d6605bebcdf39cdf7033e4 Mon Sep 17 00:00:00 2001 From: HuangJiameng <105633685+HuangJiameng@users.noreply.github.com> Date: Thu, 9 Mar 2023 16:37:23 +0800 Subject: [PATCH 20/27] add lr dict setting to multi init --- deepmd/train/trainer.py | 1 - deepmd/utils/multi_init.py | 2 +- source/tests/test_init_frz_model_multi.py | 5 +++++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index 6fb74999f3..2f964a8905 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -269,7 +269,6 @@ def fitting_net_init(fitting_type_, descrpt_type_, params): ) def get_lr_and_coef(lr_param): - scale_by_worker = lr_param.get("scale_by_worker", "linear") if scale_by_worker == "linear": scale_lr_coef = float(self.run_opt.world_size) diff --git a/deepmd/utils/multi_init.py b/deepmd/utils/multi_init.py index f2984ce145..5a20da6713 100644 --- a/deepmd/utils/multi_init.py +++ b/deepmd/utils/multi_init.py @@ -143,7 +143,7 @@ def replace_model_params_with_frz_multi_model( # Change other multi-task configurations log.info("Change the training configurations according to the pretrained one...") - for config_key in ["loss_dict", "training/data_dict"]: + for config_key in ["loss_dict", "learning_rate_dict", "training/data_dict"]: cur_jdata = jdata target_jdata = pretrained_jdata for sub_key in config_key.split("/"): diff --git a/source/tests/test_init_frz_model_multi.py b/source/tests/test_init_frz_model_multi.py index 3405ab1544..99e2a7b3a6 100644 --- a/source/tests/test_init_frz_model_multi.py +++ b/source/tests/test_init_frz_model_multi.py @@ -61,6 +61,7 @@ def _init_models(): jdata = j_loader(str(tests_path / os.path.join("init_frz_model", "input.json"))) fitting_config = jdata["model"].pop("fitting_net") loss_config = jdata.pop("loss") + learning_rate_config = jdata.pop("learning_rate") training_data_config = jdata["training"].pop("training_data") validation_data_config = jdata["training"].pop("validation_data") jdata["training"]["data_dict"] = {} @@ -78,6 +79,8 @@ def _init_models(): jdata["model"]["fitting_net_dict"]["water_ener"] = fitting_config jdata["loss_dict"] = {} jdata["loss_dict"]["water_ener"] = loss_config + jdata["learning_rate_dict"] = {} + jdata["learning_rate_dict"]["water_ener"] = learning_rate_config with open(INPUT, "w") as fp: json.dump(jdata, fp, indent=4) ret = run_dp("dp train " + INPUT) @@ -95,6 +98,8 @@ def _init_models(): jdata["model"]["fitting_net_dict"]["water_ener_new"] = fitting_config jdata["loss_dict"] = {} jdata["loss_dict"]["water_ener_new"] = loss_config + jdata["learning_rate_dict"] = {} + jdata["learning_rate_dict"]["water_ener_new"] = learning_rate_config jdata["training"]["data_dict"] = {} jdata["training"]["data_dict"]["water_ener_new"] = {} jdata["training"]["data_dict"]["water_ener_new"][ From ef1be2adf1ac118c90d570caa316cf3db468a0e3 Mon Sep 17 00:00:00 2001 From: HuangJiameng <105633685+HuangJiameng@users.noreply.github.com> Date: Fri, 24 Mar 2023 09:34:48 +0800 Subject: [PATCH 21/27] name _build_loss, extract _build_optimizer --- deepmd/train/trainer.py | 108 +++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 61 deletions(-) diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index 2f964a8905..00412f98df 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -585,7 +585,7 @@ def _build_lr(self): log.info("built lr") - def _build_optimizer(self): + def _build_loss(self): if not self.multi_task_mode: l2_l, l2_more = self.loss.build( self.learning_rate, @@ -654,14 +654,13 @@ def _build_network(self, data, suffix=""): reuse=False, ) - self.l2_l, self.l2_more = self._build_optimizer() + self.l2_l, self.l2_more = self._build_loss() log.info("built network") - - def _build_training(self): - trainable_variables = tf.trainable_variables() - if not self.multi_task_mode: - if self.run_opt.is_distrib: + + def _build_optimizer(self, fitting_key=None): + if self.run_opt.is_distrib: + if fitting_key is None: if self.scale_lr_coef > 1.0: log.info("Scale learning rate by coef: %f", self.scale_lr_coef) optimizer = tf.train.AdamOptimizer( @@ -671,26 +670,51 @@ def _build_training(self): optimizer = tf.train.AdamOptimizer(self.learning_rate) optimizer = self.run_opt._HVD.DistributedOptimizer(optimizer) else: - optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) - if self.mixed_prec is not None: - _TF_VERSION = Version(TF_VERSION) - # check the TF_VERSION, when TF < 1.12, mixed precision is not allowed - if _TF_VERSION < Version("1.14.0"): - raise RuntimeError( - "TensorFlow version %s is not compatible with the mixed precision setting. Please consider upgrading your TF version!" - % TF_VERSION + if self.scale_lr_coef_dict[fitting_key] > 1.0: + log.info( + "Scale learning rate by coef: %f", + self.scale_lr_coef_dict[fitting_key], ) - elif _TF_VERSION < Version("2.4.0"): - optimizer = ( - tf.train.experimental.enable_mixed_precision_graph_rewrite( - optimizer - ) + optimizer = tf.train.AdamOptimizer( + self.learning_rate_dict[fitting_key] + * self.scale_lr_coef_dict[fitting_key] ) else: - optimizer = tf.mixed_precision.enable_mixed_precision_graph_rewrite( - optimizer + optimizer = tf.train.AdamOptimizer( + learning_rate=self.learning_rate_dict[fitting_key] ) + optimizer = self.run_opt._HVD.DistributedOptimizer(optimizer) + else: + if fitting_key is None: + optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) + else: + optimizer = tf.train.AdamOptimizer( + learning_rate=self.learning_rate_dict[fitting_key] + ) + + if self.mixed_prec is not None: + _TF_VERSION = Version(TF_VERSION) + if _TF_VERSION < Version("1.14.0"): + raise RuntimeError( + "TensorFlow version %s is not compatible with the mixed precision setting. Please consider upgrading your TF version!" + % TF_VERSION + ) + elif _TF_VERSION < Version("2.4.0"): + optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( + optimizer + ) + else: + optimizer = tf.mixed_precision.enable_mixed_precision_graph_rewrite( + optimizer + ) + return optimizer + + + def _build_training(self): + trainable_variables = tf.trainable_variables() + if not self.multi_task_mode: + optimizer = self._build_optimizer() apply_op = optimizer.minimize( loss=self.l2_l, global_step=self.global_step, @@ -702,45 +726,7 @@ def _build_training(self): else: self.train_op = {} for fitting_key in self.fitting_type_dict: - if self.run_opt.is_distrib: - if self.scale_lr_coef_dict[fitting_key] > 1.0: - log.info( - "Scale learning rate by coef: %f", - self.scale_lr_coef_dict[fitting_key], - ) - optimizer = tf.train.AdamOptimizer( - self.learning_rate_dict[fitting_key] - * self.scale_lr_coef_dict[fitting_key] - ) - else: - optimizer = tf.train.AdamOptimizer( - self.learning_rate_dict[fitting_key] - ) - optimizer = self.run_opt._HVD.DistributedOptimizer(optimizer) - else: - optimizer = tf.train.AdamOptimizer( - learning_rate=self.learning_rate_dict[fitting_key] - ) - if self.mixed_prec is not None: - _TF_VERSION = Version(TF_VERSION) - # check the TF_VERSION, when TF < 1.12, mixed precision is not allowed - if _TF_VERSION < Version("1.14.0"): - raise RuntimeError( - "TensorFlow version %s is not compatible with the mixed precision setting. Please consider upgrading your TF version!" - % TF_VERSION - ) - elif _TF_VERSION < Version("2.4.0"): - optimizer = ( - tf.train.experimental.enable_mixed_precision_graph_rewrite( - optimizer - ) - ) - else: - optimizer = ( - tf.mixed_precision.enable_mixed_precision_graph_rewrite( - optimizer - ) - ) + optimizer = self._build_optimizer(fitting_key=fitting_key) apply_op = optimizer.minimize( loss=self.l2_l[fitting_key], global_step=self.global_step, From d1e687fcb35ade4f559351e3bc85f3c362b46aca Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 24 Mar 2023 01:35:18 +0000 Subject: [PATCH 22/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/train/trainer.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index 00412f98df..84fed27c49 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -657,7 +657,7 @@ def _build_network(self, data, suffix=""): self.l2_l, self.l2_more = self._build_loss() log.info("built network") - + def _build_optimizer(self, fitting_key=None): if self.run_opt.is_distrib: if fitting_key is None: @@ -709,7 +709,6 @@ def _build_optimizer(self, fitting_key=None): ) return optimizer - def _build_training(self): trainable_variables = tf.trainable_variables() From 599cc055b9d4a57354ed95a5c10b43d7d3cc8611 Mon Sep 17 00:00:00 2001 From: HuangJiameng <105633685+HuangJiameng@users.noreply.github.com> Date: Fri, 24 Mar 2023 09:47:58 +0800 Subject: [PATCH 23/27] fix: print lr in standard mode --- deepmd/train/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index 00412f98df..0a8ffffb7c 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -709,7 +709,6 @@ def _build_optimizer(self, fitting_key=None): ) return optimizer - def _build_training(self): trainable_variables = tf.trainable_variables() @@ -1185,6 +1184,7 @@ def print_on_training( prop_fmt = " %11.2e" for k in train_results.keys(): print_str += prop_fmt % (train_results[k]) + print_str += " %8.1e\n" % cur_lr else: for fitting_key in train_results: if valid_results[fitting_key] is not None: From 0ea116006ea6fee2701acfd2a1aabb57fefb8f1b Mon Sep 17 00:00:00 2001 From: HuangJiameng <105633685+HuangJiameng@users.noreply.github.com> Date: Mon, 27 Mar 2023 14:12:32 +0800 Subject: [PATCH 24/27] fix multi init --- deepmd/utils/multi_init.py | 2 +- source/tests/test_init_frz_model_multi.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/deepmd/utils/multi_init.py b/deepmd/utils/multi_init.py index 5a20da6713..f2984ce145 100644 --- a/deepmd/utils/multi_init.py +++ b/deepmd/utils/multi_init.py @@ -143,7 +143,7 @@ def replace_model_params_with_frz_multi_model( # Change other multi-task configurations log.info("Change the training configurations according to the pretrained one...") - for config_key in ["loss_dict", "learning_rate_dict", "training/data_dict"]: + for config_key in ["loss_dict", "training/data_dict"]: cur_jdata = jdata target_jdata = pretrained_jdata for sub_key in config_key.split("/"): diff --git a/source/tests/test_init_frz_model_multi.py b/source/tests/test_init_frz_model_multi.py index 99e2a7b3a6..0a1378d2b3 100644 --- a/source/tests/test_init_frz_model_multi.py +++ b/source/tests/test_init_frz_model_multi.py @@ -99,6 +99,7 @@ def _init_models(): jdata["loss_dict"] = {} jdata["loss_dict"]["water_ener_new"] = loss_config jdata["learning_rate_dict"] = {} + jdata["learning_rate_dict"]["water_ener"] = learning_rate_config jdata["learning_rate_dict"]["water_ener_new"] = learning_rate_config jdata["training"]["data_dict"] = {} jdata["training"]["data_dict"]["water_ener_new"] = {} From 82f2d6c54543c16f711083c7afbd31ae114395b9 Mon Sep 17 00:00:00 2001 From: HuangJiameng <105633685+HuangJiameng@users.noreply.github.com> Date: Mon, 27 Mar 2023 16:58:43 +0800 Subject: [PATCH 25/27] when using init model, set learning rate dict by order: User-provided > Pre-trained model > Default --- deepmd/entrypoints/freeze.py | 5 +++++ deepmd/utils/multi_init.py | 25 +++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/deepmd/entrypoints/freeze.py b/deepmd/entrypoints/freeze.py index 9f92c2f682..3385d1af77 100755 --- a/deepmd/entrypoints/freeze.py +++ b/deepmd/entrypoints/freeze.py @@ -158,6 +158,11 @@ def _modify_model_suffix(output_graph_def, out_suffix, freeze_type): loss_dict = jdata.pop("loss_dict") if out_suffix in loss_dict: jdata["loss"] = loss_dict[out_suffix] + #learning_rate + if "learning_rate_dict" in jdata: + learning_rate_dict = jdata.pop("learning_rate_dict") + if out_suffix in learning_rate_dict: + jdata["learning_rate"] = learning_rate_dict[out_suffix] # fitting weight if "fitting_weight" in jdata["training"]: jdata["training"].pop("fitting_weight") diff --git a/deepmd/utils/multi_init.py b/deepmd/utils/multi_init.py index f2984ce145..dfa2f0dbdd 100644 --- a/deepmd/utils/multi_init.py +++ b/deepmd/utils/multi_init.py @@ -156,6 +156,31 @@ def replace_model_params_with_frz_multi_model( log.info( f"Add '{config_key}/{fitting_key}' configurations from the pretrained frozen model." ) + + # learning rate dict keep backward compatibility + config_key = "learning_rate_dict" + single_config_key = "learning_rate" + cur_jdata = jdata + target_jdata = pretrained_jdata + if ((single_config_key not in cur_jdata) + and (config_key in cur_jdata)): + cur_jdata = cur_jdata[config_key] + if config_key in target_jdata: + target_jdata = target_jdata[config_key] + for fitting_key in reused_fittings: + if fitting_key not in cur_jdata: + target_para = target_jdata[fitting_key] + cur_jdata[fitting_key] = target_para + log.info( + f"Add '{config_key}/{fitting_key}' configurations from the pretrained frozen model." + ) + else : + for fitting_key in reused_fittings: + if fitting_key not in cur_jdata: + cur_jdata[fitting_key] = {} + log.info( + f"Add '{config_key}/{fitting_key}' configurations as default." + ) return jdata From e519ce271d41401dd6562d6813a88ae94b1ca5ee Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 27 Mar 2023 08:59:11 +0000 Subject: [PATCH 26/27] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/entrypoints/freeze.py | 2 +- deepmd/utils/multi_init.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/deepmd/entrypoints/freeze.py b/deepmd/entrypoints/freeze.py index 3385d1af77..e2ca820218 100755 --- a/deepmd/entrypoints/freeze.py +++ b/deepmd/entrypoints/freeze.py @@ -158,7 +158,7 @@ def _modify_model_suffix(output_graph_def, out_suffix, freeze_type): loss_dict = jdata.pop("loss_dict") if out_suffix in loss_dict: jdata["loss"] = loss_dict[out_suffix] - #learning_rate + # learning_rate if "learning_rate_dict" in jdata: learning_rate_dict = jdata.pop("learning_rate_dict") if out_suffix in learning_rate_dict: diff --git a/deepmd/utils/multi_init.py b/deepmd/utils/multi_init.py index dfa2f0dbdd..108dab103c 100644 --- a/deepmd/utils/multi_init.py +++ b/deepmd/utils/multi_init.py @@ -156,14 +156,13 @@ def replace_model_params_with_frz_multi_model( log.info( f"Add '{config_key}/{fitting_key}' configurations from the pretrained frozen model." ) - + # learning rate dict keep backward compatibility config_key = "learning_rate_dict" single_config_key = "learning_rate" cur_jdata = jdata target_jdata = pretrained_jdata - if ((single_config_key not in cur_jdata) - and (config_key in cur_jdata)): + if (single_config_key not in cur_jdata) and (config_key in cur_jdata): cur_jdata = cur_jdata[config_key] if config_key in target_jdata: target_jdata = target_jdata[config_key] @@ -174,7 +173,7 @@ def replace_model_params_with_frz_multi_model( log.info( f"Add '{config_key}/{fitting_key}' configurations from the pretrained frozen model." ) - else : + else: for fitting_key in reused_fittings: if fitting_key not in cur_jdata: cur_jdata[fitting_key] = {} From 1334840d9484f27f7791ccd5c37808892c7e1395 Mon Sep 17 00:00:00 2001 From: HuangJiameng <105633685+HuangJiameng@users.noreply.github.com> Date: Mon, 27 Mar 2023 17:01:19 +0800 Subject: [PATCH 27/27] Update test_init_frz_model_multi.py --- source/tests/test_init_frz_model_multi.py | 1 - 1 file changed, 1 deletion(-) diff --git a/source/tests/test_init_frz_model_multi.py b/source/tests/test_init_frz_model_multi.py index 0a1378d2b3..99e2a7b3a6 100644 --- a/source/tests/test_init_frz_model_multi.py +++ b/source/tests/test_init_frz_model_multi.py @@ -99,7 +99,6 @@ def _init_models(): jdata["loss_dict"] = {} jdata["loss_dict"]["water_ener_new"] = loss_config jdata["learning_rate_dict"] = {} - jdata["learning_rate_dict"]["water_ener"] = learning_rate_config jdata["learning_rate_dict"]["water_ener_new"] = learning_rate_config jdata["training"]["data_dict"] = {} jdata["training"]["data_dict"]["water_ener_new"] = {}