diff --git a/deepmd/descriptor/loc_frame.py b/deepmd/descriptor/loc_frame.py index d8cd4f6366..9038e3f26f 100644 --- a/deepmd/descriptor/loc_frame.py +++ b/deepmd/descriptor/loc_frame.py @@ -6,6 +6,7 @@ from deepmd.env import GLOBAL_NP_FLOAT_PRECISION from deepmd.env import op_module from deepmd.env import default_tf_session_config +from deepmd.utils.sess import run_sess class DescrptLocFrame () : def __init__(self, @@ -327,7 +328,7 @@ def _compute_dstats_sys_nonsmth (self, natoms_vec, mesh) : dd_all \ - = self.sub_sess.run(self.stat_descrpt, + = run_sess(self.sub_sess, self.stat_descrpt, feed_dict = { self.place_holders['coord']: data_coord, self.place_holders['type']: data_atype, diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py index a95f0ca39a..3b5d1d0922 100644 --- a/deepmd/descriptor/se_a.py +++ b/deepmd/descriptor/se_a.py @@ -12,6 +12,7 @@ from deepmd.utils.network import embedding_net, embedding_net_rand_seed_shift from deepmd.utils.tabulate import DeepTabulate from deepmd.utils.type_embed import embed_atom_type +from deepmd.utils.sess import run_sess class DescrptSeA (): @docstring_parameter(list_to_doc(ACTIVATION_FN_DICT.keys()), list_to_doc(PRECISION_DICT.keys())) @@ -491,7 +492,7 @@ def _compute_dstats_sys_smth (self, natoms_vec, mesh) : dd_all \ - = self.sub_sess.run(self.stat_descrpt, + = run_sess(self.sub_sess, self.stat_descrpt, feed_dict = { self.place_holders['coord']: data_coord, self.place_holders['type']: data_atype, diff --git a/deepmd/descriptor/se_a_ef.py b/deepmd/descriptor/se_a_ef.py index bdba470d39..aefd058e39 100644 --- a/deepmd/descriptor/se_a_ef.py +++ b/deepmd/descriptor/se_a_ef.py @@ -4,6 +4,7 @@ from deepmd.env import tf from deepmd.common import add_data_requirement,get_activation_func, get_precision, ACTIVATION_FN_DICT, PRECISION_DICT, docstring_parameter from deepmd.utils.argcheck import list_to_doc +from deepmd.utils.sess import run_sess from deepmd.env import GLOBAL_TF_FLOAT_PRECISION from deepmd.env import GLOBAL_NP_FLOAT_PRECISION from deepmd.env import op_module @@ -518,7 +519,7 @@ def _compute_dstats_sys_smth (self, mesh, data_efield) : dd_all \ - = self.sub_sess.run(self.stat_descrpt, + = run_sess(self.sub_sess, self.stat_descrpt, feed_dict = { self.place_holders['coord']: data_coord, self.place_holders['type']: data_atype, diff --git a/deepmd/descriptor/se_r.py b/deepmd/descriptor/se_r.py index 6dad7947fa..40bbc21593 100644 --- a/deepmd/descriptor/se_r.py +++ b/deepmd/descriptor/se_r.py @@ -9,6 +9,7 @@ from deepmd.env import op_module from deepmd.env import default_tf_session_config from deepmd.utils.network import embedding_net, embedding_net_rand_seed_shift +from deepmd.utils.sess import run_sess class DescrptSeR (): @docstring_parameter(list_to_doc(ACTIVATION_FN_DICT.keys()), list_to_doc(PRECISION_DICT.keys())) @@ -401,7 +402,7 @@ def _compute_dstats_sys_se_r (self, natoms_vec, mesh) : dd_all \ - = self.sub_sess.run(self.stat_descrpt, + = run_sess(self.sub_sess, self.stat_descrpt, feed_dict = { self.place_holders['coord']: data_coord, self.place_holders['type']: data_atype, diff --git a/deepmd/descriptor/se_t.py b/deepmd/descriptor/se_t.py index 3206990402..0e264e59f1 100644 --- a/deepmd/descriptor/se_t.py +++ b/deepmd/descriptor/se_t.py @@ -9,6 +9,7 @@ from deepmd.env import op_module from deepmd.env import default_tf_session_config from deepmd.utils.network import embedding_net, embedding_net_rand_seed_shift +from deepmd.utils.sess import run_sess class DescrptSeT (): @docstring_parameter(list_to_doc(ACTIVATION_FN_DICT.keys()), list_to_doc(PRECISION_DICT.keys())) @@ -394,7 +395,7 @@ def _compute_dstats_sys_smth (self, natoms_vec, mesh) : dd_all \ - = self.sub_sess.run(self.stat_descrpt, + = run_sess(self.sub_sess, self.stat_descrpt, feed_dict = { self.place_holders['coord']: data_coord, self.place_holders['type']: data_atype, diff --git a/deepmd/entrypoints/freeze.py b/deepmd/entrypoints/freeze.py index e11ac0c906..b7ed983cd0 100755 --- a/deepmd/entrypoints/freeze.py +++ b/deepmd/entrypoints/freeze.py @@ -8,6 +8,7 @@ from deepmd.env import tf from deepmd.env import op_module +from deepmd.utils.sess import run_sess from os.path import abspath # load grad of force module @@ -154,9 +155,9 @@ def freeze( # We start a session and restore the graph weights with tf.Session() as sess: saver.restore(sess, input_checkpoint) - model_type = sess.run("model_attr/model_type:0", feed_dict={}).decode("utf-8") + model_type = run_sess(sess, "model_attr/model_type:0", feed_dict={}).decode("utf-8") if "modifier_attr/type" in nodes: - modifier_type = sess.run("modifier_attr/type:0", feed_dict={}).decode( + modifier_type = run_sess(sess, "modifier_attr/type:0", feed_dict={}).decode( "utf-8" ) else: diff --git a/deepmd/entrypoints/train.py b/deepmd/entrypoints/train.py index 1ff0cff02e..9fa8344a8b 100755 --- a/deepmd/entrypoints/train.py +++ b/deepmd/entrypoints/train.py @@ -18,6 +18,7 @@ from deepmd.utils.argcheck import normalize from deepmd.utils.compat import updata_deepmd_input from deepmd.utils.data_system import DeepmdDataSystem +from deepmd.utils.sess import run_sess if TYPE_CHECKING: from deepmd.run_options import TFServerV1 @@ -74,7 +75,7 @@ def wait_done_queue( """ with tf.Session(server.target) as sess: for i in range(cluster_spec.num_tasks("worker")): - sess.run(queue.dequeue()) + run_sess(sess, queue.dequeue()) log.debug(f"ps:{task_index:d} received done from worker:{i:d}") log.debug(f"ps:{task_index:f} quitting") @@ -127,7 +128,7 @@ def fill_done_queue( """ with tf.Session(server.target) as sess: for i in range(cluster_spec.num_tasks("ps")): - sess.run(done_ops[i]) + run_sess(sess, done_ops[i]) log.debug(f"worker:{task_index:d} sending done to ps:{i:d}") diff --git a/deepmd/infer/data_modifier.py b/deepmd/infer/data_modifier.py index c6fec62564..ea4bf6ade8 100644 --- a/deepmd/infer/data_modifier.py +++ b/deepmd/infer/data_modifier.py @@ -12,6 +12,7 @@ from deepmd.env import global_cvt_2_tf_float from deepmd.env import global_cvt_2_ener_float from deepmd.env import op_module +from deepmd.utils.sess import run_sess class DipoleChargeModifier(DeepDipole): @@ -57,7 +58,7 @@ def __init__(self, self.ext_dim = 3 self.t_ndesc = self.graph.get_tensor_by_name(os.path.join(self.modifier_prefix, 'descrpt_attr/ndescrpt:0')) self.t_sela = self.graph.get_tensor_by_name(os.path.join(self.modifier_prefix, 'descrpt_attr/sel:0')) - [self.ndescrpt, self.sel_a] = self.sess.run([self.t_ndesc, self.t_sela]) + [self.ndescrpt, self.sel_a] = run_sess(self.sess, [self.t_ndesc, self.t_sela]) self.sel_r = [ 0 for ii in range(len(self.sel_a)) ] self.nnei_a = np.cumsum(self.sel_a)[-1] self.nnei_r = np.cumsum(self.sel_r)[-1] @@ -335,9 +336,9 @@ def _eval_fv(self, coords, cells, atom_types, ext_f) : feed_dict_test[self.t_box ] = cells.reshape([-1]) feed_dict_test[self.t_mesh ] = default_mesh.reshape([-1]) feed_dict_test[self.t_ef ] = ext_f.reshape([-1]) - # print(self.sess.run(tf.shape(self.t_tensor), feed_dict = feed_dict_test)) + # print(run_sess(self.sess, tf.shape(self.t_tensor), feed_dict = feed_dict_test)) fout, vout, avout \ - = self.sess.run([self.force, self.virial, self.av], + = run_sess(self.sess, [self.force, self.virial, self.av], feed_dict = feed_dict_test) # print('fout: ', fout.shape, fout) fout = self.reverse_map(np.reshape(fout, [nframes,-1,3]), imap) diff --git a/deepmd/infer/deep_eval.py b/deepmd/infer/deep_eval.py index 92a45e3cd4..20d4f8dd7b 100644 --- a/deepmd/infer/deep_eval.py +++ b/deepmd/infer/deep_eval.py @@ -4,6 +4,7 @@ import numpy as np from deepmd.common import make_default_mesh from deepmd.env import default_tf_session_config, tf, MODEL_VERSION +from deepmd.utils.sess import run_sess if TYPE_CHECKING: from pathlib import Path @@ -43,7 +44,7 @@ def model_type(self) -> str: if not self._model_type: t_mt = self._get_tensor("model_attr/model_type:0") sess = tf.Session(graph=self.graph, config=default_tf_session_config) - [mt] = sess.run([t_mt], feed_dict={}) + [mt] = run_sess(sess, [t_mt], feed_dict={}) self._model_type = mt.decode("utf-8") return self._model_type @@ -57,7 +58,7 @@ def model_version(self) -> str: try: t_mt = self._get_tensor("model_attr/model_version:0") sess = tf.Session(graph=self.graph, config=default_tf_session_config) - [mt] = sess.run([t_mt], feed_dict={}) + [mt] = run_sess(sess, [t_mt], feed_dict={}) self._model_version = mt.decode("utf-8") except KeyError: # For deepmd-kit version 0.x - 1.x, set model version to 0.0 diff --git a/deepmd/infer/deep_pot.py b/deepmd/infer/deep_pot.py index a8e70d5a72..6ffa12e888 100644 --- a/deepmd/infer/deep_pot.py +++ b/deepmd/infer/deep_pot.py @@ -6,6 +6,7 @@ from deepmd.env import default_tf_session_config, tf from deepmd.infer.data_modifier import DipoleChargeModifier from deepmd.infer.deep_eval import DeepEval +from deepmd.utils.sess import run_sess if TYPE_CHECKING: from pathlib import Path @@ -113,7 +114,7 @@ def __init__( # setup modifier try: t_modifier_type = self._get_tensor("modifier_attr/type:0") - self.modifier_type = self.sess.run(t_modifier_type).decode("UTF-8") + self.modifier_type = run_sess(self.sess, t_modifier_type).decode("UTF-8") except (ValueError, KeyError): self.modifier_type = None @@ -123,13 +124,13 @@ def __init__( t_sys_charge_map = self._get_tensor("modifier_attr/sys_charge_map:0") t_ewald_h = self._get_tensor("modifier_attr/ewald_h:0") t_ewald_beta = self._get_tensor("modifier_attr/ewald_beta:0") - [mdl_name, mdl_charge_map, sys_charge_map, ewald_h, ewald_beta] = self.sess.run([t_mdl_name, t_mdl_charge_map, t_sys_charge_map, t_ewald_h, t_ewald_beta]) + [mdl_name, mdl_charge_map, sys_charge_map, ewald_h, ewald_beta] = run_sess(self.sess, [t_mdl_name, t_mdl_charge_map, t_sys_charge_map, t_ewald_h, t_ewald_beta]) mdl_charge_map = [int(ii) for ii in mdl_charge_map.decode("UTF-8").split()] sys_charge_map = [int(ii) for ii in sys_charge_map.decode("UTF-8").split()] self.dm = DipoleChargeModifier(mdl_name, mdl_charge_map, sys_charge_map, ewald_h = ewald_h, ewald_beta = ewald_beta) def _run_default_sess(self): - [self.ntypes, self.rcut, self.dfparam, self.daparam, self.tmap] = self.sess.run( + [self.ntypes, self.rcut, self.dfparam, self.daparam, self.tmap] = run_sess(self.sess, [self.t_ntypes, self.t_rcut, self.t_dfparam, self.t_daparam, self.t_tmap] ) diff --git a/deepmd/infer/deep_tensor.py b/deepmd/infer/deep_tensor.py index 24a7832a32..1a8b62d855 100644 --- a/deepmd/infer/deep_tensor.py +++ b/deepmd/infer/deep_tensor.py @@ -5,6 +5,7 @@ from deepmd.common import make_default_mesh from deepmd.env import default_tf_session_config, tf from deepmd.infer.deep_eval import DeepEval +from deepmd.utils.sess import run_sess if TYPE_CHECKING: from pathlib import Path @@ -63,7 +64,7 @@ def __init__( def _run_default_sess(self): [self.ntypes, self.rcut, self.tmap, self.tselt, self.output_dim] \ - = self.sess.run( + = run_sess(self.sess, [self.t_ntypes, self.t_rcut, self.t_tmap, self.t_sel_type, self.t_ouput_dim] ) diff --git a/deepmd/infer/ewald_recp.py b/deepmd/infer/ewald_recp.py index 68ee06c552..28f4352ba5 100644 --- a/deepmd/infer/ewald_recp.py +++ b/deepmd/infer/ewald_recp.py @@ -10,6 +10,7 @@ from deepmd.env import global_cvt_2_ener_float from deepmd.env import op_module from deepmd.env import default_tf_session_config +from deepmd.utils.sess import run_sess class EwaldRecp () : """ @@ -79,7 +80,7 @@ def eval(self, box = np.reshape(box, [nframes * 9]) [energy, force, virial] \ - = self.sess.run([self.t_energy, self.t_force, self.t_virial], + = run_sess(self.sess, [self.t_energy, self.t_force, self.t_virial], feed_dict = { self.t_coord: coord, self.t_charge: charge, diff --git a/deepmd/loss/ener.py b/deepmd/loss/ener.py index f4979e9b1a..f3edac8800 100644 --- a/deepmd/loss/ener.py +++ b/deepmd/loss/ener.py @@ -4,6 +4,7 @@ from deepmd.env import global_cvt_2_tf_float from deepmd.env import global_cvt_2_ener_float +from deepmd.utils.sess import run_sess class EnerStdLoss () : """ @@ -136,7 +137,7 @@ def eval(self, sess, feed_dict, natoms): self.l2_more['l2_atom_ener_loss'], self.l2_more['l2_pref_force_loss'] ] - error, error_e, error_f, error_v, error_ae, error_pf = sess.run(run_data, feed_dict=feed_dict) + error, error_e, error_f, error_v, error_ae, error_pf = run_sess(sess, run_data, feed_dict=feed_dict) results = {"natoms": natoms[0], "rmse": np.sqrt(error)} if self.has_e: results["rmse_e"] = np.sqrt(error_e) / natoms[0] @@ -184,7 +185,7 @@ def print_on_training(self, ] # first train data - train_out = sess.run(run_data, feed_dict=feed_dict_batch) + train_out = run_sess(sess, run_data, feed_dict=feed_dict_batch) error_train, error_e_train, error_f_train, error_v_train, error_ae_train, error_pf_train = train_out # than test data, if tensorboard log writter is present, commpute summary @@ -193,7 +194,7 @@ def print_on_training(self, summary_merged_op = tf.summary.merge([self.l2_loss_summary, self.l2_loss_ener_summary, self.l2_loss_force_summary, self.l2_loss_virial_summary]) run_data.insert(0, summary_merged_op) - test_out = sess.run(run_data, feed_dict=feed_dict_test) + test_out = run_sess(sess, run_data, feed_dict=feed_dict_test) if tb_writer: summary = test_out.pop(0) @@ -297,7 +298,7 @@ def eval(self, sess, feed_dict, natoms): self.l2_more['l2_ener_loss'], self.l2_more['l2_ener_dipole_loss'] ] - error, error_e, error_ed = sess.run(run_data, feed_dict=feed_dict) + error, error_e, error_ed = run_sess(sess, run_data, feed_dict=feed_dict) results = { 'natoms': natoms[0], 'rmse': np.sqrt(error), @@ -330,7 +331,7 @@ def print_on_training(self, ] # first train data - train_out = sess.run(run_data, feed_dict=feed_dict_batch) + train_out = run_sess(sess, run_data, feed_dict=feed_dict_batch) error_train, error_e_train, error_ed_train = train_out # than test data, if tensorboard log writter is present, commpute summary @@ -343,7 +344,7 @@ def print_on_training(self, ]) run_data.insert(0, summary_merged_op) - test_out = sess.run(run_data, feed_dict=feed_dict_test) + test_out = run_sess(sess, run_data, feed_dict=feed_dict_test) if tb_writer: summary = test_out.pop(0) diff --git a/deepmd/loss/tensor.py b/deepmd/loss/tensor.py index 72a7046d10..8fbd286ad5 100644 --- a/deepmd/loss/tensor.py +++ b/deepmd/loss/tensor.py @@ -4,6 +4,7 @@ from deepmd.env import global_cvt_2_tf_float from deepmd.env import global_cvt_2_ener_float +from deepmd.utils.sess import run_sess class TensorLoss () : """ @@ -123,7 +124,7 @@ def eval(self, sess, feed_dict, natoms): atoms = natoms[0] run_data = [self.l2_l, self.l2_more['local_loss'], self.l2_more['global_loss']] - error, error_lc, error_gl = sess.run(run_data, feed_dict=feed_dict) + error, error_lc, error_gl = run_sess(sess, run_data, feed_dict=feed_dict) results = {"natoms": atoms, "rmse": np.sqrt(error)} if self.local_weight > 0.0: @@ -166,7 +167,7 @@ def print_on_training(self, summary_list.append(self.l2_loss_global_summary) # first train data - error_train = sess.run(run_data, feed_dict=feed_dict_batch) + error_train = run_sess(sess, run_data, feed_dict=feed_dict_batch) # than test data, if tensorboard log writter is present, commpute summary # and write tensorboard logs @@ -175,7 +176,7 @@ def print_on_training(self, summary_merged_op = tf.summary.merge(summary_list) run_data.insert(0, summary_merged_op) - test_out = sess.run(run_data, feed_dict=feed_dict_test) + test_out = run_sess(sess, run_data, feed_dict=feed_dict_test) if tb_writer: summary = test_out.pop(0) diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index 36fc60f029..7fa29635cf 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -21,6 +21,7 @@ from deepmd.loss import EnerStdLoss, EnerDipoleLoss, TensorLoss from deepmd.utils.learning_rate import LearningRateExp from deepmd.utils.neighbor_stat import NeighborStat +from deepmd.utils.sess import run_sess from deepmd.utils.type_embed import TypeEmbedNet from tensorflow.python.client import timeline @@ -382,21 +383,21 @@ def _init_sess_serial(self) : if self.run_opt.init_mode == 'init_from_scratch' : log.info("initialize model from scratch") init_op = tf.global_variables_initializer() - self.sess.run(init_op) + run_sess(self.sess, init_op) fp = open(self.disp_file, "w") fp.close () elif self.run_opt.init_mode == 'init_from_model' : log.info("initialize from model %s" % self.run_opt.init_model) init_op = tf.global_variables_initializer() - self.sess.run(init_op) + run_sess(self.sess, init_op) saver.restore (self.sess, self.run_opt.init_model) - self.sess.run(self.global_step.assign(0)) + run_sess(self.sess, self.global_step.assign(0)) fp = open(self.disp_file, "w") fp.close () elif self.run_opt.init_mode == 'restart' : log.info("restart from model %s" % self.run_opt.restart) init_op = tf.global_variables_initializer() - self.sess.run(init_op) + run_sess(self.sess, init_op) saver.restore (self.sess, self.run_opt.restart) else : raise RuntimeError ("unkown init mode") @@ -462,11 +463,11 @@ def train (self, train_data, valid_data=None) : if self.run_opt.is_chief : fp = open(self.disp_file, "a") - cur_batch = self.sess.run(self.global_step) + cur_batch = run_sess(self.sess, self.global_step) is_first_step = True self.cur_batch = cur_batch log.info("start training at lr %.2e (== %.2e), decay_step %d, decay_rate %f, final lr will be %.2e" % - (self.sess.run(self.learning_rate), + (run_sess(self.sess, self.learning_rate), self.lr.value(cur_batch), self.lr.decay_steps_, self.lr.decay_rate_, @@ -516,15 +517,15 @@ def train (self, train_data, valid_data=None) : # use tensorboard to visualize the training of deepmd-kit # it will takes some extra execution time to generate the tensorboard data if self.tensorboard : - summary, _ = self.sess.run([summary_merged_op, self.train_op], feed_dict=train_feed_dict, + summary, _ = run_sess(self.sess, [summary_merged_op, self.train_op], feed_dict=train_feed_dict, options=prf_options, run_metadata=prf_run_metadata) tb_train_writer.add_summary(summary, cur_batch) else : - self.sess.run([self.train_op], feed_dict=train_feed_dict, + run_sess(self.sess, [self.train_op], feed_dict=train_feed_dict, options=prf_options, run_metadata=prf_run_metadata) if self.timing_in_training: toc = time.time() if self.timing_in_training: train_time += toc - tic - cur_batch = self.sess.run(self.global_step) + cur_batch = run_sess(self.sess, self.global_step) self.cur_batch = cur_batch # on-the-fly validation @@ -568,7 +569,7 @@ def get_feed_dict(self, batch, is_training): return feed_dict def get_global_step(self): - return self.sess.run(self.global_step) + return run_sess(self.sess, self.global_step) # def print_head (self) : # depreciated # if self.run_opt.is_chief: @@ -588,7 +589,7 @@ def valid_on_the_fly(self, valid_results = self.get_evaluation_results(valid_batches) cur_batch = self.cur_batch - current_lr = self.sess.run(self.learning_rate) + current_lr = run_sess(self.sess, self.learning_rate) if print_header: self.print_header(fp, train_results, valid_results) self.print_on_training(fp, train_results, valid_results, cur_batch, current_lr) diff --git a/deepmd/utils/neighbor_stat.py b/deepmd/utils/neighbor_stat.py index 98d90bf8d6..11a466faac 100644 --- a/deepmd/utils/neighbor_stat.py +++ b/deepmd/utils/neighbor_stat.py @@ -7,6 +7,7 @@ from deepmd.env import default_tf_session_config from deepmd.env import GLOBAL_NP_FLOAT_PRECISION from deepmd.utils.data_system import DeepmdDataSystem +from deepmd.utils.sess import run_sess log = logging.getLogger(__name__) @@ -73,7 +74,7 @@ def get_stat(self, data_set = data.data_systems[ii]._load_set(jj) for kk in range(np.array(data_set['type']).shape[0]): mn, dt \ - = self.sub_sess.run([self._max_nbor_size, self._min_nbor_dist], + = run_sess(self.sub_sess, [self._max_nbor_size, self._min_nbor_dist], feed_dict = { self.place_holders['coord']: np.array(data_set['coord'])[kk].reshape([-1, data.natoms[ii] * 3]), self.place_holders['type']: np.array(data_set['type'])[kk].reshape([-1, data.natoms[ii]]), diff --git a/deepmd/utils/sess.py b/deepmd/utils/sess.py new file mode 100644 index 0000000000..21f1581d35 --- /dev/null +++ b/deepmd/utils/sess.py @@ -0,0 +1,38 @@ +import os + +from deepmd.env import tf + + +def run_sess(sess: tf.Session, *args, **kwargs): + """Run session with erorrs caught. + + Parameters + ---------- + sess: tf.Session + TensorFlow Session + + Returns + ------- + the result of sess.run() + """ + try: + # https://www.tensorflow.org/api_docs/python/tf/compat/v1/Session#run + return sess.run(*args, **kwargs) + except tf.errors.ResourceExhaustedError as e: + MESSAGE = ( + "Your memory may be not enough, thus an error has been raised " + "above. You need to take the following actions:\n" + "1. Check if the network size of the model is too large.\n" + "2. Check if the batch size of training or testing is too large." + " You can set the training batch size to `auto`.\n" + "3. Check if the number of atoms is too large.\n" + ) + if tf.test.is_built_with_cuda(): + MESSAGE += ( + "4. Check if another program is using the same GPU by " + "execuating `nvidia-smi`. The usage of GPUs is " + "controlled by `CUDA_VISIBLE_DEVICES` environment " + "variable (current value: %s).\n" % ( + os.getenv("CUDA_VISIBLE_DEVICES", None), + )) + raise RuntimeError(MESSAGE) from e diff --git a/deepmd/utils/tabulate.py b/deepmd/utils/tabulate.py index 944428bf48..6a2f54e33e 100644 --- a/deepmd/utils/tabulate.py +++ b/deepmd/utils/tabulate.py @@ -5,6 +5,7 @@ from typing import Tuple, List from deepmd.env import tf from deepmd.env import op_module +from deepmd.utils.sess import run_sess from tensorflow.python.platform import gfile from tensorflow.python.framework import tensor_util @@ -158,7 +159,7 @@ def _load_sub_graph(self): def _get_tensor_value(self, tensor) : with self.sess.as_default(): - self.sess.run(tensor) + run_sess(self.sess, tensor) value = tensor.eval() return value diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h index 904ba5fafa..8464a8f46c 100644 --- a/source/lib/include/gpu_cuda.h +++ b/source/lib/include/gpu_cuda.h @@ -9,6 +9,19 @@ inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { fprintf(stderr,"cuda assert: %s %s %d\n", cudaGetErrorString(code), file, line); + if (code == 2) { + // out of memory + // TODO: I have no idea how to thorw errors back to Python interface + fprintf(stderr, "Your memory is not enough, thus an error has been raised " \ + "above. You need to take the following actions:\n" \ + "1. Check if the network size of the model is too large.\n" \ + "2. Check if the batch size of training or testing is too large. " \ + "You can set the training batch size to `auto`.\n" \ + "3. Check if the number of atoms is too large.\n" \ + "4. Check if another program is using the same GPU by execuating `nvidia-smi`. " \ + "The usage of GPUs is controlled by `CUDA_VISIBLE_DEVICES` " \ + "environment variable.\n"); + } if (abort) exit(code); } } @@ -17,6 +30,19 @@ inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort= inline void nborAssert(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { fprintf(stderr,"cuda assert: %s %s %d\n", "DeePMD-kit:\tillegal nbor list sorting", file, line); + if (code == 2) { + // out of memory + // TODO: I have no idea how to thorw errors back to Python interface + fprintf(stderr, "Your memory is not enough, thus an error has been raised " \ + "above. You need to take the following actions:\n" \ + "1. Check if the network size of the model is too large.\n" \ + "2. Check if the batch size of training or testing is too large. " \ + "You can set the training batch size to `auto`.\n" \ + "3. Check if the number of atoms is too large.\n" \ + "4. Check if another program is using the same GPU by execuating `nvidia-smi`. " \ + "The usage of GPUs is controlled by `CUDA_VISIBLE_DEVICES` " \ + "environment variable.\n"); + } if (abort) exit(code); } }