diff --git a/deepmd/common.py b/deepmd/common.py index e4613aeabb..60af7b1493 100644 --- a/deepmd/common.py +++ b/deepmd/common.py @@ -21,6 +21,8 @@ from deepmd.env import op_module, tf from deepmd.env import GLOBAL_TF_FLOAT_PRECISION, GLOBAL_NP_FLOAT_PRECISION +from deepmd.utils.sess import run_sess +from deepmd.utils.errors import GraphWithoutTensorError if TYPE_CHECKING: _DICT_VAL = TypeVar("_DICT_VAL") @@ -483,3 +485,38 @@ def get_np_precision(precision: "_PRECISION") -> np.dtype: return np.float64 else: raise RuntimeError(f"{precision} is not a valid precision") + + +def get_tensor_by_name(model_file: str, + tensor_name: str) -> tf.Tensor: + """Load tensor value from the frozen model(model_file) + + Parameters + ---------- + model_file : str + The input frozen model. + tensor : tensor_name + Indicates which tensor which will be loaded from the frozen model. + + Returns + ------- + tf.Tensor + The tensor which was loaded from the frozen model. + + Raises + ------ + GraphWithoutTensorError + Whether the tensor_name is within the frozen model. + """ + graph_def = tf.GraphDef() + with open(model_file, "rb") as f: + graph_def.ParseFromString(f.read()) + with tf.Graph().as_default() as graph: + tf.import_graph_def(graph_def, name="") + try: + tensor = graph.get_tensor_by_name(tensor_name + ":0") + except KeyError as e: + raise GraphWithoutTensorError() from e + with tf.Session(graph=graph) as sess: + tensor = run_sess(sess, tensor) + return tensor diff --git a/deepmd/entrypoints/compress.py b/deepmd/entrypoints/compress.py index 6b85999426..b0f0f42729 100644 --- a/deepmd/entrypoints/compress.py +++ b/deepmd/entrypoints/compress.py @@ -4,10 +4,11 @@ import logging from typing import Optional -from deepmd.common import j_loader +from deepmd.env import tf +from deepmd.common import j_loader, get_tensor_by_name, GLOBAL_TF_FLOAT_PRECISION from deepmd.utils.argcheck import normalize from deepmd.utils.compat import updata_deepmd_input -from deepmd.utils.errors import GraphTooLargeError +from deepmd.utils.errors import GraphTooLargeError, GraphWithoutTensorError from .freeze import freeze from .train import train @@ -20,7 +21,6 @@ def compress( *, - INPUT: str, input: str, output: str, extrapolate: int, @@ -42,8 +42,6 @@ def compress( Parameters ---------- - INPUT : str - input json/yaml control file input : str frozen model file to compress output : str @@ -63,21 +61,30 @@ def compress( log_level : int logging level """ - jdata = j_loader(INPUT) - if "model" not in jdata.keys(): - jdata = updata_deepmd_input(jdata, warning=True, dump="input_v2_compat.json") + try: + t_jdata = get_tensor_by_name(input, 'train_attr/training_script') + t_min_nbor_dist = get_tensor_by_name(input, 'train_attr/min_nbor_dist') + except GraphWithoutTensorError as e: + raise RuntimeError( + "The input frozen model: %s has no training script or min_nbor_dist information," + "which is not supported by the model compression program." + "Please consider using the dp convert-from interface to upgrade the model" % input + ) from e + tf.constant(t_min_nbor_dist, + name = 'train_attr/min_nbor_dist', + dtype = GLOBAL_TF_FLOAT_PRECISION) + jdata = json.loads(t_jdata) jdata["model"]["compress"] = {} jdata["model"]["compress"]["type"] = 'se_e2_a' jdata["model"]["compress"]["compress"] = True jdata["model"]["compress"]["model_file"] = input + jdata["model"]["compress"]["min_nbor_dist"] = t_min_nbor_dist jdata["model"]["compress"]["table_config"] = [ extrapolate, step, 10 * step, int(frequency), ] - # be careful here, if one want to refine the model - jdata["training"]["numb_steps"] = jdata["training"]["save_freq"] jdata = normalize(jdata) # check the descriptor info of the input file @@ -90,7 +97,7 @@ def compress( # stage 1: training or refining the model with tabulation log.info("\n\n") - log.info("stage 1: train or refine the model with tabulation") + log.info("stage 1: compress the model") control_file = "compress.json" with open(control_file, "w") as fp: json.dump(jdata, fp, indent=4) @@ -103,6 +110,7 @@ def compress( mpi_log=mpi_log, log_level=log_level, log_path=log_path, + is_compress=True, ) except GraphTooLargeError as e: raise RuntimeError( diff --git a/deepmd/entrypoints/freeze.py b/deepmd/entrypoints/freeze.py index a6206abef4..a8a1294a64 100755 --- a/deepmd/entrypoints/freeze.py +++ b/deepmd/entrypoints/freeze.py @@ -45,6 +45,8 @@ def _make_node_names(model_type: str, modifier_type: Optional[str] = None) -> Li "model_attr/tmap", "model_attr/model_type", "model_attr/model_version", + "train_attr/min_nbor_dist", + "train_attr/training_script", ] if model_type == "ener": diff --git a/deepmd/entrypoints/main.py b/deepmd/entrypoints/main.py index 9557976bf6..d28022ac1f 100644 --- a/deepmd/entrypoints/main.py +++ b/deepmd/entrypoints/main.py @@ -254,11 +254,6 @@ def parse_args(args: Optional[List[str]] = None): help="compress a model", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) - parser_compress.add_argument( - "INPUT", - help="The input parameter file in json or yaml format, which should be " - "consistent with the original model parameter file", - ) parser_compress.add_argument( "-i", "--input", diff --git a/deepmd/entrypoints/train.py b/deepmd/entrypoints/train.py index bb0b6d20c0..ba20f5f7e8 100755 --- a/deepmd/entrypoints/train.py +++ b/deepmd/entrypoints/train.py @@ -11,7 +11,7 @@ import numpy as np from deepmd.common import data_requirement, expand_sys_str, j_loader, j_must_have -from deepmd.env import reset_default_tf_session_config +from deepmd.env import tf, reset_default_tf_session_config from deepmd.infer.data_modifier import DipoleChargeModifier from deepmd.train.run_options import BUILD, CITATION, WELCOME, RunOptions from deepmd.train.trainer import DPTrainer @@ -35,6 +35,7 @@ def train( mpi_log: str, log_level: int, log_path: Optional[str], + is_compress: bool = False, **kwargs, ): """Run DeePMD model training. @@ -55,6 +56,8 @@ def train( logging level defined by int 0-3 log_path : Optional[str] logging file path or None if logs are to be output only to stdout + is_compress: Bool + indicates whether in the model compress mode Raises ------ @@ -68,11 +71,15 @@ def train( jdata = normalize(jdata) - jdata = update_sel(jdata) + if is_compress == False: + jdata = update_sel(jdata) with open(output, "w") as fp: json.dump(jdata, fp, indent=4) + # save the training script into the graph + tf.constant(json.dumps(jdata), name='train_attr/training_script', dtype=tf.string) + # run options run_opt = RunOptions( init_model=init_model, @@ -86,10 +93,10 @@ def train( log.info(message) run_opt.print_resource_summary() - _do_work(jdata, run_opt) + _do_work(jdata, run_opt, is_compress) -def _do_work(jdata: Dict[str, Any], run_opt: RunOptions): +def _do_work(jdata: Dict[str, Any], run_opt: RunOptions, is_compress: bool = False): """Run serial model training. Parameters @@ -98,6 +105,8 @@ def _do_work(jdata: Dict[str, Any], run_opt: RunOptions): arguments read form json/yaml control file run_opt : RunOptions object with run configuration + is_compress : Bool + indicates whether in model compress mode Raises ------ @@ -112,7 +121,7 @@ def _do_work(jdata: Dict[str, Any], run_opt: RunOptions): reset_default_tf_session_config(cpu_only=True) # init the model - model = DPTrainer(jdata, run_opt=run_opt) + model = DPTrainer(jdata, run_opt=run_opt, is_compress = is_compress) rcut = model.model.get_rcut() type_map = model.model.get_type_map() if len(type_map) == 0: @@ -129,25 +138,31 @@ def _do_work(jdata: Dict[str, Any], run_opt: RunOptions): # setup data modifier modifier = get_modifier(jdata["model"].get("modifier", None)) - # init data - train_data = get_data(jdata["training"]["training_data"], rcut, ipt_type_map, modifier) - train_data.print_summary("training") - if jdata["training"].get("validation_data", None) is not None: - valid_data = get_data(jdata["training"]["validation_data"], rcut, ipt_type_map, modifier) - valid_data.print_summary("validation") - else: - valid_data = None + # decouple the training data from the model compress process + train_data = None + valid_data = None + if is_compress == False: + # init data + train_data = get_data(jdata["training"]["training_data"], rcut, ipt_type_map, modifier) + train_data.print_summary("training") + if jdata["training"].get("validation_data", None) is not None: + valid_data = get_data(jdata["training"]["validation_data"], rcut, ipt_type_map, modifier) + valid_data.print_summary("validation") # get training info stop_batch = j_must_have(jdata["training"], "numb_steps") model.build(train_data, stop_batch) - # train the model with the provided systems in a cyclic way - start_time = time.time() - model.train(train_data, valid_data) - end_time = time.time() - log.info("finished training") - log.info(f"wall time: {(end_time - start_time):.3f} s") + if is_compress == False: + # train the model with the provided systems in a cyclic way + start_time = time.time() + model.train(train_data, valid_data) + end_time = time.time() + log.info("finished training") + log.info(f"wall time: {(end_time - start_time):.3f} s") + else: + model.save_compressed() + log.info("finished compressing") def get_data(jdata: Dict[str, Any], rcut, type_map, modifier): diff --git a/deepmd/train/trainer.py b/deepmd/train/trainer.py index 4526c2d469..f6ea8aa8a6 100644 --- a/deepmd/train/trainer.py +++ b/deepmd/train/trainer.py @@ -32,7 +32,7 @@ # load grad of force module import deepmd.op -from deepmd.common import j_must_have, ClassArg +from deepmd.common import j_must_have, ClassArg, data_requirement log = logging.getLogger(__name__) @@ -79,9 +79,11 @@ def _generate_descrpt_from_param_dict(descrpt_param): class DPTrainer (object): def __init__(self, jdata, - run_opt): + run_opt, + is_compress = False): self.run_opt = run_opt self._init_param(jdata) + self.is_compress = is_compress def _init_param(self, jdata): # model config @@ -280,33 +282,38 @@ def _init_param(self, jdata): def build (self, - data, + data = None, stop_batch = 0) : self.ntypes = self.model.get_ntypes() - # Usually, the type number of the model should be equal to that of the data - # However, nt_model > nt_data should be allowed, since users may only want to - # train using a dataset that only have some of elements - assert (self.ntypes >= data.get_ntypes()), "ntypes should match that found in data" self.stop_batch = stop_batch - self.batch_size = data.get_batch_size() - if self.numb_fparam > 0 : log.info("training with %d frame parameter(s)" % self.numb_fparam) else: log.info("training without frame parameter") - self.type_map = data.get_type_map() - - self.model.data_stat(data) + if self.is_compress == False: + # Usually, the type number of the model should be equal to that of the data + # However, nt_model > nt_data should be allowed, since users may only want to + # train using a dataset that only have some of elements + assert (self.ntypes >= data.get_ntypes()), "ntypes should match that found in data" + self.type_map = data.get_type_map() + self.batch_size = data.get_batch_size() + self.model.data_stat(data) - if 'compress' in self.model_param and self.model_param['compress']['compress']: - assert 'rcut' in self.descrpt_param, "Error: descriptor must have attr rcut!" self.neighbor_stat \ = NeighborStat(self.ntypes, self.descrpt_param['rcut']) self.min_nbor_dist, self.max_nbor_size \ = self.neighbor_stat.get_stat(data) - self.descrpt.enable_compression(self.min_nbor_dist, self.model_param['compress']['model_file'], self.model_param['compress']['table_config'][0], self.model_param['compress']['table_config'][1], self.model_param['compress']['table_config'][2], self.model_param['compress']['table_config'][3]) + tf.constant(self.min_nbor_dist, + name = 'train_attr/min_nbor_dist', + dtype = GLOBAL_TF_FLOAT_PRECISION) + tf.constant(self.max_nbor_size, + name = 'train_attr/max_nbor_size', + dtype = GLOBAL_TF_FLOAT_PRECISION) + else : + assert 'rcut' in self.descrpt_param, "Error: descriptor must have attr rcut!" + self.descrpt.enable_compression(self.model_param['compress']["min_nbor_dist"], self.model_param['compress']['model_file'], self.model_param['compress']['table_config'][0], self.model_param['compress']['table_config'][1], self.model_param['compress']['table_config'][2], self.model_param['compress']['table_config'][3]) self._build_lr() self._build_network(data) @@ -321,15 +328,12 @@ def _build_lr(self): def _build_network(self, data): self.place_holders = {} - data_dict = data.get_data_dict() - for kk in data_dict.keys(): - if kk == 'type': - continue - prec = GLOBAL_TF_FLOAT_PRECISION - if data_dict[kk]['high_prec'] : - prec = GLOBAL_ENER_FLOAT_PRECISION - self.place_holders[kk] = tf.placeholder(prec, [None], name = 't_' + kk) - self.place_holders['find_'+kk] = tf.placeholder(tf.float32, name = 't_find_' + kk) + if self.is_compress : + for kk in ['coord', 'box']: + self.place_holders[kk] = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], 't_' + kk) + self._get_place_horders(data_requirement) + else : + self._get_place_horders(data.get_data_dict()) self.place_holders['type'] = tf.placeholder(tf.int32, [None], name='t_type') self.place_holders['natoms_vec'] = tf.placeholder(tf.int32, [self.ntypes+2], name='t_natoms') @@ -412,7 +416,7 @@ def _init_session(self): log.info('receive global variables from task#0') run_sess(self.sess, bcast_op) - def train (self, train_data, valid_data=None) : + def train (self, train_data = None, valid_data=None) : # if valid_data is None: # no validation set specified. # valid_data = train_data # using training set as validation set. @@ -466,6 +470,7 @@ def train (self, train_data, valid_data=None) : tb_valid_writer = None train_time = 0 + while cur_batch < stop_batch : # first round validation: @@ -507,7 +512,7 @@ def train (self, train_data, valid_data=None) : train_time = 0 if self.save_freq > 0 and cur_batch % self.save_freq == 0 and self.saver is not None: try: - self.saver.save (self.sess, os.getcwd() + "/" + self.save_ckpt) + self.saver.save (self.sess, os.path.join(os.getcwd(), self.save_ckpt)) except google.protobuf.message.DecodeError as e: raise GraphTooLargeError( "The graph size exceeds 2 GB, the hard limitation of protobuf." @@ -617,3 +622,21 @@ def get_evaluation_results(self, batch_list): sum_results[k] = sum_results.get(k, 0.) + v * results["natoms"] avg_results = {k: v / sum_natoms for k, v in sum_results.items() if not k == "natoms"} return avg_results + + def save_compressed(self): + """ + Save the compressed graph + """ + self._init_session() + if self.is_compress: + self.saver.save (self.sess, os.path.join(os.getcwd(), self.save_ckpt)) + + def _get_place_horders(self, data_dict): + for kk in data_dict.keys(): + if kk == 'type': + continue + prec = GLOBAL_TF_FLOAT_PRECISION + if data_dict[kk]['high_prec'] : + prec = GLOBAL_ENER_FLOAT_PRECISION + self.place_holders[kk] = tf.placeholder(prec, [None], name = 't_' + kk) + self.place_holders['find_' + kk] = tf.placeholder(tf.float32, name = 't_find_' + kk) diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index 3f75b39394..20b7667b1d 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -332,14 +332,16 @@ def modifier_variant_type_args(): # --- model compression configurations: --- # def model_compression(): - doc_compress = "The name of the frozen model file." + doc_compress = f"The name of the frozen model file." doc_model_file = f"The input model file, which will be compressed by the DeePMD-kit." doc_table_config = f"The arguments of model compression, including extrapolate(scale of model extrapolation), stride(uniform stride of tabulation's first and second table), and frequency(frequency of tabulation overflow check)." + doc_min_nbor_dist = f"The nearest distance between neighbor atoms saved in the frozen model." return [ - Argument("compress", bool, optional = False, default = True, doc = doc_compress), - Argument("model_file", str, optional = False, default = 'frozen_model.pb', doc = doc_model_file), - Argument("table_config", list, optional = False, default = [5, 0.01, 0.1, -1], doc = doc_table_config), + Argument("compress", bool, optional = False, doc = doc_compress), + Argument("model_file", str, optional = False, doc = doc_model_file), + Argument("table_config", list, optional = False, doc = doc_table_config), + Argument("min_nbor_dist", float, optional = False, doc = doc_min_nbor_dist), ] # --- model compression configurations: --- # diff --git a/deepmd/utils/errors.py b/deepmd/utils/errors.py index d7b62383f1..4a6617c055 100644 --- a/deepmd/utils/errors.py +++ b/deepmd/utils/errors.py @@ -1,2 +1,5 @@ class GraphTooLargeError(Exception): pass + +class GraphWithoutTensorError(Exception): + pass diff --git a/doc/getting-started.md b/doc/getting-started.md index 7b028d7165..0c00588092 100644 --- a/doc/getting-started.md +++ b/doc/getting-started.md @@ -277,22 +277,18 @@ For more details with respect to definition of model deviation and its applicati Once the frozen model is obtained from deepmd-kit, we can get the neural network structure and its parameters (weights, biases, etc.) from the trained model, and compress it in the following way: ```bash -dp compress input.json -i graph.pb -o graph-compress.pb +dp compress -i graph.pb -o graph-compress.pb ``` -where input.json denotes the original training input script, `-i` gives the original frozen model, `-o` gives the compressed model. Several other command line options can be passed to `dp compress`, which can be checked with +where `-i` gives the original frozen model, `-o` gives the compressed model. Several other command line options can be passed to `dp compress`, which can be checked with ```bash $ dp compress --help ``` An explanation will be provided ``` -usage: dp compress [-h] [-i INPUT] [-o OUTPUT] [-e EXTRAPOLATE] [-s STRIDE] - [-f FREQUENCY] [-d FOLDER] - INPUT - -positional arguments: - INPUT The input parameter file in json or yaml format, which - should be consistent with the original model parameter - file +usage: dp compress [-h] [-v {DEBUG,3,INFO,2,WARNING,1,ERROR,0}] [-l LOG_PATH] + [-m {master,collect,workers}] [-i INPUT] [-o OUTPUT] + [-s STEP] [-e EXTRAPOLATE] [-f FREQUENCY] + [-c CHECKPOINT_FOLDER] optional arguments: -h, --help show this help message and exit diff --git a/source/tests/test_argument_parser.py b/source/tests/test_argument_parser.py index 615199603a..e26a7d7a42 100644 --- a/source/tests/test_argument_parser.py +++ b/source/tests/test_argument_parser.py @@ -196,7 +196,7 @@ def test_parser_log(self): } for parser in ("config", "transfer", "train", "freeze", "test", "compress"): - if parser in ("compress", "train"): + if parser in ("train"): args = {**{"INPUT": dict(type=str, value="INFILE")}, **ARGS} else: args = ARGS @@ -208,7 +208,7 @@ def test_parser_mpi(self): ARGS = {"--mpi-log": dict(type=str, value="master")} for parser in ("train", "compress"): - if parser in ("train", "compress"): + if parser in ("train"): args = {**{"INPUT": dict(type=str, value="INFILE")}, **ARGS} else: args = ARGS @@ -270,7 +270,6 @@ def test_parser_test(self): def test_parser_compress(self): """Test compress subparser.""" ARGS = { - "INPUT": dict(type=str, value="INFILE"), "--output": dict(type=str, value="OUTFILE"), "--extrapolate": dict(type=int, value=5), "--step": dict(type=float, value=0.1), diff --git a/source/tests/test_model_compression.py b/source/tests/test_model_compression.py index d67e209c42..78dfbb26ac 100644 --- a/source/tests/test_model_compression.py +++ b/source/tests/test_model_compression.py @@ -33,7 +33,7 @@ def setUp(self): assert(ret == 0), "DP train error!" ret = os.system("dp freeze -o " + self.frozen_model) assert(ret == 0), "DP freeze error!" - ret = os.system("dp compress " + self.INPUT + " -i " + self.frozen_model + " -o " + self.compressed_model) + ret = os.system("dp compress " + " -i " + self.frozen_model + " -o " + self.compressed_model) assert(ret == 0), "DP model compression error!" self.dp_original = DeepPot(self.frozen_model) @@ -168,7 +168,7 @@ def setUp(self): assert(ret == 0), "DP train error!" ret = os.system("dp freeze -o " + self.frozen_model) assert(ret == 0), "DP freeze error!" - ret = os.system("dp compress " + self.INPUT + " -i " + self.frozen_model + " -o " + self.compressed_model) + ret = os.system("dp compress " + " -i " + self.frozen_model + " -o " + self.compressed_model) assert(ret == 0), "DP model compression error!" self.dp_original = DeepPot(self.frozen_model) @@ -289,7 +289,7 @@ def setUp(self): assert(ret == 0), "DP train error!" ret = os.system("dp freeze -o " + self.frozen_model) assert(ret == 0), "DP freeze error!" - ret = os.system("dp compress " + self.INPUT + " -i " + self.frozen_model + " -o " + self.compressed_model) + ret = os.system("dp compress " + " -i " + self.frozen_model + " -o " + self.compressed_model) assert(ret == 0), "DP model compression error!" self.dp_original = DeepPot(self.frozen_model) @@ -401,7 +401,7 @@ def setUp(self): assert(ret == 0), "DP train error!" ret = os.system("dp freeze -o " + self.frozen_model) assert(ret == 0), "DP freeze error!" - ret = os.system("dp compress " + self.INPUT + " -i " + self.frozen_model + " -o " + self.compressed_model) + ret = os.system("dp compress " + " -i " + self.frozen_model + " -o " + self.compressed_model) assert(ret == 0), "DP model compression error!" self.dp_original = DeepPot(self.frozen_model)