diff --git a/README.md b/README.md
index b59725ec9..c833ed059 100644
--- a/README.md
+++ b/README.md
@@ -499,9 +499,8 @@ The bold notation of key (such aas **type_map**) means that it's a necessary key
| **use_ele_temp** | int | 0 | Currently only support fp_style vasp. 0(default): no electron temperature. 1: eletron temperature as frame parameter. 2: electron temperature as atom parameter.
| *#Data*
| init_data_prefix | String | "/sharedext4/.../data/" | Prefix of initial data directories
- | ***init_data_sys*** | List of string|["CH4.POSCAR.01x01x01/.../deepmd"] |Directories of initial data. You may use either absolute or relative path here.
+ | ***init_data_sys*** | List of string|["CH4.POSCAR.01x01x01/.../deepmd"] |Directories of initial data. You may use either absolute or relative path here. Systems will be detected recursively in the directories.
| ***sys_format*** | String | "vasp/poscar" | Format of initial data. It will be `vasp/poscar` if not set.
- | init_multi_systems | Boolean | false | If set to `true`, `init_data_sys` directories should contain sub-directories of various systems. DP-GEN will regard all of these sub-directories as inital data systems.
| init_batch_size | String of integer | [8] | Each number is the batch_size of corresponding system for training in `init_data_sys`. One recommended rule for setting the `sys_batch_size` and `init_batch_size` is that `batch_size` mutiply number of atoms ot the stucture should be larger than 32. If set to `auto`, batch size will be 32 divided by number of atoms. |
| sys_configs_prefix | String | "/sharedext4/.../data/" | Prefix of `sys_configs`
| **sys_configs** | List of list of string | [
["/sharedext4/.../POSCAR"],
["....../POSCAR"]
] | Containing directories of structures to be explored in iterations.Wildcard characters are supported here. |
@@ -1086,7 +1085,6 @@ Here is an example of `param.json` for QM7 dataset:
},
"_comment": "that's all"
},
- "use_clusters": true,
"fp_style": "gaussian",
"shuffle_poscar": false,
"fp_task_max": 1000,
@@ -1109,7 +1107,7 @@ Here is an example of `param.json` for QM7 dataset:
}
```
-Here `pick_data` is the data to simplify and currently only supports `MultiSystems` containing `System` with `deepmd/npy` format, and `use_clusters` should always be `true`. `init_pick_number` and `iter_pick_number` are the numbers of picked frames. `e_trust_lo`, `e_trust_hi` mean the range of the deviation of the frame energy, and `f_trust_lo` and `f_trust_hi` mean the range of the max deviation of atomic forces in a frame. `fp_style` can only be `gaussian` currently. Other parameters are as the same as those of generator.
+Here `pick_data` is the directory to data to simplify where the program recursively detects systems `System` with `deepmd/npy` format. `init_pick_number` and `iter_pick_number` are the numbers of picked frames. `e_trust_lo`, `e_trust_hi` mean the range of the deviation of the frame energy, and `f_trust_lo` and `f_trust_hi` mean the range of the max deviation of atomic forces in a frame. `fp_style` can only be `gaussian` currently. Other parameters are as the same as those of generator.
## Set up machine
diff --git a/dpgen/generator/run.py b/dpgen/generator/run.py
index dc7a91d3b..41b5aa299 100644
--- a/dpgen/generator/run.py
+++ b/dpgen/generator/run.py
@@ -60,7 +60,7 @@
from dpgen.generator.lib.ele_temp import NBandsEsti
from dpgen.remote.decide_machine import convert_mdata
from dpgen.dispatcher.Dispatcher import Dispatcher, _split_tasks, make_dispatcher, make_submission
-from dpgen.util import sepline
+from dpgen.util import sepline, expand_sys_str
from dpgen import ROOT_PATH
from pymatgen.io.vasp import Incar,Kpoints,Potcar
from dpgen.auto_test.lib.vasp import make_kspacing_kpoints
@@ -287,13 +287,10 @@ def make_train (iter_index,
# make sure all init_data_sys has the batch size -- for the following `zip`
assert (len(init_data_sys_) <= len(init_batch_size_))
for ii, ss in zip(init_data_sys_, init_batch_size_) :
- if jdata.get('init_multi_systems', False):
- for single_sys in os.listdir(os.path.join(work_path, 'data.init', ii)):
- init_data_sys.append(os.path.join('..', 'data.init', ii, single_sys))
- init_batch_size.append(detect_batch_size(ss, os.path.join(work_path, 'data.init', ii, single_sys)))
- else:
- init_data_sys.append(os.path.join('..', 'data.init', ii))
- init_batch_size.append(detect_batch_size(ss, os.path.join(work_path, 'data.init', ii)))
+ sys_paths = expand_sys_str(os.path.join(init_data_prefix, ii))
+ for single_sys in sys_paths:
+ init_data_sys.append(os.path.normpath(os.path.join('..', 'data.init', ii, os.path.relpath(single_sys, os.path.join(init_data_prefix, ii)))))
+ init_batch_size.append(detect_batch_size(ss, single_sys))
old_range = None
if iter_index > 0 :
for ii in range(iter_index) :
@@ -307,25 +304,16 @@ def make_train (iter_index,
sys_batch_size = ["auto" for aa in range(len(sys_list))]
for jj in fp_data_sys :
sys_idx = int(jj.split('.')[-1])
- if jdata.get('use_clusters', False):
- nframes = 0
- for sys_single in os.listdir(jj):
- tmp_box = np.loadtxt(os.path.join(jj, sys_single, 'box.raw'))
- tmp_box = np.reshape(tmp_box, [-1,9])
- nframes += tmp_box.shape[0]
- if nframes < fp_task_min :
- log_task('nframes (%d) in data sys %s is too small, skip' % (nframes, jj))
- continue
- for sys_single in os.listdir(jj):
- init_data_sys.append(os.path.join('..', 'data.iters', jj, sys_single))
- init_batch_size.append(detect_batch_size(sys_batch_size[sys_idx], os.path.join(jj, sys_single)))
- else:
- nframes = dpdata.System(jj, 'deepmd/npy').get_nframes()
- if nframes < fp_task_min :
- log_task('nframes (%d) in data sys %s is too small, skip' % (nframes, jj))
- continue
- init_data_sys.append(os.path.join('..', 'data.iters', jj))
- init_batch_size.append(detect_batch_size(sys_batch_size[sys_idx], jj))
+ sys_paths = expand_sys_str(jj)
+ nframes = 0
+ for sys_single in sys_paths:
+ nframes += dpdata.LabeledSystem(sys_single, fmt="deepmd/npy").get_nframes()
+ if nframes < fp_task_min :
+ log_task('nframes (%d) in data sys %s is too small, skip' % (nframes, jj))
+ continue
+ for sys_single in sys_paths:
+ init_data_sys.append(os.path.normpath(os.path.join('..', 'data.iters', sys_single)))
+ init_batch_size.append(detect_batch_size(sys_batch_size[sys_idx], sys_single))
# establish tasks
jinput = jdata['default_training_param']
try:
@@ -567,25 +555,17 @@ def run_train (iter_index,
os.chdir(work_path)
fp_data = glob.glob(os.path.join('data.iters', 'iter.*', '02.fp', 'data.*'))
for ii in init_data_sys :
- if jdata.get('init_multi_systems', False):
- for single_sys in os.listdir(os.path.join(ii)):
- trans_comm_data += glob.glob(os.path.join(ii, single_sys, 'set.*'))
- trans_comm_data += glob.glob(os.path.join(ii, single_sys, 'type*.raw'))
- trans_comm_data += glob.glob(os.path.join(ii, single_sys, 'nopbc'))
- else:
- trans_comm_data += glob.glob(os.path.join(ii, 'set.*'))
- trans_comm_data += glob.glob(os.path.join(ii, 'type*.raw'))
- trans_comm_data += glob.glob(os.path.join(ii, 'nopbc'))
+ sys_paths = expand_sys_str(ii)
+ for single_sys in sys_paths:
+ trans_comm_data += glob.glob(os.path.join(single_sys, 'set.*'))
+ trans_comm_data += glob.glob(os.path.join(single_sys, 'type*.raw'))
+ trans_comm_data += glob.glob(os.path.join(single_sys, 'nopbc'))
for ii in fp_data :
- if jdata.get('use_clusters', False):
- for single_sys in os.listdir(os.path.join(ii)):
- trans_comm_data += glob.glob(os.path.join(ii, single_sys, 'set.*'))
- trans_comm_data += glob.glob(os.path.join(ii, single_sys, 'type*.raw'))
- trans_comm_data += glob.glob(os.path.join(ii, single_sys, 'nopbc'))
- else:
- trans_comm_data += glob.glob(os.path.join(ii, 'set.*'))
- trans_comm_data += glob.glob(os.path.join(ii, 'type*.raw'))
- trans_comm_data += glob.glob(os.path.join(ii, 'nopbc'))
+ sys_paths = expand_sys_str(ii)
+ for single_sys in sys_paths:
+ trans_comm_data += glob.glob(os.path.join(single_sys, 'set.*'))
+ trans_comm_data += glob.glob(os.path.join(single_sys, 'type*.raw'))
+ trans_comm_data += glob.glob(os.path.join(single_sys, 'nopbc'))
os.chdir(cwd)
try:
diff --git a/dpgen/simplify/simplify.py b/dpgen/simplify/simplify.py
index 982db3114..529401519 100644
--- a/dpgen/simplify/simplify.py
+++ b/dpgen/simplify/simplify.py
@@ -9,6 +9,7 @@
02: fp (optional, if the original dataset do not have fp data, same as generator)
"""
import logging
+import warnings
import queue
import os
import json
@@ -21,7 +22,7 @@
from dpgen import dlog
from dpgen import SHORT_CMD
-from dpgen.util import sepline
+from dpgen.util import sepline, expand_sys_str
from distutils.version import LooseVersion
from dpgen.dispatcher.Dispatcher import Dispatcher, _split_tasks, make_dispatcher, make_submission
from dpgen.generator.run import make_train, run_train, post_train, run_fp, post_fp, fp_name, model_devi_name, train_name, train_task_fmt, sys_link_fp_vasp_pp, make_fp_vasp_incar, make_fp_vasp_kp, make_fp_vasp_cp_cvasp, data_system_fmt, model_devi_task_fmt, fp_task_fmt
@@ -38,17 +39,6 @@
sys_name_fmt = 'sys.' + data_system_fmt
sys_name_pattern = 'sys.[0-9]*[0-9]'
-def expand_sys_str(root_dir):
- matches = []
- for root, dirnames, filenames in os.walk(root_dir, followlinks=True):
- for filename in fnmatch.filter(filenames, 'type.raw'):
- matches.append(root)
- matches.sort()
- dirnames = [os.path.basename(ii) for ii in matches]
- if (len(list(set(dirnames))) != len(matches)) :
- raise RuntimeError('duplicated system name: it is highly recommend to place all systems in the same level of directory and has different names')
- return matches
-
def get_system_cls(jdata):
if jdata.get("labeled", False):
@@ -58,28 +48,12 @@ def get_system_cls(jdata):
def get_multi_system(path, jdata):
system = get_system_cls(jdata)
+ system_paths = expand_sys_str(path)
systems = dpdata.MultiSystems(
- *[system(os.path.join(path, s), fmt='deepmd/npy') for s in os.listdir(path)])
- return systems
-
-
-def get_systems(path, jdata):
- system_cls = get_system_cls(jdata)
- system_paths = expand_sys_str(path)
- systems = {}
- for ii in system_paths:
- systems[os.path.basename(ii)] = system_cls(ii, fmt='deepmd/npy')
+ *[system(s, fmt='deepmd/npy') for s in system_paths])
return systems
-def get_system_idx(path):
- system_paths = expand_sys_str(path)
- sys_idx_map = {}
- for idx,ii in enumerate(system_paths):
- sys_idx_map[os.path.basename(ii)] = idx
- return sys_idx_map
-
-
def init_model(iter_index, jdata, mdata):
training_init_model = jdata.get('training_init_model', False)
if not training_init_model:
@@ -111,20 +85,13 @@ def init_pick(iter_index, jdata, mdata):
"""pick up init data from dataset randomly"""
pick_data = jdata['pick_data']
init_pick_number = jdata['init_pick_number']
- use_clusters = jdata.get('use_clusters', False)
# use MultiSystems with System
# TODO: support System and LabeledSystem
# TODO: support other format
- if use_clusters:
- systems = get_multi_system(pick_data, jdata)
- else:
- systems = get_systems(pick_data, jdata)
+ systems = get_multi_system(pick_data, jdata)
# label the system
labels = []
- if use_clusters:
- items = systems.systems.items()
- else:
- items = systems.items()
+ items = systems.systems.items()
for key, system in items:
labels.extend([(key, j) for j in range(len(system))])
@@ -146,48 +113,18 @@ def init_pick(iter_index, jdata, mdata):
_init_dump_selected_frames(systems, labels, rest_idx, sys_data_path, jdata)
-def _add_system(systems, key, system):
- if key in systems.keys():
- systems[key].append(system)
- else:
- systems[key] = system
- return systems
-
-
def _init_dump_selected_frames(systems, labels, selc_idx, sys_data_path, jdata):
- pick_data = jdata['pick_data']
- use_clusters = jdata.get('use_clusters', False)
- if use_clusters:
- selc_systems = dpdata.MultiSystems()
- for j in selc_idx:
- sys_name, sys_id = labels[j]
- selc_systems.append(systems[sys_name][sys_id])
- selc_systems.to_deepmd_raw(sys_data_path)
- selc_systems.to_deepmd_npy(sys_data_path, set_size=selc_idx.size)
- else:
- selc_systems = {}
- for j in selc_idx:
- sys_name, sys_id = labels[j]
- selc_systems = _add_system(selc_systems, sys_name, systems[sys_name][sys_id])
- sys_idx_map = get_system_idx(pick_data)
- for kk in selc_systems.keys():
- sub_path = os.path.join(sys_data_path, sys_name_fmt % sys_idx_map[kk])
- selc_systems[kk].to_deepmd_raw(sub_path)
- selc_systems[kk].to_deepmd_npy(sub_path, set_size=selc_idx.size)
- with open(os.path.join(sys_data_path, 'sys_idx_map.json'), 'w') as fp:
- json.dump(sys_idx_map, fp, indent=4)
-
-def _dump_system_dict(systems, path):
- for kk in systems:
- sub_path = os.path.join(path, sys_name_fmt % (int(kk)))
- systems[kk].to_deepmd_raw(sub_path)
- systems[kk].to_deepmd_npy(sub_path, set_size=systems[kk].get_nframes())
+ selc_systems = dpdata.MultiSystems()
+ for j in selc_idx:
+ sys_name, sys_id = labels[j]
+ selc_systems.append(systems[sys_name][sys_id])
+ selc_systems.to_deepmd_raw(sys_data_path)
+ selc_systems.to_deepmd_npy(sys_data_path, set_size=selc_idx.size)
def make_model_devi(iter_index, jdata, mdata):
"""calculate the model deviation of the rest idx"""
pick_data = jdata['pick_data']
- use_clusters = jdata.get('use_clusters', False)
iter_name = make_iter_name(iter_index)
work_path = os.path.join(iter_name, model_devi_name)
create_path(work_path)
@@ -203,25 +140,7 @@ def make_model_devi(iter_index, jdata, mdata):
rest_data_path = os.path.join(last_iter_name, model_devi_name, rest_data_name)
if not os.path.exists(rest_data_path):
return False
- if use_clusters:
- for jj, subsystem in enumerate(os.listdir(rest_data_path)):
- task_name = "task." + model_devi_task_fmt % (0, jj)
- task_path = os.path.join(work_path, task_name)
- create_path(task_path)
- os.symlink(os.path.abspath(os.path.join(rest_data_path, subsystem)),
- os.path.abspath(os.path.join(task_path, rest_data_name)))
- else:
- rest_data_path = os.path.abspath(rest_data_path)
- sys_path = glob.glob(os.path.join(rest_data_path, sys_name_pattern))
- cwd = os.getcwd()
- for ii in sys_path:
- task_name = "task." + model_devi_task_fmt % (int(os.path.basename(ii).split('.')[1]), 0)
- task_path = os.path.join(work_path, task_name)
- create_path(task_path)
- os.chdir(task_path)
- os.symlink(os.path.relpath(ii), rest_data_name)
- os.chdir(cwd)
- os.chdir(cwd)
+ os.symlink(os.path.abspath(rest_data_path), os.path.join(work_path, rest_data_name + ".old"))
return True
@@ -231,43 +150,28 @@ def run_model_devi(iter_index, jdata, mdata):
work_path = os.path.join(iter_name, model_devi_name)
# generate command
commands = []
- tasks = glob.glob(os.path.join(work_path, "task.*"))
- run_tasks = [os.path.basename(ii) for ii in tasks]
+ run_tasks = ["."]
# get models
models = glob.glob(os.path.join(work_path, "graph*pb"))
model_names = [os.path.basename(ii) for ii in models]
task_model_list = []
for ii in model_names:
- task_model_list.append(os.path.join('..', ii))
- # get max data size
- data_size = max([len(dpdata.System(os.path.join(
- task, rest_data_name), fmt="deepmd/npy")) for task in tasks])
+ task_model_list.append(os.path.join('.', ii))
# models
commands = []
- detail_file_names = []
- for ii, mm in enumerate(task_model_list):
- detail_file_name = "{prefix}-{ii}".format(
- prefix=detail_file_name_prefix,
- ii=ii,
- )
- # TODO: support 0.x?
- command = "{python} -m deepmd test -m {model} -s {system} -n {numb_test} -d {detail_file}".format(
- python=mdata['python_test_path'],
- model=mm,
- system=rest_data_name,
- numb_test=data_size,
- detail_file=detail_file_name,
- )
- commands.append(command)
- detail_file_names.append(detail_file_name)
+ detail_file_name = detail_file_name_prefix
+ command = "{dp} model-devi -m {model} -s {system} -o {detail_file}".format(
+ dp=mdata.get('model_devi_command', 'dp'),
+ model=" ".join(task_model_list),
+ system=rest_data_name + ".old",
+ detail_file=detail_file_name,
+ )
+ commands = [command]
# submit
- try:
- model_devi_group_size = mdata['model_devi_group_size']
- except Exception:
- model_devi_group_size = 1
+ model_devi_group_size = mdata.get('model_devi_group_size', 1)
- forward_files = [rest_data_name]
- backward_files = sum([[pf+".e.out", pf+".f.out", pf+".v.out"] for pf in detail_file_names], [])
+ forward_files = [rest_data_name + ".old"]
+ backward_files = [detail_file_name]
api_version = mdata.get('api_version', '0.9')
if LooseVersion(api_version) < LooseVersion('1.0'):
@@ -303,102 +207,50 @@ def run_model_devi(iter_index, jdata, mdata):
def post_model_devi(iter_index, jdata, mdata):
"""calculate the model deviation"""
- use_clusters = jdata.get('use_clusters', False)
iter_name = make_iter_name(iter_index)
work_path = os.path.join(iter_name, model_devi_name)
- tasks = glob.glob(os.path.join(work_path, "task.*"))
- tasks.sort()
-
- e_trust_lo = jdata['e_trust_lo']
- e_trust_hi = jdata['e_trust_hi']
- f_trust_lo = jdata['f_trust_lo']
- f_trust_hi = jdata['f_trust_hi']
-
- if use_clusters:
- sys_accurate = dpdata.MultiSystems()
- sys_candinate = dpdata.MultiSystems()
- sys_failed = dpdata.MultiSystems()
- else:
- sys_accurate = {}
- sys_candinate = {}
- sys_failed = {}
- all_names = set()
-
- for task in tasks:
- if not use_clusters:
- sys_name = os.path.basename(task).split('.')[1]
- all_names.add(sys_name)
- # e.out
- details_e = glob.glob(os.path.join(task, "{}-*.e.out".format(detail_file_name_prefix)))
- e_all = np.array([np.loadtxt(detail_e, ndmin=2)[:, 1] for detail_e in details_e])
- e_std = np.std(e_all, axis=0)
- n_frame = e_std.size
-
- # f.out
- details_f = glob.glob(os.path.join(task, "{}-*.f.out".format(detail_file_name_prefix)))
- f_all = np.array([np.loadtxt(detail_f, ndmin=2)[:, 3:6].reshape((n_frame, -1, 3)) for detail_f in details_f])
- # (n_model, n_frame, n_atom, 3)
- f_std = np.std(f_all, axis=0)
- # (n_frame, n_atom, 3)
- f_std = np.linalg.norm(f_std, axis=2)
- # (n_frame, n_atom)
- f_std = np.max(f_std, axis=1)
- # (n_frame,)
-
- system_cls = get_system_cls(jdata)
- for subsys, e_devi, f_devi in zip(system_cls(os.path.join(task, rest_data_name), fmt='deepmd/npy'), e_std, f_std):
- if (e_devi < e_trust_hi and e_devi >= e_trust_lo) or (f_devi < f_trust_hi and f_devi >= f_trust_lo) :
- if use_clusters:
+
+ f_trust_lo = jdata['model_devi_f_trust_lo']
+ f_trust_hi = jdata['model_devi_f_trust_hi']
+
+ sys_accurate = dpdata.MultiSystems()
+ sys_candinate = dpdata.MultiSystems()
+ sys_failed = dpdata.MultiSystems()
+
+ sys_entire = dpdata.MultiSystems().from_deepmd_npy(os.path.join(work_path, rest_data_name + ".old"))
+
+ detail_file_name = detail_file_name_prefix
+ with open(os.path.join(work_path, detail_file_name)) as f:
+ for line in f:
+ if line.startswith("# data.rest.old"):
+ name = (line.split()[1]).split("/")[-1]
+ elif line.startswith("#"):
+ pass
+ else:
+ idx = int(line.split()[0])
+ f_devi = float(line.split()[4])
+ subsys = sys_entire[name][idx]
+ if f_trust_lo <= f_devi < f_trust_hi:
sys_candinate.append(subsys)
- else:
- sys_candinate = _add_system(sys_candinate, sys_name, subsys)
- elif (e_devi >= e_trust_hi ) or (f_devi >= f_trust_hi ):
- if use_clusters:
+ elif f_devi >= f_trust_hi:
sys_failed.append(subsys)
- else:
- sys_failed = _add_system(sys_failed, sys_name, subsys)
- elif (e_devi < e_trust_lo and f_devi < f_trust_lo ):
- if use_clusters:
+ elif f_devi < f_trust_lo:
sys_accurate.append(subsys)
else:
- sys_accurate = _add_system(sys_accurate, sys_name, subsys)
- else:
- raise RuntimeError('reach a place that should NOT be reached...')
- if use_clusters:
- counter = {"candidate": sys_candinate.get_nframes(), "accurate": sys_accurate.get_nframes(), "failed": sys_failed.get_nframes()}
- fp_sum = sum(counter.values())
- for cc_key, cc_value in counter.items():
- dlog.info("{0:9s} : {1:6d} in {2:6d} {3:6.2f} %".format(cc_key, cc_value, fp_sum, cc_value/fp_sum*100))
- else:
- all_names = list(all_names)
- all_names.sort()
- counter = {"candidate": 0, "accurate": 0, "failed": 0}
- for kk in all_names:
- sys_counter = {"candidate": 0, "accurate": 0, "failed": 0}
- if kk in sys_candinate.keys():
- sys_counter['candidate'] += sys_candinate[kk].get_nframes()
- if kk in sys_accurate.keys():
- sys_counter['accurate'] += sys_accurate[kk].get_nframes()
- if kk in sys_failed.keys():
- sys_counter['failed'] += sys_failed[kk].get_nframes()
- fp_sum = sum(sys_counter.values())
- for cc_key, cc_value in sys_counter.items():
- if fp_sum != 0:
- dlog.info("sys{0:s} {1:9s} : {2:6d} in {3:6d} {4:6.2f} %".format(kk, cc_key, cc_value, fp_sum, cc_value/fp_sum*100))
- else:
- dlog.info("sys{0:s} {1:9s} : {2:6d} in {3:6d} {4:6.2f} %".format(kk, cc_key, cc_value, fp_sum, 0*100))
- for ii in ['candidate', 'accurate', 'failed']:
- counter[ii] += sys_counter[ii]
+ raise RuntimeError('reach a place that should NOT be reached...')
+
+ counter = {"candidate": sys_candinate.get_nframes(), "accurate": sys_accurate.get_nframes(), "failed": sys_failed.get_nframes()}
+ fp_sum = sum(counter.values())
+ for cc_key, cc_value in counter.items():
+ dlog.info("{0:9s} : {1:6d} in {2:6d} {3:6.2f} %".format(cc_key, cc_value, fp_sum, cc_value/fp_sum*100))
if counter['candidate'] == 0 and counter['failed'] > 0:
raise RuntimeError('no candidate but still have failed cases, stop. You may want to refine the training or to increase the trust level hi')
# label the candidate system
labels = []
- if use_clusters:
- items = sys_candinate.systems.items()
- else:
- items = sys_candinate.items()
+ items = sys_candinate.systems.items()
+
for key, system in items:
labels.extend([(key, j) for j in range(len(system))])
# candinate: pick up randomly
@@ -412,112 +264,61 @@ def post_model_devi(iter_index, jdata, mdata):
(counter['candidate'], len(pick_idx), float(len(pick_idx))/counter['candidate']*100., len(rest_idx), float(len(rest_idx))/counter['candidate']*100.))
# dump the picked candinate data
- if use_clusters:
- picked_systems = dpdata.MultiSystems()
- for j in pick_idx:
- sys_name, sys_id = labels[j]
- picked_systems.append(sys_candinate[sys_name][sys_id])
- sys_data_path = os.path.join(work_path, picked_data_name)
- picked_systems.to_deepmd_raw(sys_data_path)
- picked_systems.to_deepmd_npy(sys_data_path, set_size=iter_pick_number)
- else:
- selc_systems = {}
- for j in pick_idx:
- sys_name, sys_id = labels[j]
- selc_systems = _add_system(selc_systems, sys_name, sys_candinate[sys_name][sys_id])
- sys_data_path = os.path.join(work_path, picked_data_name)
- _dump_system_dict(selc_systems, sys_data_path)
+ picked_systems = dpdata.MultiSystems()
+ for j in pick_idx:
+ sys_name, sys_id = labels[j]
+ picked_systems.append(sys_candinate[sys_name][sys_id])
+ sys_data_path = os.path.join(work_path, picked_data_name)
+ picked_systems.to_deepmd_raw(sys_data_path)
+ picked_systems.to_deepmd_npy(sys_data_path, set_size=iter_pick_number)
+
# dump the rest data (not picked candinate data and failed data)
- if use_clusters:
- rest_systems = dpdata.MultiSystems()
- for j in rest_idx:
- sys_name, sys_id = labels[j]
- rest_systems.append(sys_candinate[sys_name][sys_id])
- rest_systems += sys_failed
- sys_data_path = os.path.join(work_path, rest_data_name)
- rest_systems.to_deepmd_raw(sys_data_path)
+ rest_systems = dpdata.MultiSystems()
+ for j in rest_idx:
+ sys_name, sys_id = labels[j]
+ rest_systems.append(sys_candinate[sys_name][sys_id])
+ rest_systems += sys_failed
+ sys_data_path = os.path.join(work_path, rest_data_name)
+ rest_systems.to_deepmd_raw(sys_data_path)
+ if rest_idx.size:
rest_systems.to_deepmd_npy(sys_data_path, set_size=rest_idx.size)
- else:
- selc_systems = {}
- for j in rest_idx:
- sys_name, sys_id = labels[j]
- selc_systems = _add_system(selc_systems, sys_name, sys_candinate[sys_name][sys_id])
- for kk in sys_failed.keys():
- selc_systems = _add_system(selc_systems, kk, sys_failed[kk])
- sys_data_path = os.path.join(work_path, rest_data_name)
- _dump_system_dict(selc_systems, sys_data_path)
+
# dump the accurate data -- to another directory
- if use_clusters:
- sys_data_path = os.path.join(work_path, accurate_data_name)
- sys_accurate.to_deepmd_raw(sys_data_path)
- sys_accurate.to_deepmd_npy(sys_data_path, set_size=sys_accurate.get_nframes())
- else:
- sys_data_path = os.path.join(work_path, accurate_data_name)
- _dump_system_dict(sys_accurate, sys_data_path)
+ sys_data_path = os.path.join(work_path, accurate_data_name)
+ sys_accurate.to_deepmd_raw(sys_data_path)
+ sys_accurate.to_deepmd_npy(sys_data_path, set_size=sys_accurate.get_nframes())
def make_fp_labeled(iter_index, jdata):
dlog.info("already labeled, skip make_fp and link data directly")
pick_data = jdata['pick_data']
- use_clusters = jdata.get('use_clusters', False)
iter_name = make_iter_name(iter_index)
work_path = os.path.join(iter_name, fp_name)
create_path(work_path)
picked_data_path = os.path.join(iter_name, model_devi_name, picked_data_name)
- if use_clusters:
- os.symlink(os.path.abspath(picked_data_path), os.path.abspath(
- os.path.join(work_path, "task." + data_system_fmt % 0)))
- os.symlink(os.path.abspath(picked_data_path), os.path.abspath(
- os.path.join(work_path, "data." + data_system_fmt % 0)))
- else:
- picked_data_path = os.path.abspath(picked_data_path)
- sys_path = glob.glob(os.path.join(picked_data_path, sys_name_pattern))
- cwd = os.getcwd()
- os.chdir(work_path)
- for ii in sys_path:
- sys_idx = os.path.basename(ii).split('.')[1]
- data_dir = 'data.' + data_system_fmt % int(sys_idx)
- task_dir = 'task.' + data_system_fmt % int(sys_idx)
- os.symlink(os.path.relpath(ii), data_dir)
- os.symlink(os.path.relpath(ii), task_dir)
- os.chdir(cwd)
+ os.symlink(os.path.abspath(picked_data_path), os.path.abspath(
+ os.path.join(work_path, "task." + data_system_fmt % 0)))
+ os.symlink(os.path.abspath(picked_data_path), os.path.abspath(
+ os.path.join(work_path, "data." + data_system_fmt % 0)))
def make_fp_configs(iter_index, jdata):
pick_data = jdata['pick_data']
- use_clusters = jdata.get('use_clusters', False)
iter_name = make_iter_name(iter_index)
work_path = os.path.join(iter_name, fp_name)
create_path(work_path)
picked_data_path = os.path.join(iter_name, model_devi_name, picked_data_name)
- if use_clusters:
- systems = get_multi_system(picked_data_path, jdata)
- jj = 0
- for system in systems:
- for subsys in system:
- task_name = "task." + fp_task_fmt % (0, jj)
- task_path = os.path.join(work_path, task_name)
- create_path(task_path)
- subsys.to('vasp/poscar', os.path.join(task_path, 'POSCAR'))
- jj += 1
- else:
- picked_data_path = os.path.abspath(picked_data_path)
- sys_path = glob.glob(os.path.join(picked_data_path, sys_name_pattern))
- for ii in sys_path:
- tmp_sys = dpdata.System(ii, fmt = 'deepmd/npy')
- sys_idx = os.path.basename(ii).split('.')[1]
- jj = 0
- for ss in tmp_sys:
- task_name = "task." + fp_task_fmt % (int(sys_idx), jj)
- task_path = os.path.join(work_path, task_name)
- create_path(task_path)
- ss.to('vasp/poscar', os.path.join(task_path, 'POSCAR'))
- job = {}
- with open(os.path.join(task_path, 'job.json'), 'w') as fp:
- json.dump(job, fp, indent=4)
- jj += 1
+ systems = get_multi_system(picked_data_path, jdata)
+ jj = 0
+ for system in systems:
+ for subsys in system:
+ task_name = "task." + fp_task_fmt % (0, jj)
+ task_path = os.path.join(work_path, task_name)
+ create_path(task_path)
+ subsys.to('vasp/poscar', os.path.join(task_path, 'POSCAR'))
+ jj += 1
def make_fp_gaussian(iter_index, jdata):
diff --git a/dpgen/util.py b/dpgen/util.py
index aa805e7e5..9491cdc30 100644
--- a/dpgen/util.py
+++ b/dpgen/util.py
@@ -1,5 +1,7 @@
#!/usr/bin/env python
# coding: utf-8
+from typing import Union, List
+from pathlib import Path
from dpgen import dlog
@@ -25,3 +27,23 @@ def box_center(ch='',fill=' ',sp="|"):
'''
strs=ch.center(Len,fill)
dlog.info(sp+strs[1:len(strs)-1:]+sp)
+
+
+def expand_sys_str(root_dir: Union[str, Path]) -> List[str]:
+ """Recursively iterate over directories taking those that contain `type.raw` file.
+
+ Parameters
+ ----------
+ root_dir : Union[str, Path]
+ starting directory
+
+ Returns
+ -------
+ List[str]
+ list of string pointing to system directories
+ """
+ root_dir = Path(root_dir)
+ matches = [str(d) for d in root_dir.rglob("*") if (d / "type.raw").is_file()]
+ if (root_dir / "type.raw").is_file():
+ matches.append(str(root_dir))
+ return matches