Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion deepmd/utils/argcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -972,7 +972,8 @@ def training_data_args(): # ! added by Ziyao: new specification style for data
- list: the length of which is the same as the {link_sys}. The batch size of each system is given by the elements of the list.\n\n\
- int: all {link_sys} use the same batch size.\n\n\
- string "auto": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than 32.\n\n\
- string "auto:N": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than N.'
- string "auto:N": automatically determines the batch size so that the batch_size times the number of atoms in the system is no less than N.\n\n\
- string "mixed:N": the batch data will be sampled from all systems and merged into a mixed system with the batch size N. Only support the se_atten descriptor.'
doc_auto_prob_style = 'Determine the probability of systems automatically. The method is assigned by this key and can be\n\n\
- "prob_uniform" : the probability all the systems are equal, namely 1.0/self.get_nsystems()\n\n\
- "prob_sys_size" : the probability of a system is proportional to the number of batches in the system\n\n\
Expand Down
106 changes: 104 additions & 2 deletions deepmd/utils/data_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ def __init__(
# batch size
self.batch_size = batch_size
is_auto_bs = False
self.mixed_systems = False
if isinstance(self.batch_size, int):
self.batch_size = self.batch_size * np.ones(self.nsystems, dtype=int)
elif isinstance(self.batch_size, str):
Expand All @@ -121,9 +122,16 @@ def __init__(
rule = 32
if len(words) == 2:
rule = int(words[1])
self.batch_size = self._make_auto_bs(rule)
elif "mixed" == words[0]:
self.mixed_systems = True
if len(words) == 2:
rule = int(words[1])
else:
raise RuntimeError("batch size must be specified for mixed systems")
self.batch_size = rule * np.ones(self.nsystems, dtype=int)
else:
raise RuntimeError("unknown batch_size rule " + words[0])
self.batch_size = self._make_auto_bs(rule)
elif isinstance(self.batch_size, list):
pass
else:
Expand Down Expand Up @@ -361,7 +369,7 @@ def _get_sys_probs(self, sys_probs, auto_prob_style): # depreciated
prob = self._process_sys_probs(sys_probs)
return prob

def get_batch(self, sys_idx: Optional[int] = None):
def get_batch(self, sys_idx: Optional[int] = None) -> dict:
# batch generation style altered by Ziyao Li:
# one should specify the "sys_prob" and "auto_prob_style" params
# via set_sys_prob() function. The sys_probs this function uses is
Expand All @@ -375,9 +383,36 @@ def get_batch(self, sys_idx: Optional[int] = None):
The index of system from which the batch is get.
If sys_idx is not None, `sys_probs` and `auto_prob_style` are ignored
If sys_idx is None, automatically determine the system according to `sys_probs` or `auto_prob_style`, see the following.
This option does not work for mixed systems.

Returns
-------
dict
The batch data
"""
if not hasattr(self, "default_mesh"):
self._make_default_mesh()
if not self.mixed_systems:
b_data = self.get_batch_standard(sys_idx)
else:
b_data = self.get_batch_mixed()
return b_data

def get_batch_standard(self, sys_idx: Optional[int] = None) -> dict:
"""Get a batch of data from the data systems in the standard way.

Parameters
----------
sys_idx : int
The index of system from which the batch is get.
If sys_idx is not None, `sys_probs` and `auto_prob_style` are ignored
If sys_idx is None, automatically determine the system according to `sys_probs` or `auto_prob_style`, see the following.

Returns
-------
dict
The batch data
"""
if sys_idx is not None:
self.pick_idx = sys_idx
else:
Expand All @@ -390,6 +425,73 @@ def get_batch(self, sys_idx: Optional[int] = None):
b_data["default_mesh"] = self.default_mesh[self.pick_idx]
return b_data

def get_batch_mixed(self) -> dict:
"""Get a batch of data from the data systems in the mixed way.

Returns
-------
dict
The batch data
"""
# mixed systems have a global batch size
batch_size = self.batch_size[0]
batch_data = []
for _ in range(batch_size):
self.pick_idx = dp_random.choice(np.arange(self.nsystems), p=self.sys_probs)
bb_data = self.data_systems[self.pick_idx].get_batch(1)
bb_data["natoms_vec"] = self.natoms_vec[self.pick_idx]
bb_data["default_mesh"] = self.default_mesh[self.pick_idx]
batch_data.append(bb_data)
b_data = self._merge_batch_data(batch_data)
return b_data

def _merge_batch_data(self, batch_data: List[dict]) -> dict:
"""Merge batch data from different systems.

Parameters
----------
batch_data : list of dict
A list of batch data from different systems.

Returns
-------
dict
The merged batch data.
"""
b_data = {}
max_natoms = max(bb["natoms_vec"][0] for bb in batch_data)
# natoms_vec
natoms_vec = np.zeros(2 + self.get_ntypes(), dtype=int)
natoms_vec[0:3] = max_natoms
b_data["natoms_vec"] = natoms_vec
# real_natoms_vec
real_natoms_vec = np.vstack([bb["natoms_vec"] for bb in batch_data])
b_data["real_natoms_vec"] = real_natoms_vec
# type
type_vec = np.full((len(batch_data), max_natoms), -1, dtype=int)
for ii, bb in enumerate(batch_data):
type_vec[ii, : bb["type"].shape[1]] = bb["type"][0]
b_data["type"] = type_vec
# default_mesh
default_mesh = np.mean([bb["default_mesh"] for bb in batch_data], axis=0)
b_data["default_mesh"] = default_mesh
# other data
data_dict = self.get_data_dict(0)
for kk, vv in data_dict.items():
if kk not in batch_data[0]:
continue
b_data["find_" + kk] = batch_data[0]["find_" + kk]
if not vv["atomic"]:
b_data[kk] = np.concatenate([bb[kk] for bb in batch_data], axis=0)
else:
b_data[kk] = np.zeros(
(len(batch_data), max_natoms * vv["ndof"] * vv["repeat"]),
dtype=batch_data[0][kk].dtype,
)
for ii, bb in enumerate(batch_data):
b_data[kk][ii, : bb[kk].shape[1]] = bb[kk][0]
return b_data

# ! altered by Marián Rynik
def get_test(self, sys_idx: Optional[int] = None, n_test: int = -1): # depreciated
"""Get test data from the the data systems.
Expand Down
71 changes: 71 additions & 0 deletions examples/nopbc/mixed/input.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
{
"_comment": " model parameters",
"model": {
"type_map": [
"C",
"H",
"O"
],
"descriptor": {
"type": "se_atten",
"sel": 120,
"rcut_smth": 1.00,
"rcut": 6.00,
"neuron": [
25,
50,
100
],
"resnet_dt": false,
"axis_neuron": 12,
"seed": 1,
"_comment": " that's all"
},
"fitting_net": {
"neuron": [
240,
240,
240
],
"resnet_dt": true,
"seed": 1,
"_comment": " that's all"
},
"_comment": " that's all"
},

"learning_rate": {
"type": "exp",
"decay_steps": 4000,
"start_lr": 0.001,
"stop_lr": 3.51e-8,
"_comment": "that's all"
},

"loss": {
"type": "ener",
"start_pref_e": 0.02,
"limit_pref_e": 1,
"start_pref_f": 1000,
"limit_pref_f": 1,
"start_pref_v": 0,
"limit_pref_v": 0,
"_comment": " that's all"
},

"training": {
"training_data": {
"systems": "../data/",
"batch_size": "mixed:4",
"_comment": "that's all"
},
"numb_steps": 4000000,
"seed": 10,
"disp_file": "lcurve.out",
"disp_freq": 100,
"save_freq": 1000,
"_comment": "that's all"
},

"_comment": "that's all"
}
24 changes: 24 additions & 0 deletions source/tests/test_deepmd_data_sys.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
from deepmd.env import (
GLOBAL_NP_FLOAT_PRECISION,
)
from deepmd.utils import (
random,
)
from deepmd.utils.data_system import (
DeepmdDataSystem,
)
Expand Down Expand Up @@ -398,3 +401,24 @@ def test_sys_prob_floating_point_error(self):
]
ds = DeepmdDataSystem(self.sys_name, 3, 2, 2.0, sys_probs=sys_probs)
self.assertEqual(ds.sys_probs.size, len(sys_probs))

def test_get_mixed_batch(self):
"""test get_batch with mixed system."""
batch_size = "mixed:3"
test_size = 2

ds = DeepmdDataSystem(self.sys_name, batch_size, test_size, 2.0)
ds.add("test", self.test_ndof, atomic=True, must=True)
ds.add("null", self.test_ndof, atomic=True, must=False)
random.seed(114514)
# with this seed, the batch is fixed, with natoms 3, 6, 6
data = ds.get_batch()
np.testing.assert_equal(data["natoms_vec"], np.array([6, 6, 6, 0, 0]))
np.testing.assert_equal(data["real_natoms_vec"][:, 0], np.array([3, 6, 6]))
np.testing.assert_equal(data["type"][0, 3:6], np.array([-1, -1, -1]))
np.testing.assert_equal(data["coord"][0, 9:18], np.zeros(9))
for kk in ("test", "null"):
np.testing.assert_equal(
data[kk][0, 3 * self.test_ndof : 6 * self.test_ndof],
np.zeros(3 * self.test_ndof),
)