From 8b0468d09bcdb712e11b5b1de0c055184f0a9e2a Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Fri, 9 Jan 2026 18:16:43 +0800 Subject: [PATCH 1/5] add cosine lr --- deepmd/dpmodel/utils/learning_rate.py | 38 +++++++++++++++++++++++++++ deepmd/pt/train/training.py | 14 ++++++---- deepmd/pt/utils/learning_rate.py | 2 ++ deepmd/utils/argcheck.py | 22 +++++++++++++++- 4 files changed, 70 insertions(+), 6 deletions(-) diff --git a/deepmd/dpmodel/utils/learning_rate.py b/deepmd/dpmodel/utils/learning_rate.py index 10f7ec8d04..777d518a3c 100644 --- a/deepmd/dpmodel/utils/learning_rate.py +++ b/deepmd/dpmodel/utils/learning_rate.py @@ -55,3 +55,41 @@ def value(self, step: int) -> np.float64: if step_lr < self.min_lr: step_lr = self.min_lr return step_lr + + +class LearningRateCosine: + def __init__( + self, + start_lr: float, + stop_lr: float, + stop_steps: int, + **kwargs: Any, + ) -> None: + """ + Defines a cosine annealing learning rate schedule. + The learning rate starts at `start_lr` and gradually decreases to `stop_lr` + following a cosine curve over the training steps. + + Parameters + ---------- + start_lr + The initial learning rate at the beginning of training. + stop_lr + The final learning rate at the end of training. + stop_steps + The total number of training steps over which the learning rate + will be annealed from start_lr to stop_lr. + """ + self.start_lr = start_lr + self.lr_min_factor = stop_lr / start_lr + self.stop_steps = stop_steps + + def value(self, step: int) -> np.float64: + if step >= self.stop_steps: + return self.start_lr * self.lr_min_factor + return self.start_lr * ( + self.lr_min_factor + + 0.5 + * (1 - self.lr_min_factor) + * (1 + np.cos(np.pi * (step / self.stop_steps))) + ) diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index d98b23d25c..713ee59a23 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -63,6 +63,7 @@ SAMPLER_RECORD, ) from deepmd.pt.utils.learning_rate import ( + LearningRateCosine, LearningRateExp, ) from deepmd.pt.utils.stat import ( @@ -267,12 +268,15 @@ def get_sample() -> Any: return get_sample def get_lr(lr_params: dict[str, Any]) -> LearningRateExp: - assert lr_params.get("type", "exp") == "exp", ( - "Only learning rate `exp` is supported!" - ) + lr_type = lr_params.get("type", "exp") lr_params["stop_steps"] = self.num_steps - self.warmup_steps - lr_exp = LearningRateExp(**lr_params) - return lr_exp + if lr_type == "exp": + lr_schedule = LearningRateExp(**lr_params) + elif lr_type == "cosine": + lr_schedule = LearningRateCosine(**lr_params) + else: + raise ValueError(f"Not supported learning rate type '{lr_type}'!") + return lr_schedule # Optimizer if self.multi_task and training_params.get("optim_dict", None) is not None: diff --git a/deepmd/pt/utils/learning_rate.py b/deepmd/pt/utils/learning_rate.py index 3502434bc0..31ae1c3152 100644 --- a/deepmd/pt/utils/learning_rate.py +++ b/deepmd/pt/utils/learning_rate.py @@ -1,8 +1,10 @@ # SPDX-License-Identifier: LGPL-3.0-or-later from deepmd.dpmodel.utils.learning_rate import ( + LearningRateCosine, LearningRateExp, ) __all__ = [ + "LearningRateCosine", "LearningRateExp", ] diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index 7fcc117ab5..22b71b0183 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -2509,12 +2509,32 @@ def learning_rate_exp() -> list[Argument]: return args +def learning_rate_cosine() -> list[Argument]: + """ + Defines a cosine annealing learning rate schedule. + + The learning rate starts at `start_lr` and gradually decreases to `stop_lr` + following a cosine curve over the training steps. + """ + doc_start_lr = "The learning rate at the start of the training." + doc_stop_lr = "The desired learning rate at the end of the training. " + + args = [ + Argument("start_lr", float, optional=True, default=1e-3, doc=doc_start_lr), + Argument("stop_lr", float, optional=True, default=1e-5, doc=doc_stop_lr), + ] + return args + + def learning_rate_variant_type_args() -> Variant: doc_lr = "The type of the learning rate." return Variant( "type", - [Argument("exp", dict, learning_rate_exp())], + [ + Argument("exp", dict, learning_rate_exp()), + Argument("cosine", dict, learning_rate_cosine()), + ], optional=True, default_tag="exp", doc=doc_lr, From f7ac57cf66f6024e0c952216c6eb0000c59f5f82 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Fri, 9 Jan 2026 18:31:28 +0800 Subject: [PATCH 2/5] add ut --- deepmd/utils/argcheck.py | 2 +- source/tests/pt/test_lr.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index 22b71b0183..09814beb59 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -2533,7 +2533,7 @@ def learning_rate_variant_type_args() -> Variant: "type", [ Argument("exp", dict, learning_rate_exp()), - Argument("cosine", dict, learning_rate_cosine()), + Argument("cosine", dict, learning_rate_cosine(), doc=doc_only_pt_supported), ], optional=True, default_tag="exp", diff --git a/source/tests/pt/test_lr.py b/source/tests/pt/test_lr.py index 2d6bf156e1..75f663f041 100644 --- a/source/tests/pt/test_lr.py +++ b/source/tests/pt/test_lr.py @@ -7,6 +7,7 @@ tf.disable_eager_execution() from deepmd.pt.utils.learning_rate import ( + LearningRateCosine, LearningRateExp, ) from deepmd.tf.utils import ( @@ -102,5 +103,21 @@ def decay_rate_pt(self) -> None: ) +class TestLearningRateCosine(unittest.TestCase): + def test_basic_curve(self) -> None: + start_lr = 1.0 + stop_lr = 0.1 + stop_steps = 10 + lr = LearningRateCosine(start_lr, stop_lr, stop_steps) + + self.assertTrue(np.allclose(lr.value(0), start_lr)) + self.assertTrue(np.allclose(lr.value(stop_steps), stop_lr)) + self.assertTrue(np.allclose(lr.value(stop_steps + 5), stop_lr)) + + mid_step = stop_steps // 2 + expected_mid = stop_lr + (start_lr - stop_lr) * 0.5 + self.assertTrue(np.allclose(lr.value(mid_step), expected_mid)) + + if __name__ == "__main__": unittest.main() From b0df231841c1fd4086f5db6e9598b028509c1f43 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Fri, 9 Jan 2026 18:49:22 +0800 Subject: [PATCH 3/5] Update learning_rate.py --- deepmd/dpmodel/utils/learning_rate.py | 41 +++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/deepmd/dpmodel/utils/learning_rate.py b/deepmd/dpmodel/utils/learning_rate.py index 777d518a3c..971dd3391f 100644 --- a/deepmd/dpmodel/utils/learning_rate.py +++ b/deepmd/dpmodel/utils/learning_rate.py @@ -1,4 +1,8 @@ # SPDX-License-Identifier: LGPL-3.0-or-later +from abc import ( + ABC, + abstractmethod, +) from typing import ( Any, ) @@ -6,7 +10,33 @@ import numpy as np -class LearningRateExp: +class LearningRateSchedule(ABC): + def __init__( + self, start_lr: float, stop_lr: float, stop_steps: int, **kwargs: Any + ) -> None: + """ + Base class for learning rate schedules. + + Parameters + ---------- + start_lr + The initial learning rate. + stop_lr + The final learning rate. + stop_steps + The total training steps for learning rate scheduler. + """ + self.start_lr = start_lr + self.stop_lr = stop_lr + self.stop_steps = stop_steps + + @abstractmethod + def value(self, step: int) -> np.float64: + """Get the learning rate at the given step.""" + pass + + +class LearningRateExp(LearningRateSchedule): def __init__( self, start_lr: float, @@ -37,7 +67,7 @@ def __init__( If provided, the decay rate will be set instead of calculating it through interpolation between start_lr and stop_lr. """ - self.start_lr = start_lr + super().__init__(start_lr, stop_lr, stop_steps, **kwargs) default_ds = 100 if stop_steps // 10 > 100 else stop_steps // 100 + 1 self.decay_steps = decay_steps if self.decay_steps >= stop_steps: @@ -47,7 +77,7 @@ def __init__( ) if decay_rate is not None: self.decay_rate = decay_rate - self.min_lr = stop_lr + self.min_lr = self.stop_lr def value(self, step: int) -> np.float64: """Get the learning rate at the given step.""" @@ -57,7 +87,7 @@ def value(self, step: int) -> np.float64: return step_lr -class LearningRateCosine: +class LearningRateCosine(LearningRateSchedule): def __init__( self, start_lr: float, @@ -80,9 +110,8 @@ def __init__( The total number of training steps over which the learning rate will be annealed from start_lr to stop_lr. """ - self.start_lr = start_lr + super().__init__(start_lr, stop_lr, stop_steps, **kwargs) self.lr_min_factor = stop_lr / start_lr - self.stop_steps = stop_steps def value(self, step: int) -> np.float64: if step >= self.stop_steps: From 55d482725e809f2453032204789949be4fa31d56 Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Fri, 9 Jan 2026 19:01:28 +0800 Subject: [PATCH 4/5] Update argcheck.py --- deepmd/utils/argcheck.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py index 09814beb59..1809b19083 100644 --- a/deepmd/utils/argcheck.py +++ b/deepmd/utils/argcheck.py @@ -2477,6 +2477,10 @@ def linear_ener_model_args() -> Argument: # --- Learning rate configurations: --- # +lr_args_plugin = ArgsPlugin() + + +@lr_args_plugin.register("exp") def learning_rate_exp() -> list[Argument]: doc_start_lr = "The learning rate at the start of the training." doc_stop_lr = ( @@ -2509,6 +2513,7 @@ def learning_rate_exp() -> list[Argument]: return args +@lr_args_plugin.register("cosine", doc=doc_only_pt_supported) def learning_rate_cosine() -> list[Argument]: """ Defines a cosine annealing learning rate schedule. @@ -2531,10 +2536,7 @@ def learning_rate_variant_type_args() -> Variant: return Variant( "type", - [ - Argument("exp", dict, learning_rate_exp()), - Argument("cosine", dict, learning_rate_cosine(), doc=doc_only_pt_supported), - ], + lr_args_plugin.get_all_argument(), optional=True, default_tag="exp", doc=doc_lr, From d0f3d710e888410f6e976fa27114a1a3c6ace5ea Mon Sep 17 00:00:00 2001 From: Duo <50307526+iProzd@users.noreply.github.com> Date: Fri, 9 Jan 2026 19:22:44 +0800 Subject: [PATCH 5/5] make lr plugin --- deepmd/dpmodel/utils/learning_rate.py | 21 ++++++++++++++++++--- deepmd/pt/train/training.py | 13 +++---------- deepmd/pt/utils/learning_rate.py | 2 ++ 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/deepmd/dpmodel/utils/learning_rate.py b/deepmd/dpmodel/utils/learning_rate.py index 971dd3391f..f82a42660b 100644 --- a/deepmd/dpmodel/utils/learning_rate.py +++ b/deepmd/dpmodel/utils/learning_rate.py @@ -9,8 +9,21 @@ import numpy as np +from deepmd.common import ( + j_get_type, +) +from deepmd.utils.plugin import ( + PluginVariant, + make_plugin_registry, +) + + +class BaseLR(ABC, PluginVariant, make_plugin_registry("lr")): + def __new__(cls: type, *args: Any, **kwargs: Any) -> Any: + if cls is BaseLR: + cls = cls.get_class_by_type(j_get_type(kwargs, cls.__name__)) + return super().__new__(cls) -class LearningRateSchedule(ABC): def __init__( self, start_lr: float, stop_lr: float, stop_steps: int, **kwargs: Any ) -> None: @@ -36,7 +49,8 @@ def value(self, step: int) -> np.float64: pass -class LearningRateExp(LearningRateSchedule): +@BaseLR.register("exp") +class LearningRateExp(BaseLR): def __init__( self, start_lr: float, @@ -87,7 +101,8 @@ def value(self, step: int) -> np.float64: return step_lr -class LearningRateCosine(LearningRateSchedule): +@BaseLR.register("cosine") +class LearningRateCosine(BaseLR): def __init__( self, start_lr: float, diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py index 713ee59a23..7d768cf66b 100644 --- a/deepmd/pt/train/training.py +++ b/deepmd/pt/train/training.py @@ -63,8 +63,7 @@ SAMPLER_RECORD, ) from deepmd.pt.utils.learning_rate import ( - LearningRateCosine, - LearningRateExp, + BaseLR, ) from deepmd.pt.utils.stat import ( make_stat_input, @@ -267,15 +266,9 @@ def get_sample() -> Any: _stat_file_path.root.close() return get_sample - def get_lr(lr_params: dict[str, Any]) -> LearningRateExp: - lr_type = lr_params.get("type", "exp") + def get_lr(lr_params: dict[str, Any]) -> BaseLR: lr_params["stop_steps"] = self.num_steps - self.warmup_steps - if lr_type == "exp": - lr_schedule = LearningRateExp(**lr_params) - elif lr_type == "cosine": - lr_schedule = LearningRateCosine(**lr_params) - else: - raise ValueError(f"Not supported learning rate type '{lr_type}'!") + lr_schedule = BaseLR(**lr_params) return lr_schedule # Optimizer diff --git a/deepmd/pt/utils/learning_rate.py b/deepmd/pt/utils/learning_rate.py index 31ae1c3152..ff7d4f7ec7 100644 --- a/deepmd/pt/utils/learning_rate.py +++ b/deepmd/pt/utils/learning_rate.py @@ -1,10 +1,12 @@ # SPDX-License-Identifier: LGPL-3.0-or-later from deepmd.dpmodel.utils.learning_rate import ( + BaseLR, LearningRateCosine, LearningRateExp, ) __all__ = [ + "BaseLR", "LearningRateCosine", "LearningRateExp", ]