From 2f905f733bad3b016db5141623b736ce773a2355 Mon Sep 17 00:00:00 2001
From: Parth Chadha <pchadha@nvidia.com>
Date: Sat, 29 Mar 2025 08:06:27 -0700
Subject: [PATCH 1/2] feat: add capability to set min/max eps separately as
 proposed in the DAPO paper

Signed-off-by: Parth Chadha <pchadha@nvidia.com>
---
 .gitignore                                   |  1 +
 examples/configs/grpo_math_1B.yaml           |  3 ++-
 nemo_reinforcer/algorithms/loss_functions.py | 13 ++++++++++---
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/.gitignore b/.gitignore
index 79a00631e6..0d7a81c424 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,4 @@ docker/
 wandb/
 checkpoints/
 results/
+code_snapshots/
diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml
index 72aad000ce..62325d7a03 100644
--- a/examples/configs/grpo_math_1B.yaml
+++ b/examples/configs/grpo_math_1B.yaml
@@ -12,7 +12,8 @@ grpo:
 
 loss_fn:
   reference_policy_kl_penalty: 0.01
-  ratio_eps: 0.2
+  ratio_eps_min: 0.2
+  ratio_eps_max: 0.2
 
 checkpointing:
   enabled: true
diff --git a/nemo_reinforcer/algorithms/loss_functions.py b/nemo_reinforcer/algorithms/loss_functions.py
index 8504dac007..4894a53c20 100644
--- a/nemo_reinforcer/algorithms/loss_functions.py
+++ b/nemo_reinforcer/algorithms/loss_functions.py
@@ -25,7 +25,8 @@
 
 class ClippedPGLossConfig(TypedDict):
     reference_policy_kl_penalty: float
-    ratio_eps: float
+    ratio_eps_min: float
+    ratio_eps_max: float
 
 
 class ClippedPGLossDataDict(TypedDict):
@@ -57,6 +58,9 @@ class ClippedPGLossFn(LossFunction):
     - r_t(θ) = π_θ(a_t|s_t) / π_θ_old(a_t|s_t) is the probability ratio
     - A_t is the advantage estimate
     - ε is the clip parameter (ratio_eps)
+        - As proposed in the DAPO paper (https://arxiv.org/pdf/2503.14476), we allow setting a minimum and maximum value for the clip parameter
+            - ratio_eps_min: minimum value for the clip parameter
+            - ratio_eps_max: maximum value for the clip parameter
     - β is the KL penalty coefficient (reference_policy_kl_penalty)
     - KL(π_θ || π_ref) is the KL divergence between the current policy and reference policy (Schulman Approx.)
 
@@ -65,7 +69,8 @@ class ClippedPGLossFn(LossFunction):
     """
 
     def __init__(self, cfg: ClippedPGLossConfig):
-        self.ratio_eps = cfg["ratio_eps"]
+        self.ratio_eps_min = cfg["ratio_eps_min"]
+        self.ratio_eps_max = cfg["ratio_eps_max"]
         self.reference_policy_kl_penalty = cfg["reference_policy_kl_penalty"]
         self.disable_ppo_ratio = cfg.get("disable_ppo_ratio", False)
 
@@ -108,7 +113,9 @@ def __call__(
         # Calculate clipped loss function if ppo ratio is enabled.
         if not self.disable_ppo_ratio:
             ratios = (curr_logprobs - prev_logprobs).exp()
-            ratios_clamped = ratios.clamp(1.0 - self.ratio_eps, 1.0 + self.ratio_eps)
+            ratios_clamped = ratios.clamp(
+                1.0 - self.ratio_eps_min, 1.0 + self.ratio_eps_max
+            )
         else:
             ratios = curr_logprobs
             ratios_clamped = curr_logprobs

From 077762b1e40738228051b1631deac99fb8015d0d Mon Sep 17 00:00:00 2001
From: Sahil Jain <48468750+SahilJain314@users.noreply.github.com>
Date: Mon, 31 Mar 2025 11:46:07 -0700
Subject: [PATCH 2/2] Clarify clip parameter description in comments

Signed-off-by: Parth Chadha <pchadha@nvidia.com>
---
 nemo_reinforcer/algorithms/loss_functions.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nemo_reinforcer/algorithms/loss_functions.py b/nemo_reinforcer/algorithms/loss_functions.py
index 4894a53c20..6a5d6b593d 100644
--- a/nemo_reinforcer/algorithms/loss_functions.py
+++ b/nemo_reinforcer/algorithms/loss_functions.py
@@ -58,7 +58,8 @@ class ClippedPGLossFn(LossFunction):
     - r_t(θ) = π_θ(a_t|s_t) / π_θ_old(a_t|s_t) is the probability ratio
     - A_t is the advantage estimate
     - ε is the clip parameter (ratio_eps)
-        - As proposed in the DAPO paper (https://arxiv.org/pdf/2503.14476), we allow setting a minimum and maximum value for the clip parameter
+        - As proposed in the DAPO paper (https://arxiv.org/pdf/2503.14476), 
+          we allow setting a distinct minimum and maximum value for the clip parameter (set to the same value for PPO/GRPO/etc.)
             - ratio_eps_min: minimum value for the clip parameter
             - ratio_eps_max: maximum value for the clip parameter
     - β is the KL penalty coefficient (reference_policy_kl_penalty)