NVIDIA-NeMo · terrykong · Jul 17, 2025 · Jul 17, 2025
@@ -901,7 +901,9 @@ def train(
 
         if not eval_mode:
             # take one LR step every rollout batch
-            self.scheduler.step(increment=1)
+            # we need to scale the step by gbs to counteract the fact that NeMo automatically
+            # scales lr_warmup_steps by gbs during init
+            self.scheduler.step(increment=gbs)
 
         # Aggregate metrics across all microbatches
         mb_metrics = defaultdict(list)