From 9eef557308a47d98a8ec85ce93d554d9bfbf9b39 Mon Sep 17 00:00:00 2001
From: Yuki Huang <yukih@nvidia.com>
Date: Thu, 2 Apr 2026 06:22:26 -0700
Subject: [PATCH 1/6] revert logprob_batch_size to keep same perf as before

Signed-off-by: Yuki Huang <yukih@nvidia.com>
---
 .../configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml   | 1 +
 .../recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml    | 1 +
 tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh       | 2 ++
 .../test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh  | 3 +++
 4 files changed, 7 insertions(+)

diff --git a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml
index 6e0aa5cd81..8b4118a7d7 100755
--- a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml
@@ -8,6 +8,7 @@ checkpointing:
 policy:
   model_name: Qwen/Qwen3-30B-A3B
   train_micro_batch_size: 1
+  logprob_batch_size: 4
   max_total_sequence_length: 4096
   dtensor_cfg:
     enabled: false
diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml
index 5d1c236584..3b4f22ffbd 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml
@@ -8,6 +8,7 @@ checkpointing:
 policy:
   model_name: Qwen/Qwen3-30B-A3B
   train_micro_batch_size: 1
+  logprob_batch_size: 4
   max_total_sequence_length: 40960
   dtensor_cfg:
     enabled: false
diff --git a/tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh b/tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh
index ad369c4395..45d321a47b 100755
--- a/tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh
+++ b/tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 source $SCRIPT_DIR/common.env
+# ignore tensor parallel accuracy check
+export NRL_IGNORE_TP_ACCURACY_CHECK=1
 
 # ===== BEGIN CONFIG =====
 NUM_NODES=8
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh
index 02dea5a4a5..f312eee392 100755
--- a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh
+++ b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 source $SCRIPT_DIR/common.env
+# ignore tensor parallel accuracy check to avoid FP8 precision issue
+export NRL_IGNORE_TP_ACCURACY_CHECK=1
 
 # ===== BEGIN CONFIG =====
 NUM_NODES=4
@@ -24,6 +26,7 @@ uv run examples/run_grpo.py \
     logger.monitor_gpus=True \
     logger.tensorboard_enabled=True \
     checkpointing.enabled=True \
+    checkpointing.save_period=5 \
     checkpointing.checkpoint_dir=$CKPT_DIR \
     $@ \
     2>&1 | tee $RUN_LOG

From 6d876a2ff8b8183d0ffe886b0073e1fa1adf83dd Mon Sep 17 00:00:00 2001
From: Yuki Huang <yukih@nvidia.com>
Date: Thu, 2 Apr 2026 06:36:35 -0700
Subject: [PATCH 2/6] skip NRL_IGNORE_TP_ACCURACY_CHECK=1 in test

Signed-off-by: Yuki Huang <yukih@nvidia.com>
---
 tests/unit/test_config_validation.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/unit/test_config_validation.py b/tests/unit/test_config_validation.py
index 79d9535371..df16c7f13d 100644
--- a/tests/unit/test_config_validation.py
+++ b/tests/unit/test_config_validation.py
@@ -144,6 +144,15 @@ def test_all_config_no_tp_size_accuracy_issues(config_file):
     Related document: https://docs.nvidia.com/nemo/rl/latest/guides/dtensor-tp-accuracy.html#root-cause.
     """
 
+    skip_config_files = [
+        "grpo-qwen3-30ba3b-4n8g-40K.yaml",
+        "grpo-qwen3-30ba3b-8n8g-megatron.yaml",
+    ]
+    if os.path.basename(config_file) in skip_config_files:
+        pytest.skip(
+            f"Skipping config file {config_file} because it sets NRL_IGNORE_TP_ACCURACY_CHECK=1"
+        )
+
     print(f"\nValidating config file: {config_file}")
 
     # Load the config file with inheritance

From 0010c2c5a76bcef1463a0264c69f9ba0c9f7ec85 Mon Sep 17 00:00:00 2001
From: Yuki Huang <yukih@nvidia.com>
Date: Thu, 2 Apr 2026 06:44:09 -0700
Subject: [PATCH 3/6] fix comment

Signed-off-by: Yuki Huang <yukih@nvidia.com>
---
 tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh
index f312eee392..7fc06bbf52 100755
--- a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh
+++ b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 source $SCRIPT_DIR/common.env
-# ignore tensor parallel accuracy check to avoid FP8 precision issue
+# ignore tensor parallel accuracy check
 export NRL_IGNORE_TP_ACCURACY_CHECK=1
 
 # ===== BEGIN CONFIG =====

From 945bce8ed442668d0b9b68a1d7fb5e3a9edb1e79 Mon Sep 17 00:00:00 2001
From: Yuki Huang <yukih@nvidia.com>
Date: Thu, 2 Apr 2026 06:45:56 -0700
Subject: [PATCH 4/6] fix STEPS_PER_RUN

Signed-off-by: Yuki Huang <yukih@nvidia.com>
---
 tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh
index 7fc06bbf52..9653640dc7 100755
--- a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh
+++ b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh
@@ -6,7 +6,7 @@ export NRL_IGNORE_TP_ACCURACY_CHECK=1
 
 # ===== BEGIN CONFIG =====
 NUM_NODES=4
-STEPS_PER_RUN=10
+STEPS_PER_RUN=5
 MAX_STEPS=10
 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN ))  # Round up
 NUM_MINUTES=100

From 98252a89f057d736fb8244aae0a817701b84814c Mon Sep 17 00:00:00 2001
From: Yuki Huang <yukih@nvidia.com>
Date: Fri, 3 Apr 2026 00:43:24 -0700
Subject: [PATCH 5/6] revert grpo-qwen3-32b-4n8g.yaml

Signed-off-by: Yuki Huang <yukih@nvidia.com>
---
 .../configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml    | 1 +
 tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh        | 2 ++
 tests/unit/test_config_validation.py                            | 1 +
 3 files changed, 4 insertions(+)

diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml
index ad780ebc50..1df1bc851c 100644
--- a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml
+++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml
@@ -8,6 +8,7 @@ checkpointing:
 policy:
   model_name: Qwen/Qwen3-32B
   train_micro_batch_size: 1
+  logprob_batch_size: 4
   max_total_sequence_length: 4096
   dtensor_cfg:
     enabled: false
diff --git a/tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh b/tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh
index 2f14541b6f..b537d8d9eb 100755
--- a/tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh
+++ b/tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 source $SCRIPT_DIR/common.env
+# ignore tensor parallel accuracy check
+export NRL_IGNORE_TP_ACCURACY_CHECK=1
 
 # ===== BEGIN CONFIG =====
 NUM_NODES=4
diff --git a/tests/unit/test_config_validation.py b/tests/unit/test_config_validation.py
index df16c7f13d..fd173b1f52 100644
--- a/tests/unit/test_config_validation.py
+++ b/tests/unit/test_config_validation.py
@@ -147,6 +147,7 @@ def test_all_config_no_tp_size_accuracy_issues(config_file):
     skip_config_files = [
         "grpo-qwen3-30ba3b-4n8g-40K.yaml",
         "grpo-qwen3-30ba3b-8n8g-megatron.yaml",
+        "grpo-qwen3-32b-4n8g.yaml",
     ]
     if os.path.basename(config_file) in skip_config_files:
         pytest.skip(

From 69bfd70b00cc5b3c057de27c128a5be2a19a4080 Mon Sep 17 00:00:00 2001
From: Yuki Huang <yukih@nvidia.com>
Date: Fri, 3 Apr 2026 01:22:46 -0700
Subject: [PATCH 6/6] skip grpo-qwen3-32b-8n8g-async-1off.yaml

Signed-off-by: Yuki Huang <yukih@nvidia.com>
---
 .../llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh           | 2 ++
 tests/unit/test_config_validation.py                            | 1 +
 2 files changed, 3 insertions(+)

diff --git a/tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh
index 44dbe4b337..da31ff7e3d 100755
--- a/tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh
+++ b/tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 source $SCRIPT_DIR/common.env
+# ignore tensor parallel accuracy check
+export NRL_IGNORE_TP_ACCURACY_CHECK=1
 
 # ===== BEGIN CONFIG =====
 NUM_NODES=8
diff --git a/tests/unit/test_config_validation.py b/tests/unit/test_config_validation.py
index fd173b1f52..06f750f25c 100644
--- a/tests/unit/test_config_validation.py
+++ b/tests/unit/test_config_validation.py
@@ -148,6 +148,7 @@ def test_all_config_no_tp_size_accuracy_issues(config_file):
         "grpo-qwen3-30ba3b-4n8g-40K.yaml",
         "grpo-qwen3-30ba3b-8n8g-megatron.yaml",
         "grpo-qwen3-32b-4n8g.yaml",
+        "grpo-qwen3-32b-8n8g-async-1off.yaml",
     ]
     if os.path.basename(config_file) in skip_config_files:
         pytest.skip(