From 9eef557308a47d98a8ec85ce93d554d9bfbf9b39 Mon Sep 17 00:00:00 2001 From: Yuki Huang Date: Thu, 2 Apr 2026 06:22:26 -0700 Subject: [PATCH 1/6] revert logprob_batch_size to keep same perf as before Signed-off-by: Yuki Huang --- .../configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml | 1 + .../recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml | 1 + tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh | 2 ++ .../test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh | 3 +++ 4 files changed, 7 insertions(+) diff --git a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml index 6e0aa5cd81..8b4118a7d7 100755 --- a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml @@ -8,6 +8,7 @@ checkpointing: policy: model_name: Qwen/Qwen3-30B-A3B train_micro_batch_size: 1 + logprob_batch_size: 4 max_total_sequence_length: 4096 dtensor_cfg: enabled: false diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml index 5d1c236584..3b4f22ffbd 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml @@ -8,6 +8,7 @@ checkpointing: policy: model_name: Qwen/Qwen3-30B-A3B train_micro_batch_size: 1 + logprob_batch_size: 4 max_total_sequence_length: 40960 dtensor_cfg: enabled: false diff --git a/tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh b/tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh index ad369c4395..45d321a47b 100755 --- a/tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh +++ b/tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh @@ -1,6 +1,8 @@ #!/bin/bash SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) source $SCRIPT_DIR/common.env +# ignore tensor parallel accuracy check +export NRL_IGNORE_TP_ACCURACY_CHECK=1 # ===== BEGIN CONFIG ===== NUM_NODES=8 diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh index 02dea5a4a5..f312eee392 100755 --- a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh +++ b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh @@ -1,6 +1,8 @@ #!/bin/bash SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) source $SCRIPT_DIR/common.env +# ignore tensor parallel accuracy check to avoid FP8 precision issue +export NRL_IGNORE_TP_ACCURACY_CHECK=1 # ===== BEGIN CONFIG ===== NUM_NODES=4 @@ -24,6 +26,7 @@ uv run examples/run_grpo.py \ logger.monitor_gpus=True \ logger.tensorboard_enabled=True \ checkpointing.enabled=True \ + checkpointing.save_period=5 \ checkpointing.checkpoint_dir=$CKPT_DIR \ $@ \ 2>&1 | tee $RUN_LOG From 6d876a2ff8b8183d0ffe886b0073e1fa1adf83dd Mon Sep 17 00:00:00 2001 From: Yuki Huang Date: Thu, 2 Apr 2026 06:36:35 -0700 Subject: [PATCH 2/6] skip NRL_IGNORE_TP_ACCURACY_CHECK=1 in test Signed-off-by: Yuki Huang --- tests/unit/test_config_validation.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/unit/test_config_validation.py b/tests/unit/test_config_validation.py index 79d9535371..df16c7f13d 100644 --- a/tests/unit/test_config_validation.py +++ b/tests/unit/test_config_validation.py @@ -144,6 +144,15 @@ def test_all_config_no_tp_size_accuracy_issues(config_file): Related document: https://docs.nvidia.com/nemo/rl/latest/guides/dtensor-tp-accuracy.html#root-cause. """ + skip_config_files = [ + "grpo-qwen3-30ba3b-4n8g-40K.yaml", + "grpo-qwen3-30ba3b-8n8g-megatron.yaml", + ] + if os.path.basename(config_file) in skip_config_files: + pytest.skip( + f"Skipping config file {config_file} because it sets NRL_IGNORE_TP_ACCURACY_CHECK=1" + ) + print(f"\nValidating config file: {config_file}") # Load the config file with inheritance From 0010c2c5a76bcef1463a0264c69f9ba0c9f7ec85 Mon Sep 17 00:00:00 2001 From: Yuki Huang Date: Thu, 2 Apr 2026 06:44:09 -0700 Subject: [PATCH 3/6] fix comment Signed-off-by: Yuki Huang --- tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh index f312eee392..7fc06bbf52 100755 --- a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh +++ b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh @@ -1,7 +1,7 @@ #!/bin/bash SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) source $SCRIPT_DIR/common.env -# ignore tensor parallel accuracy check to avoid FP8 precision issue +# ignore tensor parallel accuracy check export NRL_IGNORE_TP_ACCURACY_CHECK=1 # ===== BEGIN CONFIG ===== From 945bce8ed442668d0b9b68a1d7fb5e3a9edb1e79 Mon Sep 17 00:00:00 2001 From: Yuki Huang Date: Thu, 2 Apr 2026 06:45:56 -0700 Subject: [PATCH 4/6] fix STEPS_PER_RUN Signed-off-by: Yuki Huang --- tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh index 7fc06bbf52..9653640dc7 100755 --- a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh +++ b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh @@ -6,7 +6,7 @@ export NRL_IGNORE_TP_ACCURACY_CHECK=1 # ===== BEGIN CONFIG ===== NUM_NODES=4 -STEPS_PER_RUN=10 +STEPS_PER_RUN=5 MAX_STEPS=10 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up NUM_MINUTES=100 From 98252a89f057d736fb8244aae0a817701b84814c Mon Sep 17 00:00:00 2001 From: Yuki Huang Date: Fri, 3 Apr 2026 00:43:24 -0700 Subject: [PATCH 5/6] revert grpo-qwen3-32b-4n8g.yaml Signed-off-by: Yuki Huang --- .../configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml | 1 + tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh | 2 ++ tests/unit/test_config_validation.py | 1 + 3 files changed, 4 insertions(+) diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml index ad780ebc50..1df1bc851c 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml @@ -8,6 +8,7 @@ checkpointing: policy: model_name: Qwen/Qwen3-32B train_micro_batch_size: 1 + logprob_batch_size: 4 max_total_sequence_length: 4096 dtensor_cfg: enabled: false diff --git a/tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh b/tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh index 2f14541b6f..b537d8d9eb 100755 --- a/tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh +++ b/tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh @@ -1,6 +1,8 @@ #!/bin/bash SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) source $SCRIPT_DIR/common.env +# ignore tensor parallel accuracy check +export NRL_IGNORE_TP_ACCURACY_CHECK=1 # ===== BEGIN CONFIG ===== NUM_NODES=4 diff --git a/tests/unit/test_config_validation.py b/tests/unit/test_config_validation.py index df16c7f13d..fd173b1f52 100644 --- a/tests/unit/test_config_validation.py +++ b/tests/unit/test_config_validation.py @@ -147,6 +147,7 @@ def test_all_config_no_tp_size_accuracy_issues(config_file): skip_config_files = [ "grpo-qwen3-30ba3b-4n8g-40K.yaml", "grpo-qwen3-30ba3b-8n8g-megatron.yaml", + "grpo-qwen3-32b-4n8g.yaml", ] if os.path.basename(config_file) in skip_config_files: pytest.skip( From 69bfd70b00cc5b3c057de27c128a5be2a19a4080 Mon Sep 17 00:00:00 2001 From: Yuki Huang Date: Fri, 3 Apr 2026 01:22:46 -0700 Subject: [PATCH 6/6] skip grpo-qwen3-32b-8n8g-async-1off.yaml Signed-off-by: Yuki Huang --- .../llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh | 2 ++ tests/unit/test_config_validation.py | 1 + 2 files changed, 3 insertions(+) diff --git a/tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh index 44dbe4b337..da31ff7e3d 100755 --- a/tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh +++ b/tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh @@ -1,6 +1,8 @@ #!/bin/bash SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) source $SCRIPT_DIR/common.env +# ignore tensor parallel accuracy check +export NRL_IGNORE_TP_ACCURACY_CHECK=1 # ===== BEGIN CONFIG ===== NUM_NODES=8 diff --git a/tests/unit/test_config_validation.py b/tests/unit/test_config_validation.py index fd173b1f52..06f750f25c 100644 --- a/tests/unit/test_config_validation.py +++ b/tests/unit/test_config_validation.py @@ -148,6 +148,7 @@ def test_all_config_no_tp_size_accuracy_issues(config_file): "grpo-qwen3-30ba3b-4n8g-40K.yaml", "grpo-qwen3-30ba3b-8n8g-megatron.yaml", "grpo-qwen3-32b-4n8g.yaml", + "grpo-qwen3-32b-8n8g-async-1off.yaml", ] if os.path.basename(config_file) in skip_config_files: pytest.skip(