diff --git a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml index 6e0aa5cd81..8b4118a7d7 100755 --- a/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml +++ b/examples/configs/recipes/llm/grpo-qwen3-30ba3b-8n8g-megatron.yaml @@ -8,6 +8,7 @@ checkpointing: policy: model_name: Qwen/Qwen3-30B-A3B train_micro_batch_size: 1 + logprob_batch_size: 4 max_total_sequence_length: 4096 dtensor_cfg: enabled: false diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml index 5d1c236584..3b4f22ffbd 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.yaml @@ -8,6 +8,7 @@ checkpointing: policy: model_name: Qwen/Qwen3-30B-A3B train_micro_batch_size: 1 + logprob_batch_size: 4 max_total_sequence_length: 40960 dtensor_cfg: enabled: false diff --git a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml index ad780ebc50..1df1bc851c 100644 --- a/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml +++ b/examples/configs/recipes/llm/performance/grpo-qwen3-32b-4n8g.yaml @@ -8,6 +8,7 @@ checkpointing: policy: model_name: Qwen/Qwen3-32B train_micro_batch_size: 1 + logprob_batch_size: 4 max_total_sequence_length: 4096 dtensor_cfg: enabled: false diff --git a/tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh b/tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh index ad369c4395..45d321a47b 100755 --- a/tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh +++ b/tests/test_suites/llm/grpo-qwen3-30ba3b-8n8g-megatron.sh @@ -1,6 +1,8 @@ #!/bin/bash SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) source $SCRIPT_DIR/common.env +# ignore tensor parallel accuracy check +export NRL_IGNORE_TP_ACCURACY_CHECK=1 # ===== BEGIN CONFIG ===== NUM_NODES=8 diff --git a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh index 02dea5a4a5..9653640dc7 100755 --- a/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh +++ b/tests/test_suites/llm/performance/grpo-qwen3-30ba3b-4n8g-40K.sh @@ -1,10 +1,12 @@ #!/bin/bash SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) source $SCRIPT_DIR/common.env +# ignore tensor parallel accuracy check +export NRL_IGNORE_TP_ACCURACY_CHECK=1 # ===== BEGIN CONFIG ===== NUM_NODES=4 -STEPS_PER_RUN=10 +STEPS_PER_RUN=5 MAX_STEPS=10 NUM_RUNS=$(( (MAX_STEPS + STEPS_PER_RUN - 1) / STEPS_PER_RUN )) # Round up NUM_MINUTES=100 @@ -24,6 +26,7 @@ uv run examples/run_grpo.py \ logger.monitor_gpus=True \ logger.tensorboard_enabled=True \ checkpointing.enabled=True \ + checkpointing.save_period=5 \ checkpointing.checkpoint_dir=$CKPT_DIR \ $@ \ 2>&1 | tee $RUN_LOG diff --git a/tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh b/tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh index 2f14541b6f..b537d8d9eb 100755 --- a/tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh +++ b/tests/test_suites/llm/performance/grpo-qwen3-32b-4n8g.sh @@ -1,6 +1,8 @@ #!/bin/bash SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) source $SCRIPT_DIR/common.env +# ignore tensor parallel accuracy check +export NRL_IGNORE_TP_ACCURACY_CHECK=1 # ===== BEGIN CONFIG ===== NUM_NODES=4 diff --git a/tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh b/tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh index 44dbe4b337..da31ff7e3d 100755 --- a/tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh +++ b/tests/test_suites/llm/performance/grpo-qwen3-32b-8n8g-async-1off.sh @@ -1,6 +1,8 @@ #!/bin/bash SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) source $SCRIPT_DIR/common.env +# ignore tensor parallel accuracy check +export NRL_IGNORE_TP_ACCURACY_CHECK=1 # ===== BEGIN CONFIG ===== NUM_NODES=8 diff --git a/tests/unit/test_config_validation.py b/tests/unit/test_config_validation.py index 79d9535371..06f750f25c 100644 --- a/tests/unit/test_config_validation.py +++ b/tests/unit/test_config_validation.py @@ -144,6 +144,17 @@ def test_all_config_no_tp_size_accuracy_issues(config_file): Related document: https://docs.nvidia.com/nemo/rl/latest/guides/dtensor-tp-accuracy.html#root-cause. """ + skip_config_files = [ + "grpo-qwen3-30ba3b-4n8g-40K.yaml", + "grpo-qwen3-30ba3b-8n8g-megatron.yaml", + "grpo-qwen3-32b-4n8g.yaml", + "grpo-qwen3-32b-8n8g-async-1off.yaml", + ] + if os.path.basename(config_file) in skip_config_files: + pytest.skip( + f"Skipping config file {config_file} because it sets NRL_IGNORE_TP_ACCURACY_CHECK=1" + ) + print(f"\nValidating config file: {config_file}") # Load the config file with inheritance