From 006d9aaf20a7c563c8475388bcc1ead8ee9a9739 Mon Sep 17 00:00:00 2001 From: Sahil Jain Date: Mon, 28 Apr 2025 13:08:13 -0700 Subject: [PATCH 1/4] Remove 'last 100' hack for math verifier Signed-off-by: Sahil Jain --- nemo_rl/environments/math_environment.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/nemo_rl/environments/math_environment.py b/nemo_rl/environments/math_environment.py index 8da0528652..edf79b94b1 100644 --- a/nemo_rl/environments/math_environment.py +++ b/nemo_rl/environments/math_environment.py @@ -15,7 +15,8 @@ import ray import torch -from math_verify import parse, verify +from math_verify.metric import math_metric +from math_verify.parser import ExprExtractionConfig, LatexExtractionConfig from nemo_rl.distributed.batched_data_dict import BatchedDataDict from nemo_rl.distributed.virtual_cluster import PY_EXECUTABLES @@ -53,9 +54,18 @@ def verify( results = [] for response, ground_truth in zip(pred_responses, ground_truths): try: - gold = parse(ground_truth) - pred = parse(response[-100:]) # avoid looking at the whole string - results.append(float(verify(gold, pred))) + verify_func = math_metric( + gold_extraction_target=(LatexExtractionConfig(),), + pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig()), + ) + + ground_truth_parsable = "\\boxed{" + ground_truth + "}" + try: + ret_score, _ = verify_func([ground_truth_parsable], [response]) + except Exception: + ret_score = 0.0 + + results.append(float(ret_score)) except Exception: results.append(0) return results From 124418ae4cbf8ebcb06dcf94fe2dd013dcd500c7 Mon Sep 17 00:00:00 2001 From: Sahil Jain Date: Mon, 28 Apr 2025 13:25:09 -0700 Subject: [PATCH 2/4] lint Signed-off-by: Sahil Jain --- nemo_rl/environments/math_environment.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nemo_rl/environments/math_environment.py b/nemo_rl/environments/math_environment.py index edf79b94b1..28cdb8a9fb 100644 --- a/nemo_rl/environments/math_environment.py +++ b/nemo_rl/environments/math_environment.py @@ -56,7 +56,10 @@ def verify( try: verify_func = math_metric( gold_extraction_target=(LatexExtractionConfig(),), - pred_extraction_target=(ExprExtractionConfig(), LatexExtractionConfig()), + pred_extraction_target=( + ExprExtractionConfig(), + LatexExtractionConfig(), + ), ) ground_truth_parsable = "\\boxed{" + ground_truth + "}" From 32f0227892400e421916644152010b6c8ae2912e Mon Sep 17 00:00:00 2001 From: Sahil Jain Date: Mon, 28 Apr 2025 13:28:49 -0700 Subject: [PATCH 3/4] comment Signed-off-by: Sahil Jain --- nemo_rl/environments/math_environment.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nemo_rl/environments/math_environment.py b/nemo_rl/environments/math_environment.py index 28cdb8a9fb..fd968298b0 100644 --- a/nemo_rl/environments/math_environment.py +++ b/nemo_rl/environments/math_environment.py @@ -54,6 +54,8 @@ def verify( results = [] for response, ground_truth in zip(pred_responses, ground_truths): try: + # Use Latex and plain math extraction from predictions + # https://github.com/huggingface/Math-Verify?tab=readme-ov-file#extraction-targets verify_func = math_metric( gold_extraction_target=(LatexExtractionConfig(),), pred_extraction_target=( From 7391d7ae4699e5d773c18366e6d5924a7a2d229d Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Tue, 29 Apr 2025 15:15:10 -0700 Subject: [PATCH 4/4] rename experiments Signed-off-by: Terry Kong --- ...po-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v2.yaml} | 0 ...=> grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml} | 0 ...-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v2.yaml} | 0 ... grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v2.yaml} | 0 ...aml => grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v2.yaml} | 0 ...> grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v2.yaml} | 0 ...o-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v2.yaml} | 0 ...grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v2.sh} | 0 ...h => grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh} | 0 ...po-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v2.sh} | 0 ...=> grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v2.sh} | 0 ...p1.sh => grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v2.sh} | 0 ... => grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v2.sh} | 0 ...rpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v2.sh} | 0 tests/test_suites/nightly.txt | 10 +++++----- tests/test_suites/release.txt | 4 ++-- 16 files changed, 7 insertions(+), 7 deletions(-) rename examples/configs/recipes/llm/{grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.yaml => grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v2.yaml} (100%) rename examples/configs/recipes/llm/{grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.yaml => grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml} (100%) rename examples/configs/recipes/llm/{grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.yaml => grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v2.yaml} (100%) rename examples/configs/recipes/llm/{grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.yaml => grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v2.yaml} (100%) rename examples/configs/recipes/llm/{grpo-qwen2.5-7b-instruct-4n8g-fsdp1.yaml => grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v2.yaml} (100%) rename examples/configs/recipes/llm/{grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.yaml => grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v2.yaml} (100%) rename examples/configs/recipes/llm/{grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.yaml => grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v2.yaml} (100%) rename tests/test_suites/llm/{grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh => grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v2.sh} (100%) rename tests/test_suites/llm/{grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh => grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh} (100%) rename tests/test_suites/llm/{grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh => grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v2.sh} (100%) rename tests/test_suites/llm/{grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh => grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v2.sh} (100%) rename tests/test_suites/llm/{grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh => grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v2.sh} (100%) rename tests/test_suites/llm/{grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh => grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v2.sh} (100%) rename tests/test_suites/llm/{grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh => grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v2.sh} (100%) diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v2.yaml similarity index 100% rename from examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.yaml rename to examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v2.yaml diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml similarity index 100% rename from examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.yaml rename to examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v2.yaml similarity index 100% rename from examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.yaml rename to examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v2.yaml diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v2.yaml similarity index 100% rename from examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.yaml rename to examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v2.yaml diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v2.yaml similarity index 100% rename from examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.yaml rename to examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v2.yaml diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v2.yaml similarity index 100% rename from examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.yaml rename to examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v2.yaml diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v2.yaml similarity index 100% rename from examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.yaml rename to examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v2.yaml diff --git a/tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh b/tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v2.sh similarity index 100% rename from tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh rename to tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v2.sh diff --git a/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh b/tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh similarity index 100% rename from tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh rename to tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh diff --git a/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh b/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v2.sh similarity index 100% rename from tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh rename to tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v2.sh diff --git a/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh b/tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v2.sh similarity index 100% rename from tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh rename to tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v2.sh diff --git a/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v2.sh similarity index 100% rename from tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh rename to tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v2.sh diff --git a/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh b/tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v2.sh similarity index 100% rename from tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh rename to tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v2.sh diff --git a/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh b/tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v2.sh similarity index 100% rename from tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh rename to tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v2.sh diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt index 4c609d5bff..b80a7ad545 100644 --- a/tests/test_suites/nightly.txt +++ b/tests/test_suites/nightly.txt @@ -3,15 +3,15 @@ ######## # Short 1N/1B runs (go past 200 steps - usually divergence happens by now) -- going to 4 nodes doesn't help that much -tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh -tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh +tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v2.sh +tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh # FSDP1 vs Dtensor (Qwen/Qwen2.5-7B-Instruct) -tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh -tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh +tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v2.sh +tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v2.sh # Functional 32b run -tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh +tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v2.sh ####### # SFT # diff --git a/tests/test_suites/release.txt b/tests/test_suites/release.txt index 69735cb0cb..42e9c49d00 100644 --- a/tests/test_suites/release.txt +++ b/tests/test_suites/release.txt @@ -3,10 +3,10 @@ ######## # Long 8b run -tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh +tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v2.sh # Long 32b run -tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh +tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v2.sh ####### # SFT #