From 405acc22bf36087001c05ffb508cb1b89208dedf Mon Sep 17 00:00:00 2001
From: Xuehan Xiong <xxman@google.com>
Date: Tue, 24 Jun 2025 06:15:18 +0000
Subject: [PATCH 01/44] Adds multiple choice eval datasets.

Signed-off-by: Xuehan Xiong <xxman@google.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 nemo_rl/data/eval_datasets/__init__.py |  0
 nemo_rl/data/eval_datasets/gpqa.py     | 44 ++++++++++++++++++++++++++
 nemo_rl/data/eval_datasets/math.py     | 29 +++++++++++++++++
 nemo_rl/data/eval_datasets/mmlu.py     | 33 +++++++++++++++++++
 nemo_rl/data/eval_datasets/mmlu_pro.py | 31 ++++++++++++++++++
 5 files changed, 137 insertions(+)
 create mode 100644 nemo_rl/data/eval_datasets/__init__.py
 create mode 100644 nemo_rl/data/eval_datasets/gpqa.py
 create mode 100644 nemo_rl/data/eval_datasets/math.py
 create mode 100644 nemo_rl/data/eval_datasets/mmlu.py
 create mode 100644 nemo_rl/data/eval_datasets/mmlu_pro.py

diff --git a/nemo_rl/data/eval_datasets/__init__.py b/nemo_rl/data/eval_datasets/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/nemo_rl/data/eval_datasets/gpqa.py b/nemo_rl/data/eval_datasets/gpqa.py
new file mode 100644
index 0000000000..1287e446a0
--- /dev/null
+++ b/nemo_rl/data/eval_datasets/gpqa.py
@@ -0,0 +1,44 @@
+"""GPQA dataset and its variants."""
+
+import random
+from typing import Any, Optional
+
+from datasets import load_dataset
+
+from nemo_rl.data.interfaces import TaskDataSpec
+
+
+
+class GPQADataset:
+    def __init__(self, variant: str = "diamond", prompt_file: Optional[str]=None, system_prompt_file: Optional[str]=None):
+        ds = load_dataset("csv", data_files=f"https://openaipublic.blob.core.windows.net/simple-evals/gpqa_{variant}.csv", split="train")
+        self._rng = random.Random()
+        self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names)
+        self.task_spec = TaskDataSpec(
+            task_name=f"GPQA_{variant}",
+            prompt_file=prompt_file,
+            system_prompt_file=system_prompt_file,
+        )
+    
+    def _rekey(self, data: dict[str, Any]):
+        choices = [
+            data["Correct Answer"],
+            data["Incorrect Answer 1"],
+            data["Incorrect Answer 2"],
+            data["Incorrect Answer 3"],
+        ]
+        permutation = self._rng.sample(range(4), 4)
+        choices = [choices[i] for i in permutation]
+        correct_index = choices.index(data["Correct Answer"])
+        correct_answer = "ABCD"[correct_index]
+        return {
+            "question": data["Question"],
+            "options": dict(
+                A=choices[0],
+                B=choices[1],
+                C=choices[2],
+                D=choices[3],
+            ),
+            "answer": correct_answer,
+        }
+
diff --git a/nemo_rl/data/eval_datasets/math.py b/nemo_rl/data/eval_datasets/math.py
new file mode 100644
index 0000000000..cbd7cb3577
--- /dev/null
+++ b/nemo_rl/data/eval_datasets/math.py
@@ -0,0 +1,29 @@
+"""Math dataset and its variants."""
+
+from typing import Any, Literal, Optional
+
+from datasets import load_dataset
+
+from nemo_rl.data.interfaces import TaskDataSpec
+
+
+class MathDataset:
+    def __init__(self, 
+            variant: Literal["math_test", "math_500_test"] = "math_test",
+            prompt_file: Optional[str]=None, 
+            system_prompt_file: Optional[str]=None,
+        ):
+        ds = load_dataset('csv', data_files=f"https://openaipublic.blob.core.windows.net/simple-evals/{variant}.csv", split='train')
+        self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names)
+        self.task_spec = TaskDataSpec(
+            task_name=f'{variant}',
+            prompt_file=prompt_file,
+            system_prompt_file=system_prompt_file,
+        )
+    
+    def _rekey(self, data: dict[str, Any]):
+        return {
+            'problem': data['Question'],
+            'expected_answer': data['Answer'],
+        }
+
diff --git a/nemo_rl/data/eval_datasets/mmlu.py b/nemo_rl/data/eval_datasets/mmlu.py
new file mode 100644
index 0000000000..f0f126850a
--- /dev/null
+++ b/nemo_rl/data/eval_datasets/mmlu.py
@@ -0,0 +1,33 @@
+"""MMLU dataset and its variants."""
+
+from typing import Any, Optional
+
+from datasets import load_dataset
+
+from nemo_rl.data.interfaces import TaskDataSpec
+
+
+class MMLUDataset:
+    def __init__(self, prompt_file: str, system_prompt_file: Optional[str] = None):
+        ds = load_dataset('csv', data_files=f"https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", split='train')
+        self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names)
+
+        self.task_spec = TaskDataSpec(
+            task_name='MMLU',
+            prompt_file=prompt_file,
+            system_prompt_file=system_prompt_file,
+        )
+
+    def _rekey(self, data: dict[str, Any]):
+        return {
+            'question': data['Question'],
+            'options': dict(
+                A=data['A'],
+                B=data['B'],
+                C=data['C'],
+                D=data['D'],
+            ),
+            'answer': data['Answer'],
+            'subject': data['Subject'],
+        }
+
diff --git a/nemo_rl/data/eval_datasets/mmlu_pro.py b/nemo_rl/data/eval_datasets/mmlu_pro.py
new file mode 100644
index 0000000000..da990a90c5
--- /dev/null
+++ b/nemo_rl/data/eval_datasets/mmlu_pro.py
@@ -0,0 +1,31 @@
+"""MMLU-Pro dataset."""
+
+from typing import Any, Optional
+
+from datasets import load_dataset
+
+from nemo_rl.data.interfaces import TaskDataSpec
+
+
+class MMLUProDataset:
+    def __init__(self, prompt_file: str, system_prompt_file: Optional[str] = None):
+        ds = load_dataset('TIGER-Lab/MMLU-Pro', split='test')
+        self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names)
+
+        self.task_spec = TaskDataSpec(
+            task_name='MMLU-Pro',
+            prompt_file=prompt_file,
+            system_prompt_file=system_prompt_file,
+        )
+
+    def _rekey(self, data: dict[str, Any]):
+        options = {
+            chr(ord('A') + i) : op for i, op in enumerate(data['options'])
+        }
+        return {
+            'question': data['question'],
+            'options': options,
+            'answer': data['answer'],
+            'subject': data['category'],
+        }
+

From 67aae53a686dfcb2cdf4e8bad783b377508a597a Mon Sep 17 00:00:00 2001
From: Xuehan Xiong <xxman@google.com>
Date: Tue, 24 Jun 2025 18:28:37 +0000
Subject: [PATCH 02/44] Add a verify worker for multiple-choice problems.

Signed-off-by: Xuehan Xiong <xxman@google.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 nemo_rl/environments/math_environment.py | 36 +++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/nemo_rl/environments/math_environment.py b/nemo_rl/environments/math_environment.py
index e8a47db06f..3f2c7cf7af 100644
--- a/nemo_rl/environments/math_environment.py
+++ b/nemo_rl/environments/math_environment.py
@@ -14,6 +14,7 @@
 import contextlib
 import io
 import logging
+import re
 from typing import Any, Optional, TypedDict
 
 import ray
@@ -32,11 +33,13 @@
     calculate_pass_rate_per_prompt,
 )
 from nemo_rl.environments.utils import chunk_list_to_workers
+from nemo_rl.evals import answer_parsing
 
 
 class MathEnvConfig(TypedDict):
     num_workers: int
     stop_strings: Optional[list[str]]  # Default stop strings for this env
+    verifier_type: Optional[str]
 
 
 @contextlib.contextmanager
@@ -97,6 +100,36 @@ def verify(
         return results
 
 
+@ray.remote
+class MultichoiceVerifyWorker:
+
+    def verify(
+        self, pred_responses: list[str], ground_truths: list[str]
+    ) -> list[float]:
+        """Verify the correctness of the predicted responses against the ground truth.
+
+        Args:
+            pred_responses: list[str]. The predicted responses from the LLM.
+            ground_truths: list[str]. The ground truth responses.
+
+        Returns:
+            list[float]. The rewards for each predicted response.
+        """
+        results = []
+        for response, ground_truth in zip(pred_responses, ground_truths):
+            response = answer_parsing.normalize_response(response)
+            extracted_answer = None
+            for answer_regex in answer_parsing.MULTILINGUAL_ANSWER_REGEXES:
+                regex = answer_parsing.MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(answer_regex)
+                match = re.search(regex, response)
+                if match:
+                    extracted_answer = answer_parsing.normalize_extracted_answer(match.group(1))
+                    break
+            score = 1.0 if extracted_answer == ground_truth else 0.0
+            results.append(score)
+        return results
+
+
 class MathEnvironmentMetadata(TypedDict):
     ground_truth: str
 
@@ -106,8 +139,9 @@ class MathEnvironment(EnvironmentInterface):
     def __init__(self, cfg: MathEnvConfig):
         self.cfg = cfg
         self.num_workers = cfg["num_workers"]
+        worker_cls = MultichoiceVerifyWorker if cfg.get("verifier_type", "math") == "multichoice" else HFVerifyWorker
         self.workers = [
-            HFVerifyWorker.options(  # type: ignore # (decorated with @ray.remote)
+            worker_cls.options(  # type: ignore # (decorated with @ray.remote)
                 runtime_env={"py_executable": PY_EXECUTABLES.SYSTEM}
             ).remote()
             for _ in range(self.num_workers)

From 4134fcb087b1bdba0c87905acc80d5e456634cf2 Mon Sep 17 00:00:00 2001
From: Xuehan Xiong <xxman@google.com>
Date: Tue, 24 Jun 2025 18:59:43 +0000
Subject: [PATCH 03/44] add prompts for MMLU and GPQA.

Signed-off-by: Xuehan Xiong <xxman@google.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 examples/prompts/gpqa.txt | 1 +
 examples/prompts/mmlu.txt | 1 +
 2 files changed, 2 insertions(+)
 create mode 100644 examples/prompts/gpqa.txt
 create mode 100644 examples/prompts/mmlu.txt

diff --git a/examples/prompts/gpqa.txt b/examples/prompts/gpqa.txt
new file mode 100644
index 0000000000..04ea20d553
--- /dev/null
+++ b/examples/prompts/gpqa.txt
@@ -0,0 +1 @@
+Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.
diff --git a/examples/prompts/mmlu.txt b/examples/prompts/mmlu.txt
new file mode 100644
index 0000000000..04ea20d553
--- /dev/null
+++ b/examples/prompts/mmlu.txt
@@ -0,0 +1 @@
+Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering.

From 0ca559faeadb6375a2bc399bdfb6582b66b9f643 Mon Sep 17 00:00:00 2001
From: Xuehan Xiong <xxman@google.com>
Date: Tue, 24 Jun 2025 19:03:46 +0000
Subject: [PATCH 04/44] modifies eval script to support multiple-choice
 questions.

Signed-off-by: Xuehan Xiong <xxman@google.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 examples/run_eval.py            | 163 ++++++++++++++++++++++++++------
 nemo_rl/evals/answer_parsing.py |  94 ++++++++++++++++++
 2 files changed, 226 insertions(+), 31 deletions(-)
 create mode 100644 nemo_rl/evals/answer_parsing.py

diff --git a/examples/run_eval.py b/examples/run_eval.py
index 6f7f60cc44..ae86046dbc 100644
--- a/examples/run_eval.py
+++ b/examples/run_eval.py
@@ -16,19 +16,25 @@
 import os
 import pprint
 import sys
+from typing import Any, cast
+
+import torch
 
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
-from datasets import load_dataset
 from omegaconf import OmegaConf
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, PreTrainedTokenizerBase
 
 from examples.run_grpo_math import math_data_processor
 from nemo_rl.algorithms.utils import get_tokenizer
-from nemo_rl.data import MathDataConfig
 from nemo_rl.data.datasets import AllTaskProcessedDataset
-from nemo_rl.data.interfaces import TaskDataSpec
-from nemo_rl.data.llm_message_utils import remap_dataset_keys
+from nemo_rl.data.eval_datasets import (
+    gpqa,
+    math,
+    mmlu,
+    mmlu_pro,
+)
+from nemo_rl.data.interfaces import DatumSpec, TaskDataSpec
 from nemo_rl.distributed.ray_actor_environment_registry import (
     get_actor_python_env,
 )
@@ -37,6 +43,8 @@
 from nemo_rl.evals.eval import MasterConfig, run_env_eval, setup
 from nemo_rl.models.generation import configure_generation_config
 
+TokenizerType = PreTrainedTokenizerBase
+
 
 def parse_args():
     """Parse command line arguments."""
@@ -54,28 +62,119 @@ def parse_args():
     return args, overrides
 
 
-def setup_data(tokenizer: AutoTokenizer, data_config: MathDataConfig, env_configs):
-    print("\n▶ Setting up data...")
-    math_task_spec = TaskDataSpec(
-        task_name="math",
-        prompt_file=data_config["prompt_file"],
-        system_prompt_file=data_config["system_prompt_file"],
+def _construct_prompt(prompt: str, question: str, options: dict[str, str]) -> str:
+    """Construct prompt from question and options."""
+    output = prompt
+    output += f"\n\nQuestion: {question}\nOptions:\n"
+    output += "\n".join(
+        [
+            f"{letter}) {option}"
+            for letter, option in options.items()
+            if option is not None
+        ]
     )
-
-    # load dataset
-    base_dataset = load_dataset(data_config["dataset_name"])
-    if data_config["dataset_key"] is not None:
-        base_dataset = base_dataset[data_config["dataset_key"]]
-    # remap problem and solution keys
-    remapped_dataset = remap_dataset_keys(
-        base_dataset,
-        mapping_dict={
-            data_config["problem_key"]: "problem",
-            data_config["solution_key"]: "expected_answer",
-        },
+    return output
+
+
+def multichoice_qa_processor(
+    datum_dict: dict[str, Any],
+    task_data_spec: TaskDataSpec,
+    tokenizer: TokenizerType,
+    max_seq_length: int,
+    idx: int,
+) -> DatumSpec:
+    """Process a datum dictionary (directly loaded from dataset) into a DatumSpec for multiple-choice problems."""
+    question = datum_dict["question"]
+    answer = str(datum_dict["answer"])
+    options = datum_dict["options"]
+    extra_env_info = {"ground_truth": answer}
+    if "subject" in datum_dict:
+        extra_env_info.update({"subject": datum_dict["subject"]})
+
+    message_log = []
+
+    # system prompt
+    if task_data_spec.system_prompt:
+        sys_prompt: dict[str, str | torch.Tensor] = {
+            "role": "system",
+            "content": task_data_spec.system_prompt,
+        }
+        sys = tokenizer.apply_chat_template(
+            [cast(dict[str, str], sys_prompt)],
+            tokenize=False,
+            add_generation_prompt=False,
+            add_special_tokens=False,
+        )
+        sys_prompt["token_ids"] = tokenizer(sys, return_tensors="pt")["input_ids"][0]
+        message_log.append(sys_prompt)
+
+    # user prompt
+    if task_data_spec.prompt:
+        problem = _construct_prompt(task_data_spec.prompt, question, options)
+    user_message = {"role": "user", "content": problem}
+    message = tokenizer.apply_chat_template(
+        [user_message],
+        tokenize=False,
+        add_generation_prompt=True,
+        add_special_tokens=False,
     )
+    user_message["token_ids"] = tokenizer(message, return_tensors="pt")["input_ids"][0]
+    user_message["content"] = message
+    message_log.append(user_message)
+
+    length = sum(len(m["token_ids"]) for m in message_log)
+    output: DatumSpec = {
+        "message_log": message_log,
+        "length": length,
+        "extra_env_info": extra_env_info,
+        "loss_multiplier": 1.0,
+        "idx": idx,
+    }
+    if "task_name" in datum_dict:
+        output["task_name"] = datum_dict["task_name"]
+    return output
+
+
+def setup_data(tokenizer: AutoTokenizer, data_config, env_configs):
+    print("Setting up data...")
 
-    math_env = MathEnvironment.options(
+    # load dataset
+    dataset_name = data_config["dataset_name"]
+    data_processor_fn = multichoice_qa_processor
+    if dataset_name == "mmlu":
+        base_dataset = mmlu.MMLUDataset(
+            prompt_file=data_config["prompt_file"],
+            system_prompt_file=data_config["system_prompt_file"],
+        )
+    elif dataset_name == "gpqa":
+        base_dataset = gpqa.GPQADataset(
+            prompt_file=data_config["prompt_file"],
+            system_prompt_file=data_config["system_prompt_file"],
+        )
+    elif dataset_name == "mmlu_pro":
+        base_dataset = mmlu_pro.MMLUProDataset(
+            prompt_file=data_config["prompt_file"],
+            system_prompt_file=data_config["system_prompt_file"],
+        )
+    elif dataset_name == "math":
+        base_dataset = math.MathDataset(
+            variant="math_test",
+            prompt_file=data_config["prompt_file"],
+            system_prompt_file=data_config["system_prompt_file"],
+        )
+        data_processor_fn = math_data_processor
+    elif dataset_name == "math500":
+        base_dataset = math.MathDataset(
+            variant="math_500_test",
+            prompt_file=data_config["prompt_file"],
+            system_prompt_file=data_config["system_prompt_file"],
+        )
+        data_processor_fn = math_data_processor
+    else:
+        raise ValueError(f"Unknown dataset {dataset_name}.")
+    rekeyed_ds = base_dataset.rekeyed_ds
+
+    env = MathEnvironment.options(
         runtime_env={
             "py_executable": get_actor_python_env(
                 "nemo_rl.environments.math_environment.MathEnvironment"
@@ -84,14 +183,14 @@ def setup_data(tokenizer: AutoTokenizer, data_config: MathDataConfig, env_config
     ).remote(env_configs["math"])
 
     dataset = AllTaskProcessedDataset(
-        dataset=remapped_dataset,
+        dataset=rekeyed_ds,
         tokenizer=tokenizer,
-        default_task_data_spec=math_task_spec,
-        task_data_processors=math_data_processor,
+        default_task_data_spec=base_dataset.task_spec,
+        task_data_processors=data_processor_fn,
         max_seq_length=data_config["max_input_seq_length"],
     )
 
-    return dataset, math_env, tokenizer
+    return dataset, env, tokenizer
 
 
 def main():
@@ -100,7 +199,9 @@ def main():
     args, overrides = parse_args()
 
     if not args.config:
-        args.config = os.path.join(os.path.dirname(__file__), "configs", "eval.yaml")
+        args.config = os.path.join(
+            os.path.dirname(__file__), "configs", "mmlu_eval.yaml"
+        )
 
     config = OmegaConf.load(args.config)
     print(f"Loaded configuration from: {args.config}")
@@ -129,7 +230,7 @@ def main():
     # Setup data
     (
         dataset,
-        math_env,
+        env,
         tokenizer,
     ) = setup_data(tokenizer, config["data"], config["env"])
 
@@ -144,7 +245,7 @@ def main():
     run_env_eval(
         vllm_generation,
         dataloader,
-        math_env,
+        env,
         master_config,
     )
 
diff --git a/nemo_rl/evals/answer_parsing.py b/nemo_rl/evals/answer_parsing.py
new file mode 100644
index 0000000000..d4e2fddd6f
--- /dev/null
+++ b/nemo_rl/evals/answer_parsing.py
@@ -0,0 +1,94 @@
+"""Contains utility functions for answer parsing."""
+
+
+MULTILINGUAL_ANSWER_PATTERN_TEMPLATE = (
+    "(?i){}[ \t]*([A-D]|[أ-د]|[অ]|[ব]|[ড]|[ঢ]|[Ａ]|[Ｂ]|[Ｃ]|[Ｄ])"
+)
+# All the different ways "Answer" is written in different languages
+MULTILINGUAL_ANSWER_REGEXES = [
+    "Answer\s*:",
+    "Answer\s*:​​​​​​",  # Korean invisible character
+    "উত্তর\s*:",
+    "उत्तर\s*:",
+    "উত্তরঃ",
+    "উত্তর\s*:",
+    "Antwort\s*:",
+    "답변\s*:",
+    "정답\s*:",
+    "답\s*:",
+    "答案\s*：",
+    "答案\s*:",
+    "答\s*：",
+    "答\s*:",
+    "答复\s*：",
+    "答曰\s*：",
+    "الإجابة:",
+    "الجواب:",
+    "إجابة:",
+    "الإجابة النهائية:",
+    "الإجابة الصحيحة:",
+    "الإجابة الصحيحة هي:",
+    "الإجابة هي:",
+    "الجواب النهائي:",
+    "Respuesta\s*:",
+    "Risposta\s*:",
+    "答え\s*:",
+    "答え\s*：",
+    "回答\s*:",
+    "回答\s*：",
+    "解答\s*:",
+    "Jawaban\s*:",
+    "Réponse\s*:",
+    "Resposta\s*:",
+    "Jibu\s*:",
+    "Idahun\s*:",
+    "Ìdáhùn\s*:",
+    "Idáhùn\s*:",
+    "Àmọ̀nà\s*:",
+    "Àdáhùn\s*:",
+    "Ànúgọ\s*:",
+    "Àṣàyàn\s*:",
+]
+
+
+def normalize_extracted_answer(extracted_answer: str) -> str:
+    return (
+        # In arabic these are the letters used for A-D in multiple choice questions
+        extracted_answer.replace("أ", " A")
+        .replace("ب", " B")
+        .replace("ج", " C")
+        .replace("د", " D")
+        # In Bengali these are the letters used for A-D in multiple choice questions
+        .replace("অ", " A")
+        .replace("ব", " B")
+        .replace("ড", " C")
+        .replace("ঢ", " D")
+        # In Japanese these are the letters sometimes used for A-D in multiple choice questions
+        .replace("Ａ", " A")
+        .replace("Ｂ", " B")
+        .replace("Ｃ", " C")
+        .replace("Ｄ", " D")
+        .strip()
+    )
+
+
+def normalize_response(response: str) -> str:
+    """
+    Normalize the response by removing markdown and LaTeX formatting that may prevent a match.
+    """
+
+    return (
+        response.replace("**", "")
+        .replace("$\\boxed{", "")
+        .replace("}$", "")
+        .replace("\\$", "")
+        .replace("$\\text{", "")
+        .replace("$", "")
+        .replace("\\mathrm{", "")
+        .replace("\\{", "")
+        .replace("\\text", "")
+        .replace("\\(", "")
+        .replace("\\mathbf{", "")
+        .replace("{", "")
+        .replace("\\boxed", "")
+    )

From 2163cbf2047af52152524a9d2362046ac3b9c5c1 Mon Sep 17 00:00:00 2001
From: Xuehan Xiong <xxman@google.com>
Date: Tue, 24 Jun 2025 19:04:15 +0000
Subject: [PATCH 05/44] add eval config files.

Signed-off-by: Xuehan Xiong <xxman@google.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 examples/configs/{ => evals}/eval.yaml |  0
 examples/configs/evals/gpqa_eval.yaml  | 42 ++++++++++++++++++++++++++
 examples/configs/evals/math_eval.yaml  | 41 +++++++++++++++++++++++++
 3 files changed, 83 insertions(+)
 rename examples/configs/{ => evals}/eval.yaml (100%)
 create mode 100644 examples/configs/evals/gpqa_eval.yaml
 create mode 100644 examples/configs/evals/math_eval.yaml

diff --git a/examples/configs/eval.yaml b/examples/configs/evals/eval.yaml
similarity index 100%
rename from examples/configs/eval.yaml
rename to examples/configs/evals/eval.yaml
diff --git a/examples/configs/evals/gpqa_eval.yaml b/examples/configs/evals/gpqa_eval.yaml
new file mode 100644
index 0000000000..93b991b185
--- /dev/null
+++ b/examples/configs/evals/gpqa_eval.yaml
@@ -0,0 +1,42 @@
+# Evaluation Configuration
+eval:
+  metric: "pass@1" # only pass@1 is supported now
+  num_tests_per_prompt: 1 # every prompt will be tested num_tests_per_prompt times and use the average score as the final score
+  seed: 42
+
+generation:
+  backend: "vllm" # only vllm is supported for evaluation
+  max_new_tokens: ${generation.vllm_cfg.max_model_len}
+  temperature: 0.0
+  top_p: 1.0
+  top_k: -1 # -1 means disable
+  num_prompts_per_step: 16 # -1 means pass all prompts at once
+  model_name: "Qwen/Qwen2.5-7B-Instruct"
+  stop_token_ids: null
+  stop_strings: null
+  vllm_cfg:
+    async_engine: false
+    precision: "bfloat16"
+    tensor_parallel_size: 1
+    pipeline_parallel_size: 1
+    gpu_memory_utilization: 0.9
+    max_model_len: 3072
+
+tokenizer:
+  name: ${generation.model_name} ## specify if you'd like to use a tokenizer different from the model's default
+  chat_template: "default"
+
+data:
+  max_input_seq_length: ${generation.vllm_cfg.max_model_len} # useless since we directly use prompts in evaluation
+  prompt_file: "examples/prompts/gpqa.txt"
+  system_prompt_file: null
+  dataset_name: "gpqa"
+
+env:
+  math:
+    num_workers: 8
+    verifier_type: "multichoice"
+
+cluster:
+  gpus_per_node: 1
+  num_nodes: 1
diff --git a/examples/configs/evals/math_eval.yaml b/examples/configs/evals/math_eval.yaml
new file mode 100644
index 0000000000..32a4a3281c
--- /dev/null
+++ b/examples/configs/evals/math_eval.yaml
@@ -0,0 +1,41 @@
+# Evaluation Configuration
+eval:
+  metric: "pass@1" # only pass@1 is supported now
+  num_tests_per_prompt: 1 # every prompt will be tested num_tests_per_prompt times and use the average score as the final score
+  seed: 42
+
+generation:
+  backend: "vllm" # only vllm is supported for evaluation
+  max_new_tokens: ${generation.vllm_cfg.max_model_len}
+  temperature: 0.0
+  top_p: 1.0
+  top_k: -1 # -1 means disable
+  num_prompts_per_step: 16 # -1 means pass all prompts at once
+  model_name: "Qwen/Qwen2.5-7B-Instruct"
+  stop_token_ids: null
+  stop_strings: null
+  vllm_cfg:
+    async_engine: false
+    precision: "bfloat16"
+    tensor_parallel_size: 1
+    pipeline_parallel_size: 1
+    gpu_memory_utilization: 0.9
+    max_model_len: 2048
+
+tokenizer:
+  name: ${generation.model_name} ## specify if you'd like to use a tokenizer different from the model's default
+  chat_template: "default"
+
+data:
+  max_input_seq_length: ${generation.vllm_cfg.max_model_len} # useless since we directly use prompts in evaluation
+  prompt_file: "examples/prompts/cot.txt"
+  system_prompt_file: null
+  dataset_name: "math"
+
+env:
+  math:
+    num_workers: 8
+
+cluster:
+  gpus_per_node: 1
+  num_nodes: 1

From d9dd544d1da60536dd3ae3787f09dea1355105d5 Mon Sep 17 00:00:00 2001
From: Xuehan Xiong <xxman@google.com>
Date: Tue, 24 Jun 2025 21:14:18 +0000
Subject: [PATCH 06/44] add unit tests.

Signed-off-by: Xuehan Xiong xxman@google.com
Signed-off-by: Xuehan Xiong <xxman@google.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 nemo_rl/data/eval_datasets/mmlu.py            |  4 +-
 tests/unit/data/eval_datasets/test_gpqa.py    | 40 +++++++++
 tests/unit/data/eval_datasets/test_math.py    | 39 +++++++++
 tests/unit/data/eval_datasets/test_mmlu.py    | 41 ++++++++++
 .../environments/test_math_environment.py     | 81 +++++++++++++++++++
 5 files changed, 203 insertions(+), 2 deletions(-)
 create mode 100644 tests/unit/data/eval_datasets/test_gpqa.py
 create mode 100644 tests/unit/data/eval_datasets/test_math.py
 create mode 100644 tests/unit/data/eval_datasets/test_mmlu.py

diff --git a/nemo_rl/data/eval_datasets/mmlu.py b/nemo_rl/data/eval_datasets/mmlu.py
index f0f126850a..f6ab075886 100644
--- a/nemo_rl/data/eval_datasets/mmlu.py
+++ b/nemo_rl/data/eval_datasets/mmlu.py
@@ -8,8 +8,8 @@
 
 
 class MMLUDataset:
-    def __init__(self, prompt_file: str, system_prompt_file: Optional[str] = None):
-        ds = load_dataset('csv', data_files=f"https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", split='train')
+    def __init__(self, prompt_file: Optional[str] = None, system_prompt_file: Optional[str] = None):
+        ds = load_dataset('csv', data_files="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", split='train')
         self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names)
 
         self.task_spec = TaskDataSpec(
diff --git a/tests/unit/data/eval_datasets/test_gpqa.py b/tests/unit/data/eval_datasets/test_gpqa.py
new file mode 100644
index 0000000000..033a11b6ff
--- /dev/null
+++ b/tests/unit/data/eval_datasets/test_gpqa.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pytest
+from transformers import AutoTokenizer
+
+from nemo_rl.data.eval_datasets.gpqa import GPQADataset
+
+
+@pytest.mark.skip(reason="dataset download is flaky")
+def test_gpqa_dataset():
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
+    gpqa_dataset = GPQADataset()
+
+    # check that the dataset is formatted correctly
+    for example in gpqa_dataset.rekeyed_ds.take(5):
+        assert "question" in example
+        assert "options" in example
+        assert "answer" in example
+
+        ## check that applying chat template works as expected
+        default_templated = tokenizer.apply_chat_template(
+            [{"role": "user", "content": example["question"]}],
+            tokenize=False,
+            add_generation_prompt=False,
+            add_special_tokens=False,
+        )
+
+        assert default_templated == f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n{example["question"]}<|im_end|>\n"
+
diff --git a/tests/unit/data/eval_datasets/test_math.py b/tests/unit/data/eval_datasets/test_math.py
new file mode 100644
index 0000000000..7a524654fa
--- /dev/null
+++ b/tests/unit/data/eval_datasets/test_math.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pytest
+from transformers import AutoTokenizer
+
+from nemo_rl.data.eval_datasets.math import MathDataset
+
+
+@pytest.mark.skip(reason="dataset download is flaky")
+def test_math_dataset():
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
+    math_dataset = MathDataset()
+
+    # check that the dataset is formatted correctly
+    for example in math_dataset.rekeyed_ds.take(5):
+        assert "problem" in example
+        assert "expected_answer" in example
+
+        ## check that applying chat template works as expected
+        default_templated = tokenizer.apply_chat_template(
+            [{"role": "user", "content": example["problem"]}],
+            tokenize=False,
+            add_generation_prompt=False,
+            add_special_tokens=False,
+        )
+
+        assert default_templated == f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n{example["problem"]}<|im_end|>\n"
+
diff --git a/tests/unit/data/eval_datasets/test_mmlu.py b/tests/unit/data/eval_datasets/test_mmlu.py
new file mode 100644
index 0000000000..df5dabaef9
--- /dev/null
+++ b/tests/unit/data/eval_datasets/test_mmlu.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pytest
+from transformers import AutoTokenizer
+
+from nemo_rl.data.eval_datasets.mmlu import MMLUDataset
+
+
+@pytest.mark.skip(reason="dataset download is flaky")
+def test_mmlu_dataset():
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
+    mmlu_dataset = MMLUDataset()
+
+    # check that the dataset is formatted correctly
+    for example in mmlu_dataset.rekeyed_ds.take(5):
+        assert "question" in example
+        assert "options" in example
+        assert "answer" in example
+        assert "subject" in example
+
+        ## check that applying chat template works as expected
+        default_templated = tokenizer.apply_chat_template(
+            [{"role": "user", "content": example["question"]}],
+            tokenize=False,
+            add_generation_prompt=False,
+            add_special_tokens=False,
+        )
+
+        assert default_templated == f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n{example["question"]}<|im_end|>\n"
+
diff --git a/tests/unit/environments/test_math_environment.py b/tests/unit/environments/test_math_environment.py
index 386a21e2f8..ed599bcd5e 100644
--- a/tests/unit/environments/test_math_environment.py
+++ b/tests/unit/environments/test_math_environment.py
@@ -42,6 +42,25 @@ def math_env():
     time.sleep(0.1)
 
 
+@pytest.fixture(scope="module")
+def multichoice_env():
+    """Create a MathEnvironment actor for testing."""
+    env = MathEnvironment.options(
+        runtime_env={
+            "py_executable": get_actor_python_env(
+                "nemo_rl.environments.math_environment.MathEnvironment"
+            ),
+            "env_vars": dict(os.environ),
+        }
+    ).remote({"num_workers": 2, "verifier_type": "multichoice"})
+    yield env
+    # Clean up the actor and wait for it to be killed
+    env.shutdown.remote()
+    ray.kill(env)
+    # Give some time for cleanup
+    time.sleep(0.1)
+
+
 @pytest.fixture
 def basic_test_data():
     """Common test data for basic math problems."""
@@ -68,6 +87,32 @@ def basic_test_data():
     }
 
 
+@pytest.fixture
+def basic_multichoice_test_data():
+    """Common test data for basic multichoice problems."""
+    return {
+        "message_log_batch": [
+            [
+                {"role": "user", "content": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD"},
+                {"role": "assistant", "content": "\nAnswer: C"},
+            ],
+            [
+                {"role": "user", "content": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD"},
+                {"role": "assistant", "content": "\nAnswer: B"},
+            ],
+            [
+                {"role": "user", "content": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD"},
+                {"role": "assistant", "content": "\nAnswer: D"},
+            ],
+        ],
+        "metadata": [
+            {"ground_truth": "C"},
+            {"ground_truth": "B"},
+            {"ground_truth": "B"},
+        ],
+    }
+
+
 @pytest.fixture
 def mixed_test_data():
     """Test data with mix of correct and incorrect responses."""
@@ -148,6 +193,42 @@ def test_math_env_step_basic(math_env, basic_test_data):
     assert all(result.terminateds == 1.0), "All terminated flags should be 1.0"
 
 
+def test_multichoice_env_step_basic(multichoice_env, basic_multichoice_test_data):
+    """Test basic functionality of MathEnvironment step with multichoice verifier."""
+    result = ray.get(
+        multichoice_env.step.remote(
+            basic_multichoice_test_data["message_log_batch"], basic_multichoice_test_data["metadata"]
+        )
+    )
+
+    # Check observations using field access
+    assert len(result.observations) == 3, (
+        "Should return observations for all 3 messages"
+    )
+    assert all(obs["role"] == "environment" for obs in result.observations), (
+        "All observations should be from environment"
+    )
+    assert all(
+        obs["content"] == "Environment: correct" for obs in result.observations[:2]
+    ), "The first two responses should be correct"
+    assert result.observations[2]["content"] == "Environment: incorrect", "The third response should be incorrect"
+
+    # Check metadata
+    assert len(result.metadata) == 3, "Should return metadata for all 3 messages"
+    assert result.metadata == basic_multichoice_test_data["metadata"], (
+        "Metadata should be unchanged"
+    )
+
+    # Check rewards and done flags
+    assert result.rewards.shape == (3,), "Rewards should be a tensor of shape (3,)"
+    assert all(result.rewards[:2] == 1.0), "The first two rewards should be 1.0 for correct answers"
+    assert result.rewards[2] == 0.0, "The thrid  reward should be 0.0 for wrong answer"
+    assert result.terminateds.shape == (3,), (
+        "Terminated flags should be a tensor of shape (3,)"
+    )
+    assert all(result.terminateds == 1.0), "All terminated flags should be 1.0"
+
+
 def test_math_env_step_mixed(math_env, mixed_test_data):
     """Test MathEnvironment step with a mix of correct and incorrect responses."""
     result = ray.get(

From 11a1de5a9d12d3e6afd0c19578cece2cb17d1a0e Mon Sep 17 00:00:00 2001
From: Xuehan Xiong <xxman@google.com>
Date: Tue, 24 Jun 2025 21:39:26 +0000
Subject: [PATCH 07/44] add AIME 2024 dataset.

Signed-off-by: Xuehan Xiong <xxman@google.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 examples/configs/evals/eval.yaml       |  5 +----
 examples/run_eval.py                   |  9 ++++++++-
 nemo_rl/data/eval_datasets/aime2024.py | 27 ++++++++++++++++++++++++++
 3 files changed, 36 insertions(+), 5 deletions(-)
 create mode 100644 nemo_rl/data/eval_datasets/aime2024.py

diff --git a/examples/configs/evals/eval.yaml b/examples/configs/evals/eval.yaml
index 263342306f..1c21af99c4 100644
--- a/examples/configs/evals/eval.yaml
+++ b/examples/configs/evals/eval.yaml
@@ -30,10 +30,7 @@ data:
   max_input_seq_length: ${generation.vllm_cfg.max_model_len} # useless since we directly use prompts in evaluation
   prompt_file: null
   system_prompt_file: null
-  dataset_name: "HuggingFaceH4/aime_2024"
-  dataset_key: "train"
-  problem_key: "problem"
-  solution_key: "answer"
+  dataset_name: "aime2024"
 
 env:
   math:
diff --git a/examples/run_eval.py b/examples/run_eval.py
index ae86046dbc..b9fb7e89bb 100644
--- a/examples/run_eval.py
+++ b/examples/run_eval.py
@@ -29,6 +29,7 @@
 from nemo_rl.algorithms.utils import get_tokenizer
 from nemo_rl.data.datasets import AllTaskProcessedDataset
 from nemo_rl.data.eval_datasets import (
+    aime2024,
     gpqa,
     math,
     mmlu,
@@ -146,6 +147,12 @@ def setup_data(tokenizer: AutoTokenizer, data_config, env_configs):
             prompt_file=data_config["prompt_file"],
             system_prompt_file=data_config["system_prompt_file"],
         )
+    elif dataset_name == "aime2024":
+        base_dataset = aime2024.AIME2024Dataset(
+            prompt_file=data_config["prompt_file"],
+            system_prompt_file=data_config["system_prompt_file"],
+        )
+        data_processor_fn = math_data_processor
     elif dataset_name == "gpqa":
         base_dataset = gpqa.GPQADataset(
             prompt_file=data_config["prompt_file"],
@@ -200,7 +207,7 @@ def main():
 
     if not args.config:
         args.config = os.path.join(
-            os.path.dirname(__file__), "configs", "mmlu_eval.yaml"
+            os.path.dirname(__file__), "configs", "evals", "eval.yaml"
         )
 
     config = OmegaConf.load(args.config)
diff --git a/nemo_rl/data/eval_datasets/aime2024.py b/nemo_rl/data/eval_datasets/aime2024.py
new file mode 100644
index 0000000000..1eff661718
--- /dev/null
+++ b/nemo_rl/data/eval_datasets/aime2024.py
@@ -0,0 +1,27 @@
+"""AIME 2024 dataset."""
+
+from typing import Any, Literal, Optional
+
+from datasets import load_dataset
+
+from nemo_rl.data.interfaces import TaskDataSpec
+
+
+class AIME2024Dataset:
+    def __init__(self,
+            prompt_file: Optional[str]=None,
+            system_prompt_file: Optional[str]=None,
+        ):
+        ds = load_dataset("HuggingFaceH4/aime_2024", split="train")
+        self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names)
+        self.task_spec = TaskDataSpec(
+            task_name="aime2024",
+            prompt_file=prompt_file,
+            system_prompt_file=system_prompt_file,
+        )
+
+    def _rekey(self, data: dict[str, Any]):
+        return {
+            'problem': data['problem'],
+            'expected_answer': data['answer'],
+        }

From 4da0c4315108f1fbe0e68e3a9e959bd384ce42c0 Mon Sep 17 00:00:00 2001
From: Xuehan Xiong <xxman@google.com>
Date: Wed, 25 Jun 2025 17:16:14 +0000
Subject: [PATCH 08/44] add GPQA main version.

Signed-off-by: Xuehan Xiong xxman@google.com
Signed-off-by: Xuehan Xiong <xxman@google.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 examples/configs/evals/gpqa_eval.yaml |  2 +-
 examples/run_eval.py                  |  7 +++++++
 nemo_rl/data/eval_datasets/gpqa.py    | 22 ++++++++++++----------
 3 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/examples/configs/evals/gpqa_eval.yaml b/examples/configs/evals/gpqa_eval.yaml
index 93b991b185..b882c1acd8 100644
--- a/examples/configs/evals/gpqa_eval.yaml
+++ b/examples/configs/evals/gpqa_eval.yaml
@@ -10,7 +10,7 @@ generation:
   temperature: 0.0
   top_p: 1.0
   top_k: -1 # -1 means disable
-  num_prompts_per_step: 16 # -1 means pass all prompts at once
+  num_prompts_per_step: -1 # -1 means pass all prompts at once
   model_name: "Qwen/Qwen2.5-7B-Instruct"
   stop_token_ids: null
   stop_strings: null
diff --git a/examples/run_eval.py b/examples/run_eval.py
index b9fb7e89bb..117db1deab 100644
--- a/examples/run_eval.py
+++ b/examples/run_eval.py
@@ -155,6 +155,13 @@ def setup_data(tokenizer: AutoTokenizer, data_config, env_configs):
         data_processor_fn = math_data_processor
     elif dataset_name == "gpqa":
         base_dataset = gpqa.GPQADataset(
+            variant="main",
+            prompt_file=data_config["prompt_file"],
+            system_prompt_file=data_config["system_prompt_file"],
+        )
+    elif dataset_name == "gpqa_diamond":
+        base_dataset = gpqa.GPQADataset(
+            variant="diamond",
             prompt_file=data_config["prompt_file"],
             system_prompt_file=data_config["system_prompt_file"],
         )
diff --git a/nemo_rl/data/eval_datasets/gpqa.py b/nemo_rl/data/eval_datasets/gpqa.py
index 1287e446a0..0662a1a5e2 100644
--- a/nemo_rl/data/eval_datasets/gpqa.py
+++ b/nemo_rl/data/eval_datasets/gpqa.py
@@ -1,25 +1,28 @@
 """GPQA dataset and its variants."""
 
 import random
-from typing import Any, Optional
+from typing import Any, Literal, Optional
 
 from datasets import load_dataset
 
 from nemo_rl.data.interfaces import TaskDataSpec
 
 
-
 class GPQADataset:
-    def __init__(self, variant: str = "diamond", prompt_file: Optional[str]=None, system_prompt_file: Optional[str]=None):
-        ds = load_dataset("csv", data_files=f"https://openaipublic.blob.core.windows.net/simple-evals/gpqa_{variant}.csv", split="train")
+    def __init__(self,
+        variant: Literal["diamond", "main"] = "diamond",
+        prompt_file: Optional[str] = None,
+        system_prompt_file: Optional[str]=None,
+    ):
+        ds = load_dataset("Idavidrein/gpqa", f"gpqa_{variant}", split="train")
         self._rng = random.Random()
         self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names)
         self.task_spec = TaskDataSpec(
-            task_name=f"GPQA_{variant}",
+            task_name=f'GPQA_{variant}',
             prompt_file=prompt_file,
             system_prompt_file=system_prompt_file,
         )
-    
+
     def _rekey(self, data: dict[str, Any]):
         choices = [
             data["Correct Answer"],
@@ -32,13 +35,12 @@ def _rekey(self, data: dict[str, Any]):
         correct_index = choices.index(data["Correct Answer"])
         correct_answer = "ABCD"[correct_index]
         return {
-            "question": data["Question"],
-            "options": dict(
+            'question': data['Question'],
+            'options': dict(
                 A=choices[0],
                 B=choices[1],
                 C=choices[2],
                 D=choices[3],
             ),
-            "answer": correct_answer,
+            'answer': correct_answer,
         }
-

From 5870e46466f901540f0bf60fa35aacef86b1a504 Mon Sep 17 00:00:00 2001
From: yuki <48991475+yuki-666@users.noreply.github.com>
Date: Fri, 27 Jun 2025 01:20:58 +0800
Subject: [PATCH 09/44] fix: remove reference_model_buffers in fsdp2 (#558)

Signed-off-by: Yuki Huang <yukih@nvidia.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 .../models/policy/dtensor_policy_worker.py    | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/nemo_rl/models/policy/dtensor_policy_worker.py b/nemo_rl/models/policy/dtensor_policy_worker.py
index 70e5617040..61dcd9a127 100644
--- a/nemo_rl/models/policy/dtensor_policy_worker.py
+++ b/nemo_rl/models/policy/dtensor_policy_worker.py
@@ -235,9 +235,6 @@ def __init__(
             self.reference_model_state_dict = get_cpu_state_dict(
                 self.model.state_dict().items(), pin_memory=True
             )
-            self.reference_model_buffers = get_cpu_state_dict(
-                self.model.named_buffers(), pin_memory=True
-            )
 
         if init_optimizer:
             optimizer_cls = import_class_from_path(self.cfg["optimizer"]["name"])
@@ -768,32 +765,26 @@ def use_reference_model(self) -> Generator[None, None, None]:
         """
         with torch.no_grad():
             try:
+                # Save train model state_dict
                 curr_state_dict = get_cpu_state_dict(
                     self.model.state_dict().items(), pin_memory=True
                 )
-                curr_buffers = get_cpu_state_dict(
-                    self.model.named_buffers(), pin_memory=True
-                )
 
+                # Swap reference model state_dict to self.model
                 for k, v in self.model.state_dict().items():
                     val = to_local_if_dtensor(v)
                     val.copy_(self.reference_model_state_dict[k])
 
-                for k, v in self.model.named_buffers():
-                    val = to_local_if_dtensor(v)
-                    val.copy_(self.reference_model_buffers[k])
-
+                # - self.model is the original reference_model, now on CUDA
+                # - curr_state_dict is the train model, now on CPU
                 yield
 
             finally:
+                # Restore train model state_dict
                 for k, v in self.model.state_dict().items():
                     val = to_local_if_dtensor(v)
                     val.copy_(curr_state_dict[k])
 
-                for k, v in self.model.named_buffers():
-                    val = to_local_if_dtensor(v)
-                    val.copy_(curr_buffers[k])
-
     def get_reference_policy_logprobs(
         self, data: BatchedDataDict[Any], micro_batch_size: Optional[int] = None
     ) -> BatchedDataDict[ReferenceLogprobOutputSpec]:

From 79690a11de285f990fea4e1e4d5b40a0167f9b1b Mon Sep 17 00:00:00 2001
From: Parth Chadha <pchadha@nvidia.com>
Date: Thu, 26 Jun 2025 13:00:18 -0700
Subject: [PATCH 10/44] fix: Add assertion if async is disabled when using pp
 with vllm (#565)

Signed-off-by: Parth Chadha <pchadha@nvidia.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 examples/configs/grpo_math_1B.yaml | 2 +-
 nemo_rl/models/generation/vllm.py  | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml
index 283a3d9c31..85cc620b62 100644
--- a/examples/configs/grpo_math_1B.yaml
+++ b/examples/configs/grpo_math_1B.yaml
@@ -101,7 +101,7 @@ policy:
     stop_token_ids: null
     stop_strings: null
     vllm_cfg:
-      async_engine: false # Only for internal testing, will be enabled by https://github.com/NVIDIA/NeMo-RL/issues/447.
+      async_engine: false
       precision: ${policy.precision}
       tensor_parallel_size: 1
       pipeline_parallel_size: 1
diff --git a/nemo_rl/models/generation/vllm.py b/nemo_rl/models/generation/vllm.py
index 3bf64b2652..0b0bb00ad6 100644
--- a/nemo_rl/models/generation/vllm.py
+++ b/nemo_rl/models/generation/vllm.py
@@ -1100,6 +1100,12 @@ def __init__(
         """Initialize a vLLM policy with distributed workers."""
         # Store config
         self.cfg = config
+        if self.cfg["vllm_cfg"]["pipeline_parallel_size"] > 1:
+            assert self.cfg["vllm_cfg"]["async_engine"], (
+                "When pipeline_parallel_size > 1, async_engine must be set to True in the vLLM configuration. "
+                "You can enable it by adding `policy.generation.vllm_cfg.async_engine=true` to your command."
+            )
+
         # Ensure all required VllmConfig fields are present
         missing_keys = [
             key for key in VllmConfig.__required_keys__ if key not in self.cfg

From 940049f1e1c1661b312df00a02cebf3a19b02dfc Mon Sep 17 00:00:00 2001
From: Parth Chadha <pchadha@nvidia.com>
Date: Thu, 26 Jun 2025 13:14:17 -0700
Subject: [PATCH 11/44] fix: remove visualization code (#566)

Signed-off-by: Yuki Huang <yukih@nvidia.com>
Signed-off-by: Parth Chadha <pchadha@nvidia.com>
Co-authored-by: yuki <48991475+yuki-666@users.noreply.github.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 nemo_rl/distributed/virtual_cluster.py        | 241 +-----------------
 nemo_rl/distributed/worker_groups.py          |   7 -
 .../distributed/test_cluster_visualization.py | 132 ----------
 3 files changed, 1 insertion(+), 379 deletions(-)
 delete mode 100644 tests/unit/distributed/test_cluster_visualization.py

diff --git a/nemo_rl/distributed/virtual_cluster.py b/nemo_rl/distributed/virtual_cluster.py
index 3c7a557c24..22fe0bd670 100644
--- a/nemo_rl/distributed/virtual_cluster.py
+++ b/nemo_rl/distributed/virtual_cluster.py
@@ -15,7 +15,7 @@
 import os
 import sys
 import time
-from typing import Any, Optional, TypedDict
+from typing import Optional, TypedDict
 
 import ray
 from ray.util.placement_group import (
@@ -395,245 +395,6 @@ def shutdown(self) -> bool:
 
         return True
 
-    def _create_visualization_grid(
-        self, worker_groups: Optional[Any] = None, is_global_view: bool = False
-    ) -> dict[str, Any]:
-        """Create a visualization grid for the cluster with optional worker groups.
-
-        Args:
-            worker_groups: Single worker group, list of worker groups, or None
-            is_global_view: Whether this is a global view (multiple worker groups) or single view
-
-        Returns:
-            dict: A dictionary containing the grid data for display
-        """
-        # Convert single worker group to list for uniform processing
-        if worker_groups is not None and not isinstance(worker_groups, list):
-            worker_groups = [worker_groups]
-        elif worker_groups is None:
-            worker_groups = []
-
-        # Find the maximum number of GPUs per node for grid layout
-        max_gpus_per_node = (
-            max(self._bundle_ct_per_node_list) if self._bundle_ct_per_node_list else 0
-        )
-        if max_gpus_per_node == 0:
-            return {"empty": True}
-
-        # Number of nodes with GPUs
-        active_nodes = sum(1 for count in self._bundle_ct_per_node_list if count > 0)
-
-        # Determine cell width based on view type
-        cell_width = 12 if is_global_view else 7
-
-        # Create horizontal divider based on max GPUs per node
-        h_divider = "+" + "+".join(["-" * cell_width] * max_gpus_per_node) + "+"
-
-        # Build the grid data
-        grid_data = {
-            "active_nodes": active_nodes,
-            "total_gpus": self.world_size(),
-            "worker_groups": worker_groups,
-            "max_gpus_per_node": max_gpus_per_node,
-            "cell_width": cell_width,
-            "h_divider": h_divider,
-            "is_global_view": is_global_view,
-            "rows": [],
-        }
-
-        # For each node, create its row in the grid
-        for node_idx, bundle_count in enumerate(self._bundle_ct_per_node_list):
-            if bundle_count == 0:
-                continue
-
-            # Initialize row data
-            node_row = {
-                "node_idx": node_idx,
-                "bundle_count": bundle_count,
-                "gpu_cells": [],
-                "worker_cells": [],
-            }
-
-            # Initialize worker cells arrays (one per worker group)
-            for i in range(len(worker_groups)):
-                node_row["worker_cells"].append([])  # type: ignore
-
-            # Process each GPU position in the row
-            for gpu_idx in range(max_gpus_per_node):
-                if gpu_idx < bundle_count:
-                    # This is a real GPU
-                    gpu_cell = f" {node_idx}.{gpu_idx} "
-
-                    # Process worker assignments for this GPU
-                    worker_cells = self._get_worker_cells(
-                        node_idx, gpu_idx, worker_groups, cell_width, is_global_view
-                    )
-                else:
-                    # Empty cell (no GPU)
-                    gpu_cell = " " * cell_width
-                    worker_cells = [" " * cell_width] * len(worker_groups)
-
-                # Add cells to the row
-                node_row["gpu_cells"].append(gpu_cell)  # type: ignore
-                for i, cell in enumerate(worker_cells):
-                    if i < len(node_row["worker_cells"]):  # type: ignore
-                        node_row["worker_cells"][i].append(cell)  # type: ignore
-
-            # Add the completed row to the grid
-            grid_data["rows"].append(node_row)
-
-        return grid_data
-
-    def _get_worker_cells(
-        self,
-        node_idx: int,
-        gpu_idx: int,
-        worker_groups: list[Any],
-        cell_width: int,
-        is_global_view: bool,
-    ) -> list[str]:
-        """Get the worker cell content for each worker group at a specific GPU location.
-
-        Args:
-            node_idx: The node index
-            gpu_idx: The GPU index within the node
-            worker_groups: List of worker groups to check
-            cell_width: Width of each cell for formatting
-            is_global_view: Whether this is a global view with multiple worker groups
-
-        Returns:
-            list: List of formatted worker cells, one per worker group
-        """
-        worker_cells = []
-
-        for wg_idx, worker_group in enumerate(worker_groups):
-            # Default empty worker cell
-            worker_cell = " " * cell_width
-
-            # Find workers from this group assigned to this GPU
-            for worker_id, metadata in enumerate(worker_group.worker_metadata):
-                if (
-                    metadata["node_idx"] == node_idx
-                    and metadata["local_rank"] == gpu_idx
-                ):
-                    if is_global_view:
-                        # Use group numbering in global view
-                        worker_cell = f" G{wg_idx}:W{worker_id:<2d} "
-                    else:
-                        # Use simple worker IDs in single group view
-                        worker_cell = f" W {worker_id:<2d} "
-                    break
-
-            worker_cells.append(worker_cell)
-
-        return worker_cells
-
-    def _print_visualization(self, grid_data: dict[str, Any]) -> None:
-        """Print the visualization based on the grid data.
-
-        Args:
-            grid_data: The grid data generated by _create_visualization_grid
-        """
-        if grid_data.get("empty", False):
-            print("\nEmpty Ray Cluster (no GPUs)")
-            return
-
-        # Print header
-        if grid_data["is_global_view"]:
-            # Global view header
-            wg_summary = ""
-            if grid_data["worker_groups"]:
-                wg_summary = f", Worker Groups: {len(grid_data['worker_groups'])}"
-
-            print(
-                f"\nRay Cluster Global View: {grid_data['active_nodes']} nodes, {grid_data['total_gpus']} GPUs{wg_summary}"
-            )
-        else:
-            # Single view header
-            wg_info = ""
-            if grid_data["worker_groups"]:
-                worker_group = grid_data["worker_groups"][0]
-                wg_name = getattr(worker_group, "name_prefix", "Default") or "Default"
-                wg_info = (
-                    f", Worker Group: {wg_name} ({worker_group.world_size} workers)"
-                )
-
-            print(
-                f"\nRay Cluster: {grid_data['active_nodes']} nodes, {grid_data['total_gpus']} GPUs{wg_info}"
-            )
-
-        # Print the top border
-        print(grid_data["h_divider"])
-
-        # Print each row of the grid
-        for row in grid_data["rows"]:
-            # Print GPU row
-            gpu_row = ["|"]
-            for cell in row["gpu_cells"]:
-                gpu_row.append(cell.ljust(grid_data["cell_width"]))
-                gpu_row.append("|")
-            print("".join(gpu_row))
-
-            # Print worker rows
-            for wg_idx, worker_cells in enumerate(row["worker_cells"]):
-                worker_row = ["|"]
-                for cell in worker_cells:
-                    worker_row.append(cell.ljust(grid_data["cell_width"]))
-                    worker_row.append("|")
-                print("".join(worker_row))
-
-            # Print divider between nodes
-            print(grid_data["h_divider"])
-
-        # Print legend
-        self._print_legend(grid_data)
-
-    def _print_legend(self, grid_data: dict[str, Any]) -> None:
-        """Print the legend for the visualization."""
-        if grid_data["is_global_view"]:
-            # Legend for global view
-            if grid_data["worker_groups"]:
-                print("Legend:")
-                for wg_idx, wg in enumerate(grid_data["worker_groups"]):
-                    wg_name = getattr(wg, "name_prefix", "unnamed") or "unnamed"
-                    wg_count = wg.world_size
-                    print(f"G{wg_idx}: {wg_name} ({wg_count} workers)")
-                print("W##: Worker ID within its group")
-        else:
-            # Legend for single worker group view
-            if grid_data["worker_groups"]:
-                wg_name = (
-                    getattr(grid_data["worker_groups"][0], "name_prefix", "") or ""
-                )
-                print(f"W## = Worker ID in '{wg_name}' worker group")
-
-        print("#.#: Node.GPU identifier")
-
-    def print_cluster_grid(self, worker_group: Optional[Any] = None) -> None:
-        """Prints a compact grid visualization of the virtual cluster, similar to JAX's visualize_array_sharding.
-
-        If a worker_group is provided, it will also show worker assignments on each device.
-
-        Args:
-            worker_group: Optional RayWorkerGroup instance to visualize worker assignments
-        """
-        grid_data = self._create_visualization_grid(worker_group, is_global_view=False)
-        self._print_visualization(grid_data)
-
-    def print_all_worker_groups(
-        self, worker_groups: Optional[list[Any]] = None
-    ) -> None:
-        """Prints a visualization showing all worker groups in the cluster.
-
-        This provides a global view of all workers across all worker groups.
-
-        Args:
-            worker_groups: List of RayWorkerGroup instances to visualize. If None,
-                          no worker assignments will be shown.
-        """
-        grid_data = self._create_visualization_grid(worker_groups, is_global_view=True)
-        self._print_visualization(grid_data)
-
     def __del__(self) -> None:
         """Shutsdown the virtual cluster when the object is deleted or is garbage collected.
 
diff --git a/nemo_rl/distributed/worker_groups.py b/nemo_rl/distributed/worker_groups.py
index 71190d8774..a283e6b18c 100644
--- a/nemo_rl/distributed/worker_groups.py
+++ b/nemo_rl/distributed/worker_groups.py
@@ -896,10 +896,3 @@ def shutdown(
         self._worker_metadata = []
 
         return success
-
-    def print_worker_layout(self) -> None:
-        """Prints a visual representation of the worker layout across the virtual cluster.
-
-        This shows which workers are assigned to which nodes and GPUs.
-        """
-        self.cluster.print_cluster_grid(self)
diff --git a/tests/unit/distributed/test_cluster_visualization.py b/tests/unit/distributed/test_cluster_visualization.py
deleted file mode 100644
index d6dc31e1a5..0000000000
--- a/tests/unit/distributed/test_cluster_visualization.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from unittest.mock import MagicMock, patch
-
-import pytest
-
-from nemo_rl.distributed.virtual_cluster import RayVirtualCluster
-
-
-@pytest.fixture(autouse=True)
-def mock_virtual_cluster_pg():
-    # Mock the _init_placement_groups and get_placement_groups methods to avoid actually initializing placement groups
-    with (
-        patch(
-            "nemo_rl.distributed.virtual_cluster.RayVirtualCluster.get_placement_groups"
-        ) as mock_get_pg,
-        patch(
-            "nemo_rl.distributed.virtual_cluster.RayVirtualCluster._init_placement_groups"
-        ) as mock_init_pg,
-    ):
-        mock_get_pg.return_value = []
-        mock_init_pg.return_value = []
-        yield
-
-
-def test_empty_cluster_visualization(capsys):
-    """Test visualization of an empty cluster."""
-    # Create a empty cluster
-    cluster = RayVirtualCluster(
-        bundle_ct_per_node_list=[],
-        use_gpus=False,
-        name="test-empty",
-    )
-
-    # Test visualization
-    cluster.print_cluster_grid()
-
-    # Capture the output
-    out, _ = capsys.readouterr()
-    assert "Empty Ray Cluster" in out
-
-
-def test_cluster_grid(capsys):
-    """Test visualization of a cluster grid."""
-    # Create a cluster with a configuration but don't actually allocate resources
-    cluster = RayVirtualCluster(
-        bundle_ct_per_node_list=[2, 3],
-        use_gpus=False,
-        name="test-visual",
-        max_colocated_worker_groups=1,
-    )
-
-    cluster.print_cluster_grid()
-
-    # Capture the output
-    out, _ = capsys.readouterr()
-    print(out)
-    assert "Ray Cluster: 2 nodes, 5 GPUs" in out
-    assert "0.0" in out  # First node, first GPU
-    assert "0.1" in out  # First node, second GPU
-    assert "1.0" in out  # Second node, first GPU
-    assert "1.2" in out  # Second node, third GPU
-
-
-def test_global_visualization_formatting(capsys):
-    """Test global visualization formatting without actual worker groups."""
-    cluster = RayVirtualCluster(
-        bundle_ct_per_node_list=[2, 2],
-        use_gpus=False,
-        name="test-global",
-        max_colocated_worker_groups=1,
-    )
-
-    cluster.print_all_worker_groups([])
-
-    # Capture the output
-    out, _ = capsys.readouterr()
-    print(out)
-    assert "Ray Cluster Global View: 2 nodes, 4 GPUs" in out
-
-
-def test_with_mock_worker_groups(capsys):
-    """Test visualization with mock worker groups."""
-    # Create a cluster with a configuration
-    cluster = RayVirtualCluster(
-        bundle_ct_per_node_list=[2, 3],
-        use_gpus=False,
-        name="test-workers",
-        max_colocated_worker_groups=1,
-    )
-
-    worker_group1 = MagicMock()
-    worker_group1.name_prefix = "policy"
-    worker_group1.world_size = 2
-    worker_group1.worker_metadata = [
-        {"node_idx": 0, "local_rank": 0},  # First worker on node 0, GPU 0
-        {"node_idx": 1, "local_rank": 0},  # Second worker on node 1, GPU 0
-    ]
-
-    worker_group2 = MagicMock()
-    worker_group2.name_prefix = "policy_generate"
-    worker_group2.world_size = 3
-    worker_group2.worker_metadata = [
-        {"node_idx": 0, "local_rank": 1},  # First worker on node 0, GPU 1
-        {"node_idx": 1, "local_rank": 1},  # Second worker on node 1, GPU 1
-        {"node_idx": 1, "local_rank": 2},  # Third worker on node 1, GPU 2
-    ]
-
-    cluster.print_all_worker_groups([worker_group1, worker_group2])
-
-    # Capture the output
-    out, _ = capsys.readouterr()
-    print(out)
-
-    # Check for key elements in the output
-    assert "Ray Cluster Global View: 2 nodes, 5 GPUs" in out
-    assert "G0" in out  # First worker group
-    assert "G1" in out  # Second worker group
-    assert "policy" in out  # First worker group name
-    assert "policy_generate" in out  # Second worker group name

From 745790ce8befa88f3610ec79854f4fecb546b8b8 Mon Sep 17 00:00:00 2001
From: Zhaocheng Zhu <zhaochengz@nvidia.com>
Date: Thu, 26 Jun 2025 15:09:12 -0700
Subject: [PATCH 12/44] Allow uneven shards for multi-GPU inference in vllm
 backend (#494)

Signed-off-by: KiddoZhu <zhaochengz@nvidia.com>
Signed-off-by: Sahil Jain <48468750+SahilJain314@users.noreply.github.com>
Co-authored-by: Sahil Jain <48468750+SahilJain314@users.noreply.github.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 nemo_rl/models/generation/vllm.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/nemo_rl/models/generation/vllm.py b/nemo_rl/models/generation/vllm.py
index 0b0bb00ad6..58795e5031 100644
--- a/nemo_rl/models/generation/vllm.py
+++ b/nemo_rl/models/generation/vllm.py
@@ -1387,12 +1387,11 @@ def generate_text(
             f"data must be a BatchedDataDict, got type: {type(data)}"
         )
 
-        # Get total batch size
-        batch_size = len(data["prompts"])
-
         # Shard the data across the tied worker groups
         dp_size = self.sharding_annotations.get_axis_size("data_parallel")
-        sharded_data = data.shard_by_batch_size(dp_size, batch_size=batch_size)
+        sharded_data: list[SlicedDataDict] = data.shard_by_batch_size(
+            dp_size, allow_uneven_shards=True
+        )
         future_bundle = self.worker_group.run_all_workers_sharded_data(
             "generate_text",
             sharded_data,

From 9c59083d4124456a5ac941581ce237cb3dabaaef Mon Sep 17 00:00:00 2001
From: Xuehan Xiong <xxman@google.com>
Date: Wed, 25 Jun 2025 17:16:14 +0000
Subject: [PATCH 13/44] add GPQA main version.

Signed-off-by: Xuehan Xiong xxman@google.com
Signed-off-by: Xuehan <xxman@google.com>
---
 examples/configs/evals/gpqa_eval.yaml         |  31 +---
 examples/configs/evals/local_eval.yaml        |  14 ++
 examples/configs/evals/math_eval.yaml         |  36 +---
 examples/run_eval.py                          | 138 +---------------
 examples/run_grpo_math.py                     |  71 +-------
 nemo_rl/data/eval_datasets/__init__.py        |  88 ++++++++++
 nemo_rl/data/eval_datasets/aime2024.py        |  17 +-
 nemo_rl/data/eval_datasets/gpqa.py            |   7 +-
 .../data/eval_datasets/local_math_dataset.py  |  40 +++++
 nemo_rl/data/eval_datasets/math.py            |  28 ++--
 nemo_rl/data/eval_datasets/mmlu.py            |  33 ++--
 nemo_rl/data/eval_datasets/mmlu_pro.py        |  19 +--
 nemo_rl/data/processors.py                    | 155 ++++++++++++++++++
 13 files changed, 369 insertions(+), 308 deletions(-)
 create mode 100644 examples/configs/evals/local_eval.yaml
 create mode 100644 nemo_rl/data/eval_datasets/local_math_dataset.py
 create mode 100644 nemo_rl/data/processors.py

diff --git a/examples/configs/evals/gpqa_eval.yaml b/examples/configs/evals/gpqa_eval.yaml
index b882c1acd8..463702d3a4 100644
--- a/examples/configs/evals/gpqa_eval.yaml
+++ b/examples/configs/evals/gpqa_eval.yaml
@@ -1,42 +1,15 @@
-# Evaluation Configuration
-eval:
-  metric: "pass@1" # only pass@1 is supported now
-  num_tests_per_prompt: 1 # every prompt will be tested num_tests_per_prompt times and use the average score as the final score
-  seed: 42
+# GPQA evaluation Configuration
+defaults: "eval.yaml"
 
 generation:
-  backend: "vllm" # only vllm is supported for evaluation
-  max_new_tokens: ${generation.vllm_cfg.max_model_len}
-  temperature: 0.0
-  top_p: 1.0
-  top_k: -1 # -1 means disable
-  num_prompts_per_step: -1 # -1 means pass all prompts at once
   model_name: "Qwen/Qwen2.5-7B-Instruct"
-  stop_token_ids: null
-  stop_strings: null
   vllm_cfg:
-    async_engine: false
-    precision: "bfloat16"
-    tensor_parallel_size: 1
-    pipeline_parallel_size: 1
-    gpu_memory_utilization: 0.9
     max_model_len: 3072
 
-tokenizer:
-  name: ${generation.model_name} ## specify if you'd like to use a tokenizer different from the model's default
-  chat_template: "default"
-
 data:
-  max_input_seq_length: ${generation.vllm_cfg.max_model_len} # useless since we directly use prompts in evaluation
   prompt_file: "examples/prompts/gpqa.txt"
-  system_prompt_file: null
   dataset_name: "gpqa"
 
 env:
   math:
-    num_workers: 8
     verifier_type: "multichoice"
-
-cluster:
-  gpus_per_node: 1
-  num_nodes: 1
diff --git a/examples/configs/evals/local_eval.yaml b/examples/configs/evals/local_eval.yaml
new file mode 100644
index 0000000000..ad9def2112
--- /dev/null
+++ b/examples/configs/evals/local_eval.yaml
@@ -0,0 +1,14 @@
+# Evaluation Configuration from local files.
+defaults: "eval.yaml"
+
+generation:
+  model_name: "Qwen/Qwen2.5-7B-Instruct"
+
+data:
+  prompt_file: "examples/prompts/cot.txt"
+  dataset_name: "local"
+  problem_key: "Question"
+  solution_key: "Answer"
+  split: "train"
+  data_paths: "https:\/\/openaipublic.blob.core.windows.net\/simple-evals\/math_500_test.csv"
+  file_format: "csv"
diff --git a/examples/configs/evals/math_eval.yaml b/examples/configs/evals/math_eval.yaml
index 32a4a3281c..b42956866d 100644
--- a/examples/configs/evals/math_eval.yaml
+++ b/examples/configs/evals/math_eval.yaml
@@ -1,41 +1,9 @@
-# Evaluation Configuration
-eval:
-  metric: "pass@1" # only pass@1 is supported now
-  num_tests_per_prompt: 1 # every prompt will be tested num_tests_per_prompt times and use the average score as the final score
-  seed: 42
+# Math evaluation Configuration
+defaults: "eval.yaml"
 
 generation:
-  backend: "vllm" # only vllm is supported for evaluation
-  max_new_tokens: ${generation.vllm_cfg.max_model_len}
-  temperature: 0.0
-  top_p: 1.0
-  top_k: -1 # -1 means disable
-  num_prompts_per_step: 16 # -1 means pass all prompts at once
   model_name: "Qwen/Qwen2.5-7B-Instruct"
-  stop_token_ids: null
-  stop_strings: null
-  vllm_cfg:
-    async_engine: false
-    precision: "bfloat16"
-    tensor_parallel_size: 1
-    pipeline_parallel_size: 1
-    gpu_memory_utilization: 0.9
-    max_model_len: 2048
-
-tokenizer:
-  name: ${generation.model_name} ## specify if you'd like to use a tokenizer different from the model's default
-  chat_template: "default"
 
 data:
-  max_input_seq_length: ${generation.vllm_cfg.max_model_len} # useless since we directly use prompts in evaluation
   prompt_file: "examples/prompts/cot.txt"
-  system_prompt_file: null
   dataset_name: "math"
-
-env:
-  math:
-    num_workers: 8
-
-cluster:
-  gpus_per_node: 1
-  num_nodes: 1
diff --git a/examples/run_eval.py b/examples/run_eval.py
index 117db1deab..89e2ede395 100644
--- a/examples/run_eval.py
+++ b/examples/run_eval.py
@@ -16,26 +16,15 @@
 import os
 import pprint
 import sys
-from typing import Any, cast
-
-import torch
 
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 from omegaconf import OmegaConf
 from transformers import AutoTokenizer, PreTrainedTokenizerBase
 
-from examples.run_grpo_math import math_data_processor
 from nemo_rl.algorithms.utils import get_tokenizer
 from nemo_rl.data.datasets import AllTaskProcessedDataset
-from nemo_rl.data.eval_datasets import (
-    aime2024,
-    gpqa,
-    math,
-    mmlu,
-    mmlu_pro,
-)
-from nemo_rl.data.interfaces import DatumSpec, TaskDataSpec
+from nemo_rl.data.eval_datasets import load_eval_dataset
 from nemo_rl.distributed.ray_actor_environment_registry import (
     get_actor_python_env,
 )
@@ -43,6 +32,7 @@
 from nemo_rl.environments.math_environment import MathEnvironment
 from nemo_rl.evals.eval import MasterConfig, run_env_eval, setup
 from nemo_rl.models.generation import configure_generation_config
+from nemo_rl.utils.config import load_config
 
 TokenizerType = PreTrainedTokenizerBase
 
@@ -63,129 +53,11 @@ def parse_args():
     return args, overrides
 
 
-def _construct_prompt(prompt: str, question: str, options: dict[str, str]) -> str:
-    """Construct prompt from question and options."""
-    output = prompt
-    output += f"\n\nQuestion: {question}\nOptions:\n"
-    output += "\n".join(
-        [
-            f"{letter}) {option}"
-            for letter, option in options.items()
-            if option is not None
-        ]
-    )
-    return output
-
-
-def multichoice_qa_processor(
-    datum_dict: dict[str, Any],
-    task_data_spec: TaskDataSpec,
-    tokenizer: TokenizerType,
-    max_seq_length: int,
-    idx: int,
-) -> DatumSpec:
-    """Process a datum dictionary (directly loaded from dataset) into a DatumSpec for multiple-choice problems."""
-    question = datum_dict["question"]
-    answer = str(datum_dict["answer"])
-    options = datum_dict["options"]
-    extra_env_info = {"ground_truth": answer}
-    if "subject" in datum_dict:
-        extra_env_info.update({"subject": datum_dict["subject"]})
-
-    message_log = []
-
-    # system prompt
-    if task_data_spec.system_prompt:
-        sys_prompt: dict[str, str | torch.Tensor] = {
-            "role": "system",
-            "content": task_data_spec.system_prompt,
-        }
-        sys = tokenizer.apply_chat_template(
-            [cast(dict[str, str], sys_prompt)],
-            tokenize=False,
-            add_generation_prompt=False,
-            add_special_tokens=False,
-        )
-        sys_prompt["token_ids"] = tokenizer(sys, return_tensors="pt")["input_ids"][0]
-        message_log.append(sys_prompt)
-
-    # user prompt
-    if task_data_spec.prompt:
-        problem = _construct_prompt(task_data_spec.prompt, question, options)
-    user_message = {"role": "user", "content": problem}
-    message = tokenizer.apply_chat_template(
-        [user_message],
-        tokenize=False,
-        add_generation_prompt=True,
-        add_special_tokens=False,
-    )
-    user_message["token_ids"] = tokenizer(message, return_tensors="pt")["input_ids"][0]
-    user_message["content"] = message
-    message_log.append(user_message)
-
-    length = sum(len(m["token_ids"]) for m in message_log)
-    output: DatumSpec = {
-        "message_log": message_log,
-        "length": length,
-        "extra_env_info": extra_env_info,
-        "loss_multiplier": 1.0,
-        "idx": idx,
-    }
-    if "task_name" in datum_dict:
-        output["task_name"] = datum_dict["task_name"]
-    return output
-
-
 def setup_data(tokenizer: AutoTokenizer, data_config, env_configs):
     print("Setting up data...")
 
     # load dataset
-    dataset_name = data_config["dataset_name"]
-    data_processor_fn = multichoice_qa_processor
-    if dataset_name == "mmlu":
-        base_dataset = mmlu.MMLUDataset(
-            prompt_file=data_config["prompt_file"],
-            system_prompt_file=data_config["system_prompt_file"],
-        )
-    elif dataset_name == "aime2024":
-        base_dataset = aime2024.AIME2024Dataset(
-            prompt_file=data_config["prompt_file"],
-            system_prompt_file=data_config["system_prompt_file"],
-        )
-        data_processor_fn = math_data_processor
-    elif dataset_name == "gpqa":
-        base_dataset = gpqa.GPQADataset(
-            variant="main",
-            prompt_file=data_config["prompt_file"],
-            system_prompt_file=data_config["system_prompt_file"],
-        )
-    elif dataset_name == "gpqa_diamond":
-        base_dataset = gpqa.GPQADataset(
-            variant="diamond",
-            prompt_file=data_config["prompt_file"],
-            system_prompt_file=data_config["system_prompt_file"],
-        )
-    elif dataset_name == "mmlu_pro":
-        base_dataset = mmlu_pro.MMLUProDataset(
-            prompt_file=data_config["prompt_file"],
-            system_prompt_file=data_config["system_prompt_file"],
-        )
-    elif dataset_name == "math":
-        base_dataset = math.MathDataset(
-            variant="math_test",
-            prompt_file=data_config["prompt_file"],
-            system_prompt_file=data_config["system_prompt_file"],
-        )
-        data_processor_fn = math_data_processor
-    elif dataset_name == "math500":
-        base_dataset = math.MathDataset(
-            variant="math_500_test",
-            prompt_file=data_config["prompt_file"],
-            system_prompt_file=data_config["system_prompt_file"],
-        )
-        data_processor_fn = math_data_processor
-    else:
-        raise ValueError(f"Unknown dataset {dataset_name}.")
+    base_dataset = load_eval_dataset(data_config)
     rekeyed_ds = base_dataset.rekeyed_ds
 
     env = MathEnvironment.options(
@@ -200,7 +72,7 @@ def setup_data(tokenizer: AutoTokenizer, data_config, env_configs):
         dataset=rekeyed_ds,
         tokenizer=tokenizer,
         default_task_data_spec=base_dataset.task_spec,
-        task_data_processors=data_processor_fn,
+        task_data_processors=base_dataset.processor,
         max_seq_length=data_config["max_input_seq_length"],
     )
 
@@ -217,7 +89,7 @@ def main():
             os.path.dirname(__file__), "configs", "evals", "eval.yaml"
         )
 
-    config = OmegaConf.load(args.config)
+    config = load_config(args.config)
     print(f"Loaded configuration from: {args.config}")
 
     if overrides:
diff --git a/examples/run_grpo_math.py b/examples/run_grpo_math.py
index 4a64d3c13b..673322eb61 100644
--- a/examples/run_grpo_math.py
+++ b/examples/run_grpo_math.py
@@ -16,9 +16,8 @@
 import os
 import pprint
 from collections import defaultdict
-from typing import Any, Optional, cast
+from typing import Any, Optional
 
-import torch
 from omegaconf import OmegaConf
 from transformers import PreTrainedTokenizerBase
 
@@ -116,74 +115,6 @@ def hf_data_processor(
     return output
 
 
-# Example of a generic math data processor
-# TaskDataProcessFnCallable
-def math_data_processor(
-    datum_dict: dict[str, Any],
-    task_data_spec: TaskDataSpec,
-    tokenizer: TokenizerType,
-    max_seq_length: int,
-    idx: int,
-) -> DatumSpec:
-    """Process a datum dictionary (directly loaded from dataset) into a DatumSpec for the Math Environment."""
-    problem = datum_dict["problem"]
-    solution = str(datum_dict["expected_answer"])
-    extra_env_info = {"ground_truth": solution}
-
-    message_log: LLMMessageLogType = []
-
-    # system prompt
-    if task_data_spec.system_prompt:
-        sys_prompt: dict[str, str | torch.Tensor] = {
-            "role": "system",
-            "content": task_data_spec.system_prompt,
-        }
-        sys = tokenizer.apply_chat_template(
-            [cast(dict[str, str], sys_prompt)],
-            tokenize=False,
-            add_generation_prompt=False,
-            add_special_tokens=False,
-        )
-        sys_prompt["token_ids"] = tokenizer(sys, return_tensors="pt")["input_ids"][0]
-        message_log.append(sys_prompt)
-
-    # user prompt
-    if task_data_spec.prompt:
-        problem = task_data_spec.prompt.format(problem)
-    user_message = {"role": "user", "content": problem}
-    message = tokenizer.apply_chat_template(
-        [user_message],
-        tokenize=False,
-        add_generation_prompt=True,
-        add_special_tokens=False,
-    )
-    user_message["token_ids"] = tokenizer(message, return_tensors="pt")["input_ids"][0]
-    user_message["content"] = message
-    message_log.append(user_message)
-
-    length = sum(len(m["token_ids"]) for m in message_log)
-
-    loss_multiplier = 1.0
-    if length > max_seq_length:
-        # make smaller and mask out
-        for indiv_message in message_log:
-            indiv_message["token_ids"] = indiv_message["token_ids"][
-                : min(4, max_seq_length // len(message_log))
-            ]
-        loss_multiplier = 0.0
-
-    output: DatumSpec = {
-        "message_log": message_log,
-        "length": length,
-        "extra_env_info": extra_env_info,
-        "loss_multiplier": loss_multiplier,
-        "idx": idx,
-    }
-    if "task_name" in datum_dict:
-        output["task_name"] = datum_dict["task_name"]
-    return output
-
-
 def setup_data(
     tokenizer: TokenizerType,
     data_config: DataConfig,
diff --git a/nemo_rl/data/eval_datasets/__init__.py b/nemo_rl/data/eval_datasets/__init__.py
index e69de29bb2..2e5ba97974 100644
--- a/nemo_rl/data/eval_datasets/__init__.py
+++ b/nemo_rl/data/eval_datasets/__init__.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo_rl.data.eval_datasets.aime2024 import AIME2024Dataset
+from nemo_rl.data.eval_datasets.gpqa import GPQADataset
+from nemo_rl.data.eval_datasets.local_math_dataset import LocalMathDataset
+from nemo_rl.data.eval_datasets.math import MathDataset
+from nemo_rl.data.eval_datasets.mmlu import MMLUDataset
+from nemo_rl.data.eval_datasets.mmlu_pro import MMLUProDataset
+
+
+def load_eval_dataset(data_config):
+    """Loads evaluation dataset."""
+    dataset_name = data_config["dataset_name"]
+    if dataset_name == "mmlu":
+        base_dataset = MMLUDataset(
+            prompt_file=data_config["prompt_file"],
+            system_prompt_file=data_config["system_prompt_file"],
+        )
+    elif dataset_name == "aime2024":
+        base_dataset = AIME2024Dataset(
+            prompt_file=data_config["prompt_file"],
+            system_prompt_file=data_config["system_prompt_file"],
+        )
+    elif dataset_name == "gpqa":
+        base_dataset = GPQADataset(
+            variant="main",
+            prompt_file=data_config["prompt_file"],
+            system_prompt_file=data_config["system_prompt_file"],
+        )
+    elif dataset_name == "gpqa_diamond":
+        base_dataset = GPQADataset(
+            variant="diamond",
+            prompt_file=data_config["prompt_file"],
+            system_prompt_file=data_config["system_prompt_file"],
+        )
+    elif dataset_name == "mmlu_pro":
+        base_dataset = MMLUProDataset(
+            prompt_file=data_config["prompt_file"],
+            system_prompt_file=data_config["system_prompt_file"],
+        )
+    elif dataset_name == "math":
+        base_dataset = MathDataset(
+            variant="math_test",
+            prompt_file=data_config["prompt_file"],
+            system_prompt_file=data_config["system_prompt_file"],
+        )
+    elif dataset_name == "math500":
+        base_dataset = MathDataset(
+            variant="math_500_test",
+            prompt_file=data_config["prompt_file"],
+            system_prompt_file=data_config["system_prompt_file"],
+        )
+    elif dataset_name == "local":
+        base_dataset = LocalMathDataset(
+            name=dataset_name,
+            data_paths=data_config["data_paths"],
+            problem_key=data_config["problem_key"],
+            solution_key=data_config["solution_key"],
+            file_format=data_config["file_format"],
+            split=data_config["split"],
+            prompt_file=data_config["prompt_file"],
+            system_prompt_file=data_config["system_prompt_file"],
+        )
+    else:
+        raise ValueError(f"Unknown dataset {dataset_name}.")
+    return base_dataset
+
+
+__all__ = [
+    "AIME2024Dataset",
+    "GPQADataset",
+    "LocalMathDataset",
+    "MathDataset",
+    "MMLUDataset",
+    "MMLUProDataset",
+]
diff --git a/nemo_rl/data/eval_datasets/aime2024.py b/nemo_rl/data/eval_datasets/aime2024.py
index 1eff661718..b73bd34dbf 100644
--- a/nemo_rl/data/eval_datasets/aime2024.py
+++ b/nemo_rl/data/eval_datasets/aime2024.py
@@ -1,17 +1,19 @@
 """AIME 2024 dataset."""
 
-from typing import Any, Literal, Optional
+from typing import Any, Optional
 
 from datasets import load_dataset
 
+from nemo_rl.data import processors
 from nemo_rl.data.interfaces import TaskDataSpec
 
 
 class AIME2024Dataset:
-    def __init__(self,
-            prompt_file: Optional[str]=None,
-            system_prompt_file: Optional[str]=None,
-        ):
+    def __init__(
+        self,
+        prompt_file: Optional[str] = None,
+        system_prompt_file: Optional[str] = None,
+    ):
         ds = load_dataset("HuggingFaceH4/aime_2024", split="train")
         self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names)
         self.task_spec = TaskDataSpec(
@@ -19,9 +21,10 @@ def __init__(self,
             prompt_file=prompt_file,
             system_prompt_file=system_prompt_file,
         )
+        self.processor = processors.math_data_processor
 
     def _rekey(self, data: dict[str, Any]):
         return {
-            'problem': data['problem'],
-            'expected_answer': data['answer'],
+            "problem": data["problem"],
+            "expected_answer": data["answer"],
         }
diff --git a/nemo_rl/data/eval_datasets/gpqa.py b/nemo_rl/data/eval_datasets/gpqa.py
index 0662a1a5e2..4eb05014c6 100644
--- a/nemo_rl/data/eval_datasets/gpqa.py
+++ b/nemo_rl/data/eval_datasets/gpqa.py
@@ -5,14 +5,16 @@
 
 from datasets import load_dataset
 
+from nemo_rl.data import processors
 from nemo_rl.data.interfaces import TaskDataSpec
 
 
 class GPQADataset:
-    def __init__(self,
+    def __init__(
+        self,
         variant: Literal["diamond", "main"] = "diamond",
         prompt_file: Optional[str] = None,
-        system_prompt_file: Optional[str]=None,
+        system_prompt_file: Optional[str] = None,
     ):
         ds = load_dataset("Idavidrein/gpqa", f"gpqa_{variant}", split="train")
         self._rng = random.Random()
@@ -22,6 +24,7 @@ def __init__(self,
             prompt_file=prompt_file,
             system_prompt_file=system_prompt_file,
         )
+        self.processor = processors.multichoice_qa_processor
 
     def _rekey(self, data: dict[str, Any]):
         choices = [
diff --git a/nemo_rl/data/eval_datasets/local_math_dataset.py b/nemo_rl/data/eval_datasets/local_math_dataset.py
new file mode 100644
index 0000000000..d78b99565f
--- /dev/null
+++ b/nemo_rl/data/eval_datasets/local_math_dataset.py
@@ -0,0 +1,40 @@
+"""Local math dataset."""
+
+from typing import Any, Literal, Optional
+
+from datasets import load_dataset
+
+from nemo_rl.data import processors
+from nemo_rl.data.interfaces import TaskDataSpec
+
+
+class LocalMathDataset:
+    def __init__(
+        self,
+        data_paths: str | list[str],
+        problem_key: str,
+        solution_key: str,
+        name: str,
+        split: Optional[str] = None,
+        file_format: Literal["csv", "json"] = "csv",
+        prompt_file: Optional[str] = None,
+        system_prompt_file: Optional[str] = None,
+    ):
+        ds = load_dataset(file_format, data_files=data_paths)
+        if split is not None:
+            ds = ds[split]
+        self._problem_key = problem_key
+        self._solution_key = solution_key
+        self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names)
+        self.task_spec = TaskDataSpec(
+            task_name=name,
+            prompt_file=prompt_file,
+            system_prompt_file=system_prompt_file,
+        )
+        self.processor = processors.math_data_processor
+
+    def _rekey(self, data: dict[str, Any]):
+        return {
+            "problem": data[self._problem_key],
+            "expected_answer": data[self._solution_key],
+        }
diff --git a/nemo_rl/data/eval_datasets/math.py b/nemo_rl/data/eval_datasets/math.py
index cbd7cb3577..a1c489a148 100644
--- a/nemo_rl/data/eval_datasets/math.py
+++ b/nemo_rl/data/eval_datasets/math.py
@@ -4,26 +4,32 @@
 
 from datasets import load_dataset
 
+from nemo_rl.data import processors
 from nemo_rl.data.interfaces import TaskDataSpec
 
 
 class MathDataset:
-    def __init__(self, 
-            variant: Literal["math_test", "math_500_test"] = "math_test",
-            prompt_file: Optional[str]=None, 
-            system_prompt_file: Optional[str]=None,
-        ):
-        ds = load_dataset('csv', data_files=f"https://openaipublic.blob.core.windows.net/simple-evals/{variant}.csv", split='train')
+    def __init__(
+        self,
+        variant: Literal["math_test", "math_500_test"] = "math_test",
+        prompt_file: Optional[str] = None,
+        system_prompt_file: Optional[str] = None,
+    ):
+        ds = load_dataset(
+            "csv",
+            data_files=f"https://openaipublic.blob.core.windows.net/simple-evals/{variant}.csv",
+            split="train",
+        )
         self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names)
         self.task_spec = TaskDataSpec(
-            task_name=f'{variant}',
+            task_name=f"{variant}",
             prompt_file=prompt_file,
             system_prompt_file=system_prompt_file,
         )
-    
+        self.processor = processors.math_data_processor
+
     def _rekey(self, data: dict[str, Any]):
         return {
-            'problem': data['Question'],
-            'expected_answer': data['Answer'],
+            "problem": data["Question"],
+            "expected_answer": data["Answer"],
         }
-
diff --git a/nemo_rl/data/eval_datasets/mmlu.py b/nemo_rl/data/eval_datasets/mmlu.py
index f6ab075886..86acbcc9a6 100644
--- a/nemo_rl/data/eval_datasets/mmlu.py
+++ b/nemo_rl/data/eval_datasets/mmlu.py
@@ -4,30 +4,39 @@
 
 from datasets import load_dataset
 
+from nemo_rl.data import processors
 from nemo_rl.data.interfaces import TaskDataSpec
 
 
 class MMLUDataset:
-    def __init__(self, prompt_file: Optional[str] = None, system_prompt_file: Optional[str] = None):
-        ds = load_dataset('csv', data_files="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", split='train')
+    def __init__(
+        self,
+        prompt_file: Optional[str] = None,
+        system_prompt_file: Optional[str] = None,
+    ):
+        ds = load_dataset(
+            "csv",
+            data_files="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv",
+            split="train",
+        )
         self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names)
 
         self.task_spec = TaskDataSpec(
-            task_name='MMLU',
+            task_name="MMLU",
             prompt_file=prompt_file,
             system_prompt_file=system_prompt_file,
         )
+        self.processor = processors.multichoice_qa_processor
 
     def _rekey(self, data: dict[str, Any]):
         return {
-            'question': data['Question'],
-            'options': dict(
-                A=data['A'],
-                B=data['B'],
-                C=data['C'],
-                D=data['D'],
+            "question": data["Question"],
+            "options": dict(
+                A=data["A"],
+                B=data["B"],
+                C=data["C"],
+                D=data["D"],
             ),
-            'answer': data['Answer'],
-            'subject': data['Subject'],
+            "answer": data["Answer"],
+            "subject": data["Subject"],
         }
-
diff --git a/nemo_rl/data/eval_datasets/mmlu_pro.py b/nemo_rl/data/eval_datasets/mmlu_pro.py
index da990a90c5..4dd094e322 100644
--- a/nemo_rl/data/eval_datasets/mmlu_pro.py
+++ b/nemo_rl/data/eval_datasets/mmlu_pro.py
@@ -4,28 +4,27 @@
 
 from datasets import load_dataset
 
+from nemo_rl.data import processors
 from nemo_rl.data.interfaces import TaskDataSpec
 
 
 class MMLUProDataset:
     def __init__(self, prompt_file: str, system_prompt_file: Optional[str] = None):
-        ds = load_dataset('TIGER-Lab/MMLU-Pro', split='test')
+        ds = load_dataset("TIGER-Lab/MMLU-Pro", split="test")
         self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names)
 
         self.task_spec = TaskDataSpec(
-            task_name='MMLU-Pro',
+            task_name="MMLU-Pro",
             prompt_file=prompt_file,
             system_prompt_file=system_prompt_file,
         )
+        self.processor = processors.multichoice_qa_processor
 
     def _rekey(self, data: dict[str, Any]):
-        options = {
-            chr(ord('A') + i) : op for i, op in enumerate(data['options'])
-        }
+        options = {chr(ord("A") + i): op for i, op in enumerate(data["options"])}
         return {
-            'question': data['question'],
-            'options': options,
-            'answer': data['answer'],
-            'subject': data['category'],
+            "question": data["question"],
+            "options": options,
+            "answer": data["answer"],
+            "subject": data["category"],
         }
-
diff --git a/nemo_rl/data/processors.py b/nemo_rl/data/processors.py
new file mode 100644
index 0000000000..5fd35d4078
--- /dev/null
+++ b/nemo_rl/data/processors.py
@@ -0,0 +1,155 @@
+"""Contains data processors for evaluation."""
+
+from typing import Any, cast
+
+import torch
+from transformers import PreTrainedTokenizerBase
+
+from nemo_rl.data.interfaces import DatumSpec, LLMMessageLogType, TaskDataSpec
+
+TokenizerType = PreTrainedTokenizerBase
+
+
+# Example of a generic math data processor
+# TaskDataProcessFnCallable
+def math_data_processor(
+    datum_dict: dict[str, Any],
+    task_data_spec: TaskDataSpec,
+    tokenizer: TokenizerType,
+    max_seq_length: int,
+    idx: int,
+) -> DatumSpec:
+    """Process a datum dictionary (directly loaded from dataset) into a DatumSpec for the Math Environment."""
+    problem = datum_dict["problem"]
+    solution = str(datum_dict["expected_answer"])
+    extra_env_info = {"ground_truth": solution}
+
+    message_log: LLMMessageLogType = []
+
+    # system prompt
+    if task_data_spec.system_prompt:
+        sys_prompt: dict[str, str | torch.Tensor] = {
+            "role": "system",
+            "content": task_data_spec.system_prompt,
+        }
+        sys = tokenizer.apply_chat_template(
+            [cast(dict[str, str], sys_prompt)],
+            tokenize=False,
+            add_generation_prompt=False,
+            add_special_tokens=False,
+        )
+        sys_prompt["token_ids"] = tokenizer(sys, return_tensors="pt")["input_ids"][0]
+        message_log.append(sys_prompt)
+
+    # user prompt
+    if task_data_spec.prompt:
+        problem = task_data_spec.prompt.format(problem)
+    user_message = {"role": "user", "content": problem}
+    message = tokenizer.apply_chat_template(
+        [user_message],
+        tokenize=False,
+        add_generation_prompt=True,
+        add_special_tokens=False,
+    )
+    user_message["token_ids"] = tokenizer(message, return_tensors="pt")["input_ids"][0]
+    user_message["content"] = message
+    message_log.append(user_message)
+
+    length = sum(len(m["token_ids"]) for m in message_log)
+
+    loss_multiplier = 1.0
+    if length > max_seq_length:
+        # make smaller and mask out
+        for indiv_message in message_log:
+            indiv_message["token_ids"] = indiv_message["token_ids"][
+                : min(4, max_seq_length // len(message_log))
+            ]
+        loss_multiplier = 0.0
+
+    output: DatumSpec = {
+        "message_log": message_log,
+        "length": length,
+        "extra_env_info": extra_env_info,
+        "loss_multiplier": loss_multiplier,
+        "idx": idx,
+    }
+    if "task_name" in datum_dict:
+        output["task_name"] = datum_dict["task_name"]
+    return output
+
+
+def _construct_multichoice_prompt(
+    prompt: str, question: str, options: dict[str, str]
+) -> str:
+    """Construct prompt from question and options."""
+    output = prompt
+    output += f"\n\nQuestion: {question}\nOptions:\n"
+    output += "\n".join(
+        [
+            f"{letter}) {option}"
+            for letter, option in options.items()
+            if option is not None
+        ]
+    )
+    return output
+
+
+def multichoice_qa_processor(
+    datum_dict: dict[str, Any],
+    task_data_spec: TaskDataSpec,
+    tokenizer: TokenizerType,
+    max_seq_length: int,
+    idx: int,
+) -> DatumSpec:
+    """Process a datum dictionary (directly loaded from dataset) into a DatumSpec for multiple-choice problems."""
+    question = datum_dict["question"]
+    answer = str(datum_dict["answer"])
+    options = datum_dict["options"]
+    extra_env_info = {"ground_truth": answer}
+    if "subject" in datum_dict:
+        extra_env_info.update({"subject": datum_dict["subject"]})
+
+    message_log = []
+
+    # system prompt
+    if task_data_spec.system_prompt:
+        sys_prompt: dict[str, str | torch.Tensor] = {
+            "role": "system",
+            "content": task_data_spec.system_prompt,
+        }
+        sys = tokenizer.apply_chat_template(
+            [cast(dict[str, str], sys_prompt)],
+            tokenize=False,
+            add_generation_prompt=False,
+            add_special_tokens=False,
+        )
+        sys_prompt["token_ids"] = tokenizer(sys, return_tensors="pt")["input_ids"][0]
+        message_log.append(sys_prompt)
+
+    # user prompt
+    if task_data_spec.prompt:
+        question = _construct_multichoice_prompt(
+            task_data_spec.prompt, question, options
+        )
+    user_message = {"role": "user", "content": question}
+    message = tokenizer.apply_chat_template(
+        [user_message],
+        tokenize=False,
+        add_generation_prompt=True,
+        add_special_tokens=False,
+    )
+    user_message["token_ids"] = tokenizer(message, return_tensors="pt")["input_ids"][0]
+    user_message["content"] = message
+    message_log.append(user_message)
+
+    length = sum(len(m["token_ids"]) for m in message_log)
+    output: DatumSpec = {
+        "message_log": message_log,
+        "length": length,
+        "extra_env_info": extra_env_info,
+        "loss_multiplier": 1.0,
+        "idx": idx,
+    }
+    if "task_name" in datum_dict:
+        output["task_name"] = datum_dict["task_name"]
+    return output

From 628ef2d98e7e1f34bd28d24b03df2357373cbb7c Mon Sep 17 00:00:00 2001
From: Xuehan <xxman@google.com>
Date: Fri, 27 Jun 2025 21:33:08 +0000
Subject: [PATCH 14/44] updates doc.

Signed-off-by: Xuehan Xiong <xxman@google.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 docs/guides/grpo.md                    | 2 +-
 tests/unit/data/test_data_processor.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/guides/grpo.md b/docs/guides/grpo.md
index 1f63df559d..f577820a21 100644
--- a/docs/guides/grpo.md
+++ b/docs/guides/grpo.md
@@ -67,7 +67,7 @@ def my_data_processor(
 ) -> DatumSpec:
 ```
 
-We have an example of this as `math_data_processor` in [run_grpo_math.py](../../examples/run_grpo_math.py)
+We have an example of this as `math_data_processor` in [processors.py](../../nemo_rl/data/processors.py)
 
 #### Putting it all together
 
diff --git a/tests/unit/data/test_data_processor.py b/tests/unit/data/test_data_processor.py
index 302dfece77..dc88bebee3 100644
--- a/tests/unit/data/test_data_processor.py
+++ b/tests/unit/data/test_data_processor.py
@@ -20,10 +20,10 @@
 abspath = os.path.abspath(__file__)
 sys.path.append("/".join(abspath.split("/")[:-4]))
 
-from examples.run_grpo_math import math_data_processor
 from nemo_rl.algorithms.utils import get_tokenizer
 from nemo_rl.data.datasets import AllTaskProcessedDataset
 from nemo_rl.data.interfaces import TaskDataSpec
+from nemo_rl.data.processors import math_data_processor
 from nemo_rl.models.policy import TokenizerConfig
 
 basic_tokenizer_test_config: TokenizerConfig = {

From f431d48f56b3b49ef402fb3b368e41e598b71bcd Mon Sep 17 00:00:00 2001
From: Luis Vega <vegaluisjose@users.noreply.github.com>
Date: Thu, 26 Jun 2025 15:49:08 -0700
Subject: [PATCH 15/44] feat: vllm Model diagnostic test checking long
 generation quality (#516)

Signed-off-by: Luis Vega <vegaluisjose@users.noreply.github.com>
Signed-off-by: Terry Kong <terrycurtiskong@gmail.com>
Signed-off-by: Luis Vega <2478335+vegaluisjose@users.noreply.github.com>
Co-authored-by: Terry Kong <terrycurtiskong@gmail.com>
Co-authored-by: Luis Vega <2478335+vegaluisjose@users.noreply.github.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 docs/adding-new-models.md                     |  13 ++-
 .../2.long_generation_decode_vs_prefill.py    | 102 ++++++++++++++++++
 2 files changed, 114 insertions(+), 1 deletion(-)
 create mode 100644 tools/model_diagnostics/2.long_generation_decode_vs_prefill.py

diff --git a/docs/adding-new-models.md b/docs/adding-new-models.md
index c73d494907..155a012f47 100644
--- a/docs/adding-new-models.md
+++ b/docs/adding-new-models.md
@@ -140,4 +140,15 @@ uv run --extra vllm tools/model_diagnostics/1.max_model_len_respected.py Qwen/Qw
 # Generated tokens: 12
 # Total tokens: 20
 # [Qwen/Qwen2.5-1.5B] ALL GOOD!
-```
\ No newline at end of file
+```
+
+## [2.long_generation_decode_vs_prefill](https://github.com/NVIDIA/NeMo-RL/blob/main/tools/model_diagnostics/2.long_generation_decode_vs_prefill.py)
+
+Test that vLLM yields near-identical token log-probabilities when comparing decoding with a single prefill pass across multiple prompts.
+
+```sh
+# Run that is expected to pass
+uv run --extra vllm tools/model_diagnostics/2.long_generation_decode_vs_prefill.py Qwen/Qwen2.5-1.5B
+# ...
+# [Qwen/Qwen2.5-1.5B] ALL GOOD!
+```
diff --git a/tools/model_diagnostics/2.long_generation_decode_vs_prefill.py b/tools/model_diagnostics/2.long_generation_decode_vs_prefill.py
new file mode 100644
index 0000000000..69c153fd53
--- /dev/null
+++ b/tools/model_diagnostics/2.long_generation_decode_vs_prefill.py
@@ -0,0 +1,102 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+
+import torch
+from vllm import LLM, SamplingParams
+
+
+def extract_logprobs(logprobs):
+    output = []
+    for lp in logprobs:
+        if lp is not None:
+            output.append(list(lp.values())[0].logprob)
+    return output
+
+
+def calculate_error(a, b):
+    return torch.exp(torch.abs(a - b)).mean().item()
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model", type=str, nargs="?", default="nvidia/Nemotron-H-8B-Base-8K"
+    )
+    args = parser.parse_args()
+
+    seed = 0
+
+    sampling_params = SamplingParams(
+        temperature=1.0,
+        top_p=1.0,
+        max_tokens=8192,
+        prompt_logprobs=0,
+        logprobs=0,
+        seed=seed,
+    )
+
+    # Examples as of 0.9.1
+    # model="meta-llama/Meta-Llama-3-8B", # pass
+    # model="nvidia/Nemotron-H-8B-Base-8K", # fail
+    # model="ibm-ai-platform/Bamba-9B-v1", # pass
+    llm = LLM(
+        model=args.model,
+        enforce_eager=True,
+        trust_remote_code=True,
+        enable_prefix_caching=False,
+        enable_chunked_prefill=False,
+        tensor_parallel_size=2,
+        gpu_memory_utilization=0.8,
+        seed=seed,
+    )
+
+    num_batches = 2
+
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+        "The future of AI is",
+    ]
+
+    outputs = llm.generate(prompts * num_batches, sampling_params)
+
+    for i, output in enumerate(outputs):
+        sequence = output.prompt_token_ids + list(output.outputs[0].token_ids)
+        prompt_logprobs = extract_logprobs(output.prompt_logprobs)
+        logprobs = extract_logprobs(output.outputs[0].logprobs)
+        decode_lp = prompt_logprobs + logprobs
+        decode_lp = torch.tensor(decode_lp)
+
+        sampling_params = SamplingParams(
+            temperature=0.0, max_tokens=1, prompt_logprobs=0
+        )
+        score = llm.generate({"prompt_token_ids": sequence}, sampling_params)
+
+        prefill_lp = extract_logprobs(score[0].prompt_logprobs)
+        prefill_lp = torch.tensor(prefill_lp)
+
+        lp_error = calculate_error(decode_lp, prefill_lp)
+        max_abs_error = torch.abs(decode_lp - prefill_lp).max().item()
+        print(
+            f"Processed sequence length {len(sequence)} with lp error {lp_error} and max abs error {max_abs_error}"
+        )
+        assert lp_error < 1.05, f"lp error is higher than expected (1.0636): {lp_error}"
+
+    print(f"[{args.model}] ALL GOOD!")
+
+
+if __name__ == "__main__":
+    main()

From f6b948dc180b245a63a66f2d28f569ee96de4d3a Mon Sep 17 00:00:00 2001
From: Yi-Fu Wu <yifu.wu@gmail.com>
Date: Thu, 26 Jun 2025 16:25:04 -0700
Subject: [PATCH 16/44] feat: Log code in wandb (#175)

Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
Signed-off-by: Parth Chadha <pchadha@nvidia.com>
Co-authored-by: Parth Chadha <pchadha@nvidia.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 nemo_rl/utils/logger.py | 111 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)

diff --git a/nemo_rl/utils/logger.py b/nemo_rl/utils/logger.py
index a15e9bebd7..b99ebcc858 100644
--- a/nemo_rl/utils/logger.py
+++ b/nemo_rl/utils/logger.py
@@ -18,6 +18,7 @@
 import logging
 import os
 import re
+import subprocess
 import threading
 import time
 from abc import ABC, abstractmethod
@@ -138,10 +139,120 @@ class WandbLogger(LoggerInterface):
 
     def __init__(self, cfg: WandbConfig, log_dir: Optional[str] = None):
         self.run = wandb.init(**cfg, dir=log_dir)
+        self._log_code()
+        self._log_diffs()
         print(
             f"Initialized WandbLogger for project {cfg.get('project')}, run {cfg.get('name')} at {log_dir}"
         )
 
+    def _log_diffs(self):
+        """Log git diffs to wandb.
+
+        This function captures and logs two types of diffs:
+        1. Uncommitted changes (working tree diff against HEAD)
+        2. All changes (including uncommitted) against the main branch
+
+        Each diff is saved as a text file in a wandb artifact.
+        """
+        try:
+            branch_result = subprocess.run(
+                ["git", "rev-parse", "--abbrev-ref", "HEAD"],
+                capture_output=True,
+                text=True,
+                check=True,
+            )
+            current_branch = branch_result.stdout.strip()
+
+            diff_artifact = wandb.Artifact(
+                name=f"git-diffs-{self.run.project}-{self.run.id}", type="git-diffs"
+            )
+
+            # 1. Log uncommitted changes (working tree diff)
+            uncommitted_result = subprocess.run(
+                ["git", "diff", "HEAD"], capture_output=True, text=True, check=True
+            )
+            uncommitted_diff = uncommitted_result.stdout
+
+            if uncommitted_diff:
+                diff_path = os.path.join(
+                    wandb.run.dir if wandb.run else ".", "uncommitted_changes_diff.txt"
+                )
+                with open(diff_path, "w") as f:
+                    f.write(uncommitted_diff)
+
+                # Add file to artifact
+                diff_artifact.add_file(diff_path, name="uncommitted_changes_diff.txt")
+                print("Logged uncommitted changes diff to wandb")
+            else:
+                print("No uncommitted changes found")
+
+            # 2. Log diff against main branch (if current branch is not main)
+            if current_branch != "main":
+                # Log diff between main and working tree (includes uncommitted changes)
+                working_diff_result = subprocess.run(
+                    ["git", "diff", "main"], capture_output=True, text=True, check=True
+                )
+                working_diff = working_diff_result.stdout
+
+                if working_diff:
+                    # Save diff to a temporary file
+                    diff_path = os.path.join(
+                        wandb.run.dir if wandb.run else ".", "main_diff.txt"
+                    )
+                    with open(diff_path, "w") as f:
+                        f.write(working_diff)
+
+                    # Add file to artifact
+                    diff_artifact.add_file(diff_path, name="main_diff.txt")
+                    print("Logged diff against main branch")
+                else:
+                    print("No differences found between main and working tree")
+
+            self.run.log_artifact(diff_artifact)
+
+        except subprocess.CalledProcessError as e:
+            print(f"Error during git operations: {e}")
+        except Exception as e:
+            print(f"Unexpected error during git diff logging: {e}")
+
+    def _log_code(self):
+        """Log code that is tracked by git to wandb.
+
+        This function gets a list of all files tracked by git in the project root
+        and manually uploads them to the current wandb run as an artifact.
+        """
+        try:
+            result = subprocess.run(
+                ["git", "ls-files"], capture_output=True, text=True, check=True
+            )
+
+            tracked_files = result.stdout.strip().split("\n")
+
+            if not tracked_files:
+                print(
+                    "Warning: No git repository found. Wandb logs will not track code changes for reproducibility."
+                )
+                return
+
+            code_artifact = wandb.Artifact(
+                name=f"source-code-{self.run.project}", type="code"
+            )
+
+            for file_path in tracked_files:
+                if os.path.isfile(file_path):
+                    try:
+                        code_artifact.add_file(file_path, name=file_path)
+                    except Exception as e:
+                        print(f"Error adding file {file_path}: {e}")
+
+            self.run.log_artifact(code_artifact)
+            print(f"Logged {len(tracked_files)} git-tracked files to wandb")
+
+        except subprocess.CalledProcessError as e:
+            print(f"Error getting git-tracked files: {e}")
+        except Exception as e:
+            print(f"Unexpected error during git code logging: {e}")
+
     def define_metric(
         self,
         name: str,

From 4265fedd37e617a10e7f2e83ab9a218c2660c6ec Mon Sep 17 00:00:00 2001
From: Anna Shors <ashors@nvidia.com>
Date: Thu, 26 Jun 2025 18:35:36 -0700
Subject: [PATCH 17/44] fix: add dynamic_batching key to SFT OpenMathInstruct
 config (#570)

Signed-off-by: ashors1 <ashors@nvidia.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 examples/configs/sft_openmathinstruct2.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/configs/sft_openmathinstruct2.yaml b/examples/configs/sft_openmathinstruct2.yaml
index e934f7aa29..2040bdd5ff 100644
--- a/examples/configs/sft_openmathinstruct2.yaml
+++ b/examples/configs/sft_openmathinstruct2.yaml
@@ -37,6 +37,9 @@ policy:
     context_parallel_size: 1
     custom_parallel_plan: null
 
+  dynamic_batching:
+    enabled: false
+
   # makes the training sequence length divisible by the tensor parallel size
   # this is useful for sequence parallel training
   make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}

From 7c8367d141fef1c6e3e51c541d5591bca4347cab Mon Sep 17 00:00:00 2001
From: yuki <48991475+yuki-666@users.noreply.github.com>
Date: Fri, 27 Jun 2025 11:24:51 +0800
Subject: [PATCH 18/44] feat: support async in non-colocated (#523)

Signed-off-by: Yuki Huang <yukih@nvidia.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 nemo_rl/models/generation/vllm.py             | 103 +++++++++++++++++-
 .../models/generation/test_vllm_generation.py |  19 +++-
 2 files changed, 111 insertions(+), 11 deletions(-)

diff --git a/nemo_rl/models/generation/vllm.py b/nemo_rl/models/generation/vllm.py
index 58795e5031..f0cd5eb50b 100644
--- a/nemo_rl/models/generation/vllm.py
+++ b/nemo_rl/models/generation/vllm.py
@@ -359,6 +359,19 @@ def init_collective(self, data: int, ip: str, port: int, world_size: int) -> Non
             ),
         )
 
+    async def init_collective_async(
+        self, data: int, ip: str, port: int, world_size: int
+    ) -> None:
+        await self.llm.collective_rpc(
+            "init_collective",
+            args=(
+                data,
+                ip,
+                port,
+                world_size,
+            ),
+        )
+
     def llm(self):
         return self.llm
 
@@ -979,7 +992,7 @@ def update_weights_from_collective(self, data: dict[str, Any]) -> bool:
 
             if self.cfg["vllm_cfg"]["async_engine"]:
                 raise RuntimeError(
-                    "update_weights_from_collective cannot be used with async_engine=True. Use update_weights_from_ipc_handles_async instead."
+                    "update_weights_from_collective can only be used with async_engine=False. Use update_weights_from_collective_async instead."
                 )
 
             result_or_coro = self.llm.collective_rpc(
@@ -1000,12 +1013,72 @@ def update_weights_from_collective(self, data: dict[str, Any]) -> bool:
             traceback.print_exc()
             return False
 
+    async def update_weights_from_collective_async(self, data: dict[str, Any]) -> bool:
+        """Async version of update_weights_from_collective."""
+        try:
+            assert self.llm is not None, (
+                "Attempting to update weights with either an uninitialized vLLM or non-model-owner"
+            )
+
+            if not self.cfg["vllm_cfg"]["async_engine"]:
+                raise RuntimeError(
+                    "update_weights_from_collective_async can only be used with async_engine=True. Use update_weights_from_collective instead."
+                )
+
+            result_or_coro = await self.llm.collective_rpc(
+                "update_weights_from_collective", args=(data,)
+            )
+
+            if asyncio.iscoroutine(result_or_coro):
+                worker_results = await result_or_coro
+            else:
+                worker_results = result_or_coro
+
+            worker_result = worker_results[0]
+
+            if not worker_result:
+                print(
+                    f"Error: Worker failed to update weights. Result: {worker_result}"
+                )
+                return False
+            return True
+        except Exception as e:
+            print(f"Exception during collective_rpc for weight update: {e}")
+            import traceback
+
+            traceback.print_exc()
+            return False
+
     def reset_prefix_cache(self):
         """Reset the prefix cache of vLLM engine."""
+        assert self.llm is not None, (
+            "Attempting to reset prefix cache with either an uninitialized vLLM or non-model-owner"
+        )
+
+        if self.cfg["vllm_cfg"]["async_engine"]:
+            raise RuntimeError(
+                "reset_prefix_cache can only be used with async_engine=False. Use reset_prefix_cache_async instead."
+            )
+
         self.llm.llm_engine.reset_prefix_cache()
         gc.collect()
         torch.cuda.empty_cache()
 
+    async def reset_prefix_cache_async(self):
+        """Async version of reset_prefix_cache."""
+        assert self.llm is not None, (
+            "Attempting to reset prefix cache with either an uninitialized vLLM or non-model-owner"
+        )
+
+        if not self.cfg["vllm_cfg"]["async_engine"]:
+            raise RuntimeError(
+                "reset_prefix_cache_async can only be used with async_engine=True. Use reset_prefix_cache instead."
+            )
+
+        await self.llm.reset_prefix_cache()
+        gc.collect()
+        torch.cuda.empty_cache()
+
     def sleep(self):
         """Put the vLLM engine to sleep."""
         assert self.llm is not None, (
@@ -1311,6 +1384,13 @@ def init_collective(
         if not self.worker_group or not self.worker_group.workers:
             raise RuntimeError("Worker group is not initialized")
 
+        # Choose the appropriate method based on async_engine setting
+        method_name = (
+            "init_collective_async"
+            if self.cfg["vllm_cfg"]["async_engine"]
+            else "init_collective"
+        )
+
         # Prepare rank
         total_workers = len(self.worker_group.workers)
         if self.dp_size == 0:
@@ -1322,7 +1402,7 @@ def init_collective(
 
         # Send world_size and rank for init collective to all workers
         futures = self.worker_group.run_all_workers_multiple_data(
-            "init_collective",
+            method_name,
             data=rank_prefix_list,
             run_rank_0_only_axes=["tensor_parallel", "pipeline_parallel"],
             common_kwargs={"ip": ip, "port": port, "world_size": world_size},
@@ -1563,12 +1643,16 @@ def finish_generation(self, *args: Any, **kwargs: Any) -> bool:
         try:
             # Choose the appropriate method based on setting
             # non-colocated only needs reset prefix cache, no need to sleep.
-            if not self.cfg["colocated"]["enabled"]:
-                method_name = "reset_prefix_cache"
-            else:
+            if self.cfg["colocated"]["enabled"]:
                 method_name = (
                     "sleep_async" if self.cfg["vllm_cfg"]["async_engine"] else "sleep"
                 )
+            else:
+                method_name = (
+                    "reset_prefix_cache_async"
+                    if self.cfg["vllm_cfg"]["async_engine"]
+                    else "reset_prefix_cache"
+                )
             # Use run_all_workers_single_data for methods that don't need data
             futures = self.worker_group.run_all_workers_single_data(
                 method_name,
@@ -1641,9 +1725,16 @@ def update_weights_from_collective(
         if not self.worker_group or not self.worker_group.workers:
             raise RuntimeError("Worker group is not initialized")
 
+        # Choose the appropriate method based on async_engine setting
+        method_name = (
+            "update_weights_from_collective_async"
+            if self.cfg["vllm_cfg"]["async_engine"]
+            else "update_weights_from_collective"
+        )
+
         # Use run_all_workers_single_data to send data to all workers
         futures = self.worker_group.run_all_workers_single_data(
-            "update_weights_from_collective",
+            method_name,
             data=info,
             run_rank_0_only_axes=["tensor_parallel", "pipeline_parallel"],
         )
diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py
index 266be74264..dc1de1b123 100644
--- a/tests/unit/models/generation/test_vllm_generation.py
+++ b/tests/unit/models/generation/test_vllm_generation.py
@@ -379,13 +379,13 @@ def test_vllm_policy_generation(policy, test_input_data, tokenizer):
     )
 
 
-async def _generate_async(vllm_policy, tokenizer, test_input_data):
+async def _generate_async(vllm_policy, tokenizer, test_input_data, greedy=False):
     collected_indexed_outputs = []
     # generate_async is restricted to handle only single samples
     input_generator = test_input_data.make_microbatch_iterator(microbatch_size=1)
     for single_item_input in input_generator:
         async for original_idx, single_item_output in vllm_policy.generate_async(
-            single_item_input
+            single_item_input, greedy=greedy
         ):
             collected_indexed_outputs.append((original_idx, single_item_output))
 
@@ -691,7 +691,7 @@ async def test_vllm_generation_with_hf_training(
         print("Using vLLM policy for fast generation...")
         if async_engine:
             generation_results = await _generate_async(
-                vllm_policy, tokenizer, test_input_data
+                vllm_policy, tokenizer, test_input_data, greedy=True
             )
         else:
             generation_results = vllm_policy.generate(test_input_data, greedy=True)
@@ -1174,11 +1174,14 @@ def test_vllm_non_divisible_batch_handling(policy):
     )
 
 
-def test_vllm_refit_non_collocated_handles_update(
+@pytest.mark.asyncio
+@pytest.mark.parametrize("async_engine", [True, False])
+async def test_vllm_refit_non_collocated_update_weights(
     policy_cluster_separate,
     generation_cluster_separate,
     tokenizer,
     test_input_data,
+    async_engine,
 ):
     if (
         policy_cluster_separate.num_gpus_per_node < 1
@@ -1197,6 +1200,7 @@ def test_vllm_refit_non_collocated_handles_update(
     # Create VllmGeneration policy on its own cluster
     vllm_config = deepcopy(basic_vllm_test_config)
     vllm_config = configure_generation_config(vllm_config, tokenizer, is_eval=True)
+    vllm_config["vllm_cfg"]["async_engine"] = async_engine
     vllm_config["vllm_cfg"]["tensor_parallel_size"] = 1
     vllm_config["colocated"]["enabled"] = False
     vllm_generation = VllmGeneration(generation_cluster_separate, vllm_config)
@@ -1213,7 +1217,12 @@ def test_vllm_refit_non_collocated_handles_update(
     )
 
     # test generate
-    outputs = vllm_generation.generate(test_input_data, greedy=True)
+    if async_engine:
+        outputs = await _generate_async(
+            vllm_generation, tokenizer, test_input_data, greedy=True
+        )
+    else:
+        outputs = vllm_generation.generate(test_input_data, greedy=True)
     output_ids = outputs["output_ids"]
     generated_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
     assert generated_texts == [

From d0dca5b9004ebf211917f65db7ef9b463b5c6cd5 Mon Sep 17 00:00:00 2001
From: Terry Kong <terrycurtiskong@gmail.com>
Date: Fri, 27 Jun 2025 10:09:17 -0700
Subject: [PATCH 19/44] fix: correct mcore dtype + assertion on activation_func
 (#572)

Signed-off-by: Terry Kong <terryk@nvidia.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 .../models/policy/megatron_policy_worker.py    | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py
index e0bd4373be..3b6ce13e30 100644
--- a/nemo_rl/models/policy/megatron_policy_worker.py
+++ b/nemo_rl/models/policy/megatron_policy_worker.py
@@ -459,15 +459,27 @@ def __init__(
         )
         model_cfg.bf16 = self.dtype == torch.bfloat16
         model_cfg.fp16 = self.dtype == torch.float16
-        model_cfg.params_dtype = dtype_map[
-            self.cfg["megatron_cfg"]["optimizer"]["params_dtype"]
-        ]  # FP32 for amp
+        if model_cfg.fp16:
+            assert not model_cfg.bf16, "fp16 and bf16 cannot be used together"
+            model_cfg.params_dtype = torch.float16
+        elif model_cfg.bf16:
+            assert not model_cfg.fp16, "fp16 and bf16 cannot be used together"
+            model_cfg.params_dtype = torch.bfloat16
+        else:
+            model_cfg.params_dtype = torch.float32
         model_cfg.pipeline_dtype = dtype_map[self.cfg["megatron_cfg"]["pipeline_dtype"]]
         model_cfg.parallel_output = True
         if self.cfg["megatron_cfg"]["activation_checkpointing"]:
             model_cfg.activations_checkpoint_granularity = "full"
             model_cfg.activations_checkpoint_method = "uniform"
             model_cfg.activations_checkpoint_num_layers = 1
+        if not model_cfg.gated_linear_unit:
+            assert model_cfg.activation_func is not None, (
+                "activation_func must be set if not using gated_linear_unit. This likely "
+                "indicates an issue in configuration conversion (e.g. activation func was "
+                "a lambda and couldn't be serialized). This is based on this check "
+                "https://github.com/NVIDIA/Megatron-LM/blob/1ab876ddc4c1893c76f26d775226a8d1dcdfb3d2/megatron/core/transformer/mlp.py#L174."
+            )
 
         checkpoint_config = CheckpointConfig(
             save_interval=100,

From e257d881bd5f333f02928abd94f095531f842538 Mon Sep 17 00:00:00 2001
From: Terry Kong <terrycurtiskong@gmail.com>
Date: Fri, 27 Jun 2025 10:27:08 -0700
Subject: [PATCH 20/44] fix: move core ray port from 6379 -> 54258 to reduce
 port collision freq (#574)

Signed-off-by: Terry Kong <terryk@nvidia.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 ray.sub | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ray.sub b/ray.sub
index 7544d3b7b7..8c0cba32b1 100644
--- a/ray.sub
+++ b/ray.sub
@@ -40,7 +40,7 @@ DASHBOARD_AGENT_GRPC_PORT=${DASHBOARD_AGENT_GRPC_PORT:-53007}
 METRICS_EXPORT_PORT=${METRICS_EXPORT_PORT:-53009}
 
 # Ports for the head node
-PORT=${PORT:-6379}
+PORT=${PORT:-54258}
 RAY_CLIENT_SERVER_PORT=${RAY_CLIENT_SERVER_PORT:-10001}
 #REDIT_SHARD_PORTS=${REDIT_SHARD_PORTS:-"random"} ??
 DASHBOARD_GRPC_PORT=${DASHBOARD_GRPC_PORT:-52367}

From c27ff44276c3b947d5674778ae66f174d96870d7 Mon Sep 17 00:00:00 2001
From: Anna Shors <ashors@nvidia.com>
Date: Fri, 27 Jun 2025 13:52:45 -0700
Subject: [PATCH 21/44] fix: fix overlap param gather (#561)

Signed-off-by: ashors1 <ashors@nvidia.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 examples/configs/dpo.yaml                     |  2 +-
 examples/configs/grpo_math_1B_megatron.yaml   |  2 +-
 ...po-llama3.1-8b-instruct-4n8g-megatron.yaml |  2 +-
 ...8b-instruct-4n8g-megatrontp2pp2-quick.yaml |  2 +-
 ...ft-llama3.1-8b-instruct-1n8g-megatron.yaml |  2 +-
 examples/configs/sft.yaml                     |  2 +-
 .../models/policy/megatron_policy_worker.py   | 32 +++++++++++++------
 7 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/examples/configs/dpo.yaml b/examples/configs/dpo.yaml
index ccddde43b0..db6fb7fa6d 100755
--- a/examples/configs/dpo.yaml
+++ b/examples/configs/dpo.yaml
@@ -134,7 +134,7 @@ policy:
     distributed_data_parallel_config:
       grad_reduce_in_fp32: false
       overlap_grad_reduce: true
-      overlap_param_gather: false
+      overlap_param_gather: true
       average_in_collective: true
       data_parallel_sharding_strategy: "optim_grads_params"
     
diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml
index 5b14a7ff56..6b07317ed6 100644
--- a/examples/configs/grpo_math_1B_megatron.yaml
+++ b/examples/configs/grpo_math_1B_megatron.yaml
@@ -115,7 +115,7 @@ policy:
     distributed_data_parallel_config:
       grad_reduce_in_fp32: false
       overlap_grad_reduce: true
-      overlap_param_gather: false
+      overlap_param_gather: true
       average_in_collective: true
       use_custom_fsdp: false
       data_parallel_sharding_strategy: "optim_grads_params"
diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.yaml
index 03bd0d7077..1fd336d0b4 100644
--- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.yaml
@@ -91,7 +91,7 @@ policy:
     distributed_data_parallel_config:
       grad_reduce_in_fp32: false
       overlap_grad_reduce: true
-      overlap_param_gather: false
+      overlap_param_gather: true
       average_in_collective: true
       data_parallel_sharding_strategy: "optim_grads_params"
 
diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
index 74c93bbae0..73008f3154 100644
--- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
+++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
@@ -91,7 +91,7 @@ policy:
     distributed_data_parallel_config:
       grad_reduce_in_fp32: false
       overlap_grad_reduce: true
-      overlap_param_gather: false
+      overlap_param_gather: true
       average_in_collective: true
       data_parallel_sharding_strategy: "optim_grads_params"
 
diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-megatron.yaml
index f6ab46c997..ddd53920e6 100644
--- a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-megatron.yaml
+++ b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-megatron.yaml
@@ -79,7 +79,7 @@ policy:
     distributed_data_parallel_config:
       grad_reduce_in_fp32: false
       overlap_grad_reduce: true
-      overlap_param_gather: false
+      overlap_param_gather: true
       average_in_collective: true
       data_parallel_sharding_strategy: "optim_grads_params"
 
diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml
index 5be4451d3b..e3c614e2a7 100644
--- a/examples/configs/sft.yaml
+++ b/examples/configs/sft.yaml
@@ -109,7 +109,7 @@ policy:
     distributed_data_parallel_config:
       grad_reduce_in_fp32: false
       overlap_grad_reduce: true
-      overlap_param_gather: false
+      overlap_param_gather: true
       average_in_collective: true
       data_parallel_sharding_strategy: "optim_grads_params"
 
diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py
index 3b6ce13e30..89eb263674 100644
--- a/nemo_rl/models/policy/megatron_policy_worker.py
+++ b/nemo_rl/models/policy/megatron_policy_worker.py
@@ -421,15 +421,6 @@ def __init__(
             pretrained_path, "iter_0000000/run_config.yaml"
         )
 
-        assert not (
-            self.cfg["megatron_cfg"]["distributed_data_parallel_config"][
-                "overlap_param_gather"
-            ]
-            and self.cfg["megatron_cfg"]["optimizer"]["use_distributed_optimizer"]
-        ), (
-            "Using overlap param gather together with distributed optimizer has known convergence issues. Please disable overlap param gather."
-        )
-
         self.tokenizer = tokenizer
         if self.tokenizer.pad_token is None:
             self.tokenizer.pad_token = self.tokenizer.eos_token
@@ -645,6 +636,13 @@ def __init__(
         self._held_gather_buffer = None
         self.megatron_to_hf_converter = MegatronToHFConverter(hf_model_name, self.model)
 
+        self.should_disable_forward_pre_hook = (
+            self.cfg["megatron_cfg"]["optimizer"]["use_distributed_optimizer"]
+            and self.cfg["megatron_cfg"]["distributed_data_parallel_config"][
+                "overlap_param_gather"
+            ]
+        )
+
     def configure_worker(self, num_gpus: int, bundle_indices: Optional[tuple] = None):
         USE_EXPANDABLE_SEGMENTS = False  # Disabling this right now as it seems to cause vLLM refit issues with Ampere
         if USE_EXPANDABLE_SEGMENTS:
@@ -662,6 +660,14 @@ def get_gpu_info(self):
         """Return information about the GPU being used by this worker."""
         return get_gpu_info(self.model)
 
+    def enable_forward_pre_hook(self):
+        assert isinstance(self.model, DistributedDataParallel)
+        self.model.enable_forward_pre_hook()
+
+    def disable_forward_pre_hook(self, param_sync=True):
+        assert isinstance(self.model, DistributedDataParallel)
+        self.model.disable_forward_pre_hook(param_sync=param_sync)
+
     def train(
         self,
         data: BatchedDataDict,
@@ -1001,6 +1007,10 @@ def use_reference_model(self):
         On entry: Moves model to CPU, moves reference_model to CUDA. Swaps the references
         On exit: Restores original references and re-flips cuda/cpu
         """
+        ## disable overlap param gather when swapping weights
+        if self.should_disable_forward_pre_hook:
+            self.disable_forward_pre_hook()
+
         with torch.no_grad():
             try:
                 # Save original references
@@ -1035,6 +1045,10 @@ def use_reference_model(self):
                 gc.collect()
                 torch.cuda.empty_cache()
 
+                ## re-enable overlap param gather after weight swap
+                if self.should_disable_forward_pre_hook:
+                    self.enable_forward_pre_hook()
+
     # Temporary fix, 'data' is a kwarg due to some sort of ray bug
     def get_reference_policy_logprobs(
         self, *, data: BatchedDataDict[Any], micro_batch_size: Optional[int] = None

From 16ac698dcb2a6fc88f6f0f6497069a9c202c93fe Mon Sep 17 00:00:00 2001
From: Terry Kong <terrycurtiskong@gmail.com>
Date: Fri, 27 Jun 2025 14:15:25 -0700
Subject: [PATCH 22/44] docs: fix some typos on nsys/model-quirk pages (#560)

Signed-off-by: Terry Kong <terryk@nvidia.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 docs/model-quirks.md   | 2 +-
 docs/nsys-profiling.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/model-quirks.md b/docs/model-quirks.md
index f0f1c961f9..fa2b181c7e 100644
--- a/docs/model-quirks.md
+++ b/docs/model-quirks.md
@@ -32,7 +32,7 @@ NeMo-RL uses the vLLM V1 runtime for both synchronous and asynchronous inference
 ### Context Parallel with FSDP2
 
 NeMo-RL implemented this feature based on torch CP [implementation](https://github.com/pytorch/pytorch/blob/main/torch/distributed/tensor/experimental/_attention.py). And we inherit its limitations.
-Whether model level support CP only depends on arguments passed to `torch.nn.functional.scaled_dot_product_attention`. Current NeMo-RL passed all ones attention mask to `model.forward`. For Gemma-3, it won't ignore attention mask as result `attn_base` is not None which is not supported by torch CP. Please see [assertion](https://github.com/pytorch/pytorch/blob/134179474539648ba7dee1317959529fbd0e7f89/torch/distributed/tensor/experimental/_attention.py#L262) .
+Whether model level support CP only depends on arguments passed to `torch.nn.functional.scaled_dot_product_attention`. Current NeMo-RL passed all ones attention mask to `model.forward`. For Gemma-3, it won't ignore attention mask as result `attn_bias` is not None which is not supported by torch CP. Please see [assertion](https://github.com/pytorch/pytorch/blob/134179474539648ba7dee1317959529fbd0e7f89/torch/distributed/tensor/experimental/_attention.py#L262) .
 
 ## vLLM Async Rollout Timeout
 
diff --git a/docs/nsys-profiling.md b/docs/nsys-profiling.md
index 1ee914c842..3c5ccd0c3a 100644
--- a/docs/nsys-profiling.md
+++ b/docs/nsys-profiling.md
@@ -69,7 +69,7 @@ To profile a Megatron worker, you should set `LD_LIBRARY_PATH` as follows, other
 
 ```bash
 LD_LIBRARY_PATH="/usr/local/cuda/targets/x86_64-linux/lib:/usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/nvidia/lib64:/usr/local/nvidia/lib:/usr/lib/x86_64-linux-gnu" \
-NRL_NSYS_PROFILE_STEP_RANGE=2:3 NRL_NSYS_WORKER_PATTERNS="dtensor_policy_worker,vllm_generation_worker" uv run --config examples/configs/grpo_math_1B_megatron.yaml examples/run_grpo_math.py grpo.max_num_steps=5
+NRL_NSYS_PROFILE_STEP_RANGE=2:3 NRL_NSYS_WORKER_PATTERNS="megatron_policy_worker,vllm_generation_worker" uv run examples/run_grpo_math.py --config examples/configs/grpo_math_1B_megatron.yaml grpo.max_num_steps=5
 ```
 
 ## Profile Output

From 9b79e1e791ba5af3c78c7f80d3e060b5657ba8d8 Mon Sep 17 00:00:00 2001
From: Anna Shors <ashors@nvidia.com>
Date: Fri, 27 Jun 2025 16:00:38 -0700
Subject: [PATCH 23/44] feat: Add megatron to hf converter (#555)

Signed-off-by: Anna Shors <ashors@nvidia.com>
Signed-off-by: ashors1 <ashors@nvidia.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 3rdparty/NeMo-workspace/NeMo                  |   2 +-
 docs/design-docs/checkpointing.md             |   4 +-
 docs/guides/eval.md                           |   4 +-
 docs/guides/grpo-deepscaler.md                |   2 +-
 docs/guides/sft-openmathinstruct2.md          |   2 +-
 .../{ => converters}/convert_dcp_to_hf.py     |   0
 examples/converters/convert_megatron_to_hf.py |  67 ++++
 nemo_rl/models/megatron/community_import.py   |  45 ++-
 tests/functional/test_converter_roundtrip.py  | 369 ++++++++++++++++++
 tests/functional/test_converters.sh           |   1 +
 10 files changed, 488 insertions(+), 8 deletions(-)
 rename examples/{ => converters}/convert_dcp_to_hf.py (100%)
 create mode 100644 examples/converters/convert_megatron_to_hf.py
 create mode 100644 tests/functional/test_converter_roundtrip.py
 create mode 100644 tests/functional/test_converters.sh

diff --git a/3rdparty/NeMo-workspace/NeMo b/3rdparty/NeMo-workspace/NeMo
index bab66472d2..4b7ded58d8 160000
--- a/3rdparty/NeMo-workspace/NeMo
+++ b/3rdparty/NeMo-workspace/NeMo
@@ -1 +1 @@
-Subproject commit bab66472d2f2eb05ab621dbad66ad6031e4ee19e
+Subproject commit 4b7ded58d804bf3470499c6cfa385c6fa915879d
diff --git a/docs/design-docs/checkpointing.md b/docs/design-docs/checkpointing.md
index de7fb64fbe..5d3feae680 100644
--- a/docs/design-docs/checkpointing.md
+++ b/docs/design-docs/checkpointing.md
@@ -5,7 +5,7 @@ NeMo RL provides two checkpoint formats for Hugging Face models: Torch distribut
 A checkpoint converter is provided to convert a Torch distributed checkpoint checkpoint to Hugging Face format after training:
 
 ```sh
-uv run examples/convert_dcp_to_hf.py --config=<YAML CONFIG USED DURING TRAINING> <ANY CONFIG OVERRIDES USED DURING TRAINING> --dcp-ckpt-path=<PATH TO DIST CHECKPOINT TO CONVERT> --hf-ckpt-path=<WHERE TO SAVE HF CHECKPOINT>
+uv run examples/converters/convert_dcp_to_hf.py --config=<YAML CONFIG USED DURING TRAINING> <ANY CONFIG OVERRIDES USED DURING TRAINING> --dcp-ckpt-path=<PATH TO DIST CHECKPOINT TO CONVERT> --hf-ckpt-path=<WHERE TO SAVE HF CHECKPOINT>
 ```
 
 Usually Hugging Face checkpoints keep the weights and tokenizer together (which we also recommend for provenance). You can copy it afterwards. Here's an end-to-end example:
@@ -14,6 +14,6 @@ Usually Hugging Face checkpoints keep the weights and tokenizer together (which
 # Change to your appropriate checkpoint directory
 CKPT_DIR=results/sft/step_10
 
-uv run examples/convert_dcp_to_hf.py --config=$CKPT_DIR/config.yaml --dcp-ckpt-path=$CKPT_DIR/policy/weights --hf-ckpt-path=${CKPT_DIR}-hf
+uv run examples/converters/convert_dcp_to_hf.py --config=$CKPT_DIR/config.yaml --dcp-ckpt-path=$CKPT_DIR/policy/weights --hf-ckpt-path=${CKPT_DIR}-hf
 rsync -ahP $CKPT_DIR/policy/tokenizer ${CKPT_DIR}-hf/
 ```
diff --git a/docs/guides/eval.md b/docs/guides/eval.md
index b6e312f574..0281bb21f7 100644
--- a/docs/guides/eval.md
+++ b/docs/guides/eval.md
@@ -9,11 +9,11 @@ To prepare for evaluation, first ensure your model is in the correct format, whi
 ### Convert DCP to HF (Optional)
 If you have trained a model and saved the checkpoint in the Pytorch DCP format, you first need to convert it to the Hugging Face format before running evaluation.
 
-Use the `examples/convert_dcp_to_hf.py` script. You'll need the path to the training configuration file (`config.yaml`), the DCP checkpoint directory, and specify an output path for the HF format model.
+Use the `examples/converters/convert_dcp_to_hf.py` script. You'll need the path to the training configuration file (`config.yaml`), the DCP checkpoint directory, and specify an output path for the HF format model.
 
 ```sh
 # Example for a GRPO checkpoint at step 170
-uv run python examples/convert_dcp_to_hf.py \
+uv run python examples/converters/convert_dcp_to_hf.py \
     --config results/grpo/step_170/config.yaml \
     --dcp-ckpt-path results/grpo/step_170/policy/weights/ \
     --hf-ckpt-path results/grpo/hf
diff --git a/docs/guides/grpo-deepscaler.md b/docs/guides/grpo-deepscaler.md
index 5beddf1689..456b2f2d8b 100644
--- a/docs/guides/grpo-deepscaler.md
+++ b/docs/guides/grpo-deepscaler.md
@@ -16,7 +16,7 @@ uv run examples/run_grpo_math.py --config=examples/configs/grpo-deepscaler-1.5b-
 At the end of each stage, you need to specify the Hugging Face checkpoint to continue training with. To get this checkpoint, we convert a model checkpoint to a Hugging Face checkpoint with the following command:
 
 ```sh
-uv run examples/convert_dcp_to_hf.py --config=results/grpo-deepscaler-1.5b-8K/step_240/config.yaml --dcp-ckpt-path=results/grpo-deepscaler-1.5b-8K/step_240/policy/weights --hf-ckpt-path=results/grpo-deepscaler-1.5b-8K/step_240/hf
+uv run examples/converters/convert_dcp_to_hf.py --config=results/grpo-deepscaler-1.5b-8K/step_240/config.yaml --dcp-ckpt-path=results/grpo-deepscaler-1.5b-8K/step_240/policy/weights --hf-ckpt-path=results/grpo-deepscaler-1.5b-8K/step_240/hf
 ```
 
 When running the next command, we use the Hugging Face checkpoint as the initial checkpoint. We train with an 8K context window for 240 steps, a 16K context window for 290 steps, and a 24K context window for 50 steps. We run all experiments on a single 8XH100 80GB node or on a single 8XA100 80GB node.
diff --git a/docs/guides/sft-openmathinstruct2.md b/docs/guides/sft-openmathinstruct2.md
index dae8e8846d..6698c12bc0 100644
--- a/docs/guides/sft-openmathinstruct2.md
+++ b/docs/guides/sft-openmathinstruct2.md
@@ -26,7 +26,7 @@ The default config uses 8 GPUs (`cluster.gpus_per_node`) on 1 node (`cluster.num
 Throughout training, the checkpoints of the model will be saved to the `results/sft_openmathinstruct2` folder (specified by `checkpointing.checkpoint_dir`). To evaluate the model, we first need to convert the PyTorch distributed checkpoint to Hugging Face format:
 
 ```
-uv run examples/convert_dcp_to_hf.py \
+uv run examples/converters/convert_dcp_to_hf.py \
     --config=results/sft_openmathinstruct2/step_1855/config.yaml \
     --dcp-ckpt-path=results/sft_openmathinstruct2/step_1855/policy/weights \
     --hf-ckpt-path=results/sft_openmathinstruct2/step_1855/hf
diff --git a/examples/convert_dcp_to_hf.py b/examples/converters/convert_dcp_to_hf.py
similarity index 100%
rename from examples/convert_dcp_to_hf.py
rename to examples/converters/convert_dcp_to_hf.py
diff --git a/examples/converters/convert_megatron_to_hf.py b/examples/converters/convert_megatron_to_hf.py
new file mode 100644
index 0000000000..ea4501286e
--- /dev/null
+++ b/examples/converters/convert_megatron_to_hf.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import yaml
+
+from nemo_rl.models.megatron.community_import import export_model_from_megatron
+
+
+def parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(
+        description="Convert Torch DCP checkpoint to HF checkpoint"
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        default=None,
+        help="Path to config.yaml file in the checkpoint directory",
+    )
+    parser.add_argument(
+        "--megatron-ckpt-path",
+        type=str,
+        default=None,
+        help="Path to Megatron checkpoint",
+    )
+    parser.add_argument(
+        "--hf-ckpt-path", type=str, default=None, help="Path to save HF checkpoint"
+    )
+    # Parse known args for the script
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    """Main entry point."""
+    args = parse_args()
+
+    with open(args.config, "r") as f:
+        config = yaml.safe_load(f)
+
+    model_name = config["policy"]["model_name"]
+    tokenizer_name = config["policy"]["tokenizer"]["name"]
+
+    export_model_from_megatron(
+        hf_model_name=model_name,
+        input_path=args.megatron_ckpt_path,
+        output_path=args.hf_ckpt_path,
+        hf_tokenizer_path=tokenizer_name,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nemo_rl/models/megatron/community_import.py b/nemo_rl/models/megatron/community_import.py
index e83922e659..5ad061c54a 100644
--- a/nemo_rl/models/megatron/community_import.py
+++ b/nemo_rl/models/megatron/community_import.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+
 
 def import_model_from_hf_name(hf_model_name: str, output_path: str):
     if "llama" in hf_model_name.lower():
@@ -31,9 +33,50 @@ def import_model_from_hf_name(hf_model_name: str, output_path: str):
             output_path=output_path,
         )
     else:
-        raise ValueError(f"Unknown model: {hf_model_name}")
+        raise ValueError(
+            f"Unknown model: {hf_model_name}. Currently, only Qwen2 and Llama are supported. "
+            "If you'd like to run with a different model, please raise an issue or consider adding your own converter."
+        )
     importer.apply()
     # resetting mcore state
     import megatron.core.rerun_state_machine
 
     megatron.core.rerun_state_machine.destroy_rerun_state_machine()
+
+
+def export_model_from_megatron(
+    hf_model_name: str,
+    input_path: str,
+    output_path: str,
+    hf_tokenizer_path: str,
+    overwrite: bool = False,
+):
+    if os.path.exists(output_path) and not overwrite:
+        raise FileExistsError(
+            f"HF checkpoint already exists at {output_path}. Delete it to run or set overwrite=True."
+        )
+
+    if "llama" in hf_model_name.lower():
+        from nemo.tron.converter.llama import HFLlamaExporter
+
+        exporter_cls = HFLlamaExporter
+    elif "qwen" in hf_model_name.lower():
+        from nemo.tron.converter.qwen import HFQwen2Exporter
+
+        exporter_cls = HFQwen2Exporter
+    else:
+        raise ValueError(
+            f"Unknown model: {hf_model_name}. Currently, only Qwen2 and Llama are supported. "
+            "If you'd like to run with a different model, please raise an issue or consider adding your own converter."
+        )
+    print(f"Exporting model {hf_model_name} to {output_path}...")
+    exporter = exporter_cls(
+        input_path=input_path,
+        output_path=output_path,
+        hf_tokenizer_path=hf_tokenizer_path,
+    )
+    exporter.apply()
+    # resetting mcore state
+    import megatron.core.rerun_state_machine
+
+    megatron.core.rerun_state_machine.destroy_rerun_state_machine()
diff --git a/tests/functional/test_converter_roundtrip.py b/tests/functional/test_converter_roundtrip.py
new file mode 100644
index 0000000000..e551d0e6b5
--- /dev/null
+++ b/tests/functional/test_converter_roundtrip.py
@@ -0,0 +1,369 @@
+#!/usr/bin/env python3
+"""
+Functional test for converter roundtrip functionality.
+
+This test:
+1. Starts with a HuggingFace Qwen/Qwen2-0.5B checkpoint
+2. Converts the model to torch DCP format
+3. Converts the model to Megatron format (using community import)
+4. Converts both the DCP and Megatron checkpoints back to HF format
+5. Asserts that the converted DCP and Megatron checkpoints are identical and match the original HF checkpoint
+"""
+
+import os
+import tempfile
+from typing import Any, Dict
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from nemo_rl.algorithms.utils import get_tokenizer
+from nemo_rl.distributed.virtual_cluster import RayVirtualCluster
+from nemo_rl.models.megatron.community_import import (
+    export_model_from_megatron,
+    import_model_from_hf_name,
+)
+from nemo_rl.models.policy.lm_policy import Policy
+from nemo_rl.utils.native_checkpoint import convert_dcp_to_hf
+
+
+def create_test_config() -> Dict[str, Any]:
+    """Create a test configuration for SFT training."""
+    return {
+        "sft": {
+            "max_num_epochs": 1,  ## unused, no training is actually done
+            "max_num_steps": 2,
+            "val_period": 2,
+            "val_batches": 1,
+            "val_global_batch_size": 4,
+            "val_micro_batch_size": 2,
+            "val_at_start": False,
+            "seed": 42,
+        },
+        "checkpointing": {
+            "enabled": True,
+            "checkpoint_dir": "/tmp/test_converter_checkpoints",
+            "metric_name": "val_loss",
+            "higher_is_better": False,
+            "keep_top_k": 1,
+            "save_period": 2,
+        },
+        "policy": {
+            "model_name": "Qwen/Qwen2-0.5B",
+            "tokenizer": {"name": "Qwen/Qwen2-0.5B"},
+            "train_global_batch_size": 4,
+            "train_micro_batch_size": 2,
+            "max_total_sequence_length": 128,
+            "precision": "bfloat16",
+            "fsdp_offload_enabled": False,
+            "activation_checkpointing_enabled": False,
+            "dtensor_cfg": {
+                "enabled": True,
+                "cpu_offload": False,
+                "sequence_parallel": False,
+                "activation_checkpointing": False,
+                "tensor_parallel_size": 1,
+                "context_parallel_size": 1,
+                "custom_parallel_plan": None,
+            },
+            "dynamic_batching": {"enabled": False},
+            "make_sequence_length_divisible_by": 1,
+            "max_grad_norm": 1.0,
+            "optimizer": {
+                "name": "torch.optim.AdamW",
+                "kwargs": {
+                    "lr": 5.0e-6,
+                    "weight_decay": 0.1,
+                    "betas": [0.9, 0.98],
+                    "eps": 1e-5,
+                    "foreach": False,
+                    "fused": False,
+                },
+            },
+            "megatron_cfg": {
+                "enabled": False,  # We'll use DCP for this test
+            },
+        },
+        "data": {
+            "max_input_seq_length": 128,
+            "dataset_name": "squad",
+            "add_bos": True,
+            "add_eos": True,
+            "add_generation_prompt": False,
+        },
+        "logger": {
+            "log_dir": "/tmp/test_converter_logs",
+            "wandb_enabled": False,
+            "tensorboard_enabled": False,
+            "monitor_gpus": False,
+        },
+        "cluster": {
+            "gpus_per_node": 1,
+            "num_nodes": 1,
+        },
+    }
+
+
+def load_model_and_tokenizer(model_name: str):
+    """Load the original HF model and tokenizer."""
+    print(f"Loading original model: {model_name}")
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name, torch_dtype=torch.bfloat16, trust_remote_code=True
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    return model, tokenizer
+
+
+def get_model_state_dict(model):
+    """Get the state dict of a model, ensuring all tensors are on CPU."""
+    state_dict = model.state_dict()
+    cpu_state_dict = {}
+    for key, value in state_dict.items():
+        if isinstance(value, torch.Tensor):
+            cpu_state_dict[key] = value.detach().cpu()
+        else:
+            cpu_state_dict[key] = value
+    return cpu_state_dict
+
+
+def assert_state_dicts_equal(
+    state_dict1: Dict[str, Any], state_dict2: Dict[str, Any], name1: str, name2: str
+):
+    """Assert that two state dictionaries are equal."""
+    print(f"Comparing {name1} vs {name2}")
+
+    # Check that keys match
+    keys1 = set(state_dict1.keys())
+    keys2 = set(state_dict2.keys())
+
+    if keys1 != keys2:
+        missing_in_2 = keys1 - keys2
+        missing_in_1 = keys2 - keys1
+        raise AssertionError(
+            f"State dict keys don't match between {name1} and {name2}.\n"
+            f"Keys in {name1} but not in {name2}: {missing_in_2}\n"
+            f"Keys in {name2} but not in {name1}: {missing_in_1}"
+        )
+
+    # Check that values match
+    for key in keys1:
+        val1 = state_dict1[key]
+        val2 = state_dict2[key]
+
+        if isinstance(val1, torch.Tensor) and isinstance(val2, torch.Tensor):
+            if not torch.allclose(val1, val2, rtol=1e-5, atol=1e-5):
+                max_diff = torch.max(torch.abs(val1 - val2)).item()
+                raise AssertionError(
+                    f"Tensors for key '{key}' don't match between {name1} and {name2}. "
+                    f"Max difference: {max_diff}"
+                )
+        elif val1 != val2:
+            raise AssertionError(
+                f"Non-tensor values for key '{key}' don't match between {name1} and {name2}. "
+                f"{name1}: {val1}, {name2}: {val2}"
+            )
+
+    print(f"✓ {name1} and {name2} are identical")
+
+
+def create_dcp_checkpoint(
+    model_name: str, config: Dict[str, Any], temp_dir: str
+) -> str:
+    """Create a DCP checkpoint without training."""
+    print("Creating DCP checkpoint...")
+
+    # Create cluster
+    cluster = RayVirtualCluster(
+        name="test-converter-cluster",
+        bundle_ct_per_node_list=[1],
+        use_gpus=True,
+        num_gpus_per_node=1,
+        max_colocated_worker_groups=1,
+    )
+
+    # Get tokenizer
+    tokenizer = get_tokenizer(config["policy"]["tokenizer"])
+
+    # Create policy
+    policy = Policy(
+        cluster=cluster,
+        config=config["policy"],
+        tokenizer=tokenizer,
+        init_reference_model=False,
+    )
+
+    # Save checkpoint without any training
+    dcp_checkpoint_path = os.path.join(temp_dir, "dcp_checkpoint")
+    policy.save_checkpoint(dcp_checkpoint_path)
+
+    print(f"✓ DCP checkpoint saved to: {dcp_checkpoint_path}")
+    return dcp_checkpoint_path
+
+
+def create_megatron_checkpoint(model_name: str, temp_dir: str) -> str:
+    """Create a Megatron checkpoint using community import."""
+    print("Creating Megatron checkpoint...")
+
+    megatron_checkpoint_path = os.path.join(temp_dir, "megatron_checkpoint")
+    import_model_from_hf_name(model_name, megatron_checkpoint_path)
+
+    print(f"✓ Megatron checkpoint saved to: {megatron_checkpoint_path}")
+    return os.path.join(megatron_checkpoint_path, "iter_0000000")
+
+
+def convert_dcp_to_hf_checkpoint(dcp_path: str, model_name: str, temp_dir: str) -> str:
+    """Convert DCP checkpoint to HF format."""
+    print("Converting DCP to HF format...")
+
+    hf_path = os.path.join(temp_dir, "dcp_to_hf")
+    convert_dcp_to_hf(
+        dcp_ckpt_path=dcp_path,
+        hf_ckpt_path=hf_path,
+        model_name_or_path=model_name,
+        tokenizer_name_or_path=model_name,
+        overwrite=True,
+    )
+
+    print(f"✓ DCP to HF conversion saved to: {hf_path}")
+    return hf_path
+
+
+def convert_megatron_to_hf_checkpoint(
+    megatron_path: str, model_name: str, temp_dir: str
+) -> str:
+    """Convert Megatron checkpoint to HF format."""
+    print("Converting Megatron to HF format...")
+
+    hf_path = os.path.join(temp_dir, "megatron_to_hf")
+
+    # Get tokenizer for the export
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    tokenizer_path = os.path.join(temp_dir, "tokenizer")
+    tokenizer.save_pretrained(tokenizer_path)
+
+    export_model_from_megatron(
+        hf_model_name=model_name,
+        input_path=megatron_path,
+        output_path=hf_path,
+        hf_tokenizer_path=tokenizer_path,
+        overwrite=True,
+    )
+
+    print(f"✓ Megatron to HF conversion saved to: {hf_path}")
+    return hf_path
+
+
+def main():
+    """Main test function."""
+    print("=" * 80)
+    print("Starting Converter Roundtrip Functional Test")
+    print("=" * 80)
+
+    # TODO(@ashors): test more models
+    model_name = "Qwen/Qwen2-0.5B"
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        print(f"Using temporary directory: {temp_dir}")
+
+        # Step 1: Load original HF model
+        print("\n" + "=" * 60)
+        print("STEP 1: Loading original HuggingFace model")
+        print("=" * 60)
+        original_model, original_tokenizer = load_model_and_tokenizer(model_name)
+        original_state_dict = get_model_state_dict(original_model)
+
+        # Step 2: Create DCP checkpoint
+        print("\n" + "=" * 60)
+        print("STEP 2: Creating DCP checkpoint")
+        print("=" * 60)
+        config = create_test_config()
+        dcp_checkpoint_path = create_dcp_checkpoint(model_name, config, temp_dir)
+
+        # Step 3: Create Megatron checkpoint
+        print("\n" + "=" * 60)
+        print("STEP 3: Creating Megatron checkpoint")
+        print("=" * 60)
+        megatron_checkpoint_path = create_megatron_checkpoint(model_name, temp_dir)
+
+        # Step 4: Convert DCP to HF
+        print("\n" + "=" * 60)
+        print("STEP 4: Converting DCP to HF format")
+        print("=" * 60)
+        dcp_to_hf_path = convert_dcp_to_hf_checkpoint(
+            dcp_checkpoint_path, model_name, temp_dir
+        )
+
+        # Step 5: Convert Megatron to HF
+        print("\n" + "=" * 60)
+        print("STEP 5: Converting Megatron to HF format")
+        print("=" * 60)
+        megatron_to_hf_path = convert_megatron_to_hf_checkpoint(
+            megatron_checkpoint_path, model_name, temp_dir
+        )
+
+        # Step 6: Load converted models and compare
+        print("\n" + "=" * 60)
+        print("STEP 6: Loading converted models and comparing")
+        print("=" * 60)
+
+        # Load DCP-converted model
+        dcp_converted_model = AutoModelForCausalLM.from_pretrained(
+            dcp_to_hf_path, torch_dtype=torch.bfloat16, trust_remote_code=True
+        )
+        dcp_converted_state_dict = get_model_state_dict(dcp_converted_model)
+
+        # Load Megatron-converted model
+        megatron_converted_model = AutoModelForCausalLM.from_pretrained(
+            megatron_to_hf_path, torch_dtype=torch.bfloat16, trust_remote_code=True
+        )
+        megatron_converted_state_dict = get_model_state_dict(megatron_converted_model)
+
+        # Step 7: Assertions
+        print("\n" + "=" * 60)
+        print("STEP 7: Running assertions")
+        print("=" * 60)
+
+        # Compare DCP-converted vs Megatron-converted
+        print("Comparing DCP-converted HF model with Megatron-converted HF model...")
+        assert_state_dicts_equal(
+            dcp_converted_state_dict,
+            megatron_converted_state_dict,
+            "DCP-converted HF model",
+            "Megatron-converted HF model",
+        )
+
+        print("✓ DCP and Megatron roundtrip checkpoints are identical!")
+
+        # Verify that both converted models have the expected structure
+        expected_keys = set(original_state_dict.keys())
+        dcp_keys = set(dcp_converted_state_dict.keys())
+        megatron_keys = set(megatron_converted_state_dict.keys())
+
+        assert dcp_keys == expected_keys, (
+            f"DCP converted model missing keys: {expected_keys - dcp_keys}"
+        )
+        assert megatron_keys == expected_keys, (
+            f"Megatron converted model missing keys: {expected_keys - megatron_keys}"
+        )
+
+        print("✓ All converted models have the expected structure")
+
+        # Test that we can do a forward pass with both converted models
+        print("Testing forward passes...")
+        test_input = torch.randint(0, 1000, (1, 10))
+
+        with torch.no_grad():
+            dcp_output = dcp_converted_model(test_input)
+            megatron_output = megatron_converted_model(test_input)
+
+        print("✓ Both converted models can perform forward passes")
+
+        print("\n" + "=" * 80)
+        print("✓ ALL TESTS PASSED!")
+        print("=" * 80)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/functional/test_converters.sh b/tests/functional/test_converters.sh
new file mode 100644
index 0000000000..ef789ecf90
--- /dev/null
+++ b/tests/functional/test_converters.sh
@@ -0,0 +1 @@
+uv run --extra mcore tests/functional/test_converter_roundtrip.py
\ No newline at end of file

From 4022bee33e31a3225f36f9a09ffbef28f3c25932 Mon Sep 17 00:00:00 2001
From: Anna Shors <ashors@nvidia.com>
Date: Fri, 27 Jun 2025 20:54:13 -0700
Subject: [PATCH 24/44] docs: Add a note on supported backends (#553)

Signed-off-by: ashors1 <ashors@nvidia.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 README.md                             | 42 +++++++++++++++++++++++++++
 docs/design-docs/training-backends.md | 35 ++++++++++++++++++++++
 docs/index.md                         |  1 +
 3 files changed, 78 insertions(+)
 create mode 100644 docs/design-docs/training-backends.md

diff --git a/README.md b/README.md
index e83d242734..8f5b3b54e5 100644
--- a/README.md
+++ b/README.md
@@ -16,10 +16,12 @@
   - [DPO](#dpo)
     - [DPO Single Node](#dpo-single-node)
     - [DPO Multi-node](#dpo-multi-node)
+  - [Supported Training Backends](#training-backends)
   - [Evaluation](#evaluation)
     - [Convert Model Format (Optional)](#convert-model-format-optional)
     - [Run Evaluation](#run-evaluation)
   - [Set Up Clusters](#set-up-clusters)
+  - [Tips and Tricks](#tips-and-tricks)
   - [Citation](#citation)
   - [Contributing](#contributing)
   - [Licenses](#licenses)
@@ -152,6 +154,18 @@ uv run python examples/run_grpo_math.py \
   logger.num_val_samples_to_print=10
 ```
 
+The default configuration uses the DTensor training backend. We also provide a config `examples/configs/grpo_math_1B_megatron.yaml` which is set up to use the Megatron backend out of the box.
+
+To train using this config on a single GPU:
+
+```sh
+# Run a GRPO math example on 1 GPU using the Megatron backend
+uv run python examples/run_grpo_math.py \
+  --config examples/configs/grpo_math_1B_megatron.yaml
+```
+
+For additional details on supported backends and how to configure the training backend to suit your setup, refer to the [Training Backends documentation](docs/design-docs/training-backends.md).
+
 ### GRPO Multi-node
 
 ```sh
@@ -310,6 +324,15 @@ sbatch \
     ray.sub
 ```
 
+## Training Backends
+
+NeMo RL supports multiple training backends to accommodate different model sizes and hardware configurations:
+
+- **DTensor (FSDP2)** - PyTorch's next-generation distributed training with improved memory efficiency
+- **Megatron** - NVIDIA's high-performance training framework for scaling to large models (>100B parameters)
+
+The training backend is automatically determined based on your YAML configuration settings. For detailed information on backend selection, configuration, and examples, see the [Training Backends documentation](docs/design-docs/training-backends.md).
+
 ## Evaluation
 
 We provide evaluation tools to assess model capabilities.
@@ -360,6 +383,25 @@ Refer to `examples/configs/eval.yaml` for a full list of parameters that can be
 
 For detailed instructions on how to set up and launch NeMo RL on Slurm or Kubernetes clusters, please refer to the dedicated [Cluster Start](docs/cluster.md) documentation.
 
+## Tips and Tricks
+- If you forget to initialize the NeMo and Megatron submodules when cloning the NeMo-RL repository, you may run into an error like this:
+  
+  ```sh
+  ModuleNotFoundError: No module named 'megatron'
+  ```
+  
+  If you see this error, there is likely an issue with your virtual environments. To fix this, first intialize the submodules:
+
+  ```sh
+  git submodule update --init --recursive
+  ```
+
+  and then force a rebuild of the virutal environments by setting `NRL_FORCE_REBUILD_VENVS=true` next time you launch a run:
+
+  ```sh
+  NRL_FORCE_REBUILD_VENVS=true uv run examples/run_grpo.py ...
+  ```
+
 ## Citation
 
 If you use NeMo RL in your research, please cite it using the following BibTeX entry:
diff --git a/docs/design-docs/training-backends.md b/docs/design-docs/training-backends.md
new file mode 100644
index 0000000000..0448284971
--- /dev/null
+++ b/docs/design-docs/training-backends.md
@@ -0,0 +1,35 @@
+# Training Backends
+
+NeMo RL supports multiple training backends to accommodate different model sizes and hardware configurations.
+
+## Available Backends
+
+- **DTensor (FSDP2)** - PyTorch's next-generation distributed training with improved memory efficiency
+- **Megatron** - NVIDIA's high-performance training framework for scaling to large models (>100B parameters)
+
+## Backend Selection
+
+The training backend is automatically determined based on your YAML configuration settings. Here's how to configure each backend.
+
+### Megatron Backend
+To enable Megatron-based training:
+
+1. Add the `megatron_cfg` key to your policy configuration.
+2. Set `policy.megatron_cfg.enabled=True`.
+3. Refer to [examples/configs/grpo_math_1B_megatron.yaml](../../examples/configs/grpo_math_1B_megatron.yaml) for a complete configuration example.
+
+_Note_: When using Megatron, the optimizer and learning rate schedule are configured through `policy.megatron_cfg.optimizer` and `policy.megatron_cfg.scheduler`, respectively.
+
+### DTensor Backend
+To enable DTensor (FSDP2) training:
+
+1. Set `policy.dtensor_config.enabled=True`.
+2. Refer to [examples/configs/grpo_math_1B.yaml](../../examples/configs/grpo_math_1B.yaml) for a configuration example.
+
+## Backend Priority
+
+**Megatron takes precedence over DTensor.** If both backends are enabled simultaneously (`policy.megatron_cfg.enabled=True` and `policy.dtensor_config.enabled=True`), the Megatron backend will be used.
+
+## Configuration Examples
+
+For comprehensive examples of each algorithm and backend, see the [examples/configs/recipes/llm](https://github.com/NVIDIA-NeMo/RL/tree/main/examples/configs/recipes/llm) folder. This directory contains ready-to-use configurations for various supported combinations.
diff --git a/docs/index.md b/docs/index.md
index c7ad002631..33d507b6f4 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -63,4 +63,5 @@ design-docs/generation.md
 design-docs/checkpointing.md
 design-docs/loss-functions.md
 design-docs/fsdp2-parallel-plan.md
+design-docs/training-backends.md
 ```

From f03e596fdac30e913a19067dc6f3258e7eeb5ee9 Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Fri, 27 Jun 2025 20:55:39 -0700
Subject: [PATCH 25/44] feat: Support pass@k (#536)

Signed-off-by: Dheeraj Peri <peri.dheeraj@gmail.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 examples/configs/evals/eval.yaml |  3 +-
 nemo_rl/evals/eval.py            | 62 +++++++++++++++++++++++++-------
 2 files changed, 52 insertions(+), 13 deletions(-)

diff --git a/examples/configs/evals/eval.yaml b/examples/configs/evals/eval.yaml
index 1c21af99c4..439acff25e 100644
--- a/examples/configs/evals/eval.yaml
+++ b/examples/configs/evals/eval.yaml
@@ -1,8 +1,9 @@
 # Evaluation Configuration
 eval:
-  metric: "pass@1" # only pass@1 is supported now
+  metric: "pass@k"
   num_tests_per_prompt: 1 # every prompt will be tested num_tests_per_prompt times and use the average score as the final score
   seed: 42
+  pass_k_value: 1
 
 generation:
   backend: "vllm" # only vllm is supported for evaluation
diff --git a/nemo_rl/evals/eval.py b/nemo_rl/evals/eval.py
index 9ca1e90762..5788e1971e 100644
--- a/nemo_rl/evals/eval.py
+++ b/nemo_rl/evals/eval.py
@@ -16,6 +16,7 @@
 from typing import TypedDict
 
 import ray
+import torch
 from torch.utils.data import DataLoader
 from transformers import AutoTokenizer
 
@@ -38,6 +39,7 @@ class EvalConfig(TypedDict):
     metric: str
     num_tests_per_prompt: int
     seed: int
+    pass_k_value: int
 
 
 class MasterConfig(TypedDict):
@@ -83,16 +85,26 @@ def setup(
 
     # Check settings
     metric = eval_config["metric"]
+    pass_k_value = eval_config["pass_k_value"]
     num_tests_per_prompt = eval_config["num_tests_per_prompt"]
     temperature = generation_config["temperature"]
     top_k = generation_config["top_k"]
-    # TODO @yukih: support pass@k and cons@k
-    assert metric in ["pass@1"], f"Invalid metric: {metric}"
+
+    # TODO @yukih: support cons@k
+    # Validate metrics
+    assert metric in ["pass@k"], f"Invalid metric: {metric}"
     if num_tests_per_prompt > 1:
         assert temperature > 0 and top_k != 1, (
             "temperature > 0 and top_k != 1 are required for multiple samples"
         )
 
+    assert pass_k_value >= 1, (
+        "pass_k_value must be greater than or equal to 1 for pass@k metric"
+    )
+    assert num_tests_per_prompt >= pass_k_value, (
+        "num_tests_per_prompt must be greater than or equal to pass_k_value for pass@k metric"
+    )
+
     # ==========================
     #           Data
     # ==========================
@@ -150,6 +162,34 @@ def setup(
 # ===============================================================================
 
 
+def eval_pass_k(rewards: torch.Tensor, num_tests_per_prompt: int, k: int) -> float:
+    """Evaluate pass@k score using an unbiased estimator.
+
+    Reference: https://github.com/huggingface/evaluate/blob/32546aafec25cdc2a5d7dd9f941fc5be56ba122f/metrics/code_eval/code_eval.py#L198-L213
+    Args:
+        rewards: Tensor of shape (batch_size * num_tests_per_prompt)
+        k: int (pass@k value)
+
+    Returns:
+        pass_k_score: float
+    """
+
+    def eval_single_chunk(n: int, c: int, k: int) -> float:
+        """Calculates 1 - comb(n - c, k) / comb(n, k)."""
+        if n - c < k:
+            return 1.0
+        return float(1.0 - torch.prod(1.0 - k / torch.arange(n - c + 1, n + 1)).item())
+
+    # rewards is a 1d tensor of size (batch_size * num_tests_per_prompt)
+    group_rewards = rewards.split(num_tests_per_prompt)
+    pass_k_score = 0.0
+    for group_reward in group_rewards:
+        num_correct = group_reward.sum().item()
+        pass_k_score += eval_single_chunk(num_tests_per_prompt, num_correct, k)
+
+    return pass_k_score
+
+
 def run_env_eval(vllm_generation, dataloader, env, master_config):
     """Main entry point for running evaluation using environment.
 
@@ -166,13 +206,11 @@ def run_env_eval(vllm_generation, dataloader, env, master_config):
     eval_config = master_config["eval"]
     metric = eval_config["metric"]
     num_tests_per_prompt = eval_config["num_tests_per_prompt"]
+    pass_k_value = eval_config["pass_k_value"]
 
     # Run evaluation loop
-    score, count = 0.0, 0
+    score = 0.0
     for batch in dataloader:
-        # update stats
-        count += batch.size * num_tests_per_prompt
-
         # measure multiple samples
         if num_tests_per_prompt > 1:
             batch = batch.repeat_interleave(num_tests_per_prompt)
@@ -203,10 +241,10 @@ def run_env_eval(vllm_generation, dataloader, env, master_config):
             for i in range(len(batch["message_log"]))
         ]
         env_return = ray.get(env.step.remote(to_env, batch["extra_env_info"]))
-
+        rewards = env_return.rewards
         # update stats
-        if metric == "pass@1":
-            score += env_return.rewards.sum().item()
+        if metric == "pass@k":
+            score += eval_pass_k(rewards, num_tests_per_prompt, pass_k_value)
         else:
             raise ValueError(f"Invalid metric: {metric}")
 
@@ -221,11 +259,11 @@ def run_env_eval(vllm_generation, dataloader, env, master_config):
     temperature = generation_config["temperature"]
     top_p = generation_config["top_p"]
     top_k = generation_config["top_k"]
-    average_score = score / count
+    average_score = score / len(dataloader.dataset)
 
     print("\n" + "=" * 60)
     print(f"{model_name=} {dataset_name=}")
     print(f"{max_new_tokens=} {temperature=} {top_p=} {top_k=}\n")
-    print(f"{metric=} {num_tests_per_prompt=}\n")
-    print(f"score={average_score:.4f} ({score}/{count})")
+    print(f"{metric=} {pass_k_value=} {num_tests_per_prompt=}\n")
+    print(f"score={average_score:.4f} ({score}/{len(dataloader.dataset)})")
     print("=" * 60 + "\n")

From 8f444925d6d9c501e98acbd2bda6321bbd4ea15d Mon Sep 17 00:00:00 2001
From: Sahil Jain <48468750+SahilJain314@users.noreply.github.com>
Date: Fri, 27 Jun 2025 21:33:21 -0700
Subject: [PATCH 26/44] fix: Megatron config fixes (#576)

Signed-off-by: Sahil Jain <sahilj@nvidia.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 examples/configs/grpo_math_1B_megatron.yaml  | 4 ++--
 examples/configs/grpo_math_70B_megatron.yaml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml
index 6b07317ed6..237fbb0df1 100644
--- a/examples/configs/grpo_math_1B_megatron.yaml
+++ b/examples/configs/grpo_math_1B_megatron.yaml
@@ -14,8 +14,8 @@ grpo:
 
 loss_fn:
   reference_policy_kl_penalty: 0.01
-  ratio_eps_min: 0.2
-  ratio_eps_max: 0.2
+  ratio_clip_min: 0.2
+  ratio_clip_max: 0.2
   # (default off) loss formulation improvements (docs/guides/grpo.md#loss)
   use_on_policy_kl_approximation: false
   use_importance_sampling_correction: false
diff --git a/examples/configs/grpo_math_70B_megatron.yaml b/examples/configs/grpo_math_70B_megatron.yaml
index 4d071b3110..15a65c5ce6 100644
--- a/examples/configs/grpo_math_70B_megatron.yaml
+++ b/examples/configs/grpo_math_70B_megatron.yaml
@@ -68,4 +68,4 @@ policy:
 
 cluster:
   gpus_per_node: 8
-  num_nodes: 1
+  num_nodes: 8

From 39b8f25536b5a1bef9db34e0d341ba7b986ca6d7 Mon Sep 17 00:00:00 2001
From: Xuehan <xxman@google.com>
Date: Mon, 30 Jun 2025 05:14:08 +0000
Subject: [PATCH 27/44] update docs for the new eval.

Signed-off-by: Xuehan Xiong <xxman@google.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 docs/guides/eval.md | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/docs/guides/eval.md b/docs/guides/eval.md
index 0281bb21f7..8648940b2e 100644
--- a/docs/guides/eval.md
+++ b/docs/guides/eval.md
@@ -25,7 +25,7 @@ Once the conversion is complete, you can override the `generation.model_name` to
 ### Prepare the Evaluation Configuration
 **Override with Custom Settings**
 
-To run the evaluation, you can use the [default configuration file](../../examples/configs/eval.yaml). Alternatively, you can specify a custom one or override some settings via the command line.
+To run the evaluation, you can use the [default configuration file](../../examples/configs/evals/eval.yaml). Alternatively, you can specify a custom one or override some settings via the command line.
 
 The default configuration employs greedy sampling to evaluate Qwen2.5-Math-1.5B-Instruct on AIME-2024.
 
@@ -51,16 +51,22 @@ uv run python examples/run_eval.py generation.model_name=$PWD/results/grpo/hf
 # Run evaluation script with custom config file
 uv run python examples/run_eval.py --config path/to/custom_config.yaml
 
+# Run evaluation script on one of the supported benchmarks (e.g., GPQA)
+uv run python examples/run_eval.py --config examples/configs/evals/gpqa_eval.yaml
+
+# Run evaluation script with a local dataset
+uv run python examples/run_eval.py --config examples/configs/evals/local_eval.yaml
+
 # Override specific config values via command line
 # Example: Evaluation of DeepScaleR-1.5B-Preview on MATH-500 using 8 GPUs
 #          Pass@1 accuracy averaged over 16 samples for each problem
 uv run python examples/run_eval.py \
+    --config examples/configs/evals/math_eval.yaml \
     generation.model_name=agentica-org/DeepScaleR-1.5B-Preview \
     generation.temperature=0.6 \
     generation.top_p=0.95 \
-    generation.vllm_cfg.max_model_len=32768 \
-    data.dataset_name=HuggingFaceH4/MATH-500 \
-    data.dataset_key=test \
+    generation.vllm_cfg.max_model_len=32768 \ 
+    data.dataset_name="math500" \
     eval.num_tests_per_prompt=16 \
     cluster.gpus_per_node=8
 ```
@@ -80,3 +86,12 @@ metric='pass@1' num_tests_per_prompt=1
 score=0.1000 (3.0/30)
 ============================================================
 ```
+
+## List of currently supported benchmarks
+
+- [AIME-2024](../../nemo_rl/data/eval_datasets/aime2024.py)
+- [GPQA and GPQA-diamond](../../nemo_rl/data/eval_datasets/gpqa.py)
+- [MATH and MATH-500](../../nemo_rl/data/eval_datasets/math.py)
+- [MMLU](../../nemo_rl/data/eval_datasets/mmlu.py)
+- [MMLU-Pro](../../nemo_rl/data/eval_datasets/mmlu_pro.py)
+

From 8f6ac977385d576e6aeb1198b702dbf90724b030 Mon Sep 17 00:00:00 2001
From: Anna Shors <ashors@nvidia.com>
Date: Sun, 29 Jun 2025 18:24:48 -0700
Subject: [PATCH 28/44] docs: move training backends section (#580)

Signed-off-by: ashors1 <ashors@nvidia.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 README.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 8f5b3b54e5..9f605b1b5c 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,7 @@
   - [📣 News](#-news)
   - [Features](#features)
   - [Prerequisites](#prerequisites)
+  - [Supported Training Backends](#training-backends)
   - [GRPO](#grpo)
     - [GRPO Single Node](#grpo-single-node)
     - [GRPO Multi-node](#grpo-multi-node)
@@ -16,7 +17,6 @@
   - [DPO](#dpo)
     - [DPO Single Node](#dpo-single-node)
     - [DPO Multi-node](#dpo-multi-node)
-  - [Supported Training Backends](#training-backends)
   - [Evaluation](#evaluation)
     - [Convert Model Format (Optional)](#convert-model-format-optional)
     - [Run Evaluation](#run-evaluation)
@@ -122,6 +122,15 @@ uv venv
 - Ensure you have the necessary CUDA drivers and PyTorch installed compatible with your hardware.
 - **Reminder**: Don't forget to set your `HF_HOME`, `WANDB_API_KEY`, and `HF_DATASETS_CACHE` (if needed). You'll need to do a `huggingface-cli login` as well for Llama models.
 
+## Training Backends
+
+NeMo RL supports multiple training backends to accommodate different model sizes and hardware configurations:
+
+- **DTensor (FSDP2)** - PyTorch's next-generation distributed training with improved memory efficiency
+- **Megatron** - NVIDIA's high-performance training framework for scaling to large models (>100B parameters)
+
+The training backend is automatically determined based on your YAML configuration settings. For detailed information on backend selection, configuration, and examples, see the [Training Backends documentation](docs/design-docs/training-backends.md).
+
 ## GRPO
 
 We have a reference GRPO experiment config set up trained for math benchmarks using the [OpenInstructMath2](https://huggingface.co/datasets/nvidia/OpenMathInstruct-2) dataset.
@@ -324,15 +333,6 @@ sbatch \
     ray.sub
 ```
 
-## Training Backends
-
-NeMo RL supports multiple training backends to accommodate different model sizes and hardware configurations:
-
-- **DTensor (FSDP2)** - PyTorch's next-generation distributed training with improved memory efficiency
-- **Megatron** - NVIDIA's high-performance training framework for scaling to large models (>100B parameters)
-
-The training backend is automatically determined based on your YAML configuration settings. For detailed information on backend selection, configuration, and examples, see the [Training Backends documentation](docs/design-docs/training-backends.md).
-
 ## Evaluation
 
 We provide evaluation tools to assess model capabilities.

From 29753155c730931f6ea6ef67f4406d05002d93b4 Mon Sep 17 00:00:00 2001
From: Anna Shors <ashors@nvidia.com>
Date: Fri, 27 Jun 2025 20:54:13 -0700
Subject: [PATCH 29/44] docs: Add a note on supported backends (#553)

Signed-off-by: ashors1 <ashors@nvidia.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 README.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/README.md b/README.md
index 9f605b1b5c..6e86a9835e 100644
--- a/README.md
+++ b/README.md
@@ -17,6 +17,7 @@
   - [DPO](#dpo)
     - [DPO Single Node](#dpo-single-node)
     - [DPO Multi-node](#dpo-multi-node)
+  - [Supported Training Backends](#training-backends)
   - [Evaluation](#evaluation)
     - [Convert Model Format (Optional)](#convert-model-format-optional)
     - [Run Evaluation](#run-evaluation)
@@ -333,6 +334,15 @@ sbatch \
     ray.sub
 ```
 
+## Training Backends
+
+NeMo RL supports multiple training backends to accommodate different model sizes and hardware configurations:
+
+- **DTensor (FSDP2)** - PyTorch's next-generation distributed training with improved memory efficiency
+- **Megatron** - NVIDIA's high-performance training framework for scaling to large models (>100B parameters)
+
+The training backend is automatically determined based on your YAML configuration settings. For detailed information on backend selection, configuration, and examples, see the [Training Backends documentation](docs/design-docs/training-backends.md).
+
 ## Evaluation
 
 We provide evaluation tools to assess model capabilities.

From 26f8fb227817c672a2437f7763fa150775d3118d Mon Sep 17 00:00:00 2001
From: Anna Shors <ashors@nvidia.com>
Date: Sun, 29 Jun 2025 18:24:48 -0700
Subject: [PATCH 30/44] docs: move training backends section (#580)

Signed-off-by: ashors1 <ashors@nvidia.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 README.md | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/README.md b/README.md
index 6e86a9835e..9f605b1b5c 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,6 @@
   - [DPO](#dpo)
     - [DPO Single Node](#dpo-single-node)
     - [DPO Multi-node](#dpo-multi-node)
-  - [Supported Training Backends](#training-backends)
   - [Evaluation](#evaluation)
     - [Convert Model Format (Optional)](#convert-model-format-optional)
     - [Run Evaluation](#run-evaluation)
@@ -334,15 +333,6 @@ sbatch \
     ray.sub
 ```
 
-## Training Backends
-
-NeMo RL supports multiple training backends to accommodate different model sizes and hardware configurations:
-
-- **DTensor (FSDP2)** - PyTorch's next-generation distributed training with improved memory efficiency
-- **Megatron** - NVIDIA's high-performance training framework for scaling to large models (>100B parameters)
-
-The training backend is automatically determined based on your YAML configuration settings. For detailed information on backend selection, configuration, and examples, see the [Training Backends documentation](docs/design-docs/training-backends.md).
-
 ## Evaluation
 
 We provide evaluation tools to assess model capabilities.

From 1055f5ea99c8949d89681e51db9b0d8d17e58e17 Mon Sep 17 00:00:00 2001
From: Xuehan <xxman@google.com>
Date: Mon, 30 Jun 2025 06:04:37 +0000
Subject: [PATCH 31/44] Update more docs for the new eval.

Signed-off-by: Xuehan Xiong <xxman@google.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 README.md                            | 2 +-
 docs/guides/eval.md                  | 2 +-
 docs/guides/sft-openmathinstruct2.md | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 9f605b1b5c..6beca2f50b 100644
--- a/README.md
+++ b/README.md
@@ -377,7 +377,7 @@ uv run python examples/run_eval.py \
 ```
 > **Note:** Evaluation results may vary slightly due to various factors, such as sampling parameters, random seed, inference engine version, and inference engine settings.
 
-Refer to `examples/configs/eval.yaml` for a full list of parameters that can be overridden. For an in-depth explanation of evaluation, refer to the [Evaluation documentation](docs/guides/eval.md).
+Refer to `examples/configs/evals/eval.yaml` for a full list of parameters that can be overridden. For an in-depth explanation of evaluation, refer to the [Evaluation documentation](docs/guides/eval.md).
 
 ## Set Up Clusters
 
diff --git a/docs/guides/eval.md b/docs/guides/eval.md
index 8648940b2e..fc8cb19baf 100644
--- a/docs/guides/eval.md
+++ b/docs/guides/eval.md
@@ -42,7 +42,7 @@ We will use the `run_eval.py` script to run an evaluation using a model directly
 Note that the evaluation script only supports the Hugging Face format model. If you haven't converted your DCP format model, you should back to [Convert DCP to HF](#convert-dcp-to-hf-optional) and follow the guide to convert your model.
 
 ```sh
-# Run evaluation script with default config (examples/configs/eval.yaml)
+# Run evaluation script with default config (examples/configs/evals/eval.yaml)
 uv run python examples/run_eval.py
 
 # Run evaluation script with converted model
diff --git a/docs/guides/sft-openmathinstruct2.md b/docs/guides/sft-openmathinstruct2.md
index 6698c12bc0..1228d42a7d 100644
--- a/docs/guides/sft-openmathinstruct2.md
+++ b/docs/guides/sft-openmathinstruct2.md
@@ -38,7 +38,7 @@ To evaluate on the [MATH-500 benchmark](https://huggingface.co/datasets/HuggingF
 
 ```
 uv run examples/run_eval.py \
-    --config=examples/configs/eval.yaml \
+    --config=examples/configs/evals/eval.yaml \
     generation.model_name=results/sft_openmathinstruct2/step_1855/hf \
     tokenizer.name=meta-llama/Llama-3.1-8B-Instruct \
     data.dataset_name=HuggingFaceH4/MATH-500 \

From aaa3eebd5bd2208ba38527b225b6825b2ddbf2a5 Mon Sep 17 00:00:00 2001
From: Xuehan <xxman@google.com>
Date: Wed, 2 Jul 2025 16:44:59 +0000
Subject: [PATCH 32/44] fix lint errors.

Signed-off-by: Xuehan Xiong <xxman@google.com>
---
 docs/guides/eval.md                           |  2 +-
 nemo_rl/data/eval_datasets/gpqa.py            |  8 +++---
 nemo_rl/data/processors.py                    |  1 -
 nemo_rl/environments/math_environment.py      | 15 ++++++++---
 nemo_rl/evals/answer_parsing.py               |  6 +----
 tests/unit/data/eval_datasets/test_gpqa.py    |  6 +++--
 tests/unit/data/eval_datasets/test_math.py    |  6 +++--
 tests/unit/data/eval_datasets/test_mmlu.py    |  6 +++--
 .../environments/test_math_environment.py     | 26 ++++++++++++++-----
 9 files changed, 49 insertions(+), 27 deletions(-)

diff --git a/docs/guides/eval.md b/docs/guides/eval.md
index fc8cb19baf..b4f97b8c64 100644
--- a/docs/guides/eval.md
+++ b/docs/guides/eval.md
@@ -54,7 +54,7 @@ uv run python examples/run_eval.py --config path/to/custom_config.yaml
 # Run evaluation script on one of the supported benchmarks (e.g., GPQA)
 uv run python examples/run_eval.py --config examples/configs/evals/gpqa_eval.yaml
 
-# Run evaluation script with a local dataset
+# Run evaluation script with a local dataset that is prefetched as a csv file.
 uv run python examples/run_eval.py --config examples/configs/evals/local_eval.yaml
 
 # Override specific config values via command line
diff --git a/nemo_rl/data/eval_datasets/gpqa.py b/nemo_rl/data/eval_datasets/gpqa.py
index 4eb05014c6..9cadceb49e 100644
--- a/nemo_rl/data/eval_datasets/gpqa.py
+++ b/nemo_rl/data/eval_datasets/gpqa.py
@@ -20,7 +20,7 @@ def __init__(
         self._rng = random.Random()
         self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names)
         self.task_spec = TaskDataSpec(
-            task_name=f'GPQA_{variant}',
+            task_name=f"GPQA_{variant}",
             prompt_file=prompt_file,
             system_prompt_file=system_prompt_file,
         )
@@ -38,12 +38,12 @@ def _rekey(self, data: dict[str, Any]):
         correct_index = choices.index(data["Correct Answer"])
         correct_answer = "ABCD"[correct_index]
         return {
-            'question': data['Question'],
-            'options': dict(
+            "question": data["Question"],
+            "options": dict(
                 A=choices[0],
                 B=choices[1],
                 C=choices[2],
                 D=choices[3],
             ),
-            'answer': correct_answer,
+            "answer": correct_answer,
         }
diff --git a/nemo_rl/data/processors.py b/nemo_rl/data/processors.py
index 5fd35d4078..4d207abad8 100644
--- a/nemo_rl/data/processors.py
+++ b/nemo_rl/data/processors.py
@@ -11,7 +11,6 @@
 
 
 # Example of a generic math data processor
-# TaskDataProcessFnCallable
 def math_data_processor(
     datum_dict: dict[str, Any],
     task_data_spec: TaskDataSpec,
diff --git a/nemo_rl/environments/math_environment.py b/nemo_rl/environments/math_environment.py
index 3f2c7cf7af..8dd5247f1c 100644
--- a/nemo_rl/environments/math_environment.py
+++ b/nemo_rl/environments/math_environment.py
@@ -102,7 +102,6 @@ def verify(
 
 @ray.remote
 class MultichoiceVerifyWorker:
-
     def verify(
         self, pred_responses: list[str], ground_truths: list[str]
     ) -> list[float]:
@@ -120,10 +119,14 @@ def verify(
             response = answer_parsing.normalize_response(response)
             extracted_answer = None
             for answer_regex in answer_parsing.MULTILINGUAL_ANSWER_REGEXES:
-                regex = answer_parsing.MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(answer_regex)
+                regex = answer_parsing.MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(
+                    answer_regex
+                )
                 match = re.search(regex, response)
                 if match:
-                    extracted_answer = answer_parsing.normalize_extracted_answer(match.group(1))
+                    extracted_answer = answer_parsing.normalize_extracted_answer(
+                        match.group(1)
+                    )
                     break
             score = 1.0 if extracted_answer == ground_truth else 0.0
             results.append(score)
@@ -139,7 +142,11 @@ class MathEnvironment(EnvironmentInterface):
     def __init__(self, cfg: MathEnvConfig):
         self.cfg = cfg
         self.num_workers = cfg["num_workers"]
-        worker_cls = MultichoiceVerifyWorker if cfg.get("verifier_type", "math") == "multichoice" else HFVerifyWorker
+        worker_cls = (
+            MultichoiceVerifyWorker
+            if cfg.get("verifier_type", "math") == "multichoice"
+            else HFVerifyWorker
+        )
         self.workers = [
             worker_cls.options(  # type: ignore # (decorated with @ray.remote)
                 runtime_env={"py_executable": PY_EXECUTABLES.SYSTEM}
diff --git a/nemo_rl/evals/answer_parsing.py b/nemo_rl/evals/answer_parsing.py
index d4e2fddd6f..8b62026360 100644
--- a/nemo_rl/evals/answer_parsing.py
+++ b/nemo_rl/evals/answer_parsing.py
@@ -1,6 +1,5 @@
 """Contains utility functions for answer parsing."""
 
-
 MULTILINGUAL_ANSWER_PATTERN_TEMPLATE = (
     "(?i){}[ \t]*([A-D]|[أ-د]|[অ]|[ব]|[ড]|[ঢ]|[Ａ]|[Ｂ]|[Ｃ]|[Ｄ])"
 )
@@ -73,10 +72,7 @@ def normalize_extracted_answer(extracted_answer: str) -> str:
 
 
 def normalize_response(response: str) -> str:
-    """
-    Normalize the response by removing markdown and LaTeX formatting that may prevent a match.
-    """
-
+    """Normalize the response by removing markdown and LaTeX formatting that may prevent a match."""
     return (
         response.replace("**", "")
         .replace("$\\boxed{", "")
diff --git a/tests/unit/data/eval_datasets/test_gpqa.py b/tests/unit/data/eval_datasets/test_gpqa.py
index 033a11b6ff..3441f11974 100644
--- a/tests/unit/data/eval_datasets/test_gpqa.py
+++ b/tests/unit/data/eval_datasets/test_gpqa.py
@@ -36,5 +36,7 @@ def test_gpqa_dataset():
             add_special_tokens=False,
         )
 
-        assert default_templated == f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n{example["question"]}<|im_end|>\n"
-
+        assert (
+            default_templated
+            == f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n{example['question']}<|im_end|>\n"
+        )
diff --git a/tests/unit/data/eval_datasets/test_math.py b/tests/unit/data/eval_datasets/test_math.py
index 7a524654fa..3bab184f1a 100644
--- a/tests/unit/data/eval_datasets/test_math.py
+++ b/tests/unit/data/eval_datasets/test_math.py
@@ -35,5 +35,7 @@ def test_math_dataset():
             add_special_tokens=False,
         )
 
-        assert default_templated == f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n{example["problem"]}<|im_end|>\n"
-
+        assert (
+            default_templated
+            == f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n{example['problem']}<|im_end|>\n"
+        )
diff --git a/tests/unit/data/eval_datasets/test_mmlu.py b/tests/unit/data/eval_datasets/test_mmlu.py
index df5dabaef9..02c1936003 100644
--- a/tests/unit/data/eval_datasets/test_mmlu.py
+++ b/tests/unit/data/eval_datasets/test_mmlu.py
@@ -37,5 +37,7 @@ def test_mmlu_dataset():
             add_special_tokens=False,
         )
 
-        assert default_templated == f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n{example["question"]}<|im_end|>\n"
-
+        assert (
+            default_templated
+            == f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n{example['question']}<|im_end|>\n"
+        )
diff --git a/tests/unit/environments/test_math_environment.py b/tests/unit/environments/test_math_environment.py
index ed599bcd5e..b254f2ef5f 100644
--- a/tests/unit/environments/test_math_environment.py
+++ b/tests/unit/environments/test_math_environment.py
@@ -93,15 +93,24 @@ def basic_multichoice_test_data():
     return {
         "message_log_batch": [
             [
-                {"role": "user", "content": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD"},
+                {
+                    "role": "user",
+                    "content": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD",
+                },
                 {"role": "assistant", "content": "\nAnswer: C"},
             ],
             [
-                {"role": "user", "content": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD"},
+                {
+                    "role": "user",
+                    "content": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD",
+                },
                 {"role": "assistant", "content": "\nAnswer: B"},
             ],
             [
-                {"role": "user", "content": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD"},
+                {
+                    "role": "user",
+                    "content": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD",
+                },
                 {"role": "assistant", "content": "\nAnswer: D"},
             ],
         ],
@@ -197,7 +206,8 @@ def test_multichoice_env_step_basic(multichoice_env, basic_multichoice_test_data
     """Test basic functionality of MathEnvironment step with multichoice verifier."""
     result = ray.get(
         multichoice_env.step.remote(
-            basic_multichoice_test_data["message_log_batch"], basic_multichoice_test_data["metadata"]
+            basic_multichoice_test_data["message_log_batch"],
+            basic_multichoice_test_data["metadata"],
         )
     )
 
@@ -211,7 +221,9 @@ def test_multichoice_env_step_basic(multichoice_env, basic_multichoice_test_data
     assert all(
         obs["content"] == "Environment: correct" for obs in result.observations[:2]
     ), "The first two responses should be correct"
-    assert result.observations[2]["content"] == "Environment: incorrect", "The third response should be incorrect"
+    assert result.observations[2]["content"] == "Environment: incorrect", (
+        "The third response should be incorrect"
+    )
 
     # Check metadata
     assert len(result.metadata) == 3, "Should return metadata for all 3 messages"
@@ -221,7 +233,9 @@ def test_multichoice_env_step_basic(multichoice_env, basic_multichoice_test_data
 
     # Check rewards and done flags
     assert result.rewards.shape == (3,), "Rewards should be a tensor of shape (3,)"
-    assert all(result.rewards[:2] == 1.0), "The first two rewards should be 1.0 for correct answers"
+    assert all(result.rewards[:2] == 1.0), (
+        "The first two rewards should be 1.0 for correct answers"
+    )
     assert result.rewards[2] == 0.0, "The thrid  reward should be 0.0 for wrong answer"
     assert result.terminateds.shape == (3,), (
         "Terminated flags should be a tensor of shape (3,)"

From 0d77a158df5f4151dcc9246dac5321a83249a542 Mon Sep 17 00:00:00 2001
From: Xuehan <xxman@google.com>
Date: Wed, 2 Jul 2025 16:49:46 +0000
Subject: [PATCH 33/44] add missing copyright statements.

Signed-off-by: Xuehan Xiong <xxman@google.com>
---
 nemo_rl/data/eval_datasets/aime2024.py           | 14 ++++++++++++++
 nemo_rl/data/eval_datasets/gpqa.py               | 14 ++++++++++++++
 nemo_rl/data/eval_datasets/local_math_dataset.py | 14 ++++++++++++++
 nemo_rl/data/eval_datasets/math.py               | 14 ++++++++++++++
 nemo_rl/data/eval_datasets/mmlu.py               | 14 ++++++++++++++
 nemo_rl/data/eval_datasets/mmlu_pro.py           | 14 ++++++++++++++
 nemo_rl/data/processors.py                       | 14 ++++++++++++++
 7 files changed, 98 insertions(+)

diff --git a/nemo_rl/data/eval_datasets/aime2024.py b/nemo_rl/data/eval_datasets/aime2024.py
index b73bd34dbf..9e585bb511 100644
--- a/nemo_rl/data/eval_datasets/aime2024.py
+++ b/nemo_rl/data/eval_datasets/aime2024.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """AIME 2024 dataset."""
 
 from typing import Any, Optional
diff --git a/nemo_rl/data/eval_datasets/gpqa.py b/nemo_rl/data/eval_datasets/gpqa.py
index 9cadceb49e..f41efa136a 100644
--- a/nemo_rl/data/eval_datasets/gpqa.py
+++ b/nemo_rl/data/eval_datasets/gpqa.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """GPQA dataset and its variants."""
 
 import random
diff --git a/nemo_rl/data/eval_datasets/local_math_dataset.py b/nemo_rl/data/eval_datasets/local_math_dataset.py
index d78b99565f..2810899b4a 100644
--- a/nemo_rl/data/eval_datasets/local_math_dataset.py
+++ b/nemo_rl/data/eval_datasets/local_math_dataset.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """Local math dataset."""
 
 from typing import Any, Literal, Optional
diff --git a/nemo_rl/data/eval_datasets/math.py b/nemo_rl/data/eval_datasets/math.py
index a1c489a148..290902657e 100644
--- a/nemo_rl/data/eval_datasets/math.py
+++ b/nemo_rl/data/eval_datasets/math.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """Math dataset and its variants."""
 
 from typing import Any, Literal, Optional
diff --git a/nemo_rl/data/eval_datasets/mmlu.py b/nemo_rl/data/eval_datasets/mmlu.py
index 86acbcc9a6..f8b75d3b56 100644
--- a/nemo_rl/data/eval_datasets/mmlu.py
+++ b/nemo_rl/data/eval_datasets/mmlu.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """MMLU dataset and its variants."""
 
 from typing import Any, Optional
diff --git a/nemo_rl/data/eval_datasets/mmlu_pro.py b/nemo_rl/data/eval_datasets/mmlu_pro.py
index 4dd094e322..159d4d1738 100644
--- a/nemo_rl/data/eval_datasets/mmlu_pro.py
+++ b/nemo_rl/data/eval_datasets/mmlu_pro.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """MMLU-Pro dataset."""
 
 from typing import Any, Optional
diff --git a/nemo_rl/data/processors.py b/nemo_rl/data/processors.py
index 4d207abad8..67e3658882 100644
--- a/nemo_rl/data/processors.py
+++ b/nemo_rl/data/processors.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """Contains data processors for evaluation."""
 
 from typing import Any, cast

From 17fe4055d83ed6e3e0226aab0053d9d8abcacbfd Mon Sep 17 00:00:00 2001
From: Xuehan <xxman@google.com>
Date: Wed, 2 Jul 2025 16:49:46 +0000
Subject: [PATCH 34/44] add missing copyright statements.

Signed-off-by: Xuehan Xiong <xxman@google.com>
---
 nemo_rl/evals/answer_parsing.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/nemo_rl/evals/answer_parsing.py b/nemo_rl/evals/answer_parsing.py
index 8b62026360..dcf020774a 100644
--- a/nemo_rl/evals/answer_parsing.py
+++ b/nemo_rl/evals/answer_parsing.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """Contains utility functions for answer parsing."""
 
 MULTILINGUAL_ANSWER_PATTERN_TEMPLATE = (

From cf828d627f941b250b32a748c02ba10c5399838b Mon Sep 17 00:00:00 2001
From: Shun Kiyono <shunk52@gmail.com>
Date: Mon, 30 Jun 2025 13:25:18 +0900
Subject: [PATCH 35/44] docs: Add missing arguments to DeepScaler evaluation
 (#502)

Signed-off-by: Shun Kiyono <shun.kiyono@sbintuitions.co.jp>
---
 docs/guides/grpo-deepscaler.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/guides/grpo-deepscaler.md b/docs/guides/grpo-deepscaler.md
index 456b2f2d8b..4404b42949 100644
--- a/docs/guides/grpo-deepscaler.md
+++ b/docs/guides/grpo-deepscaler.md
@@ -33,7 +33,9 @@ Throughout training, the checkpoints of the model will be saved to the `results`
 
 ```sh
 uv run examples/run_eval.py \
-    generation.model_name=results/grpo-deepscaler-1.5b-8K/step_240/hf
+    generation.model_name=results/grpo-deepscaler-1.5b-8K/step_240/hf \
+    data.prompt_file=examples/prompts/cot.txt \
+    generation.vllm_cfg.max_model_len=32768
 ```
 
 Use `generation.model_name` to specify the path to the Hugging Face checkpoint. In addition, we use AIME24 as the validation dataset and calculate pass@1 on it throughout training.

From 01c384024b3897b1e90b5f6aa82c2c9c733c9437 Mon Sep 17 00:00:00 2001
From: Wei Du <wedu@nvidia.com>
Date: Mon, 30 Jun 2025 16:40:16 -0500
Subject: [PATCH 36/44] fix: prevent divisible error by dropping last batch in
 loader (#583)

Signed-off-by: Wei Du <wedu@nvidia.com>
---
 nemo_rl/algorithms/grpo.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nemo_rl/algorithms/grpo.py b/nemo_rl/algorithms/grpo.py
index ea99de1538..3bcad52849 100644
--- a/nemo_rl/algorithms/grpo.py
+++ b/nemo_rl/algorithms/grpo.py
@@ -191,6 +191,7 @@ def setup(
         batch_size=grpo_config["num_prompts_per_step"],
         shuffle=False,
         collate_fn=rl_collate_fn,
+        drop_last=True,
     )
     if last_checkpoint_path is not None:
         dataloader_state_dict = torch.load(

From 658437d80c34630a4b5f285ca94b0e8de3ad5fa7 Mon Sep 17 00:00:00 2001
From: yuki <48991475+yuki-666@users.noreply.github.com>
Date: Tue, 1 Jul 2025 05:48:53 +0800
Subject: [PATCH 37/44] feat: improve worker group args/kwargs (#539)

Signed-off-by: Yuki Huang <yukih@nvidia.com>
---
 nemo_rl/distributed/worker_groups.py         | 104 ++++++++++++++++---
 nemo_rl/models/generation/vllm.py            |  42 ++++----
 nemo_rl/models/policy/lm_policy.py           |   8 +-
 tests/unit/distributed/test_worker_groups.py |  92 ++++++++++++++--
 4 files changed, 195 insertions(+), 51 deletions(-)

diff --git a/nemo_rl/distributed/worker_groups.py b/nemo_rl/distributed/worker_groups.py
index a283e6b18c..c2e849cbee 100644
--- a/nemo_rl/distributed/worker_groups.py
+++ b/nemo_rl/distributed/worker_groups.py
@@ -15,13 +15,12 @@
 import os
 from copy import deepcopy
 from dataclasses import dataclass
-from typing import Any, Iterable, Optional, Union
+from typing import Any, Optional, Union
 
 import ray
 from ray.util.placement_group import PlacementGroup
 from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
 
-from nemo_rl.distributed.batched_data_dict import SlicedDataDict
 from nemo_rl.distributed.named_sharding import NamedSharding
 from nemo_rl.distributed.ray_actor_environment_registry import (
     get_actor_python_env,
@@ -583,6 +582,12 @@ def run_single_worker_single_data(
         Returns:
             ray.ObjectRef: A Ray future for the result.
         """
+        assert len(args) == 0, (
+            "run_single_worker_single_data will fail with args under certain circumstances. "
+            "Please use kwargs instead. "
+            "See https://github.com/NVIDIA-NeMo/RL/issues/582 for more details."
+        )
+
         worker = self.workers[worker_idx]
         method = getattr(worker, method_name)
         return method.remote(*args, **kwargs)
@@ -590,25 +595,62 @@ def run_single_worker_single_data(
     def run_all_workers_multiple_data(
         self,
         method_name: str,
-        data: list[Any],
+        *args,
         run_rank_0_only_axes: list[str] | None = None,
         common_kwargs: Optional[dict[str, Any]] = None,
+        **kwargs,
     ) -> list[ray.ObjectRef]:
         """Run a method on all workers in parallel with different data.
 
         Args:
             method_name: Name of the method to call on each worker
-            data: List of data to pass to workers/groups
+            *args: List of arguments to pass to workers/groups
+                   e.g. [[arg1_for_worker_1, arg1_for_worker_2], [arg2_for_worker_1, arg2_for_worker_2]]
             run_rank_0_only_axes: List of named axes for which only rank 0 should run the method.
-            common_kwargs: Additional keyword arguments to pass to all workers
+            common_kwargs: Keyword arguments to pass to all workers
+            **kwargs: Keyword arguments to pass to workers/groups
+                      e.g. {"key1": [value_for_worker_1, value_for_worker_2], "key2": [value_for_worker_1, value_for_worker_2]}
 
         Returns:
             list[ray.ObjectRef]: A list of ray futures
         """
+        assert len(args) == 0, (
+            "run_all_workers_multiple_data will fail with args under certain circumstances. "
+            "Please use kwargs instead. "
+            "See https://github.com/NVIDIA-NeMo/RL/issues/582 for more details."
+        )
+
+        # Check at least one arg or kwarg is provided
+        assert len(args) > 0 or len(kwargs) > 0, (
+            "At least one args (positional arguments) or kwargs (keyword arguments) must be provided in run_all_workers_multiple_data. "
+            "Otherwise, please use run_all_workers_single_data."
+        )
+
+        # Check all args and kwargs have the same length
+        args_count = [len(arg) for arg in args]
+        assert all(count == args_count[0] for count in args_count), (
+            "All args must have the same length"
+        )
+        args_count = args_count[0] if len(args_count) > 0 else 0
+
+        kwargs_count = [len(value) for value in kwargs.values()]
+        assert all(count == kwargs_count[0] for count in kwargs_count), (
+            "All kwargs must have the same length"
+        )
+        kwargs_count = kwargs_count[0] if len(kwargs_count) > 0 else 0
+
+        if args_count > 0 and kwargs_count > 0:
+            assert args_count == kwargs_count, (
+                "The number of args and kwargs must be the same in run_all_workers_multiple_data. "
+                f"args length = {args_count}, kwargs length = {kwargs_count}"
+            )
+        data_count = max(args_count, kwargs_count)
+
+        # Check the data length is equal to the number of workers
         if run_rank_0_only_axes is None:
-            assert len(data) == len(self.workers), (
+            assert data_count == len(self.workers), (
                 "data length should be equal to the number of workers: "
-                f"data length = {len(data)}, number of workers = {len(self.workers)}"
+                f"data length = {data_count}, number of workers = {len(self.workers)}"
             )
 
         futures = []
@@ -633,12 +675,16 @@ def run_all_workers_multiple_data(
 
             if should_run:
                 method = getattr(worker, method_name)
-                futures.append(method.remote(data=data[data_idx], **common_kwargs))
+                worker_args = [arg[data_idx] for arg in args]
+                worker_kwargs = {key: value[data_idx] for key, value in kwargs.items()}
+                futures.append(
+                    method.remote(*worker_args, **worker_kwargs, **common_kwargs)
+                )
                 data_idx += 1
 
-        assert data_idx == len(data), (
+        assert data_idx == data_count, (
             "data length should be equal to the number of workers started: "
-            f"data length = {len(data)}, number of workers started = {data_idx}"
+            f"data length = {data_count}, number of workers started = {data_idx}"
         )
 
         return futures
@@ -660,6 +706,12 @@ def run_all_workers_single_data(
         Returns:
             list[ray.ObjectRef]: A list of ray futures
         """
+        assert len(args) == 0, (
+            "run_all_workers_single_data will fail with args under certain circumstances. "
+            "Please use kwargs instead. "
+            "See https://github.com/NVIDIA-NeMo/RL/issues/582 for more details."
+        )
+
         futures = []
 
         if run_rank_0_only_axes is None:
@@ -686,12 +738,13 @@ def run_all_workers_single_data(
     def run_all_workers_sharded_data(
         self,
         method_name: str,
-        data: Iterable[SlicedDataDict],  # arbitrary nested iterables of SlicedDataDicts
+        *args,
         in_sharded_axes: list[str] | None = None,
         replicate_on_axes: list[str] | None = None,
         output_is_replicated: list[str] | None = None,
         make_dummy_calls_to_free_axes: bool = False,
         common_kwargs: Optional[dict[str, Any]] = None,
+        **kwargs,
     ) -> MultiWorkerFuture:
         """Run a method on all workers in parallel with sharded data.
 
@@ -701,17 +754,27 @@ def run_all_workers_sharded_data(
 
         Args:
             method_name: Name of the method to call on each worker
-            data: Iterable of SlicedDataDicts to pass to workers/groups
+            *args: List of arguments to pass to workers/groups
+                   e.g. [[arg1_for_worker_1, arg1_for_worker_2], [arg2_for_worker_1, arg2_for_worker_2]]
             in_sharded_axes: List of axes that are sharded
             replicate_on_axes: List of axes that are to be replicated
             output_is_replicated: List of axes along which the output is replicated (and we should just return the first result).
                                   We also just return from rank 0 of free axes.
             make_dummy_calls_to_free_axes: Whether to make dummy calls (with None) to workers that
                                            aren't rank 0 on 'free axes' (axes not in in_sharded_axes or replicate_on_axes).
-            common_kwargs: Additional keyword arguments to pass to all workers
+            common_kwargs: Keyword arguments to pass to all workers
+            **kwargs: Keyword arguments to pass to workers/groups
+                      e.g. {"key1": [value_for_worker_1, value_for_worker_2], "key2": [value_for_worker_1, value_for_worker_2]}
+
         Returns:
             MultiWorkerFuture: Object containing futures and their associated worker information
         """
+        assert len(args) == 0, (
+            "run_all_workers_sharded_data will fail with args under certain circumstances. "
+            "Please use kwargs instead. "
+            "See https://github.com/NVIDIA-NeMo/RL/issues/582 for more details."
+        )
+
         if self.sharding_annotations is None:
             raise ValueError(
                 "Sharding annotations must be provided to use sharded data distribution"
@@ -771,15 +834,20 @@ def run_all_workers_sharded_data(
 
             if should_receive_data:
                 # Find the appropriate data slice for this worker
-                worker_data = data
+                worker_args = args
+                worker_kwargs = kwargs
                 for axis in in_sharded_axes:
                     if axis in worker_coords:
                         # Select the appropriate slice for this axis
-                        worker_data = worker_data[worker_coords[axis]]
+                        worker_args = [arg[worker_coords[axis]] for arg in worker_args]
+                        worker_kwargs = {
+                            key: value[worker_coords[axis]]
+                            for key, value in worker_kwargs.items()
+                        }
 
                 # Call the method on the worker with its data slice
                 future = getattr(worker, method_name).remote(
-                    data=worker_data, **common_kwargs
+                    *worker_args, **worker_kwargs, **common_kwargs
                 )
                 futures.append(future)
                 called_workers.append(worker_idx)
@@ -787,8 +855,10 @@ def run_all_workers_sharded_data(
                 # If this worker doesn't need data:
                 if make_dummy_calls_to_free_axes:
                     # If make_dummy_calls_to_free_axes is True, just call the method with None
+                    worker_args = [None] * len(args)
+                    worker_kwargs = {key: None for key in kwargs.keys()}
                     future = getattr(worker, method_name).remote(
-                        data=None, **common_kwargs
+                        *worker_args, **worker_kwargs, **common_kwargs
                     )
                     futures.append(future)
                     called_workers.append(worker_idx)
diff --git a/nemo_rl/models/generation/vllm.py b/nemo_rl/models/generation/vllm.py
index f0cd5eb50b..7dbfbd3ea8 100644
--- a/nemo_rl/models/generation/vllm.py
+++ b/nemo_rl/models/generation/vllm.py
@@ -348,11 +348,13 @@ def _patch_vllm_init_workers_ray():
         else:
             self.llm = vllm.LLM(**llm_kwargs)
 
-    def init_collective(self, data: int, ip: str, port: int, world_size: int) -> None:
+    def init_collective(
+        self, rank_prefix: int, ip: str, port: int, world_size: int
+    ) -> None:
         self.llm.collective_rpc(
             "init_collective",
             args=(
-                data,
+                rank_prefix,
                 ip,
                 port,
                 world_size,
@@ -360,12 +362,12 @@ def init_collective(self, data: int, ip: str, port: int, world_size: int) -> Non
         )
 
     async def init_collective_async(
-        self, data: int, ip: str, port: int, world_size: int
+        self, rank_prefix: int, ip: str, port: int, world_size: int
     ) -> None:
         await self.llm.collective_rpc(
             "init_collective",
             args=(
-                data,
+                rank_prefix,
                 ip,
                 port,
                 world_size,
@@ -903,11 +905,11 @@ async def report_device_id_async(self) -> list[str]:
 
         return cast(list[str], list_of_worker_results)
 
-    def update_weights_from_ipc_handles(self, data: dict[str, Any]) -> bool:
+    def update_weights_from_ipc_handles(self, ipc_handles: dict[str, Any]) -> bool:
         """Update weights from IPC handles by delegating to the vLLM Worker implementation.
 
         Args:
-            data (dict): Dictionary mapping device UUIDs (str) to parameter IPC handles.
+            ipc_handles (dict): Dictionary mapping device UUIDs (str) to parameter IPC handles.
 
         Returns:
             bool: True if weights were successfully updated, False otherwise.
@@ -923,7 +925,7 @@ def update_weights_from_ipc_handles(self, data: dict[str, Any]) -> bool:
                 )
 
             result_or_coro = self.llm.collective_rpc(
-                "update_weights_from_ipc_handles", args=(data,)
+                "update_weights_from_ipc_handles", args=(ipc_handles,)
             )
             worker_result = result_or_coro[0]
 
@@ -940,11 +942,13 @@ def update_weights_from_ipc_handles(self, data: dict[str, Any]) -> bool:
             traceback.print_exc()
             return False
 
-    async def update_weights_from_ipc_handles_async(self, data: dict[str, Any]) -> bool:
+    async def update_weights_from_ipc_handles_async(
+        self, ipc_handles: dict[str, Any]
+    ) -> bool:
         """Async version of update_weights_from_ipc_handles.
 
         Args:
-            data (dict): Dictionary mapping device UUIDs (str) to parameter IPC handles.
+            ipc_handles (dict): Dictionary mapping device UUIDs (str) to parameter IPC handles.
 
         Returns:
             bool: True if weights were successfully updated, False otherwise.
@@ -960,7 +964,7 @@ async def update_weights_from_ipc_handles_async(self, data: dict[str, Any]) -> b
                 )
 
             result_or_coro = await self.llm.collective_rpc(
-                "update_weights_from_ipc_handles", args=(data,)
+                "update_weights_from_ipc_handles", args=(ipc_handles,)
             )
 
             if asyncio.iscoroutine(result_or_coro):
@@ -983,7 +987,7 @@ async def update_weights_from_ipc_handles_async(self, data: dict[str, Any]) -> b
             traceback.print_exc()
             return False
 
-    def update_weights_from_collective(self, data: dict[str, Any]) -> bool:
+    def update_weights_from_collective(self, info: dict[str, Any]) -> bool:
         """Update the model weights from collective communication."""
         try:
             assert self.llm is not None, (
@@ -996,7 +1000,7 @@ def update_weights_from_collective(self, data: dict[str, Any]) -> bool:
                 )
 
             result_or_coro = self.llm.collective_rpc(
-                "update_weights_from_collective", args=(data,)
+                "update_weights_from_collective", args=(info,)
             )
             worker_result = result_or_coro[0]
 
@@ -1013,7 +1017,7 @@ def update_weights_from_collective(self, data: dict[str, Any]) -> bool:
             traceback.print_exc()
             return False
 
-    async def update_weights_from_collective_async(self, data: dict[str, Any]) -> bool:
+    async def update_weights_from_collective_async(self, info: dict[str, Any]) -> bool:
         """Async version of update_weights_from_collective."""
         try:
             assert self.llm is not None, (
@@ -1026,7 +1030,7 @@ async def update_weights_from_collective_async(self, data: dict[str, Any]) -> bo
                 )
 
             result_or_coro = await self.llm.collective_rpc(
-                "update_weights_from_collective", args=(data,)
+                "update_weights_from_collective", args=(info,)
             )
 
             if asyncio.iscoroutine(result_or_coro):
@@ -1403,7 +1407,7 @@ def init_collective(
         # Send world_size and rank for init collective to all workers
         futures = self.worker_group.run_all_workers_multiple_data(
             method_name,
-            data=rank_prefix_list,
+            rank_prefix=rank_prefix_list,
             run_rank_0_only_axes=["tensor_parallel", "pipeline_parallel"],
             common_kwargs={"ip": ip, "port": port, "world_size": world_size},
         )
@@ -1429,7 +1433,7 @@ def generate(
         )
         future_bundle = self.worker_group.run_all_workers_sharded_data(
             "generate",
-            sharded_data,
+            data=sharded_data,
             in_sharded_axes=["data_parallel"],
             replicate_on_axes=None,  # just run on tp rank 0
             output_is_replicated=None,
@@ -1474,7 +1478,7 @@ def generate_text(
         )
         future_bundle = self.worker_group.run_all_workers_sharded_data(
             "generate_text",
-            sharded_data,
+            data=sharded_data,
             in_sharded_axes=["data_parallel"],
             replicate_on_axes=None,  # just run on tp rank 0
             output_is_replicated=None,
@@ -1708,7 +1712,7 @@ def update_weights(self, ipc_handles: dict[str, Any]) -> bool:
             # Directly pass ipc_handles to the method
             futures = self.worker_group.run_all_workers_multiple_data(
                 method_name,
-                ipc_handles_list,
+                ipc_handles=ipc_handles_list,
                 run_rank_0_only_axes=["tensor_parallel", "pipeline_parallel"],
             )
             # Wait for all futures to complete
@@ -1735,7 +1739,7 @@ def update_weights_from_collective(
         # Use run_all_workers_single_data to send data to all workers
         futures = self.worker_group.run_all_workers_single_data(
             method_name,
-            data=info,
+            info=info,
             run_rank_0_only_axes=["tensor_parallel", "pipeline_parallel"],
         )
 
diff --git a/nemo_rl/models/policy/lm_policy.py b/nemo_rl/models/policy/lm_policy.py
index 4d967a4cba..e469b32d16 100644
--- a/nemo_rl/models/policy/lm_policy.py
+++ b/nemo_rl/models/policy/lm_policy.py
@@ -207,7 +207,7 @@ def get_logprobs(
 
         futures = self.worker_group.run_all_workers_sharded_data(
             "get_logprobs",
-            sharded_data_2d,
+            data=sharded_data_2d,
             in_sharded_axes=["data_parallel", "context_parallel"],
             replicate_on_axes=["tensor_parallel", "pipeline_parallel"],
             output_is_replicated=["tensor_parallel", "pipeline_parallel"],
@@ -263,7 +263,7 @@ def get_reference_policy_logprobs(
 
         futures = self.worker_group.run_all_workers_sharded_data(
             "get_reference_policy_logprobs",
-            sharded_data_2d,
+            data=sharded_data_2d,
             in_sharded_axes=["data_parallel", "context_parallel"],
             replicate_on_axes=["tensor_parallel", "pipeline_parallel"],
             output_is_replicated=["tensor_parallel", "pipeline_parallel"],
@@ -313,7 +313,7 @@ def train(
         # Train each shard in parallel
         futures = self.worker_group.run_all_workers_sharded_data(
             "train",
-            sharded_data,
+            data=sharded_data,
             in_sharded_axes=["data_parallel"],
             replicate_on_axes=[
                 "context_parallel",
@@ -365,7 +365,7 @@ def generate(
         sharded_data = data.shard_by_batch_size(dp_size, batch_size=None)
         futures = self.worker_group.run_all_workers_sharded_data(
             "generate",
-            sharded_data,
+            data=sharded_data,
             in_sharded_axes=["data_parallel"],
             replicate_on_axes=["tensor_parallel", "pipeline_parallel"],
             output_is_replicated=["tensor_parallel", "pipeline_parallel"],
diff --git a/tests/unit/distributed/test_worker_groups.py b/tests/unit/distributed/test_worker_groups.py
index 53b6133c69..12131fe4a4 100644
--- a/tests/unit/distributed/test_worker_groups.py
+++ b/tests/unit/distributed/test_worker_groups.py
@@ -328,6 +328,48 @@ def test_configure_worker_interaction(register_test_actor, virtual_cluster):
     worker_group.shutdown(force=True)
 
 
+def test_run_single_worker_single_data(worker_group_1d_sharding):
+    worker_group = worker_group_1d_sharding
+    assert len(worker_group.workers) == 2
+    ray.get([w.reset_call_records.remote() for w in worker_group.workers])
+
+    data_for_worker0 = SlicedDataDict({"id": 0, "val": "w0_val"})
+    data_for_worker1 = SlicedDataDict({"id": 1, "val": "w1_val"})
+
+    # pass through args
+    # due to https://github.com/NVIDIA-NeMo/RL/issues/582, args are not supported.
+    with pytest.raises(AssertionError):
+        future_0 = worker_group.run_single_worker_single_data(
+            "record_call", 0, data_for_worker0
+        )
+        future_1 = worker_group.run_single_worker_single_data(
+            "record_call", 1, data_for_worker1
+        )
+        ray.get([future_0, future_1])
+
+    # pass through kwargs
+    future_0 = worker_group.run_single_worker_single_data(
+        "record_call", 0, data=data_for_worker0
+    )
+    future_1 = worker_group.run_single_worker_single_data(
+        "record_call", 1, data=data_for_worker1
+    )
+    results = ray.get([future_0, future_1])
+    assert len(results) == 2
+
+    # Check worker 0
+    d, args, _, count = ray.get(worker_group.workers[0].get_recorded_data.remote())
+    assert count == 1
+    assert d == data_for_worker0
+    assert args == ()
+
+    # Check worker 1
+    d, args, _, count = ray.get(worker_group.workers[1].get_recorded_data.remote())
+    assert count == 1
+    assert d == data_for_worker1
+    assert args == ()
+
+
 def test_run_all_workers_single_data_1d_sharding(worker_group_1d_sharding):
     worker_group = worker_group_1d_sharding
     assert len(worker_group.workers) == 2
@@ -339,17 +381,26 @@ def test_run_all_workers_single_data_1d_sharding(worker_group_1d_sharding):
     test_arg1 = "arg_single"
     test_kwarg1 = "kwarg_single_val"
 
+    # pass through args
+    # due to https://github.com/NVIDIA-NeMo/RL/issues/582, args are not supported.
+    with pytest.raises(AssertionError):
+        futures = worker_group.run_all_workers_single_data(
+            "record_call", test_data, test_arg1
+        )
+        ray.get(futures)
+
+    # pass through kwargs
     futures = worker_group.run_all_workers_single_data(
-        "record_call", test_data, test_arg1, kwarg1=test_kwarg1
+        "record_call", data=test_data, kwarg1=test_kwarg1
     )
     results = ray.get(futures)
     assert len(results) == 2  # Should run on all 2 workers
 
-    for i, worker in enumerate(worker_group.workers):
+    for worker in worker_group.workers:
         data, args, kwargs, count = ray.get(worker.get_recorded_data.remote())
         assert count == 1
         assert data == test_data
-        assert args == (test_arg1,)
+        assert args == ()
         assert kwargs == {"kwarg1": test_kwarg1}
 
 
@@ -359,7 +410,7 @@ def test_run_all_workers_single_data_2d_sharding_no_filter(worker_group_2d_shard
     ray.get([w.reset_call_records.remote() for w in worker_group.workers])
 
     test_data = SlicedDataDict({"key": "value_2d_no_filter"})
-    futures = worker_group.run_all_workers_single_data("record_call", test_data)
+    futures = worker_group.run_all_workers_single_data("record_call", data=test_data)
     results = ray.get(futures)
     assert len(results) == 4  # Runs on all 4 workers
 
@@ -377,7 +428,7 @@ def test_run_all_workers_single_data_2d_sharding_filter_tp(worker_group_2d_shard
     test_data = SlicedDataDict({"key": "value_2d_filter_tp"})
     # Only run on tp rank 0 for each dp rank
     futures = worker_group.run_all_workers_single_data(
-        "record_call", test_data, run_rank_0_only_axes=["tp"]
+        "record_call", data=test_data, run_rank_0_only_axes=["tp"]
     )
     results = ray.get(futures)
     assert len(results) == 2  # Runs on 2 workers (dp0-tp0, dp1-tp0)
@@ -403,7 +454,7 @@ def test_run_all_workers_single_data_2d_sharding_filter_dp_tp(worker_group_2d_sh
     test_data = SlicedDataDict({"key": "value_2d_filter_dp_tp"})
     # Only run on dp rank 0 AND tp rank 0
     futures = worker_group.run_all_workers_single_data(
-        "record_call", test_data, run_rank_0_only_axes=["dp", "tp"]
+        "record_call", data=test_data, run_rank_0_only_axes=["dp", "tp"]
     )
     results = ray.get(futures)
     assert len(results) == 1  # Runs on 1 worker (dp0-tp0)
@@ -430,8 +481,17 @@ def test_run_all_workers_multiple_data_1d_sharding(worker_group_1d_sharding):
     multi_data = [data_for_worker0, data_for_worker1]
     common_arg = "common_arg_multi"
 
+    # pass through args
+    # due to https://github.com/NVIDIA-NeMo/RL/issues/582, args are not supported.
+    with pytest.raises(AssertionError):
+        futures = worker_group.run_all_workers_multiple_data(
+            "record_call", multi_data, common_kwargs={"common": common_arg}
+        )
+        ray.get(futures)
+
+    # pass through kwargs
     futures = worker_group.run_all_workers_multiple_data(
-        "record_call", multi_data, common_kwargs={"common": common_arg}
+        "record_call", data=multi_data, common_kwargs={"common": common_arg}
     )
     results = ray.get(futures)
     assert len(results) == 2
@@ -462,10 +522,11 @@ def test_run_all_workers_multiple_data_fewer_data_than_workers(
     data_for_worker1 = SlicedDataDict({"id": 1})
     multi_data = [data_for_worker0, data_for_worker1]  # Only 2 data items
 
-    with pytest.raises(
-        AssertionError, match="data length should be equal to the number of workers: "
-    ):
-        futures = worker_group.run_all_workers_multiple_data("record_call", multi_data)
+    with pytest.raises(AssertionError):
+        futures = worker_group.run_all_workers_multiple_data(
+            "record_call", data=multi_data
+        )
+        ray.get(futures)
 
 
 def test_run_all_workers_sharded_data_1d(worker_group_1d_sharding):
@@ -479,6 +540,15 @@ def test_run_all_workers_sharded_data_1d(worker_group_1d_sharding):
         SlicedDataDict({"shard": 1, "val": "val1"}),
     ]
 
+    # pass through args
+    # due to https://github.com/NVIDIA-NeMo/RL/issues/582, args are not supported.
+    with pytest.raises(AssertionError):
+        future_bundle = worker_group.run_all_workers_sharded_data(
+            "record_call", sharded_data_input, in_sharded_axes=["data"]
+        )
+        worker_group.get_all_worker_results(future_bundle)
+
+    # pass through kwargs
     future_bundle = worker_group.run_all_workers_sharded_data(
         "record_call", data=sharded_data_input, in_sharded_axes=["data"]
     )

From 2eb0301dc95be011f0e37d65bc2ec96f96dd7e32 Mon Sep 17 00:00:00 2001
From: Anna Shors <ashors@nvidia.com>
Date: Mon, 30 Jun 2025 16:18:14 -0700
Subject: [PATCH 38/44] fix: update gemma3 prefix (#585)

Signed-off-by: ashors1 <ashors@nvidia.com>
---
 nemo_rl/models/dtensor/parallelize.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo_rl/models/dtensor/parallelize.py b/nemo_rl/models/dtensor/parallelize.py
index 664bc1a253..fb9c720c20 100644
--- a/nemo_rl/models/dtensor/parallelize.py
+++ b/nemo_rl/models/dtensor/parallelize.py
@@ -92,7 +92,7 @@ def _parallelize_gemma3(
     Tensor parallelism is not supported for Gemma3 models because of tied word embeddings.
     """
     if isinstance(model, Gemma3ForConditionalGeneration):
-        model_prefix = "language_model.model"
+        model_prefix = "language_model"
     else:
         model_prefix = "model"
 
@@ -399,7 +399,7 @@ def _parallelize_model(
     """
     model_cls = type(model)
     if model_cls == Gemma3ForConditionalGeneration:
-        layers: torch.nn.ModuleList = model.language_model.model.layers  # type: ignore
+        layers: torch.nn.ModuleList = model.language_model.layers  # type: ignore
         num_attention_heads = model.config.text_config.num_attention_heads
         num_key_value_heads = model.config.text_config.num_key_value_heads
     else:

From bc234a3c605930b4e334006ccf88af360cedc976 Mon Sep 17 00:00:00 2001
From: Sahil Jain <48468750+SahilJain314@users.noreply.github.com>
Date: Mon, 30 Jun 2025 20:45:02 -0700
Subject: [PATCH 39/44] fix: Added copyright to functest (#584)

Signed-off-by: Sahil Jain <sahilj@nvidia.com>
---
 tests/functional/test_converter_roundtrip.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tests/functional/test_converter_roundtrip.py b/tests/functional/test_converter_roundtrip.py
index e551d0e6b5..90756a2f18 100644
--- a/tests/functional/test_converter_roundtrip.py
+++ b/tests/functional/test_converter_roundtrip.py
@@ -1,4 +1,19 @@
 #!/usr/bin/env python3
+
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """
 Functional test for converter roundtrip functionality.
 

From 2d876dec7ca5342a67deaa20f67af873a7a6f3d1 Mon Sep 17 00:00:00 2001
From: Charlie Truong <chtruong@nvidia.com>
Date: Tue, 1 Jul 2025 19:25:17 -0500
Subject: [PATCH 40/44] chore: Update github url after org transfer (#512)

Signed-off-by: Charlie Truong <chtruong@nvidia.com>
Signed-off-by: Sahil Jain <48468750+SahilJain314@users.noreply.github.com>
Co-authored-by: Sahil Jain <48468750+SahilJain314@users.noreply.github.com>
---
 .github/PULL_REQUEST_TEMPLATE.md              |  8 ++---
 CONTRIBUTING.md                               |  2 +-
 README.md                                     | 10 +++---
 docs/adding-new-models.md                     | 32 +++++++++----------
 docs/model-quirks.md                          |  2 +-
 examples/configs/grpo-deepscaler-1.5b-8K.yaml |  2 +-
 examples/configs/grpo_math_1B.yaml            |  2 +-
 examples/configs/grpo_sliding_puzzle.yaml     |  4 +--
 ...-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml |  2 +-
 examples/converters/convert_dcp_to_hf.py      |  2 +-
 nemo_rl/algorithms/dpo.py                     |  2 +-
 .../ray_actor_environment_registry.py         |  2 +-
 nemo_rl/models/generation/vllm.py             |  2 +-
 .../models/policy/dtensor_policy_worker.py    |  4 +--
 nemo_rl/models/policy/fsdp1_policy_worker.py  |  6 ++--
 nemo_rl/package_info.py                       |  4 +--
 nemo_rl/utils/native_checkpoint.py            |  2 +-
 pyproject.toml                                |  2 +-
 tests/functional/dpo.sh                       |  2 +-
 tests/functional/test_converter_roundtrip.py  | 13 ++++++++
 ...ma3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.sh |  2 +-
 ...llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.sh |  2 +-
 ...-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.sh |  2 +-
 .../models/generation/test_vllm_generation.py |  2 +-
 tests/unit/utils/test_native_checkpoint.py    |  2 +-
 uv.lock                                       |  4 +--
 26 files changed, 66 insertions(+), 53 deletions(-)

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 3e3e4fb3fe..b83ec70073 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -10,15 +10,15 @@ List issues that this PR closes ([syntax](https://docs.github.com/en/issues/trac
 * **You can potentially add a usage example below**
 
 ```python
-# Add a code snippet demonstrating how to use this 
+# Add a code snippet demonstrating how to use this
 ```
 
 # Before your PR is "Ready for review"
 **Pre checks**:
-- [ ] Make sure you read and followed [Contributor guidelines](/NVIDIA/NeMo-RL/blob/main/CONTRIBUTING.md)
+- [ ] Make sure you read and followed [Contributor guidelines](/NVIDIA-NeMo/RL/blob/main/CONTRIBUTING.md)
 - [ ] Did you write any new necessary tests?
-- [ ] Did you run the unit tests and functional tests locally? Visit our [Testing Guide](/NVIDIA/NeMo-RL/blob/main/docs/testing.md) for how to run tests
-- [ ] Did you add or update any necessary documentation? Visit our [Document Development Guide](/NVIDIA/NeMo-RL/blob/main/docs/documentation.md) for how to write, build and test the docs.
+- [ ] Did you run the unit tests and functional tests locally? Visit our [Testing Guide](/NVIDIA-NeMo/RL/blob/main/docs/testing.md) for how to run tests
+- [ ] Did you add or update any necessary documentation? Visit our [Document Development Guide](/NVIDIA-NeMo/RL/blob/main/docs/documentation.md) for how to write, build and test the docs.
 
 # Additional Information
 * ...
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2cc6a3051b..3dc065655a 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -31,7 +31,7 @@ We follow a direct clone and branch workflow for now:
 
 1. Clone the repository directly:
    ```bash
-   git clone https://github.com/NVIDIA/NeMo-RL
+   git clone https://github.com/NVIDIA-NeMo/RL
    cd nemo-rl
    ```
 
diff --git a/README.md b/README.md
index 6beca2f50b..4dc2f7395f 100644
--- a/README.md
+++ b/README.md
@@ -73,7 +73,7 @@ cd nemo-rl
 # by running (This is not necessary if you are using the pure Pytorch/DTensor path):
 git submodule update --init --recursive
 
-# Different branches of the repo can have different pinned versions of these third-party submodules. Ensure 
+# Different branches of the repo can have different pinned versions of these third-party submodules. Ensure
 # submodules are automatically updated after switching branches or pulling updates by configuring git with:
 # git config submodule.recurse true
 
@@ -226,7 +226,7 @@ sbatch \
 We also support multi-turn generation and training (tool use, games, etc.).
 Reference example for training to play a Sliding Puzzle Game:
 ```sh
-uv run python examples/run_grpo_sliding_puzzle.py 
+uv run python examples/run_grpo_sliding_puzzle.py
 ```
 
 ## Supervised Fine-Tuning (SFT)
@@ -409,7 +409,7 @@ If you use NeMo RL in your research, please cite it using the following BibTeX e
 ```bibtex
 @misc{nemo-rl,
 title = {NeMo RL: A Scalable and Efficient Post-Training Library},
-howpublished = {\url{https://github.com/NVIDIA/NeMo-RL}},
+howpublished = {\url{https://github.com/NVIDIA-NeMo/RL}},
 year = {2025},
 note = {GitHub repository},
 }
@@ -417,8 +417,8 @@ note = {GitHub repository},
 
 ## Contributing
 
-We welcome contributions to NeMo RL\! Please see our [Contributing Guidelines](https://github.com/NVIDIA/NeMo-RL/blob/main/CONTRIBUTING.md) for more information on how to get involved.
+We welcome contributions to NeMo RL\! Please see our [Contributing Guidelines](https://github.com/NVIDIA-NeMo/RL/blob/main/CONTRIBUTING.md) for more information on how to get involved.
 
 ## Licenses
 
-NVIDIA NeMo RL is licensed under the [Apache License 2.0](https://github.com/NVIDIA/NeMo-RL/blob/main/LICENSE).
+NVIDIA NeMo RL is licensed under the [Apache License 2.0](https://github.com/NVIDIA-NeMo/RL/blob/main/LICENSE).
diff --git a/docs/adding-new-models.md b/docs/adding-new-models.md
index 155a012f47..e0de97ae40 100644
--- a/docs/adding-new-models.md
+++ b/docs/adding-new-models.md
@@ -12,7 +12,7 @@ $$\text{KL} = E_{x \sim \pi}[\pi(x) - \pi_{\text{ref}}(x)]$$
 
 When summed/integrated, replacing the $x \sim \pi$ with $x \sim \pi_{\text{wrong}}$ leads to an error of:
 
-$$\sum_{x} \left( \pi(x) - \pi_{\text{ref}}(x) \right) \left( \pi_{\text{wrong}}(x) - \pi(x) \right)$$  
+$$\sum_{x} \left( \pi(x) - \pi_{\text{ref}}(x) \right) \left( \pi_{\text{wrong}}(x) - \pi(x) \right)$$
 
 So, to verify correctness, we calculate:
 
@@ -65,28 +65,28 @@ When investigating discrepancies beyond the acceptable threshold, focus on these
 
 When validating Hugging Face-based models, perform the following checks:
 
-- **Compare log probabilities**  
+- **Compare log probabilities**
   Ensure the generation log probabilities from inference backends like **vLLM** match those computed by Hugging Face. This comparison helps diagnose potential mismatches.
 
-- **Test parallelism**  
+- **Test parallelism**
   Verify consistency with other parallelism settings.
 
-- **Variance**  
+- **Variance**
   Repeat tests multiple times (e.g., 10 runs) to confirm that behavior is deterministic or within acceptable variance.
 
-- **Check sequence lengths**  
-  Perform inference on sequence lengths of 100, 1,000, and 10,000 tokens.  
+- **Check sequence lengths**
+  Perform inference on sequence lengths of 100, 1,000, and 10,000 tokens.
   Ensure the model behaves consistently at each length.
 
-- **Use real and dummy data**  
-  - **Real data:** Tokenize and generate from actual text samples.  
+- **Use real and dummy data**
+  - **Real data:** Tokenize and generate from actual text samples.
   - **Dummy data:** Simple numeric sequences to test basic generation.
 
-- **Vary sampling parameters**  
-  Test both greedy and sampling generation modes.  
+- **Vary sampling parameters**
+  Test both greedy and sampling generation modes.
   Adjust temperature and top-p to confirm output consistency across backends.
 
-- **Test different batch sizes**  
+- **Test different batch sizes**
   Try with batch sizes of 1, 8, and 32 to ensure consistent behavior across different batch configurations.
 
 ---
@@ -95,11 +95,11 @@ When validating Hugging Face-based models, perform the following checks:
 
 ### Additional Validation
 
-- **Compare Megatron outputs**  
+- **Compare Megatron outputs**
   Ensure the Megatron forward pass aligns with Hugging Face and the generation log probabilities from inference backends like **vLLM**.
 
-- **Parallel settings**  
-  Match the same parallelism configurations used for the HuggingFace-based tests.  
+- **Parallel settings**
+  Match the same parallelism configurations used for the HuggingFace-based tests.
   Confirm outputs remain consistent across repeated runs.
 
 ---
@@ -128,7 +128,7 @@ By following these validation steps and ensuring your model's outputs remain con
 We also maintain a set of standalone scripts that can be used to diagnose issues related to correctness that
 we have encountered before.
 
-## [1.max_model_len_respected.py](https://github.com/NVIDIA/NeMo-RL/blob/main/tools/model_diagnostics/1.max_model_len_respected.py)
+## [1.max_model_len_respected.py](https://github.com/NVIDIA-NeMo/RL/blob/main/tools/model_diagnostics/1.max_model_len_respected.py)
 
 Test if a new model respects the `max_model_len` passed to vllm:
 
@@ -142,7 +142,7 @@ uv run --extra vllm tools/model_diagnostics/1.max_model_len_respected.py Qwen/Qw
 # [Qwen/Qwen2.5-1.5B] ALL GOOD!
 ```
 
-## [2.long_generation_decode_vs_prefill](https://github.com/NVIDIA/NeMo-RL/blob/main/tools/model_diagnostics/2.long_generation_decode_vs_prefill.py)
+## [2.long_generation_decode_vs_prefill](https://github.com/NVIDIA-NeMo/RL/blob/main/tools/model_diagnostics/2.long_generation_decode_vs_prefill.py)
 
 Test that vLLM yields near-identical token log-probabilities when comparing decoding with a single prefill pass across multiple prompts.
 
diff --git a/docs/model-quirks.md b/docs/model-quirks.md
index fa2b181c7e..ca08b2741b 100644
--- a/docs/model-quirks.md
+++ b/docs/model-quirks.md
@@ -6,7 +6,7 @@ This document outlines special cases and model-specific behaviors that require c
 
 ### Tied Weights
 
-Weight tying between the embedding layer (`model.embed_tokens`) and output layer (`lm_head`) is currently not respected when using the FSDP1 policy or the DTensor policy when TP > 1 (See [this issue](https://github.com/NVIDIA/NeMo-RL/issues/227)). To avoid errors when training these models, we only allow training models with tied weights using the DTensor policy with TP=1. For Llama-3 and Qwen2.5 models, weight-tying is only enabled for the smaller models (< 2B), which can typically be trained without tensor parallelism. For Gemma-3, all model sizes have weight-tying enabled, including the larger models which require tensor parallelism. To support training of these models, we specially handle the Gemma-3 models by allowing training using the DTensor policy with TP > 1.
+Weight tying between the embedding layer (`model.embed_tokens`) and output layer (`lm_head`) is currently not respected when using the FSDP1 policy or the DTensor policy when TP > 1 (See [this issue](https://github.com/NVIDIA-NeMo/RL/issues/227)). To avoid errors when training these models, we only allow training models with tied weights using the DTensor policy with TP=1. For Llama-3 and Qwen2.5 models, weight-tying is only enabled for the smaller models (< 2B), which can typically be trained without tensor parallelism. For Gemma-3, all model sizes have weight-tying enabled, including the larger models which require tensor parallelism. To support training of these models, we specially handle the Gemma-3 models by allowing training using the DTensor policy with TP > 1.
 
 **Special Handling:**
 - We skip the tied weights check for all Gemma-3 models when using the DTensor policy, allowing training using TP > 1.
diff --git a/examples/configs/grpo-deepscaler-1.5b-8K.yaml b/examples/configs/grpo-deepscaler-1.5b-8K.yaml
index 96bc7f2e76..1013f3d4c2 100644
--- a/examples/configs/grpo-deepscaler-1.5b-8K.yaml
+++ b/examples/configs/grpo-deepscaler-1.5b-8K.yaml
@@ -30,7 +30,7 @@ checkpointing:
   save_period: 10
 
 policy:
-  # Qwen/Qwen2.5-1.5B has tied weights which are only supported with dtensor policy with tp size 1 (https://github.com/NVIDIA/NeMo-RL/issues/227)
+  # Qwen/Qwen2.5-1.5B has tied weights which are only supported with dtensor policy with tp size 1 (https://github.com/NVIDIA-NeMo/RL/issues/227)
   model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
   tokenizer:
     name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml
index 85cc620b62..1842b01497 100644
--- a/examples/configs/grpo_math_1B.yaml
+++ b/examples/configs/grpo_math_1B.yaml
@@ -30,7 +30,7 @@ checkpointing:
   save_period: 10
 
 policy:
-  # Qwen/Qwen2.5-1.5B has tied weights which are only supported with dtensor policy with tp size 1 (https://github.com/NVIDIA/NeMo-RL/issues/227)
+  # Qwen/Qwen2.5-1.5B has tied weights which are only supported with dtensor policy with tp size 1 (https://github.com/NVIDIA-NeMo/RL/issues/227)
   model_name: "Qwen/Qwen2.5-1.5B"
   tokenizer:
     name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
diff --git a/examples/configs/grpo_sliding_puzzle.yaml b/examples/configs/grpo_sliding_puzzle.yaml
index 0b99e750e8..8493bfc40e 100644
--- a/examples/configs/grpo_sliding_puzzle.yaml
+++ b/examples/configs/grpo_sliding_puzzle.yaml
@@ -24,7 +24,7 @@ policy:
     max_new_tokens: ${policy.max_total_sequence_length}
     temperature: 1.0
     # Setting top_p/top_k to 0.999/10000 to strip out Qwen's special/illegal tokens
-    # https://github.com/NVIDIA/NeMo-RL/issues/237
+    # https://github.com/NVIDIA-NeMo/RL/issues/237
     top_p: 0.999
     top_k: 10000
     stop_token_ids: null
@@ -38,7 +38,7 @@ policy:
 
 data:
   add_system_prompt: false
- 
+
 env:
   sliding_puzzle_game:
     cfg:
diff --git a/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml b/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml
index 0ec0ef477a..2458739e2e 100644
--- a/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml
+++ b/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml
@@ -45,7 +45,7 @@ policy:
     context_parallel_size: 1
     custom_parallel_plan: null
   dynamic_batching:
-    # TODO: OOMs if enabled https://github.com/NVIDIA/NeMo-RL/issues/383
+    # TODO: OOMs if enabled https://github.com/NVIDIA-NeMo/RL/issues/383
     enabled: False
     train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
     logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}}
diff --git a/examples/converters/convert_dcp_to_hf.py b/examples/converters/convert_dcp_to_hf.py
index fc53418696..d87d97a64e 100644
--- a/examples/converters/convert_dcp_to_hf.py
+++ b/examples/converters/convert_dcp_to_hf.py
@@ -51,7 +51,7 @@ def main():
 
     model_name_or_path = config["policy"]["model_name"]
     # TODO: After the following PR gets merged:
-    # https://github.com/NVIDIA/NeMo-RL/pull/148/files
+    # https://github.com/NVIDIA-NeMo/RL/pull/148/files
     # tokenizer should be copied from policy/tokenizer/* instead of relying on the model name
     # We can expose a arg at the top level --tokenizer_path to plumb that through.
     # This is more stable than relying on the current NeMo-RL get_tokenizer() which can
diff --git a/nemo_rl/algorithms/dpo.py b/nemo_rl/algorithms/dpo.py
index c7b3de9f5f..3883328216 100644
--- a/nemo_rl/algorithms/dpo.py
+++ b/nemo_rl/algorithms/dpo.py
@@ -71,7 +71,7 @@ class DPOConfig(TypedDict):
     preference_average_log_probs: bool
     sft_average_log_probs: bool
     ## TODO(@ashors) support other loss functions
-    ## https://github.com/NVIDIA/NeMo-RL/issues/193
+    ## https://github.com/NVIDIA-NeMo/RL/issues/193
     # preference_loss: str
     # gt_reward_scale: float
     preference_loss_weight: float
diff --git a/nemo_rl/distributed/ray_actor_environment_registry.py b/nemo_rl/distributed/ray_actor_environment_registry.py
index 4c7eebee13..1f1937729d 100644
--- a/nemo_rl/distributed/ray_actor_environment_registry.py
+++ b/nemo_rl/distributed/ray_actor_environment_registry.py
@@ -17,7 +17,7 @@
 ACTOR_ENVIRONMENT_REGISTRY: dict[str, str] = {
     "nemo_rl.models.generation.vllm.VllmGenerationWorker": PY_EXECUTABLES.VLLM,
     # Temporary workaround for the coupled implementation of DTensorPolicyWorker and vLLM.
-    # This will be reverted to PY_EXECUTABLES.BASE once https://github.com/NVIDIA/NeMo-RL/issues/501 is resolved.
+    # This will be reverted to PY_EXECUTABLES.BASE once https://github.com/NVIDIA-NeMo/RL/issues/501 is resolved.
     "nemo_rl.models.policy.dtensor_policy_worker.DTensorPolicyWorker": PY_EXECUTABLES.VLLM,
     "nemo_rl.models.policy.fsdp1_policy_worker.FSDP1PolicyWorker": PY_EXECUTABLES.BASE,
     "nemo_rl.models.policy.megatron_policy_worker.MegatronPolicyWorker": PY_EXECUTABLES.MCORE,
diff --git a/nemo_rl/models/generation/vllm.py b/nemo_rl/models/generation/vllm.py
index 7dbfbd3ea8..cc8b44d5f3 100644
--- a/nemo_rl/models/generation/vllm.py
+++ b/nemo_rl/models/generation/vllm.py
@@ -330,7 +330,7 @@ def _patch_vllm_init_workers_ray():
             enable_prefix_caching=torch.cuda.get_device_capability()[0] >= 8,
             dtype=self.cfg["vllm_cfg"]["precision"],
             seed=seed,
-            # Don't use cuda-graph by default as it leads to convergence issues (see https://github.com/NVIDIA/NeMo-RL/issues/186)
+            # Don't use cuda-graph by default as it leads to convergence issues (see https://github.com/NVIDIA-NeMo/RL/issues/186)
             enforce_eager=True,
             max_model_len=self.cfg["vllm_cfg"]["max_model_len"],
             trust_remote_code=True,
diff --git a/nemo_rl/models/policy/dtensor_policy_worker.py b/nemo_rl/models/policy/dtensor_policy_worker.py
index 61dcd9a127..a5e1d9259d 100644
--- a/nemo_rl/models/policy/dtensor_policy_worker.py
+++ b/nemo_rl/models/policy/dtensor_policy_worker.py
@@ -162,7 +162,7 @@ def __init__(
             device_map="cpu",  # load weights onto CPU initially
             # Always load the model in float32 to keep master weights in float32.
             # Keeping the master weights in lower precision has shown to cause issues with convergence.
-            # https://github.com/NVIDIA/NeMo-RL/issues/279 will fix the issue of CPU OOM for larger models.
+            # https://github.com/NVIDIA-NeMo/RL/issues/279 will fix the issue of CPU OOM for larger models.
             torch_dtype=torch.float32,
             trust_remote_code=True,
             **sliding_window_overwrite(
@@ -381,7 +381,7 @@ def train(
             and not self.skip_tie_check
         ):
             raise ValueError(
-                f"Using dtensor policy with tp size {self.cfg['dtensor_cfg']['tensor_parallel_size']} for model ({self.cfg['model_name']}) that has tied weights (num_tied_weights={self.num_tied_weights}) is not supported (https://github.com/NVIDIA/NeMo-RL/issues/227). Please use dtensor policy with tensor parallel == 1 instead."
+                f"Using dtensor policy with tp size {self.cfg['dtensor_cfg']['tensor_parallel_size']} for model ({self.cfg['model_name']}) that has tied weights (num_tied_weights={self.num_tied_weights}) is not supported (https://github.com/NVIDIA-NeMo/RL/issues/227). Please use dtensor policy with tensor parallel == 1 instead."
             )
         if gbs is None:
             gbs = self.cfg["train_global_batch_size"]
diff --git a/nemo_rl/models/policy/fsdp1_policy_worker.py b/nemo_rl/models/policy/fsdp1_policy_worker.py
index ef0eb98720..f4ec53daa0 100644
--- a/nemo_rl/models/policy/fsdp1_policy_worker.py
+++ b/nemo_rl/models/policy/fsdp1_policy_worker.py
@@ -96,7 +96,7 @@ def __init__(
             device_map="cpu",  # load weights onto CPU initially
             # Always load the model in float32 to keep master weights in float32.
             # Keeping the master weights in lower precision has shown to cause issues with convergence.
-            # https://github.com/NVIDIA/NeMo-RL/issues/279 will fix the issue of CPU OOM for larger models.
+            # https://github.com/NVIDIA-NeMo/RL/issues/279 will fix the issue of CPU OOM for larger models.
             torch_dtype=torch.float32,
             trust_remote_code=True,
             **sliding_window_overwrite(
@@ -110,7 +110,7 @@ def __init__(
             self.reference_model = AutoModelForCausalLM.from_pretrained(
                 model_name,
                 device_map="cpu",  # load weights onto CPU initially
-                torch_dtype=torch.float32,  # use full precision in sft until https://github.com/NVIDIA/nemo-rl/issues/13 is fixed
+                torch_dtype=torch.float32,  # use full precision in sft until https://github.com/NVIDIA-NeMo/RL/issues/13 is fixed
                 trust_remote_code=True,
                 **sliding_window_overwrite(
                     model_name
@@ -249,7 +249,7 @@ def train(
         skip_tie_check = os.environ.get("NRL_SKIP_TIED_WEIGHT_CHECK")
         if self.num_tied_weights != 0 and not skip_tie_check:
             raise ValueError(
-                f"Using FSP1 with a model ({self.cfg['model_name']}) that has tied weights (num_tied_weights={self.num_tied_weights}) is not supported (https://github.com/NVIDIA/NeMo-RL/issues/227). Please use dtensor policy with tensor parallel == 1 instead."
+                f"Using FSP1 with a model ({self.cfg['model_name']}) that has tied weights (num_tied_weights={self.num_tied_weights}) is not supported (https://github.com/NVIDIA-NeMo/RL/issues/227). Please use dtensor policy with tensor parallel == 1 instead."
             )
 
         if gbs is None:
diff --git a/nemo_rl/package_info.py b/nemo_rl/package_info.py
index 3fcefc1375..29883366db 100644
--- a/nemo_rl/package_info.py
+++ b/nemo_rl/package_info.py
@@ -28,8 +28,8 @@
 __contact_names__ = "NVIDIA"
 __contact_emails__ = "nemo-tookit@nvidia.com"
 __homepage__ = "https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/"
-__repository_url__ = "https://github.com/NVIDIA/NeMo-RL"
-__download_url__ = "https://github.com/NVIDIA/NeMo-RL/releases"
+__repository_url__ = "https://github.com/NVIDIA-NeMo/RL"
+__download_url__ = "https://github.com/NVIDIA-NeMo/RL/releases"
 __description__ = "NeMo-RL - a toolkit for model alignment"
 __license__ = "Apache2"
 __keywords__ = "deep learning, machine learning, gpu, NLP, NeMo, nvidia, pytorch, torch, language, reinforcement learning, RLHF, preference modeling, SteerLM, DPO"
diff --git a/nemo_rl/utils/native_checkpoint.py b/nemo_rl/utils/native_checkpoint.py
index b857264d31..43d511bd74 100644
--- a/nemo_rl/utils/native_checkpoint.py
+++ b/nemo_rl/utils/native_checkpoint.py
@@ -248,7 +248,7 @@ def convert_dcp_to_hf(
     config.save_pretrained(hf_ckpt_path)
 
     # TODO: After the following PR gets merged:
-    # https://github.com/NVIDIA/NeMo-RL/pull/148/files
+    # https://github.com/NVIDIA-NeMo/RL/pull/148/files
     # tokenizer should be copied from policy/tokenizer/* instead of relying on the model name
     # We can expose a arg at the top level --tokenizer_path to plumb that through.
     # This is more stable than relying on the current NeMo-RL get_tokenizer() which can
diff --git a/pyproject.toml b/pyproject.toml
index 62095ae9fb..6b1371de83 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -101,7 +101,7 @@ test = [
 megatron-core = { workspace = true }
 nemo-tron = { workspace = true }
 # The NeMo Run source to be used by nemo-tron
-nemo_run = { git = "https://github.com/NVIDIA/NeMo-Run", rev = "414f0077c648fde2c71bb1186e97ccbf96d6844c" }
+nemo_run = { git = "https://github.com/NVIDIA-NeMo/Run", rev = "414f0077c648fde2c71bb1186e97ccbf96d6844c" }
 # torch/torchvision/triton all come from the torch index in order to pick up aarch64 wheels
 torch = [
   { index = "pytorch-cu128" },
diff --git a/tests/functional/dpo.sh b/tests/functional/dpo.sh
index 562f62a0b8..b03b611b25 100755
--- a/tests/functional/dpo.sh
+++ b/tests/functional/dpo.sh
@@ -36,7 +36,7 @@ uv run $PROJECT_ROOT/examples/run_dpo.py \
 uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 
 # TODO: threshold set higher since test is flaky
-# https://github.com/NVIDIA/NeMo-RL/issues/370
+# https://github.com/NVIDIA-NeMo/RL/issues/370
 uv run tests/check_metrics.py $JSON_METRICS \
   'data["train/loss"]["3"] < 0.8'
 
diff --git a/tests/functional/test_converter_roundtrip.py b/tests/functional/test_converter_roundtrip.py
index 90756a2f18..ea865be9b2 100644
--- a/tests/functional/test_converter_roundtrip.py
+++ b/tests/functional/test_converter_roundtrip.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 #!/usr/bin/env python3
 
 # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.sh b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.sh
index 9e3a004460..b22c00dec0 100755
--- a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.sh
+++ b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.sh
@@ -32,7 +32,7 @@ uv run examples/run_sft.py \
 # Convert tensorboard logs to json
 uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 
-# TODO: the memory check is known to OOM. see https://github.com/NVIDIA/NeMo-RL/issues/263
+# TODO: the memory check is known to OOM. see https://github.com/NVIDIA-NeMo/RL/issues/263
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
     # TODO: FIGURE OUT CORRECT METRICS
diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.sh b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.sh
index 26c78649c8..abed80e5ed 100755
--- a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.sh
+++ b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.sh
@@ -31,7 +31,7 @@ uv run examples/run_sft.py \
 # Convert tensorboard logs to json
 uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 
-# TODO: memory check will fail due to OOM tracked here https://github.com/NVIDIA/NeMo-RL/issues/263
+# TODO: memory check will fail due to OOM tracked here https://github.com/NVIDIA-NeMo/RL/issues/263
 
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
diff --git a/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.sh b/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.sh
index eeaa9c8025..257add6fc5 100755
--- a/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.sh
+++ b/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.sh
@@ -3,7 +3,7 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 source $SCRIPT_DIR/common.env
 
 # TODO: this config can crash on OOM
-# https://github.com/NVIDIA/NeMo-RL/issues/263
+# https://github.com/NVIDIA-NeMo/RL/issues/263
 
 # ===== BEGIN CONFIG =====
 NUM_NODES=4
diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py
index dc1de1b123..1404b02337 100644
--- a/tests/unit/models/generation/test_vllm_generation.py
+++ b/tests/unit/models/generation/test_vllm_generation.py
@@ -475,7 +475,7 @@ async def test_vllm_policy_generation_async(
 
 
 @pytest.mark.skip(
-    reason="Skipping for now, will be fixed in https://github.com/NVIDIA/NeMo-RL/issues/408"
+    reason="Skipping for now, will be fixed in https://github.com/NVIDIA-NeMo/RL/issues/408"
 )
 def test_vllm_worker_seed_behavior(cluster, tokenizer):
     """
diff --git a/tests/unit/utils/test_native_checkpoint.py b/tests/unit/utils/test_native_checkpoint.py
index 88356d2dba..7df7f8543b 100755
--- a/tests/unit/utils/test_native_checkpoint.py
+++ b/tests/unit/utils/test_native_checkpoint.py
@@ -330,7 +330,7 @@ def test_convert_dcp_to_hf(policy, num_gpus):
             os.path.join(tmp_dir, "test_hf_and_dcp-hf-offline"),
             simple_policy_config["model_name"],
             # TODO: After the following PR gets merged:
-            # https://github.com/NVIDIA/NeMo-RL/pull/148/files
+            # https://github.com/NVIDIA-NeMo/RL/pull/148/files
             # tokenizer should be copied from policy/tokenizer/* instead of relying on the model name
             # We can expose a arg at the top level --tokenizer_path to plumb that through.
             # This is more stable than relying on the current NeMo-RL get_tokenizer() which can
diff --git a/uv.lock b/uv.lock
index 9b50767fac..e2a02bdd23 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2427,7 +2427,7 @@ test = [
 [[package]]
 name = "nemo-run"
 version = "0.5.0rc0.dev0"
-source = { git = "https://github.com/NVIDIA/NeMo-Run?rev=414f0077c648fde2c71bb1186e97ccbf96d6844c#414f0077c648fde2c71bb1186e97ccbf96d6844c" }
+source = { git = "https://github.com/NVIDIA-NeMo/Run?rev=414f0077c648fde2c71bb1186e97ccbf96d6844c#414f0077c648fde2c71bb1186e97ccbf96d6844c" }
 dependencies = [
     { name = "catalogue" },
     { name = "cryptography" },
@@ -2473,7 +2473,7 @@ requires-dist = [
     { name = "ijson" },
     { name = "lightning" },
     { name = "matplotlib" },
-    { name = "nemo-run", git = "https://github.com/NVIDIA/NeMo-Run?rev=414f0077c648fde2c71bb1186e97ccbf96d6844c" },
+    { name = "nemo-run", git = "https://github.com/NVIDIA-NeMo/Run?rev=414f0077c648fde2c71bb1186e97ccbf96d6844c" },
     { name = "onnx" },
     { name = "scikit-learn" },
     { name = "webdataset" },

From ddac07c79125e18cae16d22cd181e71154503801 Mon Sep 17 00:00:00 2001
From: atfujita <40932835+AtsunoriFujita@users.noreply.github.com>
Date: Wed, 2 Jul 2025 23:25:27 +0900
Subject: [PATCH 41/44] feat: add OpenAI format dataset for SFT (#485)

Signed-off-by: Atsunori Fujita <afujita@nvidia.com>
---
 examples/run_sft.py                           |   8 ++
 nemo_rl/data/hf_datasets/__init__.py          |   2 +
 .../data/hf_datasets/oai_format_dataset.py    |  78 ++++++++++++
 .../hf_datasets/test_oai_format_dataset.py    | 119 ++++++++++++++++++
 4 files changed, 207 insertions(+)
 create mode 100644 nemo_rl/data/hf_datasets/oai_format_dataset.py
 create mode 100644 tests/unit/data/hf_datasets/test_oai_format_dataset.py

diff --git a/examples/run_sft.py b/examples/run_sft.py
index 8eb93b5adc..ce5b258b0c 100644
--- a/examples/run_sft.py
+++ b/examples/run_sft.py
@@ -109,6 +109,14 @@ def setup_data(tokenizer: AutoTokenizer, data_config: DataConfig):
             output_key=data_config["output_key"],
             prompt_file=data_config["prompt_file"],
         )
+    elif data_cls == "openai_format":
+        data = hf_datasets.OpenAIFormatDataset(
+            data_config["train_data_path"],
+            data_config["val_data_path"],
+            data_config["chat_key"],
+            data_config["system_key"],
+            data_config["system_prompt"],
+        )
     else:
         raise ValueError(f"Unknown dataset class: {data_cls}")
     print(
diff --git a/nemo_rl/data/hf_datasets/__init__.py b/nemo_rl/data/hf_datasets/__init__.py
index 54d4fd9c34..aa5596397c 100644
--- a/nemo_rl/data/hf_datasets/__init__.py
+++ b/nemo_rl/data/hf_datasets/__init__.py
@@ -15,6 +15,7 @@
 from nemo_rl.data.hf_datasets.chat_templates import COMMON_CHAT_TEMPLATES
 from nemo_rl.data.hf_datasets.dpo import DPODataset
 from nemo_rl.data.hf_datasets.helpsteer3 import HelpSteer3Dataset
+from nemo_rl.data.hf_datasets.oai_format_dataset import OpenAIFormatDataset
 from nemo_rl.data.hf_datasets.oasst import OasstDataset
 from nemo_rl.data.hf_datasets.openmathinstruct2 import OpenMathInstruct2Dataset
 from nemo_rl.data.hf_datasets.prompt_response_dataset import (
@@ -26,6 +27,7 @@
     "DPODataset",
     "HelpSteer3Dataset",
     "OasstDataset",
+    "OpenAIFormatDataset",
     "OpenMathInstruct2Dataset",
     "PromptResponseDataset",
     "SquadDataset",
diff --git a/nemo_rl/data/hf_datasets/oai_format_dataset.py b/nemo_rl/data/hf_datasets/oai_format_dataset.py
new file mode 100644
index 0000000000..22d01346bc
--- /dev/null
+++ b/nemo_rl/data/hf_datasets/oai_format_dataset.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any
+
+from datasets import load_dataset
+
+from nemo_rl.data.interfaces import TaskDataSpec
+
+
+class OpenAIFormatDataset:
+    """This class is used to load an SFT dataset in the OpenAI format.
+
+    The dataset should be in the following format:
+    {
+        "messages": [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "What is the capital of France?"},
+            {"role": "assistant", "content": "The capital of France is Paris."}
+        ]
+    }
+    system_key and system_prompt are optional. If provided, it will be added to the
+    beginning of the dataset.
+    chat_key should be the key of the messages list. Multi-turn conversations are
+    supported.
+    The last message in the conversation must be from the assistant.
+    """
+
+    def __init__(
+        self,
+        train_ds_path: str,
+        val_ds_path: str,
+        chat_key: str = "messages",
+        system_key: str = None,
+        system_prompt: str = None,
+    ):
+        self.chat_key = chat_key
+        self.system_key = system_key
+        self.system_prompt = system_prompt
+        train_original_dataset = load_dataset("json", data_files=train_ds_path)["train"]
+        val_original_dataset = load_dataset("json", data_files=val_ds_path)["train"]
+
+        formatted_train_dataset = train_original_dataset.map(self.add_messages_key)
+        formatted_val_dataset = val_original_dataset.map(self.add_messages_key)
+
+        self.formatted_ds = {
+            "train": formatted_train_dataset,
+            "validation": formatted_val_dataset,
+        }
+
+        self.task_spec = TaskDataSpec(
+            "json_dataset",
+        )
+
+    def add_messages_key(
+        self,
+        example: dict[str, Any],
+    ) -> dict[str, list[dict[str, Any]]]:
+        messages = [message for message in example[self.chat_key]]
+        if self.system_key in example:
+            messages = [
+                {"role": "system", "content": example[self.system_key]}
+            ] + messages
+        elif self.system_prompt:
+            messages = [{"role": "system", "content": self.system_prompt}] + messages
+        assert messages[-1]["role"] == "assistant"
+        return {"messages": messages}
diff --git a/tests/unit/data/hf_datasets/test_oai_format_dataset.py b/tests/unit/data/hf_datasets/test_oai_format_dataset.py
new file mode 100644
index 0000000000..4ba75a6a1d
--- /dev/null
+++ b/tests/unit/data/hf_datasets/test_oai_format_dataset.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import tempfile
+
+import pytest
+from transformers import AutoTokenizer
+
+from nemo_rl.data.hf_datasets.chat_templates import COMMON_CHAT_TEMPLATES
+from nemo_rl.data.hf_datasets.oai_format_dataset import (
+    OpenAIFormatDataset,
+)
+
+
+@pytest.fixture
+def sample_data(request):
+    chat_key = request.param[0]
+    system_key = request.param[1]
+
+    train_data = {
+        chat_key: [
+            {"role": "user", "content": "What is the capital of France?"},
+            {"role": "assistant", "content": "The capital of France is Paris."},
+        ],
+    }
+    val_data = {
+        chat_key: [
+            {"role": "user", "content": "What is the capital of Germany?"},
+            {"role": "assistant", "content": "The capital of Germany is Berlin."},
+        ],
+    }
+
+    if system_key is not None:
+        train_data[system_key] = "You are a helpful assistant."
+    if system_key is not None:
+        val_data[system_key] = "You are a helpful assistant."
+
+    # Create temporary files for train and validation data
+    with tempfile.NamedTemporaryFile(
+        mode="w", suffix=".json", delete=False
+    ) as train_file:
+        json.dump(train_data, train_file)
+        train_path = train_file.name
+
+    with tempfile.NamedTemporaryFile(
+        mode="w", suffix=".json", delete=False
+    ) as val_file:
+        json.dump(val_data, val_file)
+        val_path = val_file.name
+
+    return train_path, val_path
+
+
+@pytest.mark.parametrize("sample_data", [("messages", None)], indirect=True)
+def test_dataset_initialization(sample_data):
+    train_path, val_path = sample_data
+    dataset = OpenAIFormatDataset(train_path, val_path)
+
+    assert dataset.chat_key == "messages"
+    assert "train" in dataset.formatted_ds
+    assert "validation" in dataset.formatted_ds
+
+
+@pytest.mark.parametrize("sample_data", [("conversations", None)], indirect=True)
+def test_custom_keys(sample_data):
+    train_path, val_path = sample_data
+    dataset = OpenAIFormatDataset(
+        train_path,
+        val_path,
+        chat_key="conversations",
+        system_prompt="You are a helpful assistant.",
+    )
+
+    assert dataset.chat_key == "conversations"
+    assert dataset.system_prompt == "You are a helpful assistant."
+
+
+@pytest.mark.parametrize("sample_data", [("messages", "system_key")], indirect=True)
+def test_message_formatting(sample_data):
+    train_path, val_path = sample_data
+    dataset = OpenAIFormatDataset(
+        train_path, val_path, chat_key="messages", system_key="system_key"
+    )
+
+    first_example = dataset.formatted_ds["train"][0]
+
+    assert first_example["messages"][0]["role"] == "system"
+    assert first_example["messages"][0]["content"] == "You are a helpful assistant."
+    assert first_example["messages"][1]["role"] == "user"
+    assert first_example["messages"][1]["content"] == "What is the capital of France?"
+    assert first_example["messages"][2]["role"] == "assistant"
+    assert first_example["messages"][2]["content"] == "The capital of France is Paris."
+
+    chat_template = COMMON_CHAT_TEMPLATES.passthrough_prompt_response
+    tokenizer = AutoTokenizer.from_pretrained("Meta-Llama/Meta-Llama-3-8B-Instruct")
+
+    combined_message = tokenizer.apply_chat_template(
+        first_example["messages"],
+        chat_template=chat_template,
+        tokenize=False,
+        add_generation_prompt=False,
+        add_special_tokens=False,
+    )
+
+    assert combined_message == "".join(
+        message["content"] for message in first_example["messages"]
+    )

From 283074abb71d82a267dd770e12e10a2e1e198a39 Mon Sep 17 00:00:00 2001
From: Parth Chadha <pchadha@nvidia.com>
Date: Wed, 2 Jul 2025 10:55:46 -0700
Subject: [PATCH 42/44] fix: load HF model only on rank 0 (#544)

Signed-off-by: Parth Chadha <pchadha@nvidia.com>
---
 examples/configs/evals/eval.yaml              |  9 +++
 nemo_rl/models/generation/vllm.py             |  4 ++
 .../models/policy/dtensor_policy_worker.py    | 61 +++++++++++++++++--
 .../unit/models/policy/test_dtensor_worker.py |  7 +++
 4 files changed, 75 insertions(+), 6 deletions(-)

diff --git a/examples/configs/evals/eval.yaml b/examples/configs/evals/eval.yaml
index 439acff25e..eab0f1db21 100644
--- a/examples/configs/evals/eval.yaml
+++ b/examples/configs/evals/eval.yaml
@@ -22,6 +22,15 @@ generation:
     pipeline_parallel_size: 1
     gpu_memory_utilization: 0.9
     max_model_len: 2048
+  colocated:
+    # true: generation shares training GPUs
+    # false: uses dedicated generation resources
+    enabled: true
+    # only relevant when enabled is false
+    resources:
+      gpus_per_node: null # Decides num gpus to be dedicated to generation when there is one node in the cluster i.e cluster.num_nodes == 1
+      num_nodes: null # Decides number of nodes to be dedicated to generation
+
 
 tokenizer:
   name: ${generation.model_name} ## specify if you'd like to use a tokenizer different from the model's default
diff --git a/nemo_rl/models/generation/vllm.py b/nemo_rl/models/generation/vllm.py
index cc8b44d5f3..9506a063d3 100644
--- a/nemo_rl/models/generation/vllm.py
+++ b/nemo_rl/models/generation/vllm.py
@@ -316,6 +316,10 @@ def _patch_vllm_init_workers_ray():
         os.environ["VLLM_USE_V1"] = os.environ.get("NRL_VLLM_USE_V1", "1")
         os.environ["VLLM_ALLOW_INSECURE_SERIALIZATION"] = "1"
 
+        if not self.cfg["colocated"]["enabled"]:
+            os.environ["NCCL_SHM_DISABLE"] = "1"
+            os.environ["NCCL_P2P_DISABLE"] = "1"
+
         load_format = self.cfg["vllm_cfg"]["load_format"]
         if ModelFlag.VLLM_LOAD_FORMAT_AUTO.matches(self.model_name):
             load_format = "auto"
diff --git a/nemo_rl/models/policy/dtensor_policy_worker.py b/nemo_rl/models/policy/dtensor_policy_worker.py
index a5e1d9259d..46e1e8a52a 100644
--- a/nemo_rl/models/policy/dtensor_policy_worker.py
+++ b/nemo_rl/models/policy/dtensor_policy_worker.py
@@ -21,7 +21,12 @@
 
 import ray
 import torch
+from accelerate import init_empty_weights
 from torch import nn
+from torch.distributed.checkpoint.state_dict import (
+    StateDictOptions,
+    set_model_state_dict,
+)
 from torch.distributed.fsdp import (
     FSDPModule,
 )
@@ -30,7 +35,7 @@
 from torch.distributed.tensor.experimental._attention import (
     set_rotate_method,
 )
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 from transformers.integrations.accelerate import find_tied_parameters
 from transformers.models.gemma3.modeling_gemma3 import Gemma3ForCausalLM
 
@@ -137,6 +142,15 @@ def __init__(
         init_reference_model: bool = True,
         **kwargs: Any,
     ):
+        # Disable NCCL SHM if training and generation are not co-located: https://github.com/NVIDIA-NeMo/RL/issues/564
+        if (
+            "generation" in config
+            and config["generation"] is not None
+            and not config["generation"]["colocated"]["enabled"]
+        ):
+            os.environ["NCCL_SHM_DISABLE"] = "1"
+            os.environ["NCCL_P2P_DISABLE"] = "1"
+
         self.cfg = config
         # torch distributed init. Envars for rank, world_size, and master_addr and master_port are set from the ray remote call
         torch.distributed.init_process_group(backend="nccl")
@@ -156,19 +170,38 @@ def __init__(
         else:
             raise ValueError(f"Unknown precision: {self.cfg['precision']}")
 
-        print(f"[Rank {self.rank}] Loading model {model_name} on CPU...")
-        self.model = AutoModelForCausalLM.from_pretrained(
+        model_config = AutoConfig.from_pretrained(
             model_name,
-            device_map="cpu",  # load weights onto CPU initially
             # Always load the model in float32 to keep master weights in float32.
             # Keeping the master weights in lower precision has shown to cause issues with convergence.
-            # https://github.com/NVIDIA-NeMo/RL/issues/279 will fix the issue of CPU OOM for larger models.
             torch_dtype=torch.float32,
             trust_remote_code=True,
             **sliding_window_overwrite(
                 model_name
             ),  # due to https://github.com/huggingface/transformers/issues/38002
         )
+
+        full_state_dict = None
+        if self.rank == 0:
+            print(f"[Rank {self.rank}] Loading model {model_name} on CPU...")
+            model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                device_map="cpu",  # load weights onto CPU initially
+                trust_remote_code=True,
+                config=model_config,
+            )
+            full_state_dict = model.state_dict()
+            del model
+
+        print(f"[Rank {self.rank}] Initializing empty model for FSDP...")
+        # All ranks initialize model on meta device, so FSDP can shard it.
+        # The actual weights will be broadcast from rank 0.
+
+        with init_empty_weights():
+            self.model = AutoModelForCausalLM.from_config(
+                model_config,
+            )
+
         # caching since this property is not always preserved after FSDP
         self.num_tied_weights = len(find_tied_parameters(self.model))
         self.skip_tie_check = os.environ.get(
@@ -222,8 +255,24 @@ def __init__(
             custom_parallel_plan=self.cfg["dtensor_cfg"]["custom_parallel_plan"],
         )
 
+        print(f"[Rank {self.rank}] Loading state dict from rank 0...")
+        # This will broadcast the state dict from rank 0 to all other ranks
+        # and load it into the FSDP model.
+        set_model_state_dict(
+            self.model,
+            model_state_dict=full_state_dict,
+            options=StateDictOptions(
+                full_state_dict=True,
+                broadcast_from_rank0=True,
+            ),
+        )
+
+        # Manually broadcast buffers
+        for _, buf in self.model.named_buffers():
+            torch.distributed.broadcast(buf, src=0)
+
         if self.cpu_offload:
-            self.model = self.move_buffer_to_device(self.model, "cpu")
+            self.model = self.move_to_device(self.model, "cpu")
 
         # used for streaming update inference engine weights
         self._held_sharded_state_dict_reference: Optional[dict[str, torch.Tensor]] = (
diff --git a/tests/unit/models/policy/test_dtensor_worker.py b/tests/unit/models/policy/test_dtensor_worker.py
index 0a42ea1e9f..91bf140641 100644
--- a/tests/unit/models/policy/test_dtensor_worker.py
+++ b/tests/unit/models/policy/test_dtensor_worker.py
@@ -61,6 +61,13 @@ def create_test_config(
             "top_k": None,
             "stop_token_ids": None,
             "stop_strings": None,
+            "colocated": {
+                "enabled": True,
+                "resources": {
+                    "gpus_per_node": None,
+                    "num_nodes": None,
+                },
+            },
         },
         "dtensor_cfg": {
             "enabled": True,

From e78af38cc4061dd63e5a621fad2787449edeb299 Mon Sep 17 00:00:00 2001
From: yuki <48991475+yuki-666@users.noreply.github.com>
Date: Fri, 27 Jun 2025 11:24:51 +0800
Subject: [PATCH 43/44] feat: support async in non-colocated (#523)

Signed-off-by: Yuki Huang <yukih@nvidia.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 nemo_rl/models/generation/vllm.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/nemo_rl/models/generation/vllm.py b/nemo_rl/models/generation/vllm.py
index 9506a063d3..c2e0e5c28b 100644
--- a/nemo_rl/models/generation/vllm.py
+++ b/nemo_rl/models/generation/vllm.py
@@ -378,6 +378,19 @@ async def init_collective_async(
             ),
         )
 
+    async def init_collective_async(
+        self, data: int, ip: str, port: int, world_size: int
+    ) -> None:
+        await self.llm.collective_rpc(
+            "init_collective",
+            args=(
+                data,
+                ip,
+                port,
+                world_size,
+            ),
+        )
+
     def llm(self):
         return self.llm
 

From 4cd4568e380e74263e5fcb9e928047651b2695a6 Mon Sep 17 00:00:00 2001
From: Anna Shors <ashors@nvidia.com>
Date: Fri, 27 Jun 2025 16:00:38 -0700
Subject: [PATCH 44/44] feat: Add megatron to hf converter (#555)

Signed-off-by: Anna Shors <ashors@nvidia.com>
Signed-off-by: ashors1 <ashors@nvidia.com>
Signed-off-by: Xuehan <xxman@google.com>
---
 tests/functional/test_converter_roundtrip.py | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/tests/functional/test_converter_roundtrip.py b/tests/functional/test_converter_roundtrip.py
index ea865be9b2..9679fcc724 100644
--- a/tests/functional/test_converter_roundtrip.py
+++ b/tests/functional/test_converter_roundtrip.py
@@ -13,20 +13,6 @@
 # limitations under the License.
 #!/usr/bin/env python3
 
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 """
 Functional test for converter roundtrip functionality.