From 405acc22bf36087001c05ffb508cb1b89208dedf Mon Sep 17 00:00:00 2001 From: Xuehan Xiong Date: Tue, 24 Jun 2025 06:15:18 +0000 Subject: [PATCH 01/44] Adds multiple choice eval datasets. Signed-off-by: Xuehan Xiong Signed-off-by: Xuehan --- nemo_rl/data/eval_datasets/__init__.py | 0 nemo_rl/data/eval_datasets/gpqa.py | 44 ++++++++++++++++++++++++++ nemo_rl/data/eval_datasets/math.py | 29 +++++++++++++++++ nemo_rl/data/eval_datasets/mmlu.py | 33 +++++++++++++++++++ nemo_rl/data/eval_datasets/mmlu_pro.py | 31 ++++++++++++++++++ 5 files changed, 137 insertions(+) create mode 100644 nemo_rl/data/eval_datasets/__init__.py create mode 100644 nemo_rl/data/eval_datasets/gpqa.py create mode 100644 nemo_rl/data/eval_datasets/math.py create mode 100644 nemo_rl/data/eval_datasets/mmlu.py create mode 100644 nemo_rl/data/eval_datasets/mmlu_pro.py diff --git a/nemo_rl/data/eval_datasets/__init__.py b/nemo_rl/data/eval_datasets/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/nemo_rl/data/eval_datasets/gpqa.py b/nemo_rl/data/eval_datasets/gpqa.py new file mode 100644 index 0000000000..1287e446a0 --- /dev/null +++ b/nemo_rl/data/eval_datasets/gpqa.py @@ -0,0 +1,44 @@ +"""GPQA dataset and its variants.""" + +import random +from typing import Any, Optional + +from datasets import load_dataset + +from nemo_rl.data.interfaces import TaskDataSpec + + + +class GPQADataset: + def __init__(self, variant: str = "diamond", prompt_file: Optional[str]=None, system_prompt_file: Optional[str]=None): + ds = load_dataset("csv", data_files=f"https://openaipublic.blob.core.windows.net/simple-evals/gpqa_{variant}.csv", split="train") + self._rng = random.Random() + self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names) + self.task_spec = TaskDataSpec( + task_name=f"GPQA_{variant}", + prompt_file=prompt_file, + system_prompt_file=system_prompt_file, + ) + + def _rekey(self, data: dict[str, Any]): + choices = [ + data["Correct Answer"], + data["Incorrect Answer 1"], + data["Incorrect Answer 2"], + data["Incorrect Answer 3"], + ] + permutation = self._rng.sample(range(4), 4) + choices = [choices[i] for i in permutation] + correct_index = choices.index(data["Correct Answer"]) + correct_answer = "ABCD"[correct_index] + return { + "question": data["Question"], + "options": dict( + A=choices[0], + B=choices[1], + C=choices[2], + D=choices[3], + ), + "answer": correct_answer, + } + diff --git a/nemo_rl/data/eval_datasets/math.py b/nemo_rl/data/eval_datasets/math.py new file mode 100644 index 0000000000..cbd7cb3577 --- /dev/null +++ b/nemo_rl/data/eval_datasets/math.py @@ -0,0 +1,29 @@ +"""Math dataset and its variants.""" + +from typing import Any, Literal, Optional + +from datasets import load_dataset + +from nemo_rl.data.interfaces import TaskDataSpec + + +class MathDataset: + def __init__(self, + variant: Literal["math_test", "math_500_test"] = "math_test", + prompt_file: Optional[str]=None, + system_prompt_file: Optional[str]=None, + ): + ds = load_dataset('csv', data_files=f"https://openaipublic.blob.core.windows.net/simple-evals/{variant}.csv", split='train') + self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names) + self.task_spec = TaskDataSpec( + task_name=f'{variant}', + prompt_file=prompt_file, + system_prompt_file=system_prompt_file, + ) + + def _rekey(self, data: dict[str, Any]): + return { + 'problem': data['Question'], + 'expected_answer': data['Answer'], + } + diff --git a/nemo_rl/data/eval_datasets/mmlu.py b/nemo_rl/data/eval_datasets/mmlu.py new file mode 100644 index 0000000000..f0f126850a --- /dev/null +++ b/nemo_rl/data/eval_datasets/mmlu.py @@ -0,0 +1,33 @@ +"""MMLU dataset and its variants.""" + +from typing import Any, Optional + +from datasets import load_dataset + +from nemo_rl.data.interfaces import TaskDataSpec + + +class MMLUDataset: + def __init__(self, prompt_file: str, system_prompt_file: Optional[str] = None): + ds = load_dataset('csv', data_files=f"https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", split='train') + self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names) + + self.task_spec = TaskDataSpec( + task_name='MMLU', + prompt_file=prompt_file, + system_prompt_file=system_prompt_file, + ) + + def _rekey(self, data: dict[str, Any]): + return { + 'question': data['Question'], + 'options': dict( + A=data['A'], + B=data['B'], + C=data['C'], + D=data['D'], + ), + 'answer': data['Answer'], + 'subject': data['Subject'], + } + diff --git a/nemo_rl/data/eval_datasets/mmlu_pro.py b/nemo_rl/data/eval_datasets/mmlu_pro.py new file mode 100644 index 0000000000..da990a90c5 --- /dev/null +++ b/nemo_rl/data/eval_datasets/mmlu_pro.py @@ -0,0 +1,31 @@ +"""MMLU-Pro dataset.""" + +from typing import Any, Optional + +from datasets import load_dataset + +from nemo_rl.data.interfaces import TaskDataSpec + + +class MMLUProDataset: + def __init__(self, prompt_file: str, system_prompt_file: Optional[str] = None): + ds = load_dataset('TIGER-Lab/MMLU-Pro', split='test') + self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names) + + self.task_spec = TaskDataSpec( + task_name='MMLU-Pro', + prompt_file=prompt_file, + system_prompt_file=system_prompt_file, + ) + + def _rekey(self, data: dict[str, Any]): + options = { + chr(ord('A') + i) : op for i, op in enumerate(data['options']) + } + return { + 'question': data['question'], + 'options': options, + 'answer': data['answer'], + 'subject': data['category'], + } + From 67aae53a686dfcb2cdf4e8bad783b377508a597a Mon Sep 17 00:00:00 2001 From: Xuehan Xiong Date: Tue, 24 Jun 2025 18:28:37 +0000 Subject: [PATCH 02/44] Add a verify worker for multiple-choice problems. Signed-off-by: Xuehan Xiong Signed-off-by: Xuehan --- nemo_rl/environments/math_environment.py | 36 +++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/nemo_rl/environments/math_environment.py b/nemo_rl/environments/math_environment.py index e8a47db06f..3f2c7cf7af 100644 --- a/nemo_rl/environments/math_environment.py +++ b/nemo_rl/environments/math_environment.py @@ -14,6 +14,7 @@ import contextlib import io import logging +import re from typing import Any, Optional, TypedDict import ray @@ -32,11 +33,13 @@ calculate_pass_rate_per_prompt, ) from nemo_rl.environments.utils import chunk_list_to_workers +from nemo_rl.evals import answer_parsing class MathEnvConfig(TypedDict): num_workers: int stop_strings: Optional[list[str]] # Default stop strings for this env + verifier_type: Optional[str] @contextlib.contextmanager @@ -97,6 +100,36 @@ def verify( return results +@ray.remote +class MultichoiceVerifyWorker: + + def verify( + self, pred_responses: list[str], ground_truths: list[str] + ) -> list[float]: + """Verify the correctness of the predicted responses against the ground truth. + + Args: + pred_responses: list[str]. The predicted responses from the LLM. + ground_truths: list[str]. The ground truth responses. + + Returns: + list[float]. The rewards for each predicted response. + """ + results = [] + for response, ground_truth in zip(pred_responses, ground_truths): + response = answer_parsing.normalize_response(response) + extracted_answer = None + for answer_regex in answer_parsing.MULTILINGUAL_ANSWER_REGEXES: + regex = answer_parsing.MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(answer_regex) + match = re.search(regex, response) + if match: + extracted_answer = answer_parsing.normalize_extracted_answer(match.group(1)) + break + score = 1.0 if extracted_answer == ground_truth else 0.0 + results.append(score) + return results + + class MathEnvironmentMetadata(TypedDict): ground_truth: str @@ -106,8 +139,9 @@ class MathEnvironment(EnvironmentInterface): def __init__(self, cfg: MathEnvConfig): self.cfg = cfg self.num_workers = cfg["num_workers"] + worker_cls = MultichoiceVerifyWorker if cfg.get("verifier_type", "math") == "multichoice" else HFVerifyWorker self.workers = [ - HFVerifyWorker.options( # type: ignore # (decorated with @ray.remote) + worker_cls.options( # type: ignore # (decorated with @ray.remote) runtime_env={"py_executable": PY_EXECUTABLES.SYSTEM} ).remote() for _ in range(self.num_workers) From 4134fcb087b1bdba0c87905acc80d5e456634cf2 Mon Sep 17 00:00:00 2001 From: Xuehan Xiong Date: Tue, 24 Jun 2025 18:59:43 +0000 Subject: [PATCH 03/44] add prompts for MMLU and GPQA. Signed-off-by: Xuehan Xiong Signed-off-by: Xuehan --- examples/prompts/gpqa.txt | 1 + examples/prompts/mmlu.txt | 1 + 2 files changed, 2 insertions(+) create mode 100644 examples/prompts/gpqa.txt create mode 100644 examples/prompts/mmlu.txt diff --git a/examples/prompts/gpqa.txt b/examples/prompts/gpqa.txt new file mode 100644 index 0000000000..04ea20d553 --- /dev/null +++ b/examples/prompts/gpqa.txt @@ -0,0 +1 @@ +Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering. diff --git a/examples/prompts/mmlu.txt b/examples/prompts/mmlu.txt new file mode 100644 index 0000000000..04ea20d553 --- /dev/null +++ b/examples/prompts/mmlu.txt @@ -0,0 +1 @@ +Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD. Think step by step before answering. From 0ca559faeadb6375a2bc399bdfb6582b66b9f643 Mon Sep 17 00:00:00 2001 From: Xuehan Xiong Date: Tue, 24 Jun 2025 19:03:46 +0000 Subject: [PATCH 04/44] modifies eval script to support multiple-choice questions. Signed-off-by: Xuehan Xiong Signed-off-by: Xuehan --- examples/run_eval.py | 163 ++++++++++++++++++++++++++------ nemo_rl/evals/answer_parsing.py | 94 ++++++++++++++++++ 2 files changed, 226 insertions(+), 31 deletions(-) create mode 100644 nemo_rl/evals/answer_parsing.py diff --git a/examples/run_eval.py b/examples/run_eval.py index 6f7f60cc44..ae86046dbc 100644 --- a/examples/run_eval.py +++ b/examples/run_eval.py @@ -16,19 +16,25 @@ import os import pprint import sys +from typing import Any, cast + +import torch sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from datasets import load_dataset from omegaconf import OmegaConf -from transformers import AutoTokenizer +from transformers import AutoTokenizer, PreTrainedTokenizerBase from examples.run_grpo_math import math_data_processor from nemo_rl.algorithms.utils import get_tokenizer -from nemo_rl.data import MathDataConfig from nemo_rl.data.datasets import AllTaskProcessedDataset -from nemo_rl.data.interfaces import TaskDataSpec -from nemo_rl.data.llm_message_utils import remap_dataset_keys +from nemo_rl.data.eval_datasets import ( + gpqa, + math, + mmlu, + mmlu_pro, +) +from nemo_rl.data.interfaces import DatumSpec, TaskDataSpec from nemo_rl.distributed.ray_actor_environment_registry import ( get_actor_python_env, ) @@ -37,6 +43,8 @@ from nemo_rl.evals.eval import MasterConfig, run_env_eval, setup from nemo_rl.models.generation import configure_generation_config +TokenizerType = PreTrainedTokenizerBase + def parse_args(): """Parse command line arguments.""" @@ -54,28 +62,119 @@ def parse_args(): return args, overrides -def setup_data(tokenizer: AutoTokenizer, data_config: MathDataConfig, env_configs): - print("\n▶ Setting up data...") - math_task_spec = TaskDataSpec( - task_name="math", - prompt_file=data_config["prompt_file"], - system_prompt_file=data_config["system_prompt_file"], +def _construct_prompt(prompt: str, question: str, options: dict[str, str]) -> str: + """Construct prompt from question and options.""" + output = prompt + output += f"\n\nQuestion: {question}\nOptions:\n" + output += "\n".join( + [ + f"{letter}) {option}" + for letter, option in options.items() + if option is not None + ] ) - - # load dataset - base_dataset = load_dataset(data_config["dataset_name"]) - if data_config["dataset_key"] is not None: - base_dataset = base_dataset[data_config["dataset_key"]] - # remap problem and solution keys - remapped_dataset = remap_dataset_keys( - base_dataset, - mapping_dict={ - data_config["problem_key"]: "problem", - data_config["solution_key"]: "expected_answer", - }, + return output + + +def multichoice_qa_processor( + datum_dict: dict[str, Any], + task_data_spec: TaskDataSpec, + tokenizer: TokenizerType, + max_seq_length: int, + idx: int, +) -> DatumSpec: + """Process a datum dictionary (directly loaded from dataset) into a DatumSpec for multiple-choice problems.""" + question = datum_dict["question"] + answer = str(datum_dict["answer"]) + options = datum_dict["options"] + extra_env_info = {"ground_truth": answer} + if "subject" in datum_dict: + extra_env_info.update({"subject": datum_dict["subject"]}) + + message_log = [] + + # system prompt + if task_data_spec.system_prompt: + sys_prompt: dict[str, str | torch.Tensor] = { + "role": "system", + "content": task_data_spec.system_prompt, + } + sys = tokenizer.apply_chat_template( + [cast(dict[str, str], sys_prompt)], + tokenize=False, + add_generation_prompt=False, + add_special_tokens=False, + ) + sys_prompt["token_ids"] = tokenizer(sys, return_tensors="pt")["input_ids"][0] + message_log.append(sys_prompt) + + # user prompt + if task_data_spec.prompt: + problem = _construct_prompt(task_data_spec.prompt, question, options) + user_message = {"role": "user", "content": problem} + message = tokenizer.apply_chat_template( + [user_message], + tokenize=False, + add_generation_prompt=True, + add_special_tokens=False, ) + user_message["token_ids"] = tokenizer(message, return_tensors="pt")["input_ids"][0] + user_message["content"] = message + message_log.append(user_message) + + length = sum(len(m["token_ids"]) for m in message_log) + output: DatumSpec = { + "message_log": message_log, + "length": length, + "extra_env_info": extra_env_info, + "loss_multiplier": 1.0, + "idx": idx, + } + if "task_name" in datum_dict: + output["task_name"] = datum_dict["task_name"] + return output + + +def setup_data(tokenizer: AutoTokenizer, data_config, env_configs): + print("Setting up data...") - math_env = MathEnvironment.options( + # load dataset + dataset_name = data_config["dataset_name"] + data_processor_fn = multichoice_qa_processor + if dataset_name == "mmlu": + base_dataset = mmlu.MMLUDataset( + prompt_file=data_config["prompt_file"], + system_prompt_file=data_config["system_prompt_file"], + ) + elif dataset_name == "gpqa": + base_dataset = gpqa.GPQADataset( + prompt_file=data_config["prompt_file"], + system_prompt_file=data_config["system_prompt_file"], + ) + elif dataset_name == "mmlu_pro": + base_dataset = mmlu_pro.MMLUProDataset( + prompt_file=data_config["prompt_file"], + system_prompt_file=data_config["system_prompt_file"], + ) + elif dataset_name == "math": + base_dataset = math.MathDataset( + variant="math_test", + prompt_file=data_config["prompt_file"], + system_prompt_file=data_config["system_prompt_file"], + ) + data_processor_fn = math_data_processor + elif dataset_name == "math500": + base_dataset = math.MathDataset( + variant="math_500_test", + prompt_file=data_config["prompt_file"], + system_prompt_file=data_config["system_prompt_file"], + ) + data_processor_fn = math_data_processor + else: + raise ValueError(f"Unknown dataset {dataset_name}.") + rekeyed_ds = base_dataset.rekeyed_ds + + env = MathEnvironment.options( runtime_env={ "py_executable": get_actor_python_env( "nemo_rl.environments.math_environment.MathEnvironment" @@ -84,14 +183,14 @@ def setup_data(tokenizer: AutoTokenizer, data_config: MathDataConfig, env_config ).remote(env_configs["math"]) dataset = AllTaskProcessedDataset( - dataset=remapped_dataset, + dataset=rekeyed_ds, tokenizer=tokenizer, - default_task_data_spec=math_task_spec, - task_data_processors=math_data_processor, + default_task_data_spec=base_dataset.task_spec, + task_data_processors=data_processor_fn, max_seq_length=data_config["max_input_seq_length"], ) - return dataset, math_env, tokenizer + return dataset, env, tokenizer def main(): @@ -100,7 +199,9 @@ def main(): args, overrides = parse_args() if not args.config: - args.config = os.path.join(os.path.dirname(__file__), "configs", "eval.yaml") + args.config = os.path.join( + os.path.dirname(__file__), "configs", "mmlu_eval.yaml" + ) config = OmegaConf.load(args.config) print(f"Loaded configuration from: {args.config}") @@ -129,7 +230,7 @@ def main(): # Setup data ( dataset, - math_env, + env, tokenizer, ) = setup_data(tokenizer, config["data"], config["env"]) @@ -144,7 +245,7 @@ def main(): run_env_eval( vllm_generation, dataloader, - math_env, + env, master_config, ) diff --git a/nemo_rl/evals/answer_parsing.py b/nemo_rl/evals/answer_parsing.py new file mode 100644 index 0000000000..d4e2fddd6f --- /dev/null +++ b/nemo_rl/evals/answer_parsing.py @@ -0,0 +1,94 @@ +"""Contains utility functions for answer parsing.""" + + +MULTILINGUAL_ANSWER_PATTERN_TEMPLATE = ( + "(?i){}[ \t]*([A-D]|[أ-د]|[অ]|[ব]|[ড]|[ঢ]|[A]|[B]|[C]|[D])" +) +# All the different ways "Answer" is written in different languages +MULTILINGUAL_ANSWER_REGEXES = [ + "Answer\s*:", + "Answer\s*:​​​​​​", # Korean invisible character + "উত্তর\s*:", + "उत्तर\s*:", + "উত্তরঃ", + "উত্তর\s*:", + "Antwort\s*:", + "답변\s*:", + "정답\s*:", + "답\s*:", + "答案\s*:", + "答案\s*:", + "答\s*:", + "答\s*:", + "答复\s*:", + "答曰\s*:", + "الإجابة:", + "الجواب:", + "إجابة:", + "الإجابة النهائية:", + "الإجابة الصحيحة:", + "الإجابة الصحيحة هي:", + "الإجابة هي:", + "الجواب النهائي:", + "Respuesta\s*:", + "Risposta\s*:", + "答え\s*:", + "答え\s*:", + "回答\s*:", + "回答\s*:", + "解答\s*:", + "Jawaban\s*:", + "Réponse\s*:", + "Resposta\s*:", + "Jibu\s*:", + "Idahun\s*:", + "Ìdáhùn\s*:", + "Idáhùn\s*:", + "Àmọ̀nà\s*:", + "Àdáhùn\s*:", + "Ànúgọ\s*:", + "Àṣàyàn\s*:", +] + + +def normalize_extracted_answer(extracted_answer: str) -> str: + return ( + # In arabic these are the letters used for A-D in multiple choice questions + extracted_answer.replace("أ", " A") + .replace("ب", " B") + .replace("ج", " C") + .replace("د", " D") + # In Bengali these are the letters used for A-D in multiple choice questions + .replace("অ", " A") + .replace("ব", " B") + .replace("ড", " C") + .replace("ঢ", " D") + # In Japanese these are the letters sometimes used for A-D in multiple choice questions + .replace("A", " A") + .replace("B", " B") + .replace("C", " C") + .replace("D", " D") + .strip() + ) + + +def normalize_response(response: str) -> str: + """ + Normalize the response by removing markdown and LaTeX formatting that may prevent a match. + """ + + return ( + response.replace("**", "") + .replace("$\\boxed{", "") + .replace("}$", "") + .replace("\\$", "") + .replace("$\\text{", "") + .replace("$", "") + .replace("\\mathrm{", "") + .replace("\\{", "") + .replace("\\text", "") + .replace("\\(", "") + .replace("\\mathbf{", "") + .replace("{", "") + .replace("\\boxed", "") + ) From 2163cbf2047af52152524a9d2362046ac3b9c5c1 Mon Sep 17 00:00:00 2001 From: Xuehan Xiong Date: Tue, 24 Jun 2025 19:04:15 +0000 Subject: [PATCH 05/44] add eval config files. Signed-off-by: Xuehan Xiong Signed-off-by: Xuehan --- examples/configs/{ => evals}/eval.yaml | 0 examples/configs/evals/gpqa_eval.yaml | 42 ++++++++++++++++++++++++++ examples/configs/evals/math_eval.yaml | 41 +++++++++++++++++++++++++ 3 files changed, 83 insertions(+) rename examples/configs/{ => evals}/eval.yaml (100%) create mode 100644 examples/configs/evals/gpqa_eval.yaml create mode 100644 examples/configs/evals/math_eval.yaml diff --git a/examples/configs/eval.yaml b/examples/configs/evals/eval.yaml similarity index 100% rename from examples/configs/eval.yaml rename to examples/configs/evals/eval.yaml diff --git a/examples/configs/evals/gpqa_eval.yaml b/examples/configs/evals/gpqa_eval.yaml new file mode 100644 index 0000000000..93b991b185 --- /dev/null +++ b/examples/configs/evals/gpqa_eval.yaml @@ -0,0 +1,42 @@ +# Evaluation Configuration +eval: + metric: "pass@1" # only pass@1 is supported now + num_tests_per_prompt: 1 # every prompt will be tested num_tests_per_prompt times and use the average score as the final score + seed: 42 + +generation: + backend: "vllm" # only vllm is supported for evaluation + max_new_tokens: ${generation.vllm_cfg.max_model_len} + temperature: 0.0 + top_p: 1.0 + top_k: -1 # -1 means disable + num_prompts_per_step: 16 # -1 means pass all prompts at once + model_name: "Qwen/Qwen2.5-7B-Instruct" + stop_token_ids: null + stop_strings: null + vllm_cfg: + async_engine: false + precision: "bfloat16" + tensor_parallel_size: 1 + pipeline_parallel_size: 1 + gpu_memory_utilization: 0.9 + max_model_len: 3072 + +tokenizer: + name: ${generation.model_name} ## specify if you'd like to use a tokenizer different from the model's default + chat_template: "default" + +data: + max_input_seq_length: ${generation.vllm_cfg.max_model_len} # useless since we directly use prompts in evaluation + prompt_file: "examples/prompts/gpqa.txt" + system_prompt_file: null + dataset_name: "gpqa" + +env: + math: + num_workers: 8 + verifier_type: "multichoice" + +cluster: + gpus_per_node: 1 + num_nodes: 1 diff --git a/examples/configs/evals/math_eval.yaml b/examples/configs/evals/math_eval.yaml new file mode 100644 index 0000000000..32a4a3281c --- /dev/null +++ b/examples/configs/evals/math_eval.yaml @@ -0,0 +1,41 @@ +# Evaluation Configuration +eval: + metric: "pass@1" # only pass@1 is supported now + num_tests_per_prompt: 1 # every prompt will be tested num_tests_per_prompt times and use the average score as the final score + seed: 42 + +generation: + backend: "vllm" # only vllm is supported for evaluation + max_new_tokens: ${generation.vllm_cfg.max_model_len} + temperature: 0.0 + top_p: 1.0 + top_k: -1 # -1 means disable + num_prompts_per_step: 16 # -1 means pass all prompts at once + model_name: "Qwen/Qwen2.5-7B-Instruct" + stop_token_ids: null + stop_strings: null + vllm_cfg: + async_engine: false + precision: "bfloat16" + tensor_parallel_size: 1 + pipeline_parallel_size: 1 + gpu_memory_utilization: 0.9 + max_model_len: 2048 + +tokenizer: + name: ${generation.model_name} ## specify if you'd like to use a tokenizer different from the model's default + chat_template: "default" + +data: + max_input_seq_length: ${generation.vllm_cfg.max_model_len} # useless since we directly use prompts in evaluation + prompt_file: "examples/prompts/cot.txt" + system_prompt_file: null + dataset_name: "math" + +env: + math: + num_workers: 8 + +cluster: + gpus_per_node: 1 + num_nodes: 1 From d9dd544d1da60536dd3ae3787f09dea1355105d5 Mon Sep 17 00:00:00 2001 From: Xuehan Xiong Date: Tue, 24 Jun 2025 21:14:18 +0000 Subject: [PATCH 06/44] add unit tests. Signed-off-by: Xuehan Xiong xxman@google.com Signed-off-by: Xuehan Xiong Signed-off-by: Xuehan --- nemo_rl/data/eval_datasets/mmlu.py | 4 +- tests/unit/data/eval_datasets/test_gpqa.py | 40 +++++++++ tests/unit/data/eval_datasets/test_math.py | 39 +++++++++ tests/unit/data/eval_datasets/test_mmlu.py | 41 ++++++++++ .../environments/test_math_environment.py | 81 +++++++++++++++++++ 5 files changed, 203 insertions(+), 2 deletions(-) create mode 100644 tests/unit/data/eval_datasets/test_gpqa.py create mode 100644 tests/unit/data/eval_datasets/test_math.py create mode 100644 tests/unit/data/eval_datasets/test_mmlu.py diff --git a/nemo_rl/data/eval_datasets/mmlu.py b/nemo_rl/data/eval_datasets/mmlu.py index f0f126850a..f6ab075886 100644 --- a/nemo_rl/data/eval_datasets/mmlu.py +++ b/nemo_rl/data/eval_datasets/mmlu.py @@ -8,8 +8,8 @@ class MMLUDataset: - def __init__(self, prompt_file: str, system_prompt_file: Optional[str] = None): - ds = load_dataset('csv', data_files=f"https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", split='train') + def __init__(self, prompt_file: Optional[str] = None, system_prompt_file: Optional[str] = None): + ds = load_dataset('csv', data_files="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", split='train') self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names) self.task_spec = TaskDataSpec( diff --git a/tests/unit/data/eval_datasets/test_gpqa.py b/tests/unit/data/eval_datasets/test_gpqa.py new file mode 100644 index 0000000000..033a11b6ff --- /dev/null +++ b/tests/unit/data/eval_datasets/test_gpqa.py @@ -0,0 +1,40 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +from transformers import AutoTokenizer + +from nemo_rl.data.eval_datasets.gpqa import GPQADataset + + +@pytest.mark.skip(reason="dataset download is flaky") +def test_gpqa_dataset(): + tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct") + gpqa_dataset = GPQADataset() + + # check that the dataset is formatted correctly + for example in gpqa_dataset.rekeyed_ds.take(5): + assert "question" in example + assert "options" in example + assert "answer" in example + + ## check that applying chat template works as expected + default_templated = tokenizer.apply_chat_template( + [{"role": "user", "content": example["question"]}], + tokenize=False, + add_generation_prompt=False, + add_special_tokens=False, + ) + + assert default_templated == f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n{example["question"]}<|im_end|>\n" + diff --git a/tests/unit/data/eval_datasets/test_math.py b/tests/unit/data/eval_datasets/test_math.py new file mode 100644 index 0000000000..7a524654fa --- /dev/null +++ b/tests/unit/data/eval_datasets/test_math.py @@ -0,0 +1,39 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +from transformers import AutoTokenizer + +from nemo_rl.data.eval_datasets.math import MathDataset + + +@pytest.mark.skip(reason="dataset download is flaky") +def test_math_dataset(): + tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct") + math_dataset = MathDataset() + + # check that the dataset is formatted correctly + for example in math_dataset.rekeyed_ds.take(5): + assert "problem" in example + assert "expected_answer" in example + + ## check that applying chat template works as expected + default_templated = tokenizer.apply_chat_template( + [{"role": "user", "content": example["problem"]}], + tokenize=False, + add_generation_prompt=False, + add_special_tokens=False, + ) + + assert default_templated == f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n{example["problem"]}<|im_end|>\n" + diff --git a/tests/unit/data/eval_datasets/test_mmlu.py b/tests/unit/data/eval_datasets/test_mmlu.py new file mode 100644 index 0000000000..df5dabaef9 --- /dev/null +++ b/tests/unit/data/eval_datasets/test_mmlu.py @@ -0,0 +1,41 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +from transformers import AutoTokenizer + +from nemo_rl.data.eval_datasets.mmlu import MMLUDataset + + +@pytest.mark.skip(reason="dataset download is flaky") +def test_mmlu_dataset(): + tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct") + mmlu_dataset = MMLUDataset() + + # check that the dataset is formatted correctly + for example in mmlu_dataset.rekeyed_ds.take(5): + assert "question" in example + assert "options" in example + assert "answer" in example + assert "subject" in example + + ## check that applying chat template works as expected + default_templated = tokenizer.apply_chat_template( + [{"role": "user", "content": example["question"]}], + tokenize=False, + add_generation_prompt=False, + add_special_tokens=False, + ) + + assert default_templated == f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n{example["question"]}<|im_end|>\n" + diff --git a/tests/unit/environments/test_math_environment.py b/tests/unit/environments/test_math_environment.py index 386a21e2f8..ed599bcd5e 100644 --- a/tests/unit/environments/test_math_environment.py +++ b/tests/unit/environments/test_math_environment.py @@ -42,6 +42,25 @@ def math_env(): time.sleep(0.1) +@pytest.fixture(scope="module") +def multichoice_env(): + """Create a MathEnvironment actor for testing.""" + env = MathEnvironment.options( + runtime_env={ + "py_executable": get_actor_python_env( + "nemo_rl.environments.math_environment.MathEnvironment" + ), + "env_vars": dict(os.environ), + } + ).remote({"num_workers": 2, "verifier_type": "multichoice"}) + yield env + # Clean up the actor and wait for it to be killed + env.shutdown.remote() + ray.kill(env) + # Give some time for cleanup + time.sleep(0.1) + + @pytest.fixture def basic_test_data(): """Common test data for basic math problems.""" @@ -68,6 +87,32 @@ def basic_test_data(): } +@pytest.fixture +def basic_multichoice_test_data(): + """Common test data for basic multichoice problems.""" + return { + "message_log_batch": [ + [ + {"role": "user", "content": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD"}, + {"role": "assistant", "content": "\nAnswer: C"}, + ], + [ + {"role": "user", "content": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD"}, + {"role": "assistant", "content": "\nAnswer: B"}, + ], + [ + {"role": "user", "content": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD"}, + {"role": "assistant", "content": "\nAnswer: D"}, + ], + ], + "metadata": [ + {"ground_truth": "C"}, + {"ground_truth": "B"}, + {"ground_truth": "B"}, + ], + } + + @pytest.fixture def mixed_test_data(): """Test data with mix of correct and incorrect responses.""" @@ -148,6 +193,42 @@ def test_math_env_step_basic(math_env, basic_test_data): assert all(result.terminateds == 1.0), "All terminated flags should be 1.0" +def test_multichoice_env_step_basic(multichoice_env, basic_multichoice_test_data): + """Test basic functionality of MathEnvironment step with multichoice verifier.""" + result = ray.get( + multichoice_env.step.remote( + basic_multichoice_test_data["message_log_batch"], basic_multichoice_test_data["metadata"] + ) + ) + + # Check observations using field access + assert len(result.observations) == 3, ( + "Should return observations for all 3 messages" + ) + assert all(obs["role"] == "environment" for obs in result.observations), ( + "All observations should be from environment" + ) + assert all( + obs["content"] == "Environment: correct" for obs in result.observations[:2] + ), "The first two responses should be correct" + assert result.observations[2]["content"] == "Environment: incorrect", "The third response should be incorrect" + + # Check metadata + assert len(result.metadata) == 3, "Should return metadata for all 3 messages" + assert result.metadata == basic_multichoice_test_data["metadata"], ( + "Metadata should be unchanged" + ) + + # Check rewards and done flags + assert result.rewards.shape == (3,), "Rewards should be a tensor of shape (3,)" + assert all(result.rewards[:2] == 1.0), "The first two rewards should be 1.0 for correct answers" + assert result.rewards[2] == 0.0, "The thrid reward should be 0.0 for wrong answer" + assert result.terminateds.shape == (3,), ( + "Terminated flags should be a tensor of shape (3,)" + ) + assert all(result.terminateds == 1.0), "All terminated flags should be 1.0" + + def test_math_env_step_mixed(math_env, mixed_test_data): """Test MathEnvironment step with a mix of correct and incorrect responses.""" result = ray.get( From 11a1de5a9d12d3e6afd0c19578cece2cb17d1a0e Mon Sep 17 00:00:00 2001 From: Xuehan Xiong Date: Tue, 24 Jun 2025 21:39:26 +0000 Subject: [PATCH 07/44] add AIME 2024 dataset. Signed-off-by: Xuehan Xiong Signed-off-by: Xuehan --- examples/configs/evals/eval.yaml | 5 +---- examples/run_eval.py | 9 ++++++++- nemo_rl/data/eval_datasets/aime2024.py | 27 ++++++++++++++++++++++++++ 3 files changed, 36 insertions(+), 5 deletions(-) create mode 100644 nemo_rl/data/eval_datasets/aime2024.py diff --git a/examples/configs/evals/eval.yaml b/examples/configs/evals/eval.yaml index 263342306f..1c21af99c4 100644 --- a/examples/configs/evals/eval.yaml +++ b/examples/configs/evals/eval.yaml @@ -30,10 +30,7 @@ data: max_input_seq_length: ${generation.vllm_cfg.max_model_len} # useless since we directly use prompts in evaluation prompt_file: null system_prompt_file: null - dataset_name: "HuggingFaceH4/aime_2024" - dataset_key: "train" - problem_key: "problem" - solution_key: "answer" + dataset_name: "aime2024" env: math: diff --git a/examples/run_eval.py b/examples/run_eval.py index ae86046dbc..b9fb7e89bb 100644 --- a/examples/run_eval.py +++ b/examples/run_eval.py @@ -29,6 +29,7 @@ from nemo_rl.algorithms.utils import get_tokenizer from nemo_rl.data.datasets import AllTaskProcessedDataset from nemo_rl.data.eval_datasets import ( + aime2024, gpqa, math, mmlu, @@ -146,6 +147,12 @@ def setup_data(tokenizer: AutoTokenizer, data_config, env_configs): prompt_file=data_config["prompt_file"], system_prompt_file=data_config["system_prompt_file"], ) + elif dataset_name == "aime2024": + base_dataset = aime2024.AIME2024Dataset( + prompt_file=data_config["prompt_file"], + system_prompt_file=data_config["system_prompt_file"], + ) + data_processor_fn = math_data_processor elif dataset_name == "gpqa": base_dataset = gpqa.GPQADataset( prompt_file=data_config["prompt_file"], @@ -200,7 +207,7 @@ def main(): if not args.config: args.config = os.path.join( - os.path.dirname(__file__), "configs", "mmlu_eval.yaml" + os.path.dirname(__file__), "configs", "evals", "eval.yaml" ) config = OmegaConf.load(args.config) diff --git a/nemo_rl/data/eval_datasets/aime2024.py b/nemo_rl/data/eval_datasets/aime2024.py new file mode 100644 index 0000000000..1eff661718 --- /dev/null +++ b/nemo_rl/data/eval_datasets/aime2024.py @@ -0,0 +1,27 @@ +"""AIME 2024 dataset.""" + +from typing import Any, Literal, Optional + +from datasets import load_dataset + +from nemo_rl.data.interfaces import TaskDataSpec + + +class AIME2024Dataset: + def __init__(self, + prompt_file: Optional[str]=None, + system_prompt_file: Optional[str]=None, + ): + ds = load_dataset("HuggingFaceH4/aime_2024", split="train") + self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names) + self.task_spec = TaskDataSpec( + task_name="aime2024", + prompt_file=prompt_file, + system_prompt_file=system_prompt_file, + ) + + def _rekey(self, data: dict[str, Any]): + return { + 'problem': data['problem'], + 'expected_answer': data['answer'], + } From 4da0c4315108f1fbe0e68e3a9e959bd384ce42c0 Mon Sep 17 00:00:00 2001 From: Xuehan Xiong Date: Wed, 25 Jun 2025 17:16:14 +0000 Subject: [PATCH 08/44] add GPQA main version. Signed-off-by: Xuehan Xiong xxman@google.com Signed-off-by: Xuehan Xiong Signed-off-by: Xuehan --- examples/configs/evals/gpqa_eval.yaml | 2 +- examples/run_eval.py | 7 +++++++ nemo_rl/data/eval_datasets/gpqa.py | 22 ++++++++++++---------- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/examples/configs/evals/gpqa_eval.yaml b/examples/configs/evals/gpqa_eval.yaml index 93b991b185..b882c1acd8 100644 --- a/examples/configs/evals/gpqa_eval.yaml +++ b/examples/configs/evals/gpqa_eval.yaml @@ -10,7 +10,7 @@ generation: temperature: 0.0 top_p: 1.0 top_k: -1 # -1 means disable - num_prompts_per_step: 16 # -1 means pass all prompts at once + num_prompts_per_step: -1 # -1 means pass all prompts at once model_name: "Qwen/Qwen2.5-7B-Instruct" stop_token_ids: null stop_strings: null diff --git a/examples/run_eval.py b/examples/run_eval.py index b9fb7e89bb..117db1deab 100644 --- a/examples/run_eval.py +++ b/examples/run_eval.py @@ -155,6 +155,13 @@ def setup_data(tokenizer: AutoTokenizer, data_config, env_configs): data_processor_fn = math_data_processor elif dataset_name == "gpqa": base_dataset = gpqa.GPQADataset( + variant="main", + prompt_file=data_config["prompt_file"], + system_prompt_file=data_config["system_prompt_file"], + ) + elif dataset_name == "gpqa_diamond": + base_dataset = gpqa.GPQADataset( + variant="diamond", prompt_file=data_config["prompt_file"], system_prompt_file=data_config["system_prompt_file"], ) diff --git a/nemo_rl/data/eval_datasets/gpqa.py b/nemo_rl/data/eval_datasets/gpqa.py index 1287e446a0..0662a1a5e2 100644 --- a/nemo_rl/data/eval_datasets/gpqa.py +++ b/nemo_rl/data/eval_datasets/gpqa.py @@ -1,25 +1,28 @@ """GPQA dataset and its variants.""" import random -from typing import Any, Optional +from typing import Any, Literal, Optional from datasets import load_dataset from nemo_rl.data.interfaces import TaskDataSpec - class GPQADataset: - def __init__(self, variant: str = "diamond", prompt_file: Optional[str]=None, system_prompt_file: Optional[str]=None): - ds = load_dataset("csv", data_files=f"https://openaipublic.blob.core.windows.net/simple-evals/gpqa_{variant}.csv", split="train") + def __init__(self, + variant: Literal["diamond", "main"] = "diamond", + prompt_file: Optional[str] = None, + system_prompt_file: Optional[str]=None, + ): + ds = load_dataset("Idavidrein/gpqa", f"gpqa_{variant}", split="train") self._rng = random.Random() self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names) self.task_spec = TaskDataSpec( - task_name=f"GPQA_{variant}", + task_name=f'GPQA_{variant}', prompt_file=prompt_file, system_prompt_file=system_prompt_file, ) - + def _rekey(self, data: dict[str, Any]): choices = [ data["Correct Answer"], @@ -32,13 +35,12 @@ def _rekey(self, data: dict[str, Any]): correct_index = choices.index(data["Correct Answer"]) correct_answer = "ABCD"[correct_index] return { - "question": data["Question"], - "options": dict( + 'question': data['Question'], + 'options': dict( A=choices[0], B=choices[1], C=choices[2], D=choices[3], ), - "answer": correct_answer, + 'answer': correct_answer, } - From 5870e46466f901540f0bf60fa35aacef86b1a504 Mon Sep 17 00:00:00 2001 From: yuki <48991475+yuki-666@users.noreply.github.com> Date: Fri, 27 Jun 2025 01:20:58 +0800 Subject: [PATCH 09/44] fix: remove reference_model_buffers in fsdp2 (#558) Signed-off-by: Yuki Huang Signed-off-by: Xuehan --- .../models/policy/dtensor_policy_worker.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/nemo_rl/models/policy/dtensor_policy_worker.py b/nemo_rl/models/policy/dtensor_policy_worker.py index 70e5617040..61dcd9a127 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker.py +++ b/nemo_rl/models/policy/dtensor_policy_worker.py @@ -235,9 +235,6 @@ def __init__( self.reference_model_state_dict = get_cpu_state_dict( self.model.state_dict().items(), pin_memory=True ) - self.reference_model_buffers = get_cpu_state_dict( - self.model.named_buffers(), pin_memory=True - ) if init_optimizer: optimizer_cls = import_class_from_path(self.cfg["optimizer"]["name"]) @@ -768,32 +765,26 @@ def use_reference_model(self) -> Generator[None, None, None]: """ with torch.no_grad(): try: + # Save train model state_dict curr_state_dict = get_cpu_state_dict( self.model.state_dict().items(), pin_memory=True ) - curr_buffers = get_cpu_state_dict( - self.model.named_buffers(), pin_memory=True - ) + # Swap reference model state_dict to self.model for k, v in self.model.state_dict().items(): val = to_local_if_dtensor(v) val.copy_(self.reference_model_state_dict[k]) - for k, v in self.model.named_buffers(): - val = to_local_if_dtensor(v) - val.copy_(self.reference_model_buffers[k]) - + # - self.model is the original reference_model, now on CUDA + # - curr_state_dict is the train model, now on CPU yield finally: + # Restore train model state_dict for k, v in self.model.state_dict().items(): val = to_local_if_dtensor(v) val.copy_(curr_state_dict[k]) - for k, v in self.model.named_buffers(): - val = to_local_if_dtensor(v) - val.copy_(curr_buffers[k]) - def get_reference_policy_logprobs( self, data: BatchedDataDict[Any], micro_batch_size: Optional[int] = None ) -> BatchedDataDict[ReferenceLogprobOutputSpec]: From 79690a11de285f990fea4e1e4d5b40a0167f9b1b Mon Sep 17 00:00:00 2001 From: Parth Chadha Date: Thu, 26 Jun 2025 13:00:18 -0700 Subject: [PATCH 10/44] fix: Add assertion if async is disabled when using pp with vllm (#565) Signed-off-by: Parth Chadha Signed-off-by: Xuehan --- examples/configs/grpo_math_1B.yaml | 2 +- nemo_rl/models/generation/vllm.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml index 283a3d9c31..85cc620b62 100644 --- a/examples/configs/grpo_math_1B.yaml +++ b/examples/configs/grpo_math_1B.yaml @@ -101,7 +101,7 @@ policy: stop_token_ids: null stop_strings: null vllm_cfg: - async_engine: false # Only for internal testing, will be enabled by https://github.com/NVIDIA/NeMo-RL/issues/447. + async_engine: false precision: ${policy.precision} tensor_parallel_size: 1 pipeline_parallel_size: 1 diff --git a/nemo_rl/models/generation/vllm.py b/nemo_rl/models/generation/vllm.py index 3bf64b2652..0b0bb00ad6 100644 --- a/nemo_rl/models/generation/vllm.py +++ b/nemo_rl/models/generation/vllm.py @@ -1100,6 +1100,12 @@ def __init__( """Initialize a vLLM policy with distributed workers.""" # Store config self.cfg = config + if self.cfg["vllm_cfg"]["pipeline_parallel_size"] > 1: + assert self.cfg["vllm_cfg"]["async_engine"], ( + "When pipeline_parallel_size > 1, async_engine must be set to True in the vLLM configuration. " + "You can enable it by adding `policy.generation.vllm_cfg.async_engine=true` to your command." + ) + # Ensure all required VllmConfig fields are present missing_keys = [ key for key in VllmConfig.__required_keys__ if key not in self.cfg From 940049f1e1c1661b312df00a02cebf3a19b02dfc Mon Sep 17 00:00:00 2001 From: Parth Chadha Date: Thu, 26 Jun 2025 13:14:17 -0700 Subject: [PATCH 11/44] fix: remove visualization code (#566) Signed-off-by: Yuki Huang Signed-off-by: Parth Chadha Co-authored-by: yuki <48991475+yuki-666@users.noreply.github.com> Signed-off-by: Xuehan --- nemo_rl/distributed/virtual_cluster.py | 241 +----------------- nemo_rl/distributed/worker_groups.py | 7 - .../distributed/test_cluster_visualization.py | 132 ---------- 3 files changed, 1 insertion(+), 379 deletions(-) delete mode 100644 tests/unit/distributed/test_cluster_visualization.py diff --git a/nemo_rl/distributed/virtual_cluster.py b/nemo_rl/distributed/virtual_cluster.py index 3c7a557c24..22fe0bd670 100644 --- a/nemo_rl/distributed/virtual_cluster.py +++ b/nemo_rl/distributed/virtual_cluster.py @@ -15,7 +15,7 @@ import os import sys import time -from typing import Any, Optional, TypedDict +from typing import Optional, TypedDict import ray from ray.util.placement_group import ( @@ -395,245 +395,6 @@ def shutdown(self) -> bool: return True - def _create_visualization_grid( - self, worker_groups: Optional[Any] = None, is_global_view: bool = False - ) -> dict[str, Any]: - """Create a visualization grid for the cluster with optional worker groups. - - Args: - worker_groups: Single worker group, list of worker groups, or None - is_global_view: Whether this is a global view (multiple worker groups) or single view - - Returns: - dict: A dictionary containing the grid data for display - """ - # Convert single worker group to list for uniform processing - if worker_groups is not None and not isinstance(worker_groups, list): - worker_groups = [worker_groups] - elif worker_groups is None: - worker_groups = [] - - # Find the maximum number of GPUs per node for grid layout - max_gpus_per_node = ( - max(self._bundle_ct_per_node_list) if self._bundle_ct_per_node_list else 0 - ) - if max_gpus_per_node == 0: - return {"empty": True} - - # Number of nodes with GPUs - active_nodes = sum(1 for count in self._bundle_ct_per_node_list if count > 0) - - # Determine cell width based on view type - cell_width = 12 if is_global_view else 7 - - # Create horizontal divider based on max GPUs per node - h_divider = "+" + "+".join(["-" * cell_width] * max_gpus_per_node) + "+" - - # Build the grid data - grid_data = { - "active_nodes": active_nodes, - "total_gpus": self.world_size(), - "worker_groups": worker_groups, - "max_gpus_per_node": max_gpus_per_node, - "cell_width": cell_width, - "h_divider": h_divider, - "is_global_view": is_global_view, - "rows": [], - } - - # For each node, create its row in the grid - for node_idx, bundle_count in enumerate(self._bundle_ct_per_node_list): - if bundle_count == 0: - continue - - # Initialize row data - node_row = { - "node_idx": node_idx, - "bundle_count": bundle_count, - "gpu_cells": [], - "worker_cells": [], - } - - # Initialize worker cells arrays (one per worker group) - for i in range(len(worker_groups)): - node_row["worker_cells"].append([]) # type: ignore - - # Process each GPU position in the row - for gpu_idx in range(max_gpus_per_node): - if gpu_idx < bundle_count: - # This is a real GPU - gpu_cell = f" {node_idx}.{gpu_idx} " - - # Process worker assignments for this GPU - worker_cells = self._get_worker_cells( - node_idx, gpu_idx, worker_groups, cell_width, is_global_view - ) - else: - # Empty cell (no GPU) - gpu_cell = " " * cell_width - worker_cells = [" " * cell_width] * len(worker_groups) - - # Add cells to the row - node_row["gpu_cells"].append(gpu_cell) # type: ignore - for i, cell in enumerate(worker_cells): - if i < len(node_row["worker_cells"]): # type: ignore - node_row["worker_cells"][i].append(cell) # type: ignore - - # Add the completed row to the grid - grid_data["rows"].append(node_row) - - return grid_data - - def _get_worker_cells( - self, - node_idx: int, - gpu_idx: int, - worker_groups: list[Any], - cell_width: int, - is_global_view: bool, - ) -> list[str]: - """Get the worker cell content for each worker group at a specific GPU location. - - Args: - node_idx: The node index - gpu_idx: The GPU index within the node - worker_groups: List of worker groups to check - cell_width: Width of each cell for formatting - is_global_view: Whether this is a global view with multiple worker groups - - Returns: - list: List of formatted worker cells, one per worker group - """ - worker_cells = [] - - for wg_idx, worker_group in enumerate(worker_groups): - # Default empty worker cell - worker_cell = " " * cell_width - - # Find workers from this group assigned to this GPU - for worker_id, metadata in enumerate(worker_group.worker_metadata): - if ( - metadata["node_idx"] == node_idx - and metadata["local_rank"] == gpu_idx - ): - if is_global_view: - # Use group numbering in global view - worker_cell = f" G{wg_idx}:W{worker_id:<2d} " - else: - # Use simple worker IDs in single group view - worker_cell = f" W {worker_id:<2d} " - break - - worker_cells.append(worker_cell) - - return worker_cells - - def _print_visualization(self, grid_data: dict[str, Any]) -> None: - """Print the visualization based on the grid data. - - Args: - grid_data: The grid data generated by _create_visualization_grid - """ - if grid_data.get("empty", False): - print("\nEmpty Ray Cluster (no GPUs)") - return - - # Print header - if grid_data["is_global_view"]: - # Global view header - wg_summary = "" - if grid_data["worker_groups"]: - wg_summary = f", Worker Groups: {len(grid_data['worker_groups'])}" - - print( - f"\nRay Cluster Global View: {grid_data['active_nodes']} nodes, {grid_data['total_gpus']} GPUs{wg_summary}" - ) - else: - # Single view header - wg_info = "" - if grid_data["worker_groups"]: - worker_group = grid_data["worker_groups"][0] - wg_name = getattr(worker_group, "name_prefix", "Default") or "Default" - wg_info = ( - f", Worker Group: {wg_name} ({worker_group.world_size} workers)" - ) - - print( - f"\nRay Cluster: {grid_data['active_nodes']} nodes, {grid_data['total_gpus']} GPUs{wg_info}" - ) - - # Print the top border - print(grid_data["h_divider"]) - - # Print each row of the grid - for row in grid_data["rows"]: - # Print GPU row - gpu_row = ["|"] - for cell in row["gpu_cells"]: - gpu_row.append(cell.ljust(grid_data["cell_width"])) - gpu_row.append("|") - print("".join(gpu_row)) - - # Print worker rows - for wg_idx, worker_cells in enumerate(row["worker_cells"]): - worker_row = ["|"] - for cell in worker_cells: - worker_row.append(cell.ljust(grid_data["cell_width"])) - worker_row.append("|") - print("".join(worker_row)) - - # Print divider between nodes - print(grid_data["h_divider"]) - - # Print legend - self._print_legend(grid_data) - - def _print_legend(self, grid_data: dict[str, Any]) -> None: - """Print the legend for the visualization.""" - if grid_data["is_global_view"]: - # Legend for global view - if grid_data["worker_groups"]: - print("Legend:") - for wg_idx, wg in enumerate(grid_data["worker_groups"]): - wg_name = getattr(wg, "name_prefix", "unnamed") or "unnamed" - wg_count = wg.world_size - print(f"G{wg_idx}: {wg_name} ({wg_count} workers)") - print("W##: Worker ID within its group") - else: - # Legend for single worker group view - if grid_data["worker_groups"]: - wg_name = ( - getattr(grid_data["worker_groups"][0], "name_prefix", "") or "" - ) - print(f"W## = Worker ID in '{wg_name}' worker group") - - print("#.#: Node.GPU identifier") - - def print_cluster_grid(self, worker_group: Optional[Any] = None) -> None: - """Prints a compact grid visualization of the virtual cluster, similar to JAX's visualize_array_sharding. - - If a worker_group is provided, it will also show worker assignments on each device. - - Args: - worker_group: Optional RayWorkerGroup instance to visualize worker assignments - """ - grid_data = self._create_visualization_grid(worker_group, is_global_view=False) - self._print_visualization(grid_data) - - def print_all_worker_groups( - self, worker_groups: Optional[list[Any]] = None - ) -> None: - """Prints a visualization showing all worker groups in the cluster. - - This provides a global view of all workers across all worker groups. - - Args: - worker_groups: List of RayWorkerGroup instances to visualize. If None, - no worker assignments will be shown. - """ - grid_data = self._create_visualization_grid(worker_groups, is_global_view=True) - self._print_visualization(grid_data) - def __del__(self) -> None: """Shutsdown the virtual cluster when the object is deleted or is garbage collected. diff --git a/nemo_rl/distributed/worker_groups.py b/nemo_rl/distributed/worker_groups.py index 71190d8774..a283e6b18c 100644 --- a/nemo_rl/distributed/worker_groups.py +++ b/nemo_rl/distributed/worker_groups.py @@ -896,10 +896,3 @@ def shutdown( self._worker_metadata = [] return success - - def print_worker_layout(self) -> None: - """Prints a visual representation of the worker layout across the virtual cluster. - - This shows which workers are assigned to which nodes and GPUs. - """ - self.cluster.print_cluster_grid(self) diff --git a/tests/unit/distributed/test_cluster_visualization.py b/tests/unit/distributed/test_cluster_visualization.py deleted file mode 100644 index d6dc31e1a5..0000000000 --- a/tests/unit/distributed/test_cluster_visualization.py +++ /dev/null @@ -1,132 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from unittest.mock import MagicMock, patch - -import pytest - -from nemo_rl.distributed.virtual_cluster import RayVirtualCluster - - -@pytest.fixture(autouse=True) -def mock_virtual_cluster_pg(): - # Mock the _init_placement_groups and get_placement_groups methods to avoid actually initializing placement groups - with ( - patch( - "nemo_rl.distributed.virtual_cluster.RayVirtualCluster.get_placement_groups" - ) as mock_get_pg, - patch( - "nemo_rl.distributed.virtual_cluster.RayVirtualCluster._init_placement_groups" - ) as mock_init_pg, - ): - mock_get_pg.return_value = [] - mock_init_pg.return_value = [] - yield - - -def test_empty_cluster_visualization(capsys): - """Test visualization of an empty cluster.""" - # Create a empty cluster - cluster = RayVirtualCluster( - bundle_ct_per_node_list=[], - use_gpus=False, - name="test-empty", - ) - - # Test visualization - cluster.print_cluster_grid() - - # Capture the output - out, _ = capsys.readouterr() - assert "Empty Ray Cluster" in out - - -def test_cluster_grid(capsys): - """Test visualization of a cluster grid.""" - # Create a cluster with a configuration but don't actually allocate resources - cluster = RayVirtualCluster( - bundle_ct_per_node_list=[2, 3], - use_gpus=False, - name="test-visual", - max_colocated_worker_groups=1, - ) - - cluster.print_cluster_grid() - - # Capture the output - out, _ = capsys.readouterr() - print(out) - assert "Ray Cluster: 2 nodes, 5 GPUs" in out - assert "0.0" in out # First node, first GPU - assert "0.1" in out # First node, second GPU - assert "1.0" in out # Second node, first GPU - assert "1.2" in out # Second node, third GPU - - -def test_global_visualization_formatting(capsys): - """Test global visualization formatting without actual worker groups.""" - cluster = RayVirtualCluster( - bundle_ct_per_node_list=[2, 2], - use_gpus=False, - name="test-global", - max_colocated_worker_groups=1, - ) - - cluster.print_all_worker_groups([]) - - # Capture the output - out, _ = capsys.readouterr() - print(out) - assert "Ray Cluster Global View: 2 nodes, 4 GPUs" in out - - -def test_with_mock_worker_groups(capsys): - """Test visualization with mock worker groups.""" - # Create a cluster with a configuration - cluster = RayVirtualCluster( - bundle_ct_per_node_list=[2, 3], - use_gpus=False, - name="test-workers", - max_colocated_worker_groups=1, - ) - - worker_group1 = MagicMock() - worker_group1.name_prefix = "policy" - worker_group1.world_size = 2 - worker_group1.worker_metadata = [ - {"node_idx": 0, "local_rank": 0}, # First worker on node 0, GPU 0 - {"node_idx": 1, "local_rank": 0}, # Second worker on node 1, GPU 0 - ] - - worker_group2 = MagicMock() - worker_group2.name_prefix = "policy_generate" - worker_group2.world_size = 3 - worker_group2.worker_metadata = [ - {"node_idx": 0, "local_rank": 1}, # First worker on node 0, GPU 1 - {"node_idx": 1, "local_rank": 1}, # Second worker on node 1, GPU 1 - {"node_idx": 1, "local_rank": 2}, # Third worker on node 1, GPU 2 - ] - - cluster.print_all_worker_groups([worker_group1, worker_group2]) - - # Capture the output - out, _ = capsys.readouterr() - print(out) - - # Check for key elements in the output - assert "Ray Cluster Global View: 2 nodes, 5 GPUs" in out - assert "G0" in out # First worker group - assert "G1" in out # Second worker group - assert "policy" in out # First worker group name - assert "policy_generate" in out # Second worker group name From 745790ce8befa88f3610ec79854f4fecb546b8b8 Mon Sep 17 00:00:00 2001 From: Zhaocheng Zhu Date: Thu, 26 Jun 2025 15:09:12 -0700 Subject: [PATCH 12/44] Allow uneven shards for multi-GPU inference in vllm backend (#494) Signed-off-by: KiddoZhu Signed-off-by: Sahil Jain <48468750+SahilJain314@users.noreply.github.com> Co-authored-by: Sahil Jain <48468750+SahilJain314@users.noreply.github.com> Signed-off-by: Xuehan --- nemo_rl/models/generation/vllm.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/nemo_rl/models/generation/vllm.py b/nemo_rl/models/generation/vllm.py index 0b0bb00ad6..58795e5031 100644 --- a/nemo_rl/models/generation/vllm.py +++ b/nemo_rl/models/generation/vllm.py @@ -1387,12 +1387,11 @@ def generate_text( f"data must be a BatchedDataDict, got type: {type(data)}" ) - # Get total batch size - batch_size = len(data["prompts"]) - # Shard the data across the tied worker groups dp_size = self.sharding_annotations.get_axis_size("data_parallel") - sharded_data = data.shard_by_batch_size(dp_size, batch_size=batch_size) + sharded_data: list[SlicedDataDict] = data.shard_by_batch_size( + dp_size, allow_uneven_shards=True + ) future_bundle = self.worker_group.run_all_workers_sharded_data( "generate_text", sharded_data, From 9c59083d4124456a5ac941581ce237cb3dabaaef Mon Sep 17 00:00:00 2001 From: Xuehan Xiong Date: Wed, 25 Jun 2025 17:16:14 +0000 Subject: [PATCH 13/44] add GPQA main version. Signed-off-by: Xuehan Xiong xxman@google.com Signed-off-by: Xuehan --- examples/configs/evals/gpqa_eval.yaml | 31 +--- examples/configs/evals/local_eval.yaml | 14 ++ examples/configs/evals/math_eval.yaml | 36 +--- examples/run_eval.py | 138 +--------------- examples/run_grpo_math.py | 71 +------- nemo_rl/data/eval_datasets/__init__.py | 88 ++++++++++ nemo_rl/data/eval_datasets/aime2024.py | 17 +- nemo_rl/data/eval_datasets/gpqa.py | 7 +- .../data/eval_datasets/local_math_dataset.py | 40 +++++ nemo_rl/data/eval_datasets/math.py | 28 ++-- nemo_rl/data/eval_datasets/mmlu.py | 33 ++-- nemo_rl/data/eval_datasets/mmlu_pro.py | 19 +-- nemo_rl/data/processors.py | 155 ++++++++++++++++++ 13 files changed, 369 insertions(+), 308 deletions(-) create mode 100644 examples/configs/evals/local_eval.yaml create mode 100644 nemo_rl/data/eval_datasets/local_math_dataset.py create mode 100644 nemo_rl/data/processors.py diff --git a/examples/configs/evals/gpqa_eval.yaml b/examples/configs/evals/gpqa_eval.yaml index b882c1acd8..463702d3a4 100644 --- a/examples/configs/evals/gpqa_eval.yaml +++ b/examples/configs/evals/gpqa_eval.yaml @@ -1,42 +1,15 @@ -# Evaluation Configuration -eval: - metric: "pass@1" # only pass@1 is supported now - num_tests_per_prompt: 1 # every prompt will be tested num_tests_per_prompt times and use the average score as the final score - seed: 42 +# GPQA evaluation Configuration +defaults: "eval.yaml" generation: - backend: "vllm" # only vllm is supported for evaluation - max_new_tokens: ${generation.vllm_cfg.max_model_len} - temperature: 0.0 - top_p: 1.0 - top_k: -1 # -1 means disable - num_prompts_per_step: -1 # -1 means pass all prompts at once model_name: "Qwen/Qwen2.5-7B-Instruct" - stop_token_ids: null - stop_strings: null vllm_cfg: - async_engine: false - precision: "bfloat16" - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - gpu_memory_utilization: 0.9 max_model_len: 3072 -tokenizer: - name: ${generation.model_name} ## specify if you'd like to use a tokenizer different from the model's default - chat_template: "default" - data: - max_input_seq_length: ${generation.vllm_cfg.max_model_len} # useless since we directly use prompts in evaluation prompt_file: "examples/prompts/gpqa.txt" - system_prompt_file: null dataset_name: "gpqa" env: math: - num_workers: 8 verifier_type: "multichoice" - -cluster: - gpus_per_node: 1 - num_nodes: 1 diff --git a/examples/configs/evals/local_eval.yaml b/examples/configs/evals/local_eval.yaml new file mode 100644 index 0000000000..ad9def2112 --- /dev/null +++ b/examples/configs/evals/local_eval.yaml @@ -0,0 +1,14 @@ +# Evaluation Configuration from local files. +defaults: "eval.yaml" + +generation: + model_name: "Qwen/Qwen2.5-7B-Instruct" + +data: + prompt_file: "examples/prompts/cot.txt" + dataset_name: "local" + problem_key: "Question" + solution_key: "Answer" + split: "train" + data_paths: "https:\/\/openaipublic.blob.core.windows.net\/simple-evals\/math_500_test.csv" + file_format: "csv" diff --git a/examples/configs/evals/math_eval.yaml b/examples/configs/evals/math_eval.yaml index 32a4a3281c..b42956866d 100644 --- a/examples/configs/evals/math_eval.yaml +++ b/examples/configs/evals/math_eval.yaml @@ -1,41 +1,9 @@ -# Evaluation Configuration -eval: - metric: "pass@1" # only pass@1 is supported now - num_tests_per_prompt: 1 # every prompt will be tested num_tests_per_prompt times and use the average score as the final score - seed: 42 +# Math evaluation Configuration +defaults: "eval.yaml" generation: - backend: "vllm" # only vllm is supported for evaluation - max_new_tokens: ${generation.vllm_cfg.max_model_len} - temperature: 0.0 - top_p: 1.0 - top_k: -1 # -1 means disable - num_prompts_per_step: 16 # -1 means pass all prompts at once model_name: "Qwen/Qwen2.5-7B-Instruct" - stop_token_ids: null - stop_strings: null - vllm_cfg: - async_engine: false - precision: "bfloat16" - tensor_parallel_size: 1 - pipeline_parallel_size: 1 - gpu_memory_utilization: 0.9 - max_model_len: 2048 - -tokenizer: - name: ${generation.model_name} ## specify if you'd like to use a tokenizer different from the model's default - chat_template: "default" data: - max_input_seq_length: ${generation.vllm_cfg.max_model_len} # useless since we directly use prompts in evaluation prompt_file: "examples/prompts/cot.txt" - system_prompt_file: null dataset_name: "math" - -env: - math: - num_workers: 8 - -cluster: - gpus_per_node: 1 - num_nodes: 1 diff --git a/examples/run_eval.py b/examples/run_eval.py index 117db1deab..89e2ede395 100644 --- a/examples/run_eval.py +++ b/examples/run_eval.py @@ -16,26 +16,15 @@ import os import pprint import sys -from typing import Any, cast - -import torch sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from omegaconf import OmegaConf from transformers import AutoTokenizer, PreTrainedTokenizerBase -from examples.run_grpo_math import math_data_processor from nemo_rl.algorithms.utils import get_tokenizer from nemo_rl.data.datasets import AllTaskProcessedDataset -from nemo_rl.data.eval_datasets import ( - aime2024, - gpqa, - math, - mmlu, - mmlu_pro, -) -from nemo_rl.data.interfaces import DatumSpec, TaskDataSpec +from nemo_rl.data.eval_datasets import load_eval_dataset from nemo_rl.distributed.ray_actor_environment_registry import ( get_actor_python_env, ) @@ -43,6 +32,7 @@ from nemo_rl.environments.math_environment import MathEnvironment from nemo_rl.evals.eval import MasterConfig, run_env_eval, setup from nemo_rl.models.generation import configure_generation_config +from nemo_rl.utils.config import load_config TokenizerType = PreTrainedTokenizerBase @@ -63,129 +53,11 @@ def parse_args(): return args, overrides -def _construct_prompt(prompt: str, question: str, options: dict[str, str]) -> str: - """Construct prompt from question and options.""" - output = prompt - output += f"\n\nQuestion: {question}\nOptions:\n" - output += "\n".join( - [ - f"{letter}) {option}" - for letter, option in options.items() - if option is not None - ] - ) - return output - - -def multichoice_qa_processor( - datum_dict: dict[str, Any], - task_data_spec: TaskDataSpec, - tokenizer: TokenizerType, - max_seq_length: int, - idx: int, -) -> DatumSpec: - """Process a datum dictionary (directly loaded from dataset) into a DatumSpec for multiple-choice problems.""" - question = datum_dict["question"] - answer = str(datum_dict["answer"]) - options = datum_dict["options"] - extra_env_info = {"ground_truth": answer} - if "subject" in datum_dict: - extra_env_info.update({"subject": datum_dict["subject"]}) - - message_log = [] - - # system prompt - if task_data_spec.system_prompt: - sys_prompt: dict[str, str | torch.Tensor] = { - "role": "system", - "content": task_data_spec.system_prompt, - } - sys = tokenizer.apply_chat_template( - [cast(dict[str, str], sys_prompt)], - tokenize=False, - add_generation_prompt=False, - add_special_tokens=False, - ) - sys_prompt["token_ids"] = tokenizer(sys, return_tensors="pt")["input_ids"][0] - message_log.append(sys_prompt) - - # user prompt - if task_data_spec.prompt: - problem = _construct_prompt(task_data_spec.prompt, question, options) - user_message = {"role": "user", "content": problem} - message = tokenizer.apply_chat_template( - [user_message], - tokenize=False, - add_generation_prompt=True, - add_special_tokens=False, - ) - user_message["token_ids"] = tokenizer(message, return_tensors="pt")["input_ids"][0] - user_message["content"] = message - message_log.append(user_message) - - length = sum(len(m["token_ids"]) for m in message_log) - output: DatumSpec = { - "message_log": message_log, - "length": length, - "extra_env_info": extra_env_info, - "loss_multiplier": 1.0, - "idx": idx, - } - if "task_name" in datum_dict: - output["task_name"] = datum_dict["task_name"] - return output - - def setup_data(tokenizer: AutoTokenizer, data_config, env_configs): print("Setting up data...") # load dataset - dataset_name = data_config["dataset_name"] - data_processor_fn = multichoice_qa_processor - if dataset_name == "mmlu": - base_dataset = mmlu.MMLUDataset( - prompt_file=data_config["prompt_file"], - system_prompt_file=data_config["system_prompt_file"], - ) - elif dataset_name == "aime2024": - base_dataset = aime2024.AIME2024Dataset( - prompt_file=data_config["prompt_file"], - system_prompt_file=data_config["system_prompt_file"], - ) - data_processor_fn = math_data_processor - elif dataset_name == "gpqa": - base_dataset = gpqa.GPQADataset( - variant="main", - prompt_file=data_config["prompt_file"], - system_prompt_file=data_config["system_prompt_file"], - ) - elif dataset_name == "gpqa_diamond": - base_dataset = gpqa.GPQADataset( - variant="diamond", - prompt_file=data_config["prompt_file"], - system_prompt_file=data_config["system_prompt_file"], - ) - elif dataset_name == "mmlu_pro": - base_dataset = mmlu_pro.MMLUProDataset( - prompt_file=data_config["prompt_file"], - system_prompt_file=data_config["system_prompt_file"], - ) - elif dataset_name == "math": - base_dataset = math.MathDataset( - variant="math_test", - prompt_file=data_config["prompt_file"], - system_prompt_file=data_config["system_prompt_file"], - ) - data_processor_fn = math_data_processor - elif dataset_name == "math500": - base_dataset = math.MathDataset( - variant="math_500_test", - prompt_file=data_config["prompt_file"], - system_prompt_file=data_config["system_prompt_file"], - ) - data_processor_fn = math_data_processor - else: - raise ValueError(f"Unknown dataset {dataset_name}.") + base_dataset = load_eval_dataset(data_config) rekeyed_ds = base_dataset.rekeyed_ds env = MathEnvironment.options( @@ -200,7 +72,7 @@ def setup_data(tokenizer: AutoTokenizer, data_config, env_configs): dataset=rekeyed_ds, tokenizer=tokenizer, default_task_data_spec=base_dataset.task_spec, - task_data_processors=data_processor_fn, + task_data_processors=base_dataset.processor, max_seq_length=data_config["max_input_seq_length"], ) @@ -217,7 +89,7 @@ def main(): os.path.dirname(__file__), "configs", "evals", "eval.yaml" ) - config = OmegaConf.load(args.config) + config = load_config(args.config) print(f"Loaded configuration from: {args.config}") if overrides: diff --git a/examples/run_grpo_math.py b/examples/run_grpo_math.py index 4a64d3c13b..673322eb61 100644 --- a/examples/run_grpo_math.py +++ b/examples/run_grpo_math.py @@ -16,9 +16,8 @@ import os import pprint from collections import defaultdict -from typing import Any, Optional, cast +from typing import Any, Optional -import torch from omegaconf import OmegaConf from transformers import PreTrainedTokenizerBase @@ -116,74 +115,6 @@ def hf_data_processor( return output -# Example of a generic math data processor -# TaskDataProcessFnCallable -def math_data_processor( - datum_dict: dict[str, Any], - task_data_spec: TaskDataSpec, - tokenizer: TokenizerType, - max_seq_length: int, - idx: int, -) -> DatumSpec: - """Process a datum dictionary (directly loaded from dataset) into a DatumSpec for the Math Environment.""" - problem = datum_dict["problem"] - solution = str(datum_dict["expected_answer"]) - extra_env_info = {"ground_truth": solution} - - message_log: LLMMessageLogType = [] - - # system prompt - if task_data_spec.system_prompt: - sys_prompt: dict[str, str | torch.Tensor] = { - "role": "system", - "content": task_data_spec.system_prompt, - } - sys = tokenizer.apply_chat_template( - [cast(dict[str, str], sys_prompt)], - tokenize=False, - add_generation_prompt=False, - add_special_tokens=False, - ) - sys_prompt["token_ids"] = tokenizer(sys, return_tensors="pt")["input_ids"][0] - message_log.append(sys_prompt) - - # user prompt - if task_data_spec.prompt: - problem = task_data_spec.prompt.format(problem) - user_message = {"role": "user", "content": problem} - message = tokenizer.apply_chat_template( - [user_message], - tokenize=False, - add_generation_prompt=True, - add_special_tokens=False, - ) - user_message["token_ids"] = tokenizer(message, return_tensors="pt")["input_ids"][0] - user_message["content"] = message - message_log.append(user_message) - - length = sum(len(m["token_ids"]) for m in message_log) - - loss_multiplier = 1.0 - if length > max_seq_length: - # make smaller and mask out - for indiv_message in message_log: - indiv_message["token_ids"] = indiv_message["token_ids"][ - : min(4, max_seq_length // len(message_log)) - ] - loss_multiplier = 0.0 - - output: DatumSpec = { - "message_log": message_log, - "length": length, - "extra_env_info": extra_env_info, - "loss_multiplier": loss_multiplier, - "idx": idx, - } - if "task_name" in datum_dict: - output["task_name"] = datum_dict["task_name"] - return output - - def setup_data( tokenizer: TokenizerType, data_config: DataConfig, diff --git a/nemo_rl/data/eval_datasets/__init__.py b/nemo_rl/data/eval_datasets/__init__.py index e69de29bb2..2e5ba97974 100644 --- a/nemo_rl/data/eval_datasets/__init__.py +++ b/nemo_rl/data/eval_datasets/__init__.py @@ -0,0 +1,88 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from nemo_rl.data.eval_datasets.aime2024 import AIME2024Dataset +from nemo_rl.data.eval_datasets.gpqa import GPQADataset +from nemo_rl.data.eval_datasets.local_math_dataset import LocalMathDataset +from nemo_rl.data.eval_datasets.math import MathDataset +from nemo_rl.data.eval_datasets.mmlu import MMLUDataset +from nemo_rl.data.eval_datasets.mmlu_pro import MMLUProDataset + + +def load_eval_dataset(data_config): + """Loads evaluation dataset.""" + dataset_name = data_config["dataset_name"] + if dataset_name == "mmlu": + base_dataset = MMLUDataset( + prompt_file=data_config["prompt_file"], + system_prompt_file=data_config["system_prompt_file"], + ) + elif dataset_name == "aime2024": + base_dataset = AIME2024Dataset( + prompt_file=data_config["prompt_file"], + system_prompt_file=data_config["system_prompt_file"], + ) + elif dataset_name == "gpqa": + base_dataset = GPQADataset( + variant="main", + prompt_file=data_config["prompt_file"], + system_prompt_file=data_config["system_prompt_file"], + ) + elif dataset_name == "gpqa_diamond": + base_dataset = GPQADataset( + variant="diamond", + prompt_file=data_config["prompt_file"], + system_prompt_file=data_config["system_prompt_file"], + ) + elif dataset_name == "mmlu_pro": + base_dataset = MMLUProDataset( + prompt_file=data_config["prompt_file"], + system_prompt_file=data_config["system_prompt_file"], + ) + elif dataset_name == "math": + base_dataset = MathDataset( + variant="math_test", + prompt_file=data_config["prompt_file"], + system_prompt_file=data_config["system_prompt_file"], + ) + elif dataset_name == "math500": + base_dataset = MathDataset( + variant="math_500_test", + prompt_file=data_config["prompt_file"], + system_prompt_file=data_config["system_prompt_file"], + ) + elif dataset_name == "local": + base_dataset = LocalMathDataset( + name=dataset_name, + data_paths=data_config["data_paths"], + problem_key=data_config["problem_key"], + solution_key=data_config["solution_key"], + file_format=data_config["file_format"], + split=data_config["split"], + prompt_file=data_config["prompt_file"], + system_prompt_file=data_config["system_prompt_file"], + ) + else: + raise ValueError(f"Unknown dataset {dataset_name}.") + return base_dataset + + +__all__ = [ + "AIME2024Dataset", + "GPQADataset", + "LocalMathDataset", + "MathDataset", + "MMLUDataset", + "MMLUProDataset", +] diff --git a/nemo_rl/data/eval_datasets/aime2024.py b/nemo_rl/data/eval_datasets/aime2024.py index 1eff661718..b73bd34dbf 100644 --- a/nemo_rl/data/eval_datasets/aime2024.py +++ b/nemo_rl/data/eval_datasets/aime2024.py @@ -1,17 +1,19 @@ """AIME 2024 dataset.""" -from typing import Any, Literal, Optional +from typing import Any, Optional from datasets import load_dataset +from nemo_rl.data import processors from nemo_rl.data.interfaces import TaskDataSpec class AIME2024Dataset: - def __init__(self, - prompt_file: Optional[str]=None, - system_prompt_file: Optional[str]=None, - ): + def __init__( + self, + prompt_file: Optional[str] = None, + system_prompt_file: Optional[str] = None, + ): ds = load_dataset("HuggingFaceH4/aime_2024", split="train") self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names) self.task_spec = TaskDataSpec( @@ -19,9 +21,10 @@ def __init__(self, prompt_file=prompt_file, system_prompt_file=system_prompt_file, ) + self.processor = processors.math_data_processor def _rekey(self, data: dict[str, Any]): return { - 'problem': data['problem'], - 'expected_answer': data['answer'], + "problem": data["problem"], + "expected_answer": data["answer"], } diff --git a/nemo_rl/data/eval_datasets/gpqa.py b/nemo_rl/data/eval_datasets/gpqa.py index 0662a1a5e2..4eb05014c6 100644 --- a/nemo_rl/data/eval_datasets/gpqa.py +++ b/nemo_rl/data/eval_datasets/gpqa.py @@ -5,14 +5,16 @@ from datasets import load_dataset +from nemo_rl.data import processors from nemo_rl.data.interfaces import TaskDataSpec class GPQADataset: - def __init__(self, + def __init__( + self, variant: Literal["diamond", "main"] = "diamond", prompt_file: Optional[str] = None, - system_prompt_file: Optional[str]=None, + system_prompt_file: Optional[str] = None, ): ds = load_dataset("Idavidrein/gpqa", f"gpqa_{variant}", split="train") self._rng = random.Random() @@ -22,6 +24,7 @@ def __init__(self, prompt_file=prompt_file, system_prompt_file=system_prompt_file, ) + self.processor = processors.multichoice_qa_processor def _rekey(self, data: dict[str, Any]): choices = [ diff --git a/nemo_rl/data/eval_datasets/local_math_dataset.py b/nemo_rl/data/eval_datasets/local_math_dataset.py new file mode 100644 index 0000000000..d78b99565f --- /dev/null +++ b/nemo_rl/data/eval_datasets/local_math_dataset.py @@ -0,0 +1,40 @@ +"""Local math dataset.""" + +from typing import Any, Literal, Optional + +from datasets import load_dataset + +from nemo_rl.data import processors +from nemo_rl.data.interfaces import TaskDataSpec + + +class LocalMathDataset: + def __init__( + self, + data_paths: str | list[str], + problem_key: str, + solution_key: str, + name: str, + split: Optional[str] = None, + file_format: Literal["csv", "json"] = "csv", + prompt_file: Optional[str] = None, + system_prompt_file: Optional[str] = None, + ): + ds = load_dataset(file_format, data_files=data_paths) + if split is not None: + ds = ds[split] + self._problem_key = problem_key + self._solution_key = solution_key + self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names) + self.task_spec = TaskDataSpec( + task_name=name, + prompt_file=prompt_file, + system_prompt_file=system_prompt_file, + ) + self.processor = processors.math_data_processor + + def _rekey(self, data: dict[str, Any]): + return { + "problem": data[self._problem_key], + "expected_answer": data[self._solution_key], + } diff --git a/nemo_rl/data/eval_datasets/math.py b/nemo_rl/data/eval_datasets/math.py index cbd7cb3577..a1c489a148 100644 --- a/nemo_rl/data/eval_datasets/math.py +++ b/nemo_rl/data/eval_datasets/math.py @@ -4,26 +4,32 @@ from datasets import load_dataset +from nemo_rl.data import processors from nemo_rl.data.interfaces import TaskDataSpec class MathDataset: - def __init__(self, - variant: Literal["math_test", "math_500_test"] = "math_test", - prompt_file: Optional[str]=None, - system_prompt_file: Optional[str]=None, - ): - ds = load_dataset('csv', data_files=f"https://openaipublic.blob.core.windows.net/simple-evals/{variant}.csv", split='train') + def __init__( + self, + variant: Literal["math_test", "math_500_test"] = "math_test", + prompt_file: Optional[str] = None, + system_prompt_file: Optional[str] = None, + ): + ds = load_dataset( + "csv", + data_files=f"https://openaipublic.blob.core.windows.net/simple-evals/{variant}.csv", + split="train", + ) self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names) self.task_spec = TaskDataSpec( - task_name=f'{variant}', + task_name=f"{variant}", prompt_file=prompt_file, system_prompt_file=system_prompt_file, ) - + self.processor = processors.math_data_processor + def _rekey(self, data: dict[str, Any]): return { - 'problem': data['Question'], - 'expected_answer': data['Answer'], + "problem": data["Question"], + "expected_answer": data["Answer"], } - diff --git a/nemo_rl/data/eval_datasets/mmlu.py b/nemo_rl/data/eval_datasets/mmlu.py index f6ab075886..86acbcc9a6 100644 --- a/nemo_rl/data/eval_datasets/mmlu.py +++ b/nemo_rl/data/eval_datasets/mmlu.py @@ -4,30 +4,39 @@ from datasets import load_dataset +from nemo_rl.data import processors from nemo_rl.data.interfaces import TaskDataSpec class MMLUDataset: - def __init__(self, prompt_file: Optional[str] = None, system_prompt_file: Optional[str] = None): - ds = load_dataset('csv', data_files="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", split='train') + def __init__( + self, + prompt_file: Optional[str] = None, + system_prompt_file: Optional[str] = None, + ): + ds = load_dataset( + "csv", + data_files="https://openaipublic.blob.core.windows.net/simple-evals/mmlu.csv", + split="train", + ) self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names) self.task_spec = TaskDataSpec( - task_name='MMLU', + task_name="MMLU", prompt_file=prompt_file, system_prompt_file=system_prompt_file, ) + self.processor = processors.multichoice_qa_processor def _rekey(self, data: dict[str, Any]): return { - 'question': data['Question'], - 'options': dict( - A=data['A'], - B=data['B'], - C=data['C'], - D=data['D'], + "question": data["Question"], + "options": dict( + A=data["A"], + B=data["B"], + C=data["C"], + D=data["D"], ), - 'answer': data['Answer'], - 'subject': data['Subject'], + "answer": data["Answer"], + "subject": data["Subject"], } - diff --git a/nemo_rl/data/eval_datasets/mmlu_pro.py b/nemo_rl/data/eval_datasets/mmlu_pro.py index da990a90c5..4dd094e322 100644 --- a/nemo_rl/data/eval_datasets/mmlu_pro.py +++ b/nemo_rl/data/eval_datasets/mmlu_pro.py @@ -4,28 +4,27 @@ from datasets import load_dataset +from nemo_rl.data import processors from nemo_rl.data.interfaces import TaskDataSpec class MMLUProDataset: def __init__(self, prompt_file: str, system_prompt_file: Optional[str] = None): - ds = load_dataset('TIGER-Lab/MMLU-Pro', split='test') + ds = load_dataset("TIGER-Lab/MMLU-Pro", split="test") self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names) self.task_spec = TaskDataSpec( - task_name='MMLU-Pro', + task_name="MMLU-Pro", prompt_file=prompt_file, system_prompt_file=system_prompt_file, ) + self.processor = processors.multichoice_qa_processor def _rekey(self, data: dict[str, Any]): - options = { - chr(ord('A') + i) : op for i, op in enumerate(data['options']) - } + options = {chr(ord("A") + i): op for i, op in enumerate(data["options"])} return { - 'question': data['question'], - 'options': options, - 'answer': data['answer'], - 'subject': data['category'], + "question": data["question"], + "options": options, + "answer": data["answer"], + "subject": data["category"], } - diff --git a/nemo_rl/data/processors.py b/nemo_rl/data/processors.py new file mode 100644 index 0000000000..5fd35d4078 --- /dev/null +++ b/nemo_rl/data/processors.py @@ -0,0 +1,155 @@ +"""Contains data processors for evaluation.""" + +from typing import Any, cast + +import torch +from transformers import PreTrainedTokenizerBase + +from nemo_rl.data.interfaces import DatumSpec, LLMMessageLogType, TaskDataSpec + +TokenizerType = PreTrainedTokenizerBase + + +# Example of a generic math data processor +# TaskDataProcessFnCallable +def math_data_processor( + datum_dict: dict[str, Any], + task_data_spec: TaskDataSpec, + tokenizer: TokenizerType, + max_seq_length: int, + idx: int, +) -> DatumSpec: + """Process a datum dictionary (directly loaded from dataset) into a DatumSpec for the Math Environment.""" + problem = datum_dict["problem"] + solution = str(datum_dict["expected_answer"]) + extra_env_info = {"ground_truth": solution} + + message_log: LLMMessageLogType = [] + + # system prompt + if task_data_spec.system_prompt: + sys_prompt: dict[str, str | torch.Tensor] = { + "role": "system", + "content": task_data_spec.system_prompt, + } + sys = tokenizer.apply_chat_template( + [cast(dict[str, str], sys_prompt)], + tokenize=False, + add_generation_prompt=False, + add_special_tokens=False, + ) + sys_prompt["token_ids"] = tokenizer(sys, return_tensors="pt")["input_ids"][0] + message_log.append(sys_prompt) + + # user prompt + if task_data_spec.prompt: + problem = task_data_spec.prompt.format(problem) + user_message = {"role": "user", "content": problem} + message = tokenizer.apply_chat_template( + [user_message], + tokenize=False, + add_generation_prompt=True, + add_special_tokens=False, + ) + user_message["token_ids"] = tokenizer(message, return_tensors="pt")["input_ids"][0] + user_message["content"] = message + message_log.append(user_message) + + length = sum(len(m["token_ids"]) for m in message_log) + + loss_multiplier = 1.0 + if length > max_seq_length: + # make smaller and mask out + for indiv_message in message_log: + indiv_message["token_ids"] = indiv_message["token_ids"][ + : min(4, max_seq_length // len(message_log)) + ] + loss_multiplier = 0.0 + + output: DatumSpec = { + "message_log": message_log, + "length": length, + "extra_env_info": extra_env_info, + "loss_multiplier": loss_multiplier, + "idx": idx, + } + if "task_name" in datum_dict: + output["task_name"] = datum_dict["task_name"] + return output + + +def _construct_multichoice_prompt( + prompt: str, question: str, options: dict[str, str] +) -> str: + """Construct prompt from question and options.""" + output = prompt + output += f"\n\nQuestion: {question}\nOptions:\n" + output += "\n".join( + [ + f"{letter}) {option}" + for letter, option in options.items() + if option is not None + ] + ) + return output + + +def multichoice_qa_processor( + datum_dict: dict[str, Any], + task_data_spec: TaskDataSpec, + tokenizer: TokenizerType, + max_seq_length: int, + idx: int, +) -> DatumSpec: + """Process a datum dictionary (directly loaded from dataset) into a DatumSpec for multiple-choice problems.""" + question = datum_dict["question"] + answer = str(datum_dict["answer"]) + options = datum_dict["options"] + extra_env_info = {"ground_truth": answer} + if "subject" in datum_dict: + extra_env_info.update({"subject": datum_dict["subject"]}) + + message_log = [] + + # system prompt + if task_data_spec.system_prompt: + sys_prompt: dict[str, str | torch.Tensor] = { + "role": "system", + "content": task_data_spec.system_prompt, + } + sys = tokenizer.apply_chat_template( + [cast(dict[str, str], sys_prompt)], + tokenize=False, + add_generation_prompt=False, + add_special_tokens=False, + ) + sys_prompt["token_ids"] = tokenizer(sys, return_tensors="pt")["input_ids"][0] + message_log.append(sys_prompt) + + # user prompt + if task_data_spec.prompt: + question = _construct_multichoice_prompt( + task_data_spec.prompt, question, options + ) + user_message = {"role": "user", "content": question} + message = tokenizer.apply_chat_template( + [user_message], + tokenize=False, + add_generation_prompt=True, + add_special_tokens=False, + ) + user_message["token_ids"] = tokenizer(message, return_tensors="pt")["input_ids"][0] + user_message["content"] = message + message_log.append(user_message) + + length = sum(len(m["token_ids"]) for m in message_log) + output: DatumSpec = { + "message_log": message_log, + "length": length, + "extra_env_info": extra_env_info, + "loss_multiplier": 1.0, + "idx": idx, + } + if "task_name" in datum_dict: + output["task_name"] = datum_dict["task_name"] + return output From 628ef2d98e7e1f34bd28d24b03df2357373cbb7c Mon Sep 17 00:00:00 2001 From: Xuehan Date: Fri, 27 Jun 2025 21:33:08 +0000 Subject: [PATCH 14/44] updates doc. Signed-off-by: Xuehan Xiong Signed-off-by: Xuehan --- docs/guides/grpo.md | 2 +- tests/unit/data/test_data_processor.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/guides/grpo.md b/docs/guides/grpo.md index 1f63df559d..f577820a21 100644 --- a/docs/guides/grpo.md +++ b/docs/guides/grpo.md @@ -67,7 +67,7 @@ def my_data_processor( ) -> DatumSpec: ``` -We have an example of this as `math_data_processor` in [run_grpo_math.py](../../examples/run_grpo_math.py) +We have an example of this as `math_data_processor` in [processors.py](../../nemo_rl/data/processors.py) #### Putting it all together diff --git a/tests/unit/data/test_data_processor.py b/tests/unit/data/test_data_processor.py index 302dfece77..dc88bebee3 100644 --- a/tests/unit/data/test_data_processor.py +++ b/tests/unit/data/test_data_processor.py @@ -20,10 +20,10 @@ abspath = os.path.abspath(__file__) sys.path.append("/".join(abspath.split("/")[:-4])) -from examples.run_grpo_math import math_data_processor from nemo_rl.algorithms.utils import get_tokenizer from nemo_rl.data.datasets import AllTaskProcessedDataset from nemo_rl.data.interfaces import TaskDataSpec +from nemo_rl.data.processors import math_data_processor from nemo_rl.models.policy import TokenizerConfig basic_tokenizer_test_config: TokenizerConfig = { From f431d48f56b3b49ef402fb3b368e41e598b71bcd Mon Sep 17 00:00:00 2001 From: Luis Vega Date: Thu, 26 Jun 2025 15:49:08 -0700 Subject: [PATCH 15/44] feat: vllm Model diagnostic test checking long generation quality (#516) Signed-off-by: Luis Vega Signed-off-by: Terry Kong Signed-off-by: Luis Vega <2478335+vegaluisjose@users.noreply.github.com> Co-authored-by: Terry Kong Co-authored-by: Luis Vega <2478335+vegaluisjose@users.noreply.github.com> Signed-off-by: Xuehan --- docs/adding-new-models.md | 13 ++- .../2.long_generation_decode_vs_prefill.py | 102 ++++++++++++++++++ 2 files changed, 114 insertions(+), 1 deletion(-) create mode 100644 tools/model_diagnostics/2.long_generation_decode_vs_prefill.py diff --git a/docs/adding-new-models.md b/docs/adding-new-models.md index c73d494907..155a012f47 100644 --- a/docs/adding-new-models.md +++ b/docs/adding-new-models.md @@ -140,4 +140,15 @@ uv run --extra vllm tools/model_diagnostics/1.max_model_len_respected.py Qwen/Qw # Generated tokens: 12 # Total tokens: 20 # [Qwen/Qwen2.5-1.5B] ALL GOOD! -``` \ No newline at end of file +``` + +## [2.long_generation_decode_vs_prefill](https://github.com/NVIDIA/NeMo-RL/blob/main/tools/model_diagnostics/2.long_generation_decode_vs_prefill.py) + +Test that vLLM yields near-identical token log-probabilities when comparing decoding with a single prefill pass across multiple prompts. + +```sh +# Run that is expected to pass +uv run --extra vllm tools/model_diagnostics/2.long_generation_decode_vs_prefill.py Qwen/Qwen2.5-1.5B +# ... +# [Qwen/Qwen2.5-1.5B] ALL GOOD! +``` diff --git a/tools/model_diagnostics/2.long_generation_decode_vs_prefill.py b/tools/model_diagnostics/2.long_generation_decode_vs_prefill.py new file mode 100644 index 0000000000..69c153fd53 --- /dev/null +++ b/tools/model_diagnostics/2.long_generation_decode_vs_prefill.py @@ -0,0 +1,102 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse + +import torch +from vllm import LLM, SamplingParams + + +def extract_logprobs(logprobs): + output = [] + for lp in logprobs: + if lp is not None: + output.append(list(lp.values())[0].logprob) + return output + + +def calculate_error(a, b): + return torch.exp(torch.abs(a - b)).mean().item() + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--model", type=str, nargs="?", default="nvidia/Nemotron-H-8B-Base-8K" + ) + args = parser.parse_args() + + seed = 0 + + sampling_params = SamplingParams( + temperature=1.0, + top_p=1.0, + max_tokens=8192, + prompt_logprobs=0, + logprobs=0, + seed=seed, + ) + + # Examples as of 0.9.1 + # model="meta-llama/Meta-Llama-3-8B", # pass + # model="nvidia/Nemotron-H-8B-Base-8K", # fail + # model="ibm-ai-platform/Bamba-9B-v1", # pass + llm = LLM( + model=args.model, + enforce_eager=True, + trust_remote_code=True, + enable_prefix_caching=False, + enable_chunked_prefill=False, + tensor_parallel_size=2, + gpu_memory_utilization=0.8, + seed=seed, + ) + + num_batches = 2 + + prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", + ] + + outputs = llm.generate(prompts * num_batches, sampling_params) + + for i, output in enumerate(outputs): + sequence = output.prompt_token_ids + list(output.outputs[0].token_ids) + prompt_logprobs = extract_logprobs(output.prompt_logprobs) + logprobs = extract_logprobs(output.outputs[0].logprobs) + decode_lp = prompt_logprobs + logprobs + decode_lp = torch.tensor(decode_lp) + + sampling_params = SamplingParams( + temperature=0.0, max_tokens=1, prompt_logprobs=0 + ) + score = llm.generate({"prompt_token_ids": sequence}, sampling_params) + + prefill_lp = extract_logprobs(score[0].prompt_logprobs) + prefill_lp = torch.tensor(prefill_lp) + + lp_error = calculate_error(decode_lp, prefill_lp) + max_abs_error = torch.abs(decode_lp - prefill_lp).max().item() + print( + f"Processed sequence length {len(sequence)} with lp error {lp_error} and max abs error {max_abs_error}" + ) + assert lp_error < 1.05, f"lp error is higher than expected (1.0636): {lp_error}" + + print(f"[{args.model}] ALL GOOD!") + + +if __name__ == "__main__": + main() From f6b948dc180b245a63a66f2d28f569ee96de4d3a Mon Sep 17 00:00:00 2001 From: Yi-Fu Wu Date: Thu, 26 Jun 2025 16:25:04 -0700 Subject: [PATCH 16/44] feat: Log code in wandb (#175) Signed-off-by: Yi-Fu Wu Signed-off-by: Parth Chadha Co-authored-by: Parth Chadha Signed-off-by: Xuehan --- nemo_rl/utils/logger.py | 111 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) diff --git a/nemo_rl/utils/logger.py b/nemo_rl/utils/logger.py index a15e9bebd7..b99ebcc858 100644 --- a/nemo_rl/utils/logger.py +++ b/nemo_rl/utils/logger.py @@ -18,6 +18,7 @@ import logging import os import re +import subprocess import threading import time from abc import ABC, abstractmethod @@ -138,10 +139,120 @@ class WandbLogger(LoggerInterface): def __init__(self, cfg: WandbConfig, log_dir: Optional[str] = None): self.run = wandb.init(**cfg, dir=log_dir) + self._log_code() + self._log_diffs() print( f"Initialized WandbLogger for project {cfg.get('project')}, run {cfg.get('name')} at {log_dir}" ) + def _log_diffs(self): + """Log git diffs to wandb. + + This function captures and logs two types of diffs: + 1. Uncommitted changes (working tree diff against HEAD) + 2. All changes (including uncommitted) against the main branch + + Each diff is saved as a text file in a wandb artifact. + """ + try: + branch_result = subprocess.run( + ["git", "rev-parse", "--abbrev-ref", "HEAD"], + capture_output=True, + text=True, + check=True, + ) + current_branch = branch_result.stdout.strip() + + diff_artifact = wandb.Artifact( + name=f"git-diffs-{self.run.project}-{self.run.id}", type="git-diffs" + ) + + # 1. Log uncommitted changes (working tree diff) + uncommitted_result = subprocess.run( + ["git", "diff", "HEAD"], capture_output=True, text=True, check=True + ) + uncommitted_diff = uncommitted_result.stdout + + if uncommitted_diff: + diff_path = os.path.join( + wandb.run.dir if wandb.run else ".", "uncommitted_changes_diff.txt" + ) + with open(diff_path, "w") as f: + f.write(uncommitted_diff) + + # Add file to artifact + diff_artifact.add_file(diff_path, name="uncommitted_changes_diff.txt") + print("Logged uncommitted changes diff to wandb") + else: + print("No uncommitted changes found") + + # 2. Log diff against main branch (if current branch is not main) + if current_branch != "main": + # Log diff between main and working tree (includes uncommitted changes) + working_diff_result = subprocess.run( + ["git", "diff", "main"], capture_output=True, text=True, check=True + ) + working_diff = working_diff_result.stdout + + if working_diff: + # Save diff to a temporary file + diff_path = os.path.join( + wandb.run.dir if wandb.run else ".", "main_diff.txt" + ) + with open(diff_path, "w") as f: + f.write(working_diff) + + # Add file to artifact + diff_artifact.add_file(diff_path, name="main_diff.txt") + print("Logged diff against main branch") + else: + print("No differences found between main and working tree") + + self.run.log_artifact(diff_artifact) + + except subprocess.CalledProcessError as e: + print(f"Error during git operations: {e}") + except Exception as e: + print(f"Unexpected error during git diff logging: {e}") + + def _log_code(self): + """Log code that is tracked by git to wandb. + + This function gets a list of all files tracked by git in the project root + and manually uploads them to the current wandb run as an artifact. + """ + try: + result = subprocess.run( + ["git", "ls-files"], capture_output=True, text=True, check=True + ) + + tracked_files = result.stdout.strip().split("\n") + + if not tracked_files: + print( + "Warning: No git repository found. Wandb logs will not track code changes for reproducibility." + ) + return + + code_artifact = wandb.Artifact( + name=f"source-code-{self.run.project}", type="code" + ) + + for file_path in tracked_files: + if os.path.isfile(file_path): + try: + code_artifact.add_file(file_path, name=file_path) + except Exception as e: + print(f"Error adding file {file_path}: {e}") + + self.run.log_artifact(code_artifact) + print(f"Logged {len(tracked_files)} git-tracked files to wandb") + + except subprocess.CalledProcessError as e: + print(f"Error getting git-tracked files: {e}") + except Exception as e: + print(f"Unexpected error during git code logging: {e}") + def define_metric( self, name: str, From 4265fedd37e617a10e7f2e83ab9a218c2660c6ec Mon Sep 17 00:00:00 2001 From: Anna Shors Date: Thu, 26 Jun 2025 18:35:36 -0700 Subject: [PATCH 17/44] fix: add dynamic_batching key to SFT OpenMathInstruct config (#570) Signed-off-by: ashors1 Signed-off-by: Xuehan --- examples/configs/sft_openmathinstruct2.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/configs/sft_openmathinstruct2.yaml b/examples/configs/sft_openmathinstruct2.yaml index e934f7aa29..2040bdd5ff 100644 --- a/examples/configs/sft_openmathinstruct2.yaml +++ b/examples/configs/sft_openmathinstruct2.yaml @@ -37,6 +37,9 @@ policy: context_parallel_size: 1 custom_parallel_plan: null + dynamic_batching: + enabled: false + # makes the training sequence length divisible by the tensor parallel size # this is useful for sequence parallel training make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size} From 7c8367d141fef1c6e3e51c541d5591bca4347cab Mon Sep 17 00:00:00 2001 From: yuki <48991475+yuki-666@users.noreply.github.com> Date: Fri, 27 Jun 2025 11:24:51 +0800 Subject: [PATCH 18/44] feat: support async in non-colocated (#523) Signed-off-by: Yuki Huang Signed-off-by: Xuehan --- nemo_rl/models/generation/vllm.py | 103 +++++++++++++++++- .../models/generation/test_vllm_generation.py | 19 +++- 2 files changed, 111 insertions(+), 11 deletions(-) diff --git a/nemo_rl/models/generation/vllm.py b/nemo_rl/models/generation/vllm.py index 58795e5031..f0cd5eb50b 100644 --- a/nemo_rl/models/generation/vllm.py +++ b/nemo_rl/models/generation/vllm.py @@ -359,6 +359,19 @@ def init_collective(self, data: int, ip: str, port: int, world_size: int) -> Non ), ) + async def init_collective_async( + self, data: int, ip: str, port: int, world_size: int + ) -> None: + await self.llm.collective_rpc( + "init_collective", + args=( + data, + ip, + port, + world_size, + ), + ) + def llm(self): return self.llm @@ -979,7 +992,7 @@ def update_weights_from_collective(self, data: dict[str, Any]) -> bool: if self.cfg["vllm_cfg"]["async_engine"]: raise RuntimeError( - "update_weights_from_collective cannot be used with async_engine=True. Use update_weights_from_ipc_handles_async instead." + "update_weights_from_collective can only be used with async_engine=False. Use update_weights_from_collective_async instead." ) result_or_coro = self.llm.collective_rpc( @@ -1000,12 +1013,72 @@ def update_weights_from_collective(self, data: dict[str, Any]) -> bool: traceback.print_exc() return False + async def update_weights_from_collective_async(self, data: dict[str, Any]) -> bool: + """Async version of update_weights_from_collective.""" + try: + assert self.llm is not None, ( + "Attempting to update weights with either an uninitialized vLLM or non-model-owner" + ) + + if not self.cfg["vllm_cfg"]["async_engine"]: + raise RuntimeError( + "update_weights_from_collective_async can only be used with async_engine=True. Use update_weights_from_collective instead." + ) + + result_or_coro = await self.llm.collective_rpc( + "update_weights_from_collective", args=(data,) + ) + + if asyncio.iscoroutine(result_or_coro): + worker_results = await result_or_coro + else: + worker_results = result_or_coro + + worker_result = worker_results[0] + + if not worker_result: + print( + f"Error: Worker failed to update weights. Result: {worker_result}" + ) + return False + return True + except Exception as e: + print(f"Exception during collective_rpc for weight update: {e}") + import traceback + + traceback.print_exc() + return False + def reset_prefix_cache(self): """Reset the prefix cache of vLLM engine.""" + assert self.llm is not None, ( + "Attempting to reset prefix cache with either an uninitialized vLLM or non-model-owner" + ) + + if self.cfg["vllm_cfg"]["async_engine"]: + raise RuntimeError( + "reset_prefix_cache can only be used with async_engine=False. Use reset_prefix_cache_async instead." + ) + self.llm.llm_engine.reset_prefix_cache() gc.collect() torch.cuda.empty_cache() + async def reset_prefix_cache_async(self): + """Async version of reset_prefix_cache.""" + assert self.llm is not None, ( + "Attempting to reset prefix cache with either an uninitialized vLLM or non-model-owner" + ) + + if not self.cfg["vllm_cfg"]["async_engine"]: + raise RuntimeError( + "reset_prefix_cache_async can only be used with async_engine=True. Use reset_prefix_cache instead." + ) + + await self.llm.reset_prefix_cache() + gc.collect() + torch.cuda.empty_cache() + def sleep(self): """Put the vLLM engine to sleep.""" assert self.llm is not None, ( @@ -1311,6 +1384,13 @@ def init_collective( if not self.worker_group or not self.worker_group.workers: raise RuntimeError("Worker group is not initialized") + # Choose the appropriate method based on async_engine setting + method_name = ( + "init_collective_async" + if self.cfg["vllm_cfg"]["async_engine"] + else "init_collective" + ) + # Prepare rank total_workers = len(self.worker_group.workers) if self.dp_size == 0: @@ -1322,7 +1402,7 @@ def init_collective( # Send world_size and rank for init collective to all workers futures = self.worker_group.run_all_workers_multiple_data( - "init_collective", + method_name, data=rank_prefix_list, run_rank_0_only_axes=["tensor_parallel", "pipeline_parallel"], common_kwargs={"ip": ip, "port": port, "world_size": world_size}, @@ -1563,12 +1643,16 @@ def finish_generation(self, *args: Any, **kwargs: Any) -> bool: try: # Choose the appropriate method based on setting # non-colocated only needs reset prefix cache, no need to sleep. - if not self.cfg["colocated"]["enabled"]: - method_name = "reset_prefix_cache" - else: + if self.cfg["colocated"]["enabled"]: method_name = ( "sleep_async" if self.cfg["vllm_cfg"]["async_engine"] else "sleep" ) + else: + method_name = ( + "reset_prefix_cache_async" + if self.cfg["vllm_cfg"]["async_engine"] + else "reset_prefix_cache" + ) # Use run_all_workers_single_data for methods that don't need data futures = self.worker_group.run_all_workers_single_data( method_name, @@ -1641,9 +1725,16 @@ def update_weights_from_collective( if not self.worker_group or not self.worker_group.workers: raise RuntimeError("Worker group is not initialized") + # Choose the appropriate method based on async_engine setting + method_name = ( + "update_weights_from_collective_async" + if self.cfg["vllm_cfg"]["async_engine"] + else "update_weights_from_collective" + ) + # Use run_all_workers_single_data to send data to all workers futures = self.worker_group.run_all_workers_single_data( - "update_weights_from_collective", + method_name, data=info, run_rank_0_only_axes=["tensor_parallel", "pipeline_parallel"], ) diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py index 266be74264..dc1de1b123 100644 --- a/tests/unit/models/generation/test_vllm_generation.py +++ b/tests/unit/models/generation/test_vllm_generation.py @@ -379,13 +379,13 @@ def test_vllm_policy_generation(policy, test_input_data, tokenizer): ) -async def _generate_async(vllm_policy, tokenizer, test_input_data): +async def _generate_async(vllm_policy, tokenizer, test_input_data, greedy=False): collected_indexed_outputs = [] # generate_async is restricted to handle only single samples input_generator = test_input_data.make_microbatch_iterator(microbatch_size=1) for single_item_input in input_generator: async for original_idx, single_item_output in vllm_policy.generate_async( - single_item_input + single_item_input, greedy=greedy ): collected_indexed_outputs.append((original_idx, single_item_output)) @@ -691,7 +691,7 @@ async def test_vllm_generation_with_hf_training( print("Using vLLM policy for fast generation...") if async_engine: generation_results = await _generate_async( - vllm_policy, tokenizer, test_input_data + vllm_policy, tokenizer, test_input_data, greedy=True ) else: generation_results = vllm_policy.generate(test_input_data, greedy=True) @@ -1174,11 +1174,14 @@ def test_vllm_non_divisible_batch_handling(policy): ) -def test_vllm_refit_non_collocated_handles_update( +@pytest.mark.asyncio +@pytest.mark.parametrize("async_engine", [True, False]) +async def test_vllm_refit_non_collocated_update_weights( policy_cluster_separate, generation_cluster_separate, tokenizer, test_input_data, + async_engine, ): if ( policy_cluster_separate.num_gpus_per_node < 1 @@ -1197,6 +1200,7 @@ def test_vllm_refit_non_collocated_handles_update( # Create VllmGeneration policy on its own cluster vllm_config = deepcopy(basic_vllm_test_config) vllm_config = configure_generation_config(vllm_config, tokenizer, is_eval=True) + vllm_config["vllm_cfg"]["async_engine"] = async_engine vllm_config["vllm_cfg"]["tensor_parallel_size"] = 1 vllm_config["colocated"]["enabled"] = False vllm_generation = VllmGeneration(generation_cluster_separate, vllm_config) @@ -1213,7 +1217,12 @@ def test_vllm_refit_non_collocated_handles_update( ) # test generate - outputs = vllm_generation.generate(test_input_data, greedy=True) + if async_engine: + outputs = await _generate_async( + vllm_generation, tokenizer, test_input_data, greedy=True + ) + else: + outputs = vllm_generation.generate(test_input_data, greedy=True) output_ids = outputs["output_ids"] generated_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True) assert generated_texts == [ From d0dca5b9004ebf211917f65db7ef9b463b5c6cd5 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 27 Jun 2025 10:09:17 -0700 Subject: [PATCH 19/44] fix: correct mcore dtype + assertion on activation_func (#572) Signed-off-by: Terry Kong Signed-off-by: Xuehan --- .../models/policy/megatron_policy_worker.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py index e0bd4373be..3b6ce13e30 100644 --- a/nemo_rl/models/policy/megatron_policy_worker.py +++ b/nemo_rl/models/policy/megatron_policy_worker.py @@ -459,15 +459,27 @@ def __init__( ) model_cfg.bf16 = self.dtype == torch.bfloat16 model_cfg.fp16 = self.dtype == torch.float16 - model_cfg.params_dtype = dtype_map[ - self.cfg["megatron_cfg"]["optimizer"]["params_dtype"] - ] # FP32 for amp + if model_cfg.fp16: + assert not model_cfg.bf16, "fp16 and bf16 cannot be used together" + model_cfg.params_dtype = torch.float16 + elif model_cfg.bf16: + assert not model_cfg.fp16, "fp16 and bf16 cannot be used together" + model_cfg.params_dtype = torch.bfloat16 + else: + model_cfg.params_dtype = torch.float32 model_cfg.pipeline_dtype = dtype_map[self.cfg["megatron_cfg"]["pipeline_dtype"]] model_cfg.parallel_output = True if self.cfg["megatron_cfg"]["activation_checkpointing"]: model_cfg.activations_checkpoint_granularity = "full" model_cfg.activations_checkpoint_method = "uniform" model_cfg.activations_checkpoint_num_layers = 1 + if not model_cfg.gated_linear_unit: + assert model_cfg.activation_func is not None, ( + "activation_func must be set if not using gated_linear_unit. This likely " + "indicates an issue in configuration conversion (e.g. activation func was " + "a lambda and couldn't be serialized). This is based on this check " + "https://github.com/NVIDIA/Megatron-LM/blob/1ab876ddc4c1893c76f26d775226a8d1dcdfb3d2/megatron/core/transformer/mlp.py#L174." + ) checkpoint_config = CheckpointConfig( save_interval=100, From e257d881bd5f333f02928abd94f095531f842538 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 27 Jun 2025 10:27:08 -0700 Subject: [PATCH 20/44] fix: move core ray port from 6379 -> 54258 to reduce port collision freq (#574) Signed-off-by: Terry Kong Signed-off-by: Xuehan --- ray.sub | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray.sub b/ray.sub index 7544d3b7b7..8c0cba32b1 100644 --- a/ray.sub +++ b/ray.sub @@ -40,7 +40,7 @@ DASHBOARD_AGENT_GRPC_PORT=${DASHBOARD_AGENT_GRPC_PORT:-53007} METRICS_EXPORT_PORT=${METRICS_EXPORT_PORT:-53009} # Ports for the head node -PORT=${PORT:-6379} +PORT=${PORT:-54258} RAY_CLIENT_SERVER_PORT=${RAY_CLIENT_SERVER_PORT:-10001} #REDIT_SHARD_PORTS=${REDIT_SHARD_PORTS:-"random"} ?? DASHBOARD_GRPC_PORT=${DASHBOARD_GRPC_PORT:-52367} From c27ff44276c3b947d5674778ae66f174d96870d7 Mon Sep 17 00:00:00 2001 From: Anna Shors Date: Fri, 27 Jun 2025 13:52:45 -0700 Subject: [PATCH 21/44] fix: fix overlap param gather (#561) Signed-off-by: ashors1 Signed-off-by: Xuehan --- examples/configs/dpo.yaml | 2 +- examples/configs/grpo_math_1B_megatron.yaml | 2 +- ...po-llama3.1-8b-instruct-4n8g-megatron.yaml | 2 +- ...8b-instruct-4n8g-megatrontp2pp2-quick.yaml | 2 +- ...ft-llama3.1-8b-instruct-1n8g-megatron.yaml | 2 +- examples/configs/sft.yaml | 2 +- .../models/policy/megatron_policy_worker.py | 32 +++++++++++++------ 7 files changed, 29 insertions(+), 15 deletions(-) diff --git a/examples/configs/dpo.yaml b/examples/configs/dpo.yaml index ccddde43b0..db6fb7fa6d 100755 --- a/examples/configs/dpo.yaml +++ b/examples/configs/dpo.yaml @@ -134,7 +134,7 @@ policy: distributed_data_parallel_config: grad_reduce_in_fp32: false overlap_grad_reduce: true - overlap_param_gather: false + overlap_param_gather: true average_in_collective: true data_parallel_sharding_strategy: "optim_grads_params" diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml index 5b14a7ff56..6b07317ed6 100644 --- a/examples/configs/grpo_math_1B_megatron.yaml +++ b/examples/configs/grpo_math_1B_megatron.yaml @@ -115,7 +115,7 @@ policy: distributed_data_parallel_config: grad_reduce_in_fp32: false overlap_grad_reduce: true - overlap_param_gather: false + overlap_param_gather: true average_in_collective: true use_custom_fsdp: false data_parallel_sharding_strategy: "optim_grads_params" diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.yaml index 03bd0d7077..1fd336d0b4 100644 --- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.yaml @@ -91,7 +91,7 @@ policy: distributed_data_parallel_config: grad_reduce_in_fp32: false overlap_grad_reduce: true - overlap_param_gather: false + overlap_param_gather: true average_in_collective: true data_parallel_sharding_strategy: "optim_grads_params" diff --git a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml index 74c93bbae0..73008f3154 100644 --- a/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml +++ b/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml @@ -91,7 +91,7 @@ policy: distributed_data_parallel_config: grad_reduce_in_fp32: false overlap_grad_reduce: true - overlap_param_gather: false + overlap_param_gather: true average_in_collective: true data_parallel_sharding_strategy: "optim_grads_params" diff --git a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-megatron.yaml b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-megatron.yaml index f6ab46c997..ddd53920e6 100644 --- a/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-megatron.yaml +++ b/examples/configs/recipes/llm/sft-llama3.1-8b-instruct-1n8g-megatron.yaml @@ -79,7 +79,7 @@ policy: distributed_data_parallel_config: grad_reduce_in_fp32: false overlap_grad_reduce: true - overlap_param_gather: false + overlap_param_gather: true average_in_collective: true data_parallel_sharding_strategy: "optim_grads_params" diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml index 5be4451d3b..e3c614e2a7 100644 --- a/examples/configs/sft.yaml +++ b/examples/configs/sft.yaml @@ -109,7 +109,7 @@ policy: distributed_data_parallel_config: grad_reduce_in_fp32: false overlap_grad_reduce: true - overlap_param_gather: false + overlap_param_gather: true average_in_collective: true data_parallel_sharding_strategy: "optim_grads_params" diff --git a/nemo_rl/models/policy/megatron_policy_worker.py b/nemo_rl/models/policy/megatron_policy_worker.py index 3b6ce13e30..89eb263674 100644 --- a/nemo_rl/models/policy/megatron_policy_worker.py +++ b/nemo_rl/models/policy/megatron_policy_worker.py @@ -421,15 +421,6 @@ def __init__( pretrained_path, "iter_0000000/run_config.yaml" ) - assert not ( - self.cfg["megatron_cfg"]["distributed_data_parallel_config"][ - "overlap_param_gather" - ] - and self.cfg["megatron_cfg"]["optimizer"]["use_distributed_optimizer"] - ), ( - "Using overlap param gather together with distributed optimizer has known convergence issues. Please disable overlap param gather." - ) - self.tokenizer = tokenizer if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token @@ -645,6 +636,13 @@ def __init__( self._held_gather_buffer = None self.megatron_to_hf_converter = MegatronToHFConverter(hf_model_name, self.model) + self.should_disable_forward_pre_hook = ( + self.cfg["megatron_cfg"]["optimizer"]["use_distributed_optimizer"] + and self.cfg["megatron_cfg"]["distributed_data_parallel_config"][ + "overlap_param_gather" + ] + ) + def configure_worker(self, num_gpus: int, bundle_indices: Optional[tuple] = None): USE_EXPANDABLE_SEGMENTS = False # Disabling this right now as it seems to cause vLLM refit issues with Ampere if USE_EXPANDABLE_SEGMENTS: @@ -662,6 +660,14 @@ def get_gpu_info(self): """Return information about the GPU being used by this worker.""" return get_gpu_info(self.model) + def enable_forward_pre_hook(self): + assert isinstance(self.model, DistributedDataParallel) + self.model.enable_forward_pre_hook() + + def disable_forward_pre_hook(self, param_sync=True): + assert isinstance(self.model, DistributedDataParallel) + self.model.disable_forward_pre_hook(param_sync=param_sync) + def train( self, data: BatchedDataDict, @@ -1001,6 +1007,10 @@ def use_reference_model(self): On entry: Moves model to CPU, moves reference_model to CUDA. Swaps the references On exit: Restores original references and re-flips cuda/cpu """ + ## disable overlap param gather when swapping weights + if self.should_disable_forward_pre_hook: + self.disable_forward_pre_hook() + with torch.no_grad(): try: # Save original references @@ -1035,6 +1045,10 @@ def use_reference_model(self): gc.collect() torch.cuda.empty_cache() + ## re-enable overlap param gather after weight swap + if self.should_disable_forward_pre_hook: + self.enable_forward_pre_hook() + # Temporary fix, 'data' is a kwarg due to some sort of ray bug def get_reference_policy_logprobs( self, *, data: BatchedDataDict[Any], micro_batch_size: Optional[int] = None From 16ac698dcb2a6fc88f6f0f6497069a9c202c93fe Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 27 Jun 2025 14:15:25 -0700 Subject: [PATCH 22/44] docs: fix some typos on nsys/model-quirk pages (#560) Signed-off-by: Terry Kong Signed-off-by: Xuehan --- docs/model-quirks.md | 2 +- docs/nsys-profiling.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/model-quirks.md b/docs/model-quirks.md index f0f1c961f9..fa2b181c7e 100644 --- a/docs/model-quirks.md +++ b/docs/model-quirks.md @@ -32,7 +32,7 @@ NeMo-RL uses the vLLM V1 runtime for both synchronous and asynchronous inference ### Context Parallel with FSDP2 NeMo-RL implemented this feature based on torch CP [implementation](https://github.com/pytorch/pytorch/blob/main/torch/distributed/tensor/experimental/_attention.py). And we inherit its limitations. -Whether model level support CP only depends on arguments passed to `torch.nn.functional.scaled_dot_product_attention`. Current NeMo-RL passed all ones attention mask to `model.forward`. For Gemma-3, it won't ignore attention mask as result `attn_base` is not None which is not supported by torch CP. Please see [assertion](https://github.com/pytorch/pytorch/blob/134179474539648ba7dee1317959529fbd0e7f89/torch/distributed/tensor/experimental/_attention.py#L262) . +Whether model level support CP only depends on arguments passed to `torch.nn.functional.scaled_dot_product_attention`. Current NeMo-RL passed all ones attention mask to `model.forward`. For Gemma-3, it won't ignore attention mask as result `attn_bias` is not None which is not supported by torch CP. Please see [assertion](https://github.com/pytorch/pytorch/blob/134179474539648ba7dee1317959529fbd0e7f89/torch/distributed/tensor/experimental/_attention.py#L262) . ## vLLM Async Rollout Timeout diff --git a/docs/nsys-profiling.md b/docs/nsys-profiling.md index 1ee914c842..3c5ccd0c3a 100644 --- a/docs/nsys-profiling.md +++ b/docs/nsys-profiling.md @@ -69,7 +69,7 @@ To profile a Megatron worker, you should set `LD_LIBRARY_PATH` as follows, other ```bash LD_LIBRARY_PATH="/usr/local/cuda/targets/x86_64-linux/lib:/usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/nvidia/lib64:/usr/local/nvidia/lib:/usr/lib/x86_64-linux-gnu" \ -NRL_NSYS_PROFILE_STEP_RANGE=2:3 NRL_NSYS_WORKER_PATTERNS="dtensor_policy_worker,vllm_generation_worker" uv run --config examples/configs/grpo_math_1B_megatron.yaml examples/run_grpo_math.py grpo.max_num_steps=5 +NRL_NSYS_PROFILE_STEP_RANGE=2:3 NRL_NSYS_WORKER_PATTERNS="megatron_policy_worker,vllm_generation_worker" uv run examples/run_grpo_math.py --config examples/configs/grpo_math_1B_megatron.yaml grpo.max_num_steps=5 ``` ## Profile Output From 9b79e1e791ba5af3c78c7f80d3e060b5657ba8d8 Mon Sep 17 00:00:00 2001 From: Anna Shors Date: Fri, 27 Jun 2025 16:00:38 -0700 Subject: [PATCH 23/44] feat: Add megatron to hf converter (#555) Signed-off-by: Anna Shors Signed-off-by: ashors1 Signed-off-by: Xuehan --- 3rdparty/NeMo-workspace/NeMo | 2 +- docs/design-docs/checkpointing.md | 4 +- docs/guides/eval.md | 4 +- docs/guides/grpo-deepscaler.md | 2 +- docs/guides/sft-openmathinstruct2.md | 2 +- .../{ => converters}/convert_dcp_to_hf.py | 0 examples/converters/convert_megatron_to_hf.py | 67 ++++ nemo_rl/models/megatron/community_import.py | 45 ++- tests/functional/test_converter_roundtrip.py | 369 ++++++++++++++++++ tests/functional/test_converters.sh | 1 + 10 files changed, 488 insertions(+), 8 deletions(-) rename examples/{ => converters}/convert_dcp_to_hf.py (100%) create mode 100644 examples/converters/convert_megatron_to_hf.py create mode 100644 tests/functional/test_converter_roundtrip.py create mode 100644 tests/functional/test_converters.sh diff --git a/3rdparty/NeMo-workspace/NeMo b/3rdparty/NeMo-workspace/NeMo index bab66472d2..4b7ded58d8 160000 --- a/3rdparty/NeMo-workspace/NeMo +++ b/3rdparty/NeMo-workspace/NeMo @@ -1 +1 @@ -Subproject commit bab66472d2f2eb05ab621dbad66ad6031e4ee19e +Subproject commit 4b7ded58d804bf3470499c6cfa385c6fa915879d diff --git a/docs/design-docs/checkpointing.md b/docs/design-docs/checkpointing.md index de7fb64fbe..5d3feae680 100644 --- a/docs/design-docs/checkpointing.md +++ b/docs/design-docs/checkpointing.md @@ -5,7 +5,7 @@ NeMo RL provides two checkpoint formats for Hugging Face models: Torch distribut A checkpoint converter is provided to convert a Torch distributed checkpoint checkpoint to Hugging Face format after training: ```sh -uv run examples/convert_dcp_to_hf.py --config= --dcp-ckpt-path= --hf-ckpt-path= +uv run examples/converters/convert_dcp_to_hf.py --config= --dcp-ckpt-path= --hf-ckpt-path= ``` Usually Hugging Face checkpoints keep the weights and tokenizer together (which we also recommend for provenance). You can copy it afterwards. Here's an end-to-end example: @@ -14,6 +14,6 @@ Usually Hugging Face checkpoints keep the weights and tokenizer together (which # Change to your appropriate checkpoint directory CKPT_DIR=results/sft/step_10 -uv run examples/convert_dcp_to_hf.py --config=$CKPT_DIR/config.yaml --dcp-ckpt-path=$CKPT_DIR/policy/weights --hf-ckpt-path=${CKPT_DIR}-hf +uv run examples/converters/convert_dcp_to_hf.py --config=$CKPT_DIR/config.yaml --dcp-ckpt-path=$CKPT_DIR/policy/weights --hf-ckpt-path=${CKPT_DIR}-hf rsync -ahP $CKPT_DIR/policy/tokenizer ${CKPT_DIR}-hf/ ``` diff --git a/docs/guides/eval.md b/docs/guides/eval.md index b6e312f574..0281bb21f7 100644 --- a/docs/guides/eval.md +++ b/docs/guides/eval.md @@ -9,11 +9,11 @@ To prepare for evaluation, first ensure your model is in the correct format, whi ### Convert DCP to HF (Optional) If you have trained a model and saved the checkpoint in the Pytorch DCP format, you first need to convert it to the Hugging Face format before running evaluation. -Use the `examples/convert_dcp_to_hf.py` script. You'll need the path to the training configuration file (`config.yaml`), the DCP checkpoint directory, and specify an output path for the HF format model. +Use the `examples/converters/convert_dcp_to_hf.py` script. You'll need the path to the training configuration file (`config.yaml`), the DCP checkpoint directory, and specify an output path for the HF format model. ```sh # Example for a GRPO checkpoint at step 170 -uv run python examples/convert_dcp_to_hf.py \ +uv run python examples/converters/convert_dcp_to_hf.py \ --config results/grpo/step_170/config.yaml \ --dcp-ckpt-path results/grpo/step_170/policy/weights/ \ --hf-ckpt-path results/grpo/hf diff --git a/docs/guides/grpo-deepscaler.md b/docs/guides/grpo-deepscaler.md index 5beddf1689..456b2f2d8b 100644 --- a/docs/guides/grpo-deepscaler.md +++ b/docs/guides/grpo-deepscaler.md @@ -16,7 +16,7 @@ uv run examples/run_grpo_math.py --config=examples/configs/grpo-deepscaler-1.5b- At the end of each stage, you need to specify the Hugging Face checkpoint to continue training with. To get this checkpoint, we convert a model checkpoint to a Hugging Face checkpoint with the following command: ```sh -uv run examples/convert_dcp_to_hf.py --config=results/grpo-deepscaler-1.5b-8K/step_240/config.yaml --dcp-ckpt-path=results/grpo-deepscaler-1.5b-8K/step_240/policy/weights --hf-ckpt-path=results/grpo-deepscaler-1.5b-8K/step_240/hf +uv run examples/converters/convert_dcp_to_hf.py --config=results/grpo-deepscaler-1.5b-8K/step_240/config.yaml --dcp-ckpt-path=results/grpo-deepscaler-1.5b-8K/step_240/policy/weights --hf-ckpt-path=results/grpo-deepscaler-1.5b-8K/step_240/hf ``` When running the next command, we use the Hugging Face checkpoint as the initial checkpoint. We train with an 8K context window for 240 steps, a 16K context window for 290 steps, and a 24K context window for 50 steps. We run all experiments on a single 8XH100 80GB node or on a single 8XA100 80GB node. diff --git a/docs/guides/sft-openmathinstruct2.md b/docs/guides/sft-openmathinstruct2.md index dae8e8846d..6698c12bc0 100644 --- a/docs/guides/sft-openmathinstruct2.md +++ b/docs/guides/sft-openmathinstruct2.md @@ -26,7 +26,7 @@ The default config uses 8 GPUs (`cluster.gpus_per_node`) on 1 node (`cluster.num Throughout training, the checkpoints of the model will be saved to the `results/sft_openmathinstruct2` folder (specified by `checkpointing.checkpoint_dir`). To evaluate the model, we first need to convert the PyTorch distributed checkpoint to Hugging Face format: ``` -uv run examples/convert_dcp_to_hf.py \ +uv run examples/converters/convert_dcp_to_hf.py \ --config=results/sft_openmathinstruct2/step_1855/config.yaml \ --dcp-ckpt-path=results/sft_openmathinstruct2/step_1855/policy/weights \ --hf-ckpt-path=results/sft_openmathinstruct2/step_1855/hf diff --git a/examples/convert_dcp_to_hf.py b/examples/converters/convert_dcp_to_hf.py similarity index 100% rename from examples/convert_dcp_to_hf.py rename to examples/converters/convert_dcp_to_hf.py diff --git a/examples/converters/convert_megatron_to_hf.py b/examples/converters/convert_megatron_to_hf.py new file mode 100644 index 0000000000..ea4501286e --- /dev/null +++ b/examples/converters/convert_megatron_to_hf.py @@ -0,0 +1,67 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +import yaml + +from nemo_rl.models.megatron.community_import import export_model_from_megatron + + +def parse_args(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + description="Convert Torch DCP checkpoint to HF checkpoint" + ) + parser.add_argument( + "--config", + type=str, + default=None, + help="Path to config.yaml file in the checkpoint directory", + ) + parser.add_argument( + "--megatron-ckpt-path", + type=str, + default=None, + help="Path to Megatron checkpoint", + ) + parser.add_argument( + "--hf-ckpt-path", type=str, default=None, help="Path to save HF checkpoint" + ) + # Parse known args for the script + args = parser.parse_args() + + return args + + +def main(): + """Main entry point.""" + args = parse_args() + + with open(args.config, "r") as f: + config = yaml.safe_load(f) + + model_name = config["policy"]["model_name"] + tokenizer_name = config["policy"]["tokenizer"]["name"] + + export_model_from_megatron( + hf_model_name=model_name, + input_path=args.megatron_ckpt_path, + output_path=args.hf_ckpt_path, + hf_tokenizer_path=tokenizer_name, + ) + + +if __name__ == "__main__": + main() diff --git a/nemo_rl/models/megatron/community_import.py b/nemo_rl/models/megatron/community_import.py index e83922e659..5ad061c54a 100644 --- a/nemo_rl/models/megatron/community_import.py +++ b/nemo_rl/models/megatron/community_import.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os + def import_model_from_hf_name(hf_model_name: str, output_path: str): if "llama" in hf_model_name.lower(): @@ -31,9 +33,50 @@ def import_model_from_hf_name(hf_model_name: str, output_path: str): output_path=output_path, ) else: - raise ValueError(f"Unknown model: {hf_model_name}") + raise ValueError( + f"Unknown model: {hf_model_name}. Currently, only Qwen2 and Llama are supported. " + "If you'd like to run with a different model, please raise an issue or consider adding your own converter." + ) importer.apply() # resetting mcore state import megatron.core.rerun_state_machine megatron.core.rerun_state_machine.destroy_rerun_state_machine() + + +def export_model_from_megatron( + hf_model_name: str, + input_path: str, + output_path: str, + hf_tokenizer_path: str, + overwrite: bool = False, +): + if os.path.exists(output_path) and not overwrite: + raise FileExistsError( + f"HF checkpoint already exists at {output_path}. Delete it to run or set overwrite=True." + ) + + if "llama" in hf_model_name.lower(): + from nemo.tron.converter.llama import HFLlamaExporter + + exporter_cls = HFLlamaExporter + elif "qwen" in hf_model_name.lower(): + from nemo.tron.converter.qwen import HFQwen2Exporter + + exporter_cls = HFQwen2Exporter + else: + raise ValueError( + f"Unknown model: {hf_model_name}. Currently, only Qwen2 and Llama are supported. " + "If you'd like to run with a different model, please raise an issue or consider adding your own converter." + ) + print(f"Exporting model {hf_model_name} to {output_path}...") + exporter = exporter_cls( + input_path=input_path, + output_path=output_path, + hf_tokenizer_path=hf_tokenizer_path, + ) + exporter.apply() + # resetting mcore state + import megatron.core.rerun_state_machine + + megatron.core.rerun_state_machine.destroy_rerun_state_machine() diff --git a/tests/functional/test_converter_roundtrip.py b/tests/functional/test_converter_roundtrip.py new file mode 100644 index 0000000000..e551d0e6b5 --- /dev/null +++ b/tests/functional/test_converter_roundtrip.py @@ -0,0 +1,369 @@ +#!/usr/bin/env python3 +""" +Functional test for converter roundtrip functionality. + +This test: +1. Starts with a HuggingFace Qwen/Qwen2-0.5B checkpoint +2. Converts the model to torch DCP format +3. Converts the model to Megatron format (using community import) +4. Converts both the DCP and Megatron checkpoints back to HF format +5. Asserts that the converted DCP and Megatron checkpoints are identical and match the original HF checkpoint +""" + +import os +import tempfile +from typing import Any, Dict + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +from nemo_rl.algorithms.utils import get_tokenizer +from nemo_rl.distributed.virtual_cluster import RayVirtualCluster +from nemo_rl.models.megatron.community_import import ( + export_model_from_megatron, + import_model_from_hf_name, +) +from nemo_rl.models.policy.lm_policy import Policy +from nemo_rl.utils.native_checkpoint import convert_dcp_to_hf + + +def create_test_config() -> Dict[str, Any]: + """Create a test configuration for SFT training.""" + return { + "sft": { + "max_num_epochs": 1, ## unused, no training is actually done + "max_num_steps": 2, + "val_period": 2, + "val_batches": 1, + "val_global_batch_size": 4, + "val_micro_batch_size": 2, + "val_at_start": False, + "seed": 42, + }, + "checkpointing": { + "enabled": True, + "checkpoint_dir": "/tmp/test_converter_checkpoints", + "metric_name": "val_loss", + "higher_is_better": False, + "keep_top_k": 1, + "save_period": 2, + }, + "policy": { + "model_name": "Qwen/Qwen2-0.5B", + "tokenizer": {"name": "Qwen/Qwen2-0.5B"}, + "train_global_batch_size": 4, + "train_micro_batch_size": 2, + "max_total_sequence_length": 128, + "precision": "bfloat16", + "fsdp_offload_enabled": False, + "activation_checkpointing_enabled": False, + "dtensor_cfg": { + "enabled": True, + "cpu_offload": False, + "sequence_parallel": False, + "activation_checkpointing": False, + "tensor_parallel_size": 1, + "context_parallel_size": 1, + "custom_parallel_plan": None, + }, + "dynamic_batching": {"enabled": False}, + "make_sequence_length_divisible_by": 1, + "max_grad_norm": 1.0, + "optimizer": { + "name": "torch.optim.AdamW", + "kwargs": { + "lr": 5.0e-6, + "weight_decay": 0.1, + "betas": [0.9, 0.98], + "eps": 1e-5, + "foreach": False, + "fused": False, + }, + }, + "megatron_cfg": { + "enabled": False, # We'll use DCP for this test + }, + }, + "data": { + "max_input_seq_length": 128, + "dataset_name": "squad", + "add_bos": True, + "add_eos": True, + "add_generation_prompt": False, + }, + "logger": { + "log_dir": "/tmp/test_converter_logs", + "wandb_enabled": False, + "tensorboard_enabled": False, + "monitor_gpus": False, + }, + "cluster": { + "gpus_per_node": 1, + "num_nodes": 1, + }, + } + + +def load_model_and_tokenizer(model_name: str): + """Load the original HF model and tokenizer.""" + print(f"Loading original model: {model_name}") + model = AutoModelForCausalLM.from_pretrained( + model_name, torch_dtype=torch.bfloat16, trust_remote_code=True + ) + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + return model, tokenizer + + +def get_model_state_dict(model): + """Get the state dict of a model, ensuring all tensors are on CPU.""" + state_dict = model.state_dict() + cpu_state_dict = {} + for key, value in state_dict.items(): + if isinstance(value, torch.Tensor): + cpu_state_dict[key] = value.detach().cpu() + else: + cpu_state_dict[key] = value + return cpu_state_dict + + +def assert_state_dicts_equal( + state_dict1: Dict[str, Any], state_dict2: Dict[str, Any], name1: str, name2: str +): + """Assert that two state dictionaries are equal.""" + print(f"Comparing {name1} vs {name2}") + + # Check that keys match + keys1 = set(state_dict1.keys()) + keys2 = set(state_dict2.keys()) + + if keys1 != keys2: + missing_in_2 = keys1 - keys2 + missing_in_1 = keys2 - keys1 + raise AssertionError( + f"State dict keys don't match between {name1} and {name2}.\n" + f"Keys in {name1} but not in {name2}: {missing_in_2}\n" + f"Keys in {name2} but not in {name1}: {missing_in_1}" + ) + + # Check that values match + for key in keys1: + val1 = state_dict1[key] + val2 = state_dict2[key] + + if isinstance(val1, torch.Tensor) and isinstance(val2, torch.Tensor): + if not torch.allclose(val1, val2, rtol=1e-5, atol=1e-5): + max_diff = torch.max(torch.abs(val1 - val2)).item() + raise AssertionError( + f"Tensors for key '{key}' don't match between {name1} and {name2}. " + f"Max difference: {max_diff}" + ) + elif val1 != val2: + raise AssertionError( + f"Non-tensor values for key '{key}' don't match between {name1} and {name2}. " + f"{name1}: {val1}, {name2}: {val2}" + ) + + print(f"✓ {name1} and {name2} are identical") + + +def create_dcp_checkpoint( + model_name: str, config: Dict[str, Any], temp_dir: str +) -> str: + """Create a DCP checkpoint without training.""" + print("Creating DCP checkpoint...") + + # Create cluster + cluster = RayVirtualCluster( + name="test-converter-cluster", + bundle_ct_per_node_list=[1], + use_gpus=True, + num_gpus_per_node=1, + max_colocated_worker_groups=1, + ) + + # Get tokenizer + tokenizer = get_tokenizer(config["policy"]["tokenizer"]) + + # Create policy + policy = Policy( + cluster=cluster, + config=config["policy"], + tokenizer=tokenizer, + init_reference_model=False, + ) + + # Save checkpoint without any training + dcp_checkpoint_path = os.path.join(temp_dir, "dcp_checkpoint") + policy.save_checkpoint(dcp_checkpoint_path) + + print(f"✓ DCP checkpoint saved to: {dcp_checkpoint_path}") + return dcp_checkpoint_path + + +def create_megatron_checkpoint(model_name: str, temp_dir: str) -> str: + """Create a Megatron checkpoint using community import.""" + print("Creating Megatron checkpoint...") + + megatron_checkpoint_path = os.path.join(temp_dir, "megatron_checkpoint") + import_model_from_hf_name(model_name, megatron_checkpoint_path) + + print(f"✓ Megatron checkpoint saved to: {megatron_checkpoint_path}") + return os.path.join(megatron_checkpoint_path, "iter_0000000") + + +def convert_dcp_to_hf_checkpoint(dcp_path: str, model_name: str, temp_dir: str) -> str: + """Convert DCP checkpoint to HF format.""" + print("Converting DCP to HF format...") + + hf_path = os.path.join(temp_dir, "dcp_to_hf") + convert_dcp_to_hf( + dcp_ckpt_path=dcp_path, + hf_ckpt_path=hf_path, + model_name_or_path=model_name, + tokenizer_name_or_path=model_name, + overwrite=True, + ) + + print(f"✓ DCP to HF conversion saved to: {hf_path}") + return hf_path + + +def convert_megatron_to_hf_checkpoint( + megatron_path: str, model_name: str, temp_dir: str +) -> str: + """Convert Megatron checkpoint to HF format.""" + print("Converting Megatron to HF format...") + + hf_path = os.path.join(temp_dir, "megatron_to_hf") + + # Get tokenizer for the export + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + tokenizer_path = os.path.join(temp_dir, "tokenizer") + tokenizer.save_pretrained(tokenizer_path) + + export_model_from_megatron( + hf_model_name=model_name, + input_path=megatron_path, + output_path=hf_path, + hf_tokenizer_path=tokenizer_path, + overwrite=True, + ) + + print(f"✓ Megatron to HF conversion saved to: {hf_path}") + return hf_path + + +def main(): + """Main test function.""" + print("=" * 80) + print("Starting Converter Roundtrip Functional Test") + print("=" * 80) + + # TODO(@ashors): test more models + model_name = "Qwen/Qwen2-0.5B" + + with tempfile.TemporaryDirectory() as temp_dir: + print(f"Using temporary directory: {temp_dir}") + + # Step 1: Load original HF model + print("\n" + "=" * 60) + print("STEP 1: Loading original HuggingFace model") + print("=" * 60) + original_model, original_tokenizer = load_model_and_tokenizer(model_name) + original_state_dict = get_model_state_dict(original_model) + + # Step 2: Create DCP checkpoint + print("\n" + "=" * 60) + print("STEP 2: Creating DCP checkpoint") + print("=" * 60) + config = create_test_config() + dcp_checkpoint_path = create_dcp_checkpoint(model_name, config, temp_dir) + + # Step 3: Create Megatron checkpoint + print("\n" + "=" * 60) + print("STEP 3: Creating Megatron checkpoint") + print("=" * 60) + megatron_checkpoint_path = create_megatron_checkpoint(model_name, temp_dir) + + # Step 4: Convert DCP to HF + print("\n" + "=" * 60) + print("STEP 4: Converting DCP to HF format") + print("=" * 60) + dcp_to_hf_path = convert_dcp_to_hf_checkpoint( + dcp_checkpoint_path, model_name, temp_dir + ) + + # Step 5: Convert Megatron to HF + print("\n" + "=" * 60) + print("STEP 5: Converting Megatron to HF format") + print("=" * 60) + megatron_to_hf_path = convert_megatron_to_hf_checkpoint( + megatron_checkpoint_path, model_name, temp_dir + ) + + # Step 6: Load converted models and compare + print("\n" + "=" * 60) + print("STEP 6: Loading converted models and comparing") + print("=" * 60) + + # Load DCP-converted model + dcp_converted_model = AutoModelForCausalLM.from_pretrained( + dcp_to_hf_path, torch_dtype=torch.bfloat16, trust_remote_code=True + ) + dcp_converted_state_dict = get_model_state_dict(dcp_converted_model) + + # Load Megatron-converted model + megatron_converted_model = AutoModelForCausalLM.from_pretrained( + megatron_to_hf_path, torch_dtype=torch.bfloat16, trust_remote_code=True + ) + megatron_converted_state_dict = get_model_state_dict(megatron_converted_model) + + # Step 7: Assertions + print("\n" + "=" * 60) + print("STEP 7: Running assertions") + print("=" * 60) + + # Compare DCP-converted vs Megatron-converted + print("Comparing DCP-converted HF model with Megatron-converted HF model...") + assert_state_dicts_equal( + dcp_converted_state_dict, + megatron_converted_state_dict, + "DCP-converted HF model", + "Megatron-converted HF model", + ) + + print("✓ DCP and Megatron roundtrip checkpoints are identical!") + + # Verify that both converted models have the expected structure + expected_keys = set(original_state_dict.keys()) + dcp_keys = set(dcp_converted_state_dict.keys()) + megatron_keys = set(megatron_converted_state_dict.keys()) + + assert dcp_keys == expected_keys, ( + f"DCP converted model missing keys: {expected_keys - dcp_keys}" + ) + assert megatron_keys == expected_keys, ( + f"Megatron converted model missing keys: {expected_keys - megatron_keys}" + ) + + print("✓ All converted models have the expected structure") + + # Test that we can do a forward pass with both converted models + print("Testing forward passes...") + test_input = torch.randint(0, 1000, (1, 10)) + + with torch.no_grad(): + dcp_output = dcp_converted_model(test_input) + megatron_output = megatron_converted_model(test_input) + + print("✓ Both converted models can perform forward passes") + + print("\n" + "=" * 80) + print("✓ ALL TESTS PASSED!") + print("=" * 80) + + +if __name__ == "__main__": + main() diff --git a/tests/functional/test_converters.sh b/tests/functional/test_converters.sh new file mode 100644 index 0000000000..ef789ecf90 --- /dev/null +++ b/tests/functional/test_converters.sh @@ -0,0 +1 @@ +uv run --extra mcore tests/functional/test_converter_roundtrip.py \ No newline at end of file From 4022bee33e31a3225f36f9a09ffbef28f3c25932 Mon Sep 17 00:00:00 2001 From: Anna Shors Date: Fri, 27 Jun 2025 20:54:13 -0700 Subject: [PATCH 24/44] docs: Add a note on supported backends (#553) Signed-off-by: ashors1 Signed-off-by: Xuehan --- README.md | 42 +++++++++++++++++++++++++++ docs/design-docs/training-backends.md | 35 ++++++++++++++++++++++ docs/index.md | 1 + 3 files changed, 78 insertions(+) create mode 100644 docs/design-docs/training-backends.md diff --git a/README.md b/README.md index e83d242734..8f5b3b54e5 100644 --- a/README.md +++ b/README.md @@ -16,10 +16,12 @@ - [DPO](#dpo) - [DPO Single Node](#dpo-single-node) - [DPO Multi-node](#dpo-multi-node) + - [Supported Training Backends](#training-backends) - [Evaluation](#evaluation) - [Convert Model Format (Optional)](#convert-model-format-optional) - [Run Evaluation](#run-evaluation) - [Set Up Clusters](#set-up-clusters) + - [Tips and Tricks](#tips-and-tricks) - [Citation](#citation) - [Contributing](#contributing) - [Licenses](#licenses) @@ -152,6 +154,18 @@ uv run python examples/run_grpo_math.py \ logger.num_val_samples_to_print=10 ``` +The default configuration uses the DTensor training backend. We also provide a config `examples/configs/grpo_math_1B_megatron.yaml` which is set up to use the Megatron backend out of the box. + +To train using this config on a single GPU: + +```sh +# Run a GRPO math example on 1 GPU using the Megatron backend +uv run python examples/run_grpo_math.py \ + --config examples/configs/grpo_math_1B_megatron.yaml +``` + +For additional details on supported backends and how to configure the training backend to suit your setup, refer to the [Training Backends documentation](docs/design-docs/training-backends.md). + ### GRPO Multi-node ```sh @@ -310,6 +324,15 @@ sbatch \ ray.sub ``` +## Training Backends + +NeMo RL supports multiple training backends to accommodate different model sizes and hardware configurations: + +- **DTensor (FSDP2)** - PyTorch's next-generation distributed training with improved memory efficiency +- **Megatron** - NVIDIA's high-performance training framework for scaling to large models (>100B parameters) + +The training backend is automatically determined based on your YAML configuration settings. For detailed information on backend selection, configuration, and examples, see the [Training Backends documentation](docs/design-docs/training-backends.md). + ## Evaluation We provide evaluation tools to assess model capabilities. @@ -360,6 +383,25 @@ Refer to `examples/configs/eval.yaml` for a full list of parameters that can be For detailed instructions on how to set up and launch NeMo RL on Slurm or Kubernetes clusters, please refer to the dedicated [Cluster Start](docs/cluster.md) documentation. +## Tips and Tricks +- If you forget to initialize the NeMo and Megatron submodules when cloning the NeMo-RL repository, you may run into an error like this: + + ```sh + ModuleNotFoundError: No module named 'megatron' + ``` + + If you see this error, there is likely an issue with your virtual environments. To fix this, first intialize the submodules: + + ```sh + git submodule update --init --recursive + ``` + + and then force a rebuild of the virutal environments by setting `NRL_FORCE_REBUILD_VENVS=true` next time you launch a run: + + ```sh + NRL_FORCE_REBUILD_VENVS=true uv run examples/run_grpo.py ... + ``` + ## Citation If you use NeMo RL in your research, please cite it using the following BibTeX entry: diff --git a/docs/design-docs/training-backends.md b/docs/design-docs/training-backends.md new file mode 100644 index 0000000000..0448284971 --- /dev/null +++ b/docs/design-docs/training-backends.md @@ -0,0 +1,35 @@ +# Training Backends + +NeMo RL supports multiple training backends to accommodate different model sizes and hardware configurations. + +## Available Backends + +- **DTensor (FSDP2)** - PyTorch's next-generation distributed training with improved memory efficiency +- **Megatron** - NVIDIA's high-performance training framework for scaling to large models (>100B parameters) + +## Backend Selection + +The training backend is automatically determined based on your YAML configuration settings. Here's how to configure each backend. + +### Megatron Backend +To enable Megatron-based training: + +1. Add the `megatron_cfg` key to your policy configuration. +2. Set `policy.megatron_cfg.enabled=True`. +3. Refer to [examples/configs/grpo_math_1B_megatron.yaml](../../examples/configs/grpo_math_1B_megatron.yaml) for a complete configuration example. + +_Note_: When using Megatron, the optimizer and learning rate schedule are configured through `policy.megatron_cfg.optimizer` and `policy.megatron_cfg.scheduler`, respectively. + +### DTensor Backend +To enable DTensor (FSDP2) training: + +1. Set `policy.dtensor_config.enabled=True`. +2. Refer to [examples/configs/grpo_math_1B.yaml](../../examples/configs/grpo_math_1B.yaml) for a configuration example. + +## Backend Priority + +**Megatron takes precedence over DTensor.** If both backends are enabled simultaneously (`policy.megatron_cfg.enabled=True` and `policy.dtensor_config.enabled=True`), the Megatron backend will be used. + +## Configuration Examples + +For comprehensive examples of each algorithm and backend, see the [examples/configs/recipes/llm](https://github.com/NVIDIA-NeMo/RL/tree/main/examples/configs/recipes/llm) folder. This directory contains ready-to-use configurations for various supported combinations. diff --git a/docs/index.md b/docs/index.md index c7ad002631..33d507b6f4 100644 --- a/docs/index.md +++ b/docs/index.md @@ -63,4 +63,5 @@ design-docs/generation.md design-docs/checkpointing.md design-docs/loss-functions.md design-docs/fsdp2-parallel-plan.md +design-docs/training-backends.md ``` From f03e596fdac30e913a19067dc6f3258e7eeb5ee9 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Fri, 27 Jun 2025 20:55:39 -0700 Subject: [PATCH 25/44] feat: Support pass@k (#536) Signed-off-by: Dheeraj Peri Signed-off-by: Xuehan --- examples/configs/evals/eval.yaml | 3 +- nemo_rl/evals/eval.py | 62 +++++++++++++++++++++++++------- 2 files changed, 52 insertions(+), 13 deletions(-) diff --git a/examples/configs/evals/eval.yaml b/examples/configs/evals/eval.yaml index 1c21af99c4..439acff25e 100644 --- a/examples/configs/evals/eval.yaml +++ b/examples/configs/evals/eval.yaml @@ -1,8 +1,9 @@ # Evaluation Configuration eval: - metric: "pass@1" # only pass@1 is supported now + metric: "pass@k" num_tests_per_prompt: 1 # every prompt will be tested num_tests_per_prompt times and use the average score as the final score seed: 42 + pass_k_value: 1 generation: backend: "vllm" # only vllm is supported for evaluation diff --git a/nemo_rl/evals/eval.py b/nemo_rl/evals/eval.py index 9ca1e90762..5788e1971e 100644 --- a/nemo_rl/evals/eval.py +++ b/nemo_rl/evals/eval.py @@ -16,6 +16,7 @@ from typing import TypedDict import ray +import torch from torch.utils.data import DataLoader from transformers import AutoTokenizer @@ -38,6 +39,7 @@ class EvalConfig(TypedDict): metric: str num_tests_per_prompt: int seed: int + pass_k_value: int class MasterConfig(TypedDict): @@ -83,16 +85,26 @@ def setup( # Check settings metric = eval_config["metric"] + pass_k_value = eval_config["pass_k_value"] num_tests_per_prompt = eval_config["num_tests_per_prompt"] temperature = generation_config["temperature"] top_k = generation_config["top_k"] - # TODO @yukih: support pass@k and cons@k - assert metric in ["pass@1"], f"Invalid metric: {metric}" + + # TODO @yukih: support cons@k + # Validate metrics + assert metric in ["pass@k"], f"Invalid metric: {metric}" if num_tests_per_prompt > 1: assert temperature > 0 and top_k != 1, ( "temperature > 0 and top_k != 1 are required for multiple samples" ) + assert pass_k_value >= 1, ( + "pass_k_value must be greater than or equal to 1 for pass@k metric" + ) + assert num_tests_per_prompt >= pass_k_value, ( + "num_tests_per_prompt must be greater than or equal to pass_k_value for pass@k metric" + ) + # ========================== # Data # ========================== @@ -150,6 +162,34 @@ def setup( # =============================================================================== +def eval_pass_k(rewards: torch.Tensor, num_tests_per_prompt: int, k: int) -> float: + """Evaluate pass@k score using an unbiased estimator. + + Reference: https://github.com/huggingface/evaluate/blob/32546aafec25cdc2a5d7dd9f941fc5be56ba122f/metrics/code_eval/code_eval.py#L198-L213 + Args: + rewards: Tensor of shape (batch_size * num_tests_per_prompt) + k: int (pass@k value) + + Returns: + pass_k_score: float + """ + + def eval_single_chunk(n: int, c: int, k: int) -> float: + """Calculates 1 - comb(n - c, k) / comb(n, k).""" + if n - c < k: + return 1.0 + return float(1.0 - torch.prod(1.0 - k / torch.arange(n - c + 1, n + 1)).item()) + + # rewards is a 1d tensor of size (batch_size * num_tests_per_prompt) + group_rewards = rewards.split(num_tests_per_prompt) + pass_k_score = 0.0 + for group_reward in group_rewards: + num_correct = group_reward.sum().item() + pass_k_score += eval_single_chunk(num_tests_per_prompt, num_correct, k) + + return pass_k_score + + def run_env_eval(vllm_generation, dataloader, env, master_config): """Main entry point for running evaluation using environment. @@ -166,13 +206,11 @@ def run_env_eval(vllm_generation, dataloader, env, master_config): eval_config = master_config["eval"] metric = eval_config["metric"] num_tests_per_prompt = eval_config["num_tests_per_prompt"] + pass_k_value = eval_config["pass_k_value"] # Run evaluation loop - score, count = 0.0, 0 + score = 0.0 for batch in dataloader: - # update stats - count += batch.size * num_tests_per_prompt - # measure multiple samples if num_tests_per_prompt > 1: batch = batch.repeat_interleave(num_tests_per_prompt) @@ -203,10 +241,10 @@ def run_env_eval(vllm_generation, dataloader, env, master_config): for i in range(len(batch["message_log"])) ] env_return = ray.get(env.step.remote(to_env, batch["extra_env_info"])) - + rewards = env_return.rewards # update stats - if metric == "pass@1": - score += env_return.rewards.sum().item() + if metric == "pass@k": + score += eval_pass_k(rewards, num_tests_per_prompt, pass_k_value) else: raise ValueError(f"Invalid metric: {metric}") @@ -221,11 +259,11 @@ def run_env_eval(vllm_generation, dataloader, env, master_config): temperature = generation_config["temperature"] top_p = generation_config["top_p"] top_k = generation_config["top_k"] - average_score = score / count + average_score = score / len(dataloader.dataset) print("\n" + "=" * 60) print(f"{model_name=} {dataset_name=}") print(f"{max_new_tokens=} {temperature=} {top_p=} {top_k=}\n") - print(f"{metric=} {num_tests_per_prompt=}\n") - print(f"score={average_score:.4f} ({score}/{count})") + print(f"{metric=} {pass_k_value=} {num_tests_per_prompt=}\n") + print(f"score={average_score:.4f} ({score}/{len(dataloader.dataset)})") print("=" * 60 + "\n") From 8f444925d6d9c501e98acbd2bda6321bbd4ea15d Mon Sep 17 00:00:00 2001 From: Sahil Jain <48468750+SahilJain314@users.noreply.github.com> Date: Fri, 27 Jun 2025 21:33:21 -0700 Subject: [PATCH 26/44] fix: Megatron config fixes (#576) Signed-off-by: Sahil Jain Signed-off-by: Xuehan --- examples/configs/grpo_math_1B_megatron.yaml | 4 ++-- examples/configs/grpo_math_70B_megatron.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml index 6b07317ed6..237fbb0df1 100644 --- a/examples/configs/grpo_math_1B_megatron.yaml +++ b/examples/configs/grpo_math_1B_megatron.yaml @@ -14,8 +14,8 @@ grpo: loss_fn: reference_policy_kl_penalty: 0.01 - ratio_eps_min: 0.2 - ratio_eps_max: 0.2 + ratio_clip_min: 0.2 + ratio_clip_max: 0.2 # (default off) loss formulation improvements (docs/guides/grpo.md#loss) use_on_policy_kl_approximation: false use_importance_sampling_correction: false diff --git a/examples/configs/grpo_math_70B_megatron.yaml b/examples/configs/grpo_math_70B_megatron.yaml index 4d071b3110..15a65c5ce6 100644 --- a/examples/configs/grpo_math_70B_megatron.yaml +++ b/examples/configs/grpo_math_70B_megatron.yaml @@ -68,4 +68,4 @@ policy: cluster: gpus_per_node: 8 - num_nodes: 1 + num_nodes: 8 From 39b8f25536b5a1bef9db34e0d341ba7b986ca6d7 Mon Sep 17 00:00:00 2001 From: Xuehan Date: Mon, 30 Jun 2025 05:14:08 +0000 Subject: [PATCH 27/44] update docs for the new eval. Signed-off-by: Xuehan Xiong Signed-off-by: Xuehan --- docs/guides/eval.md | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/docs/guides/eval.md b/docs/guides/eval.md index 0281bb21f7..8648940b2e 100644 --- a/docs/guides/eval.md +++ b/docs/guides/eval.md @@ -25,7 +25,7 @@ Once the conversion is complete, you can override the `generation.model_name` to ### Prepare the Evaluation Configuration **Override with Custom Settings** -To run the evaluation, you can use the [default configuration file](../../examples/configs/eval.yaml). Alternatively, you can specify a custom one or override some settings via the command line. +To run the evaluation, you can use the [default configuration file](../../examples/configs/evals/eval.yaml). Alternatively, you can specify a custom one or override some settings via the command line. The default configuration employs greedy sampling to evaluate Qwen2.5-Math-1.5B-Instruct on AIME-2024. @@ -51,16 +51,22 @@ uv run python examples/run_eval.py generation.model_name=$PWD/results/grpo/hf # Run evaluation script with custom config file uv run python examples/run_eval.py --config path/to/custom_config.yaml +# Run evaluation script on one of the supported benchmarks (e.g., GPQA) +uv run python examples/run_eval.py --config examples/configs/evals/gpqa_eval.yaml + +# Run evaluation script with a local dataset +uv run python examples/run_eval.py --config examples/configs/evals/local_eval.yaml + # Override specific config values via command line # Example: Evaluation of DeepScaleR-1.5B-Preview on MATH-500 using 8 GPUs # Pass@1 accuracy averaged over 16 samples for each problem uv run python examples/run_eval.py \ + --config examples/configs/evals/math_eval.yaml \ generation.model_name=agentica-org/DeepScaleR-1.5B-Preview \ generation.temperature=0.6 \ generation.top_p=0.95 \ - generation.vllm_cfg.max_model_len=32768 \ - data.dataset_name=HuggingFaceH4/MATH-500 \ - data.dataset_key=test \ + generation.vllm_cfg.max_model_len=32768 \ + data.dataset_name="math500" \ eval.num_tests_per_prompt=16 \ cluster.gpus_per_node=8 ``` @@ -80,3 +86,12 @@ metric='pass@1' num_tests_per_prompt=1 score=0.1000 (3.0/30) ============================================================ ``` + +## List of currently supported benchmarks + +- [AIME-2024](../../nemo_rl/data/eval_datasets/aime2024.py) +- [GPQA and GPQA-diamond](../../nemo_rl/data/eval_datasets/gpqa.py) +- [MATH and MATH-500](../../nemo_rl/data/eval_datasets/math.py) +- [MMLU](../../nemo_rl/data/eval_datasets/mmlu.py) +- [MMLU-Pro](../../nemo_rl/data/eval_datasets/mmlu_pro.py) + From 8f6ac977385d576e6aeb1198b702dbf90724b030 Mon Sep 17 00:00:00 2001 From: Anna Shors Date: Sun, 29 Jun 2025 18:24:48 -0700 Subject: [PATCH 28/44] docs: move training backends section (#580) Signed-off-by: ashors1 Signed-off-by: Xuehan --- README.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 8f5b3b54e5..9f605b1b5c 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ - [📣 News](#-news) - [Features](#features) - [Prerequisites](#prerequisites) + - [Supported Training Backends](#training-backends) - [GRPO](#grpo) - [GRPO Single Node](#grpo-single-node) - [GRPO Multi-node](#grpo-multi-node) @@ -16,7 +17,6 @@ - [DPO](#dpo) - [DPO Single Node](#dpo-single-node) - [DPO Multi-node](#dpo-multi-node) - - [Supported Training Backends](#training-backends) - [Evaluation](#evaluation) - [Convert Model Format (Optional)](#convert-model-format-optional) - [Run Evaluation](#run-evaluation) @@ -122,6 +122,15 @@ uv venv - Ensure you have the necessary CUDA drivers and PyTorch installed compatible with your hardware. - **Reminder**: Don't forget to set your `HF_HOME`, `WANDB_API_KEY`, and `HF_DATASETS_CACHE` (if needed). You'll need to do a `huggingface-cli login` as well for Llama models. +## Training Backends + +NeMo RL supports multiple training backends to accommodate different model sizes and hardware configurations: + +- **DTensor (FSDP2)** - PyTorch's next-generation distributed training with improved memory efficiency +- **Megatron** - NVIDIA's high-performance training framework for scaling to large models (>100B parameters) + +The training backend is automatically determined based on your YAML configuration settings. For detailed information on backend selection, configuration, and examples, see the [Training Backends documentation](docs/design-docs/training-backends.md). + ## GRPO We have a reference GRPO experiment config set up trained for math benchmarks using the [OpenInstructMath2](https://huggingface.co/datasets/nvidia/OpenMathInstruct-2) dataset. @@ -324,15 +333,6 @@ sbatch \ ray.sub ``` -## Training Backends - -NeMo RL supports multiple training backends to accommodate different model sizes and hardware configurations: - -- **DTensor (FSDP2)** - PyTorch's next-generation distributed training with improved memory efficiency -- **Megatron** - NVIDIA's high-performance training framework for scaling to large models (>100B parameters) - -The training backend is automatically determined based on your YAML configuration settings. For detailed information on backend selection, configuration, and examples, see the [Training Backends documentation](docs/design-docs/training-backends.md). - ## Evaluation We provide evaluation tools to assess model capabilities. From 29753155c730931f6ea6ef67f4406d05002d93b4 Mon Sep 17 00:00:00 2001 From: Anna Shors Date: Fri, 27 Jun 2025 20:54:13 -0700 Subject: [PATCH 29/44] docs: Add a note on supported backends (#553) Signed-off-by: ashors1 Signed-off-by: Xuehan --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index 9f605b1b5c..6e86a9835e 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ - [DPO](#dpo) - [DPO Single Node](#dpo-single-node) - [DPO Multi-node](#dpo-multi-node) + - [Supported Training Backends](#training-backends) - [Evaluation](#evaluation) - [Convert Model Format (Optional)](#convert-model-format-optional) - [Run Evaluation](#run-evaluation) @@ -333,6 +334,15 @@ sbatch \ ray.sub ``` +## Training Backends + +NeMo RL supports multiple training backends to accommodate different model sizes and hardware configurations: + +- **DTensor (FSDP2)** - PyTorch's next-generation distributed training with improved memory efficiency +- **Megatron** - NVIDIA's high-performance training framework for scaling to large models (>100B parameters) + +The training backend is automatically determined based on your YAML configuration settings. For detailed information on backend selection, configuration, and examples, see the [Training Backends documentation](docs/design-docs/training-backends.md). + ## Evaluation We provide evaluation tools to assess model capabilities. From 26f8fb227817c672a2437f7763fa150775d3118d Mon Sep 17 00:00:00 2001 From: Anna Shors Date: Sun, 29 Jun 2025 18:24:48 -0700 Subject: [PATCH 30/44] docs: move training backends section (#580) Signed-off-by: ashors1 Signed-off-by: Xuehan --- README.md | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/README.md b/README.md index 6e86a9835e..9f605b1b5c 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,6 @@ - [DPO](#dpo) - [DPO Single Node](#dpo-single-node) - [DPO Multi-node](#dpo-multi-node) - - [Supported Training Backends](#training-backends) - [Evaluation](#evaluation) - [Convert Model Format (Optional)](#convert-model-format-optional) - [Run Evaluation](#run-evaluation) @@ -334,15 +333,6 @@ sbatch \ ray.sub ``` -## Training Backends - -NeMo RL supports multiple training backends to accommodate different model sizes and hardware configurations: - -- **DTensor (FSDP2)** - PyTorch's next-generation distributed training with improved memory efficiency -- **Megatron** - NVIDIA's high-performance training framework for scaling to large models (>100B parameters) - -The training backend is automatically determined based on your YAML configuration settings. For detailed information on backend selection, configuration, and examples, see the [Training Backends documentation](docs/design-docs/training-backends.md). - ## Evaluation We provide evaluation tools to assess model capabilities. From 1055f5ea99c8949d89681e51db9b0d8d17e58e17 Mon Sep 17 00:00:00 2001 From: Xuehan Date: Mon, 30 Jun 2025 06:04:37 +0000 Subject: [PATCH 31/44] Update more docs for the new eval. Signed-off-by: Xuehan Xiong Signed-off-by: Xuehan --- README.md | 2 +- docs/guides/eval.md | 2 +- docs/guides/sft-openmathinstruct2.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 9f605b1b5c..6beca2f50b 100644 --- a/README.md +++ b/README.md @@ -377,7 +377,7 @@ uv run python examples/run_eval.py \ ``` > **Note:** Evaluation results may vary slightly due to various factors, such as sampling parameters, random seed, inference engine version, and inference engine settings. -Refer to `examples/configs/eval.yaml` for a full list of parameters that can be overridden. For an in-depth explanation of evaluation, refer to the [Evaluation documentation](docs/guides/eval.md). +Refer to `examples/configs/evals/eval.yaml` for a full list of parameters that can be overridden. For an in-depth explanation of evaluation, refer to the [Evaluation documentation](docs/guides/eval.md). ## Set Up Clusters diff --git a/docs/guides/eval.md b/docs/guides/eval.md index 8648940b2e..fc8cb19baf 100644 --- a/docs/guides/eval.md +++ b/docs/guides/eval.md @@ -42,7 +42,7 @@ We will use the `run_eval.py` script to run an evaluation using a model directly Note that the evaluation script only supports the Hugging Face format model. If you haven't converted your DCP format model, you should back to [Convert DCP to HF](#convert-dcp-to-hf-optional) and follow the guide to convert your model. ```sh -# Run evaluation script with default config (examples/configs/eval.yaml) +# Run evaluation script with default config (examples/configs/evals/eval.yaml) uv run python examples/run_eval.py # Run evaluation script with converted model diff --git a/docs/guides/sft-openmathinstruct2.md b/docs/guides/sft-openmathinstruct2.md index 6698c12bc0..1228d42a7d 100644 --- a/docs/guides/sft-openmathinstruct2.md +++ b/docs/guides/sft-openmathinstruct2.md @@ -38,7 +38,7 @@ To evaluate on the [MATH-500 benchmark](https://huggingface.co/datasets/HuggingF ``` uv run examples/run_eval.py \ - --config=examples/configs/eval.yaml \ + --config=examples/configs/evals/eval.yaml \ generation.model_name=results/sft_openmathinstruct2/step_1855/hf \ tokenizer.name=meta-llama/Llama-3.1-8B-Instruct \ data.dataset_name=HuggingFaceH4/MATH-500 \ From aaa3eebd5bd2208ba38527b225b6825b2ddbf2a5 Mon Sep 17 00:00:00 2001 From: Xuehan Date: Wed, 2 Jul 2025 16:44:59 +0000 Subject: [PATCH 32/44] fix lint errors. Signed-off-by: Xuehan Xiong --- docs/guides/eval.md | 2 +- nemo_rl/data/eval_datasets/gpqa.py | 8 +++--- nemo_rl/data/processors.py | 1 - nemo_rl/environments/math_environment.py | 15 ++++++++--- nemo_rl/evals/answer_parsing.py | 6 +---- tests/unit/data/eval_datasets/test_gpqa.py | 6 +++-- tests/unit/data/eval_datasets/test_math.py | 6 +++-- tests/unit/data/eval_datasets/test_mmlu.py | 6 +++-- .../environments/test_math_environment.py | 26 ++++++++++++++----- 9 files changed, 49 insertions(+), 27 deletions(-) diff --git a/docs/guides/eval.md b/docs/guides/eval.md index fc8cb19baf..b4f97b8c64 100644 --- a/docs/guides/eval.md +++ b/docs/guides/eval.md @@ -54,7 +54,7 @@ uv run python examples/run_eval.py --config path/to/custom_config.yaml # Run evaluation script on one of the supported benchmarks (e.g., GPQA) uv run python examples/run_eval.py --config examples/configs/evals/gpqa_eval.yaml -# Run evaluation script with a local dataset +# Run evaluation script with a local dataset that is prefetched as a csv file. uv run python examples/run_eval.py --config examples/configs/evals/local_eval.yaml # Override specific config values via command line diff --git a/nemo_rl/data/eval_datasets/gpqa.py b/nemo_rl/data/eval_datasets/gpqa.py index 4eb05014c6..9cadceb49e 100644 --- a/nemo_rl/data/eval_datasets/gpqa.py +++ b/nemo_rl/data/eval_datasets/gpqa.py @@ -20,7 +20,7 @@ def __init__( self._rng = random.Random() self.rekeyed_ds = ds.map(self._rekey, remove_columns=ds.column_names) self.task_spec = TaskDataSpec( - task_name=f'GPQA_{variant}', + task_name=f"GPQA_{variant}", prompt_file=prompt_file, system_prompt_file=system_prompt_file, ) @@ -38,12 +38,12 @@ def _rekey(self, data: dict[str, Any]): correct_index = choices.index(data["Correct Answer"]) correct_answer = "ABCD"[correct_index] return { - 'question': data['Question'], - 'options': dict( + "question": data["Question"], + "options": dict( A=choices[0], B=choices[1], C=choices[2], D=choices[3], ), - 'answer': correct_answer, + "answer": correct_answer, } diff --git a/nemo_rl/data/processors.py b/nemo_rl/data/processors.py index 5fd35d4078..4d207abad8 100644 --- a/nemo_rl/data/processors.py +++ b/nemo_rl/data/processors.py @@ -11,7 +11,6 @@ # Example of a generic math data processor -# TaskDataProcessFnCallable def math_data_processor( datum_dict: dict[str, Any], task_data_spec: TaskDataSpec, diff --git a/nemo_rl/environments/math_environment.py b/nemo_rl/environments/math_environment.py index 3f2c7cf7af..8dd5247f1c 100644 --- a/nemo_rl/environments/math_environment.py +++ b/nemo_rl/environments/math_environment.py @@ -102,7 +102,6 @@ def verify( @ray.remote class MultichoiceVerifyWorker: - def verify( self, pred_responses: list[str], ground_truths: list[str] ) -> list[float]: @@ -120,10 +119,14 @@ def verify( response = answer_parsing.normalize_response(response) extracted_answer = None for answer_regex in answer_parsing.MULTILINGUAL_ANSWER_REGEXES: - regex = answer_parsing.MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(answer_regex) + regex = answer_parsing.MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format( + answer_regex + ) match = re.search(regex, response) if match: - extracted_answer = answer_parsing.normalize_extracted_answer(match.group(1)) + extracted_answer = answer_parsing.normalize_extracted_answer( + match.group(1) + ) break score = 1.0 if extracted_answer == ground_truth else 0.0 results.append(score) @@ -139,7 +142,11 @@ class MathEnvironment(EnvironmentInterface): def __init__(self, cfg: MathEnvConfig): self.cfg = cfg self.num_workers = cfg["num_workers"] - worker_cls = MultichoiceVerifyWorker if cfg.get("verifier_type", "math") == "multichoice" else HFVerifyWorker + worker_cls = ( + MultichoiceVerifyWorker + if cfg.get("verifier_type", "math") == "multichoice" + else HFVerifyWorker + ) self.workers = [ worker_cls.options( # type: ignore # (decorated with @ray.remote) runtime_env={"py_executable": PY_EXECUTABLES.SYSTEM} diff --git a/nemo_rl/evals/answer_parsing.py b/nemo_rl/evals/answer_parsing.py index d4e2fddd6f..8b62026360 100644 --- a/nemo_rl/evals/answer_parsing.py +++ b/nemo_rl/evals/answer_parsing.py @@ -1,6 +1,5 @@ """Contains utility functions for answer parsing.""" - MULTILINGUAL_ANSWER_PATTERN_TEMPLATE = ( "(?i){}[ \t]*([A-D]|[أ-د]|[অ]|[ব]|[ড]|[ঢ]|[A]|[B]|[C]|[D])" ) @@ -73,10 +72,7 @@ def normalize_extracted_answer(extracted_answer: str) -> str: def normalize_response(response: str) -> str: - """ - Normalize the response by removing markdown and LaTeX formatting that may prevent a match. - """ - + """Normalize the response by removing markdown and LaTeX formatting that may prevent a match.""" return ( response.replace("**", "") .replace("$\\boxed{", "") diff --git a/tests/unit/data/eval_datasets/test_gpqa.py b/tests/unit/data/eval_datasets/test_gpqa.py index 033a11b6ff..3441f11974 100644 --- a/tests/unit/data/eval_datasets/test_gpqa.py +++ b/tests/unit/data/eval_datasets/test_gpqa.py @@ -36,5 +36,7 @@ def test_gpqa_dataset(): add_special_tokens=False, ) - assert default_templated == f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n{example["question"]}<|im_end|>\n" - + assert ( + default_templated + == f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n{example['question']}<|im_end|>\n" + ) diff --git a/tests/unit/data/eval_datasets/test_math.py b/tests/unit/data/eval_datasets/test_math.py index 7a524654fa..3bab184f1a 100644 --- a/tests/unit/data/eval_datasets/test_math.py +++ b/tests/unit/data/eval_datasets/test_math.py @@ -35,5 +35,7 @@ def test_math_dataset(): add_special_tokens=False, ) - assert default_templated == f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n{example["problem"]}<|im_end|>\n" - + assert ( + default_templated + == f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n{example['problem']}<|im_end|>\n" + ) diff --git a/tests/unit/data/eval_datasets/test_mmlu.py b/tests/unit/data/eval_datasets/test_mmlu.py index df5dabaef9..02c1936003 100644 --- a/tests/unit/data/eval_datasets/test_mmlu.py +++ b/tests/unit/data/eval_datasets/test_mmlu.py @@ -37,5 +37,7 @@ def test_mmlu_dataset(): add_special_tokens=False, ) - assert default_templated == f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n{example["question"]}<|im_end|>\n" - + assert ( + default_templated + == f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n{example['question']}<|im_end|>\n" + ) diff --git a/tests/unit/environments/test_math_environment.py b/tests/unit/environments/test_math_environment.py index ed599bcd5e..b254f2ef5f 100644 --- a/tests/unit/environments/test_math_environment.py +++ b/tests/unit/environments/test_math_environment.py @@ -93,15 +93,24 @@ def basic_multichoice_test_data(): return { "message_log_batch": [ [ - {"role": "user", "content": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD"}, + { + "role": "user", + "content": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD", + }, {"role": "assistant", "content": "\nAnswer: C"}, ], [ - {"role": "user", "content": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD"}, + { + "role": "user", + "content": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD", + }, {"role": "assistant", "content": "\nAnswer: B"}, ], [ - {"role": "user", "content": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD"}, + { + "role": "user", + "content": "Answer the following multiple choice question. The last line of your response should be of the following format: 'Answer: $LETTER' (without quotes) where LETTER is one of ABCD", + }, {"role": "assistant", "content": "\nAnswer: D"}, ], ], @@ -197,7 +206,8 @@ def test_multichoice_env_step_basic(multichoice_env, basic_multichoice_test_data """Test basic functionality of MathEnvironment step with multichoice verifier.""" result = ray.get( multichoice_env.step.remote( - basic_multichoice_test_data["message_log_batch"], basic_multichoice_test_data["metadata"] + basic_multichoice_test_data["message_log_batch"], + basic_multichoice_test_data["metadata"], ) ) @@ -211,7 +221,9 @@ def test_multichoice_env_step_basic(multichoice_env, basic_multichoice_test_data assert all( obs["content"] == "Environment: correct" for obs in result.observations[:2] ), "The first two responses should be correct" - assert result.observations[2]["content"] == "Environment: incorrect", "The third response should be incorrect" + assert result.observations[2]["content"] == "Environment: incorrect", ( + "The third response should be incorrect" + ) # Check metadata assert len(result.metadata) == 3, "Should return metadata for all 3 messages" @@ -221,7 +233,9 @@ def test_multichoice_env_step_basic(multichoice_env, basic_multichoice_test_data # Check rewards and done flags assert result.rewards.shape == (3,), "Rewards should be a tensor of shape (3,)" - assert all(result.rewards[:2] == 1.0), "The first two rewards should be 1.0 for correct answers" + assert all(result.rewards[:2] == 1.0), ( + "The first two rewards should be 1.0 for correct answers" + ) assert result.rewards[2] == 0.0, "The thrid reward should be 0.0 for wrong answer" assert result.terminateds.shape == (3,), ( "Terminated flags should be a tensor of shape (3,)" From 0d77a158df5f4151dcc9246dac5321a83249a542 Mon Sep 17 00:00:00 2001 From: Xuehan Date: Wed, 2 Jul 2025 16:49:46 +0000 Subject: [PATCH 33/44] add missing copyright statements. Signed-off-by: Xuehan Xiong --- nemo_rl/data/eval_datasets/aime2024.py | 14 ++++++++++++++ nemo_rl/data/eval_datasets/gpqa.py | 14 ++++++++++++++ nemo_rl/data/eval_datasets/local_math_dataset.py | 14 ++++++++++++++ nemo_rl/data/eval_datasets/math.py | 14 ++++++++++++++ nemo_rl/data/eval_datasets/mmlu.py | 14 ++++++++++++++ nemo_rl/data/eval_datasets/mmlu_pro.py | 14 ++++++++++++++ nemo_rl/data/processors.py | 14 ++++++++++++++ 7 files changed, 98 insertions(+) diff --git a/nemo_rl/data/eval_datasets/aime2024.py b/nemo_rl/data/eval_datasets/aime2024.py index b73bd34dbf..9e585bb511 100644 --- a/nemo_rl/data/eval_datasets/aime2024.py +++ b/nemo_rl/data/eval_datasets/aime2024.py @@ -1,3 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """AIME 2024 dataset.""" from typing import Any, Optional diff --git a/nemo_rl/data/eval_datasets/gpqa.py b/nemo_rl/data/eval_datasets/gpqa.py index 9cadceb49e..f41efa136a 100644 --- a/nemo_rl/data/eval_datasets/gpqa.py +++ b/nemo_rl/data/eval_datasets/gpqa.py @@ -1,3 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """GPQA dataset and its variants.""" import random diff --git a/nemo_rl/data/eval_datasets/local_math_dataset.py b/nemo_rl/data/eval_datasets/local_math_dataset.py index d78b99565f..2810899b4a 100644 --- a/nemo_rl/data/eval_datasets/local_math_dataset.py +++ b/nemo_rl/data/eval_datasets/local_math_dataset.py @@ -1,3 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Local math dataset.""" from typing import Any, Literal, Optional diff --git a/nemo_rl/data/eval_datasets/math.py b/nemo_rl/data/eval_datasets/math.py index a1c489a148..290902657e 100644 --- a/nemo_rl/data/eval_datasets/math.py +++ b/nemo_rl/data/eval_datasets/math.py @@ -1,3 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Math dataset and its variants.""" from typing import Any, Literal, Optional diff --git a/nemo_rl/data/eval_datasets/mmlu.py b/nemo_rl/data/eval_datasets/mmlu.py index 86acbcc9a6..f8b75d3b56 100644 --- a/nemo_rl/data/eval_datasets/mmlu.py +++ b/nemo_rl/data/eval_datasets/mmlu.py @@ -1,3 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """MMLU dataset and its variants.""" from typing import Any, Optional diff --git a/nemo_rl/data/eval_datasets/mmlu_pro.py b/nemo_rl/data/eval_datasets/mmlu_pro.py index 4dd094e322..159d4d1738 100644 --- a/nemo_rl/data/eval_datasets/mmlu_pro.py +++ b/nemo_rl/data/eval_datasets/mmlu_pro.py @@ -1,3 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """MMLU-Pro dataset.""" from typing import Any, Optional diff --git a/nemo_rl/data/processors.py b/nemo_rl/data/processors.py index 4d207abad8..67e3658882 100644 --- a/nemo_rl/data/processors.py +++ b/nemo_rl/data/processors.py @@ -1,3 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Contains data processors for evaluation.""" from typing import Any, cast From 17fe4055d83ed6e3e0226aab0053d9d8abcacbfd Mon Sep 17 00:00:00 2001 From: Xuehan Date: Wed, 2 Jul 2025 16:49:46 +0000 Subject: [PATCH 34/44] add missing copyright statements. Signed-off-by: Xuehan Xiong --- nemo_rl/evals/answer_parsing.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/nemo_rl/evals/answer_parsing.py b/nemo_rl/evals/answer_parsing.py index 8b62026360..dcf020774a 100644 --- a/nemo_rl/evals/answer_parsing.py +++ b/nemo_rl/evals/answer_parsing.py @@ -1,3 +1,17 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Contains utility functions for answer parsing.""" MULTILINGUAL_ANSWER_PATTERN_TEMPLATE = ( From cf828d627f941b250b32a748c02ba10c5399838b Mon Sep 17 00:00:00 2001 From: Shun Kiyono Date: Mon, 30 Jun 2025 13:25:18 +0900 Subject: [PATCH 35/44] docs: Add missing arguments to DeepScaler evaluation (#502) Signed-off-by: Shun Kiyono --- docs/guides/grpo-deepscaler.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/guides/grpo-deepscaler.md b/docs/guides/grpo-deepscaler.md index 456b2f2d8b..4404b42949 100644 --- a/docs/guides/grpo-deepscaler.md +++ b/docs/guides/grpo-deepscaler.md @@ -33,7 +33,9 @@ Throughout training, the checkpoints of the model will be saved to the `results` ```sh uv run examples/run_eval.py \ - generation.model_name=results/grpo-deepscaler-1.5b-8K/step_240/hf + generation.model_name=results/grpo-deepscaler-1.5b-8K/step_240/hf \ + data.prompt_file=examples/prompts/cot.txt \ + generation.vllm_cfg.max_model_len=32768 ``` Use `generation.model_name` to specify the path to the Hugging Face checkpoint. In addition, we use AIME24 as the validation dataset and calculate pass@1 on it throughout training. From 01c384024b3897b1e90b5f6aa82c2c9c733c9437 Mon Sep 17 00:00:00 2001 From: Wei Du Date: Mon, 30 Jun 2025 16:40:16 -0500 Subject: [PATCH 36/44] fix: prevent divisible error by dropping last batch in loader (#583) Signed-off-by: Wei Du --- nemo_rl/algorithms/grpo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nemo_rl/algorithms/grpo.py b/nemo_rl/algorithms/grpo.py index ea99de1538..3bcad52849 100644 --- a/nemo_rl/algorithms/grpo.py +++ b/nemo_rl/algorithms/grpo.py @@ -191,6 +191,7 @@ def setup( batch_size=grpo_config["num_prompts_per_step"], shuffle=False, collate_fn=rl_collate_fn, + drop_last=True, ) if last_checkpoint_path is not None: dataloader_state_dict = torch.load( From 658437d80c34630a4b5f285ca94b0e8de3ad5fa7 Mon Sep 17 00:00:00 2001 From: yuki <48991475+yuki-666@users.noreply.github.com> Date: Tue, 1 Jul 2025 05:48:53 +0800 Subject: [PATCH 37/44] feat: improve worker group args/kwargs (#539) Signed-off-by: Yuki Huang --- nemo_rl/distributed/worker_groups.py | 104 ++++++++++++++++--- nemo_rl/models/generation/vllm.py | 42 ++++---- nemo_rl/models/policy/lm_policy.py | 8 +- tests/unit/distributed/test_worker_groups.py | 92 ++++++++++++++-- 4 files changed, 195 insertions(+), 51 deletions(-) diff --git a/nemo_rl/distributed/worker_groups.py b/nemo_rl/distributed/worker_groups.py index a283e6b18c..c2e849cbee 100644 --- a/nemo_rl/distributed/worker_groups.py +++ b/nemo_rl/distributed/worker_groups.py @@ -15,13 +15,12 @@ import os from copy import deepcopy from dataclasses import dataclass -from typing import Any, Iterable, Optional, Union +from typing import Any, Optional, Union import ray from ray.util.placement_group import PlacementGroup from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy -from nemo_rl.distributed.batched_data_dict import SlicedDataDict from nemo_rl.distributed.named_sharding import NamedSharding from nemo_rl.distributed.ray_actor_environment_registry import ( get_actor_python_env, @@ -583,6 +582,12 @@ def run_single_worker_single_data( Returns: ray.ObjectRef: A Ray future for the result. """ + assert len(args) == 0, ( + "run_single_worker_single_data will fail with args under certain circumstances. " + "Please use kwargs instead. " + "See https://github.com/NVIDIA-NeMo/RL/issues/582 for more details." + ) + worker = self.workers[worker_idx] method = getattr(worker, method_name) return method.remote(*args, **kwargs) @@ -590,25 +595,62 @@ def run_single_worker_single_data( def run_all_workers_multiple_data( self, method_name: str, - data: list[Any], + *args, run_rank_0_only_axes: list[str] | None = None, common_kwargs: Optional[dict[str, Any]] = None, + **kwargs, ) -> list[ray.ObjectRef]: """Run a method on all workers in parallel with different data. Args: method_name: Name of the method to call on each worker - data: List of data to pass to workers/groups + *args: List of arguments to pass to workers/groups + e.g. [[arg1_for_worker_1, arg1_for_worker_2], [arg2_for_worker_1, arg2_for_worker_2]] run_rank_0_only_axes: List of named axes for which only rank 0 should run the method. - common_kwargs: Additional keyword arguments to pass to all workers + common_kwargs: Keyword arguments to pass to all workers + **kwargs: Keyword arguments to pass to workers/groups + e.g. {"key1": [value_for_worker_1, value_for_worker_2], "key2": [value_for_worker_1, value_for_worker_2]} Returns: list[ray.ObjectRef]: A list of ray futures """ + assert len(args) == 0, ( + "run_all_workers_multiple_data will fail with args under certain circumstances. " + "Please use kwargs instead. " + "See https://github.com/NVIDIA-NeMo/RL/issues/582 for more details." + ) + + # Check at least one arg or kwarg is provided + assert len(args) > 0 or len(kwargs) > 0, ( + "At least one args (positional arguments) or kwargs (keyword arguments) must be provided in run_all_workers_multiple_data. " + "Otherwise, please use run_all_workers_single_data." + ) + + # Check all args and kwargs have the same length + args_count = [len(arg) for arg in args] + assert all(count == args_count[0] for count in args_count), ( + "All args must have the same length" + ) + args_count = args_count[0] if len(args_count) > 0 else 0 + + kwargs_count = [len(value) for value in kwargs.values()] + assert all(count == kwargs_count[0] for count in kwargs_count), ( + "All kwargs must have the same length" + ) + kwargs_count = kwargs_count[0] if len(kwargs_count) > 0 else 0 + + if args_count > 0 and kwargs_count > 0: + assert args_count == kwargs_count, ( + "The number of args and kwargs must be the same in run_all_workers_multiple_data. " + f"args length = {args_count}, kwargs length = {kwargs_count}" + ) + data_count = max(args_count, kwargs_count) + + # Check the data length is equal to the number of workers if run_rank_0_only_axes is None: - assert len(data) == len(self.workers), ( + assert data_count == len(self.workers), ( "data length should be equal to the number of workers: " - f"data length = {len(data)}, number of workers = {len(self.workers)}" + f"data length = {data_count}, number of workers = {len(self.workers)}" ) futures = [] @@ -633,12 +675,16 @@ def run_all_workers_multiple_data( if should_run: method = getattr(worker, method_name) - futures.append(method.remote(data=data[data_idx], **common_kwargs)) + worker_args = [arg[data_idx] for arg in args] + worker_kwargs = {key: value[data_idx] for key, value in kwargs.items()} + futures.append( + method.remote(*worker_args, **worker_kwargs, **common_kwargs) + ) data_idx += 1 - assert data_idx == len(data), ( + assert data_idx == data_count, ( "data length should be equal to the number of workers started: " - f"data length = {len(data)}, number of workers started = {data_idx}" + f"data length = {data_count}, number of workers started = {data_idx}" ) return futures @@ -660,6 +706,12 @@ def run_all_workers_single_data( Returns: list[ray.ObjectRef]: A list of ray futures """ + assert len(args) == 0, ( + "run_all_workers_single_data will fail with args under certain circumstances. " + "Please use kwargs instead. " + "See https://github.com/NVIDIA-NeMo/RL/issues/582 for more details." + ) + futures = [] if run_rank_0_only_axes is None: @@ -686,12 +738,13 @@ def run_all_workers_single_data( def run_all_workers_sharded_data( self, method_name: str, - data: Iterable[SlicedDataDict], # arbitrary nested iterables of SlicedDataDicts + *args, in_sharded_axes: list[str] | None = None, replicate_on_axes: list[str] | None = None, output_is_replicated: list[str] | None = None, make_dummy_calls_to_free_axes: bool = False, common_kwargs: Optional[dict[str, Any]] = None, + **kwargs, ) -> MultiWorkerFuture: """Run a method on all workers in parallel with sharded data. @@ -701,17 +754,27 @@ def run_all_workers_sharded_data( Args: method_name: Name of the method to call on each worker - data: Iterable of SlicedDataDicts to pass to workers/groups + *args: List of arguments to pass to workers/groups + e.g. [[arg1_for_worker_1, arg1_for_worker_2], [arg2_for_worker_1, arg2_for_worker_2]] in_sharded_axes: List of axes that are sharded replicate_on_axes: List of axes that are to be replicated output_is_replicated: List of axes along which the output is replicated (and we should just return the first result). We also just return from rank 0 of free axes. make_dummy_calls_to_free_axes: Whether to make dummy calls (with None) to workers that aren't rank 0 on 'free axes' (axes not in in_sharded_axes or replicate_on_axes). - common_kwargs: Additional keyword arguments to pass to all workers + common_kwargs: Keyword arguments to pass to all workers + **kwargs: Keyword arguments to pass to workers/groups + e.g. {"key1": [value_for_worker_1, value_for_worker_2], "key2": [value_for_worker_1, value_for_worker_2]} + Returns: MultiWorkerFuture: Object containing futures and their associated worker information """ + assert len(args) == 0, ( + "run_all_workers_sharded_data will fail with args under certain circumstances. " + "Please use kwargs instead. " + "See https://github.com/NVIDIA-NeMo/RL/issues/582 for more details." + ) + if self.sharding_annotations is None: raise ValueError( "Sharding annotations must be provided to use sharded data distribution" @@ -771,15 +834,20 @@ def run_all_workers_sharded_data( if should_receive_data: # Find the appropriate data slice for this worker - worker_data = data + worker_args = args + worker_kwargs = kwargs for axis in in_sharded_axes: if axis in worker_coords: # Select the appropriate slice for this axis - worker_data = worker_data[worker_coords[axis]] + worker_args = [arg[worker_coords[axis]] for arg in worker_args] + worker_kwargs = { + key: value[worker_coords[axis]] + for key, value in worker_kwargs.items() + } # Call the method on the worker with its data slice future = getattr(worker, method_name).remote( - data=worker_data, **common_kwargs + *worker_args, **worker_kwargs, **common_kwargs ) futures.append(future) called_workers.append(worker_idx) @@ -787,8 +855,10 @@ def run_all_workers_sharded_data( # If this worker doesn't need data: if make_dummy_calls_to_free_axes: # If make_dummy_calls_to_free_axes is True, just call the method with None + worker_args = [None] * len(args) + worker_kwargs = {key: None for key in kwargs.keys()} future = getattr(worker, method_name).remote( - data=None, **common_kwargs + *worker_args, **worker_kwargs, **common_kwargs ) futures.append(future) called_workers.append(worker_idx) diff --git a/nemo_rl/models/generation/vllm.py b/nemo_rl/models/generation/vllm.py index f0cd5eb50b..7dbfbd3ea8 100644 --- a/nemo_rl/models/generation/vllm.py +++ b/nemo_rl/models/generation/vllm.py @@ -348,11 +348,13 @@ def _patch_vllm_init_workers_ray(): else: self.llm = vllm.LLM(**llm_kwargs) - def init_collective(self, data: int, ip: str, port: int, world_size: int) -> None: + def init_collective( + self, rank_prefix: int, ip: str, port: int, world_size: int + ) -> None: self.llm.collective_rpc( "init_collective", args=( - data, + rank_prefix, ip, port, world_size, @@ -360,12 +362,12 @@ def init_collective(self, data: int, ip: str, port: int, world_size: int) -> Non ) async def init_collective_async( - self, data: int, ip: str, port: int, world_size: int + self, rank_prefix: int, ip: str, port: int, world_size: int ) -> None: await self.llm.collective_rpc( "init_collective", args=( - data, + rank_prefix, ip, port, world_size, @@ -903,11 +905,11 @@ async def report_device_id_async(self) -> list[str]: return cast(list[str], list_of_worker_results) - def update_weights_from_ipc_handles(self, data: dict[str, Any]) -> bool: + def update_weights_from_ipc_handles(self, ipc_handles: dict[str, Any]) -> bool: """Update weights from IPC handles by delegating to the vLLM Worker implementation. Args: - data (dict): Dictionary mapping device UUIDs (str) to parameter IPC handles. + ipc_handles (dict): Dictionary mapping device UUIDs (str) to parameter IPC handles. Returns: bool: True if weights were successfully updated, False otherwise. @@ -923,7 +925,7 @@ def update_weights_from_ipc_handles(self, data: dict[str, Any]) -> bool: ) result_or_coro = self.llm.collective_rpc( - "update_weights_from_ipc_handles", args=(data,) + "update_weights_from_ipc_handles", args=(ipc_handles,) ) worker_result = result_or_coro[0] @@ -940,11 +942,13 @@ def update_weights_from_ipc_handles(self, data: dict[str, Any]) -> bool: traceback.print_exc() return False - async def update_weights_from_ipc_handles_async(self, data: dict[str, Any]) -> bool: + async def update_weights_from_ipc_handles_async( + self, ipc_handles: dict[str, Any] + ) -> bool: """Async version of update_weights_from_ipc_handles. Args: - data (dict): Dictionary mapping device UUIDs (str) to parameter IPC handles. + ipc_handles (dict): Dictionary mapping device UUIDs (str) to parameter IPC handles. Returns: bool: True if weights were successfully updated, False otherwise. @@ -960,7 +964,7 @@ async def update_weights_from_ipc_handles_async(self, data: dict[str, Any]) -> b ) result_or_coro = await self.llm.collective_rpc( - "update_weights_from_ipc_handles", args=(data,) + "update_weights_from_ipc_handles", args=(ipc_handles,) ) if asyncio.iscoroutine(result_or_coro): @@ -983,7 +987,7 @@ async def update_weights_from_ipc_handles_async(self, data: dict[str, Any]) -> b traceback.print_exc() return False - def update_weights_from_collective(self, data: dict[str, Any]) -> bool: + def update_weights_from_collective(self, info: dict[str, Any]) -> bool: """Update the model weights from collective communication.""" try: assert self.llm is not None, ( @@ -996,7 +1000,7 @@ def update_weights_from_collective(self, data: dict[str, Any]) -> bool: ) result_or_coro = self.llm.collective_rpc( - "update_weights_from_collective", args=(data,) + "update_weights_from_collective", args=(info,) ) worker_result = result_or_coro[0] @@ -1013,7 +1017,7 @@ def update_weights_from_collective(self, data: dict[str, Any]) -> bool: traceback.print_exc() return False - async def update_weights_from_collective_async(self, data: dict[str, Any]) -> bool: + async def update_weights_from_collective_async(self, info: dict[str, Any]) -> bool: """Async version of update_weights_from_collective.""" try: assert self.llm is not None, ( @@ -1026,7 +1030,7 @@ async def update_weights_from_collective_async(self, data: dict[str, Any]) -> bo ) result_or_coro = await self.llm.collective_rpc( - "update_weights_from_collective", args=(data,) + "update_weights_from_collective", args=(info,) ) if asyncio.iscoroutine(result_or_coro): @@ -1403,7 +1407,7 @@ def init_collective( # Send world_size and rank for init collective to all workers futures = self.worker_group.run_all_workers_multiple_data( method_name, - data=rank_prefix_list, + rank_prefix=rank_prefix_list, run_rank_0_only_axes=["tensor_parallel", "pipeline_parallel"], common_kwargs={"ip": ip, "port": port, "world_size": world_size}, ) @@ -1429,7 +1433,7 @@ def generate( ) future_bundle = self.worker_group.run_all_workers_sharded_data( "generate", - sharded_data, + data=sharded_data, in_sharded_axes=["data_parallel"], replicate_on_axes=None, # just run on tp rank 0 output_is_replicated=None, @@ -1474,7 +1478,7 @@ def generate_text( ) future_bundle = self.worker_group.run_all_workers_sharded_data( "generate_text", - sharded_data, + data=sharded_data, in_sharded_axes=["data_parallel"], replicate_on_axes=None, # just run on tp rank 0 output_is_replicated=None, @@ -1708,7 +1712,7 @@ def update_weights(self, ipc_handles: dict[str, Any]) -> bool: # Directly pass ipc_handles to the method futures = self.worker_group.run_all_workers_multiple_data( method_name, - ipc_handles_list, + ipc_handles=ipc_handles_list, run_rank_0_only_axes=["tensor_parallel", "pipeline_parallel"], ) # Wait for all futures to complete @@ -1735,7 +1739,7 @@ def update_weights_from_collective( # Use run_all_workers_single_data to send data to all workers futures = self.worker_group.run_all_workers_single_data( method_name, - data=info, + info=info, run_rank_0_only_axes=["tensor_parallel", "pipeline_parallel"], ) diff --git a/nemo_rl/models/policy/lm_policy.py b/nemo_rl/models/policy/lm_policy.py index 4d967a4cba..e469b32d16 100644 --- a/nemo_rl/models/policy/lm_policy.py +++ b/nemo_rl/models/policy/lm_policy.py @@ -207,7 +207,7 @@ def get_logprobs( futures = self.worker_group.run_all_workers_sharded_data( "get_logprobs", - sharded_data_2d, + data=sharded_data_2d, in_sharded_axes=["data_parallel", "context_parallel"], replicate_on_axes=["tensor_parallel", "pipeline_parallel"], output_is_replicated=["tensor_parallel", "pipeline_parallel"], @@ -263,7 +263,7 @@ def get_reference_policy_logprobs( futures = self.worker_group.run_all_workers_sharded_data( "get_reference_policy_logprobs", - sharded_data_2d, + data=sharded_data_2d, in_sharded_axes=["data_parallel", "context_parallel"], replicate_on_axes=["tensor_parallel", "pipeline_parallel"], output_is_replicated=["tensor_parallel", "pipeline_parallel"], @@ -313,7 +313,7 @@ def train( # Train each shard in parallel futures = self.worker_group.run_all_workers_sharded_data( "train", - sharded_data, + data=sharded_data, in_sharded_axes=["data_parallel"], replicate_on_axes=[ "context_parallel", @@ -365,7 +365,7 @@ def generate( sharded_data = data.shard_by_batch_size(dp_size, batch_size=None) futures = self.worker_group.run_all_workers_sharded_data( "generate", - sharded_data, + data=sharded_data, in_sharded_axes=["data_parallel"], replicate_on_axes=["tensor_parallel", "pipeline_parallel"], output_is_replicated=["tensor_parallel", "pipeline_parallel"], diff --git a/tests/unit/distributed/test_worker_groups.py b/tests/unit/distributed/test_worker_groups.py index 53b6133c69..12131fe4a4 100644 --- a/tests/unit/distributed/test_worker_groups.py +++ b/tests/unit/distributed/test_worker_groups.py @@ -328,6 +328,48 @@ def test_configure_worker_interaction(register_test_actor, virtual_cluster): worker_group.shutdown(force=True) +def test_run_single_worker_single_data(worker_group_1d_sharding): + worker_group = worker_group_1d_sharding + assert len(worker_group.workers) == 2 + ray.get([w.reset_call_records.remote() for w in worker_group.workers]) + + data_for_worker0 = SlicedDataDict({"id": 0, "val": "w0_val"}) + data_for_worker1 = SlicedDataDict({"id": 1, "val": "w1_val"}) + + # pass through args + # due to https://github.com/NVIDIA-NeMo/RL/issues/582, args are not supported. + with pytest.raises(AssertionError): + future_0 = worker_group.run_single_worker_single_data( + "record_call", 0, data_for_worker0 + ) + future_1 = worker_group.run_single_worker_single_data( + "record_call", 1, data_for_worker1 + ) + ray.get([future_0, future_1]) + + # pass through kwargs + future_0 = worker_group.run_single_worker_single_data( + "record_call", 0, data=data_for_worker0 + ) + future_1 = worker_group.run_single_worker_single_data( + "record_call", 1, data=data_for_worker1 + ) + results = ray.get([future_0, future_1]) + assert len(results) == 2 + + # Check worker 0 + d, args, _, count = ray.get(worker_group.workers[0].get_recorded_data.remote()) + assert count == 1 + assert d == data_for_worker0 + assert args == () + + # Check worker 1 + d, args, _, count = ray.get(worker_group.workers[1].get_recorded_data.remote()) + assert count == 1 + assert d == data_for_worker1 + assert args == () + + def test_run_all_workers_single_data_1d_sharding(worker_group_1d_sharding): worker_group = worker_group_1d_sharding assert len(worker_group.workers) == 2 @@ -339,17 +381,26 @@ def test_run_all_workers_single_data_1d_sharding(worker_group_1d_sharding): test_arg1 = "arg_single" test_kwarg1 = "kwarg_single_val" + # pass through args + # due to https://github.com/NVIDIA-NeMo/RL/issues/582, args are not supported. + with pytest.raises(AssertionError): + futures = worker_group.run_all_workers_single_data( + "record_call", test_data, test_arg1 + ) + ray.get(futures) + + # pass through kwargs futures = worker_group.run_all_workers_single_data( - "record_call", test_data, test_arg1, kwarg1=test_kwarg1 + "record_call", data=test_data, kwarg1=test_kwarg1 ) results = ray.get(futures) assert len(results) == 2 # Should run on all 2 workers - for i, worker in enumerate(worker_group.workers): + for worker in worker_group.workers: data, args, kwargs, count = ray.get(worker.get_recorded_data.remote()) assert count == 1 assert data == test_data - assert args == (test_arg1,) + assert args == () assert kwargs == {"kwarg1": test_kwarg1} @@ -359,7 +410,7 @@ def test_run_all_workers_single_data_2d_sharding_no_filter(worker_group_2d_shard ray.get([w.reset_call_records.remote() for w in worker_group.workers]) test_data = SlicedDataDict({"key": "value_2d_no_filter"}) - futures = worker_group.run_all_workers_single_data("record_call", test_data) + futures = worker_group.run_all_workers_single_data("record_call", data=test_data) results = ray.get(futures) assert len(results) == 4 # Runs on all 4 workers @@ -377,7 +428,7 @@ def test_run_all_workers_single_data_2d_sharding_filter_tp(worker_group_2d_shard test_data = SlicedDataDict({"key": "value_2d_filter_tp"}) # Only run on tp rank 0 for each dp rank futures = worker_group.run_all_workers_single_data( - "record_call", test_data, run_rank_0_only_axes=["tp"] + "record_call", data=test_data, run_rank_0_only_axes=["tp"] ) results = ray.get(futures) assert len(results) == 2 # Runs on 2 workers (dp0-tp0, dp1-tp0) @@ -403,7 +454,7 @@ def test_run_all_workers_single_data_2d_sharding_filter_dp_tp(worker_group_2d_sh test_data = SlicedDataDict({"key": "value_2d_filter_dp_tp"}) # Only run on dp rank 0 AND tp rank 0 futures = worker_group.run_all_workers_single_data( - "record_call", test_data, run_rank_0_only_axes=["dp", "tp"] + "record_call", data=test_data, run_rank_0_only_axes=["dp", "tp"] ) results = ray.get(futures) assert len(results) == 1 # Runs on 1 worker (dp0-tp0) @@ -430,8 +481,17 @@ def test_run_all_workers_multiple_data_1d_sharding(worker_group_1d_sharding): multi_data = [data_for_worker0, data_for_worker1] common_arg = "common_arg_multi" + # pass through args + # due to https://github.com/NVIDIA-NeMo/RL/issues/582, args are not supported. + with pytest.raises(AssertionError): + futures = worker_group.run_all_workers_multiple_data( + "record_call", multi_data, common_kwargs={"common": common_arg} + ) + ray.get(futures) + + # pass through kwargs futures = worker_group.run_all_workers_multiple_data( - "record_call", multi_data, common_kwargs={"common": common_arg} + "record_call", data=multi_data, common_kwargs={"common": common_arg} ) results = ray.get(futures) assert len(results) == 2 @@ -462,10 +522,11 @@ def test_run_all_workers_multiple_data_fewer_data_than_workers( data_for_worker1 = SlicedDataDict({"id": 1}) multi_data = [data_for_worker0, data_for_worker1] # Only 2 data items - with pytest.raises( - AssertionError, match="data length should be equal to the number of workers: " - ): - futures = worker_group.run_all_workers_multiple_data("record_call", multi_data) + with pytest.raises(AssertionError): + futures = worker_group.run_all_workers_multiple_data( + "record_call", data=multi_data + ) + ray.get(futures) def test_run_all_workers_sharded_data_1d(worker_group_1d_sharding): @@ -479,6 +540,15 @@ def test_run_all_workers_sharded_data_1d(worker_group_1d_sharding): SlicedDataDict({"shard": 1, "val": "val1"}), ] + # pass through args + # due to https://github.com/NVIDIA-NeMo/RL/issues/582, args are not supported. + with pytest.raises(AssertionError): + future_bundle = worker_group.run_all_workers_sharded_data( + "record_call", sharded_data_input, in_sharded_axes=["data"] + ) + worker_group.get_all_worker_results(future_bundle) + + # pass through kwargs future_bundle = worker_group.run_all_workers_sharded_data( "record_call", data=sharded_data_input, in_sharded_axes=["data"] ) From 2eb0301dc95be011f0e37d65bc2ec96f96dd7e32 Mon Sep 17 00:00:00 2001 From: Anna Shors Date: Mon, 30 Jun 2025 16:18:14 -0700 Subject: [PATCH 38/44] fix: update gemma3 prefix (#585) Signed-off-by: ashors1 --- nemo_rl/models/dtensor/parallelize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_rl/models/dtensor/parallelize.py b/nemo_rl/models/dtensor/parallelize.py index 664bc1a253..fb9c720c20 100644 --- a/nemo_rl/models/dtensor/parallelize.py +++ b/nemo_rl/models/dtensor/parallelize.py @@ -92,7 +92,7 @@ def _parallelize_gemma3( Tensor parallelism is not supported for Gemma3 models because of tied word embeddings. """ if isinstance(model, Gemma3ForConditionalGeneration): - model_prefix = "language_model.model" + model_prefix = "language_model" else: model_prefix = "model" @@ -399,7 +399,7 @@ def _parallelize_model( """ model_cls = type(model) if model_cls == Gemma3ForConditionalGeneration: - layers: torch.nn.ModuleList = model.language_model.model.layers # type: ignore + layers: torch.nn.ModuleList = model.language_model.layers # type: ignore num_attention_heads = model.config.text_config.num_attention_heads num_key_value_heads = model.config.text_config.num_key_value_heads else: From bc234a3c605930b4e334006ccf88af360cedc976 Mon Sep 17 00:00:00 2001 From: Sahil Jain <48468750+SahilJain314@users.noreply.github.com> Date: Mon, 30 Jun 2025 20:45:02 -0700 Subject: [PATCH 39/44] fix: Added copyright to functest (#584) Signed-off-by: Sahil Jain --- tests/functional/test_converter_roundtrip.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/tests/functional/test_converter_roundtrip.py b/tests/functional/test_converter_roundtrip.py index e551d0e6b5..90756a2f18 100644 --- a/tests/functional/test_converter_roundtrip.py +++ b/tests/functional/test_converter_roundtrip.py @@ -1,4 +1,19 @@ #!/usr/bin/env python3 + +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """ Functional test for converter roundtrip functionality. From 2d876dec7ca5342a67deaa20f67af873a7a6f3d1 Mon Sep 17 00:00:00 2001 From: Charlie Truong Date: Tue, 1 Jul 2025 19:25:17 -0500 Subject: [PATCH 40/44] chore: Update github url after org transfer (#512) Signed-off-by: Charlie Truong Signed-off-by: Sahil Jain <48468750+SahilJain314@users.noreply.github.com> Co-authored-by: Sahil Jain <48468750+SahilJain314@users.noreply.github.com> --- .github/PULL_REQUEST_TEMPLATE.md | 8 ++--- CONTRIBUTING.md | 2 +- README.md | 10 +++--- docs/adding-new-models.md | 32 +++++++++---------- docs/model-quirks.md | 2 +- examples/configs/grpo-deepscaler-1.5b-8K.yaml | 2 +- examples/configs/grpo_math_1B.yaml | 2 +- examples/configs/grpo_sliding_puzzle.yaml | 4 +-- ...-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml | 2 +- examples/converters/convert_dcp_to_hf.py | 2 +- nemo_rl/algorithms/dpo.py | 2 +- .../ray_actor_environment_registry.py | 2 +- nemo_rl/models/generation/vllm.py | 2 +- .../models/policy/dtensor_policy_worker.py | 4 +-- nemo_rl/models/policy/fsdp1_policy_worker.py | 6 ++-- nemo_rl/package_info.py | 4 +-- nemo_rl/utils/native_checkpoint.py | 2 +- pyproject.toml | 2 +- tests/functional/dpo.sh | 2 +- tests/functional/test_converter_roundtrip.py | 13 ++++++++ ...ma3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.sh | 2 +- ...llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.sh | 2 +- ...-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.sh | 2 +- .../models/generation/test_vllm_generation.py | 2 +- tests/unit/utils/test_native_checkpoint.py | 2 +- uv.lock | 4 +-- 26 files changed, 66 insertions(+), 53 deletions(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 3e3e4fb3fe..b83ec70073 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -10,15 +10,15 @@ List issues that this PR closes ([syntax](https://docs.github.com/en/issues/trac * **You can potentially add a usage example below** ```python -# Add a code snippet demonstrating how to use this +# Add a code snippet demonstrating how to use this ``` # Before your PR is "Ready for review" **Pre checks**: -- [ ] Make sure you read and followed [Contributor guidelines](/NVIDIA/NeMo-RL/blob/main/CONTRIBUTING.md) +- [ ] Make sure you read and followed [Contributor guidelines](/NVIDIA-NeMo/RL/blob/main/CONTRIBUTING.md) - [ ] Did you write any new necessary tests? -- [ ] Did you run the unit tests and functional tests locally? Visit our [Testing Guide](/NVIDIA/NeMo-RL/blob/main/docs/testing.md) for how to run tests -- [ ] Did you add or update any necessary documentation? Visit our [Document Development Guide](/NVIDIA/NeMo-RL/blob/main/docs/documentation.md) for how to write, build and test the docs. +- [ ] Did you run the unit tests and functional tests locally? Visit our [Testing Guide](/NVIDIA-NeMo/RL/blob/main/docs/testing.md) for how to run tests +- [ ] Did you add or update any necessary documentation? Visit our [Document Development Guide](/NVIDIA-NeMo/RL/blob/main/docs/documentation.md) for how to write, build and test the docs. # Additional Information * ... diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2cc6a3051b..3dc065655a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -31,7 +31,7 @@ We follow a direct clone and branch workflow for now: 1. Clone the repository directly: ```bash - git clone https://github.com/NVIDIA/NeMo-RL + git clone https://github.com/NVIDIA-NeMo/RL cd nemo-rl ``` diff --git a/README.md b/README.md index 6beca2f50b..4dc2f7395f 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ cd nemo-rl # by running (This is not necessary if you are using the pure Pytorch/DTensor path): git submodule update --init --recursive -# Different branches of the repo can have different pinned versions of these third-party submodules. Ensure +# Different branches of the repo can have different pinned versions of these third-party submodules. Ensure # submodules are automatically updated after switching branches or pulling updates by configuring git with: # git config submodule.recurse true @@ -226,7 +226,7 @@ sbatch \ We also support multi-turn generation and training (tool use, games, etc.). Reference example for training to play a Sliding Puzzle Game: ```sh -uv run python examples/run_grpo_sliding_puzzle.py +uv run python examples/run_grpo_sliding_puzzle.py ``` ## Supervised Fine-Tuning (SFT) @@ -409,7 +409,7 @@ If you use NeMo RL in your research, please cite it using the following BibTeX e ```bibtex @misc{nemo-rl, title = {NeMo RL: A Scalable and Efficient Post-Training Library}, -howpublished = {\url{https://github.com/NVIDIA/NeMo-RL}}, +howpublished = {\url{https://github.com/NVIDIA-NeMo/RL}}, year = {2025}, note = {GitHub repository}, } @@ -417,8 +417,8 @@ note = {GitHub repository}, ## Contributing -We welcome contributions to NeMo RL\! Please see our [Contributing Guidelines](https://github.com/NVIDIA/NeMo-RL/blob/main/CONTRIBUTING.md) for more information on how to get involved. +We welcome contributions to NeMo RL\! Please see our [Contributing Guidelines](https://github.com/NVIDIA-NeMo/RL/blob/main/CONTRIBUTING.md) for more information on how to get involved. ## Licenses -NVIDIA NeMo RL is licensed under the [Apache License 2.0](https://github.com/NVIDIA/NeMo-RL/blob/main/LICENSE). +NVIDIA NeMo RL is licensed under the [Apache License 2.0](https://github.com/NVIDIA-NeMo/RL/blob/main/LICENSE). diff --git a/docs/adding-new-models.md b/docs/adding-new-models.md index 155a012f47..e0de97ae40 100644 --- a/docs/adding-new-models.md +++ b/docs/adding-new-models.md @@ -12,7 +12,7 @@ $$\text{KL} = E_{x \sim \pi}[\pi(x) - \pi_{\text{ref}}(x)]$$ When summed/integrated, replacing the $x \sim \pi$ with $x \sim \pi_{\text{wrong}}$ leads to an error of: -$$\sum_{x} \left( \pi(x) - \pi_{\text{ref}}(x) \right) \left( \pi_{\text{wrong}}(x) - \pi(x) \right)$$ +$$\sum_{x} \left( \pi(x) - \pi_{\text{ref}}(x) \right) \left( \pi_{\text{wrong}}(x) - \pi(x) \right)$$ So, to verify correctness, we calculate: @@ -65,28 +65,28 @@ When investigating discrepancies beyond the acceptable threshold, focus on these When validating Hugging Face-based models, perform the following checks: -- **Compare log probabilities** +- **Compare log probabilities** Ensure the generation log probabilities from inference backends like **vLLM** match those computed by Hugging Face. This comparison helps diagnose potential mismatches. -- **Test parallelism** +- **Test parallelism** Verify consistency with other parallelism settings. -- **Variance** +- **Variance** Repeat tests multiple times (e.g., 10 runs) to confirm that behavior is deterministic or within acceptable variance. -- **Check sequence lengths** - Perform inference on sequence lengths of 100, 1,000, and 10,000 tokens. +- **Check sequence lengths** + Perform inference on sequence lengths of 100, 1,000, and 10,000 tokens. Ensure the model behaves consistently at each length. -- **Use real and dummy data** - - **Real data:** Tokenize and generate from actual text samples. +- **Use real and dummy data** + - **Real data:** Tokenize and generate from actual text samples. - **Dummy data:** Simple numeric sequences to test basic generation. -- **Vary sampling parameters** - Test both greedy and sampling generation modes. +- **Vary sampling parameters** + Test both greedy and sampling generation modes. Adjust temperature and top-p to confirm output consistency across backends. -- **Test different batch sizes** +- **Test different batch sizes** Try with batch sizes of 1, 8, and 32 to ensure consistent behavior across different batch configurations. --- @@ -95,11 +95,11 @@ When validating Hugging Face-based models, perform the following checks: ### Additional Validation -- **Compare Megatron outputs** +- **Compare Megatron outputs** Ensure the Megatron forward pass aligns with Hugging Face and the generation log probabilities from inference backends like **vLLM**. -- **Parallel settings** - Match the same parallelism configurations used for the HuggingFace-based tests. +- **Parallel settings** + Match the same parallelism configurations used for the HuggingFace-based tests. Confirm outputs remain consistent across repeated runs. --- @@ -128,7 +128,7 @@ By following these validation steps and ensuring your model's outputs remain con We also maintain a set of standalone scripts that can be used to diagnose issues related to correctness that we have encountered before. -## [1.max_model_len_respected.py](https://github.com/NVIDIA/NeMo-RL/blob/main/tools/model_diagnostics/1.max_model_len_respected.py) +## [1.max_model_len_respected.py](https://github.com/NVIDIA-NeMo/RL/blob/main/tools/model_diagnostics/1.max_model_len_respected.py) Test if a new model respects the `max_model_len` passed to vllm: @@ -142,7 +142,7 @@ uv run --extra vllm tools/model_diagnostics/1.max_model_len_respected.py Qwen/Qw # [Qwen/Qwen2.5-1.5B] ALL GOOD! ``` -## [2.long_generation_decode_vs_prefill](https://github.com/NVIDIA/NeMo-RL/blob/main/tools/model_diagnostics/2.long_generation_decode_vs_prefill.py) +## [2.long_generation_decode_vs_prefill](https://github.com/NVIDIA-NeMo/RL/blob/main/tools/model_diagnostics/2.long_generation_decode_vs_prefill.py) Test that vLLM yields near-identical token log-probabilities when comparing decoding with a single prefill pass across multiple prompts. diff --git a/docs/model-quirks.md b/docs/model-quirks.md index fa2b181c7e..ca08b2741b 100644 --- a/docs/model-quirks.md +++ b/docs/model-quirks.md @@ -6,7 +6,7 @@ This document outlines special cases and model-specific behaviors that require c ### Tied Weights -Weight tying between the embedding layer (`model.embed_tokens`) and output layer (`lm_head`) is currently not respected when using the FSDP1 policy or the DTensor policy when TP > 1 (See [this issue](https://github.com/NVIDIA/NeMo-RL/issues/227)). To avoid errors when training these models, we only allow training models with tied weights using the DTensor policy with TP=1. For Llama-3 and Qwen2.5 models, weight-tying is only enabled for the smaller models (< 2B), which can typically be trained without tensor parallelism. For Gemma-3, all model sizes have weight-tying enabled, including the larger models which require tensor parallelism. To support training of these models, we specially handle the Gemma-3 models by allowing training using the DTensor policy with TP > 1. +Weight tying between the embedding layer (`model.embed_tokens`) and output layer (`lm_head`) is currently not respected when using the FSDP1 policy or the DTensor policy when TP > 1 (See [this issue](https://github.com/NVIDIA-NeMo/RL/issues/227)). To avoid errors when training these models, we only allow training models with tied weights using the DTensor policy with TP=1. For Llama-3 and Qwen2.5 models, weight-tying is only enabled for the smaller models (< 2B), which can typically be trained without tensor parallelism. For Gemma-3, all model sizes have weight-tying enabled, including the larger models which require tensor parallelism. To support training of these models, we specially handle the Gemma-3 models by allowing training using the DTensor policy with TP > 1. **Special Handling:** - We skip the tied weights check for all Gemma-3 models when using the DTensor policy, allowing training using TP > 1. diff --git a/examples/configs/grpo-deepscaler-1.5b-8K.yaml b/examples/configs/grpo-deepscaler-1.5b-8K.yaml index 96bc7f2e76..1013f3d4c2 100644 --- a/examples/configs/grpo-deepscaler-1.5b-8K.yaml +++ b/examples/configs/grpo-deepscaler-1.5b-8K.yaml @@ -30,7 +30,7 @@ checkpointing: save_period: 10 policy: - # Qwen/Qwen2.5-1.5B has tied weights which are only supported with dtensor policy with tp size 1 (https://github.com/NVIDIA/NeMo-RL/issues/227) + # Qwen/Qwen2.5-1.5B has tied weights which are only supported with dtensor policy with tp size 1 (https://github.com/NVIDIA-NeMo/RL/issues/227) model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" tokenizer: name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml index 85cc620b62..1842b01497 100644 --- a/examples/configs/grpo_math_1B.yaml +++ b/examples/configs/grpo_math_1B.yaml @@ -30,7 +30,7 @@ checkpointing: save_period: 10 policy: - # Qwen/Qwen2.5-1.5B has tied weights which are only supported with dtensor policy with tp size 1 (https://github.com/NVIDIA/NeMo-RL/issues/227) + # Qwen/Qwen2.5-1.5B has tied weights which are only supported with dtensor policy with tp size 1 (https://github.com/NVIDIA-NeMo/RL/issues/227) model_name: "Qwen/Qwen2.5-1.5B" tokenizer: name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default diff --git a/examples/configs/grpo_sliding_puzzle.yaml b/examples/configs/grpo_sliding_puzzle.yaml index 0b99e750e8..8493bfc40e 100644 --- a/examples/configs/grpo_sliding_puzzle.yaml +++ b/examples/configs/grpo_sliding_puzzle.yaml @@ -24,7 +24,7 @@ policy: max_new_tokens: ${policy.max_total_sequence_length} temperature: 1.0 # Setting top_p/top_k to 0.999/10000 to strip out Qwen's special/illegal tokens - # https://github.com/NVIDIA/NeMo-RL/issues/237 + # https://github.com/NVIDIA-NeMo/RL/issues/237 top_p: 0.999 top_k: 10000 stop_token_ids: null @@ -38,7 +38,7 @@ policy: data: add_system_prompt: false - + env: sliding_puzzle_game: cfg: diff --git a/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml b/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml index 0ec0ef477a..2458739e2e 100644 --- a/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml +++ b/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml @@ -45,7 +45,7 @@ policy: context_parallel_size: 1 custom_parallel_plan: null dynamic_batching: - # TODO: OOMs if enabled https://github.com/NVIDIA/NeMo-RL/issues/383 + # TODO: OOMs if enabled https://github.com/NVIDIA-NeMo/RL/issues/383 enabled: False train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}} logprob_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.logprob_batch_size}} diff --git a/examples/converters/convert_dcp_to_hf.py b/examples/converters/convert_dcp_to_hf.py index fc53418696..d87d97a64e 100644 --- a/examples/converters/convert_dcp_to_hf.py +++ b/examples/converters/convert_dcp_to_hf.py @@ -51,7 +51,7 @@ def main(): model_name_or_path = config["policy"]["model_name"] # TODO: After the following PR gets merged: - # https://github.com/NVIDIA/NeMo-RL/pull/148/files + # https://github.com/NVIDIA-NeMo/RL/pull/148/files # tokenizer should be copied from policy/tokenizer/* instead of relying on the model name # We can expose a arg at the top level --tokenizer_path to plumb that through. # This is more stable than relying on the current NeMo-RL get_tokenizer() which can diff --git a/nemo_rl/algorithms/dpo.py b/nemo_rl/algorithms/dpo.py index c7b3de9f5f..3883328216 100644 --- a/nemo_rl/algorithms/dpo.py +++ b/nemo_rl/algorithms/dpo.py @@ -71,7 +71,7 @@ class DPOConfig(TypedDict): preference_average_log_probs: bool sft_average_log_probs: bool ## TODO(@ashors) support other loss functions - ## https://github.com/NVIDIA/NeMo-RL/issues/193 + ## https://github.com/NVIDIA-NeMo/RL/issues/193 # preference_loss: str # gt_reward_scale: float preference_loss_weight: float diff --git a/nemo_rl/distributed/ray_actor_environment_registry.py b/nemo_rl/distributed/ray_actor_environment_registry.py index 4c7eebee13..1f1937729d 100644 --- a/nemo_rl/distributed/ray_actor_environment_registry.py +++ b/nemo_rl/distributed/ray_actor_environment_registry.py @@ -17,7 +17,7 @@ ACTOR_ENVIRONMENT_REGISTRY: dict[str, str] = { "nemo_rl.models.generation.vllm.VllmGenerationWorker": PY_EXECUTABLES.VLLM, # Temporary workaround for the coupled implementation of DTensorPolicyWorker and vLLM. - # This will be reverted to PY_EXECUTABLES.BASE once https://github.com/NVIDIA/NeMo-RL/issues/501 is resolved. + # This will be reverted to PY_EXECUTABLES.BASE once https://github.com/NVIDIA-NeMo/RL/issues/501 is resolved. "nemo_rl.models.policy.dtensor_policy_worker.DTensorPolicyWorker": PY_EXECUTABLES.VLLM, "nemo_rl.models.policy.fsdp1_policy_worker.FSDP1PolicyWorker": PY_EXECUTABLES.BASE, "nemo_rl.models.policy.megatron_policy_worker.MegatronPolicyWorker": PY_EXECUTABLES.MCORE, diff --git a/nemo_rl/models/generation/vllm.py b/nemo_rl/models/generation/vllm.py index 7dbfbd3ea8..cc8b44d5f3 100644 --- a/nemo_rl/models/generation/vllm.py +++ b/nemo_rl/models/generation/vllm.py @@ -330,7 +330,7 @@ def _patch_vllm_init_workers_ray(): enable_prefix_caching=torch.cuda.get_device_capability()[0] >= 8, dtype=self.cfg["vllm_cfg"]["precision"], seed=seed, - # Don't use cuda-graph by default as it leads to convergence issues (see https://github.com/NVIDIA/NeMo-RL/issues/186) + # Don't use cuda-graph by default as it leads to convergence issues (see https://github.com/NVIDIA-NeMo/RL/issues/186) enforce_eager=True, max_model_len=self.cfg["vllm_cfg"]["max_model_len"], trust_remote_code=True, diff --git a/nemo_rl/models/policy/dtensor_policy_worker.py b/nemo_rl/models/policy/dtensor_policy_worker.py index 61dcd9a127..a5e1d9259d 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker.py +++ b/nemo_rl/models/policy/dtensor_policy_worker.py @@ -162,7 +162,7 @@ def __init__( device_map="cpu", # load weights onto CPU initially # Always load the model in float32 to keep master weights in float32. # Keeping the master weights in lower precision has shown to cause issues with convergence. - # https://github.com/NVIDIA/NeMo-RL/issues/279 will fix the issue of CPU OOM for larger models. + # https://github.com/NVIDIA-NeMo/RL/issues/279 will fix the issue of CPU OOM for larger models. torch_dtype=torch.float32, trust_remote_code=True, **sliding_window_overwrite( @@ -381,7 +381,7 @@ def train( and not self.skip_tie_check ): raise ValueError( - f"Using dtensor policy with tp size {self.cfg['dtensor_cfg']['tensor_parallel_size']} for model ({self.cfg['model_name']}) that has tied weights (num_tied_weights={self.num_tied_weights}) is not supported (https://github.com/NVIDIA/NeMo-RL/issues/227). Please use dtensor policy with tensor parallel == 1 instead." + f"Using dtensor policy with tp size {self.cfg['dtensor_cfg']['tensor_parallel_size']} for model ({self.cfg['model_name']}) that has tied weights (num_tied_weights={self.num_tied_weights}) is not supported (https://github.com/NVIDIA-NeMo/RL/issues/227). Please use dtensor policy with tensor parallel == 1 instead." ) if gbs is None: gbs = self.cfg["train_global_batch_size"] diff --git a/nemo_rl/models/policy/fsdp1_policy_worker.py b/nemo_rl/models/policy/fsdp1_policy_worker.py index ef0eb98720..f4ec53daa0 100644 --- a/nemo_rl/models/policy/fsdp1_policy_worker.py +++ b/nemo_rl/models/policy/fsdp1_policy_worker.py @@ -96,7 +96,7 @@ def __init__( device_map="cpu", # load weights onto CPU initially # Always load the model in float32 to keep master weights in float32. # Keeping the master weights in lower precision has shown to cause issues with convergence. - # https://github.com/NVIDIA/NeMo-RL/issues/279 will fix the issue of CPU OOM for larger models. + # https://github.com/NVIDIA-NeMo/RL/issues/279 will fix the issue of CPU OOM for larger models. torch_dtype=torch.float32, trust_remote_code=True, **sliding_window_overwrite( @@ -110,7 +110,7 @@ def __init__( self.reference_model = AutoModelForCausalLM.from_pretrained( model_name, device_map="cpu", # load weights onto CPU initially - torch_dtype=torch.float32, # use full precision in sft until https://github.com/NVIDIA/nemo-rl/issues/13 is fixed + torch_dtype=torch.float32, # use full precision in sft until https://github.com/NVIDIA-NeMo/RL/issues/13 is fixed trust_remote_code=True, **sliding_window_overwrite( model_name @@ -249,7 +249,7 @@ def train( skip_tie_check = os.environ.get("NRL_SKIP_TIED_WEIGHT_CHECK") if self.num_tied_weights != 0 and not skip_tie_check: raise ValueError( - f"Using FSP1 with a model ({self.cfg['model_name']}) that has tied weights (num_tied_weights={self.num_tied_weights}) is not supported (https://github.com/NVIDIA/NeMo-RL/issues/227). Please use dtensor policy with tensor parallel == 1 instead." + f"Using FSP1 with a model ({self.cfg['model_name']}) that has tied weights (num_tied_weights={self.num_tied_weights}) is not supported (https://github.com/NVIDIA-NeMo/RL/issues/227). Please use dtensor policy with tensor parallel == 1 instead." ) if gbs is None: diff --git a/nemo_rl/package_info.py b/nemo_rl/package_info.py index 3fcefc1375..29883366db 100644 --- a/nemo_rl/package_info.py +++ b/nemo_rl/package_info.py @@ -28,8 +28,8 @@ __contact_names__ = "NVIDIA" __contact_emails__ = "nemo-tookit@nvidia.com" __homepage__ = "https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/" -__repository_url__ = "https://github.com/NVIDIA/NeMo-RL" -__download_url__ = "https://github.com/NVIDIA/NeMo-RL/releases" +__repository_url__ = "https://github.com/NVIDIA-NeMo/RL" +__download_url__ = "https://github.com/NVIDIA-NeMo/RL/releases" __description__ = "NeMo-RL - a toolkit for model alignment" __license__ = "Apache2" __keywords__ = "deep learning, machine learning, gpu, NLP, NeMo, nvidia, pytorch, torch, language, reinforcement learning, RLHF, preference modeling, SteerLM, DPO" diff --git a/nemo_rl/utils/native_checkpoint.py b/nemo_rl/utils/native_checkpoint.py index b857264d31..43d511bd74 100644 --- a/nemo_rl/utils/native_checkpoint.py +++ b/nemo_rl/utils/native_checkpoint.py @@ -248,7 +248,7 @@ def convert_dcp_to_hf( config.save_pretrained(hf_ckpt_path) # TODO: After the following PR gets merged: - # https://github.com/NVIDIA/NeMo-RL/pull/148/files + # https://github.com/NVIDIA-NeMo/RL/pull/148/files # tokenizer should be copied from policy/tokenizer/* instead of relying on the model name # We can expose a arg at the top level --tokenizer_path to plumb that through. # This is more stable than relying on the current NeMo-RL get_tokenizer() which can diff --git a/pyproject.toml b/pyproject.toml index 62095ae9fb..6b1371de83 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,7 +101,7 @@ test = [ megatron-core = { workspace = true } nemo-tron = { workspace = true } # The NeMo Run source to be used by nemo-tron -nemo_run = { git = "https://github.com/NVIDIA/NeMo-Run", rev = "414f0077c648fde2c71bb1186e97ccbf96d6844c" } +nemo_run = { git = "https://github.com/NVIDIA-NeMo/Run", rev = "414f0077c648fde2c71bb1186e97ccbf96d6844c" } # torch/torchvision/triton all come from the torch index in order to pick up aarch64 wheels torch = [ { index = "pytorch-cu128" }, diff --git a/tests/functional/dpo.sh b/tests/functional/dpo.sh index 562f62a0b8..b03b611b25 100755 --- a/tests/functional/dpo.sh +++ b/tests/functional/dpo.sh @@ -36,7 +36,7 @@ uv run $PROJECT_ROOT/examples/run_dpo.py \ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS # TODO: threshold set higher since test is flaky -# https://github.com/NVIDIA/NeMo-RL/issues/370 +# https://github.com/NVIDIA-NeMo/RL/issues/370 uv run tests/check_metrics.py $JSON_METRICS \ 'data["train/loss"]["3"] < 0.8' diff --git a/tests/functional/test_converter_roundtrip.py b/tests/functional/test_converter_roundtrip.py index 90756a2f18..ea865be9b2 100644 --- a/tests/functional/test_converter_roundtrip.py +++ b/tests/functional/test_converter_roundtrip.py @@ -1,3 +1,16 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. #!/usr/bin/env python3 # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.sh b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.sh index 9e3a004460..b22c00dec0 100755 --- a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.sh +++ b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp1-long.v2.sh @@ -32,7 +32,7 @@ uv run examples/run_sft.py \ # Convert tensorboard logs to json uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS -# TODO: the memory check is known to OOM. see https://github.com/NVIDIA/NeMo-RL/issues/263 +# TODO: the memory check is known to OOM. see https://github.com/NVIDIA-NeMo/RL/issues/263 # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then # TODO: FIGURE OUT CORRECT METRICS diff --git a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.sh b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.sh index 26c78649c8..abed80e5ed 100755 --- a/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.sh +++ b/tests/test_suites/llm/sft-llama3.1-8b-instruct-1n8g-fsdp2tp2sp.v2.sh @@ -31,7 +31,7 @@ uv run examples/run_sft.py \ # Convert tensorboard logs to json uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS -# TODO: memory check will fail due to OOM tracked here https://github.com/NVIDIA/NeMo-RL/issues/263 +# TODO: memory check will fail due to OOM tracked here https://github.com/NVIDIA-NeMo/RL/issues/263 # Only run metrics if the target step is reached if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then diff --git a/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.sh b/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.sh index eeaa9c8025..257add6fc5 100755 --- a/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.sh +++ b/tests/test_suites/llm/sft-qwen2.5-32b-4n8g-fsdp2tp8sp-actckpt.v2.sh @@ -3,7 +3,7 @@ SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd) source $SCRIPT_DIR/common.env # TODO: this config can crash on OOM -# https://github.com/NVIDIA/NeMo-RL/issues/263 +# https://github.com/NVIDIA-NeMo/RL/issues/263 # ===== BEGIN CONFIG ===== NUM_NODES=4 diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py index dc1de1b123..1404b02337 100644 --- a/tests/unit/models/generation/test_vllm_generation.py +++ b/tests/unit/models/generation/test_vllm_generation.py @@ -475,7 +475,7 @@ async def test_vllm_policy_generation_async( @pytest.mark.skip( - reason="Skipping for now, will be fixed in https://github.com/NVIDIA/NeMo-RL/issues/408" + reason="Skipping for now, will be fixed in https://github.com/NVIDIA-NeMo/RL/issues/408" ) def test_vllm_worker_seed_behavior(cluster, tokenizer): """ diff --git a/tests/unit/utils/test_native_checkpoint.py b/tests/unit/utils/test_native_checkpoint.py index 88356d2dba..7df7f8543b 100755 --- a/tests/unit/utils/test_native_checkpoint.py +++ b/tests/unit/utils/test_native_checkpoint.py @@ -330,7 +330,7 @@ def test_convert_dcp_to_hf(policy, num_gpus): os.path.join(tmp_dir, "test_hf_and_dcp-hf-offline"), simple_policy_config["model_name"], # TODO: After the following PR gets merged: - # https://github.com/NVIDIA/NeMo-RL/pull/148/files + # https://github.com/NVIDIA-NeMo/RL/pull/148/files # tokenizer should be copied from policy/tokenizer/* instead of relying on the model name # We can expose a arg at the top level --tokenizer_path to plumb that through. # This is more stable than relying on the current NeMo-RL get_tokenizer() which can diff --git a/uv.lock b/uv.lock index 9b50767fac..e2a02bdd23 100644 --- a/uv.lock +++ b/uv.lock @@ -2427,7 +2427,7 @@ test = [ [[package]] name = "nemo-run" version = "0.5.0rc0.dev0" -source = { git = "https://github.com/NVIDIA/NeMo-Run?rev=414f0077c648fde2c71bb1186e97ccbf96d6844c#414f0077c648fde2c71bb1186e97ccbf96d6844c" } +source = { git = "https://github.com/NVIDIA-NeMo/Run?rev=414f0077c648fde2c71bb1186e97ccbf96d6844c#414f0077c648fde2c71bb1186e97ccbf96d6844c" } dependencies = [ { name = "catalogue" }, { name = "cryptography" }, @@ -2473,7 +2473,7 @@ requires-dist = [ { name = "ijson" }, { name = "lightning" }, { name = "matplotlib" }, - { name = "nemo-run", git = "https://github.com/NVIDIA/NeMo-Run?rev=414f0077c648fde2c71bb1186e97ccbf96d6844c" }, + { name = "nemo-run", git = "https://github.com/NVIDIA-NeMo/Run?rev=414f0077c648fde2c71bb1186e97ccbf96d6844c" }, { name = "onnx" }, { name = "scikit-learn" }, { name = "webdataset" }, From ddac07c79125e18cae16d22cd181e71154503801 Mon Sep 17 00:00:00 2001 From: atfujita <40932835+AtsunoriFujita@users.noreply.github.com> Date: Wed, 2 Jul 2025 23:25:27 +0900 Subject: [PATCH 41/44] feat: add OpenAI format dataset for SFT (#485) Signed-off-by: Atsunori Fujita --- examples/run_sft.py | 8 ++ nemo_rl/data/hf_datasets/__init__.py | 2 + .../data/hf_datasets/oai_format_dataset.py | 78 ++++++++++++ .../hf_datasets/test_oai_format_dataset.py | 119 ++++++++++++++++++ 4 files changed, 207 insertions(+) create mode 100644 nemo_rl/data/hf_datasets/oai_format_dataset.py create mode 100644 tests/unit/data/hf_datasets/test_oai_format_dataset.py diff --git a/examples/run_sft.py b/examples/run_sft.py index 8eb93b5adc..ce5b258b0c 100644 --- a/examples/run_sft.py +++ b/examples/run_sft.py @@ -109,6 +109,14 @@ def setup_data(tokenizer: AutoTokenizer, data_config: DataConfig): output_key=data_config["output_key"], prompt_file=data_config["prompt_file"], ) + elif data_cls == "openai_format": + data = hf_datasets.OpenAIFormatDataset( + data_config["train_data_path"], + data_config["val_data_path"], + data_config["chat_key"], + data_config["system_key"], + data_config["system_prompt"], + ) else: raise ValueError(f"Unknown dataset class: {data_cls}") print( diff --git a/nemo_rl/data/hf_datasets/__init__.py b/nemo_rl/data/hf_datasets/__init__.py index 54d4fd9c34..aa5596397c 100644 --- a/nemo_rl/data/hf_datasets/__init__.py +++ b/nemo_rl/data/hf_datasets/__init__.py @@ -15,6 +15,7 @@ from nemo_rl.data.hf_datasets.chat_templates import COMMON_CHAT_TEMPLATES from nemo_rl.data.hf_datasets.dpo import DPODataset from nemo_rl.data.hf_datasets.helpsteer3 import HelpSteer3Dataset +from nemo_rl.data.hf_datasets.oai_format_dataset import OpenAIFormatDataset from nemo_rl.data.hf_datasets.oasst import OasstDataset from nemo_rl.data.hf_datasets.openmathinstruct2 import OpenMathInstruct2Dataset from nemo_rl.data.hf_datasets.prompt_response_dataset import ( @@ -26,6 +27,7 @@ "DPODataset", "HelpSteer3Dataset", "OasstDataset", + "OpenAIFormatDataset", "OpenMathInstruct2Dataset", "PromptResponseDataset", "SquadDataset", diff --git a/nemo_rl/data/hf_datasets/oai_format_dataset.py b/nemo_rl/data/hf_datasets/oai_format_dataset.py new file mode 100644 index 0000000000..22d01346bc --- /dev/null +++ b/nemo_rl/data/hf_datasets/oai_format_dataset.py @@ -0,0 +1,78 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any + +from datasets import load_dataset + +from nemo_rl.data.interfaces import TaskDataSpec + + +class OpenAIFormatDataset: + """This class is used to load an SFT dataset in the OpenAI format. + + The dataset should be in the following format: + { + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is the capital of France?"}, + {"role": "assistant", "content": "The capital of France is Paris."} + ] + } + system_key and system_prompt are optional. If provided, it will be added to the + beginning of the dataset. + chat_key should be the key of the messages list. Multi-turn conversations are + supported. + The last message in the conversation must be from the assistant. + """ + + def __init__( + self, + train_ds_path: str, + val_ds_path: str, + chat_key: str = "messages", + system_key: str = None, + system_prompt: str = None, + ): + self.chat_key = chat_key + self.system_key = system_key + self.system_prompt = system_prompt + train_original_dataset = load_dataset("json", data_files=train_ds_path)["train"] + val_original_dataset = load_dataset("json", data_files=val_ds_path)["train"] + + formatted_train_dataset = train_original_dataset.map(self.add_messages_key) + formatted_val_dataset = val_original_dataset.map(self.add_messages_key) + + self.formatted_ds = { + "train": formatted_train_dataset, + "validation": formatted_val_dataset, + } + + self.task_spec = TaskDataSpec( + "json_dataset", + ) + + def add_messages_key( + self, + example: dict[str, Any], + ) -> dict[str, list[dict[str, Any]]]: + messages = [message for message in example[self.chat_key]] + if self.system_key in example: + messages = [ + {"role": "system", "content": example[self.system_key]} + ] + messages + elif self.system_prompt: + messages = [{"role": "system", "content": self.system_prompt}] + messages + assert messages[-1]["role"] == "assistant" + return {"messages": messages} diff --git a/tests/unit/data/hf_datasets/test_oai_format_dataset.py b/tests/unit/data/hf_datasets/test_oai_format_dataset.py new file mode 100644 index 0000000000..4ba75a6a1d --- /dev/null +++ b/tests/unit/data/hf_datasets/test_oai_format_dataset.py @@ -0,0 +1,119 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import tempfile + +import pytest +from transformers import AutoTokenizer + +from nemo_rl.data.hf_datasets.chat_templates import COMMON_CHAT_TEMPLATES +from nemo_rl.data.hf_datasets.oai_format_dataset import ( + OpenAIFormatDataset, +) + + +@pytest.fixture +def sample_data(request): + chat_key = request.param[0] + system_key = request.param[1] + + train_data = { + chat_key: [ + {"role": "user", "content": "What is the capital of France?"}, + {"role": "assistant", "content": "The capital of France is Paris."}, + ], + } + val_data = { + chat_key: [ + {"role": "user", "content": "What is the capital of Germany?"}, + {"role": "assistant", "content": "The capital of Germany is Berlin."}, + ], + } + + if system_key is not None: + train_data[system_key] = "You are a helpful assistant." + if system_key is not None: + val_data[system_key] = "You are a helpful assistant." + + # Create temporary files for train and validation data + with tempfile.NamedTemporaryFile( + mode="w", suffix=".json", delete=False + ) as train_file: + json.dump(train_data, train_file) + train_path = train_file.name + + with tempfile.NamedTemporaryFile( + mode="w", suffix=".json", delete=False + ) as val_file: + json.dump(val_data, val_file) + val_path = val_file.name + + return train_path, val_path + + +@pytest.mark.parametrize("sample_data", [("messages", None)], indirect=True) +def test_dataset_initialization(sample_data): + train_path, val_path = sample_data + dataset = OpenAIFormatDataset(train_path, val_path) + + assert dataset.chat_key == "messages" + assert "train" in dataset.formatted_ds + assert "validation" in dataset.formatted_ds + + +@pytest.mark.parametrize("sample_data", [("conversations", None)], indirect=True) +def test_custom_keys(sample_data): + train_path, val_path = sample_data + dataset = OpenAIFormatDataset( + train_path, + val_path, + chat_key="conversations", + system_prompt="You are a helpful assistant.", + ) + + assert dataset.chat_key == "conversations" + assert dataset.system_prompt == "You are a helpful assistant." + + +@pytest.mark.parametrize("sample_data", [("messages", "system_key")], indirect=True) +def test_message_formatting(sample_data): + train_path, val_path = sample_data + dataset = OpenAIFormatDataset( + train_path, val_path, chat_key="messages", system_key="system_key" + ) + + first_example = dataset.formatted_ds["train"][0] + + assert first_example["messages"][0]["role"] == "system" + assert first_example["messages"][0]["content"] == "You are a helpful assistant." + assert first_example["messages"][1]["role"] == "user" + assert first_example["messages"][1]["content"] == "What is the capital of France?" + assert first_example["messages"][2]["role"] == "assistant" + assert first_example["messages"][2]["content"] == "The capital of France is Paris." + + chat_template = COMMON_CHAT_TEMPLATES.passthrough_prompt_response + tokenizer = AutoTokenizer.from_pretrained("Meta-Llama/Meta-Llama-3-8B-Instruct") + + combined_message = tokenizer.apply_chat_template( + first_example["messages"], + chat_template=chat_template, + tokenize=False, + add_generation_prompt=False, + add_special_tokens=False, + ) + + assert combined_message == "".join( + message["content"] for message in first_example["messages"] + ) From 283074abb71d82a267dd770e12e10a2e1e198a39 Mon Sep 17 00:00:00 2001 From: Parth Chadha Date: Wed, 2 Jul 2025 10:55:46 -0700 Subject: [PATCH 42/44] fix: load HF model only on rank 0 (#544) Signed-off-by: Parth Chadha --- examples/configs/evals/eval.yaml | 9 +++ nemo_rl/models/generation/vllm.py | 4 ++ .../models/policy/dtensor_policy_worker.py | 61 +++++++++++++++++-- .../unit/models/policy/test_dtensor_worker.py | 7 +++ 4 files changed, 75 insertions(+), 6 deletions(-) diff --git a/examples/configs/evals/eval.yaml b/examples/configs/evals/eval.yaml index 439acff25e..eab0f1db21 100644 --- a/examples/configs/evals/eval.yaml +++ b/examples/configs/evals/eval.yaml @@ -22,6 +22,15 @@ generation: pipeline_parallel_size: 1 gpu_memory_utilization: 0.9 max_model_len: 2048 + colocated: + # true: generation shares training GPUs + # false: uses dedicated generation resources + enabled: true + # only relevant when enabled is false + resources: + gpus_per_node: null # Decides num gpus to be dedicated to generation when there is one node in the cluster i.e cluster.num_nodes == 1 + num_nodes: null # Decides number of nodes to be dedicated to generation + tokenizer: name: ${generation.model_name} ## specify if you'd like to use a tokenizer different from the model's default diff --git a/nemo_rl/models/generation/vllm.py b/nemo_rl/models/generation/vllm.py index cc8b44d5f3..9506a063d3 100644 --- a/nemo_rl/models/generation/vllm.py +++ b/nemo_rl/models/generation/vllm.py @@ -316,6 +316,10 @@ def _patch_vllm_init_workers_ray(): os.environ["VLLM_USE_V1"] = os.environ.get("NRL_VLLM_USE_V1", "1") os.environ["VLLM_ALLOW_INSECURE_SERIALIZATION"] = "1" + if not self.cfg["colocated"]["enabled"]: + os.environ["NCCL_SHM_DISABLE"] = "1" + os.environ["NCCL_P2P_DISABLE"] = "1" + load_format = self.cfg["vllm_cfg"]["load_format"] if ModelFlag.VLLM_LOAD_FORMAT_AUTO.matches(self.model_name): load_format = "auto" diff --git a/nemo_rl/models/policy/dtensor_policy_worker.py b/nemo_rl/models/policy/dtensor_policy_worker.py index a5e1d9259d..46e1e8a52a 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker.py +++ b/nemo_rl/models/policy/dtensor_policy_worker.py @@ -21,7 +21,12 @@ import ray import torch +from accelerate import init_empty_weights from torch import nn +from torch.distributed.checkpoint.state_dict import ( + StateDictOptions, + set_model_state_dict, +) from torch.distributed.fsdp import ( FSDPModule, ) @@ -30,7 +35,7 @@ from torch.distributed.tensor.experimental._attention import ( set_rotate_method, ) -from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer from transformers.integrations.accelerate import find_tied_parameters from transformers.models.gemma3.modeling_gemma3 import Gemma3ForCausalLM @@ -137,6 +142,15 @@ def __init__( init_reference_model: bool = True, **kwargs: Any, ): + # Disable NCCL SHM if training and generation are not co-located: https://github.com/NVIDIA-NeMo/RL/issues/564 + if ( + "generation" in config + and config["generation"] is not None + and not config["generation"]["colocated"]["enabled"] + ): + os.environ["NCCL_SHM_DISABLE"] = "1" + os.environ["NCCL_P2P_DISABLE"] = "1" + self.cfg = config # torch distributed init. Envars for rank, world_size, and master_addr and master_port are set from the ray remote call torch.distributed.init_process_group(backend="nccl") @@ -156,19 +170,38 @@ def __init__( else: raise ValueError(f"Unknown precision: {self.cfg['precision']}") - print(f"[Rank {self.rank}] Loading model {model_name} on CPU...") - self.model = AutoModelForCausalLM.from_pretrained( + model_config = AutoConfig.from_pretrained( model_name, - device_map="cpu", # load weights onto CPU initially # Always load the model in float32 to keep master weights in float32. # Keeping the master weights in lower precision has shown to cause issues with convergence. - # https://github.com/NVIDIA-NeMo/RL/issues/279 will fix the issue of CPU OOM for larger models. torch_dtype=torch.float32, trust_remote_code=True, **sliding_window_overwrite( model_name ), # due to https://github.com/huggingface/transformers/issues/38002 ) + + full_state_dict = None + if self.rank == 0: + print(f"[Rank {self.rank}] Loading model {model_name} on CPU...") + model = AutoModelForCausalLM.from_pretrained( + model_name, + device_map="cpu", # load weights onto CPU initially + trust_remote_code=True, + config=model_config, + ) + full_state_dict = model.state_dict() + del model + + print(f"[Rank {self.rank}] Initializing empty model for FSDP...") + # All ranks initialize model on meta device, so FSDP can shard it. + # The actual weights will be broadcast from rank 0. + + with init_empty_weights(): + self.model = AutoModelForCausalLM.from_config( + model_config, + ) + # caching since this property is not always preserved after FSDP self.num_tied_weights = len(find_tied_parameters(self.model)) self.skip_tie_check = os.environ.get( @@ -222,8 +255,24 @@ def __init__( custom_parallel_plan=self.cfg["dtensor_cfg"]["custom_parallel_plan"], ) + print(f"[Rank {self.rank}] Loading state dict from rank 0...") + # This will broadcast the state dict from rank 0 to all other ranks + # and load it into the FSDP model. + set_model_state_dict( + self.model, + model_state_dict=full_state_dict, + options=StateDictOptions( + full_state_dict=True, + broadcast_from_rank0=True, + ), + ) + + # Manually broadcast buffers + for _, buf in self.model.named_buffers(): + torch.distributed.broadcast(buf, src=0) + if self.cpu_offload: - self.model = self.move_buffer_to_device(self.model, "cpu") + self.model = self.move_to_device(self.model, "cpu") # used for streaming update inference engine weights self._held_sharded_state_dict_reference: Optional[dict[str, torch.Tensor]] = ( diff --git a/tests/unit/models/policy/test_dtensor_worker.py b/tests/unit/models/policy/test_dtensor_worker.py index 0a42ea1e9f..91bf140641 100644 --- a/tests/unit/models/policy/test_dtensor_worker.py +++ b/tests/unit/models/policy/test_dtensor_worker.py @@ -61,6 +61,13 @@ def create_test_config( "top_k": None, "stop_token_ids": None, "stop_strings": None, + "colocated": { + "enabled": True, + "resources": { + "gpus_per_node": None, + "num_nodes": None, + }, + }, }, "dtensor_cfg": { "enabled": True, From e78af38cc4061dd63e5a621fad2787449edeb299 Mon Sep 17 00:00:00 2001 From: yuki <48991475+yuki-666@users.noreply.github.com> Date: Fri, 27 Jun 2025 11:24:51 +0800 Subject: [PATCH 43/44] feat: support async in non-colocated (#523) Signed-off-by: Yuki Huang Signed-off-by: Xuehan --- nemo_rl/models/generation/vllm.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/nemo_rl/models/generation/vllm.py b/nemo_rl/models/generation/vllm.py index 9506a063d3..c2e0e5c28b 100644 --- a/nemo_rl/models/generation/vllm.py +++ b/nemo_rl/models/generation/vllm.py @@ -378,6 +378,19 @@ async def init_collective_async( ), ) + async def init_collective_async( + self, data: int, ip: str, port: int, world_size: int + ) -> None: + await self.llm.collective_rpc( + "init_collective", + args=( + data, + ip, + port, + world_size, + ), + ) + def llm(self): return self.llm From 4cd4568e380e74263e5fcb9e928047651b2695a6 Mon Sep 17 00:00:00 2001 From: Anna Shors Date: Fri, 27 Jun 2025 16:00:38 -0700 Subject: [PATCH 44/44] feat: Add megatron to hf converter (#555) Signed-off-by: Anna Shors Signed-off-by: ashors1 Signed-off-by: Xuehan --- tests/functional/test_converter_roundtrip.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/tests/functional/test_converter_roundtrip.py b/tests/functional/test_converter_roundtrip.py index ea865be9b2..9679fcc724 100644 --- a/tests/functional/test_converter_roundtrip.py +++ b/tests/functional/test_converter_roundtrip.py @@ -13,20 +13,6 @@ # limitations under the License. #!/usr/bin/env python3 -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - """ Functional test for converter roundtrip functionality.