diff --git a/docs/guides/eval.md b/docs/guides/eval.md new file mode 100644 index 0000000000..8ac5ab5675 --- /dev/null +++ b/docs/guides/eval.md @@ -0,0 +1,33 @@ +# Evaluation + +## Start Evaluation + +### Start Script +```sh +# To run the evaluation with default config (examples/configs/eval.yaml) +uv run python examples/run_eval.py + +# Specify a custom config file +uv run python examples/run_eval.py --config path/to/custom_config.yaml + +# Override specific config values via command line +uv run python examples/run_eval.py generation.model_name="Qwen/Qwen2.5-Math-7B-Instruct" +``` + +### Example Output + +``` +============================================================ +model_name='Qwen2.5-Math-1.5B-Instruct' dataset_name='aime_2024' +score=0.10 (3.0/30) +============================================================ +``` + +## Configuration + +An example Evaluation configuration file can be found [here](../../examples/configs/eval.yaml). + +### Prompt Template Configuration +Always remember to use the same `prompt_file` and `system_prompt_file` that were used during training. + +For open-source models, we recommend setting `prompt_file=null` and `system_prompt_file=null` to allow them to use their native chat templates. diff --git a/docs/index.md b/docs/index.md index 56cd64ac1b..0628f19953 100644 --- a/docs/index.md +++ b/docs/index.md @@ -18,6 +18,7 @@ cluster.md adding_new_models.md guides/sft.md guides/grpo.md +guides/eval.md ``` ```{toctree} diff --git a/examples/configs/eval.yaml b/examples/configs/eval.yaml new file mode 100644 index 0000000000..a867e9617f --- /dev/null +++ b/examples/configs/eval.yaml @@ -0,0 +1,30 @@ +# Evaluation Configuration +generation: + backend: "vllm" # only vllm is supported for evaluation + max_new_tokens: ${generation.vllm_cfg.max_model_len} + temperature: 0.0 + top_p: 1.0 + top_k: -1 # disable + num_prompts_per_step: -1 # -1 means pass all prompts at once + model_name: "Qwen/Qwen2.5-Math-1.5B-Instruct" + vllm_cfg: + tensor_parallel_size: 1 + gpu_memory_utilization: 0.9 + max_model_len: 2048 + +data: + max_input_seq_length: ${generation.vllm_cfg.max_model_len} # useless since we directly use prompts in evaluation + prompt_file: null + system_prompt_file: null + dataset_name: "HuggingFaceH4/aime_2024" + dataset_key: "train" + problem_key: "problem" + solution_key: "answer" + +env: + math: + num_workers: 8 + +cluster: + gpus_per_node: 1 + num_nodes: 1 diff --git a/examples/run_eval.py b/examples/run_eval.py new file mode 100644 index 0000000000..54358ad260 --- /dev/null +++ b/examples/run_eval.py @@ -0,0 +1,145 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import pprint +import sys + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from datasets import load_dataset +from omegaconf import OmegaConf +from transformers import AutoTokenizer + +from examples.run_grpo_math import math_data_processor +from nemo_reinforcer.data import MathDataConfig +from nemo_reinforcer.data.datasets import AllTaskProcessedDataset +from nemo_reinforcer.data.interfaces import TaskDataSpec +from nemo_reinforcer.data.llm_message_utils import remap_dataset_keys +from nemo_reinforcer.distributed.virtual_cluster import init_ray +from nemo_reinforcer.environments.math_environment import MathEnvironment +from nemo_reinforcer.evals.eval import MasterConfig, run_env_eval, setup +from nemo_reinforcer.models.generation.interfaces import GenerationConfig + + +def parse_args(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser(description="Run Evaluation with configuration") + parser.add_argument( + "--config", type=str, default=None, help="Path to YAML config file" + ) + + # Parse known args for the script + args, remaining = parser.parse_known_args() + + # Convert remaining args to OmegaConf format + overrides = OmegaConf.from_dotlist(remaining) + + return args, overrides + + +def setup_data( + data_config: MathDataConfig, generation_config: GenerationConfig, env_configs +): + print("\n▶ Setting up data...") + math_task_spec = TaskDataSpec( + task_name="math", + prompt_file=data_config["prompt_file"], + system_prompt_file=data_config["system_prompt_file"], + ) + + # load dataset + base_dataset = load_dataset(data_config["dataset_name"]) + if data_config["dataset_key"] is not None: + base_dataset = base_dataset[data_config["dataset_key"]] + # remap problem and solution keys + remapped_dataset = remap_dataset_keys( + base_dataset, + mapping_dict={ + data_config["problem_key"]: "problem", + data_config["solution_key"]: "expected_answer", + }, + ) + + tokenizer = AutoTokenizer.from_pretrained(generation_config["model_name"]) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + math_env = MathEnvironment.options( + runtime_env={"py_executable": MathEnvironment.DEFAULT_PY_EXECUTABLE} + ).remote(env_configs["math"]) + + dataset = AllTaskProcessedDataset( + dataset=remapped_dataset, + tokenizer=tokenizer, + default_task_data_spec=math_task_spec, + task_data_processors=math_data_processor, + max_seq_length=data_config["max_input_seq_length"], + ) + + return dataset, math_env, tokenizer + + +def main(): + """Main entry point.""" + # Parse arguments + args, overrides = parse_args() + + if not args.config: + args.config = os.path.join(os.path.dirname(__file__), "configs", "eval.yaml") + + config = OmegaConf.load(args.config) + print(f"Loaded configuration from: {args.config}") + + if overrides: + override_conf = OmegaConf.from_cli() + print(f"Overrides: {override_conf}") + config = OmegaConf.merge(config, override_conf) + + config: MasterConfig = OmegaConf.to_container(config, resolve=True) + print("Applied CLI overrides") + + # Print config + print("Final config:") + pprint.pprint(config) + + # Init ray + init_ray() + + # Setup data + ( + dataset, + math_env, + tokenizer, + ) = setup_data(config["data"], config["generation"], config["env"]) + + # Setup + ( + vllm_generation, + dataloader, + master_config, + ) = setup(config, tokenizer, dataset) + + # Run evaluation + run_env_eval( + vllm_generation, + dataloader, + math_env, + master_config, + ) + + +if __name__ == "__main__": + main() diff --git a/examples/run_grpo_math.py b/examples/run_grpo_math.py index 7e2f3e693d..b87c7037b3 100644 --- a/examples/run_grpo_math.py +++ b/examples/run_grpo_math.py @@ -122,6 +122,8 @@ def math_data_processor( template = task_data_spec.custom_template message_log: LLMMessageLogType = [] + + # system prompt if task_data_spec.system_prompt: sys_message = {"role": "system", "content": task_data_spec.system_prompt} message = tokenizer.apply_chat_template( @@ -135,10 +137,11 @@ def math_data_processor( 0 ] message_log.append(sys_message) - user_message = { - "role": "user", - "content": task_data_spec.prompt.format(problem), - } + + # user prompt + if task_data_spec.prompt: + problem = task_data_spec.prompt.format(problem) + user_message = {"role": "user", "content": problem} message = tokenizer.apply_chat_template( [user_message], chat_template=template, @@ -167,8 +170,9 @@ def math_data_processor( "extra_env_info": extra_env_info, "loss_multiplier": loss_multiplier, "idx": idx, - "task_name": datum_dict["task_name"], } + if "task_name" in datum_dict: + output["task_name"] = datum_dict["task_name"] return output diff --git a/nemo_reinforcer/data/__init__.py b/nemo_reinforcer/data/__init__.py index 63aad516b2..09eaf35fb5 100644 --- a/nemo_reinforcer/data/__init__.py +++ b/nemo_reinforcer/data/__init__.py @@ -21,3 +21,8 @@ class DataConfig(TypedDict): system_prompt_file: Optional[str] dataset_name: str val_dataset_name: Optional[str] + + +class MathDataConfig(DataConfig): + problem_key: str + solution_key: str diff --git a/nemo_reinforcer/data/datasets.py b/nemo_reinforcer/data/datasets.py index 033cee05d7..8a81c85fb2 100644 --- a/nemo_reinforcer/data/datasets.py +++ b/nemo_reinforcer/data/datasets.py @@ -130,3 +130,54 @@ def rl_collate_fn(data_batch: List[DatumSpec]) -> BatchedDataDict: batch_max_length=batch_max_length, ) return output + + +def eval_collate_fn(data_batch: List[DatumSpec]) -> BatchedDataDict: + """Collate function for evaluation. + + Takes a list of data samples and combines them into a single batched dictionary + for model evaluation. + + Args: + data_batch: List of data samples with message_log, extra_env_info, and idx fields. + + Returns: + BatchedDataDict with message_log, extra_env_info, and idx fields. + + Examples: + ```{doctest} + >>> import torch + >>> from nemo_reinforcer.data.datasets import eval_collate_fn + >>> from nemo_reinforcer.data.interfaces import DatumSpec + >>> data_batch = [ + ... DatumSpec( + ... message_log=[{"role": "user", "content": "Hello", "token_ids": torch.tensor([1, 2, 3])}], + ... extra_env_info={'ground_truth': '1'}, + ... idx=0, + ... ), + ... DatumSpec( + ... message_log=[{"role": "assistant", "content": "Hi there", "token_ids": torch.tensor([4, 5, 6, 7])}], + ... extra_env_info={'ground_truth': '2'}, + ... idx=1, + ... ), + ... ] + >>> output = eval_collate_fn(data_batch) + >>> output['message_log'][0] + [{'role': 'user', 'content': 'Hello', 'token_ids': tensor([1, 2, 3])}] + >>> output['message_log'][1] + [{'role': 'assistant', 'content': 'Hi there', 'token_ids': tensor([4, 5, 6, 7])}] + >>> output['extra_env_info'] + [{'ground_truth': '1'}, {'ground_truth': '2'}] + >>> output['idx'] + [0, 1] + """ + message_log = [datum_spec["message_log"] for datum_spec in data_batch] + extra_env_info = [datum_spec["extra_env_info"] for datum_spec in data_batch] + idx = [datum_spec["idx"] for datum_spec in data_batch] + + output = BatchedDataDict( + message_log=message_log, + extra_env_info=extra_env_info, + idx=idx, + ) + return output diff --git a/nemo_reinforcer/data/llm_message_utils.py b/nemo_reinforcer/data/llm_message_utils.py index 43e24fc1ce..5ae8bee9a8 100644 --- a/nemo_reinforcer/data/llm_message_utils.py +++ b/nemo_reinforcer/data/llm_message_utils.py @@ -11,10 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Dict, List, Union - +from typing import Dict, List import torch +from datasets import Dataset from nemo_reinforcer.data.interfaces import ( LLMMessageLogType, @@ -390,3 +390,27 @@ def get_formatted_message_log( prev_formatted_message = formatted_message return message_log + + +def remap_dataset_keys( + dataset: Dataset, + mapping_dict: Dict[str, str], +) -> Dataset: + """Remap dataset keys as per mapping. + + Args: + dataset: The input dataset to remap keys in + mapping_dict: A dictionary mapping input keys to output keys + + Returns: + Dataset: A new dataset with remapped keys + """ + # no need to remap if the keys are already correct + if all(k == v for k, v in mapping_dict.items()): + return dataset + + # return the remapped dataset + return dataset.map( + lambda x: {v: x[k] for k, v in mapping_dict.items()}, + remove_columns=list(mapping_dict.keys()), + ) diff --git a/nemo_reinforcer/evals/eval.py b/nemo_reinforcer/evals/eval.py new file mode 100644 index 0000000000..33d486a4d5 --- /dev/null +++ b/nemo_reinforcer/evals/eval.py @@ -0,0 +1,191 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import Tuple, TypedDict + +import ray +from torch.utils.data import DataLoader +from transformers import AutoTokenizer + +from nemo_reinforcer.data import MathDataConfig +from nemo_reinforcer.data.datasets import AllTaskProcessedDataset, eval_collate_fn +from nemo_reinforcer.data.llm_message_utils import get_keys_from_message_log +from nemo_reinforcer.distributed.batched_data_dict import BatchedDataDict +from nemo_reinforcer.distributed.virtual_cluster import ClusterConfig, RayVirtualCluster +from nemo_reinforcer.environments.math_environment import MathEnvConfig +from nemo_reinforcer.models.generation.interfaces import GenerationConfig +from nemo_reinforcer.models.generation.vllm import VllmGeneration + + +# =============================================================================== +# Configuration +# =============================================================================== + + +class MasterConfig(TypedDict): + generate: GenerationConfig + data: MathDataConfig + env: MathEnvConfig + cluster: ClusterConfig + + +# =============================================================================== +# Setup & Initialization +# =============================================================================== + + +def setup( + master_config: MasterConfig, + tokenizer: AutoTokenizer, + dataset: AllTaskProcessedDataset, +) -> Tuple[ + VllmGeneration, + DataLoader, + MasterConfig, +]: + """Set up components for model evaluation. + + Initializes the VLLM model and data loader. + + Args: + master_config: Configuration settings. + dataset: Dataset to evaluate on. + + Returns: + VLLM model, data loader, and config. + """ + # Extract individual configs for easier access + generation_config = master_config["generation"] + cluster_config = master_config["cluster"] + + # ========================== + # Data + # ========================== + if generation_config["num_prompts_per_step"] == -1: + generation_config["num_prompts_per_step"] = len(dataset) + dataloader = DataLoader( + dataset, + batch_size=generation_config["num_prompts_per_step"], + shuffle=False, + collate_fn=eval_collate_fn, + ) + print(f" ✓ Evaluation dataset loaded with {len(dataset)} samples") + + # ========================== + # Cluster + # ========================== + print("\n▶ Setting up compute cluster...") + cluster = RayVirtualCluster( + name="eval_cluster", + bundle_ct_per_node_list=[cluster_config["gpus_per_node"]] + * cluster_config["num_nodes"], + use_gpus=True, + num_gpus_per_node=cluster_config["gpus_per_node"], + max_colocated_worker_groups=1, + ) + print(f" ✓ Ray cluster initialized with {cluster_config['num_nodes']} nodes") + + # ========================== + # Model + # ========================== + print("\n▶ Setting up model...") + # check backend + backend = generation_config["backend"] + assert backend == "vllm", "Only vLLM backend is supported for evaluation" + + # set vllm config + generation_config["vllm_cfg"]["load_format"] = "auto" + generation_config["vllm_cfg"]["skip_tokenizer_init"] = False + generation_config["stop_token_ids"] = [tokenizer.eos_token_id] + generation_config["pad_token"] = tokenizer.pad_token_id + + # initialize vllm generation + vllm_generation = VllmGeneration(cluster=cluster, config=generation_config) + print( + f" ✓ Using vLLM backend for generation with {generation_config['model_name']}" + ) + + print("\n" + "=" * 60) + print(" " * 18 + "SETUP COMPLETE") + print("=" * 60 + "\n") + + return ( + vllm_generation, + dataloader, + master_config, + ) + + +# =============================================================================== +# Evaluation +# =============================================================================== + + +def run_env_eval(vllm_generation, dataloader, env, master_config): + """Main entry point for running evaluation using environment. + + Generates model responses and evaluates them by env. + + Args: + vllm_generation: Model for generating responses. + dataloader: Data loader with evaluation samples. + env: Environment that scores responses. + master_config: Configuration settings. + """ + # Run evaluation loop + score, count = 0.0, 0 + for batch in dataloader: + # get input prompt from message_log + prompts = [] + for message_log in batch["message_log"]: + content = [message["content"] for message in message_log] + content = "\n".join(content) + prompts.append(content) + # generate by vllm + inputs = BatchedDataDict({"prompts": prompts}) + outputs = vllm_generation.generate_text(inputs)["texts"] + + # append to message_log + for idx, output in enumerate(outputs): + batch["message_log"][idx].append( + { + "role": "assistant", + "content": output, + } + ) + + # evaluate generations with the environment + to_env = [ + get_keys_from_message_log(batch["message_log"][i], ["role", "content"]) + for i in range(len(batch["message_log"])) + ] + _, _, rewards, _ = ray.get(env.step.remote(to_env, batch["extra_env_info"])) + + score += rewards.sum().item() + count += len(rewards) + + # Cleanup before printing results + ray.get(env.shutdown.remote()) + vllm_generation.shutdown() + + # Print results + dataset_name = os.path.basename(master_config["data"]["dataset_name"]) + model_name = os.path.basename(master_config["generation"]["model_name"]) + average_score = score / count + + print("\n" + "=" * 60) + print(f"{model_name=} {dataset_name=}") + print(f"score={average_score:.2f} ({score}/{count})") + print("=" * 60 + "\n") diff --git a/nemo_reinforcer/evals/run_env_eval.py b/nemo_reinforcer/evals/run_env_eval.py deleted file mode 100644 index 341a77c5bc..0000000000 --- a/nemo_reinforcer/evals/run_env_eval.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/nemo_reinforcer/models/generation/vllm.py b/nemo_reinforcer/models/generation/vllm.py index cb60b7fe8c..2395b65e34 100644 --- a/nemo_reinforcer/models/generation/vllm.py +++ b/nemo_reinforcer/models/generation/vllm.py @@ -200,6 +200,7 @@ def generate( Args: data: BatchedDataDict containing input_ids and input_lengths tensors + greedy: Whether to use greedy decoding instead of sampling Returns: BatchedDataDict conforming to GenerationOutputSpec: @@ -246,7 +247,7 @@ def generate( # Read generation parameters from config top_k = self.cfg["top_k"] if self.cfg["top_k"] is not None else -1 sampling_params = self.SamplingParams( - temperature=self.cfg["temperature"], + temperature=self.cfg["temperature"] if not greedy else 0, top_p=self.cfg["top_p"], top_k=top_k if not greedy @@ -330,6 +331,37 @@ def generate( return return_data + def generate_text( + self, data: BatchedDataDict[GenerationDatumSpec], greedy: bool = False + ) -> BatchedDataDict[GenerationOutputSpec]: + """Generate text responses using vLLM generation. + + Args: + data: BatchedDataDict containing prompts with text strings + greedy: Whether to use greedy decoding instead of sampling + + Returns: + BatchedDataDict containing: + - texts: List of generated text responses + """ + # Read generation parameters from config + top_k = self.cfg["top_k"] if self.cfg["top_k"] is not None else -1 + sampling_params = self.SamplingParams( + temperature=self.cfg["temperature"] if not greedy else 0, + top_p=self.cfg["top_p"], + top_k=top_k if not greedy else 1, + max_tokens=self.cfg["max_new_tokens"], + stop=self.cfg.get("stop_sequences", None), + ) + + # Generate outputs + outputs = self.llm.generate(data["prompts"], sampling_params) + texts = [output.outputs[0].text for output in outputs] + + # Convert to BatchedDataDict + return_data = BatchedDataDict({"texts": texts}) + return return_data + def shutdown(self): """Clean up vLLM resources.""" try: @@ -537,6 +569,42 @@ def generate( return combined + def generate_text( + self, data: BatchedDataDict[GenerationDatumSpec], greedy: bool = False + ) -> BatchedDataDict[GenerationOutputSpec]: + """Generate text responses using vLLM.""" + assert isinstance(data, BatchedDataDict), ( + f"data must be a BatchedDataDict, got type: {type(data)}" + ) + + # Get total batch size + batch_size = len(data["prompts"]) + + # Shard the data across the tied worker groups + sharded_data = data.shard_by_batch_size(self.dp_size, batch_size=batch_size) + future_bundle = self.worker_group.run_all_workers_multiple_data( + "generate_text", + sharded_data, + common_kwargs={"greedy": greedy}, + respect_tied_workers=True, + ) + + # Get results from the workers, respecting tied worker groups (only one result per tied worker group) + results = self.worker_group.get_all_worker_results(future_bundle) + + # Combine results from all tied worker groups + combined = BatchedDataDict.from_batches(results) + + # Verify the output has all required fields + required_keys = ["texts"] + missing_keys = [key for key in required_keys if key not in combined] + if missing_keys: + raise ValueError( + f"Missing required keys for GenerationOutputSpec: {missing_keys}" + ) + + return combined + def prepare_for_generation(self, *args, **kwargs): """Abstract method that must be implemented by subclasses.""" try: diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py index af8b5698e3..8c810e31dc 100644 --- a/tests/unit/models/generation/test_vllm_generation.py +++ b/tests/unit/models/generation/test_vllm_generation.py @@ -40,10 +40,14 @@ } -def configure_vllm_with_tokenizer(vllm_config, tokenizer): +def configure_vllm_with_tokenizer(vllm_config, tokenizer, is_eval=False): """Apply tokenizer-specific configurations to vLLM config.""" - vllm_config["vllm_cfg"]["skip_tokenizer_init"] = True - vllm_config["vllm_cfg"]["load_format"] = "dummy" + if is_eval: + vllm_config["vllm_cfg"]["skip_tokenizer_init"] = False + vllm_config["vllm_cfg"]["load_format"] = "auto" + else: + vllm_config["vllm_cfg"]["skip_tokenizer_init"] = True + vllm_config["vllm_cfg"]["load_format"] = "dummy" vllm_config["pad_token"] = tokenizer.pad_token_id vllm_config["stop_token_ids"] = [tokenizer.eos_token_id] return vllm_config @@ -532,3 +536,38 @@ def test_vllm_policy_weight_update(cluster, tokenizer, tensor_parallel_size): # Clean up vllm_policy.shutdown() + + +def test_vllm_generate_text(cluster, tokenizer): + """Test that vLLM can generate text.""" + # Prepare test data + test_prompts = [ + "Hello, my name is", + "The capital of France is", + ] + test_prompts = BatchedDataDict({"prompts": test_prompts}) + + # Create separate configs for each policy + vllm_config = basic_vllm_test_config.copy() + vllm_config = configure_vllm_with_tokenizer(vllm_config, tokenizer, is_eval=True) + + # Ensure we can get same output + assert vllm_config["model_name"] == "meta-llama/Llama-3.2-1B", ( + "Model name should be meta-llama/Llama-3.2-1B to get expected output" + ) + assert vllm_config["vllm_cfg"]["tensor_parallel_size"] == 1, ( + "Tensor parallel size should be 1 to get expected output" + ) + + # Create vLLM generation + vllm_generation = VllmGeneration(cluster, vllm_config) + + # Generate and check result + output = vllm_generation.generate_text(test_prompts, greedy=True) + assert output["texts"] == [ + " Kelsey and I am a 2018 graduate", + " Paris. The city is located in the north of", + ], "Output should be the same as the expected output" + + # Clean up + vllm_generation.shutdown()