From 21d752b9621486a13db240aa708b87f620b1366e Mon Sep 17 00:00:00 2001 From: Yuki Huang Date: Mon, 31 Mar 2025 06:44:13 +0000 Subject: [PATCH 1/5] fix error padding in vllm and hf Signed-off-by: Yuki Huang --- examples/run_grpo_math.py | 5 ++-- examples/run_sft.py | 4 +-- nemo_reinforcer/algorithms/utils.py | 9 ++++++ nemo_reinforcer/models/generation/vllm.py | 8 ++++-- nemo_reinforcer/models/policy/hf_policy.py | 15 +++++++--- .../models/generation/test_vllm_generation.py | 7 ++--- .../unit/models/policy/test_hf_ray_policy.py | 28 +++++++++++++------ 7 files changed, 51 insertions(+), 25 deletions(-) diff --git a/examples/run_grpo_math.py b/examples/run_grpo_math.py index c67829e80e..ea13bcffb4 100644 --- a/examples/run_grpo_math.py +++ b/examples/run_grpo_math.py @@ -23,6 +23,7 @@ from transformers import AutoTokenizer from nemo_reinforcer.algorithms.grpo import MasterConfig, grpo_train, setup +from nemo_reinforcer.algorithms.utils import get_tokenizer from nemo_reinforcer.data import DataConfig from nemo_reinforcer.data.datasets import AllTaskProcessedDataset, rl_collate_fn from nemo_reinforcer.data.hf_datasets.openmathinstruct2 import OpenMathInstruct2Dataset @@ -187,9 +188,7 @@ def setup_data(data_config: DataConfig, policy_config: PolicyConfig, env_configs else: raise ValueError(f"No processor for dataset {data_config['dataset_name']}.") - tokenizer = AutoTokenizer.from_pretrained(policy_config["model_name"]) - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token + tokenizer = get_tokenizer(policy_config["model_name"]) task_data_processors = defaultdict( lambda: (math_task_spec, openinstructmath2_data_processor) diff --git a/examples/run_sft.py b/examples/run_sft.py index 950938b4da..d09d949ce3 100644 --- a/examples/run_sft.py +++ b/examples/run_sft.py @@ -20,6 +20,7 @@ from omegaconf import OmegaConf from nemo_reinforcer.algorithms.sft import MasterConfig, sft_train, setup +from nemo_reinforcer.algorithms.utils import get_tokenizer from nemo_reinforcer.distributed.virtual_cluster import init_ray from nemo_reinforcer.utils.config import load_config from nemo_reinforcer.utils.logger import get_next_experiment_dir @@ -27,7 +28,6 @@ from nemo_reinforcer.data.datasets import AllTaskProcessedDataset from nemo_reinforcer.data.interfaces import TaskDataSpec, DatumSpec from nemo_reinforcer.data.llm_message_utils import get_formatted_message_log -from transformers import AutoTokenizer from nemo_reinforcer.models.policy import PolicyConfig @@ -100,7 +100,7 @@ def setup_data(data_config: DataConfig, policy_config: PolicyConfig): val_dataset = data.formatted_ds["validation"] sft_task_spec = data.task_spec - tokenizer = AutoTokenizer.from_pretrained(policy_config["model_name"]) + tokenizer = get_tokenizer(policy_config["model_name"]) train_dataset = AllTaskProcessedDataset( train_dataset, diff --git a/nemo_reinforcer/algorithms/utils.py b/nemo_reinforcer/algorithms/utils.py index 138c3802d1..a3c42e2a19 100644 --- a/nemo_reinforcer/algorithms/utils.py +++ b/nemo_reinforcer/algorithms/utils.py @@ -18,6 +18,7 @@ import numpy as np import torch from torch.masked import as_masked_tensor +from transformers import AutoTokenizer def calculate_kl_penalty_joschu2020( @@ -130,3 +131,11 @@ def set_seed(seed: int): np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) + + +def get_tokenizer(model_name: str) -> AutoTokenizer: + """Get the tokenizer and set pad token to eos token if it is not already set.""" + tokenizer = AutoTokenizer.from_pretrained(model_name) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + return tokenizer diff --git a/nemo_reinforcer/models/generation/vllm.py b/nemo_reinforcer/models/generation/vllm.py index 4ffbb3e2ff..7cfd3b0f7f 100644 --- a/nemo_reinforcer/models/generation/vllm.py +++ b/nemo_reinforcer/models/generation/vllm.py @@ -516,7 +516,9 @@ def generate( results = self.worker_group.get_all_worker_results(future_bundle) # Combine results from all tied worker groups - combined = BatchedDataDict.from_batches(results) + combined = BatchedDataDict.from_batches( + results, pad_value_dict={"output_ids": self.cfg["pad_token"]} + ) # Verify the output has all required fields required_keys = [ @@ -557,7 +559,9 @@ def generate_text( results = self.worker_group.get_all_worker_results(future_bundle) # Combine results from all tied worker groups - combined = BatchedDataDict.from_batches(results) + combined = BatchedDataDict.from_batches( + results, pad_value_dict={"output_ids": self.cfg["pad_token"]} + ) # Verify the output has all required fields required_keys = ["texts"] diff --git a/nemo_reinforcer/models/policy/hf_policy.py b/nemo_reinforcer/models/policy/hf_policy.py index 3a316ba3ae..f1234e7fa2 100644 --- a/nemo_reinforcer/models/policy/hf_policy.py +++ b/nemo_reinforcer/models/policy/hf_policy.py @@ -519,7 +519,7 @@ def generate( batch_size, seq_len = input_ids.shape # Convert right padding to left padding - left_padded_input_ids = torch.zeros_like(input_ids) + left_padded_input_ids = torch.full_like(input_ids, gen_cfg["pad_token"]) left_padded_attention_mask = torch.zeros( (batch_size, seq_len), dtype=torch.long, device=input_ids.device ) @@ -569,7 +569,12 @@ def generate( micro_batches.append(mb) # Get lengths, pad, and concatenate all batches - return_data = BatchedDataDict.from_batches(micro_batches) + return_data = BatchedDataDict.from_batches( + micro_batches, + pad_value_dict={ + "left_padded_output_ids": self.cfg["generation"]["pad_token"] + }, + ) # Calculate the lengths of generations for each sequence by finding stop tokens generation_lengths = [] @@ -581,8 +586,9 @@ def generate( max_seq_len = max( [seq.size(0) for seq in return_data["left_padded_output_ids"]] ) - right_padded_output_ids = torch.zeros( + right_padded_output_ids = torch.full( (batch_size, max_seq_len), + self.cfg["generation"]["pad_token"], dtype=return_data["left_padded_output_ids"][0].dtype, device=return_data["left_padded_output_ids"][0].device, ) @@ -1017,7 +1023,8 @@ def generate( "generate", sharded_data, common_kwargs={"greedy": greedy} ) result = BatchedDataDict.from_batches( - self.worker_group.get_all_worker_results(futures) + self.worker_group.get_all_worker_results(futures), + pad_value_dict={"output_ids": self.cfg["generation"]["pad_token"]}, ) # Verify the output has all required fields diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py index a5bcda1ff6..11e12d8ee9 100644 --- a/tests/unit/models/generation/test_vllm_generation.py +++ b/tests/unit/models/generation/test_vllm_generation.py @@ -17,8 +17,7 @@ import ray import numpy as np -from transformers import AutoTokenizer - +from nemo_reinforcer.algorithms.utils import get_tokenizer from nemo_reinforcer.distributed.virtual_cluster import RayVirtualCluster from nemo_reinforcer.distributed.batched_data_dict import BatchedDataDict from nemo_reinforcer.models.generation.vllm import VllmGeneration, VllmConfig @@ -82,9 +81,7 @@ def cluster(): def tokenizer(): """Initialize tokenizer for the test model.""" model_name = basic_vllm_test_config["model_name"] - tokenizer = AutoTokenizer.from_pretrained(model_name) - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token + tokenizer = get_tokenizer(model_name) return tokenizer diff --git a/tests/unit/models/policy/test_hf_ray_policy.py b/tests/unit/models/policy/test_hf_ray_policy.py index ded244feac..dc570ba66a 100644 --- a/tests/unit/models/policy/test_hf_ray_policy.py +++ b/tests/unit/models/policy/test_hf_ray_policy.py @@ -16,13 +16,13 @@ import pprint import torch +from nemo_reinforcer.algorithms.utils import get_tokenizer from nemo_reinforcer.models.policy import PolicyConfig from nemo_reinforcer.models.policy.hf_policy import HfPolicy from nemo_reinforcer.distributed.virtual_cluster import RayVirtualCluster from nemo_reinforcer.distributed.batched_data_dict import BatchedDataDict from nemo_reinforcer.algorithms.interfaces import LossFunction from tests.unit.test_utils import simple_loss, nll_loss -from transformers import AutoTokenizer basic_llama_test_config: PolicyConfig = { @@ -66,8 +66,16 @@ def gc_collect(): gc.collect() +@pytest.fixture(scope="function") +def tokenizer(): + """Initialize tokenizer for the test model.""" + model_name = basic_llama_test_config["model_name"] + tokenizer = get_tokenizer(model_name) + return tokenizer + + @pytest.fixture -def policy_setup(): +def policy_setup(tokenizer): """Setup and teardown for policy tests - creates a virtual cluster and policy.""" policy = None cluster = None @@ -84,6 +92,8 @@ def policy_setup(): ) config = basic_llama_test_config + config["generation"]["pad_token"] = tokenizer.pad_token_id + config["generation"]["stop_token_ids"] = [tokenizer.eos_token_id] print("Creating HfPolicy...") policy = HfPolicy(cluster=cluster, config=config) @@ -278,7 +288,7 @@ def verify_loss_tensor(loss_tensor): @pytest.fixture -def generation_setup(request): +def generation_setup(request, tokenizer): """Setup and teardown specifically for generation tests.""" policy = None cluster = None @@ -298,6 +308,8 @@ def generation_setup(request): ) config = basic_llama_test_config + config["generation"]["pad_token"] = tokenizer.pad_token_id + config["generation"]["stop_token_ids"] = [tokenizer.eos_token_id] print("Creating generation HfPolicy...") policy = HfPolicy( @@ -331,8 +343,6 @@ def generation_setup(request): ] # Tokenize the prompts - tokenizer = AutoTokenizer.from_pretrained(config["model_name"]) - tokenizer.pad_token = tokenizer.eos_token tokenized = tokenizer( prompts, padding=True, @@ -353,7 +363,7 @@ def generation_setup(request): ) # Provide the resources to the test - yield policy, cluster, data, tokenizer, prompts, expected_generations + yield policy, cluster, data, prompts, expected_generations except Exception as e: print(f"Error during generation setup: {e}") @@ -367,8 +377,8 @@ def generation_setup(request): @pytest.mark.timeout(180) @pytest.mark.parametrize("generation_setup", [False], indirect=True) -def test_hf_policy_generation(generation_setup, tracker): - policy, cluster, data, tokenizer, prompts, expected_generations = generation_setup +def test_hf_policy_generation(generation_setup, tokenizer, tracker): + policy, cluster, data, prompts, expected_generations = generation_setup # Verify resources were created properly assert policy is not None, "Generation policy was not created properly" @@ -455,7 +465,7 @@ def test_hf_policy_generation(generation_setup, tracker): @pytest.mark.timeout(180) @pytest.mark.parametrize("generation_setup", [True], indirect=True) def test_all_hf_policy_generation_lps_ref_training(generation_setup): - policy, cluster, data, tokenizer, prompts, expected_generations = generation_setup + policy, cluster, data, prompts, expected_generations = generation_setup # Verify resources were created properly assert policy is not None, "Generation policy was not created properly" From 25817824f7f46bea83478f863a146ad236debf27 Mon Sep 17 00:00:00 2001 From: Yuki Huang Date: Tue, 1 Apr 2025 14:39:46 +0000 Subject: [PATCH 2/5] add configure_generation_config Signed-off-by: Yuki Huang --- examples/run_eval.py | 19 +++++++------ examples/run_grpo_math.py | 28 ++++++++++++------- examples/run_sft.py | 27 ++++++++++-------- nemo_reinforcer/algorithms/grpo.py | 5 ---- nemo_reinforcer/evals/eval.py | 6 ---- .../models/generation/interfaces.py | 25 ++++++++++++++++- nemo_reinforcer/models/generation/vllm.py | 8 +++--- nemo_reinforcer/models/policy/hf_policy.py | 10 ++++--- .../models/generation/test_vllm_generation.py | 26 +++++------------ .../unit/models/policy/test_hf_ray_policy.py | 15 +++++----- 10 files changed, 93 insertions(+), 76 deletions(-) diff --git a/examples/run_eval.py b/examples/run_eval.py index 54358ad260..90a7c23235 100644 --- a/examples/run_eval.py +++ b/examples/run_eval.py @@ -24,6 +24,7 @@ from transformers import AutoTokenizer from examples.run_grpo_math import math_data_processor +from nemo_reinforcer.algorithms.utils import get_tokenizer from nemo_reinforcer.data import MathDataConfig from nemo_reinforcer.data.datasets import AllTaskProcessedDataset from nemo_reinforcer.data.interfaces import TaskDataSpec @@ -31,7 +32,7 @@ from nemo_reinforcer.distributed.virtual_cluster import init_ray from nemo_reinforcer.environments.math_environment import MathEnvironment from nemo_reinforcer.evals.eval import MasterConfig, run_env_eval, setup -from nemo_reinforcer.models.generation.interfaces import GenerationConfig +from nemo_reinforcer.models.generation.interfaces import configure_generation_config def parse_args(): @@ -50,9 +51,7 @@ def parse_args(): return args, overrides -def setup_data( - data_config: MathDataConfig, generation_config: GenerationConfig, env_configs -): +def setup_data(tokenizer: AutoTokenizer, data_config: MathDataConfig, env_configs): print("\n▶ Setting up data...") math_task_spec = TaskDataSpec( task_name="math", @@ -73,10 +72,6 @@ def setup_data( }, ) - tokenizer = AutoTokenizer.from_pretrained(generation_config["model_name"]) - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - math_env = MathEnvironment.options( runtime_env={"py_executable": MathEnvironment.DEFAULT_PY_EXECUTABLE} ).remote(env_configs["math"]) @@ -118,12 +113,18 @@ def main(): # Init ray init_ray() + # Setup tokenizer + tokenizer = get_tokenizer(config["generation"]["model_name"]) + config["generation"] = configure_generation_config( + config["generation"], tokenizer, is_eval=True + ) + # Setup data ( dataset, math_env, tokenizer, - ) = setup_data(config["data"], config["generation"], config["env"]) + ) = setup_data(tokenizer, config["data"], config["env"]) # Setup ( diff --git a/examples/run_grpo_math.py b/examples/run_grpo_math.py index ea13bcffb4..c6dea5609c 100644 --- a/examples/run_grpo_math.py +++ b/examples/run_grpo_math.py @@ -18,19 +18,18 @@ from collections import defaultdict from typing import Any, Dict -from datasets import load_dataset from omegaconf import OmegaConf from transformers import AutoTokenizer from nemo_reinforcer.algorithms.grpo import MasterConfig, grpo_train, setup from nemo_reinforcer.algorithms.utils import get_tokenizer from nemo_reinforcer.data import DataConfig -from nemo_reinforcer.data.datasets import AllTaskProcessedDataset, rl_collate_fn +from nemo_reinforcer.data.datasets import AllTaskProcessedDataset from nemo_reinforcer.data.hf_datasets.openmathinstruct2 import OpenMathInstruct2Dataset from nemo_reinforcer.data.interfaces import DatumSpec, LLMMessageLogType, TaskDataSpec from nemo_reinforcer.distributed.virtual_cluster import init_ray from nemo_reinforcer.environments.math_environment import MathEnvironment -from nemo_reinforcer.models.policy import PolicyConfig +from nemo_reinforcer.models.generation.interfaces import configure_generation_config from nemo_reinforcer.utils.config import load_config, parse_hydra_overrides from nemo_reinforcer.utils.logger import get_next_experiment_dir @@ -173,7 +172,7 @@ def math_data_processor( return output -def setup_data(data_config: DataConfig, policy_config: PolicyConfig, env_configs): +def setup_data(tokenizer: AutoTokenizer, data_config: DataConfig, env_configs): print("\n▶ Setting up data...") math_task_spec = TaskDataSpec( task_name="math", @@ -188,8 +187,6 @@ def setup_data(data_config: DataConfig, policy_config: PolicyConfig, env_configs else: raise ValueError(f"No processor for dataset {data_config['dataset_name']}.") - tokenizer = get_tokenizer(policy_config["model_name"]) - task_data_processors = defaultdict( lambda: (math_task_spec, openinstructmath2_data_processor) ) @@ -219,7 +216,7 @@ def setup_data(data_config: DataConfig, policy_config: PolicyConfig, env_configs task_to_env = defaultdict(lambda: math_env) task_to_env["math"] = math_env - return dataset, val_dataset, task_to_env, task_to_env, tokenizer + return dataset, val_dataset, task_to_env, task_to_env def main(): @@ -256,10 +253,20 @@ def main(): init_ray() - # setup data - dataset, val_dataset, task_to_env, val_task_to_env, tokenizer = setup_data( - config["data"], config["policy"], config["env"] + # setup tokenizer + tokenizer = get_tokenizer(config["policy"]["model_name"]) + config["policy"]["generation"] = configure_generation_config( + config["policy"]["generation"], tokenizer ) + + # setup data + ( + dataset, + val_dataset, + task_to_env, + val_task_to_env, + ) = setup_data(tokenizer, config["data"], config["env"]) + ( policy, policy_generation, @@ -272,6 +279,7 @@ def main(): grpo_state, master_config, ) = setup(config, tokenizer, dataset, val_dataset) + grpo_train( policy, policy_generation, diff --git a/examples/run_sft.py b/examples/run_sft.py index d09d949ce3..00d2a93900 100644 --- a/examples/run_sft.py +++ b/examples/run_sft.py @@ -18,17 +18,17 @@ from typing import Dict, Any from omegaconf import OmegaConf +from transformers import AutoTokenizer from nemo_reinforcer.algorithms.sft import MasterConfig, sft_train, setup from nemo_reinforcer.algorithms.utils import get_tokenizer -from nemo_reinforcer.distributed.virtual_cluster import init_ray -from nemo_reinforcer.utils.config import load_config -from nemo_reinforcer.utils.logger import get_next_experiment_dir from nemo_reinforcer.data import DataConfig, hf_datasets from nemo_reinforcer.data.datasets import AllTaskProcessedDataset from nemo_reinforcer.data.interfaces import TaskDataSpec, DatumSpec from nemo_reinforcer.data.llm_message_utils import get_formatted_message_log -from nemo_reinforcer.models.policy import PolicyConfig +from nemo_reinforcer.distributed.virtual_cluster import init_ray +from nemo_reinforcer.utils.config import load_config +from nemo_reinforcer.utils.logger import get_next_experiment_dir def parse_args(): @@ -83,7 +83,7 @@ def sft_preprocessor( return output -def setup_data(data_config: DataConfig, policy_config: PolicyConfig): +def setup_data(tokenizer: AutoTokenizer, data_config: DataConfig): print("\n▶ Setting up data...") data_cls = data_config["dataset_name"] if data_cls == "open_assistant": @@ -100,8 +100,6 @@ def setup_data(data_config: DataConfig, policy_config: PolicyConfig): val_dataset = data.formatted_ds["validation"] sft_task_spec = data.task_spec - tokenizer = get_tokenizer(policy_config["model_name"]) - train_dataset = AllTaskProcessedDataset( train_dataset, tokenizer, @@ -118,7 +116,7 @@ def setup_data(data_config: DataConfig, policy_config: PolicyConfig): max_seq_length=data_config["max_input_seq_length"], ) - return train_dataset, val_dataset, tokenizer, sft_task_spec + return train_dataset, val_dataset, sft_task_spec def main(): @@ -152,10 +150,16 @@ def main(): init_ray() + # setup tokenizer + tokenizer = get_tokenizer(config["policy"]["model_name"]) + # setup data - dataset, val_dataset, tokenizer, sft_task_spec = setup_data( - config["data"], config["policy"] - ) + ( + dataset, + val_dataset, + sft_task_spec, + ) = setup_data(tokenizer, config["data"]) + ( policy, cluster, @@ -167,6 +171,7 @@ def main(): sft_save_state, master_config, ) = setup(config, dataset, val_dataset) + sft_train( policy, train_dataloader, diff --git a/nemo_reinforcer/algorithms/grpo.py b/nemo_reinforcer/algorithms/grpo.py index d49dc32418..6ea5a56dd6 100644 --- a/nemo_reinforcer/algorithms/grpo.py +++ b/nemo_reinforcer/algorithms/grpo.py @@ -220,11 +220,6 @@ def setup( # vllm model loading prefers clean environment, initialize policy_generation before policy (#52 will fix this) backend = generation_config["backend"] generation_config["model_name"] = policy_config["model_name"] # Needed for vLLM - generation_config["vllm_cfg"]["skip_tokenizer_init"] = True - # When https://github.com/NVIDIA/reinforcer/issues/57 is fixed, we should update stop_token_ids below. - generation_config["stop_token_ids"] = [tokenizer.eos_token_id] - generation_config["pad_token"] = tokenizer.pad_token_id - generation_config["vllm_cfg"]["load_format"] = "dummy" if backend == "hf": policy_generation = None diff --git a/nemo_reinforcer/evals/eval.py b/nemo_reinforcer/evals/eval.py index 33d486a4d5..a1a4cad74b 100644 --- a/nemo_reinforcer/evals/eval.py +++ b/nemo_reinforcer/evals/eval.py @@ -105,12 +105,6 @@ def setup( backend = generation_config["backend"] assert backend == "vllm", "Only vLLM backend is supported for evaluation" - # set vllm config - generation_config["vllm_cfg"]["load_format"] = "auto" - generation_config["vllm_cfg"]["skip_tokenizer_init"] = False - generation_config["stop_token_ids"] = [tokenizer.eos_token_id] - generation_config["pad_token"] = tokenizer.pad_token_id - # initialize vllm generation vllm_generation = VllmGeneration(cluster=cluster, config=generation_config) print( diff --git a/nemo_reinforcer/models/generation/interfaces.py b/nemo_reinforcer/models/generation/interfaces.py index da7e737784..6aeaa06d7c 100644 --- a/nemo_reinforcer/models/generation/interfaces.py +++ b/nemo_reinforcer/models/generation/interfaces.py @@ -15,6 +15,8 @@ from typing import Any, TypedDict, Union, Tuple, List import torch +from transformers import AutoTokenizer + from nemo_reinforcer.distributed.batched_data_dict import BatchedDataDict @@ -107,7 +109,28 @@ class GenerationConfig(TypedDict): top_k: int model_name: str stop_token_ids: List[int] - pad_token: int + pad_token_id: int + + +def configure_generation_config( + config: GenerationConfig, tokenizer: AutoTokenizer, is_eval=False +): + """Apply specific configurations to generation config.""" + # tokenizer setting + config["pad_token_id"] = tokenizer.pad_token_id + # When https://github.com/NVIDIA/reinforcer/issues/57 is fixed, we should update stop_token_ids below. + config["stop_token_ids"] = [tokenizer.eos_token_id] + + # vllm setting + if config["backend"] == "vllm": + if is_eval: + config["vllm_cfg"]["skip_tokenizer_init"] = False + config["vllm_cfg"]["load_format"] = "auto" + else: + config["vllm_cfg"]["skip_tokenizer_init"] = True + config["vllm_cfg"]["load_format"] = "dummy" + + return config class GenerationDatumSpec(TypedDict): diff --git a/nemo_reinforcer/models/generation/vllm.py b/nemo_reinforcer/models/generation/vllm.py index 7cfd3b0f7f..3f8528f549 100644 --- a/nemo_reinforcer/models/generation/vllm.py +++ b/nemo_reinforcer/models/generation/vllm.py @@ -218,7 +218,7 @@ def generate( f"input_ids and input_lengths must be present in the BatchedDataDict, got keys: {data.keys()}" ) is_right_padded, error_msg = verify_right_padding( - data, pad_value=self.cfg["pad_token"] + data, pad_value=self.cfg["pad_token_id"] ) if not is_right_padded: warnings.warn( @@ -282,7 +282,7 @@ def generate( # Create a new tensor with the right size and fill with padding token full_output = torch.full( - (total_length,), self.cfg["pad_token"], dtype=input_ids.dtype + (total_length,), self.cfg["pad_token_id"], dtype=input_ids.dtype ) # Copy original input (with padding) into the beginning @@ -517,7 +517,7 @@ def generate( # Combine results from all tied worker groups combined = BatchedDataDict.from_batches( - results, pad_value_dict={"output_ids": self.cfg["pad_token"]} + results, pad_value_dict={"output_ids": self.cfg["pad_token_id"]} ) # Verify the output has all required fields @@ -560,7 +560,7 @@ def generate_text( # Combine results from all tied worker groups combined = BatchedDataDict.from_batches( - results, pad_value_dict={"output_ids": self.cfg["pad_token"]} + results, pad_value_dict={"output_ids": self.cfg["pad_token_id"]} ) # Verify the output has all required fields diff --git a/nemo_reinforcer/models/policy/hf_policy.py b/nemo_reinforcer/models/policy/hf_policy.py index f1234e7fa2..3f3f16660d 100644 --- a/nemo_reinforcer/models/policy/hf_policy.py +++ b/nemo_reinforcer/models/policy/hf_policy.py @@ -519,7 +519,9 @@ def generate( batch_size, seq_len = input_ids.shape # Convert right padding to left padding - left_padded_input_ids = torch.full_like(input_ids, gen_cfg["pad_token"]) + left_padded_input_ids = torch.full_like( + input_ids, gen_cfg["pad_token_id"] + ) left_padded_attention_mask = torch.zeros( (batch_size, seq_len), dtype=torch.long, device=input_ids.device ) @@ -572,7 +574,7 @@ def generate( return_data = BatchedDataDict.from_batches( micro_batches, pad_value_dict={ - "left_padded_output_ids": self.cfg["generation"]["pad_token"] + "left_padded_output_ids": self.cfg["generation"]["pad_token_id"] }, ) @@ -588,7 +590,7 @@ def generate( ) right_padded_output_ids = torch.full( (batch_size, max_seq_len), - self.cfg["generation"]["pad_token"], + self.cfg["generation"]["pad_token_id"], dtype=return_data["left_padded_output_ids"][0].dtype, device=return_data["left_padded_output_ids"][0].device, ) @@ -1024,7 +1026,7 @@ def generate( ) result = BatchedDataDict.from_batches( self.worker_group.get_all_worker_results(futures), - pad_value_dict={"output_ids": self.cfg["generation"]["pad_token"]}, + pad_value_dict={"output_ids": self.cfg["generation"]["pad_token_id"]}, ) # Verify the output has all required fields diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py index 11e12d8ee9..259732cc33 100644 --- a/tests/unit/models/generation/test_vllm_generation.py +++ b/tests/unit/models/generation/test_vllm_generation.py @@ -20,6 +20,7 @@ from nemo_reinforcer.algorithms.utils import get_tokenizer from nemo_reinforcer.distributed.virtual_cluster import RayVirtualCluster from nemo_reinforcer.distributed.batched_data_dict import BatchedDataDict +from nemo_reinforcer.models.generation.interfaces import configure_generation_config from nemo_reinforcer.models.generation.vllm import VllmGeneration, VllmConfig @@ -40,19 +41,6 @@ } -def configure_vllm_with_tokenizer(vllm_config, tokenizer, is_eval=False): - """Apply tokenizer-specific configurations to vLLM config.""" - if is_eval: - vllm_config["vllm_cfg"]["skip_tokenizer_init"] = False - vllm_config["vllm_cfg"]["load_format"] = "auto" - else: - vllm_config["vllm_cfg"]["skip_tokenizer_init"] = True - vllm_config["vllm_cfg"]["load_format"] = "dummy" - vllm_config["pad_token"] = tokenizer.pad_token_id - vllm_config["stop_token_ids"] = [tokenizer.eos_token_id] - return vllm_config - - @pytest.fixture(scope="module") def check_vllm_available(): """Skip tests if vLLM is not installed.""" @@ -90,7 +78,7 @@ def policy(cluster, tokenizer, check_vllm_available): """Initialize the vLLM policy.""" # Create separate configs for each policy vllm_config = basic_vllm_test_config.copy() - vllm_config = configure_vllm_with_tokenizer(vllm_config, tokenizer) + vllm_config = configure_generation_config(vllm_config, tokenizer) policy = VllmGeneration(cluster, vllm_config) yield policy @@ -210,7 +198,7 @@ def test_vllm_generation_with_hf_training(cluster, tokenizer): # Create separate configs for each policy vllm_config = basic_vllm_test_config.copy() - vllm_config = configure_vllm_with_tokenizer(vllm_config, tokenizer) + vllm_config = configure_generation_config(vllm_config, tokenizer) # Create HF-specific config with required parameters hf_config = { @@ -399,8 +387,8 @@ def test_vllm_policy_tensor_parallel(cluster, tokenizer): """Test vLLM policy with tensor parallelism > 1.""" # Configure with tensor_parallel_size=2 tp_config = basic_vllm_test_config.copy() - tp_config = configure_vllm_with_tokenizer(tp_config, tokenizer) - tp_config["tensor_parallel_size"] = 2 + tp_config = configure_generation_config(tp_config, tokenizer) + tp_config["vllm_cfg"]["tensor_parallel_size"] = 2 # Ensure we specify the distributed executor backend tp_config["vllm_kwargs"] = {"distributed_executor_backend": "ray"} @@ -463,7 +451,7 @@ def test_vllm_generate_text(cluster, tokenizer): # Create separate configs for each policy vllm_config = basic_vllm_test_config.copy() - vllm_config = configure_vllm_with_tokenizer(vllm_config, tokenizer, is_eval=True) + vllm_config = configure_generation_config(vllm_config, tokenizer, is_eval=True) # Ensure we can get same output assert vllm_config["model_name"] == "meta-llama/Llama-3.2-1B", ( @@ -497,7 +485,7 @@ def test_vllm_weight_update_and_prefix_cache_reset( # Create configs vllm_config = basic_vllm_test_config.copy() - vllm_config = configure_vllm_with_tokenizer(vllm_config, tokenizer, is_eval=True) + vllm_config = configure_generation_config(vllm_config, tokenizer, is_eval=True) vllm_config["vllm_cfg"]["tensor_parallel_size"] = tensor_parallel_size if tensor_parallel_size > 1: vllm_config["vllm_kwargs"] = {"distributed_executor_backend": "ray"} diff --git a/tests/unit/models/policy/test_hf_ray_policy.py b/tests/unit/models/policy/test_hf_ray_policy.py index dc570ba66a..c010450c22 100644 --- a/tests/unit/models/policy/test_hf_ray_policy.py +++ b/tests/unit/models/policy/test_hf_ray_policy.py @@ -16,12 +16,13 @@ import pprint import torch +from nemo_reinforcer.algorithms.interfaces import LossFunction from nemo_reinforcer.algorithms.utils import get_tokenizer -from nemo_reinforcer.models.policy import PolicyConfig -from nemo_reinforcer.models.policy.hf_policy import HfPolicy from nemo_reinforcer.distributed.virtual_cluster import RayVirtualCluster from nemo_reinforcer.distributed.batched_data_dict import BatchedDataDict -from nemo_reinforcer.algorithms.interfaces import LossFunction +from nemo_reinforcer.models.generation.interfaces import configure_generation_config +from nemo_reinforcer.models.policy import PolicyConfig +from nemo_reinforcer.models.policy.hf_policy import HfPolicy from tests.unit.test_utils import simple_loss, nll_loss @@ -92,8 +93,7 @@ def policy_setup(tokenizer): ) config = basic_llama_test_config - config["generation"]["pad_token"] = tokenizer.pad_token_id - config["generation"]["stop_token_ids"] = [tokenizer.eos_token_id] + config["generation"] = configure_generation_config(config["generation"], tokenizer) print("Creating HfPolicy...") policy = HfPolicy(cluster=cluster, config=config) @@ -308,8 +308,9 @@ def generation_setup(request, tokenizer): ) config = basic_llama_test_config - config["generation"]["pad_token"] = tokenizer.pad_token_id - config["generation"]["stop_token_ids"] = [tokenizer.eos_token_id] + config["generation"] = configure_generation_config( + config["generation"], tokenizer + ) print("Creating generation HfPolicy...") policy = HfPolicy( From 051a5bfd377c6284213353d2dc3a0d794b80a7e1 Mon Sep 17 00:00:00 2001 From: Yuki Huang Date: Thu, 3 Apr 2025 04:04:36 +0000 Subject: [PATCH 3/5] add test Signed-off-by: Yuki Huang --- .../models/generation/test_vllm_generation.py | 16 +++++++++++++++- tests/unit/models/policy/test_hf_ray_policy.py | 4 ++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py index 259732cc33..e0074e5c19 100644 --- a/tests/unit/models/generation/test_vllm_generation.py +++ b/tests/unit/models/generation/test_vllm_generation.py @@ -237,6 +237,17 @@ def test_vllm_generation_with_hf_training(cluster, tokenizer): "Where is the sun?", ] + expected_generations = [ + "Write a story about a magical forest. The forest is magical because it is full of", + "Explain how photosynthesis works\nExplain how photosynthesis works\nPhotosynthesis", + "What are the benefits of exercise? The benefits of exercise are many and varied. It", + "Describe the water cycle in your own words.\nDescribe the water cycle in", + "What is the capital of France? A. Paris B. New York C. Washington", + "Who is the president of the USA? Who is the president of the USA? Who is", + "What is the capital of the moon? A. Houston, Texas B. New York City", + "Where is the sun? Where is the moon? Where is the earth?", + ] + # Tokenize the prompts the same way as in test_hf_ray_policy tokenized = tokenizer( prompts, @@ -271,7 +282,7 @@ def test_vllm_generation_with_hf_training(cluster, tokenizer): # Step 1: Use vLLM for generation print("Using vLLM policy for fast generation...") - generation_results = vllm_policy.generate(test_input_data) + generation_results = vllm_policy.generate(test_input_data, greedy=True) vllm_policy.finish_generation() # Validate generation outputs assert "output_ids" in generation_results, ( @@ -286,6 +297,9 @@ def test_vllm_generation_with_hf_training(cluster, tokenizer): generation_results["output_ids"], skip_special_tokens=True ) print(f"vLLM generated texts: {generated_texts}") + assert generated_texts == expected_generations, ( + "Output should be the same as the expected output" + ) # Run logprob calculation with HF policy to verify diff --git a/tests/unit/models/policy/test_hf_ray_policy.py b/tests/unit/models/policy/test_hf_ray_policy.py index c010450c22..76926960cf 100644 --- a/tests/unit/models/policy/test_hf_ray_policy.py +++ b/tests/unit/models/policy/test_hf_ray_policy.py @@ -397,6 +397,10 @@ def test_hf_policy_generation(generation_setup, tokenizer, tracker): # Verify results assert "output_ids" in results, "Generation results should contain 'output_ids'" output_ids = results["output_ids"] + generated_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + assert generated_texts == expected_generations, ( + "Output should be the same as the expected output" + ) # run logprob calculation manually to verify fprop_logprob_data = BatchedDataDict( From 56c88b8b0bdea9370216586671dc63ade8f48f34 Mon Sep 17 00:00:00 2001 From: Yuki Huang Date: Fri, 4 Apr 2025 02:51:54 +0000 Subject: [PATCH 4/5] minor update of get_tokenizer Signed-off-by: Yuki Huang --- docs/design_docs/generation.md | 18 ++++++++---------- .../models/generation/interfaces.py | 4 ++-- nemo_reinforcer/models/policy/hf_policy.py | 8 +++----- 3 files changed, 13 insertions(+), 17 deletions(-) diff --git a/docs/design_docs/generation.md b/docs/design_docs/generation.md index 8dda8a028a..84f450c7cc 100644 --- a/docs/design_docs/generation.md +++ b/docs/design_docs/generation.md @@ -95,26 +95,20 @@ The {py:class}`UpdatableVllmInternalWorker Date: Fri, 4 Apr 2025 02:52:22 +0000 Subject: [PATCH 5/5] fix copy in test Signed-off-by: Yuki Huang --- tests/unit/models/generation/test_vllm_generation.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py index e0074e5c19..ed90267d10 100644 --- a/tests/unit/models/generation/test_vllm_generation.py +++ b/tests/unit/models/generation/test_vllm_generation.py @@ -12,10 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. +from copy import deepcopy + import pytest import torch import ray -import numpy as np from nemo_reinforcer.algorithms.utils import get_tokenizer from nemo_reinforcer.distributed.virtual_cluster import RayVirtualCluster @@ -400,7 +401,7 @@ def test_vllm_generation_with_hf_training(cluster, tokenizer): def test_vllm_policy_tensor_parallel(cluster, tokenizer): """Test vLLM policy with tensor parallelism > 1.""" # Configure with tensor_parallel_size=2 - tp_config = basic_vllm_test_config.copy() + tp_config = deepcopy(basic_vllm_test_config) tp_config = configure_generation_config(tp_config, tokenizer) tp_config["vllm_cfg"]["tensor_parallel_size"] = 2 @@ -498,7 +499,7 @@ def test_vllm_weight_update_and_prefix_cache_reset( from nemo_reinforcer.models.policy.hf_policy import HfPolicy # Create configs - vllm_config = basic_vllm_test_config.copy() + vllm_config = deepcopy(basic_vllm_test_config) vllm_config = configure_generation_config(vllm_config, tokenizer, is_eval=True) vllm_config["vllm_cfg"]["tensor_parallel_size"] = tensor_parallel_size if tensor_parallel_size > 1: