From 4bc5bfd71ef7b6e2cd179526fecd04b898a86e71 Mon Sep 17 00:00:00 2001 From: KiddoZhu Date: Tue, 6 May 2025 12:01:34 -0700 Subject: [PATCH 1/7] code execution + tool use + basic blockers for filesystems & modules Signed-off-by: KiddoZhu --- .gitignore | 8 +- nemo_rl/algorithms/dpo.py | 12 +- nemo_rl/algorithms/grpo.py | 7 + nemo_rl/algorithms/sft.py | 10 + nemo_rl/data/llm_message_utils.py | 6 - nemo_rl/environments/math_environment.py | 23 +- nemo_rl/experience/rollouts.py | 4 +- nemo_rl/models/dtensor/parallelize.py | 27 +- nemo_rl/models/generation/vllm.py | 2 + nemo_rl/models/policy/__init__.py | 2 +- .../models/policy/dtensor_policy_worker.py | 15 +- nemo_rl/models/policy/fsdp1_policy_worker.py | 46 ++- nemo_rl/models/policy/hf_policy.py | 4 + nemo_rl/tools/__init__.py | 0 nemo_rl/tools/generation.py | 236 ++++++++++++ nemo_rl/tools/interfaces.py | 20 + nemo_rl/tools/tools.py | 199 ++++++++++ nemo_rl/utils/checkpoint.py | 7 +- nemo_rl/utils/native_checkpoint.py | 44 ++- tests/functional/dpo.sh | 2 +- tests/test_suites/README.md | 7 +- tests/test_suites/nightly.txt | 10 +- tests/test_suites/release.txt | 4 +- tests/unit/conftest.py | 31 -- tests/unit/data/test_llm_message_utils.py | 35 -- tests/unit/experience/test_rollouts.py | 40 -- .../models/generation/test_vllm_generation.py | 5 +- .../unit/models/policy/test_dtensor_worker.py | 6 - tests/unit/test_recipes_and_test_suites.py | 96 +---- tests/unit/tools/test_tools.py | 351 ++++++++++++++++++ tests/unit/utils/test_checkpoint.py | 5 +- tests/unit/utils/test_native_checkpoint.py | 104 +++++- 32 files changed, 1036 insertions(+), 332 deletions(-) create mode 100644 nemo_rl/tools/__init__.py create mode 100644 nemo_rl/tools/generation.py create mode 100644 nemo_rl/tools/interfaces.py create mode 100644 nemo_rl/tools/tools.py create mode 100644 tests/unit/tools/test_tools.py diff --git a/.gitignore b/.gitignore index 12121a4155..46efa31b70 100644 --- a/.gitignore +++ b/.gitignore @@ -15,12 +15,14 @@ apidocs/ dist/ *.egg-info/ *.vscode/ -release_run* -ckpts/ # Test coverage.json .coverage* +unit_results.json +unit_results/ +release_run* +ckpts/ test_assets/ # Cache @@ -33,4 +35,4 @@ docker/ wandb/ checkpoints/ results/ -code_snapshots/ +code_snapshots/ \ No newline at end of file diff --git a/nemo_rl/algorithms/dpo.py b/nemo_rl/algorithms/dpo.py index 0647f0cd5a..dd6607ef9d 100644 --- a/nemo_rl/algorithms/dpo.py +++ b/nemo_rl/algorithms/dpo.py @@ -446,6 +446,14 @@ def dpo_train( % master_config["checkpointing"]["save_period"] == 0 ): # +1 because step is 0-indexed + is_last_checkpoint = ( + min( + len(train_dataloader) * max_num_epochs, + master_config["dpo"]["max_num_steps"], + ) + - (total_steps + 1) + < master_config["checkpointing"]["save_period"] + ) dpo_save_state["step"] = (current_step + 1) % len(train_dataloader) dpo_save_state["total_steps"] = total_steps + 1 dpo_save_state["epoch"] = current_epoch @@ -462,9 +470,7 @@ def dpo_train( optimizer_path=os.path.join( checkpoint_path, "policy", "optimizer" ), - tokenizer_path=os.path.join( - checkpoint_path, "policy", "tokenizer" - ), + save_hf=is_last_checkpoint, ) torch.save( train_dataloader.state_dict(), diff --git a/nemo_rl/algorithms/grpo.py b/nemo_rl/algorithms/grpo.py index 5a007451d0..952a6c172a 100644 --- a/nemo_rl/algorithms/grpo.py +++ b/nemo_rl/algorithms/grpo.py @@ -524,6 +524,12 @@ def grpo_train( ): # +1 because step is 0-indexed policy.prepare_for_training() + is_last_checkpoint = ( + min(len(dataloader), master_config["grpo"]["max_num_steps"]) + - (step + 1) + < master_config["checkpointing"]["save_period"] + ) + grpo_save_state["step"] = step + 1 grpo_save_state["val_reward"] = val_metrics["accuracy"] grpo_save_state["consumed_samples"] = consumed_samples @@ -540,6 +546,7 @@ def grpo_train( tokenizer_path=os.path.join( checkpoint_path, "policy", "tokenizer" ), + save_hf=is_last_checkpoint, ) torch.save( dataloader.state_dict(), diff --git a/nemo_rl/algorithms/sft.py b/nemo_rl/algorithms/sft.py index d10c3df483..8b5ffcddfd 100644 --- a/nemo_rl/algorithms/sft.py +++ b/nemo_rl/algorithms/sft.py @@ -447,6 +447,15 @@ def sft_train( % master_config["checkpointing"]["save_period"] == 0 ): # +1 because step is 0-indexed + is_last_checkpoint = ( + min( + len(train_dataloader) * max_num_epochs, + master_config["sft"]["max_num_steps"], + ) + - (total_steps + 1) + < master_config["checkpointing"]["save_period"] + ) + sft_save_state["step"] = (current_step + 1) % len(train_dataloader) sft_save_state["total_steps"] = total_steps + 1 sft_save_state["epoch"] = current_epoch @@ -467,6 +476,7 @@ def sft_train( tokenizer_path=os.path.join( checkpoint_path, "policy", "tokenizer" ), + save_hf=is_last_checkpoint, ) torch.save( train_dataloader.state_dict(), diff --git a/nemo_rl/data/llm_message_utils.py b/nemo_rl/data/llm_message_utils.py index 51cd5a279d..f2d24fc421 100644 --- a/nemo_rl/data/llm_message_utils.py +++ b/nemo_rl/data/llm_message_utils.py @@ -421,12 +421,6 @@ def get_formatted_message_log( new_message["token_ids"] = tokenizer( message_chunk, return_tensors="pt", add_special_tokens=False )["input_ids"][0] - if len(new_message["token_ids"]) == 0: - # if there is an empty message, the empty `token_ids` tensor ends up being in fp32, - # which causes `_validate_tensor_consistency` to fail. To fix this, we convert the - # empty tensor to int64. - new_message["token_ids"] = new_message["token_ids"].to(torch.int64) - new_message["content"] = message_chunk new_message_log.append(new_message) diff --git a/nemo_rl/environments/math_environment.py b/nemo_rl/environments/math_environment.py index fd968298b0..8da0528652 100644 --- a/nemo_rl/environments/math_environment.py +++ b/nemo_rl/environments/math_environment.py @@ -15,8 +15,7 @@ import ray import torch -from math_verify.metric import math_metric -from math_verify.parser import ExprExtractionConfig, LatexExtractionConfig +from math_verify import parse, verify from nemo_rl.distributed.batched_data_dict import BatchedDataDict from nemo_rl.distributed.virtual_cluster import PY_EXECUTABLES @@ -54,23 +53,9 @@ def verify( results = [] for response, ground_truth in zip(pred_responses, ground_truths): try: - # Use Latex and plain math extraction from predictions - # https://github.com/huggingface/Math-Verify?tab=readme-ov-file#extraction-targets - verify_func = math_metric( - gold_extraction_target=(LatexExtractionConfig(),), - pred_extraction_target=( - ExprExtractionConfig(), - LatexExtractionConfig(), - ), - ) - - ground_truth_parsable = "\\boxed{" + ground_truth + "}" - try: - ret_score, _ = verify_func([ground_truth_parsable], [response]) - except Exception: - ret_score = 0.0 - - results.append(float(ret_score)) + gold = parse(ground_truth) + pred = parse(response[-100:]) # avoid looking at the whole string + results.append(float(verify(gold, pred))) except Exception: results.append(0) return results diff --git a/nemo_rl/experience/rollouts.py b/nemo_rl/experience/rollouts.py index 567add0dfc..a556a32a42 100644 --- a/nemo_rl/experience/rollouts.py +++ b/nemo_rl/experience/rollouts.py @@ -311,9 +311,7 @@ def run_multi_turn_rollout( >= max_seq_len ): # truncate - tokenized_obs = tokenized_obs[ - : max_seq_len - (len(generated_ids[i]) + active_input_lengths[i]) - ] + tokenized_obs = tokenized_obs[: max_seq_len - active_input_lengths[i]] truncation_mask[i] = True # Record truncation sample_truncated[active_indices[i]] = True diff --git a/nemo_rl/models/dtensor/parallelize.py b/nemo_rl/models/dtensor/parallelize.py index 5998937cc9..3ae86d70cc 100644 --- a/nemo_rl/models/dtensor/parallelize.py +++ b/nemo_rl/models/dtensor/parallelize.py @@ -30,7 +30,6 @@ from torch.distributed.tensor.placement_types import Replicate, Shard from transformers.models.llama.modeling_llama import LlamaForCausalLM from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM -from transformers.models.qwen3.modeling_qwen3 import Qwen3ForCausalLM from nemo_rl.distributed.model_utils import from_parallel_logits_to_logprobs @@ -99,7 +98,7 @@ def _parallelize_llama( def _parallelize_qwen( - model: Union[Qwen2ForCausalLM, Qwen3ForCausalLM], + model: Qwen2ForCausalLM, dp_mesh: DeviceMesh, tp_mesh: DeviceMesh, mp_policy: MixedPrecisionPolicy, @@ -109,7 +108,7 @@ def _parallelize_qwen( ): """Parallelizes a Qwen2ForCausalLM model across data and tensor parallel dimensions.""" - class QwenRotaryEmbedParallel(SequenceParallel): + class Qwen2RotaryEmbedParallel(SequenceParallel): """Custom SequenceParallel class for Qwen2 rotary embeddings because the input is a tuple.""" @staticmethod @@ -142,23 +141,6 @@ def _prepare_input_fn(sequence_sharding, mod, inputs, device_mesh): return type(inputs)(new_inputs) - class Qwen3QKNorm(SequenceParallel): - @staticmethod - def _prepare_input_fn(sequence_sharding, mod, inputs, device_mesh): - input_tensor = inputs[0] - - if isinstance(input_tensor, DTensor): - assert input_tensor.placements == (Shard(dim=2),) - elif isinstance(input_tensor, torch.Tensor): - # assume the input passed in already sharded on the sequence dim and create the DTensor - return DTensor.from_local( - input_tensor, device_mesh, sequence_sharding, run_check=False - ) - else: - raise ValueError( - f"expecting input of {mod} to be a torch.Tensor or DTensor, but got {input_tensor}" - ) - if tp_mesh.size() > 1: assert not model.config.tie_word_embeddings, ( "Tie word embeddings not supported when TP is enabled" @@ -174,7 +156,7 @@ def _prepare_input_fn(sequence_sharding, mod, inputs, device_mesh): input_layouts=Replicate(), output_layouts=Shard(1), ), - "model.rotary_emb": QwenRotaryEmbedParallel(), + "model.rotary_emb": Qwen2RotaryEmbedParallel(), "model.norm": SequenceParallel(), "model.layers.*.input_layernorm": SequenceParallel(), "model.layers.*.self_attn.q_proj": ColwiseParallel( @@ -189,8 +171,6 @@ def _prepare_input_fn(sequence_sharding, mod, inputs, device_mesh): "model.layers.*.self_attn.o_proj": RowwiseParallel( output_layouts=Shard(1) ), - "model.layers.*.self_attn.q_norm": Qwen3QKNorm(), - "model.layers.*.self_attn.k_norm": Qwen3QKNorm(), "model.layers.*.post_attention_layernorm": SequenceParallel(), "model.layers.*.mlp.up_proj": ColwiseParallel(), "model.layers.*.mlp.gate_proj": ColwiseParallel(), @@ -234,7 +214,6 @@ def _prepare_input_fn(sequence_sharding, mod, inputs, device_mesh): PARALLIZE_FUNCTIONS = { Qwen2ForCausalLM: _parallelize_qwen, - Qwen3ForCausalLM: _parallelize_qwen, LlamaForCausalLM: _parallelize_llama, } diff --git a/nemo_rl/models/generation/vllm.py b/nemo_rl/models/generation/vllm.py index 59fcc26320..4128f6a9cc 100644 --- a/nemo_rl/models/generation/vllm.py +++ b/nemo_rl/models/generation/vllm.py @@ -273,6 +273,7 @@ def generate( # Read generation parameters from config top_k = self.cfg["top_k"] if self.cfg["top_k"] is not None else -1 + sampling_params = self.SamplingParams( temperature=self.cfg["temperature"] if not greedy else 0, top_p=self.cfg["top_p"], @@ -390,6 +391,7 @@ def generate_text( # Read generation parameters from config top_k = self.cfg["top_k"] if self.cfg["top_k"] is not None else -1 + sampling_params = self.SamplingParams( temperature=self.cfg["temperature"] if not greedy else 0, top_p=self.cfg["top_p"], diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py index fbe728a840..47714fb0f5 100644 --- a/nemo_rl/models/policy/__init__.py +++ b/nemo_rl/models/policy/__init__.py @@ -37,7 +37,7 @@ class PolicyConfig(TypedDict): train_micro_batch_size: int learning_rate: float logprob_batch_size: int - generation: Optional[GenerationConfig] + generation: GenerationConfig precision: str dtensor_cfg: DTensorConfig make_sequence_length_divisible_by: int diff --git a/nemo_rl/models/policy/dtensor_policy_worker.py b/nemo_rl/models/policy/dtensor_policy_worker.py index c99110d7e7..29ecd46452 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker.py +++ b/nemo_rl/models/policy/dtensor_policy_worker.py @@ -335,10 +335,6 @@ def train( else: logits = outputs.logits - # Divide logits by temperature - if "generation" in self.cfg and self.cfg["generation"] is not None: - logits.div_(self.cfg["generation"]["temperature"]) - loss, loss_metrics = loss_fn(logits, mb) num_valid_samples = loss_metrics["num_valid_samples"] loss_metrics["lr"] = self.optimizer.param_groups[0]["lr"] @@ -375,12 +371,10 @@ def train( # Update parameters self.optimizer.step() + self.scheduler.step() losses.append(torch.tensor(mb_losses).sum().item()) - # increment scheduler after all batches in rollout are processed - self.scheduler.step() - # Compute global loss across all ranks with torch.no_grad(): local_loss = torch.tensor(losses, device="cuda") @@ -720,10 +714,13 @@ def save_checkpoint( weights_path: str, optimizer_path: Optional[str] = None, tokenizer_path: Optional[str] = None, + save_torch_dist: bool = True, + save_hf: bool = False, ): """Save a checkpoint of the model. - the optimizer states are saved only if `optimizer` and `optimizer_path` are provided. + the HuggingFace checkpoint is saved only if `save_hf` is True, + and the optimizer states are saved only if `optimizer` and `optimizer_path` are provided. """ save_checkpoint( model=self.model, @@ -733,6 +730,8 @@ def save_checkpoint( optimizer_path=optimizer_path, tokenizer=self.tokenizer if tokenizer_path else None, tokenizer_path=tokenizer_path, + save_torch_dist=save_torch_dist, + save_hf=save_hf, ) def load_checkpoint(self, weights_path: str, optimizer_path: Optional[str] = None): diff --git a/nemo_rl/models/policy/fsdp1_policy_worker.py b/nemo_rl/models/policy/fsdp1_policy_worker.py index 53ec5944f9..bd3951f3a2 100644 --- a/nemo_rl/models/policy/fsdp1_policy_worker.py +++ b/nemo_rl/models/policy/fsdp1_policy_worker.py @@ -290,10 +290,6 @@ def train( else: logits = outputs.logits - # Divide logits by temperature - if "generation" in self.cfg and self.cfg["generation"] is not None: - logits.div_(self.cfg["generation"]["temperature"]) - loss, loss_metrics = loss_fn(logits, mb) num_valid_samples = loss_metrics["num_valid_samples"] loss_metrics["lr"] = self.optimizer.param_groups[0]["lr"] @@ -329,11 +325,9 @@ def train( # Update parameters self.optimizer.step() + self.scheduler.step() losses.append(torch.tensor(mb_losses).sum().item()) - # increment scheduler after all batches in rollout are processed - self.scheduler.step() - # Compute global loss across all ranks with torch.no_grad(): local_loss = torch.tensor(losses, device="cuda") @@ -634,26 +628,17 @@ def generate( device=return_data["left_padded_output_ids"][0].device, ) - for idx, seq in enumerate(return_data["left_padded_output_ids"]): + for idx, (seq, generated_logprob) in enumerate( + zip( + return_data["left_padded_output_ids"], + return_data["generation_logprobs"], + ) + ): # Get only the generated part (excluding input) original_length = return_data["orig_input_lengths"][idx].item() seq_len = seq.size(0) - # The generated content starts after the left-padded input - generated_part = seq[-(seq_len - input_length) :] - - eos_positions = (generated_part == self.tokenizer.eos_token_id).nonzero( - as_tuple=True - )[0] - # TODO @sahilj: handle different stopping criteria - # Calculate generation length - if len(eos_positions) > 0: - gen_length = ( - eos_positions[0].item() + 1 - ) # +1 to include the EOS token - else: - gen_length = len(generated_part) - + gen_length = (generated_logprob != 0).sum().item() generation_lengths.append(gen_length) valid_length = original_length + gen_length @@ -668,7 +653,7 @@ def generate( ) # Combine with generated part - valid_generated_part = generated_part[:gen_length] + valid_generated_part = seq[input_length : input_length + gen_length] valid_tokens = torch.cat([valid_input_part, valid_generated_part]) # Place at the beginning of the right-padded sequence @@ -916,6 +901,8 @@ def save_checkpoint( weights_path: str, optimizer_path: Optional[str] = None, tokenizer_path: Optional[str] = None, + save_torch_dist: bool = True, + save_hf: bool = False, ): """Save a checkpoint of the model. @@ -925,12 +912,19 @@ def save_checkpoint( __0_1.distcp __1_0.distcp ... + weights_path-hf/ + config.json + generation_config.json + model-00001-of-.safetensors + ... + model.safetensors.index.json optimizer_path/ __0_0.distcp __1_0.distcp ... - the optimizer states are saved only if `optimizer` and `optimizer_path` are provided. + the HuggingFace checkpoint is saved only if `save_hf` is True, + and the optimizer states are saved only if `optimizer` and `optimizer_path` are provided. """ save_checkpoint( model=self.model, @@ -940,6 +934,8 @@ def save_checkpoint( optimizer_path=optimizer_path, tokenizer=self.tokenizer if tokenizer_path else None, tokenizer_path=tokenizer_path, + save_torch_dist=save_torch_dist, + save_hf=save_hf, ) def load_checkpoint(self, weights_path: str, optimizer_path: Optional[str] = None): diff --git a/nemo_rl/models/policy/hf_policy.py b/nemo_rl/models/policy/hf_policy.py index 2d2dbf3d4c..2a579e3bcd 100644 --- a/nemo_rl/models/policy/hf_policy.py +++ b/nemo_rl/models/policy/hf_policy.py @@ -307,6 +307,8 @@ def save_checkpoint( weights_path: str, optimizer_path: Optional[str] = None, tokenizer_path: Optional[str] = None, + save_torch_dist: bool = True, + save_hf: bool = False, ): """Save a checkpoint of the model.""" futures = self.worker_group.run_all_workers_single_data( @@ -314,6 +316,8 @@ def save_checkpoint( weights_path, optimizer_path, tokenizer_path, + save_torch_dist, + save_hf, only_on="all_tied_workers", ) ray.get(futures) diff --git a/nemo_rl/tools/__init__.py b/nemo_rl/tools/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/nemo_rl/tools/generation.py b/nemo_rl/tools/generation.py new file mode 100644 index 0000000000..f50dbfe3ae --- /dev/null +++ b/nemo_rl/tools/generation.py @@ -0,0 +1,236 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import re +import warnings +from pprint import pformat +from typing import Dict + +import ray +import torch +from torch.nn.utils.rnn import pad_sequence +from transformers import AutoTokenizer + +from nemo_rl.distributed.batched_data_dict import BatchedDataDict +from nemo_rl.models.generation.interfaces import ( + GenerationDatumSpec, + GenerationInterface, + GenerationOutputSpec, +) +from nemo_rl.tools.interfaces import ToolInterface +from nemo_rl.tools.tools import StatefulCodeExecutor + +LOGIT_INFINITY = 1000 + + +def generate_with_code_and_tools( + policy: GenerationInterface, + input_batch: BatchedDataDict[GenerationDatumSpec], + tokenizer: AutoTokenizer, + execute_code: bool = True, + tool_map: Dict[str, ToolInterface] = {}, + tag: str = "", + result_tag: str = "", + *args, + **kwargs, +) -> BatchedDataDict[GenerationOutputSpec]: + """Generate a batch of data with code execution and tool use. + + All code execution and tool calls in the generation will be executed on-the-fly, + of which the results will be appended to the output. Multiple code execution and tool calls + is supported. + + This function can be used as a drop-in replacement of `policy.generate()`. + + Args: + policy: policy to generate from. Can be either vllm or HuggingFace backend + input_batch: BatchedDataDict containing input_ids and input_lengths tensors + tokenizer: tokenizer from the pretrained model + execute_code: whether to execute code + tool_map: tools that the model can use + tag: xml tag to detect code snippet + result_tag: xml tag to output the result + *args, **kwargs: arguments and keyword arguments accepted by `policy.generate()` + """ + if tool_map and not execute_code: + warnings.warn( + "Tool use requires code execution, but code execution is disabled. All the tools will be ignored." + ) + + batch = input_batch.copy() + start_tag = tag + end_tag = tag.replace("<", " 0: + generation_outputs = policy.generate(active_batch, *args, **kwargs) + + output_ids = generation_outputs["output_ids"] + # only contains logprobs for newly generated tokens + logprobs = generation_outputs["logprobs"] + input_lengths = active_batch["input_lengths"] + total_lengths = generation_outputs["unpadded_sequence_lengths"] + if old_logprobs is not None: + # restore logprobs for tokens generated in previous iterations + for i, input_length in enumerate(input_lengths): + logprobs[i, :input_length] = old_logprobs[i, :input_length] + + # extract newly generated tokens + generated_ids = [] + for output_id, input_length, total_length in zip( + output_ids, input_lengths, total_lengths + ): + generated_ids.append(output_id[input_length:total_length]) + + generated_texts = tokenizer.batch_decode( + generated_ids, skip_special_tokens=True + ) + + is_code = [] + exprs = [] + lookaheads = [] + # parse newly generated texts + for i, (generated_text, active_index, total_length) in enumerate( + zip(generated_texts, active_indices, total_lengths) + ): + match = re.search( + rf"{start_tag}(.*){end_tag}(.*)", generated_text, re.DOTALL + ) + if match: + # stop is caused by code execution + # expr takes everything between and , including new lines + # lookahead takes everything after + is_code.append(i) + expr, lookahead = match.groups() + exprs.append(expr) + lookaheads.append(lookahead) + else: + # stop is not caused by code execution + # e.g. eos token, max length or other stop strings + completed_output_ids[active_index] = output_ids[i, :total_length] + completed_logprobs[active_index] = logprobs[i, :total_length] + if len(is_code) == 0: + break + + # execute all code in this batch + futures = [] + for i, expr, lookahead in zip(is_code, exprs, lookaheads): + active_index = active_indices[i] + # dispatch code to a pre-allocated executor for that sample + # so that functions and variables will be carried over + future = executors[active_index].__call__.remote(expr) + futures.append(future) + results = ray.get(futures) + + new_results = [] + for result in results: + if result is None: + # no return value + result = "" + new_results.append(result) + continue + result = pformat(result) + if "\n" in expr or "\n" in result: + # multi-line format + result = f"\n\n{result_start}\n{result}\n{result_end}" + else: + # inline format + result = f"{result_start}{result}{result_end}" + if lookahead: + if result.startswith(lookahead): + # The generation may look like "\n" if ">\n" is a single token. + # We trim \n from the result if the model has already generated it. + result = result[len(lookahead) :] + else: + warnings.warn( + f"Expect the generation to stop at {repr(end_tag)}, but got {repr(end_tag + lookahead)}. " + "This is because some characters are merged into a single token by the tokenizer. " + "These extra characters will be kept in the generation." + ) + new_results.append(result) + + encodings = tokenizer( + new_results, + add_special_tokens=False, + padding=True, + padding_side="right", + return_tensors="pt", + ) + result_ids = encodings["input_ids"] + result_lengths = encodings["attention_mask"].sum(dim=1).to(torch.int32) + + is_code = torch.tensor(is_code) + # reduce active batch to those containing code + active_batch = active_batch.select_indices(is_code) + active_indices = active_indices[is_code] + output_ids = output_ids[is_code] + logprobs = logprobs[is_code] + total_lengths = total_lengths[is_code] + # max length before appending results + old_max_length = total_lengths.max() + # max length after appending results + new_max_length = (total_lengths + result_lengths).max() + new_output_ids = torch.full( + (len(active_indices), new_max_length), + tokenizer.pad_token_id, + dtype=output_ids.dtype, + ) + new_logprobs = torch.full( + (len(active_indices), new_max_length), 0, dtype=logprobs.dtype + ) + new_output_ids[:, :old_max_length] = output_ids[:, :old_max_length] + new_logprobs[:, :old_max_length] = logprobs[:, :old_max_length] + + # append results to generation + for i, (old_length, result_length) in enumerate( + zip(total_lengths, result_lengths) + ): + new_length = old_length + result_length + new_output_ids[i, old_length:new_length] = result_ids[i, :result_length] + new_logprobs[i, old_length:new_length] = LOGIT_INFINITY + + active_batch["input_ids"] = new_output_ids + active_batch["input_lengths"] = total_lengths + result_lengths + old_logprobs = new_logprobs + + output_ids = pad_sequence( + completed_output_ids, + batch_first=True, + padding_value=tokenizer.pad_token_id, + padding_side="right", + ) + logprobs = pad_sequence( + completed_logprobs, batch_first=True, padding_value=0.0, padding_side="right" + ) + total_lengths = torch.tensor([len(output_id) for output_id in completed_output_ids]) + generation_lengths = total_lengths - input_batch["input_lengths"] + + return { + "output_ids": output_ids, + "logprobs": logprobs, + "generation_lengths": generation_lengths, + "unpadded_sequence_lengths": total_lengths, + } diff --git a/nemo_rl/tools/interfaces.py b/nemo_rl/tools/interfaces.py new file mode 100644 index 0000000000..a37a3b6f10 --- /dev/null +++ b/nemo_rl/tools/interfaces.py @@ -0,0 +1,20 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from abc import ABC, abstractmethod + + +class ToolInterface(ABC): + @abstractmethod + def __call__(self, *args, **kwargs): + pass diff --git a/nemo_rl/tools/tools.py b/nemo_rl/tools/tools.py new file mode 100644 index 0000000000..1af1977926 --- /dev/null +++ b/nemo_rl/tools/tools.py @@ -0,0 +1,199 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import ast +import builtins +import math +import os +import tempfile +from collections import Counter +from contextlib import contextmanager +from typing import Any, Dict, List, Optional + +import ray +import torch +from datasets import load_dataset +from tqdm import tqdm +from transformers import AutoTokenizer + +from nemo_rl.tools.interfaces import ToolInterface + + +@ray.remote +class StatefulCodeExecutor(ToolInterface): + """Stateful code executor. + + Args: + context: classes, functions and variables accessible to the code executor. + By passing tools in context, the code executor also serves tool use. + """ + + def __init__(self, context: Dict[str, Any] = {}): + self.context = context.copy() + self.tmp_dir = tempfile.TemporaryDirectory() + + builtin_dict = {k: getattr(builtins, k) for k in dir(builtins)} + builtin_dict["open"] = self.safe_open + builtin_dict["__import__"] = self.safe_import + self.sandbox = {"__builtins__": builtin_dict} + + def __call__(self, code: str) -> Optional[str]: + tree = ast.parse(code) + + if tree.body and isinstance(tree.body[-1], ast.Expr): + # interactive mode + code = ast.unparse(tree.body[:-1]) + expr = ast.unparse(tree.body[-1]) + else: + # silent mode + expr = None + + try: + # isolate the code in a sandbox with globals={} + # capture local variables in self.context + with self.change_temporary_dir(): + exec(code, self.sandbox, self.context) + if expr: + return eval(expr, self.sandbox, self.context) + except Exception as err: + return err + + @contextmanager + def change_temporary_dir(self): + current_dir = os.getcwd() + os.chdir(self.tmp_dir.name) + try: + yield + finally: + os.chdir(current_dir) + + def safe_open(self, file, *args, **kwargs): + real_file = os.path.realpath(file) + tmp_dir = os.path.realpath(self.tmp_dir.name) + if os.path.commonpath([real_file, tmp_dir]) != tmp_dir: + # real_file is not inside tmp_dir + raise PermissionError( + "Access beyond the temporary working directory is blocked" + ) + return open(file, *args, **kwargs) + + def safe_import(self, name, *args, **kwargs): + risky_modules = { + "os", + "shutil", # erase filesystem + "sys", + "signal", # exit the current program + "socket", # network communication + "subprocess", + "threading", + "multiprocessing", # spawn threads or processes + "builtins", + "importlib", # bypass current blockers + } + if name in risky_modules: + raise PermissionError("Importing system and network modules is blocked") + return builtins.__import__(name, *args, **kwargs) + + +class BM25Retriever(ToolInterface): + """Sparse BM25 retriever. + + Args: + documents: list of documents to retrieve from + num_result: retrieve top-k documents + k1: parameter of BM25. Values in [1.2, 2.0] are recommended. + b: parameter of BM25. 0.75 is recommended. + device: device to compute BM25 + """ + + def __init__( + self, + documents: List[str] = None, + num_result: int = 10, + k1: float = 1.5, + b: float = 0.75, + device: str = "cpu", + ): + if documents is None: + dataset = load_dataset("wikimedia/wikipedia", "20231101.en") + self.documents = [sample["text"] for sample in dataset["train"]] + else: + self.documents = documents + self.tokenizer = AutoTokenizer.from_pretrained( + "bert-base-uncased", use_fast=True + ) + self.num_result = num_result + self.k1 = k1 + self.b = b + self.device = device + self.corpus_size = len(self.documents) + self.vocab_size = self.tokenizer.vocab_size + + self.build_index() + + def build_index(self): + doc_ids = [] + token_ids = [] + tfs = [] + lengths = [] + + for i, document in enumerate( + tqdm(self.documents, "Build index for BM25Retriever") + ): + input_ids = self.tokenizer.encode(document, add_special_tokens=False) + token2cnt = Counter(input_ids) + token_ids += token2cnt.keys() + tfs += token2cnt.values() + doc_ids += [i] * len(token2cnt) + lengths.append(len(input_ids)) + + avg_dl = sum(lengths) / self.corpus_size + for i, doc_id in enumerate(doc_ids): + tfs[i] = ( + tfs[i] + * (self.k1 + 1) + / (tfs[i] + self.k1 * (1 - self.b + self.b * lengths[doc_id] / avg_dl)) + ) + + indices = torch.tensor([doc_ids, token_ids], device=self.device) + values = torch.tensor(tfs, device=self.device) + self.doc_tfs = torch.sparse_coo_tensor( + indices, values, (self.corpus_size, self.vocab_size) + ) + + idfs = [0] * self.vocab_size + token2df = Counter(token_ids) + for token_id, df in token2df.items(): + idfs[token_id] = math.log((self.corpus_size - df + 0.5) / (df + 0.5) + 1) + self.idfs = idfs + + def __call__(self, query: str) -> List[str]: + input_ids = self.tokenizer.encode(query, add_special_tokens=False) + token2cnt = Counter(input_ids) + token_ids = [] + query_idfs = [] + for token_id, query_tf in token2cnt.items(): + token_ids.append(token_id) + query_idfs.append(query_tf * self.idfs[token_id]) + + indices = torch.tensor([token_ids, [0] * len(token_ids)], device=self.device) + values = torch.tensor(query_idfs, device=self.device) + query_idfs = torch.sparse_coo_tensor(indices, values, (self.vocab_size, 1)) + + scores = torch.sparse.mm(self.doc_tfs, query_idfs) + scores = scores.to_dense().squeeze(-1) + results = [] + for i in scores.topk(k=self.num_result).indices.tolist(): + results.append(self.documents[i]) + + return results diff --git a/nemo_rl/utils/checkpoint.py b/nemo_rl/utils/checkpoint.py index 5f23a0bd68..bc916d3d7e 100644 --- a/nemo_rl/utils/checkpoint.py +++ b/nemo_rl/utils/checkpoint.py @@ -26,7 +26,6 @@ import numpy as np import torch -import yaml class CheckpointingConfig(TypedDict): @@ -57,7 +56,7 @@ class CheckpointManager: checkpoint_dir/ step_0/ training_info.json - config.yaml + config.json policy.py (up to the algorithm loop to save here) policy_optimizer.py (up to the algorithm loop to save here) ... @@ -115,8 +114,8 @@ def init_tmp_checkpoint( # save config if run_config is not None: - with open(save_dir / "config.yaml", "w") as f: - yaml.safe_dump(run_config, f) + with open(save_dir / "config.json", "w") as f: + json.dump(run_config, f) return Path(os.path.abspath(save_dir)) diff --git a/nemo_rl/utils/native_checkpoint.py b/nemo_rl/utils/native_checkpoint.py index fc8f9ba44d..3573d2d86d 100644 --- a/nemo_rl/utils/native_checkpoint.py +++ b/nemo_rl/utils/native_checkpoint.py @@ -15,6 +15,7 @@ """Checkpoint management utilities for HF models.""" import os +from pathlib import Path from typing import Any, Optional import torch @@ -138,6 +139,8 @@ def save_checkpoint( optimizer_path: Optional[str] = None, tokenizer: Optional[Any] = None, tokenizer_path: Optional[str] = None, + save_torch_dist: bool = True, + save_hf: bool = False, ) -> None: """Save a checkpoint of the model and optionally optimizer state. @@ -147,17 +150,40 @@ def save_checkpoint( optimizer: Optional optimizer to save scheduler: Optional scheduler to save optimizer_path: Path to save optimizer state (required if optimizer provided) + save_torch_dist: Whether to save in PyTorch distributed format + save_hf: Whether to save in HuggingFace format """ - model_state = {"model": ModelState(model)} - dcp.save(model_state, checkpoint_id=weights_path) - - if optimizer is not None: - if optimizer_path is None: - raise ValueError( - "optimizer_path must be provided when saving optimizer state" + if save_hf: + if hasattr(model, "_fsdp_wrapped_module"): + model_state_dict = model._fsdp_wrapped_module.state_dict() + else: + model_state_dict = { + k: v.full_tensor() + if isinstance(v, torch.distributed.tensor.DTensor) + else v + for k, v in model.state_dict().items() + } + + if torch.distributed.get_rank() == 0: + # Create a new path by appending "-hf" to the weights path + hf_weights_path = f"{Path(weights_path)}-hf" + + model.save_pretrained( + hf_weights_path, + state_dict=model_state_dict, ) - optimizer_state = {"optim": OptimizerState(model, optimizer, scheduler)} - dcp.save(optimizer_state, checkpoint_id=optimizer_path) + + if save_torch_dist: + model_state = {"model": ModelState(model)} + dcp.save(model_state, checkpoint_id=weights_path) + + if optimizer is not None: + if optimizer_path is None: + raise ValueError( + "optimizer_path must be provided when saving optimizer state" + ) + optimizer_state = {"optim": OptimizerState(model, optimizer, scheduler)} + dcp.save(optimizer_state, checkpoint_id=optimizer_path) if tokenizer is not None: if tokenizer_path is None: diff --git a/tests/functional/dpo.sh b/tests/functional/dpo.sh index fb976d6701..200a08cdd7 100755 --- a/tests/functional/dpo.sh +++ b/tests/functional/dpo.sh @@ -35,5 +35,5 @@ uv run $PROJECT_ROOT/examples/run_dpo.py \ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS uv run tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["2"] < 0.715' + 'data["train/loss"]["2"] < 0.694' \ diff --git a/tests/test_suites/README.md b/tests/test_suites/README.md index 0759f06f25..3ccf0d75c9 100644 --- a/tests/test_suites/README.md +++ b/tests/test_suites/README.md @@ -4,18 +4,13 @@ Each test is named: ``` ---#n#g--.sh +--#n#g--.sh ``` Examples: * sft-llama3.2-1b-1n8g-fsdp2tp1.sh * grpo-qwen2-1.5B-instruct-4n8g-fsdp2tp2.sh * grpo-qwen2-1.5B-instruct-4n8g-fsdp2tp2-long.sh -* grpo-qwen2-1.5B-instruct-4n8g-fsdp2tp2-long.v2.sh - * The final verison suffix (starts with `.v2`, `.v3`, ...), is reserved for cases contributors believe the recipe's - convergence has changed due to their commit. Versioning signals that this recipe should not be compared to its - predecessor due to a change in convergence behavior. Examples of this change include: changing dataset, changing loss, - convergence bug fix. Changes affecting performance do not need a version change. ## Running manually diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt index b80a7ad545..4c609d5bff 100644 --- a/tests/test_suites/nightly.txt +++ b/tests/test_suites/nightly.txt @@ -3,15 +3,15 @@ ######## # Short 1N/1B runs (go past 200 steps - usually divergence happens by now) -- going to 4 nodes doesn't help that much -tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v2.sh -tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh +tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh +tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh # FSDP1 vs Dtensor (Qwen/Qwen2.5-7B-Instruct) -tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v2.sh -tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v2.sh +tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh +tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh # Functional 32b run -tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v2.sh +tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh ####### # SFT # diff --git a/tests/test_suites/release.txt b/tests/test_suites/release.txt index 42e9c49d00..69735cb0cb 100644 --- a/tests/test_suites/release.txt +++ b/tests/test_suites/release.txt @@ -3,10 +3,10 @@ ######## # Long 8b run -tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v2.sh +tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh # Long 32b run -tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v2.sh +tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh ####### # SFT # diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 7c6a9e21bf..2a3ec3a7c9 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -48,9 +48,6 @@ class TEST_ASSETS: TINY_QWEN2_MODEL_PATH = os.path.join( _TEST_ASSETS_DIR, "tiny_qwen2_with_qwen2_tokenizer" ) - TINY_QWEN3_MODEL_PATH = os.path.join( - _TEST_ASSETS_DIR, "tiny_qwen3_with_qwen3_tokenizer" - ) class UnitTestData(TypedDict): @@ -465,31 +462,3 @@ def tiny_qwen2_model_path(): tokenizer.save_pretrained(model_path) del model, tokenizer yield model_path - - -@pytest.fixture(scope="session", autouse=True) -def tiny_qwen3_model_path(): - """Fixture that returns a path to a tiny llama model with a dummy tokenizer.""" - import shutil - - from transformers import AutoTokenizer, Qwen3Config, Qwen3ForCausalLM - - model_path = TEST_ASSETS.TINY_QWEN3_MODEL_PATH - # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention) - # vocab_size=151936 (so we can re-use qwen2 1.5b tokenizer) - config = Qwen3Config( - num_hidden_layers=2, - hidden_size=64, - intermediate_size=32, - num_attention_heads=2, - vocab_size=151936, - tie_word_embeddings=False, - num_key_value_heads=None, - ) - model = Qwen3ForCausalLM(config=config) - tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B") - shutil.rmtree(model_path, ignore_errors=True) - model.save_pretrained(model_path) - tokenizer.save_pretrained(model_path) - del model, tokenizer - yield model_path diff --git a/tests/unit/data/test_llm_message_utils.py b/tests/unit/data/test_llm_message_utils.py index fc4c6c6b8d..0a5cb3ef4b 100644 --- a/tests/unit/data/test_llm_message_utils.py +++ b/tests/unit/data/test_llm_message_utils.py @@ -18,10 +18,8 @@ import torch from transformers import AutoTokenizer -from nemo_rl.data.hf_datasets import COMMON_CHAT_TEMPLATES from nemo_rl.data.interfaces import LLMMessageLogType, TaskDataSpec from nemo_rl.data.llm_message_utils import ( - _validate_tensor_consistency, add_loss_mask_to_message_log, batched_message_log_to_flat_message, get_first_index_that_differs, @@ -408,39 +406,6 @@ def test_get_formatted_message_log_qwen( assert actual_text == expected_text -def test_formatted_message_log_empty_message(): - message_logs = [ - [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": ""}, - ], - [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": "Hello!"}, - ], - ] - tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") - tokenizer.chat_template = COMMON_CHAT_TEMPLATES.passthrough_prompt_response - task_data_spec = TaskDataSpec(task_name="test") - result = [ - get_formatted_message_log( - message_log, - tokenizer, - task_data_spec, - add_bos_token=False, - add_eos_token=False, - ) - for message_log in message_logs - ] - flat_result = [message_log_to_flat_messages(m) for m in result] - for k in flat_result[0].keys(): - if isinstance(flat_result[0][k], torch.Tensor): - # make sure validate_tensor_consistency does not raise an error when one of the messages is empty - _validate_tensor_consistency( - [flat_result[i][k] for i in range(len(flat_result))] - ) - - def test_add_loss_mask_to_chat_message_log( tokenized_chat_message_log: LLMMessageLogType, ): diff --git a/tests/unit/experience/test_rollouts.py b/tests/unit/experience/test_rollouts.py index bcfa1b84d2..b45811d4f8 100644 --- a/tests/unit/experience/test_rollouts.py +++ b/tests/unit/experience/test_rollouts.py @@ -20,7 +20,6 @@ import torch from transformers import AutoTokenizer -from nemo_rl.data.llm_message_utils import batched_message_log_to_flat_message from nemo_rl.distributed.batched_data_dict import BatchedDataDict from nemo_rl.distributed.virtual_cluster import RayVirtualCluster from nemo_rl.environments.games.sliding_puzzle import ( @@ -441,45 +440,6 @@ def test_run_multi_step_calculator_vllm(multi_step_setup_vllm): print("\nMulti-Step Calculator VLLM Test assertions passed.") -@pytest.mark.skipif( - not torch.cuda.is_available() or torch.cuda.device_count() < 1, - reason="VLLM test requires at least 1 GPU", -) -def test_max_seqlen_respected(multi_step_setup_vllm): - """Tests multi-step calculator rollout with VllmGeneration.""" - vllm_generation, rollout_tokenizer, task_to_env, initial_batch, rollout_cluster = ( - multi_step_setup_vllm - ) - max_rollout_turns = initial_batch["extra_env_info"][0]["max_steps"] + 1 - max_seq_len = 290 - - print("\nRunning multi-step calculator rollout (VLLM)...") - vllm_generation.prepare_for_generation() - final_batch, rollout_metrics = run_multi_turn_rollout( - policy_generation=vllm_generation, - input_batch=initial_batch, - tokenizer=rollout_tokenizer, - task_to_env=task_to_env, - max_seq_len=max_seq_len, - max_rollout_turns=max_rollout_turns, - ) - vllm_generation.finish_generation() - print("Multi-step calculator rollout complete (VLLM).") - - # --- Assertions --- - assert isinstance(final_batch, BatchedDataDict) - assert "message_log" in final_batch - assert "total_reward" in final_batch - assert len(final_batch["message_log"]) == len(initial_batch["message_log"]) - flattened_message_log, _ = batched_message_log_to_flat_message( - final_batch["message_log"] - ) - # Check that the sequence length is respected by flattening the message log and checking the length - assert len(flattened_message_log["token_ids"][0]) == max_seq_len, ( - f"Sequence length {len(flattened_message_log['token_ids'][0])} is not equal to max_seq_len {max_seq_len}" - ) - - # --- Fixture for Sliding Puzzle Environment --- @pytest.fixture(scope="function") def sliding_puzzle_environment(rollout_cluster): diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py index 08f34defe2..552ea3dae2 100644 --- a/tests/unit/models/generation/test_vllm_generation.py +++ b/tests/unit/models/generation/test_vllm_generation.py @@ -36,7 +36,7 @@ }, "dtype": "bfloat16", "max_new_tokens": 10, - "temperature": 0.8, + "temperature": 1.0, "top_p": 1.0, "top_k": None, "stop_token_ids": None, @@ -85,9 +85,6 @@ def get_basic_hf_test_config(enable_dtensor: bool = False) -> PolicyConfig: }, "max_grad_norm": 1.0, "make_sequence_length_divisible_by": 1, - "generation": { - "temperature": 0.8, - }, } diff --git a/tests/unit/models/policy/test_dtensor_worker.py b/tests/unit/models/policy/test_dtensor_worker.py index 8ff416059e..7f175b3f15 100644 --- a/tests/unit/models/policy/test_dtensor_worker.py +++ b/tests/unit/models/policy/test_dtensor_worker.py @@ -294,10 +294,6 @@ def training_setup(request, two_gpu_virtual_cluster): (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, True, False, True), (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, False, True, True), (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, True, True, True), - (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, True, True, False), - (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, True, False, True), - (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, False, True, True), - (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, True, True, True), ], indirect=True, ) @@ -425,8 +421,6 @@ def logprob_setup(request, two_gpu_virtual_cluster): (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 2, False, False, False), (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 2, False, True, False), (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 2, False, True, True), - (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 2, False, True, False), - (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 2, False, False, False), ], indirect=True, ) diff --git a/tests/unit/test_recipes_and_test_suites.py b/tests/unit/test_recipes_and_test_suites.py index 47d1d2f45b..9bac39188e 100644 --- a/tests/unit/test_recipes_and_test_suites.py +++ b/tests/unit/test_recipes_and_test_suites.py @@ -19,8 +19,6 @@ dir_path = os.path.dirname(os.path.abspath(__file__)) project_root = os.path.abspath(os.path.join(dir_path, "..", "..")) -configs_dir = os.path.join(project_root, "examples", "configs") -recipes_dir = os.path.join(project_root, "examples", "configs", "recipes") test_suites_dir = os.path.join(project_root, "tests", "test_suites") nightly_test_suite_path = os.path.join(test_suites_dir, "nightly.txt") @@ -32,13 +30,6 @@ test_suites_dir, "release_performance.txt" ) -# Relative to project root -ALGO_MAPPING_TO_BASE_YAML = { - "sft": "examples/configs/sft.yaml", - "dpo": "examples/configs/dpo.yaml", - "grpo": "examples/configs/grpo_math_1B.yaml", -} - @pytest.fixture def nightly_test_suite(): @@ -99,16 +90,6 @@ def all_test_suites( ) -@pytest.fixture -def all_recipe_yaml_rel_paths(): - all_recipes = [] - for recipe_path in glob.glob( - os.path.join(recipes_dir, "**", "*.yaml"), recursive=True - ): - all_recipes.append(recipe_path[len(recipes_dir) + 1 :]) - return all_recipes - - @pytest.mark.parametrize( "test_suite_path", [ @@ -131,14 +112,12 @@ def test_test_suites_exist(test_suite_path): def test_no_overlap_across_test_suites(all_test_suites): - all_tests = set(all_test_suites) - assert len(all_tests) == len(all_test_suites), ( - f"Test suites have repeats {all_tests}" - ) + recipes = set(all_test_suites) + assert len(recipes) == len(all_test_suites), f"Test suites have repeats {recipes}" -def test_all_test_scripts_accounted_for_in_test_suites(all_test_suites): - all_test_scripts_in_test_suites = set(all_test_suites) +def test_all_recipes_accounted_for_in_test_suites(all_test_suites): + all_recipes_in_test_suites = set(all_test_suites) all_tests_in_test_suites_dir = set() for recipe_path in glob.glob( @@ -148,37 +127,8 @@ def test_all_test_scripts_accounted_for_in_test_suites(all_test_suites): recipe_name = recipe_path[len(project_root) + 1 :] all_tests_in_test_suites_dir.add(recipe_name) - assert all_test_scripts_in_test_suites == all_tests_in_test_suites_dir, ( - "All test scripts are not accounted for in the test suites" - ) - - -def test_all_recipe_yamls_accounted_for_in_test_suites( - all_recipe_yaml_rel_paths, all_test_suites -): - """This test along with test_all_test_scripts_accounted_for_in_test_suites() ensures that all recipe yaml/test scripts/test_suite(txts) are in sync.""" - assert len(set(all_recipe_yaml_rel_paths)) == len(set(all_test_suites)), ( - "Recipe YAMLs should be accounted for in the test suites" - ) - - all_test_script_paths_in_test_suites = set() - for test_script in all_test_suites: - # Each test suite is relative from project root - test_script_rel_to_test_suites_dir = test_script[ - len(os.path.join("tests", "test_suites")) + 1 : - ] - all_test_script_paths_in_test_suites.add(test_script_rel_to_test_suites_dir) - - # Since we're comparing yaml to sh, chop off the .sh/.yaml extensions for comparison - all_test_script_paths_in_test_suites = { - os.path.splitext(path)[0] for path in all_test_script_paths_in_test_suites - } - all_recipe_yaml_rel_paths = { - os.path.splitext(path)[0] for path in all_recipe_yaml_rel_paths - } - - assert all_test_script_paths_in_test_suites == set(all_recipe_yaml_rel_paths), ( - "All recipe YAMLs are not accounted for in the test suites" + assert all_recipes_in_test_suites == all_tests_in_test_suites_dir, ( + "All recipes are not accounted for in the test suites" ) @@ -265,37 +215,3 @@ def test_all_tests_can_find_config_if_dryrun(all_test_suites): assert result.returncode == 0, ( f"Command failed with exit code {result.returncode}" ) - - -def test_all_recipes_start_with_algo_hyphen(all_recipe_yaml_rel_paths): - expected_algos = set(ALGO_MAPPING_TO_BASE_YAML.keys()) - for recipe_yaml in all_recipe_yaml_rel_paths: - basename = os.path.basename(recipe_yaml) - algo = basename.split("-")[0] - assert algo in expected_algos, ( - f"Recipe {recipe_yaml} has unexpected algo {algo}" - ) - - -@pytest.mark.parametrize("algo, algo_base_yaml", ALGO_MAPPING_TO_BASE_YAML.items()) -def test_all_recipes_can_merge_configs_with_base_config( - all_recipe_yaml_rel_paths, all_test_suites, algo, algo_base_yaml -): - from omegaconf import OmegaConf - - base_yaml = os.path.join(project_root, algo_base_yaml) - base_config = OmegaConf.load(base_yaml) - # Would result in an error if we couldn't merge our config with the recipe's config - OmegaConf.set_struct(base_config, True) - for recipe_yaml in all_recipe_yaml_rel_paths: - if not os.path.basename(recipe_yaml).startswith(algo): - # Skipping here b/c we test that all recipes start with the algo-hyphen in - # test_all_recipes_start_with_algo_hyphen() - continue - recipe_yaml_path = os.path.join(recipes_dir, recipe_yaml) - recipe_config = OmegaConf.load(recipe_yaml_path) - OmegaConf.set_struct(recipe_config, True) - # This will raise a error if the config can't be merged - print(f"Merging {recipe_yaml} with {base_yaml}") - merged_config = OmegaConf.merge(base_config, recipe_config) - print(merged_config) diff --git a/tests/unit/tools/test_tools.py b/tests/unit/tools/test_tools.py new file mode 100644 index 0000000000..a22ca03c3c --- /dev/null +++ b/tests/unit/tools/test_tools.py @@ -0,0 +1,351 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from copy import deepcopy + +import pytest +import ray +import torch +from datasets import load_dataset +from transformers import AutoTokenizer + +from nemo_rl.distributed.batched_data_dict import BatchedDataDict +from nemo_rl.distributed.virtual_cluster import RayVirtualCluster +from nemo_rl.models.generation.interfaces import configure_generation_config +from nemo_rl.models.generation.vllm import VllmConfig, VllmGeneration +from nemo_rl.models.policy.hf_policy import HfPolicy, PolicyConfig +from nemo_rl.tools.generation import generate_with_code_and_tools +from nemo_rl.tools.tools import BM25Retriever, StatefulCodeExecutor + +MODEL_NAME = "meta-llama/Llama-3.2-1B" + + +# Define basic vLLM test config +basic_vllm_test_config: VllmConfig = { + "backend": "vllm", + "model_name": MODEL_NAME, + "tokenizer_name": None, + "dtype": "bfloat16", + "max_new_tokens": 100, + "temperature": 1.0, + "top_p": 1.0, + "top_k": None, + "stop_token_ids": None, + "stop_strings": None, + "vllm_cfg": { + "tensor_parallel_size": 1, + "gpu_memory_utilization": 0.3, + "max_model_len": 1024, + }, +} + +basic_hf_test_config: PolicyConfig = { + "model_name": MODEL_NAME, + "tokenizer_name": None, + "generation_batch_size": 1, + "generation": { + "backend": "hf", + "max_new_tokens": 100, + "temperature": 1.0, + "top_p": 1.0, + "top_k": None, + "stop_token_ids": None, + "stop_strings": None, + }, + # Required training parameters + "train_global_batch_size": 1, + "train_micro_batch_size": 1, + "learning_rate": 5e-6, + "logprob_batch_size": 1, + "max_new_tokens": 16, + "do_sample": False, + "precision": "float32", + "activation_checkpointing_enabled": False, + "fsdp_offload_enabled": False, + "optimizer": { + "name": "torch.optim.AdamW", + "kwargs": { + "lr": 5e-6, + "weight_decay": 0.01, + "betas": [0.9, 0.999], + "eps": 1e-8, + }, + }, + "dtensor_cfg": {"enabled": False}, +} + + +@pytest.fixture(scope="module") +def cluster(): + """Create a virtual cluster for testing.""" + # Create a cluster with 1 node that has 1 GPU bundles + virtual_cluster = RayVirtualCluster( + bundle_ct_per_node_list=[1], # 1 node with 1 GPU bundle + use_gpus=True, + max_colocated_worker_groups=2, + num_gpus_per_node=1, # Use available GPUs + name="vllm-test-cluster", + ) + yield virtual_cluster + virtual_cluster.shutdown() + + +@pytest.fixture(scope="function") +def tokenizer(): + """Loads the tokenizer for the tests.""" + print(f"Loading tokenizer: {MODEL_NAME}") + tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + print( + f"Tokenizer loaded. Pad token: {tokenizer.pad_token} (ID: {tokenizer.pad_token_id}), EOS token: {tokenizer.eos_token} (ID: {tokenizer.eos_token_id})" + ) + return tokenizer + + +def test_vllm_execute_code(cluster, tokenizer): + """Test that vLLM can call the code executor.""" + # Prepare test data + codes = [ + "x = 3; y = 4\nThis is some regular text.\nx + y\n", + "\ndef f(x):\n return x * x\n\nf(2)\n\n", + ] + results = ["7", "\n\n4\n"] + results = [code + result for code, result in zip(codes, results)] + + test_prompts = [code * 4 for code in codes] + encodings = tokenizer( + test_prompts, + padding="max_length", + max_length=1024, + return_tensors="pt", + padding_side="right", + ) + input_lengths = encodings["attention_mask"].sum(dim=1).to(torch.int32) + batch = BatchedDataDict( + { + "input_ids": encodings["input_ids"], + "input_lengths": input_lengths, + } + ) + + # Create separate configs for each policy + vllm_config = basic_vllm_test_config.copy() + vllm_config = configure_generation_config(vllm_config, tokenizer, is_eval=True) + + # Create vLLM generation + vllm_generation = VllmGeneration(cluster, vllm_config) + + # Generate and check result + outputs = generate_with_code_and_tools( + vllm_generation, batch, tokenizer, greedy=True + ) + + all_output_ids = outputs["output_ids"] + logprobs = outputs["logprobs"] + input_lengths = outputs["unpadded_sequence_lengths"] - outputs["generation_lengths"] + output_lengths = outputs["unpadded_sequence_lengths"] + input_ids = [] + output_ids = [] + for all_output_id, input_length, output_length in zip( + all_output_ids, input_lengths, output_lengths + ): + input_ids.append(all_output_id[:input_length]) + output_ids.append(all_output_id[input_length:output_length]) + indices = torch.arange(all_output_ids.shape[-1]) + input_lengths = input_lengths.unsqueeze(-1) + output_lengths = output_lengths.unsqueeze(-1) + is_generated = (indices >= input_lengths) & (indices < output_lengths) + + input_texts = tokenizer.batch_decode(input_ids, skip_special_tokens=True) + output_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + + assert input_texts == test_prompts, "Unexpected modification to input texts" + assert output_texts == results, f"Expect {results}, got wrong output {output_texts}" + assert (logprobs[~is_generated] == 0.0).all(), ( + "Unexpected log probabilities on input tokens or paddings" + ) + assert (logprobs[is_generated] != 0.0).all(), ( + "Generated tokens must have non-trivial log probabilities" + ) + + # Clean up + vllm_generation.shutdown() + + +def test_hf_execute_code(cluster, tokenizer): + """Test that Huggingface models can call the code executor.""" + # Prepare test data + codes = [ + "x = 3; y = 4\nThis is some regular text.\nx + y\n", + "\ndef f(x):\n return x * x\n\nf(2)\n\n", + ] + results = ["7", "\n\n4\n"] + results = [code + result for code, result in zip(codes, results)] + + test_prompts = [code * 4 for code in codes] + encodings = tokenizer( + test_prompts, + padding="max_length", + max_length=1024, + return_tensors="pt", + padding_side="right", + ) + input_lengths = encodings["attention_mask"].sum(dim=1).to(torch.int32) + batch = BatchedDataDict( + { + "input_ids": encodings["input_ids"], + "input_lengths": input_lengths, + } + ) + + # Create separate configs for each policy + hf_config = deepcopy(basic_hf_test_config) + hf_config["generation"] = configure_generation_config( + hf_config["generation"], + tokenizer, # is_eval=True + ) + + # Create vLLM generation + hf_policy = HfPolicy( + cluster, hf_config, tokenizer, init_reference_model=False, init_optimizer=False + ) + + # Generate and check result + outputs = generate_with_code_and_tools(hf_policy, batch, tokenizer, greedy=True) + + all_output_ids = outputs["output_ids"] + logprobs = outputs["logprobs"] + input_lengths = outputs["unpadded_sequence_lengths"] - outputs["generation_lengths"] + output_lengths = outputs["unpadded_sequence_lengths"] + input_ids = [] + output_ids = [] + for all_output_id, input_length, output_length in zip( + all_output_ids, input_lengths, output_lengths + ): + input_ids.append(all_output_id[:input_length]) + output_ids.append(all_output_id[input_length:output_length]) + indices = torch.arange(all_output_ids.shape[-1]) + input_lengths = input_lengths.unsqueeze(-1) + output_lengths = output_lengths.unsqueeze(-1) + is_generated = (indices >= input_lengths) & (indices < output_lengths) + + input_texts = tokenizer.batch_decode(input_ids, skip_special_tokens=True) + output_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + + assert input_texts == test_prompts, "Unexpected modification to input texts" + assert output_texts == results, f"Expect {results}, got wrong output {output_texts}" + assert (logprobs[~is_generated] == 0.0).all(), ( + "Unexpected log probabilities on input tokens or paddings" + ) + assert (logprobs[is_generated] != 0.0).all(), ( + "Generated tokens must have non-trivial log probabilities" + ) + + # Clean up + hf_policy.shutdown() + + +def test_untrusted_code(cluster): + """Test whether the code executor can block untrusted code.""" + executor = StatefulCodeExecutor.remote() + + # accessing temporary files shouldn't be blocked + code = ( + "with open('allowed_file.txt', 'w') as fout:\n" + " fout.write('some content')\n" + "with open('allowed_file.txt') as fin:\n" + " content = fin.read()\n" + "content" + ) + result = ray.get(executor.__call__.remote(code)) + assert result == "some content" + + # accessing other files should be blocked + code = "with open('/etc/passwd', 'r') as fin:\n fin.read()" + result = ray.get(executor.__call__.remote(code)) + assert isinstance(result, PermissionError) + + # importing non-sensitive modules shouldn't be blocked + code = "import math\nround(math.sqrt(8))" + result = ray.get(executor.__call__.remote(code)) + assert result == 3 + + # importing sensitive modules should be blocked + code = "import os" + result = ray.get(executor.__call__.remote(code)) + assert isinstance(result, PermissionError) + + +@pytest.mark.timeout(150) +def test_vllm_use_tool(cluster, tokenizer): + """Test that vLLM can use tool in the code executor.""" + # Prepare test data + codes = ["retrieve('Jen-Hsun Huang')\n"] + results = [ + "\n\n" + "['Nvidia was established in 1993 by Jen-Hsun Huang, Curtis Priem, and Chris '\n" + " 'Malachowsky. In 2000 Nvidia took intellectual possession of 3dfx, one of the '\n" + " 'biggest GPU producers in 1990s.']\n" + "" + ] + results = [code + result for code, result in zip(codes, results)] + + test_prompts = [code * 4 for code in codes] + encodings = tokenizer( + test_prompts, + padding="max_length", + max_length=1024, + return_tensors="pt", + padding_side="right", + ) + input_lengths = encodings["attention_mask"].sum(dim=1).to(torch.int32) + batch = BatchedDataDict( + { + "input_ids": encodings["input_ids"], + "input_lengths": input_lengths, + } + ) + + # Construct retriever + dataset = load_dataset("rahular/simple-wikipedia") + documents = [sample["text"] for sample in dataset["train"]] + tool_map = {"retrieve": BM25Retriever(documents, num_result=1)} + + # Create separate configs for each policy + vllm_config = basic_vllm_test_config.copy() + vllm_config = configure_generation_config(vllm_config, tokenizer, is_eval=True) + + # Create vLLM generation + vllm_generation = VllmGeneration(cluster, vllm_config) + + # Generate and check result + outputs = generate_with_code_and_tools( + vllm_generation, batch, tokenizer, tool_map=tool_map, greedy=True + ) + + all_output_ids = outputs["output_ids"] + input_lengths = outputs["unpadded_sequence_lengths"] - outputs["generation_lengths"] + output_lengths = outputs["unpadded_sequence_lengths"] + output_ids = [] + for all_output_id, input_length, output_length in zip( + all_output_ids, input_lengths, output_lengths + ): + output_ids.append(all_output_id[input_length:output_length]) + + output_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + + assert output_texts == results, f"Expect {results}, got wrong output {output_texts}" + + # Clean up + vllm_generation.shutdown() diff --git a/tests/unit/utils/test_checkpoint.py b/tests/unit/utils/test_checkpoint.py index c5a90c7932..2a912e94b2 100644 --- a/tests/unit/utils/test_checkpoint.py +++ b/tests/unit/utils/test_checkpoint.py @@ -17,7 +17,6 @@ import numpy as np import pytest import torch -import yaml from nemo_rl.utils.checkpoint import CheckpointManager @@ -63,8 +62,8 @@ def test_init_tmp_checkpoint(checkpoint_manager, checkpoint_dir): assert isinstance(saved_metadata["numpy"], (int, float)) # Check if config was saved - with open(save_dir / "config.yaml", "r") as f: - saved_config = yaml.safe_load(f) + with open(save_dir / "config.json", "r") as f: + saved_config = json.load(f) assert saved_config == run_config diff --git a/tests/unit/utils/test_native_checkpoint.py b/tests/unit/utils/test_native_checkpoint.py index f751c5c47e..7cebeade90 100755 --- a/tests/unit/utils/test_native_checkpoint.py +++ b/tests/unit/utils/test_native_checkpoint.py @@ -61,9 +61,6 @@ "tensor_parallel_size": 1, }, "max_grad_norm": 1.0, - "generation": { - "temperature": 1.0, - }, } @@ -286,6 +283,77 @@ def test_save_and_load_model_and_optimizer(mock_experiment): check_dict_equality(new_optimizer.state_dict(), optimizer.state_dict()) +@pytest.mark.parametrize("num_gpus", [1, 2], ids=["1gpu", "2gpu"]) +def test_save_and_load_hf_checkpoint(policy, num_gpus): + ## warm up with a forward pass + ## this is needed before saving a checkpoint because FSDP does some lazy initialization + input_ids = torch.randint(0, 16000, (4, 128)) # 4 sequences, each of length 128 + attention_mask = torch.ones(4, 128) + input_lengths = attention_mask.sum(dim=1).to(torch.int32) + dummy_fwd_dict = BatchedDataDict( + { + "input_ids": input_ids, + "input_lengths": input_lengths, + "attention_mask": attention_mask, + "labels": torch.randint(0, 16000, (4, 128)), + } + ) + policy.get_logprobs(dummy_fwd_dict) + + with TemporaryDirectory() as tmp_dir: + policy.save_checkpoint( + os.path.join(tmp_dir, "test_hf_and_dcp"), + save_hf=True, + save_torch_dist=True, + tokenizer_path=os.path.join(tmp_dir, "test_hf_and_dcp_tokenizer"), + ) + + ## make sure we save both HF and DCP checkpoints + # Dynamically create the expected set of distcp files based on num_gpus + expected_distcp_files = {f"__{rank}_0.distcp" for rank in range(num_gpus)} + expected_files = expected_distcp_files.union({".metadata"}) + + assert ( + set(os.listdir(os.path.join(tmp_dir, "test_hf_and_dcp"))) == expected_files + ) + assert set(os.listdir(os.path.join(tmp_dir, "test_hf_and_dcp_tokenizer"))) == { + "tokenizer_config.json", + "tokenizer.json", + "special_tokens_map.json", + } + + converted_model = AutoModelForCausalLM.from_pretrained( + os.path.join(tmp_dir, "test_hf_and_dcp-hf") + ) + + hf_save_dir = os.path.join(tmp_dir, "test_hf_and_dcp-hf") + hf_files = set(os.listdir(hf_save_dir)) + + # Check the HF saved files structure: could be single or sharded + expected_common_hf_files = {"config.json", "generation_config.json"} + if "model.safetensors" in hf_files: + # Single file format (1 GPU or smaller model) + expected_hf_files = expected_common_hf_files.union({"model.safetensors"}) + else: + # Sharded format (>=2 GPUs or larger model) + expected_hf_files = expected_common_hf_files.union( + { + "model-00001-of-00002.safetensors", + "model-00002-of-00002.safetensors", + "model.safetensors.index.json", + } + ) + assert hf_files == expected_hf_files + + coverted_model = AutoModelForCausalLM.from_pretrained(hf_save_dir) + original_model = AutoModelForCausalLM.from_pretrained( + simple_policy_config["model_name"] + ) + + ## make sure converted model matches the original + check_dict_equality(converted_model.state_dict(), original_model.state_dict()) + + @pytest.mark.parametrize("num_gpus", [1, 2], ids=["1gpu", "2gpu"]) def test_convert_dcp_to_hf(policy, num_gpus): ## warm up with a forward pass @@ -306,6 +374,8 @@ def test_convert_dcp_to_hf(policy, num_gpus): with TemporaryDirectory() as tmp_dir: policy.save_checkpoint( os.path.join(tmp_dir, "test_hf_and_dcp"), + save_hf=True, + save_torch_dist=True, ) # Dynamically create the expected set of distcp files based on num_gpus @@ -317,6 +387,25 @@ def test_convert_dcp_to_hf(policy, num_gpus): set(os.listdir(os.path.join(tmp_dir, "test_hf_and_dcp"))) == expected_files ) + # Check the HF saved files structure: could be single or sharded + hf_save_dir = os.path.join(tmp_dir, "test_hf_and_dcp-hf") + hf_files = set(os.listdir(hf_save_dir)) + expected_common_hf_files = {"config.json", "generation_config.json"} + + if "model.safetensors" in hf_files: + # Single file format (1 GPU or smaller model) + expected_hf_files = expected_common_hf_files.union({"model.safetensors"}) + else: + # Sharded format (>=2 GPUs or larger model) + expected_hf_files = expected_common_hf_files.union( + { + "model-00001-of-00002.safetensors", + "model-00002-of-00002.safetensors", + "model.safetensors.index.json", + } + ) + assert hf_files == expected_hf_files + offline_converted_model_path = convert_dcp_to_hf( os.path.join(tmp_dir, "test_hf_and_dcp"), os.path.join(tmp_dir, "test_hf_and_dcp-hf-offline"), @@ -334,11 +423,18 @@ def test_convert_dcp_to_hf(policy, num_gpus): offline_converted_model_path ) + online_converted_model = AutoModelForCausalLM.from_pretrained( + os.path.join(tmp_dir, "test_hf_and_dcp-hf") + ) original_model = AutoModelForCausalLM.from_pretrained( simple_policy_config["model_name"] ) - # Ensure the offline checkpoint is different from the original + ## make sure both conversions results in the same state dict + check_dict_equality( + online_converted_model.state_dict(), offline_converted_model.state_dict() + ) + # Ensure the offline one is different from the original assert_recursive_dict_different( offline_converted_model.state_dict(), original_model.state_dict() ) From 09e7a805da8006320667cb14b2b97d0d0ce85a27 Mon Sep 17 00:00:00 2001 From: KiddoZhu Date: Tue, 6 May 2025 14:01:20 -0700 Subject: [PATCH 2/7] revert from main branch Signed-off-by: KiddoZhu --- .gitignore | 6 +- nemo_rl/algorithms/dpo.py | 12 +- nemo_rl/algorithms/grpo.py | 7 -- nemo_rl/algorithms/sft.py | 10 -- nemo_rl/data/llm_message_utils.py | 6 + nemo_rl/environments/math_environment.py | 23 +++- nemo_rl/experience/rollouts.py | 4 +- nemo_rl/models/dtensor/parallelize.py | 27 ++++- nemo_rl/models/generation/vllm.py | 2 - nemo_rl/models/policy/__init__.py | 2 +- .../models/policy/dtensor_policy_worker.py | 15 +-- nemo_rl/models/policy/fsdp1_policy_worker.py | 21 ++-- nemo_rl/models/policy/hf_policy.py | 4 - nemo_rl/utils/checkpoint.py | 7 +- nemo_rl/utils/native_checkpoint.py | 44 ++------ tests/functional/dpo.sh | 2 +- tests/test_suites/README.md | 7 +- tests/test_suites/nightly.txt | 10 +- tests/test_suites/release.txt | 4 +- tests/unit/conftest.py | 31 ++++++ tests/unit/data/test_llm_message_utils.py | 35 ++++++ tests/unit/experience/test_rollouts.py | 40 +++++++ .../models/generation/test_vllm_generation.py | 5 +- .../unit/models/policy/test_dtensor_worker.py | 6 + tests/unit/test_recipes_and_test_suites.py | 96 +++++++++++++++- tests/unit/utils/test_checkpoint.py | 5 +- tests/unit/utils/test_native_checkpoint.py | 104 +----------------- 27 files changed, 315 insertions(+), 220 deletions(-) diff --git a/.gitignore b/.gitignore index 46efa31b70..27a0fca478 100644 --- a/.gitignore +++ b/.gitignore @@ -15,14 +15,14 @@ apidocs/ dist/ *.egg-info/ *.vscode/ +release_run* +ckpts/ # Test coverage.json .coverage* unit_results.json unit_results/ -release_run* -ckpts/ test_assets/ # Cache @@ -35,4 +35,4 @@ docker/ wandb/ checkpoints/ results/ -code_snapshots/ \ No newline at end of file +code_snapshots/ diff --git a/nemo_rl/algorithms/dpo.py b/nemo_rl/algorithms/dpo.py index dd6607ef9d..0647f0cd5a 100644 --- a/nemo_rl/algorithms/dpo.py +++ b/nemo_rl/algorithms/dpo.py @@ -446,14 +446,6 @@ def dpo_train( % master_config["checkpointing"]["save_period"] == 0 ): # +1 because step is 0-indexed - is_last_checkpoint = ( - min( - len(train_dataloader) * max_num_epochs, - master_config["dpo"]["max_num_steps"], - ) - - (total_steps + 1) - < master_config["checkpointing"]["save_period"] - ) dpo_save_state["step"] = (current_step + 1) % len(train_dataloader) dpo_save_state["total_steps"] = total_steps + 1 dpo_save_state["epoch"] = current_epoch @@ -470,7 +462,9 @@ def dpo_train( optimizer_path=os.path.join( checkpoint_path, "policy", "optimizer" ), - save_hf=is_last_checkpoint, + tokenizer_path=os.path.join( + checkpoint_path, "policy", "tokenizer" + ), ) torch.save( train_dataloader.state_dict(), diff --git a/nemo_rl/algorithms/grpo.py b/nemo_rl/algorithms/grpo.py index 952a6c172a..5a007451d0 100644 --- a/nemo_rl/algorithms/grpo.py +++ b/nemo_rl/algorithms/grpo.py @@ -524,12 +524,6 @@ def grpo_train( ): # +1 because step is 0-indexed policy.prepare_for_training() - is_last_checkpoint = ( - min(len(dataloader), master_config["grpo"]["max_num_steps"]) - - (step + 1) - < master_config["checkpointing"]["save_period"] - ) - grpo_save_state["step"] = step + 1 grpo_save_state["val_reward"] = val_metrics["accuracy"] grpo_save_state["consumed_samples"] = consumed_samples @@ -546,7 +540,6 @@ def grpo_train( tokenizer_path=os.path.join( checkpoint_path, "policy", "tokenizer" ), - save_hf=is_last_checkpoint, ) torch.save( dataloader.state_dict(), diff --git a/nemo_rl/algorithms/sft.py b/nemo_rl/algorithms/sft.py index 8b5ffcddfd..d10c3df483 100644 --- a/nemo_rl/algorithms/sft.py +++ b/nemo_rl/algorithms/sft.py @@ -447,15 +447,6 @@ def sft_train( % master_config["checkpointing"]["save_period"] == 0 ): # +1 because step is 0-indexed - is_last_checkpoint = ( - min( - len(train_dataloader) * max_num_epochs, - master_config["sft"]["max_num_steps"], - ) - - (total_steps + 1) - < master_config["checkpointing"]["save_period"] - ) - sft_save_state["step"] = (current_step + 1) % len(train_dataloader) sft_save_state["total_steps"] = total_steps + 1 sft_save_state["epoch"] = current_epoch @@ -476,7 +467,6 @@ def sft_train( tokenizer_path=os.path.join( checkpoint_path, "policy", "tokenizer" ), - save_hf=is_last_checkpoint, ) torch.save( train_dataloader.state_dict(), diff --git a/nemo_rl/data/llm_message_utils.py b/nemo_rl/data/llm_message_utils.py index f2d24fc421..51cd5a279d 100644 --- a/nemo_rl/data/llm_message_utils.py +++ b/nemo_rl/data/llm_message_utils.py @@ -421,6 +421,12 @@ def get_formatted_message_log( new_message["token_ids"] = tokenizer( message_chunk, return_tensors="pt", add_special_tokens=False )["input_ids"][0] + if len(new_message["token_ids"]) == 0: + # if there is an empty message, the empty `token_ids` tensor ends up being in fp32, + # which causes `_validate_tensor_consistency` to fail. To fix this, we convert the + # empty tensor to int64. + new_message["token_ids"] = new_message["token_ids"].to(torch.int64) + new_message["content"] = message_chunk new_message_log.append(new_message) diff --git a/nemo_rl/environments/math_environment.py b/nemo_rl/environments/math_environment.py index 8da0528652..fd968298b0 100644 --- a/nemo_rl/environments/math_environment.py +++ b/nemo_rl/environments/math_environment.py @@ -15,7 +15,8 @@ import ray import torch -from math_verify import parse, verify +from math_verify.metric import math_metric +from math_verify.parser import ExprExtractionConfig, LatexExtractionConfig from nemo_rl.distributed.batched_data_dict import BatchedDataDict from nemo_rl.distributed.virtual_cluster import PY_EXECUTABLES @@ -53,9 +54,23 @@ def verify( results = [] for response, ground_truth in zip(pred_responses, ground_truths): try: - gold = parse(ground_truth) - pred = parse(response[-100:]) # avoid looking at the whole string - results.append(float(verify(gold, pred))) + # Use Latex and plain math extraction from predictions + # https://github.com/huggingface/Math-Verify?tab=readme-ov-file#extraction-targets + verify_func = math_metric( + gold_extraction_target=(LatexExtractionConfig(),), + pred_extraction_target=( + ExprExtractionConfig(), + LatexExtractionConfig(), + ), + ) + + ground_truth_parsable = "\\boxed{" + ground_truth + "}" + try: + ret_score, _ = verify_func([ground_truth_parsable], [response]) + except Exception: + ret_score = 0.0 + + results.append(float(ret_score)) except Exception: results.append(0) return results diff --git a/nemo_rl/experience/rollouts.py b/nemo_rl/experience/rollouts.py index a556a32a42..567add0dfc 100644 --- a/nemo_rl/experience/rollouts.py +++ b/nemo_rl/experience/rollouts.py @@ -311,7 +311,9 @@ def run_multi_turn_rollout( >= max_seq_len ): # truncate - tokenized_obs = tokenized_obs[: max_seq_len - active_input_lengths[i]] + tokenized_obs = tokenized_obs[ + : max_seq_len - (len(generated_ids[i]) + active_input_lengths[i]) + ] truncation_mask[i] = True # Record truncation sample_truncated[active_indices[i]] = True diff --git a/nemo_rl/models/dtensor/parallelize.py b/nemo_rl/models/dtensor/parallelize.py index 3ae86d70cc..5998937cc9 100644 --- a/nemo_rl/models/dtensor/parallelize.py +++ b/nemo_rl/models/dtensor/parallelize.py @@ -30,6 +30,7 @@ from torch.distributed.tensor.placement_types import Replicate, Shard from transformers.models.llama.modeling_llama import LlamaForCausalLM from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM +from transformers.models.qwen3.modeling_qwen3 import Qwen3ForCausalLM from nemo_rl.distributed.model_utils import from_parallel_logits_to_logprobs @@ -98,7 +99,7 @@ def _parallelize_llama( def _parallelize_qwen( - model: Qwen2ForCausalLM, + model: Union[Qwen2ForCausalLM, Qwen3ForCausalLM], dp_mesh: DeviceMesh, tp_mesh: DeviceMesh, mp_policy: MixedPrecisionPolicy, @@ -108,7 +109,7 @@ def _parallelize_qwen( ): """Parallelizes a Qwen2ForCausalLM model across data and tensor parallel dimensions.""" - class Qwen2RotaryEmbedParallel(SequenceParallel): + class QwenRotaryEmbedParallel(SequenceParallel): """Custom SequenceParallel class for Qwen2 rotary embeddings because the input is a tuple.""" @staticmethod @@ -141,6 +142,23 @@ def _prepare_input_fn(sequence_sharding, mod, inputs, device_mesh): return type(inputs)(new_inputs) + class Qwen3QKNorm(SequenceParallel): + @staticmethod + def _prepare_input_fn(sequence_sharding, mod, inputs, device_mesh): + input_tensor = inputs[0] + + if isinstance(input_tensor, DTensor): + assert input_tensor.placements == (Shard(dim=2),) + elif isinstance(input_tensor, torch.Tensor): + # assume the input passed in already sharded on the sequence dim and create the DTensor + return DTensor.from_local( + input_tensor, device_mesh, sequence_sharding, run_check=False + ) + else: + raise ValueError( + f"expecting input of {mod} to be a torch.Tensor or DTensor, but got {input_tensor}" + ) + if tp_mesh.size() > 1: assert not model.config.tie_word_embeddings, ( "Tie word embeddings not supported when TP is enabled" @@ -156,7 +174,7 @@ def _prepare_input_fn(sequence_sharding, mod, inputs, device_mesh): input_layouts=Replicate(), output_layouts=Shard(1), ), - "model.rotary_emb": Qwen2RotaryEmbedParallel(), + "model.rotary_emb": QwenRotaryEmbedParallel(), "model.norm": SequenceParallel(), "model.layers.*.input_layernorm": SequenceParallel(), "model.layers.*.self_attn.q_proj": ColwiseParallel( @@ -171,6 +189,8 @@ def _prepare_input_fn(sequence_sharding, mod, inputs, device_mesh): "model.layers.*.self_attn.o_proj": RowwiseParallel( output_layouts=Shard(1) ), + "model.layers.*.self_attn.q_norm": Qwen3QKNorm(), + "model.layers.*.self_attn.k_norm": Qwen3QKNorm(), "model.layers.*.post_attention_layernorm": SequenceParallel(), "model.layers.*.mlp.up_proj": ColwiseParallel(), "model.layers.*.mlp.gate_proj": ColwiseParallel(), @@ -214,6 +234,7 @@ def _prepare_input_fn(sequence_sharding, mod, inputs, device_mesh): PARALLIZE_FUNCTIONS = { Qwen2ForCausalLM: _parallelize_qwen, + Qwen3ForCausalLM: _parallelize_qwen, LlamaForCausalLM: _parallelize_llama, } diff --git a/nemo_rl/models/generation/vllm.py b/nemo_rl/models/generation/vllm.py index 4128f6a9cc..59fcc26320 100644 --- a/nemo_rl/models/generation/vllm.py +++ b/nemo_rl/models/generation/vllm.py @@ -273,7 +273,6 @@ def generate( # Read generation parameters from config top_k = self.cfg["top_k"] if self.cfg["top_k"] is not None else -1 - sampling_params = self.SamplingParams( temperature=self.cfg["temperature"] if not greedy else 0, top_p=self.cfg["top_p"], @@ -391,7 +390,6 @@ def generate_text( # Read generation parameters from config top_k = self.cfg["top_k"] if self.cfg["top_k"] is not None else -1 - sampling_params = self.SamplingParams( temperature=self.cfg["temperature"] if not greedy else 0, top_p=self.cfg["top_p"], diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py index 47714fb0f5..fbe728a840 100644 --- a/nemo_rl/models/policy/__init__.py +++ b/nemo_rl/models/policy/__init__.py @@ -37,7 +37,7 @@ class PolicyConfig(TypedDict): train_micro_batch_size: int learning_rate: float logprob_batch_size: int - generation: GenerationConfig + generation: Optional[GenerationConfig] precision: str dtensor_cfg: DTensorConfig make_sequence_length_divisible_by: int diff --git a/nemo_rl/models/policy/dtensor_policy_worker.py b/nemo_rl/models/policy/dtensor_policy_worker.py index 29ecd46452..c99110d7e7 100644 --- a/nemo_rl/models/policy/dtensor_policy_worker.py +++ b/nemo_rl/models/policy/dtensor_policy_worker.py @@ -335,6 +335,10 @@ def train( else: logits = outputs.logits + # Divide logits by temperature + if "generation" in self.cfg and self.cfg["generation"] is not None: + logits.div_(self.cfg["generation"]["temperature"]) + loss, loss_metrics = loss_fn(logits, mb) num_valid_samples = loss_metrics["num_valid_samples"] loss_metrics["lr"] = self.optimizer.param_groups[0]["lr"] @@ -371,10 +375,12 @@ def train( # Update parameters self.optimizer.step() - self.scheduler.step() losses.append(torch.tensor(mb_losses).sum().item()) + # increment scheduler after all batches in rollout are processed + self.scheduler.step() + # Compute global loss across all ranks with torch.no_grad(): local_loss = torch.tensor(losses, device="cuda") @@ -714,13 +720,10 @@ def save_checkpoint( weights_path: str, optimizer_path: Optional[str] = None, tokenizer_path: Optional[str] = None, - save_torch_dist: bool = True, - save_hf: bool = False, ): """Save a checkpoint of the model. - the HuggingFace checkpoint is saved only if `save_hf` is True, - and the optimizer states are saved only if `optimizer` and `optimizer_path` are provided. + the optimizer states are saved only if `optimizer` and `optimizer_path` are provided. """ save_checkpoint( model=self.model, @@ -730,8 +733,6 @@ def save_checkpoint( optimizer_path=optimizer_path, tokenizer=self.tokenizer if tokenizer_path else None, tokenizer_path=tokenizer_path, - save_torch_dist=save_torch_dist, - save_hf=save_hf, ) def load_checkpoint(self, weights_path: str, optimizer_path: Optional[str] = None): diff --git a/nemo_rl/models/policy/fsdp1_policy_worker.py b/nemo_rl/models/policy/fsdp1_policy_worker.py index bd3951f3a2..19523394ad 100644 --- a/nemo_rl/models/policy/fsdp1_policy_worker.py +++ b/nemo_rl/models/policy/fsdp1_policy_worker.py @@ -289,6 +289,10 @@ def train( logits = self.model.lm_head(outputs.last_hidden_state) else: logits = outputs.logits + + # Divide logits by temperature + if "generation" in self.cfg and self.cfg["generation"] is not None: + logits.div_(self.cfg["generation"]["temperature"]) loss, loss_metrics = loss_fn(logits, mb) num_valid_samples = loss_metrics["num_valid_samples"] @@ -325,9 +329,11 @@ def train( # Update parameters self.optimizer.step() - self.scheduler.step() losses.append(torch.tensor(mb_losses).sum().item()) + # increment scheduler after all batches in rollout are processed + self.scheduler.step() + # Compute global loss across all ranks with torch.no_grad(): local_loss = torch.tensor(losses, device="cuda") @@ -901,8 +907,6 @@ def save_checkpoint( weights_path: str, optimizer_path: Optional[str] = None, tokenizer_path: Optional[str] = None, - save_torch_dist: bool = True, - save_hf: bool = False, ): """Save a checkpoint of the model. @@ -912,19 +916,12 @@ def save_checkpoint( __0_1.distcp __1_0.distcp ... - weights_path-hf/ - config.json - generation_config.json - model-00001-of-.safetensors - ... - model.safetensors.index.json optimizer_path/ __0_0.distcp __1_0.distcp ... - the HuggingFace checkpoint is saved only if `save_hf` is True, - and the optimizer states are saved only if `optimizer` and `optimizer_path` are provided. + the optimizer states are saved only if `optimizer` and `optimizer_path` are provided. """ save_checkpoint( model=self.model, @@ -934,8 +931,6 @@ def save_checkpoint( optimizer_path=optimizer_path, tokenizer=self.tokenizer if tokenizer_path else None, tokenizer_path=tokenizer_path, - save_torch_dist=save_torch_dist, - save_hf=save_hf, ) def load_checkpoint(self, weights_path: str, optimizer_path: Optional[str] = None): diff --git a/nemo_rl/models/policy/hf_policy.py b/nemo_rl/models/policy/hf_policy.py index 2a579e3bcd..2d2dbf3d4c 100644 --- a/nemo_rl/models/policy/hf_policy.py +++ b/nemo_rl/models/policy/hf_policy.py @@ -307,8 +307,6 @@ def save_checkpoint( weights_path: str, optimizer_path: Optional[str] = None, tokenizer_path: Optional[str] = None, - save_torch_dist: bool = True, - save_hf: bool = False, ): """Save a checkpoint of the model.""" futures = self.worker_group.run_all_workers_single_data( @@ -316,8 +314,6 @@ def save_checkpoint( weights_path, optimizer_path, tokenizer_path, - save_torch_dist, - save_hf, only_on="all_tied_workers", ) ray.get(futures) diff --git a/nemo_rl/utils/checkpoint.py b/nemo_rl/utils/checkpoint.py index bc916d3d7e..5f23a0bd68 100644 --- a/nemo_rl/utils/checkpoint.py +++ b/nemo_rl/utils/checkpoint.py @@ -26,6 +26,7 @@ import numpy as np import torch +import yaml class CheckpointingConfig(TypedDict): @@ -56,7 +57,7 @@ class CheckpointManager: checkpoint_dir/ step_0/ training_info.json - config.json + config.yaml policy.py (up to the algorithm loop to save here) policy_optimizer.py (up to the algorithm loop to save here) ... @@ -114,8 +115,8 @@ def init_tmp_checkpoint( # save config if run_config is not None: - with open(save_dir / "config.json", "w") as f: - json.dump(run_config, f) + with open(save_dir / "config.yaml", "w") as f: + yaml.safe_dump(run_config, f) return Path(os.path.abspath(save_dir)) diff --git a/nemo_rl/utils/native_checkpoint.py b/nemo_rl/utils/native_checkpoint.py index 3573d2d86d..fc8f9ba44d 100644 --- a/nemo_rl/utils/native_checkpoint.py +++ b/nemo_rl/utils/native_checkpoint.py @@ -15,7 +15,6 @@ """Checkpoint management utilities for HF models.""" import os -from pathlib import Path from typing import Any, Optional import torch @@ -139,8 +138,6 @@ def save_checkpoint( optimizer_path: Optional[str] = None, tokenizer: Optional[Any] = None, tokenizer_path: Optional[str] = None, - save_torch_dist: bool = True, - save_hf: bool = False, ) -> None: """Save a checkpoint of the model and optionally optimizer state. @@ -150,40 +147,17 @@ def save_checkpoint( optimizer: Optional optimizer to save scheduler: Optional scheduler to save optimizer_path: Path to save optimizer state (required if optimizer provided) - save_torch_dist: Whether to save in PyTorch distributed format - save_hf: Whether to save in HuggingFace format """ - if save_hf: - if hasattr(model, "_fsdp_wrapped_module"): - model_state_dict = model._fsdp_wrapped_module.state_dict() - else: - model_state_dict = { - k: v.full_tensor() - if isinstance(v, torch.distributed.tensor.DTensor) - else v - for k, v in model.state_dict().items() - } - - if torch.distributed.get_rank() == 0: - # Create a new path by appending "-hf" to the weights path - hf_weights_path = f"{Path(weights_path)}-hf" - - model.save_pretrained( - hf_weights_path, - state_dict=model_state_dict, - ) + model_state = {"model": ModelState(model)} + dcp.save(model_state, checkpoint_id=weights_path) - if save_torch_dist: - model_state = {"model": ModelState(model)} - dcp.save(model_state, checkpoint_id=weights_path) - - if optimizer is not None: - if optimizer_path is None: - raise ValueError( - "optimizer_path must be provided when saving optimizer state" - ) - optimizer_state = {"optim": OptimizerState(model, optimizer, scheduler)} - dcp.save(optimizer_state, checkpoint_id=optimizer_path) + if optimizer is not None: + if optimizer_path is None: + raise ValueError( + "optimizer_path must be provided when saving optimizer state" + ) + optimizer_state = {"optim": OptimizerState(model, optimizer, scheduler)} + dcp.save(optimizer_state, checkpoint_id=optimizer_path) if tokenizer is not None: if tokenizer_path is None: diff --git a/tests/functional/dpo.sh b/tests/functional/dpo.sh index 200a08cdd7..fb976d6701 100755 --- a/tests/functional/dpo.sh +++ b/tests/functional/dpo.sh @@ -35,5 +35,5 @@ uv run $PROJECT_ROOT/examples/run_dpo.py \ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS uv run tests/check_metrics.py $JSON_METRICS \ - 'data["train/loss"]["2"] < 0.694' \ + 'data["train/loss"]["2"] < 0.715' diff --git a/tests/test_suites/README.md b/tests/test_suites/README.md index 3ccf0d75c9..0759f06f25 100644 --- a/tests/test_suites/README.md +++ b/tests/test_suites/README.md @@ -4,13 +4,18 @@ Each test is named: ``` ---#n#g--.sh +--#n#g--.sh ``` Examples: * sft-llama3.2-1b-1n8g-fsdp2tp1.sh * grpo-qwen2-1.5B-instruct-4n8g-fsdp2tp2.sh * grpo-qwen2-1.5B-instruct-4n8g-fsdp2tp2-long.sh +* grpo-qwen2-1.5B-instruct-4n8g-fsdp2tp2-long.v2.sh + * The final verison suffix (starts with `.v2`, `.v3`, ...), is reserved for cases contributors believe the recipe's + convergence has changed due to their commit. Versioning signals that this recipe should not be compared to its + predecessor due to a change in convergence behavior. Examples of this change include: changing dataset, changing loss, + convergence bug fix. Changes affecting performance do not need a version change. ## Running manually diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt index 4c609d5bff..b80a7ad545 100644 --- a/tests/test_suites/nightly.txt +++ b/tests/test_suites/nightly.txt @@ -3,15 +3,15 @@ ######## # Short 1N/1B runs (go past 200 steps - usually divergence happens by now) -- going to 4 nodes doesn't help that much -tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh -tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh +tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v2.sh +tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh # FSDP1 vs Dtensor (Qwen/Qwen2.5-7B-Instruct) -tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh -tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh +tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v2.sh +tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v2.sh # Functional 32b run -tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh +tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v2.sh ####### # SFT # diff --git a/tests/test_suites/release.txt b/tests/test_suites/release.txt index 69735cb0cb..42e9c49d00 100644 --- a/tests/test_suites/release.txt +++ b/tests/test_suites/release.txt @@ -3,10 +3,10 @@ ######## # Long 8b run -tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh +tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v2.sh # Long 32b run -tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh +tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v2.sh ####### # SFT # diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 2a3ec3a7c9..7c6a9e21bf 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -48,6 +48,9 @@ class TEST_ASSETS: TINY_QWEN2_MODEL_PATH = os.path.join( _TEST_ASSETS_DIR, "tiny_qwen2_with_qwen2_tokenizer" ) + TINY_QWEN3_MODEL_PATH = os.path.join( + _TEST_ASSETS_DIR, "tiny_qwen3_with_qwen3_tokenizer" + ) class UnitTestData(TypedDict): @@ -462,3 +465,31 @@ def tiny_qwen2_model_path(): tokenizer.save_pretrained(model_path) del model, tokenizer yield model_path + + +@pytest.fixture(scope="session", autouse=True) +def tiny_qwen3_model_path(): + """Fixture that returns a path to a tiny llama model with a dummy tokenizer.""" + import shutil + + from transformers import AutoTokenizer, Qwen3Config, Qwen3ForCausalLM + + model_path = TEST_ASSETS.TINY_QWEN3_MODEL_PATH + # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention) + # vocab_size=151936 (so we can re-use qwen2 1.5b tokenizer) + config = Qwen3Config( + num_hidden_layers=2, + hidden_size=64, + intermediate_size=32, + num_attention_heads=2, + vocab_size=151936, + tie_word_embeddings=False, + num_key_value_heads=None, + ) + model = Qwen3ForCausalLM(config=config) + tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B") + shutil.rmtree(model_path, ignore_errors=True) + model.save_pretrained(model_path) + tokenizer.save_pretrained(model_path) + del model, tokenizer + yield model_path diff --git a/tests/unit/data/test_llm_message_utils.py b/tests/unit/data/test_llm_message_utils.py index 0a5cb3ef4b..fc4c6c6b8d 100644 --- a/tests/unit/data/test_llm_message_utils.py +++ b/tests/unit/data/test_llm_message_utils.py @@ -18,8 +18,10 @@ import torch from transformers import AutoTokenizer +from nemo_rl.data.hf_datasets import COMMON_CHAT_TEMPLATES from nemo_rl.data.interfaces import LLMMessageLogType, TaskDataSpec from nemo_rl.data.llm_message_utils import ( + _validate_tensor_consistency, add_loss_mask_to_message_log, batched_message_log_to_flat_message, get_first_index_that_differs, @@ -406,6 +408,39 @@ def test_get_formatted_message_log_qwen( assert actual_text == expected_text +def test_formatted_message_log_empty_message(): + message_logs = [ + [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": ""}, + ], + [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hello!"}, + ], + ] + tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") + tokenizer.chat_template = COMMON_CHAT_TEMPLATES.passthrough_prompt_response + task_data_spec = TaskDataSpec(task_name="test") + result = [ + get_formatted_message_log( + message_log, + tokenizer, + task_data_spec, + add_bos_token=False, + add_eos_token=False, + ) + for message_log in message_logs + ] + flat_result = [message_log_to_flat_messages(m) for m in result] + for k in flat_result[0].keys(): + if isinstance(flat_result[0][k], torch.Tensor): + # make sure validate_tensor_consistency does not raise an error when one of the messages is empty + _validate_tensor_consistency( + [flat_result[i][k] for i in range(len(flat_result))] + ) + + def test_add_loss_mask_to_chat_message_log( tokenized_chat_message_log: LLMMessageLogType, ): diff --git a/tests/unit/experience/test_rollouts.py b/tests/unit/experience/test_rollouts.py index b45811d4f8..bcfa1b84d2 100644 --- a/tests/unit/experience/test_rollouts.py +++ b/tests/unit/experience/test_rollouts.py @@ -20,6 +20,7 @@ import torch from transformers import AutoTokenizer +from nemo_rl.data.llm_message_utils import batched_message_log_to_flat_message from nemo_rl.distributed.batched_data_dict import BatchedDataDict from nemo_rl.distributed.virtual_cluster import RayVirtualCluster from nemo_rl.environments.games.sliding_puzzle import ( @@ -440,6 +441,45 @@ def test_run_multi_step_calculator_vllm(multi_step_setup_vllm): print("\nMulti-Step Calculator VLLM Test assertions passed.") +@pytest.mark.skipif( + not torch.cuda.is_available() or torch.cuda.device_count() < 1, + reason="VLLM test requires at least 1 GPU", +) +def test_max_seqlen_respected(multi_step_setup_vllm): + """Tests multi-step calculator rollout with VllmGeneration.""" + vllm_generation, rollout_tokenizer, task_to_env, initial_batch, rollout_cluster = ( + multi_step_setup_vllm + ) + max_rollout_turns = initial_batch["extra_env_info"][0]["max_steps"] + 1 + max_seq_len = 290 + + print("\nRunning multi-step calculator rollout (VLLM)...") + vllm_generation.prepare_for_generation() + final_batch, rollout_metrics = run_multi_turn_rollout( + policy_generation=vllm_generation, + input_batch=initial_batch, + tokenizer=rollout_tokenizer, + task_to_env=task_to_env, + max_seq_len=max_seq_len, + max_rollout_turns=max_rollout_turns, + ) + vllm_generation.finish_generation() + print("Multi-step calculator rollout complete (VLLM).") + + # --- Assertions --- + assert isinstance(final_batch, BatchedDataDict) + assert "message_log" in final_batch + assert "total_reward" in final_batch + assert len(final_batch["message_log"]) == len(initial_batch["message_log"]) + flattened_message_log, _ = batched_message_log_to_flat_message( + final_batch["message_log"] + ) + # Check that the sequence length is respected by flattening the message log and checking the length + assert len(flattened_message_log["token_ids"][0]) == max_seq_len, ( + f"Sequence length {len(flattened_message_log['token_ids'][0])} is not equal to max_seq_len {max_seq_len}" + ) + + # --- Fixture for Sliding Puzzle Environment --- @pytest.fixture(scope="function") def sliding_puzzle_environment(rollout_cluster): diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py index 552ea3dae2..08f34defe2 100644 --- a/tests/unit/models/generation/test_vllm_generation.py +++ b/tests/unit/models/generation/test_vllm_generation.py @@ -36,7 +36,7 @@ }, "dtype": "bfloat16", "max_new_tokens": 10, - "temperature": 1.0, + "temperature": 0.8, "top_p": 1.0, "top_k": None, "stop_token_ids": None, @@ -85,6 +85,9 @@ def get_basic_hf_test_config(enable_dtensor: bool = False) -> PolicyConfig: }, "max_grad_norm": 1.0, "make_sequence_length_divisible_by": 1, + "generation": { + "temperature": 0.8, + }, } diff --git a/tests/unit/models/policy/test_dtensor_worker.py b/tests/unit/models/policy/test_dtensor_worker.py index 7f175b3f15..8ff416059e 100644 --- a/tests/unit/models/policy/test_dtensor_worker.py +++ b/tests/unit/models/policy/test_dtensor_worker.py @@ -294,6 +294,10 @@ def training_setup(request, two_gpu_virtual_cluster): (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, True, False, True), (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, False, True, True), (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, True, True, True), + (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, True, True, False), + (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, True, False, True), + (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, False, True, True), + (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, True, True, True), ], indirect=True, ) @@ -421,6 +425,8 @@ def logprob_setup(request, two_gpu_virtual_cluster): (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 2, False, False, False), (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 2, False, True, False), (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 2, False, True, True), + (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 2, False, True, False), + (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 2, False, False, False), ], indirect=True, ) diff --git a/tests/unit/test_recipes_and_test_suites.py b/tests/unit/test_recipes_and_test_suites.py index 9bac39188e..47d1d2f45b 100644 --- a/tests/unit/test_recipes_and_test_suites.py +++ b/tests/unit/test_recipes_and_test_suites.py @@ -19,6 +19,8 @@ dir_path = os.path.dirname(os.path.abspath(__file__)) project_root = os.path.abspath(os.path.join(dir_path, "..", "..")) +configs_dir = os.path.join(project_root, "examples", "configs") +recipes_dir = os.path.join(project_root, "examples", "configs", "recipes") test_suites_dir = os.path.join(project_root, "tests", "test_suites") nightly_test_suite_path = os.path.join(test_suites_dir, "nightly.txt") @@ -30,6 +32,13 @@ test_suites_dir, "release_performance.txt" ) +# Relative to project root +ALGO_MAPPING_TO_BASE_YAML = { + "sft": "examples/configs/sft.yaml", + "dpo": "examples/configs/dpo.yaml", + "grpo": "examples/configs/grpo_math_1B.yaml", +} + @pytest.fixture def nightly_test_suite(): @@ -90,6 +99,16 @@ def all_test_suites( ) +@pytest.fixture +def all_recipe_yaml_rel_paths(): + all_recipes = [] + for recipe_path in glob.glob( + os.path.join(recipes_dir, "**", "*.yaml"), recursive=True + ): + all_recipes.append(recipe_path[len(recipes_dir) + 1 :]) + return all_recipes + + @pytest.mark.parametrize( "test_suite_path", [ @@ -112,12 +131,14 @@ def test_test_suites_exist(test_suite_path): def test_no_overlap_across_test_suites(all_test_suites): - recipes = set(all_test_suites) - assert len(recipes) == len(all_test_suites), f"Test suites have repeats {recipes}" + all_tests = set(all_test_suites) + assert len(all_tests) == len(all_test_suites), ( + f"Test suites have repeats {all_tests}" + ) -def test_all_recipes_accounted_for_in_test_suites(all_test_suites): - all_recipes_in_test_suites = set(all_test_suites) +def test_all_test_scripts_accounted_for_in_test_suites(all_test_suites): + all_test_scripts_in_test_suites = set(all_test_suites) all_tests_in_test_suites_dir = set() for recipe_path in glob.glob( @@ -127,8 +148,37 @@ def test_all_recipes_accounted_for_in_test_suites(all_test_suites): recipe_name = recipe_path[len(project_root) + 1 :] all_tests_in_test_suites_dir.add(recipe_name) - assert all_recipes_in_test_suites == all_tests_in_test_suites_dir, ( - "All recipes are not accounted for in the test suites" + assert all_test_scripts_in_test_suites == all_tests_in_test_suites_dir, ( + "All test scripts are not accounted for in the test suites" + ) + + +def test_all_recipe_yamls_accounted_for_in_test_suites( + all_recipe_yaml_rel_paths, all_test_suites +): + """This test along with test_all_test_scripts_accounted_for_in_test_suites() ensures that all recipe yaml/test scripts/test_suite(txts) are in sync.""" + assert len(set(all_recipe_yaml_rel_paths)) == len(set(all_test_suites)), ( + "Recipe YAMLs should be accounted for in the test suites" + ) + + all_test_script_paths_in_test_suites = set() + for test_script in all_test_suites: + # Each test suite is relative from project root + test_script_rel_to_test_suites_dir = test_script[ + len(os.path.join("tests", "test_suites")) + 1 : + ] + all_test_script_paths_in_test_suites.add(test_script_rel_to_test_suites_dir) + + # Since we're comparing yaml to sh, chop off the .sh/.yaml extensions for comparison + all_test_script_paths_in_test_suites = { + os.path.splitext(path)[0] for path in all_test_script_paths_in_test_suites + } + all_recipe_yaml_rel_paths = { + os.path.splitext(path)[0] for path in all_recipe_yaml_rel_paths + } + + assert all_test_script_paths_in_test_suites == set(all_recipe_yaml_rel_paths), ( + "All recipe YAMLs are not accounted for in the test suites" ) @@ -215,3 +265,37 @@ def test_all_tests_can_find_config_if_dryrun(all_test_suites): assert result.returncode == 0, ( f"Command failed with exit code {result.returncode}" ) + + +def test_all_recipes_start_with_algo_hyphen(all_recipe_yaml_rel_paths): + expected_algos = set(ALGO_MAPPING_TO_BASE_YAML.keys()) + for recipe_yaml in all_recipe_yaml_rel_paths: + basename = os.path.basename(recipe_yaml) + algo = basename.split("-")[0] + assert algo in expected_algos, ( + f"Recipe {recipe_yaml} has unexpected algo {algo}" + ) + + +@pytest.mark.parametrize("algo, algo_base_yaml", ALGO_MAPPING_TO_BASE_YAML.items()) +def test_all_recipes_can_merge_configs_with_base_config( + all_recipe_yaml_rel_paths, all_test_suites, algo, algo_base_yaml +): + from omegaconf import OmegaConf + + base_yaml = os.path.join(project_root, algo_base_yaml) + base_config = OmegaConf.load(base_yaml) + # Would result in an error if we couldn't merge our config with the recipe's config + OmegaConf.set_struct(base_config, True) + for recipe_yaml in all_recipe_yaml_rel_paths: + if not os.path.basename(recipe_yaml).startswith(algo): + # Skipping here b/c we test that all recipes start with the algo-hyphen in + # test_all_recipes_start_with_algo_hyphen() + continue + recipe_yaml_path = os.path.join(recipes_dir, recipe_yaml) + recipe_config = OmegaConf.load(recipe_yaml_path) + OmegaConf.set_struct(recipe_config, True) + # This will raise a error if the config can't be merged + print(f"Merging {recipe_yaml} with {base_yaml}") + merged_config = OmegaConf.merge(base_config, recipe_config) + print(merged_config) diff --git a/tests/unit/utils/test_checkpoint.py b/tests/unit/utils/test_checkpoint.py index 2a912e94b2..c5a90c7932 100644 --- a/tests/unit/utils/test_checkpoint.py +++ b/tests/unit/utils/test_checkpoint.py @@ -17,6 +17,7 @@ import numpy as np import pytest import torch +import yaml from nemo_rl.utils.checkpoint import CheckpointManager @@ -62,8 +63,8 @@ def test_init_tmp_checkpoint(checkpoint_manager, checkpoint_dir): assert isinstance(saved_metadata["numpy"], (int, float)) # Check if config was saved - with open(save_dir / "config.json", "r") as f: - saved_config = json.load(f) + with open(save_dir / "config.yaml", "r") as f: + saved_config = yaml.safe_load(f) assert saved_config == run_config diff --git a/tests/unit/utils/test_native_checkpoint.py b/tests/unit/utils/test_native_checkpoint.py index 7cebeade90..f751c5c47e 100755 --- a/tests/unit/utils/test_native_checkpoint.py +++ b/tests/unit/utils/test_native_checkpoint.py @@ -61,6 +61,9 @@ "tensor_parallel_size": 1, }, "max_grad_norm": 1.0, + "generation": { + "temperature": 1.0, + }, } @@ -283,77 +286,6 @@ def test_save_and_load_model_and_optimizer(mock_experiment): check_dict_equality(new_optimizer.state_dict(), optimizer.state_dict()) -@pytest.mark.parametrize("num_gpus", [1, 2], ids=["1gpu", "2gpu"]) -def test_save_and_load_hf_checkpoint(policy, num_gpus): - ## warm up with a forward pass - ## this is needed before saving a checkpoint because FSDP does some lazy initialization - input_ids = torch.randint(0, 16000, (4, 128)) # 4 sequences, each of length 128 - attention_mask = torch.ones(4, 128) - input_lengths = attention_mask.sum(dim=1).to(torch.int32) - dummy_fwd_dict = BatchedDataDict( - { - "input_ids": input_ids, - "input_lengths": input_lengths, - "attention_mask": attention_mask, - "labels": torch.randint(0, 16000, (4, 128)), - } - ) - policy.get_logprobs(dummy_fwd_dict) - - with TemporaryDirectory() as tmp_dir: - policy.save_checkpoint( - os.path.join(tmp_dir, "test_hf_and_dcp"), - save_hf=True, - save_torch_dist=True, - tokenizer_path=os.path.join(tmp_dir, "test_hf_and_dcp_tokenizer"), - ) - - ## make sure we save both HF and DCP checkpoints - # Dynamically create the expected set of distcp files based on num_gpus - expected_distcp_files = {f"__{rank}_0.distcp" for rank in range(num_gpus)} - expected_files = expected_distcp_files.union({".metadata"}) - - assert ( - set(os.listdir(os.path.join(tmp_dir, "test_hf_and_dcp"))) == expected_files - ) - assert set(os.listdir(os.path.join(tmp_dir, "test_hf_and_dcp_tokenizer"))) == { - "tokenizer_config.json", - "tokenizer.json", - "special_tokens_map.json", - } - - converted_model = AutoModelForCausalLM.from_pretrained( - os.path.join(tmp_dir, "test_hf_and_dcp-hf") - ) - - hf_save_dir = os.path.join(tmp_dir, "test_hf_and_dcp-hf") - hf_files = set(os.listdir(hf_save_dir)) - - # Check the HF saved files structure: could be single or sharded - expected_common_hf_files = {"config.json", "generation_config.json"} - if "model.safetensors" in hf_files: - # Single file format (1 GPU or smaller model) - expected_hf_files = expected_common_hf_files.union({"model.safetensors"}) - else: - # Sharded format (>=2 GPUs or larger model) - expected_hf_files = expected_common_hf_files.union( - { - "model-00001-of-00002.safetensors", - "model-00002-of-00002.safetensors", - "model.safetensors.index.json", - } - ) - assert hf_files == expected_hf_files - - coverted_model = AutoModelForCausalLM.from_pretrained(hf_save_dir) - original_model = AutoModelForCausalLM.from_pretrained( - simple_policy_config["model_name"] - ) - - ## make sure converted model matches the original - check_dict_equality(converted_model.state_dict(), original_model.state_dict()) - - @pytest.mark.parametrize("num_gpus", [1, 2], ids=["1gpu", "2gpu"]) def test_convert_dcp_to_hf(policy, num_gpus): ## warm up with a forward pass @@ -374,8 +306,6 @@ def test_convert_dcp_to_hf(policy, num_gpus): with TemporaryDirectory() as tmp_dir: policy.save_checkpoint( os.path.join(tmp_dir, "test_hf_and_dcp"), - save_hf=True, - save_torch_dist=True, ) # Dynamically create the expected set of distcp files based on num_gpus @@ -387,25 +317,6 @@ def test_convert_dcp_to_hf(policy, num_gpus): set(os.listdir(os.path.join(tmp_dir, "test_hf_and_dcp"))) == expected_files ) - # Check the HF saved files structure: could be single or sharded - hf_save_dir = os.path.join(tmp_dir, "test_hf_and_dcp-hf") - hf_files = set(os.listdir(hf_save_dir)) - expected_common_hf_files = {"config.json", "generation_config.json"} - - if "model.safetensors" in hf_files: - # Single file format (1 GPU or smaller model) - expected_hf_files = expected_common_hf_files.union({"model.safetensors"}) - else: - # Sharded format (>=2 GPUs or larger model) - expected_hf_files = expected_common_hf_files.union( - { - "model-00001-of-00002.safetensors", - "model-00002-of-00002.safetensors", - "model.safetensors.index.json", - } - ) - assert hf_files == expected_hf_files - offline_converted_model_path = convert_dcp_to_hf( os.path.join(tmp_dir, "test_hf_and_dcp"), os.path.join(tmp_dir, "test_hf_and_dcp-hf-offline"), @@ -423,18 +334,11 @@ def test_convert_dcp_to_hf(policy, num_gpus): offline_converted_model_path ) - online_converted_model = AutoModelForCausalLM.from_pretrained( - os.path.join(tmp_dir, "test_hf_and_dcp-hf") - ) original_model = AutoModelForCausalLM.from_pretrained( simple_policy_config["model_name"] ) - ## make sure both conversions results in the same state dict - check_dict_equality( - online_converted_model.state_dict(), offline_converted_model.state_dict() - ) - # Ensure the offline one is different from the original + # Ensure the offline checkpoint is different from the original assert_recursive_dict_different( offline_converted_model.state_dict(), original_model.state_dict() ) From e882334b5c68498e302b4f5a0992d23bc3253396 Mon Sep 17 00:00:00 2001 From: KiddoZhu Date: Wed, 7 May 2025 21:07:26 -0700 Subject: [PATCH 3/7] rewrite code & tool use as environments Signed-off-by: KiddoZhu --- nemo_rl/environments/code_environment.py | 252 +++++++++++++++++++ nemo_rl/environments/tools/retriever.py | 107 ++++++++ nemo_rl/experience/rollouts.py | 2 + nemo_rl/tools/generation.py | 2 +- tests/unit/experience/test_code.py | 298 +++++++++++++++++++++++ tests/unit/experience/test_retriever.py | 159 ++++++++++++ 6 files changed, 819 insertions(+), 1 deletion(-) create mode 100644 nemo_rl/environments/code_environment.py create mode 100644 nemo_rl/environments/tools/retriever.py create mode 100644 tests/unit/experience/test_code.py create mode 100644 tests/unit/experience/test_retriever.py diff --git a/nemo_rl/environments/code_environment.py b/nemo_rl/environments/code_environment.py new file mode 100644 index 0000000000..325733fa37 --- /dev/null +++ b/nemo_rl/environments/code_environment.py @@ -0,0 +1,252 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import ast +import builtins +import os +import re +from io import IOBase +from types import ModuleType +from copy import copy +from collections.abc import Mapping, Sequence, Set +from tempfile import TemporaryDirectory +from contextlib import contextmanager +from typing import Any, Dict, List, Optional, Tuple, TypedDict + +import ray +import torch +from pprint import pformat + +from nemo_rl.data.interfaces import LLMMessageLogType +from nemo_rl.distributed.batched_data_dict import BatchedDataDict +from nemo_rl.distributed.virtual_cluster import PY_EXECUTABLES +from nemo_rl.environments.utils import chunk_list_to_workers +from nemo_rl.environments.interfaces import EnvironmentInterface, EnvironmentReturn +from nemo_rl.tools.interfaces import ToolInterface + + +class CodeEnvConfig(TypedDict): + num_workers: int + # whether to terminate the execution after expression evaluation + # if you want to execute multiple rounds of code, set this to False + # and wrap CodeEnvironment in another environment that terminates the generation + terminate_on_evaluation: bool + + +class CodeEnvMetadata(TypedDict): + context: Dict[str, Any] # Hold functions and variables defined in the code + working_dir: str # Working directory for file operations + + +@ray.remote +class CodeExecutionWorker: + DEFAULT_PY_EXECUTABLE = PY_EXECUTABLES.SYSTEM + """Helper class to process individual code execution steps.""" + + def __init__(self): + # Create sandbox with safe builtins + builtin_dict = {k: getattr(builtins, k) for k in dir(builtins)} + builtin_dict["open"] = self.safe_open + builtin_dict["__import__"] = self.safe_import + self.sandbox = {"__builtins__": builtin_dict} + + def sanitize(self, obj: Any) -> Any: + # TODO: better handling of unpicklable objects: custom __getstate__ and __setstate__ + # recursively remove all file objects as they are not picklable by ray + if isinstance(obj, (IOBase, ModuleType)): + # replace unpickable objects with a string representation + return repr(obj) + if isinstance(obj, Mapping): + return obj.__class__({self.sanitize(k): self.sanitize(v) for k, v in obj.items()}) + if isinstance(obj, Sequence) and not isinstance(obj, str): + return obj.__class__(self.sanitize(v) for v in obj) + if hasattr(obj, "__dict__"): + new_obj = copy(obj) + new_obj.__dict__ = {self.sanitize(k): self.sanitize(v) for k, v in obj.__dict__.items()} + return new_obj + return obj + + def format_result(self, result: Any, code: Optional[str] = None, lookahead: Optional[str] = None) -> str: + if result is None: + # no return value + return "" + result = pformat(result) + multiline = (code and "\n" in code) or "\n" in result + if multiline: + # multi-line format + result = f"\n\n\n{result}\n" + else: + # inline format + result = f"{result}" + if lookahead: + if result.startswith(lookahead): + # The generation may look like "\n" if ">\n" is a single token. + # We trim \n from the result if the model has already generated it. + result = result[len(lookahead):] + return result + + def execute(self, message_batch: str, metadata_batch: List[CodeEnvMetadata]) -> str: + """Execute code in a sandboxed environment.""" + results = [] + terminateds = [] + + for message, metadata in zip(message_batch, metadata_batch): + match = re.search(rf"(.*)(.*)", message, re.DOTALL) + if not match: + results.append("") + terminateds.append(False) + continue + + code, lookahead = match.groups() + tree = ast.parse(code) + + if tree.body and isinstance(tree.body[-1], ast.Expr): + # Interactive mode + exec_code = ast.unparse(tree.body[:-1]) + eval_code = ast.unparse(tree.body[-1]) + else: + # Silent mode + exec_code = code + eval_code = None + + result = None + terminated = False + with self.chdir(metadata["working_dir"]): + try: + # isolate the code in a sandbox + # capture local variables in metadata["context"] + exec(exec_code, self.sandbox, metadata["context"]) + if eval_code: + result = eval(eval_code, self.sandbox, metadata["context"]) + terminated = True + except Exception as err: + result = err + + result = self.format_result(result, code, lookahead) + results.append(result) + terminateds.append(terminated) + + observations = [{"role": "environment", "content": result} for result in results] + metadata_batch = self.sanitize(metadata_batch) + + return observations, terminateds, metadata_batch + + @contextmanager + def chdir(self, dir: str): + """Change to temporary directory for file operations.""" + current_dir = os.getcwd() + os.chdir(dir) + try: + yield + finally: + os.chdir(current_dir) + + def safe_open(self, file: str, *args, **kwargs): + """Safe version of open() that only allows access to temporary directory.""" + real_file = os.path.realpath(file) + working_dir = os.path.realpath(os.getcwd()) + if os.path.commonpath([real_file, working_dir]) != working_dir: + raise PermissionError("Access beyond the temporary working directory is blocked") + return open(file, *args, **kwargs) + + def safe_import(self, name: str, *args, **kwargs): + """Safe version of import that blocks risky modules.""" + risky_modules = { + "os", "shutil", # erase filesystem + "sys", "signal", # exit the current program + "socket", # network communication + "subprocess", "threading", "multiprocessing", # spawn threads or processes + "builtins", "importlib", # bypass current blockers + } + if name in risky_modules: + raise PermissionError("Importing system and network modules is blocked") + return builtins.__import__(name, *args, **kwargs) + + +@ray.remote +class CodeEnvironment(EnvironmentInterface): + DEFAULT_PY_EXECUTABLE = PY_EXECUTABLES.SYSTEM + """Code execution environment that maintains state between steps.""" + + def __init__(self, cfg: CodeEnvConfig): + self.cfg = cfg + self.num_workers = cfg["num_workers"] + self.terminate_on_evaluation = cfg["terminate_on_evaluation"] + self.workers = [ + CodeExecutionWorker.options( + runtime_env={"py_executable": CodeExecutionWorker.DEFAULT_PY_EXECUTABLE} + ).remote() + for _ in range(self.num_workers) + ] + + def step( + self, + message_log_batch: List[LLMMessageLogType], + metadata_batch: List[CodeEnvMetadata], + ) -> EnvironmentReturn: + """Process a batch of code execution steps.""" + message_batch = [ml[-1]["content"] for ml in message_log_batch] + chunked_message_batch = chunk_list_to_workers( + message_batch, self.num_workers + ) + chunked_metadata_batch = chunk_list_to_workers( + metadata_batch, self.num_workers + ) + + # Process each chunk in parallel + futures = [ + self.workers[i].execute.remote(message_chunk, metadata_chunk) + for i, (message_chunk, metadata_chunk) in enumerate( + zip(chunked_message_batch, chunked_metadata_batch) + ) + ] + + results = ray.get(futures) + + # Unpack results + observations = [] + terminateds = [] + new_metadata_batch = [] + + for obs, term, meta in results: + observations += obs + terminateds += term + new_metadata_batch += meta + + if self.terminate_on_evaluation: + terminated_tensor = torch.tensor(terminateds, dtype=torch.bool) + else: + terminated_tensor = torch.zeros(len(terminateds), dtype=torch.bool) + rewards_tensor = torch.zeros_like(terminated_tensor, dtype=torch.float32) + + next_stop_strings = [[""]] * len(message_log_batch) + + return EnvironmentReturn( + observations=observations, + metadata=new_metadata_batch, + next_stop_strings=next_stop_strings, + rewards=rewards_tensor, + terminateds=terminated_tensor, + ) + + def shutdown(self): + # shutdown all workers + for worker in self.workers: + ray.kill(worker) + + def global_post_process_and_metrics( + self, batch: BatchedDataDict + ) -> Tuple[BatchedDataDict, dict]: + """Compute metrics for the batch.""" + # No specific metrics for code execution + return batch, {} \ No newline at end of file diff --git a/nemo_rl/environments/tools/retriever.py b/nemo_rl/environments/tools/retriever.py new file mode 100644 index 0000000000..40da06f092 --- /dev/null +++ b/nemo_rl/environments/tools/retriever.py @@ -0,0 +1,107 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Dict, List, TypedDict + +import re +import ray +import torch +from datasets import load_dataset + +from nemo_rl.data.interfaces import LLMMessageLogType +from nemo_rl.distributed.batched_data_dict import BatchedDataDict +from nemo_rl.environments.interfaces import EnvironmentInterface, EnvironmentReturn +from nemo_rl.tools.tools import BM25Retriever + + +class RAGEnvConfig(TypedDict): + dataset_name: str # Name of the dataset to load + dataset_split: str # Split of the dataset to use + text_column: str # Column name containing the text to retrieve + num_results: int # Number of documents to retrieve + k1: float # BM25 parameter + b: float # BM25 parameter + device: str # Device to compute BM25 + + +@ray.remote +class RAGEnvironment(EnvironmentInterface): + """RAG environment that uses BM25 for document retrieval.""" + + def __init__(self, cfg: RAGEnvConfig): + self.cfg = cfg + + # Load dataset + dataset = load_dataset(cfg["dataset_name"], split=cfg["dataset_split"]) + documents = [sample[cfg["text_column"]] for sample in dataset] + + # Initialize BM25 retriever + self.retriever = BM25Retriever( + documents=documents, + num_result=cfg["num_results"], + k1=cfg["k1"], + b=cfg["b"], + device=cfg["device"], + ) + + def format_result(self, retrieved_docs: List[str]) -> str: + result = "\n" + for i, doc in enumerate(retrieved_docs): + result += f"<{i+1}>\n{doc}\n\n" + result += "\n" + return result + + def step( + self, + message_log_batch: List[LLMMessageLogType], + metadata_batch: List[Dict[str, Any]], + ) -> EnvironmentReturn: + """Process a batch of retrieval steps.""" + # Extract queries from the last message in each log + messages = [ml[-1]["content"] for ml in message_log_batch] + + # Retrieve documents for each query + results = [] + for message in messages: + match = re.search(rf"(.*)", message, re.DOTALL) + if not match: + results.append({"role": "environment", "content": "No retrieval query found!"}) + continue + query = match.group(1) + retrieved_docs = self.retriever(query) + result = self.format_result(retrieved_docs) + results.append({"role": "environment", "content": result}) + + batch_size = len(message_log_batch) + rewards_tensor = torch.zeros(batch_size, dtype=torch.float32) + terminated_tensor = torch.ones(batch_size, dtype=torch.bool) + next_stop_strings = [[""]] * batch_size + + return EnvironmentReturn( + observations=results, + metadata=metadata_batch, + next_stop_strings=next_stop_strings, + rewards=rewards_tensor, + terminateds=terminated_tensor, + ) + + def shutdown(self): + """Clean up resources.""" + pass + + def global_post_process_and_metrics( + self, batch: BatchedDataDict + ) -> tuple[BatchedDataDict, dict]: + """Compute metrics for the batch.""" + # No specific metrics for RAG + return batch, {} diff --git a/nemo_rl/experience/rollouts.py b/nemo_rl/experience/rollouts.py index 567add0dfc..5304e44cb9 100644 --- a/nemo_rl/experience/rollouts.py +++ b/nemo_rl/experience/rollouts.py @@ -304,6 +304,8 @@ def run_multi_turn_rollout( tokenized_obs = tokenizer( env_obs_content, return_tensors="pt", add_special_tokens=False )["input_ids"][0] + # tokenizer returns torch.float32 when env_obs_content is empty + tokenized_obs = tokenized_obs.to(dtype=torch.int64) # check if new message overflows max_seq_len if ( diff --git a/nemo_rl/tools/generation.py b/nemo_rl/tools/generation.py index f50dbfe3ae..06f2f966b3 100644 --- a/nemo_rl/tools/generation.py +++ b/nemo_rl/tools/generation.py @@ -150,7 +150,7 @@ def generate_with_code_and_tools( if result is None: # no return value result = "" - new_results.append(result) + new_results.extend(result) continue result = pformat(result) if "\n" in expr or "\n" in result: diff --git a/tests/unit/experience/test_code.py b/tests/unit/experience/test_code.py new file mode 100644 index 0000000000..1fb37464af --- /dev/null +++ b/tests/unit/experience/test_code.py @@ -0,0 +1,298 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import ray +import torch +from tempfile import TemporaryDirectory +from typing import List, Dict, Any +from transformers import AutoTokenizer + +from nemo_rl.data.interfaces import LLMMessageLogType +from nemo_rl.distributed.batched_data_dict import BatchedDataDict +from nemo_rl.distributed.virtual_cluster import RayVirtualCluster +from nemo_rl.environments.code_environment import CodeEnvironment, CodeEnvConfig, CodeEnvMetadata +from nemo_rl.experience.rollouts import run_multi_turn_rollout +from nemo_rl.models.generation.interfaces import configure_generation_config +from nemo_rl.models.generation.vllm import VllmConfig, VllmGeneration +from nemo_rl.models.policy.hf_policy import HfPolicy, PolicyConfig + +MODEL_NAME = "meta-llama/Llama-3.2-1B" + +cfg: CodeEnvConfig = { + "num_workers": 2, + "terminate_on_evaluation": True, +} + +# Define basic vLLM test config +basic_vllm_test_config: VllmConfig = { + "backend": "vllm", + "model_name": MODEL_NAME, + "tokenizer_name": None, + "dtype": "bfloat16", + "max_new_tokens": 100, + "temperature": 1.0, + "top_p": 1.0, + "top_k": None, + "stop_token_ids": None, + "stop_strings": None, + "vllm_cfg": { + "tensor_parallel_size": 1, + "gpu_memory_utilization": 0.3, + "max_model_len": 1024, + }, +} + +basic_hf_test_config: PolicyConfig = { + "model_name": MODEL_NAME, + "tokenizer_name": None, + "generation_batch_size": 1, + "generation": { + "backend": "hf", + "max_new_tokens": 100, + "temperature": 1.0, + "top_p": 1.0, + "top_k": None, + "stop_token_ids": None, + "stop_strings": None, + }, + # Required training parameters + "train_global_batch_size": 1, + "train_micro_batch_size": 1, + "learning_rate": 5e-6, + "logprob_batch_size": 1, + "max_new_tokens": 16, + "do_sample": False, + "precision": "float32", + "activation_checkpointing_enabled": False, + "fsdp_offload_enabled": False, + "optimizer": { + "name": "torch.optim.AdamW", + "kwargs": { + "lr": 5e-6, + "weight_decay": 0.01, + "betas": [0.9, 0.999], + "eps": 1e-8, + }, + }, + "dtensor_cfg": {"enabled": False}, +} + + +@pytest.fixture(scope="function") +def code_env(): + """Create a code environment for testing.""" + try: + env_actor = CodeEnvironment.remote(cfg) + yield env_actor + finally: + if env_actor: + ray.kill(env_actor) + + +@pytest.fixture(scope="function") +def tokenizer(): + """Loads the tokenizer for the tests.""" + print(f"Loading tokenizer: {MODEL_NAME}") + tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + print( + f"Tokenizer loaded. Pad token: {tokenizer.pad_token} (ID: {tokenizer.pad_token_id}), EOS token: {tokenizer.eos_token} (ID: {tokenizer.eos_token_id})" + ) + return tokenizer + + +@pytest.fixture(scope="function") +def cluster(): + """Create a virtual cluster for testing.""" + cluster_instance = None + cluster_name = f"test-code-cluster-{id(cluster_instance)}" + print(f"\nCreating virtual cluster '{cluster_name}'...") + try: + cluster_instance = RayVirtualCluster( + name=cluster_name, + bundle_ct_per_node_list=[1], + use_gpus=True, + num_gpus_per_node=1, + max_colocated_worker_groups=2, + ) + yield cluster_instance + finally: + print(f"\nCleaning up cluster '{cluster_name}'...") + if cluster_instance: + cluster_instance.shutdown() + + +def test_untrusted_code(code_env): + """Test whether the code environment can block untrusted code.""" + codes = [ + "with open('allowed_file.txt', 'w') as fout:\n" + " fout.write('some content')\n" + "with open('allowed_file.txt') as fin:\n" + " content = fin.read()\n" + "content", + "with open('/etc/passwd', 'r') as fin:\n" + " fin.read()", + "import math\n" + "round(math.sqrt(8))", + "import os", + ] + results = [ + "\n\n\n'some content'\n", + "\n\n\nPermissionError('Access beyond the temporary working directory is blocked')\n", + "\n\n\n3\n", + "PermissionError('Importing system and network modules is blocked')", + ] + + message_log_batch = [ + [{"role": "user", "content": f"{code}"}] for code in codes + ] + temp_dirs = [TemporaryDirectory() for _ in codes] + metadata_batch = [ + CodeEnvMetadata( + context={}, working_dir=temp_dir.name, + ) for temp_dir in temp_dirs + ] + + # Execute the code + output = ray.get(code_env.step.remote(message_log_batch, metadata_batch)) + responses = [obs["content"] for obs in output.observations] + + assert responses == results, f"Got wrong output {responses}" + + +def test_vllm_execute_code(cluster, tokenizer, code_env): + """Test that vLLM can call the code executor.""" + # Prepare test data + codes = [ + "x = 3; y = 4\nThis is some regular text.\nx + y\n", + "\ndef f(x):\n return x * x\n\nf(2)\n\n", + ] + results = ["7", "\n\n4\n"] + + # Create message logs + message_logs = [] + metadata_batch = [] + temp_dirs = [] + for code in codes: + # Tokenize the message content + prompt = code * 4 + token_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)["input_ids"][0] + temp_dir = TemporaryDirectory() + message_logs.append([{"role": "user", "content": prompt, "token_ids": token_ids}]) + metadata_batch.append(CodeEnvMetadata(context={}, working_dir=temp_dir.name)) + temp_dirs.append(temp_dir) + + # Create initial batch + initial_batch = BatchedDataDict({ + "message_log": message_logs, + "extra_env_info": metadata_batch, + "task_name": ["code_execution"] * len(codes), + "stop_strings": [[""]] * len(codes), + }) + + # Create vLLM generation + vllm_config = basic_vllm_test_config.copy() + vllm_config = configure_generation_config(vllm_config, tokenizer, is_eval=True) + vllm_generation = VllmGeneration(cluster, vllm_config) + + # Create code environment + task_to_env = {"code_execution": code_env} + + # Run rollout + vllm_generation.prepare_for_generation() + final_batch, _ = run_multi_turn_rollout( + policy_generation=vllm_generation, + input_batch=initial_batch, + tokenizer=tokenizer, + task_to_env=task_to_env, + max_seq_len=256, + max_rollout_turns=2, + greedy=True, + ) + vllm_generation.finish_generation() + + # Check results + for i, msg_log in enumerate(final_batch["message_log"]): + # Get the last message which should contain the result + last_msg = msg_log[-1] + assert last_msg["role"] == "environment" + assert last_msg["content"] == results[i], f"Expected {results[i]}, got {last_msg['content']}" + + +def test_hf_execute_code(cluster, tokenizer, code_env): + """Test that Huggingface models can call the code executor.""" + # Prepare test data + codes = [ + "x = 3; y = 4\nThis is some regular text.\nx + y\n", + "\ndef f(x):\n return x * x\n\nf(2)\n\n", + ] + results = ["7", "\n\n4\n"] + + # Create message logs + message_logs = [] + metadata_batch = [] + temp_dirs = [] + for code in codes: + # Tokenize the message content + prompt = code * 4 + token_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)["input_ids"][0] + temp_dir = TemporaryDirectory() + message_logs.append([{"role": "user", "content": prompt, "token_ids": token_ids}]) + metadata_batch.append(CodeEnvMetadata(context={}, working_dir=temp_dir.name)) + temp_dirs.append(temp_dir) + + # Create initial batch + initial_batch = BatchedDataDict({ + "message_log": message_logs, + "extra_env_info": metadata_batch, + "task_name": ["code_execution"] * len(codes), + "stop_strings": [[""]] * len(codes), + }) + + # Create HF policy + hf_config = basic_hf_test_config.copy() + hf_config["generation"] = configure_generation_config( + hf_config["generation"], + tokenizer, + ) + hf_policy = HfPolicy( + cluster, hf_config, tokenizer, init_reference_model=False, init_optimizer=False + ) + + # Create code environment + task_to_env = {"code_execution": code_env} + + # Run rollout + hf_policy.prepare_for_generation() + final_batch, _ = run_multi_turn_rollout( + policy_generation=hf_policy, + input_batch=initial_batch, + tokenizer=tokenizer, + task_to_env=task_to_env, + max_seq_len=256, + max_rollout_turns=2, + greedy=True, + ) + hf_policy.finish_generation() + + # Check results + for i, msg_log in enumerate(final_batch["message_log"]): + # Get the last message which should contain the result + last_msg = msg_log[-1] + assert last_msg["role"] == "environment" + assert last_msg["content"] == results[i], f"Expected {results[i]}, got {last_msg['content']}" + + diff --git a/tests/unit/experience/test_retriever.py b/tests/unit/experience/test_retriever.py new file mode 100644 index 0000000000..359b7898c5 --- /dev/null +++ b/tests/unit/experience/test_retriever.py @@ -0,0 +1,159 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import ray +import torch +from transformers import AutoTokenizer + +from nemo_rl.data.interfaces import LLMMessageLogType +from nemo_rl.distributed.batched_data_dict import BatchedDataDict +from nemo_rl.distributed.virtual_cluster import RayVirtualCluster +from nemo_rl.environments.tools.retriever import RAGEnvironment, RAGEnvConfig +from nemo_rl.experience.rollouts import run_multi_turn_rollout +from nemo_rl.models.generation.interfaces import configure_generation_config +from nemo_rl.models.generation.vllm import VllmConfig, VllmGeneration + +MODEL_NAME = "meta-llama/Llama-3.2-1B" + +cfg: RAGEnvConfig = { + "dataset_name": "rahular/simple-wikipedia", + "dataset_split": "train", + "text_column": "text", + "num_results": 1, + "k1": 1.5, + "b": 0.75, + "device": "cpu", +} + +# Define basic vLLM test config +basic_vllm_test_config: VllmConfig = { + "backend": "vllm", + "model_name": MODEL_NAME, + "tokenizer_name": None, + "dtype": "bfloat16", + "max_new_tokens": 100, + "temperature": 1.0, + "top_p": 1.0, + "top_k": None, + "stop_token_ids": None, + "stop_strings": None, + "vllm_cfg": { + "tensor_parallel_size": 1, + "gpu_memory_utilization": 0.3, + "max_model_len": 1024, + }, +} + + +@pytest.fixture(scope="function") +def rag_env(): + """Create a RAG environment for testing.""" + try: + env_actor = RAGEnvironment.remote(cfg) + yield env_actor + finally: + if env_actor: + ray.kill(env_actor) + + +@pytest.fixture(scope="function") +def tokenizer(): + """Loads the tokenizer for the tests.""" + print(f"Loading tokenizer: {MODEL_NAME}") + tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + print( + f"Tokenizer loaded. Pad token: {tokenizer.pad_token} (ID: {tokenizer.pad_token_id}), EOS token: {tokenizer.eos_token} (ID: {tokenizer.eos_token_id})" + ) + return tokenizer + + +@pytest.fixture(scope="function") +def cluster(): + """Create a virtual cluster for testing.""" + cluster_instance = None + cluster_name = f"test-rag-cluster-{id(cluster_instance)}" + print(f"\nCreating virtual cluster '{cluster_name}'...") + try: + cluster_instance = RayVirtualCluster( + name=cluster_name, + bundle_ct_per_node_list=[1], + use_gpus=True, + num_gpus_per_node=1, + max_colocated_worker_groups=2, + ) + yield cluster_instance + finally: + print(f"\nCleaning up cluster '{cluster_name}'...") + if cluster_instance: + cluster_instance.shutdown() + + +def test_vllm_retrieve(cluster, tokenizer, rag_env): + """Test that vLLM can use the RAG environment for document retrieval.""" + # Prepare test data + queries = [ + "Jen-Hsun Huang\n", + ] + expected_results = [ + "\n<1>\n" + "Nvidia was established in 1993 by Jen-Hsun Huang, Curtis Priem, and Chris Malachowsky. In 2000 Nvidia took intellectual possession of 3dfx, one of the biggest GPU producers in 1990s.\n" + "\n\n", + ] + + # Create message logs + message_logs = [] + for query in queries: + # Tokenize the message content + prompt = query * 4 + token_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)["input_ids"][0] + message_logs.append([{"role": "user", "content": prompt, "token_ids": token_ids}]) + + # Create initial batch + initial_batch = BatchedDataDict({ + "message_log": message_logs, + "extra_env_info": [{}] * len(queries), # No metadata needed for RAG + "task_name": ["document_retrieval"] * len(queries), + "stop_strings": [[""]] * len(queries), + }) + + # Create vLLM generation + vllm_config = basic_vllm_test_config.copy() + vllm_config = configure_generation_config(vllm_config, tokenizer, is_eval=True) + vllm_generation = VllmGeneration(cluster, vllm_config) + + # Create RAG environment + task_to_env = {"document_retrieval": rag_env} + + # Run rollout + vllm_generation.prepare_for_generation() + final_batch, _ = run_multi_turn_rollout( + policy_generation=vllm_generation, + input_batch=initial_batch, + tokenizer=tokenizer, + task_to_env=task_to_env, + max_seq_len=256, + max_rollout_turns=1, + greedy=True, + ) + vllm_generation.finish_generation() + + # Check results + for i, msg_log in enumerate(final_batch["message_log"]): + # Get the last message which should contain the result + last_msg = msg_log[-1] + assert last_msg["role"] == "environment" + assert last_msg["content"] == expected_results[i], f"Expected {expected_results[i]}, got {last_msg['content']}" From 487cd94a29946b97d6acbcd9ccfaae1d25673e2c Mon Sep 17 00:00:00 2001 From: KiddoZhu Date: Thu, 8 May 2025 15:50:51 -0700 Subject: [PATCH 4/7] fix lint check Signed-off-by: KiddoZhu --- nemo_rl/environments/code_environment.py | 75 ++++++++++--------- nemo_rl/environments/tools/retriever.py | 24 +++--- nemo_rl/models/policy/fsdp1_policy_worker.py | 2 +- tests/unit/experience/test_code.py | 78 ++++++++++++-------- tests/unit/experience/test_retriever.py | 30 +++++--- 5 files changed, 121 insertions(+), 88 deletions(-) diff --git a/nemo_rl/environments/code_environment.py b/nemo_rl/environments/code_environment.py index 325733fa37..cb72c1532e 100644 --- a/nemo_rl/environments/code_environment.py +++ b/nemo_rl/environments/code_environment.py @@ -15,24 +15,22 @@ import builtins import os import re +from collections.abc import Mapping, Sequence +from contextlib import contextmanager +from copy import copy from io import IOBase +from pprint import pformat from types import ModuleType -from copy import copy -from collections.abc import Mapping, Sequence, Set -from tempfile import TemporaryDirectory -from contextlib import contextmanager from typing import Any, Dict, List, Optional, Tuple, TypedDict import ray import torch -from pprint import pformat from nemo_rl.data.interfaces import LLMMessageLogType from nemo_rl.distributed.batched_data_dict import BatchedDataDict from nemo_rl.distributed.virtual_cluster import PY_EXECUTABLES -from nemo_rl.environments.utils import chunk_list_to_workers from nemo_rl.environments.interfaces import EnvironmentInterface, EnvironmentReturn -from nemo_rl.tools.interfaces import ToolInterface +from nemo_rl.environments.utils import chunk_list_to_workers class CodeEnvConfig(TypedDict): @@ -67,16 +65,22 @@ def sanitize(self, obj: Any) -> Any: # replace unpickable objects with a string representation return repr(obj) if isinstance(obj, Mapping): - return obj.__class__({self.sanitize(k): self.sanitize(v) for k, v in obj.items()}) + return obj.__class__( + {self.sanitize(k): self.sanitize(v) for k, v in obj.items()} + ) if isinstance(obj, Sequence) and not isinstance(obj, str): return obj.__class__(self.sanitize(v) for v in obj) if hasattr(obj, "__dict__"): new_obj = copy(obj) - new_obj.__dict__ = {self.sanitize(k): self.sanitize(v) for k, v in obj.__dict__.items()} + new_obj.__dict__ = { + self.sanitize(k): self.sanitize(v) for k, v in obj.__dict__.items() + } return new_obj return obj - def format_result(self, result: Any, code: Optional[str] = None, lookahead: Optional[str] = None) -> str: + def format_result( + self, result: Any, code: Optional[str] = None, lookahead: Optional[str] = None + ) -> str: if result is None: # no return value return "" @@ -92,16 +96,16 @@ def format_result(self, result: Any, code: Optional[str] = None, lookahead: Opti if result.startswith(lookahead): # The generation may look like "\n" if ">\n" is a single token. # We trim \n from the result if the model has already generated it. - result = result[len(lookahead):] + result = result[len(lookahead) :] return result def execute(self, message_batch: str, metadata_batch: List[CodeEnvMetadata]) -> str: """Execute code in a sandboxed environment.""" results = [] terminateds = [] - + for message, metadata in zip(message_batch, metadata_batch): - match = re.search(rf"(.*)(.*)", message, re.DOTALL) + match = re.search(r"(.*)(.*)", message, re.DOTALL) if not match: results.append("") terminateds.append(False) @@ -118,7 +122,7 @@ def execute(self, message_batch: str, metadata_batch: List[CodeEnvMetadata]) -> # Silent mode exec_code = code eval_code = None - + result = None terminated = False with self.chdir(metadata["working_dir"]): @@ -131,16 +135,18 @@ def execute(self, message_batch: str, metadata_batch: List[CodeEnvMetadata]) -> terminated = True except Exception as err: result = err - + result = self.format_result(result, code, lookahead) results.append(result) terminateds.append(terminated) - - observations = [{"role": "environment", "content": result} for result in results] + + observations = [ + {"role": "environment", "content": result} for result in results + ] metadata_batch = self.sanitize(metadata_batch) - + return observations, terminateds, metadata_batch - + @contextmanager def chdir(self, dir: str): """Change to temporary directory for file operations.""" @@ -156,17 +162,24 @@ def safe_open(self, file: str, *args, **kwargs): real_file = os.path.realpath(file) working_dir = os.path.realpath(os.getcwd()) if os.path.commonpath([real_file, working_dir]) != working_dir: - raise PermissionError("Access beyond the temporary working directory is blocked") + raise PermissionError( + "Access beyond the temporary working directory is blocked" + ) return open(file, *args, **kwargs) def safe_import(self, name: str, *args, **kwargs): """Safe version of import that blocks risky modules.""" risky_modules = { - "os", "shutil", # erase filesystem - "sys", "signal", # exit the current program + "os", + "shutil", # erase filesystem + "sys", + "signal", # exit the current program "socket", # network communication - "subprocess", "threading", "multiprocessing", # spawn threads or processes - "builtins", "importlib", # bypass current blockers + "subprocess", + "threading", + "multiprocessing", # spawn threads or processes + "builtins", + "importlib", # bypass current blockers } if name in risky_modules: raise PermissionError("Importing system and network modules is blocked") @@ -196,12 +209,8 @@ def step( ) -> EnvironmentReturn: """Process a batch of code execution steps.""" message_batch = [ml[-1]["content"] for ml in message_log_batch] - chunked_message_batch = chunk_list_to_workers( - message_batch, self.num_workers - ) - chunked_metadata_batch = chunk_list_to_workers( - metadata_batch, self.num_workers - ) + chunked_message_batch = chunk_list_to_workers(message_batch, self.num_workers) + chunked_metadata_batch = chunk_list_to_workers(metadata_batch, self.num_workers) # Process each chunk in parallel futures = [ @@ -222,7 +231,7 @@ def step( observations += obs terminateds += term new_metadata_batch += meta - + if self.terminate_on_evaluation: terminated_tensor = torch.tensor(terminateds, dtype=torch.bool) else: @@ -240,7 +249,7 @@ def step( ) def shutdown(self): - # shutdown all workers + # shutdown all workers for worker in self.workers: ray.kill(worker) @@ -249,4 +258,4 @@ def global_post_process_and_metrics( ) -> Tuple[BatchedDataDict, dict]: """Compute metrics for the batch.""" # No specific metrics for code execution - return batch, {} \ No newline at end of file + return batch, {} diff --git a/nemo_rl/environments/tools/retriever.py b/nemo_rl/environments/tools/retriever.py index 40da06f092..655f5a801f 100644 --- a/nemo_rl/environments/tools/retriever.py +++ b/nemo_rl/environments/tools/retriever.py @@ -11,9 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import re from typing import Any, Dict, List, TypedDict -import re import ray import torch from datasets import load_dataset @@ -40,11 +40,11 @@ class RAGEnvironment(EnvironmentInterface): def __init__(self, cfg: RAGEnvConfig): self.cfg = cfg - + # Load dataset dataset = load_dataset(cfg["dataset_name"], split=cfg["dataset_split"]) documents = [sample[cfg["text_column"]] for sample in dataset] - + # Initialize BM25 retriever self.retriever = BM25Retriever( documents=documents, @@ -53,14 +53,14 @@ def __init__(self, cfg: RAGEnvConfig): b=cfg["b"], device=cfg["device"], ) - + def format_result(self, retrieved_docs: List[str]) -> str: result = "\n" for i, doc in enumerate(retrieved_docs): - result += f"<{i+1}>\n{doc}\n\n" + result += f"<{i + 1}>\n{doc}\n\n" result += "\n" return result - + def step( self, message_log_batch: List[LLMMessageLogType], @@ -69,24 +69,26 @@ def step( """Process a batch of retrieval steps.""" # Extract queries from the last message in each log messages = [ml[-1]["content"] for ml in message_log_batch] - + # Retrieve documents for each query results = [] for message in messages: - match = re.search(rf"(.*)", message, re.DOTALL) + match = re.search(r"(.*)", message, re.DOTALL) if not match: - results.append({"role": "environment", "content": "No retrieval query found!"}) + results.append( + {"role": "environment", "content": "No retrieval query found!"} + ) continue query = match.group(1) retrieved_docs = self.retriever(query) result = self.format_result(retrieved_docs) results.append({"role": "environment", "content": result}) - + batch_size = len(message_log_batch) rewards_tensor = torch.zeros(batch_size, dtype=torch.float32) terminated_tensor = torch.ones(batch_size, dtype=torch.bool) next_stop_strings = [[""]] * batch_size - + return EnvironmentReturn( observations=results, metadata=metadata_batch, diff --git a/nemo_rl/models/policy/fsdp1_policy_worker.py b/nemo_rl/models/policy/fsdp1_policy_worker.py index 19523394ad..1d057271a4 100644 --- a/nemo_rl/models/policy/fsdp1_policy_worker.py +++ b/nemo_rl/models/policy/fsdp1_policy_worker.py @@ -289,7 +289,7 @@ def train( logits = self.model.lm_head(outputs.last_hidden_state) else: logits = outputs.logits - + # Divide logits by temperature if "generation" in self.cfg and self.cfg["generation"] is not None: logits.div_(self.cfg["generation"]["temperature"]) diff --git a/tests/unit/experience/test_code.py b/tests/unit/experience/test_code.py index 1fb37464af..7e06ffc45a 100644 --- a/tests/unit/experience/test_code.py +++ b/tests/unit/experience/test_code.py @@ -12,17 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. +from tempfile import TemporaryDirectory + import pytest import ray -import torch -from tempfile import TemporaryDirectory -from typing import List, Dict, Any from transformers import AutoTokenizer -from nemo_rl.data.interfaces import LLMMessageLogType from nemo_rl.distributed.batched_data_dict import BatchedDataDict from nemo_rl.distributed.virtual_cluster import RayVirtualCluster -from nemo_rl.environments.code_environment import CodeEnvironment, CodeEnvConfig, CodeEnvMetadata +from nemo_rl.environments.code_environment import ( + CodeEnvConfig, + CodeEnvironment, + CodeEnvMetadata, +) from nemo_rl.experience.rollouts import run_multi_turn_rollout from nemo_rl.models.generation.interfaces import configure_generation_config from nemo_rl.models.generation.vllm import VllmConfig, VllmGeneration @@ -143,10 +145,8 @@ def test_untrusted_code(code_env): "with open('allowed_file.txt') as fin:\n" " content = fin.read()\n" "content", - "with open('/etc/passwd', 'r') as fin:\n" - " fin.read()", - "import math\n" - "round(math.sqrt(8))", + "with open('/etc/passwd', 'r') as fin:\n fin.read()", + "import math\nround(math.sqrt(8))", "import os", ] results = [ @@ -162,8 +162,10 @@ def test_untrusted_code(code_env): temp_dirs = [TemporaryDirectory() for _ in codes] metadata_batch = [ CodeEnvMetadata( - context={}, working_dir=temp_dir.name, - ) for temp_dir in temp_dirs + context={}, + working_dir=temp_dir.name, + ) + for temp_dir in temp_dirs ] # Execute the code @@ -189,19 +191,25 @@ def test_vllm_execute_code(cluster, tokenizer, code_env): for code in codes: # Tokenize the message content prompt = code * 4 - token_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)["input_ids"][0] + token_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)[ + "input_ids" + ][0] temp_dir = TemporaryDirectory() - message_logs.append([{"role": "user", "content": prompt, "token_ids": token_ids}]) + message_logs.append( + [{"role": "user", "content": prompt, "token_ids": token_ids}] + ) metadata_batch.append(CodeEnvMetadata(context={}, working_dir=temp_dir.name)) temp_dirs.append(temp_dir) # Create initial batch - initial_batch = BatchedDataDict({ - "message_log": message_logs, - "extra_env_info": metadata_batch, - "task_name": ["code_execution"] * len(codes), - "stop_strings": [[""]] * len(codes), - }) + initial_batch = BatchedDataDict( + { + "message_log": message_logs, + "extra_env_info": metadata_batch, + "task_name": ["code_execution"] * len(codes), + "stop_strings": [[""]] * len(codes), + } + ) # Create vLLM generation vllm_config = basic_vllm_test_config.copy() @@ -229,7 +237,9 @@ def test_vllm_execute_code(cluster, tokenizer, code_env): # Get the last message which should contain the result last_msg = msg_log[-1] assert last_msg["role"] == "environment" - assert last_msg["content"] == results[i], f"Expected {results[i]}, got {last_msg['content']}" + assert last_msg["content"] == results[i], ( + f"Expected {results[i]}, got {last_msg['content']}" + ) def test_hf_execute_code(cluster, tokenizer, code_env): @@ -248,19 +258,25 @@ def test_hf_execute_code(cluster, tokenizer, code_env): for code in codes: # Tokenize the message content prompt = code * 4 - token_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)["input_ids"][0] + token_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)[ + "input_ids" + ][0] temp_dir = TemporaryDirectory() - message_logs.append([{"role": "user", "content": prompt, "token_ids": token_ids}]) + message_logs.append( + [{"role": "user", "content": prompt, "token_ids": token_ids}] + ) metadata_batch.append(CodeEnvMetadata(context={}, working_dir=temp_dir.name)) temp_dirs.append(temp_dir) # Create initial batch - initial_batch = BatchedDataDict({ - "message_log": message_logs, - "extra_env_info": metadata_batch, - "task_name": ["code_execution"] * len(codes), - "stop_strings": [[""]] * len(codes), - }) + initial_batch = BatchedDataDict( + { + "message_log": message_logs, + "extra_env_info": metadata_batch, + "task_name": ["code_execution"] * len(codes), + "stop_strings": [[""]] * len(codes), + } + ) # Create HF policy hf_config = basic_hf_test_config.copy() @@ -293,6 +309,6 @@ def test_hf_execute_code(cluster, tokenizer, code_env): # Get the last message which should contain the result last_msg = msg_log[-1] assert last_msg["role"] == "environment" - assert last_msg["content"] == results[i], f"Expected {results[i]}, got {last_msg['content']}" - - + assert last_msg["content"] == results[i], ( + f"Expected {results[i]}, got {last_msg['content']}" + ) diff --git a/tests/unit/experience/test_retriever.py b/tests/unit/experience/test_retriever.py index 359b7898c5..0c059c1453 100644 --- a/tests/unit/experience/test_retriever.py +++ b/tests/unit/experience/test_retriever.py @@ -14,13 +14,11 @@ import pytest import ray -import torch from transformers import AutoTokenizer -from nemo_rl.data.interfaces import LLMMessageLogType from nemo_rl.distributed.batched_data_dict import BatchedDataDict from nemo_rl.distributed.virtual_cluster import RayVirtualCluster -from nemo_rl.environments.tools.retriever import RAGEnvironment, RAGEnvConfig +from nemo_rl.environments.tools.retriever import RAGEnvConfig, RAGEnvironment from nemo_rl.experience.rollouts import run_multi_turn_rollout from nemo_rl.models.generation.interfaces import configure_generation_config from nemo_rl.models.generation.vllm import VllmConfig, VllmGeneration @@ -119,16 +117,22 @@ def test_vllm_retrieve(cluster, tokenizer, rag_env): for query in queries: # Tokenize the message content prompt = query * 4 - token_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)["input_ids"][0] - message_logs.append([{"role": "user", "content": prompt, "token_ids": token_ids}]) + token_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)[ + "input_ids" + ][0] + message_logs.append( + [{"role": "user", "content": prompt, "token_ids": token_ids}] + ) # Create initial batch - initial_batch = BatchedDataDict({ - "message_log": message_logs, - "extra_env_info": [{}] * len(queries), # No metadata needed for RAG - "task_name": ["document_retrieval"] * len(queries), - "stop_strings": [[""]] * len(queries), - }) + initial_batch = BatchedDataDict( + { + "message_log": message_logs, + "extra_env_info": [{}] * len(queries), # No metadata needed for RAG + "task_name": ["document_retrieval"] * len(queries), + "stop_strings": [[""]] * len(queries), + } + ) # Create vLLM generation vllm_config = basic_vllm_test_config.copy() @@ -156,4 +160,6 @@ def test_vllm_retrieve(cluster, tokenizer, rag_env): # Get the last message which should contain the result last_msg = msg_log[-1] assert last_msg["role"] == "environment" - assert last_msg["content"] == expected_results[i], f"Expected {expected_results[i]}, got {last_msg['content']}" + assert last_msg["content"] == expected_results[i], ( + f"Expected {expected_results[i]}, got {last_msg['content']}" + ) From 9563db26575989fe5b4f0c7dd85a29036a84ccbc Mon Sep 17 00:00:00 2001 From: KiddoZhu Date: Mon, 9 Jun 2025 14:17:43 -0700 Subject: [PATCH 5/7] clean up old impleementation & test passed Signed-off-by: KiddoZhu --- nemo_rl/environments/tools/retriever.py | 99 ++++- nemo_rl/tools/__init__.py | 0 nemo_rl/tools/generation.py | 236 ------------ nemo_rl/tools/interfaces.py | 20 - nemo_rl/tools/tools.py | 199 ---------- .../test_code_environment.py} | 10 +- .../test_retriever.py | 9 +- tests/unit/tools/test_tools.py | 351 ------------------ 8 files changed, 113 insertions(+), 811 deletions(-) delete mode 100644 nemo_rl/tools/__init__.py delete mode 100644 nemo_rl/tools/generation.py delete mode 100644 nemo_rl/tools/interfaces.py delete mode 100644 nemo_rl/tools/tools.py rename tests/unit/{experience/test_code.py => environments/test_code_environment.py} (96%) rename tests/unit/{experience => environments}/test_retriever.py (95%) delete mode 100644 tests/unit/tools/test_tools.py diff --git a/nemo_rl/environments/tools/retriever.py b/nemo_rl/environments/tools/retriever.py index 655f5a801f..cc62d8a2af 100644 --- a/nemo_rl/environments/tools/retriever.py +++ b/nemo_rl/environments/tools/retriever.py @@ -12,16 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. import re +import math from typing import Any, Dict, List, TypedDict +from collections import Counter +from tqdm import tqdm import ray import torch from datasets import load_dataset +from transformers import AutoTokenizer from nemo_rl.data.interfaces import LLMMessageLogType from nemo_rl.distributed.batched_data_dict import BatchedDataDict from nemo_rl.environments.interfaces import EnvironmentInterface, EnvironmentReturn -from nemo_rl.tools.tools import BM25Retriever class RAGEnvConfig(TypedDict): @@ -34,6 +37,100 @@ class RAGEnvConfig(TypedDict): device: str # Device to compute BM25 +class BM25Retriever: + """Sparse BM25 retriever. + + Args: + documents: list of documents to retrieve from + num_result: retrieve top-k documents + k1: parameter of BM25. Values in [1.2, 2.0] are recommended. + b: parameter of BM25. 0.75 is recommended. + device: device to compute BM25 + """ + + def __init__( + self, + documents: List[str] = None, + num_result: int = 10, + k1: float = 1.5, + b: float = 0.75, + device: str = "cpu", + ): + if documents is None: + dataset = load_dataset("wikimedia/wikipedia", "20231101.en") + self.documents = [sample["text"] for sample in dataset["train"]] + else: + self.documents = documents + self.tokenizer = AutoTokenizer.from_pretrained( + "bert-base-uncased", use_fast=True + ) + self.num_result = num_result + self.k1 = k1 + self.b = b + self.device = device + self.corpus_size = len(self.documents) + self.vocab_size = self.tokenizer.vocab_size + + self.build_index() + + def build_index(self): + doc_ids = [] + token_ids = [] + tfs = [] + lengths = [] + + for i, document in enumerate( + tqdm(self.documents, "Build index for BM25Retriever") + ): + input_ids = self.tokenizer.encode(document, add_special_tokens=False) + token2cnt = Counter(input_ids) + token_ids += token2cnt.keys() + tfs += token2cnt.values() + doc_ids += [i] * len(token2cnt) + lengths.append(len(input_ids)) + + avg_dl = sum(lengths) / self.corpus_size + for i, doc_id in enumerate(doc_ids): + tfs[i] = ( + tfs[i] + * (self.k1 + 1) + / (tfs[i] + self.k1 * (1 - self.b + self.b * lengths[doc_id] / avg_dl)) + ) + + indices = torch.tensor([doc_ids, token_ids], device=self.device) + values = torch.tensor(tfs, device=self.device) + self.doc_tfs = torch.sparse_coo_tensor( + indices, values, (self.corpus_size, self.vocab_size) + ) + + idfs = [0] * self.vocab_size + token2df = Counter(token_ids) + for token_id, df in token2df.items(): + idfs[token_id] = math.log((self.corpus_size - df + 0.5) / (df + 0.5) + 1) + self.idfs = idfs + + def __call__(self, query: str) -> List[str]: + input_ids = self.tokenizer.encode(query, add_special_tokens=False) + token2cnt = Counter(input_ids) + token_ids = [] + query_idfs = [] + for token_id, query_tf in token2cnt.items(): + token_ids.append(token_id) + query_idfs.append(query_tf * self.idfs[token_id]) + + indices = torch.tensor([token_ids, [0] * len(token_ids)], device=self.device) + values = torch.tensor(query_idfs, device=self.device) + query_idfs = torch.sparse_coo_tensor(indices, values, (self.vocab_size, 1)) + + scores = torch.sparse.mm(self.doc_tfs, query_idfs) + scores = scores.to_dense().squeeze(-1) + results = [] + for i in scores.topk(k=self.num_result).indices.tolist(): + results.append(self.documents[i]) + + return results + + @ray.remote class RAGEnvironment(EnvironmentInterface): """RAG environment that uses BM25 for document retrieval.""" diff --git a/nemo_rl/tools/__init__.py b/nemo_rl/tools/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/nemo_rl/tools/generation.py b/nemo_rl/tools/generation.py deleted file mode 100644 index 06f2f966b3..0000000000 --- a/nemo_rl/tools/generation.py +++ /dev/null @@ -1,236 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import re -import warnings -from pprint import pformat -from typing import Dict - -import ray -import torch -from torch.nn.utils.rnn import pad_sequence -from transformers import AutoTokenizer - -from nemo_rl.distributed.batched_data_dict import BatchedDataDict -from nemo_rl.models.generation.interfaces import ( - GenerationDatumSpec, - GenerationInterface, - GenerationOutputSpec, -) -from nemo_rl.tools.interfaces import ToolInterface -from nemo_rl.tools.tools import StatefulCodeExecutor - -LOGIT_INFINITY = 1000 - - -def generate_with_code_and_tools( - policy: GenerationInterface, - input_batch: BatchedDataDict[GenerationDatumSpec], - tokenizer: AutoTokenizer, - execute_code: bool = True, - tool_map: Dict[str, ToolInterface] = {}, - tag: str = "", - result_tag: str = "", - *args, - **kwargs, -) -> BatchedDataDict[GenerationOutputSpec]: - """Generate a batch of data with code execution and tool use. - - All code execution and tool calls in the generation will be executed on-the-fly, - of which the results will be appended to the output. Multiple code execution and tool calls - is supported. - - This function can be used as a drop-in replacement of `policy.generate()`. - - Args: - policy: policy to generate from. Can be either vllm or HuggingFace backend - input_batch: BatchedDataDict containing input_ids and input_lengths tensors - tokenizer: tokenizer from the pretrained model - execute_code: whether to execute code - tool_map: tools that the model can use - tag: xml tag to detect code snippet - result_tag: xml tag to output the result - *args, **kwargs: arguments and keyword arguments accepted by `policy.generate()` - """ - if tool_map and not execute_code: - warnings.warn( - "Tool use requires code execution, but code execution is disabled. All the tools will be ignored." - ) - - batch = input_batch.copy() - start_tag = tag - end_tag = tag.replace("<", " 0: - generation_outputs = policy.generate(active_batch, *args, **kwargs) - - output_ids = generation_outputs["output_ids"] - # only contains logprobs for newly generated tokens - logprobs = generation_outputs["logprobs"] - input_lengths = active_batch["input_lengths"] - total_lengths = generation_outputs["unpadded_sequence_lengths"] - if old_logprobs is not None: - # restore logprobs for tokens generated in previous iterations - for i, input_length in enumerate(input_lengths): - logprobs[i, :input_length] = old_logprobs[i, :input_length] - - # extract newly generated tokens - generated_ids = [] - for output_id, input_length, total_length in zip( - output_ids, input_lengths, total_lengths - ): - generated_ids.append(output_id[input_length:total_length]) - - generated_texts = tokenizer.batch_decode( - generated_ids, skip_special_tokens=True - ) - - is_code = [] - exprs = [] - lookaheads = [] - # parse newly generated texts - for i, (generated_text, active_index, total_length) in enumerate( - zip(generated_texts, active_indices, total_lengths) - ): - match = re.search( - rf"{start_tag}(.*){end_tag}(.*)", generated_text, re.DOTALL - ) - if match: - # stop is caused by code execution - # expr takes everything between and , including new lines - # lookahead takes everything after - is_code.append(i) - expr, lookahead = match.groups() - exprs.append(expr) - lookaheads.append(lookahead) - else: - # stop is not caused by code execution - # e.g. eos token, max length or other stop strings - completed_output_ids[active_index] = output_ids[i, :total_length] - completed_logprobs[active_index] = logprobs[i, :total_length] - if len(is_code) == 0: - break - - # execute all code in this batch - futures = [] - for i, expr, lookahead in zip(is_code, exprs, lookaheads): - active_index = active_indices[i] - # dispatch code to a pre-allocated executor for that sample - # so that functions and variables will be carried over - future = executors[active_index].__call__.remote(expr) - futures.append(future) - results = ray.get(futures) - - new_results = [] - for result in results: - if result is None: - # no return value - result = "" - new_results.extend(result) - continue - result = pformat(result) - if "\n" in expr or "\n" in result: - # multi-line format - result = f"\n\n{result_start}\n{result}\n{result_end}" - else: - # inline format - result = f"{result_start}{result}{result_end}" - if lookahead: - if result.startswith(lookahead): - # The generation may look like "\n" if ">\n" is a single token. - # We trim \n from the result if the model has already generated it. - result = result[len(lookahead) :] - else: - warnings.warn( - f"Expect the generation to stop at {repr(end_tag)}, but got {repr(end_tag + lookahead)}. " - "This is because some characters are merged into a single token by the tokenizer. " - "These extra characters will be kept in the generation." - ) - new_results.append(result) - - encodings = tokenizer( - new_results, - add_special_tokens=False, - padding=True, - padding_side="right", - return_tensors="pt", - ) - result_ids = encodings["input_ids"] - result_lengths = encodings["attention_mask"].sum(dim=1).to(torch.int32) - - is_code = torch.tensor(is_code) - # reduce active batch to those containing code - active_batch = active_batch.select_indices(is_code) - active_indices = active_indices[is_code] - output_ids = output_ids[is_code] - logprobs = logprobs[is_code] - total_lengths = total_lengths[is_code] - # max length before appending results - old_max_length = total_lengths.max() - # max length after appending results - new_max_length = (total_lengths + result_lengths).max() - new_output_ids = torch.full( - (len(active_indices), new_max_length), - tokenizer.pad_token_id, - dtype=output_ids.dtype, - ) - new_logprobs = torch.full( - (len(active_indices), new_max_length), 0, dtype=logprobs.dtype - ) - new_output_ids[:, :old_max_length] = output_ids[:, :old_max_length] - new_logprobs[:, :old_max_length] = logprobs[:, :old_max_length] - - # append results to generation - for i, (old_length, result_length) in enumerate( - zip(total_lengths, result_lengths) - ): - new_length = old_length + result_length - new_output_ids[i, old_length:new_length] = result_ids[i, :result_length] - new_logprobs[i, old_length:new_length] = LOGIT_INFINITY - - active_batch["input_ids"] = new_output_ids - active_batch["input_lengths"] = total_lengths + result_lengths - old_logprobs = new_logprobs - - output_ids = pad_sequence( - completed_output_ids, - batch_first=True, - padding_value=tokenizer.pad_token_id, - padding_side="right", - ) - logprobs = pad_sequence( - completed_logprobs, batch_first=True, padding_value=0.0, padding_side="right" - ) - total_lengths = torch.tensor([len(output_id) for output_id in completed_output_ids]) - generation_lengths = total_lengths - input_batch["input_lengths"] - - return { - "output_ids": output_ids, - "logprobs": logprobs, - "generation_lengths": generation_lengths, - "unpadded_sequence_lengths": total_lengths, - } diff --git a/nemo_rl/tools/interfaces.py b/nemo_rl/tools/interfaces.py deleted file mode 100644 index a37a3b6f10..0000000000 --- a/nemo_rl/tools/interfaces.py +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from abc import ABC, abstractmethod - - -class ToolInterface(ABC): - @abstractmethod - def __call__(self, *args, **kwargs): - pass diff --git a/nemo_rl/tools/tools.py b/nemo_rl/tools/tools.py deleted file mode 100644 index 1af1977926..0000000000 --- a/nemo_rl/tools/tools.py +++ /dev/null @@ -1,199 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import ast -import builtins -import math -import os -import tempfile -from collections import Counter -from contextlib import contextmanager -from typing import Any, Dict, List, Optional - -import ray -import torch -from datasets import load_dataset -from tqdm import tqdm -from transformers import AutoTokenizer - -from nemo_rl.tools.interfaces import ToolInterface - - -@ray.remote -class StatefulCodeExecutor(ToolInterface): - """Stateful code executor. - - Args: - context: classes, functions and variables accessible to the code executor. - By passing tools in context, the code executor also serves tool use. - """ - - def __init__(self, context: Dict[str, Any] = {}): - self.context = context.copy() - self.tmp_dir = tempfile.TemporaryDirectory() - - builtin_dict = {k: getattr(builtins, k) for k in dir(builtins)} - builtin_dict["open"] = self.safe_open - builtin_dict["__import__"] = self.safe_import - self.sandbox = {"__builtins__": builtin_dict} - - def __call__(self, code: str) -> Optional[str]: - tree = ast.parse(code) - - if tree.body and isinstance(tree.body[-1], ast.Expr): - # interactive mode - code = ast.unparse(tree.body[:-1]) - expr = ast.unparse(tree.body[-1]) - else: - # silent mode - expr = None - - try: - # isolate the code in a sandbox with globals={} - # capture local variables in self.context - with self.change_temporary_dir(): - exec(code, self.sandbox, self.context) - if expr: - return eval(expr, self.sandbox, self.context) - except Exception as err: - return err - - @contextmanager - def change_temporary_dir(self): - current_dir = os.getcwd() - os.chdir(self.tmp_dir.name) - try: - yield - finally: - os.chdir(current_dir) - - def safe_open(self, file, *args, **kwargs): - real_file = os.path.realpath(file) - tmp_dir = os.path.realpath(self.tmp_dir.name) - if os.path.commonpath([real_file, tmp_dir]) != tmp_dir: - # real_file is not inside tmp_dir - raise PermissionError( - "Access beyond the temporary working directory is blocked" - ) - return open(file, *args, **kwargs) - - def safe_import(self, name, *args, **kwargs): - risky_modules = { - "os", - "shutil", # erase filesystem - "sys", - "signal", # exit the current program - "socket", # network communication - "subprocess", - "threading", - "multiprocessing", # spawn threads or processes - "builtins", - "importlib", # bypass current blockers - } - if name in risky_modules: - raise PermissionError("Importing system and network modules is blocked") - return builtins.__import__(name, *args, **kwargs) - - -class BM25Retriever(ToolInterface): - """Sparse BM25 retriever. - - Args: - documents: list of documents to retrieve from - num_result: retrieve top-k documents - k1: parameter of BM25. Values in [1.2, 2.0] are recommended. - b: parameter of BM25. 0.75 is recommended. - device: device to compute BM25 - """ - - def __init__( - self, - documents: List[str] = None, - num_result: int = 10, - k1: float = 1.5, - b: float = 0.75, - device: str = "cpu", - ): - if documents is None: - dataset = load_dataset("wikimedia/wikipedia", "20231101.en") - self.documents = [sample["text"] for sample in dataset["train"]] - else: - self.documents = documents - self.tokenizer = AutoTokenizer.from_pretrained( - "bert-base-uncased", use_fast=True - ) - self.num_result = num_result - self.k1 = k1 - self.b = b - self.device = device - self.corpus_size = len(self.documents) - self.vocab_size = self.tokenizer.vocab_size - - self.build_index() - - def build_index(self): - doc_ids = [] - token_ids = [] - tfs = [] - lengths = [] - - for i, document in enumerate( - tqdm(self.documents, "Build index for BM25Retriever") - ): - input_ids = self.tokenizer.encode(document, add_special_tokens=False) - token2cnt = Counter(input_ids) - token_ids += token2cnt.keys() - tfs += token2cnt.values() - doc_ids += [i] * len(token2cnt) - lengths.append(len(input_ids)) - - avg_dl = sum(lengths) / self.corpus_size - for i, doc_id in enumerate(doc_ids): - tfs[i] = ( - tfs[i] - * (self.k1 + 1) - / (tfs[i] + self.k1 * (1 - self.b + self.b * lengths[doc_id] / avg_dl)) - ) - - indices = torch.tensor([doc_ids, token_ids], device=self.device) - values = torch.tensor(tfs, device=self.device) - self.doc_tfs = torch.sparse_coo_tensor( - indices, values, (self.corpus_size, self.vocab_size) - ) - - idfs = [0] * self.vocab_size - token2df = Counter(token_ids) - for token_id, df in token2df.items(): - idfs[token_id] = math.log((self.corpus_size - df + 0.5) / (df + 0.5) + 1) - self.idfs = idfs - - def __call__(self, query: str) -> List[str]: - input_ids = self.tokenizer.encode(query, add_special_tokens=False) - token2cnt = Counter(input_ids) - token_ids = [] - query_idfs = [] - for token_id, query_tf in token2cnt.items(): - token_ids.append(token_id) - query_idfs.append(query_tf * self.idfs[token_id]) - - indices = torch.tensor([token_ids, [0] * len(token_ids)], device=self.device) - values = torch.tensor(query_idfs, device=self.device) - query_idfs = torch.sparse_coo_tensor(indices, values, (self.vocab_size, 1)) - - scores = torch.sparse.mm(self.doc_tfs, query_idfs) - scores = scores.to_dense().squeeze(-1) - results = [] - for i in scores.topk(k=self.num_result).indices.tolist(): - results.append(self.documents[i]) - - return results diff --git a/tests/unit/experience/test_code.py b/tests/unit/environments/test_code_environment.py similarity index 96% rename from tests/unit/experience/test_code.py rename to tests/unit/environments/test_code_environment.py index 7e06ffc45a..732edf682b 100644 --- a/tests/unit/experience/test_code.py +++ b/tests/unit/environments/test_code_environment.py @@ -26,7 +26,7 @@ CodeEnvMetadata, ) from nemo_rl.experience.rollouts import run_multi_turn_rollout -from nemo_rl.models.generation.interfaces import configure_generation_config +from nemo_rl.models.generation import configure_generation_config from nemo_rl.models.generation.vllm import VllmConfig, VllmGeneration from nemo_rl.models.policy.hf_policy import HfPolicy, PolicyConfig @@ -50,9 +50,14 @@ "stop_token_ids": None, "stop_strings": None, "vllm_cfg": { + "async_engine": False, + "precision": "bfloat16", "tensor_parallel_size": 1, - "gpu_memory_utilization": 0.3, + "pipeline_parallel_size": 1, "max_model_len": 1024, + "disable_log_stats": True, + "disable_log_requests": True, + "gpu_memory_utilization": 0.6, }, } @@ -89,6 +94,7 @@ }, }, "dtensor_cfg": {"enabled": False}, + "dynamic_batching": {"enabled": False}, } diff --git a/tests/unit/experience/test_retriever.py b/tests/unit/environments/test_retriever.py similarity index 95% rename from tests/unit/experience/test_retriever.py rename to tests/unit/environments/test_retriever.py index 0c059c1453..457dc5bc4b 100644 --- a/tests/unit/experience/test_retriever.py +++ b/tests/unit/environments/test_retriever.py @@ -20,7 +20,7 @@ from nemo_rl.distributed.virtual_cluster import RayVirtualCluster from nemo_rl.environments.tools.retriever import RAGEnvConfig, RAGEnvironment from nemo_rl.experience.rollouts import run_multi_turn_rollout -from nemo_rl.models.generation.interfaces import configure_generation_config +from nemo_rl.models.generation import configure_generation_config from nemo_rl.models.generation.vllm import VllmConfig, VllmGeneration MODEL_NAME = "meta-llama/Llama-3.2-1B" @@ -48,9 +48,14 @@ "stop_token_ids": None, "stop_strings": None, "vllm_cfg": { + "async_engine": False, + "precision": "bfloat16", "tensor_parallel_size": 1, - "gpu_memory_utilization": 0.3, + "pipeline_parallel_size": 1, "max_model_len": 1024, + "disable_log_stats": True, + "disable_log_requests": True, + "gpu_memory_utilization": 0.6, }, } diff --git a/tests/unit/tools/test_tools.py b/tests/unit/tools/test_tools.py deleted file mode 100644 index a22ca03c3c..0000000000 --- a/tests/unit/tools/test_tools.py +++ /dev/null @@ -1,351 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from copy import deepcopy - -import pytest -import ray -import torch -from datasets import load_dataset -from transformers import AutoTokenizer - -from nemo_rl.distributed.batched_data_dict import BatchedDataDict -from nemo_rl.distributed.virtual_cluster import RayVirtualCluster -from nemo_rl.models.generation.interfaces import configure_generation_config -from nemo_rl.models.generation.vllm import VllmConfig, VllmGeneration -from nemo_rl.models.policy.hf_policy import HfPolicy, PolicyConfig -from nemo_rl.tools.generation import generate_with_code_and_tools -from nemo_rl.tools.tools import BM25Retriever, StatefulCodeExecutor - -MODEL_NAME = "meta-llama/Llama-3.2-1B" - - -# Define basic vLLM test config -basic_vllm_test_config: VllmConfig = { - "backend": "vllm", - "model_name": MODEL_NAME, - "tokenizer_name": None, - "dtype": "bfloat16", - "max_new_tokens": 100, - "temperature": 1.0, - "top_p": 1.0, - "top_k": None, - "stop_token_ids": None, - "stop_strings": None, - "vllm_cfg": { - "tensor_parallel_size": 1, - "gpu_memory_utilization": 0.3, - "max_model_len": 1024, - }, -} - -basic_hf_test_config: PolicyConfig = { - "model_name": MODEL_NAME, - "tokenizer_name": None, - "generation_batch_size": 1, - "generation": { - "backend": "hf", - "max_new_tokens": 100, - "temperature": 1.0, - "top_p": 1.0, - "top_k": None, - "stop_token_ids": None, - "stop_strings": None, - }, - # Required training parameters - "train_global_batch_size": 1, - "train_micro_batch_size": 1, - "learning_rate": 5e-6, - "logprob_batch_size": 1, - "max_new_tokens": 16, - "do_sample": False, - "precision": "float32", - "activation_checkpointing_enabled": False, - "fsdp_offload_enabled": False, - "optimizer": { - "name": "torch.optim.AdamW", - "kwargs": { - "lr": 5e-6, - "weight_decay": 0.01, - "betas": [0.9, 0.999], - "eps": 1e-8, - }, - }, - "dtensor_cfg": {"enabled": False}, -} - - -@pytest.fixture(scope="module") -def cluster(): - """Create a virtual cluster for testing.""" - # Create a cluster with 1 node that has 1 GPU bundles - virtual_cluster = RayVirtualCluster( - bundle_ct_per_node_list=[1], # 1 node with 1 GPU bundle - use_gpus=True, - max_colocated_worker_groups=2, - num_gpus_per_node=1, # Use available GPUs - name="vllm-test-cluster", - ) - yield virtual_cluster - virtual_cluster.shutdown() - - -@pytest.fixture(scope="function") -def tokenizer(): - """Loads the tokenizer for the tests.""" - print(f"Loading tokenizer: {MODEL_NAME}") - tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.eos_token - print( - f"Tokenizer loaded. Pad token: {tokenizer.pad_token} (ID: {tokenizer.pad_token_id}), EOS token: {tokenizer.eos_token} (ID: {tokenizer.eos_token_id})" - ) - return tokenizer - - -def test_vllm_execute_code(cluster, tokenizer): - """Test that vLLM can call the code executor.""" - # Prepare test data - codes = [ - "x = 3; y = 4\nThis is some regular text.\nx + y\n", - "\ndef f(x):\n return x * x\n\nf(2)\n\n", - ] - results = ["7", "\n\n4\n"] - results = [code + result for code, result in zip(codes, results)] - - test_prompts = [code * 4 for code in codes] - encodings = tokenizer( - test_prompts, - padding="max_length", - max_length=1024, - return_tensors="pt", - padding_side="right", - ) - input_lengths = encodings["attention_mask"].sum(dim=1).to(torch.int32) - batch = BatchedDataDict( - { - "input_ids": encodings["input_ids"], - "input_lengths": input_lengths, - } - ) - - # Create separate configs for each policy - vllm_config = basic_vllm_test_config.copy() - vllm_config = configure_generation_config(vllm_config, tokenizer, is_eval=True) - - # Create vLLM generation - vllm_generation = VllmGeneration(cluster, vllm_config) - - # Generate and check result - outputs = generate_with_code_and_tools( - vllm_generation, batch, tokenizer, greedy=True - ) - - all_output_ids = outputs["output_ids"] - logprobs = outputs["logprobs"] - input_lengths = outputs["unpadded_sequence_lengths"] - outputs["generation_lengths"] - output_lengths = outputs["unpadded_sequence_lengths"] - input_ids = [] - output_ids = [] - for all_output_id, input_length, output_length in zip( - all_output_ids, input_lengths, output_lengths - ): - input_ids.append(all_output_id[:input_length]) - output_ids.append(all_output_id[input_length:output_length]) - indices = torch.arange(all_output_ids.shape[-1]) - input_lengths = input_lengths.unsqueeze(-1) - output_lengths = output_lengths.unsqueeze(-1) - is_generated = (indices >= input_lengths) & (indices < output_lengths) - - input_texts = tokenizer.batch_decode(input_ids, skip_special_tokens=True) - output_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True) - - assert input_texts == test_prompts, "Unexpected modification to input texts" - assert output_texts == results, f"Expect {results}, got wrong output {output_texts}" - assert (logprobs[~is_generated] == 0.0).all(), ( - "Unexpected log probabilities on input tokens or paddings" - ) - assert (logprobs[is_generated] != 0.0).all(), ( - "Generated tokens must have non-trivial log probabilities" - ) - - # Clean up - vllm_generation.shutdown() - - -def test_hf_execute_code(cluster, tokenizer): - """Test that Huggingface models can call the code executor.""" - # Prepare test data - codes = [ - "x = 3; y = 4\nThis is some regular text.\nx + y\n", - "\ndef f(x):\n return x * x\n\nf(2)\n\n", - ] - results = ["7", "\n\n4\n"] - results = [code + result for code, result in zip(codes, results)] - - test_prompts = [code * 4 for code in codes] - encodings = tokenizer( - test_prompts, - padding="max_length", - max_length=1024, - return_tensors="pt", - padding_side="right", - ) - input_lengths = encodings["attention_mask"].sum(dim=1).to(torch.int32) - batch = BatchedDataDict( - { - "input_ids": encodings["input_ids"], - "input_lengths": input_lengths, - } - ) - - # Create separate configs for each policy - hf_config = deepcopy(basic_hf_test_config) - hf_config["generation"] = configure_generation_config( - hf_config["generation"], - tokenizer, # is_eval=True - ) - - # Create vLLM generation - hf_policy = HfPolicy( - cluster, hf_config, tokenizer, init_reference_model=False, init_optimizer=False - ) - - # Generate and check result - outputs = generate_with_code_and_tools(hf_policy, batch, tokenizer, greedy=True) - - all_output_ids = outputs["output_ids"] - logprobs = outputs["logprobs"] - input_lengths = outputs["unpadded_sequence_lengths"] - outputs["generation_lengths"] - output_lengths = outputs["unpadded_sequence_lengths"] - input_ids = [] - output_ids = [] - for all_output_id, input_length, output_length in zip( - all_output_ids, input_lengths, output_lengths - ): - input_ids.append(all_output_id[:input_length]) - output_ids.append(all_output_id[input_length:output_length]) - indices = torch.arange(all_output_ids.shape[-1]) - input_lengths = input_lengths.unsqueeze(-1) - output_lengths = output_lengths.unsqueeze(-1) - is_generated = (indices >= input_lengths) & (indices < output_lengths) - - input_texts = tokenizer.batch_decode(input_ids, skip_special_tokens=True) - output_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True) - - assert input_texts == test_prompts, "Unexpected modification to input texts" - assert output_texts == results, f"Expect {results}, got wrong output {output_texts}" - assert (logprobs[~is_generated] == 0.0).all(), ( - "Unexpected log probabilities on input tokens or paddings" - ) - assert (logprobs[is_generated] != 0.0).all(), ( - "Generated tokens must have non-trivial log probabilities" - ) - - # Clean up - hf_policy.shutdown() - - -def test_untrusted_code(cluster): - """Test whether the code executor can block untrusted code.""" - executor = StatefulCodeExecutor.remote() - - # accessing temporary files shouldn't be blocked - code = ( - "with open('allowed_file.txt', 'w') as fout:\n" - " fout.write('some content')\n" - "with open('allowed_file.txt') as fin:\n" - " content = fin.read()\n" - "content" - ) - result = ray.get(executor.__call__.remote(code)) - assert result == "some content" - - # accessing other files should be blocked - code = "with open('/etc/passwd', 'r') as fin:\n fin.read()" - result = ray.get(executor.__call__.remote(code)) - assert isinstance(result, PermissionError) - - # importing non-sensitive modules shouldn't be blocked - code = "import math\nround(math.sqrt(8))" - result = ray.get(executor.__call__.remote(code)) - assert result == 3 - - # importing sensitive modules should be blocked - code = "import os" - result = ray.get(executor.__call__.remote(code)) - assert isinstance(result, PermissionError) - - -@pytest.mark.timeout(150) -def test_vllm_use_tool(cluster, tokenizer): - """Test that vLLM can use tool in the code executor.""" - # Prepare test data - codes = ["retrieve('Jen-Hsun Huang')\n"] - results = [ - "\n\n" - "['Nvidia was established in 1993 by Jen-Hsun Huang, Curtis Priem, and Chris '\n" - " 'Malachowsky. In 2000 Nvidia took intellectual possession of 3dfx, one of the '\n" - " 'biggest GPU producers in 1990s.']\n" - "" - ] - results = [code + result for code, result in zip(codes, results)] - - test_prompts = [code * 4 for code in codes] - encodings = tokenizer( - test_prompts, - padding="max_length", - max_length=1024, - return_tensors="pt", - padding_side="right", - ) - input_lengths = encodings["attention_mask"].sum(dim=1).to(torch.int32) - batch = BatchedDataDict( - { - "input_ids": encodings["input_ids"], - "input_lengths": input_lengths, - } - ) - - # Construct retriever - dataset = load_dataset("rahular/simple-wikipedia") - documents = [sample["text"] for sample in dataset["train"]] - tool_map = {"retrieve": BM25Retriever(documents, num_result=1)} - - # Create separate configs for each policy - vllm_config = basic_vllm_test_config.copy() - vllm_config = configure_generation_config(vllm_config, tokenizer, is_eval=True) - - # Create vLLM generation - vllm_generation = VllmGeneration(cluster, vllm_config) - - # Generate and check result - outputs = generate_with_code_and_tools( - vllm_generation, batch, tokenizer, tool_map=tool_map, greedy=True - ) - - all_output_ids = outputs["output_ids"] - input_lengths = outputs["unpadded_sequence_lengths"] - outputs["generation_lengths"] - output_lengths = outputs["unpadded_sequence_lengths"] - output_ids = [] - for all_output_id, input_length, output_length in zip( - all_output_ids, input_lengths, output_lengths - ): - output_ids.append(all_output_id[input_length:output_length]) - - output_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True) - - assert output_texts == results, f"Expect {results}, got wrong output {output_texts}" - - # Clean up - vllm_generation.shutdown() From 5bed0757c119b64344cbe1f60d6133dba3a6c5cf Mon Sep 17 00:00:00 2001 From: KiddoZhu Date: Mon, 28 Jul 2025 16:16:27 -0700 Subject: [PATCH 6/7] remove hf path Signed-off-by: KiddoZhu --- .../ray_actor_environment_registry.py | 2 + nemo_rl/environments/code_environment.py | 4 +- nemo_rl/environments/tools/retriever.py | 6 +- .../environments/test_code_environment.py | 113 +----------------- tests/unit/environments/test_retriever.py | 8 ++ 5 files changed, 20 insertions(+), 113 deletions(-) diff --git a/nemo_rl/distributed/ray_actor_environment_registry.py b/nemo_rl/distributed/ray_actor_environment_registry.py index 277619bb92..e300aec54b 100644 --- a/nemo_rl/distributed/ray_actor_environment_registry.py +++ b/nemo_rl/distributed/ray_actor_environment_registry.py @@ -21,7 +21,9 @@ "nemo_rl.models.policy.dtensor_policy_worker.DTensorPolicyWorker": PY_EXECUTABLES.VLLM, "nemo_rl.models.policy.megatron_policy_worker.MegatronPolicyWorker": PY_EXECUTABLES.MCORE, "nemo_rl.environments.math_environment.MathEnvironment": PY_EXECUTABLES.SYSTEM, + "nemo_rl.environments.code_environment.CodeEnvironment": PY_EXECUTABLES.SYSTEM, "nemo_rl.environments.games.sliding_puzzle.SlidingPuzzleEnv": PY_EXECUTABLES.SYSTEM, + "nemo_rl.environments.tools.retriever.RAGEnvironment": PY_EXECUTABLES.SYSTEM, } diff --git a/nemo_rl/environments/code_environment.py b/nemo_rl/environments/code_environment.py index cb72c1532e..c340d1980b 100644 --- a/nemo_rl/environments/code_environment.py +++ b/nemo_rl/environments/code_environment.py @@ -48,7 +48,6 @@ class CodeEnvMetadata(TypedDict): @ray.remote class CodeExecutionWorker: - DEFAULT_PY_EXECUTABLE = PY_EXECUTABLES.SYSTEM """Helper class to process individual code execution steps.""" def __init__(self): @@ -188,7 +187,6 @@ def safe_import(self, name: str, *args, **kwargs): @ray.remote class CodeEnvironment(EnvironmentInterface): - DEFAULT_PY_EXECUTABLE = PY_EXECUTABLES.SYSTEM """Code execution environment that maintains state between steps.""" def __init__(self, cfg: CodeEnvConfig): @@ -197,7 +195,7 @@ def __init__(self, cfg: CodeEnvConfig): self.terminate_on_evaluation = cfg["terminate_on_evaluation"] self.workers = [ CodeExecutionWorker.options( - runtime_env={"py_executable": CodeExecutionWorker.DEFAULT_PY_EXECUTABLE} + runtime_env={"py_executable": PY_EXECUTABLES.SYSTEM} ).remote() for _ in range(self.num_workers) ] diff --git a/nemo_rl/environments/tools/retriever.py b/nemo_rl/environments/tools/retriever.py index cc62d8a2af..4109a21a05 100644 --- a/nemo_rl/environments/tools/retriever.py +++ b/nemo_rl/environments/tools/retriever.py @@ -11,15 +11,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import re import math -from typing import Any, Dict, List, TypedDict +import re from collections import Counter -from tqdm import tqdm +from typing import Any, Dict, List, TypedDict import ray import torch from datasets import load_dataset +from tqdm import tqdm from transformers import AutoTokenizer from nemo_rl.data.interfaces import LLMMessageLogType diff --git a/tests/unit/environments/test_code_environment.py b/tests/unit/environments/test_code_environment.py index 732edf682b..dd5b8de7a6 100644 --- a/tests/unit/environments/test_code_environment.py +++ b/tests/unit/environments/test_code_environment.py @@ -28,7 +28,6 @@ from nemo_rl.experience.rollouts import run_multi_turn_rollout from nemo_rl.models.generation import configure_generation_config from nemo_rl.models.generation.vllm import VllmConfig, VllmGeneration -from nemo_rl.models.policy.hf_policy import HfPolicy, PolicyConfig MODEL_NAME = "meta-llama/Llama-3.2-1B" @@ -58,43 +57,15 @@ "disable_log_stats": True, "disable_log_requests": True, "gpu_memory_utilization": 0.6, + "enforce_eager": "False", }, -} - -basic_hf_test_config: PolicyConfig = { - "model_name": MODEL_NAME, - "tokenizer_name": None, - "generation_batch_size": 1, - "generation": { - "backend": "hf", - "max_new_tokens": 100, - "temperature": 1.0, - "top_p": 1.0, - "top_k": None, - "stop_token_ids": None, - "stop_strings": None, - }, - # Required training parameters - "train_global_batch_size": 1, - "train_micro_batch_size": 1, - "learning_rate": 5e-6, - "logprob_batch_size": 1, - "max_new_tokens": 16, - "do_sample": False, - "precision": "float32", - "activation_checkpointing_enabled": False, - "fsdp_offload_enabled": False, - "optimizer": { - "name": "torch.optim.AdamW", - "kwargs": { - "lr": 5e-6, - "weight_decay": 0.01, - "betas": [0.9, 0.999], - "eps": 1e-8, + "colocated": { + "enabled": True, + "resources": { + "gpus_per_node": None, + "num_nodes": None, }, }, - "dtensor_cfg": {"enabled": False}, - "dynamic_batching": {"enabled": False}, } @@ -246,75 +217,3 @@ def test_vllm_execute_code(cluster, tokenizer, code_env): assert last_msg["content"] == results[i], ( f"Expected {results[i]}, got {last_msg['content']}" ) - - -def test_hf_execute_code(cluster, tokenizer, code_env): - """Test that Huggingface models can call the code executor.""" - # Prepare test data - codes = [ - "x = 3; y = 4\nThis is some regular text.\nx + y\n", - "\ndef f(x):\n return x * x\n\nf(2)\n\n", - ] - results = ["7", "\n\n4\n"] - - # Create message logs - message_logs = [] - metadata_batch = [] - temp_dirs = [] - for code in codes: - # Tokenize the message content - prompt = code * 4 - token_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)[ - "input_ids" - ][0] - temp_dir = TemporaryDirectory() - message_logs.append( - [{"role": "user", "content": prompt, "token_ids": token_ids}] - ) - metadata_batch.append(CodeEnvMetadata(context={}, working_dir=temp_dir.name)) - temp_dirs.append(temp_dir) - - # Create initial batch - initial_batch = BatchedDataDict( - { - "message_log": message_logs, - "extra_env_info": metadata_batch, - "task_name": ["code_execution"] * len(codes), - "stop_strings": [[""]] * len(codes), - } - ) - - # Create HF policy - hf_config = basic_hf_test_config.copy() - hf_config["generation"] = configure_generation_config( - hf_config["generation"], - tokenizer, - ) - hf_policy = HfPolicy( - cluster, hf_config, tokenizer, init_reference_model=False, init_optimizer=False - ) - - # Create code environment - task_to_env = {"code_execution": code_env} - - # Run rollout - hf_policy.prepare_for_generation() - final_batch, _ = run_multi_turn_rollout( - policy_generation=hf_policy, - input_batch=initial_batch, - tokenizer=tokenizer, - task_to_env=task_to_env, - max_seq_len=256, - max_rollout_turns=2, - greedy=True, - ) - hf_policy.finish_generation() - - # Check results - for i, msg_log in enumerate(final_batch["message_log"]): - # Get the last message which should contain the result - last_msg = msg_log[-1] - assert last_msg["role"] == "environment" - assert last_msg["content"] == results[i], ( - f"Expected {results[i]}, got {last_msg['content']}" - ) diff --git a/tests/unit/environments/test_retriever.py b/tests/unit/environments/test_retriever.py index 457dc5bc4b..a773d5dac0 100644 --- a/tests/unit/environments/test_retriever.py +++ b/tests/unit/environments/test_retriever.py @@ -56,6 +56,14 @@ "disable_log_stats": True, "disable_log_requests": True, "gpu_memory_utilization": 0.6, + "enforce_eager": "False", + }, + "colocated": { + "enabled": True, + "resources": { + "gpus_per_node": None, + "num_nodes": None, + }, }, } From 013fb299113500ba14fbaf992774f11ed52c0116 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Tue, 29 Jul 2025 21:13:17 +0000 Subject: [PATCH 7/7] terry nits Signed-off-by: Terry Kong --- nemo_rl/environments/code_environment.py | 4 ++-- nemo_rl/environments/tools/retriever.py | 2 +- tests/unit/environments/test_code_environment.py | 1 + tests/unit/environments/test_retriever.py | 1 + 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/nemo_rl/environments/code_environment.py b/nemo_rl/environments/code_environment.py index c340d1980b..029b9cd1c0 100644 --- a/nemo_rl/environments/code_environment.py +++ b/nemo_rl/environments/code_environment.py @@ -46,7 +46,7 @@ class CodeEnvMetadata(TypedDict): working_dir: str # Working directory for file operations -@ray.remote +@ray.remote # pragma: no cover class CodeExecutionWorker: """Helper class to process individual code execution steps.""" @@ -185,7 +185,7 @@ def safe_import(self, name: str, *args, **kwargs): return builtins.__import__(name, *args, **kwargs) -@ray.remote +@ray.remote # pragma: no cover class CodeEnvironment(EnvironmentInterface): """Code execution environment that maintains state between steps.""" diff --git a/nemo_rl/environments/tools/retriever.py b/nemo_rl/environments/tools/retriever.py index 4109a21a05..8f408fc92b 100644 --- a/nemo_rl/environments/tools/retriever.py +++ b/nemo_rl/environments/tools/retriever.py @@ -131,7 +131,7 @@ def __call__(self, query: str) -> List[str]: return results -@ray.remote +@ray.remote # pragma: no cover class RAGEnvironment(EnvironmentInterface): """RAG environment that uses BM25 for document retrieval.""" diff --git a/tests/unit/environments/test_code_environment.py b/tests/unit/environments/test_code_environment.py index dd5b8de7a6..27ada6d9bf 100644 --- a/tests/unit/environments/test_code_environment.py +++ b/tests/unit/environments/test_code_environment.py @@ -152,6 +152,7 @@ def test_untrusted_code(code_env): assert responses == results, f"Got wrong output {responses}" +@pytest.mark.hf_gated def test_vllm_execute_code(cluster, tokenizer, code_env): """Test that vLLM can call the code executor.""" # Prepare test data diff --git a/tests/unit/environments/test_retriever.py b/tests/unit/environments/test_retriever.py index a773d5dac0..824c09b041 100644 --- a/tests/unit/environments/test_retriever.py +++ b/tests/unit/environments/test_retriever.py @@ -113,6 +113,7 @@ def cluster(): cluster_instance.shutdown() +@pytest.mark.hf_gated def test_vllm_retrieve(cluster, tokenizer, rag_env): """Test that vLLM can use the RAG environment for document retrieval.""" # Prepare test data