From 4bc5bfd71ef7b6e2cd179526fecd04b898a86e71 Mon Sep 17 00:00:00 2001
From: KiddoZhu <zhaochengz@nvidia.com>
Date: Tue, 6 May 2025 12:01:34 -0700
Subject: [PATCH 1/7] code execution + tool use + basic blockers for
 filesystems & modules

Signed-off-by: KiddoZhu <zhaochengz@nvidia.com>
---
 .gitignore                                    |   8 +-
 nemo_rl/algorithms/dpo.py                     |  12 +-
 nemo_rl/algorithms/grpo.py                    |   7 +
 nemo_rl/algorithms/sft.py                     |  10 +
 nemo_rl/data/llm_message_utils.py             |   6 -
 nemo_rl/environments/math_environment.py      |  23 +-
 nemo_rl/experience/rollouts.py                |   4 +-
 nemo_rl/models/dtensor/parallelize.py         |  27 +-
 nemo_rl/models/generation/vllm.py             |   2 +
 nemo_rl/models/policy/__init__.py             |   2 +-
 .../models/policy/dtensor_policy_worker.py    |  15 +-
 nemo_rl/models/policy/fsdp1_policy_worker.py  |  46 ++-
 nemo_rl/models/policy/hf_policy.py            |   4 +
 nemo_rl/tools/__init__.py                     |   0
 nemo_rl/tools/generation.py                   | 236 ++++++++++++
 nemo_rl/tools/interfaces.py                   |  20 +
 nemo_rl/tools/tools.py                        | 199 ++++++++++
 nemo_rl/utils/checkpoint.py                   |   7 +-
 nemo_rl/utils/native_checkpoint.py            |  44 ++-
 tests/functional/dpo.sh                       |   2 +-
 tests/test_suites/README.md                   |   7 +-
 tests/test_suites/nightly.txt                 |  10 +-
 tests/test_suites/release.txt                 |   4 +-
 tests/unit/conftest.py                        |  31 --
 tests/unit/data/test_llm_message_utils.py     |  35 --
 tests/unit/experience/test_rollouts.py        |  40 --
 .../models/generation/test_vllm_generation.py |   5 +-
 .../unit/models/policy/test_dtensor_worker.py |   6 -
 tests/unit/test_recipes_and_test_suites.py    |  96 +----
 tests/unit/tools/test_tools.py                | 351 ++++++++++++++++++
 tests/unit/utils/test_checkpoint.py           |   5 +-
 tests/unit/utils/test_native_checkpoint.py    | 104 +++++-
 32 files changed, 1036 insertions(+), 332 deletions(-)
 create mode 100644 nemo_rl/tools/__init__.py
 create mode 100644 nemo_rl/tools/generation.py
 create mode 100644 nemo_rl/tools/interfaces.py
 create mode 100644 nemo_rl/tools/tools.py
 create mode 100644 tests/unit/tools/test_tools.py

diff --git a/.gitignore b/.gitignore
index 12121a4155..46efa31b70 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,12 +15,14 @@ apidocs/
 dist/
 *.egg-info/
 *.vscode/
-release_run*
-ckpts/
 
 # Test
 coverage.json
 .coverage*
+unit_results.json
+unit_results/
+release_run*
+ckpts/
 test_assets/
 
 # Cache
@@ -33,4 +35,4 @@ docker/
 wandb/
 checkpoints/
 results/
-code_snapshots/
+code_snapshots/
\ No newline at end of file
diff --git a/nemo_rl/algorithms/dpo.py b/nemo_rl/algorithms/dpo.py
index 0647f0cd5a..dd6607ef9d 100644
--- a/nemo_rl/algorithms/dpo.py
+++ b/nemo_rl/algorithms/dpo.py
@@ -446,6 +446,14 @@ def dpo_train(
                     % master_config["checkpointing"]["save_period"]
                     == 0
                 ):  # +1 because step is 0-indexed
+                    is_last_checkpoint = (
+                        min(
+                            len(train_dataloader) * max_num_epochs,
+                            master_config["dpo"]["max_num_steps"],
+                        )
+                        - (total_steps + 1)
+                        < master_config["checkpointing"]["save_period"]
+                    )
                     dpo_save_state["step"] = (current_step + 1) % len(train_dataloader)
                     dpo_save_state["total_steps"] = total_steps + 1
                     dpo_save_state["epoch"] = current_epoch
@@ -462,9 +470,7 @@ def dpo_train(
                             optimizer_path=os.path.join(
                                 checkpoint_path, "policy", "optimizer"
                             ),
-                            tokenizer_path=os.path.join(
-                                checkpoint_path, "policy", "tokenizer"
-                            ),
+                            save_hf=is_last_checkpoint,
                         )
                         torch.save(
                             train_dataloader.state_dict(),
diff --git a/nemo_rl/algorithms/grpo.py b/nemo_rl/algorithms/grpo.py
index 5a007451d0..952a6c172a 100644
--- a/nemo_rl/algorithms/grpo.py
+++ b/nemo_rl/algorithms/grpo.py
@@ -524,6 +524,12 @@ def grpo_train(
             ):  # +1 because step is 0-indexed
                 policy.prepare_for_training()
 
+                is_last_checkpoint = (
+                    min(len(dataloader), master_config["grpo"]["max_num_steps"])
+                    - (step + 1)
+                    < master_config["checkpointing"]["save_period"]
+                )
+
                 grpo_save_state["step"] = step + 1
                 grpo_save_state["val_reward"] = val_metrics["accuracy"]
                 grpo_save_state["consumed_samples"] = consumed_samples
@@ -540,6 +546,7 @@ def grpo_train(
                         tokenizer_path=os.path.join(
                             checkpoint_path, "policy", "tokenizer"
                         ),
+                        save_hf=is_last_checkpoint,
                     )
                     torch.save(
                         dataloader.state_dict(),
diff --git a/nemo_rl/algorithms/sft.py b/nemo_rl/algorithms/sft.py
index d10c3df483..8b5ffcddfd 100644
--- a/nemo_rl/algorithms/sft.py
+++ b/nemo_rl/algorithms/sft.py
@@ -447,6 +447,15 @@ def sft_train(
                     % master_config["checkpointing"]["save_period"]
                     == 0
                 ):  # +1 because step is 0-indexed
+                    is_last_checkpoint = (
+                        min(
+                            len(train_dataloader) * max_num_epochs,
+                            master_config["sft"]["max_num_steps"],
+                        )
+                        - (total_steps + 1)
+                        < master_config["checkpointing"]["save_period"]
+                    )
+
                     sft_save_state["step"] = (current_step + 1) % len(train_dataloader)
                     sft_save_state["total_steps"] = total_steps + 1
                     sft_save_state["epoch"] = current_epoch
@@ -467,6 +476,7 @@ def sft_train(
                             tokenizer_path=os.path.join(
                                 checkpoint_path, "policy", "tokenizer"
                             ),
+                            save_hf=is_last_checkpoint,
                         )
                         torch.save(
                             train_dataloader.state_dict(),
diff --git a/nemo_rl/data/llm_message_utils.py b/nemo_rl/data/llm_message_utils.py
index 51cd5a279d..f2d24fc421 100644
--- a/nemo_rl/data/llm_message_utils.py
+++ b/nemo_rl/data/llm_message_utils.py
@@ -421,12 +421,6 @@ def get_formatted_message_log(
         new_message["token_ids"] = tokenizer(
             message_chunk, return_tensors="pt", add_special_tokens=False
         )["input_ids"][0]
-        if len(new_message["token_ids"]) == 0:
-            # if there is an empty message, the empty `token_ids` tensor ends up being in fp32,
-            # which causes `_validate_tensor_consistency` to fail. To fix this, we convert the
-            # empty tensor to int64.
-            new_message["token_ids"] = new_message["token_ids"].to(torch.int64)
-
         new_message["content"] = message_chunk
         new_message_log.append(new_message)
 
diff --git a/nemo_rl/environments/math_environment.py b/nemo_rl/environments/math_environment.py
index fd968298b0..8da0528652 100644
--- a/nemo_rl/environments/math_environment.py
+++ b/nemo_rl/environments/math_environment.py
@@ -15,8 +15,7 @@
 
 import ray
 import torch
-from math_verify.metric import math_metric
-from math_verify.parser import ExprExtractionConfig, LatexExtractionConfig
+from math_verify import parse, verify
 
 from nemo_rl.distributed.batched_data_dict import BatchedDataDict
 from nemo_rl.distributed.virtual_cluster import PY_EXECUTABLES
@@ -54,23 +53,9 @@ def verify(
         results = []
         for response, ground_truth in zip(pred_responses, ground_truths):
             try:
-                # Use Latex and plain math extraction from predictions
-                # https://github.com/huggingface/Math-Verify?tab=readme-ov-file#extraction-targets
-                verify_func = math_metric(
-                    gold_extraction_target=(LatexExtractionConfig(),),
-                    pred_extraction_target=(
-                        ExprExtractionConfig(),
-                        LatexExtractionConfig(),
-                    ),
-                )
-
-                ground_truth_parsable = "\\boxed{" + ground_truth + "}"
-                try:
-                    ret_score, _ = verify_func([ground_truth_parsable], [response])
-                except Exception:
-                    ret_score = 0.0
-
-                results.append(float(ret_score))
+                gold = parse(ground_truth)
+                pred = parse(response[-100:])  # avoid looking at the whole string
+                results.append(float(verify(gold, pred)))
             except Exception:
                 results.append(0)
         return results
diff --git a/nemo_rl/experience/rollouts.py b/nemo_rl/experience/rollouts.py
index 567add0dfc..a556a32a42 100644
--- a/nemo_rl/experience/rollouts.py
+++ b/nemo_rl/experience/rollouts.py
@@ -311,9 +311,7 @@ def run_multi_turn_rollout(
                 >= max_seq_len
             ):
                 # truncate
-                tokenized_obs = tokenized_obs[
-                    : max_seq_len - (len(generated_ids[i]) + active_input_lengths[i])
-                ]
+                tokenized_obs = tokenized_obs[: max_seq_len - active_input_lengths[i]]
                 truncation_mask[i] = True
                 # Record truncation
                 sample_truncated[active_indices[i]] = True
diff --git a/nemo_rl/models/dtensor/parallelize.py b/nemo_rl/models/dtensor/parallelize.py
index 5998937cc9..3ae86d70cc 100644
--- a/nemo_rl/models/dtensor/parallelize.py
+++ b/nemo_rl/models/dtensor/parallelize.py
@@ -30,7 +30,6 @@
 from torch.distributed.tensor.placement_types import Replicate, Shard
 from transformers.models.llama.modeling_llama import LlamaForCausalLM
 from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM
-from transformers.models.qwen3.modeling_qwen3 import Qwen3ForCausalLM
 
 from nemo_rl.distributed.model_utils import from_parallel_logits_to_logprobs
 
@@ -99,7 +98,7 @@ def _parallelize_llama(
 
 
 def _parallelize_qwen(
-    model: Union[Qwen2ForCausalLM, Qwen3ForCausalLM],
+    model: Qwen2ForCausalLM,
     dp_mesh: DeviceMesh,
     tp_mesh: DeviceMesh,
     mp_policy: MixedPrecisionPolicy,
@@ -109,7 +108,7 @@ def _parallelize_qwen(
 ):
     """Parallelizes a Qwen2ForCausalLM model across data and tensor parallel dimensions."""
 
-    class QwenRotaryEmbedParallel(SequenceParallel):
+    class Qwen2RotaryEmbedParallel(SequenceParallel):
         """Custom SequenceParallel class for Qwen2 rotary embeddings because the input is a tuple."""
 
         @staticmethod
@@ -142,23 +141,6 @@ def _prepare_input_fn(sequence_sharding, mod, inputs, device_mesh):
 
             return type(inputs)(new_inputs)
 
-    class Qwen3QKNorm(SequenceParallel):
-        @staticmethod
-        def _prepare_input_fn(sequence_sharding, mod, inputs, device_mesh):
-            input_tensor = inputs[0]
-
-            if isinstance(input_tensor, DTensor):
-                assert input_tensor.placements == (Shard(dim=2),)
-            elif isinstance(input_tensor, torch.Tensor):
-                # assume the input passed in already sharded on the sequence dim and create the DTensor
-                return DTensor.from_local(
-                    input_tensor, device_mesh, sequence_sharding, run_check=False
-                )
-            else:
-                raise ValueError(
-                    f"expecting input of {mod} to be a torch.Tensor or DTensor, but got {input_tensor}"
-                )
-
     if tp_mesh.size() > 1:
         assert not model.config.tie_word_embeddings, (
             "Tie word embeddings not supported when TP is enabled"
@@ -174,7 +156,7 @@ def _prepare_input_fn(sequence_sharding, mod, inputs, device_mesh):
                     input_layouts=Replicate(),
                     output_layouts=Shard(1),
                 ),
-                "model.rotary_emb": QwenRotaryEmbedParallel(),
+                "model.rotary_emb": Qwen2RotaryEmbedParallel(),
                 "model.norm": SequenceParallel(),
                 "model.layers.*.input_layernorm": SequenceParallel(),
                 "model.layers.*.self_attn.q_proj": ColwiseParallel(
@@ -189,8 +171,6 @@ def _prepare_input_fn(sequence_sharding, mod, inputs, device_mesh):
                 "model.layers.*.self_attn.o_proj": RowwiseParallel(
                     output_layouts=Shard(1)
                 ),
-                "model.layers.*.self_attn.q_norm": Qwen3QKNorm(),
-                "model.layers.*.self_attn.k_norm": Qwen3QKNorm(),
                 "model.layers.*.post_attention_layernorm": SequenceParallel(),
                 "model.layers.*.mlp.up_proj": ColwiseParallel(),
                 "model.layers.*.mlp.gate_proj": ColwiseParallel(),
@@ -234,7 +214,6 @@ def _prepare_input_fn(sequence_sharding, mod, inputs, device_mesh):
 
 PARALLIZE_FUNCTIONS = {
     Qwen2ForCausalLM: _parallelize_qwen,
-    Qwen3ForCausalLM: _parallelize_qwen,
     LlamaForCausalLM: _parallelize_llama,
 }
 
diff --git a/nemo_rl/models/generation/vllm.py b/nemo_rl/models/generation/vllm.py
index 59fcc26320..4128f6a9cc 100644
--- a/nemo_rl/models/generation/vllm.py
+++ b/nemo_rl/models/generation/vllm.py
@@ -273,6 +273,7 @@ def generate(
 
         # Read generation parameters from config
         top_k = self.cfg["top_k"] if self.cfg["top_k"] is not None else -1
+
         sampling_params = self.SamplingParams(
             temperature=self.cfg["temperature"] if not greedy else 0,
             top_p=self.cfg["top_p"],
@@ -390,6 +391,7 @@ def generate_text(
 
         # Read generation parameters from config
         top_k = self.cfg["top_k"] if self.cfg["top_k"] is not None else -1
+
         sampling_params = self.SamplingParams(
             temperature=self.cfg["temperature"] if not greedy else 0,
             top_p=self.cfg["top_p"],
diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py
index fbe728a840..47714fb0f5 100644
--- a/nemo_rl/models/policy/__init__.py
+++ b/nemo_rl/models/policy/__init__.py
@@ -37,7 +37,7 @@ class PolicyConfig(TypedDict):
     train_micro_batch_size: int
     learning_rate: float
     logprob_batch_size: int
-    generation: Optional[GenerationConfig]
+    generation: GenerationConfig
     precision: str
     dtensor_cfg: DTensorConfig
     make_sequence_length_divisible_by: int
diff --git a/nemo_rl/models/policy/dtensor_policy_worker.py b/nemo_rl/models/policy/dtensor_policy_worker.py
index c99110d7e7..29ecd46452 100644
--- a/nemo_rl/models/policy/dtensor_policy_worker.py
+++ b/nemo_rl/models/policy/dtensor_policy_worker.py
@@ -335,10 +335,6 @@ def train(
                     else:
                         logits = outputs.logits
 
-                    # Divide logits by temperature
-                    if "generation" in self.cfg and self.cfg["generation"] is not None:
-                        logits.div_(self.cfg["generation"]["temperature"])
-
                     loss, loss_metrics = loss_fn(logits, mb)
                     num_valid_samples = loss_metrics["num_valid_samples"]
                     loss_metrics["lr"] = self.optimizer.param_groups[0]["lr"]
@@ -375,12 +371,10 @@ def train(
 
                     # Update parameters
                     self.optimizer.step()
+                    self.scheduler.step()
 
                 losses.append(torch.tensor(mb_losses).sum().item())
 
-            # increment scheduler after all batches in rollout are processed
-            self.scheduler.step()
-
             # Compute global loss across all ranks
             with torch.no_grad():
                 local_loss = torch.tensor(losses, device="cuda")
@@ -720,10 +714,13 @@ def save_checkpoint(
         weights_path: str,
         optimizer_path: Optional[str] = None,
         tokenizer_path: Optional[str] = None,
+        save_torch_dist: bool = True,
+        save_hf: bool = False,
     ):
         """Save a checkpoint of the model.
 
-        the optimizer states are saved only if `optimizer` and `optimizer_path` are provided.
+        the HuggingFace checkpoint is saved only if `save_hf` is True,
+        and the optimizer states are saved only if `optimizer` and `optimizer_path` are provided.
         """
         save_checkpoint(
             model=self.model,
@@ -733,6 +730,8 @@ def save_checkpoint(
             optimizer_path=optimizer_path,
             tokenizer=self.tokenizer if tokenizer_path else None,
             tokenizer_path=tokenizer_path,
+            save_torch_dist=save_torch_dist,
+            save_hf=save_hf,
         )
 
     def load_checkpoint(self, weights_path: str, optimizer_path: Optional[str] = None):
diff --git a/nemo_rl/models/policy/fsdp1_policy_worker.py b/nemo_rl/models/policy/fsdp1_policy_worker.py
index 53ec5944f9..bd3951f3a2 100644
--- a/nemo_rl/models/policy/fsdp1_policy_worker.py
+++ b/nemo_rl/models/policy/fsdp1_policy_worker.py
@@ -290,10 +290,6 @@ def train(
                         else:
                             logits = outputs.logits
 
-                    # Divide logits by temperature
-                    if "generation" in self.cfg and self.cfg["generation"] is not None:
-                        logits.div_(self.cfg["generation"]["temperature"])
-
                     loss, loss_metrics = loss_fn(logits, mb)
                     num_valid_samples = loss_metrics["num_valid_samples"]
                     loss_metrics["lr"] = self.optimizer.param_groups[0]["lr"]
@@ -329,11 +325,9 @@ def train(
 
                     # Update parameters
                     self.optimizer.step()
+                    self.scheduler.step()
                 losses.append(torch.tensor(mb_losses).sum().item())
 
-            # increment scheduler after all batches in rollout are processed
-            self.scheduler.step()
-
             # Compute global loss across all ranks
             with torch.no_grad():
                 local_loss = torch.tensor(losses, device="cuda")
@@ -634,26 +628,17 @@ def generate(
                 device=return_data["left_padded_output_ids"][0].device,
             )
 
-            for idx, seq in enumerate(return_data["left_padded_output_ids"]):
+            for idx, (seq, generated_logprob) in enumerate(
+                zip(
+                    return_data["left_padded_output_ids"],
+                    return_data["generation_logprobs"],
+                )
+            ):
                 # Get only the generated part (excluding input)
                 original_length = return_data["orig_input_lengths"][idx].item()
                 seq_len = seq.size(0)
 
-                # The generated content starts after the left-padded input
-                generated_part = seq[-(seq_len - input_length) :]
-
-                eos_positions = (generated_part == self.tokenizer.eos_token_id).nonzero(
-                    as_tuple=True
-                )[0]
-                # TODO @sahilj: handle different stopping criteria
-                # Calculate generation length
-                if len(eos_positions) > 0:
-                    gen_length = (
-                        eos_positions[0].item() + 1
-                    )  # +1 to include the EOS token
-                else:
-                    gen_length = len(generated_part)
-
+                gen_length = (generated_logprob != 0).sum().item()
                 generation_lengths.append(gen_length)
 
                 valid_length = original_length + gen_length
@@ -668,7 +653,7 @@ def generate(
                 )
 
                 # Combine with generated part
-                valid_generated_part = generated_part[:gen_length]
+                valid_generated_part = seq[input_length : input_length + gen_length]
                 valid_tokens = torch.cat([valid_input_part, valid_generated_part])
 
                 # Place at the beginning of the right-padded sequence
@@ -916,6 +901,8 @@ def save_checkpoint(
         weights_path: str,
         optimizer_path: Optional[str] = None,
         tokenizer_path: Optional[str] = None,
+        save_torch_dist: bool = True,
+        save_hf: bool = False,
     ):
         """Save a checkpoint of the model.
 
@@ -925,12 +912,19 @@ def save_checkpoint(
             __0_1.distcp
             __1_0.distcp
             ...
+        weights_path-hf/
+            config.json
+            generation_config.json
+            model-00001-of-<TOTAL_SHARDS>.safetensors
+            ...
+            model.safetensors.index.json
         optimizer_path/
             __0_0.distcp
             __1_0.distcp
             ...
 
-        the optimizer states are saved only if `optimizer` and `optimizer_path` are provided.
+        the HuggingFace checkpoint is saved only if `save_hf` is True,
+        and the optimizer states are saved only if `optimizer` and `optimizer_path` are provided.
         """
         save_checkpoint(
             model=self.model,
@@ -940,6 +934,8 @@ def save_checkpoint(
             optimizer_path=optimizer_path,
             tokenizer=self.tokenizer if tokenizer_path else None,
             tokenizer_path=tokenizer_path,
+            save_torch_dist=save_torch_dist,
+            save_hf=save_hf,
         )
 
     def load_checkpoint(self, weights_path: str, optimizer_path: Optional[str] = None):
diff --git a/nemo_rl/models/policy/hf_policy.py b/nemo_rl/models/policy/hf_policy.py
index 2d2dbf3d4c..2a579e3bcd 100644
--- a/nemo_rl/models/policy/hf_policy.py
+++ b/nemo_rl/models/policy/hf_policy.py
@@ -307,6 +307,8 @@ def save_checkpoint(
         weights_path: str,
         optimizer_path: Optional[str] = None,
         tokenizer_path: Optional[str] = None,
+        save_torch_dist: bool = True,
+        save_hf: bool = False,
     ):
         """Save a checkpoint of the model."""
         futures = self.worker_group.run_all_workers_single_data(
@@ -314,6 +316,8 @@ def save_checkpoint(
             weights_path,
             optimizer_path,
             tokenizer_path,
+            save_torch_dist,
+            save_hf,
             only_on="all_tied_workers",
         )
         ray.get(futures)
diff --git a/nemo_rl/tools/__init__.py b/nemo_rl/tools/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/nemo_rl/tools/generation.py b/nemo_rl/tools/generation.py
new file mode 100644
index 0000000000..f50dbfe3ae
--- /dev/null
+++ b/nemo_rl/tools/generation.py
@@ -0,0 +1,236 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+import warnings
+from pprint import pformat
+from typing import Dict
+
+import ray
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from transformers import AutoTokenizer
+
+from nemo_rl.distributed.batched_data_dict import BatchedDataDict
+from nemo_rl.models.generation.interfaces import (
+    GenerationDatumSpec,
+    GenerationInterface,
+    GenerationOutputSpec,
+)
+from nemo_rl.tools.interfaces import ToolInterface
+from nemo_rl.tools.tools import StatefulCodeExecutor
+
+LOGIT_INFINITY = 1000
+
+
+def generate_with_code_and_tools(
+    policy: GenerationInterface,
+    input_batch: BatchedDataDict[GenerationDatumSpec],
+    tokenizer: AutoTokenizer,
+    execute_code: bool = True,
+    tool_map: Dict[str, ToolInterface] = {},
+    tag: str = "<code>",
+    result_tag: str = "<result>",
+    *args,
+    **kwargs,
+) -> BatchedDataDict[GenerationOutputSpec]:
+    """Generate a batch of data with code execution and tool use.
+
+    All code execution and tool calls in the generation will be executed on-the-fly,
+    of which the results will be appended to the output. Multiple code execution and tool calls
+    is supported.
+
+    This function can be used as a drop-in replacement of `policy.generate()`.
+
+    Args:
+        policy: policy to generate from. Can be either vllm or HuggingFace backend
+        input_batch: BatchedDataDict containing input_ids and input_lengths tensors
+        tokenizer: tokenizer from the pretrained model
+        execute_code: whether to execute code
+        tool_map: tools that the model can use
+        tag: xml tag to detect code snippet
+        result_tag: xml tag to output the result
+        *args, **kwargs: arguments and keyword arguments accepted by `policy.generate()`
+    """
+    if tool_map and not execute_code:
+        warnings.warn(
+            "Tool use requires code execution, but code execution is disabled. All the tools will be ignored."
+        )
+
+    batch = input_batch.copy()
+    start_tag = tag
+    end_tag = tag.replace("<", "</")
+    result_start = result_tag
+    result_end = result_tag.replace("<", "</")
+
+    batch_size = len(batch["input_ids"])
+    stop_strings = batch["stop_strings"] if "stop_strings" in batch else []
+    stop_strings = [stop_strings + [end_tag]] * batch_size
+    batch["stop_strings"] = stop_strings
+    old_logprobs = None
+
+    active_batch = batch
+    active_indices = torch.arange(batch_size)
+    executors = [StatefulCodeExecutor.remote(tool_map) for _ in range(batch_size)]
+    completed_output_ids = [None] * batch_size
+    completed_logprobs = [None] * batch_size
+
+    while len(active_indices) > 0:
+        generation_outputs = policy.generate(active_batch, *args, **kwargs)
+
+        output_ids = generation_outputs["output_ids"]
+        # only contains logprobs for newly generated tokens
+        logprobs = generation_outputs["logprobs"]
+        input_lengths = active_batch["input_lengths"]
+        total_lengths = generation_outputs["unpadded_sequence_lengths"]
+        if old_logprobs is not None:
+            # restore logprobs for tokens generated in previous iterations
+            for i, input_length in enumerate(input_lengths):
+                logprobs[i, :input_length] = old_logprobs[i, :input_length]
+
+        # extract newly generated tokens
+        generated_ids = []
+        for output_id, input_length, total_length in zip(
+            output_ids, input_lengths, total_lengths
+        ):
+            generated_ids.append(output_id[input_length:total_length])
+
+        generated_texts = tokenizer.batch_decode(
+            generated_ids, skip_special_tokens=True
+        )
+
+        is_code = []
+        exprs = []
+        lookaheads = []
+        # parse newly generated texts
+        for i, (generated_text, active_index, total_length) in enumerate(
+            zip(generated_texts, active_indices, total_lengths)
+        ):
+            match = re.search(
+                rf"{start_tag}(.*){end_tag}(.*)", generated_text, re.DOTALL
+            )
+            if match:
+                # stop is caused by code execution
+                # expr takes everything between <code> and </code>, including new lines
+                # lookahead takes everything after </code>
+                is_code.append(i)
+                expr, lookahead = match.groups()
+                exprs.append(expr)
+                lookaheads.append(lookahead)
+            else:
+                # stop is not caused by code execution
+                # e.g. eos token, max length or other stop strings
+                completed_output_ids[active_index] = output_ids[i, :total_length]
+                completed_logprobs[active_index] = logprobs[i, :total_length]
+        if len(is_code) == 0:
+            break
+
+        # execute all code in this batch
+        futures = []
+        for i, expr, lookahead in zip(is_code, exprs, lookaheads):
+            active_index = active_indices[i]
+            # dispatch code to a pre-allocated executor for that sample
+            # so that functions and variables will be carried over
+            future = executors[active_index].__call__.remote(expr)
+            futures.append(future)
+        results = ray.get(futures)
+
+        new_results = []
+        for result in results:
+            if result is None:
+                # no return value
+                result = ""
+                new_results.append(result)
+                continue
+            result = pformat(result)
+            if "\n" in expr or "\n" in result:
+                # multi-line format
+                result = f"\n\n{result_start}\n{result}\n{result_end}"
+            else:
+                # inline format
+                result = f"{result_start}{result}{result_end}"
+            if lookahead:
+                if result.startswith(lookahead):
+                    # The generation may look like "</code>\n" if ">\n" is a single token.
+                    # We trim \n from the result if the model has already generated it.
+                    result = result[len(lookahead) :]
+                else:
+                    warnings.warn(
+                        f"Expect the generation to stop at {repr(end_tag)}, but got {repr(end_tag + lookahead)}. "
+                        "This is because some characters are merged into a single token by the tokenizer. "
+                        "These extra characters will be kept in the generation."
+                    )
+            new_results.append(result)
+
+        encodings = tokenizer(
+            new_results,
+            add_special_tokens=False,
+            padding=True,
+            padding_side="right",
+            return_tensors="pt",
+        )
+        result_ids = encodings["input_ids"]
+        result_lengths = encodings["attention_mask"].sum(dim=1).to(torch.int32)
+
+        is_code = torch.tensor(is_code)
+        # reduce active batch to those containing code
+        active_batch = active_batch.select_indices(is_code)
+        active_indices = active_indices[is_code]
+        output_ids = output_ids[is_code]
+        logprobs = logprobs[is_code]
+        total_lengths = total_lengths[is_code]
+        # max length before appending results
+        old_max_length = total_lengths.max()
+        # max length after appending results
+        new_max_length = (total_lengths + result_lengths).max()
+        new_output_ids = torch.full(
+            (len(active_indices), new_max_length),
+            tokenizer.pad_token_id,
+            dtype=output_ids.dtype,
+        )
+        new_logprobs = torch.full(
+            (len(active_indices), new_max_length), 0, dtype=logprobs.dtype
+        )
+        new_output_ids[:, :old_max_length] = output_ids[:, :old_max_length]
+        new_logprobs[:, :old_max_length] = logprobs[:, :old_max_length]
+
+        # append results to generation
+        for i, (old_length, result_length) in enumerate(
+            zip(total_lengths, result_lengths)
+        ):
+            new_length = old_length + result_length
+            new_output_ids[i, old_length:new_length] = result_ids[i, :result_length]
+            new_logprobs[i, old_length:new_length] = LOGIT_INFINITY
+
+        active_batch["input_ids"] = new_output_ids
+        active_batch["input_lengths"] = total_lengths + result_lengths
+        old_logprobs = new_logprobs
+
+    output_ids = pad_sequence(
+        completed_output_ids,
+        batch_first=True,
+        padding_value=tokenizer.pad_token_id,
+        padding_side="right",
+    )
+    logprobs = pad_sequence(
+        completed_logprobs, batch_first=True, padding_value=0.0, padding_side="right"
+    )
+    total_lengths = torch.tensor([len(output_id) for output_id in completed_output_ids])
+    generation_lengths = total_lengths - input_batch["input_lengths"]
+
+    return {
+        "output_ids": output_ids,
+        "logprobs": logprobs,
+        "generation_lengths": generation_lengths,
+        "unpadded_sequence_lengths": total_lengths,
+    }
diff --git a/nemo_rl/tools/interfaces.py b/nemo_rl/tools/interfaces.py
new file mode 100644
index 0000000000..a37a3b6f10
--- /dev/null
+++ b/nemo_rl/tools/interfaces.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import ABC, abstractmethod
+
+
+class ToolInterface(ABC):
+    @abstractmethod
+    def __call__(self, *args, **kwargs):
+        pass
diff --git a/nemo_rl/tools/tools.py b/nemo_rl/tools/tools.py
new file mode 100644
index 0000000000..1af1977926
--- /dev/null
+++ b/nemo_rl/tools/tools.py
@@ -0,0 +1,199 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ast
+import builtins
+import math
+import os
+import tempfile
+from collections import Counter
+from contextlib import contextmanager
+from typing import Any, Dict, List, Optional
+
+import ray
+import torch
+from datasets import load_dataset
+from tqdm import tqdm
+from transformers import AutoTokenizer
+
+from nemo_rl.tools.interfaces import ToolInterface
+
+
+@ray.remote
+class StatefulCodeExecutor(ToolInterface):
+    """Stateful code executor.
+
+    Args:
+        context: classes, functions and variables accessible to the code executor.
+            By passing tools in context, the code executor also serves tool use.
+    """
+
+    def __init__(self, context: Dict[str, Any] = {}):
+        self.context = context.copy()
+        self.tmp_dir = tempfile.TemporaryDirectory()
+
+        builtin_dict = {k: getattr(builtins, k) for k in dir(builtins)}
+        builtin_dict["open"] = self.safe_open
+        builtin_dict["__import__"] = self.safe_import
+        self.sandbox = {"__builtins__": builtin_dict}
+
+    def __call__(self, code: str) -> Optional[str]:
+        tree = ast.parse(code)
+
+        if tree.body and isinstance(tree.body[-1], ast.Expr):
+            # interactive mode
+            code = ast.unparse(tree.body[:-1])
+            expr = ast.unparse(tree.body[-1])
+        else:
+            # silent mode
+            expr = None
+
+        try:
+            # isolate the code in a sandbox with globals={}
+            # capture local variables in self.context
+            with self.change_temporary_dir():
+                exec(code, self.sandbox, self.context)
+                if expr:
+                    return eval(expr, self.sandbox, self.context)
+        except Exception as err:
+            return err
+
+    @contextmanager
+    def change_temporary_dir(self):
+        current_dir = os.getcwd()
+        os.chdir(self.tmp_dir.name)
+        try:
+            yield
+        finally:
+            os.chdir(current_dir)
+
+    def safe_open(self, file, *args, **kwargs):
+        real_file = os.path.realpath(file)
+        tmp_dir = os.path.realpath(self.tmp_dir.name)
+        if os.path.commonpath([real_file, tmp_dir]) != tmp_dir:
+            # real_file is not inside tmp_dir
+            raise PermissionError(
+                "Access beyond the temporary working directory is blocked"
+            )
+        return open(file, *args, **kwargs)
+
+    def safe_import(self, name, *args, **kwargs):
+        risky_modules = {
+            "os",
+            "shutil",  # erase filesystem
+            "sys",
+            "signal",  # exit the current program
+            "socket",  # network communication
+            "subprocess",
+            "threading",
+            "multiprocessing",  # spawn threads or processes
+            "builtins",
+            "importlib",  # bypass current blockers
+        }
+        if name in risky_modules:
+            raise PermissionError("Importing system and network modules is blocked")
+        return builtins.__import__(name, *args, **kwargs)
+
+
+class BM25Retriever(ToolInterface):
+    """Sparse BM25 retriever.
+
+    Args:
+        documents: list of documents to retrieve from
+        num_result: retrieve top-k documents
+        k1: parameter of BM25. Values in [1.2, 2.0] are recommended.
+        b: parameter of BM25. 0.75 is recommended.
+        device: device to compute BM25
+    """
+
+    def __init__(
+        self,
+        documents: List[str] = None,
+        num_result: int = 10,
+        k1: float = 1.5,
+        b: float = 0.75,
+        device: str = "cpu",
+    ):
+        if documents is None:
+            dataset = load_dataset("wikimedia/wikipedia", "20231101.en")
+            self.documents = [sample["text"] for sample in dataset["train"]]
+        else:
+            self.documents = documents
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            "bert-base-uncased", use_fast=True
+        )
+        self.num_result = num_result
+        self.k1 = k1
+        self.b = b
+        self.device = device
+        self.corpus_size = len(self.documents)
+        self.vocab_size = self.tokenizer.vocab_size
+
+        self.build_index()
+
+    def build_index(self):
+        doc_ids = []
+        token_ids = []
+        tfs = []
+        lengths = []
+
+        for i, document in enumerate(
+            tqdm(self.documents, "Build index for BM25Retriever")
+        ):
+            input_ids = self.tokenizer.encode(document, add_special_tokens=False)
+            token2cnt = Counter(input_ids)
+            token_ids += token2cnt.keys()
+            tfs += token2cnt.values()
+            doc_ids += [i] * len(token2cnt)
+            lengths.append(len(input_ids))
+
+        avg_dl = sum(lengths) / self.corpus_size
+        for i, doc_id in enumerate(doc_ids):
+            tfs[i] = (
+                tfs[i]
+                * (self.k1 + 1)
+                / (tfs[i] + self.k1 * (1 - self.b + self.b * lengths[doc_id] / avg_dl))
+            )
+
+        indices = torch.tensor([doc_ids, token_ids], device=self.device)
+        values = torch.tensor(tfs, device=self.device)
+        self.doc_tfs = torch.sparse_coo_tensor(
+            indices, values, (self.corpus_size, self.vocab_size)
+        )
+
+        idfs = [0] * self.vocab_size
+        token2df = Counter(token_ids)
+        for token_id, df in token2df.items():
+            idfs[token_id] = math.log((self.corpus_size - df + 0.5) / (df + 0.5) + 1)
+        self.idfs = idfs
+
+    def __call__(self, query: str) -> List[str]:
+        input_ids = self.tokenizer.encode(query, add_special_tokens=False)
+        token2cnt = Counter(input_ids)
+        token_ids = []
+        query_idfs = []
+        for token_id, query_tf in token2cnt.items():
+            token_ids.append(token_id)
+            query_idfs.append(query_tf * self.idfs[token_id])
+
+        indices = torch.tensor([token_ids, [0] * len(token_ids)], device=self.device)
+        values = torch.tensor(query_idfs, device=self.device)
+        query_idfs = torch.sparse_coo_tensor(indices, values, (self.vocab_size, 1))
+
+        scores = torch.sparse.mm(self.doc_tfs, query_idfs)
+        scores = scores.to_dense().squeeze(-1)
+        results = []
+        for i in scores.topk(k=self.num_result).indices.tolist():
+            results.append(self.documents[i])
+
+        return results
diff --git a/nemo_rl/utils/checkpoint.py b/nemo_rl/utils/checkpoint.py
index 5f23a0bd68..bc916d3d7e 100644
--- a/nemo_rl/utils/checkpoint.py
+++ b/nemo_rl/utils/checkpoint.py
@@ -26,7 +26,6 @@
 
 import numpy as np
 import torch
-import yaml
 
 
 class CheckpointingConfig(TypedDict):
@@ -57,7 +56,7 @@ class CheckpointManager:
     checkpoint_dir/
         step_0/
             training_info.json
-            config.yaml
+            config.json
             policy.py (up to the algorithm loop to save here)
             policy_optimizer.py (up to the algorithm loop to save here)
             ...
@@ -115,8 +114,8 @@ def init_tmp_checkpoint(
 
         # save config
         if run_config is not None:
-            with open(save_dir / "config.yaml", "w") as f:
-                yaml.safe_dump(run_config, f)
+            with open(save_dir / "config.json", "w") as f:
+                json.dump(run_config, f)
 
         return Path(os.path.abspath(save_dir))
 
diff --git a/nemo_rl/utils/native_checkpoint.py b/nemo_rl/utils/native_checkpoint.py
index fc8f9ba44d..3573d2d86d 100644
--- a/nemo_rl/utils/native_checkpoint.py
+++ b/nemo_rl/utils/native_checkpoint.py
@@ -15,6 +15,7 @@
 """Checkpoint management utilities for HF models."""
 
 import os
+from pathlib import Path
 from typing import Any, Optional
 
 import torch
@@ -138,6 +139,8 @@ def save_checkpoint(
     optimizer_path: Optional[str] = None,
     tokenizer: Optional[Any] = None,
     tokenizer_path: Optional[str] = None,
+    save_torch_dist: bool = True,
+    save_hf: bool = False,
 ) -> None:
     """Save a checkpoint of the model and optionally optimizer state.
 
@@ -147,17 +150,40 @@ def save_checkpoint(
         optimizer: Optional optimizer to save
         scheduler: Optional scheduler to save
         optimizer_path: Path to save optimizer state (required if optimizer provided)
+        save_torch_dist: Whether to save in PyTorch distributed format
+        save_hf: Whether to save in HuggingFace format
     """
-    model_state = {"model": ModelState(model)}
-    dcp.save(model_state, checkpoint_id=weights_path)
-
-    if optimizer is not None:
-        if optimizer_path is None:
-            raise ValueError(
-                "optimizer_path must be provided when saving optimizer state"
+    if save_hf:
+        if hasattr(model, "_fsdp_wrapped_module"):
+            model_state_dict = model._fsdp_wrapped_module.state_dict()
+        else:
+            model_state_dict = {
+                k: v.full_tensor()
+                if isinstance(v, torch.distributed.tensor.DTensor)
+                else v
+                for k, v in model.state_dict().items()
+            }
+
+        if torch.distributed.get_rank() == 0:
+            # Create a new path by appending "-hf" to the weights path
+            hf_weights_path = f"{Path(weights_path)}-hf"
+
+            model.save_pretrained(
+                hf_weights_path,
+                state_dict=model_state_dict,
             )
-        optimizer_state = {"optim": OptimizerState(model, optimizer, scheduler)}
-        dcp.save(optimizer_state, checkpoint_id=optimizer_path)
+
+    if save_torch_dist:
+        model_state = {"model": ModelState(model)}
+        dcp.save(model_state, checkpoint_id=weights_path)
+
+        if optimizer is not None:
+            if optimizer_path is None:
+                raise ValueError(
+                    "optimizer_path must be provided when saving optimizer state"
+                )
+            optimizer_state = {"optim": OptimizerState(model, optimizer, scheduler)}
+            dcp.save(optimizer_state, checkpoint_id=optimizer_path)
 
     if tokenizer is not None:
         if tokenizer_path is None:
diff --git a/tests/functional/dpo.sh b/tests/functional/dpo.sh
index fb976d6701..200a08cdd7 100755
--- a/tests/functional/dpo.sh
+++ b/tests/functional/dpo.sh
@@ -35,5 +35,5 @@ uv run $PROJECT_ROOT/examples/run_dpo.py \
 uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 
 uv run tests/check_metrics.py $JSON_METRICS \
-  'data["train/loss"]["2"] < 0.715'
+  'data["train/loss"]["2"] < 0.694' \
 
diff --git a/tests/test_suites/README.md b/tests/test_suites/README.md
index 0759f06f25..3ccf0d75c9 100644
--- a/tests/test_suites/README.md
+++ b/tests/test_suites/README.md
@@ -4,18 +4,13 @@
 
 Each test is named:
 ```
-<algo>-<model>-#n#g-<parallelism>-<opt:long><opt:v$N>.sh
+<algo>-<model>-#n#g-<parallelism>-<opt:long>.sh
 ```
 
 Examples:
 * sft-llama3.2-1b-1n8g-fsdp2tp1.sh
 * grpo-qwen2-1.5B-instruct-4n8g-fsdp2tp2.sh
 * grpo-qwen2-1.5B-instruct-4n8g-fsdp2tp2-long.sh
-* grpo-qwen2-1.5B-instruct-4n8g-fsdp2tp2-long.v2.sh
-    * The final verison suffix (starts with `.v2`, `.v3`, ...), is reserved for cases contributors believe the recipe's 
-      convergence has changed due to their commit. Versioning signals that this recipe should not be compared to its
-      predecessor due to a change in convergence behavior. Examples of this change include: changing dataset, changing loss,
-      convergence bug fix. Changes affecting performance do not need a version change. 
 
 ## Running manually
 
diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt
index b80a7ad545..4c609d5bff 100644
--- a/tests/test_suites/nightly.txt
+++ b/tests/test_suites/nightly.txt
@@ -3,15 +3,15 @@
 ########
 
 # Short 1N/1B runs (go past 200 steps - usually divergence happens by now) -- going to 4 nodes doesn't help that much
-tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v2.sh
-tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh
+tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh
+tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh
 
 # FSDP1 vs Dtensor (Qwen/Qwen2.5-7B-Instruct)
-tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v2.sh
-tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v2.sh
+tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh
+tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh
 
 # Functional 32b run
-tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v2.sh
+tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh
 
 #######
 # SFT #
diff --git a/tests/test_suites/release.txt b/tests/test_suites/release.txt
index 42e9c49d00..69735cb0cb 100644
--- a/tests/test_suites/release.txt
+++ b/tests/test_suites/release.txt
@@ -3,10 +3,10 @@
 ########
 
 # Long 8b run
-tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v2.sh
+tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh
 
 # Long 32b run
-tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v2.sh
+tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh
 
 #######
 # SFT #
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index 7c6a9e21bf..2a3ec3a7c9 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -48,9 +48,6 @@ class TEST_ASSETS:
     TINY_QWEN2_MODEL_PATH = os.path.join(
         _TEST_ASSETS_DIR, "tiny_qwen2_with_qwen2_tokenizer"
     )
-    TINY_QWEN3_MODEL_PATH = os.path.join(
-        _TEST_ASSETS_DIR, "tiny_qwen3_with_qwen3_tokenizer"
-    )
 
 
 class UnitTestData(TypedDict):
@@ -465,31 +462,3 @@ def tiny_qwen2_model_path():
     tokenizer.save_pretrained(model_path)
     del model, tokenizer
     yield model_path
-
-
-@pytest.fixture(scope="session", autouse=True)
-def tiny_qwen3_model_path():
-    """Fixture that returns a path to a tiny llama model with a dummy tokenizer."""
-    import shutil
-
-    from transformers import AutoTokenizer, Qwen3Config, Qwen3ForCausalLM
-
-    model_path = TEST_ASSETS.TINY_QWEN3_MODEL_PATH
-    # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention)
-    # vocab_size=151936 (so we can re-use qwen2 1.5b tokenizer)
-    config = Qwen3Config(
-        num_hidden_layers=2,
-        hidden_size=64,
-        intermediate_size=32,
-        num_attention_heads=2,
-        vocab_size=151936,
-        tie_word_embeddings=False,
-        num_key_value_heads=None,
-    )
-    model = Qwen3ForCausalLM(config=config)
-    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
-    shutil.rmtree(model_path, ignore_errors=True)
-    model.save_pretrained(model_path)
-    tokenizer.save_pretrained(model_path)
-    del model, tokenizer
-    yield model_path
diff --git a/tests/unit/data/test_llm_message_utils.py b/tests/unit/data/test_llm_message_utils.py
index fc4c6c6b8d..0a5cb3ef4b 100644
--- a/tests/unit/data/test_llm_message_utils.py
+++ b/tests/unit/data/test_llm_message_utils.py
@@ -18,10 +18,8 @@
 import torch
 from transformers import AutoTokenizer
 
-from nemo_rl.data.hf_datasets import COMMON_CHAT_TEMPLATES
 from nemo_rl.data.interfaces import LLMMessageLogType, TaskDataSpec
 from nemo_rl.data.llm_message_utils import (
-    _validate_tensor_consistency,
     add_loss_mask_to_message_log,
     batched_message_log_to_flat_message,
     get_first_index_that_differs,
@@ -408,39 +406,6 @@ def test_get_formatted_message_log_qwen(
     assert actual_text == expected_text
 
 
-def test_formatted_message_log_empty_message():
-    message_logs = [
-        [
-            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": ""},
-        ],
-        [
-            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": "Hello!"},
-        ],
-    ]
-    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
-    tokenizer.chat_template = COMMON_CHAT_TEMPLATES.passthrough_prompt_response
-    task_data_spec = TaskDataSpec(task_name="test")
-    result = [
-        get_formatted_message_log(
-            message_log,
-            tokenizer,
-            task_data_spec,
-            add_bos_token=False,
-            add_eos_token=False,
-        )
-        for message_log in message_logs
-    ]
-    flat_result = [message_log_to_flat_messages(m) for m in result]
-    for k in flat_result[0].keys():
-        if isinstance(flat_result[0][k], torch.Tensor):
-            # make sure validate_tensor_consistency does not raise an error when one of the messages is empty
-            _validate_tensor_consistency(
-                [flat_result[i][k] for i in range(len(flat_result))]
-            )
-
-
 def test_add_loss_mask_to_chat_message_log(
     tokenized_chat_message_log: LLMMessageLogType,
 ):
diff --git a/tests/unit/experience/test_rollouts.py b/tests/unit/experience/test_rollouts.py
index bcfa1b84d2..b45811d4f8 100644
--- a/tests/unit/experience/test_rollouts.py
+++ b/tests/unit/experience/test_rollouts.py
@@ -20,7 +20,6 @@
 import torch
 from transformers import AutoTokenizer
 
-from nemo_rl.data.llm_message_utils import batched_message_log_to_flat_message
 from nemo_rl.distributed.batched_data_dict import BatchedDataDict
 from nemo_rl.distributed.virtual_cluster import RayVirtualCluster
 from nemo_rl.environments.games.sliding_puzzle import (
@@ -441,45 +440,6 @@ def test_run_multi_step_calculator_vllm(multi_step_setup_vllm):
     print("\nMulti-Step Calculator VLLM Test assertions passed.")
 
 
-@pytest.mark.skipif(
-    not torch.cuda.is_available() or torch.cuda.device_count() < 1,
-    reason="VLLM test requires at least 1 GPU",
-)
-def test_max_seqlen_respected(multi_step_setup_vllm):
-    """Tests multi-step calculator rollout with VllmGeneration."""
-    vllm_generation, rollout_tokenizer, task_to_env, initial_batch, rollout_cluster = (
-        multi_step_setup_vllm
-    )
-    max_rollout_turns = initial_batch["extra_env_info"][0]["max_steps"] + 1
-    max_seq_len = 290
-
-    print("\nRunning multi-step calculator rollout (VLLM)...")
-    vllm_generation.prepare_for_generation()
-    final_batch, rollout_metrics = run_multi_turn_rollout(
-        policy_generation=vllm_generation,
-        input_batch=initial_batch,
-        tokenizer=rollout_tokenizer,
-        task_to_env=task_to_env,
-        max_seq_len=max_seq_len,
-        max_rollout_turns=max_rollout_turns,
-    )
-    vllm_generation.finish_generation()
-    print("Multi-step calculator rollout complete (VLLM).")
-
-    # --- Assertions ---
-    assert isinstance(final_batch, BatchedDataDict)
-    assert "message_log" in final_batch
-    assert "total_reward" in final_batch
-    assert len(final_batch["message_log"]) == len(initial_batch["message_log"])
-    flattened_message_log, _ = batched_message_log_to_flat_message(
-        final_batch["message_log"]
-    )
-    # Check that the sequence length is respected by flattening the message log and checking the length
-    assert len(flattened_message_log["token_ids"][0]) == max_seq_len, (
-        f"Sequence length {len(flattened_message_log['token_ids'][0])} is not equal to max_seq_len {max_seq_len}"
-    )
-
-
 # --- Fixture for Sliding Puzzle Environment ---
 @pytest.fixture(scope="function")
 def sliding_puzzle_environment(rollout_cluster):
diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py
index 08f34defe2..552ea3dae2 100644
--- a/tests/unit/models/generation/test_vllm_generation.py
+++ b/tests/unit/models/generation/test_vllm_generation.py
@@ -36,7 +36,7 @@
     },
     "dtype": "bfloat16",
     "max_new_tokens": 10,
-    "temperature": 0.8,
+    "temperature": 1.0,
     "top_p": 1.0,
     "top_k": None,
     "stop_token_ids": None,
@@ -85,9 +85,6 @@ def get_basic_hf_test_config(enable_dtensor: bool = False) -> PolicyConfig:
         },
         "max_grad_norm": 1.0,
         "make_sequence_length_divisible_by": 1,
-        "generation": {
-            "temperature": 0.8,
-        },
     }
 
 
diff --git a/tests/unit/models/policy/test_dtensor_worker.py b/tests/unit/models/policy/test_dtensor_worker.py
index 8ff416059e..7f175b3f15 100644
--- a/tests/unit/models/policy/test_dtensor_worker.py
+++ b/tests/unit/models/policy/test_dtensor_worker.py
@@ -294,10 +294,6 @@ def training_setup(request, two_gpu_virtual_cluster):
         (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, True, False, True),
         (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, False, True, True),
         (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, True, True, True),
-        (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, True, True, False),
-        (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, True, False, True),
-        (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, False, True, True),
-        (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, True, True, True),
     ],
     indirect=True,
 )
@@ -425,8 +421,6 @@ def logprob_setup(request, two_gpu_virtual_cluster):
         (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 2, False, False, False),
         (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 2, False, True, False),
         (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 2, False, True, True),
-        (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 2, False, True, False),
-        (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 2, False, False, False),
     ],
     indirect=True,
 )
diff --git a/tests/unit/test_recipes_and_test_suites.py b/tests/unit/test_recipes_and_test_suites.py
index 47d1d2f45b..9bac39188e 100644
--- a/tests/unit/test_recipes_and_test_suites.py
+++ b/tests/unit/test_recipes_and_test_suites.py
@@ -19,8 +19,6 @@
 
 dir_path = os.path.dirname(os.path.abspath(__file__))
 project_root = os.path.abspath(os.path.join(dir_path, "..", ".."))
-configs_dir = os.path.join(project_root, "examples", "configs")
-recipes_dir = os.path.join(project_root, "examples", "configs", "recipes")
 test_suites_dir = os.path.join(project_root, "tests", "test_suites")
 
 nightly_test_suite_path = os.path.join(test_suites_dir, "nightly.txt")
@@ -32,13 +30,6 @@
     test_suites_dir, "release_performance.txt"
 )
 
-# Relative to project root
-ALGO_MAPPING_TO_BASE_YAML = {
-    "sft": "examples/configs/sft.yaml",
-    "dpo": "examples/configs/dpo.yaml",
-    "grpo": "examples/configs/grpo_math_1B.yaml",
-}
-
 
 @pytest.fixture
 def nightly_test_suite():
@@ -99,16 +90,6 @@ def all_test_suites(
     )
 
 
-@pytest.fixture
-def all_recipe_yaml_rel_paths():
-    all_recipes = []
-    for recipe_path in glob.glob(
-        os.path.join(recipes_dir, "**", "*.yaml"), recursive=True
-    ):
-        all_recipes.append(recipe_path[len(recipes_dir) + 1 :])
-    return all_recipes
-
-
 @pytest.mark.parametrize(
     "test_suite_path",
     [
@@ -131,14 +112,12 @@ def test_test_suites_exist(test_suite_path):
 
 
 def test_no_overlap_across_test_suites(all_test_suites):
-    all_tests = set(all_test_suites)
-    assert len(all_tests) == len(all_test_suites), (
-        f"Test suites have repeats {all_tests}"
-    )
+    recipes = set(all_test_suites)
+    assert len(recipes) == len(all_test_suites), f"Test suites have repeats {recipes}"
 
 
-def test_all_test_scripts_accounted_for_in_test_suites(all_test_suites):
-    all_test_scripts_in_test_suites = set(all_test_suites)
+def test_all_recipes_accounted_for_in_test_suites(all_test_suites):
+    all_recipes_in_test_suites = set(all_test_suites)
 
     all_tests_in_test_suites_dir = set()
     for recipe_path in glob.glob(
@@ -148,37 +127,8 @@ def test_all_test_scripts_accounted_for_in_test_suites(all_test_suites):
         recipe_name = recipe_path[len(project_root) + 1 :]
         all_tests_in_test_suites_dir.add(recipe_name)
 
-    assert all_test_scripts_in_test_suites == all_tests_in_test_suites_dir, (
-        "All test scripts are not accounted for in the test suites"
-    )
-
-
-def test_all_recipe_yamls_accounted_for_in_test_suites(
-    all_recipe_yaml_rel_paths, all_test_suites
-):
-    """This test along with test_all_test_scripts_accounted_for_in_test_suites() ensures that all recipe yaml/test scripts/test_suite(txts) are in sync."""
-    assert len(set(all_recipe_yaml_rel_paths)) == len(set(all_test_suites)), (
-        "Recipe YAMLs should be accounted for in the test suites"
-    )
-
-    all_test_script_paths_in_test_suites = set()
-    for test_script in all_test_suites:
-        # Each test suite is relative from project root
-        test_script_rel_to_test_suites_dir = test_script[
-            len(os.path.join("tests", "test_suites")) + 1 :
-        ]
-        all_test_script_paths_in_test_suites.add(test_script_rel_to_test_suites_dir)
-
-    # Since we're comparing yaml to sh, chop off the .sh/.yaml extensions for comparison
-    all_test_script_paths_in_test_suites = {
-        os.path.splitext(path)[0] for path in all_test_script_paths_in_test_suites
-    }
-    all_recipe_yaml_rel_paths = {
-        os.path.splitext(path)[0] for path in all_recipe_yaml_rel_paths
-    }
-
-    assert all_test_script_paths_in_test_suites == set(all_recipe_yaml_rel_paths), (
-        "All recipe YAMLs are not accounted for in the test suites"
+    assert all_recipes_in_test_suites == all_tests_in_test_suites_dir, (
+        "All recipes are not accounted for in the test suites"
     )
 
 
@@ -265,37 +215,3 @@ def test_all_tests_can_find_config_if_dryrun(all_test_suites):
         assert result.returncode == 0, (
             f"Command failed with exit code {result.returncode}"
         )
-
-
-def test_all_recipes_start_with_algo_hyphen(all_recipe_yaml_rel_paths):
-    expected_algos = set(ALGO_MAPPING_TO_BASE_YAML.keys())
-    for recipe_yaml in all_recipe_yaml_rel_paths:
-        basename = os.path.basename(recipe_yaml)
-        algo = basename.split("-")[0]
-        assert algo in expected_algos, (
-            f"Recipe {recipe_yaml} has unexpected algo {algo}"
-        )
-
-
-@pytest.mark.parametrize("algo, algo_base_yaml", ALGO_MAPPING_TO_BASE_YAML.items())
-def test_all_recipes_can_merge_configs_with_base_config(
-    all_recipe_yaml_rel_paths, all_test_suites, algo, algo_base_yaml
-):
-    from omegaconf import OmegaConf
-
-    base_yaml = os.path.join(project_root, algo_base_yaml)
-    base_config = OmegaConf.load(base_yaml)
-    # Would result in an error if we couldn't merge our config with the recipe's config
-    OmegaConf.set_struct(base_config, True)
-    for recipe_yaml in all_recipe_yaml_rel_paths:
-        if not os.path.basename(recipe_yaml).startswith(algo):
-            # Skipping here b/c we test that all recipes start with the algo-hyphen in
-            #  test_all_recipes_start_with_algo_hyphen()
-            continue
-        recipe_yaml_path = os.path.join(recipes_dir, recipe_yaml)
-        recipe_config = OmegaConf.load(recipe_yaml_path)
-        OmegaConf.set_struct(recipe_config, True)
-        # This will raise a error if the config can't be merged
-        print(f"Merging {recipe_yaml} with {base_yaml}")
-        merged_config = OmegaConf.merge(base_config, recipe_config)
-        print(merged_config)
diff --git a/tests/unit/tools/test_tools.py b/tests/unit/tools/test_tools.py
new file mode 100644
index 0000000000..a22ca03c3c
--- /dev/null
+++ b/tests/unit/tools/test_tools.py
@@ -0,0 +1,351 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from copy import deepcopy
+
+import pytest
+import ray
+import torch
+from datasets import load_dataset
+from transformers import AutoTokenizer
+
+from nemo_rl.distributed.batched_data_dict import BatchedDataDict
+from nemo_rl.distributed.virtual_cluster import RayVirtualCluster
+from nemo_rl.models.generation.interfaces import configure_generation_config
+from nemo_rl.models.generation.vllm import VllmConfig, VllmGeneration
+from nemo_rl.models.policy.hf_policy import HfPolicy, PolicyConfig
+from nemo_rl.tools.generation import generate_with_code_and_tools
+from nemo_rl.tools.tools import BM25Retriever, StatefulCodeExecutor
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B"
+
+
+# Define basic vLLM test config
+basic_vllm_test_config: VllmConfig = {
+    "backend": "vllm",
+    "model_name": MODEL_NAME,
+    "tokenizer_name": None,
+    "dtype": "bfloat16",
+    "max_new_tokens": 100,
+    "temperature": 1.0,
+    "top_p": 1.0,
+    "top_k": None,
+    "stop_token_ids": None,
+    "stop_strings": None,
+    "vllm_cfg": {
+        "tensor_parallel_size": 1,
+        "gpu_memory_utilization": 0.3,
+        "max_model_len": 1024,
+    },
+}
+
+basic_hf_test_config: PolicyConfig = {
+    "model_name": MODEL_NAME,
+    "tokenizer_name": None,
+    "generation_batch_size": 1,
+    "generation": {
+        "backend": "hf",
+        "max_new_tokens": 100,
+        "temperature": 1.0,
+        "top_p": 1.0,
+        "top_k": None,
+        "stop_token_ids": None,
+        "stop_strings": None,
+    },
+    # Required training parameters
+    "train_global_batch_size": 1,
+    "train_micro_batch_size": 1,
+    "learning_rate": 5e-6,
+    "logprob_batch_size": 1,
+    "max_new_tokens": 16,
+    "do_sample": False,
+    "precision": "float32",
+    "activation_checkpointing_enabled": False,
+    "fsdp_offload_enabled": False,
+    "optimizer": {
+        "name": "torch.optim.AdamW",
+        "kwargs": {
+            "lr": 5e-6,
+            "weight_decay": 0.01,
+            "betas": [0.9, 0.999],
+            "eps": 1e-8,
+        },
+    },
+    "dtensor_cfg": {"enabled": False},
+}
+
+
+@pytest.fixture(scope="module")
+def cluster():
+    """Create a virtual cluster for testing."""
+    # Create a cluster with 1 node that has 1 GPU bundles
+    virtual_cluster = RayVirtualCluster(
+        bundle_ct_per_node_list=[1],  # 1 node with 1 GPU bundle
+        use_gpus=True,
+        max_colocated_worker_groups=2,
+        num_gpus_per_node=1,  # Use available GPUs
+        name="vllm-test-cluster",
+    )
+    yield virtual_cluster
+    virtual_cluster.shutdown()
+
+
+@pytest.fixture(scope="function")
+def tokenizer():
+    """Loads the tokenizer for the tests."""
+    print(f"Loading tokenizer: {MODEL_NAME}")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    print(
+        f"Tokenizer loaded. Pad token: {tokenizer.pad_token} (ID: {tokenizer.pad_token_id}), EOS token: {tokenizer.eos_token} (ID: {tokenizer.eos_token_id})"
+    )
+    return tokenizer
+
+
+def test_vllm_execute_code(cluster, tokenizer):
+    """Test that vLLM can call the code executor."""
+    # Prepare test data
+    codes = [
+        "<code>x = 3; y = 4</code>\nThis is some regular text.\n<code>x + y</code>\n",
+        "<code>\ndef f(x):\n    return x * x\n\nf(2)\n</code>\n",
+    ]
+    results = ["<result>7</result>", "\n<result>\n4\n</result>"]
+    results = [code + result for code, result in zip(codes, results)]
+
+    test_prompts = [code * 4 for code in codes]
+    encodings = tokenizer(
+        test_prompts,
+        padding="max_length",
+        max_length=1024,
+        return_tensors="pt",
+        padding_side="right",
+    )
+    input_lengths = encodings["attention_mask"].sum(dim=1).to(torch.int32)
+    batch = BatchedDataDict(
+        {
+            "input_ids": encodings["input_ids"],
+            "input_lengths": input_lengths,
+        }
+    )
+
+    # Create separate configs for each policy
+    vllm_config = basic_vllm_test_config.copy()
+    vllm_config = configure_generation_config(vllm_config, tokenizer, is_eval=True)
+
+    # Create vLLM generation
+    vllm_generation = VllmGeneration(cluster, vllm_config)
+
+    # Generate and check result
+    outputs = generate_with_code_and_tools(
+        vllm_generation, batch, tokenizer, greedy=True
+    )
+
+    all_output_ids = outputs["output_ids"]
+    logprobs = outputs["logprobs"]
+    input_lengths = outputs["unpadded_sequence_lengths"] - outputs["generation_lengths"]
+    output_lengths = outputs["unpadded_sequence_lengths"]
+    input_ids = []
+    output_ids = []
+    for all_output_id, input_length, output_length in zip(
+        all_output_ids, input_lengths, output_lengths
+    ):
+        input_ids.append(all_output_id[:input_length])
+        output_ids.append(all_output_id[input_length:output_length])
+    indices = torch.arange(all_output_ids.shape[-1])
+    input_lengths = input_lengths.unsqueeze(-1)
+    output_lengths = output_lengths.unsqueeze(-1)
+    is_generated = (indices >= input_lengths) & (indices < output_lengths)
+
+    input_texts = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
+    output_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+
+    assert input_texts == test_prompts, "Unexpected modification to input texts"
+    assert output_texts == results, f"Expect {results}, got wrong output {output_texts}"
+    assert (logprobs[~is_generated] == 0.0).all(), (
+        "Unexpected log probabilities on input tokens or paddings"
+    )
+    assert (logprobs[is_generated] != 0.0).all(), (
+        "Generated tokens must have non-trivial log probabilities"
+    )
+
+    # Clean up
+    vllm_generation.shutdown()
+
+
+def test_hf_execute_code(cluster, tokenizer):
+    """Test that Huggingface models can call the code executor."""
+    # Prepare test data
+    codes = [
+        "<code>x = 3; y = 4</code>\nThis is some regular text.\n<code>x + y</code>\n",
+        "<code>\ndef f(x):\n    return x * x\n\nf(2)\n</code>\n",
+    ]
+    results = ["<result>7</result>", "\n<result>\n4\n</result>"]
+    results = [code + result for code, result in zip(codes, results)]
+
+    test_prompts = [code * 4 for code in codes]
+    encodings = tokenizer(
+        test_prompts,
+        padding="max_length",
+        max_length=1024,
+        return_tensors="pt",
+        padding_side="right",
+    )
+    input_lengths = encodings["attention_mask"].sum(dim=1).to(torch.int32)
+    batch = BatchedDataDict(
+        {
+            "input_ids": encodings["input_ids"],
+            "input_lengths": input_lengths,
+        }
+    )
+
+    # Create separate configs for each policy
+    hf_config = deepcopy(basic_hf_test_config)
+    hf_config["generation"] = configure_generation_config(
+        hf_config["generation"],
+        tokenizer,  # is_eval=True
+    )
+
+    # Create vLLM generation
+    hf_policy = HfPolicy(
+        cluster, hf_config, tokenizer, init_reference_model=False, init_optimizer=False
+    )
+
+    # Generate and check result
+    outputs = generate_with_code_and_tools(hf_policy, batch, tokenizer, greedy=True)
+
+    all_output_ids = outputs["output_ids"]
+    logprobs = outputs["logprobs"]
+    input_lengths = outputs["unpadded_sequence_lengths"] - outputs["generation_lengths"]
+    output_lengths = outputs["unpadded_sequence_lengths"]
+    input_ids = []
+    output_ids = []
+    for all_output_id, input_length, output_length in zip(
+        all_output_ids, input_lengths, output_lengths
+    ):
+        input_ids.append(all_output_id[:input_length])
+        output_ids.append(all_output_id[input_length:output_length])
+    indices = torch.arange(all_output_ids.shape[-1])
+    input_lengths = input_lengths.unsqueeze(-1)
+    output_lengths = output_lengths.unsqueeze(-1)
+    is_generated = (indices >= input_lengths) & (indices < output_lengths)
+
+    input_texts = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
+    output_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+
+    assert input_texts == test_prompts, "Unexpected modification to input texts"
+    assert output_texts == results, f"Expect {results}, got wrong output {output_texts}"
+    assert (logprobs[~is_generated] == 0.0).all(), (
+        "Unexpected log probabilities on input tokens or paddings"
+    )
+    assert (logprobs[is_generated] != 0.0).all(), (
+        "Generated tokens must have non-trivial log probabilities"
+    )
+
+    # Clean up
+    hf_policy.shutdown()
+
+
+def test_untrusted_code(cluster):
+    """Test whether the code executor can block untrusted code."""
+    executor = StatefulCodeExecutor.remote()
+
+    # accessing temporary files shouldn't be blocked
+    code = (
+        "with open('allowed_file.txt', 'w') as fout:\n"
+        "    fout.write('some content')\n"
+        "with open('allowed_file.txt') as fin:\n"
+        "    content = fin.read()\n"
+        "content"
+    )
+    result = ray.get(executor.__call__.remote(code))
+    assert result == "some content"
+
+    # accessing other files should be blocked
+    code = "with open('/etc/passwd', 'r') as fin:\n    fin.read()"
+    result = ray.get(executor.__call__.remote(code))
+    assert isinstance(result, PermissionError)
+
+    # importing non-sensitive modules shouldn't be blocked
+    code = "import math\nround(math.sqrt(8))"
+    result = ray.get(executor.__call__.remote(code))
+    assert result == 3
+
+    # importing sensitive modules should be blocked
+    code = "import os"
+    result = ray.get(executor.__call__.remote(code))
+    assert isinstance(result, PermissionError)
+
+
+@pytest.mark.timeout(150)
+def test_vllm_use_tool(cluster, tokenizer):
+    """Test that vLLM can use tool in the code executor."""
+    # Prepare test data
+    codes = ["<code>retrieve('Jen-Hsun Huang')</code>\n"]
+    results = [
+        "\n<result>\n"
+        "['Nvidia was established in 1993 by Jen-Hsun Huang, Curtis Priem, and Chris '\n"
+        " 'Malachowsky. In 2000 Nvidia took intellectual possession of 3dfx, one of the '\n"
+        " 'biggest GPU producers in 1990s.']\n"
+        "</result>"
+    ]
+    results = [code + result for code, result in zip(codes, results)]
+
+    test_prompts = [code * 4 for code in codes]
+    encodings = tokenizer(
+        test_prompts,
+        padding="max_length",
+        max_length=1024,
+        return_tensors="pt",
+        padding_side="right",
+    )
+    input_lengths = encodings["attention_mask"].sum(dim=1).to(torch.int32)
+    batch = BatchedDataDict(
+        {
+            "input_ids": encodings["input_ids"],
+            "input_lengths": input_lengths,
+        }
+    )
+
+    # Construct retriever
+    dataset = load_dataset("rahular/simple-wikipedia")
+    documents = [sample["text"] for sample in dataset["train"]]
+    tool_map = {"retrieve": BM25Retriever(documents, num_result=1)}
+
+    # Create separate configs for each policy
+    vllm_config = basic_vllm_test_config.copy()
+    vllm_config = configure_generation_config(vllm_config, tokenizer, is_eval=True)
+
+    # Create vLLM generation
+    vllm_generation = VllmGeneration(cluster, vllm_config)
+
+    # Generate and check result
+    outputs = generate_with_code_and_tools(
+        vllm_generation, batch, tokenizer, tool_map=tool_map, greedy=True
+    )
+
+    all_output_ids = outputs["output_ids"]
+    input_lengths = outputs["unpadded_sequence_lengths"] - outputs["generation_lengths"]
+    output_lengths = outputs["unpadded_sequence_lengths"]
+    output_ids = []
+    for all_output_id, input_length, output_length in zip(
+        all_output_ids, input_lengths, output_lengths
+    ):
+        output_ids.append(all_output_id[input_length:output_length])
+
+    output_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+
+    assert output_texts == results, f"Expect {results}, got wrong output {output_texts}"
+
+    # Clean up
+    vllm_generation.shutdown()
diff --git a/tests/unit/utils/test_checkpoint.py b/tests/unit/utils/test_checkpoint.py
index c5a90c7932..2a912e94b2 100644
--- a/tests/unit/utils/test_checkpoint.py
+++ b/tests/unit/utils/test_checkpoint.py
@@ -17,7 +17,6 @@
 import numpy as np
 import pytest
 import torch
-import yaml
 
 from nemo_rl.utils.checkpoint import CheckpointManager
 
@@ -63,8 +62,8 @@ def test_init_tmp_checkpoint(checkpoint_manager, checkpoint_dir):
         assert isinstance(saved_metadata["numpy"], (int, float))
 
     # Check if config was saved
-    with open(save_dir / "config.yaml", "r") as f:
-        saved_config = yaml.safe_load(f)
+    with open(save_dir / "config.json", "r") as f:
+        saved_config = json.load(f)
         assert saved_config == run_config
 
 
diff --git a/tests/unit/utils/test_native_checkpoint.py b/tests/unit/utils/test_native_checkpoint.py
index f751c5c47e..7cebeade90 100755
--- a/tests/unit/utils/test_native_checkpoint.py
+++ b/tests/unit/utils/test_native_checkpoint.py
@@ -61,9 +61,6 @@
         "tensor_parallel_size": 1,
     },
     "max_grad_norm": 1.0,
-    "generation": {
-        "temperature": 1.0,
-    },
 }
 
 
@@ -286,6 +283,77 @@ def test_save_and_load_model_and_optimizer(mock_experiment):
     check_dict_equality(new_optimizer.state_dict(), optimizer.state_dict())
 
 
+@pytest.mark.parametrize("num_gpus", [1, 2], ids=["1gpu", "2gpu"])
+def test_save_and_load_hf_checkpoint(policy, num_gpus):
+    ## warm up with a forward pass
+    ## this is needed before saving a checkpoint because FSDP does some lazy initialization
+    input_ids = torch.randint(0, 16000, (4, 128))  # 4 sequences, each of length 128
+    attention_mask = torch.ones(4, 128)
+    input_lengths = attention_mask.sum(dim=1).to(torch.int32)
+    dummy_fwd_dict = BatchedDataDict(
+        {
+            "input_ids": input_ids,
+            "input_lengths": input_lengths,
+            "attention_mask": attention_mask,
+            "labels": torch.randint(0, 16000, (4, 128)),
+        }
+    )
+    policy.get_logprobs(dummy_fwd_dict)
+
+    with TemporaryDirectory() as tmp_dir:
+        policy.save_checkpoint(
+            os.path.join(tmp_dir, "test_hf_and_dcp"),
+            save_hf=True,
+            save_torch_dist=True,
+            tokenizer_path=os.path.join(tmp_dir, "test_hf_and_dcp_tokenizer"),
+        )
+
+        ## make sure we save both HF and DCP checkpoints
+        # Dynamically create the expected set of distcp files based on num_gpus
+        expected_distcp_files = {f"__{rank}_0.distcp" for rank in range(num_gpus)}
+        expected_files = expected_distcp_files.union({".metadata"})
+
+        assert (
+            set(os.listdir(os.path.join(tmp_dir, "test_hf_and_dcp"))) == expected_files
+        )
+        assert set(os.listdir(os.path.join(tmp_dir, "test_hf_and_dcp_tokenizer"))) == {
+            "tokenizer_config.json",
+            "tokenizer.json",
+            "special_tokens_map.json",
+        }
+
+        converted_model = AutoModelForCausalLM.from_pretrained(
+            os.path.join(tmp_dir, "test_hf_and_dcp-hf")
+        )
+
+        hf_save_dir = os.path.join(tmp_dir, "test_hf_and_dcp-hf")
+        hf_files = set(os.listdir(hf_save_dir))
+
+        # Check the HF saved files structure: could be single or sharded
+        expected_common_hf_files = {"config.json", "generation_config.json"}
+        if "model.safetensors" in hf_files:
+            # Single file format (1 GPU or smaller model)
+            expected_hf_files = expected_common_hf_files.union({"model.safetensors"})
+        else:
+            # Sharded format (>=2 GPUs or larger model)
+            expected_hf_files = expected_common_hf_files.union(
+                {
+                    "model-00001-of-00002.safetensors",
+                    "model-00002-of-00002.safetensors",
+                    "model.safetensors.index.json",
+                }
+            )
+        assert hf_files == expected_hf_files
+
+        coverted_model = AutoModelForCausalLM.from_pretrained(hf_save_dir)
+        original_model = AutoModelForCausalLM.from_pretrained(
+            simple_policy_config["model_name"]
+        )
+
+    ## make sure converted model matches the original
+    check_dict_equality(converted_model.state_dict(), original_model.state_dict())
+
+
 @pytest.mark.parametrize("num_gpus", [1, 2], ids=["1gpu", "2gpu"])
 def test_convert_dcp_to_hf(policy, num_gpus):
     ## warm up with a forward pass
@@ -306,6 +374,8 @@ def test_convert_dcp_to_hf(policy, num_gpus):
     with TemporaryDirectory() as tmp_dir:
         policy.save_checkpoint(
             os.path.join(tmp_dir, "test_hf_and_dcp"),
+            save_hf=True,
+            save_torch_dist=True,
         )
 
         # Dynamically create the expected set of distcp files based on num_gpus
@@ -317,6 +387,25 @@ def test_convert_dcp_to_hf(policy, num_gpus):
             set(os.listdir(os.path.join(tmp_dir, "test_hf_and_dcp"))) == expected_files
         )
 
+        # Check the HF saved files structure: could be single or sharded
+        hf_save_dir = os.path.join(tmp_dir, "test_hf_and_dcp-hf")
+        hf_files = set(os.listdir(hf_save_dir))
+        expected_common_hf_files = {"config.json", "generation_config.json"}
+
+        if "model.safetensors" in hf_files:
+            # Single file format (1 GPU or smaller model)
+            expected_hf_files = expected_common_hf_files.union({"model.safetensors"})
+        else:
+            # Sharded format (>=2 GPUs or larger model)
+            expected_hf_files = expected_common_hf_files.union(
+                {
+                    "model-00001-of-00002.safetensors",
+                    "model-00002-of-00002.safetensors",
+                    "model.safetensors.index.json",
+                }
+            )
+        assert hf_files == expected_hf_files
+
         offline_converted_model_path = convert_dcp_to_hf(
             os.path.join(tmp_dir, "test_hf_and_dcp"),
             os.path.join(tmp_dir, "test_hf_and_dcp-hf-offline"),
@@ -334,11 +423,18 @@ def test_convert_dcp_to_hf(policy, num_gpus):
             offline_converted_model_path
         )
 
+        online_converted_model = AutoModelForCausalLM.from_pretrained(
+            os.path.join(tmp_dir, "test_hf_and_dcp-hf")
+        )
         original_model = AutoModelForCausalLM.from_pretrained(
             simple_policy_config["model_name"]
         )
 
-    # Ensure the offline checkpoint is different from the original
+    ## make sure both conversions results in the same state dict
+    check_dict_equality(
+        online_converted_model.state_dict(), offline_converted_model.state_dict()
+    )
+    # Ensure the offline one is different from the original
     assert_recursive_dict_different(
         offline_converted_model.state_dict(), original_model.state_dict()
     )

From 09e7a805da8006320667cb14b2b97d0d0ce85a27 Mon Sep 17 00:00:00 2001
From: KiddoZhu <zhaochengz@nvidia.com>
Date: Tue, 6 May 2025 14:01:20 -0700
Subject: [PATCH 2/7] revert from main branch

Signed-off-by: KiddoZhu <zhaochengz@nvidia.com>
---
 .gitignore                                    |   6 +-
 nemo_rl/algorithms/dpo.py                     |  12 +-
 nemo_rl/algorithms/grpo.py                    |   7 --
 nemo_rl/algorithms/sft.py                     |  10 --
 nemo_rl/data/llm_message_utils.py             |   6 +
 nemo_rl/environments/math_environment.py      |  23 +++-
 nemo_rl/experience/rollouts.py                |   4 +-
 nemo_rl/models/dtensor/parallelize.py         |  27 ++++-
 nemo_rl/models/generation/vllm.py             |   2 -
 nemo_rl/models/policy/__init__.py             |   2 +-
 .../models/policy/dtensor_policy_worker.py    |  15 +--
 nemo_rl/models/policy/fsdp1_policy_worker.py  |  21 ++--
 nemo_rl/models/policy/hf_policy.py            |   4 -
 nemo_rl/utils/checkpoint.py                   |   7 +-
 nemo_rl/utils/native_checkpoint.py            |  44 ++------
 tests/functional/dpo.sh                       |   2 +-
 tests/test_suites/README.md                   |   7 +-
 tests/test_suites/nightly.txt                 |  10 +-
 tests/test_suites/release.txt                 |   4 +-
 tests/unit/conftest.py                        |  31 ++++++
 tests/unit/data/test_llm_message_utils.py     |  35 ++++++
 tests/unit/experience/test_rollouts.py        |  40 +++++++
 .../models/generation/test_vllm_generation.py |   5 +-
 .../unit/models/policy/test_dtensor_worker.py |   6 +
 tests/unit/test_recipes_and_test_suites.py    |  96 +++++++++++++++-
 tests/unit/utils/test_checkpoint.py           |   5 +-
 tests/unit/utils/test_native_checkpoint.py    | 104 +-----------------
 27 files changed, 315 insertions(+), 220 deletions(-)

diff --git a/.gitignore b/.gitignore
index 46efa31b70..27a0fca478 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,14 +15,14 @@ apidocs/
 dist/
 *.egg-info/
 *.vscode/
+release_run*
+ckpts/
 
 # Test
 coverage.json
 .coverage*
 unit_results.json
 unit_results/
-release_run*
-ckpts/
 test_assets/
 
 # Cache
@@ -35,4 +35,4 @@ docker/
 wandb/
 checkpoints/
 results/
-code_snapshots/
\ No newline at end of file
+code_snapshots/
diff --git a/nemo_rl/algorithms/dpo.py b/nemo_rl/algorithms/dpo.py
index dd6607ef9d..0647f0cd5a 100644
--- a/nemo_rl/algorithms/dpo.py
+++ b/nemo_rl/algorithms/dpo.py
@@ -446,14 +446,6 @@ def dpo_train(
                     % master_config["checkpointing"]["save_period"]
                     == 0
                 ):  # +1 because step is 0-indexed
-                    is_last_checkpoint = (
-                        min(
-                            len(train_dataloader) * max_num_epochs,
-                            master_config["dpo"]["max_num_steps"],
-                        )
-                        - (total_steps + 1)
-                        < master_config["checkpointing"]["save_period"]
-                    )
                     dpo_save_state["step"] = (current_step + 1) % len(train_dataloader)
                     dpo_save_state["total_steps"] = total_steps + 1
                     dpo_save_state["epoch"] = current_epoch
@@ -470,7 +462,9 @@ def dpo_train(
                             optimizer_path=os.path.join(
                                 checkpoint_path, "policy", "optimizer"
                             ),
-                            save_hf=is_last_checkpoint,
+                            tokenizer_path=os.path.join(
+                                checkpoint_path, "policy", "tokenizer"
+                            ),
                         )
                         torch.save(
                             train_dataloader.state_dict(),
diff --git a/nemo_rl/algorithms/grpo.py b/nemo_rl/algorithms/grpo.py
index 952a6c172a..5a007451d0 100644
--- a/nemo_rl/algorithms/grpo.py
+++ b/nemo_rl/algorithms/grpo.py
@@ -524,12 +524,6 @@ def grpo_train(
             ):  # +1 because step is 0-indexed
                 policy.prepare_for_training()
 
-                is_last_checkpoint = (
-                    min(len(dataloader), master_config["grpo"]["max_num_steps"])
-                    - (step + 1)
-                    < master_config["checkpointing"]["save_period"]
-                )
-
                 grpo_save_state["step"] = step + 1
                 grpo_save_state["val_reward"] = val_metrics["accuracy"]
                 grpo_save_state["consumed_samples"] = consumed_samples
@@ -546,7 +540,6 @@ def grpo_train(
                         tokenizer_path=os.path.join(
                             checkpoint_path, "policy", "tokenizer"
                         ),
-                        save_hf=is_last_checkpoint,
                     )
                     torch.save(
                         dataloader.state_dict(),
diff --git a/nemo_rl/algorithms/sft.py b/nemo_rl/algorithms/sft.py
index 8b5ffcddfd..d10c3df483 100644
--- a/nemo_rl/algorithms/sft.py
+++ b/nemo_rl/algorithms/sft.py
@@ -447,15 +447,6 @@ def sft_train(
                     % master_config["checkpointing"]["save_period"]
                     == 0
                 ):  # +1 because step is 0-indexed
-                    is_last_checkpoint = (
-                        min(
-                            len(train_dataloader) * max_num_epochs,
-                            master_config["sft"]["max_num_steps"],
-                        )
-                        - (total_steps + 1)
-                        < master_config["checkpointing"]["save_period"]
-                    )
-
                     sft_save_state["step"] = (current_step + 1) % len(train_dataloader)
                     sft_save_state["total_steps"] = total_steps + 1
                     sft_save_state["epoch"] = current_epoch
@@ -476,7 +467,6 @@ def sft_train(
                             tokenizer_path=os.path.join(
                                 checkpoint_path, "policy", "tokenizer"
                             ),
-                            save_hf=is_last_checkpoint,
                         )
                         torch.save(
                             train_dataloader.state_dict(),
diff --git a/nemo_rl/data/llm_message_utils.py b/nemo_rl/data/llm_message_utils.py
index f2d24fc421..51cd5a279d 100644
--- a/nemo_rl/data/llm_message_utils.py
+++ b/nemo_rl/data/llm_message_utils.py
@@ -421,6 +421,12 @@ def get_formatted_message_log(
         new_message["token_ids"] = tokenizer(
             message_chunk, return_tensors="pt", add_special_tokens=False
         )["input_ids"][0]
+        if len(new_message["token_ids"]) == 0:
+            # if there is an empty message, the empty `token_ids` tensor ends up being in fp32,
+            # which causes `_validate_tensor_consistency` to fail. To fix this, we convert the
+            # empty tensor to int64.
+            new_message["token_ids"] = new_message["token_ids"].to(torch.int64)
+
         new_message["content"] = message_chunk
         new_message_log.append(new_message)
 
diff --git a/nemo_rl/environments/math_environment.py b/nemo_rl/environments/math_environment.py
index 8da0528652..fd968298b0 100644
--- a/nemo_rl/environments/math_environment.py
+++ b/nemo_rl/environments/math_environment.py
@@ -15,7 +15,8 @@
 
 import ray
 import torch
-from math_verify import parse, verify
+from math_verify.metric import math_metric
+from math_verify.parser import ExprExtractionConfig, LatexExtractionConfig
 
 from nemo_rl.distributed.batched_data_dict import BatchedDataDict
 from nemo_rl.distributed.virtual_cluster import PY_EXECUTABLES
@@ -53,9 +54,23 @@ def verify(
         results = []
         for response, ground_truth in zip(pred_responses, ground_truths):
             try:
-                gold = parse(ground_truth)
-                pred = parse(response[-100:])  # avoid looking at the whole string
-                results.append(float(verify(gold, pred)))
+                # Use Latex and plain math extraction from predictions
+                # https://github.com/huggingface/Math-Verify?tab=readme-ov-file#extraction-targets
+                verify_func = math_metric(
+                    gold_extraction_target=(LatexExtractionConfig(),),
+                    pred_extraction_target=(
+                        ExprExtractionConfig(),
+                        LatexExtractionConfig(),
+                    ),
+                )
+
+                ground_truth_parsable = "\\boxed{" + ground_truth + "}"
+                try:
+                    ret_score, _ = verify_func([ground_truth_parsable], [response])
+                except Exception:
+                    ret_score = 0.0
+
+                results.append(float(ret_score))
             except Exception:
                 results.append(0)
         return results
diff --git a/nemo_rl/experience/rollouts.py b/nemo_rl/experience/rollouts.py
index a556a32a42..567add0dfc 100644
--- a/nemo_rl/experience/rollouts.py
+++ b/nemo_rl/experience/rollouts.py
@@ -311,7 +311,9 @@ def run_multi_turn_rollout(
                 >= max_seq_len
             ):
                 # truncate
-                tokenized_obs = tokenized_obs[: max_seq_len - active_input_lengths[i]]
+                tokenized_obs = tokenized_obs[
+                    : max_seq_len - (len(generated_ids[i]) + active_input_lengths[i])
+                ]
                 truncation_mask[i] = True
                 # Record truncation
                 sample_truncated[active_indices[i]] = True
diff --git a/nemo_rl/models/dtensor/parallelize.py b/nemo_rl/models/dtensor/parallelize.py
index 3ae86d70cc..5998937cc9 100644
--- a/nemo_rl/models/dtensor/parallelize.py
+++ b/nemo_rl/models/dtensor/parallelize.py
@@ -30,6 +30,7 @@
 from torch.distributed.tensor.placement_types import Replicate, Shard
 from transformers.models.llama.modeling_llama import LlamaForCausalLM
 from transformers.models.qwen2.modeling_qwen2 import Qwen2ForCausalLM
+from transformers.models.qwen3.modeling_qwen3 import Qwen3ForCausalLM
 
 from nemo_rl.distributed.model_utils import from_parallel_logits_to_logprobs
 
@@ -98,7 +99,7 @@ def _parallelize_llama(
 
 
 def _parallelize_qwen(
-    model: Qwen2ForCausalLM,
+    model: Union[Qwen2ForCausalLM, Qwen3ForCausalLM],
     dp_mesh: DeviceMesh,
     tp_mesh: DeviceMesh,
     mp_policy: MixedPrecisionPolicy,
@@ -108,7 +109,7 @@ def _parallelize_qwen(
 ):
     """Parallelizes a Qwen2ForCausalLM model across data and tensor parallel dimensions."""
 
-    class Qwen2RotaryEmbedParallel(SequenceParallel):
+    class QwenRotaryEmbedParallel(SequenceParallel):
         """Custom SequenceParallel class for Qwen2 rotary embeddings because the input is a tuple."""
 
         @staticmethod
@@ -141,6 +142,23 @@ def _prepare_input_fn(sequence_sharding, mod, inputs, device_mesh):
 
             return type(inputs)(new_inputs)
 
+    class Qwen3QKNorm(SequenceParallel):
+        @staticmethod
+        def _prepare_input_fn(sequence_sharding, mod, inputs, device_mesh):
+            input_tensor = inputs[0]
+
+            if isinstance(input_tensor, DTensor):
+                assert input_tensor.placements == (Shard(dim=2),)
+            elif isinstance(input_tensor, torch.Tensor):
+                # assume the input passed in already sharded on the sequence dim and create the DTensor
+                return DTensor.from_local(
+                    input_tensor, device_mesh, sequence_sharding, run_check=False
+                )
+            else:
+                raise ValueError(
+                    f"expecting input of {mod} to be a torch.Tensor or DTensor, but got {input_tensor}"
+                )
+
     if tp_mesh.size() > 1:
         assert not model.config.tie_word_embeddings, (
             "Tie word embeddings not supported when TP is enabled"
@@ -156,7 +174,7 @@ def _prepare_input_fn(sequence_sharding, mod, inputs, device_mesh):
                     input_layouts=Replicate(),
                     output_layouts=Shard(1),
                 ),
-                "model.rotary_emb": Qwen2RotaryEmbedParallel(),
+                "model.rotary_emb": QwenRotaryEmbedParallel(),
                 "model.norm": SequenceParallel(),
                 "model.layers.*.input_layernorm": SequenceParallel(),
                 "model.layers.*.self_attn.q_proj": ColwiseParallel(
@@ -171,6 +189,8 @@ def _prepare_input_fn(sequence_sharding, mod, inputs, device_mesh):
                 "model.layers.*.self_attn.o_proj": RowwiseParallel(
                     output_layouts=Shard(1)
                 ),
+                "model.layers.*.self_attn.q_norm": Qwen3QKNorm(),
+                "model.layers.*.self_attn.k_norm": Qwen3QKNorm(),
                 "model.layers.*.post_attention_layernorm": SequenceParallel(),
                 "model.layers.*.mlp.up_proj": ColwiseParallel(),
                 "model.layers.*.mlp.gate_proj": ColwiseParallel(),
@@ -214,6 +234,7 @@ def _prepare_input_fn(sequence_sharding, mod, inputs, device_mesh):
 
 PARALLIZE_FUNCTIONS = {
     Qwen2ForCausalLM: _parallelize_qwen,
+    Qwen3ForCausalLM: _parallelize_qwen,
     LlamaForCausalLM: _parallelize_llama,
 }
 
diff --git a/nemo_rl/models/generation/vllm.py b/nemo_rl/models/generation/vllm.py
index 4128f6a9cc..59fcc26320 100644
--- a/nemo_rl/models/generation/vllm.py
+++ b/nemo_rl/models/generation/vllm.py
@@ -273,7 +273,6 @@ def generate(
 
         # Read generation parameters from config
         top_k = self.cfg["top_k"] if self.cfg["top_k"] is not None else -1
-
         sampling_params = self.SamplingParams(
             temperature=self.cfg["temperature"] if not greedy else 0,
             top_p=self.cfg["top_p"],
@@ -391,7 +390,6 @@ def generate_text(
 
         # Read generation parameters from config
         top_k = self.cfg["top_k"] if self.cfg["top_k"] is not None else -1
-
         sampling_params = self.SamplingParams(
             temperature=self.cfg["temperature"] if not greedy else 0,
             top_p=self.cfg["top_p"],
diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py
index 47714fb0f5..fbe728a840 100644
--- a/nemo_rl/models/policy/__init__.py
+++ b/nemo_rl/models/policy/__init__.py
@@ -37,7 +37,7 @@ class PolicyConfig(TypedDict):
     train_micro_batch_size: int
     learning_rate: float
     logprob_batch_size: int
-    generation: GenerationConfig
+    generation: Optional[GenerationConfig]
     precision: str
     dtensor_cfg: DTensorConfig
     make_sequence_length_divisible_by: int
diff --git a/nemo_rl/models/policy/dtensor_policy_worker.py b/nemo_rl/models/policy/dtensor_policy_worker.py
index 29ecd46452..c99110d7e7 100644
--- a/nemo_rl/models/policy/dtensor_policy_worker.py
+++ b/nemo_rl/models/policy/dtensor_policy_worker.py
@@ -335,6 +335,10 @@ def train(
                     else:
                         logits = outputs.logits
 
+                    # Divide logits by temperature
+                    if "generation" in self.cfg and self.cfg["generation"] is not None:
+                        logits.div_(self.cfg["generation"]["temperature"])
+
                     loss, loss_metrics = loss_fn(logits, mb)
                     num_valid_samples = loss_metrics["num_valid_samples"]
                     loss_metrics["lr"] = self.optimizer.param_groups[0]["lr"]
@@ -371,10 +375,12 @@ def train(
 
                     # Update parameters
                     self.optimizer.step()
-                    self.scheduler.step()
 
                 losses.append(torch.tensor(mb_losses).sum().item())
 
+            # increment scheduler after all batches in rollout are processed
+            self.scheduler.step()
+
             # Compute global loss across all ranks
             with torch.no_grad():
                 local_loss = torch.tensor(losses, device="cuda")
@@ -714,13 +720,10 @@ def save_checkpoint(
         weights_path: str,
         optimizer_path: Optional[str] = None,
         tokenizer_path: Optional[str] = None,
-        save_torch_dist: bool = True,
-        save_hf: bool = False,
     ):
         """Save a checkpoint of the model.
 
-        the HuggingFace checkpoint is saved only if `save_hf` is True,
-        and the optimizer states are saved only if `optimizer` and `optimizer_path` are provided.
+        the optimizer states are saved only if `optimizer` and `optimizer_path` are provided.
         """
         save_checkpoint(
             model=self.model,
@@ -730,8 +733,6 @@ def save_checkpoint(
             optimizer_path=optimizer_path,
             tokenizer=self.tokenizer if tokenizer_path else None,
             tokenizer_path=tokenizer_path,
-            save_torch_dist=save_torch_dist,
-            save_hf=save_hf,
         )
 
     def load_checkpoint(self, weights_path: str, optimizer_path: Optional[str] = None):
diff --git a/nemo_rl/models/policy/fsdp1_policy_worker.py b/nemo_rl/models/policy/fsdp1_policy_worker.py
index bd3951f3a2..19523394ad 100644
--- a/nemo_rl/models/policy/fsdp1_policy_worker.py
+++ b/nemo_rl/models/policy/fsdp1_policy_worker.py
@@ -289,6 +289,10 @@ def train(
                             logits = self.model.lm_head(outputs.last_hidden_state)
                         else:
                             logits = outputs.logits
+                    
+                    # Divide logits by temperature
+                    if "generation" in self.cfg and self.cfg["generation"] is not None:
+                        logits.div_(self.cfg["generation"]["temperature"])
 
                     loss, loss_metrics = loss_fn(logits, mb)
                     num_valid_samples = loss_metrics["num_valid_samples"]
@@ -325,9 +329,11 @@ def train(
 
                     # Update parameters
                     self.optimizer.step()
-                    self.scheduler.step()
                 losses.append(torch.tensor(mb_losses).sum().item())
 
+            # increment scheduler after all batches in rollout are processed
+            self.scheduler.step()
+
             # Compute global loss across all ranks
             with torch.no_grad():
                 local_loss = torch.tensor(losses, device="cuda")
@@ -901,8 +907,6 @@ def save_checkpoint(
         weights_path: str,
         optimizer_path: Optional[str] = None,
         tokenizer_path: Optional[str] = None,
-        save_torch_dist: bool = True,
-        save_hf: bool = False,
     ):
         """Save a checkpoint of the model.
 
@@ -912,19 +916,12 @@ def save_checkpoint(
             __0_1.distcp
             __1_0.distcp
             ...
-        weights_path-hf/
-            config.json
-            generation_config.json
-            model-00001-of-<TOTAL_SHARDS>.safetensors
-            ...
-            model.safetensors.index.json
         optimizer_path/
             __0_0.distcp
             __1_0.distcp
             ...
 
-        the HuggingFace checkpoint is saved only if `save_hf` is True,
-        and the optimizer states are saved only if `optimizer` and `optimizer_path` are provided.
+        the optimizer states are saved only if `optimizer` and `optimizer_path` are provided.
         """
         save_checkpoint(
             model=self.model,
@@ -934,8 +931,6 @@ def save_checkpoint(
             optimizer_path=optimizer_path,
             tokenizer=self.tokenizer if tokenizer_path else None,
             tokenizer_path=tokenizer_path,
-            save_torch_dist=save_torch_dist,
-            save_hf=save_hf,
         )
 
     def load_checkpoint(self, weights_path: str, optimizer_path: Optional[str] = None):
diff --git a/nemo_rl/models/policy/hf_policy.py b/nemo_rl/models/policy/hf_policy.py
index 2a579e3bcd..2d2dbf3d4c 100644
--- a/nemo_rl/models/policy/hf_policy.py
+++ b/nemo_rl/models/policy/hf_policy.py
@@ -307,8 +307,6 @@ def save_checkpoint(
         weights_path: str,
         optimizer_path: Optional[str] = None,
         tokenizer_path: Optional[str] = None,
-        save_torch_dist: bool = True,
-        save_hf: bool = False,
     ):
         """Save a checkpoint of the model."""
         futures = self.worker_group.run_all_workers_single_data(
@@ -316,8 +314,6 @@ def save_checkpoint(
             weights_path,
             optimizer_path,
             tokenizer_path,
-            save_torch_dist,
-            save_hf,
             only_on="all_tied_workers",
         )
         ray.get(futures)
diff --git a/nemo_rl/utils/checkpoint.py b/nemo_rl/utils/checkpoint.py
index bc916d3d7e..5f23a0bd68 100644
--- a/nemo_rl/utils/checkpoint.py
+++ b/nemo_rl/utils/checkpoint.py
@@ -26,6 +26,7 @@
 
 import numpy as np
 import torch
+import yaml
 
 
 class CheckpointingConfig(TypedDict):
@@ -56,7 +57,7 @@ class CheckpointManager:
     checkpoint_dir/
         step_0/
             training_info.json
-            config.json
+            config.yaml
             policy.py (up to the algorithm loop to save here)
             policy_optimizer.py (up to the algorithm loop to save here)
             ...
@@ -114,8 +115,8 @@ def init_tmp_checkpoint(
 
         # save config
         if run_config is not None:
-            with open(save_dir / "config.json", "w") as f:
-                json.dump(run_config, f)
+            with open(save_dir / "config.yaml", "w") as f:
+                yaml.safe_dump(run_config, f)
 
         return Path(os.path.abspath(save_dir))
 
diff --git a/nemo_rl/utils/native_checkpoint.py b/nemo_rl/utils/native_checkpoint.py
index 3573d2d86d..fc8f9ba44d 100644
--- a/nemo_rl/utils/native_checkpoint.py
+++ b/nemo_rl/utils/native_checkpoint.py
@@ -15,7 +15,6 @@
 """Checkpoint management utilities for HF models."""
 
 import os
-from pathlib import Path
 from typing import Any, Optional
 
 import torch
@@ -139,8 +138,6 @@ def save_checkpoint(
     optimizer_path: Optional[str] = None,
     tokenizer: Optional[Any] = None,
     tokenizer_path: Optional[str] = None,
-    save_torch_dist: bool = True,
-    save_hf: bool = False,
 ) -> None:
     """Save a checkpoint of the model and optionally optimizer state.
 
@@ -150,40 +147,17 @@ def save_checkpoint(
         optimizer: Optional optimizer to save
         scheduler: Optional scheduler to save
         optimizer_path: Path to save optimizer state (required if optimizer provided)
-        save_torch_dist: Whether to save in PyTorch distributed format
-        save_hf: Whether to save in HuggingFace format
     """
-    if save_hf:
-        if hasattr(model, "_fsdp_wrapped_module"):
-            model_state_dict = model._fsdp_wrapped_module.state_dict()
-        else:
-            model_state_dict = {
-                k: v.full_tensor()
-                if isinstance(v, torch.distributed.tensor.DTensor)
-                else v
-                for k, v in model.state_dict().items()
-            }
-
-        if torch.distributed.get_rank() == 0:
-            # Create a new path by appending "-hf" to the weights path
-            hf_weights_path = f"{Path(weights_path)}-hf"
-
-            model.save_pretrained(
-                hf_weights_path,
-                state_dict=model_state_dict,
-            )
+    model_state = {"model": ModelState(model)}
+    dcp.save(model_state, checkpoint_id=weights_path)
 
-    if save_torch_dist:
-        model_state = {"model": ModelState(model)}
-        dcp.save(model_state, checkpoint_id=weights_path)
-
-        if optimizer is not None:
-            if optimizer_path is None:
-                raise ValueError(
-                    "optimizer_path must be provided when saving optimizer state"
-                )
-            optimizer_state = {"optim": OptimizerState(model, optimizer, scheduler)}
-            dcp.save(optimizer_state, checkpoint_id=optimizer_path)
+    if optimizer is not None:
+        if optimizer_path is None:
+            raise ValueError(
+                "optimizer_path must be provided when saving optimizer state"
+            )
+        optimizer_state = {"optim": OptimizerState(model, optimizer, scheduler)}
+        dcp.save(optimizer_state, checkpoint_id=optimizer_path)
 
     if tokenizer is not None:
         if tokenizer_path is None:
diff --git a/tests/functional/dpo.sh b/tests/functional/dpo.sh
index 200a08cdd7..fb976d6701 100755
--- a/tests/functional/dpo.sh
+++ b/tests/functional/dpo.sh
@@ -35,5 +35,5 @@ uv run $PROJECT_ROOT/examples/run_dpo.py \
 uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 
 uv run tests/check_metrics.py $JSON_METRICS \
-  'data["train/loss"]["2"] < 0.694' \
+  'data["train/loss"]["2"] < 0.715'
 
diff --git a/tests/test_suites/README.md b/tests/test_suites/README.md
index 3ccf0d75c9..0759f06f25 100644
--- a/tests/test_suites/README.md
+++ b/tests/test_suites/README.md
@@ -4,13 +4,18 @@
 
 Each test is named:
 ```
-<algo>-<model>-#n#g-<parallelism>-<opt:long>.sh
+<algo>-<model>-#n#g-<parallelism>-<opt:long><opt:v$N>.sh
 ```
 
 Examples:
 * sft-llama3.2-1b-1n8g-fsdp2tp1.sh
 * grpo-qwen2-1.5B-instruct-4n8g-fsdp2tp2.sh
 * grpo-qwen2-1.5B-instruct-4n8g-fsdp2tp2-long.sh
+* grpo-qwen2-1.5B-instruct-4n8g-fsdp2tp2-long.v2.sh
+    * The final verison suffix (starts with `.v2`, `.v3`, ...), is reserved for cases contributors believe the recipe's 
+      convergence has changed due to their commit. Versioning signals that this recipe should not be compared to its
+      predecessor due to a change in convergence behavior. Examples of this change include: changing dataset, changing loss,
+      convergence bug fix. Changes affecting performance do not need a version change. 
 
 ## Running manually
 
diff --git a/tests/test_suites/nightly.txt b/tests/test_suites/nightly.txt
index 4c609d5bff..b80a7ad545 100644
--- a/tests/test_suites/nightly.txt
+++ b/tests/test_suites/nightly.txt
@@ -3,15 +3,15 @@
 ########
 
 # Short 1N/1B runs (go past 200 steps - usually divergence happens by now) -- going to 4 nodes doesn't help that much
-tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.sh
-tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.sh
+tests/test_suites/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v2.sh
+tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.sh
 
 # FSDP1 vs Dtensor (Qwen/Qwen2.5-7B-Instruct)
-tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.sh
-tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.sh
+tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v2.sh
+tests/test_suites/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v2.sh
 
 # Functional 32b run
-tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.sh
+tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v2.sh
 
 #######
 # SFT #
diff --git a/tests/test_suites/release.txt b/tests/test_suites/release.txt
index 69735cb0cb..42e9c49d00 100644
--- a/tests/test_suites/release.txt
+++ b/tests/test_suites/release.txt
@@ -3,10 +3,10 @@
 ########
 
 # Long 8b run
-tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.sh
+tests/test_suites/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v2.sh
 
 # Long 32b run
-tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.sh
+tests/test_suites/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v2.sh
 
 #######
 # SFT #
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index 2a3ec3a7c9..7c6a9e21bf 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -48,6 +48,9 @@ class TEST_ASSETS:
     TINY_QWEN2_MODEL_PATH = os.path.join(
         _TEST_ASSETS_DIR, "tiny_qwen2_with_qwen2_tokenizer"
     )
+    TINY_QWEN3_MODEL_PATH = os.path.join(
+        _TEST_ASSETS_DIR, "tiny_qwen3_with_qwen3_tokenizer"
+    )
 
 
 class UnitTestData(TypedDict):
@@ -462,3 +465,31 @@ def tiny_qwen2_model_path():
     tokenizer.save_pretrained(model_path)
     del model, tokenizer
     yield model_path
+
+
+@pytest.fixture(scope="session", autouse=True)
+def tiny_qwen3_model_path():
+    """Fixture that returns a path to a tiny llama model with a dummy tokenizer."""
+    import shutil
+
+    from transformers import AutoTokenizer, Qwen3Config, Qwen3ForCausalLM
+
+    model_path = TEST_ASSETS.TINY_QWEN3_MODEL_PATH
+    # hidden_size//num_attention_heads = 32 (smallest value to not error due to vllm paged attention)
+    # vocab_size=151936 (so we can re-use qwen2 1.5b tokenizer)
+    config = Qwen3Config(
+        num_hidden_layers=2,
+        hidden_size=64,
+        intermediate_size=32,
+        num_attention_heads=2,
+        vocab_size=151936,
+        tie_word_embeddings=False,
+        num_key_value_heads=None,
+    )
+    model = Qwen3ForCausalLM(config=config)
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
+    shutil.rmtree(model_path, ignore_errors=True)
+    model.save_pretrained(model_path)
+    tokenizer.save_pretrained(model_path)
+    del model, tokenizer
+    yield model_path
diff --git a/tests/unit/data/test_llm_message_utils.py b/tests/unit/data/test_llm_message_utils.py
index 0a5cb3ef4b..fc4c6c6b8d 100644
--- a/tests/unit/data/test_llm_message_utils.py
+++ b/tests/unit/data/test_llm_message_utils.py
@@ -18,8 +18,10 @@
 import torch
 from transformers import AutoTokenizer
 
+from nemo_rl.data.hf_datasets import COMMON_CHAT_TEMPLATES
 from nemo_rl.data.interfaces import LLMMessageLogType, TaskDataSpec
 from nemo_rl.data.llm_message_utils import (
+    _validate_tensor_consistency,
     add_loss_mask_to_message_log,
     batched_message_log_to_flat_message,
     get_first_index_that_differs,
@@ -406,6 +408,39 @@ def test_get_formatted_message_log_qwen(
     assert actual_text == expected_text
 
 
+def test_formatted_message_log_empty_message():
+    message_logs = [
+        [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": ""},
+        ],
+        [
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Hello!"},
+        ],
+    ]
+    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
+    tokenizer.chat_template = COMMON_CHAT_TEMPLATES.passthrough_prompt_response
+    task_data_spec = TaskDataSpec(task_name="test")
+    result = [
+        get_formatted_message_log(
+            message_log,
+            tokenizer,
+            task_data_spec,
+            add_bos_token=False,
+            add_eos_token=False,
+        )
+        for message_log in message_logs
+    ]
+    flat_result = [message_log_to_flat_messages(m) for m in result]
+    for k in flat_result[0].keys():
+        if isinstance(flat_result[0][k], torch.Tensor):
+            # make sure validate_tensor_consistency does not raise an error when one of the messages is empty
+            _validate_tensor_consistency(
+                [flat_result[i][k] for i in range(len(flat_result))]
+            )
+
+
 def test_add_loss_mask_to_chat_message_log(
     tokenized_chat_message_log: LLMMessageLogType,
 ):
diff --git a/tests/unit/experience/test_rollouts.py b/tests/unit/experience/test_rollouts.py
index b45811d4f8..bcfa1b84d2 100644
--- a/tests/unit/experience/test_rollouts.py
+++ b/tests/unit/experience/test_rollouts.py
@@ -20,6 +20,7 @@
 import torch
 from transformers import AutoTokenizer
 
+from nemo_rl.data.llm_message_utils import batched_message_log_to_flat_message
 from nemo_rl.distributed.batched_data_dict import BatchedDataDict
 from nemo_rl.distributed.virtual_cluster import RayVirtualCluster
 from nemo_rl.environments.games.sliding_puzzle import (
@@ -440,6 +441,45 @@ def test_run_multi_step_calculator_vllm(multi_step_setup_vllm):
     print("\nMulti-Step Calculator VLLM Test assertions passed.")
 
 
+@pytest.mark.skipif(
+    not torch.cuda.is_available() or torch.cuda.device_count() < 1,
+    reason="VLLM test requires at least 1 GPU",
+)
+def test_max_seqlen_respected(multi_step_setup_vllm):
+    """Tests multi-step calculator rollout with VllmGeneration."""
+    vllm_generation, rollout_tokenizer, task_to_env, initial_batch, rollout_cluster = (
+        multi_step_setup_vllm
+    )
+    max_rollout_turns = initial_batch["extra_env_info"][0]["max_steps"] + 1
+    max_seq_len = 290
+
+    print("\nRunning multi-step calculator rollout (VLLM)...")
+    vllm_generation.prepare_for_generation()
+    final_batch, rollout_metrics = run_multi_turn_rollout(
+        policy_generation=vllm_generation,
+        input_batch=initial_batch,
+        tokenizer=rollout_tokenizer,
+        task_to_env=task_to_env,
+        max_seq_len=max_seq_len,
+        max_rollout_turns=max_rollout_turns,
+    )
+    vllm_generation.finish_generation()
+    print("Multi-step calculator rollout complete (VLLM).")
+
+    # --- Assertions ---
+    assert isinstance(final_batch, BatchedDataDict)
+    assert "message_log" in final_batch
+    assert "total_reward" in final_batch
+    assert len(final_batch["message_log"]) == len(initial_batch["message_log"])
+    flattened_message_log, _ = batched_message_log_to_flat_message(
+        final_batch["message_log"]
+    )
+    # Check that the sequence length is respected by flattening the message log and checking the length
+    assert len(flattened_message_log["token_ids"][0]) == max_seq_len, (
+        f"Sequence length {len(flattened_message_log['token_ids'][0])} is not equal to max_seq_len {max_seq_len}"
+    )
+
+
 # --- Fixture for Sliding Puzzle Environment ---
 @pytest.fixture(scope="function")
 def sliding_puzzle_environment(rollout_cluster):
diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py
index 552ea3dae2..08f34defe2 100644
--- a/tests/unit/models/generation/test_vllm_generation.py
+++ b/tests/unit/models/generation/test_vllm_generation.py
@@ -36,7 +36,7 @@
     },
     "dtype": "bfloat16",
     "max_new_tokens": 10,
-    "temperature": 1.0,
+    "temperature": 0.8,
     "top_p": 1.0,
     "top_k": None,
     "stop_token_ids": None,
@@ -85,6 +85,9 @@ def get_basic_hf_test_config(enable_dtensor: bool = False) -> PolicyConfig:
         },
         "max_grad_norm": 1.0,
         "make_sequence_length_divisible_by": 1,
+        "generation": {
+            "temperature": 0.8,
+        },
     }
 
 
diff --git a/tests/unit/models/policy/test_dtensor_worker.py b/tests/unit/models/policy/test_dtensor_worker.py
index 7f175b3f15..8ff416059e 100644
--- a/tests/unit/models/policy/test_dtensor_worker.py
+++ b/tests/unit/models/policy/test_dtensor_worker.py
@@ -294,6 +294,10 @@ def training_setup(request, two_gpu_virtual_cluster):
         (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, True, False, True),
         (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, False, True, True),
         (TEST_ASSETS.TINY_QWEN2_MODEL_PATH, 1, True, True, True),
+        (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, True, True, False),
+        (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, True, False, True),
+        (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, False, True, True),
+        (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 1, True, True, True),
     ],
     indirect=True,
 )
@@ -421,6 +425,8 @@ def logprob_setup(request, two_gpu_virtual_cluster):
         (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 2, False, False, False),
         (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 2, False, True, False),
         (TEST_ASSETS.TINY_LLAMA_MODEL_PATH, 2, False, True, True),
+        (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 2, False, True, False),
+        (TEST_ASSETS.TINY_QWEN3_MODEL_PATH, 2, False, False, False),
     ],
     indirect=True,
 )
diff --git a/tests/unit/test_recipes_and_test_suites.py b/tests/unit/test_recipes_and_test_suites.py
index 9bac39188e..47d1d2f45b 100644
--- a/tests/unit/test_recipes_and_test_suites.py
+++ b/tests/unit/test_recipes_and_test_suites.py
@@ -19,6 +19,8 @@
 
 dir_path = os.path.dirname(os.path.abspath(__file__))
 project_root = os.path.abspath(os.path.join(dir_path, "..", ".."))
+configs_dir = os.path.join(project_root, "examples", "configs")
+recipes_dir = os.path.join(project_root, "examples", "configs", "recipes")
 test_suites_dir = os.path.join(project_root, "tests", "test_suites")
 
 nightly_test_suite_path = os.path.join(test_suites_dir, "nightly.txt")
@@ -30,6 +32,13 @@
     test_suites_dir, "release_performance.txt"
 )
 
+# Relative to project root
+ALGO_MAPPING_TO_BASE_YAML = {
+    "sft": "examples/configs/sft.yaml",
+    "dpo": "examples/configs/dpo.yaml",
+    "grpo": "examples/configs/grpo_math_1B.yaml",
+}
+
 
 @pytest.fixture
 def nightly_test_suite():
@@ -90,6 +99,16 @@ def all_test_suites(
     )
 
 
+@pytest.fixture
+def all_recipe_yaml_rel_paths():
+    all_recipes = []
+    for recipe_path in glob.glob(
+        os.path.join(recipes_dir, "**", "*.yaml"), recursive=True
+    ):
+        all_recipes.append(recipe_path[len(recipes_dir) + 1 :])
+    return all_recipes
+
+
 @pytest.mark.parametrize(
     "test_suite_path",
     [
@@ -112,12 +131,14 @@ def test_test_suites_exist(test_suite_path):
 
 
 def test_no_overlap_across_test_suites(all_test_suites):
-    recipes = set(all_test_suites)
-    assert len(recipes) == len(all_test_suites), f"Test suites have repeats {recipes}"
+    all_tests = set(all_test_suites)
+    assert len(all_tests) == len(all_test_suites), (
+        f"Test suites have repeats {all_tests}"
+    )
 
 
-def test_all_recipes_accounted_for_in_test_suites(all_test_suites):
-    all_recipes_in_test_suites = set(all_test_suites)
+def test_all_test_scripts_accounted_for_in_test_suites(all_test_suites):
+    all_test_scripts_in_test_suites = set(all_test_suites)
 
     all_tests_in_test_suites_dir = set()
     for recipe_path in glob.glob(
@@ -127,8 +148,37 @@ def test_all_recipes_accounted_for_in_test_suites(all_test_suites):
         recipe_name = recipe_path[len(project_root) + 1 :]
         all_tests_in_test_suites_dir.add(recipe_name)
 
-    assert all_recipes_in_test_suites == all_tests_in_test_suites_dir, (
-        "All recipes are not accounted for in the test suites"
+    assert all_test_scripts_in_test_suites == all_tests_in_test_suites_dir, (
+        "All test scripts are not accounted for in the test suites"
+    )
+
+
+def test_all_recipe_yamls_accounted_for_in_test_suites(
+    all_recipe_yaml_rel_paths, all_test_suites
+):
+    """This test along with test_all_test_scripts_accounted_for_in_test_suites() ensures that all recipe yaml/test scripts/test_suite(txts) are in sync."""
+    assert len(set(all_recipe_yaml_rel_paths)) == len(set(all_test_suites)), (
+        "Recipe YAMLs should be accounted for in the test suites"
+    )
+
+    all_test_script_paths_in_test_suites = set()
+    for test_script in all_test_suites:
+        # Each test suite is relative from project root
+        test_script_rel_to_test_suites_dir = test_script[
+            len(os.path.join("tests", "test_suites")) + 1 :
+        ]
+        all_test_script_paths_in_test_suites.add(test_script_rel_to_test_suites_dir)
+
+    # Since we're comparing yaml to sh, chop off the .sh/.yaml extensions for comparison
+    all_test_script_paths_in_test_suites = {
+        os.path.splitext(path)[0] for path in all_test_script_paths_in_test_suites
+    }
+    all_recipe_yaml_rel_paths = {
+        os.path.splitext(path)[0] for path in all_recipe_yaml_rel_paths
+    }
+
+    assert all_test_script_paths_in_test_suites == set(all_recipe_yaml_rel_paths), (
+        "All recipe YAMLs are not accounted for in the test suites"
     )
 
 
@@ -215,3 +265,37 @@ def test_all_tests_can_find_config_if_dryrun(all_test_suites):
         assert result.returncode == 0, (
             f"Command failed with exit code {result.returncode}"
         )
+
+
+def test_all_recipes_start_with_algo_hyphen(all_recipe_yaml_rel_paths):
+    expected_algos = set(ALGO_MAPPING_TO_BASE_YAML.keys())
+    for recipe_yaml in all_recipe_yaml_rel_paths:
+        basename = os.path.basename(recipe_yaml)
+        algo = basename.split("-")[0]
+        assert algo in expected_algos, (
+            f"Recipe {recipe_yaml} has unexpected algo {algo}"
+        )
+
+
+@pytest.mark.parametrize("algo, algo_base_yaml", ALGO_MAPPING_TO_BASE_YAML.items())
+def test_all_recipes_can_merge_configs_with_base_config(
+    all_recipe_yaml_rel_paths, all_test_suites, algo, algo_base_yaml
+):
+    from omegaconf import OmegaConf
+
+    base_yaml = os.path.join(project_root, algo_base_yaml)
+    base_config = OmegaConf.load(base_yaml)
+    # Would result in an error if we couldn't merge our config with the recipe's config
+    OmegaConf.set_struct(base_config, True)
+    for recipe_yaml in all_recipe_yaml_rel_paths:
+        if not os.path.basename(recipe_yaml).startswith(algo):
+            # Skipping here b/c we test that all recipes start with the algo-hyphen in
+            #  test_all_recipes_start_with_algo_hyphen()
+            continue
+        recipe_yaml_path = os.path.join(recipes_dir, recipe_yaml)
+        recipe_config = OmegaConf.load(recipe_yaml_path)
+        OmegaConf.set_struct(recipe_config, True)
+        # This will raise a error if the config can't be merged
+        print(f"Merging {recipe_yaml} with {base_yaml}")
+        merged_config = OmegaConf.merge(base_config, recipe_config)
+        print(merged_config)
diff --git a/tests/unit/utils/test_checkpoint.py b/tests/unit/utils/test_checkpoint.py
index 2a912e94b2..c5a90c7932 100644
--- a/tests/unit/utils/test_checkpoint.py
+++ b/tests/unit/utils/test_checkpoint.py
@@ -17,6 +17,7 @@
 import numpy as np
 import pytest
 import torch
+import yaml
 
 from nemo_rl.utils.checkpoint import CheckpointManager
 
@@ -62,8 +63,8 @@ def test_init_tmp_checkpoint(checkpoint_manager, checkpoint_dir):
         assert isinstance(saved_metadata["numpy"], (int, float))
 
     # Check if config was saved
-    with open(save_dir / "config.json", "r") as f:
-        saved_config = json.load(f)
+    with open(save_dir / "config.yaml", "r") as f:
+        saved_config = yaml.safe_load(f)
         assert saved_config == run_config
 
 
diff --git a/tests/unit/utils/test_native_checkpoint.py b/tests/unit/utils/test_native_checkpoint.py
index 7cebeade90..f751c5c47e 100755
--- a/tests/unit/utils/test_native_checkpoint.py
+++ b/tests/unit/utils/test_native_checkpoint.py
@@ -61,6 +61,9 @@
         "tensor_parallel_size": 1,
     },
     "max_grad_norm": 1.0,
+    "generation": {
+        "temperature": 1.0,
+    },
 }
 
 
@@ -283,77 +286,6 @@ def test_save_and_load_model_and_optimizer(mock_experiment):
     check_dict_equality(new_optimizer.state_dict(), optimizer.state_dict())
 
 
-@pytest.mark.parametrize("num_gpus", [1, 2], ids=["1gpu", "2gpu"])
-def test_save_and_load_hf_checkpoint(policy, num_gpus):
-    ## warm up with a forward pass
-    ## this is needed before saving a checkpoint because FSDP does some lazy initialization
-    input_ids = torch.randint(0, 16000, (4, 128))  # 4 sequences, each of length 128
-    attention_mask = torch.ones(4, 128)
-    input_lengths = attention_mask.sum(dim=1).to(torch.int32)
-    dummy_fwd_dict = BatchedDataDict(
-        {
-            "input_ids": input_ids,
-            "input_lengths": input_lengths,
-            "attention_mask": attention_mask,
-            "labels": torch.randint(0, 16000, (4, 128)),
-        }
-    )
-    policy.get_logprobs(dummy_fwd_dict)
-
-    with TemporaryDirectory() as tmp_dir:
-        policy.save_checkpoint(
-            os.path.join(tmp_dir, "test_hf_and_dcp"),
-            save_hf=True,
-            save_torch_dist=True,
-            tokenizer_path=os.path.join(tmp_dir, "test_hf_and_dcp_tokenizer"),
-        )
-
-        ## make sure we save both HF and DCP checkpoints
-        # Dynamically create the expected set of distcp files based on num_gpus
-        expected_distcp_files = {f"__{rank}_0.distcp" for rank in range(num_gpus)}
-        expected_files = expected_distcp_files.union({".metadata"})
-
-        assert (
-            set(os.listdir(os.path.join(tmp_dir, "test_hf_and_dcp"))) == expected_files
-        )
-        assert set(os.listdir(os.path.join(tmp_dir, "test_hf_and_dcp_tokenizer"))) == {
-            "tokenizer_config.json",
-            "tokenizer.json",
-            "special_tokens_map.json",
-        }
-
-        converted_model = AutoModelForCausalLM.from_pretrained(
-            os.path.join(tmp_dir, "test_hf_and_dcp-hf")
-        )
-
-        hf_save_dir = os.path.join(tmp_dir, "test_hf_and_dcp-hf")
-        hf_files = set(os.listdir(hf_save_dir))
-
-        # Check the HF saved files structure: could be single or sharded
-        expected_common_hf_files = {"config.json", "generation_config.json"}
-        if "model.safetensors" in hf_files:
-            # Single file format (1 GPU or smaller model)
-            expected_hf_files = expected_common_hf_files.union({"model.safetensors"})
-        else:
-            # Sharded format (>=2 GPUs or larger model)
-            expected_hf_files = expected_common_hf_files.union(
-                {
-                    "model-00001-of-00002.safetensors",
-                    "model-00002-of-00002.safetensors",
-                    "model.safetensors.index.json",
-                }
-            )
-        assert hf_files == expected_hf_files
-
-        coverted_model = AutoModelForCausalLM.from_pretrained(hf_save_dir)
-        original_model = AutoModelForCausalLM.from_pretrained(
-            simple_policy_config["model_name"]
-        )
-
-    ## make sure converted model matches the original
-    check_dict_equality(converted_model.state_dict(), original_model.state_dict())
-
-
 @pytest.mark.parametrize("num_gpus", [1, 2], ids=["1gpu", "2gpu"])
 def test_convert_dcp_to_hf(policy, num_gpus):
     ## warm up with a forward pass
@@ -374,8 +306,6 @@ def test_convert_dcp_to_hf(policy, num_gpus):
     with TemporaryDirectory() as tmp_dir:
         policy.save_checkpoint(
             os.path.join(tmp_dir, "test_hf_and_dcp"),
-            save_hf=True,
-            save_torch_dist=True,
         )
 
         # Dynamically create the expected set of distcp files based on num_gpus
@@ -387,25 +317,6 @@ def test_convert_dcp_to_hf(policy, num_gpus):
             set(os.listdir(os.path.join(tmp_dir, "test_hf_and_dcp"))) == expected_files
         )
 
-        # Check the HF saved files structure: could be single or sharded
-        hf_save_dir = os.path.join(tmp_dir, "test_hf_and_dcp-hf")
-        hf_files = set(os.listdir(hf_save_dir))
-        expected_common_hf_files = {"config.json", "generation_config.json"}
-
-        if "model.safetensors" in hf_files:
-            # Single file format (1 GPU or smaller model)
-            expected_hf_files = expected_common_hf_files.union({"model.safetensors"})
-        else:
-            # Sharded format (>=2 GPUs or larger model)
-            expected_hf_files = expected_common_hf_files.union(
-                {
-                    "model-00001-of-00002.safetensors",
-                    "model-00002-of-00002.safetensors",
-                    "model.safetensors.index.json",
-                }
-            )
-        assert hf_files == expected_hf_files
-
         offline_converted_model_path = convert_dcp_to_hf(
             os.path.join(tmp_dir, "test_hf_and_dcp"),
             os.path.join(tmp_dir, "test_hf_and_dcp-hf-offline"),
@@ -423,18 +334,11 @@ def test_convert_dcp_to_hf(policy, num_gpus):
             offline_converted_model_path
         )
 
-        online_converted_model = AutoModelForCausalLM.from_pretrained(
-            os.path.join(tmp_dir, "test_hf_and_dcp-hf")
-        )
         original_model = AutoModelForCausalLM.from_pretrained(
             simple_policy_config["model_name"]
         )
 
-    ## make sure both conversions results in the same state dict
-    check_dict_equality(
-        online_converted_model.state_dict(), offline_converted_model.state_dict()
-    )
-    # Ensure the offline one is different from the original
+    # Ensure the offline checkpoint is different from the original
     assert_recursive_dict_different(
         offline_converted_model.state_dict(), original_model.state_dict()
     )

From e882334b5c68498e302b4f5a0992d23bc3253396 Mon Sep 17 00:00:00 2001
From: KiddoZhu <zhaochengz@nvidia.com>
Date: Wed, 7 May 2025 21:07:26 -0700
Subject: [PATCH 3/7] rewrite code & tool use as environments

Signed-off-by: KiddoZhu <zhaochengz@nvidia.com>
---
 nemo_rl/environments/code_environment.py | 252 +++++++++++++++++++
 nemo_rl/environments/tools/retriever.py  | 107 ++++++++
 nemo_rl/experience/rollouts.py           |   2 +
 nemo_rl/tools/generation.py              |   2 +-
 tests/unit/experience/test_code.py       | 298 +++++++++++++++++++++++
 tests/unit/experience/test_retriever.py  | 159 ++++++++++++
 6 files changed, 819 insertions(+), 1 deletion(-)
 create mode 100644 nemo_rl/environments/code_environment.py
 create mode 100644 nemo_rl/environments/tools/retriever.py
 create mode 100644 tests/unit/experience/test_code.py
 create mode 100644 tests/unit/experience/test_retriever.py

diff --git a/nemo_rl/environments/code_environment.py b/nemo_rl/environments/code_environment.py
new file mode 100644
index 0000000000..325733fa37
--- /dev/null
+++ b/nemo_rl/environments/code_environment.py
@@ -0,0 +1,252 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ast
+import builtins
+import os
+import re
+from io import IOBase
+from types import ModuleType
+from copy import copy
+from collections.abc import Mapping, Sequence, Set
+from tempfile import TemporaryDirectory
+from contextlib import contextmanager
+from typing import Any, Dict, List, Optional, Tuple, TypedDict
+
+import ray
+import torch
+from pprint import pformat
+
+from nemo_rl.data.interfaces import LLMMessageLogType
+from nemo_rl.distributed.batched_data_dict import BatchedDataDict
+from nemo_rl.distributed.virtual_cluster import PY_EXECUTABLES
+from nemo_rl.environments.utils import chunk_list_to_workers
+from nemo_rl.environments.interfaces import EnvironmentInterface, EnvironmentReturn
+from nemo_rl.tools.interfaces import ToolInterface
+
+
+class CodeEnvConfig(TypedDict):
+    num_workers: int
+    # whether to terminate the execution after expression evaluation
+    # if you want to execute multiple rounds of code, set this to False
+    # and wrap CodeEnvironment in another environment that terminates the generation
+    terminate_on_evaluation: bool
+
+
+class CodeEnvMetadata(TypedDict):
+    context: Dict[str, Any]  # Hold functions and variables defined in the code
+    working_dir: str  # Working directory for file operations
+
+
+@ray.remote
+class CodeExecutionWorker:
+    DEFAULT_PY_EXECUTABLE = PY_EXECUTABLES.SYSTEM
+    """Helper class to process individual code execution steps."""
+
+    def __init__(self):
+        # Create sandbox with safe builtins
+        builtin_dict = {k: getattr(builtins, k) for k in dir(builtins)}
+        builtin_dict["open"] = self.safe_open
+        builtin_dict["__import__"] = self.safe_import
+        self.sandbox = {"__builtins__": builtin_dict}
+
+    def sanitize(self, obj: Any) -> Any:
+        # TODO: better handling of unpicklable objects: custom __getstate__ and __setstate__
+        # recursively remove all file objects as they are not picklable by ray
+        if isinstance(obj, (IOBase, ModuleType)):
+            # replace unpickable objects with a string representation
+            return repr(obj)
+        if isinstance(obj, Mapping):
+            return obj.__class__({self.sanitize(k): self.sanitize(v) for k, v in obj.items()})
+        if isinstance(obj, Sequence) and not isinstance(obj, str):
+            return obj.__class__(self.sanitize(v) for v in obj)
+        if hasattr(obj, "__dict__"):
+            new_obj = copy(obj)
+            new_obj.__dict__ = {self.sanitize(k): self.sanitize(v) for k, v in obj.__dict__.items()}
+            return new_obj
+        return obj
+
+    def format_result(self, result: Any, code: Optional[str] = None, lookahead: Optional[str] = None) -> str:
+        if result is None:
+            # no return value
+            return ""
+        result = pformat(result)
+        multiline = (code and "\n" in code) or "\n" in result
+        if multiline:
+            # multi-line format
+            result = f"\n\n<result>\n{result}\n</result>"
+        else:
+            # inline format
+            result = f"<result>{result}</result>"
+        if lookahead:
+            if result.startswith(lookahead):
+                # The generation may look like "</code>\n" if ">\n" is a single token.
+                # We trim \n from the result if the model has already generated it.
+                result = result[len(lookahead):]
+        return result
+
+    def execute(self, message_batch: str, metadata_batch: List[CodeEnvMetadata]) -> str:
+        """Execute code in a sandboxed environment."""
+        results = []
+        terminateds = []
+        
+        for message, metadata in zip(message_batch, metadata_batch):
+            match = re.search(rf"<code>(.*)</code>(.*)", message, re.DOTALL)
+            if not match:
+                results.append("")
+                terminateds.append(False)
+                continue
+
+            code, lookahead = match.groups()
+            tree = ast.parse(code)
+
+            if tree.body and isinstance(tree.body[-1], ast.Expr):
+                # Interactive mode
+                exec_code = ast.unparse(tree.body[:-1])
+                eval_code = ast.unparse(tree.body[-1])
+            else:
+                # Silent mode
+                exec_code = code
+                eval_code = None
+            
+            result = None
+            terminated = False
+            with self.chdir(metadata["working_dir"]):
+                try:
+                    # isolate the code in a sandbox
+                    # capture local variables in metadata["context"]
+                    exec(exec_code, self.sandbox, metadata["context"])
+                    if eval_code:
+                        result = eval(eval_code, self.sandbox, metadata["context"])
+                        terminated = True
+                except Exception as err:
+                    result = err
+            
+            result = self.format_result(result, code, lookahead)
+            results.append(result)
+            terminateds.append(terminated)
+        
+        observations = [{"role": "environment", "content": result} for result in results]
+        metadata_batch = self.sanitize(metadata_batch)
+        
+        return observations, terminateds, metadata_batch
+            
+    @contextmanager
+    def chdir(self, dir: str):
+        """Change to temporary directory for file operations."""
+        current_dir = os.getcwd()
+        os.chdir(dir)
+        try:
+            yield
+        finally:
+            os.chdir(current_dir)
+
+    def safe_open(self, file: str, *args, **kwargs):
+        """Safe version of open() that only allows access to temporary directory."""
+        real_file = os.path.realpath(file)
+        working_dir = os.path.realpath(os.getcwd())
+        if os.path.commonpath([real_file, working_dir]) != working_dir:
+            raise PermissionError("Access beyond the temporary working directory is blocked")
+        return open(file, *args, **kwargs)
+
+    def safe_import(self, name: str, *args, **kwargs):
+        """Safe version of import that blocks risky modules."""
+        risky_modules = {
+            "os", "shutil",  # erase filesystem
+            "sys", "signal",  # exit the current program
+            "socket",  # network communication
+            "subprocess", "threading", "multiprocessing",  # spawn threads or processes
+            "builtins", "importlib",  # bypass current blockers
+        }
+        if name in risky_modules:
+            raise PermissionError("Importing system and network modules is blocked")
+        return builtins.__import__(name, *args, **kwargs)
+
+
+@ray.remote
+class CodeEnvironment(EnvironmentInterface):
+    DEFAULT_PY_EXECUTABLE = PY_EXECUTABLES.SYSTEM
+    """Code execution environment that maintains state between steps."""
+
+    def __init__(self, cfg: CodeEnvConfig):
+        self.cfg = cfg
+        self.num_workers = cfg["num_workers"]
+        self.terminate_on_evaluation = cfg["terminate_on_evaluation"]
+        self.workers = [
+            CodeExecutionWorker.options(
+                runtime_env={"py_executable": CodeExecutionWorker.DEFAULT_PY_EXECUTABLE}
+            ).remote()
+            for _ in range(self.num_workers)
+        ]
+
+    def step(
+        self,
+        message_log_batch: List[LLMMessageLogType],
+        metadata_batch: List[CodeEnvMetadata],
+    ) -> EnvironmentReturn:
+        """Process a batch of code execution steps."""
+        message_batch = [ml[-1]["content"] for ml in message_log_batch]
+        chunked_message_batch = chunk_list_to_workers(
+            message_batch, self.num_workers
+        )
+        chunked_metadata_batch = chunk_list_to_workers(
+            metadata_batch, self.num_workers
+        )
+
+        # Process each chunk in parallel
+        futures = [
+            self.workers[i].execute.remote(message_chunk, metadata_chunk)
+            for i, (message_chunk, metadata_chunk) in enumerate(
+                zip(chunked_message_batch, chunked_metadata_batch)
+            )
+        ]
+
+        results = ray.get(futures)
+
+        # Unpack results
+        observations = []
+        terminateds = []
+        new_metadata_batch = []
+
+        for obs, term, meta in results:
+            observations += obs
+            terminateds += term
+            new_metadata_batch += meta
+        
+        if self.terminate_on_evaluation:
+            terminated_tensor = torch.tensor(terminateds, dtype=torch.bool)
+        else:
+            terminated_tensor = torch.zeros(len(terminateds), dtype=torch.bool)
+        rewards_tensor = torch.zeros_like(terminated_tensor, dtype=torch.float32)
+
+        next_stop_strings = [["</code>"]] * len(message_log_batch)
+
+        return EnvironmentReturn(
+            observations=observations,
+            metadata=new_metadata_batch,
+            next_stop_strings=next_stop_strings,
+            rewards=rewards_tensor,
+            terminateds=terminated_tensor,
+        )
+
+    def shutdown(self):
+         # shutdown all workers
+        for worker in self.workers:
+            ray.kill(worker)
+
+    def global_post_process_and_metrics(
+        self, batch: BatchedDataDict
+    ) -> Tuple[BatchedDataDict, dict]:
+        """Compute metrics for the batch."""
+        # No specific metrics for code execution
+        return batch, {}
\ No newline at end of file
diff --git a/nemo_rl/environments/tools/retriever.py b/nemo_rl/environments/tools/retriever.py
new file mode 100644
index 0000000000..40da06f092
--- /dev/null
+++ b/nemo_rl/environments/tools/retriever.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, List, TypedDict
+
+import re
+import ray
+import torch
+from datasets import load_dataset
+
+from nemo_rl.data.interfaces import LLMMessageLogType
+from nemo_rl.distributed.batched_data_dict import BatchedDataDict
+from nemo_rl.environments.interfaces import EnvironmentInterface, EnvironmentReturn
+from nemo_rl.tools.tools import BM25Retriever
+
+
+class RAGEnvConfig(TypedDict):
+    dataset_name: str  # Name of the dataset to load
+    dataset_split: str  # Split of the dataset to use
+    text_column: str  # Column name containing the text to retrieve
+    num_results: int  # Number of documents to retrieve
+    k1: float  # BM25 parameter
+    b: float  # BM25 parameter
+    device: str  # Device to compute BM25
+
+
+@ray.remote
+class RAGEnvironment(EnvironmentInterface):
+    """RAG environment that uses BM25 for document retrieval."""
+
+    def __init__(self, cfg: RAGEnvConfig):
+        self.cfg = cfg
+        
+        # Load dataset
+        dataset = load_dataset(cfg["dataset_name"], split=cfg["dataset_split"])
+        documents = [sample[cfg["text_column"]] for sample in dataset]
+        
+        # Initialize BM25 retriever
+        self.retriever = BM25Retriever(
+            documents=documents,
+            num_result=cfg["num_results"],
+            k1=cfg["k1"],
+            b=cfg["b"],
+            device=cfg["device"],
+        )
+    
+    def format_result(self, retrieved_docs: List[str]) -> str:
+        result = "<result>\n"
+        for i, doc in enumerate(retrieved_docs):
+            result += f"<{i+1}>\n{doc}\n</{i+1}>\n"
+        result += "</result>\n"
+        return result
+    
+    def step(
+        self,
+        message_log_batch: List[LLMMessageLogType],
+        metadata_batch: List[Dict[str, Any]],
+    ) -> EnvironmentReturn:
+        """Process a batch of retrieval steps."""
+        # Extract queries from the last message in each log
+        messages = [ml[-1]["content"] for ml in message_log_batch]
+        
+        # Retrieve documents for each query
+        results = []
+        for message in messages:
+            match = re.search(rf"<retrieve>(.*)</retrieve>", message, re.DOTALL)
+            if not match:
+                results.append({"role": "environment", "content": "No retrieval query found!"})
+                continue
+            query = match.group(1)
+            retrieved_docs = self.retriever(query)
+            result = self.format_result(retrieved_docs)
+            results.append({"role": "environment", "content": result})
+        
+        batch_size = len(message_log_batch)
+        rewards_tensor = torch.zeros(batch_size, dtype=torch.float32)
+        terminated_tensor = torch.ones(batch_size, dtype=torch.bool)
+        next_stop_strings = [["</retrieve>"]] * batch_size
+        
+        return EnvironmentReturn(
+            observations=results,
+            metadata=metadata_batch,
+            next_stop_strings=next_stop_strings,
+            rewards=rewards_tensor,
+            terminateds=terminated_tensor,
+        )
+
+    def shutdown(self):
+        """Clean up resources."""
+        pass
+
+    def global_post_process_and_metrics(
+        self, batch: BatchedDataDict
+    ) -> tuple[BatchedDataDict, dict]:
+        """Compute metrics for the batch."""
+        # No specific metrics for RAG
+        return batch, {}
diff --git a/nemo_rl/experience/rollouts.py b/nemo_rl/experience/rollouts.py
index 567add0dfc..5304e44cb9 100644
--- a/nemo_rl/experience/rollouts.py
+++ b/nemo_rl/experience/rollouts.py
@@ -304,6 +304,8 @@ def run_multi_turn_rollout(
             tokenized_obs = tokenizer(
                 env_obs_content, return_tensors="pt", add_special_tokens=False
             )["input_ids"][0]
+            # tokenizer returns torch.float32 when env_obs_content is empty
+            tokenized_obs = tokenized_obs.to(dtype=torch.int64)
 
             # check if new message overflows max_seq_len
             if (
diff --git a/nemo_rl/tools/generation.py b/nemo_rl/tools/generation.py
index f50dbfe3ae..06f2f966b3 100644
--- a/nemo_rl/tools/generation.py
+++ b/nemo_rl/tools/generation.py
@@ -150,7 +150,7 @@ def generate_with_code_and_tools(
             if result is None:
                 # no return value
                 result = ""
-                new_results.append(result)
+                new_results.extend(result)
                 continue
             result = pformat(result)
             if "\n" in expr or "\n" in result:
diff --git a/tests/unit/experience/test_code.py b/tests/unit/experience/test_code.py
new file mode 100644
index 0000000000..1fb37464af
--- /dev/null
+++ b/tests/unit/experience/test_code.py
@@ -0,0 +1,298 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import ray
+import torch
+from tempfile import TemporaryDirectory
+from typing import List, Dict, Any
+from transformers import AutoTokenizer
+
+from nemo_rl.data.interfaces import LLMMessageLogType
+from nemo_rl.distributed.batched_data_dict import BatchedDataDict
+from nemo_rl.distributed.virtual_cluster import RayVirtualCluster
+from nemo_rl.environments.code_environment import CodeEnvironment, CodeEnvConfig, CodeEnvMetadata
+from nemo_rl.experience.rollouts import run_multi_turn_rollout
+from nemo_rl.models.generation.interfaces import configure_generation_config
+from nemo_rl.models.generation.vllm import VllmConfig, VllmGeneration
+from nemo_rl.models.policy.hf_policy import HfPolicy, PolicyConfig
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B"
+
+cfg: CodeEnvConfig = {
+    "num_workers": 2,
+    "terminate_on_evaluation": True,
+}
+
+# Define basic vLLM test config
+basic_vllm_test_config: VllmConfig = {
+    "backend": "vllm",
+    "model_name": MODEL_NAME,
+    "tokenizer_name": None,
+    "dtype": "bfloat16",
+    "max_new_tokens": 100,
+    "temperature": 1.0,
+    "top_p": 1.0,
+    "top_k": None,
+    "stop_token_ids": None,
+    "stop_strings": None,
+    "vllm_cfg": {
+        "tensor_parallel_size": 1,
+        "gpu_memory_utilization": 0.3,
+        "max_model_len": 1024,
+    },
+}
+
+basic_hf_test_config: PolicyConfig = {
+    "model_name": MODEL_NAME,
+    "tokenizer_name": None,
+    "generation_batch_size": 1,
+    "generation": {
+        "backend": "hf",
+        "max_new_tokens": 100,
+        "temperature": 1.0,
+        "top_p": 1.0,
+        "top_k": None,
+        "stop_token_ids": None,
+        "stop_strings": None,
+    },
+    # Required training parameters
+    "train_global_batch_size": 1,
+    "train_micro_batch_size": 1,
+    "learning_rate": 5e-6,
+    "logprob_batch_size": 1,
+    "max_new_tokens": 16,
+    "do_sample": False,
+    "precision": "float32",
+    "activation_checkpointing_enabled": False,
+    "fsdp_offload_enabled": False,
+    "optimizer": {
+        "name": "torch.optim.AdamW",
+        "kwargs": {
+            "lr": 5e-6,
+            "weight_decay": 0.01,
+            "betas": [0.9, 0.999],
+            "eps": 1e-8,
+        },
+    },
+    "dtensor_cfg": {"enabled": False},
+}
+
+
+@pytest.fixture(scope="function")
+def code_env():
+    """Create a code environment for testing."""
+    try:
+        env_actor = CodeEnvironment.remote(cfg)
+        yield env_actor
+    finally:
+        if env_actor:
+            ray.kill(env_actor)
+
+
+@pytest.fixture(scope="function")
+def tokenizer():
+    """Loads the tokenizer for the tests."""
+    print(f"Loading tokenizer: {MODEL_NAME}")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    print(
+        f"Tokenizer loaded. Pad token: {tokenizer.pad_token} (ID: {tokenizer.pad_token_id}), EOS token: {tokenizer.eos_token} (ID: {tokenizer.eos_token_id})"
+    )
+    return tokenizer
+
+
+@pytest.fixture(scope="function")
+def cluster():
+    """Create a virtual cluster for testing."""
+    cluster_instance = None
+    cluster_name = f"test-code-cluster-{id(cluster_instance)}"
+    print(f"\nCreating virtual cluster '{cluster_name}'...")
+    try:
+        cluster_instance = RayVirtualCluster(
+            name=cluster_name,
+            bundle_ct_per_node_list=[1],
+            use_gpus=True,
+            num_gpus_per_node=1,
+            max_colocated_worker_groups=2,
+        )
+        yield cluster_instance
+    finally:
+        print(f"\nCleaning up cluster '{cluster_name}'...")
+        if cluster_instance:
+            cluster_instance.shutdown()
+
+
+def test_untrusted_code(code_env):
+    """Test whether the code environment can block untrusted code."""
+    codes = [
+        "with open('allowed_file.txt', 'w') as fout:\n"
+        "    fout.write('some content')\n"
+        "with open('allowed_file.txt') as fin:\n"
+        "    content = fin.read()\n"
+        "content",
+        "with open('/etc/passwd', 'r') as fin:\n"
+        "    fin.read()",
+        "import math\n"
+        "round(math.sqrt(8))",
+        "import os",
+    ]
+    results = [
+        "\n\n<result>\n'some content'\n</result>",
+        "\n\n<result>\nPermissionError('Access beyond the temporary working directory is blocked')\n</result>",
+        "\n\n<result>\n3\n</result>",
+        "<result>PermissionError('Importing system and network modules is blocked')</result>",
+    ]
+
+    message_log_batch = [
+        [{"role": "user", "content": f"<code>{code}</code>"}] for code in codes
+    ]
+    temp_dirs = [TemporaryDirectory() for _ in codes]
+    metadata_batch = [
+        CodeEnvMetadata(
+            context={}, working_dir=temp_dir.name,
+        ) for temp_dir in temp_dirs
+    ]
+
+    # Execute the code
+    output = ray.get(code_env.step.remote(message_log_batch, metadata_batch))
+    responses = [obs["content"] for obs in output.observations]
+
+    assert responses == results, f"Got wrong output {responses}"
+
+
+def test_vllm_execute_code(cluster, tokenizer, code_env):
+    """Test that vLLM can call the code executor."""
+    # Prepare test data
+    codes = [
+        "<code>x = 3; y = 4</code>\nThis is some regular text.\n<code>x + y</code>\n",
+        "<code>\ndef f(x):\n    return x * x\n\nf(2)\n</code>\n",
+    ]
+    results = ["<result>7</result>", "\n<result>\n4\n</result>"]
+
+    # Create message logs
+    message_logs = []
+    metadata_batch = []
+    temp_dirs = []
+    for code in codes:
+        # Tokenize the message content
+        prompt = code * 4
+        token_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)["input_ids"][0]
+        temp_dir = TemporaryDirectory()
+        message_logs.append([{"role": "user", "content": prompt, "token_ids": token_ids}])
+        metadata_batch.append(CodeEnvMetadata(context={}, working_dir=temp_dir.name))
+        temp_dirs.append(temp_dir)
+
+    # Create initial batch
+    initial_batch = BatchedDataDict({
+        "message_log": message_logs,
+        "extra_env_info": metadata_batch,
+        "task_name": ["code_execution"] * len(codes),
+        "stop_strings": [["</code>"]] * len(codes),
+    })
+
+    # Create vLLM generation
+    vllm_config = basic_vllm_test_config.copy()
+    vllm_config = configure_generation_config(vllm_config, tokenizer, is_eval=True)
+    vllm_generation = VllmGeneration(cluster, vllm_config)
+
+    # Create code environment
+    task_to_env = {"code_execution": code_env}
+
+    # Run rollout
+    vllm_generation.prepare_for_generation()
+    final_batch, _ = run_multi_turn_rollout(
+        policy_generation=vllm_generation,
+        input_batch=initial_batch,
+        tokenizer=tokenizer,
+        task_to_env=task_to_env,
+        max_seq_len=256,
+        max_rollout_turns=2,
+        greedy=True,
+    )
+    vllm_generation.finish_generation()
+
+    # Check results
+    for i, msg_log in enumerate(final_batch["message_log"]):
+        # Get the last message which should contain the result
+        last_msg = msg_log[-1]
+        assert last_msg["role"] == "environment"
+        assert last_msg["content"] == results[i], f"Expected {results[i]}, got {last_msg['content']}"
+
+
+def test_hf_execute_code(cluster, tokenizer, code_env):
+    """Test that Huggingface models can call the code executor."""
+    # Prepare test data
+    codes = [
+        "<code>x = 3; y = 4</code>\nThis is some regular text.\n<code>x + y</code>\n",
+        "<code>\ndef f(x):\n    return x * x\n\nf(2)\n</code>\n",
+    ]
+    results = ["<result>7</result>", "\n<result>\n4\n</result>"]
+
+    # Create message logs
+    message_logs = []
+    metadata_batch = []
+    temp_dirs = []
+    for code in codes:
+        # Tokenize the message content
+        prompt = code * 4
+        token_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)["input_ids"][0]
+        temp_dir = TemporaryDirectory()
+        message_logs.append([{"role": "user", "content": prompt, "token_ids": token_ids}])
+        metadata_batch.append(CodeEnvMetadata(context={}, working_dir=temp_dir.name))
+        temp_dirs.append(temp_dir)
+
+    # Create initial batch
+    initial_batch = BatchedDataDict({
+        "message_log": message_logs,
+        "extra_env_info": metadata_batch,
+        "task_name": ["code_execution"] * len(codes),
+        "stop_strings": [["</code>"]] * len(codes),
+    })
+
+    # Create HF policy
+    hf_config = basic_hf_test_config.copy()
+    hf_config["generation"] = configure_generation_config(
+        hf_config["generation"],
+        tokenizer,
+    )
+    hf_policy = HfPolicy(
+        cluster, hf_config, tokenizer, init_reference_model=False, init_optimizer=False
+    )
+
+    # Create code environment
+    task_to_env = {"code_execution": code_env}
+
+    # Run rollout
+    hf_policy.prepare_for_generation()
+    final_batch, _ = run_multi_turn_rollout(
+        policy_generation=hf_policy,
+        input_batch=initial_batch,
+        tokenizer=tokenizer,
+        task_to_env=task_to_env,
+        max_seq_len=256,
+        max_rollout_turns=2,
+        greedy=True,
+    )
+    hf_policy.finish_generation()
+
+    # Check results
+    for i, msg_log in enumerate(final_batch["message_log"]):
+        # Get the last message which should contain the result
+        last_msg = msg_log[-1]
+        assert last_msg["role"] == "environment"
+        assert last_msg["content"] == results[i], f"Expected {results[i]}, got {last_msg['content']}"
+
+
diff --git a/tests/unit/experience/test_retriever.py b/tests/unit/experience/test_retriever.py
new file mode 100644
index 0000000000..359b7898c5
--- /dev/null
+++ b/tests/unit/experience/test_retriever.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+import ray
+import torch
+from transformers import AutoTokenizer
+
+from nemo_rl.data.interfaces import LLMMessageLogType
+from nemo_rl.distributed.batched_data_dict import BatchedDataDict
+from nemo_rl.distributed.virtual_cluster import RayVirtualCluster
+from nemo_rl.environments.tools.retriever import RAGEnvironment, RAGEnvConfig
+from nemo_rl.experience.rollouts import run_multi_turn_rollout
+from nemo_rl.models.generation.interfaces import configure_generation_config
+from nemo_rl.models.generation.vllm import VllmConfig, VllmGeneration
+
+MODEL_NAME = "meta-llama/Llama-3.2-1B"
+
+cfg: RAGEnvConfig = {
+    "dataset_name": "rahular/simple-wikipedia",
+    "dataset_split": "train",
+    "text_column": "text",
+    "num_results": 1,
+    "k1": 1.5,
+    "b": 0.75,
+    "device": "cpu",
+}
+
+# Define basic vLLM test config
+basic_vllm_test_config: VllmConfig = {
+    "backend": "vllm",
+    "model_name": MODEL_NAME,
+    "tokenizer_name": None,
+    "dtype": "bfloat16",
+    "max_new_tokens": 100,
+    "temperature": 1.0,
+    "top_p": 1.0,
+    "top_k": None,
+    "stop_token_ids": None,
+    "stop_strings": None,
+    "vllm_cfg": {
+        "tensor_parallel_size": 1,
+        "gpu_memory_utilization": 0.3,
+        "max_model_len": 1024,
+    },
+}
+
+
+@pytest.fixture(scope="function")
+def rag_env():
+    """Create a RAG environment for testing."""
+    try:
+        env_actor = RAGEnvironment.remote(cfg)
+        yield env_actor
+    finally:
+        if env_actor:
+            ray.kill(env_actor)
+
+
+@pytest.fixture(scope="function")
+def tokenizer():
+    """Loads the tokenizer for the tests."""
+    print(f"Loading tokenizer: {MODEL_NAME}")
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    print(
+        f"Tokenizer loaded. Pad token: {tokenizer.pad_token} (ID: {tokenizer.pad_token_id}), EOS token: {tokenizer.eos_token} (ID: {tokenizer.eos_token_id})"
+    )
+    return tokenizer
+
+
+@pytest.fixture(scope="function")
+def cluster():
+    """Create a virtual cluster for testing."""
+    cluster_instance = None
+    cluster_name = f"test-rag-cluster-{id(cluster_instance)}"
+    print(f"\nCreating virtual cluster '{cluster_name}'...")
+    try:
+        cluster_instance = RayVirtualCluster(
+            name=cluster_name,
+            bundle_ct_per_node_list=[1],
+            use_gpus=True,
+            num_gpus_per_node=1,
+            max_colocated_worker_groups=2,
+        )
+        yield cluster_instance
+    finally:
+        print(f"\nCleaning up cluster '{cluster_name}'...")
+        if cluster_instance:
+            cluster_instance.shutdown()
+
+
+def test_vllm_retrieve(cluster, tokenizer, rag_env):
+    """Test that vLLM can use the RAG environment for document retrieval."""
+    # Prepare test data
+    queries = [
+        "<retrieve>Jen-Hsun Huang</retrieve>\n",
+    ]
+    expected_results = [
+        "<result>\n<1>\n"
+        "Nvidia was established in 1993 by Jen-Hsun Huang, Curtis Priem, and Chris Malachowsky. In 2000 Nvidia took intellectual possession of 3dfx, one of the biggest GPU producers in 1990s.\n"
+        "</1>\n</result>\n",
+    ]
+
+    # Create message logs
+    message_logs = []
+    for query in queries:
+        # Tokenize the message content
+        prompt = query * 4
+        token_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)["input_ids"][0]
+        message_logs.append([{"role": "user", "content": prompt, "token_ids": token_ids}])
+
+    # Create initial batch
+    initial_batch = BatchedDataDict({
+        "message_log": message_logs,
+        "extra_env_info": [{}] * len(queries),  # No metadata needed for RAG
+        "task_name": ["document_retrieval"] * len(queries),
+        "stop_strings": [["</retrieve>"]] * len(queries),
+    })
+
+    # Create vLLM generation
+    vllm_config = basic_vllm_test_config.copy()
+    vllm_config = configure_generation_config(vllm_config, tokenizer, is_eval=True)
+    vllm_generation = VllmGeneration(cluster, vllm_config)
+
+    # Create RAG environment
+    task_to_env = {"document_retrieval": rag_env}
+
+    # Run rollout
+    vllm_generation.prepare_for_generation()
+    final_batch, _ = run_multi_turn_rollout(
+        policy_generation=vllm_generation,
+        input_batch=initial_batch,
+        tokenizer=tokenizer,
+        task_to_env=task_to_env,
+        max_seq_len=256,
+        max_rollout_turns=1,
+        greedy=True,
+    )
+    vllm_generation.finish_generation()
+
+    # Check results
+    for i, msg_log in enumerate(final_batch["message_log"]):
+        # Get the last message which should contain the result
+        last_msg = msg_log[-1]
+        assert last_msg["role"] == "environment"
+        assert last_msg["content"] == expected_results[i], f"Expected {expected_results[i]}, got {last_msg['content']}"

From 487cd94a29946b97d6acbcd9ccfaae1d25673e2c Mon Sep 17 00:00:00 2001
From: KiddoZhu <zhaochengz@nvidia.com>
Date: Thu, 8 May 2025 15:50:51 -0700
Subject: [PATCH 4/7] fix lint check

Signed-off-by: KiddoZhu <zhaochengz@nvidia.com>
---
 nemo_rl/environments/code_environment.py     | 75 ++++++++++---------
 nemo_rl/environments/tools/retriever.py      | 24 +++---
 nemo_rl/models/policy/fsdp1_policy_worker.py |  2 +-
 tests/unit/experience/test_code.py           | 78 ++++++++++++--------
 tests/unit/experience/test_retriever.py      | 30 +++++---
 5 files changed, 121 insertions(+), 88 deletions(-)

diff --git a/nemo_rl/environments/code_environment.py b/nemo_rl/environments/code_environment.py
index 325733fa37..cb72c1532e 100644
--- a/nemo_rl/environments/code_environment.py
+++ b/nemo_rl/environments/code_environment.py
@@ -15,24 +15,22 @@
 import builtins
 import os
 import re
+from collections.abc import Mapping, Sequence
+from contextlib import contextmanager
+from copy import copy
 from io import IOBase
+from pprint import pformat
 from types import ModuleType
-from copy import copy
-from collections.abc import Mapping, Sequence, Set
-from tempfile import TemporaryDirectory
-from contextlib import contextmanager
 from typing import Any, Dict, List, Optional, Tuple, TypedDict
 
 import ray
 import torch
-from pprint import pformat
 
 from nemo_rl.data.interfaces import LLMMessageLogType
 from nemo_rl.distributed.batched_data_dict import BatchedDataDict
 from nemo_rl.distributed.virtual_cluster import PY_EXECUTABLES
-from nemo_rl.environments.utils import chunk_list_to_workers
 from nemo_rl.environments.interfaces import EnvironmentInterface, EnvironmentReturn
-from nemo_rl.tools.interfaces import ToolInterface
+from nemo_rl.environments.utils import chunk_list_to_workers
 
 
 class CodeEnvConfig(TypedDict):
@@ -67,16 +65,22 @@ def sanitize(self, obj: Any) -> Any:
             # replace unpickable objects with a string representation
             return repr(obj)
         if isinstance(obj, Mapping):
-            return obj.__class__({self.sanitize(k): self.sanitize(v) for k, v in obj.items()})
+            return obj.__class__(
+                {self.sanitize(k): self.sanitize(v) for k, v in obj.items()}
+            )
         if isinstance(obj, Sequence) and not isinstance(obj, str):
             return obj.__class__(self.sanitize(v) for v in obj)
         if hasattr(obj, "__dict__"):
             new_obj = copy(obj)
-            new_obj.__dict__ = {self.sanitize(k): self.sanitize(v) for k, v in obj.__dict__.items()}
+            new_obj.__dict__ = {
+                self.sanitize(k): self.sanitize(v) for k, v in obj.__dict__.items()
+            }
             return new_obj
         return obj
 
-    def format_result(self, result: Any, code: Optional[str] = None, lookahead: Optional[str] = None) -> str:
+    def format_result(
+        self, result: Any, code: Optional[str] = None, lookahead: Optional[str] = None
+    ) -> str:
         if result is None:
             # no return value
             return ""
@@ -92,16 +96,16 @@ def format_result(self, result: Any, code: Optional[str] = None, lookahead: Opti
             if result.startswith(lookahead):
                 # The generation may look like "</code>\n" if ">\n" is a single token.
                 # We trim \n from the result if the model has already generated it.
-                result = result[len(lookahead):]
+                result = result[len(lookahead) :]
         return result
 
     def execute(self, message_batch: str, metadata_batch: List[CodeEnvMetadata]) -> str:
         """Execute code in a sandboxed environment."""
         results = []
         terminateds = []
-        
+
         for message, metadata in zip(message_batch, metadata_batch):
-            match = re.search(rf"<code>(.*)</code>(.*)", message, re.DOTALL)
+            match = re.search(r"<code>(.*)</code>(.*)", message, re.DOTALL)
             if not match:
                 results.append("")
                 terminateds.append(False)
@@ -118,7 +122,7 @@ def execute(self, message_batch: str, metadata_batch: List[CodeEnvMetadata]) ->
                 # Silent mode
                 exec_code = code
                 eval_code = None
-            
+
             result = None
             terminated = False
             with self.chdir(metadata["working_dir"]):
@@ -131,16 +135,18 @@ def execute(self, message_batch: str, metadata_batch: List[CodeEnvMetadata]) ->
                         terminated = True
                 except Exception as err:
                     result = err
-            
+
             result = self.format_result(result, code, lookahead)
             results.append(result)
             terminateds.append(terminated)
-        
-        observations = [{"role": "environment", "content": result} for result in results]
+
+        observations = [
+            {"role": "environment", "content": result} for result in results
+        ]
         metadata_batch = self.sanitize(metadata_batch)
-        
+
         return observations, terminateds, metadata_batch
-            
+
     @contextmanager
     def chdir(self, dir: str):
         """Change to temporary directory for file operations."""
@@ -156,17 +162,24 @@ def safe_open(self, file: str, *args, **kwargs):
         real_file = os.path.realpath(file)
         working_dir = os.path.realpath(os.getcwd())
         if os.path.commonpath([real_file, working_dir]) != working_dir:
-            raise PermissionError("Access beyond the temporary working directory is blocked")
+            raise PermissionError(
+                "Access beyond the temporary working directory is blocked"
+            )
         return open(file, *args, **kwargs)
 
     def safe_import(self, name: str, *args, **kwargs):
         """Safe version of import that blocks risky modules."""
         risky_modules = {
-            "os", "shutil",  # erase filesystem
-            "sys", "signal",  # exit the current program
+            "os",
+            "shutil",  # erase filesystem
+            "sys",
+            "signal",  # exit the current program
             "socket",  # network communication
-            "subprocess", "threading", "multiprocessing",  # spawn threads or processes
-            "builtins", "importlib",  # bypass current blockers
+            "subprocess",
+            "threading",
+            "multiprocessing",  # spawn threads or processes
+            "builtins",
+            "importlib",  # bypass current blockers
         }
         if name in risky_modules:
             raise PermissionError("Importing system and network modules is blocked")
@@ -196,12 +209,8 @@ def step(
     ) -> EnvironmentReturn:
         """Process a batch of code execution steps."""
         message_batch = [ml[-1]["content"] for ml in message_log_batch]
-        chunked_message_batch = chunk_list_to_workers(
-            message_batch, self.num_workers
-        )
-        chunked_metadata_batch = chunk_list_to_workers(
-            metadata_batch, self.num_workers
-        )
+        chunked_message_batch = chunk_list_to_workers(message_batch, self.num_workers)
+        chunked_metadata_batch = chunk_list_to_workers(metadata_batch, self.num_workers)
 
         # Process each chunk in parallel
         futures = [
@@ -222,7 +231,7 @@ def step(
             observations += obs
             terminateds += term
             new_metadata_batch += meta
-        
+
         if self.terminate_on_evaluation:
             terminated_tensor = torch.tensor(terminateds, dtype=torch.bool)
         else:
@@ -240,7 +249,7 @@ def step(
         )
 
     def shutdown(self):
-         # shutdown all workers
+        # shutdown all workers
         for worker in self.workers:
             ray.kill(worker)
 
@@ -249,4 +258,4 @@ def global_post_process_and_metrics(
     ) -> Tuple[BatchedDataDict, dict]:
         """Compute metrics for the batch."""
         # No specific metrics for code execution
-        return batch, {}
\ No newline at end of file
+        return batch, {}
diff --git a/nemo_rl/environments/tools/retriever.py b/nemo_rl/environments/tools/retriever.py
index 40da06f092..655f5a801f 100644
--- a/nemo_rl/environments/tools/retriever.py
+++ b/nemo_rl/environments/tools/retriever.py
@@ -11,9 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import re
 from typing import Any, Dict, List, TypedDict
 
-import re
 import ray
 import torch
 from datasets import load_dataset
@@ -40,11 +40,11 @@ class RAGEnvironment(EnvironmentInterface):
 
     def __init__(self, cfg: RAGEnvConfig):
         self.cfg = cfg
-        
+
         # Load dataset
         dataset = load_dataset(cfg["dataset_name"], split=cfg["dataset_split"])
         documents = [sample[cfg["text_column"]] for sample in dataset]
-        
+
         # Initialize BM25 retriever
         self.retriever = BM25Retriever(
             documents=documents,
@@ -53,14 +53,14 @@ def __init__(self, cfg: RAGEnvConfig):
             b=cfg["b"],
             device=cfg["device"],
         )
-    
+
     def format_result(self, retrieved_docs: List[str]) -> str:
         result = "<result>\n"
         for i, doc in enumerate(retrieved_docs):
-            result += f"<{i+1}>\n{doc}\n</{i+1}>\n"
+            result += f"<{i + 1}>\n{doc}\n</{i + 1}>\n"
         result += "</result>\n"
         return result
-    
+
     def step(
         self,
         message_log_batch: List[LLMMessageLogType],
@@ -69,24 +69,26 @@ def step(
         """Process a batch of retrieval steps."""
         # Extract queries from the last message in each log
         messages = [ml[-1]["content"] for ml in message_log_batch]
-        
+
         # Retrieve documents for each query
         results = []
         for message in messages:
-            match = re.search(rf"<retrieve>(.*)</retrieve>", message, re.DOTALL)
+            match = re.search(r"<retrieve>(.*)</retrieve>", message, re.DOTALL)
             if not match:
-                results.append({"role": "environment", "content": "No retrieval query found!"})
+                results.append(
+                    {"role": "environment", "content": "No retrieval query found!"}
+                )
                 continue
             query = match.group(1)
             retrieved_docs = self.retriever(query)
             result = self.format_result(retrieved_docs)
             results.append({"role": "environment", "content": result})
-        
+
         batch_size = len(message_log_batch)
         rewards_tensor = torch.zeros(batch_size, dtype=torch.float32)
         terminated_tensor = torch.ones(batch_size, dtype=torch.bool)
         next_stop_strings = [["</retrieve>"]] * batch_size
-        
+
         return EnvironmentReturn(
             observations=results,
             metadata=metadata_batch,
diff --git a/nemo_rl/models/policy/fsdp1_policy_worker.py b/nemo_rl/models/policy/fsdp1_policy_worker.py
index 19523394ad..1d057271a4 100644
--- a/nemo_rl/models/policy/fsdp1_policy_worker.py
+++ b/nemo_rl/models/policy/fsdp1_policy_worker.py
@@ -289,7 +289,7 @@ def train(
                             logits = self.model.lm_head(outputs.last_hidden_state)
                         else:
                             logits = outputs.logits
-                    
+
                     # Divide logits by temperature
                     if "generation" in self.cfg and self.cfg["generation"] is not None:
                         logits.div_(self.cfg["generation"]["temperature"])
diff --git a/tests/unit/experience/test_code.py b/tests/unit/experience/test_code.py
index 1fb37464af..7e06ffc45a 100644
--- a/tests/unit/experience/test_code.py
+++ b/tests/unit/experience/test_code.py
@@ -12,17 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from tempfile import TemporaryDirectory
+
 import pytest
 import ray
-import torch
-from tempfile import TemporaryDirectory
-from typing import List, Dict, Any
 from transformers import AutoTokenizer
 
-from nemo_rl.data.interfaces import LLMMessageLogType
 from nemo_rl.distributed.batched_data_dict import BatchedDataDict
 from nemo_rl.distributed.virtual_cluster import RayVirtualCluster
-from nemo_rl.environments.code_environment import CodeEnvironment, CodeEnvConfig, CodeEnvMetadata
+from nemo_rl.environments.code_environment import (
+    CodeEnvConfig,
+    CodeEnvironment,
+    CodeEnvMetadata,
+)
 from nemo_rl.experience.rollouts import run_multi_turn_rollout
 from nemo_rl.models.generation.interfaces import configure_generation_config
 from nemo_rl.models.generation.vllm import VllmConfig, VllmGeneration
@@ -143,10 +145,8 @@ def test_untrusted_code(code_env):
         "with open('allowed_file.txt') as fin:\n"
         "    content = fin.read()\n"
         "content",
-        "with open('/etc/passwd', 'r') as fin:\n"
-        "    fin.read()",
-        "import math\n"
-        "round(math.sqrt(8))",
+        "with open('/etc/passwd', 'r') as fin:\n    fin.read()",
+        "import math\nround(math.sqrt(8))",
         "import os",
     ]
     results = [
@@ -162,8 +162,10 @@ def test_untrusted_code(code_env):
     temp_dirs = [TemporaryDirectory() for _ in codes]
     metadata_batch = [
         CodeEnvMetadata(
-            context={}, working_dir=temp_dir.name,
-        ) for temp_dir in temp_dirs
+            context={},
+            working_dir=temp_dir.name,
+        )
+        for temp_dir in temp_dirs
     ]
 
     # Execute the code
@@ -189,19 +191,25 @@ def test_vllm_execute_code(cluster, tokenizer, code_env):
     for code in codes:
         # Tokenize the message content
         prompt = code * 4
-        token_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)["input_ids"][0]
+        token_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)[
+            "input_ids"
+        ][0]
         temp_dir = TemporaryDirectory()
-        message_logs.append([{"role": "user", "content": prompt, "token_ids": token_ids}])
+        message_logs.append(
+            [{"role": "user", "content": prompt, "token_ids": token_ids}]
+        )
         metadata_batch.append(CodeEnvMetadata(context={}, working_dir=temp_dir.name))
         temp_dirs.append(temp_dir)
 
     # Create initial batch
-    initial_batch = BatchedDataDict({
-        "message_log": message_logs,
-        "extra_env_info": metadata_batch,
-        "task_name": ["code_execution"] * len(codes),
-        "stop_strings": [["</code>"]] * len(codes),
-    })
+    initial_batch = BatchedDataDict(
+        {
+            "message_log": message_logs,
+            "extra_env_info": metadata_batch,
+            "task_name": ["code_execution"] * len(codes),
+            "stop_strings": [["</code>"]] * len(codes),
+        }
+    )
 
     # Create vLLM generation
     vllm_config = basic_vllm_test_config.copy()
@@ -229,7 +237,9 @@ def test_vllm_execute_code(cluster, tokenizer, code_env):
         # Get the last message which should contain the result
         last_msg = msg_log[-1]
         assert last_msg["role"] == "environment"
-        assert last_msg["content"] == results[i], f"Expected {results[i]}, got {last_msg['content']}"
+        assert last_msg["content"] == results[i], (
+            f"Expected {results[i]}, got {last_msg['content']}"
+        )
 
 
 def test_hf_execute_code(cluster, tokenizer, code_env):
@@ -248,19 +258,25 @@ def test_hf_execute_code(cluster, tokenizer, code_env):
     for code in codes:
         # Tokenize the message content
         prompt = code * 4
-        token_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)["input_ids"][0]
+        token_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)[
+            "input_ids"
+        ][0]
         temp_dir = TemporaryDirectory()
-        message_logs.append([{"role": "user", "content": prompt, "token_ids": token_ids}])
+        message_logs.append(
+            [{"role": "user", "content": prompt, "token_ids": token_ids}]
+        )
         metadata_batch.append(CodeEnvMetadata(context={}, working_dir=temp_dir.name))
         temp_dirs.append(temp_dir)
 
     # Create initial batch
-    initial_batch = BatchedDataDict({
-        "message_log": message_logs,
-        "extra_env_info": metadata_batch,
-        "task_name": ["code_execution"] * len(codes),
-        "stop_strings": [["</code>"]] * len(codes),
-    })
+    initial_batch = BatchedDataDict(
+        {
+            "message_log": message_logs,
+            "extra_env_info": metadata_batch,
+            "task_name": ["code_execution"] * len(codes),
+            "stop_strings": [["</code>"]] * len(codes),
+        }
+    )
 
     # Create HF policy
     hf_config = basic_hf_test_config.copy()
@@ -293,6 +309,6 @@ def test_hf_execute_code(cluster, tokenizer, code_env):
         # Get the last message which should contain the result
         last_msg = msg_log[-1]
         assert last_msg["role"] == "environment"
-        assert last_msg["content"] == results[i], f"Expected {results[i]}, got {last_msg['content']}"
-
-
+        assert last_msg["content"] == results[i], (
+            f"Expected {results[i]}, got {last_msg['content']}"
+        )
diff --git a/tests/unit/experience/test_retriever.py b/tests/unit/experience/test_retriever.py
index 359b7898c5..0c059c1453 100644
--- a/tests/unit/experience/test_retriever.py
+++ b/tests/unit/experience/test_retriever.py
@@ -14,13 +14,11 @@
 
 import pytest
 import ray
-import torch
 from transformers import AutoTokenizer
 
-from nemo_rl.data.interfaces import LLMMessageLogType
 from nemo_rl.distributed.batched_data_dict import BatchedDataDict
 from nemo_rl.distributed.virtual_cluster import RayVirtualCluster
-from nemo_rl.environments.tools.retriever import RAGEnvironment, RAGEnvConfig
+from nemo_rl.environments.tools.retriever import RAGEnvConfig, RAGEnvironment
 from nemo_rl.experience.rollouts import run_multi_turn_rollout
 from nemo_rl.models.generation.interfaces import configure_generation_config
 from nemo_rl.models.generation.vllm import VllmConfig, VllmGeneration
@@ -119,16 +117,22 @@ def test_vllm_retrieve(cluster, tokenizer, rag_env):
     for query in queries:
         # Tokenize the message content
         prompt = query * 4
-        token_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)["input_ids"][0]
-        message_logs.append([{"role": "user", "content": prompt, "token_ids": token_ids}])
+        token_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)[
+            "input_ids"
+        ][0]
+        message_logs.append(
+            [{"role": "user", "content": prompt, "token_ids": token_ids}]
+        )
 
     # Create initial batch
-    initial_batch = BatchedDataDict({
-        "message_log": message_logs,
-        "extra_env_info": [{}] * len(queries),  # No metadata needed for RAG
-        "task_name": ["document_retrieval"] * len(queries),
-        "stop_strings": [["</retrieve>"]] * len(queries),
-    })
+    initial_batch = BatchedDataDict(
+        {
+            "message_log": message_logs,
+            "extra_env_info": [{}] * len(queries),  # No metadata needed for RAG
+            "task_name": ["document_retrieval"] * len(queries),
+            "stop_strings": [["</retrieve>"]] * len(queries),
+        }
+    )
 
     # Create vLLM generation
     vllm_config = basic_vllm_test_config.copy()
@@ -156,4 +160,6 @@ def test_vllm_retrieve(cluster, tokenizer, rag_env):
         # Get the last message which should contain the result
         last_msg = msg_log[-1]
         assert last_msg["role"] == "environment"
-        assert last_msg["content"] == expected_results[i], f"Expected {expected_results[i]}, got {last_msg['content']}"
+        assert last_msg["content"] == expected_results[i], (
+            f"Expected {expected_results[i]}, got {last_msg['content']}"
+        )

From 9563db26575989fe5b4f0c7dd85a29036a84ccbc Mon Sep 17 00:00:00 2001
From: KiddoZhu <zhaochengz@nvidia.com>
Date: Mon, 9 Jun 2025 14:17:43 -0700
Subject: [PATCH 5/7] clean up old impleementation & test passed

Signed-off-by: KiddoZhu <zhaochengz@nvidia.com>
---
 nemo_rl/environments/tools/retriever.py       |  99 ++++-
 nemo_rl/tools/__init__.py                     |   0
 nemo_rl/tools/generation.py                   | 236 ------------
 nemo_rl/tools/interfaces.py                   |  20 -
 nemo_rl/tools/tools.py                        | 199 ----------
 .../test_code_environment.py}                 |  10 +-
 .../test_retriever.py                         |   9 +-
 tests/unit/tools/test_tools.py                | 351 ------------------
 8 files changed, 113 insertions(+), 811 deletions(-)
 delete mode 100644 nemo_rl/tools/__init__.py
 delete mode 100644 nemo_rl/tools/generation.py
 delete mode 100644 nemo_rl/tools/interfaces.py
 delete mode 100644 nemo_rl/tools/tools.py
 rename tests/unit/{experience/test_code.py => environments/test_code_environment.py} (96%)
 rename tests/unit/{experience => environments}/test_retriever.py (95%)
 delete mode 100644 tests/unit/tools/test_tools.py

diff --git a/nemo_rl/environments/tools/retriever.py b/nemo_rl/environments/tools/retriever.py
index 655f5a801f..cc62d8a2af 100644
--- a/nemo_rl/environments/tools/retriever.py
+++ b/nemo_rl/environments/tools/retriever.py
@@ -12,16 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import re
+import math
 from typing import Any, Dict, List, TypedDict
+from collections import Counter
+from tqdm import tqdm
 
 import ray
 import torch
 from datasets import load_dataset
+from transformers import AutoTokenizer
 
 from nemo_rl.data.interfaces import LLMMessageLogType
 from nemo_rl.distributed.batched_data_dict import BatchedDataDict
 from nemo_rl.environments.interfaces import EnvironmentInterface, EnvironmentReturn
-from nemo_rl.tools.tools import BM25Retriever
 
 
 class RAGEnvConfig(TypedDict):
@@ -34,6 +37,100 @@ class RAGEnvConfig(TypedDict):
     device: str  # Device to compute BM25
 
 
+class BM25Retriever:
+    """Sparse BM25 retriever.
+
+    Args:
+        documents: list of documents to retrieve from
+        num_result: retrieve top-k documents
+        k1: parameter of BM25. Values in [1.2, 2.0] are recommended.
+        b: parameter of BM25. 0.75 is recommended.
+        device: device to compute BM25
+    """
+
+    def __init__(
+        self,
+        documents: List[str] = None,
+        num_result: int = 10,
+        k1: float = 1.5,
+        b: float = 0.75,
+        device: str = "cpu",
+    ):
+        if documents is None:
+            dataset = load_dataset("wikimedia/wikipedia", "20231101.en")
+            self.documents = [sample["text"] for sample in dataset["train"]]
+        else:
+            self.documents = documents
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            "bert-base-uncased", use_fast=True
+        )
+        self.num_result = num_result
+        self.k1 = k1
+        self.b = b
+        self.device = device
+        self.corpus_size = len(self.documents)
+        self.vocab_size = self.tokenizer.vocab_size
+
+        self.build_index()
+
+    def build_index(self):
+        doc_ids = []
+        token_ids = []
+        tfs = []
+        lengths = []
+
+        for i, document in enumerate(
+            tqdm(self.documents, "Build index for BM25Retriever")
+        ):
+            input_ids = self.tokenizer.encode(document, add_special_tokens=False)
+            token2cnt = Counter(input_ids)
+            token_ids += token2cnt.keys()
+            tfs += token2cnt.values()
+            doc_ids += [i] * len(token2cnt)
+            lengths.append(len(input_ids))
+
+        avg_dl = sum(lengths) / self.corpus_size
+        for i, doc_id in enumerate(doc_ids):
+            tfs[i] = (
+                tfs[i]
+                * (self.k1 + 1)
+                / (tfs[i] + self.k1 * (1 - self.b + self.b * lengths[doc_id] / avg_dl))
+            )
+
+        indices = torch.tensor([doc_ids, token_ids], device=self.device)
+        values = torch.tensor(tfs, device=self.device)
+        self.doc_tfs = torch.sparse_coo_tensor(
+            indices, values, (self.corpus_size, self.vocab_size)
+        )
+
+        idfs = [0] * self.vocab_size
+        token2df = Counter(token_ids)
+        for token_id, df in token2df.items():
+            idfs[token_id] = math.log((self.corpus_size - df + 0.5) / (df + 0.5) + 1)
+        self.idfs = idfs
+
+    def __call__(self, query: str) -> List[str]:
+        input_ids = self.tokenizer.encode(query, add_special_tokens=False)
+        token2cnt = Counter(input_ids)
+        token_ids = []
+        query_idfs = []
+        for token_id, query_tf in token2cnt.items():
+            token_ids.append(token_id)
+            query_idfs.append(query_tf * self.idfs[token_id])
+
+        indices = torch.tensor([token_ids, [0] * len(token_ids)], device=self.device)
+        values = torch.tensor(query_idfs, device=self.device)
+        query_idfs = torch.sparse_coo_tensor(indices, values, (self.vocab_size, 1))
+
+        scores = torch.sparse.mm(self.doc_tfs, query_idfs)
+        scores = scores.to_dense().squeeze(-1)
+        results = []
+        for i in scores.topk(k=self.num_result).indices.tolist():
+            results.append(self.documents[i])
+
+        return results
+
+
 @ray.remote
 class RAGEnvironment(EnvironmentInterface):
     """RAG environment that uses BM25 for document retrieval."""
diff --git a/nemo_rl/tools/__init__.py b/nemo_rl/tools/__init__.py
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/nemo_rl/tools/generation.py b/nemo_rl/tools/generation.py
deleted file mode 100644
index 06f2f966b3..0000000000
--- a/nemo_rl/tools/generation.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import re
-import warnings
-from pprint import pformat
-from typing import Dict
-
-import ray
-import torch
-from torch.nn.utils.rnn import pad_sequence
-from transformers import AutoTokenizer
-
-from nemo_rl.distributed.batched_data_dict import BatchedDataDict
-from nemo_rl.models.generation.interfaces import (
-    GenerationDatumSpec,
-    GenerationInterface,
-    GenerationOutputSpec,
-)
-from nemo_rl.tools.interfaces import ToolInterface
-from nemo_rl.tools.tools import StatefulCodeExecutor
-
-LOGIT_INFINITY = 1000
-
-
-def generate_with_code_and_tools(
-    policy: GenerationInterface,
-    input_batch: BatchedDataDict[GenerationDatumSpec],
-    tokenizer: AutoTokenizer,
-    execute_code: bool = True,
-    tool_map: Dict[str, ToolInterface] = {},
-    tag: str = "<code>",
-    result_tag: str = "<result>",
-    *args,
-    **kwargs,
-) -> BatchedDataDict[GenerationOutputSpec]:
-    """Generate a batch of data with code execution and tool use.
-
-    All code execution and tool calls in the generation will be executed on-the-fly,
-    of which the results will be appended to the output. Multiple code execution and tool calls
-    is supported.
-
-    This function can be used as a drop-in replacement of `policy.generate()`.
-
-    Args:
-        policy: policy to generate from. Can be either vllm or HuggingFace backend
-        input_batch: BatchedDataDict containing input_ids and input_lengths tensors
-        tokenizer: tokenizer from the pretrained model
-        execute_code: whether to execute code
-        tool_map: tools that the model can use
-        tag: xml tag to detect code snippet
-        result_tag: xml tag to output the result
-        *args, **kwargs: arguments and keyword arguments accepted by `policy.generate()`
-    """
-    if tool_map and not execute_code:
-        warnings.warn(
-            "Tool use requires code execution, but code execution is disabled. All the tools will be ignored."
-        )
-
-    batch = input_batch.copy()
-    start_tag = tag
-    end_tag = tag.replace("<", "</")
-    result_start = result_tag
-    result_end = result_tag.replace("<", "</")
-
-    batch_size = len(batch["input_ids"])
-    stop_strings = batch["stop_strings"] if "stop_strings" in batch else []
-    stop_strings = [stop_strings + [end_tag]] * batch_size
-    batch["stop_strings"] = stop_strings
-    old_logprobs = None
-
-    active_batch = batch
-    active_indices = torch.arange(batch_size)
-    executors = [StatefulCodeExecutor.remote(tool_map) for _ in range(batch_size)]
-    completed_output_ids = [None] * batch_size
-    completed_logprobs = [None] * batch_size
-
-    while len(active_indices) > 0:
-        generation_outputs = policy.generate(active_batch, *args, **kwargs)
-
-        output_ids = generation_outputs["output_ids"]
-        # only contains logprobs for newly generated tokens
-        logprobs = generation_outputs["logprobs"]
-        input_lengths = active_batch["input_lengths"]
-        total_lengths = generation_outputs["unpadded_sequence_lengths"]
-        if old_logprobs is not None:
-            # restore logprobs for tokens generated in previous iterations
-            for i, input_length in enumerate(input_lengths):
-                logprobs[i, :input_length] = old_logprobs[i, :input_length]
-
-        # extract newly generated tokens
-        generated_ids = []
-        for output_id, input_length, total_length in zip(
-            output_ids, input_lengths, total_lengths
-        ):
-            generated_ids.append(output_id[input_length:total_length])
-
-        generated_texts = tokenizer.batch_decode(
-            generated_ids, skip_special_tokens=True
-        )
-
-        is_code = []
-        exprs = []
-        lookaheads = []
-        # parse newly generated texts
-        for i, (generated_text, active_index, total_length) in enumerate(
-            zip(generated_texts, active_indices, total_lengths)
-        ):
-            match = re.search(
-                rf"{start_tag}(.*){end_tag}(.*)", generated_text, re.DOTALL
-            )
-            if match:
-                # stop is caused by code execution
-                # expr takes everything between <code> and </code>, including new lines
-                # lookahead takes everything after </code>
-                is_code.append(i)
-                expr, lookahead = match.groups()
-                exprs.append(expr)
-                lookaheads.append(lookahead)
-            else:
-                # stop is not caused by code execution
-                # e.g. eos token, max length or other stop strings
-                completed_output_ids[active_index] = output_ids[i, :total_length]
-                completed_logprobs[active_index] = logprobs[i, :total_length]
-        if len(is_code) == 0:
-            break
-
-        # execute all code in this batch
-        futures = []
-        for i, expr, lookahead in zip(is_code, exprs, lookaheads):
-            active_index = active_indices[i]
-            # dispatch code to a pre-allocated executor for that sample
-            # so that functions and variables will be carried over
-            future = executors[active_index].__call__.remote(expr)
-            futures.append(future)
-        results = ray.get(futures)
-
-        new_results = []
-        for result in results:
-            if result is None:
-                # no return value
-                result = ""
-                new_results.extend(result)
-                continue
-            result = pformat(result)
-            if "\n" in expr or "\n" in result:
-                # multi-line format
-                result = f"\n\n{result_start}\n{result}\n{result_end}"
-            else:
-                # inline format
-                result = f"{result_start}{result}{result_end}"
-            if lookahead:
-                if result.startswith(lookahead):
-                    # The generation may look like "</code>\n" if ">\n" is a single token.
-                    # We trim \n from the result if the model has already generated it.
-                    result = result[len(lookahead) :]
-                else:
-                    warnings.warn(
-                        f"Expect the generation to stop at {repr(end_tag)}, but got {repr(end_tag + lookahead)}. "
-                        "This is because some characters are merged into a single token by the tokenizer. "
-                        "These extra characters will be kept in the generation."
-                    )
-            new_results.append(result)
-
-        encodings = tokenizer(
-            new_results,
-            add_special_tokens=False,
-            padding=True,
-            padding_side="right",
-            return_tensors="pt",
-        )
-        result_ids = encodings["input_ids"]
-        result_lengths = encodings["attention_mask"].sum(dim=1).to(torch.int32)
-
-        is_code = torch.tensor(is_code)
-        # reduce active batch to those containing code
-        active_batch = active_batch.select_indices(is_code)
-        active_indices = active_indices[is_code]
-        output_ids = output_ids[is_code]
-        logprobs = logprobs[is_code]
-        total_lengths = total_lengths[is_code]
-        # max length before appending results
-        old_max_length = total_lengths.max()
-        # max length after appending results
-        new_max_length = (total_lengths + result_lengths).max()
-        new_output_ids = torch.full(
-            (len(active_indices), new_max_length),
-            tokenizer.pad_token_id,
-            dtype=output_ids.dtype,
-        )
-        new_logprobs = torch.full(
-            (len(active_indices), new_max_length), 0, dtype=logprobs.dtype
-        )
-        new_output_ids[:, :old_max_length] = output_ids[:, :old_max_length]
-        new_logprobs[:, :old_max_length] = logprobs[:, :old_max_length]
-
-        # append results to generation
-        for i, (old_length, result_length) in enumerate(
-            zip(total_lengths, result_lengths)
-        ):
-            new_length = old_length + result_length
-            new_output_ids[i, old_length:new_length] = result_ids[i, :result_length]
-            new_logprobs[i, old_length:new_length] = LOGIT_INFINITY
-
-        active_batch["input_ids"] = new_output_ids
-        active_batch["input_lengths"] = total_lengths + result_lengths
-        old_logprobs = new_logprobs
-
-    output_ids = pad_sequence(
-        completed_output_ids,
-        batch_first=True,
-        padding_value=tokenizer.pad_token_id,
-        padding_side="right",
-    )
-    logprobs = pad_sequence(
-        completed_logprobs, batch_first=True, padding_value=0.0, padding_side="right"
-    )
-    total_lengths = torch.tensor([len(output_id) for output_id in completed_output_ids])
-    generation_lengths = total_lengths - input_batch["input_lengths"]
-
-    return {
-        "output_ids": output_ids,
-        "logprobs": logprobs,
-        "generation_lengths": generation_lengths,
-        "unpadded_sequence_lengths": total_lengths,
-    }
diff --git a/nemo_rl/tools/interfaces.py b/nemo_rl/tools/interfaces.py
deleted file mode 100644
index a37a3b6f10..0000000000
--- a/nemo_rl/tools/interfaces.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from abc import ABC, abstractmethod
-
-
-class ToolInterface(ABC):
-    @abstractmethod
-    def __call__(self, *args, **kwargs):
-        pass
diff --git a/nemo_rl/tools/tools.py b/nemo_rl/tools/tools.py
deleted file mode 100644
index 1af1977926..0000000000
--- a/nemo_rl/tools/tools.py
+++ /dev/null
@@ -1,199 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import ast
-import builtins
-import math
-import os
-import tempfile
-from collections import Counter
-from contextlib import contextmanager
-from typing import Any, Dict, List, Optional
-
-import ray
-import torch
-from datasets import load_dataset
-from tqdm import tqdm
-from transformers import AutoTokenizer
-
-from nemo_rl.tools.interfaces import ToolInterface
-
-
-@ray.remote
-class StatefulCodeExecutor(ToolInterface):
-    """Stateful code executor.
-
-    Args:
-        context: classes, functions and variables accessible to the code executor.
-            By passing tools in context, the code executor also serves tool use.
-    """
-
-    def __init__(self, context: Dict[str, Any] = {}):
-        self.context = context.copy()
-        self.tmp_dir = tempfile.TemporaryDirectory()
-
-        builtin_dict = {k: getattr(builtins, k) for k in dir(builtins)}
-        builtin_dict["open"] = self.safe_open
-        builtin_dict["__import__"] = self.safe_import
-        self.sandbox = {"__builtins__": builtin_dict}
-
-    def __call__(self, code: str) -> Optional[str]:
-        tree = ast.parse(code)
-
-        if tree.body and isinstance(tree.body[-1], ast.Expr):
-            # interactive mode
-            code = ast.unparse(tree.body[:-1])
-            expr = ast.unparse(tree.body[-1])
-        else:
-            # silent mode
-            expr = None
-
-        try:
-            # isolate the code in a sandbox with globals={}
-            # capture local variables in self.context
-            with self.change_temporary_dir():
-                exec(code, self.sandbox, self.context)
-                if expr:
-                    return eval(expr, self.sandbox, self.context)
-        except Exception as err:
-            return err
-
-    @contextmanager
-    def change_temporary_dir(self):
-        current_dir = os.getcwd()
-        os.chdir(self.tmp_dir.name)
-        try:
-            yield
-        finally:
-            os.chdir(current_dir)
-
-    def safe_open(self, file, *args, **kwargs):
-        real_file = os.path.realpath(file)
-        tmp_dir = os.path.realpath(self.tmp_dir.name)
-        if os.path.commonpath([real_file, tmp_dir]) != tmp_dir:
-            # real_file is not inside tmp_dir
-            raise PermissionError(
-                "Access beyond the temporary working directory is blocked"
-            )
-        return open(file, *args, **kwargs)
-
-    def safe_import(self, name, *args, **kwargs):
-        risky_modules = {
-            "os",
-            "shutil",  # erase filesystem
-            "sys",
-            "signal",  # exit the current program
-            "socket",  # network communication
-            "subprocess",
-            "threading",
-            "multiprocessing",  # spawn threads or processes
-            "builtins",
-            "importlib",  # bypass current blockers
-        }
-        if name in risky_modules:
-            raise PermissionError("Importing system and network modules is blocked")
-        return builtins.__import__(name, *args, **kwargs)
-
-
-class BM25Retriever(ToolInterface):
-    """Sparse BM25 retriever.
-
-    Args:
-        documents: list of documents to retrieve from
-        num_result: retrieve top-k documents
-        k1: parameter of BM25. Values in [1.2, 2.0] are recommended.
-        b: parameter of BM25. 0.75 is recommended.
-        device: device to compute BM25
-    """
-
-    def __init__(
-        self,
-        documents: List[str] = None,
-        num_result: int = 10,
-        k1: float = 1.5,
-        b: float = 0.75,
-        device: str = "cpu",
-    ):
-        if documents is None:
-            dataset = load_dataset("wikimedia/wikipedia", "20231101.en")
-            self.documents = [sample["text"] for sample in dataset["train"]]
-        else:
-            self.documents = documents
-        self.tokenizer = AutoTokenizer.from_pretrained(
-            "bert-base-uncased", use_fast=True
-        )
-        self.num_result = num_result
-        self.k1 = k1
-        self.b = b
-        self.device = device
-        self.corpus_size = len(self.documents)
-        self.vocab_size = self.tokenizer.vocab_size
-
-        self.build_index()
-
-    def build_index(self):
-        doc_ids = []
-        token_ids = []
-        tfs = []
-        lengths = []
-
-        for i, document in enumerate(
-            tqdm(self.documents, "Build index for BM25Retriever")
-        ):
-            input_ids = self.tokenizer.encode(document, add_special_tokens=False)
-            token2cnt = Counter(input_ids)
-            token_ids += token2cnt.keys()
-            tfs += token2cnt.values()
-            doc_ids += [i] * len(token2cnt)
-            lengths.append(len(input_ids))
-
-        avg_dl = sum(lengths) / self.corpus_size
-        for i, doc_id in enumerate(doc_ids):
-            tfs[i] = (
-                tfs[i]
-                * (self.k1 + 1)
-                / (tfs[i] + self.k1 * (1 - self.b + self.b * lengths[doc_id] / avg_dl))
-            )
-
-        indices = torch.tensor([doc_ids, token_ids], device=self.device)
-        values = torch.tensor(tfs, device=self.device)
-        self.doc_tfs = torch.sparse_coo_tensor(
-            indices, values, (self.corpus_size, self.vocab_size)
-        )
-
-        idfs = [0] * self.vocab_size
-        token2df = Counter(token_ids)
-        for token_id, df in token2df.items():
-            idfs[token_id] = math.log((self.corpus_size - df + 0.5) / (df + 0.5) + 1)
-        self.idfs = idfs
-
-    def __call__(self, query: str) -> List[str]:
-        input_ids = self.tokenizer.encode(query, add_special_tokens=False)
-        token2cnt = Counter(input_ids)
-        token_ids = []
-        query_idfs = []
-        for token_id, query_tf in token2cnt.items():
-            token_ids.append(token_id)
-            query_idfs.append(query_tf * self.idfs[token_id])
-
-        indices = torch.tensor([token_ids, [0] * len(token_ids)], device=self.device)
-        values = torch.tensor(query_idfs, device=self.device)
-        query_idfs = torch.sparse_coo_tensor(indices, values, (self.vocab_size, 1))
-
-        scores = torch.sparse.mm(self.doc_tfs, query_idfs)
-        scores = scores.to_dense().squeeze(-1)
-        results = []
-        for i in scores.topk(k=self.num_result).indices.tolist():
-            results.append(self.documents[i])
-
-        return results
diff --git a/tests/unit/experience/test_code.py b/tests/unit/environments/test_code_environment.py
similarity index 96%
rename from tests/unit/experience/test_code.py
rename to tests/unit/environments/test_code_environment.py
index 7e06ffc45a..732edf682b 100644
--- a/tests/unit/experience/test_code.py
+++ b/tests/unit/environments/test_code_environment.py
@@ -26,7 +26,7 @@
     CodeEnvMetadata,
 )
 from nemo_rl.experience.rollouts import run_multi_turn_rollout
-from nemo_rl.models.generation.interfaces import configure_generation_config
+from nemo_rl.models.generation import configure_generation_config
 from nemo_rl.models.generation.vllm import VllmConfig, VllmGeneration
 from nemo_rl.models.policy.hf_policy import HfPolicy, PolicyConfig
 
@@ -50,9 +50,14 @@
     "stop_token_ids": None,
     "stop_strings": None,
     "vllm_cfg": {
+        "async_engine": False,
+        "precision": "bfloat16",
         "tensor_parallel_size": 1,
-        "gpu_memory_utilization": 0.3,
+        "pipeline_parallel_size": 1,
         "max_model_len": 1024,
+        "disable_log_stats": True,
+        "disable_log_requests": True,
+        "gpu_memory_utilization": 0.6,
     },
 }
 
@@ -89,6 +94,7 @@
         },
     },
     "dtensor_cfg": {"enabled": False},
+    "dynamic_batching": {"enabled": False},
 }
 
 
diff --git a/tests/unit/experience/test_retriever.py b/tests/unit/environments/test_retriever.py
similarity index 95%
rename from tests/unit/experience/test_retriever.py
rename to tests/unit/environments/test_retriever.py
index 0c059c1453..457dc5bc4b 100644
--- a/tests/unit/experience/test_retriever.py
+++ b/tests/unit/environments/test_retriever.py
@@ -20,7 +20,7 @@
 from nemo_rl.distributed.virtual_cluster import RayVirtualCluster
 from nemo_rl.environments.tools.retriever import RAGEnvConfig, RAGEnvironment
 from nemo_rl.experience.rollouts import run_multi_turn_rollout
-from nemo_rl.models.generation.interfaces import configure_generation_config
+from nemo_rl.models.generation import configure_generation_config
 from nemo_rl.models.generation.vllm import VllmConfig, VllmGeneration
 
 MODEL_NAME = "meta-llama/Llama-3.2-1B"
@@ -48,9 +48,14 @@
     "stop_token_ids": None,
     "stop_strings": None,
     "vllm_cfg": {
+        "async_engine": False,
+        "precision": "bfloat16",
         "tensor_parallel_size": 1,
-        "gpu_memory_utilization": 0.3,
+        "pipeline_parallel_size": 1,
         "max_model_len": 1024,
+        "disable_log_stats": True,
+        "disable_log_requests": True,
+        "gpu_memory_utilization": 0.6,
     },
 }
 
diff --git a/tests/unit/tools/test_tools.py b/tests/unit/tools/test_tools.py
deleted file mode 100644
index a22ca03c3c..0000000000
--- a/tests/unit/tools/test_tools.py
+++ /dev/null
@@ -1,351 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from copy import deepcopy
-
-import pytest
-import ray
-import torch
-from datasets import load_dataset
-from transformers import AutoTokenizer
-
-from nemo_rl.distributed.batched_data_dict import BatchedDataDict
-from nemo_rl.distributed.virtual_cluster import RayVirtualCluster
-from nemo_rl.models.generation.interfaces import configure_generation_config
-from nemo_rl.models.generation.vllm import VllmConfig, VllmGeneration
-from nemo_rl.models.policy.hf_policy import HfPolicy, PolicyConfig
-from nemo_rl.tools.generation import generate_with_code_and_tools
-from nemo_rl.tools.tools import BM25Retriever, StatefulCodeExecutor
-
-MODEL_NAME = "meta-llama/Llama-3.2-1B"
-
-
-# Define basic vLLM test config
-basic_vllm_test_config: VllmConfig = {
-    "backend": "vllm",
-    "model_name": MODEL_NAME,
-    "tokenizer_name": None,
-    "dtype": "bfloat16",
-    "max_new_tokens": 100,
-    "temperature": 1.0,
-    "top_p": 1.0,
-    "top_k": None,
-    "stop_token_ids": None,
-    "stop_strings": None,
-    "vllm_cfg": {
-        "tensor_parallel_size": 1,
-        "gpu_memory_utilization": 0.3,
-        "max_model_len": 1024,
-    },
-}
-
-basic_hf_test_config: PolicyConfig = {
-    "model_name": MODEL_NAME,
-    "tokenizer_name": None,
-    "generation_batch_size": 1,
-    "generation": {
-        "backend": "hf",
-        "max_new_tokens": 100,
-        "temperature": 1.0,
-        "top_p": 1.0,
-        "top_k": None,
-        "stop_token_ids": None,
-        "stop_strings": None,
-    },
-    # Required training parameters
-    "train_global_batch_size": 1,
-    "train_micro_batch_size": 1,
-    "learning_rate": 5e-6,
-    "logprob_batch_size": 1,
-    "max_new_tokens": 16,
-    "do_sample": False,
-    "precision": "float32",
-    "activation_checkpointing_enabled": False,
-    "fsdp_offload_enabled": False,
-    "optimizer": {
-        "name": "torch.optim.AdamW",
-        "kwargs": {
-            "lr": 5e-6,
-            "weight_decay": 0.01,
-            "betas": [0.9, 0.999],
-            "eps": 1e-8,
-        },
-    },
-    "dtensor_cfg": {"enabled": False},
-}
-
-
-@pytest.fixture(scope="module")
-def cluster():
-    """Create a virtual cluster for testing."""
-    # Create a cluster with 1 node that has 1 GPU bundles
-    virtual_cluster = RayVirtualCluster(
-        bundle_ct_per_node_list=[1],  # 1 node with 1 GPU bundle
-        use_gpus=True,
-        max_colocated_worker_groups=2,
-        num_gpus_per_node=1,  # Use available GPUs
-        name="vllm-test-cluster",
-    )
-    yield virtual_cluster
-    virtual_cluster.shutdown()
-
-
-@pytest.fixture(scope="function")
-def tokenizer():
-    """Loads the tokenizer for the tests."""
-    print(f"Loading tokenizer: {MODEL_NAME}")
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-    print(
-        f"Tokenizer loaded. Pad token: {tokenizer.pad_token} (ID: {tokenizer.pad_token_id}), EOS token: {tokenizer.eos_token} (ID: {tokenizer.eos_token_id})"
-    )
-    return tokenizer
-
-
-def test_vllm_execute_code(cluster, tokenizer):
-    """Test that vLLM can call the code executor."""
-    # Prepare test data
-    codes = [
-        "<code>x = 3; y = 4</code>\nThis is some regular text.\n<code>x + y</code>\n",
-        "<code>\ndef f(x):\n    return x * x\n\nf(2)\n</code>\n",
-    ]
-    results = ["<result>7</result>", "\n<result>\n4\n</result>"]
-    results = [code + result for code, result in zip(codes, results)]
-
-    test_prompts = [code * 4 for code in codes]
-    encodings = tokenizer(
-        test_prompts,
-        padding="max_length",
-        max_length=1024,
-        return_tensors="pt",
-        padding_side="right",
-    )
-    input_lengths = encodings["attention_mask"].sum(dim=1).to(torch.int32)
-    batch = BatchedDataDict(
-        {
-            "input_ids": encodings["input_ids"],
-            "input_lengths": input_lengths,
-        }
-    )
-
-    # Create separate configs for each policy
-    vllm_config = basic_vllm_test_config.copy()
-    vllm_config = configure_generation_config(vllm_config, tokenizer, is_eval=True)
-
-    # Create vLLM generation
-    vllm_generation = VllmGeneration(cluster, vllm_config)
-
-    # Generate and check result
-    outputs = generate_with_code_and_tools(
-        vllm_generation, batch, tokenizer, greedy=True
-    )
-
-    all_output_ids = outputs["output_ids"]
-    logprobs = outputs["logprobs"]
-    input_lengths = outputs["unpadded_sequence_lengths"] - outputs["generation_lengths"]
-    output_lengths = outputs["unpadded_sequence_lengths"]
-    input_ids = []
-    output_ids = []
-    for all_output_id, input_length, output_length in zip(
-        all_output_ids, input_lengths, output_lengths
-    ):
-        input_ids.append(all_output_id[:input_length])
-        output_ids.append(all_output_id[input_length:output_length])
-    indices = torch.arange(all_output_ids.shape[-1])
-    input_lengths = input_lengths.unsqueeze(-1)
-    output_lengths = output_lengths.unsqueeze(-1)
-    is_generated = (indices >= input_lengths) & (indices < output_lengths)
-
-    input_texts = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
-    output_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-
-    assert input_texts == test_prompts, "Unexpected modification to input texts"
-    assert output_texts == results, f"Expect {results}, got wrong output {output_texts}"
-    assert (logprobs[~is_generated] == 0.0).all(), (
-        "Unexpected log probabilities on input tokens or paddings"
-    )
-    assert (logprobs[is_generated] != 0.0).all(), (
-        "Generated tokens must have non-trivial log probabilities"
-    )
-
-    # Clean up
-    vllm_generation.shutdown()
-
-
-def test_hf_execute_code(cluster, tokenizer):
-    """Test that Huggingface models can call the code executor."""
-    # Prepare test data
-    codes = [
-        "<code>x = 3; y = 4</code>\nThis is some regular text.\n<code>x + y</code>\n",
-        "<code>\ndef f(x):\n    return x * x\n\nf(2)\n</code>\n",
-    ]
-    results = ["<result>7</result>", "\n<result>\n4\n</result>"]
-    results = [code + result for code, result in zip(codes, results)]
-
-    test_prompts = [code * 4 for code in codes]
-    encodings = tokenizer(
-        test_prompts,
-        padding="max_length",
-        max_length=1024,
-        return_tensors="pt",
-        padding_side="right",
-    )
-    input_lengths = encodings["attention_mask"].sum(dim=1).to(torch.int32)
-    batch = BatchedDataDict(
-        {
-            "input_ids": encodings["input_ids"],
-            "input_lengths": input_lengths,
-        }
-    )
-
-    # Create separate configs for each policy
-    hf_config = deepcopy(basic_hf_test_config)
-    hf_config["generation"] = configure_generation_config(
-        hf_config["generation"],
-        tokenizer,  # is_eval=True
-    )
-
-    # Create vLLM generation
-    hf_policy = HfPolicy(
-        cluster, hf_config, tokenizer, init_reference_model=False, init_optimizer=False
-    )
-
-    # Generate and check result
-    outputs = generate_with_code_and_tools(hf_policy, batch, tokenizer, greedy=True)
-
-    all_output_ids = outputs["output_ids"]
-    logprobs = outputs["logprobs"]
-    input_lengths = outputs["unpadded_sequence_lengths"] - outputs["generation_lengths"]
-    output_lengths = outputs["unpadded_sequence_lengths"]
-    input_ids = []
-    output_ids = []
-    for all_output_id, input_length, output_length in zip(
-        all_output_ids, input_lengths, output_lengths
-    ):
-        input_ids.append(all_output_id[:input_length])
-        output_ids.append(all_output_id[input_length:output_length])
-    indices = torch.arange(all_output_ids.shape[-1])
-    input_lengths = input_lengths.unsqueeze(-1)
-    output_lengths = output_lengths.unsqueeze(-1)
-    is_generated = (indices >= input_lengths) & (indices < output_lengths)
-
-    input_texts = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
-    output_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-
-    assert input_texts == test_prompts, "Unexpected modification to input texts"
-    assert output_texts == results, f"Expect {results}, got wrong output {output_texts}"
-    assert (logprobs[~is_generated] == 0.0).all(), (
-        "Unexpected log probabilities on input tokens or paddings"
-    )
-    assert (logprobs[is_generated] != 0.0).all(), (
-        "Generated tokens must have non-trivial log probabilities"
-    )
-
-    # Clean up
-    hf_policy.shutdown()
-
-
-def test_untrusted_code(cluster):
-    """Test whether the code executor can block untrusted code."""
-    executor = StatefulCodeExecutor.remote()
-
-    # accessing temporary files shouldn't be blocked
-    code = (
-        "with open('allowed_file.txt', 'w') as fout:\n"
-        "    fout.write('some content')\n"
-        "with open('allowed_file.txt') as fin:\n"
-        "    content = fin.read()\n"
-        "content"
-    )
-    result = ray.get(executor.__call__.remote(code))
-    assert result == "some content"
-
-    # accessing other files should be blocked
-    code = "with open('/etc/passwd', 'r') as fin:\n    fin.read()"
-    result = ray.get(executor.__call__.remote(code))
-    assert isinstance(result, PermissionError)
-
-    # importing non-sensitive modules shouldn't be blocked
-    code = "import math\nround(math.sqrt(8))"
-    result = ray.get(executor.__call__.remote(code))
-    assert result == 3
-
-    # importing sensitive modules should be blocked
-    code = "import os"
-    result = ray.get(executor.__call__.remote(code))
-    assert isinstance(result, PermissionError)
-
-
-@pytest.mark.timeout(150)
-def test_vllm_use_tool(cluster, tokenizer):
-    """Test that vLLM can use tool in the code executor."""
-    # Prepare test data
-    codes = ["<code>retrieve('Jen-Hsun Huang')</code>\n"]
-    results = [
-        "\n<result>\n"
-        "['Nvidia was established in 1993 by Jen-Hsun Huang, Curtis Priem, and Chris '\n"
-        " 'Malachowsky. In 2000 Nvidia took intellectual possession of 3dfx, one of the '\n"
-        " 'biggest GPU producers in 1990s.']\n"
-        "</result>"
-    ]
-    results = [code + result for code, result in zip(codes, results)]
-
-    test_prompts = [code * 4 for code in codes]
-    encodings = tokenizer(
-        test_prompts,
-        padding="max_length",
-        max_length=1024,
-        return_tensors="pt",
-        padding_side="right",
-    )
-    input_lengths = encodings["attention_mask"].sum(dim=1).to(torch.int32)
-    batch = BatchedDataDict(
-        {
-            "input_ids": encodings["input_ids"],
-            "input_lengths": input_lengths,
-        }
-    )
-
-    # Construct retriever
-    dataset = load_dataset("rahular/simple-wikipedia")
-    documents = [sample["text"] for sample in dataset["train"]]
-    tool_map = {"retrieve": BM25Retriever(documents, num_result=1)}
-
-    # Create separate configs for each policy
-    vllm_config = basic_vllm_test_config.copy()
-    vllm_config = configure_generation_config(vllm_config, tokenizer, is_eval=True)
-
-    # Create vLLM generation
-    vllm_generation = VllmGeneration(cluster, vllm_config)
-
-    # Generate and check result
-    outputs = generate_with_code_and_tools(
-        vllm_generation, batch, tokenizer, tool_map=tool_map, greedy=True
-    )
-
-    all_output_ids = outputs["output_ids"]
-    input_lengths = outputs["unpadded_sequence_lengths"] - outputs["generation_lengths"]
-    output_lengths = outputs["unpadded_sequence_lengths"]
-    output_ids = []
-    for all_output_id, input_length, output_length in zip(
-        all_output_ids, input_lengths, output_lengths
-    ):
-        output_ids.append(all_output_id[input_length:output_length])
-
-    output_texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-
-    assert output_texts == results, f"Expect {results}, got wrong output {output_texts}"
-
-    # Clean up
-    vllm_generation.shutdown()

From 5bed0757c119b64344cbe1f60d6133dba3a6c5cf Mon Sep 17 00:00:00 2001
From: KiddoZhu <zhaochengz@nvidia.com>
Date: Mon, 28 Jul 2025 16:16:27 -0700
Subject: [PATCH 6/7] remove hf path

Signed-off-by: KiddoZhu <zhaochengz@nvidia.com>
---
 .../ray_actor_environment_registry.py         |   2 +
 nemo_rl/environments/code_environment.py      |   4 +-
 nemo_rl/environments/tools/retriever.py       |   6 +-
 .../environments/test_code_environment.py     | 113 +-----------------
 tests/unit/environments/test_retriever.py     |   8 ++
 5 files changed, 20 insertions(+), 113 deletions(-)

diff --git a/nemo_rl/distributed/ray_actor_environment_registry.py b/nemo_rl/distributed/ray_actor_environment_registry.py
index 277619bb92..e300aec54b 100644
--- a/nemo_rl/distributed/ray_actor_environment_registry.py
+++ b/nemo_rl/distributed/ray_actor_environment_registry.py
@@ -21,7 +21,9 @@
     "nemo_rl.models.policy.dtensor_policy_worker.DTensorPolicyWorker": PY_EXECUTABLES.VLLM,
     "nemo_rl.models.policy.megatron_policy_worker.MegatronPolicyWorker": PY_EXECUTABLES.MCORE,
     "nemo_rl.environments.math_environment.MathEnvironment": PY_EXECUTABLES.SYSTEM,
+    "nemo_rl.environments.code_environment.CodeEnvironment": PY_EXECUTABLES.SYSTEM,
     "nemo_rl.environments.games.sliding_puzzle.SlidingPuzzleEnv": PY_EXECUTABLES.SYSTEM,
+    "nemo_rl.environments.tools.retriever.RAGEnvironment": PY_EXECUTABLES.SYSTEM,
 }
 
 
diff --git a/nemo_rl/environments/code_environment.py b/nemo_rl/environments/code_environment.py
index cb72c1532e..c340d1980b 100644
--- a/nemo_rl/environments/code_environment.py
+++ b/nemo_rl/environments/code_environment.py
@@ -48,7 +48,6 @@ class CodeEnvMetadata(TypedDict):
 
 @ray.remote
 class CodeExecutionWorker:
-    DEFAULT_PY_EXECUTABLE = PY_EXECUTABLES.SYSTEM
     """Helper class to process individual code execution steps."""
 
     def __init__(self):
@@ -188,7 +187,6 @@ def safe_import(self, name: str, *args, **kwargs):
 
 @ray.remote
 class CodeEnvironment(EnvironmentInterface):
-    DEFAULT_PY_EXECUTABLE = PY_EXECUTABLES.SYSTEM
     """Code execution environment that maintains state between steps."""
 
     def __init__(self, cfg: CodeEnvConfig):
@@ -197,7 +195,7 @@ def __init__(self, cfg: CodeEnvConfig):
         self.terminate_on_evaluation = cfg["terminate_on_evaluation"]
         self.workers = [
             CodeExecutionWorker.options(
-                runtime_env={"py_executable": CodeExecutionWorker.DEFAULT_PY_EXECUTABLE}
+                runtime_env={"py_executable": PY_EXECUTABLES.SYSTEM}
             ).remote()
             for _ in range(self.num_workers)
         ]
diff --git a/nemo_rl/environments/tools/retriever.py b/nemo_rl/environments/tools/retriever.py
index cc62d8a2af..4109a21a05 100644
--- a/nemo_rl/environments/tools/retriever.py
+++ b/nemo_rl/environments/tools/retriever.py
@@ -11,15 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import re
 import math
-from typing import Any, Dict, List, TypedDict
+import re
 from collections import Counter
-from tqdm import tqdm
+from typing import Any, Dict, List, TypedDict
 
 import ray
 import torch
 from datasets import load_dataset
+from tqdm import tqdm
 from transformers import AutoTokenizer
 
 from nemo_rl.data.interfaces import LLMMessageLogType
diff --git a/tests/unit/environments/test_code_environment.py b/tests/unit/environments/test_code_environment.py
index 732edf682b..dd5b8de7a6 100644
--- a/tests/unit/environments/test_code_environment.py
+++ b/tests/unit/environments/test_code_environment.py
@@ -28,7 +28,6 @@
 from nemo_rl.experience.rollouts import run_multi_turn_rollout
 from nemo_rl.models.generation import configure_generation_config
 from nemo_rl.models.generation.vllm import VllmConfig, VllmGeneration
-from nemo_rl.models.policy.hf_policy import HfPolicy, PolicyConfig
 
 MODEL_NAME = "meta-llama/Llama-3.2-1B"
 
@@ -58,43 +57,15 @@
         "disable_log_stats": True,
         "disable_log_requests": True,
         "gpu_memory_utilization": 0.6,
+        "enforce_eager": "False",
     },
-}
-
-basic_hf_test_config: PolicyConfig = {
-    "model_name": MODEL_NAME,
-    "tokenizer_name": None,
-    "generation_batch_size": 1,
-    "generation": {
-        "backend": "hf",
-        "max_new_tokens": 100,
-        "temperature": 1.0,
-        "top_p": 1.0,
-        "top_k": None,
-        "stop_token_ids": None,
-        "stop_strings": None,
-    },
-    # Required training parameters
-    "train_global_batch_size": 1,
-    "train_micro_batch_size": 1,
-    "learning_rate": 5e-6,
-    "logprob_batch_size": 1,
-    "max_new_tokens": 16,
-    "do_sample": False,
-    "precision": "float32",
-    "activation_checkpointing_enabled": False,
-    "fsdp_offload_enabled": False,
-    "optimizer": {
-        "name": "torch.optim.AdamW",
-        "kwargs": {
-            "lr": 5e-6,
-            "weight_decay": 0.01,
-            "betas": [0.9, 0.999],
-            "eps": 1e-8,
+    "colocated": {
+        "enabled": True,
+        "resources": {
+            "gpus_per_node": None,
+            "num_nodes": None,
         },
     },
-    "dtensor_cfg": {"enabled": False},
-    "dynamic_batching": {"enabled": False},
 }
 
 
@@ -246,75 +217,3 @@ def test_vllm_execute_code(cluster, tokenizer, code_env):
         assert last_msg["content"] == results[i], (
             f"Expected {results[i]}, got {last_msg['content']}"
         )
-
-
-def test_hf_execute_code(cluster, tokenizer, code_env):
-    """Test that Huggingface models can call the code executor."""
-    # Prepare test data
-    codes = [
-        "<code>x = 3; y = 4</code>\nThis is some regular text.\n<code>x + y</code>\n",
-        "<code>\ndef f(x):\n    return x * x\n\nf(2)\n</code>\n",
-    ]
-    results = ["<result>7</result>", "\n<result>\n4\n</result>"]
-
-    # Create message logs
-    message_logs = []
-    metadata_batch = []
-    temp_dirs = []
-    for code in codes:
-        # Tokenize the message content
-        prompt = code * 4
-        token_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)[
-            "input_ids"
-        ][0]
-        temp_dir = TemporaryDirectory()
-        message_logs.append(
-            [{"role": "user", "content": prompt, "token_ids": token_ids}]
-        )
-        metadata_batch.append(CodeEnvMetadata(context={}, working_dir=temp_dir.name))
-        temp_dirs.append(temp_dir)
-
-    # Create initial batch
-    initial_batch = BatchedDataDict(
-        {
-            "message_log": message_logs,
-            "extra_env_info": metadata_batch,
-            "task_name": ["code_execution"] * len(codes),
-            "stop_strings": [["</code>"]] * len(codes),
-        }
-    )
-
-    # Create HF policy
-    hf_config = basic_hf_test_config.copy()
-    hf_config["generation"] = configure_generation_config(
-        hf_config["generation"],
-        tokenizer,
-    )
-    hf_policy = HfPolicy(
-        cluster, hf_config, tokenizer, init_reference_model=False, init_optimizer=False
-    )
-
-    # Create code environment
-    task_to_env = {"code_execution": code_env}
-
-    # Run rollout
-    hf_policy.prepare_for_generation()
-    final_batch, _ = run_multi_turn_rollout(
-        policy_generation=hf_policy,
-        input_batch=initial_batch,
-        tokenizer=tokenizer,
-        task_to_env=task_to_env,
-        max_seq_len=256,
-        max_rollout_turns=2,
-        greedy=True,
-    )
-    hf_policy.finish_generation()
-
-    # Check results
-    for i, msg_log in enumerate(final_batch["message_log"]):
-        # Get the last message which should contain the result
-        last_msg = msg_log[-1]
-        assert last_msg["role"] == "environment"
-        assert last_msg["content"] == results[i], (
-            f"Expected {results[i]}, got {last_msg['content']}"
-        )
diff --git a/tests/unit/environments/test_retriever.py b/tests/unit/environments/test_retriever.py
index 457dc5bc4b..a773d5dac0 100644
--- a/tests/unit/environments/test_retriever.py
+++ b/tests/unit/environments/test_retriever.py
@@ -56,6 +56,14 @@
         "disable_log_stats": True,
         "disable_log_requests": True,
         "gpu_memory_utilization": 0.6,
+        "enforce_eager": "False",
+    },
+    "colocated": {
+        "enabled": True,
+        "resources": {
+            "gpus_per_node": None,
+            "num_nodes": None,
+        },
     },
 }
 

From 013fb299113500ba14fbaf992774f11ed52c0116 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Tue, 29 Jul 2025 21:13:17 +0000
Subject: [PATCH 7/7] terry nits

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 nemo_rl/environments/code_environment.py         | 4 ++--
 nemo_rl/environments/tools/retriever.py          | 2 +-
 tests/unit/environments/test_code_environment.py | 1 +
 tests/unit/environments/test_retriever.py        | 1 +
 4 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/nemo_rl/environments/code_environment.py b/nemo_rl/environments/code_environment.py
index c340d1980b..029b9cd1c0 100644
--- a/nemo_rl/environments/code_environment.py
+++ b/nemo_rl/environments/code_environment.py
@@ -46,7 +46,7 @@ class CodeEnvMetadata(TypedDict):
     working_dir: str  # Working directory for file operations
 
 
-@ray.remote
+@ray.remote  # pragma: no cover
 class CodeExecutionWorker:
     """Helper class to process individual code execution steps."""
 
@@ -185,7 +185,7 @@ def safe_import(self, name: str, *args, **kwargs):
         return builtins.__import__(name, *args, **kwargs)
 
 
-@ray.remote
+@ray.remote  # pragma: no cover
 class CodeEnvironment(EnvironmentInterface):
     """Code execution environment that maintains state between steps."""
 
diff --git a/nemo_rl/environments/tools/retriever.py b/nemo_rl/environments/tools/retriever.py
index 4109a21a05..8f408fc92b 100644
--- a/nemo_rl/environments/tools/retriever.py
+++ b/nemo_rl/environments/tools/retriever.py
@@ -131,7 +131,7 @@ def __call__(self, query: str) -> List[str]:
         return results
 
 
-@ray.remote
+@ray.remote  # pragma: no cover
 class RAGEnvironment(EnvironmentInterface):
     """RAG environment that uses BM25 for document retrieval."""
 
diff --git a/tests/unit/environments/test_code_environment.py b/tests/unit/environments/test_code_environment.py
index dd5b8de7a6..27ada6d9bf 100644
--- a/tests/unit/environments/test_code_environment.py
+++ b/tests/unit/environments/test_code_environment.py
@@ -152,6 +152,7 @@ def test_untrusted_code(code_env):
     assert responses == results, f"Got wrong output {responses}"
 
 
+@pytest.mark.hf_gated
 def test_vllm_execute_code(cluster, tokenizer, code_env):
     """Test that vLLM can call the code executor."""
     # Prepare test data
diff --git a/tests/unit/environments/test_retriever.py b/tests/unit/environments/test_retriever.py
index a773d5dac0..824c09b041 100644
--- a/tests/unit/environments/test_retriever.py
+++ b/tests/unit/environments/test_retriever.py
@@ -113,6 +113,7 @@ def cluster():
             cluster_instance.shutdown()
 
 
+@pytest.mark.hf_gated
 def test_vllm_retrieve(cluster, tokenizer, rag_env):
     """Test that vLLM can use the RAG environment for document retrieval."""
     # Prepare test data