From a1fe428b5269497093f5b01ea8bf17b0e4b468c8 Mon Sep 17 00:00:00 2001
From: Yi-Fu Wu <yifu.wu@gmail.com>
Date: Tue, 25 Mar 2025 15:10:53 -0700
Subject: [PATCH 01/27] WIP commit

Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
---
 examples/configs/grpo_math_1B.yaml           |  15 +-
 examples/run_grpo_math.py                    |   3 +-
 nemo_reinforcer/distributed/worker_groups.py |   1 +
 nemo_reinforcer/models/policy/hf_policy.py   | 214 +++++++++++++++++--
 4 files changed, 209 insertions(+), 24 deletions(-)

diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml
index 72aad000ce..a8149156e9 100644
--- a/examples/configs/grpo_math_1B.yaml
+++ b/examples/configs/grpo_math_1B.yaml
@@ -5,7 +5,7 @@ grpo:
   max_num_steps: 1000000
   normalize_rewards: true
   use_leave_one_out_baseline: true
-  val_period: 10
+  val_period: -1
   val_at_start: false
   max_val_samples: 256
   val_batch_size: 256
@@ -15,7 +15,7 @@ loss_fn:
   ratio_eps: 0.2
 
 checkpointing:
-  enabled: true
+  enabled: false
   checkpoint_dir: "results/grpo"
   metric_name: "val_reward"
   higher_is_better: true
@@ -29,7 +29,8 @@ policy:
   generation_batch_size: 32
   logprob_batch_size: 4
   max_total_sequence_length: 512
-  precision: "bfloat16"
+  # precision: "bfloat16"
+  precision: "float32"
 
   optimizer:
     name: "torch.optim.AdamW"
@@ -75,17 +76,17 @@ env:
 logger:
   log_dir: "logs"  # Base directory for all logs
   num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal
-  wandb_enabled: false
+  wandb_enabled: true
   tensorboard_enabled: false
   monitor_gpus: false  # If true, will monitor GPU usage and log to wandb and/or tensorboard
   wandb:
-    project: "grpo-dev"
-    name: "grpo-dev-logger"
+    project: "grpo-dev-yifu"
+    name: "grpo-dev-offload2"
   tensorboard: {}
   gpu_monitoring:
     collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
     flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
 
 cluster:
-  gpus_per_node: 1
+  gpus_per_node: 2
   num_nodes: 1
diff --git a/examples/run_grpo_math.py b/examples/run_grpo_math.py
index bc11687ab0..d3cb3fd805 100644
--- a/examples/run_grpo_math.py
+++ b/examples/run_grpo_math.py
@@ -23,7 +23,7 @@
 from transformers import AutoTokenizer
 from collections import defaultdict
 
-from nemo_reinforcer.algorithms.grpo import MasterConfig, grpo_train, setup
+from nemo_reinforcer.algorithms.grpo import MasterConfig, grpo_train, setup, refit_policy_generation
 from nemo_reinforcer.distributed.virtual_cluster import init_ray
 from nemo_reinforcer.utils.config import load_config
 from nemo_reinforcer.utils.logger import get_next_experiment_dir
@@ -271,6 +271,7 @@ def main():
         grpo_state,
         master_config,
     ) = setup(config, dataset, val_dataset)
+    # refit_policy_generation(policy, policy_generation)
     grpo_train(
         policy,
         policy_generation,
diff --git a/nemo_reinforcer/distributed/worker_groups.py b/nemo_reinforcer/distributed/worker_groups.py
index 9cc7798631..93d4808b75 100644
--- a/nemo_reinforcer/distributed/worker_groups.py
+++ b/nemo_reinforcer/distributed/worker_groups.py
@@ -125,6 +125,7 @@ def __call__(
                 num_gpus=num_gpus,
                 bundle_indices=bundle_indices,
             )
+            print("HERE worker_class", worker_class, "env_vars", env_vars)
 
             # Apply resource configuration
             if resources and "num_gpus" in resources:
diff --git a/nemo_reinforcer/models/policy/hf_policy.py b/nemo_reinforcer/models/policy/hf_policy.py
index 34a621063d..6478eda6e2 100644
--- a/nemo_reinforcer/models/policy/hf_policy.py
+++ b/nemo_reinforcer/models/policy/hf_policy.py
@@ -21,6 +21,8 @@
 import ray
 import torch
 from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.fsdp.api import CPUOffload
+from torch.distributed.fsdp._runtime_utils import _lazy_init
 from torch.distributed.fsdp import (
     FullyShardedDataParallel,
     FullStateDictConfig,
@@ -46,7 +48,163 @@
 from nemo_reinforcer.distributed.virtual_cluster import (
     PY_EXECUTABLES,
 )
-
+# def move_to_cpu(model):
+#     return model.to("cpu")
+
+# def move_to_cpu(model):
+#     for param in model.parameters():
+#         param.data = param.data.to("cpu")
+#     for buffer in model.buffers():
+#         buffer.data = buffer.data.to("cpu")
+#     if hasattr(model, "_fsdp_wrapped_module"):
+#         model._fsdp_wrapped_module.to("cpu")
+
+#     return model
+
+# def move_to_gpu(model):
+#     return model.to("cuda")
+
+def move_to_cpu(model):
+    # offload2
+    # if not isinstance(model, FullyShardedDataParallel):
+    #     return model.to("cpu")
+
+    for _, param in model.named_parameters():
+        param.data = param.data.to("cpu", non_blocking=True)
+        if hasattr(param, "_local_shard"):
+            # param._local_shard = param._local_shard.to("cpu", non_blocking=True)
+            param._local_shard = param.data
+        # param.data = param.data.to("cpu", non_blocking=True)
+        if param.grad is not None:
+            param.grad = param.grad.to("cpu", non_blocking=True)
+
+    # for buffer in model.buffers():
+    #     buffer.data = buffer.data.to("cpu")
+    # if torch.distributed.get_rank() == 0:
+    #     import pdb; pdb.set_trace()
+    if hasattr(model, "_fsdp_wrapped_module"):
+        move_to_cpu(model._fsdp_wrapped_module)
+        # model._fsdp_wrapped_module.to("cpu")
+
+    return model
+
+def move_to_gpu(model):
+    # offload2
+    # if not isinstance(model, FullyShardedDataParallel):
+    #     return model.to("cuda")
+
+    for _, param in model.named_parameters():
+        param.data = param.data.to("cuda", non_blocking=True)
+        if hasattr(param, "_local_shard"):
+            # param._local_shard = param._local_shard.to("cuda", non_blocking=True)
+            param._local_shard = param.data
+        # param.data = param.data.to("cuda", non_blocking=True)
+        if param.grad is not None:
+            param.grad = param.grad.to("cuda", non_blocking=True)
+
+    # for buffer in model.buffers():
+    #     buffer.data = buffer.data.to("cuda")
+    if hasattr(model, "_fsdp_wrapped_module"):
+        move_to_gpu(model._fsdp_wrapped_module)
+
+    return model
+
+
+# def move_to_cpu(model):
+#     if not isinstance(model, FullyShardedDataParallel):
+#         model = model.to("cpu")
+#         return model
+#     # lazy init FSDP model
+#     _lazy_init(model, model)
+#     # assert model._is_root, f"Only support root model offloading to CPU"
+#     # # if torch.distributed.get_rank() == 0:
+#     #     # import ipdb; ipdb.set_trace()
+#     for handle in model._all_handles:
+#         if handle._offload_params:
+#             continue
+#         flat_param = handle.flat_param
+#         # assert flat_param.data.data_ptr() == flat_param._local_shard.data_ptr() and \
+#         #     id(flat_param.data) != id(flat_param._local_shard) and \
+#         #     flat_param.data.size() == flat_param._local_shard.size()
+#         handle.flat_param_to(torch.device("cpu"), non_blocking=True)
+#         # the following still keeps id(._local_shard) != id(.data)
+#         flat_param._local_shard = flat_param.data
+#         # assert id(flat_param._local_shard) != id(flat_param.data)
+#     return model
+
+# def move_to_gpu(model):
+#     if not isinstance(model, FullyShardedDataParallel):
+#         model = model.to("cuda")
+#         return model
+#     # lazy init FSDP model
+#     _lazy_init(model, model)
+#     # assert model._is_root, f"Only support root model loading to GPU"
+#     device_id = torch.cuda.current_device()
+#     for handle in model._all_handles:
+#         if handle._offload_params:
+#             continue
+#         flat_param = handle.flat_param
+#         handle.flat_param_to(torch.device(f"cuda:{device_id}"), non_blocking=True)
+#         # the following still keeps id(._local_shard) != id(.data)
+#         flat_param._local_shard = flat_param.data
+#     return model
+
+
+# def move_to_cpu(model):
+#     if not isinstance(model, FullyShardedDataParallel):
+#         model = model.to("cpu")
+#         return model
+#     # lazy init FSDP model
+#     _lazy_init(model, model)
+#     assert model._is_root, f"Only support root model offloading to CPU"
+#     # if torch.distributed.get_rank() == 0:
+#         # import ipdb; ipdb.set_trace()
+#     for handle in model._all_handles:
+#         if handle._offload_params:
+#             continue
+#         flat_param = handle.flat_param
+#         assert flat_param.data.data_ptr() == flat_param._local_shard.data_ptr() and \
+#             id(flat_param.data) != id(flat_param._local_shard) and \
+#             flat_param.data.size() == flat_param._local_shard.size()
+#         handle.flat_param_to(torch.device("cpu"), non_blocking=True)
+#         # the following still keeps id(._local_shard) != id(.data)
+#         flat_param._local_shard = flat_param.data
+#         assert id(flat_param._local_shard) != id(flat_param.data)
+
+#     for param in model.parameters():
+#         param.data = param.data.to("cpu")
+#     for buffer in model.buffers():
+#         buffer.data = buffer.data.to("cpu")
+#     if hasattr(model, "_fsdp_wrapped_module"):
+#         move_to_cpu(model._fsdp_wrapped_module)
+#         # model._fsdp_wrapped_module.to("cpu")
+
+#     return model
+
+# def move_to_gpu(model):
+#     if not isinstance(model, FullyShardedDataParallel):
+#         model = model.to("cuda")
+#         return model
+#     # lazy init FSDP model
+#     _lazy_init(model, model)
+#     assert model._is_root, f"Only support root model loading to GPU"
+#     device_id = torch.cuda.current_device()
+#     for handle in model._all_handles:
+#         if handle._offload_params:
+#             continue
+#         flat_param = handle.flat_param
+#         handle.flat_param_to(torch.device(f"cuda:{device_id}"), non_blocking=True)
+#         # the following still keeps id(._local_shard) != id(.data)
+#         flat_param._local_shard = flat_param.data
+
+#     for param in model.parameters():
+#         param.data = param.data.to("cuda")
+#     for buffer in model.buffers():
+#         buffer.data = buffer.data.to("cuda")
+#     if hasattr(model, "_fsdp_wrapped_module"):
+#         move_to_gpu(model._fsdp_wrapped_module)
+
+#     return model
 
 @ray.remote
 class HfPolicyWorker:
@@ -104,6 +262,15 @@ def __init__(
         #    (Initialize device mesh, shard submodules, then shard entire model)
         # ------------------------------------------------
 
+        @staticmethod
+        def configure_worker(
+            num_gpus: int | float, bundle_indices: Optional[list] = None
+        ) -> tuple[dict, dict, dict]:
+            env_vars = {
+                "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True"
+            }
+            return None, env_vars, None
+
         def do_fsdp(model):
             # Create a device mesh with 'world_size' GPUs in a 1D arrangement.
             mesh = init_device_mesh("cuda", (world_size,))
@@ -113,20 +280,31 @@ def do_fsdp(model):
                 buffer_dtype=torch.float32,
             )
 
+            # return FullyShardedDataParallel(
+            #     model,
+            #     device_mesh=mesh,
+            #     auto_wrap_policy=size_based_auto_wrap_policy,
+            # )
             return FullyShardedDataParallel(
                 model,
                 device_mesh=mesh,
                 auto_wrap_policy=size_based_auto_wrap_policy,
                 mixed_precision=mp_policy,
+                # cpu_offload=CPUOffload(offload_params=True),
             )
 
-        self.model.to("cuda")
+        # self.model.to("cuda")
+        self.model = move_to_gpu(self.model)
         self.model = do_fsdp(self.model)
-        self.model = self.move_to_cpu(self.model)
-        self.reference_model.to("cuda")
+        self.model = move_to_cpu(self.model)
+        # self.reference_model.to("cuda")
+
+        self.reference_model = move_to_gpu(self.reference_model)
         self.reference_model = do_fsdp(self.reference_model)
-        self.reference_model = self.move_to_cpu(self.reference_model)
-        self.model.to("cuda")
+        self.reference_model = move_to_cpu(self.reference_model)
+
+        # self.model.to("cuda")
+        self.model = move_to_gpu(self.model)
         self._held_reference_model_params = None
         # register_fsdp_forward_method(self.model, "generate")
         if init_optimizer:
@@ -422,8 +600,9 @@ def use_reference_model(self):
             original_model = self.model
             original_reference_model = self.reference_model
 
-            self.model = self.move_to_cpu(self.model)
-            self.reference_model = self.reference_model.to("cuda")
+            self.model = move_to_cpu(self.model)
+            # self.reference_model = self.reference_model.to("cuda")
+            self.reference_model = move_to_gpu(self.reference_model)
 
             # Swap the references
             self.model, self.reference_model = self.reference_model, self.model
@@ -436,8 +615,9 @@ def use_reference_model(self):
 
         finally:
             # Restore original references and device placement
-            self.reference_model = self.move_to_cpu(original_reference_model)
-            self.model = original_model.to("cuda")
+            self.reference_model = move_to_cpu(original_reference_model)
+            # self.model = original_model.to("cuda")
+            self.model = move_to_gpu(original_model)
             gc.collect()
             torch.cuda.empty_cache()
 
@@ -704,19 +884,21 @@ def get_weight_ipc_handles(self, offload_model=True):
             data[name] = reduce_tensor(p.detach())
 
         if offload_model:
-            self.model = self.move_to_cpu(self.model)
+            self.model = move_to_cpu(self.model)
             gc.collect()
             torch.cuda.empty_cache()
         return {self.device_uuid: data}
 
     def prepare_for_lp_inference(self):
-        self.model.to("cuda")
+        # self.model.to("cuda")
+        self.model = move_to_gpu(self.model)
         self.model.eval()
         self.offload_before_refit()
 
     def prepare_for_training(self, *args, **kwargs):
         # onload models and optimizer state to cuda
-        self.model.to("cuda")
+        # self.model.to("cuda")
+        self.model = move_to_gpu(self.model)
         self.model.train()
 
         # Move optimizer state to CUDA if it exists
@@ -754,7 +936,7 @@ def offload_before_refit(self):
     @torch.no_grad()
     def offload_after_refit(self):
         # Offload as much as possible on the CPU
-        self.model = self.move_to_cpu(self.model)
+        self.model = move_to_cpu(self.model)
         self.model.eval()
         torch.randn(1).cuda()  # wake up torch allocator
         self.offload_before_refit()  # rerun the old offload function
@@ -771,8 +953,8 @@ def offload_after_refit(self):
         print(
             f"GPU Memory after refit complete: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved"
         )
-
-    def move_to_cpu(self, model):
+    
+    def move_to_cpu_old1(self, model):
         for param in model.parameters():
             param.data = param.data.to("cpu")
 

From 39cd9561d7224e5da673faf899b5233a58826897 Mon Sep 17 00:00:00 2001
From: Parth Chadha <pchadha@nvidia.com>
Date: Tue, 25 Mar 2025 17:12:24 -0700
Subject: [PATCH 02/27] fix: Remove reference of tokenizer from generation
 backend (#75) (#82)

Signed-off-by: Parth Chadha <pchadha@nvidia.com>
---
 docs/design_docs/generation.md                | 14 +++++-
 examples/run_grpo_math.py                     |  5 ++-
 nemo_reinforcer/algorithms/grpo.py            |  7 +++
 .../models/generation/interfaces.py           |  4 +-
 nemo_reinforcer/models/generation/vllm.py     | 33 ++++++++++----
 .../models/generation/test_vllm_generation.py | 44 ++++++++++++++++++-
 6 files changed, 91 insertions(+), 16 deletions(-)

diff --git a/docs/design_docs/generation.md b/docs/design_docs/generation.md
index e9fa3ee2ee..8dda8a028a 100644
--- a/docs/design_docs/generation.md
+++ b/docs/design_docs/generation.md
@@ -95,20 +95,30 @@ The {py:class}`UpdatableVllmInternalWorker <nemo_reinforcer.models.generation.vl
 To use a generation backend:
 
 ```python
+from transformers import AutoTokenizer
+
 from nemo_reinforcer.models.generation.vllm import VllmGeneration, VllmConfig
 from nemo_reinforcer.distributed.virtual_cluster import RayVirtualCluster
 from nemo_reinforcer.distributed.batched_data_dict import BatchedDataDict
 
 # Set up the configuration
+tokenizer = AutoTokenizer.from_pretrained(policy_config["model_name"])
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
+
 config = VllmConfig(
-    backend="vllm",
     model_name="Qwen/Qwen2.5-1.5B",
     max_new_tokens=100,
     temperature=0.7,
     top_p=1,
+    top_k=None,
+    stop_token_ids=[tokenizer.eos_token_id]
+    pad_token=tokenizer.pad_token_id,
+    skip_tokenizer_init=True,
     vllm_cfg={
         "tensor_parallel_size": 1,
-        "gpu_memory_utilization": 0.8
+        "gpu_memory_utilization": 0.8,
+        "max_model_len": 2048,
     }
 )
 
diff --git a/examples/run_grpo_math.py b/examples/run_grpo_math.py
index d3cb3fd805..b0085535ef 100644
--- a/examples/run_grpo_math.py
+++ b/examples/run_grpo_math.py
@@ -188,6 +188,8 @@ def setup_data(data_config: DataConfig, policy_config: PolicyConfig, env_configs
         raise ValueError(f"No processor for dataset {data_config['dataset_name']}.")
 
     tokenizer = AutoTokenizer.from_pretrained(policy_config["model_name"])
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
 
     task_data_processors = defaultdict(
         lambda: (math_task_spec, openinstructmath2_data_processor)
@@ -270,8 +272,7 @@ def main():
         checkpointer,
         grpo_state,
         master_config,
-    ) = setup(config, dataset, val_dataset)
-    # refit_policy_generation(policy, policy_generation)
+    ) = setup(config, tokenizer, dataset, val_dataset)
     grpo_train(
         policy,
         policy_generation,
diff --git a/nemo_reinforcer/algorithms/grpo.py b/nemo_reinforcer/algorithms/grpo.py
index 8e04c9c929..e08c848522 100644
--- a/nemo_reinforcer/algorithms/grpo.py
+++ b/nemo_reinforcer/algorithms/grpo.py
@@ -109,6 +109,7 @@ class MasterConfig(TypedDict):
 
 def setup(
     master_config: MasterConfig,
+    tokenizer: AutoTokenizer,
     dataset: AllTaskProcessedDataset,
     val_dataset: Optional[AllTaskProcessedDataset],
 ) -> Tuple[
@@ -219,6 +220,12 @@ def setup(
     # vllm model loading prefers clean environment, initialize policy_generation before policy (#52 will fix this)
     backend = generation_config["backend"]
     generation_config["model_name"] = policy_config["model_name"]  # Needed for vLLM
+    generation_config["vllm_cfg"]["skip_tokenizer_init"] = True
+    # When https://github.com/NVIDIA/reinforcer/issues/57 is fixed, we should update stop_token_ids below.
+    generation_config["stop_token_ids"] = [tokenizer.eos_token_id]
+    generation_config["pad_token"] = tokenizer.pad_token_id
+    generation_config["vllm_cfg"]["load_format"] = "dummy"
+
     if backend == "hf":
         policy_generation = None
         print(f"  ✓ Using HF backend for generation with {policy_config['model_name']}")
diff --git a/nemo_reinforcer/models/generation/interfaces.py b/nemo_reinforcer/models/generation/interfaces.py
index 8ffa1d2945..138b70fbc1 100644
--- a/nemo_reinforcer/models/generation/interfaces.py
+++ b/nemo_reinforcer/models/generation/interfaces.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from abc import ABC, abstractmethod
-from typing import Any, TypedDict, Union, Tuple
+from typing import Any, TypedDict, Union, Tuple, List
 
 import torch
 from nemo_reinforcer.distributed.batched_data_dict import BatchedDataDict
@@ -101,6 +101,8 @@ class GenerationConfig(TypedDict):
     top_p: float
     top_k: int
     model_name: str
+    stop_token_ids: List[int]
+    pad_token: int
 
 
 class GenerationDatumSpec(TypedDict):
diff --git a/nemo_reinforcer/models/generation/vllm.py b/nemo_reinforcer/models/generation/vllm.py
index ebbe53cff5..cb60b7fe8c 100644
--- a/nemo_reinforcer/models/generation/vllm.py
+++ b/nemo_reinforcer/models/generation/vllm.py
@@ -39,6 +39,9 @@ class VllmSpecificArgs(TypedDict):
     tensor_parallel_size: int
     gpu_memory_utilization: float
     max_model_len: int
+    # Additional arguments for vLLM inserted by reinforcer based on the context of when vllm is used
+    skip_tokenizer_init: bool
+    load_format: str
 
 
 class VllmConfig(GenerationConfig):
@@ -110,6 +113,7 @@ def __init__(
                           Only needed for the first worker in each tied worker group.
         """
         self.cfg = config
+
         self.model_name = self.cfg["model_name"]
         self.tensor_parallel_size = self.cfg["vllm_cfg"]["tensor_parallel_size"]
         self.gpu_memory_utilization = self.cfg["vllm_cfg"]["gpu_memory_utilization"]
@@ -166,9 +170,11 @@ def __init__(
 
         self.llm = LLM(
             model=self.model_name,
-            load_format="dummy",
-            tensor_parallel_size=self.tensor_parallel_size,
-            gpu_memory_utilization=self.gpu_memory_utilization,
+            # Training pipeline will set this to "dummy" and eval will load real weights using 'auto'
+            load_format=self.cfg["vllm_cfg"]["load_format"],
+            skip_tokenizer_init=self.cfg["vllm_cfg"]["skip_tokenizer_init"],
+            tensor_parallel_size=self.cfg["vllm_cfg"]["tensor_parallel_size"],
+            gpu_memory_utilization=self.cfg["vllm_cfg"]["gpu_memory_utilization"],
             enable_prefix_caching=True,
             dtype="auto",
             enforce_eager=True,
@@ -176,13 +182,10 @@ def __init__(
             trust_remote_code=True,
             worker_cls=UpdatableVllmInternalWorker,
             enable_sleep_mode=True,
+            disable_log_stats=True,
             **vllm_kwargs,
         )
 
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
-        if self.tokenizer.pad_token is None:
-            self.tokenizer.pad_token = self.tokenizer.eos_token
-
     def llm(self):
         return self.llm
 
@@ -213,7 +216,7 @@ def generate(
             f"input_ids and input_lengths must be present in the BatchedDataDict, got keys: {data.keys()}"
         )
         is_right_padded, error_msg = verify_right_padding(
-            data, pad_value=self.tokenizer.pad_token_id
+            data, pad_value=self.cfg["pad_token"]
         )
         if not is_right_padded:
             warnings.warn(
@@ -251,6 +254,7 @@ def generate(
             max_tokens=self.cfg["max_new_tokens"],
             logprobs=0,  # Return logprobs for the generated tokens
             stop=None,
+            stop_token_ids=self.cfg["stop_token_ids"],
         )
 
         # Generate outputs
@@ -276,7 +280,7 @@ def generate(
 
             # Create a new tensor with the right size and fill with padding token
             full_output = torch.full(
-                (total_length,), self.tokenizer.pad_token_id, dtype=input_ids.dtype
+                (total_length,), self.cfg["pad_token"], dtype=input_ids.dtype
             )
 
             # Copy original input (with padding) into the beginning
@@ -402,6 +406,17 @@ def __init__(
         """Initialize a vLLM policy with distributed workers."""
         # Store config
         self.cfg = config
+        # Ensure all required VllmConfig fields are present
+        missing_keys = [
+            key for key in VllmConfig.__annotations__ if key not in self.cfg
+        ]
+        assert not missing_keys, (
+            f"VLLM Configuration Error: Missing required keys in VllmConfig.\n"
+            f"Missing keys: {', '.join(missing_keys)}\n"
+            f"Provided keys: {', '.join(self.cfg.keys())}\n"
+            f"Please update your configuration to include all required VLLM parameters."
+        )
+
         self.tensor_parallel_size = self.cfg["vllm_cfg"]["tensor_parallel_size"]
 
         # Create worker builder for VllmGenerationWorker
diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py
index 9c89046af2..af8b5698e3 100644
--- a/tests/unit/models/generation/test_vllm_generation.py
+++ b/tests/unit/models/generation/test_vllm_generation.py
@@ -25,6 +25,7 @@
 
 # Define basic vLLM test config
 basic_vllm_test_config: VllmConfig = {
+    "backend": "vllm",
     "model_name": "meta-llama/Llama-3.2-1B",  # Small model for testing
     "dtype": "bfloat16",
     "max_new_tokens": 10,
@@ -39,6 +40,15 @@
 }
 
 
+def configure_vllm_with_tokenizer(vllm_config, tokenizer):
+    """Apply tokenizer-specific configurations to vLLM config."""
+    vllm_config["vllm_cfg"]["skip_tokenizer_init"] = True
+    vllm_config["vllm_cfg"]["load_format"] = "dummy"
+    vllm_config["pad_token"] = tokenizer.pad_token_id
+    vllm_config["stop_token_ids"] = [tokenizer.eos_token_id]
+    return vllm_config
+
+
 @pytest.fixture(scope="module")
 def check_vllm_available():
     """Skip tests if vLLM is not installed."""
@@ -74,9 +84,12 @@ def tokenizer():
 
 
 @pytest.fixture(scope="function")
-def policy(cluster, check_vllm_available):
+def policy(cluster, tokenizer, check_vllm_available):
     """Initialize the vLLM policy."""
-    policy = VllmGeneration(cluster, basic_vllm_test_config)
+    # Create separate configs for each policy
+    vllm_config = basic_vllm_test_config.copy()
+    vllm_config = configure_vllm_with_tokenizer(vllm_config, tokenizer)
+    policy = VllmGeneration(cluster, vllm_config)
     yield policy
 
     # Ensure policy is properly shutdown
@@ -121,6 +134,30 @@ def test_input_data(tokenizer):
     )
 
 
+def test_vllm_missing_required_config_key(cluster, check_vllm_available):
+    """Test that an assertion error is raised when a required config key is missing."""
+    # Create a config missing a required key by removing 'model_name'
+    incomplete_config = basic_vllm_test_config.copy()
+    del incomplete_config["model_name"]  # Remove a required key
+
+    # Also need to ensure skip_tokenizer_init and load_format are there
+    # since these are checked in VllmConfig.__annotations__
+    incomplete_config["skip_tokenizer_init"] = True
+    incomplete_config["load_format"] = "auto"
+
+    # Attempt to initialize VllmGeneration with incomplete config - should raise AssertionError
+    with pytest.raises(AssertionError) as excinfo:
+        VllmGeneration(cluster, incomplete_config)
+
+    # Verify the error message contains information about the missing key
+    error_message = str(excinfo.value)
+    assert "Missing required keys in VllmConfig" in error_message
+    assert "model_name" in error_message, (
+        "Error should mention the missing 'model_name' key"
+    )
+    print(f"Successfully caught missing config key with error: {error_message}")
+
+
 def test_vllm_policy_generation(policy, test_input_data, tokenizer):
     """Test vLLM policy generation capabilities."""
     # Test generation
@@ -171,6 +208,7 @@ def test_vllm_generation_with_hf_training(cluster, tokenizer):
 
     # Create separate configs for each policy
     vllm_config = basic_vllm_test_config.copy()
+    vllm_config = configure_vllm_with_tokenizer(vllm_config, tokenizer)
 
     # Create HF-specific config with required parameters
     hf_config = {
@@ -359,6 +397,7 @@ def test_vllm_policy_tensor_parallel(cluster, tokenizer):
     """Test vLLM policy with tensor parallelism > 1."""
     # Configure with tensor_parallel_size=2
     tp_config = basic_vllm_test_config.copy()
+    tp_config = configure_vllm_with_tokenizer(tp_config, tokenizer)
     tp_config["tensor_parallel_size"] = 2
 
     # Ensure we specify the distributed executor backend
@@ -420,6 +459,7 @@ def test_vllm_policy_weight_update(cluster, tokenizer, tensor_parallel_size):
 
     # Create separate configs for each policy
     vllm_config = basic_vllm_test_config.copy()
+    vllm_config = configure_vllm_with_tokenizer(vllm_config, tokenizer)
     vllm_config["tensor_parallel_size"] = tensor_parallel_size
 
     # Add vllm_kwargs only if using tensor parallelism

From f8e51a5dafd4565d432088cd6b82a3e1d5f92912 Mon Sep 17 00:00:00 2001
From: Terry Kong <terrycurtiskong@gmail.com>
Date: Tue, 25 Mar 2025 22:02:15 -0700
Subject: [PATCH 03/27] feat: unit test metric tracking (#40)

Signed-off-by: Terry Kong <terryk@nvidia.com>
Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
---
 .github/workflows/_run_test.yml               |  18 +-
 .github/workflows/cicd-main.yml               |   7 +
 docs/testing.md                               |  65 +++++++
 nemo_reinforcer/utils/logger.py               |  96 ++++++++--
 tests/run_unit.sh                             |   2 +-
 tests/unit/conftest.py                        | 168 ++++++++++++++++++
 .../unit/models/policy/test_hf_ray_policy.py  |   4 +-
 tests/unit/utils/test_logger.py               |   8 +-
 8 files changed, 347 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/_run_test.yml b/.github/workflows/_run_test.yml
index 813719503c..3081d9b204 100644
--- a/.github/workflows/_run_test.yml
+++ b/.github/workflows/_run_test.yml
@@ -27,11 +27,16 @@ on:
         default: 10
       SCRIPT:
         type: string
-        description: Test script to execute
+        description: Test script to execute in container
         required: true
       AFTER_SCRIPT:
         type: string
-        description: Script to run after main test
+        description: Script to run after main test in container
+        required: false
+        default: ":"
+      FINAL_SCRIPT_EXTERNAL:
+        type: string
+        description: Script to run after SCRIPT and AFTER_SCRIPT, but outside container (useful for logging)
         required: false
         default: ":"
       IS_OPTIONAL:
@@ -163,6 +168,15 @@ jobs:
             )
             docker exec nemo_container_${{ github.run_id }} bash -eux -o pipefail -c "$cmd"
 
+        - name: final_script_external
+          if: always() && inputs.FINAL_SCRIPT_EXTERNAL != ':'
+          run: |
+            cmd=$(cat <<"RUN_TEST_EOF"
+            ${{ inputs.FINAL_SCRIPT_EXTERNAL }}
+            RUN_TEST_EOF
+            )
+            bash -eux -o pipefail -c "$cmd"
+
         - name: Container shutdown
           if: always()
           run: |
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 0e2a289c62..05dcefaf86 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -154,6 +154,13 @@ jobs:
       SCRIPT: |
         cd ${REINFORCER_REPO_DIR}
         uv run --extra test bash -x ./tests/run_unit.sh
+      FINAL_SCRIPT_EXTERNAL: |
+        cat <<EOF | tee -a $GITHUB_STEP_SUMMARY
+        # Unit test results
+        \`\`\`json
+        $(cat tests/unit/unit_results.json || echo "n/a")
+        \`\`\`
+        EOF
     secrets:
       HF_TOKEN: ${{ secrets.HF_TOKEN }}
 
diff --git a/docs/testing.md b/docs/testing.md
index 610067961c..570c9c8696 100644
--- a/docs/testing.md
+++ b/docs/testing.md
@@ -32,6 +32,71 @@ CONTAINER=... bash tests/run_unit_in_docker.sh
 
 The required `CONTAINER` can be built by following the instructions in the [docker documentation](docker.md).
 
+### Tracking metrics in unit tests
+
+Unit tests may also log metrics to a fixture. The fixture is called `tracker` and has the following API:
+```python
+# Track an arbitrary metric (must be json serializable)
+tracker.track(metric_name, metric_value)
+# Log the maximum memory across the entire cluster. Okay for tests since they are run serially.
+tracker.log_max_mem(metric_name)
+# Returns the maximum memory. Useful if you are measuring changes in memory.
+tracker.get_max_mem()
+```
+
+Including the `tracker` fixture also tracks the elapsed time for the test implicitly.
+
+Here is an example test:
+```python
+def test_exponentiate(tracker):
+    starting_mem = tracker.get_max_mem()
+    base = 2
+    exponent = 4
+    result = base ** exponent
+    tracker.track("result", result)
+    tracker.log_max_mem("memory_after_exponentiating")
+    change_in_mem = tracker.get_max_mem() - starting_mem
+    tracker.track("change_in_mem", change_in_mem)
+    assert result == 16
+```
+
+Which would produce this file in `tests/unit/unit_results.json`:
+```json
+{
+  "exit_status": 0,
+  "git_commit": "f1062bd3fd95fc64443e2d9ee4a35fc654ba897e",
+  "start_time": "2025-03-24 23:34:12",
+  "metrics": {
+    "test_hf_ray_policy::test_hf_policy_generation": {
+      "avg_prob_mult_error": 1.0000039339065552,
+      "mean_lps": -1.5399343967437744,
+      "_elapsed": 17.323044061660767
+    }
+  },
+  "gpu_types": [
+    "NVIDIA H100 80GB HBM3"
+  ],
+  "coverage": 24.55897613282601
+}
+```
+
+:::{tip}
+Past unit test results are logged in `tests/unit/unit_results/`. These are helpful to view trends over time and commits.
+
+Here's an example `jq` command to view trends:
+
+```sh
+jq -r '[.start_time, .git_commit, .metrics["test_hf_ray_policy::test_hf_policy_generation"].avg_prob_mult_error] | @tsv' tests/unit/unit_results/*
+
+# Example output:
+#2025-03-24 23:35:39     778d288bb5d2edfd3eec4d07bb7dffffad5ef21b        1.0000039339065552
+#2025-03-24 23:36:37     778d288bb5d2edfd3eec4d07bb7dffffad5ef21b        1.0000039339065552
+#2025-03-24 23:37:37     778d288bb5d2edfd3eec4d07bb7dffffad5ef21b        1.0000039339065552
+#2025-03-24 23:38:14     778d288bb5d2edfd3eec4d07bb7dffffad5ef21b        1.0000039339065552
+#2025-03-24 23:38:50     778d288bb5d2edfd3eec4d07bb7dffffad5ef21b        1.0000039339065552
+```
+:::
+
 ## Functional tests
 
 :::{important}
diff --git a/nemo_reinforcer/utils/logger.py b/nemo_reinforcer/utils/logger.py
index dca4181681..42143e7593 100644
--- a/nemo_reinforcer/utils/logger.py
+++ b/nemo_reinforcer/utils/logger.py
@@ -144,6 +144,11 @@ def log_hyperparams(self, params: Dict[str, Any]) -> None:
         self.run.config.update(params)
 
 
+class GpuMetricSnapshot(TypedDict):
+    step: int
+    metrics: Dict[str, Any]
+
+
 class RayGpuMonitorLogger:
     """Monitor GPU utilization across a Ray cluster and log metrics to a parent logger."""
 
@@ -163,7 +168,9 @@ def __init__(
         self.collection_interval = collection_interval
         self.flush_interval = flush_interval
         self.parent_logger = parent_logger
-        self.metrics_buffer = []  # Store metrics with timestamps
+        self.metrics_buffer: list[
+            GpuMetricSnapshot
+        ] = []  # Store metrics with timestamps
         self.last_flush_time = time.time()
         self.is_running = False
         self.collection_thread = None
@@ -228,7 +235,9 @@ def _collection_loop(self):
 
                 time.sleep(self.collection_interval)
             except Exception as e:
-                print(f"Error in GPU monitoring collection loop: {e}")
+                print(
+                    f"Error in GPU monitoring collection loop or stopped abruptly: {e}"
+                )
                 time.sleep(self.collection_interval)  # Continue despite errors
 
     def _parse_gpu_metric(self, sample: Sample, node_idx: int) -> Dict[str, Any]:
@@ -241,7 +250,6 @@ def _parse_gpu_metric(self, sample: Sample, node_idx: int) -> Dict[str, Any]:
         Returns:
             Dictionary with metric name and value
         """
-        # TODO: Consider plumbing {'GpuDeviceName': 'NVIDIA H100 80GB HBM3'}
         # Expected labels for GPU metrics
         expected_labels = ["GpuIndex"]
         for label in expected_labels:
@@ -266,12 +274,72 @@ def _parse_gpu_metric(self, sample: Sample, node_idx: int) -> Dict[str, Any]:
         metric_name = f"node.{node_idx}.gpu.{index}.{metric_name}"
         return {metric_name: value}
 
+    def _parse_gpu_sku(self, sample: Sample, node_idx: int) -> Dict[str, str]:
+        """Parse a GPU metric sample into a standardized format.
+
+        Args:
+            sample: Prometheus metric sample
+            node_idx: Index of the node
+
+        Returns:
+            Dictionary with metric name and value
+        """
+        # TODO: Consider plumbing {'GpuDeviceName': 'NVIDIA H100 80GB HBM3'}
+        # Expected labels for GPU metrics
+        expected_labels = ["GpuIndex", "GpuDeviceName"]
+        for label in expected_labels:
+            if label not in sample.labels:
+                # This is probably a CPU node
+                return {}
+
+        metric_name = sample.name
+        # Only return SKU if the metric is one of these which publish these metrics
+        if (
+            metric_name != "ray_node_gpus_utilization"
+            and metric_name != "ray_node_gram_used"
+        ):
+            # Skip unexpected metrics
+            return {}
+
+        labels = sample.labels
+        index = labels["GpuIndex"]
+        value = labels["GpuDeviceName"]
+
+        metric_name = f"node.{node_idx}.gpu.{index}.type"
+        return {metric_name: value}
+
+    def _collect_gpu_sku(self) -> Dict[str, str]:
+        """Collect GPU SKU from all Ray nodes.
+
+        Note: This is an internal API and users are not expected to call this.
+
+        Returns:
+            Dictionary of SKU types on all Ray nodes
+        """
+        # TODO: We can re-use the same path for metrics because even though both utilization and memory metrics duplicate
+        #       the GPU metadata information; since the metadata is the same for each node, we can overwrite it and expect them to
+        #       be the same
+        return self._collect(sku=True)
+
     def _collect_metrics(self) -> Dict[str, Any]:
         """Collect GPU metrics from all Ray nodes.
 
         Returns:
             Dictionary of collected metrics
         """
+        return self._collect(metrics=True)
+
+    def _collect(self, metrics: bool = False, sku: bool = False) -> Dict[str, Any]:
+        """Collect GPU metrics from all Ray nodes.
+
+        Returns:
+            Dictionary of collected metrics
+        """
+        assert metrics ^ sku, (
+            f"Must collect either metrics or sku, not both: {metrics=}, {sku=}"
+        )
+        parser_fn = self._parse_gpu_metric if metrics else self._parse_gpu_sku
+
         if not ray.is_initialized():
             print("Ray is not initialized. Cannot collect GPU metrics.")
             return {}
@@ -295,7 +363,9 @@ def _collect_metrics(self) -> Dict[str, Any]:
             # Process each node's metrics
             collected_metrics = {}
             for node_idx, metric_address in enumerate(unique_metric_addresses):
-                gpu_metrics = self._fetch_and_parse_metrics(node_idx, metric_address)
+                gpu_metrics = self._fetch_and_parse_metrics(
+                    node_idx, metric_address, parser_fn
+                )
                 collected_metrics.update(gpu_metrics)
 
             return collected_metrics
@@ -304,7 +374,7 @@ def _collect_metrics(self) -> Dict[str, Any]:
             print(f"Error collecting GPU metrics: {e}")
             return {}
 
-    def _fetch_and_parse_metrics(self, node_idx, metric_address):
+    def _fetch_and_parse_metrics(self, node_idx, metric_address, parser_fn):
         """Fetch metrics from a node and parse GPU metrics.
 
         Args:
@@ -335,7 +405,7 @@ def _fetch_and_parse_metrics(self, node_idx, metric_address):
                     continue
 
                 for sample in family.samples:
-                    metrics = self._parse_gpu_metric(sample, node_idx)
+                    metrics = parser_fn(sample, node_idx)
                     gpu_metrics.update(metrics)
 
             return gpu_metrics
@@ -346,18 +416,16 @@ def _fetch_and_parse_metrics(self, node_idx, metric_address):
 
     def flush(self):
         """Flush collected metrics to the parent logger."""
-        if not self.parent_logger:
-            return
-
         with self.lock:
             if not self.metrics_buffer:
                 return
 
-            # Log each set of metrics with its original step
-            for entry in self.metrics_buffer:
-                step = entry["step"]
-                metrics = entry["metrics"]
-                self.parent_logger.log_metrics(metrics, step, prefix="ray")
+            if self.parent_logger:
+                # Log each set of metrics with its original step
+                for entry in self.metrics_buffer:
+                    step = entry["step"]
+                    metrics = entry["metrics"]
+                    self.parent_logger.log_metrics(metrics, step, prefix="ray")
 
             # Clear buffer after logging
             self.metrics_buffer = []
diff --git a/tests/run_unit.sh b/tests/run_unit.sh
index e826a18a60..f51ff49ff6 100755
--- a/tests/run_unit.sh
+++ b/tests/run_unit.sh
@@ -33,7 +33,7 @@ export RAY_DEDUP_LOGS=0
 
 # Run unit tests
 echo "Running unit tests..."
-if ! pytest unit/ -s -rA "$@"; then
+if ! pytest unit/ --cov=nemo_reinforcer --cov-report=term --cov-report=json -s -rA "$@"; then
     echo "[ERROR]: Unit tests failed."
     exit 1
 fi
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index 67de3a36af..b13d6588a4 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -11,7 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from io import StringIO
+import time
 import pytest
+from nemo_reinforcer.utils.logger import GPUMonitoringConfig
+from tests import unit
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
@@ -28,7 +32,153 @@
 import random
 from typing import Callable
 import ray
+import json
 from nemo_reinforcer.distributed.virtual_cluster import init_ray
+from typing import TypedDict
+from datetime import datetime
+
+dir_path = os.path.dirname(os.path.abspath(__file__))
+
+UNIT_RESULTS_FILE = os.path.join(dir_path, "unit_results.json")
+UNIT_RESULTS_FILE_DATED = os.path.join(
+    dir_path, f"unit_results/{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+)
+
+
+class UnitTestData(TypedDict):
+    exit_status: int | str
+    git_commit: str
+    start_time: str
+    metrics: dict
+    gpu_types: list[str]
+    coverage: dict
+
+
+def pytest_sessionstart(session):
+    # Delete the unit results file at the start of a new test session
+    if os.path.exists(UNIT_RESULTS_FILE):
+        try:
+            os.remove(UNIT_RESULTS_FILE)
+            print(f"Deleted existing results file: {UNIT_RESULTS_FILE}")
+        except Exception as e:
+            print(f"Warning: Failed to delete results file: {e}")
+
+    # Get the git commit hash
+    try:
+        import subprocess
+
+        result = subprocess.run(
+            ["git", "-C", dir_path, "rev-parse", "HEAD"],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        git_commit = result.stdout.strip()
+    except Exception as e:
+        git_commit = f"Error getting git commit: {str(e)}"
+
+    session.config._unit_test_data = UnitTestData(
+        exit_status="was not set",
+        git_commit=git_commit,
+        start_time=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+        metrics={},
+        gpu_types=[],
+        coverage={},
+    )
+
+
+@pytest.fixture(scope="session", autouse=True)
+def session_data(request, init_ray_cluster):
+    """Session-level fixture to store and save metrics data.
+
+    This fixture tracks both metrics from tests and metadata about the test environment.
+    The metrics are stored in the 'metrics' dictionary.
+
+    It's set to autouse so that we track metadata and coverage even if no test selected
+    explicitly track metrics.
+    """
+    # Pass init_ray_cluster so that we can access ray metadata
+
+    ############################################################
+    # 1. Gather all the unit test data #
+    ############################################################
+    unit_test_data: UnitTestData = request.config._unit_test_data
+    yield unit_test_data
+
+    ############################################################
+    # 2. Gather the ray metadata #
+    ############################################################
+    from nemo_reinforcer.utils.logger import RayGpuMonitorLogger
+
+    logger = RayGpuMonitorLogger(
+        collection_interval=float("inf"),
+        flush_interval=float("inf"),
+        parent_logger=None,
+    )
+    unit_test_data["gpu_types"] = list(set(logger._collect_gpu_sku().values()))
+
+    ############################################################
+    # 3. Gather the coverage data #
+    ############################################################
+    # We directly access the coverage controller from the plugin manager
+    # so we can access the coverage total before the pytest session finishes.
+    if request.config.pluginmanager.hasplugin("_cov"):
+        plugin = request.config.pluginmanager.getplugin("_cov")
+        if plugin.cov_controller:
+            cov_controller = plugin.cov_controller
+    # We currently don't use the cov_report since we can always access the coverage.json later, but
+    # in the future if we want to report the coverage more granularly as part of the session finish,
+    # we can access it here.
+    cov_report = StringIO()
+    cov_total = cov_controller.summary(cov_report)
+    unit_test_data["coverage"] = cov_total
+
+
+@pytest.fixture
+def tracker(request, session_data, ray_gpu_monitor):
+    """Test-level fixture that automatically captures test function info."""
+    # Get fully qualified test name (module::test_function)
+    module_name = request.module.__name__
+    test_name = request.function.__name__
+    qualified_name = f"{module_name}::{test_name}"
+
+    # Initialize an empty dict for this test if it doesn't exist
+    if qualified_name not in session_data:
+        session_data["metrics"][qualified_name] = {}
+
+    class Tracker:
+        def track(self, metric_name: str, value):
+            """Tracking an arbitrary metric."""
+            session_data["metrics"][qualified_name][metric_name] = value
+
+        def get_max_mem(self):
+            metrics = ray_gpu_monitor._collect_metrics()
+            max_mem = 0
+            for m_name, m_value in metrics.items():
+                if m_name.endswith(".memory"):
+                    max_mem = max(max_mem, m_value)
+            return max_mem
+
+        def log_max_mem(self, metric_name: str):
+            session_data["metrics"][qualified_name][metric_name] = self.get_max_mem()
+
+    start_time = time.time()
+    yield Tracker()
+    end_time = time.time()
+    # Prefix with `_` to indicate it's automatically collected
+    session_data["metrics"][qualified_name]["_elapsed"] = end_time - start_time
+
+
+def pytest_sessionfinish(session, exitstatus):
+    data = session.config._unit_test_data
+    data["exit_status"] = exitstatus
+    print(f"\nSaving unit test data to {UNIT_RESULTS_FILE}")
+    print(f"and saving to {UNIT_RESULTS_FILE_DATED}")
+    with open(UNIT_RESULTS_FILE, "w") as f:
+        json.dump(data, f, indent=2)
+    os.makedirs(os.path.dirname(UNIT_RESULTS_FILE_DATED), exist_ok=True)
+    with open(UNIT_RESULTS_FILE_DATED, "w") as f:
+        json.dump(data, f, indent=2)
 
 
 @pytest.fixture(scope="session", autouse=True)
@@ -42,6 +192,24 @@ def init_ray_cluster():
     ray.shutdown()
 
 
+@pytest.fixture(scope="session", autouse=True)
+def ray_gpu_monitor(init_ray_cluster):
+    """Initialize Ray for the test module and clean up afterward.
+
+    This fixture doesn't need to be called directly.
+    """
+    from nemo_reinforcer.utils.logger import RayGpuMonitorLogger
+
+    gpu_monitor = RayGpuMonitorLogger(
+        collection_interval=1,
+        flush_interval=float("inf"),  # Disabling flushing since we will do it manually
+        parent_logger=None,
+    )
+    gpu_monitor.start()
+    yield gpu_monitor
+    gpu_monitor.stop()
+
+
 def _setup_distributed(rank, world_size, port, backend="nccl"):
     """Initialize the distributed environment for a test (internal use only)"""
     os.environ["MASTER_ADDR"] = "localhost"
diff --git a/tests/unit/models/policy/test_hf_ray_policy.py b/tests/unit/models/policy/test_hf_ray_policy.py
index ae80b3fd1e..29d8cbcbee 100644
--- a/tests/unit/models/policy/test_hf_ray_policy.py
+++ b/tests/unit/models/policy/test_hf_ray_policy.py
@@ -364,7 +364,7 @@ def generation_setup():
 
 
 @pytest.mark.timeout(180)
-def test_hf_policy_generation(generation_setup):
+def test_hf_policy_generation(generation_setup, tracker):
     policy, cluster, data, tokenizer, prompts, expected_generations = generation_setup
 
     # Verify resources were created properly
@@ -415,6 +415,7 @@ def test_hf_policy_generation(generation_setup):
         torch.exp(torch.abs(results["logprobs"] - fprop_results["logprobs"]))
     )
     print(f"avg prob mult error: {avg_prob_mult_error}")
+    tracker.track("avg_prob_mult_error", float(avg_prob_mult_error))
     assert avg_prob_mult_error <= 1.025
 
     # get logprobs for the expected generations
@@ -439,6 +440,7 @@ def test_hf_policy_generation(generation_setup):
 
     expected_logprobs = policy.get_logprobs(expected_data)["logprobs"]
     mean_lps = torch.mean(expected_logprobs * expected_tokenized["attention_mask"])
+    tracker.track("mean_lps", float(mean_lps))
     assert mean_lps > -1.7, "Expected logprobs should be greater than -1.7"
     assert mean_lps < -1.4, "Expected logprobs should be less than -1.4"
 
diff --git a/tests/unit/utils/test_logger.py b/tests/unit/utils/test_logger.py
index a8d982266b..dd1c24fb27 100644
--- a/tests/unit/utils/test_logger.py
+++ b/tests/unit/utils/test_logger.py
@@ -417,7 +417,9 @@ def test_fetch_and_parse_metrics(self, mock_get, mock_ray):
 
             # Call the method
             result = monitor._fetch_and_parse_metrics(
-                node_idx=2, metric_address="test_ip:test_port"
+                node_idx=2,
+                metric_address="test_ip:test_port",
+                parser_fn=monitor._parse_gpu_metric,
             )
 
             # Verify request was made correctly
@@ -460,8 +462,8 @@ def test_collect_metrics(self, mock_ray):
 
             # Verify _fetch_and_parse_metrics was called for each node
             assert mock_fetch.call_count == 2
-            mock_fetch.assert_any_call(0, "10.0.0.1:8080")
-            mock_fetch.assert_any_call(1, "10.0.0.2:8080")
+            mock_fetch.assert_any_call(0, "10.0.0.1:8080", monitor._parse_gpu_metric)
+            mock_fetch.assert_any_call(1, "10.0.0.2:8080", monitor._parse_gpu_metric)
 
             # Verify the result combines metrics from all nodes
             assert result == {

From 778f5b0927fcc9b58b82b2381d7d6701e6a95447 Mon Sep 17 00:00:00 2001
From: Terry Kong <terrycurtiskong@gmail.com>
Date: Wed, 26 Mar 2025 10:29:07 -0700
Subject: [PATCH 04/27] fix: unit test error when coverage wasn't specified
 (#88)

Signed-off-by: Terry Kong <terryk@nvidia.com>
Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
---
 tests/unit/conftest.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index b13d6588a4..1dbbd6a169 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -51,7 +51,7 @@ class UnitTestData(TypedDict):
     start_time: str
     metrics: dict
     gpu_types: list[str]
-    coverage: dict
+    coverage: str
 
 
 def pytest_sessionstart(session):
@@ -83,7 +83,7 @@ def pytest_sessionstart(session):
         start_time=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
         metrics={},
         gpu_types=[],
-        coverage={},
+        coverage="[n/a] run with --cov=nemo_reinforcer",
     )
 
 
@@ -122,10 +122,16 @@ def session_data(request, init_ray_cluster):
     ############################################################
     # We directly access the coverage controller from the plugin manager
     # so we can access the coverage total before the pytest session finishes.
+    cov_controller = None
     if request.config.pluginmanager.hasplugin("_cov"):
         plugin = request.config.pluginmanager.getplugin("_cov")
         if plugin.cov_controller:
             cov_controller = plugin.cov_controller
+
+    if not cov_controller:
+        # Means the user didn't run with --cov=...
+        return
+
     # We currently don't use the cov_report since we can always access the coverage.json later, but
     # in the future if we want to report the coverage more granularly as part of the session finish,
     # we can access it here.

From c46581ea74e3a87f985848abe5c8bcf7468d0437 Mon Sep 17 00:00:00 2001
From: Terry Kong <terrycurtiskong@gmail.com>
Date: Wed, 26 Mar 2025 11:54:25 -0700
Subject: [PATCH 05/27] ci: temporarily disable CI on main since PRs must be up
 to date before merge (#91)

Signed-off-by: Terry Kong <terryk@nvidia.com>
Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
---
 .github/workflows/cicd-main.yml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 05dcefaf86..2d2255e7f2 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -28,9 +28,10 @@ on:
         default: all
         type: string
         description: Comma-separated list of tests to run. Use "all" to run the full test suite.
-  push:
-    branches:
-      - 'main'
+  # TODO: Due to limited compute, disabling pushes to main. This is okay to do since we force PRs to be up to date and the CI tests on pull/$PR_NUM/merge
+  #push:
+  #  branches:
+  #    - 'main'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}

From dc9d857cb616125b1c2bd20361ecd9628a1915c1 Mon Sep 17 00:00:00 2001
From: Yi-Fu Wu <yifu.wu@gmail.com>
Date: Wed, 26 Mar 2025 18:29:58 -0700
Subject: [PATCH 06/27] Cleanup + expandable_segments

Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
---
 nemo_reinforcer/distributed/worker_groups.py |   4 +-
 nemo_reinforcer/models/policy/hf_policy.py   | 177 ++-----------------
 2 files changed, 12 insertions(+), 169 deletions(-)

diff --git a/nemo_reinforcer/distributed/worker_groups.py b/nemo_reinforcer/distributed/worker_groups.py
index 93d4808b75..2497065b0f 100644
--- a/nemo_reinforcer/distributed/worker_groups.py
+++ b/nemo_reinforcer/distributed/worker_groups.py
@@ -125,7 +125,6 @@ def __call__(
                 num_gpus=num_gpus,
                 bundle_indices=bundle_indices,
             )
-            print("HERE worker_class", worker_class, "env_vars", env_vars)
 
             # Apply resource configuration
             if resources and "num_gpus" in resources:
@@ -135,7 +134,8 @@ def __call__(
             if env_vars:
                 if "runtime_env" not in options:
                     options["runtime_env"] = {}
-                options["runtime_env"]["env_vars"] = env_vars
+                for k, v in env_vars.items():
+                    options["runtime_env"]["env_vars"][k] = v
 
             # Apply initialization parameters
             if init_kwargs:
diff --git a/nemo_reinforcer/models/policy/hf_policy.py b/nemo_reinforcer/models/policy/hf_policy.py
index 6478eda6e2..08eb9061ce 100644
--- a/nemo_reinforcer/models/policy/hf_policy.py
+++ b/nemo_reinforcer/models/policy/hf_policy.py
@@ -21,8 +21,6 @@
 import ray
 import torch
 from torch.distributed.device_mesh import init_device_mesh
-from torch.distributed.fsdp.api import CPUOffload
-from torch.distributed.fsdp._runtime_utils import _lazy_init
 from torch.distributed.fsdp import (
     FullyShardedDataParallel,
     FullStateDictConfig,
@@ -48,164 +46,36 @@
 from nemo_reinforcer.distributed.virtual_cluster import (
     PY_EXECUTABLES,
 )
-# def move_to_cpu(model):
-#     return model.to("cpu")
 
-# def move_to_cpu(model):
-#     for param in model.parameters():
-#         param.data = param.data.to("cpu")
-#     for buffer in model.buffers():
-#         buffer.data = buffer.data.to("cpu")
-#     if hasattr(model, "_fsdp_wrapped_module"):
-#         model._fsdp_wrapped_module.to("cpu")
-
-#     return model
-
-# def move_to_gpu(model):
-#     return model.to("cuda")
 
 def move_to_cpu(model):
-    # offload2
-    # if not isinstance(model, FullyShardedDataParallel):
-    #     return model.to("cpu")
-
     for _, param in model.named_parameters():
         param.data = param.data.to("cpu", non_blocking=True)
         if hasattr(param, "_local_shard"):
-            # param._local_shard = param._local_shard.to("cpu", non_blocking=True)
             param._local_shard = param.data
-        # param.data = param.data.to("cpu", non_blocking=True)
         if param.grad is not None:
             param.grad = param.grad.to("cpu", non_blocking=True)
 
-    # for buffer in model.buffers():
-    #     buffer.data = buffer.data.to("cpu")
-    # if torch.distributed.get_rank() == 0:
-    #     import pdb; pdb.set_trace()
     if hasattr(model, "_fsdp_wrapped_module"):
         move_to_cpu(model._fsdp_wrapped_module)
-        # model._fsdp_wrapped_module.to("cpu")
 
     return model
 
-def move_to_gpu(model):
-    # offload2
-    # if not isinstance(model, FullyShardedDataParallel):
-    #     return model.to("cuda")
 
+def move_to_gpu(model):
     for _, param in model.named_parameters():
         param.data = param.data.to("cuda", non_blocking=True)
         if hasattr(param, "_local_shard"):
-            # param._local_shard = param._local_shard.to("cuda", non_blocking=True)
             param._local_shard = param.data
-        # param.data = param.data.to("cuda", non_blocking=True)
         if param.grad is not None:
             param.grad = param.grad.to("cuda", non_blocking=True)
 
-    # for buffer in model.buffers():
-    #     buffer.data = buffer.data.to("cuda")
     if hasattr(model, "_fsdp_wrapped_module"):
         move_to_gpu(model._fsdp_wrapped_module)
 
     return model
 
 
-# def move_to_cpu(model):
-#     if not isinstance(model, FullyShardedDataParallel):
-#         model = model.to("cpu")
-#         return model
-#     # lazy init FSDP model
-#     _lazy_init(model, model)
-#     # assert model._is_root, f"Only support root model offloading to CPU"
-#     # # if torch.distributed.get_rank() == 0:
-#     #     # import ipdb; ipdb.set_trace()
-#     for handle in model._all_handles:
-#         if handle._offload_params:
-#             continue
-#         flat_param = handle.flat_param
-#         # assert flat_param.data.data_ptr() == flat_param._local_shard.data_ptr() and \
-#         #     id(flat_param.data) != id(flat_param._local_shard) and \
-#         #     flat_param.data.size() == flat_param._local_shard.size()
-#         handle.flat_param_to(torch.device("cpu"), non_blocking=True)
-#         # the following still keeps id(._local_shard) != id(.data)
-#         flat_param._local_shard = flat_param.data
-#         # assert id(flat_param._local_shard) != id(flat_param.data)
-#     return model
-
-# def move_to_gpu(model):
-#     if not isinstance(model, FullyShardedDataParallel):
-#         model = model.to("cuda")
-#         return model
-#     # lazy init FSDP model
-#     _lazy_init(model, model)
-#     # assert model._is_root, f"Only support root model loading to GPU"
-#     device_id = torch.cuda.current_device()
-#     for handle in model._all_handles:
-#         if handle._offload_params:
-#             continue
-#         flat_param = handle.flat_param
-#         handle.flat_param_to(torch.device(f"cuda:{device_id}"), non_blocking=True)
-#         # the following still keeps id(._local_shard) != id(.data)
-#         flat_param._local_shard = flat_param.data
-#     return model
-
-
-# def move_to_cpu(model):
-#     if not isinstance(model, FullyShardedDataParallel):
-#         model = model.to("cpu")
-#         return model
-#     # lazy init FSDP model
-#     _lazy_init(model, model)
-#     assert model._is_root, f"Only support root model offloading to CPU"
-#     # if torch.distributed.get_rank() == 0:
-#         # import ipdb; ipdb.set_trace()
-#     for handle in model._all_handles:
-#         if handle._offload_params:
-#             continue
-#         flat_param = handle.flat_param
-#         assert flat_param.data.data_ptr() == flat_param._local_shard.data_ptr() and \
-#             id(flat_param.data) != id(flat_param._local_shard) and \
-#             flat_param.data.size() == flat_param._local_shard.size()
-#         handle.flat_param_to(torch.device("cpu"), non_blocking=True)
-#         # the following still keeps id(._local_shard) != id(.data)
-#         flat_param._local_shard = flat_param.data
-#         assert id(flat_param._local_shard) != id(flat_param.data)
-
-#     for param in model.parameters():
-#         param.data = param.data.to("cpu")
-#     for buffer in model.buffers():
-#         buffer.data = buffer.data.to("cpu")
-#     if hasattr(model, "_fsdp_wrapped_module"):
-#         move_to_cpu(model._fsdp_wrapped_module)
-#         # model._fsdp_wrapped_module.to("cpu")
-
-#     return model
-
-# def move_to_gpu(model):
-#     if not isinstance(model, FullyShardedDataParallel):
-#         model = model.to("cuda")
-#         return model
-#     # lazy init FSDP model
-#     _lazy_init(model, model)
-#     assert model._is_root, f"Only support root model loading to GPU"
-#     device_id = torch.cuda.current_device()
-#     for handle in model._all_handles:
-#         if handle._offload_params:
-#             continue
-#         flat_param = handle.flat_param
-#         handle.flat_param_to(torch.device(f"cuda:{device_id}"), non_blocking=True)
-#         # the following still keeps id(._local_shard) != id(.data)
-#         flat_param._local_shard = flat_param.data
-
-#     for param in model.parameters():
-#         param.data = param.data.to("cuda")
-#     for buffer in model.buffers():
-#         buffer.data = buffer.data.to("cuda")
-#     if hasattr(model, "_fsdp_wrapped_module"):
-#         move_to_gpu(model._fsdp_wrapped_module)
-
-#     return model
-
 @ray.remote
 class HfPolicyWorker:
     DEFAULT_PY_EXECUTABLE = PY_EXECUTABLES.DEFAULT_VENV
@@ -262,15 +132,6 @@ def __init__(
         #    (Initialize device mesh, shard submodules, then shard entire model)
         # ------------------------------------------------
 
-        @staticmethod
-        def configure_worker(
-            num_gpus: int | float, bundle_indices: Optional[list] = None
-        ) -> tuple[dict, dict, dict]:
-            env_vars = {
-                "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True"
-            }
-            return None, env_vars, None
-
         def do_fsdp(model):
             # Create a device mesh with 'world_size' GPUs in a 1D arrangement.
             mesh = init_device_mesh("cuda", (world_size,))
@@ -280,30 +141,21 @@ def do_fsdp(model):
                 buffer_dtype=torch.float32,
             )
 
-            # return FullyShardedDataParallel(
-            #     model,
-            #     device_mesh=mesh,
-            #     auto_wrap_policy=size_based_auto_wrap_policy,
-            # )
             return FullyShardedDataParallel(
                 model,
                 device_mesh=mesh,
                 auto_wrap_policy=size_based_auto_wrap_policy,
                 mixed_precision=mp_policy,
-                # cpu_offload=CPUOffload(offload_params=True),
             )
 
-        # self.model.to("cuda")
-        self.model = move_to_gpu(self.model)
+        self.model.to("cuda")
         self.model = do_fsdp(self.model)
         self.model = move_to_cpu(self.model)
-        # self.reference_model.to("cuda")
 
-        self.reference_model = move_to_gpu(self.reference_model)
+        self.reference_model.to("cuda")
         self.reference_model = do_fsdp(self.reference_model)
         self.reference_model = move_to_cpu(self.reference_model)
 
-        # self.model.to("cuda")
         self.model = move_to_gpu(self.model)
         self._held_reference_model_params = None
         # register_fsdp_forward_method(self.model, "generate")
@@ -355,6 +207,13 @@ def do_fsdp(model):
                 "No weights path provided. Starting from scratch (default policy init)"
             )
 
+    @staticmethod
+    def configure_worker(
+        num_gpus: int | float, bundle_indices: Optional[list] = None
+    ) -> tuple[dict, dict, dict]:
+        env_vars = {"PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True"}
+        return None, env_vars, None
+
     def is_alive(self):
         return True
 
@@ -601,7 +460,6 @@ def use_reference_model(self):
             original_reference_model = self.reference_model
 
             self.model = move_to_cpu(self.model)
-            # self.reference_model = self.reference_model.to("cuda")
             self.reference_model = move_to_gpu(self.reference_model)
 
             # Swap the references
@@ -616,7 +474,6 @@ def use_reference_model(self):
         finally:
             # Restore original references and device placement
             self.reference_model = move_to_cpu(original_reference_model)
-            # self.model = original_model.to("cuda")
             self.model = move_to_gpu(original_model)
             gc.collect()
             torch.cuda.empty_cache()
@@ -890,14 +747,12 @@ def get_weight_ipc_handles(self, offload_model=True):
         return {self.device_uuid: data}
 
     def prepare_for_lp_inference(self):
-        # self.model.to("cuda")
         self.model = move_to_gpu(self.model)
         self.model.eval()
         self.offload_before_refit()
 
     def prepare_for_training(self, *args, **kwargs):
         # onload models and optimizer state to cuda
-        # self.model.to("cuda")
         self.model = move_to_gpu(self.model)
         self.model.train()
 
@@ -953,18 +808,6 @@ def offload_after_refit(self):
         print(
             f"GPU Memory after refit complete: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved"
         )
-    
-    def move_to_cpu_old1(self, model):
-        for param in model.parameters():
-            param.data = param.data.to("cpu")
-
-        for buffer in model.buffers():
-            buffer.data = buffer.data.to("cpu")
-
-        if hasattr(model, "_fsdp_wrapped_module"):
-            model._fsdp_wrapped_module.to("cpu")
-
-        return model
 
     def save_checkpoint(
         self,

From fd0f613109e061c9a881166d7c306ab065805a15 Mon Sep 17 00:00:00 2001
From: Yi-Fu Wu <yifu.wu@gmail.com>
Date: Wed, 26 Mar 2025 18:34:24 -0700
Subject: [PATCH 07/27] More cleanup

Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
---
 examples/configs/grpo_math_1B.yaml         | 15 +++++++--------
 examples/run_grpo_math.py                  |  2 +-
 nemo_reinforcer/models/policy/hf_policy.py |  4 ++--
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml
index a8149156e9..72aad000ce 100644
--- a/examples/configs/grpo_math_1B.yaml
+++ b/examples/configs/grpo_math_1B.yaml
@@ -5,7 +5,7 @@ grpo:
   max_num_steps: 1000000
   normalize_rewards: true
   use_leave_one_out_baseline: true
-  val_period: -1
+  val_period: 10
   val_at_start: false
   max_val_samples: 256
   val_batch_size: 256
@@ -15,7 +15,7 @@ loss_fn:
   ratio_eps: 0.2
 
 checkpointing:
-  enabled: false
+  enabled: true
   checkpoint_dir: "results/grpo"
   metric_name: "val_reward"
   higher_is_better: true
@@ -29,8 +29,7 @@ policy:
   generation_batch_size: 32
   logprob_batch_size: 4
   max_total_sequence_length: 512
-  # precision: "bfloat16"
-  precision: "float32"
+  precision: "bfloat16"
 
   optimizer:
     name: "torch.optim.AdamW"
@@ -76,17 +75,17 @@ env:
 logger:
   log_dir: "logs"  # Base directory for all logs
   num_val_samples_to_print: 0 # Number of validation samples to pretty print on terminal
-  wandb_enabled: true
+  wandb_enabled: false
   tensorboard_enabled: false
   monitor_gpus: false  # If true, will monitor GPU usage and log to wandb and/or tensorboard
   wandb:
-    project: "grpo-dev-yifu"
-    name: "grpo-dev-offload2"
+    project: "grpo-dev"
+    name: "grpo-dev-logger"
   tensorboard: {}
   gpu_monitoring:
     collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
     flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
 
 cluster:
-  gpus_per_node: 2
+  gpus_per_node: 1
   num_nodes: 1
diff --git a/examples/run_grpo_math.py b/examples/run_grpo_math.py
index b0085535ef..7e2f3e693d 100644
--- a/examples/run_grpo_math.py
+++ b/examples/run_grpo_math.py
@@ -23,7 +23,7 @@
 from transformers import AutoTokenizer
 from collections import defaultdict
 
-from nemo_reinforcer.algorithms.grpo import MasterConfig, grpo_train, setup, refit_policy_generation
+from nemo_reinforcer.algorithms.grpo import MasterConfig, grpo_train, setup
 from nemo_reinforcer.distributed.virtual_cluster import init_ray
 from nemo_reinforcer.utils.config import load_config
 from nemo_reinforcer.utils.logger import get_next_experiment_dir
diff --git a/nemo_reinforcer/models/policy/hf_policy.py b/nemo_reinforcer/models/policy/hf_policy.py
index 08eb9061ce..9813187914 100644
--- a/nemo_reinforcer/models/policy/hf_policy.py
+++ b/nemo_reinforcer/models/policy/hf_policy.py
@@ -49,7 +49,7 @@
 
 
 def move_to_cpu(model):
-    for _, param in model.named_parameters():
+    for param in model.parameters():
         param.data = param.data.to("cpu", non_blocking=True)
         if hasattr(param, "_local_shard"):
             param._local_shard = param.data
@@ -63,7 +63,7 @@ def move_to_cpu(model):
 
 
 def move_to_gpu(model):
-    for _, param in model.named_parameters():
+    for param in model.parameters():
         param.data = param.data.to("cuda", non_blocking=True)
         if hasattr(param, "_local_shard"):
             param._local_shard = param.data

From d9b7fd458a87885d550b6f16e15c807419a8de0a Mon Sep 17 00:00:00 2001
From: Yi-Fu Wu <yifu.wu@gmail.com>
Date: Mon, 31 Mar 2025 07:22:34 -0700
Subject: [PATCH 08/27] cpu offload

Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
---
 nemo_reinforcer/models/policy/hf_policy.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/nemo_reinforcer/models/policy/hf_policy.py b/nemo_reinforcer/models/policy/hf_policy.py
index 9813187914..a5fb7be767 100644
--- a/nemo_reinforcer/models/policy/hf_policy.py
+++ b/nemo_reinforcer/models/policy/hf_policy.py
@@ -22,6 +22,7 @@
 import torch
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.fsdp import (
+    CPUOffload,
     FullyShardedDataParallel,
     FullStateDictConfig,
     MixedPrecision,
@@ -49,6 +50,7 @@
 
 
 def move_to_cpu(model):
+    return model
     for param in model.parameters():
         param.data = param.data.to("cpu", non_blocking=True)
         if hasattr(param, "_local_shard"):
@@ -63,6 +65,7 @@ def move_to_cpu(model):
 
 
 def move_to_gpu(model):
+    return model
     for param in model.parameters():
         param.data = param.data.to("cuda", non_blocking=True)
         if hasattr(param, "_local_shard"):
@@ -146,6 +149,7 @@ def do_fsdp(model):
                 device_mesh=mesh,
                 auto_wrap_policy=size_based_auto_wrap_policy,
                 mixed_precision=mp_policy,
+                cpu_offload=CPUOffload(offload_params=True),
             )
 
         self.model.to("cuda")
@@ -761,7 +765,8 @@ def prepare_for_training(self, *args, **kwargs):
             for state in self.optimizer.state.values():
                 for k, v in state.items():
                     if torch.is_tensor(v) and not v.is_cuda:
-                        state[k] = v.to("cuda")
+                        ...
+                        # state[k] = v.to("cuda")
 
         torch.cuda.empty_cache()
 
@@ -773,10 +778,11 @@ def offload_before_refit(self):
             for state in self.optimizer.state.values():
                 for k, v in state.items():
                     if torch.is_tensor(v):
-                        state[k] = v.to("cpu")
+                        ...
+                        # state[k] = v.to("cpu")
 
-        for buffer in self.model.buffers():
-            buffer.data = buffer.data.to("cpu")
+        # for buffer in self.model.buffers():
+        #     buffer.data = buffer.data.to("cpu")
 
         gc.collect()
         torch.cuda.empty_cache()

From d24a9c340a20fbdedc08355bc243a939f22545b7 Mon Sep 17 00:00:00 2001
From: Yi-Fu Wu <yifu.wu@gmail.com>
Date: Mon, 31 Mar 2025 18:05:36 -0700
Subject: [PATCH 09/27] Make everything configurable

Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
---
 examples/configs/grpo_math_1B.yaml           |   2 +
 nemo_reinforcer/distributed/worker_groups.py |   6 +-
 nemo_reinforcer/models/policy/__init__.py    |   2 +
 nemo_reinforcer/models/policy/hf_policy.py   | 133 ++++++++++---------
 4 files changed, 78 insertions(+), 65 deletions(-)

diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml
index 72aad000ce..f15031c536 100644
--- a/examples/configs/grpo_math_1B.yaml
+++ b/examples/configs/grpo_math_1B.yaml
@@ -30,6 +30,8 @@ policy:
   logprob_batch_size: 4
   max_total_sequence_length: 512
   precision: "bfloat16"
+  expandable_segments_enabled: false
+  fsdp_offload_enabled: false
 
   optimizer:
     name: "torch.optim.AdamW"
diff --git a/nemo_reinforcer/distributed/worker_groups.py b/nemo_reinforcer/distributed/worker_groups.py
index 2497065b0f..94af81b555 100644
--- a/nemo_reinforcer/distributed/worker_groups.py
+++ b/nemo_reinforcer/distributed/worker_groups.py
@@ -179,6 +179,7 @@ def __init__(
         workers_per_node: Optional[Union[int, List[int]]] = None,
         name_prefix: str = "",
         bundle_indices_list: Optional[List[tuple]] = None,
+        additional_env_vars: Optional[Dict[str, str]] = None,
     ):
         """Initialize a group of distributed Ray workers.
 
@@ -191,13 +192,14 @@ def __init__(
             bundle_indices_list: Explicit list of (node_idx, [local_bundle_indices]) tuples.
                                Each tuple defines a tied group of workers placed on the same node.
                                If provided, workers_per_node is ignored.
+            additional_env_vars: Additional environment variables to pass to the workers
         """
         self._workers = []
         self._worker_metadata = []
         self.cluster = cluster
         self.name_prefix = name_prefix
         self.tied_workers_groups = []
-
+        self.additional_env_vars = additional_env_vars
         # Maps worker indices to their corresponding tied group index
         # For example, if worker with index 3 belongs to tied worker group 1,
         # then worker_to_tied_group_index[3] = 1
@@ -287,6 +289,8 @@ def _create_workers_from_bundle_indices(
                         "NODE_RANK": str(node_idx),
                     }
                 )
+                if self.additional_env_vars:
+                    env_vars.update(self.additional_env_vars)
 
                 # For tensor parallel groups, only the first worker gets bundle_indices
                 worker_bundle_indices = (
diff --git a/nemo_reinforcer/models/policy/__init__.py b/nemo_reinforcer/models/policy/__init__.py
index ee2bf2389e..0e57b62ca1 100644
--- a/nemo_reinforcer/models/policy/__init__.py
+++ b/nemo_reinforcer/models/policy/__init__.py
@@ -25,3 +25,5 @@ class PolicyConfig(TypedDict):
     logprob_batch_size: int
     generation: GenerationConfig
     precision: str
+    expandable_segments_enabled: bool
+    fsdp_offload_enabled: bool
diff --git a/nemo_reinforcer/models/policy/hf_policy.py b/nemo_reinforcer/models/policy/hf_policy.py
index a5fb7be767..ef4ec8460f 100644
--- a/nemo_reinforcer/models/policy/hf_policy.py
+++ b/nemo_reinforcer/models/policy/hf_policy.py
@@ -49,36 +49,6 @@
 )
 
 
-def move_to_cpu(model):
-    return model
-    for param in model.parameters():
-        param.data = param.data.to("cpu", non_blocking=True)
-        if hasattr(param, "_local_shard"):
-            param._local_shard = param.data
-        if param.grad is not None:
-            param.grad = param.grad.to("cpu", non_blocking=True)
-
-    if hasattr(model, "_fsdp_wrapped_module"):
-        move_to_cpu(model._fsdp_wrapped_module)
-
-    return model
-
-
-def move_to_gpu(model):
-    return model
-    for param in model.parameters():
-        param.data = param.data.to("cuda", non_blocking=True)
-        if hasattr(param, "_local_shard"):
-            param._local_shard = param.data
-        if param.grad is not None:
-            param.grad = param.grad.to("cuda", non_blocking=True)
-
-    if hasattr(model, "_fsdp_wrapped_module"):
-        move_to_gpu(model._fsdp_wrapped_module)
-
-    return model
-
-
 @ray.remote
 class HfPolicyWorker:
     DEFAULT_PY_EXECUTABLE = PY_EXECUTABLES.DEFAULT_VENV
@@ -144,23 +114,29 @@ def do_fsdp(model):
                 buffer_dtype=torch.float32,
             )
 
+            cpu_offload = (
+                CPUOffload(offload_params=True)
+                if self.cfg["fsdp_offload_enabled"]
+                else None
+            )
+
             return FullyShardedDataParallel(
                 model,
                 device_mesh=mesh,
                 auto_wrap_policy=size_based_auto_wrap_policy,
                 mixed_precision=mp_policy,
-                cpu_offload=CPUOffload(offload_params=True),
+                cpu_offload=cpu_offload,
             )
 
         self.model.to("cuda")
         self.model = do_fsdp(self.model)
-        self.model = move_to_cpu(self.model)
+        self.model = self.manual_offload_to_cpu(self.model)
 
         self.reference_model.to("cuda")
         self.reference_model = do_fsdp(self.reference_model)
-        self.reference_model = move_to_cpu(self.reference_model)
+        self.reference_model = self.manual_offload_to_cpu(self.reference_model)
 
-        self.model = move_to_gpu(self.model)
+        self.model = self.manual_load_to_gpu(self.model)
         self._held_reference_model_params = None
         # register_fsdp_forward_method(self.model, "generate")
         if init_optimizer:
@@ -211,13 +187,6 @@ def do_fsdp(model):
                 "No weights path provided. Starting from scratch (default policy init)"
             )
 
-    @staticmethod
-    def configure_worker(
-        num_gpus: int | float, bundle_indices: Optional[list] = None
-    ) -> tuple[dict, dict, dict]:
-        env_vars = {"PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True"}
-        return None, env_vars, None
-
     def is_alive(self):
         return True
 
@@ -463,8 +432,8 @@ def use_reference_model(self):
             original_model = self.model
             original_reference_model = self.reference_model
 
-            self.model = move_to_cpu(self.model)
-            self.reference_model = move_to_gpu(self.reference_model)
+            self.model = self.manual_offload_to_cpu(self.model)
+            self.reference_model = self.manual_load_to_gpu(self.reference_model)
 
             # Swap the references
             self.model, self.reference_model = self.reference_model, self.model
@@ -477,8 +446,8 @@ def use_reference_model(self):
 
         finally:
             # Restore original references and device placement
-            self.reference_model = move_to_cpu(original_reference_model)
-            self.model = move_to_gpu(original_model)
+            self.reference_model = self.manual_offload_to_cpu(original_reference_model)
+            self.model = self.manual_load_to_gpu(original_model)
             gc.collect()
             torch.cuda.empty_cache()
 
@@ -745,28 +714,28 @@ def get_weight_ipc_handles(self, offload_model=True):
             data[name] = reduce_tensor(p.detach())
 
         if offload_model:
-            self.model = move_to_cpu(self.model)
+            self.model = self.manual_offload_to_cpu(self.model)
             gc.collect()
             torch.cuda.empty_cache()
         return {self.device_uuid: data}
 
     def prepare_for_lp_inference(self):
-        self.model = move_to_gpu(self.model)
+        self.model = self.manual_load_to_gpu(self.model)
         self.model.eval()
         self.offload_before_refit()
 
     def prepare_for_training(self, *args, **kwargs):
         # onload models and optimizer state to cuda
-        self.model = move_to_gpu(self.model)
+        self.model = self.manual_load_to_gpu(self.model)
         self.model.train()
 
-        # Move optimizer state to CUDA if it exists
-        if hasattr(self, "optimizer") and self.optimizer is not None:
-            for state in self.optimizer.state.values():
-                for k, v in state.items():
-                    if torch.is_tensor(v) and not v.is_cuda:
-                        ...
-                        # state[k] = v.to("cuda")
+        if not self.cfg["fsdp_offload_enabled"]:
+            # Move optimizer state to CUDA if it exists
+            if hasattr(self, "optimizer") and self.optimizer is not None:
+                for state in self.optimizer.state.values():
+                    for k, v in state.items():
+                        if torch.is_tensor(v) and not v.is_cuda:
+                            state[k] = v.to("cuda")
 
         torch.cuda.empty_cache()
 
@@ -774,15 +743,15 @@ def prepare_for_training(self, *args, **kwargs):
     def offload_before_refit(self):
         """Offload the optimizer and buffers to the CPU."""
         torch.randn(1).cuda()  # wake up torch allocator
-        if hasattr(self, "optimizer") and self.optimizer is not None:
-            for state in self.optimizer.state.values():
-                for k, v in state.items():
-                    if torch.is_tensor(v):
-                        ...
-                        # state[k] = v.to("cpu")
+        if not self.cfg["fsdp_offload_enabled"]:
+            if hasattr(self, "optimizer") and self.optimizer is not None:
+                for state in self.optimizer.state.values():
+                    for k, v in state.items():
+                        if torch.is_tensor(v):
+                            state[k] = v.to("cpu")
 
-        # for buffer in self.model.buffers():
-        #     buffer.data = buffer.data.to("cpu")
+            for buffer in self.model.buffers():
+                buffer.data = buffer.data.to("cpu")
 
         gc.collect()
         torch.cuda.empty_cache()
@@ -797,7 +766,7 @@ def offload_before_refit(self):
     @torch.no_grad()
     def offload_after_refit(self):
         # Offload as much as possible on the CPU
-        self.model = move_to_cpu(self.model)
+        self.model = self.manual_offload_to_cpu(self.model)
         self.model.eval()
         torch.randn(1).cuda()  # wake up torch allocator
         self.offload_before_refit()  # rerun the old offload function
@@ -815,6 +784,38 @@ def offload_after_refit(self):
             f"GPU Memory after refit complete: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved"
         )
 
+    def manual_offload_to_cpu(self, model):
+        if self.cfg["fsdp_offload_enabled"]:
+            return model
+
+        for param in model.parameters():
+            param.data = param.data.to("cpu", non_blocking=True)
+            if hasattr(param, "_local_shard"):
+                param._local_shard = param.data
+            if param.grad is not None:
+                param.grad = param.grad.to("cpu", non_blocking=True)
+
+        if hasattr(model, "_fsdp_wrapped_module"):
+            self.manual_offload_to_cpu(model._fsdp_wrapped_module)
+
+        return model
+
+    def manual_load_to_gpu(self, model):
+        if self.cfg["fsdp_offload_enabled"]:
+            return model
+
+        for param in model.parameters():
+            param.data = param.data.to("cuda", non_blocking=True)
+            if hasattr(param, "_local_shard"):
+                param._local_shard = param.data
+            if param.grad is not None:
+                param.grad = param.grad.to("cuda", non_blocking=True)
+
+        if hasattr(model, "_fsdp_wrapped_module"):
+            self.manual_load_to_gpu(model._fsdp_wrapped_module)
+
+        return model
+
     def save_checkpoint(
         self,
         weights_path: str,
@@ -921,11 +922,15 @@ def __init__(
             weights_path=weights_path,
             optimizer_path=optimizer_path,
         )
+        additional_env_vars = {}
+        if config["expandable_segments_enabled"]:
+            additional_env_vars["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
         self.worker_group = RayWorkerGroup(
             cluster,
             worker_builder,
             name_prefix=name_prefix,
             workers_per_node=workers_per_node,
+            additional_env_vars=additional_env_vars,
         )
         self.dp_size = self.worker_group.world_size
         self.cfg = config

From 11fbef755cce4045cfbea76746b30b790e12176f Mon Sep 17 00:00:00 2001
From: Yi-Fu Wu <yifu.wu@gmail.com>
Date: Wed, 2 Apr 2025 04:06:16 -0700
Subject: [PATCH 10/27] expandable_segments and fsdp offload configs

Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
---
 examples/configs/grpo_math_8B.yaml | 2 ++
 examples/configs/sft.yaml          | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/examples/configs/grpo_math_8B.yaml b/examples/configs/grpo_math_8B.yaml
index d747e9249f..3408e84e32 100644
--- a/examples/configs/grpo_math_8B.yaml
+++ b/examples/configs/grpo_math_8B.yaml
@@ -9,6 +9,8 @@ policy:
   logprob_batch_size: 2
   max_total_sequence_length: 4096
   precision: "bfloat16"
+  expandable_segments_enabled: false
+  fsdp_offload_enabled: false
 
   optimizer:
     name: "torch.optim.AdamW"
diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml
index 1282285fc3..8bb0c7615b 100644
--- a/examples/configs/sft.yaml
+++ b/examples/configs/sft.yaml
@@ -22,6 +22,8 @@ policy:
   train_micro_batch_size: 1
   max_total_sequence_length: 2048
   precision: "float32"
+  expandable_segments_enabled: false
+  fsdp_offload_enabled: false
 
   optimizer:
     name: "torch.optim.AdamW"

From ad48c1db004e7031b9e6c4571e9547d47a098a88 Mon Sep 17 00:00:00 2001
From: Yi-Fu Wu <yifu.wu@gmail.com>
Date: Wed, 2 Apr 2025 04:06:42 -0700
Subject: [PATCH 11/27] Make port configurable in ray.sub

Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
---
 ray.sub | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ray.sub b/ray.sub
index 7258aec045..3fb8b76585 100644
--- a/ray.sub
+++ b/ray.sub
@@ -17,6 +17,7 @@ set -eoux pipefail
 CONTAINER=$CONTAINER
 MOUNTS=$MOUNTS
 COMMAND=${COMMAND:-}  # This is a script relative to the SLURM_SUBMIT_DIR. If left empty, it will leave the cluster idle after it's brought up.
+PORT=${PORT:-41993}
 ########################################################
 
 COMMON_SRUN_ARGS=""
@@ -54,7 +55,7 @@ done
 head_node=${nodes_array[0]}
 head_node_ip=${ip_addresses_array[0]}
 
-port=41993
+port=$PORT
 ip_head=$head_node_ip:$port
 
 # First we start the head of the ray cluster on one of the physical nodes

From 83bd55d38da8c8808a457b659a12584669eb92ae Mon Sep 17 00:00:00 2001
From: Parth Chadha <pchadha@nvidia.com>
Date: Wed, 26 Mar 2025 16:22:43 -0700
Subject: [PATCH 12/27] fix: error out early if ray cluster does not have
 resources  (#89)

Signed-off-by: Parth Chadha <pchadha@nvidia.com>
Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
---
 .../distributed/virtual_cluster.py            | 43 ++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/nemo_reinforcer/distributed/virtual_cluster.py b/nemo_reinforcer/distributed/virtual_cluster.py
index 3c45697345..d909f17b6b 100644
--- a/nemo_reinforcer/distributed/virtual_cluster.py
+++ b/nemo_reinforcer/distributed/virtual_cluster.py
@@ -173,6 +173,30 @@ def _init_placement_groups(self, strategy: str):
         if self._node_placement_groups is not None:
             return self._node_placement_groups
 
+        # Check available resources in the Ray cluster
+        cluster_resources = ray.cluster_resources()
+        total_available_gpus = int(cluster_resources.get("GPU", 0))
+        total_available_cpus = int(cluster_resources.get("CPU", 0))
+
+        # Calculate required resources
+        total_requested_gpus = (
+            sum(self._bundle_ct_per_node_list) if self.use_gpus else 0
+        )
+        total_requested_cpus = (
+            sum(self._bundle_ct_per_node_list) * self.max_colocated_worker_groups
+        )
+
+        # Validate resources
+        if self.use_gpus and total_requested_gpus > total_available_gpus:
+            raise ValueError(
+                f"Not enough GPUs available. Requested {total_requested_gpus} GPUs, but only {total_available_gpus} are available in the cluster."
+            )
+
+        if total_requested_cpus > total_available_cpus:
+            raise ValueError(
+                f"Not enough CPUs available. Requested {total_requested_cpus} CPUs, but only {total_available_cpus} are available in the cluster."
+            )
+
         num_cpus_per_bundle = self.max_colocated_worker_groups
         # num_gpus_per_bundle == 1 indicates that there is 1 GPU per process
         num_gpus_per_bundle = 1 if self.use_gpus else 0
@@ -192,7 +216,24 @@ def _init_placement_groups(self, strategy: str):
             for i, bundles in enumerate(resources)
         ]
 
-        ray.get([pg.ready() for pg in self._node_placement_groups])
+        # Add timeout to prevent hanging indefinitely
+        try:
+            ray.get(
+                [pg.ready() for pg in self._node_placement_groups], timeout=180
+            )  # 3-minute timeout
+        except (TimeoutError, ray.exceptions.GetTimeoutError):
+            # Clean up any created placement groups
+            for pg in self._node_placement_groups:
+                try:
+                    remove_placement_group(pg)
+                except Exception:
+                    pass
+            self._node_placement_groups = None
+            raise TimeoutError(
+                "Timed out waiting for placement groups to be ready. The cluster may not have enough resources "
+                "to satisfy the requested configuration, or the resources may be busy with other tasks."
+            )
+
         return self._node_placement_groups
 
     def get_placement_groups(self):

From f6a3d91a777814e2603668d692c9232ed31eaadc Mon Sep 17 00:00:00 2001
From: Terry Kong <terrycurtiskong@gmail.com>
Date: Wed, 26 Mar 2025 16:56:26 -0700
Subject: [PATCH 13/27] ci: skip functional until more capacity available
 and/or tests speed up (#94)

Signed-off-by: Terry Kong <terryk@nvidia.com>
Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
---
 .github/workflows/cicd-main.yml | 42 +++++++++++++++++----------------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 2d2255e7f2..7902c02146 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -165,23 +165,25 @@ jobs:
     secrets:
       HF_TOKEN: ${{ secrets.HF_TOKEN }}
 
-  functional-tests:
-    name: ${{ matrix.test_case }}
-    needs: [build-container, pre-flight]
-    uses: ./.github/workflows/_run_test.yml
-    if: ${{ needs.pre-flight.outputs.run_ci == 'true' }}
-    strategy:
-      matrix:
-        test_case:
-          - sft.sh
-          - grpo.sh
-    with:
-      # TODO: For now, allow these to fail since the checks are not robust.
-      IS_OPTIONAL: true
-      RUNNER: self-hosted-azure
-      TIMEOUT: 15
-      SCRIPT: |
-        cd ${REINFORCER_REPO_DIR}
-        uv run bash ./tests/functional/${{ matrix.test_case }}
-    secrets:
-      HF_TOKEN: ${{ secrets.HF_TOKEN }}
\ No newline at end of file
+  # TODO: Temporarily disable functional tests until we have more capacity and tests run quicker
+  #       Related: https://github.com/NVIDIA/reinforcer/pull/27
+  #functional-tests:
+  #  name: ${{ matrix.test_case }}
+  #  needs: [build-container, pre-flight]
+  #  uses: ./.github/workflows/_run_test.yml
+  #  if: ${{ needs.pre-flight.outputs.run_ci == 'true' }}
+  #  strategy:
+  #    matrix:
+  #      test_case:
+  #        - sft.sh
+  #        - grpo.sh
+  #  with:
+  #    # TODO: For now, allow these to fail since the checks are not robust.
+  #    IS_OPTIONAL: true
+  #    RUNNER: self-hosted-azure
+  #    TIMEOUT: 15
+  #    SCRIPT: |
+  #      cd ${REINFORCER_REPO_DIR}
+  #      uv run bash ./tests/functional/${{ matrix.test_case }}
+  #  secrets:
+  #    HF_TOKEN: ${{ secrets.HF_TOKEN }}
\ No newline at end of file

From 4eb1d6dcf25c1f053a82ed0a02b281be91201d93 Mon Sep 17 00:00:00 2001
From: yuki <48991475+yuki-666@users.noreply.github.com>
Date: Thu, 27 Mar 2025 09:00:25 +0800
Subject: [PATCH 14/27] feat: evaluation implement (#16)

Signed-off-by: Yuki Huang <yukih@nvidia.com>
Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
---
 docs/guides/eval.md                           |  33 +++
 docs/index.md                                 |   1 +
 examples/configs/eval.yaml                    |  30 +++
 examples/run_eval.py                          | 145 +++++++++++++
 examples/run_grpo_math.py                     |  14 +-
 nemo_reinforcer/data/__init__.py              |   5 +
 nemo_reinforcer/data/datasets.py              |  51 +++++
 nemo_reinforcer/data/llm_message_utils.py     |  28 ++-
 nemo_reinforcer/evals/eval.py                 | 191 ++++++++++++++++++
 nemo_reinforcer/evals/run_env_eval.py         |  13 --
 nemo_reinforcer/models/generation/vllm.py     |  70 ++++++-
 .../models/generation/test_vllm_generation.py |  45 ++++-
 12 files changed, 602 insertions(+), 24 deletions(-)
 create mode 100644 docs/guides/eval.md
 create mode 100644 examples/configs/eval.yaml
 create mode 100644 examples/run_eval.py
 create mode 100644 nemo_reinforcer/evals/eval.py
 delete mode 100644 nemo_reinforcer/evals/run_env_eval.py

diff --git a/docs/guides/eval.md b/docs/guides/eval.md
new file mode 100644
index 0000000000..8ac5ab5675
--- /dev/null
+++ b/docs/guides/eval.md
@@ -0,0 +1,33 @@
+# Evaluation
+
+## Start Evaluation
+
+### Start Script
+```sh
+# To run the evaluation with default config (examples/configs/eval.yaml)
+uv run python examples/run_eval.py
+
+# Specify a custom config file
+uv run python examples/run_eval.py --config path/to/custom_config.yaml
+
+# Override specific config values via command line
+uv run python examples/run_eval.py generation.model_name="Qwen/Qwen2.5-Math-7B-Instruct"
+```
+
+### Example Output
+
+```
+============================================================
+model_name='Qwen2.5-Math-1.5B-Instruct' dataset_name='aime_2024'
+score=0.10 (3.0/30)
+============================================================
+```
+
+## Configuration
+
+An example Evaluation configuration file can be found [here](../../examples/configs/eval.yaml).
+
+### Prompt Template Configuration
+Always remember to use the same `prompt_file` and `system_prompt_file` that were used during training.
+
+For open-source models, we recommend setting `prompt_file=null` and `system_prompt_file=null` to allow them to use their native chat templates.
diff --git a/docs/index.md b/docs/index.md
index 56cd64ac1b..0628f19953 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -18,6 +18,7 @@ cluster.md
 adding_new_models.md
 guides/sft.md
 guides/grpo.md
+guides/eval.md
 ```
 
 ```{toctree}
diff --git a/examples/configs/eval.yaml b/examples/configs/eval.yaml
new file mode 100644
index 0000000000..a867e9617f
--- /dev/null
+++ b/examples/configs/eval.yaml
@@ -0,0 +1,30 @@
+# Evaluation Configuration
+generation:
+  backend: "vllm" # only vllm is supported for evaluation
+  max_new_tokens: ${generation.vllm_cfg.max_model_len}
+  temperature: 0.0
+  top_p: 1.0
+  top_k: -1 # disable
+  num_prompts_per_step: -1 # -1 means pass all prompts at once
+  model_name: "Qwen/Qwen2.5-Math-1.5B-Instruct"
+  vllm_cfg:
+    tensor_parallel_size: 1
+    gpu_memory_utilization: 0.9
+    max_model_len: 2048
+
+data:
+  max_input_seq_length: ${generation.vllm_cfg.max_model_len} # useless since we directly use prompts in evaluation
+  prompt_file: null
+  system_prompt_file: null
+  dataset_name: "HuggingFaceH4/aime_2024"
+  dataset_key: "train"
+  problem_key: "problem"
+  solution_key: "answer"
+
+env:
+  math:
+    num_workers: 8
+
+cluster:
+  gpus_per_node: 1
+  num_nodes: 1
diff --git a/examples/run_eval.py b/examples/run_eval.py
new file mode 100644
index 0000000000..54358ad260
--- /dev/null
+++ b/examples/run_eval.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import pprint
+import sys
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from datasets import load_dataset
+from omegaconf import OmegaConf
+from transformers import AutoTokenizer
+
+from examples.run_grpo_math import math_data_processor
+from nemo_reinforcer.data import MathDataConfig
+from nemo_reinforcer.data.datasets import AllTaskProcessedDataset
+from nemo_reinforcer.data.interfaces import TaskDataSpec
+from nemo_reinforcer.data.llm_message_utils import remap_dataset_keys
+from nemo_reinforcer.distributed.virtual_cluster import init_ray
+from nemo_reinforcer.environments.math_environment import MathEnvironment
+from nemo_reinforcer.evals.eval import MasterConfig, run_env_eval, setup
+from nemo_reinforcer.models.generation.interfaces import GenerationConfig
+
+
+def parse_args():
+    """Parse command line arguments."""
+    parser = argparse.ArgumentParser(description="Run Evaluation with configuration")
+    parser.add_argument(
+        "--config", type=str, default=None, help="Path to YAML config file"
+    )
+
+    # Parse known args for the script
+    args, remaining = parser.parse_known_args()
+
+    # Convert remaining args to OmegaConf format
+    overrides = OmegaConf.from_dotlist(remaining)
+
+    return args, overrides
+
+
+def setup_data(
+    data_config: MathDataConfig, generation_config: GenerationConfig, env_configs
+):
+    print("\n▶ Setting up data...")
+    math_task_spec = TaskDataSpec(
+        task_name="math",
+        prompt_file=data_config["prompt_file"],
+        system_prompt_file=data_config["system_prompt_file"],
+    )
+
+    # load dataset
+    base_dataset = load_dataset(data_config["dataset_name"])
+    if data_config["dataset_key"] is not None:
+        base_dataset = base_dataset[data_config["dataset_key"]]
+    # remap problem and solution keys
+    remapped_dataset = remap_dataset_keys(
+        base_dataset,
+        mapping_dict={
+            data_config["problem_key"]: "problem",
+            data_config["solution_key"]: "expected_answer",
+        },
+    )
+
+    tokenizer = AutoTokenizer.from_pretrained(generation_config["model_name"])
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    math_env = MathEnvironment.options(
+        runtime_env={"py_executable": MathEnvironment.DEFAULT_PY_EXECUTABLE}
+    ).remote(env_configs["math"])
+
+    dataset = AllTaskProcessedDataset(
+        dataset=remapped_dataset,
+        tokenizer=tokenizer,
+        default_task_data_spec=math_task_spec,
+        task_data_processors=math_data_processor,
+        max_seq_length=data_config["max_input_seq_length"],
+    )
+
+    return dataset, math_env, tokenizer
+
+
+def main():
+    """Main entry point."""
+    # Parse arguments
+    args, overrides = parse_args()
+
+    if not args.config:
+        args.config = os.path.join(os.path.dirname(__file__), "configs", "eval.yaml")
+
+    config = OmegaConf.load(args.config)
+    print(f"Loaded configuration from: {args.config}")
+
+    if overrides:
+        override_conf = OmegaConf.from_cli()
+        print(f"Overrides: {override_conf}")
+        config = OmegaConf.merge(config, override_conf)
+
+    config: MasterConfig = OmegaConf.to_container(config, resolve=True)
+    print("Applied CLI overrides")
+
+    # Print config
+    print("Final config:")
+    pprint.pprint(config)
+
+    # Init ray
+    init_ray()
+
+    # Setup data
+    (
+        dataset,
+        math_env,
+        tokenizer,
+    ) = setup_data(config["data"], config["generation"], config["env"])
+
+    # Setup
+    (
+        vllm_generation,
+        dataloader,
+        master_config,
+    ) = setup(config, tokenizer, dataset)
+
+    # Run evaluation
+    run_env_eval(
+        vllm_generation,
+        dataloader,
+        math_env,
+        master_config,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/run_grpo_math.py b/examples/run_grpo_math.py
index 7e2f3e693d..b87c7037b3 100644
--- a/examples/run_grpo_math.py
+++ b/examples/run_grpo_math.py
@@ -122,6 +122,8 @@ def math_data_processor(
 
     template = task_data_spec.custom_template
     message_log: LLMMessageLogType = []
+
+    # system prompt
     if task_data_spec.system_prompt:
         sys_message = {"role": "system", "content": task_data_spec.system_prompt}
         message = tokenizer.apply_chat_template(
@@ -135,10 +137,11 @@ def math_data_processor(
             0
         ]
         message_log.append(sys_message)
-    user_message = {
-        "role": "user",
-        "content": task_data_spec.prompt.format(problem),
-    }
+
+    # user prompt
+    if task_data_spec.prompt:
+        problem = task_data_spec.prompt.format(problem)
+    user_message = {"role": "user", "content": problem}
     message = tokenizer.apply_chat_template(
         [user_message],
         chat_template=template,
@@ -167,8 +170,9 @@ def math_data_processor(
         "extra_env_info": extra_env_info,
         "loss_multiplier": loss_multiplier,
         "idx": idx,
-        "task_name": datum_dict["task_name"],
     }
+    if "task_name" in datum_dict:
+        output["task_name"] = datum_dict["task_name"]
     return output
 
 
diff --git a/nemo_reinforcer/data/__init__.py b/nemo_reinforcer/data/__init__.py
index 63aad516b2..09eaf35fb5 100644
--- a/nemo_reinforcer/data/__init__.py
+++ b/nemo_reinforcer/data/__init__.py
@@ -21,3 +21,8 @@ class DataConfig(TypedDict):
     system_prompt_file: Optional[str]
     dataset_name: str
     val_dataset_name: Optional[str]
+
+
+class MathDataConfig(DataConfig):
+    problem_key: str
+    solution_key: str
diff --git a/nemo_reinforcer/data/datasets.py b/nemo_reinforcer/data/datasets.py
index 033cee05d7..8a81c85fb2 100644
--- a/nemo_reinforcer/data/datasets.py
+++ b/nemo_reinforcer/data/datasets.py
@@ -130,3 +130,54 @@ def rl_collate_fn(data_batch: List[DatumSpec]) -> BatchedDataDict:
         batch_max_length=batch_max_length,
     )
     return output
+
+
+def eval_collate_fn(data_batch: List[DatumSpec]) -> BatchedDataDict:
+    """Collate function for evaluation.
+
+    Takes a list of data samples and combines them into a single batched dictionary
+    for model evaluation.
+
+    Args:
+        data_batch: List of data samples with message_log, extra_env_info, and idx fields.
+
+    Returns:
+        BatchedDataDict with message_log, extra_env_info, and idx fields.
+
+    Examples:
+    ```{doctest}
+    >>> import torch
+    >>> from nemo_reinforcer.data.datasets import eval_collate_fn
+    >>> from nemo_reinforcer.data.interfaces import DatumSpec
+    >>> data_batch = [
+    ...     DatumSpec(
+    ...         message_log=[{"role": "user", "content": "Hello", "token_ids": torch.tensor([1, 2, 3])}],
+    ...         extra_env_info={'ground_truth': '1'},
+    ...         idx=0,
+    ...     ),
+    ...     DatumSpec(
+    ...         message_log=[{"role": "assistant", "content": "Hi there", "token_ids": torch.tensor([4, 5, 6, 7])}],
+    ...         extra_env_info={'ground_truth': '2'},
+    ...         idx=1,
+    ...     ),
+    ... ]
+    >>> output = eval_collate_fn(data_batch)
+    >>> output['message_log'][0]
+    [{'role': 'user', 'content': 'Hello', 'token_ids': tensor([1, 2, 3])}]
+    >>> output['message_log'][1]
+    [{'role': 'assistant', 'content': 'Hi there', 'token_ids': tensor([4, 5, 6, 7])}]
+    >>> output['extra_env_info']
+    [{'ground_truth': '1'}, {'ground_truth': '2'}]
+    >>> output['idx']
+    [0, 1]
+    """
+    message_log = [datum_spec["message_log"] for datum_spec in data_batch]
+    extra_env_info = [datum_spec["extra_env_info"] for datum_spec in data_batch]
+    idx = [datum_spec["idx"] for datum_spec in data_batch]
+
+    output = BatchedDataDict(
+        message_log=message_log,
+        extra_env_info=extra_env_info,
+        idx=idx,
+    )
+    return output
diff --git a/nemo_reinforcer/data/llm_message_utils.py b/nemo_reinforcer/data/llm_message_utils.py
index 43e24fc1ce..5ae8bee9a8 100644
--- a/nemo_reinforcer/data/llm_message_utils.py
+++ b/nemo_reinforcer/data/llm_message_utils.py
@@ -11,10 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Dict, List, Union
-
+from typing import Dict, List
 
 import torch
+from datasets import Dataset
 
 from nemo_reinforcer.data.interfaces import (
     LLMMessageLogType,
@@ -390,3 +390,27 @@ def get_formatted_message_log(
         prev_formatted_message = formatted_message
 
     return message_log
+
+
+def remap_dataset_keys(
+    dataset: Dataset,
+    mapping_dict: Dict[str, str],
+) -> Dataset:
+    """Remap dataset keys as per mapping.
+
+    Args:
+        dataset: The input dataset to remap keys in
+        mapping_dict: A dictionary mapping input keys to output keys
+
+    Returns:
+        Dataset: A new dataset with remapped keys
+    """
+    # no need to remap if the keys are already correct
+    if all(k == v for k, v in mapping_dict.items()):
+        return dataset
+
+    # return the remapped dataset
+    return dataset.map(
+        lambda x: {v: x[k] for k, v in mapping_dict.items()},
+        remove_columns=list(mapping_dict.keys()),
+    )
diff --git a/nemo_reinforcer/evals/eval.py b/nemo_reinforcer/evals/eval.py
new file mode 100644
index 0000000000..33d486a4d5
--- /dev/null
+++ b/nemo_reinforcer/evals/eval.py
@@ -0,0 +1,191 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import Tuple, TypedDict
+
+import ray
+from torch.utils.data import DataLoader
+from transformers import AutoTokenizer
+
+from nemo_reinforcer.data import MathDataConfig
+from nemo_reinforcer.data.datasets import AllTaskProcessedDataset, eval_collate_fn
+from nemo_reinforcer.data.llm_message_utils import get_keys_from_message_log
+from nemo_reinforcer.distributed.batched_data_dict import BatchedDataDict
+from nemo_reinforcer.distributed.virtual_cluster import ClusterConfig, RayVirtualCluster
+from nemo_reinforcer.environments.math_environment import MathEnvConfig
+from nemo_reinforcer.models.generation.interfaces import GenerationConfig
+from nemo_reinforcer.models.generation.vllm import VllmGeneration
+
+
+# ===============================================================================
+# Configuration
+# ===============================================================================
+
+
+class MasterConfig(TypedDict):
+    generate: GenerationConfig
+    data: MathDataConfig
+    env: MathEnvConfig
+    cluster: ClusterConfig
+
+
+# ===============================================================================
+# Setup & Initialization
+# ===============================================================================
+
+
+def setup(
+    master_config: MasterConfig,
+    tokenizer: AutoTokenizer,
+    dataset: AllTaskProcessedDataset,
+) -> Tuple[
+    VllmGeneration,
+    DataLoader,
+    MasterConfig,
+]:
+    """Set up components for model evaluation.
+
+    Initializes the VLLM model and data loader.
+
+    Args:
+        master_config: Configuration settings.
+        dataset: Dataset to evaluate on.
+
+    Returns:
+        VLLM model, data loader, and config.
+    """
+    # Extract individual configs for easier access
+    generation_config = master_config["generation"]
+    cluster_config = master_config["cluster"]
+
+    # ==========================
+    #           Data
+    # ==========================
+    if generation_config["num_prompts_per_step"] == -1:
+        generation_config["num_prompts_per_step"] = len(dataset)
+    dataloader = DataLoader(
+        dataset,
+        batch_size=generation_config["num_prompts_per_step"],
+        shuffle=False,
+        collate_fn=eval_collate_fn,
+    )
+    print(f"  ✓ Evaluation dataset loaded with {len(dataset)} samples")
+
+    # ==========================
+    #          Cluster
+    # ==========================
+    print("\n▶ Setting up compute cluster...")
+    cluster = RayVirtualCluster(
+        name="eval_cluster",
+        bundle_ct_per_node_list=[cluster_config["gpus_per_node"]]
+        * cluster_config["num_nodes"],
+        use_gpus=True,
+        num_gpus_per_node=cluster_config["gpus_per_node"],
+        max_colocated_worker_groups=1,
+    )
+    print(f"  ✓ Ray cluster initialized with {cluster_config['num_nodes']} nodes")
+
+    # ==========================
+    #           Model
+    # ==========================
+    print("\n▶ Setting up model...")
+    # check backend
+    backend = generation_config["backend"]
+    assert backend == "vllm", "Only vLLM backend is supported for evaluation"
+
+    # set vllm config
+    generation_config["vllm_cfg"]["load_format"] = "auto"
+    generation_config["vllm_cfg"]["skip_tokenizer_init"] = False
+    generation_config["stop_token_ids"] = [tokenizer.eos_token_id]
+    generation_config["pad_token"] = tokenizer.pad_token_id
+
+    # initialize vllm generation
+    vllm_generation = VllmGeneration(cluster=cluster, config=generation_config)
+    print(
+        f"  ✓ Using vLLM backend for generation with {generation_config['model_name']}"
+    )
+
+    print("\n" + "=" * 60)
+    print(" " * 18 + "SETUP COMPLETE")
+    print("=" * 60 + "\n")
+
+    return (
+        vllm_generation,
+        dataloader,
+        master_config,
+    )
+
+
+# ===============================================================================
+# Evaluation
+# ===============================================================================
+
+
+def run_env_eval(vllm_generation, dataloader, env, master_config):
+    """Main entry point for running evaluation using environment.
+
+    Generates model responses and evaluates them by env.
+
+    Args:
+        vllm_generation: Model for generating responses.
+        dataloader: Data loader with evaluation samples.
+        env: Environment that scores responses.
+        master_config: Configuration settings.
+    """
+    # Run evaluation loop
+    score, count = 0.0, 0
+    for batch in dataloader:
+        # get input prompt from message_log
+        prompts = []
+        for message_log in batch["message_log"]:
+            content = [message["content"] for message in message_log]
+            content = "\n".join(content)
+            prompts.append(content)
+        # generate by vllm
+        inputs = BatchedDataDict({"prompts": prompts})
+        outputs = vllm_generation.generate_text(inputs)["texts"]
+
+        # append to message_log
+        for idx, output in enumerate(outputs):
+            batch["message_log"][idx].append(
+                {
+                    "role": "assistant",
+                    "content": output,
+                }
+            )
+
+        # evaluate generations with the environment
+        to_env = [
+            get_keys_from_message_log(batch["message_log"][i], ["role", "content"])
+            for i in range(len(batch["message_log"]))
+        ]
+        _, _, rewards, _ = ray.get(env.step.remote(to_env, batch["extra_env_info"]))
+
+        score += rewards.sum().item()
+        count += len(rewards)
+
+    # Cleanup before printing results
+    ray.get(env.shutdown.remote())
+    vllm_generation.shutdown()
+
+    # Print results
+    dataset_name = os.path.basename(master_config["data"]["dataset_name"])
+    model_name = os.path.basename(master_config["generation"]["model_name"])
+    average_score = score / count
+
+    print("\n" + "=" * 60)
+    print(f"{model_name=} {dataset_name=}")
+    print(f"score={average_score:.2f} ({score}/{count})")
+    print("=" * 60 + "\n")
diff --git a/nemo_reinforcer/evals/run_env_eval.py b/nemo_reinforcer/evals/run_env_eval.py
deleted file mode 100644
index 341a77c5bc..0000000000
--- a/nemo_reinforcer/evals/run_env_eval.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/nemo_reinforcer/models/generation/vllm.py b/nemo_reinforcer/models/generation/vllm.py
index cb60b7fe8c..2395b65e34 100644
--- a/nemo_reinforcer/models/generation/vllm.py
+++ b/nemo_reinforcer/models/generation/vllm.py
@@ -200,6 +200,7 @@ def generate(
 
         Args:
             data: BatchedDataDict containing input_ids and input_lengths tensors
+            greedy: Whether to use greedy decoding instead of sampling
 
         Returns:
             BatchedDataDict conforming to GenerationOutputSpec:
@@ -246,7 +247,7 @@ def generate(
         # Read generation parameters from config
         top_k = self.cfg["top_k"] if self.cfg["top_k"] is not None else -1
         sampling_params = self.SamplingParams(
-            temperature=self.cfg["temperature"],
+            temperature=self.cfg["temperature"] if not greedy else 0,
             top_p=self.cfg["top_p"],
             top_k=top_k
             if not greedy
@@ -330,6 +331,37 @@ def generate(
 
         return return_data
 
+    def generate_text(
+        self, data: BatchedDataDict[GenerationDatumSpec], greedy: bool = False
+    ) -> BatchedDataDict[GenerationOutputSpec]:
+        """Generate text responses using vLLM generation.
+
+        Args:
+            data: BatchedDataDict containing prompts with text strings
+            greedy: Whether to use greedy decoding instead of sampling
+
+        Returns:
+            BatchedDataDict containing:
+                - texts: List of generated text responses
+        """
+        # Read generation parameters from config
+        top_k = self.cfg["top_k"] if self.cfg["top_k"] is not None else -1
+        sampling_params = self.SamplingParams(
+            temperature=self.cfg["temperature"] if not greedy else 0,
+            top_p=self.cfg["top_p"],
+            top_k=top_k if not greedy else 1,
+            max_tokens=self.cfg["max_new_tokens"],
+            stop=self.cfg.get("stop_sequences", None),
+        )
+
+        # Generate outputs
+        outputs = self.llm.generate(data["prompts"], sampling_params)
+        texts = [output.outputs[0].text for output in outputs]
+
+        # Convert to BatchedDataDict
+        return_data = BatchedDataDict({"texts": texts})
+        return return_data
+
     def shutdown(self):
         """Clean up vLLM resources."""
         try:
@@ -537,6 +569,42 @@ def generate(
 
         return combined
 
+    def generate_text(
+        self, data: BatchedDataDict[GenerationDatumSpec], greedy: bool = False
+    ) -> BatchedDataDict[GenerationOutputSpec]:
+        """Generate text responses using vLLM."""
+        assert isinstance(data, BatchedDataDict), (
+            f"data must be a BatchedDataDict, got type: {type(data)}"
+        )
+
+        # Get total batch size
+        batch_size = len(data["prompts"])
+
+        # Shard the data across the tied worker groups
+        sharded_data = data.shard_by_batch_size(self.dp_size, batch_size=batch_size)
+        future_bundle = self.worker_group.run_all_workers_multiple_data(
+            "generate_text",
+            sharded_data,
+            common_kwargs={"greedy": greedy},
+            respect_tied_workers=True,
+        )
+
+        # Get results from the workers, respecting tied worker groups (only one result per tied worker group)
+        results = self.worker_group.get_all_worker_results(future_bundle)
+
+        # Combine results from all tied worker groups
+        combined = BatchedDataDict.from_batches(results)
+
+        # Verify the output has all required fields
+        required_keys = ["texts"]
+        missing_keys = [key for key in required_keys if key not in combined]
+        if missing_keys:
+            raise ValueError(
+                f"Missing required keys for GenerationOutputSpec: {missing_keys}"
+            )
+
+        return combined
+
     def prepare_for_generation(self, *args, **kwargs):
         """Abstract method that must be implemented by subclasses."""
         try:
diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py
index af8b5698e3..8c810e31dc 100644
--- a/tests/unit/models/generation/test_vllm_generation.py
+++ b/tests/unit/models/generation/test_vllm_generation.py
@@ -40,10 +40,14 @@
 }
 
 
-def configure_vllm_with_tokenizer(vllm_config, tokenizer):
+def configure_vllm_with_tokenizer(vllm_config, tokenizer, is_eval=False):
     """Apply tokenizer-specific configurations to vLLM config."""
-    vllm_config["vllm_cfg"]["skip_tokenizer_init"] = True
-    vllm_config["vllm_cfg"]["load_format"] = "dummy"
+    if is_eval:
+        vllm_config["vllm_cfg"]["skip_tokenizer_init"] = False
+        vllm_config["vllm_cfg"]["load_format"] = "auto"
+    else:
+        vllm_config["vllm_cfg"]["skip_tokenizer_init"] = True
+        vllm_config["vllm_cfg"]["load_format"] = "dummy"
     vllm_config["pad_token"] = tokenizer.pad_token_id
     vllm_config["stop_token_ids"] = [tokenizer.eos_token_id]
     return vllm_config
@@ -532,3 +536,38 @@ def test_vllm_policy_weight_update(cluster, tokenizer, tensor_parallel_size):
 
     # Clean up
     vllm_policy.shutdown()
+
+
+def test_vllm_generate_text(cluster, tokenizer):
+    """Test that vLLM can generate text."""
+    # Prepare test data
+    test_prompts = [
+        "Hello, my name is",
+        "The capital of France is",
+    ]
+    test_prompts = BatchedDataDict({"prompts": test_prompts})
+
+    # Create separate configs for each policy
+    vllm_config = basic_vllm_test_config.copy()
+    vllm_config = configure_vllm_with_tokenizer(vllm_config, tokenizer, is_eval=True)
+
+    # Ensure we can get same output
+    assert vllm_config["model_name"] == "meta-llama/Llama-3.2-1B", (
+        "Model name should be meta-llama/Llama-3.2-1B to get expected output"
+    )
+    assert vllm_config["vllm_cfg"]["tensor_parallel_size"] == 1, (
+        "Tensor parallel size should be 1 to get expected output"
+    )
+
+    # Create vLLM generation
+    vllm_generation = VllmGeneration(cluster, vllm_config)
+
+    # Generate and check result
+    output = vllm_generation.generate_text(test_prompts, greedy=True)
+    assert output["texts"] == [
+        " Kelsey and I am a 2018 graduate",
+        " Paris. The city is located in the north of",
+    ], "Output should be the same as the expected output"
+
+    # Clean up
+    vllm_generation.shutdown()

From 8062086f60af0ca4ce7990eb8d4ce59ad8b3394c Mon Sep 17 00:00:00 2001
From: Parth Chadha <pchadha@nvidia.com>
Date: Wed, 26 Mar 2025 21:03:49 -0700
Subject: [PATCH 15/27] fix: gradient should be averaged instead of summed
 across mbs (#86)

Signed-off-by: Parth Chadha <pchadha@nvidia.com>
Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
---
 nemo_reinforcer/models/policy/hf_policy.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/nemo_reinforcer/models/policy/hf_policy.py b/nemo_reinforcer/models/policy/hf_policy.py
index ef4ec8460f..bc5d0c4a5d 100644
--- a/nemo_reinforcer/models/policy/hf_policy.py
+++ b/nemo_reinforcer/models/policy/hf_policy.py
@@ -278,6 +278,12 @@ def train(
         for gb_start in range(0, dataset_size, local_gbs):
             self.optimizer.zero_grad()
             mb_losses = []
+
+            # Calculate number of microbatches to process
+            # make_microbatch_iterator assumes that the batch size is a multiple of the microbatch size
+            # so its safe to not check for the case where the last data slice is smaller than mbs
+            num_microbatches = min(local_gbs, dataset_size - gb_start) // mbs
+
             for mb in data.slice(
                 gb_start, gb_start + local_gbs
             ).make_microbatch_iterator(mbs):
@@ -308,6 +314,9 @@ def train(
                 loss_metrics["lr"] = self.optimizer.param_groups[0]["lr"]
 
                 # Backward pass
+
+                # Loss is accumulated across microbatches, so we need to scale by the number of microbatches
+                loss = loss / num_microbatches
                 if not eval_mode:
                     loss.backward()
                 mb_losses.append(loss.item())
@@ -320,7 +329,7 @@ def train(
                 # Update parameters
                 self.optimizer.step()
                 self.scheduler.step()
-            losses.append(torch.tensor(mb_losses).mean().item())
+            losses.append(torch.tensor(mb_losses).sum().item())
 
         # Compute global loss across all ranks
         with torch.no_grad():

From 1ce29010f9c242b78938046fa82b9e5e8872018a Mon Sep 17 00:00:00 2001
From: Yi-Fu Wu <yifu.wu@gmail.com>
Date: Mon, 31 Mar 2025 09:15:56 -0700
Subject: [PATCH 16/27] fix: Use separate step_metric for GPU Monitoring (#92)

Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
Co-authored-by: Terry Kong <terrycurtiskong@gmail.com>
Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
---
 nemo_reinforcer/utils/logger.py |  89 ++++++++--
 tests/unit/conftest.py          |   4 +
 tests/unit/utils/test_logger.py | 304 ++++++++++++++++++++++++++++++--
 3 files changed, 372 insertions(+), 25 deletions(-)

diff --git a/nemo_reinforcer/utils/logger.py b/nemo_reinforcer/utils/logger.py
index 42143e7593..bc0157d564 100644
--- a/nemo_reinforcer/utils/logger.py
+++ b/nemo_reinforcer/utils/logger.py
@@ -68,7 +68,11 @@ class LoggerInterface(ABC):
 
     @abstractmethod
     def log_metrics(
-        self, metrics: Dict[str, Any], step: int, prefix: Optional[str] = ""
+        self,
+        metrics: Dict[str, Any],
+        step: int,
+        prefix: Optional[str] = "",
+        step_metric: Optional[str] = None,
     ) -> None:
         """Log a dictionary of metrics."""
         pass
@@ -87,7 +91,11 @@ def __init__(self, cfg: TensorboardConfig, log_dir: Optional[str] = None):
         print(f"Initialized TensorboardLogger at {log_dir}")
 
     def log_metrics(
-        self, metrics: Dict[str, Any], step: int, prefix: Optional[str] = ""
+        self,
+        metrics: Dict[str, Any],
+        step: int,
+        prefix: Optional[str] = "",
+        step_metric: Optional[str] = None,  # ignored in TensorBoard
     ) -> None:
         """Log metrics to Tensorboard.
 
@@ -95,6 +103,7 @@ def log_metrics(
             metrics: Dict of metrics to log
             step: Global step value
             prefix: Optional prefix for metric names
+            step_metric: Optional step metric name (ignored in TensorBoard)
         """
         for name, value in metrics.items():
             if prefix:
@@ -120,8 +129,25 @@ def __init__(self, cfg: WandbConfig, log_dir: Optional[str] = None):
             f"Initialized WandbLogger for project {cfg.get('project')}, run {cfg.get('name')} at {log_dir}"
         )
 
+    def define_metric(
+        self,
+        name: str,
+        step_metric: Optional[str] = None,
+    ) -> None:
+        """Define a metric with custom step metric.
+
+        Args:
+            name: Name of the metric or pattern (e.g. 'ray/*')
+            step_metric: Optional name of the step metric to use
+        """
+        self.run.define_metric(name, step_metric=step_metric)
+
     def log_metrics(
-        self, metrics: Dict[str, Any], step: int, prefix: Optional[str] = ""
+        self,
+        metrics: Dict[str, Any],
+        step: int,
+        prefix: Optional[str] = "",
+        step_metric: Optional[str] = None,
     ) -> None:
         """Log metrics to wandb.
 
@@ -129,11 +155,21 @@ def log_metrics(
             metrics: Dict of metrics to log
             step: Global step value
             prefix: Optional prefix for metric names
+            step_metric: Optional name of a field in metrics to use as step instead
+                         of the provided step value
         """
         if prefix:
-            metrics = {f"{prefix}/{k}": v for k, v in metrics.items()}
-
-        self.run.log(metrics, step=step)
+            metrics = {
+                f"{prefix}/{k}" if k != step_metric else k: v
+                for k, v in metrics.items()
+            }
+
+        # If step_metric is provided, use the corresponding value from metrics as step
+        if step_metric and step_metric in metrics:
+            # commit=False so the step does not get incremented
+            self.run.log(metrics, commit=False)
+        else:
+            self.run.log(metrics, step=step)
 
     def log_hyperparams(self, params: Dict[str, Any]) -> None:
         """Log hyperparameters to wandb.
@@ -156,6 +192,8 @@ def __init__(
         self,
         collection_interval: int | float,
         flush_interval: int | float,
+        metric_prefix: str,
+        step_metric: str,
         parent_logger: Optional["Logger"] = None,
     ):
         """Initialize the GPU monitor.
@@ -163,10 +201,13 @@ def __init__(
         Args:
             collection_interval: Interval in seconds to collect GPU metrics
             flush_interval: Interval in seconds to flush metrics to parent logger
+            step_metric: Name of the field to use as the step metric
             parent_logger: Logger to receive the collected metrics
         """
         self.collection_interval = collection_interval
         self.flush_interval = flush_interval
+        self.metric_prefix = metric_prefix
+        self.step_metric = step_metric
         self.parent_logger = parent_logger
         self.metrics_buffer: list[
             GpuMetricSnapshot
@@ -425,7 +466,17 @@ def flush(self):
                 for entry in self.metrics_buffer:
                     step = entry["step"]
                     metrics = entry["metrics"]
-                    self.parent_logger.log_metrics(metrics, step, prefix="ray")
+
+                    # Add the step metric directly to metrics for use as step_metric
+                    metrics[self.step_metric] = step
+
+                    # Pass step_metric as the step_metric to use it as the step value in wandb
+                    self.parent_logger.log_metrics(
+                        metrics,
+                        step=step,
+                        prefix=self.metric_prefix,
+                        step_metric=self.step_metric,
+                    )
 
             # Clear buffer after logging
             self.metrics_buffer = []
@@ -448,6 +499,7 @@ def __init__(self, cfg: LoggerConfig):
                 - gpu_flush_interval
         """
         self.loggers = []
+        self.wandb_logger = None
 
         self.base_log_dir = cfg["log_dir"]
         os.makedirs(self.base_log_dir, exist_ok=True)
@@ -455,8 +507,8 @@ def __init__(self, cfg: LoggerConfig):
         if cfg["wandb_enabled"]:
             wandb_log_dir = os.path.join(self.base_log_dir, "wandb")
             os.makedirs(wandb_log_dir, exist_ok=True)
-            wandb_logger = WandbLogger(cfg["wandb"], log_dir=wandb_log_dir)
-            self.loggers.append(wandb_logger)
+            self.wandb_logger = WandbLogger(cfg["wandb"], log_dir=wandb_log_dir)
+            self.loggers.append(self.wandb_logger)
 
         if cfg["tensorboard_enabled"]:
             tensorboard_log_dir = os.path.join(self.base_log_dir, "tensorboard")
@@ -469,9 +521,18 @@ def __init__(self, cfg: LoggerConfig):
         # Initialize GPU monitoring if requested
         self.gpu_monitor = None
         if cfg["monitor_gpus"]:
+            metric_prefix = "ray"
+            step_metric = f"{metric_prefix}/ray_step"
+            if cfg["wandb_enabled"] and self.wandb_logger:
+                self.wandb_logger.define_metric(
+                    f"{metric_prefix}/*", step_metric=step_metric
+                )
+
             self.gpu_monitor = RayGpuMonitorLogger(
                 collection_interval=cfg["gpu_monitoring"]["collection_interval"],
                 flush_interval=cfg["gpu_monitoring"]["flush_interval"],
+                metric_prefix=metric_prefix,
+                step_metric=step_metric,
                 parent_logger=self,
             )
             self.gpu_monitor.start()
@@ -480,7 +541,11 @@ def __init__(self, cfg: LoggerConfig):
             print("No loggers initialized")
 
     def log_metrics(
-        self, metrics: Dict[str, Any], step: int, prefix: Optional[str] = ""
+        self,
+        metrics: Dict[str, Any],
+        step: int,
+        prefix: Optional[str] = "",
+        step_metric: Optional[str] = None,
     ) -> None:
         """Log metrics to all enabled backends.
 
@@ -488,9 +553,11 @@ def log_metrics(
             metrics: Dict of metrics to log
             step: Global step value
             prefix: Optional prefix for metric names
+            step_metric: Optional name of a field in metrics to use as step instead
+                         of the provided step value (currently only needed for wandb)
         """
         for logger in self.loggers:
-            logger.log_metrics(metrics, step, prefix)
+            logger.log_metrics(metrics, step, prefix, step_metric)
 
     def log_hyperparams(self, params: Dict[str, Any]) -> None:
         """Log hyperparameters to all enabled backends.
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index 1dbbd6a169..6056effa57 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -113,6 +113,8 @@ def session_data(request, init_ray_cluster):
     logger = RayGpuMonitorLogger(
         collection_interval=float("inf"),
         flush_interval=float("inf"),
+        metric_prefix="test",
+        step_metric="test/step",
         parent_logger=None,
     )
     unit_test_data["gpu_types"] = list(set(logger._collect_gpu_sku().values()))
@@ -209,6 +211,8 @@ def ray_gpu_monitor(init_ray_cluster):
     gpu_monitor = RayGpuMonitorLogger(
         collection_interval=1,
         flush_interval=float("inf"),  # Disabling flushing since we will do it manually
+        metric_prefix="test",
+        step_metric="test/step",
         parent_logger=None,
     )
     gpu_monitor.start()
diff --git a/tests/unit/utils/test_logger.py b/tests/unit/utils/test_logger.py
index dd1c24fb27..ac2ce9cc43 100644
--- a/tests/unit/utils/test_logger.py
+++ b/tests/unit/utils/test_logger.py
@@ -186,6 +186,67 @@ def test_log_metrics_with_prefix(self, mock_wandb):
         expected_metrics = {"train/loss": 0.5, "train/accuracy": 0.8}
         mock_run.log.assert_called_once_with(expected_metrics, step=step)
 
+    @patch("nemo_reinforcer.utils.logger.wandb")
+    def test_log_metrics_with_step_metric(self, mock_wandb):
+        """Test logging metrics with a step metric to WandbLogger."""
+        cfg = {}
+        logger = WandbLogger(cfg)
+
+        # Define step metric
+        step_metric = "iteration"
+
+        # Include the step metric in the metrics
+        metrics = {"loss": 0.5, "accuracy": 0.8, "iteration": 15}
+        step = 10  # This should be ignored when step_metric is provided
+
+        logger.log_metrics(metrics, step, step_metric=step_metric)
+
+        # Check that log was called with metrics and commit=False
+        # When using step_metric, step should be ignored and commit=False should be used
+        mock_run = mock_wandb.init.return_value
+        mock_run.log.assert_called_once_with(metrics, commit=False)
+
+    @patch("nemo_reinforcer.utils.logger.wandb")
+    def test_log_metrics_with_prefix_and_step_metric(self, mock_wandb):
+        """Test logging metrics with both prefix and step metric."""
+        cfg = {}
+        logger = WandbLogger(cfg)
+
+        # Define prefix and step metric
+        prefix = "train"
+        step_metric = "train/iteration"
+
+        # Include the step metric in the metrics
+        metrics = {"loss": 0.5, "accuracy": 0.8, "iteration": 15}
+        step = 10  # This should be ignored when step_metric is provided
+
+        logger.log_metrics(metrics, step, prefix=prefix, step_metric=step_metric)
+
+        # Check that log was called with prefixed metrics and commit=False
+        # The step_metric key gets prefixed based on the current implementation
+        mock_run = mock_wandb.init.return_value
+        expected_metrics = {
+            "train/loss": 0.5,
+            "train/accuracy": 0.8,
+            "train/iteration": 15,
+        }
+        mock_run.log.assert_called_once_with(expected_metrics, commit=False)
+
+    @patch("nemo_reinforcer.utils.logger.wandb")
+    def test_define_metric(self, mock_wandb):
+        """Test defining a metric with a custom step metric."""
+        cfg = {}
+        logger = WandbLogger(cfg)
+
+        # Define metric pattern and step metric
+        logger.define_metric("ray/*", step_metric="ray/ray_step")
+
+        # Check that define_metric was called
+        mock_run = mock_wandb.init.return_value
+        mock_run.define_metric.assert_called_once_with(
+            "ray/*", step_metric="ray/ray_step"
+        )
+
     @patch("nemo_reinforcer.utils.logger.wandb")
     def test_log_hyperparams(self, mock_wandb):
         """Test logging hyperparameters to WandbLogger."""
@@ -219,11 +280,13 @@ def __init__(self):
                 self.logged_metrics = []
                 self.logged_steps = []
                 self.logged_prefixes = []
+                self.logged_step_metrics = []
 
-            def log_metrics(self, metrics, step, prefix=""):
+            def log_metrics(self, metrics, step, prefix="", step_metric=None):
                 self.logged_metrics.append(metrics)
                 self.logged_steps.append(step)
                 self.logged_prefixes.append(prefix)
+                self.logged_step_metrics.append(step_metric)
 
         return MockLogger()
 
@@ -235,12 +298,18 @@ def test_init(self, mock_ray):
 
         # Initialize the monitor with standard settings
         monitor = RayGpuMonitorLogger(
-            collection_interval=10.0, flush_interval=60.0, parent_logger=None
+            collection_interval=10.0,
+            flush_interval=60.0,
+            metric_prefix="test",
+            step_metric="test/step",
+            parent_logger=None,
         )
 
         # Verify initialization parameters
         assert monitor.collection_interval == 10.0
         assert monitor.flush_interval == 60.0
+        assert monitor.metric_prefix == "test"
+        assert monitor.step_metric == "test/step"
         assert monitor.parent_logger is None
         assert monitor.metrics_buffer == []
         assert monitor.is_running is False
@@ -255,7 +324,11 @@ def test_start(self, mock_thread, mock_ray):
 
         # Initialize the monitor
         monitor = RayGpuMonitorLogger(
-            collection_interval=10.0, flush_interval=60.0, parent_logger=None
+            collection_interval=10.0,
+            flush_interval=60.0,
+            metric_prefix="test",
+            step_metric="test/step",
+            parent_logger=None,
         )
 
         # Start the monitor
@@ -277,7 +350,11 @@ def test_start_ray_not_initialized(self, mock_ray):
 
         # Initialize the monitor
         monitor = RayGpuMonitorLogger(
-            collection_interval=10.0, flush_interval=60.0, parent_logger=None
+            collection_interval=10.0,
+            flush_interval=60.0,
+            metric_prefix="test",
+            step_metric="test/step",
+            parent_logger=None,
         )
 
         # Starting should raise a ValueError
@@ -293,7 +370,11 @@ def test_stop(self, mock_thread, mock_ray):
 
         # Initialize the monitor
         monitor = RayGpuMonitorLogger(
-            collection_interval=10.0, flush_interval=60.0, parent_logger=None
+            collection_interval=10.0,
+            flush_interval=60.0,
+            metric_prefix="test",
+            step_metric="test/step",
+            parent_logger=None,
         )
 
         # Start the monitor
@@ -318,7 +399,11 @@ def test_parse_gpu_metric(self, mock_ray):
 
         # Initialize the monitor
         monitor = RayGpuMonitorLogger(
-            collection_interval=10.0, flush_interval=60.0, parent_logger=None
+            collection_interval=10.0,
+            flush_interval=60.0,
+            metric_prefix="test",
+            step_metric="test/step",
+            parent_logger=None,
         )
 
         # Create a sample with GPU utilization metric
@@ -405,7 +490,11 @@ def test_fetch_and_parse_metrics(self, mock_get, mock_ray):
 
         # Initialize the monitor
         monitor = RayGpuMonitorLogger(
-            collection_interval=10.0, flush_interval=60.0, parent_logger=None
+            collection_interval=10.0,
+            flush_interval=60.0,
+            metric_prefix="test",
+            step_metric="test/step",
+            parent_logger=None,
         )
 
         # Mock the _parse_gpu_metric method to return expected values
@@ -447,7 +536,11 @@ def test_collect_metrics(self, mock_ray):
 
         # Initialize the monitor
         monitor = RayGpuMonitorLogger(
-            collection_interval=10.0, flush_interval=60.0, parent_logger=None
+            collection_interval=10.0,
+            flush_interval=60.0,
+            metric_prefix="test",
+            step_metric="test/step",
+            parent_logger=None,
         )
 
         # Mock the _fetch_and_parse_metrics method
@@ -483,6 +576,8 @@ def test_flush_empty_buffer(self, mock_ray, mock_parent_logger):
         monitor = RayGpuMonitorLogger(
             collection_interval=10.0,
             flush_interval=60.0,
+            metric_prefix="ray",
+            step_metric="ray/ray_step",
             parent_logger=mock_parent_logger,
         )
 
@@ -502,6 +597,8 @@ def test_flush(self, mock_ray, mock_parent_logger):
         monitor = RayGpuMonitorLogger(
             collection_interval=10.0,
             flush_interval=60.0,
+            metric_prefix="ray",
+            step_metric="ray/ray_step",
             parent_logger=mock_parent_logger,
         )
 
@@ -522,23 +619,68 @@ def test_flush(self, mock_ray, mock_parent_logger):
 
         # Verify parent logger's log_metrics was called for each entry
         assert len(mock_parent_logger.logged_metrics) == 2
-        assert mock_parent_logger.logged_metrics[0] == {
+
+        # First metrics entry should include the step metric
+        expected_first_metrics = {
             "node.0.gpu.0.gpu": 75.5,
             "node.0.gpu.0.memory": 4096.0,
+            "ray/ray_step": 10,  # Step metric added
         }
+        assert mock_parent_logger.logged_metrics[0] == expected_first_metrics
         assert mock_parent_logger.logged_steps[0] == 10
         assert mock_parent_logger.logged_prefixes[0] == "ray"
+        assert mock_parent_logger.logged_step_metrics[0] == "ray/ray_step"
 
-        assert mock_parent_logger.logged_metrics[1] == {
+        # Second metrics entry should include the step metric
+        expected_second_metrics = {
             "node.0.gpu.0.gpu": 80.0,
             "node.0.gpu.0.memory": 5120.0,
+            "ray/ray_step": 20,  # Step metric added
         }
+        assert mock_parent_logger.logged_metrics[1] == expected_second_metrics
         assert mock_parent_logger.logged_steps[1] == 20
         assert mock_parent_logger.logged_prefixes[1] == "ray"
+        assert mock_parent_logger.logged_step_metrics[1] == "ray/ray_step"
 
         # Verify buffer was cleared
         assert monitor.metrics_buffer == []
 
+    @patch("nemo_reinforcer.utils.logger.ray")
+    def test_flush_with_custom_prefix(self, mock_ray, mock_parent_logger):
+        """Test flush method with custom metric prefix."""
+        # Mock ray.is_initialized to return True
+        mock_ray.is_initialized.return_value = True
+
+        # Initialize the monitor with parent logger and custom prefix
+        custom_prefix = "custom_metrics"
+        custom_step_metric = "custom_metrics/step"
+        monitor = RayGpuMonitorLogger(
+            collection_interval=10.0,
+            flush_interval=60.0,
+            metric_prefix=custom_prefix,
+            step_metric=custom_step_metric,
+            parent_logger=mock_parent_logger,
+        )
+
+        # Add test metrics to buffer
+        monitor.metrics_buffer = [
+            {
+                "step": 15,
+                "metrics": {"node.0.gpu.0.gpu": 60.0},
+            }
+        ]
+
+        # Call flush
+        monitor.flush()
+
+        # Verify parent logger's log_metrics was called with the custom prefix
+        assert len(mock_parent_logger.logged_metrics) == 1
+        expected_metrics = {"node.0.gpu.0.gpu": 60.0, "custom_metrics/step": 15}
+        assert mock_parent_logger.logged_metrics[0] == expected_metrics
+        assert mock_parent_logger.logged_steps[0] == 15
+        assert mock_parent_logger.logged_prefixes[0] == custom_prefix
+        assert mock_parent_logger.logged_step_metrics[0] == custom_step_metric
+
     @patch("nemo_reinforcer.utils.logger.ray")
     @patch("nemo_reinforcer.utils.logger.time")
     def test_collection_loop(self, mock_time, mock_ray):
@@ -556,7 +698,11 @@ def test_collection_loop(self, mock_time, mock_ray):
 
         # Initialize the monitor
         monitor = RayGpuMonitorLogger(
-            collection_interval=10.0, flush_interval=60.0, parent_logger=None
+            collection_interval=10.0,
+            flush_interval=60.0,
+            metric_prefix="test",
+            step_metric="test/step",
+            parent_logger=None,
         )
 
         # Set start time and running flag
@@ -593,6 +739,89 @@ def side_effect():
                 # Verify flush was called (flush_interval elapsed)
                 mock_flush.assert_called_once()
 
+    @patch("nemo_reinforcer.utils.logger.WandbLogger")
+    @patch("nemo_reinforcer.utils.logger.TensorboardLogger")
+    @patch("nemo_reinforcer.utils.logger.RayGpuMonitorLogger")
+    def test_init_with_gpu_monitoring(
+        self, mock_gpu_monitor, mock_tb_logger, mock_wandb_logger, temp_dir
+    ):
+        """Test initialization with GPU monitoring enabled."""
+        cfg = {
+            "wandb_enabled": True,
+            "tensorboard_enabled": True,
+            "monitor_gpus": True,
+            "gpu_monitoring": {
+                "collection_interval": 15.0,
+                "flush_interval": 45.0,
+            },
+            "wandb": {"project": "test-project"},
+            "tensorboard": {"log_dir": "test_logs"},
+            "log_dir": temp_dir,
+        }
+        logger = Logger(cfg)
+
+        # Check that regular loggers were initialized
+        assert len(logger.loggers) == 2
+        mock_wandb_logger.assert_called_once()
+        mock_tb_logger.assert_called_once()
+
+        # Check that GPU monitor was initialized with correct parameters
+        mock_gpu_monitor.assert_called_once_with(
+            collection_interval=15.0,
+            flush_interval=45.0,
+            metric_prefix="ray",
+            step_metric="ray/ray_step",
+            parent_logger=logger,
+        )
+
+        # Check that GPU monitor was started
+        mock_gpu_instance = mock_gpu_monitor.return_value
+        mock_gpu_instance.start.assert_called_once()
+
+        # Check that wandb metrics are defined with the step metric
+        mock_wandb_instance = mock_wandb_logger.return_value
+        mock_wandb_instance.define_metric.assert_called_once_with(
+            "ray/*", step_metric="ray/ray_step"
+        )
+
+    @patch("nemo_reinforcer.utils.logger.WandbLogger")
+    @patch("nemo_reinforcer.utils.logger.TensorboardLogger")
+    @patch("nemo_reinforcer.utils.logger.RayGpuMonitorLogger")
+    def test_gpu_monitoring_without_wandb(
+        self, mock_gpu_monitor, mock_tb_logger, mock_wandb_logger, temp_dir
+    ):
+        """Test GPU monitoring initialization when wandb is disabled."""
+        cfg = {
+            "wandb_enabled": False,
+            "tensorboard_enabled": True,
+            "monitor_gpus": True,
+            "gpu_monitoring": {
+                "collection_interval": 15.0,
+                "flush_interval": 45.0,
+            },
+            "tensorboard": {"log_dir": "test_logs"},
+            "log_dir": temp_dir,
+        }
+        logger = Logger(cfg)
+
+        # Check that only tensorboard logger was initialized
+        assert len(logger.loggers) == 1
+        mock_wandb_logger.assert_not_called()
+        mock_tb_logger.assert_called_once()
+
+        # Check that GPU monitor was initialized with correct parameters
+        mock_gpu_monitor.assert_called_once_with(
+            collection_interval=15.0,
+            flush_interval=45.0,
+            metric_prefix="ray",
+            step_metric="ray/ray_step",
+            parent_logger=logger,
+        )
+
+        # Since wandb is disabled, define_metric should not be called
+        mock_wandb_instance = mock_wandb_logger.return_value
+        assert not mock_wandb_instance.define_metric.called
+
 
 class TestLogger:
     """Test the main Logger class."""
@@ -704,8 +933,8 @@ def test_log_metrics(self, mock_tb_logger, mock_wandb_logger, temp_dir):
         logger.log_metrics(metrics, step)
 
         # Check that log_metrics was called on both loggers
-        mock_wandb_instance.log_metrics.assert_called_once_with(metrics, step, "")
-        mock_tb_instance.log_metrics.assert_called_once_with(metrics, step, "")
+        mock_wandb_instance.log_metrics.assert_called_once_with(metrics, step, "", None)
+        mock_tb_instance.log_metrics.assert_called_once_with(metrics, step, "", None)
 
     @patch("nemo_reinforcer.utils.logger.WandbLogger")
     @patch("nemo_reinforcer.utils.logger.TensorboardLogger")
@@ -760,9 +989,56 @@ def test_init_with_gpu_monitoring(
 
         # Check that GPU monitor was initialized with correct parameters
         mock_gpu_monitor.assert_called_once_with(
-            collection_interval=15.0, flush_interval=45.0, parent_logger=logger
+            collection_interval=15.0,
+            flush_interval=45.0,
+            metric_prefix="ray",
+            step_metric="ray/ray_step",
+            parent_logger=logger,
         )
 
         # Check that GPU monitor was started
         mock_gpu_instance = mock_gpu_monitor.return_value
         mock_gpu_instance.start.assert_called_once()
+
+        # Check that wandb metrics are defined with the step metric
+        mock_wandb_instance = mock_wandb_logger.return_value
+        mock_wandb_instance.define_metric.assert_called_once_with(
+            "ray/*", step_metric="ray/ray_step"
+        )
+
+    @patch("nemo_reinforcer.utils.logger.WandbLogger")
+    @patch("nemo_reinforcer.utils.logger.TensorboardLogger")
+    def test_log_metrics_with_prefix_and_step_metric(
+        self, mock_tb_logger, mock_wandb_logger, temp_dir
+    ):
+        """Test logging metrics with prefix and step_metric."""
+        cfg = {
+            "wandb_enabled": True,
+            "tensorboard_enabled": True,
+            "monitor_gpus": False,
+            "wandb": {"project": "test-project"},
+            "tensorboard": {"log_dir": "test_logs"},
+            "log_dir": temp_dir,
+        }
+        logger = Logger(cfg)
+
+        # Create mock logger instances
+        mock_wandb_instance = mock_wandb_logger.return_value
+        mock_tb_instance = mock_tb_logger.return_value
+
+        # Create metrics with a step metric field
+        metrics = {"loss": 0.5, "accuracy": 0.8, "iteration": 15}
+        step = 10
+        prefix = "train"
+        step_metric = "iteration"
+
+        # Log metrics with prefix and step_metric
+        logger.log_metrics(metrics, step, prefix=prefix, step_metric=step_metric)
+
+        # Check that log_metrics was called on both loggers with correct parameters
+        mock_wandb_instance.log_metrics.assert_called_once_with(
+            metrics, step, prefix, step_metric
+        )
+        mock_tb_instance.log_metrics.assert_called_once_with(
+            metrics, step, prefix, step_metric
+        )

From e93d6979821fedee49da74f3165e9189008f2118 Mon Sep 17 00:00:00 2001
From: Anna Shors <ashors@nvidia.com>
Date: Mon, 31 Mar 2025 11:30:57 -0700
Subject: [PATCH 17/27] feat: Update sft config to use single GPU (#90)

Signed-off-by: ashors1 <ashors@nvidia.com>
Signed-off-by: Oleksii Kuchaiev <okuchaiev@nvidia.com>
Co-authored-by: Oleksii Kuchaiev <okuchaiev@nvidia.com>
Co-authored-by: Terry Kong <terrycurtiskong@gmail.com>
Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
---
 README.md                 | 14 ++++++++------
 examples/configs/sft.yaml | 27 ++++++++++-----------------
 2 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index ded81200f7..aeab19f33c 100644
--- a/README.md
+++ b/README.md
@@ -60,20 +60,22 @@ We provide a sample SFT experiment that uses the [SQuAD dataset](https://rajpurk
 
 #### Single Node
 
-The experiment is set up to run on 8 GPUs. If using a machine that has access to 8 GPUs, you can launch the experiment as follows:
+The default SFT experiment is configured to run on a single GPU. To launch the experiment,
 
 ```sh
 uv run python examples/run_sft.py
 ```
 
-This trains `Llama3.1-8B` on 8 GPUs. To run on a single GPU, we'll have to override a few of the experiment settings. We replace the 8B model with a smaller 1B model, decrease the batch size, and update the cluster configuration to use a single gpu:
+This trains `Llama3.2-1B` on one GPU using SQUAD dataset.
+
+If you have access to more GPUs, you can update the experiment accordingly. To run on 8 GPUs, we update the cluster configuration. We also switch to an 8B Llama base model and increase the batch size:
 
 ```sh
 uv run python examples/run_sft.py \
-  policy.model_name="meta-llama/Llama-3.2-1B" \
-  policy.train_global_batch_size=16 \
-  sft.val_global_batch_size=16 \
-  cluster.gpus_per_node=1
+  policy.model_name="meta-llama/Meta-Llama-3-8B" \
+  policy.train_global_batch_size=128 \
+  sft.val_global_batch_size=128 \
+  cluster.gpus_per_node=8
 ```
 
 Refer to [sft.yaml](examples/configs/sft.yaml) for a full list of parameters that can be overridden.
diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml
index 8bb0c7615b..c028e83dc2 100644
--- a/examples/configs/sft.yaml
+++ b/examples/configs/sft.yaml
@@ -1,9 +1,9 @@
 # SFT Algorithm Configuration
 sft:
-  max_num_steps: 1000
+  max_num_steps: 60
   val_period: 10
   val_batches: 8
-  val_global_batch_size: 128
+  val_global_batch_size: 32
   val_micro_batch_size: 1
   val_at_start: true
   seed: 42
@@ -17,10 +17,10 @@ checkpointing:
   save_period: 10
 
 policy:
-  model_name: "meta-llama/Meta-Llama-3-8B"
-  train_global_batch_size: 128
+  model_name: "meta-llama/Llama-3.2-1B"
+  train_global_batch_size: 32
   train_micro_batch_size: 1
-  max_total_sequence_length: 2048
+  max_total_sequence_length: 1024
   precision: "float32"
   expandable_segments_enabled: false
   fsdp_offload_enabled: false
@@ -32,13 +32,6 @@ policy:
       weight_decay: 0.1
       betas: [0.9, 0.98]
       eps: 1e-5
-
-  scheduler:
-    name: "torch.optim.lr_scheduler.LinearLR"
-    kwargs:
-      start_factor: 0.0196078
-      end_factor: 1.0
-      total_iters: 50
     
 data:
   max_input_seq_length: ${policy.max_total_sequence_length}
@@ -46,18 +39,18 @@ data:
 
 logger:
   log_dir: "logs"  # Base directory for all logs
-  wandb_enabled: false
-  tensorboard_enabled: false
+  wandb_enabled: true # Make sure you do ``wandb login [Your API key]'' before run
+  tensorboard_enabled: true
   monitor_gpus: false  # If true, will monitor GPU usage and log to wandb and/or tensorboard
   wandb:
     project: "sft-dev"
-    name: "sft-dev-logger"
+    name: "sft-dev-${data.dataset_name}"
   tensorboard:
-    log_dir: "tb_logs"
+    log_dir: "tb_logs-sft-dev-${data.dataset_name}"
   gpu_monitoring:
     collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
     flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
 
 cluster:
-  gpus_per_node: 8
+  gpus_per_node: 1
   num_nodes: 1

From c0aa9896dbe694046e65d90fb2acefb4819dd1bb Mon Sep 17 00:00:00 2001
From: Sahil Jain <48468750+SahilJain314@users.noreply.github.com>
Date: Mon, 31 Mar 2025 11:42:48 -0700
Subject: [PATCH 18/27] fix: Grammar nit (#98)

Signed-off-by: Sahil Jain <sahilj@nvidia.com>
Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
---
 README.md                 | 2 +-
 examples/configs/sft.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index aeab19f33c..4b0b5cbba9 100644
--- a/README.md
+++ b/README.md
@@ -66,7 +66,7 @@ The default SFT experiment is configured to run on a single GPU. To launch the e
 uv run python examples/run_sft.py
 ```
 
-This trains `Llama3.2-1B` on one GPU using SQUAD dataset.
+This trains `Llama3.2-1B` on one GPU using the SQUAD dataset.
 
 If you have access to more GPUs, you can update the experiment accordingly. To run on 8 GPUs, we update the cluster configuration. We also switch to an 8B Llama base model and increase the batch size:
 
diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml
index c028e83dc2..3b20e4cc93 100644
--- a/examples/configs/sft.yaml
+++ b/examples/configs/sft.yaml
@@ -39,7 +39,7 @@ data:
 
 logger:
   log_dir: "logs"  # Base directory for all logs
-  wandb_enabled: true # Make sure you do ``wandb login [Your API key]'' before run
+  wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running
   tensorboard_enabled: true
   monitor_gpus: false  # If true, will monitor GPU usage and log to wandb and/or tensorboard
   wandb:

From e27b5fd6b7a341aec09bf3e0e2b4efd0eddd1293 Mon Sep 17 00:00:00 2001
From: Parth Chadha <pchadha@nvidia.com>
Date: Mon, 31 Mar 2025 12:11:37 -0700
Subject: [PATCH 19/27] =?UTF-8?q?feat:=20add=20capability=20to=20set=20min?=
 =?UTF-8?q?/max=20eps=20separately=20as=20proposed=20in=20the=E2=80=A6=20(?=
 =?UTF-8?q?#95)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Parth Chadha <pchadha@nvidia.com>
Co-authored-by: Sahil Jain <48468750+SahilJain314@users.noreply.github.com>
Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
---
 .gitignore                                   |  1 +
 examples/configs/grpo_math_1B.yaml           |  3 ++-
 nemo_reinforcer/algorithms/loss_functions.py | 14 +++++++++++---
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/.gitignore b/.gitignore
index 79a00631e6..0d7a81c424 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,4 @@ docker/
 wandb/
 checkpoints/
 results/
+code_snapshots/
diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml
index f15031c536..a7cb644114 100644
--- a/examples/configs/grpo_math_1B.yaml
+++ b/examples/configs/grpo_math_1B.yaml
@@ -12,7 +12,8 @@ grpo:
 
 loss_fn:
   reference_policy_kl_penalty: 0.01
-  ratio_eps: 0.2
+  ratio_eps_min: 0.2
+  ratio_eps_max: 0.2
 
 checkpointing:
   enabled: true
diff --git a/nemo_reinforcer/algorithms/loss_functions.py b/nemo_reinforcer/algorithms/loss_functions.py
index 8504dac007..6a5d6b593d 100644
--- a/nemo_reinforcer/algorithms/loss_functions.py
+++ b/nemo_reinforcer/algorithms/loss_functions.py
@@ -25,7 +25,8 @@
 
 class ClippedPGLossConfig(TypedDict):
     reference_policy_kl_penalty: float
-    ratio_eps: float
+    ratio_eps_min: float
+    ratio_eps_max: float
 
 
 class ClippedPGLossDataDict(TypedDict):
@@ -57,6 +58,10 @@ class ClippedPGLossFn(LossFunction):
     - r_t(θ) = π_θ(a_t|s_t) / π_θ_old(a_t|s_t) is the probability ratio
     - A_t is the advantage estimate
     - ε is the clip parameter (ratio_eps)
+        - As proposed in the DAPO paper (https://arxiv.org/pdf/2503.14476), 
+          we allow setting a distinct minimum and maximum value for the clip parameter (set to the same value for PPO/GRPO/etc.)
+            - ratio_eps_min: minimum value for the clip parameter
+            - ratio_eps_max: maximum value for the clip parameter
     - β is the KL penalty coefficient (reference_policy_kl_penalty)
     - KL(π_θ || π_ref) is the KL divergence between the current policy and reference policy (Schulman Approx.)
 
@@ -65,7 +70,8 @@ class ClippedPGLossFn(LossFunction):
     """
 
     def __init__(self, cfg: ClippedPGLossConfig):
-        self.ratio_eps = cfg["ratio_eps"]
+        self.ratio_eps_min = cfg["ratio_eps_min"]
+        self.ratio_eps_max = cfg["ratio_eps_max"]
         self.reference_policy_kl_penalty = cfg["reference_policy_kl_penalty"]
         self.disable_ppo_ratio = cfg.get("disable_ppo_ratio", False)
 
@@ -108,7 +114,9 @@ def __call__(
         # Calculate clipped loss function if ppo ratio is enabled.
         if not self.disable_ppo_ratio:
             ratios = (curr_logprobs - prev_logprobs).exp()
-            ratios_clamped = ratios.clamp(1.0 - self.ratio_eps, 1.0 + self.ratio_eps)
+            ratios_clamped = ratios.clamp(
+                1.0 - self.ratio_eps_min, 1.0 + self.ratio_eps_max
+            )
         else:
             ratios = curr_logprobs
             ratios_clamped = curr_logprobs

From a7b2e2bd78088596f9794b4281f95bed12794e43 Mon Sep 17 00:00:00 2001
From: Zhaocheng Zhu <healmysoul@163.com>
Date: Mon, 31 Mar 2025 14:55:45 -0700
Subject: [PATCH 20/27] fix: change format messages to out of place (#77)

Signed-off-by: KiddoZhu <zhaochengz@nvidia.com>
Signed-off-by: Sahil Jain <sahilj@nvidia.com>
Co-authored-by: Parth Chadha <pchadha@nvidia.com>
Co-authored-by: Sahil Jain <48468750+SahilJain314@users.noreply.github.com>
Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
---
 nemo_reinforcer/algorithms/loss_functions.py |  2 +-
 nemo_reinforcer/data/llm_message_utils.py    | 15 +++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/nemo_reinforcer/algorithms/loss_functions.py b/nemo_reinforcer/algorithms/loss_functions.py
index 6a5d6b593d..158c9824eb 100644
--- a/nemo_reinforcer/algorithms/loss_functions.py
+++ b/nemo_reinforcer/algorithms/loss_functions.py
@@ -58,7 +58,7 @@ class ClippedPGLossFn(LossFunction):
     - r_t(θ) = π_θ(a_t|s_t) / π_θ_old(a_t|s_t) is the probability ratio
     - A_t is the advantage estimate
     - ε is the clip parameter (ratio_eps)
-        - As proposed in the DAPO paper (https://arxiv.org/pdf/2503.14476), 
+        - As proposed in the DAPO paper (https://arxiv.org/pdf/2503.14476),
           we allow setting a distinct minimum and maximum value for the clip parameter (set to the same value for PPO/GRPO/etc.)
             - ratio_eps_min: minimum value for the clip parameter
             - ratio_eps_max: maximum value for the clip parameter
diff --git a/nemo_reinforcer/data/llm_message_utils.py b/nemo_reinforcer/data/llm_message_utils.py
index 5ae8bee9a8..f6cb3c8079 100644
--- a/nemo_reinforcer/data/llm_message_utils.py
+++ b/nemo_reinforcer/data/llm_message_utils.py
@@ -353,14 +353,13 @@ def get_formatted_message_log(
     Returns:
         The message log with updated 'token_ids' and 'content' fields.
     """
-    cu_message = []
+    new_message_log = []
     prev_formatted_message = ""
     template = task_data_spec.custom_template
 
     for i, message in enumerate(message_log):
-        cu_message.append(message.copy())
         formatted_message = tokenizer.apply_chat_template(
-            cu_message,
+            message_log[: i + 1],
             chat_template=template,
             add_generation_prompt=False,
             tokenize=False,
@@ -383,13 +382,17 @@ def get_formatted_message_log(
             message_chunk = message_chunk.rstrip("\n")
             if not message_chunk.endswith(tokenizer.eos_token):
                 message_chunk += tokenizer.eos_token
-        message["token_ids"] = tokenizer(
+
+        new_message = message.copy()
+        new_message["token_ids"] = tokenizer(
             message_chunk, return_tensors="pt", add_special_tokens=False
         )["input_ids"][0]
-        message["content"] = message_chunk
+        new_message["content"] = message_chunk
+        new_message_log.append(new_message)
+
         prev_formatted_message = formatted_message
 
-    return message_log
+    return new_message_log
 
 
 def remap_dataset_keys(

From b17069c70a2f7419de75e5d4aed17185b2476424 Mon Sep 17 00:00:00 2001
From: Terry Kong <terrycurtiskong@gmail.com>
Date: Mon, 31 Mar 2025 19:03:54 -0700
Subject: [PATCH 21/27] fix: correct version and use setuptools.dynamic
 metadata for version/readme (#104)

Signed-off-by: Terry Kong <terryk@nvidia.com>
Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
---
 nemo_reinforcer/package_info.py |  4 ++--
 pyproject.toml                  | 10 ++++++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/nemo_reinforcer/package_info.py b/nemo_reinforcer/package_info.py
index ea2977b049..8ca9d4a97b 100644
--- a/nemo_reinforcer/package_info.py
+++ b/nemo_reinforcer/package_info.py
@@ -14,9 +14,9 @@
 
 
 MAJOR = 0
-MINOR = 1
+MINOR = 2
 PATCH = 0
-PRE_RELEASE = ""
+PRE_RELEASE = "dev"
 
 # Use the following formatting: (major, minor, patch, pre-release)
 VERSION = (MAJOR, MINOR, PATCH, PRE_RELEASE)
diff --git a/pyproject.toml b/pyproject.toml
index eddc57a15b..e1e538304e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,9 +4,11 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "nemo-reinforcer"
-version = "0.0.1"
+dynamic = [
+    "version",
+    "readme",
+]
 description = "Nemo-Reinforcer: A Scalable and Efficient Post-Training Library for Models Ranging from 1 GPU to 1000s, and from Tiny to >100B Parameters"
-readme = "README.md"
 requires-python = ">=3.10"
 license = {text = "Apache 2.0"}
 dependencies = [
@@ -29,6 +31,10 @@ dependencies = [
 [tool.setuptools]
 packages = ["nemo_reinforcer"]
 
+[tool.setuptools.dynamic]
+version = {attr = "nemo_reinforcer.__version__"}  # any module attribute compatible with ast.literal_eval
+readme = {file = "README.md", content-type = "text/markdown"}
+
 [project.optional-dependencies]
 build = [
     "torch==2.6.0", 

From 6f2c31e7cb653e90c944744c70aac9be9e56ce47 Mon Sep 17 00:00:00 2001
From: Parth Chadha <pchadha@nvidia.com>
Date: Tue, 1 Apr 2025 07:01:24 -0700
Subject: [PATCH 22/27] =?UTF-8?q?fix:=20remove=20usage=20of=20vllm=20to=20?=
 =?UTF-8?q?get=20device=20uuid=20and=20instead=20use=20nvidia-m=E2=80=A6?=
 =?UTF-8?q?=20(#105)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Parth Chadha <pchadha@nvidia.com>
Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
---
 nemo_reinforcer/models/generation/vllm.py     |  4 +-
 .../models/generation/vllm_backend.py         |  5 +-
 nemo_reinforcer/models/policy/hf_policy.py    | 17 +++--
 nemo_reinforcer/utils/nvml.py                 | 66 +++++++++++++++++++
 pyproject.toml                                |  1 +
 tests/unit/utils/test_pynvml.py               | 61 +++++++++++++++++
 6 files changed, 143 insertions(+), 11 deletions(-)
 create mode 100644 nemo_reinforcer/utils/nvml.py
 create mode 100644 tests/unit/utils/test_pynvml.py

diff --git a/nemo_reinforcer/models/generation/vllm.py b/nemo_reinforcer/models/generation/vllm.py
index 2395b65e34..edef3c69ae 100644
--- a/nemo_reinforcer/models/generation/vllm.py
+++ b/nemo_reinforcer/models/generation/vllm.py
@@ -97,6 +97,7 @@ def configure_worker(
 
         # Force vllm to use v0 runtime (will be enabled by default in #51)
         env_vars["VLLM_USE_V1"] = "0"
+
         return resources, env_vars, init_kwargs
 
     def __init__(
@@ -379,9 +380,6 @@ def shutdown(self):
             return False
 
     def report_device_id(self) -> str:
-        # from vllm.platforms import current_platform
-        # self.device_uuid = current_platform.get_device_uuid(self.rank)
-        # return self.device_uuid
         return self.llm.collective_rpc("report_device_id", args=tuple())[0]
 
     def update_weights_from_ipc_handles(self, ipc_handles):
diff --git a/nemo_reinforcer/models/generation/vllm_backend.py b/nemo_reinforcer/models/generation/vllm_backend.py
index a8ded9b519..3507d7dc55 100644
--- a/nemo_reinforcer/models/generation/vllm_backend.py
+++ b/nemo_reinforcer/models/generation/vllm_backend.py
@@ -25,10 +25,9 @@
 
 class UpdatableVllmInternalWorker(Worker):
     def report_device_id(self) -> str:
-        from vllm.platforms import current_platform
+        from nemo_reinforcer.utils.nvml import get_device_uuid
 
-        self.device_uuid = current_platform.get_device_uuid(self.device.index)
-        return self.device_uuid
+        return get_device_uuid(self.device.index)
 
     def update_weights_from_ipc_handles(self, ipc_handles):
         """Update weights from IPC handles.
diff --git a/nemo_reinforcer/models/policy/hf_policy.py b/nemo_reinforcer/models/policy/hf_policy.py
index bc5d0c4a5d..8d0f67f230 100644
--- a/nemo_reinforcer/models/policy/hf_policy.py
+++ b/nemo_reinforcer/models/policy/hf_policy.py
@@ -696,10 +696,17 @@ def zero_out_weights(self):
         torch.cuda.synchronize()
 
     def report_device_id(self) -> str:
-        from vllm.platforms import current_platform
+        """Report the UUID of the current CUDA device using NVML.
 
-        self.device_uuid = current_platform.get_device_uuid(torch.cuda.current_device())
-        return self.device_uuid
+        Returns:
+            str: UUID of the device in the format "GPU-xxxxx"
+        """
+        from nemo_reinforcer.utils.nvml import get_device_uuid
+
+        # Get current device index from torch
+        device_idx = torch.cuda.current_device()
+        # Get device UUID using NVML
+        return get_device_uuid(device_idx)
 
     @torch.no_grad()
     def get_weight_ipc_handles(self, offload_model=True):
@@ -718,7 +725,7 @@ def get_weight_ipc_handles(self, offload_model=True):
         params = dtype_params
         self._held_reference_model_params = params
         data = {}
-        self.device_uuid = self.report_device_id()
+        device_uuid = self.report_device_id()
         for name, p in params.items():
             data[name] = reduce_tensor(p.detach())
 
@@ -726,7 +733,7 @@ def get_weight_ipc_handles(self, offload_model=True):
             self.model = self.manual_offload_to_cpu(self.model)
             gc.collect()
             torch.cuda.empty_cache()
-        return {self.device_uuid: data}
+        return {device_uuid: data}
 
     def prepare_for_lp_inference(self):
         self.model = self.manual_load_to_gpu(self.model)
diff --git a/nemo_reinforcer/utils/nvml.py b/nemo_reinforcer/utils/nvml.py
new file mode 100644
index 0000000000..2f684effef
--- /dev/null
+++ b/nemo_reinforcer/utils/nvml.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+import os
+import pynvml
+
+
+@contextlib.contextmanager
+def nvml_context():
+    """Context manager for NVML initialization and shutdown.
+
+    Raises:
+        RuntimeError: If NVML initialization fails
+    """
+    try:
+        pynvml.nvmlInit()
+        yield
+    except pynvml.NVMLError as e:
+        raise RuntimeError(f"Failed to initialize NVML: {e}")
+    finally:
+        try:
+            pynvml.nvmlShutdown()
+        except:
+            pass
+
+
+def device_id_to_physical_device_id(device_id: int) -> int:
+    """Convert a logical device ID to a physical device ID considering CUDA_VISIBLE_DEVICES."""
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
+        try:
+            physical_device_id = int(device_ids[device_id])
+            return physical_device_id
+        except ValueError:
+            raise RuntimeError(
+                f"Failed to convert logical device ID {device_id} to physical device ID. Available devices are: {device_ids}."
+            )
+    else:
+        return device_id
+
+
+def get_device_uuid(device_idx: int) -> str:
+    """Get the UUID of a CUDA device using NVML."""
+    # Convert logical device index to physical device index
+    global_device_idx = device_id_to_physical_device_id(device_idx)
+
+    # Get the device handle and UUID
+    with nvml_context():
+        try:
+            handle = pynvml.nvmlDeviceGetHandleByIndex(global_device_idx)
+            return pynvml.nvmlDeviceGetUUID(handle)
+        except pynvml.NVMLError as e:
+            raise RuntimeError(
+                f"Failed to get device UUID for device {device_idx} (global index: {global_device_idx}): {e}"
+            )
diff --git a/pyproject.toml b/pyproject.toml
index e1e538304e..8397211ce6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,6 +26,7 @@ dependencies = [
     "omegaconf",
     "torchdata",
     "vllm==0.8.0",
+    "nvidia-ml-py",
 ]
 
 [tool.setuptools]
diff --git a/tests/unit/utils/test_pynvml.py b/tests/unit/utils/test_pynvml.py
new file mode 100644
index 0000000000..cf7044654f
--- /dev/null
+++ b/tests/unit/utils/test_pynvml.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from unittest.mock import patch
+
+from nemo_reinforcer.utils.nvml import (
+    nvml_context,
+    device_id_to_physical_device_id,
+    get_device_uuid,
+)
+
+
+@patch("nemo_reinforcer.utils.nvml.pynvml")
+def test_nvml_context(mock_pynvml):
+    """Test that nvml_context initializes and shuts down NVML."""
+    with nvml_context():
+        pass
+
+    # Verify init and shutdown were called
+    mock_pynvml.nvmlInit.assert_called_once()
+    mock_pynvml.nvmlShutdown.assert_called_once()
+
+
+def test_device_id_conversion():
+    """Test device ID conversion with and without CUDA_VISIBLE_DEVICES."""
+    with patch.dict(os.environ, {}, clear=True):
+        assert device_id_to_physical_device_id(0) == 0
+
+    with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "2,3"}):
+        assert device_id_to_physical_device_id(0) == 2
+        assert device_id_to_physical_device_id(1) == 3
+
+
+@patch("nemo_reinforcer.utils.nvml.device_id_to_physical_device_id")
+@patch("nemo_reinforcer.utils.nvml.pynvml")
+def test_get_device_uuid(mock_pynvml, mock_convert_id):
+    """Test that get_device_uuid correctly retrieves a UUID."""
+
+    # Setup
+    mock_convert_id.return_value = 1
+    mock_handle = mock_pynvml.nvmlDeviceGetHandleByIndex.return_value
+    mock_pynvml.nvmlDeviceGetUUID.return_value = b"GPU-12345"
+
+    # Call function
+    uuid = get_device_uuid(0)
+
+    # Verify
+    assert uuid == b"GPU-12345"
+    mock_convert_id.assert_called_once_with(0)
+    mock_pynvml.nvmlDeviceGetHandleByIndex.assert_called_once_with(1)

From 4e20786b70e8730f974de1af2f621ee03c7d4e21 Mon Sep 17 00:00:00 2001
From: Hemil Desai <hemild@nvidia.com>
Date: Tue, 1 Apr 2025 11:21:35 -0700
Subject: [PATCH 23/27] fix: Change optional-dependencies to dependency-groups
 (#81)

Signed-off-by: Hemil Desai <hemild@nvidia.com>
Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
---
 .github/workflows/cicd-main.yml | 20 ++++++++++----------
 docker/Dockerfile               |  2 +-
 docs/documentation.md           |  6 +++---
 pyproject.toml                  |  6 +++---
 tests/run_unit_in_docker.sh     |  2 +-
 5 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 7902c02146..4f20b77cda 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -16,8 +16,8 @@ name: "CICD Reinforcer"
 on:
   pull_request:
     branches:
-      - 'main'
-      - 'r**'
+      - "main"
+      - "r**"
     types: [labeled]
   merge_group:
     types: [checks_requested]
@@ -81,14 +81,14 @@ jobs:
           # Some output that's helpful for debugging
           echo "Docs changed: $CHANGED_DOCS"
           echo "Src changed: $CHANGED_SRC"
-          
+
           # echo "DOCS_ONLY: $DOCS_ONLY"
           echo "LABEL: $LABEL"
           echo "IS_PULLREQUEST: $IS_PULLREQUEST"
-          
+
           # Run CI only (on main or if label is attached) and if it's not only docs
           echo run_ci=$([[ ("$LABEL" = "true" || "$IS_PULLREQUEST" = "false" || "$MERGE_GROUP" = "true")  && "$DOCS_ONLY" = "false" ]] && echo "true" || echo "false") | tee -a "$GITHUB_OUTPUT"
-  
+
   lint-check:
     name: Lint check
     needs: [pre-flight]
@@ -102,7 +102,7 @@ jobs:
           pip install pre-commit
           pre-commit install
           pre-commit run --all-files --show-diff-on-failure --color=always
-  
+
   sphinx-build:
     name: Sphinx build
     needs: [pre-flight]
@@ -115,7 +115,7 @@ jobs:
         run: |
           pip install uv
           cd docs/
-          uv run --extra docs sphinx-build . _build/html
+          uv run --group docs sphinx-build . _build/html
 
   build-container:
     if: ${{ needs.pre-flight.outputs.run_ci == 'true' }}
@@ -140,7 +140,7 @@ jobs:
       TIMEOUT: 10
       SCRIPT: |
         cd ${REINFORCER_REPO_DIR}/docs
-        uv run --extra docs sphinx-build -b doctest . _build/doctest
+        uv run --group docs sphinx-build -b doctest . _build/doctest
     secrets:
       HF_TOKEN: ${{ secrets.HF_TOKEN }}
 
@@ -154,7 +154,7 @@ jobs:
       TIMEOUT: 15
       SCRIPT: |
         cd ${REINFORCER_REPO_DIR}
-        uv run --extra test bash -x ./tests/run_unit.sh
+        uv run --group test bash -x ./tests/run_unit.sh
       FINAL_SCRIPT_EXTERNAL: |
         cat <<EOF | tee -a $GITHUB_STEP_SUMMARY
         # Unit test results
@@ -186,4 +186,4 @@ jobs:
   #      cd ${REINFORCER_REPO_DIR}
   #      uv run bash ./tests/functional/${{ matrix.test_case }}
   #  secrets:
-  #    HF_TOKEN: ${{ secrets.HF_TOKEN }}
\ No newline at end of file
+  #    HF_TOKEN: ${{ secrets.HF_TOKEN }}
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 5842eed006..562af43798 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -21,7 +21,7 @@ RUN chmod 755 /home/ray/.cache
 WORKDIR /opt/reinforcer
 RUN uv venv .venv
 # uv sync has a more reliable resolver than simple uv pip install which can fail
-RUN uv sync --extra test --extra dev --extra docs --no-install-project
+RUN uv sync --group test --group dev --group docs --no-install-project
 
 ENV VIRTUAL_ENV=/opt/reinforcer/.venv
 ENV PATH="/opt/reinforcer/.venv/bin:$PATH"
diff --git a/docs/documentation.md b/docs/documentation.md
index 668056b13b..c94239f213 100644
--- a/docs/documentation.md
+++ b/docs/documentation.md
@@ -15,7 +15,7 @@ Switch to the documentation source folder and generate HTML output.
 
 ```sh
 cd docs/
-uv run --extra docs sphinx-build . _build/html
+uv run --group docs sphinx-build . _build/html
 ```
 
 * The resulting HTML files are generated in a `_build/html` folder that is created under the project `docs/` folder.
@@ -29,7 +29,7 @@ To do so run:
 
 ```sh
 cd docs/
-uv run --extra docs sphinx-autobuild . _build/html --port 12345 --host 0.0.0.0
+uv run --group docs sphinx-autobuild . _build/html --port 12345 --host 0.0.0.0
 ```
 
 Open a web browser and go to `http://${HOST_WHERE_SPHINX_COMMAND_RUN}:12345` to view the output.
@@ -41,7 +41,7 @@ We also run tests in our python docstrings. You can run them with:
 
 ```sh
 cd docs/
-uv run --extra docs sphinx-build -b doctest . _build/doctest
+uv run --group docs sphinx-build -b doctest . _build/doctest
 ```
 
 ## Writing Tests in Python Docstrings
diff --git a/pyproject.toml b/pyproject.toml
index 8397211ce6..752f25047d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,10 +36,10 @@ packages = ["nemo_reinforcer"]
 version = {attr = "nemo_reinforcer.__version__"}  # any module attribute compatible with ast.literal_eval
 readme = {file = "README.md", content-type = "text/markdown"}
 
-[project.optional-dependencies]
+[dependency-groups]
 build = [
-    "torch==2.6.0", 
-    "setuptools", 
+    "torch==2.6.0",
+    "setuptools",
     "packaging",
     "einops"
 ]
diff --git a/tests/run_unit_in_docker.sh b/tests/run_unit_in_docker.sh
index 1190e3d8ec..cdcd3a2ed2 100755
--- a/tests/run_unit_in_docker.sh
+++ b/tests/run_unit_in_docker.sh
@@ -36,4 +36,4 @@ fi
 # We have found that 111 does not always work and can leave the filesystem permissions in a bad state.
 
 # Run the script inside the Docker container with GPU support
-docker run -u root $INTERACTIVE_FLAG --ulimit memlock=-1 --ulimit stack=67108864 --rm --gpus '"device=0,1"' -v "$(realpath $SCRIPT_DIR/..):/workspace" -v $HF_HOME:/hf_home -e HF_TOKEN -e HF_HOME=/hf_home -e HOME=/tmp/ -w /workspace/tests "$CONTAINER" -- bash -x -c "umask 000 && uv run --extra test bash -x ./run_unit.sh $@"
+docker run -u root $INTERACTIVE_FLAG --ulimit memlock=-1 --ulimit stack=67108864 --rm --gpus '"device=0,1"' -v "$(realpath $SCRIPT_DIR/..):/workspace" -v $HF_HOME:/hf_home -e HF_TOKEN -e HF_HOME=/hf_home -e HOME=/tmp/ -w /workspace/tests "$CONTAINER" -- bash -x -c "umask 000 && uv run --group test bash -x ./run_unit.sh $@"

From d6de793c7fa8a42d8c248bf226ba99aa262ca2a3 Mon Sep 17 00:00:00 2001
From: Hemil Desai <hemild@nvidia.com>
Date: Tue, 1 Apr 2025 12:06:51 -0700
Subject: [PATCH 24/27] feat: Add support for hydra style overrides (#80)

Signed-off-by: Hemil Desai <hemild@nvidia.com>
Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
---
 examples/run_grpo_math.py       | 26 +++++++--------
 nemo_reinforcer/utils/config.py | 31 ++++++++++++++++++
 pyproject.toml                  |  1 +
 tests/unit/utils/test_config.py | 58 +++++++++++++++++++++++++++++++++
 4 files changed, 101 insertions(+), 15 deletions(-)

diff --git a/examples/run_grpo_math.py b/examples/run_grpo_math.py
index b87c7037b3..c67829e80e 100644
--- a/examples/run_grpo_math.py
+++ b/examples/run_grpo_math.py
@@ -15,24 +15,23 @@
 import argparse
 import os
 import pprint
-
-from omegaconf import OmegaConf
-from typing import Dict, Any
+from collections import defaultdict
+from typing import Any, Dict
 
 from datasets import load_dataset
+from omegaconf import OmegaConf
 from transformers import AutoTokenizer
-from collections import defaultdict
 
 from nemo_reinforcer.algorithms.grpo import MasterConfig, grpo_train, setup
-from nemo_reinforcer.distributed.virtual_cluster import init_ray
-from nemo_reinforcer.utils.config import load_config
-from nemo_reinforcer.utils.logger import get_next_experiment_dir
-from nemo_reinforcer.data.interfaces import TaskDataSpec, DatumSpec, LLMMessageLogType
 from nemo_reinforcer.data import DataConfig
-from nemo_reinforcer.models.policy import PolicyConfig
 from nemo_reinforcer.data.datasets import AllTaskProcessedDataset, rl_collate_fn
-from nemo_reinforcer.environments.math_environment import MathEnvironment
 from nemo_reinforcer.data.hf_datasets.openmathinstruct2 import OpenMathInstruct2Dataset
+from nemo_reinforcer.data.interfaces import DatumSpec, LLMMessageLogType, TaskDataSpec
+from nemo_reinforcer.distributed.virtual_cluster import init_ray
+from nemo_reinforcer.environments.math_environment import MathEnvironment
+from nemo_reinforcer.models.policy import PolicyConfig
+from nemo_reinforcer.utils.config import load_config, parse_hydra_overrides
+from nemo_reinforcer.utils.logger import get_next_experiment_dir
 
 
 def parse_args():
@@ -43,10 +42,7 @@ def parse_args():
     )
 
     # Parse known args for the script
-    args, remaining = parser.parse_known_args()
-
-    # Convert remaining args to OmegaConf format
-    overrides = OmegaConf.from_dotlist(remaining)
+    args, overrides = parser.parse_known_args()
 
     return args, overrides
 
@@ -242,7 +238,7 @@ def main():
 
     if overrides:
         print(f"Overrides: {overrides}")
-        config = OmegaConf.merge(config, overrides)
+        config = parse_hydra_overrides(config, overrides)
 
     config: MasterConfig = OmegaConf.to_container(config, resolve=True)
     print("Applied CLI overrides")
diff --git a/nemo_reinforcer/utils/config.py b/nemo_reinforcer/utils/config.py
index 51418e3b1b..82c4be462b 100644
--- a/nemo_reinforcer/utils/config.py
+++ b/nemo_reinforcer/utils/config.py
@@ -15,6 +15,8 @@
 from pathlib import Path
 from typing import Optional, Union
 
+from hydra._internal.config_loader_impl import ConfigLoaderImpl
+from hydra.core.override_parser.overrides_parser import OverridesParser
 from omegaconf import DictConfig, ListConfig, OmegaConf
 
 
@@ -130,3 +132,32 @@ def load_config(config_path: Union[str, Path]) -> DictConfig:
         Merged config dictionary
     """
     return load_config_with_inheritance(config_path)
+
+
+class OverridesError(Exception):
+    """Custom exception for Hydra override parsing errors."""
+
+    pass
+
+
+def parse_hydra_overrides(cfg: DictConfig, overrides: list[str]) -> DictConfig:
+    """Parse and apply Hydra overrides to an OmegaConf config.
+
+    Args:
+        cfg: OmegaConf config to apply overrides to
+        overrides: List of Hydra override strings
+
+    Returns:
+        Updated config with overrides applied
+
+    Raises:
+        OverridesError: If there's an error parsing or applying overrides
+    """
+    try:
+        OmegaConf.set_struct(cfg, True)
+        parser = OverridesParser.create()
+        parsed = parser.parse_overrides(overrides=overrides)
+        ConfigLoaderImpl._apply_overrides_to_config(overrides=parsed, cfg=cfg)
+        return cfg
+    except Exception as e:
+        raise OverridesError(f"Failed to parse Hydra overrides: {str(e)}") from e
diff --git a/pyproject.toml b/pyproject.toml
index 752f25047d..5f72e283f9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,6 +27,7 @@ dependencies = [
     "torchdata",
     "vllm==0.8.0",
     "nvidia-ml-py",
+    "hydra-core",
 ]
 
 [tool.setuptools]
diff --git a/tests/unit/utils/test_config.py b/tests/unit/utils/test_config.py
index 245d9e0053..b8aa5328d2 100644
--- a/tests/unit/utils/test_config.py
+++ b/tests/unit/utils/test_config.py
@@ -196,3 +196,61 @@ def test_interpolation(temp_config_dir):
     config = load_config(child_path)
     assert config.base_value == 43
     assert config.derived.value == 43  # Interpolation uses child's base_value
+
+
+def test_parse_hydra_overrides():
+    """Test parsing and applying Hydra overrides."""
+    from omegaconf import OmegaConf
+
+    from nemo_reinforcer.utils.config import OverridesError, parse_hydra_overrides
+
+    # Create initial config
+    cfg = OmegaConf.create(
+        {
+            "model": {"type": "default", "hidden_size": 768},
+            "training": {"batch_size": 32, "learning_rate": 1e-4},
+        }
+    )
+
+    # Test basic override
+    overrides = ["model.type=transformer"]
+    updated_cfg = parse_hydra_overrides(cfg, overrides)
+    assert updated_cfg.model.type == "transformer"
+    assert updated_cfg.model.hidden_size == 768  # Unchanged
+
+    # Test nested override
+    overrides = ["model.hidden_size=1024"]
+    updated_cfg = parse_hydra_overrides(cfg, overrides)
+    assert updated_cfg.model.hidden_size == 1024
+
+    # Test multiple overrides
+    overrides = ["training.batch_size=64", "training.learning_rate=2e-4"]
+    updated_cfg = parse_hydra_overrides(cfg, overrides)
+    assert updated_cfg.training.batch_size == 64
+    assert updated_cfg.training.learning_rate == 2e-4
+
+    # Test invalid override
+    overrides = ["nonexistent.key=value"]
+    with pytest.raises(OverridesError):
+        parse_hydra_overrides(cfg, overrides)
+
+    # Test invalid syntax
+    overrides = ["invalid.syntax"]
+    with pytest.raises(OverridesError):
+        parse_hydra_overrides(cfg, overrides)
+
+    # Test empty overrides
+    overrides = []
+    updated_cfg = parse_hydra_overrides(cfg, overrides)
+    assert updated_cfg == cfg  # Config should be unchanged
+
+    # Test override additions and deletions
+    overrides = [
+        "+model.num_layers=12",
+        "++model.type=transformer",
+        "~training.batch_size",
+    ]
+    updated_cfg = parse_hydra_overrides(cfg, overrides)
+    assert updated_cfg.model.num_layers == 12
+    assert updated_cfg.model.type == "transformer"
+    assert "batch_size" not in updated_cfg.training

From f87474667e140f653435d66620aac37312e3b517 Mon Sep 17 00:00:00 2001
From: Anna Shors <ashors@nvidia.com>
Date: Tue, 1 Apr 2025 12:44:29 -0700
Subject: [PATCH 25/27] fix: Do not initialize reference model for sft (#71)

Signed-off-by: ashors1 <ashors@nvidia.com>
Signed-off-by: Anna Shors <ashors@nvidia.com>
Co-authored-by: Terry Kong <terrycurtiskong@gmail.com>
---
 nemo_reinforcer/algorithms/sft.py             |  1 +
 nemo_reinforcer/models/policy/hf_policy.py    | 26 +++++++++++--------
 .../unit/models/policy/test_hf_ray_policy.py  | 10 ++++---
 3 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/nemo_reinforcer/algorithms/sft.py b/nemo_reinforcer/algorithms/sft.py
index 0d06ad6366..5ff77e11e9 100644
--- a/nemo_reinforcer/algorithms/sft.py
+++ b/nemo_reinforcer/algorithms/sft.py
@@ -182,6 +182,7 @@ def setup(
         if last_checkpoint_path
         else None,
         init_optimizer=True,
+        init_reference_model=False,
     )
     loss_fn = NLLLoss()
     print(f"  ✓ Model initialized")
diff --git a/nemo_reinforcer/models/policy/hf_policy.py b/nemo_reinforcer/models/policy/hf_policy.py
index 8d0f67f230..9c80fbbc8b 100644
--- a/nemo_reinforcer/models/policy/hf_policy.py
+++ b/nemo_reinforcer/models/policy/hf_policy.py
@@ -69,6 +69,7 @@ def __init__(
         weights_path: Optional[str] = None,
         optimizer_path: Optional[str] = None,
         init_optimizer: bool = True,
+        init_reference_model: bool = True,
     ):
         self.cfg = config
         # torch distributed init. Envars for rank, world_size, and master_addr and master_port are set from the ray remote call
@@ -89,12 +90,14 @@ def __init__(
             device_map="cpu",  # load weights onto CPU initially
             torch_dtype=torch.float32,  # use full precision in sft until https://github.com/NVIDIA/reinforcer/issues/13 is fixed
         )
-        self.reference_model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            device_map="cpu",  # load weights onto CPU initially
-            torch_dtype=torch.float32,  # use full precision in sft until https://github.com/NVIDIA/reinforcer/issues/13 is fixed
-        )
-
+        if init_reference_model:
+            self.reference_model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                device_map="cpu",  # load weights onto CPU initially
+                torch_dtype=torch.float32,  # use full precision in sft until https://github.com/NVIDIA/reinforcer/issues/13 is fixed
+            )
+        else:
+            self.reference_model = None
         self.tokenizer = AutoTokenizer.from_pretrained(model_name)
         # If no pad token is defined, you might need:
         if self.tokenizer.pad_token is None:
@@ -131,11 +134,10 @@ def do_fsdp(model):
         self.model.to("cuda")
         self.model = do_fsdp(self.model)
         self.model = self.manual_offload_to_cpu(self.model)
-
-        self.reference_model.to("cuda")
-        self.reference_model = do_fsdp(self.reference_model)
-        self.reference_model = self.manual_offload_to_cpu(self.reference_model)
-
+        if self.reference_model is not None:
+            self.reference_model.to("cuda")
+            self.reference_model = do_fsdp(self.reference_model)
+            self.reference_model = self.manual_offload_to_cpu(self.reference_model)
         self.model = self.manual_load_to_gpu(self.model)
         self._held_reference_model_params = None
         # register_fsdp_forward_method(self.model, "generate")
@@ -925,6 +927,7 @@ def __init__(
         init_optimizer: bool = True,
         weights_path: Optional[str] = None,
         optimizer_path: Optional[str] = None,
+        init_reference_model: bool = True,
     ):
         if weights_path:
             weights_path = os.path.abspath(weights_path)
@@ -937,6 +940,7 @@ def __init__(
             init_optimizer=init_optimizer,
             weights_path=weights_path,
             optimizer_path=optimizer_path,
+            init_reference_model=init_reference_model,
         )
         additional_env_vars = {}
         if config["expandable_segments_enabled"]:
diff --git a/tests/unit/models/policy/test_hf_ray_policy.py b/tests/unit/models/policy/test_hf_ray_policy.py
index 29d8cbcbee..ded244feac 100644
--- a/tests/unit/models/policy/test_hf_ray_policy.py
+++ b/tests/unit/models/policy/test_hf_ray_policy.py
@@ -201,7 +201,7 @@ def training_setup():
         config = basic_llama_test_config
 
         print("Creating training HfPolicy...")
-        policy = HfPolicy(cluster=cluster, config=config)
+        policy = HfPolicy(cluster=cluster, config=config, init_reference_model=False)
 
         # Create a test batch
         print("Creating test batch...")
@@ -278,7 +278,7 @@ def verify_loss_tensor(loss_tensor):
 
 
 @pytest.fixture
-def generation_setup():
+def generation_setup(request):
     """Setup and teardown specifically for generation tests."""
     policy = None
     cluster = None
@@ -300,7 +300,9 @@ def generation_setup():
         config = basic_llama_test_config
 
         print("Creating generation HfPolicy...")
-        policy = HfPolicy(cluster=cluster, config=config)
+        policy = HfPolicy(
+            cluster=cluster, config=config, init_reference_model=request.param
+        )
 
         # Create a test batch
         print("Creating test batch...")
@@ -364,6 +366,7 @@ def generation_setup():
 
 
 @pytest.mark.timeout(180)
+@pytest.mark.parametrize("generation_setup", [False], indirect=True)
 def test_hf_policy_generation(generation_setup, tracker):
     policy, cluster, data, tokenizer, prompts, expected_generations = generation_setup
 
@@ -450,6 +453,7 @@ def test_hf_policy_generation(generation_setup, tracker):
 
 
 @pytest.mark.timeout(180)
+@pytest.mark.parametrize("generation_setup", [True], indirect=True)
 def test_all_hf_policy_generation_lps_ref_training(generation_setup):
     policy, cluster, data, tokenizer, prompts, expected_generations = generation_setup
 

From 0c0aa6dc36448e58ef5cfbd74851704018eb0a4f Mon Sep 17 00:00:00 2001
From: Parth Chadha <pchadha@nvidia.com>
Date: Tue, 1 Apr 2025 15:39:43 -0700
Subject: [PATCH 26/27] =?UTF-8?q?fix:=20change=20grpo=20default=20to=20use?=
 =?UTF-8?q?=2064=20prompts=20per=20step=20and=2032=20generation=E2=80=A6?=
 =?UTF-8?q?=20(#111)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Parth Chadha <pchadha@nvidia.com>
Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
---
 examples/configs/grpo_math_1B.yaml | 2 +-
 examples/configs/grpo_math_8B.yaml | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml
index a7cb644114..c79c9bde5b 100644
--- a/examples/configs/grpo_math_1B.yaml
+++ b/examples/configs/grpo_math_1B.yaml
@@ -27,7 +27,7 @@ policy:
   model_name: "meta-llama/Llama-3.2-1B-Instruct"
   train_global_batch_size: 512
   train_micro_batch_size: 4
-  generation_batch_size: 32
+  generation_batch_size: 32 # Only used when generating using HF backend
   logprob_batch_size: 4
   max_total_sequence_length: 512
   precision: "bfloat16"
diff --git a/examples/configs/grpo_math_8B.yaml b/examples/configs/grpo_math_8B.yaml
index 3408e84e32..06005425cb 100644
--- a/examples/configs/grpo_math_8B.yaml
+++ b/examples/configs/grpo_math_8B.yaml
@@ -1,11 +1,15 @@
 # GRPO Algorithm Configuration
 defaults: "grpo_math_1B.yaml"
 
+grpo:
+  num_prompts_per_step: 64
+  num_generations_per_prompt: 32
+
 policy:
   model_name: "meta-llama/Llama-3.1-8B-Instruct"
   train_global_batch_size: 512
   train_micro_batch_size: 1
-  generation_batch_size: 32
+  generation_batch_size: 32 # Only used when generating using HF backend
   logprob_batch_size: 2
   max_total_sequence_length: 4096
   precision: "bfloat16"

From 97c5e1b73594c18202f82b7ab3eae69a8bb35abf Mon Sep 17 00:00:00 2001
From: Parth Chadha <pchadha@nvidia.com>
Date: Tue, 1 Apr 2025 15:58:53 -0700
Subject: [PATCH 27/27] feat: use cuda_graph by default for vllm (#116)

Signed-off-by: Parth Chadha <pchadha@nvidia.com>
Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com>
---
 nemo_reinforcer/models/generation/vllm.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nemo_reinforcer/models/generation/vllm.py b/nemo_reinforcer/models/generation/vllm.py
index edef3c69ae..e22110bf55 100644
--- a/nemo_reinforcer/models/generation/vllm.py
+++ b/nemo_reinforcer/models/generation/vllm.py
@@ -178,7 +178,8 @@ def __init__(
             gpu_memory_utilization=self.cfg["vllm_cfg"]["gpu_memory_utilization"],
             enable_prefix_caching=True,
             dtype="auto",
-            enforce_eager=True,
+            # Use cuda-graph by default for performance, set to True to use eager execution
+            enforce_eager=False,
             max_model_len=self.cfg["vllm_cfg"]["max_model_len"],
             trust_remote_code=True,
             worker_cls=UpdatableVllmInternalWorker,