From d9e3f4090e53ab83f38f3c96ead9461890eefe9b Mon Sep 17 00:00:00 2001
From: Jimmy Zhang <jiemingz@nvidia.com>
Date: Mon, 30 Jun 2025 09:08:06 -0700
Subject: [PATCH 1/3] vllm CG

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
---
 examples/configs/grpo-deepscaler-1.5b-8K.yaml               | 1 +
 examples/configs/grpo_math_1B.yaml                          | 1 +
 examples/configs/grpo_math_8B.yaml                          | 1 +
 .../recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml        | 1 +
 .../grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml   | 2 +-
 .../grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml    | 1 +
 .../llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml     | 1 +
 .../grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v3.yaml  | 1 +
 .../llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v3.yaml   | 1 +
 .../recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v3.yaml | 1 +
 .../llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml    | 1 +
 .../grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml   | 1 +
 nemo_rl/models/generation/vllm.py                           | 6 ++++--
 tests/unit/experience/test_rollouts.py                      | 1 +
 tests/unit/models/generation/test_vllm_generation.py        | 1 +
 tests/unit/models/generation/test_vllm_large_model.py       | 1 +
 16 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/examples/configs/grpo-deepscaler-1.5b-8K.yaml b/examples/configs/grpo-deepscaler-1.5b-8K.yaml
index 1013f3d4c2..ce5ed73c17 100644
--- a/examples/configs/grpo-deepscaler-1.5b-8K.yaml
+++ b/examples/configs/grpo-deepscaler-1.5b-8K.yaml
@@ -99,6 +99,7 @@ policy:
       pipeline_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: ${policy.max_total_sequence_length}
+      enforce_eager: False
       # For most cases, use "dummy" to load the initial weights, since they will be overwritten during refit
       # For Gemma models, we need to use "auto" due to a vllm bug
       load_format: dummy
diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml
index 1842b01497..fd944fa9e7 100644
--- a/examples/configs/grpo_math_1B.yaml
+++ b/examples/configs/grpo_math_1B.yaml
@@ -107,6 +107,7 @@ policy:
       pipeline_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: ${policy.max_total_sequence_length}
+      enforce_eager: False
     colocated:
       # true: generation shares training GPUs
       # false: uses dedicated generation resources
diff --git a/examples/configs/grpo_math_8B.yaml b/examples/configs/grpo_math_8B.yaml
index 429a1d7663..a857b08858 100644
--- a/examples/configs/grpo_math_8B.yaml
+++ b/examples/configs/grpo_math_8B.yaml
@@ -58,6 +58,7 @@ policy:
       tensor_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: ${policy.max_total_sequence_length}
+      enforce_eager: False
 
 cluster:
   gpus_per_node: 8
diff --git a/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml
index 1248c28622..6bbcd95edd 100644
--- a/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml
+++ b/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml
@@ -89,6 +89,7 @@ policy:
       pipeline_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: 512
+      enforce_eager: False
     colocated:
       enabled: true
       resources:
diff --git a/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml b/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml
index 2458739e2e..7425fd9b8c 100644
--- a/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml
+++ b/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml
@@ -90,12 +90,12 @@ policy:
       pipeline_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: 16384
+      enforce_eager: False
     colocated:
       enabled: true
       resources:
         gpus_per_node: null
         num_nodes: null
-data:
   max_input_seq_length: 16384
   prompt_file: examples/prompts/cot.txt
   system_prompt_file: null
diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml
index 8f6327e1e9..b854eb7d38 100644
--- a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml
+++ b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml
@@ -90,6 +90,7 @@ policy:
       pipeline_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: 4096
+      enforce_eager: False
     colocated:
       enabled: true
       resources:
diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml
index cd05c86dbb..9f92be089b 100644
--- a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml
+++ b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml
@@ -90,6 +90,7 @@ policy:
       pipeline_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: 512
+      enforce_eager: False
     colocated:
       enabled: true
       resources:
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v3.yaml
index c5ebb4f8eb..2a1a151ea5 100644
--- a/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v3.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v3.yaml
@@ -90,6 +90,7 @@ policy:
       pipeline_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: 16384
+      enforce_eager: False
     colocated:
       enabled: true
       resources:
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v3.yaml
index 6d7a858749..06ae6b4637 100644
--- a/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v3.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v3.yaml
@@ -90,6 +90,7 @@ policy:
       pipeline_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: 16384
+      enforce_eager: False
     colocated:
       enabled: true
       resources:
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v3.yaml
index bd22cd760e..fe2de660ce 100644
--- a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v3.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v3.yaml
@@ -87,6 +87,7 @@ policy:
       pipeline_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: 4096
+      enforce_eager: False
     colocated:
       enabled: true
       resources:
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml
index d6176ddd22..00a40de4d0 100644
--- a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml
@@ -90,6 +90,7 @@ policy:
       pipeline_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: 4096
+      enforce_eager: False
     colocated:
       enabled: true
       resources:
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml
index d1303bb444..d3bbc266f2 100644
--- a/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml
+++ b/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml
@@ -90,6 +90,7 @@ policy:
       pipeline_parallel_size: 1
       gpu_memory_utilization: 0.6
       max_model_len: 512
+      enforce_eager: False
     colocated:
       enabled: true
       resources:
diff --git a/nemo_rl/models/generation/vllm.py b/nemo_rl/models/generation/vllm.py
index 9506a063d3..64e97c3314 100644
--- a/nemo_rl/models/generation/vllm.py
+++ b/nemo_rl/models/generation/vllm.py
@@ -131,6 +131,9 @@ def configure_worker(
                 seed = node_idx * 1024 + bundle_id
 
             init_kwargs["seed"] = seed
+            # Need to give each DP group its own vllm cache to address:
+            # https://github.com/vllm-project/vllm/issues/18851
+            env_vars["VLLM_CACHE_ROOT"] = os.path.expanduser(f"~/.cache/vllm_{seed}")
 
         # Check if this worker is part of a parallel group (TP or TP+PP).
         # A worker is part of a parallel group if it's a secondary member (local_bundle_indices is None)
@@ -334,8 +337,7 @@ def _patch_vllm_init_workers_ray():
             enable_prefix_caching=torch.cuda.get_device_capability()[0] >= 8,
             dtype=self.cfg["vllm_cfg"]["precision"],
             seed=seed,
-            # Don't use cuda-graph by default as it leads to convergence issues (see https://github.com/NVIDIA-NeMo/RL/issues/186)
-            enforce_eager=True,
+            enforce_eager=self.cfg["vllm_cfg"]["enforce_eager"],
             max_model_len=self.cfg["vllm_cfg"]["max_model_len"],
             trust_remote_code=True,
             worker_extension_cls="nemo_rl.models.generation.vllm_backend.VllmInternalWorkerExtension",
diff --git a/tests/unit/experience/test_rollouts.py b/tests/unit/experience/test_rollouts.py
index 08d1c0ffd6..db41fe2d39 100644
--- a/tests/unit/experience/test_rollouts.py
+++ b/tests/unit/experience/test_rollouts.py
@@ -241,6 +241,7 @@ def initial_multi_step_calculator_batch(rollout_tokenizer):
         "disable_log_stats": True,
         "disable_log_requests": True,
         "gpu_memory_utilization": 0.6,
+        "enforce_eager": "False",
     },
     "colocated": {
         "enabled": True,
diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py
index 1404b02337..8371fababb 100644
--- a/tests/unit/models/generation/test_vllm_generation.py
+++ b/tests/unit/models/generation/test_vllm_generation.py
@@ -56,6 +56,7 @@
         "async_engine": False,  # Default to False for synchronous tests
         "skip_tokenizer_init": False,
         "load_format": "auto",
+        "enforce_eager": "False",
     },
     "colocated": {
         "enabled": True,
diff --git a/tests/unit/models/generation/test_vllm_large_model.py b/tests/unit/models/generation/test_vllm_large_model.py
index 9735b5f03d..d24a0c0f31 100644
--- a/tests/unit/models/generation/test_vllm_large_model.py
+++ b/tests/unit/models/generation/test_vllm_large_model.py
@@ -50,6 +50,7 @@
         "async_engine": True,
         "skip_tokenizer_init": False,
         "load_format": "auto",
+        "enforce_eager": "False",
     },
     "colocated": {
         "enabled": True,

From 1ac69d2978285b444511ab6b8b749063e79652af Mon Sep 17 00:00:00 2001
From: Jimmy Zhang <133159885+jiemingz@users.noreply.github.com>
Date: Mon, 30 Jun 2025 12:09:49 -0400
Subject: [PATCH 2/3] Update
 grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml

Signed-off-by: Jimmy Zhang <133159885+jiemingz@users.noreply.github.com>
---
 .../llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml b/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml
index 7425fd9b8c..af4bb6945d 100644
--- a/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml
+++ b/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml
@@ -96,6 +96,7 @@ policy:
       resources:
         gpus_per_node: null
         num_nodes: null
+data:
   max_input_seq_length: 16384
   prompt_file: examples/prompts/cot.txt
   system_prompt_file: null

From e545a486a0738f30577e69a0e3b936982cd7043b Mon Sep 17 00:00:00 2001
From: Jimmy Zhang <jiemingz@nvidia.com>
Date: Tue, 1 Jul 2025 16:47:46 -0700
Subject: [PATCH 3/3] add key

Signed-off-by: Jimmy Zhang <jiemingz@nvidia.com>
---
 examples/configs/evals/eval.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/configs/evals/eval.yaml b/examples/configs/evals/eval.yaml
index eab0f1db21..85e193dcae 100644
--- a/examples/configs/evals/eval.yaml
+++ b/examples/configs/evals/eval.yaml
@@ -22,6 +22,7 @@ generation:
     pipeline_parallel_size: 1
     gpu_memory_utilization: 0.9
     max_model_len: 2048
+    enforce_eager: False
   colocated:
     # true: generation shares training GPUs
     # false: uses dedicated generation resources