From d9e3f4090e53ab83f38f3c96ead9461890eefe9b Mon Sep 17 00:00:00 2001 From: Jimmy Zhang Date: Mon, 30 Jun 2025 09:08:06 -0700 Subject: [PATCH 1/3] vllm CG Signed-off-by: Jimmy Zhang --- examples/configs/grpo-deepscaler-1.5b-8K.yaml | 1 + examples/configs/grpo_math_1B.yaml | 1 + examples/configs/grpo_math_8B.yaml | 1 + .../recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml | 1 + .../grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml | 2 +- .../grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml | 1 + .../llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml | 1 + .../grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v3.yaml | 1 + .../llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v3.yaml | 1 + .../recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v3.yaml | 1 + .../llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml | 1 + .../grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml | 1 + nemo_rl/models/generation/vllm.py | 6 ++++-- tests/unit/experience/test_rollouts.py | 1 + tests/unit/models/generation/test_vllm_generation.py | 1 + tests/unit/models/generation/test_vllm_large_model.py | 1 + 16 files changed, 19 insertions(+), 3 deletions(-) diff --git a/examples/configs/grpo-deepscaler-1.5b-8K.yaml b/examples/configs/grpo-deepscaler-1.5b-8K.yaml index 1013f3d4c2..ce5ed73c17 100644 --- a/examples/configs/grpo-deepscaler-1.5b-8K.yaml +++ b/examples/configs/grpo-deepscaler-1.5b-8K.yaml @@ -99,6 +99,7 @@ policy: pipeline_parallel_size: 1 gpu_memory_utilization: 0.6 max_model_len: ${policy.max_total_sequence_length} + enforce_eager: False # For most cases, use "dummy" to load the initial weights, since they will be overwritten during refit # For Gemma models, we need to use "auto" due to a vllm bug load_format: dummy diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml index 1842b01497..fd944fa9e7 100644 --- a/examples/configs/grpo_math_1B.yaml +++ b/examples/configs/grpo_math_1B.yaml @@ -107,6 +107,7 @@ policy: pipeline_parallel_size: 1 gpu_memory_utilization: 0.6 max_model_len: ${policy.max_total_sequence_length} + enforce_eager: False colocated: # true: generation shares training GPUs # false: uses dedicated generation resources diff --git a/examples/configs/grpo_math_8B.yaml b/examples/configs/grpo_math_8B.yaml index 429a1d7663..a857b08858 100644 --- a/examples/configs/grpo_math_8B.yaml +++ b/examples/configs/grpo_math_8B.yaml @@ -58,6 +58,7 @@ policy: tensor_parallel_size: 1 gpu_memory_utilization: 0.6 max_model_len: ${policy.max_total_sequence_length} + enforce_eager: False cluster: gpus_per_node: 8 diff --git a/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml index 1248c28622..6bbcd95edd 100644 --- a/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml +++ b/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml @@ -89,6 +89,7 @@ policy: pipeline_parallel_size: 1 gpu_memory_utilization: 0.6 max_model_len: 512 + enforce_eager: False colocated: enabled: true resources: diff --git a/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml b/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml index 2458739e2e..7425fd9b8c 100644 --- a/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml +++ b/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml @@ -90,12 +90,12 @@ policy: pipeline_parallel_size: 1 gpu_memory_utilization: 0.6 max_model_len: 16384 + enforce_eager: False colocated: enabled: true resources: gpus_per_node: null num_nodes: null -data: max_input_seq_length: 16384 prompt_file: examples/prompts/cot.txt system_prompt_file: null diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml index 8f6327e1e9..b854eb7d38 100644 --- a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml +++ b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml @@ -90,6 +90,7 @@ policy: pipeline_parallel_size: 1 gpu_memory_utilization: 0.6 max_model_len: 4096 + enforce_eager: False colocated: enabled: true resources: diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml index cd05c86dbb..9f92be089b 100644 --- a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml +++ b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml @@ -90,6 +90,7 @@ policy: pipeline_parallel_size: 1 gpu_memory_utilization: 0.6 max_model_len: 512 + enforce_eager: False colocated: enabled: true resources: diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v3.yaml index c5ebb4f8eb..2a1a151ea5 100644 --- a/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v3.yaml +++ b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt-long.v3.yaml @@ -90,6 +90,7 @@ policy: pipeline_parallel_size: 1 gpu_memory_utilization: 0.6 max_model_len: 16384 + enforce_eager: False colocated: enabled: true resources: diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v3.yaml index 6d7a858749..06ae6b4637 100644 --- a/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v3.yaml +++ b/examples/configs/recipes/llm/grpo-qwen2.5-32b-16n8g-fsdp2tp8sp-actckpt.v3.yaml @@ -90,6 +90,7 @@ policy: pipeline_parallel_size: 1 gpu_memory_utilization: 0.6 max_model_len: 16384 + enforce_eager: False colocated: enabled: true resources: diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v3.yaml index bd22cd760e..fe2de660ce 100644 --- a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v3.yaml +++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp1.v3.yaml @@ -87,6 +87,7 @@ policy: pipeline_parallel_size: 1 gpu_memory_utilization: 0.6 max_model_len: 4096 + enforce_eager: False colocated: enabled: true resources: diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml index d6176ddd22..00a40de4d0 100644 --- a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml +++ b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml @@ -90,6 +90,7 @@ policy: pipeline_parallel_size: 1 gpu_memory_utilization: 0.6 max_model_len: 4096 + enforce_eager: False colocated: enabled: true resources: diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml index d1303bb444..d3bbc266f2 100644 --- a/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml +++ b/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml @@ -90,6 +90,7 @@ policy: pipeline_parallel_size: 1 gpu_memory_utilization: 0.6 max_model_len: 512 + enforce_eager: False colocated: enabled: true resources: diff --git a/nemo_rl/models/generation/vllm.py b/nemo_rl/models/generation/vllm.py index 9506a063d3..64e97c3314 100644 --- a/nemo_rl/models/generation/vllm.py +++ b/nemo_rl/models/generation/vllm.py @@ -131,6 +131,9 @@ def configure_worker( seed = node_idx * 1024 + bundle_id init_kwargs["seed"] = seed + # Need to give each DP group its own vllm cache to address: + # https://github.com/vllm-project/vllm/issues/18851 + env_vars["VLLM_CACHE_ROOT"] = os.path.expanduser(f"~/.cache/vllm_{seed}") # Check if this worker is part of a parallel group (TP or TP+PP). # A worker is part of a parallel group if it's a secondary member (local_bundle_indices is None) @@ -334,8 +337,7 @@ def _patch_vllm_init_workers_ray(): enable_prefix_caching=torch.cuda.get_device_capability()[0] >= 8, dtype=self.cfg["vllm_cfg"]["precision"], seed=seed, - # Don't use cuda-graph by default as it leads to convergence issues (see https://github.com/NVIDIA-NeMo/RL/issues/186) - enforce_eager=True, + enforce_eager=self.cfg["vllm_cfg"]["enforce_eager"], max_model_len=self.cfg["vllm_cfg"]["max_model_len"], trust_remote_code=True, worker_extension_cls="nemo_rl.models.generation.vllm_backend.VllmInternalWorkerExtension", diff --git a/tests/unit/experience/test_rollouts.py b/tests/unit/experience/test_rollouts.py index 08d1c0ffd6..db41fe2d39 100644 --- a/tests/unit/experience/test_rollouts.py +++ b/tests/unit/experience/test_rollouts.py @@ -241,6 +241,7 @@ def initial_multi_step_calculator_batch(rollout_tokenizer): "disable_log_stats": True, "disable_log_requests": True, "gpu_memory_utilization": 0.6, + "enforce_eager": "False", }, "colocated": { "enabled": True, diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py index 1404b02337..8371fababb 100644 --- a/tests/unit/models/generation/test_vllm_generation.py +++ b/tests/unit/models/generation/test_vllm_generation.py @@ -56,6 +56,7 @@ "async_engine": False, # Default to False for synchronous tests "skip_tokenizer_init": False, "load_format": "auto", + "enforce_eager": "False", }, "colocated": { "enabled": True, diff --git a/tests/unit/models/generation/test_vllm_large_model.py b/tests/unit/models/generation/test_vllm_large_model.py index 9735b5f03d..d24a0c0f31 100644 --- a/tests/unit/models/generation/test_vllm_large_model.py +++ b/tests/unit/models/generation/test_vllm_large_model.py @@ -50,6 +50,7 @@ "async_engine": True, "skip_tokenizer_init": False, "load_format": "auto", + "enforce_eager": "False", }, "colocated": { "enabled": True, From 1ac69d2978285b444511ab6b8b749063e79652af Mon Sep 17 00:00:00 2001 From: Jimmy Zhang <133159885+jiemingz@users.noreply.github.com> Date: Mon, 30 Jun 2025 12:09:49 -0400 Subject: [PATCH 2/3] Update grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml Signed-off-by: Jimmy Zhang <133159885+jiemingz@users.noreply.github.com> --- .../llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml b/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml index 7425fd9b8c..af4bb6945d 100644 --- a/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml +++ b/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml @@ -96,6 +96,7 @@ policy: resources: gpus_per_node: null num_nodes: null +data: max_input_seq_length: 16384 prompt_file: examples/prompts/cot.txt system_prompt_file: null From e545a486a0738f30577e69a0e3b936982cd7043b Mon Sep 17 00:00:00 2001 From: Jimmy Zhang Date: Tue, 1 Jul 2025 16:47:46 -0700 Subject: [PATCH 3/3] add key Signed-off-by: Jimmy Zhang --- examples/configs/evals/eval.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/configs/evals/eval.yaml b/examples/configs/evals/eval.yaml index eab0f1db21..85e193dcae 100644 --- a/examples/configs/evals/eval.yaml +++ b/examples/configs/evals/eval.yaml @@ -22,6 +22,7 @@ generation: pipeline_parallel_size: 1 gpu_memory_utilization: 0.9 max_model_len: 2048 + enforce_eager: False colocated: # true: generation shares training GPUs # false: uses dedicated generation resources