From 445e6293d651472e167df552976e44acf6af5803 Mon Sep 17 00:00:00 2001 From: Parth Chadha Date: Mon, 14 Apr 2025 18:38:04 -0700 Subject: [PATCH] fix: don't use cuda-graphs for vllm generation Signed-off-by: Parth Chadha --- nemo_reinforcer/models/generation/vllm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_reinforcer/models/generation/vllm.py b/nemo_reinforcer/models/generation/vllm.py index f8c527dd06..c9676a2f2b 100644 --- a/nemo_reinforcer/models/generation/vllm.py +++ b/nemo_reinforcer/models/generation/vllm.py @@ -177,8 +177,8 @@ def __init__( gpu_memory_utilization=self.cfg["vllm_cfg"]["gpu_memory_utilization"], enable_prefix_caching=True, dtype="auto", - # Use cuda-graph by default for performance, set to True to use eager execution - enforce_eager=False, + # Don't use cuda-graph by default as it leads to convergence issue (see https://github.com/NVIDIA/reinforcer/issues/186) + enforce_eager=True, max_model_len=self.cfg["vllm_cfg"]["max_model_len"], trust_remote_code=True, worker_cls=UpdatableVllmInternalWorker,