NVIDIA-NeMo · parthchadha · Apr 16, 2025 · Apr 4, 2025 · Apr 4, 2025 · Apr 7, 2025
@@ -18,7 +18,6 @@
 
 import ray
 import torch
-from transformers import AutoTokenizer
 
 from nemo_reinforcer.models.generation.interfaces import (
     GenerationInterface,
@@ -109,8 +108,7 @@ def configure_worker(
             env_vars["RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES"] = "1"
             init_kwargs["fraction_of_gpus"] = num_gpus
 
-        # Force vllm to use v0 runtime (will be enabled by default in #51)
-        env_vars["VLLM_USE_V1"] = "0"
+        env_vars["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
 
         return resources, env_vars, init_kwargs
 
@@ -150,12 +148,9 @@ def __init__(
         self.world_size = 1
 
         try:
-            from vllm import LLM, SamplingParams
-            from nemo_reinforcer.models.generation.vllm_backend import (
-                UpdatableVllmInternalWorker,
-            )
+            import vllm
 
-            self.SamplingParams = SamplingParams
+            self.SamplingParams = vllm.SamplingParams
         except ImportError:
             raise ImportError(
                 "vLLM is not installed. Please install it with `pip install nemo-reinforcer[vllm]` "
@@ -184,7 +179,7 @@ def __init__(
             # For non-TP mode, explicitly set executor to None to avoid Ray issues
             vllm_kwargs["distributed_executor_backend"] = None
 
-        self.llm = LLM(
+        self.llm = vllm.LLM(
             model=self.model_name,
             # Training pipeline will set this to "dummy" and eval will load real weights using 'auto'
             load_format=self.cfg["vllm_cfg"]["load_format"],
@@ -198,7 +193,7 @@ def __init__(
             enforce_eager=True,
             max_model_len=self.cfg["vllm_cfg"]["max_model_len"],
             trust_remote_code=True,
-            worker_cls=UpdatableVllmInternalWorker,
+            worker_extension_cls="nemo_reinforcer.models.generation.vllm_backend.VllmInternalWorkerExtension",
             enable_sleep_mode=True,
             disable_log_stats=True,
             **vllm_kwargs,

@@ -14,7 +14,7 @@
 import torch
 
 try:
-    from vllm.worker.worker import Worker
+    import vllm
 except ImportError:
     raise ImportError(
         "vLLM is not installed. Please install it with `pip install nemo-reinforcer[vllm]` "
@@ -23,7 +23,7 @@
     )
 
 
-class UpdatableVllmInternalWorker(Worker):
+class VllmInternalWorkerExtension:
     def report_device_id(self) -> str:
         from nemo_reinforcer.utils.nvml import get_device_uuid
 
@@ -60,6 +60,6 @@ def update_weights_from_ipc_handles(self, ipc_handles):
             return True
         except Exception as e:
             print(
-                f"Error in UpdatableVllmInternalWorker.update_weights_from_ipc_handles: {e}"
+                f"Error in VllmInternalWorkerExtension.update_weights_from_ipc_handles: {e}"
             )
             return False
@@ -38,7 +38,7 @@ readme = {file = "README.md", content-type = "text/markdown"}
 
 [project.optional-dependencies]
 vllm = [
-    "vllm==0.8.0",
+    "vllm==0.8.2",
 ]
 
 [dependency-groups]
@@ -109,4 +109,4 @@ convention = "google"
 #  --link-mode=copy (slower but more reliable; supresses warning)
 #  --link-mode=symlink (fastest option when uv cache and venv on different file-system; caveat: venv is brittle since it depends on the environment/container)
 #
-#link-mode = "symlink"
+#link-mode = "symlink"