diff --git a/nemo_reinforcer/models/generation/vllm.py b/nemo_reinforcer/models/generation/vllm.py index 4e8ff364c7..f7138689e3 100644 --- a/nemo_reinforcer/models/generation/vllm.py +++ b/nemo_reinforcer/models/generation/vllm.py @@ -18,7 +18,6 @@ import ray import torch -from transformers import AutoTokenizer from nemo_reinforcer.models.generation.interfaces import ( GenerationInterface, @@ -109,8 +108,7 @@ def configure_worker( env_vars["RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES"] = "1" init_kwargs["fraction_of_gpus"] = num_gpus - # Force vllm to use v0 runtime (will be enabled by default in #51) - env_vars["VLLM_USE_V1"] = "0" + env_vars["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" return resources, env_vars, init_kwargs @@ -150,12 +148,9 @@ def __init__( self.world_size = 1 try: - from vllm import LLM, SamplingParams - from nemo_reinforcer.models.generation.vllm_backend import ( - UpdatableVllmInternalWorker, - ) + import vllm - self.SamplingParams = SamplingParams + self.SamplingParams = vllm.SamplingParams except ImportError: raise ImportError( "vLLM is not installed. Please install it with `pip install nemo-reinforcer[vllm]` " @@ -184,7 +179,7 @@ def __init__( # For non-TP mode, explicitly set executor to None to avoid Ray issues vllm_kwargs["distributed_executor_backend"] = None - self.llm = LLM( + self.llm = vllm.LLM( model=self.model_name, # Training pipeline will set this to "dummy" and eval will load real weights using 'auto' load_format=self.cfg["vllm_cfg"]["load_format"], @@ -198,7 +193,7 @@ def __init__( enforce_eager=True, max_model_len=self.cfg["vllm_cfg"]["max_model_len"], trust_remote_code=True, - worker_cls=UpdatableVllmInternalWorker, + worker_extension_cls="nemo_reinforcer.models.generation.vllm_backend.VllmInternalWorkerExtension", enable_sleep_mode=True, disable_log_stats=True, **vllm_kwargs, diff --git a/nemo_reinforcer/models/generation/vllm_backend.py b/nemo_reinforcer/models/generation/vllm_backend.py index 09e94f2815..a7fd12aa26 100644 --- a/nemo_reinforcer/models/generation/vllm_backend.py +++ b/nemo_reinforcer/models/generation/vllm_backend.py @@ -14,7 +14,7 @@ import torch try: - from vllm.worker.worker import Worker + import vllm except ImportError: raise ImportError( "vLLM is not installed. Please install it with `pip install nemo-reinforcer[vllm]` " @@ -23,7 +23,7 @@ ) -class UpdatableVllmInternalWorker(Worker): +class VllmInternalWorkerExtension: def report_device_id(self) -> str: from nemo_reinforcer.utils.nvml import get_device_uuid @@ -60,6 +60,6 @@ def update_weights_from_ipc_handles(self, ipc_handles): return True except Exception as e: print( - f"Error in UpdatableVllmInternalWorker.update_weights_from_ipc_handles: {e}" + f"Error in VllmInternalWorkerExtension.update_weights_from_ipc_handles: {e}" ) return False diff --git a/pyproject.toml b/pyproject.toml index febbf9c5f1..b7f8260ff8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,7 +38,7 @@ readme = {file = "README.md", content-type = "text/markdown"} [project.optional-dependencies] vllm = [ - "vllm==0.8.0", + "vllm==0.8.2", ] [dependency-groups] @@ -109,4 +109,4 @@ convention = "google" # --link-mode=copy (slower but more reliable; supresses warning) # --link-mode=symlink (fastest option when uv cache and venv on different file-system; caveat: venv is brittle since it depends on the environment/container) # -#link-mode = "symlink" \ No newline at end of file +#link-mode = "symlink" diff --git a/uv.lock b/uv.lock index b5d6bbb4f0..d546f25e64 100644 --- a/uv.lock +++ b/uv.lock @@ -1361,6 +1361,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6d/eb/a5e8b06b924b4149cf498e1598116bad1e91ab23046c2dfc2c498154d393/latex2sympy2_extended-1.10.1-py3-none-any.whl", hash = "sha256:917a23e8f3b6edea88a56978fbbe87ed9fca4197f8277646be57b4660710347c", size = 207460 }, ] +[[package]] +name = "llguidance" +version = "0.7.11" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/7c/4b/92f81aa9d98e2c0721e2760e0fa1ae1691380bd27f2bf530310671a777d9/llguidance-0.7.11.tar.gz", hash = "sha256:226409610f1d1e0ecd62f15d1dd47851879513eb1eb56129c56de8188b80fa8d", size = 384121 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c2/38/bb5e0e185f84e4702ca079b0874de88b0d1b7245c48fc6449b766bce6103/llguidance-0.7.11-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:c1639466113196cf6d274461deaafbe6011b60d459f773ca97045df1ee87e195", size = 3065620 }, + { url = "https://files.pythonhosted.org/packages/c3/c3/14f1173407a0ba18e1f57d26eae4da49d6336d5e0405336b9cbcb749848b/llguidance-0.7.11-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:1e6899df33f3372ec86d7c1939e33891fda9e9a533dcd7f7f8c556897446765b", size = 2957459 }, + { url = "https://files.pythonhosted.org/packages/5f/07/6064f1253708c879c96ce0b74bacd7ab2845c0e8199ff13d84681a5041ad/llguidance-0.7.11-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32edcdc60922bdc97dcbae4d18e2d6dca451571959303ced7b7821dbbd344c0f", size = 13561497 }, + { url = "https://files.pythonhosted.org/packages/e1/9e/96d96fab0c27adb9f51dabc42682d12dfe4602e7637a71614b916879ae7a/llguidance-0.7.11-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b167f7d4da85747378c0c58393cd078b459a90d6e8a60e676692784a78a6f61", size = 13687114 }, + { url = "https://files.pythonhosted.org/packages/c3/72/f5ed95fd29faf6b197d6af543671306ef154741f804b197c3e3f7ad15a8b/llguidance-0.7.11-cp39-abi3-win_amd64.whl", hash = "sha256:585cb3b52a702303240ae91cc0633735dab3a1db2c062af8ffb4ef3ca4737236", size = 2611515 }, +] + [[package]] name = "llvmlite" version = "0.43.0" @@ -1823,7 +1836,7 @@ requires-dist = [ { name = "torch", specifier = "==2.6.0" }, { name = "torchdata" }, { name = "transformers" }, - { name = "vllm", marker = "extra == 'vllm'", specifier = "==0.8.0" }, + { name = "vllm", marker = "extra == 'vllm'", specifier = "==0.8.2" }, { name = "wandb" }, ] provides-extras = ["vllm"] @@ -4153,7 +4166,7 @@ wheels = [ [[package]] name = "vllm" -version = "0.8.0" +version = "0.8.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp" }, @@ -4168,6 +4181,7 @@ dependencies = [ { name = "gguf" }, { name = "importlib-metadata" }, { name = "lark" }, + { name = "llguidance", marker = "platform_machine == 'aarch64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" }, { name = "lm-format-enforcer" }, { name = "mistral-common", extra = ["opencv"] }, { name = "msgspec" }, @@ -4205,9 +4219,9 @@ dependencies = [ { name = "xformers", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, { name = "xgrammar", marker = "platform_machine == 'aarch64' or platform_machine == 'x86_64'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/d2/27/73a54707964c5160067e253398cc328943e3ddbaa3099265ab593e6ec766/vllm-0.8.0.tar.gz", hash = "sha256:449e6651d30d6d5025d0d42499cf1a02d983915ef3b3670547db14a0431aa9bd", size = 6407594 } +sdist = { url = "https://files.pythonhosted.org/packages/df/4d/6b27cc14d0c35e578a743a767953500a801ba296694b7e44cca709738b41/vllm-0.8.2.tar.gz", hash = "sha256:9b337b1c4072ccb94b1bf2b716593fadbe2dcb8d091f9bcbd6b5c6d37f9842ac", size = 6450146 } wheels = [ - { url = "https://files.pythonhosted.org/packages/15/77/7beca2061aadfdfd2d81411102e6445b459bcfedfc46671d4712de6a00fb/vllm-0.8.0-cp38-abi3-manylinux1_x86_64.whl", hash = "sha256:d3660eda448560b0ce6a1524466d7d36ec0024e772c9dbf562dbead980e7d480", size = 265290109 }, + { url = "https://files.pythonhosted.org/packages/57/49/207364110b96d76139a4e80617e5831d46884abe824941b15c8a748ca5e0/vllm-0.8.2-cp38-abi3-manylinux1_x86_64.whl", hash = "sha256:32442b686c5dad8e6ddcf5a8b0cf3f741359fed6a9e9e940009f1daf80ae15de", size = 293643693 }, ] [[package]]