From 3fa9131f0c19b728d1a008784c526b272891428a Mon Sep 17 00:00:00 2001
From: Parth Chadha <pchadha@nvidia.com>
Date: Mon, 31 Mar 2025 16:52:12 -0700
Subject: [PATCH] fix: remove usage of vllm to get device uuid and instead use
 nvidia-ml-py library to get device uuid

Signed-off-by: Parth Chadha <pchadha@nvidia.com>
---
 nemo_reinforcer/models/generation/vllm.py     |  4 +-
 .../models/generation/vllm_backend.py         |  5 +-
 nemo_reinforcer/models/policy/hf_policy.py    | 17 +++--
 nemo_reinforcer/utils/nvml.py                 | 66 +++++++++++++++++++
 pyproject.toml                                |  1 +
 tests/unit/utils/test_pynvml.py               | 61 +++++++++++++++++
 6 files changed, 143 insertions(+), 11 deletions(-)
 create mode 100644 nemo_reinforcer/utils/nvml.py
 create mode 100644 tests/unit/utils/test_pynvml.py

diff --git a/nemo_reinforcer/models/generation/vllm.py b/nemo_reinforcer/models/generation/vllm.py
index 2395b65e34..edef3c69ae 100644
--- a/nemo_reinforcer/models/generation/vllm.py
+++ b/nemo_reinforcer/models/generation/vllm.py
@@ -97,6 +97,7 @@ def configure_worker(
 
         # Force vllm to use v0 runtime (will be enabled by default in #51)
         env_vars["VLLM_USE_V1"] = "0"
+
         return resources, env_vars, init_kwargs
 
     def __init__(
@@ -379,9 +380,6 @@ def shutdown(self):
             return False
 
     def report_device_id(self) -> str:
-        # from vllm.platforms import current_platform
-        # self.device_uuid = current_platform.get_device_uuid(self.rank)
-        # return self.device_uuid
         return self.llm.collective_rpc("report_device_id", args=tuple())[0]
 
     def update_weights_from_ipc_handles(self, ipc_handles):
diff --git a/nemo_reinforcer/models/generation/vllm_backend.py b/nemo_reinforcer/models/generation/vllm_backend.py
index a8ded9b519..3507d7dc55 100644
--- a/nemo_reinforcer/models/generation/vllm_backend.py
+++ b/nemo_reinforcer/models/generation/vllm_backend.py
@@ -25,10 +25,9 @@
 
 class UpdatableVllmInternalWorker(Worker):
     def report_device_id(self) -> str:
-        from vllm.platforms import current_platform
+        from nemo_reinforcer.utils.nvml import get_device_uuid
 
-        self.device_uuid = current_platform.get_device_uuid(self.device.index)
-        return self.device_uuid
+        return get_device_uuid(self.device.index)
 
     def update_weights_from_ipc_handles(self, ipc_handles):
         """Update weights from IPC handles.
diff --git a/nemo_reinforcer/models/policy/hf_policy.py b/nemo_reinforcer/models/policy/hf_policy.py
index 30fb77adeb..b2b004bddb 100644
--- a/nemo_reinforcer/models/policy/hf_policy.py
+++ b/nemo_reinforcer/models/policy/hf_policy.py
@@ -686,10 +686,17 @@ def zero_out_weights(self):
         torch.cuda.synchronize()
 
     def report_device_id(self) -> str:
-        from vllm.platforms import current_platform
+        """Report the UUID of the current CUDA device using NVML.
 
-        self.device_uuid = current_platform.get_device_uuid(torch.cuda.current_device())
-        return self.device_uuid
+        Returns:
+            str: UUID of the device in the format "GPU-xxxxx"
+        """
+        from nemo_reinforcer.utils.nvml import get_device_uuid
+
+        # Get current device index from torch
+        device_idx = torch.cuda.current_device()
+        # Get device UUID using NVML
+        return get_device_uuid(device_idx)
 
     @torch.no_grad()
     def get_weight_ipc_handles(self, offload_model=True):
@@ -708,7 +715,7 @@ def get_weight_ipc_handles(self, offload_model=True):
         params = dtype_params
         self._held_reference_model_params = params
         data = {}
-        self.device_uuid = self.report_device_id()
+        device_uuid = self.report_device_id()
         for name, p in params.items():
             data[name] = reduce_tensor(p.detach())
 
@@ -716,7 +723,7 @@ def get_weight_ipc_handles(self, offload_model=True):
             self.model = self.move_to_cpu(self.model)
             gc.collect()
             torch.cuda.empty_cache()
-        return {self.device_uuid: data}
+        return {device_uuid: data}
 
     def prepare_for_lp_inference(self):
         self.model.to("cuda")
diff --git a/nemo_reinforcer/utils/nvml.py b/nemo_reinforcer/utils/nvml.py
new file mode 100644
index 0000000000..2f684effef
--- /dev/null
+++ b/nemo_reinforcer/utils/nvml.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+import os
+import pynvml
+
+
+@contextlib.contextmanager
+def nvml_context():
+    """Context manager for NVML initialization and shutdown.
+
+    Raises:
+        RuntimeError: If NVML initialization fails
+    """
+    try:
+        pynvml.nvmlInit()
+        yield
+    except pynvml.NVMLError as e:
+        raise RuntimeError(f"Failed to initialize NVML: {e}")
+    finally:
+        try:
+            pynvml.nvmlShutdown()
+        except:
+            pass
+
+
+def device_id_to_physical_device_id(device_id: int) -> int:
+    """Convert a logical device ID to a physical device ID considering CUDA_VISIBLE_DEVICES."""
+    if "CUDA_VISIBLE_DEVICES" in os.environ:
+        device_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
+        try:
+            physical_device_id = int(device_ids[device_id])
+            return physical_device_id
+        except ValueError:
+            raise RuntimeError(
+                f"Failed to convert logical device ID {device_id} to physical device ID. Available devices are: {device_ids}."
+            )
+    else:
+        return device_id
+
+
+def get_device_uuid(device_idx: int) -> str:
+    """Get the UUID of a CUDA device using NVML."""
+    # Convert logical device index to physical device index
+    global_device_idx = device_id_to_physical_device_id(device_idx)
+
+    # Get the device handle and UUID
+    with nvml_context():
+        try:
+            handle = pynvml.nvmlDeviceGetHandleByIndex(global_device_idx)
+            return pynvml.nvmlDeviceGetUUID(handle)
+        except pynvml.NVMLError as e:
+            raise RuntimeError(
+                f"Failed to get device UUID for device {device_idx} (global index: {global_device_idx}): {e}"
+            )
diff --git a/pyproject.toml b/pyproject.toml
index eddc57a15b..e25116a33c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -24,6 +24,7 @@ dependencies = [
     "omegaconf",
     "torchdata",
     "vllm==0.8.0",
+    "nvidia-ml-py",
 ]
 
 [tool.setuptools]
diff --git a/tests/unit/utils/test_pynvml.py b/tests/unit/utils/test_pynvml.py
new file mode 100644
index 0000000000..cf7044654f
--- /dev/null
+++ b/tests/unit/utils/test_pynvml.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from unittest.mock import patch
+
+from nemo_reinforcer.utils.nvml import (
+    nvml_context,
+    device_id_to_physical_device_id,
+    get_device_uuid,
+)
+
+
+@patch("nemo_reinforcer.utils.nvml.pynvml")
+def test_nvml_context(mock_pynvml):
+    """Test that nvml_context initializes and shuts down NVML."""
+    with nvml_context():
+        pass
+
+    # Verify init and shutdown were called
+    mock_pynvml.nvmlInit.assert_called_once()
+    mock_pynvml.nvmlShutdown.assert_called_once()
+
+
+def test_device_id_conversion():
+    """Test device ID conversion with and without CUDA_VISIBLE_DEVICES."""
+    with patch.dict(os.environ, {}, clear=True):
+        assert device_id_to_physical_device_id(0) == 0
+
+    with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "2,3"}):
+        assert device_id_to_physical_device_id(0) == 2
+        assert device_id_to_physical_device_id(1) == 3
+
+
+@patch("nemo_reinforcer.utils.nvml.device_id_to_physical_device_id")
+@patch("nemo_reinforcer.utils.nvml.pynvml")
+def test_get_device_uuid(mock_pynvml, mock_convert_id):
+    """Test that get_device_uuid correctly retrieves a UUID."""
+
+    # Setup
+    mock_convert_id.return_value = 1
+    mock_handle = mock_pynvml.nvmlDeviceGetHandleByIndex.return_value
+    mock_pynvml.nvmlDeviceGetUUID.return_value = b"GPU-12345"
+
+    # Call function
+    uuid = get_device_uuid(0)
+
+    # Verify
+    assert uuid == b"GPU-12345"
+    mock_convert_id.assert_called_once_with(0)
+    mock_pynvml.nvmlDeviceGetHandleByIndex.assert_called_once_with(1)