Skip to content
7 changes: 4 additions & 3 deletions cuda_bindings/cuda/bindings/_path_finder/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,13 @@ strategy for locating NVIDIA shared libraries:
- Falls back to native loader:
- `dlopen()` on Linux
- `LoadLibraryW()` on Windows
- Conda installations are expected to be discovered:
- Linux: Via `$ORIGIN/../lib` on `RPATH` (of the `python` binary;
note that this preempts `LD_LIBRARY_PATH` and `/etc/ld.so.conf.d/`)
- Windows: Via `%CONDA_PREFIX%\Library\bin` on system `PATH`
- CTK installations with system config updates are expected to be discovered:
- Linux: Via `/etc/ld.so.conf.d/*cuda*.conf`
- Windows: Via `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y\bin` on system `PATH`
- Conda installations are expected to be discovered:
- Linux: Via `$ORIGIN/../lib` on `RPATH` (of the `python` binary)
- Windows: Via `%CONDA_PREFIX%\Library\bin` on system `PATH`

3. **Environment variables**
- Relies on `CUDA_HOME` or `CUDA_PATH` environment variables if set
Expand Down
47 changes: 5 additions & 42 deletions cuda_bindings/cuda/bindings/_path_finder/load_dl_windows.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# Copyright 2025 NVIDIA Corporation. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

import ctypes
import ctypes.wintypes
from typing import Optional

import pywintypes
Expand Down Expand Up @@ -36,46 +34,11 @@ def add_dll_directory(dll_abs_path: str) -> None:


def abs_path_for_dynamic_library(libname: str, handle: pywintypes.HANDLE) -> str:
"""Get the absolute path of a loaded dynamic library on Windows.

Args:
handle: The library handle

Returns:
The absolute path to the DLL file

Raises:
OSError: If GetModuleFileNameW fails
RuntimeError: If the required path length is unreasonably long
"""
MAX_ITERATIONS = 10 # Allows for extremely long paths (up to ~266,000 chars)
buf_size = 260 # Start with traditional MAX_PATH

for _ in range(MAX_ITERATIONS):
buf = ctypes.create_unicode_buffer(buf_size)
n_chars = ctypes.windll.kernel32.GetModuleFileNameW(ctypes.wintypes.HMODULE(handle), buf, buf_size)

if n_chars == 0:
raise OSError(
f"GetModuleFileNameW failed ({libname=!r}, {buf_size=}). "
"Long paths may require enabling the "
"Windows 10+ long path registry setting. See: "
"https://docs.python.org/3/using/windows.html#removing-the-max-path-limitation"
)
if n_chars < buf_size - 1:
return buf.value

buf_size *= 2 # Double the buffer size and try again

raise RuntimeError(
f"Failed to retrieve the full path after {MAX_ITERATIONS} attempts "
f"(final buffer size: {buf_size} characters). "
"This may indicate:\n"
" 1. An extremely long path requiring Windows long path support, or\n"
" 2. An invalid or corrupt library handle, or\n"
" 3. An unexpected system error.\n"
"See: https://docs.python.org/3/using/windows.html#removing-the-max-path-limitation"
)
"""Get the absolute path of a loaded dynamic library on Windows."""
try:
return win32api.GetModuleFileName(handle)
except Exception as e:
raise RuntimeError(f"GetModuleFileName failed for {libname!r} (exception type: {type(e)})") from e


def check_if_already_loaded_from_elsewhere(libname: str) -> Optional[LoadedDL]:
Expand Down
89 changes: 0 additions & 89 deletions cuda_bindings/tests/run_python_code_safely.py

This file was deleted.

126 changes: 126 additions & 0 deletions cuda_bindings/tests/spawned_process_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
# Copyright 2025 NVIDIA Corporation. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE

import multiprocessing
import queue # for Empty
import sys
import traceback
from dataclasses import dataclass
from io import StringIO
from typing import Any, Callable, Optional, Sequence

PROCESS_KILLED = -9
PROCESS_NO_RESULT = -999


# Similar to https://docs.python.org/3/library/subprocess.html#subprocess.CompletedProcess
# (args, check_returncode() are intentionally not supported here.)
@dataclass
class CompletedProcess:
returncode: int
stdout: str
stderr: str


class ChildProcessWrapper:
def __init__(self, result_queue, target, args, kwargs):
self.target = target
self.args = () if args is None else args
self.kwargs = {} if kwargs is None else kwargs
self.result_queue = result_queue

def __call__(self):
# Capture stdout/stderr
old_stdout = sys.stdout
old_stderr = sys.stderr
sys.stdout = StringIO()
sys.stderr = StringIO()

try:
self.target(*self.args, **self.kwargs)
returncode = 0
except SystemExit as e: # Handle sys.exit()
returncode = e.code if isinstance(e.code, int) else 0
except BaseException:
traceback.print_exc()
returncode = 1
finally:
# Collect outputs and restore streams
stdout = sys.stdout.getvalue()
stderr = sys.stderr.getvalue()
sys.stdout = old_stdout
sys.stderr = old_stderr
try: # noqa: SIM105
self.result_queue.put((returncode, stdout, stderr))
except Exception: # nosec B110
# If the queue is broken (e.g., parent gone), best effort logging
pass


def run_in_spawned_child_process(
target: Callable[..., None],
*,
args: Optional[Sequence[Any]] = None,
kwargs: Optional[dict[str, Any]] = None,
timeout: Optional[float] = None,
rethrow: bool = False,
) -> CompletedProcess:
"""Run `target` in a spawned child process, capturing stdout/stderr.

The provided `target` must be defined at the top level of a module, and must
be importable in the spawned child process. Lambdas, closures, or interactively
defined functions (e.g., in Jupyter notebooks) will not work.

If `rethrow=True` and the child process exits with a nonzero code,
raises ChildProcessError with the captured stderr.
"""
ctx = multiprocessing.get_context("spawn")
result_queue = ctx.Queue()
process = ctx.Process(target=ChildProcessWrapper(result_queue, target, args, kwargs))
process.start()

try:
process.join(timeout)
if process.is_alive():
process.terminate()
process.join()
result = CompletedProcess(
returncode=PROCESS_KILLED,
stdout="",
stderr=f"Process timed out after {timeout} seconds and was terminated.",
)
else:
try:
returncode, stdout, stderr = result_queue.get(timeout=1.0)
except (queue.Empty, EOFError):
result = CompletedProcess(
returncode=PROCESS_NO_RESULT,
stdout="",
stderr="Process exited or crashed before returning results.",
)
else:
result = CompletedProcess(
returncode=returncode,
stdout=stdout,
stderr=stderr,
)

if rethrow and result.returncode != 0:
raise ChildProcessError(
f"Child process exited with code {result.returncode}.\n"
"--- stderr-from-child-process ---\n"
f"{result.stderr}"
"<end-of-stderr-from-child-process>\n"
)

return result

finally:
try:
result_queue.close()
result_queue.join_thread()
except Exception: # nosec B110
pass
if process.is_alive():
process.kill()
process.join()
65 changes: 32 additions & 33 deletions cuda_bindings/tests/test_path_finder_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import sys

import pytest
from run_python_code_safely import run_python_code_safely
import spawned_process_runner

from cuda.bindings import path_finder
from cuda.bindings._path_finder import supported_libs
Expand Down Expand Up @@ -38,46 +38,45 @@ def test_all_libnames_expected_lib_symbols_consistency():
assert tuple(sorted(ALL_LIBNAMES)) == tuple(sorted(supported_libs.EXPECTED_LIB_SYMBOLS.keys()))


def build_subprocess_failed_for_libname_message(libname, result):
def build_child_process_failed_for_libname_message(libname, result):
return (
f"Subprocess failed for {libname=!r} with exit code {result.returncode}\n"
f"--- stdout-from-subprocess ---\n{result.stdout}<end-of-stdout-from-subprocess>\n"
f"--- stderr-from-subprocess ---\n{result.stderr}<end-of-stderr-from-subprocess>\n"
f"Child process failed for {libname=!r} with exit code {result.returncode}\n"
f"--- stdout-from-child-process ---\n{result.stdout}<end-of-stdout-from-child-process>\n"
f"--- stderr-from-child-process ---\n{result.stderr}<end-of-stderr-from-child-process>\n"
)


def child_process_func(libname):
import os

from cuda.bindings._path_finder.load_nvidia_dynamic_library import _load_nvidia_dynamic_library_no_cache
from cuda.bindings.path_finder import _load_nvidia_dynamic_library

loaded_dl_fresh = _load_nvidia_dynamic_library(libname)
if loaded_dl_fresh.was_already_loaded_from_elsewhere:
raise RuntimeError("loaded_dl_fresh.was_already_loaded_from_elsewhere")

loaded_dl_from_cache = _load_nvidia_dynamic_library(libname)
if loaded_dl_from_cache is not loaded_dl_fresh:
raise RuntimeError("loaded_dl_from_cache is not loaded_dl_fresh")

loaded_dl_no_cache = _load_nvidia_dynamic_library_no_cache(libname)
if not loaded_dl_no_cache.was_already_loaded_from_elsewhere:
raise RuntimeError("loaded_dl_no_cache.was_already_loaded_from_elsewhere")
if not os.path.samefile(loaded_dl_no_cache.abs_path, loaded_dl_fresh.abs_path):
raise RuntimeError(f"not os.path.samefile({loaded_dl_no_cache.abs_path=!r}, {loaded_dl_fresh.abs_path=!r})")

print(f"{loaded_dl_fresh.abs_path!r}")


@pytest.mark.parametrize("libname", TEST_FIND_OR_LOAD_LIBNAMES)
def test_find_or_load_nvidia_dynamic_library(info_summary_append, libname):
# We intentionally run each dynamic library operation in a subprocess
# We intentionally run each dynamic library operation in a child process
# to ensure isolation of global dynamic linking state (e.g., dlopen handles).
# Without subprocesses, loading/unloading libraries during testing could
# Without child processes, loading/unloading libraries during testing could
# interfere across test cases and lead to nondeterministic or platform-specific failures.
#
# Defining the subprocess code snippets as strings ensures each subprocess
# runs a minimal, independent script tailored to the specific libname and API being tested.
code = f"""\
import os
from cuda.bindings.path_finder import _load_nvidia_dynamic_library
from cuda.bindings._path_finder.load_nvidia_dynamic_library import _load_nvidia_dynamic_library_no_cache

loaded_dl_fresh = _load_nvidia_dynamic_library({libname!r})
if loaded_dl_fresh.was_already_loaded_from_elsewhere:
raise RuntimeError("loaded_dl_fresh.was_already_loaded_from_elsewhere")

loaded_dl_from_cache = _load_nvidia_dynamic_library({libname!r})
if loaded_dl_from_cache is not loaded_dl_fresh:
raise RuntimeError("loaded_dl_from_cache is not loaded_dl_fresh")

loaded_dl_no_cache = _load_nvidia_dynamic_library_no_cache({libname!r})
if not loaded_dl_no_cache.was_already_loaded_from_elsewhere:
raise RuntimeError("loaded_dl_no_cache.was_already_loaded_from_elsewhere")
if not os.path.samefile(loaded_dl_no_cache.abs_path, loaded_dl_fresh.abs_path):
raise RuntimeError(f"not os.path.samefile({{loaded_dl_no_cache.abs_path=!r}}, {{loaded_dl_fresh.abs_path=!r}})")

print(f"{{loaded_dl_fresh.abs_path!r}}")
"""
result = run_python_code_safely(code, timeout=30)
result = spawned_process_runner.run_in_spawned_child_process(child_process_func, args=(libname,), timeout=30)
if result.returncode == 0:
info_summary_append(f"abs_path={result.stdout.rstrip()}")
else:
raise RuntimeError(build_subprocess_failed_for_libname_message(libname, result))
raise RuntimeError(build_child_process_failed_for_libname_message(libname, result))
Loading
Loading