NVIDIA · rwgk · May 9, 2025 · May 7, 2025 · May 7, 2025 · May 7, 2025
diff --git a/cuda_bindings/cuda/bindings/_path_finder/README.md b/cuda_bindings/cuda/bindings/_path_finder/README.md
@@ -31,12 +31,13 @@ strategy for locating NVIDIA shared libraries:
    - Falls back to native loader:
      - `dlopen()` on Linux
      - `LoadLibraryW()` on Windows
+   - Conda installations are expected to be discovered:
+     - Linux: Via `$ORIGIN/../lib` on `RPATH` (of the `python` binary;
+       note that this preempts `LD_LIBRARY_PATH` and `/etc/ld.so.conf.d/`)
+     - Windows: Via `%CONDA_PREFIX%\Library\bin` on system `PATH`
    - CTK installations with system config updates are expected to be discovered:
      - Linux: Via `/etc/ld.so.conf.d/*cuda*.conf`
      - Windows: Via `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y\bin` on system `PATH`
-   - Conda installations are expected to be discovered:
-     - Linux: Via `$ORIGIN/../lib` on `RPATH` (of the `python` binary)
-     - Windows: Via `%CONDA_PREFIX%\Library\bin` on system `PATH`
 
 3. **Environment variables**
    - Relies on `CUDA_HOME` or `CUDA_PATH` environment variables if set

diff --git a/cuda_bindings/cuda/bindings/_path_finder/load_dl_windows.py b/cuda_bindings/cuda/bindings/_path_finder/load_dl_windows.py
@@ -1,8 +1,6 @@
 # Copyright 2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-import ctypes
-import ctypes.wintypes
 from typing import Optional
 
 import pywintypes
@@ -36,46 +34,11 @@ def add_dll_directory(dll_abs_path: str) -> None:
 
 
 def abs_path_for_dynamic_library(libname: str, handle: pywintypes.HANDLE) -> str:
-    """Get the absolute path of a loaded dynamic library on Windows.
-
-    Args:
-        handle: The library handle
-
-    Returns:
-        The absolute path to the DLL file
-
-    Raises:
-        OSError: If GetModuleFileNameW fails
-        RuntimeError: If the required path length is unreasonably long
-    """
-    MAX_ITERATIONS = 10  # Allows for extremely long paths (up to ~266,000 chars)
-    buf_size = 260  # Start with traditional MAX_PATH
-
-    for _ in range(MAX_ITERATIONS):
-        buf = ctypes.create_unicode_buffer(buf_size)
-        n_chars = ctypes.windll.kernel32.GetModuleFileNameW(ctypes.wintypes.HMODULE(handle), buf, buf_size)
-
-        if n_chars == 0:
-            raise OSError(
-                f"GetModuleFileNameW failed ({libname=!r}, {buf_size=}). "
-                "Long paths may require enabling the "
-                "Windows 10+ long path registry setting. See: "
-                "https://docs.python.org/3/using/windows.html#removing-the-max-path-limitation"
-            )
-        if n_chars < buf_size - 1:
-            return buf.value
-
-        buf_size *= 2  # Double the buffer size and try again
-
-    raise RuntimeError(
-        f"Failed to retrieve the full path after {MAX_ITERATIONS} attempts "
-        f"(final buffer size: {buf_size} characters). "
-        "This may indicate:\n"
-        "  1. An extremely long path requiring Windows long path support, or\n"
-        "  2. An invalid or corrupt library handle, or\n"
-        "  3. An unexpected system error.\n"
-        "See: https://docs.python.org/3/using/windows.html#removing-the-max-path-limitation"
-    )
+    """Get the absolute path of a loaded dynamic library on Windows."""
+    try:
+        return win32api.GetModuleFileName(handle)
+    except Exception as e:
+        raise RuntimeError(f"GetModuleFileName failed for {libname!r} (exception type: {type(e)})") from e
 
 
 def check_if_already_loaded_from_elsewhere(libname: str) -> Optional[LoadedDL]:

diff --git a/cuda_bindings/tests/run_python_code_safely.py b/cuda_bindings/tests/run_python_code_safely.py
diff --git a/cuda_bindings/tests/spawned_process_runner.py b/cuda_bindings/tests/spawned_process_runner.py
@@ -0,0 +1,126 @@
+# Copyright 2025 NVIDIA Corporation.  All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+import multiprocessing
+import queue  # for Empty
+import sys
+import traceback
+from dataclasses import dataclass
+from io import StringIO
+from typing import Any, Callable, Optional, Sequence
+
+PROCESS_KILLED = -9
+PROCESS_NO_RESULT = -999
+
+
+# Similar to https://docs.python.org/3/library/subprocess.html#subprocess.CompletedProcess
+# (args, check_returncode() are intentionally not supported here.)
+@dataclass
+class CompletedProcess:
+    returncode: int
+    stdout: str
+    stderr: str
+
+
+class ChildProcessWrapper:
+    def __init__(self, result_queue, target, args, kwargs):
+        self.target = target
+        self.args = () if args is None else args
+        self.kwargs = {} if kwargs is None else kwargs
+        self.result_queue = result_queue
+
+    def __call__(self):
+        # Capture stdout/stderr
+        old_stdout = sys.stdout
+        old_stderr = sys.stderr
+        sys.stdout = StringIO()
+        sys.stderr = StringIO()
+
+        try:
+            self.target(*self.args, **self.kwargs)
+            returncode = 0
+        except SystemExit as e:  # Handle sys.exit()
+            returncode = e.code if isinstance(e.code, int) else 0
+        except BaseException:
+            traceback.print_exc()
+            returncode = 1
+        finally:
+            # Collect outputs and restore streams
+            stdout = sys.stdout.getvalue()
+            stderr = sys.stderr.getvalue()
+            sys.stdout = old_stdout
+            sys.stderr = old_stderr
+            try:  # noqa: SIM105
+                self.result_queue.put((returncode, stdout, stderr))
+            except Exception:  # nosec B110
+                # If the queue is broken (e.g., parent gone), best effort logging
+                pass
+
+
+def run_in_spawned_child_process(
+    target: Callable[..., None],
+    *,
+    args: Optional[Sequence[Any]] = None,
+    kwargs: Optional[dict[str, Any]] = None,
+    timeout: Optional[float] = None,
+    rethrow: bool = False,
+) -> CompletedProcess:
+    """Run `target` in a spawned child process, capturing stdout/stderr.
+
+    The provided `target` must be defined at the top level of a module, and must
+    be importable in the spawned child process. Lambdas, closures, or interactively
+    defined functions (e.g., in Jupyter notebooks) will not work.
+
+    If `rethrow=True` and the child process exits with a nonzero code,
+    raises ChildProcessError with the captured stderr.
+    """
+    ctx = multiprocessing.get_context("spawn")
+    result_queue = ctx.Queue()
+    process = ctx.Process(target=ChildProcessWrapper(result_queue, target, args, kwargs))
+    process.start()
+
+    try:
+        process.join(timeout)
+        if process.is_alive():
+            process.terminate()
+            process.join()
+            result = CompletedProcess(
+                returncode=PROCESS_KILLED,
+                stdout="",
+                stderr=f"Process timed out after {timeout} seconds and was terminated.",
+            )
+        else:
+            try:
+                returncode, stdout, stderr = result_queue.get(timeout=1.0)
+            except (queue.Empty, EOFError):
+                result = CompletedProcess(
+                    returncode=PROCESS_NO_RESULT,
+                    stdout="",
+                    stderr="Process exited or crashed before returning results.",
+                )
+            else:
+                result = CompletedProcess(
+                    returncode=returncode,
+                    stdout=stdout,
+                    stderr=stderr,
+                )
+
+        if rethrow and result.returncode != 0:
+            raise ChildProcessError(
+                f"Child process exited with code {result.returncode}.\n"
+                "--- stderr-from-child-process ---\n"
+                f"{result.stderr}"
+                "<end-of-stderr-from-child-process>\n"
+            )
+
+        return result
+
+    finally:
+        try:
+            result_queue.close()
+            result_queue.join_thread()
+        except Exception:  # nosec B110
+            pass
+        if process.is_alive():
+            process.kill()
+            process.join()
diff --git a/cuda_bindings/tests/test_path_finder_load.py b/cuda_bindings/tests/test_path_finder_load.py
@@ -5,7 +5,7 @@
 import sys
 
 import pytest
-from run_python_code_safely import run_python_code_safely
+import spawned_process_runner
 
 from cuda.bindings import path_finder
 from cuda.bindings._path_finder import supported_libs
@@ -38,46 +38,45 @@ def test_all_libnames_expected_lib_symbols_consistency():
     assert tuple(sorted(ALL_LIBNAMES)) == tuple(sorted(supported_libs.EXPECTED_LIB_SYMBOLS.keys()))
 
 
-def build_subprocess_failed_for_libname_message(libname, result):
+def build_child_process_failed_for_libname_message(libname, result):
     return (
-        f"Subprocess failed for {libname=!r} with exit code {result.returncode}\n"
-        f"--- stdout-from-subprocess ---\n{result.stdout}<end-of-stdout-from-subprocess>\n"
-        f"--- stderr-from-subprocess ---\n{result.stderr}<end-of-stderr-from-subprocess>\n"
+        f"Child process failed for {libname=!r} with exit code {result.returncode}\n"
+        f"--- stdout-from-child-process ---\n{result.stdout}<end-of-stdout-from-child-process>\n"
+        f"--- stderr-from-child-process ---\n{result.stderr}<end-of-stderr-from-child-process>\n"
     )
 
 
+def child_process_func(libname):
+    import os
+
+    from cuda.bindings._path_finder.load_nvidia_dynamic_library import _load_nvidia_dynamic_library_no_cache
+    from cuda.bindings.path_finder import _load_nvidia_dynamic_library
+
+    loaded_dl_fresh = _load_nvidia_dynamic_library(libname)
+    if loaded_dl_fresh.was_already_loaded_from_elsewhere:
+        raise RuntimeError("loaded_dl_fresh.was_already_loaded_from_elsewhere")
+
+    loaded_dl_from_cache = _load_nvidia_dynamic_library(libname)
+    if loaded_dl_from_cache is not loaded_dl_fresh:
+        raise RuntimeError("loaded_dl_from_cache is not loaded_dl_fresh")
+
+    loaded_dl_no_cache = _load_nvidia_dynamic_library_no_cache(libname)
+    if not loaded_dl_no_cache.was_already_loaded_from_elsewhere:
+        raise RuntimeError("loaded_dl_no_cache.was_already_loaded_from_elsewhere")
+    if not os.path.samefile(loaded_dl_no_cache.abs_path, loaded_dl_fresh.abs_path):
+        raise RuntimeError(f"not os.path.samefile({loaded_dl_no_cache.abs_path=!r}, {loaded_dl_fresh.abs_path=!r})")
+
+    print(f"{loaded_dl_fresh.abs_path!r}")
+
+
 @pytest.mark.parametrize("libname", TEST_FIND_OR_LOAD_LIBNAMES)
 def test_find_or_load_nvidia_dynamic_library(info_summary_append, libname):
-    # We intentionally run each dynamic library operation in a subprocess
+    # We intentionally run each dynamic library operation in a child process
     # to ensure isolation of global dynamic linking state (e.g., dlopen handles).
-    # Without subprocesses, loading/unloading libraries during testing could
+    # Without child processes, loading/unloading libraries during testing could
     # interfere across test cases and lead to nondeterministic or platform-specific failures.
-    #
-    # Defining the subprocess code snippets as strings ensures each subprocess
-    # runs a minimal, independent script tailored to the specific libname and API being tested.
-    code = f"""\
-import os
-from cuda.bindings.path_finder import _load_nvidia_dynamic_library
-from cuda.bindings._path_finder.load_nvidia_dynamic_library import _load_nvidia_dynamic_library_no_cache
-
-loaded_dl_fresh = _load_nvidia_dynamic_library({libname!r})
-if loaded_dl_fresh.was_already_loaded_from_elsewhere:
-    raise RuntimeError("loaded_dl_fresh.was_already_loaded_from_elsewhere")
-
-loaded_dl_from_cache = _load_nvidia_dynamic_library({libname!r})
-if loaded_dl_from_cache is not loaded_dl_fresh:
-    raise RuntimeError("loaded_dl_from_cache is not loaded_dl_fresh")
-
-loaded_dl_no_cache = _load_nvidia_dynamic_library_no_cache({libname!r})
-if not loaded_dl_no_cache.was_already_loaded_from_elsewhere:
-    raise RuntimeError("loaded_dl_no_cache.was_already_loaded_from_elsewhere")
-if not os.path.samefile(loaded_dl_no_cache.abs_path, loaded_dl_fresh.abs_path):
-    raise RuntimeError(f"not os.path.samefile({{loaded_dl_no_cache.abs_path=!r}}, {{loaded_dl_fresh.abs_path=!r}})")
-
-print(f"{{loaded_dl_fresh.abs_path!r}}")
-"""
-    result = run_python_code_safely(code, timeout=30)
+    result = spawned_process_runner.run_in_spawned_child_process(child_process_func, args=(libname,), timeout=30)
     if result.returncode == 0:
         info_summary_append(f"abs_path={result.stdout.rstrip()}")
     else:
-        raise RuntimeError(build_subprocess_failed_for_libname_message(libname, result))
+        raise RuntimeError(build_child_process_failed_for_libname_message(libname, result))