From 048e689e5a4cdf2f6c3edb4a38b1c19d37cef309 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 7 May 2025 11:40:13 -0700
Subject: [PATCH 1/9] Use win32api.GetModuleFileName() in
 abs_path_for_dynamic_library(). With this, load_dl_windows.py consistently
 uses win32api. ctypes is no longer needed, which eliminates the potential for
 confusion due to different types of handles.

---
 .../bindings/_path_finder/load_dl_windows.py  | 47 ++-----------------
 1 file changed, 5 insertions(+), 42 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_path_finder/load_dl_windows.py b/cuda_bindings/cuda/bindings/_path_finder/load_dl_windows.py
index ec305be927..0d13680a6b 100644
--- a/cuda_bindings/cuda/bindings/_path_finder/load_dl_windows.py
+++ b/cuda_bindings/cuda/bindings/_path_finder/load_dl_windows.py
@@ -1,8 +1,6 @@
 # Copyright 2025 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-import ctypes
-import ctypes.wintypes
 from typing import Optional
 
 import pywintypes
@@ -36,46 +34,11 @@ def add_dll_directory(dll_abs_path: str) -> None:
 
 
 def abs_path_for_dynamic_library(libname: str, handle: pywintypes.HANDLE) -> str:
-    """Get the absolute path of a loaded dynamic library on Windows.
-
-    Args:
-        handle: The library handle
-
-    Returns:
-        The absolute path to the DLL file
-
-    Raises:
-        OSError: If GetModuleFileNameW fails
-        RuntimeError: If the required path length is unreasonably long
-    """
-    MAX_ITERATIONS = 10  # Allows for extremely long paths (up to ~266,000 chars)
-    buf_size = 260  # Start with traditional MAX_PATH
-
-    for _ in range(MAX_ITERATIONS):
-        buf = ctypes.create_unicode_buffer(buf_size)
-        n_chars = ctypes.windll.kernel32.GetModuleFileNameW(ctypes.wintypes.HMODULE(handle), buf, buf_size)
-
-        if n_chars == 0:
-            raise OSError(
-                f"GetModuleFileNameW failed ({libname=!r}, {buf_size=}). "
-                "Long paths may require enabling the "
-                "Windows 10+ long path registry setting. See: "
-                "https://docs.python.org/3/using/windows.html#removing-the-max-path-limitation"
-            )
-        if n_chars < buf_size - 1:
-            return buf.value
-
-        buf_size *= 2  # Double the buffer size and try again
-
-    raise RuntimeError(
-        f"Failed to retrieve the full path after {MAX_ITERATIONS} attempts "
-        f"(final buffer size: {buf_size} characters). "
-        "This may indicate:\n"
-        "  1. An extremely long path requiring Windows long path support, or\n"
-        "  2. An invalid or corrupt library handle, or\n"
-        "  3. An unexpected system error.\n"
-        "See: https://docs.python.org/3/using/windows.html#removing-the-max-path-limitation"
-    )
+    """Get the absolute path of a loaded dynamic library on Windows."""
+    try:
+        return win32api.GetModuleFileName(handle)
+    except Exception as e:
+        raise RuntimeError(f"GetModuleFileName failed for {libname!r} (exception type: {type(e)})") from e
 
 
 def check_if_already_loaded_from_elsewhere(libname: str) -> Optional[LoadedDL]:

From d539b4337bfe056e1ef028856bb9efd6a1a33e86 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 7 May 2025 11:48:49 -0700
Subject: [PATCH 2/9] Address review comment
 https://github.com/NVIDIA/cuda-python/pull/604#discussion_r2075856805 by
 at-kkraus14

---
 cuda_bindings/cuda/bindings/_path_finder/README.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_path_finder/README.md b/cuda_bindings/cuda/bindings/_path_finder/README.md
index fa51b56fa7..1e115f2eda 100644
--- a/cuda_bindings/cuda/bindings/_path_finder/README.md
+++ b/cuda_bindings/cuda/bindings/_path_finder/README.md
@@ -31,12 +31,13 @@ strategy for locating NVIDIA shared libraries:
    - Falls back to native loader:
      - `dlopen()` on Linux
      - `LoadLibraryW()` on Windows
+   - Conda installations are expected to be discovered:
+     - Linux: Via `$ORIGIN/../lib` on `RPATH` (of the `python` binary;
+       note that this preempts `LD_LIBRARY_PATH` and `/etc/ld.so.conf.d/`)
+     - Windows: Via `%CONDA_PREFIX%\Library\bin` on system `PATH`
    - CTK installations with system config updates are expected to be discovered:
      - Linux: Via `/etc/ld.so.conf.d/*cuda*.conf`
      - Windows: Via `C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y\bin` on system `PATH`
-   - Conda installations are expected to be discovered:
-     - Linux: Via `$ORIGIN/../lib` on `RPATH` (of the `python` binary)
-     - Windows: Via `%CONDA_PREFIX%\Library\bin` on system `PATH`
 
 3. **Environment variables**
    - Relies on `CUDA_HOME` or `CUDA_PATH` environment variables if set

From 8cc1b2b8a14c64ce9e0167b9ff304fae073a7d3f Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 7 May 2025 12:22:32 -0700
Subject: [PATCH 3/9] =?UTF-8?q?Rename=20function=20run=5Fpython=5Fcode=5Fs?=
 =?UTF-8?q?afely()=20=E2=86=92=20run=5Fin=5Fspawed=5Fchild=5Fprocess()?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cuda_bindings/tests/run_python_code_safely.py | 4 ++--
 cuda_bindings/tests/test_path_finder_load.py  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cuda_bindings/tests/run_python_code_safely.py b/cuda_bindings/tests/run_python_code_safely.py
index 349ed96826..5a5fc38c20 100644
--- a/cuda_bindings/tests/run_python_code_safely.py
+++ b/cuda_bindings/tests/run_python_code_safely.py
@@ -42,8 +42,8 @@ def __call__(self):
                 pass
 
 
-def run_python_code_safely(python_code, *, timeout=None):
-    """Run Python code in a spawned subprocess, capturing stdout/stderr/output."""
+def run_in_spawned_child_process(python_code, *, timeout=None):
+    """Run Python code in a spawned child process, capturing stdout/stderr/output."""
     ctx = multiprocessing.get_context("spawn")
     result_queue = ctx.Queue()
     process = ctx.Process(target=Worker(python_code, result_queue))
diff --git a/cuda_bindings/tests/test_path_finder_load.py b/cuda_bindings/tests/test_path_finder_load.py
index 5c21e8a058..f5f0814c6f 100644
--- a/cuda_bindings/tests/test_path_finder_load.py
+++ b/cuda_bindings/tests/test_path_finder_load.py
@@ -5,7 +5,7 @@
 import sys
 
 import pytest
-from run_python_code_safely import run_python_code_safely
+import run_python_code_safely
 
 from cuda.bindings import path_finder
 from cuda.bindings._path_finder import supported_libs
@@ -76,7 +76,7 @@ def test_find_or_load_nvidia_dynamic_library(info_summary_append, libname):
 
 print(f"{{loaded_dl_fresh.abs_path!r}}")
 """
-    result = run_python_code_safely(code, timeout=30)
+    result = run_python_code_safely.run_in_spawned_child_process(code, timeout=30)
     if result.returncode == 0:
         info_summary_append(f"abs_path={result.stdout.rstrip()}")
     else:

From b3dee33ea3a5cef87cdb05dc77b997163da28f0c Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 7 May 2025 14:41:58 -0700
Subject: [PATCH 4/9] Change run_in_spawned_child_process() to accept a
 callable function instead of a string with Python code.

---
 cuda_bindings/tests/run_python_code_safely.py | 32 ++++++----
 cuda_bindings/tests/test_path_finder_load.py  | 63 +++++++++----------
 2 files changed, 51 insertions(+), 44 deletions(-)

diff --git a/cuda_bindings/tests/run_python_code_safely.py b/cuda_bindings/tests/run_python_code_safely.py
index 5a5fc38c20..7b601a0073 100644
--- a/cuda_bindings/tests/run_python_code_safely.py
+++ b/cuda_bindings/tests/run_python_code_safely.py
@@ -3,15 +3,17 @@
 
 import multiprocessing
 import queue  # for Empty
-import subprocess  # nosec B404
 import sys
 import traceback
+from dataclasses import dataclass
 from io import StringIO
 
 
 class Worker:
-    def __init__(self, python_code, result_queue):
-        self.python_code = python_code
+    def __init__(self, result_queue, func, args, kwargs):
+        self.func = func
+        self.args = args or ()
+        self.kwargs = kwargs or {}
         self.result_queue = result_queue
 
     def __call__(self):
@@ -22,7 +24,7 @@ def __call__(self):
         sys.stderr = StringIO()
 
         try:
-            exec(self.python_code, {"__name__": "__main__"})  # nosec B102
+            self.func(*self.args, **self.kwargs)
             returncode = 0
         except SystemExit as e:  # Handle sys.exit()
             returncode = e.code if isinstance(e.code, int) else 0
@@ -42,11 +44,20 @@ def __call__(self):
                 pass
 
 
-def run_in_spawned_child_process(python_code, *, timeout=None):
+# Similar to https://docs.python.org/3/library/subprocess.html#subprocess.CompletedProcess
+# (args, check_returncode() are intentionally not supported here.)
+@dataclass
+class CompletedProcess:
+    returncode: int
+    stdout: str
+    stderr: str
+
+
+def run_in_spawned_child_process(func, *, args=None, kwargs=None, timeout=None):
     """Run Python code in a spawned child process, capturing stdout/stderr/output."""
     ctx = multiprocessing.get_context("spawn")
     result_queue = ctx.Queue()
-    process = ctx.Process(target=Worker(python_code, result_queue))
+    process = ctx.Process(target=Worker(result_queue, func, args, kwargs))
     process.start()
 
     try:
@@ -54,8 +65,7 @@ def run_in_spawned_child_process(python_code, *, timeout=None):
         if process.is_alive():
             process.terminate()
             process.join()
-            return subprocess.CompletedProcess(
-                args=[sys.executable, "-c", python_code],
+            return CompletedProcess(
                 returncode=-9,
                 stdout="",
                 stderr=f"Process timed out after {timeout} seconds and was terminated.",
@@ -64,15 +74,13 @@ def run_in_spawned_child_process(python_code, *, timeout=None):
         try:
             returncode, stdout, stderr = result_queue.get(timeout=1.0)
         except (queue.Empty, EOFError):
-            return subprocess.CompletedProcess(
-                args=[sys.executable, "-c", python_code],
+            return CompletedProcess(
                 returncode=-999,
                 stdout="",
                 stderr="Process exited or crashed before returning results.",
             )
 
-        return subprocess.CompletedProcess(
-            args=[sys.executable, "-c", python_code],
+        return CompletedProcess(
             returncode=returncode,
             stdout=stdout,
             stderr=stderr,
diff --git a/cuda_bindings/tests/test_path_finder_load.py b/cuda_bindings/tests/test_path_finder_load.py
index f5f0814c6f..a5d0a1e1d1 100644
--- a/cuda_bindings/tests/test_path_finder_load.py
+++ b/cuda_bindings/tests/test_path_finder_load.py
@@ -38,46 +38,45 @@ def test_all_libnames_expected_lib_symbols_consistency():
     assert tuple(sorted(ALL_LIBNAMES)) == tuple(sorted(supported_libs.EXPECTED_LIB_SYMBOLS.keys()))
 
 
-def build_subprocess_failed_for_libname_message(libname, result):
+def build_child_process_failed_for_libname_message(libname, result):
     return (
-        f"Subprocess failed for {libname=!r} with exit code {result.returncode}\n"
-        f"--- stdout-from-subprocess ---\n{result.stdout}<end-of-stdout-from-subprocess>\n"
-        f"--- stderr-from-subprocess ---\n{result.stderr}<end-of-stderr-from-subprocess>\n"
+        f"Child process failed for {libname=!r} with exit code {result.returncode}\n"
+        f"--- stdout-from-child-process ---\n{result.stdout}<end-of-stdout-from-child-process>\n"
+        f"--- stderr-from-child-process ---\n{result.stderr}<end-of-stderr-from-child-process>\n"
     )
 
 
+def child_process_func(libname):
+    import os
+
+    from cuda.bindings._path_finder.load_nvidia_dynamic_library import _load_nvidia_dynamic_library_no_cache
+    from cuda.bindings.path_finder import _load_nvidia_dynamic_library
+
+    loaded_dl_fresh = _load_nvidia_dynamic_library(libname)
+    if loaded_dl_fresh.was_already_loaded_from_elsewhere:
+        raise RuntimeError("loaded_dl_fresh.was_already_loaded_from_elsewhere")
+
+    loaded_dl_from_cache = _load_nvidia_dynamic_library(libname)
+    if loaded_dl_from_cache is not loaded_dl_fresh:
+        raise RuntimeError("loaded_dl_from_cache is not loaded_dl_fresh")
+
+    loaded_dl_no_cache = _load_nvidia_dynamic_library_no_cache(libname)
+    if not loaded_dl_no_cache.was_already_loaded_from_elsewhere:
+        raise RuntimeError("loaded_dl_no_cache.was_already_loaded_from_elsewhere")
+    if not os.path.samefile(loaded_dl_no_cache.abs_path, loaded_dl_fresh.abs_path):
+        raise RuntimeError(f"not os.path.samefile({loaded_dl_no_cache.abs_path=!r}, {loaded_dl_fresh.abs_path=!r})")
+
+    print(f"{loaded_dl_fresh.abs_path!r}")
+
+
 @pytest.mark.parametrize("libname", TEST_FIND_OR_LOAD_LIBNAMES)
 def test_find_or_load_nvidia_dynamic_library(info_summary_append, libname):
-    # We intentionally run each dynamic library operation in a subprocess
+    # We intentionally run each dynamic library operation in a child process
     # to ensure isolation of global dynamic linking state (e.g., dlopen handles).
-    # Without subprocesses, loading/unloading libraries during testing could
+    # Without child processes, loading/unloading libraries during testing could
     # interfere across test cases and lead to nondeterministic or platform-specific failures.
-    #
-    # Defining the subprocess code snippets as strings ensures each subprocess
-    # runs a minimal, independent script tailored to the specific libname and API being tested.
-    code = f"""\
-import os
-from cuda.bindings.path_finder import _load_nvidia_dynamic_library
-from cuda.bindings._path_finder.load_nvidia_dynamic_library import _load_nvidia_dynamic_library_no_cache
-
-loaded_dl_fresh = _load_nvidia_dynamic_library({libname!r})
-if loaded_dl_fresh.was_already_loaded_from_elsewhere:
-    raise RuntimeError("loaded_dl_fresh.was_already_loaded_from_elsewhere")
-
-loaded_dl_from_cache = _load_nvidia_dynamic_library({libname!r})
-if loaded_dl_from_cache is not loaded_dl_fresh:
-    raise RuntimeError("loaded_dl_from_cache is not loaded_dl_fresh")
-
-loaded_dl_no_cache = _load_nvidia_dynamic_library_no_cache({libname!r})
-if not loaded_dl_no_cache.was_already_loaded_from_elsewhere:
-    raise RuntimeError("loaded_dl_no_cache.was_already_loaded_from_elsewhere")
-if not os.path.samefile(loaded_dl_no_cache.abs_path, loaded_dl_fresh.abs_path):
-    raise RuntimeError(f"not os.path.samefile({{loaded_dl_no_cache.abs_path=!r}}, {{loaded_dl_fresh.abs_path=!r}})")
-
-print(f"{{loaded_dl_fresh.abs_path!r}}")
-"""
-    result = run_python_code_safely.run_in_spawned_child_process(code, timeout=30)
+    result = run_python_code_safely.run_in_spawned_child_process(child_process_func, args=(libname,), timeout=30)
     if result.returncode == 0:
         info_summary_append(f"abs_path={result.stdout.rstrip()}")
     else:
-        raise RuntimeError(build_subprocess_failed_for_libname_message(libname, result))
+        raise RuntimeError(build_child_process_failed_for_libname_message(libname, result))

From fe1a17437fc185958deb16c4e91386dfe6db829b Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 7 May 2025 15:04:39 -0700
Subject: [PATCH 5/9] ChatGPT suggestions

---
 cuda_bindings/tests/run_python_code_safely.py | 35 ++++++++++++-------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/cuda_bindings/tests/run_python_code_safely.py b/cuda_bindings/tests/run_python_code_safely.py
index 7b601a0073..2815517884 100644
--- a/cuda_bindings/tests/run_python_code_safely.py
+++ b/cuda_bindings/tests/run_python_code_safely.py
@@ -8,12 +8,24 @@
 from dataclasses import dataclass
 from io import StringIO
 
+PROCESS_KILLED = -9
+PROCESS_NO_RESULT = -999
+
+
+# Similar to https://docs.python.org/3/library/subprocess.html#subprocess.CompletedProcess
+# (args, check_returncode() are intentionally not supported here.)
+@dataclass
+class CompletedProcess:
+    returncode: int
+    stdout: str
+    stderr: str
+
 
 class Worker:
     def __init__(self, result_queue, func, args, kwargs):
         self.func = func
-        self.args = args or ()
-        self.kwargs = kwargs or {}
+        self.args = () if args is None else args
+        self.kwargs = {} if kwargs is None else kwargs
         self.result_queue = result_queue
 
     def __call__(self):
@@ -44,17 +56,14 @@ def __call__(self):
                 pass
 
 
-# Similar to https://docs.python.org/3/library/subprocess.html#subprocess.CompletedProcess
-# (args, check_returncode() are intentionally not supported here.)
-@dataclass
-class CompletedProcess:
-    returncode: int
-    stdout: str
-    stderr: str
+def run_in_spawned_child_process(func, *, args=None, kwargs=None, timeout=None):
+    """Run `func` in a spawned child process, capturing stdout/stderr.
 
+    The provided `func` must be defined at the top level of a module, and must
+    be importable in the spawned child process. Lambdas, closures, or interactively
+    defined functions (e.g., in Jupyter notebooks) will not work.
+    """
 
-def run_in_spawned_child_process(func, *, args=None, kwargs=None, timeout=None):
-    """Run Python code in a spawned child process, capturing stdout/stderr/output."""
     ctx = multiprocessing.get_context("spawn")
     result_queue = ctx.Queue()
     process = ctx.Process(target=Worker(result_queue, func, args, kwargs))
@@ -66,7 +75,7 @@ def run_in_spawned_child_process(func, *, args=None, kwargs=None, timeout=None):
             process.terminate()
             process.join()
             return CompletedProcess(
-                returncode=-9,
+                returncode=PROCESS_KILLED,
                 stdout="",
                 stderr=f"Process timed out after {timeout} seconds and was terminated.",
             )
@@ -75,7 +84,7 @@ def run_in_spawned_child_process(func, *, args=None, kwargs=None, timeout=None):
             returncode, stdout, stderr = result_queue.get(timeout=1.0)
         except (queue.Empty, EOFError):
             return CompletedProcess(
-                returncode=-999,
+                returncode=PROCESS_NO_RESULT,
                 stdout="",
                 stderr="Process exited or crashed before returning results.",
             )

From d620613732c681675fefc2b06de1eb505a418e49 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 7 May 2025 15:12:11 -0700
Subject: [PATCH 6/9] Add rethrow as suggested by ChatGPT

---
 cuda_bindings/tests/run_python_code_safely.py | 43 ++++++++++++-------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/cuda_bindings/tests/run_python_code_safely.py b/cuda_bindings/tests/run_python_code_safely.py
index 2815517884..8e6fefc70a 100644
--- a/cuda_bindings/tests/run_python_code_safely.py
+++ b/cuda_bindings/tests/run_python_code_safely.py
@@ -56,14 +56,16 @@ def __call__(self):
                 pass
 
 
-def run_in_spawned_child_process(func, *, args=None, kwargs=None, timeout=None):
+def run_in_spawned_child_process(func, *, args=None, kwargs=None, timeout=None, rethrow=False):
     """Run `func` in a spawned child process, capturing stdout/stderr.
 
     The provided `func` must be defined at the top level of a module, and must
     be importable in the spawned child process. Lambdas, closures, or interactively
     defined functions (e.g., in Jupyter notebooks) will not work.
-    """
 
+    If `rethrow=True` and the child process exits with a nonzero code,
+    raises ChildProcessError with the captured stderr.
+    """
     ctx = multiprocessing.get_context("spawn")
     result_queue = ctx.Queue()
     process = ctx.Process(target=Worker(result_queue, func, args, kwargs))
@@ -74,26 +76,35 @@ def run_in_spawned_child_process(func, *, args=None, kwargs=None, timeout=None):
         if process.is_alive():
             process.terminate()
             process.join()
-            return CompletedProcess(
+            result = CompletedProcess(
                 returncode=PROCESS_KILLED,
                 stdout="",
                 stderr=f"Process timed out after {timeout} seconds and was terminated.",
             )
-
-        try:
-            returncode, stdout, stderr = result_queue.get(timeout=1.0)
-        except (queue.Empty, EOFError):
-            return CompletedProcess(
-                returncode=PROCESS_NO_RESULT,
-                stdout="",
-                stderr="Process exited or crashed before returning results.",
+        else:
+            try:
+                returncode, stdout, stderr = result_queue.get(timeout=1.0)
+            except (queue.Empty, EOFError):
+                result = CompletedProcess(
+                    returncode=PROCESS_NO_RESULT,
+                    stdout="",
+                    stderr="Process exited or crashed before returning results.",
+                )
+            else:
+                result = CompletedProcess(
+                    returncode=returncode,
+                    stdout=stdout,
+                    stderr=stderr,
+                )
+
+        if rethrow and result.returncode != 0:
+            raise ChildProcessError(
+                f"Child process exited with code {result.returncode}.\n"
+                f"--- stderr-from-child-process ---\n{result.stderr}"
+                "<end-of-stderr-from-child-process>\n"
             )
 
-        return CompletedProcess(
-            returncode=returncode,
-            stdout=stdout,
-            stderr=stderr,
-        )
+        return result
 
     finally:
         try:

From e22b601f74c3576a691036dd737cd456aa7d5060 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 7 May 2025 15:19:28 -0700
Subject: [PATCH 7/9] =?UTF-8?q?Better=20names:=20Worker=20=E2=86=92=20Chil?=
 =?UTF-8?q?dProcessWrapper,=20func=20=E2=86=92=20target?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 cuda_bindings/tests/run_python_code_safely.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/cuda_bindings/tests/run_python_code_safely.py b/cuda_bindings/tests/run_python_code_safely.py
index 8e6fefc70a..5e75087d77 100644
--- a/cuda_bindings/tests/run_python_code_safely.py
+++ b/cuda_bindings/tests/run_python_code_safely.py
@@ -21,9 +21,9 @@ class CompletedProcess:
     stderr: str
 
 
-class Worker:
-    def __init__(self, result_queue, func, args, kwargs):
-        self.func = func
+class ChildProcessWrapper:
+    def __init__(self, result_queue, target, args, kwargs):
+        self.target = target
         self.args = () if args is None else args
         self.kwargs = {} if kwargs is None else kwargs
         self.result_queue = result_queue
@@ -36,7 +36,7 @@ def __call__(self):
         sys.stderr = StringIO()
 
         try:
-            self.func(*self.args, **self.kwargs)
+            self.target(*self.args, **self.kwargs)
             returncode = 0
         except SystemExit as e:  # Handle sys.exit()
             returncode = e.code if isinstance(e.code, int) else 0
@@ -56,10 +56,10 @@ def __call__(self):
                 pass
 
 
-def run_in_spawned_child_process(func, *, args=None, kwargs=None, timeout=None, rethrow=False):
-    """Run `func` in a spawned child process, capturing stdout/stderr.
+def run_in_spawned_child_process(target, *, args=None, kwargs=None, timeout=None, rethrow=False):
+    """Run `target` in a spawned child process, capturing stdout/stderr.
 
-    The provided `func` must be defined at the top level of a module, and must
+    The provided `target` must be defined at the top level of a module, and must
     be importable in the spawned child process. Lambdas, closures, or interactively
     defined functions (e.g., in Jupyter notebooks) will not work.
 
@@ -68,7 +68,7 @@ def run_in_spawned_child_process(func, *, args=None, kwargs=None, timeout=None,
     """
     ctx = multiprocessing.get_context("spawn")
     result_queue = ctx.Queue()
-    process = ctx.Process(target=Worker(result_queue, func, args, kwargs))
+    process = ctx.Process(target=ChildProcessWrapper(result_queue, target, args, kwargs))
     process.start()
 
     try:

From 4719b22373d8c49fe594b899395411892ff7a5ff Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 7 May 2025 15:25:42 -0700
Subject: [PATCH 8/9] ChatGPT suggestions

---
 ...hon_code_safely.py => spawned_process_runner.py} | 13 +++++++++++--
 cuda_bindings/tests/test_path_finder_load.py        |  4 ++--
 2 files changed, 13 insertions(+), 4 deletions(-)
 rename cuda_bindings/tests/{run_python_code_safely.py => spawned_process_runner.py} (90%)

diff --git a/cuda_bindings/tests/run_python_code_safely.py b/cuda_bindings/tests/spawned_process_runner.py
similarity index 90%
rename from cuda_bindings/tests/run_python_code_safely.py
rename to cuda_bindings/tests/spawned_process_runner.py
index 5e75087d77..3a13362fe2 100644
--- a/cuda_bindings/tests/run_python_code_safely.py
+++ b/cuda_bindings/tests/spawned_process_runner.py
@@ -7,6 +7,7 @@
 import traceback
 from dataclasses import dataclass
 from io import StringIO
+from typing import Any, Callable, Optional, Sequence
 
 PROCESS_KILLED = -9
 PROCESS_NO_RESULT = -999
@@ -56,7 +57,14 @@ def __call__(self):
                 pass
 
 
-def run_in_spawned_child_process(target, *, args=None, kwargs=None, timeout=None, rethrow=False):
+def run_in_spawned_child_process(
+    target: Callable[..., None],
+    *,
+    args: Optional[Sequence[Any]] = None,
+    kwargs: Optional[dict[str, Any]] = None,
+    timeout: Optional[float] = None,
+    rethrow: bool = False,
+) -> CompletedProcess:
     """Run `target` in a spawned child process, capturing stdout/stderr.
 
     The provided `target` must be defined at the top level of a module, and must
@@ -100,7 +108,8 @@ def run_in_spawned_child_process(target, *, args=None, kwargs=None, timeout=None
         if rethrow and result.returncode != 0:
             raise ChildProcessError(
                 f"Child process exited with code {result.returncode}.\n"
-                f"--- stderr-from-child-process ---\n{result.stderr}"
+                "--- stderr-from-child-process ---\n"
+                f"{result.stderr}"
                 "<end-of-stderr-from-child-process>\n"
             )
 
diff --git a/cuda_bindings/tests/test_path_finder_load.py b/cuda_bindings/tests/test_path_finder_load.py
index a5d0a1e1d1..66defe2bac 100644
--- a/cuda_bindings/tests/test_path_finder_load.py
+++ b/cuda_bindings/tests/test_path_finder_load.py
@@ -5,7 +5,7 @@
 import sys
 
 import pytest
-import run_python_code_safely
+import spawned_process_runner
 
 from cuda.bindings import path_finder
 from cuda.bindings._path_finder import supported_libs
@@ -75,7 +75,7 @@ def test_find_or_load_nvidia_dynamic_library(info_summary_append, libname):
     # to ensure isolation of global dynamic linking state (e.g., dlopen handles).
     # Without child processes, loading/unloading libraries during testing could
     # interfere across test cases and lead to nondeterministic or platform-specific failures.
-    result = run_python_code_safely.run_in_spawned_child_process(child_process_func, args=(libname,), timeout=30)
+    result = spawned_process_runner.run_in_spawned_child_process(child_process_func, args=(libname,), timeout=30)
     if result.returncode == 0:
         info_summary_append(f"abs_path={result.stdout.rstrip()}")
     else:

From 3ecf55d2a758d1db6589053db5b72b959bceef70 Mon Sep 17 00:00:00 2001
From: "Ralf W. Grosse-Kunstleve" <rgrossekunst@nvidia.com>
Date: Wed, 7 May 2025 15:33:57 -0700
Subject: [PATCH 9/9] Add minimal test_spawned_process_runner.py as generated
 by ChatGPT

---
 .../tests/test_spawned_process_runner.py      | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 cuda_bindings/tests/test_spawned_process_runner.py

diff --git a/cuda_bindings/tests/test_spawned_process_runner.py b/cuda_bindings/tests/test_spawned_process_runner.py
new file mode 100644
index 0000000000..644ed8a839
--- /dev/null
+++ b/cuda_bindings/tests/test_spawned_process_runner.py
@@ -0,0 +1,21 @@
+# Copyright 2025 NVIDIA Corporation.  All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+# Note: This only covers what is not covered already in test_path_finder_load.py
+
+import pytest
+from spawned_process_runner import run_in_spawned_child_process
+
+
+def child_crashes():
+    raise RuntimeError("this is an intentional failure")
+
+
+def test_rethrow_child_exception():
+    with pytest.raises(ChildProcessError) as excinfo:
+        run_in_spawned_child_process(child_crashes, rethrow=True)
+
+    msg = str(excinfo.value)
+    assert "Child process exited with code 1" in msg
+    assert "this is an intentional failure" in msg
+    assert "--- stderr-from-child-process ---" in msg