From 0ac74984bb5c65ef1ecf89040ec69d6ec1647ee9 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Thu, 16 Apr 2026 10:23:09 -0400 Subject: [PATCH 1/2] test(cufile): prime libcufile before parameter-set tests to avoid SIGFPE Under pytest-randomly, the cuFile test module fatally crashes with SIGFPE in CUFileDrv::ReadVersionInfo (unguarded div %rcx with rcx=0) inside libcufile.so cuFileDriverOpen+0xe. The crash is deterministic given specific test orderings and was reproducible with seed 2758108007. Root cause is a libcufile 1.17.1 bug. Calling cuFileSetParameterSizeT (or other pre-open configuration APIs) BEFORE the first cuFileDriverOpen leaves an internal version list uninitialized; the next driver_open then divides by its zero length. Minimal repro: pytest tests/test_cufile.py::test_set_get_parameter_size_t \\ tests/test_cufile.py::test_buf_register_invalid_flags Fix: add a module-scope autouse _cufile_driver_prewarm fixture that performs one driver_open/driver_close before any test in the module runs. That single cycle initializes libcufile's version list; both test regimes (driver-open tests via the function-scope `driver` fixture, and driver-closed parameter-set tests) then work under any ordering. Also swap test_set_parameter_posix_pool_slab_array's inline driver_open/close for the `driver` fixture. pytest fixture ordering guarantees driver_config (which calls set_parameter_posix_pool_slab_array while closed) runs before `driver` opens, matching the previous manual ordering. Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_bindings/tests/test_cufile.py | 56 +++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 8 deletions(-) diff --git a/cuda_bindings/tests/test_cufile.py b/cuda_bindings/tests/test_cufile.py index a4400f637a3..0a68ab5433e 100644 --- a/cuda_bindings/tests/test_cufile.py +++ b/cuda_bindings/tests/test_cufile.py @@ -140,6 +140,52 @@ def ctx(): cuda.cuDevicePrimaryCtxRelease(device) +@pytest.fixture(scope="module", autouse=True) +def _cufile_driver_prewarm(): + """Prime libcufile with one driver_open/close cycle before any test runs. + + The cuFile test module mixes two incompatible regimes: + + - Driver-open tests (buf_register_*, cufile_read_write, batch_io, stats, + etc.) need cuFileDriverOpen; they use the function-scope `driver` + fixture to open/close per test. + - Driver-closed tests (test_set_get_parameter_*, test_set_parameter_posix_*) + must run with the driver CLOSED — libcufile rejects parameter-set calls + when the driver is open (DRIVER_ALREADY_OPEN, 5026). + + Workaround for NVIDIA libcufile 1.17.1 bug: calling cuFileSetParameterSizeT + (or similar pre-open configuration APIs) BEFORE the first cuFileDriverOpen + leaves an internal version list uninitialized such that a later + cuFileDriverOpen SIGFPEs in CUFileDrv::ReadVersionInfo (div-by-zero). + Under random ordering, a driver-closed test can run before any + driver-open test, poisoning libcufile and tearing down pytest with a fatal + signal on the next driver_open. + + One open/close cycle up front primes libcufile's version list. After that, + both regimes work: the per-test `driver` fixture can open/close freely, + and parameter-set tests run against the (now properly initialized) closed + driver. + + Note: per-test driver_open/close is not ideal on throughput grounds, but + it is forced by the libcufile API — parameter-set tests cannot coexist + with a session-wide open driver. + """ + (err,) = cuda.cuInit(0) + assert err == cuda.CUresult.CUDA_SUCCESS + err, device = cuda.cuDeviceGet(0) + assert err == cuda.CUresult.CUDA_SUCCESS + err, dctx = cuda.cuDevicePrimaryCtxRetain(device) + assert err == cuda.CUresult.CUDA_SUCCESS + (err,) = cuda.cuCtxSetCurrent(dctx) + assert err == cuda.CUresult.CUDA_SUCCESS + try: + cufile.driver_open() + cufile.driver_close() + finally: + cuda.cuDevicePrimaryCtxRelease(device) + yield + + @pytest.fixture def driver(ctx): cufile.driver_open() @@ -1896,8 +1942,7 @@ def driver_config(slab_sizes, slab_counts): @pytest.mark.skipif( cufileVersionLessThan(1150), reason="cuFile parameter APIs require cuFile library version 13.0 or later" ) -@pytest.mark.usefixtures("ctx") -def test_set_parameter_posix_pool_slab_array(slab_sizes, slab_counts, driver_config): +def test_set_parameter_posix_pool_slab_array(slab_sizes, slab_counts, driver_config, driver): """Test cuFile POSIX pool slab array configuration.""" # After setting parameters, retrieve them back to verify n_slab_sizes = len(slab_sizes) @@ -1907,12 +1952,7 @@ def test_set_parameter_posix_pool_slab_array(slab_sizes, slab_counts, driver_con retrieved_sizes_addr = ctypes.addressof(retrieved_sizes) retrieved_counts_addr = ctypes.addressof(retrieved_counts) - # Open cuFile driver AFTER setting parameters - cufile.driver_open() - try: - cufile.get_parameter_posix_pool_slab_array(retrieved_sizes_addr, retrieved_counts_addr, n_slab_sizes) - finally: - cufile.driver_close() + cufile.get_parameter_posix_pool_slab_array(retrieved_sizes_addr, retrieved_counts_addr, n_slab_sizes) # Verify they match what we set assert list(retrieved_sizes) == slab_sizes From f3f35b7e7d980bc4407ca8938ddcd0b13b000c16 Mon Sep 17 00:00:00 2001 From: Phillip Cloud <417981+cpcloud@users.noreply.github.com> Date: Thu, 16 Apr 2026 12:46:03 -0400 Subject: [PATCH 2/2] test(cufile): drop useless yield from prewarm fixture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The _cufile_driver_prewarm fixture has no teardown — the open/close cycle is setup-only. Keeping a trailing `yield` made ruff's PT022 (pytest-useless-yield-fixture) flag it. Drop the yield so the fixture runs as pure setup. Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_bindings/tests/test_cufile.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cuda_bindings/tests/test_cufile.py b/cuda_bindings/tests/test_cufile.py index 0a68ab5433e..f6dbc2b50cb 100644 --- a/cuda_bindings/tests/test_cufile.py +++ b/cuda_bindings/tests/test_cufile.py @@ -183,7 +183,6 @@ def _cufile_driver_prewarm(): cufile.driver_close() finally: cuda.cuDevicePrimaryCtxRelease(device) - yield @pytest.fixture