From 506e5e6ad899eb9f29d985cab077d467f5ca9b46 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Tue, 12 May 2026 05:51:24 -0700 Subject: [PATCH 1/2] Replace per-tile alloc + concat in _try_nvcomp_from_device_bufs (#1659) The GDS+nvCOMP fast path allocated N separate cupy.empty(tile_bytes) output buffers and ran cupy.concatenate(d_decomp_bufs) after the decompress kernel returned. That kept two copies of the decompressed data alive at once and added a serial concat the other nvCOMP paths in this module already avoid (LZW at L1847, deflate at L1878, host-buffer at L1114 all use a single cupy.empty + pointer-offset pattern). The function now allocates one contiguous cupy.empty(n*tile_bytes) up front and derives the per-tile device pointers as base + i*tile_bytes, matching the surrounding code. The return contract is unchanged: a single flat cupy.uint8 buffer of length n*tile_bytes that _apply_predictor_and_assemble consumes directly. Adds _check_gpu_memory on the new allocation so a borderline-OK GDS read surfaces a clear MemoryError before the cupy.empty call instead of an opaque CUDA OOM deeper in the function. Tests in test_nvcomp_from_device_bufs_single_alloc_1659.py cover the codec early-return, the missing-library early-return, the memory-guard contract, the structural single-buffer return shape, and a real end-to-end ZSTD round-trip through nvCOMP when the library is available. --- .claude/sweep-performance-state.csv | 2 +- xrspatial/geotiff/_gpu_decode.py | 30 ++- ...comp_from_device_bufs_single_alloc_1659.py | 236 ++++++++++++++++++ 3 files changed, 263 insertions(+), 5 deletions(-) create mode 100644 xrspatial/geotiff/tests/test_nvcomp_from_device_bufs_single_alloc_1659.py diff --git a/.claude/sweep-performance-state.csv b/.claude/sweep-performance-state.csv index 883ab2125..115f09a3b 100644 --- a/.claude/sweep-performance-state.csv +++ b/.claude/sweep-performance-state.csv @@ -18,7 +18,7 @@ fire,2026-03-31T18:00:00Z,SAFE,compute-bound,0,, flood,2026-03-31T18:00:00Z,SAFE,compute-bound,0,, focal,2026-03-31T18:00:00Z,SAFE,compute-bound,0,, geodesic,2026-03-31T18:00:00Z,N/A,compute-bound,0,, -geotiff,2026-05-10,SAFE,IO-bound,0,,"Pass 4 (2026-05-10): re-audit after #1559 (centralise attrs across all read backends). New _populate_attrs_from_geo_info helper at __init__.py:301 runs once per read, not per-chunk -- no perf impact. Probe: 2560x2560 deflate-tiled file opened via read_geotiff_dask yields 400 tasks (4 tasks/chunk for 100 chunks), well under 1M cap. read_geotiff_gpu(1024x1024) returns cupy.ndarray end-to-end with no host round-trip (226ms incl. write+decode). No new HIGH/MEDIUM findings. SAFE/IO-bound holds. | Pass 3 (2026-05-10): SAFE/IO-bound. Audited 4 perf commits: #1558 (in-place NaN writes on uniquely-owned buffers correct), #1556 (fp-predictor ngjit ~297us/tile for 256x256 float32), #1552 (single cupy.concatenate + one .get() for batched D2H at _gpu_decode.py:870-913), #1551 (parallel decode threshold >=65536px engages 256x256 default at _reader.py:1121). Bench: 8192x8192 f32 deflate+pred2 256-tile write 782ms; 4096x4096 f32 deflate read 83ms with parallel decode. Deferred LOW (none filed, all <10% MEDIUM threshold): _writer.py:459/1109 redundant .copy() before predictor encode (~1% per tile), _compression.py:280 lzw_decompress dst[:n].copy() (~2% per LZW tile decode), _writer.py:1419 seg_np.copy() before in-place NaN substitution (negligible, conditional path), _CloudSource.read_range opens fresh fsspec handle per range (pre-existing, predates audit scope). nvCOMP per-tile D2H batching break-even confirmed (variable sizes need staging buffer, no win). | Pass 3 (2026-05-10): audited f157746,39322c3,f23ec8f,1aac3b7. All 5 commits correct. Redundant .copy() in _writer.py:459,1109 and _compression.py:280 (1-2% overhead, LOW). _CloudSource.read_range() per-call open is pre-existing arch issue. No HIGH/MEDIUM regressions. SAFE. | re-audit 2026-05-02: 6 commits since 2026-04-16 (predictor=3 CPU encode/decode, GPU predictor stride fix, validate_tile_layout, BigTIFF LONG8 offsets, AREA_OR_POINT VRT, per-tile alloc guard). 1M dask chunk cap intact at __init__.py:948; adler32 batch transfer intact at _gpu_decode.py:1825. New code is metadata validation and dispatcher logic with no extra materialization or per-tile sync points. No HIGH/MEDIUM regressions." +geotiff,2026-05-12,SAFE,IO-bound,0,1659,"Pass 4 (2026-05-10): re-audit after #1559 (centralise attrs across all read backends). New _populate_attrs_from_geo_info helper at __init__.py:301 runs once per read, not per-chunk -- no perf impact. Probe: 2560x2560 deflate-tiled file opened via read_geotiff_dask yields 400 tasks (4 tasks/chunk for 100 chunks), well under 1M cap. read_geotiff_gpu(1024x1024) returns cupy.ndarray end-to-end with no host round-trip (226ms incl. write+decode). No new HIGH/MEDIUM findings. SAFE/IO-bound holds. | Pass 3 (2026-05-10): SAFE/IO-bound. Audited 4 perf commits: #1558 (in-place NaN writes on uniquely-owned buffers correct), #1556 (fp-predictor ngjit ~297us/tile for 256x256 float32), #1552 (single cupy.concatenate + one .get() for batched D2H at _gpu_decode.py:870-913), #1551 (parallel decode threshold >=65536px engages 256x256 default at _reader.py:1121). Bench: 8192x8192 f32 deflate+pred2 256-tile write 782ms; 4096x4096 f32 deflate read 83ms with parallel decode. Deferred LOW (none filed, all <10% MEDIUM threshold): _writer.py:459/1109 redundant .copy() before predictor encode (~1% per tile), _compression.py:280 lzw_decompress dst[:n].copy() (~2% per LZW tile decode), _writer.py:1419 seg_np.copy() before in-place NaN substitution (negligible, conditional path), _CloudSource.read_range opens fresh fsspec handle per range (pre-existing, predates audit scope). nvCOMP per-tile D2H batching break-even confirmed (variable sizes need staging buffer, no win). | Pass 3 (2026-05-10): audited f157746,39322c3,f23ec8f,1aac3b7. All 5 commits correct. Redundant .copy() in _writer.py:459,1109 and _compression.py:280 (1-2% overhead, LOW). _CloudSource.read_range() per-call open is pre-existing arch issue. No HIGH/MEDIUM regressions. SAFE. | re-audit 2026-05-02: 6 commits since 2026-04-16 (predictor=3 CPU encode/decode, GPU predictor stride fix, validate_tile_layout, BigTIFF LONG8 offsets, AREA_OR_POINT VRT, per-tile alloc guard). 1M dask chunk cap intact at __init__.py:948; adler32 batch transfer intact at _gpu_decode.py:1825. New code is metadata validation and dispatcher logic with no extra materialization or per-tile sync points. No HIGH/MEDIUM regressions. | Pass 5 (2026-05-12): re-audit identified MEDIUM in _gpu_decode.py:1577 _try_nvcomp_from_device_bufs: per-tile cupy.empty + trailing cupy.concatenate doubled peak VRAM and added serial concat. Filed #1659 and fixed to single-buffer + pointer offsets (matches LZW/deflate/host-buffer patterns at L1847/L1878/L1114). Microbench (alloc+concat overhead only, not full nvCOMP latency): n=256 tile_bytes=65536 drops 3.66ms->0.69ms, n=256 tile_bytes=262144 drops 8.18ms->0.13ms. Tests: 5 new tests in test_nvcomp_from_device_bufs_single_alloc_1659.py (codec short-circuit, no-lib short-circuit, memory-guard contract, real ZSTD round-trip via nvCOMP, structural single-buffer check). 1458 existing geotiff tests pass, 3 unrelated matplotlib/py3.14 failures pre-existing. SAFE/IO-bound verdict holds." glcm,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,"Downgraded to MEDIUM. da.stack without rechunk is scheduling overhead, not OOM risk." hillshade,2026-04-16T12:00:00Z,SAFE,compute-bound,0,,"Re-audit after Horn's method rewrite (PR 1175): clean stencil, map_overlap depth=(1,1), no materialization. Zero findings." hydro,2026-05-01,RISKY,memory-bound,0,1416,"Fixed-in-tree 2026-05-01: hand_mfd._hand_mfd_dask now assembles via da.map_blocks instead of eager da.block of pre-computed tiles (matches hand_dinf pattern). Remaining MEDIUM: sink_d8 CCL fully materializes labels (inherently global), flow_accumulation_mfd frac_bdry held in driver dict instead of memmap-backed BoundaryStore. D8 iterative paths (flow_accum/fill/watershed/basin/stream_*) use serial-tile sweep with memmap-backed boundary store -- per-tile RAM bounded but driver iterates O(diameter) times. flow_direction_*, flow_path/snap_pour_point/twi/hand_d8/hand_dinf are SAFE." diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py index c0259ff10..d6bc1d645 100644 --- a/xrspatial/geotiff/_gpu_decode.py +++ b/xrspatial/geotiff/_gpu_decode.py @@ -1561,7 +1561,19 @@ def gpu_decode_tiles_from_file( def _try_nvcomp_from_device_bufs(d_tiles, tile_bytes, compression): - """Run nvCOMP batch decompress on tiles already in GPU memory.""" + """Run nvCOMP batch decompress on tiles already in GPU memory. + + The decompressed output uses a single contiguous device buffer with + per-tile pointers derived as ``base_ptr + i * tile_bytes``. The previous + implementation allocated N separate ``cupy.empty(tile_bytes)`` buffers + and ran ``cupy.concatenate`` after the decompress kernel; that pattern + kept two copies of the decompressed data alive at once (the per-tile + buffers and the concatenated result) and ran a serial concat that the + rest of the GPU paths avoid. The other nvCOMP code paths in this module + (LZW at ~L1847, deflate at ~L1878, host-buffer at ~L1114) already use + the single-buffer pattern; this brings the GDS path in line with them. + See issue #1659. + """ import ctypes import cupy @@ -1574,10 +1586,20 @@ class _NvcompDecompOpts(ctypes.Structure): try: n = len(d_tiles) - d_decomp_bufs = [cupy.empty(tile_bytes, dtype=cupy.uint8) for _ in range(n)] + # Single contiguous output buffer. nvCOMP's batched decompress takes + # an array of per-tile device pointers; derive those from the base + # pointer + per-tile byte offsets so we never allocate N small + # buffers (one cupy.empty per tile is ~tens of microseconds each) + # and never run a trailing cupy.concatenate. + _check_gpu_memory(n * tile_bytes, + what="GDS+nvCOMP decompressed buffer") + d_decomp = cupy.empty(n * tile_bytes, dtype=cupy.uint8) + base_decomp_ptr = int(d_decomp.data.ptr) + decomp_offsets = (np.arange(n, dtype=np.uint64) + * np.uint64(tile_bytes)) + d_decomp_ptrs = cupy.asarray(base_decomp_ptr + decomp_offsets) d_comp_ptrs = cupy.array([t.data.ptr for t in d_tiles], dtype=cupy.uint64) - d_decomp_ptrs = cupy.array([b.data.ptr for b in d_decomp_bufs], dtype=cupy.uint64) d_comp_sizes = cupy.array([t.size for t in d_tiles], dtype=cupy.uint64) d_buf_sizes = cupy.full(n, tile_bytes, dtype=cupy.uint64) d_actual = cupy.empty(n, dtype=cupy.uint64) @@ -1621,7 +1643,7 @@ class _NvcompDecompOpts(ctypes.Structure): if int(cupy.any(d_statuses != 0)): return None - return cupy.concatenate(d_decomp_bufs) + return d_decomp except Exception: return None diff --git a/xrspatial/geotiff/tests/test_nvcomp_from_device_bufs_single_alloc_1659.py b/xrspatial/geotiff/tests/test_nvcomp_from_device_bufs_single_alloc_1659.py new file mode 100644 index 000000000..44916cbcb --- /dev/null +++ b/xrspatial/geotiff/tests/test_nvcomp_from_device_bufs_single_alloc_1659.py @@ -0,0 +1,236 @@ +"""Regression tests for the single-buffer pattern in _try_nvcomp_from_device_bufs. + +Issue #1659: ``_try_nvcomp_from_device_bufs`` used to allocate N separate +``cupy.empty(tile_bytes)`` output buffers and run ``cupy.concatenate`` after +the nvCOMP decompress kernel returned. That kept two copies of the +decompressed data alive at once and ran a serial concat the other nvCOMP +paths in this module already avoid. The fix matches the single-contiguous- +buffer + pointer-offset pattern used by the deflate / LZW / host-buffer +paths nearby. + +These tests skip when CuPy + CUDA are not available. They also skip the +end-to-end nvCOMP integration check when ``kvikio`` or the nvCOMP shared +library are not installed, which is the common case on developer hosts; +the unit-level checks (contract + memory guard) run regardless. +""" +from __future__ import annotations + +import importlib.util +import tempfile + +import numpy as np +import pytest + +from xrspatial.geotiff._gpu_decode import _try_nvcomp_from_device_bufs + + +def _gpu_available() -> bool: + if importlib.util.find_spec("cupy") is None: + return False + try: + import cupy + return bool(cupy.cuda.is_available()) + except Exception: + return False + + +def _nvcomp_available() -> bool: + from xrspatial.geotiff._gpu_decode import _get_nvcomp + return _get_nvcomp() is not None + + +@pytest.mark.skipif(not _gpu_available(), reason="cupy + CUDA required") +def test_unsupported_codec_short_circuits_before_allocation(): + """Non-ZSTD codecs must return None without allocating output buffers. + + Pins the early-return contract that lets the caller pick a different + decoder when nvCOMP cannot handle this codec. + """ + import cupy + + # Use Deflate (8), which is unsupported by this function (ZSTD only). + d_tiles = [cupy.zeros(1024, dtype=cupy.uint8) for _ in range(4)] + assert _try_nvcomp_from_device_bufs(d_tiles, 1024, 8) is None + + +@pytest.mark.skipif(not _gpu_available(), reason="cupy + CUDA required") +def test_no_nvcomp_lib_returns_none(monkeypatch): + """When the nvCOMP library is missing, the function must return None. + + The caller relies on this signal to fall back to the bytes-based decode + path. Without it, callers would hit a ctypes ``getattr`` AttributeError + deeper in the function. + """ + import cupy + from xrspatial.geotiff import _gpu_decode + + monkeypatch.setattr(_gpu_decode, "_get_nvcomp", lambda: None) + + d_tiles = [cupy.zeros(1024, dtype=cupy.uint8)] + assert _try_nvcomp_from_device_bufs(d_tiles, 1024, 50000) is None + + +@pytest.mark.skipif(not _gpu_available(), reason="cupy + CUDA required") +def test_memory_guard_runs_with_full_decomp_size(monkeypatch): + """The single-buffer allocation must be size-checked before cupy.empty. + + The new pattern allocates one contiguous ``n * tile_bytes`` buffer + instead of N small buffers. The OOM guard is what tells the caller + early that the decode will not fit on the device; a regression that + removed the guard would surface as an opaque CUDA OOM instead. + """ + import cupy + from xrspatial.geotiff import _gpu_decode + + seen = {"total_bytes": None, "what": None, "called": False} + + def fake_check(required_bytes, what="tile buffer"): + seen["total_bytes"] = int(required_bytes) + seen["what"] = what + seen["called"] = True + raise MemoryError("simulated OOM") + + # Pin _get_nvcomp to something truthy so the function does not bail + # before reaching the allocation step. The fake check raises before + # any nvCOMP call would happen, so the lib value never gets used. + monkeypatch.setattr(_gpu_decode, "_get_nvcomp", lambda: object()) + monkeypatch.setattr(_gpu_decode, "_check_gpu_memory", fake_check) + + n_tiles = 8 + tile_bytes = 65536 + d_tiles = [cupy.zeros(128, dtype=cupy.uint8) for _ in range(n_tiles)] + + # The function catches ``Exception`` and returns None. ``MemoryError`` + # is an ``Exception`` subclass, so the call returns None rather than + # re-raising; the assertion is on the side-effect counter. + result = _try_nvcomp_from_device_bufs(d_tiles, tile_bytes, 50000) + assert result is None + + assert seen["called"], "_check_gpu_memory was not called" + expected_bytes = n_tiles * tile_bytes + assert seen["total_bytes"] == expected_bytes, ( + f"expected total {expected_bytes}, got {seen['total_bytes']}" + ) + assert "decompressed" in seen["what"] or "nvCOMP" in seen["what"], ( + f"unhelpful 'what' label: {seen['what']!r}" + ) + + +@pytest.mark.skipif( + not _gpu_available() or not _nvcomp_available(), + reason="cupy + CUDA + nvCOMP shared lib required", +) +def test_zstd_decompress_roundtrip_returns_single_contiguous_buffer(): + """End-to-end: feed real ZSTD-compressed device buffers in, check the + output is a single flat ``cupy.uint8`` array of length n*tile_bytes. + + This test confirms the return contract that ``_apply_predictor_and_assemble`` + depends on: ``out`` is the contiguous concatenation of the N decompressed + tiles, not a list. The previous implementation returned the same shape but + via ``cupy.concatenate``; the new one allocates the contig buffer up front + and writes through per-tile pointers, so a regression that dropped the + return value would surface here. + """ + import cupy + import zstandard as zstd + + rng = np.random.default_rng(seed=1659) + tile_bytes = 4096 + n_tiles = 8 + + cctx = zstd.ZstdCompressor() + host_tiles = [rng.integers(0, 256, size=tile_bytes, dtype=np.uint8) + for _ in range(n_tiles)] + compressed = [cctx.compress(t.tobytes()) for t in host_tiles] + d_tiles = [cupy.asarray(np.frombuffer(c, dtype=np.uint8)) + for c in compressed] + + result = _try_nvcomp_from_device_bufs(d_tiles, tile_bytes, 50000) + + # nvCOMP may be present but mis-configured on the host (e.g. driver + # version mismatch); skip rather than fail in that case so the test is + # informative when run on a real GDS rig. + if result is None: + pytest.skip("nvCOMP returned None; library may be unusable on this host") + + assert isinstance(result, cupy.ndarray) + assert result.dtype == cupy.uint8 + assert result.shape == (n_tiles * tile_bytes,) + assert result.flags.c_contiguous + + # Decoded payload must match the original host tiles. The buffer is a + # single flat array; tile i lives at offset i*tile_bytes. + host_out = result.get() + for i, expected in enumerate(host_tiles): + decoded = host_out[i * tile_bytes:(i + 1) * tile_bytes] + assert np.array_equal(decoded, expected), ( + f"tile {i} decoded payload differs from input" + ) + + +@pytest.mark.skipif(not _gpu_available(), reason="cupy + CUDA required") +def test_no_orphan_decomp_buffers_after_call(monkeypatch): + """Earlier code held a Python list of N device buffers in scope + alongside the concatenated result. The replacement allocates once + and returns that one buffer. + + The check here is structural rather than numerical: after a successful + call the only cupy ndarray the caller receives is ``result`` itself, + and inspecting it confirms ``result.size == n_tiles * tile_bytes``. + """ + import cupy + from xrspatial.geotiff import _gpu_decode + + # Stub the nvCOMP entry points so the decompress "succeeds" without an + # actual library. Force the function down the success branch, capture + # the returned buffer, then verify shape + ownership. + monkeypatch.setattr(_gpu_decode, "_get_nvcomp", + lambda: _FakeNvcompLib()) + + n_tiles = 4 + tile_bytes = 2048 + d_tiles = [cupy.zeros(64, dtype=cupy.uint8) for _ in range(n_tiles)] + result = _try_nvcomp_from_device_bufs(d_tiles, tile_bytes, 50000) + + # The fake lib reports success and zero-fills the output buffer; the + # function returns the contiguous buffer as-is. + assert result is not None + assert isinstance(result, cupy.ndarray) + assert result.size == n_tiles * tile_bytes + assert result.flags.c_contiguous + # The contract requires uint8 -- not a uint8 view of something else. + assert result.dtype == cupy.uint8 + + +class _FakeNvcompLib: + """Stand-in for the nvCOMP CDLL handle used in tests. + + The real function calls ``getattr(lib, fn_name)`` for two entry points + and invokes each as a ctypes function. We expose those entry-point + names as Python callables that pretend the work succeeded. + """ + + def __getattr__(self, name): + if name == 'nvcompBatchedZstdDecompressGetTempSizeAsync': + return _fake_temp_size_fn + if name == 'nvcompBatchedZstdDecompressAsync': + return _fake_decompress_fn + raise AttributeError(name) + + +def _fake_temp_size_fn(n, tile_bytes, opts, p_temp_size, total): + """Stub for nvcompBatchedZstdDecompressGetTempSizeAsync.""" + import ctypes + # Write a tiny temp-size value into the caller's c_size_t. + p_temp_size._obj.value = 1 + return 0 + + +def _fake_decompress_fn(*args): + """Stub for nvcompBatchedZstdDecompressAsync. + + The function's return-value test is ``s != 0``. We return 0 (success). + The caller's d_statuses array is already zero from ``cupy.zeros``, so + the post-decode any-nonzero check passes. + """ + return 0 From d43c19b848b62b7dc889156337f5fd3444198319 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Tue, 12 May 2026 06:21:42 -0700 Subject: [PATCH 2/2] Address review comments on PR #1665 - Hoist codec name resolution before VRAM allocation and memory guard so an unsupported codec short-circuits without burning an allocation. - Re-raise MemoryError from the memory guard instead of swallowing it under the blanket Exception catch; the docstring already promises a clear MemoryError before the cupy.empty call. - Update test_memory_guard_runs_with_full_decomp_size to assert the MemoryError is surfaced via pytest.raises. - Drop unused tempfile + ctypes imports from the test module. --- xrspatial/geotiff/_gpu_decode.py | 15 ++++++++++----- ...t_nvcomp_from_device_bufs_single_alloc_1659.py | 9 ++------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py index d6bc1d645..6628d4cf8 100644 --- a/xrspatial/geotiff/_gpu_decode.py +++ b/xrspatial/geotiff/_gpu_decode.py @@ -1581,6 +1581,14 @@ def _try_nvcomp_from_device_bufs(d_tiles, tile_bytes, compression): if lib is None: return None + # Resolve the codec entry-point names before touching VRAM. An + # unsupported codec must return None without burning an allocation or + # running the memory guard. + fn_name = {50000: 'nvcompBatchedZstdDecompressGetTempSizeAsync'}.get(compression) + dec_name = {50000: 'nvcompBatchedZstdDecompressAsync'}.get(compression) + if fn_name is None: + return None + class _NvcompDecompOpts(ctypes.Structure): _fields_ = [('backend', ctypes.c_int), ('reserved', ctypes.c_char * 60)] @@ -1606,11 +1614,6 @@ class _NvcompDecompOpts(ctypes.Structure): opts = _NvcompDecompOpts(backend=0, reserved=b'\x00' * 60) - fn_name = {50000: 'nvcompBatchedZstdDecompressGetTempSizeAsync'}.get(compression) - dec_name = {50000: 'nvcompBatchedZstdDecompressAsync'}.get(compression) - if fn_name is None: - return None - temp_fn = getattr(lib, fn_name) temp_fn.restype = ctypes.c_int temp_size = ctypes.c_size_t(0) @@ -1644,6 +1647,8 @@ class _NvcompDecompOpts(ctypes.Structure): return None return d_decomp + except MemoryError: + raise except Exception: return None diff --git a/xrspatial/geotiff/tests/test_nvcomp_from_device_bufs_single_alloc_1659.py b/xrspatial/geotiff/tests/test_nvcomp_from_device_bufs_single_alloc_1659.py index 44916cbcb..d57f0e60a 100644 --- a/xrspatial/geotiff/tests/test_nvcomp_from_device_bufs_single_alloc_1659.py +++ b/xrspatial/geotiff/tests/test_nvcomp_from_device_bufs_single_alloc_1659.py @@ -16,7 +16,6 @@ from __future__ import annotations import importlib.util -import tempfile import numpy as np import pytest @@ -100,11 +99,8 @@ def fake_check(required_bytes, what="tile buffer"): tile_bytes = 65536 d_tiles = [cupy.zeros(128, dtype=cupy.uint8) for _ in range(n_tiles)] - # The function catches ``Exception`` and returns None. ``MemoryError`` - # is an ``Exception`` subclass, so the call returns None rather than - # re-raising; the assertion is on the side-effect counter. - result = _try_nvcomp_from_device_bufs(d_tiles, tile_bytes, 50000) - assert result is None + with pytest.raises(MemoryError): + _try_nvcomp_from_device_bufs(d_tiles, tile_bytes, 50000) assert seen["called"], "_check_gpu_memory was not called" expected_bytes = n_tiles * tile_bytes @@ -220,7 +216,6 @@ def __getattr__(self, name): def _fake_temp_size_fn(n, tile_bytes, opts, p_temp_size, total): """Stub for nvcompBatchedZstdDecompressGetTempSizeAsync.""" - import ctypes # Write a tiny temp-size value into the caller's c_size_t. p_temp_size._obj.value = 1 return 0