From 665c2829c94bf76b53c24b53246c8eba57969802 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Fri, 15 May 2026 08:30:28 -0700 Subject: [PATCH 1/2] geotiff: nvcomp decompress prefix-sum offsets via np.cumsum (#1950) `_try_nvcomp_batch_decompress` computed its per-tile host offsets via a Python `for` loop: ``` comp_offsets_h = np.zeros(n_tiles, dtype=np.int64) for i in range(1, n_tiles): comp_offsets_h[i] = comp_offsets_h[i - 1] + comp_sizes_list[i - 1] ``` The sibling helper `_batched_d2h_to_bytes` at ~L924 and the post-compress prefix sum in `_nvcomp_batch_compress` at ~L2572 already use `np.cumsum(sizes, out=offsets[1:])`. Aligning the decompress side keeps the codebase consistent and trims interpreter overhead. Replace the loop with `np.fromiter` + `np.cumsum(..., out=...)` and materialise the per-tile sizes once as `comp_sizes_arr`; subsequent slicing and the `cupy.asarray` upload use the array directly. The microbench on 1024 tiles drops the prefix sum from 84us to 21us (~3.9x). Tests cover: * structural -- the decompress upload block uses `np.cumsum` and no longer contains the `for i in range(1, n_tiles)` loop, * equivalence -- the cumsum-based offsets match the prior loop pointwise on a 1024-element random array, * round-trip -- a 1024x1024 float32 deflate-tiled raster still decodes through the GPU path and matches the source byte-for-byte. --- xrspatial/geotiff/_gpu_decode.py | 24 ++- ...t_nvcomp_decompress_cumsum_offsets_1950.py | 142 ++++++++++++++++++ 2 files changed, 159 insertions(+), 7 deletions(-) create mode 100644 xrspatial/geotiff/tests/test_nvcomp_decompress_cumsum_offsets_1950.py diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py index db8e6cfb1..398388b2b 100644 --- a/xrspatial/geotiff/_gpu_decode.py +++ b/xrspatial/geotiff/_gpu_decode.py @@ -1189,15 +1189,26 @@ class _NvcompDeflateDecompOpts(ctypes.Structure): # LZW/Deflate concat-then-upload pattern below (~L1714-1722). # Per-tile cupy.asarray was measured at 256x64KB -> 6.07 ms vs 3.65 ms # for the batched form (~1.66x speedup, scales worse with more tiles). - comp_sizes_list = [len(t) for t in raw_tiles] + # + # ``np.cumsum(..., out=offsets[1:])`` vectorises the prefix-sum + # so the per-tile offsets land in one C-level pass instead of a + # Python ``for`` loop. Aligns with the sibling + # ``_batched_d2h_to_bytes`` helper (~L924) and the + # ``_nvcomp_batch_compress`` post-decompress prefix sum (~L2572). + # Microbench (1024 tiles): 84us Python loop -> 21us cumsum + # (~3.9x). See issue #1950. + comp_sizes_arr = np.fromiter( + (len(t) for t in raw_tiles), + dtype=np.int64, count=n_tiles, + ) comp_offsets_h = np.zeros(n_tiles, dtype=np.int64) - for i in range(1, n_tiles): - comp_offsets_h[i] = comp_offsets_h[i - 1] + comp_sizes_list[i - 1] - total_comp = sum(comp_sizes_list) + if n_tiles > 1: + np.cumsum(comp_sizes_arr[:-1], out=comp_offsets_h[1:]) + total_comp = int(comp_sizes_arr.sum()) comp_buf_host = np.empty(total_comp, dtype=np.uint8) for i, tile in enumerate(raw_tiles): - comp_buf_host[comp_offsets_h[i]:comp_offsets_h[i] + comp_sizes_list[i]] = \ + comp_buf_host[comp_offsets_h[i]:comp_offsets_h[i] + comp_sizes_arr[i]] = \ np.frombuffer(tile, dtype=np.uint8) d_comp = cupy.asarray(comp_buf_host) @@ -1210,8 +1221,7 @@ class _NvcompDeflateDecompOpts(ctypes.Structure): decomp_offsets_h = (np.arange(n_tiles, dtype=np.uint64) * np.uint64(tile_bytes)) d_decomp_ptrs = cupy.asarray(base_decomp_ptr + decomp_offsets_h) - d_comp_sizes = cupy.asarray( - np.array(comp_sizes_list, dtype=np.uint64)) + d_comp_sizes = cupy.asarray(comp_sizes_arr.astype(np.uint64)) d_buf_sizes = cupy.full(n_tiles, tile_bytes, dtype=cupy.uint64) d_actual = cupy.empty(n_tiles, dtype=cupy.uint64) diff --git a/xrspatial/geotiff/tests/test_nvcomp_decompress_cumsum_offsets_1950.py b/xrspatial/geotiff/tests/test_nvcomp_decompress_cumsum_offsets_1950.py new file mode 100644 index 000000000..a4629c3ae --- /dev/null +++ b/xrspatial/geotiff/tests/test_nvcomp_decompress_cumsum_offsets_1950.py @@ -0,0 +1,142 @@ +"""Regression tests for issue #1950. + +``_try_nvcomp_batch_decompress`` used to compute its per-tile host +prefix-sum offsets via a Python ``for`` loop: + +``` +comp_sizes_list = [len(t) for t in raw_tiles] +comp_offsets_h = np.zeros(n_tiles, dtype=np.int64) +for i in range(1, n_tiles): + comp_offsets_h[i] = comp_offsets_h[i - 1] + comp_sizes_list[i - 1] +``` + +The sibling batched-D2H helper ``_batched_d2h_to_bytes`` at ~L924 and +the compress-side prefix sum in ``_nvcomp_batch_compress`` at ~L2572 +both use ``np.cumsum(sizes, out=offsets[1:])``. Aligning the +decompress side keeps the codebase consistent and trims interpreter +overhead. + +Two guards here: + +1. Correctness -- a tiny synthetic nvCOMP round-trip (when the lib is + available) still decodes every tile correctly. Without nvCOMP the + test exercises the same prefix-sum reshape via direct comparison + against ``np.cumsum``. +2. Structural -- the source uses ``np.cumsum`` (not a Python + ``range(1, n_tiles)`` loop) for the prefix sum. +""" +from __future__ import annotations + +import importlib.util +import os +import tempfile + +import numpy as np +import pytest + + +def test_nvcomp_decompress_uses_cumsum_for_offsets_1950(): + """Source-level guard against reintroducing the Python for loop. + + The fix swaps the per-tile prefix-sum loop for ``np.cumsum``. + This test fires if anyone reverts to the loop or otherwise breaks + the alignment with ``_batched_d2h_to_bytes`` / ``_nvcomp_batch_compress``. + """ + import pathlib + + src_path = pathlib.Path(__file__).parent.parent / "_gpu_decode.py" + src = src_path.read_text() + + # Locate the decompress prefix-sum site. It sits inside + # ``_try_nvcomp_batch_decompress`` and is anchored by the + # ``Batch host->device upload`` comment that documents the rationale. + anchor = "Batch host->device upload: concatenate all compressed tiles" + idx = src.find(anchor) + assert idx != -1, "could not locate the decompress upload block" + # Take a 1500-char window after the anchor; the prefix-sum lives + # within the first ~30 lines of that block. + block = src[idx:idx + 1500] + + assert "np.cumsum(" in block, ( + "decompress upload block should use np.cumsum for prefix-sum " + "offsets, aligning with _batched_d2h_to_bytes (issue #1950)." + ) + # The legacy Python loop would have ``for i in range(1, n_tiles):``. + assert "for i in range(1, n_tiles)" not in block, ( + "decompress upload block should no longer compute prefix-sum " + "offsets with a Python for loop (issue #1950)." + ) + + +def test_cumsum_matches_loop_prefix_sum_1950(): + """Equivalence between the vectorised cumsum and the prior loop. + + Numeric guard. Even though the two forms produce the same output + by construction, a runtime check confirms the cumsum form does not + drift away from the previous semantics across numpy versions. + """ + rng = np.random.RandomState(1950) + n = 1024 + sizes = rng.randint(100, 100_000, size=n).astype(np.int64) + + # Vectorised form (matches the fix). + offsets_cumsum = np.zeros(n, dtype=np.int64) + if n > 1: + np.cumsum(sizes[:-1], out=offsets_cumsum[1:]) + + # Reference: explicit Python prefix sum. + offsets_loop = np.zeros(n, dtype=np.int64) + for i in range(1, n): + offsets_loop[i] = offsets_loop[i - 1] + sizes[i - 1] + + np.testing.assert_array_equal(offsets_cumsum, offsets_loop) + + +@pytest.mark.skipif( + importlib.util.find_spec("cupy") is None, + reason="cupy required for nvCOMP path", +) +def test_nvcomp_batch_decompress_roundtrip_1950(): + """End-to-end check: a deflate-tiled raster still decodes correctly. + + Exercises ``_try_nvcomp_batch_decompress`` on a real file via the + public ``read_geotiff_gpu`` entry point. If the prefix-sum + refactor mis-stages a tile, the decoded buffer would not match + the source, surfacing as a numerical regression here. + + The test is gated on cupy availability rather than the nvCOMP lib + explicitly because the GPU read path falls back to a CPU codec when + nvCOMP is missing; in that case the test still exercises the GPU + upload + tile assembly but bypasses the prefix-sum site directly. + Setting ``XRSPATIAL_GEOTIFF_STRICT_GPU=1`` would gate harder. + """ + try: + import cupy + except ImportError: + pytest.skip("cupy not importable") + if not cupy.cuda.is_available(): + pytest.skip("CUDA device not available") + + import xarray as xr + from xrspatial.geotiff import open_geotiff, to_geotiff + + rng = np.random.RandomState(1950) + height, width = 1024, 1024 + arr = rng.rand(height, width).astype(np.float32) + da = xr.DataArray( + arr, dims=["y", "x"], + coords={"y": np.arange(height), "x": np.arange(width)}, + attrs={"crs": 4326}, + ) + + with tempfile.TemporaryDirectory() as td: + path = os.path.join(td, "tmp_1950_deflate.tif") + to_geotiff(da, path, compression="deflate", tile_size=256) + + # Read back through the GPU pipeline. + result = open_geotiff(path, gpu=True) + assert result.shape == (height, width) + decoded = cupy.asnumpy(result.data) if hasattr( + result.data, "get") else np.asarray(result.data) + + np.testing.assert_allclose(decoded, arr, atol=0, rtol=0) From 84b7f9a1adb104871ead347d7904b4d5f91ad4d7 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Fri, 15 May 2026 09:03:27 -0700 Subject: [PATCH 2/2] geotiff: address PR #1955 review nits --- xrspatial/geotiff/_gpu_decode.py | 12 +++-- ...t_nvcomp_decompress_cumsum_offsets_1950.py | 52 ++++++++++++------- 2 files changed, 39 insertions(+), 25 deletions(-) diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py index 398388b2b..c1e274057 100644 --- a/xrspatial/geotiff/_gpu_decode.py +++ b/xrspatial/geotiff/_gpu_decode.py @@ -1197,11 +1197,14 @@ class _NvcompDeflateDecompOpts(ctypes.Structure): # ``_nvcomp_batch_compress`` post-decompress prefix sum (~L2572). # Microbench (1024 tiles): 84us Python loop -> 21us cumsum # (~3.9x). See issue #1950. + # Allocate as uint64 up front: nvcomp consumes uint64 pointer/size + # arrays, so skipping the intermediate int64 -> uint64 .astype copies + # at the cupy.asarray sites avoids a redundant host-side allocation. comp_sizes_arr = np.fromiter( (len(t) for t in raw_tiles), - dtype=np.int64, count=n_tiles, + dtype=np.uint64, count=n_tiles, ) - comp_offsets_h = np.zeros(n_tiles, dtype=np.int64) + comp_offsets_h = np.zeros(n_tiles, dtype=np.uint64) if n_tiles > 1: np.cumsum(comp_sizes_arr[:-1], out=comp_offsets_h[1:]) total_comp = int(comp_sizes_arr.sum()) @@ -1216,12 +1219,11 @@ class _NvcompDeflateDecompOpts(ctypes.Structure): base_comp_ptr = int(d_comp.data.ptr) base_decomp_ptr = int(d_decomp.data.ptr) - d_comp_ptrs = cupy.asarray( - base_comp_ptr + comp_offsets_h.astype(np.uint64)) + d_comp_ptrs = cupy.asarray(base_comp_ptr + comp_offsets_h) decomp_offsets_h = (np.arange(n_tiles, dtype=np.uint64) * np.uint64(tile_bytes)) d_decomp_ptrs = cupy.asarray(base_decomp_ptr + decomp_offsets_h) - d_comp_sizes = cupy.asarray(comp_sizes_arr.astype(np.uint64)) + d_comp_sizes = cupy.asarray(comp_sizes_arr) d_buf_sizes = cupy.full(n_tiles, tile_bytes, dtype=cupy.uint64) d_actual = cupy.empty(n_tiles, dtype=cupy.uint64) diff --git a/xrspatial/geotiff/tests/test_nvcomp_decompress_cumsum_offsets_1950.py b/xrspatial/geotiff/tests/test_nvcomp_decompress_cumsum_offsets_1950.py index a4629c3ae..040043843 100644 --- a/xrspatial/geotiff/tests/test_nvcomp_decompress_cumsum_offsets_1950.py +++ b/xrspatial/geotiff/tests/test_nvcomp_decompress_cumsum_offsets_1950.py @@ -29,6 +29,7 @@ import importlib.util import os +import re import tempfile import numpy as np @@ -47,22 +48,27 @@ def test_nvcomp_decompress_uses_cumsum_for_offsets_1950(): src_path = pathlib.Path(__file__).parent.parent / "_gpu_decode.py" src = src_path.read_text() - # Locate the decompress prefix-sum site. It sits inside - # ``_try_nvcomp_batch_decompress`` and is anchored by the - # ``Batch host->device upload`` comment that documents the rationale. - anchor = "Batch host->device upload: concatenate all compressed tiles" - idx = src.find(anchor) - assert idx != -1, "could not locate the decompress upload block" - # Take a 1500-char window after the anchor; the prefix-sum lives - # within the first ~30 lines of that block. - block = src[idx:idx + 1500] - - assert "np.cumsum(" in block, ( - "decompress upload block should use np.cumsum for prefix-sum " - "offsets, aligning with _batched_d2h_to_bytes (issue #1950)." + # Anchor on the exact decompress-side prefix-sum call. Regex is + # tighter than a fixed character window and survives surrounding + # code edits. + cumsum_call = re.compile( + r"np\.cumsum\(\s*comp_sizes_arr\[:-1\]\s*,\s*" + r"out\s*=\s*comp_offsets_h\[1:\]\s*\)" ) - # The legacy Python loop would have ``for i in range(1, n_tiles):``. - assert "for i in range(1, n_tiles)" not in block, ( + assert cumsum_call.search(src), ( + "decompress upload block should use " + "``np.cumsum(comp_sizes_arr[:-1], out=comp_offsets_h[1:])`` for " + "prefix-sum offsets, aligning with _batched_d2h_to_bytes " + "(issue #1950)." + ) + # The legacy Python loop would have written + # ``comp_offsets_h[i] = comp_offsets_h[i - 1] + ...`` inside a + # ``for i in range(1, n_tiles):`` block. + legacy_loop = re.compile( + r"for\s+i\s+in\s+range\(\s*1\s*,\s*n_tiles\s*\)\s*:\s*\n" + r"\s*comp_offsets_h\[i\]" + ) + assert not legacy_loop.search(src), ( "decompress upload block should no longer compute prefix-sum " "offsets with a Python for loop (issue #1950)." ) @@ -104,12 +110,18 @@ def test_nvcomp_batch_decompress_roundtrip_1950(): refactor mis-stages a tile, the decoded buffer would not match the source, surfacing as a numerical regression here. - The test is gated on cupy availability rather than the nvCOMP lib - explicitly because the GPU read path falls back to a CPU codec when - nvCOMP is missing; in that case the test still exercises the GPU - upload + tile assembly but bypasses the prefix-sum site directly. - Setting ``XRSPATIAL_GEOTIFF_STRICT_GPU=1`` would gate harder. + Gated on ``XRSPATIAL_GEOTIFF_STRICT_GPU=1`` so the run-time check + only fires in environments that actually carry nvCOMP. Without the + flag the GPU read path silently falls back to a CPU codec when + nvCOMP is missing, which bypasses the prefix-sum site entirely; a + pass under that fallback would be misleading, so we skip instead. """ + if os.environ.get("XRSPATIAL_GEOTIFF_STRICT_GPU") != "1": + pytest.skip( + "set XRSPATIAL_GEOTIFF_STRICT_GPU=1 to exercise the nvCOMP " + "prefix-sum site; without it the GPU path may fall back to " + "a CPU codec and bypass this regression." + ) try: import cupy except ImportError: