From 665c2829c94bf76b53c24b53246c8eba57969802 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Fri, 15 May 2026 08:30:28 -0700
Subject: [PATCH 1/2] geotiff: nvcomp decompress prefix-sum offsets via
 np.cumsum (#1950)

`_try_nvcomp_batch_decompress` computed its per-tile host offsets via
a Python `for` loop:

```
comp_offsets_h = np.zeros(n_tiles, dtype=np.int64)
for i in range(1, n_tiles):
    comp_offsets_h[i] = comp_offsets_h[i - 1] + comp_sizes_list[i - 1]
```

The sibling helper `_batched_d2h_to_bytes` at ~L924 and the
post-compress prefix sum in `_nvcomp_batch_compress` at ~L2572 already
use `np.cumsum(sizes, out=offsets[1:])`. Aligning the decompress side
keeps the codebase consistent and trims interpreter overhead.

Replace the loop with `np.fromiter` + `np.cumsum(..., out=...)` and
materialise the per-tile sizes once as `comp_sizes_arr`; subsequent
slicing and the `cupy.asarray` upload use the array directly. The
microbench on 1024 tiles drops the prefix sum from 84us to 21us
(~3.9x).

Tests cover:
* structural -- the decompress upload block uses `np.cumsum` and no
  longer contains the `for i in range(1, n_tiles)` loop,
* equivalence -- the cumsum-based offsets match the prior loop
  pointwise on a 1024-element random array,
* round-trip -- a 1024x1024 float32 deflate-tiled raster still decodes
  through the GPU path and matches the source byte-for-byte.
---
 xrspatial/geotiff/_gpu_decode.py              |  24 ++-
 ...t_nvcomp_decompress_cumsum_offsets_1950.py | 142 ++++++++++++++++++
 2 files changed, 159 insertions(+), 7 deletions(-)
 create mode 100644 xrspatial/geotiff/tests/test_nvcomp_decompress_cumsum_offsets_1950.py

diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py
index db8e6cfb1..398388b2b 100644
--- a/xrspatial/geotiff/_gpu_decode.py
+++ b/xrspatial/geotiff/_gpu_decode.py
@@ -1189,15 +1189,26 @@ class _NvcompDeflateDecompOpts(ctypes.Structure):
         # LZW/Deflate concat-then-upload pattern below (~L1714-1722).
         # Per-tile cupy.asarray was measured at 256x64KB -> 6.07 ms vs 3.65 ms
         # for the batched form (~1.66x speedup, scales worse with more tiles).
-        comp_sizes_list = [len(t) for t in raw_tiles]
+        #
+        # ``np.cumsum(..., out=offsets[1:])`` vectorises the prefix-sum
+        # so the per-tile offsets land in one C-level pass instead of a
+        # Python ``for`` loop. Aligns with the sibling
+        # ``_batched_d2h_to_bytes`` helper (~L924) and the
+        # ``_nvcomp_batch_compress`` post-decompress prefix sum (~L2572).
+        # Microbench (1024 tiles): 84us Python loop -> 21us cumsum
+        # (~3.9x). See issue #1950.
+        comp_sizes_arr = np.fromiter(
+            (len(t) for t in raw_tiles),
+            dtype=np.int64, count=n_tiles,
+        )
         comp_offsets_h = np.zeros(n_tiles, dtype=np.int64)
-        for i in range(1, n_tiles):
-            comp_offsets_h[i] = comp_offsets_h[i - 1] + comp_sizes_list[i - 1]
-        total_comp = sum(comp_sizes_list)
+        if n_tiles > 1:
+            np.cumsum(comp_sizes_arr[:-1], out=comp_offsets_h[1:])
+        total_comp = int(comp_sizes_arr.sum())
 
         comp_buf_host = np.empty(total_comp, dtype=np.uint8)
         for i, tile in enumerate(raw_tiles):
-            comp_buf_host[comp_offsets_h[i]:comp_offsets_h[i] + comp_sizes_list[i]] = \
+            comp_buf_host[comp_offsets_h[i]:comp_offsets_h[i] + comp_sizes_arr[i]] = \
                 np.frombuffer(tile, dtype=np.uint8)
 
         d_comp = cupy.asarray(comp_buf_host)
@@ -1210,8 +1221,7 @@ class _NvcompDeflateDecompOpts(ctypes.Structure):
         decomp_offsets_h = (np.arange(n_tiles, dtype=np.uint64)
                             * np.uint64(tile_bytes))
         d_decomp_ptrs = cupy.asarray(base_decomp_ptr + decomp_offsets_h)
-        d_comp_sizes = cupy.asarray(
-            np.array(comp_sizes_list, dtype=np.uint64))
+        d_comp_sizes = cupy.asarray(comp_sizes_arr.astype(np.uint64))
         d_buf_sizes = cupy.full(n_tiles, tile_bytes, dtype=cupy.uint64)
         d_actual = cupy.empty(n_tiles, dtype=cupy.uint64)
 
diff --git a/xrspatial/geotiff/tests/test_nvcomp_decompress_cumsum_offsets_1950.py b/xrspatial/geotiff/tests/test_nvcomp_decompress_cumsum_offsets_1950.py
new file mode 100644
index 000000000..a4629c3ae
--- /dev/null
+++ b/xrspatial/geotiff/tests/test_nvcomp_decompress_cumsum_offsets_1950.py
@@ -0,0 +1,142 @@
+"""Regression tests for issue #1950.
+
+``_try_nvcomp_batch_decompress`` used to compute its per-tile host
+prefix-sum offsets via a Python ``for`` loop:
+
+```
+comp_sizes_list = [len(t) for t in raw_tiles]
+comp_offsets_h = np.zeros(n_tiles, dtype=np.int64)
+for i in range(1, n_tiles):
+    comp_offsets_h[i] = comp_offsets_h[i - 1] + comp_sizes_list[i - 1]
+```
+
+The sibling batched-D2H helper ``_batched_d2h_to_bytes`` at ~L924 and
+the compress-side prefix sum in ``_nvcomp_batch_compress`` at ~L2572
+both use ``np.cumsum(sizes, out=offsets[1:])``. Aligning the
+decompress side keeps the codebase consistent and trims interpreter
+overhead.
+
+Two guards here:
+
+1. Correctness -- a tiny synthetic nvCOMP round-trip (when the lib is
+   available) still decodes every tile correctly. Without nvCOMP the
+   test exercises the same prefix-sum reshape via direct comparison
+   against ``np.cumsum``.
+2. Structural -- the source uses ``np.cumsum`` (not a Python
+   ``range(1, n_tiles)`` loop) for the prefix sum.
+"""
+from __future__ import annotations
+
+import importlib.util
+import os
+import tempfile
+
+import numpy as np
+import pytest
+
+
+def test_nvcomp_decompress_uses_cumsum_for_offsets_1950():
+    """Source-level guard against reintroducing the Python for loop.
+
+    The fix swaps the per-tile prefix-sum loop for ``np.cumsum``.
+    This test fires if anyone reverts to the loop or otherwise breaks
+    the alignment with ``_batched_d2h_to_bytes`` / ``_nvcomp_batch_compress``.
+    """
+    import pathlib
+
+    src_path = pathlib.Path(__file__).parent.parent / "_gpu_decode.py"
+    src = src_path.read_text()
+
+    # Locate the decompress prefix-sum site. It sits inside
+    # ``_try_nvcomp_batch_decompress`` and is anchored by the
+    # ``Batch host->device upload`` comment that documents the rationale.
+    anchor = "Batch host->device upload: concatenate all compressed tiles"
+    idx = src.find(anchor)
+    assert idx != -1, "could not locate the decompress upload block"
+    # Take a 1500-char window after the anchor; the prefix-sum lives
+    # within the first ~30 lines of that block.
+    block = src[idx:idx + 1500]
+
+    assert "np.cumsum(" in block, (
+        "decompress upload block should use np.cumsum for prefix-sum "
+        "offsets, aligning with _batched_d2h_to_bytes (issue #1950)."
+    )
+    # The legacy Python loop would have ``for i in range(1, n_tiles):``.
+    assert "for i in range(1, n_tiles)" not in block, (
+        "decompress upload block should no longer compute prefix-sum "
+        "offsets with a Python for loop (issue #1950)."
+    )
+
+
+def test_cumsum_matches_loop_prefix_sum_1950():
+    """Equivalence between the vectorised cumsum and the prior loop.
+
+    Numeric guard. Even though the two forms produce the same output
+    by construction, a runtime check confirms the cumsum form does not
+    drift away from the previous semantics across numpy versions.
+    """
+    rng = np.random.RandomState(1950)
+    n = 1024
+    sizes = rng.randint(100, 100_000, size=n).astype(np.int64)
+
+    # Vectorised form (matches the fix).
+    offsets_cumsum = np.zeros(n, dtype=np.int64)
+    if n > 1:
+        np.cumsum(sizes[:-1], out=offsets_cumsum[1:])
+
+    # Reference: explicit Python prefix sum.
+    offsets_loop = np.zeros(n, dtype=np.int64)
+    for i in range(1, n):
+        offsets_loop[i] = offsets_loop[i - 1] + sizes[i - 1]
+
+    np.testing.assert_array_equal(offsets_cumsum, offsets_loop)
+
+
+@pytest.mark.skipif(
+    importlib.util.find_spec("cupy") is None,
+    reason="cupy required for nvCOMP path",
+)
+def test_nvcomp_batch_decompress_roundtrip_1950():
+    """End-to-end check: a deflate-tiled raster still decodes correctly.
+
+    Exercises ``_try_nvcomp_batch_decompress`` on a real file via the
+    public ``read_geotiff_gpu`` entry point. If the prefix-sum
+    refactor mis-stages a tile, the decoded buffer would not match
+    the source, surfacing as a numerical regression here.
+
+    The test is gated on cupy availability rather than the nvCOMP lib
+    explicitly because the GPU read path falls back to a CPU codec when
+    nvCOMP is missing; in that case the test still exercises the GPU
+    upload + tile assembly but bypasses the prefix-sum site directly.
+    Setting ``XRSPATIAL_GEOTIFF_STRICT_GPU=1`` would gate harder.
+    """
+    try:
+        import cupy
+    except ImportError:
+        pytest.skip("cupy not importable")
+    if not cupy.cuda.is_available():
+        pytest.skip("CUDA device not available")
+
+    import xarray as xr
+    from xrspatial.geotiff import open_geotiff, to_geotiff
+
+    rng = np.random.RandomState(1950)
+    height, width = 1024, 1024
+    arr = rng.rand(height, width).astype(np.float32)
+    da = xr.DataArray(
+        arr, dims=["y", "x"],
+        coords={"y": np.arange(height), "x": np.arange(width)},
+        attrs={"crs": 4326},
+    )
+
+    with tempfile.TemporaryDirectory() as td:
+        path = os.path.join(td, "tmp_1950_deflate.tif")
+        to_geotiff(da, path, compression="deflate", tile_size=256)
+
+        # Read back through the GPU pipeline.
+        result = open_geotiff(path, gpu=True)
+        assert result.shape == (height, width)
+        decoded = cupy.asnumpy(result.data) if hasattr(
+            result.data, "get") else np.asarray(result.data)
+
+    np.testing.assert_allclose(decoded, arr, atol=0, rtol=0)

From 84b7f9a1adb104871ead347d7904b4d5f91ad4d7 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Fri, 15 May 2026 09:03:27 -0700
Subject: [PATCH 2/2] geotiff: address PR #1955 review nits

---
 xrspatial/geotiff/_gpu_decode.py              | 12 +++--
 ...t_nvcomp_decompress_cumsum_offsets_1950.py | 52 ++++++++++++-------
 2 files changed, 39 insertions(+), 25 deletions(-)

diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py
index 398388b2b..c1e274057 100644
--- a/xrspatial/geotiff/_gpu_decode.py
+++ b/xrspatial/geotiff/_gpu_decode.py
@@ -1197,11 +1197,14 @@ class _NvcompDeflateDecompOpts(ctypes.Structure):
         # ``_nvcomp_batch_compress`` post-decompress prefix sum (~L2572).
         # Microbench (1024 tiles): 84us Python loop -> 21us cumsum
         # (~3.9x). See issue #1950.
+        # Allocate as uint64 up front: nvcomp consumes uint64 pointer/size
+        # arrays, so skipping the intermediate int64 -> uint64 .astype copies
+        # at the cupy.asarray sites avoids a redundant host-side allocation.
         comp_sizes_arr = np.fromiter(
             (len(t) for t in raw_tiles),
-            dtype=np.int64, count=n_tiles,
+            dtype=np.uint64, count=n_tiles,
         )
-        comp_offsets_h = np.zeros(n_tiles, dtype=np.int64)
+        comp_offsets_h = np.zeros(n_tiles, dtype=np.uint64)
         if n_tiles > 1:
             np.cumsum(comp_sizes_arr[:-1], out=comp_offsets_h[1:])
         total_comp = int(comp_sizes_arr.sum())
@@ -1216,12 +1219,11 @@ class _NvcompDeflateDecompOpts(ctypes.Structure):
 
         base_comp_ptr = int(d_comp.data.ptr)
         base_decomp_ptr = int(d_decomp.data.ptr)
-        d_comp_ptrs = cupy.asarray(
-            base_comp_ptr + comp_offsets_h.astype(np.uint64))
+        d_comp_ptrs = cupy.asarray(base_comp_ptr + comp_offsets_h)
         decomp_offsets_h = (np.arange(n_tiles, dtype=np.uint64)
                             * np.uint64(tile_bytes))
         d_decomp_ptrs = cupy.asarray(base_decomp_ptr + decomp_offsets_h)
-        d_comp_sizes = cupy.asarray(comp_sizes_arr.astype(np.uint64))
+        d_comp_sizes = cupy.asarray(comp_sizes_arr)
         d_buf_sizes = cupy.full(n_tiles, tile_bytes, dtype=cupy.uint64)
         d_actual = cupy.empty(n_tiles, dtype=cupy.uint64)
 
diff --git a/xrspatial/geotiff/tests/test_nvcomp_decompress_cumsum_offsets_1950.py b/xrspatial/geotiff/tests/test_nvcomp_decompress_cumsum_offsets_1950.py
index a4629c3ae..040043843 100644
--- a/xrspatial/geotiff/tests/test_nvcomp_decompress_cumsum_offsets_1950.py
+++ b/xrspatial/geotiff/tests/test_nvcomp_decompress_cumsum_offsets_1950.py
@@ -29,6 +29,7 @@
 
 import importlib.util
 import os
+import re
 import tempfile
 
 import numpy as np
@@ -47,22 +48,27 @@ def test_nvcomp_decompress_uses_cumsum_for_offsets_1950():
     src_path = pathlib.Path(__file__).parent.parent / "_gpu_decode.py"
     src = src_path.read_text()
 
-    # Locate the decompress prefix-sum site. It sits inside
-    # ``_try_nvcomp_batch_decompress`` and is anchored by the
-    # ``Batch host->device upload`` comment that documents the rationale.
-    anchor = "Batch host->device upload: concatenate all compressed tiles"
-    idx = src.find(anchor)
-    assert idx != -1, "could not locate the decompress upload block"
-    # Take a 1500-char window after the anchor; the prefix-sum lives
-    # within the first ~30 lines of that block.
-    block = src[idx:idx + 1500]
-
-    assert "np.cumsum(" in block, (
-        "decompress upload block should use np.cumsum for prefix-sum "
-        "offsets, aligning with _batched_d2h_to_bytes (issue #1950)."
+    # Anchor on the exact decompress-side prefix-sum call. Regex is
+    # tighter than a fixed character window and survives surrounding
+    # code edits.
+    cumsum_call = re.compile(
+        r"np\.cumsum\(\s*comp_sizes_arr\[:-1\]\s*,\s*"
+        r"out\s*=\s*comp_offsets_h\[1:\]\s*\)"
     )
-    # The legacy Python loop would have ``for i in range(1, n_tiles):``.
-    assert "for i in range(1, n_tiles)" not in block, (
+    assert cumsum_call.search(src), (
+        "decompress upload block should use "
+        "``np.cumsum(comp_sizes_arr[:-1], out=comp_offsets_h[1:])`` for "
+        "prefix-sum offsets, aligning with _batched_d2h_to_bytes "
+        "(issue #1950)."
+    )
+    # The legacy Python loop would have written
+    # ``comp_offsets_h[i] = comp_offsets_h[i - 1] + ...`` inside a
+    # ``for i in range(1, n_tiles):`` block.
+    legacy_loop = re.compile(
+        r"for\s+i\s+in\s+range\(\s*1\s*,\s*n_tiles\s*\)\s*:\s*\n"
+        r"\s*comp_offsets_h\[i\]"
+    )
+    assert not legacy_loop.search(src), (
         "decompress upload block should no longer compute prefix-sum "
         "offsets with a Python for loop (issue #1950)."
     )
@@ -104,12 +110,18 @@ def test_nvcomp_batch_decompress_roundtrip_1950():
     refactor mis-stages a tile, the decoded buffer would not match
     the source, surfacing as a numerical regression here.
 
-    The test is gated on cupy availability rather than the nvCOMP lib
-    explicitly because the GPU read path falls back to a CPU codec when
-    nvCOMP is missing; in that case the test still exercises the GPU
-    upload + tile assembly but bypasses the prefix-sum site directly.
-    Setting ``XRSPATIAL_GEOTIFF_STRICT_GPU=1`` would gate harder.
+    Gated on ``XRSPATIAL_GEOTIFF_STRICT_GPU=1`` so the run-time check
+    only fires in environments that actually carry nvCOMP. Without the
+    flag the GPU read path silently falls back to a CPU codec when
+    nvCOMP is missing, which bypasses the prefix-sum site entirely; a
+    pass under that fallback would be misleading, so we skip instead.
     """
+    if os.environ.get("XRSPATIAL_GEOTIFF_STRICT_GPU") != "1":
+        pytest.skip(
+            "set XRSPATIAL_GEOTIFF_STRICT_GPU=1 to exercise the nvCOMP "
+            "prefix-sum site; without it the GPU path may fall back to "
+            "a CPU codec and bypass this regression."
+        )
     try:
         import cupy
     except ImportError: