From 4ba17ec2316463aec41c08ee10ab59f12f5326d0 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Sun, 19 Apr 2026 02:14:46 -0700
Subject: [PATCH] Fix GPU predictor kernel stride for multi-sample tiled TIFFs
 (#1220)

Both call sites of _predictor_decode_kernel passed
width=tile_width*samples and bytes_per_sample=itemsize*samples. The
kernel computes row_bytes = width * bytes_per_sample, so row_bytes
ended up tile_width * samples**2 * itemsize instead of
tile_width * samples * itemsize. The inner loop walked past the end
of each tile row and, on the last tile, past the end of the d_decomp
buffer (OOB GPU write).

Fix: pass width=tile_width at both call sites. bytes_per_sample stays
itemsize*samples. Matches the CPU call convention in
_reader._apply_predictor.

Regression test builds tiled multi-sample TIFFs (RGB/RGBA, uint8/uint16,
even and uneven tile grids) with predictor=2, decodes via the GPU path,
and asserts byte-for-byte equality with the CPU decode.
---
 .claude/sweep-security-state.json             |   4 +-
 xrspatial/geotiff/_gpu_decode.py              |  11 +-
 .../tests/test_predictor_multisample.py       | 108 ++++++++++++++++++
 3 files changed, 119 insertions(+), 4 deletions(-)
 create mode 100644 xrspatial/geotiff/tests/test_predictor_multisample.py

diff --git a/.claude/sweep-security-state.json b/.claude/sweep-security-state.json
index cdd13ee3..198b2b43 100644
--- a/.claude/sweep-security-state.json
+++ b/.claude/sweep-security-state.json
@@ -9,8 +9,10 @@
     "geotiff": {
       "last_inspected": "2026-04-17",
       "issue": 1215,
+      "followup_issues": [1220],
       "severity_max": "HIGH",
-      "categories_found": [1, 4]
+      "categories_found": [1, 4],
+      "notes": "Follow-up #1220: GPU predictor=2 kernel over-indexed d_decomp for multi-sample tiled TIFFs (silent OOB GPU write). Fixed by passing width=tile_width instead of tile_width*samples."
     }
   }
 }
diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py
index 420908d0..242852cf 100644
--- a/xrspatial/geotiff/_gpu_decode.py
+++ b/xrspatial/geotiff/_gpu_decode.py
@@ -1336,8 +1336,11 @@ def _apply_predictor_and_assemble(d_decomp, d_decomp_offsets, n_tiles,
         total_rows = n_tiles * tile_height
         tpb = min(256, total_rows)
         bpg = math.ceil(total_rows / tpb)
+        # Kernel uses row_bytes = width * bytes_per_sample, so pass pixel
+        # width and full per-pixel size (itemsize * samples). Matches CPU
+        # call at _reader.py _apply_predictor(..., bytes_per_sample * samples).
         _predictor_decode_kernel[bpg, tpb](
-            d_decomp, tile_width * samples, total_rows, dtype.itemsize * samples)
+            d_decomp, tile_width, total_rows, dtype.itemsize * samples)
         cuda.synchronize()
     elif predictor == 3:
         total_rows = n_tiles * tile_height
@@ -1579,9 +1582,11 @@ def gpu_decode_tiles(
         total_rows = n_tiles * tile_height
         tpb = min(256, total_rows)
         bpg = math.ceil(total_rows / tpb)
-        # Reshape so each tile's rows are contiguous (they already are)
+        # Reshape so each tile's rows are contiguous (they already are).
+        # Kernel uses row_bytes = width * bytes_per_sample, so pass pixel
+        # width and full per-pixel size (itemsize * samples).
         _predictor_decode_kernel[bpg, tpb](
-            d_decomp, tile_width * samples, total_rows, dtype.itemsize * samples)
+            d_decomp, tile_width, total_rows, dtype.itemsize * samples)
         cuda.synchronize()
 
     elif predictor == 3:
diff --git a/xrspatial/geotiff/tests/test_predictor_multisample.py b/xrspatial/geotiff/tests/test_predictor_multisample.py
new file mode 100644
index 00000000..e1704a85
--- /dev/null
+++ b/xrspatial/geotiff/tests/test_predictor_multisample.py
@@ -0,0 +1,108 @@
+"""Regression tests for GPU predictor decode on multi-sample tiled TIFFs.
+
+Covers issue #1220: the GPU predictor=2 decode path passed
+`width=tile_width * samples` and `bytes_per_sample=itemsize * samples`
+to `_predictor_decode_kernel`, causing `row_bytes` to be
+`tile_width * samples**2 * itemsize` instead of
+`tile_width * samples * itemsize`. That walks the cumulative-sum loop
+past the end of each tile row and, on the last tile, past the end of
+the `d_decomp` buffer (silent OOB GPU write).
+
+These tests build multi-sample tiled TIFFs with predictor=2 on the
+CPU path, decode them via the GPU path, and assert byte-for-byte
+equality with the CPU decode.
+"""
+from __future__ import annotations
+
+import os
+import tempfile
+
+import numpy as np
+import pytest
+import xarray as xr
+
+from xrspatial.geotiff import open_geotiff, to_geotiff
+
+
+try:
+    import cupy  # noqa: F401
+    from numba import cuda
+    _HAS_GPU = cuda.is_available()
+except Exception:
+    _HAS_GPU = False
+
+
+pytestmark = pytest.mark.skipif(
+    not _HAS_GPU, reason="Requires CuPy + CUDA for GPU decode path")
+
+
+def _gpu_to_numpy(da: xr.DataArray) -> np.ndarray:
+    """Pull a CuPy-backed DataArray to host memory."""
+    arr = da.data
+    if hasattr(arr, 'get'):
+        return arr.get()
+    return np.asarray(arr)
+
+
+@pytest.mark.parametrize("samples,dtype_str", [
+    (3, "uint8"),   # RGB uint8
+    (4, "uint8"),   # RGBA uint8
+    (3, "uint16"),  # RGB uint16
+])
+def test_gpu_predictor2_multisample_matches_cpu(tmp_path, samples, dtype_str):
+    """GPU decode of a tiled multi-sample TIFF with predictor=2 matches CPU.
+
+    Before the fix, the GPU predictor=2 kernel over-indexed the
+    decompressed buffer by a factor of `samples` on the inner loop,
+    corrupting the decoded pixels and writing past the end of
+    ``d_decomp`` on the last tile.
+    """
+    dtype = np.dtype(dtype_str)
+    h, w = 16, 16
+    rng = np.random.RandomState(42)
+    if dtype.kind == 'u' and dtype.itemsize == 1:
+        data = rng.randint(0, 256, size=(h, w, samples), dtype=dtype)
+    else:
+        high = np.iinfo(dtype).max if dtype.kind in ('u', 'i') else 1000
+        data = rng.randint(0, high, size=(h, w, samples), dtype=dtype)
+
+    da = xr.DataArray(data, dims=['y', 'x', 'band'])
+
+    path = str(tmp_path / f"rgb_pred_{samples}_{dtype_str}.tif")
+    # tile_size smaller than image so we exercise more than one tile.
+    to_geotiff(da, path, compression='deflate', tile_size=8, predictor=True)
+
+    cpu_arr = open_geotiff(path).values
+    assert cpu_arr.shape == (h, w, samples)
+    assert cpu_arr.dtype == dtype
+    # Sanity: CPU round-trip reproduces the source exactly.
+    np.testing.assert_array_equal(cpu_arr, data)
+
+    gpu_da = open_geotiff(path, gpu=True)
+    gpu_arr = _gpu_to_numpy(gpu_da)
+
+    assert gpu_arr.shape == cpu_arr.shape
+    assert gpu_arr.dtype == cpu_arr.dtype
+    # Byte-for-byte equality is the whole point: predictor=2 is exact.
+    np.testing.assert_array_equal(gpu_arr, cpu_arr)
+
+
+def test_gpu_predictor2_multisample_uneven_tiles(tmp_path):
+    """Image size not a multiple of tile size, multi-sample, predictor=2.
+
+    Exercises partial edge tiles which share the same predictor decode
+    path and are most likely to trip any lingering stride mismatch.
+    """
+    h, w, samples = 20, 20, 3  # 20 is not a multiple of 8
+    rng = np.random.RandomState(7)
+    data = rng.randint(0, 256, size=(h, w, samples), dtype=np.uint8)
+    da = xr.DataArray(data, dims=['y', 'x', 'band'])
+
+    path = str(tmp_path / "rgb_pred_uneven.tif")
+    to_geotiff(da, path, compression='deflate', tile_size=8, predictor=True)
+
+    cpu_arr = open_geotiff(path).values
+    gpu_arr = _gpu_to_numpy(open_geotiff(path, gpu=True))
+
+    np.testing.assert_array_equal(gpu_arr, cpu_arr)
+    np.testing.assert_array_equal(gpu_arr, data)