From 4ba17ec2316463aec41c08ee10ab59f12f5326d0 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Sun, 19 Apr 2026 02:14:46 -0700 Subject: [PATCH] Fix GPU predictor kernel stride for multi-sample tiled TIFFs (#1220) Both call sites of _predictor_decode_kernel passed width=tile_width*samples and bytes_per_sample=itemsize*samples. The kernel computes row_bytes = width * bytes_per_sample, so row_bytes ended up tile_width * samples**2 * itemsize instead of tile_width * samples * itemsize. The inner loop walked past the end of each tile row and, on the last tile, past the end of the d_decomp buffer (OOB GPU write). Fix: pass width=tile_width at both call sites. bytes_per_sample stays itemsize*samples. Matches the CPU call convention in _reader._apply_predictor. Regression test builds tiled multi-sample TIFFs (RGB/RGBA, uint8/uint16, even and uneven tile grids) with predictor=2, decodes via the GPU path, and asserts byte-for-byte equality with the CPU decode. --- .claude/sweep-security-state.json | 4 +- xrspatial/geotiff/_gpu_decode.py | 11 +- .../tests/test_predictor_multisample.py | 108 ++++++++++++++++++ 3 files changed, 119 insertions(+), 4 deletions(-) create mode 100644 xrspatial/geotiff/tests/test_predictor_multisample.py diff --git a/.claude/sweep-security-state.json b/.claude/sweep-security-state.json index cdd13ee3..198b2b43 100644 --- a/.claude/sweep-security-state.json +++ b/.claude/sweep-security-state.json @@ -9,8 +9,10 @@ "geotiff": { "last_inspected": "2026-04-17", "issue": 1215, + "followup_issues": [1220], "severity_max": "HIGH", - "categories_found": [1, 4] + "categories_found": [1, 4], + "notes": "Follow-up #1220: GPU predictor=2 kernel over-indexed d_decomp for multi-sample tiled TIFFs (silent OOB GPU write). Fixed by passing width=tile_width instead of tile_width*samples." } } } diff --git a/xrspatial/geotiff/_gpu_decode.py b/xrspatial/geotiff/_gpu_decode.py index 420908d0..242852cf 100644 --- a/xrspatial/geotiff/_gpu_decode.py +++ b/xrspatial/geotiff/_gpu_decode.py @@ -1336,8 +1336,11 @@ def _apply_predictor_and_assemble(d_decomp, d_decomp_offsets, n_tiles, total_rows = n_tiles * tile_height tpb = min(256, total_rows) bpg = math.ceil(total_rows / tpb) + # Kernel uses row_bytes = width * bytes_per_sample, so pass pixel + # width and full per-pixel size (itemsize * samples). Matches CPU + # call at _reader.py _apply_predictor(..., bytes_per_sample * samples). _predictor_decode_kernel[bpg, tpb]( - d_decomp, tile_width * samples, total_rows, dtype.itemsize * samples) + d_decomp, tile_width, total_rows, dtype.itemsize * samples) cuda.synchronize() elif predictor == 3: total_rows = n_tiles * tile_height @@ -1579,9 +1582,11 @@ def gpu_decode_tiles( total_rows = n_tiles * tile_height tpb = min(256, total_rows) bpg = math.ceil(total_rows / tpb) - # Reshape so each tile's rows are contiguous (they already are) + # Reshape so each tile's rows are contiguous (they already are). + # Kernel uses row_bytes = width * bytes_per_sample, so pass pixel + # width and full per-pixel size (itemsize * samples). _predictor_decode_kernel[bpg, tpb]( - d_decomp, tile_width * samples, total_rows, dtype.itemsize * samples) + d_decomp, tile_width, total_rows, dtype.itemsize * samples) cuda.synchronize() elif predictor == 3: diff --git a/xrspatial/geotiff/tests/test_predictor_multisample.py b/xrspatial/geotiff/tests/test_predictor_multisample.py new file mode 100644 index 00000000..e1704a85 --- /dev/null +++ b/xrspatial/geotiff/tests/test_predictor_multisample.py @@ -0,0 +1,108 @@ +"""Regression tests for GPU predictor decode on multi-sample tiled TIFFs. + +Covers issue #1220: the GPU predictor=2 decode path passed +`width=tile_width * samples` and `bytes_per_sample=itemsize * samples` +to `_predictor_decode_kernel`, causing `row_bytes` to be +`tile_width * samples**2 * itemsize` instead of +`tile_width * samples * itemsize`. That walks the cumulative-sum loop +past the end of each tile row and, on the last tile, past the end of +the `d_decomp` buffer (silent OOB GPU write). + +These tests build multi-sample tiled TIFFs with predictor=2 on the +CPU path, decode them via the GPU path, and assert byte-for-byte +equality with the CPU decode. +""" +from __future__ import annotations + +import os +import tempfile + +import numpy as np +import pytest +import xarray as xr + +from xrspatial.geotiff import open_geotiff, to_geotiff + + +try: + import cupy # noqa: F401 + from numba import cuda + _HAS_GPU = cuda.is_available() +except Exception: + _HAS_GPU = False + + +pytestmark = pytest.mark.skipif( + not _HAS_GPU, reason="Requires CuPy + CUDA for GPU decode path") + + +def _gpu_to_numpy(da: xr.DataArray) -> np.ndarray: + """Pull a CuPy-backed DataArray to host memory.""" + arr = da.data + if hasattr(arr, 'get'): + return arr.get() + return np.asarray(arr) + + +@pytest.mark.parametrize("samples,dtype_str", [ + (3, "uint8"), # RGB uint8 + (4, "uint8"), # RGBA uint8 + (3, "uint16"), # RGB uint16 +]) +def test_gpu_predictor2_multisample_matches_cpu(tmp_path, samples, dtype_str): + """GPU decode of a tiled multi-sample TIFF with predictor=2 matches CPU. + + Before the fix, the GPU predictor=2 kernel over-indexed the + decompressed buffer by a factor of `samples` on the inner loop, + corrupting the decoded pixels and writing past the end of + ``d_decomp`` on the last tile. + """ + dtype = np.dtype(dtype_str) + h, w = 16, 16 + rng = np.random.RandomState(42) + if dtype.kind == 'u' and dtype.itemsize == 1: + data = rng.randint(0, 256, size=(h, w, samples), dtype=dtype) + else: + high = np.iinfo(dtype).max if dtype.kind in ('u', 'i') else 1000 + data = rng.randint(0, high, size=(h, w, samples), dtype=dtype) + + da = xr.DataArray(data, dims=['y', 'x', 'band']) + + path = str(tmp_path / f"rgb_pred_{samples}_{dtype_str}.tif") + # tile_size smaller than image so we exercise more than one tile. + to_geotiff(da, path, compression='deflate', tile_size=8, predictor=True) + + cpu_arr = open_geotiff(path).values + assert cpu_arr.shape == (h, w, samples) + assert cpu_arr.dtype == dtype + # Sanity: CPU round-trip reproduces the source exactly. + np.testing.assert_array_equal(cpu_arr, data) + + gpu_da = open_geotiff(path, gpu=True) + gpu_arr = _gpu_to_numpy(gpu_da) + + assert gpu_arr.shape == cpu_arr.shape + assert gpu_arr.dtype == cpu_arr.dtype + # Byte-for-byte equality is the whole point: predictor=2 is exact. + np.testing.assert_array_equal(gpu_arr, cpu_arr) + + +def test_gpu_predictor2_multisample_uneven_tiles(tmp_path): + """Image size not a multiple of tile size, multi-sample, predictor=2. + + Exercises partial edge tiles which share the same predictor decode + path and are most likely to trip any lingering stride mismatch. + """ + h, w, samples = 20, 20, 3 # 20 is not a multiple of 8 + rng = np.random.RandomState(7) + data = rng.randint(0, 256, size=(h, w, samples), dtype=np.uint8) + da = xr.DataArray(data, dims=['y', 'x', 'band']) + + path = str(tmp_path / "rgb_pred_uneven.tif") + to_geotiff(da, path, compression='deflate', tile_size=8, predictor=True) + + cpu_arr = open_geotiff(path).values + gpu_arr = _gpu_to_numpy(open_geotiff(path, gpu=True)) + + np.testing.assert_array_equal(gpu_arr, cpu_arr) + np.testing.assert_array_equal(gpu_arr, data)