From d699277fb590e32f6b9eeb46533d0013d74aeadb Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Wed, 13 May 2026 08:48:05 -0700 Subject: [PATCH 1/3] geotiff: reject ambiguous 3D writer inputs (#1812) to_geotiff and write_geotiff_gpu used to silently mishandle 3D DataArrays whose leading dim was not in _BAND_DIM_NAMES = ('band', 'bands', 'channel'). The moveaxis that puts (band, y, x) into the on-disk (y, x, band) layout was skipped, the writer kept the leading axis as the spatial y axis, and the round-trip produced a TIFF with silently swapped axes -- on read-back, out[:, :, 0].sum() != arr[0].sum(). Reject ambiguous 3D layouts at all three writer entry points (eager to_geotiff, dask streaming, write_geotiff_gpu) via the shared _validate_3d_writer_dims helper. Accepted layouts: (band, y, x) or (y, x, band) with band-name aliases bands/channel and spatial-name aliases lat/lon/latitude/longitude/row/col. Anything else raises ValueError with an actionable message (rename the non-spatial dim or transpose). Surfaced by the 2026-05-13 metadata propagation sweep. --- .claude/sweep-metadata-state.csv | 2 +- xrspatial/geotiff/__init__.py | 89 +++++++ .../test_to_geotiff_3d_dim_validation_1812.py | 232 ++++++++++++++++++ 3 files changed, 322 insertions(+), 1 deletion(-) create mode 100644 xrspatial/geotiff/tests/test_to_geotiff_3d_dim_validation_1812.py diff --git a/.claude/sweep-metadata-state.csv b/.claude/sweep-metadata-state.csv index a3f53c87..615fc939 100644 --- a/.claude/sweep-metadata-state.csv +++ b/.claude/sweep-metadata-state.csv @@ -1,3 +1,3 @@ module,last_inspected,issue,severity_max,categories_found,notes -geotiff,2026-05-12,1753,HIGH,2,"read_geotiff_gpu stripped-fallback windowed read on a non-georef TIFF emitted float64 [-0.5, -1.5, ...] coords via the default unit GeoTransform placeholder, while the eager numpy and dask paths emit int64 file-relative pixel coords. Regression of #1710 (fix missed the stripped GPU branch). The tiled-GPU helper _gpu_apply_window_band already gates on has_georef correctly; the stripped fallback only checked t is None. Fixed by routing both non-georef and t-is-None cases through the integer pixel-coord branch and using file-relative offsets to match every other backend. Verified 4-backend parity (numpy / cupy / dask+numpy / dask+cupy)." +geotiff,2026-05-13,1812,HIGH,3,"Re-audit 2026-05-13 (af588e03 worktree, branch deep-sweep-metadata-geotiff-2026-05-13-af588e03): 4-backend (numpy/cupy/dask+numpy/dask+cupy) parity verified for full reads, windowed reads, multi-band, miniswhite, nodata, extra_tags, gdal_metadata, x/y_resolution, image_description. All three writers (eager to_geotiff, dask streaming, write_geotiff_gpu) preserve rich tags through round-trip. VRT read attrs agree across open_geotiff / read_geotiff_dask / read_vrt (all 3 drop x/y_resolution -- VRT XML format limitation, LOW). NEW HIGH finding #1812: to_geotiff silently corrupted data when 3D dims[0] was not in _BAND_DIM_NAMES (e.g. dims=('time','y','x')); writer skipped moveaxis and treated leading axis as y -- round-trip produced shape (2,4,5) instead of (4,5,2) with swapped per-slice contents. Fix raises ValueError at all 3 writer entry points with an actionable message (transpose to (y,x,band) or rename to band/bands/channel). Existing LOW (int+nodata dtype dask always-promotes-to-float64 while eager/gpu keep int when sentinel absent) is documented and intentional per #1597." reproject,2026-05-10,1572;1573,HIGH,1;3;4,geoid_height_raster dropped input attrs and used dims[-2:] for 3D inputs (#1572). reproject/merge ignored nodatavals (rasterio convention) when rioxarray absent (#1573). Fixed in same branch. diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 588cc763..eb7a1bfd 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -94,6 +94,58 @@ # a GeoTransform from coords (see ``_coords_to_transform`` and issue #1643). _BAND_DIM_NAMES = ('band', 'bands', 'channel') +# Spatial dim names recognised on 3D writer inputs. ``y``/``x`` are the +# canonical TIFF axes; aliases are accepted so a user who happens to use +# ``lat``/``lon`` or ``row``/``col`` is not bounced by the validator below. +_Y_DIM_NAMES = ('y', 'lat', 'latitude', 'row') +_X_DIM_NAMES = ('x', 'lon', 'longitude', 'col') + + +def _validate_3d_writer_dims(dims) -> None: + """Reject ambiguous 3D writer inputs (issue #1812). + + The writer interprets a 3D DataArray as either ``(band, y, x)`` or + ``(y, x, band)``. ``data.dims[0] in _BAND_DIM_NAMES`` decides which + branch fires the ``moveaxis``. Anything else (e.g. ``('time', 'y', 'x')``) + used to fall through silently: the writer kept the leading axis as + the spatial ``y`` axis and the result was a TIFF with the leading + axis values laid out along ``y`` (silent data corruption -- on + read-back the array round-tripped with a swapped shape). + + Refuse the ambiguous case at the entry point. The message tells the + caller exactly how to fix the input (rename to one of + ``_BAND_DIM_NAMES`` or transpose to ``(y, x, band)``). + """ + if len(dims) != 3: + return + d0, d1, d2 = dims + band_layout = (d0 in _BAND_DIM_NAMES + and d1 in _Y_DIM_NAMES + and d2 in _X_DIM_NAMES) + yxb_layout = (d0 in _Y_DIM_NAMES + and d1 in _X_DIM_NAMES + and d2 in _BAND_DIM_NAMES) + if band_layout or yxb_layout: + return + # Bare (y, x, *) or (*, y, x) where the third dim is unnamed but + # spatial -- the writer's old behaviour treats the non-spatial axis + # as bands. Accept that only when the unknown dim is in the band + # position (last), which matches how raw numpy callers typically + # build a band-last array. + if d0 in _Y_DIM_NAMES and d1 in _X_DIM_NAMES: + return + raise ValueError( + f"3D writer input has ambiguous dims {dims!r}. Expected " + f"(band, y, x) or (y, x, band); accepted band-dim aliases are " + f"{_BAND_DIM_NAMES} and spatial aliases are y={_Y_DIM_NAMES} / " + f"x={_X_DIM_NAMES}. Rename the non-spatial dim to 'band' or " + f"transpose the array so spatial dims come first (e.g. " + f"``da.transpose('y', 'x', {dims[0]!r})``). The writer cannot " + f"infer which axis is the band axis from arbitrary dim names " + f"and would otherwise silently treat the leading axis as the " + f"spatial y axis (issue #1812)." + ) + class GeoTIFFFallbackWarning(UserWarning): """Warning emitted when a geotiff helper falls back to a slower path. @@ -1388,6 +1440,15 @@ def to_geotiff(data: xr.DataArray | np.ndarray, (``b != 0`` or ``d != 0`` in rasterio ``Affine`` ordering). The on-disk GeoTIFF is axis-aligned; reproject onto an axis-aligned grid first. + ValueError + If ``data`` is a 3D DataArray whose ``dims`` is not + ``(band, y, x)`` or ``(y, x, band)`` (accepting the band-name + aliases ``bands`` / ``channel`` and spatial-name aliases + ``lat`` / ``lon`` / ``latitude`` / ``longitude`` / ``row`` / + ``col``). A leading non-band dim such as ``time`` is rejected + because the writer cannot infer the band axis from arbitrary + names and used to silently treat the leading axis as ``y`` + (issue #1812). """ from ._reader import _coerce_path @@ -1632,6 +1693,12 @@ def to_geotiff(data: xr.DataArray | np.ndarray, # destinations we materialise eagerly and assemble in-memory. if hasattr(raw, 'dask') and not cog and not _path_is_file_like: dask_arr = raw + # Reject ambiguous 3D layouts at the entry point so a leading + # non-band dim like ``('time', 'y', 'x')`` cannot silently + # round-trip as a TIFF whose ``y`` axis carries the time + # values (issue #1812). + if raw.ndim == 3: + _validate_3d_writer_dims(data.dims) # Handle band-first dimension order (band, y, x) -> (y, x, band) if raw.ndim == 3 and data.dims[0] in _BAND_DIM_NAMES: import dask.array as da @@ -1682,6 +1749,12 @@ def to_geotiff(data: xr.DataArray | np.ndarray, arr = arr.get() # Dask+CuPy -> numpy else: arr = np.asarray(raw) + # Reject ambiguous 3D layouts (issue #1812). The validator runs + # on ``data.dims`` (the original DataArray's dim names) rather + # than on ``arr`` so the error fires before the move-axis even + # for COG and file-like destinations that fall through here. + if arr.ndim == 3: + _validate_3d_writer_dims(data.dims) # Handle band-first dimension order (band, y, x) -> (y, x, band) if arr.ndim == 3 and data.dims[0] in _BAND_DIM_NAMES: arr = np.moveaxis(arr, 0, -1) @@ -3522,6 +3595,16 @@ def write_geotiff_gpu(data: xr.DataArray | cupy.ndarray | np.ndarray, GPU writer forwards this kwarg unchanged. Default ``'auto'`` writes MinIsBlack for any band count, so a 4-band raster is not silently tagged as RGB+alpha (issue #1769). + + Raises + ------ + ValueError + If ``data`` is a 3D DataArray whose ``dims`` is not + ``(band, y, x)`` or ``(y, x, band)`` (accepting band-name + aliases ``bands`` / ``channel`` and spatial-name aliases + ``lat`` / ``lon`` / ``row`` / ``col``). A leading non-band + dim such as ``time`` would otherwise silently round-trip with + the leading axis treated as ``y`` (issue #1812). """ if not tiled: raise ValueError( @@ -3603,6 +3686,12 @@ def write_geotiff_gpu(data: xr.DataArray | cupy.ndarray | np.ndarray, else: arr = cupy.asarray(np.asarray(arr)) # numpy -> GPU + # Reject ambiguous 3D layouts (issue #1812). Mirrors the gate + # in ``to_geotiff``: a leading non-band dim like ``time`` would + # otherwise round-trip with the leading axis silently treated + # as ``y``. + if arr.ndim == 3: + _validate_3d_writer_dims(data.dims) # Handle band-first dimension order (band, y, x) -> (y, x, band). # rioxarray and CF-style multi-band rasters land here; without # this remap the writer treats arr.shape[2] as the band axis and diff --git a/xrspatial/geotiff/tests/test_to_geotiff_3d_dim_validation_1812.py b/xrspatial/geotiff/tests/test_to_geotiff_3d_dim_validation_1812.py new file mode 100644 index 00000000..4ce9d2ad --- /dev/null +++ b/xrspatial/geotiff/tests/test_to_geotiff_3d_dim_validation_1812.py @@ -0,0 +1,232 @@ +"""Regression tests for issue #1812. + +``to_geotiff`` / ``write_geotiff_gpu`` used to silently mishandle 3D +DataArrays whose leading dim was not in +``_BAND_DIM_NAMES = ('band', 'bands', 'channel')``. The +``moveaxis(arr, 0, -1)`` that puts ``(band, y, x)`` into the on-disk +``(y, x, band)`` layout was skipped, the writer kept the leading axis +as the spatial ``y`` axis, and the result was a TIFF with silently +swapped axes -- on read-back, ``out[:, :, 0].sum() != arr[0].sum()``. + +The fix raises ``ValueError`` at the entry point of all three writers +(eager, dask streaming, and GPU) when ``dims`` is not unambiguously +one of ``(band, y, x)`` or ``(y, x, band)``. +""" +from __future__ import annotations + +import importlib.util + +import dask.array as dsk +import numpy as np +import pytest +import xarray as xr + +from xrspatial.geotiff import open_geotiff, to_geotiff + + +def _gpu_available() -> bool: + if importlib.util.find_spec("cupy") is None: + return False + try: + import cupy + + return bool(cupy.cuda.is_available()) + except Exception: + return False + + +_HAS_GPU = _gpu_available() +_gpu_only = pytest.mark.skipif(not _HAS_GPU, reason="cupy + CUDA required") + + +# Inputs that *must* raise. Each tuple is (dims, shape). +_AMBIGUOUS_3D_INPUTS = [ + pytest.param(("time", "y", "x"), (2, 4, 5), id="time-y-x"), + pytest.param(("z", "y", "x"), (2, 4, 5), id="z-y-x"), + pytest.param(("band", "lat", "lon"), (2, 4, 5), id="band-lat-lon"), # ok via alias + pytest.param(("y", "x", "depth"), (4, 5, 2), id="y-x-depth"), # accepted: spatial-first + pytest.param(("foo", "bar", "baz"), (2, 4, 5), id="foo-bar-baz"), +] + + +# Inputs that must be accepted (round-trip cleanly). +_HAPPY_3D_INPUTS = [ + pytest.param(("band", "y", "x"), (3, 4, 5), id="band-y-x"), + pytest.param(("bands", "y", "x"), (3, 4, 5), id="bands-y-x"), + pytest.param(("channel", "y", "x"), (3, 4, 5), id="channel-y-x"), + pytest.param(("y", "x", "band"), (4, 5, 3), id="y-x-band"), + pytest.param(("lat", "lon", "band"), (4, 5, 3), id="lat-lon-band"), + pytest.param(("row", "col", "channel"), (4, 5, 3), id="row-col-channel"), + pytest.param(("band", "lat", "lon"), (3, 4, 5), id="band-lat-lon-alias"), +] + + +def _make_da(dims, shape, dtype=np.uint8, backend="numpy"): + if backend == "numpy": + arr = np.arange(int(np.prod(shape)), dtype=dtype).reshape(shape) + elif backend == "dask": + arr_np = np.arange(int(np.prod(shape)), dtype=dtype).reshape(shape) + arr = dsk.from_array(arr_np, chunks=2) + elif backend == "cupy": + import cupy + + arr = cupy.arange(int(np.prod(shape)), + dtype=cupy.dtype(dtype)).reshape(shape) + else: + raise ValueError(backend) + return xr.DataArray(arr, dims=dims, attrs={"crs": "EPSG:4326"}) + + +def test_repro_silent_corruption_now_raises(tmp_path): + """The original repro now raises a clear ValueError.""" + arr = np.zeros((2, 4, 5), dtype=np.uint8) + arr[0] = 1 + arr[1] = 2 + da = xr.DataArray(arr, dims=("time", "y", "x"), + attrs={"crs": "EPSG:4326"}) + out_path = tmp_path / "tmp_1812_time_y_x.tif" + with pytest.raises(ValueError, match="ambiguous dims"): + to_geotiff(da, str(out_path), crs=4326) + + +@pytest.mark.parametrize("dims, shape", [ + pytest.param(("time", "y", "x"), (2, 4, 5), id="time-y-x"), + pytest.param(("z", "y", "x"), (2, 4, 5), id="z-y-x"), + pytest.param(("foo", "bar", "baz"), (2, 4, 5), id="foo-bar-baz"), +]) +def test_eager_rejects_ambiguous_3d(tmp_path, dims, shape): + """Eager numpy path raises ValueError on ambiguous 3D dim names.""" + da = _make_da(dims, shape, backend="numpy") + out_path = tmp_path / f"tmp_1812_eager_{'_'.join(dims)}.tif" + with pytest.raises(ValueError, match="ambiguous dims"): + to_geotiff(da, str(out_path), crs=4326) + + +@pytest.mark.parametrize("dims, shape", [ + pytest.param(("time", "y", "x"), (2, 4, 5), id="time-y-x"), + pytest.param(("z", "y", "x"), (2, 4, 5), id="z-y-x"), + pytest.param(("foo", "bar", "baz"), (2, 4, 5), id="foo-bar-baz"), +]) +def test_dask_streaming_rejects_ambiguous_3d(tmp_path, dims, shape): + """Dask-streaming branch raises ValueError on ambiguous 3D dim names.""" + da = _make_da(dims, shape, backend="dask") + out_path = tmp_path / f"tmp_1812_dask_{'_'.join(dims)}.tif" + with pytest.raises(ValueError, match="ambiguous dims"): + to_geotiff(da, str(out_path), crs=4326) + + +@_gpu_only +@pytest.mark.parametrize("dims, shape", [ + pytest.param(("time", "y", "x"), (2, 4, 5), id="time-y-x"), + pytest.param(("foo", "bar", "baz"), (2, 4, 5), id="foo-bar-baz"), +]) +def test_gpu_writer_rejects_ambiguous_3d(tmp_path, dims, shape): + """GPU writer raises ValueError on ambiguous 3D dim names.""" + from xrspatial.geotiff import write_geotiff_gpu + + da = _make_da(dims, shape, backend="cupy") + out_path = tmp_path / f"tmp_1812_gpu_{'_'.join(dims)}.tif" + with pytest.raises(ValueError, match="ambiguous dims"): + write_geotiff_gpu(da, str(out_path), crs=4326) + + +@pytest.mark.parametrize("dims, shape", _HAPPY_3D_INPUTS) +def test_happy_3d_round_trip(tmp_path, dims, shape): + """Accepted 3D dim layouts still round-trip cleanly (eager + dask). + + Each slice along the band axis is filled with a distinct constant + so a silent axis swap would change the per-slice sums. + """ + # Build a per-slice-distinguishable array. ``arr_full[k]`` along the + # band axis is filled with ``k + 1``. + band_pos = next(i for i, d in enumerate(dims) + if d in ("band", "bands", "channel")) + n_bands = shape[band_pos] + spatial_shape = tuple(s for i, s in enumerate(shape) if i != band_pos) + arr_np = np.empty(shape, dtype=np.uint8) + for k in range(n_bands): + slicer = [slice(None)] * 3 + slicer[band_pos] = k + arr_np[tuple(slicer)] = k + 1 + + # Eager round-trip + da_eager = xr.DataArray(arr_np, dims=dims, + attrs={"crs": "EPSG:4326"}) + p_eager = tmp_path / f"tmp_1812_happy_eager_{'_'.join(dims)}.tif" + to_geotiff(da_eager, str(p_eager), crs=4326) + out_eager = open_geotiff(str(p_eager)) + # On-disk layout is always (y, x, band). Compare per-band sums. + assert out_eager.shape == spatial_shape + (n_bands,) + for k in range(n_bands): + expected = (k + 1) * (spatial_shape[0] * spatial_shape[1]) + assert int(out_eager.values[:, :, k].sum()) == expected, ( + f"band {k} sum mismatch on eager round-trip of dims={dims}" + ) + + # Dask streaming round-trip + da_dask = xr.DataArray(dsk.from_array(arr_np, chunks=2), dims=dims, + attrs={"crs": "EPSG:4326"}) + p_dask = tmp_path / f"tmp_1812_happy_dask_{'_'.join(dims)}.tif" + to_geotiff(da_dask, str(p_dask), crs=4326) + out_dask = open_geotiff(str(p_dask)) + assert out_dask.shape == spatial_shape + (n_bands,) + for k in range(n_bands): + expected = (k + 1) * (spatial_shape[0] * spatial_shape[1]) + assert int(out_dask.values[:, :, k].sum()) == expected, ( + f"band {k} sum mismatch on dask round-trip of dims={dims}" + ) + + +def test_2d_still_works(tmp_path): + """2D inputs are unaffected by the new 3D validator.""" + arr = np.arange(20, dtype=np.uint8).reshape(4, 5) + da = xr.DataArray(arr, dims=("y", "x"), attrs={"crs": "EPSG:4326"}) + p = tmp_path / "tmp_1812_2d.tif" + to_geotiff(da, str(p), crs=4326) + out = open_geotiff(str(p)) + assert out.shape == (4, 5) + assert np.array_equal(out.values, arr) + + +def test_error_message_actionable(tmp_path): + """The ValueError message tells the caller how to fix the input.""" + arr = np.zeros((2, 4, 5), dtype=np.uint8) + da = xr.DataArray(arr, dims=("time", "y", "x"), + attrs={"crs": "EPSG:4326"}) + p = tmp_path / "tmp_1812_msg.tif" + with pytest.raises(ValueError) as excinfo: + to_geotiff(da, str(p), crs=4326) + msg = str(excinfo.value) + # Mentions the offending dim layout + assert "('time', 'y', 'x')" in msg + # Mentions the accepted alternatives + assert "(band, y, x)" in msg + assert "(y, x, band)" in msg + # Points the user at a concrete remediation + assert "transpose" in msg.lower() or "rename" in msg.lower() + # References the issue + assert "#1812" in msg + + +@_gpu_only +def test_gpu_writer_happy_path_still_works(tmp_path): + """GPU writer's existing happy paths (band-first and band-last) survive.""" + import cupy + + from xrspatial.geotiff import write_geotiff_gpu + + arr_bf = cupy.arange(3 * 4 * 5, dtype=cupy.uint8).reshape(3, 4, 5) + da_bf = xr.DataArray(arr_bf, dims=("band", "y", "x"), + attrs={"crs": "EPSG:4326"}) + p_bf = tmp_path / "tmp_1812_gpu_bf.tif" + write_geotiff_gpu(da_bf, str(p_bf), crs=4326) + out_bf = open_geotiff(str(p_bf)) + assert out_bf.shape == (4, 5, 3) + + arr_bl = cupy.arange(4 * 5 * 3, dtype=cupy.uint8).reshape(4, 5, 3) + da_bl = xr.DataArray(arr_bl, dims=("y", "x", "band"), + attrs={"crs": "EPSG:4326"}) + p_bl = tmp_path / "tmp_1812_gpu_bl.tif" + write_geotiff_gpu(da_bl, str(p_bl), crs=4326) + out_bl = open_geotiff(str(p_bl)) + assert out_bl.shape == (4, 5, 3) From c3daa54668fbc717447f2b839de8eadffde3df3b Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Wed, 13 May 2026 10:16:16 -0700 Subject: [PATCH 2/3] geotiff: remove unused _AMBIGUOUS_3D_INPUTS test list (#1820 review) --- .../tests/test_to_geotiff_3d_dim_validation_1812.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/xrspatial/geotiff/tests/test_to_geotiff_3d_dim_validation_1812.py b/xrspatial/geotiff/tests/test_to_geotiff_3d_dim_validation_1812.py index 4ce9d2ad..5726bc67 100644 --- a/xrspatial/geotiff/tests/test_to_geotiff_3d_dim_validation_1812.py +++ b/xrspatial/geotiff/tests/test_to_geotiff_3d_dim_validation_1812.py @@ -39,16 +39,6 @@ def _gpu_available() -> bool: _gpu_only = pytest.mark.skipif(not _HAS_GPU, reason="cupy + CUDA required") -# Inputs that *must* raise. Each tuple is (dims, shape). -_AMBIGUOUS_3D_INPUTS = [ - pytest.param(("time", "y", "x"), (2, 4, 5), id="time-y-x"), - pytest.param(("z", "y", "x"), (2, 4, 5), id="z-y-x"), - pytest.param(("band", "lat", "lon"), (2, 4, 5), id="band-lat-lon"), # ok via alias - pytest.param(("y", "x", "depth"), (4, 5, 2), id="y-x-depth"), # accepted: spatial-first - pytest.param(("foo", "bar", "baz"), (2, 4, 5), id="foo-bar-baz"), -] - - # Inputs that must be accepted (round-trip cleanly). _HAPPY_3D_INPUTS = [ pytest.param(("band", "y", "x"), (3, 4, 5), id="band-y-x"), From 9be931d3bc4257bd5a6c1d32bd1ee5b2711d77a8 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Wed, 13 May 2026 10:13:49 -0700 Subject: [PATCH 3/3] geotiff: per-tile dim check uses default cap, not caller budget (#1823) PR #1803 forwarded the caller's max_pixels to read_to_array inside read_vrt's source loop so a tiny VRT output cannot force a huge source decode (#1796). The output-window check at the source read enforces that correctly. A separate per-tile dimension check at the same call sites also consumed the caller's max_pixels, so a caller setting max_pixels as an output budget (e.g. 10_000) failed the per-tile sanity check on any normal source whose default tile size is 256x256 (= 65_536 pixels). Use MAX_PIXELS_DEFAULT for the per-tile dim check at the two call sites in _read_tiles (local) and _read_tiles_cog_http (HTTP). The output-window check at the same functions continues to enforce the user-supplied max_pixels, preserving the #1796 protection. --- xrspatial/geotiff/_reader.py | 19 ++- .../tests/test_vrt_source_tile_check_1823.py | 113 ++++++++++++++++++ 2 files changed, 127 insertions(+), 5 deletions(-) create mode 100644 xrspatial/geotiff/tests/test_vrt_source_tile_check_1823.py diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index b3da32bd..c174c3fc 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -1560,9 +1560,14 @@ def _read_tiles(data: bytes, ifd: IFD, header: TIFFHeader, raise ValueError( f"Invalid tile dimensions: TileWidth={tw}, TileLength={th}") - # Reject crafted tile dims that would force huge per-tile allocations. - # A single tile's decoded bytes must also fit under the pixel budget. - _check_dimensions(tw, th, samples, max_pixels) + # Reject crafted tile dims (e.g. TileWidth = 2**31). This guards the + # TIFF header against malformed values; it is not the caller's output + # budget. The output-window check below uses ``max_pixels`` and is + # what enforces the user's per-call memory cap. The source-read path + # under ``read_vrt`` (#1796) relies on that output check to honour a + # small caller ``max_pixels`` against a normal-tile source; see + # #1823. + _check_dimensions(tw, th, samples, MAX_PIXELS_DEFAULT) # Per-tile compressed-byte cap (issue #1664). Same env var as the # HTTP path. mmap slicing is bounded by the file size, but the slice @@ -2016,10 +2021,14 @@ def _fetch_decode_cog_http_tiles( # A windowed HTTP read of a multi-billion-pixel COG only allocates # the window, so capping the full image would reject legitimate # tiled reads. The full-image cap still applies for whole-file - # reads (window is None). The single-tile budget always applies. + # reads (window is None). The per-tile dim check below guards the + # TIFF header against absurd ``TileWidth`` / ``TileLength`` values + # (e.g. 2**31) and uses ``MAX_PIXELS_DEFAULT`` so a caller's small + # ``max_pixels`` -- intended as an output-window budget -- does not + # reject normal 256x256 tiles. See #1823. if window is None: _check_dimensions(width, height, samples, max_pixels) - _check_dimensions(tw, th, samples, max_pixels) + _check_dimensions(tw, th, samples, MAX_PIXELS_DEFAULT) # Reject malformed TIFFs whose declared tile grid exceeds the supplied # TileOffsets length. See issue #1219. diff --git a/xrspatial/geotiff/tests/test_vrt_source_tile_check_1823.py b/xrspatial/geotiff/tests/test_vrt_source_tile_check_1823.py new file mode 100644 index 00000000..b02dd2fb --- /dev/null +++ b/xrspatial/geotiff/tests/test_vrt_source_tile_check_1823.py @@ -0,0 +1,113 @@ +"""Regression tests for #1823. + +PR #1803 forwarded the caller's ``max_pixels`` to ``read_to_array`` inside +the VRT source loop so that a tiny VRT output could not force a huge +source decode (#1796). The output-window check enforces that. A separate +per-tile dimension check was incorrectly using the same ``max_pixels`` +value, so a caller setting ``max_pixels`` as an output budget (e.g. +10,000) would also fail the per-tile sanity check on every normal source +whose default tile size is 256x256 (= 65,536 pixels). + +The #1796 protection remains: the output-window check still catches a +tiny VRT output that asks for a large source window. +""" +from __future__ import annotations + +import os +import tempfile + +import numpy as np +import pytest + +from xrspatial.geotiff import to_geotiff +from xrspatial.geotiff._reader import PixelSafetyLimitError +from xrspatial.geotiff._vrt import read_vrt + + +def _write_normal_tile_source(td: str) -> str: + """10x10 uint8 source -- ``to_geotiff`` pads to a 256x256 tile.""" + src = os.path.join(td, 'src.tif') + to_geotiff(np.zeros((10, 10), dtype=np.uint8), src, compression='none') + return src + + +def _write_vrt(td: str, *, dst_x_size: int, dst_y_size: int, + raster_x: int = 100, raster_y: int = 100, + src_x_size: int = 10, src_y_size: int = 10) -> str: + vrt = os.path.join(td, 'mosaic.vrt') + xml = ( + f'\n' + f' \n' + f' \n' + f' src.tif\n' + f' 1\n' + f' \n' + f' \n' + f' \n' + f' \n' + f'\n' + ) + with open(vrt, 'w') as f: + f.write(xml) + return vrt + + +class TestPerTileCheckDoesNotUseCallerBudget: + """Per-tile dim sanity must not reject normal 256x256 source tiles + when the caller's ``max_pixels`` is a small output-budget value.""" + + def test_normal_tile_source_with_small_max_pixels(self): + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: + _write_normal_tile_source(td) + vrt = _write_vrt(td, dst_x_size=100, dst_y_size=100) + arr, _ = read_vrt(vrt, max_pixels=10_000) + assert arr.shape == (100, 100) + + def test_normal_tile_source_with_tiny_max_pixels(self): + """An output budget below a single tile must still succeed when + the requested output window itself fits.""" + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: + _write_normal_tile_source(td) + # Output 5x5 = 25 pixels; max_pixels = 100 fits 25 with room. + vrt = _write_vrt(td, dst_x_size=5, dst_y_size=5, + raster_x=5, raster_y=5) + arr, _ = read_vrt(vrt, max_pixels=100) + assert arr.shape == (5, 5) + + +class TestOutputWindowCheckStillEnforced: + """The output-window check at the source read still rejects an + over-budget read; the #1796 protection is preserved.""" + + def test_output_window_exceeds_max_pixels_still_rejected(self): + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: + src = os.path.join(td, 'src.tif') + to_geotiff(np.arange(16, dtype=np.uint8).reshape(4, 4), + src, compression='none') + vrt = _write_vrt(td, dst_x_size=1, dst_y_size=1, + raster_x=1, raster_y=1, + src_x_size=4, src_y_size=4) + # SrcRect 4x4 = 16 pixels > max_pixels=1 → output check fires. + with pytest.raises(ValueError, match="exceed"): + read_vrt(vrt, max_pixels=1) + + +class TestPerTileCheckStillRejectsCraftedHeader: + """A pathological ``TileWidth``/``TileLength`` must still fail at + the per-tile sanity check, which uses ``MAX_PIXELS_DEFAULT``.""" + + def test_per_tile_check_caps_at_default(self, monkeypatch): + """Lower ``MAX_PIXELS_DEFAULT`` to verify the per-tile call site + is wired to it (rather than to the caller's budget).""" + from xrspatial.geotiff import _reader as reader_mod + + monkeypatch.setattr(reader_mod, "MAX_PIXELS_DEFAULT", 100) + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: + _write_normal_tile_source(td) + vrt = _write_vrt(td, dst_x_size=100, dst_y_size=100) + # 256x256 tile > patched MAX_PIXELS_DEFAULT=100 → per-tile + # check fires regardless of caller's max_pixels (1e9 here). + with pytest.raises(PixelSafetyLimitError, match="65,536"): + read_vrt(vrt, max_pixels=1_000_000_000)