From 36b9d51e9829a26f3ea239edbdc22e888d7f2c0f Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Wed, 13 May 2026 08:38:54 -0700 Subject: [PATCH 1/3] geotiff: forward missing_sources from open_geotiff to read_vrt (#1810) read_vrt gained a missing_sources='warn'|'raise' policy kwarg in #1806 but the documented dispatcher open_geotiff did not expose it. Callers who wanted strict failure on broken VRT sources had to call read_vrt directly, defeating the dispatcher contract. Same class of bug as #1561 / #1605 / #1685 / #1795: a backend kwarg reachable on the routed-to function is unreachable through the documented entry point. Fix mirrors the on_gpu_failure pattern that open_geotiff already uses for GPU-only kwargs: - Add missing_sources= to open_geotiff with a sentinel default so the dispatcher can tell "caller never set it" (skip forwarding, let the read_vrt default of 'warn' apply) from "caller set it" (forward verbatim). - Reject missing_sources on non-VRT sources with a clear ValueError so callers learn the policy is being ignored instead of getting a silent drop. - Document the kwarg in the open_geotiff docstring with the same values and semantics as read_vrt. Regression tests in test_open_geotiff_missing_sources_1810.py cover the signature, warn/raise/invalid policies through the dispatcher, non-VRT rejection, and the no-kwarg backward-compatible path. --- xrspatial/geotiff/__init__.py | 38 ++++- .../test_open_geotiff_missing_sources_1810.py | 146 ++++++++++++++++++ 2 files changed, 182 insertions(+), 2 deletions(-) create mode 100644 xrspatial/geotiff/tests/test_open_geotiff_missing_sources_1810.py diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 588cc763..7231ce6b 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -87,6 +87,12 @@ # None is itself a value a caller could supply alongside crs=. See # issue #1715. _CRS_WKT_DEPRECATED_SENTINEL = object() +# ``open_geotiff`` needs to tell "caller never set missing_sources" (default +# sentinel: skip forwarding so the read_vrt default applies, and reject the +# kwarg up front for non-VRT sources) from "caller set missing_sources=" +# (forward verbatim to read_vrt). Mirrors the on_gpu_failure pattern. See +# issue #1810. +_MISSING_SOURCES_SENTINEL = object() # Names of dims that ``to_geotiff`` / ``write_geotiff_gpu`` treat as the # non-spatial band axis. Used both to remap ``(band, y, x)`` inputs to @@ -692,6 +698,7 @@ def open_geotiff(source: str | BinaryIO, *, gpu: bool = False, max_pixels: int | None = None, on_gpu_failure: str = _ON_GPU_FAILURE_SENTINEL, + missing_sources: str = _MISSING_SOURCES_SENTINEL, ) -> xr.DataArray: """Read a GeoTIFF, COG, or VRT file into an xarray.DataArray. @@ -737,6 +744,14 @@ def open_geotiff(source: str | BinaryIO, *, Passing this kwarg with ``gpu=False`` raises ``ValueError`` because the policy only applies to the GPU pipeline. See ``read_geotiff_gpu`` for the full description. + missing_sources : {'warn', 'raise'}, optional + Forwarded to ``read_vrt`` when the source is a ``.vrt`` file. + ``'warn'`` preserves the historical behavior: emit + ``GeoTIFFFallbackWarning``, record ``attrs['vrt_holes']``, and + return a partial mosaic. ``'raise'`` fails immediately. Passing + this kwarg with a non-VRT source raises ``ValueError`` because + the policy only applies to the VRT pipeline. See ``read_vrt`` + for the full description. Returns ------- @@ -782,8 +797,24 @@ def open_geotiff(source: str | BinaryIO, *, "Pass gpu=True to enable the GPU pipeline, or drop " "on_gpu_failure to keep the default CPU path.") + # ``missing_sources`` is VRT-only. Reject it up front when the source + # is not a ``.vrt`` file so callers learn the policy is being ignored + # instead of getting a silent drop -- same pattern ``on_gpu_failure`` + # uses above for the GPU-only kwarg, and the same class of dispatcher + # silently-drops-backend-kwarg bug #1561 / #1605 / #1685 / #1795 fixed + # for the other VRT/GPU kwargs. See issue #1810. + missing_sources_passed = ( + missing_sources is not _MISSING_SOURCES_SENTINEL) + _is_vrt_source = ( + isinstance(source, str) and source.lower().endswith('.vrt')) + if missing_sources_passed and not _is_vrt_source: + raise ValueError( + "missing_sources only applies to VRT sources. " + "Pass a .vrt path to enable the VRT pipeline, or drop " + "missing_sources to keep the default GeoTIFF path.") + # VRT files (string paths only -- VRT XML references other files on disk) - if isinstance(source, str) and source.lower().endswith('.vrt'): + if _is_vrt_source: # ``read_vrt`` does not accept ``overview_level`` (the VRT XML # references its own source files; overview selection would need # to apply to each one). Silently dropping the kwarg was the same @@ -810,9 +841,12 @@ def open_geotiff(source: str | BinaryIO, *, "VRT reads do not go through the GPU decoder pipeline; " "drop the kwarg or call read_geotiff_gpu directly on a " ".tif source.") + vrt_kwargs = {} + if missing_sources_passed: + vrt_kwargs['missing_sources'] = missing_sources return read_vrt(source, dtype=dtype, window=window, band=band, name=name, chunks=chunks, gpu=gpu, - max_pixels=max_pixels) + max_pixels=max_pixels, **vrt_kwargs) # File-like buffers don't support the GPU or dask code paths because # those re-open the source by path from worker tasks or device-side diff --git a/xrspatial/geotiff/tests/test_open_geotiff_missing_sources_1810.py b/xrspatial/geotiff/tests/test_open_geotiff_missing_sources_1810.py new file mode 100644 index 00000000..5e23bacd --- /dev/null +++ b/xrspatial/geotiff/tests/test_open_geotiff_missing_sources_1810.py @@ -0,0 +1,146 @@ +"""Regression test for #1810: ``open_geotiff`` did not accept +``missing_sources`` and did not forward it to ``read_vrt`` when the +source was a VRT. + +The api-consistency sweep on 2026-05-13 flagged that ``read_vrt`` had +gained a ``missing_sources='warn'|'raise'`` policy kwarg (#1806) but +the documented dispatcher entry point ``open_geotiff`` did not expose +it. Callers wanting strict failure had to call ``read_vrt`` directly. +Same class of dispatcher-drops-backend-kwarg bug as #1561, #1605, +#1685, #1795. +""" +from __future__ import annotations + +import inspect + +import numpy as np +import pytest +import xarray as xr + +from xrspatial.geotiff import ( + GeoTIFFFallbackWarning, + open_geotiff, + to_geotiff, + write_vrt, +) + + +def _write_missing_source_vrt(path): + """Write a VRT pointing at a non-existent source file. + + Mirrors the helper in ``test_vrt_missing_sources_policy_1799.py`` so + the warn-vs-raise branches surface the same way through the + dispatcher as they do via the direct ``read_vrt`` call. + """ + path.write_text( + '\n' + ' \n' + ' \n' + ' missing.tif' + '\n' + ' 1\n' + ' \n' + ' \n' + ' \n' + ' \n' + '\n' + ) + + +def test_open_geotiff_accepts_missing_sources(): + """The signature exposes ``missing_sources`` so IDE autocomplete and + ``inspect.signature`` see the kwarg without parsing the docstring. + """ + sig = inspect.signature(open_geotiff) + assert 'missing_sources' in sig.parameters + + +def test_open_geotiff_vrt_missing_sources_warn(tmp_path): + """``missing_sources='warn'`` matches the historic behavior: + ``GeoTIFFFallbackWarning`` is emitted and ``attrs['vrt_holes']`` is + populated. Going through the dispatcher must produce the same + result as calling ``read_vrt`` directly. + """ + vrt = tmp_path / "tmp_1810_missing_warn.vrt" + _write_missing_source_vrt(vrt) + + with pytest.warns(GeoTIFFFallbackWarning, match="could not be read"): + da = open_geotiff(str(vrt), missing_sources='warn') + + assert 'vrt_holes' in da.attrs + assert da.attrs['vrt_holes'][0]['source'].endswith('missing.tif') + + +def test_open_geotiff_vrt_missing_sources_raise(tmp_path): + """``missing_sources='raise'`` propagates through the dispatcher and + fails fast instead of silently returning a partial mosaic. + """ + vrt = tmp_path / "tmp_1810_missing_raise.vrt" + _write_missing_source_vrt(vrt) + + with pytest.raises((OSError, ValueError)): + open_geotiff(str(vrt), missing_sources='raise') + + +def test_open_geotiff_vrt_missing_sources_validates_policy(tmp_path): + """The validator on ``read_vrt`` fires through the dispatcher path + so callers get the same parameter-named error from either entry + point. + """ + vrt = tmp_path / "tmp_1810_missing_bad_policy.vrt" + _write_missing_source_vrt(vrt) + + with pytest.raises(ValueError, match="missing_sources"): + open_geotiff(str(vrt), missing_sources='ignore') + + +def test_open_geotiff_rejects_missing_sources_on_tif(tmp_path): + """``missing_sources`` is VRT-only; passing it with a ``.tif`` source + raises rather than silently doing nothing. Mirrors the + ``on_gpu_failure`` rejection pattern. + """ + arr = np.arange(16, dtype=np.uint16).reshape(4, 4) + da = xr.DataArray( + arr, dims=['y', 'x'], + coords={'y': np.arange(4, dtype=np.float64), + 'x': np.arange(4, dtype=np.float64)}, + attrs={'crs': 4326}, + ) + tif_path = tmp_path / "single_tile.tif" + to_geotiff(da, str(tif_path)) + + with pytest.raises(ValueError, match="missing_sources only applies"): + open_geotiff(str(tif_path), missing_sources='raise') + + +def test_open_geotiff_vrt_without_missing_sources_kwarg_still_works(tmp_path): + """Omitting ``missing_sources`` is a no-op: behaviour matches the + pre-fix dispatcher (and the ``read_vrt`` default of ``'warn'``). + The existing two-tile VRT exercises the happy path so callers + upgrading from #1806 do not see a regression. + """ + arr_a = np.arange(16, dtype=np.uint16).reshape(4, 4) + da_a = xr.DataArray( + arr_a, dims=['y', 'x'], + coords={'y': np.array([0.5, 1.5, 2.5, 3.5]), + 'x': np.array([0.5, 1.5, 2.5, 3.5])}, + attrs={'crs': 4326}, + ) + tile_a = tmp_path / "tile_a.tif" + to_geotiff(da_a, str(tile_a)) + + arr_b = np.arange(16, 32, dtype=np.uint16).reshape(4, 4) + da_b = xr.DataArray( + arr_b, dims=['y', 'x'], + coords={'y': np.array([0.5, 1.5, 2.5, 3.5]), + 'x': np.array([4.5, 5.5, 6.5, 7.5])}, + attrs={'crs': 4326}, + ) + tile_b = tmp_path / "tile_b.tif" + to_geotiff(da_b, str(tile_b)) + + vrt_path = tmp_path / "mosaic_1810.vrt" + write_vrt(str(vrt_path), [str(tile_a), str(tile_b)]) + + da = open_geotiff(str(vrt_path)) + assert da.shape == (4, 8) From a58db09b7734a8c091ef67596b1bcd015dbd6724 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Wed, 13 May 2026 08:39:04 -0700 Subject: [PATCH 2/3] sweep: update api-consistency state for geotiff (v5) Record the v5 api-consistency sweep against geotiff. One MEDIUM finding filed and fixed (#1810). Cross-sibling write-return-type drift remains deferred at LOW. --- .claude/sweep-api-consistency-state.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.claude/sweep-api-consistency-state.csv b/.claude/sweep-api-consistency-state.csv index 76ec19a2..d1e60c54 100644 --- a/.claude/sweep-api-consistency-state.csv +++ b/.claude/sweep-api-consistency-state.csv @@ -1,3 +1,3 @@ module,last_inspected,issue,severity_max,categories_found,notes -geotiff,2026-05-13,1775,MEDIUM,3,"Sweep v4 (deep-sweep-api-consistency-geotiff-2026-05-13). 1 MEDIUM finding filed and fixed: #1775 reader trio dtype kwarg lacks str|np.dtype|None annotation (Cat 3). Annotation added on open_geotiff, read_geotiff_dask, read_geotiff_gpu, and read_vrt; pinning tests extended in test_signature_annotations_1654.py. Cross-sibling return-type drift surfaced for follow-up but not fixed (Cat 2): write_vrt returns str while to_geotiff and write_geotiff_gpu return None -- behaviour change deferred. Prior sweep findings (#1654 #1683 #1684 #1685 #1705 #1754) all confirmed fixed. cuda-validated." +geotiff,2026-05-13,1810,MEDIUM,5,"Sweep v5 (deep-sweep-api-consistency-geotiff-2026-05-13). 1 MEDIUM finding filed and fixed: #1810 open_geotiff dispatcher dropped missing_sources kwarg when routing to read_vrt (Cat 5, same class as #1561/#1605/#1685/#1795). Fix mirrors the on_gpu_failure pattern: sentinel default, forward to read_vrt for .vrt sources, reject for non-VRT sources. Regression test in test_open_geotiff_missing_sources_1810.py. Prior sweep findings (#1654 #1683 #1684 #1685 #1705 #1715 #1754 #1775) all confirmed fixed. Cross-sibling return-type drift (Cat 2): write_vrt returns str while to_geotiff and write_geotiff_gpu return None -- still deferred (LOW, callers do not substitute these writers). cuda-validated." reproject,2026-05-10,1570,HIGH,2;5,"Filed cross-module attrs['vertical_crs'] type collision (string vs EPSG int) vs xrspatial.geotiff. Fixed in PR (TBD): reproject now writes EPSG int and preserves friendly token under vertical_datum. MEDIUM kwarg-order drift (transform_precision vs chunk_size) and missing type hints vs geotiff documented but not fixed (cosmetic, kwarg-only)." From 4666bf8b8380477b37a4b75111ffa6445e63f629 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Wed, 13 May 2026 10:13:49 -0700 Subject: [PATCH 3/3] geotiff: per-tile dim check uses default cap, not caller budget (#1823) PR #1803 forwarded the caller's max_pixels to read_to_array inside read_vrt's source loop so a tiny VRT output cannot force a huge source decode (#1796). The output-window check at the source read enforces that correctly. A separate per-tile dimension check at the same call sites also consumed the caller's max_pixels, so a caller setting max_pixels as an output budget (e.g. 10_000) failed the per-tile sanity check on any normal source whose default tile size is 256x256 (= 65_536 pixels). Use MAX_PIXELS_DEFAULT for the per-tile dim check at the two call sites in _read_tiles (local) and _read_tiles_cog_http (HTTP). The output-window check at the same functions continues to enforce the user-supplied max_pixels, preserving the #1796 protection. --- xrspatial/geotiff/_reader.py | 19 ++- .../tests/test_vrt_source_tile_check_1823.py | 113 ++++++++++++++++++ 2 files changed, 127 insertions(+), 5 deletions(-) create mode 100644 xrspatial/geotiff/tests/test_vrt_source_tile_check_1823.py diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index b3da32bd..c174c3fc 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -1560,9 +1560,14 @@ def _read_tiles(data: bytes, ifd: IFD, header: TIFFHeader, raise ValueError( f"Invalid tile dimensions: TileWidth={tw}, TileLength={th}") - # Reject crafted tile dims that would force huge per-tile allocations. - # A single tile's decoded bytes must also fit under the pixel budget. - _check_dimensions(tw, th, samples, max_pixels) + # Reject crafted tile dims (e.g. TileWidth = 2**31). This guards the + # TIFF header against malformed values; it is not the caller's output + # budget. The output-window check below uses ``max_pixels`` and is + # what enforces the user's per-call memory cap. The source-read path + # under ``read_vrt`` (#1796) relies on that output check to honour a + # small caller ``max_pixels`` against a normal-tile source; see + # #1823. + _check_dimensions(tw, th, samples, MAX_PIXELS_DEFAULT) # Per-tile compressed-byte cap (issue #1664). Same env var as the # HTTP path. mmap slicing is bounded by the file size, but the slice @@ -2016,10 +2021,14 @@ def _fetch_decode_cog_http_tiles( # A windowed HTTP read of a multi-billion-pixel COG only allocates # the window, so capping the full image would reject legitimate # tiled reads. The full-image cap still applies for whole-file - # reads (window is None). The single-tile budget always applies. + # reads (window is None). The per-tile dim check below guards the + # TIFF header against absurd ``TileWidth`` / ``TileLength`` values + # (e.g. 2**31) and uses ``MAX_PIXELS_DEFAULT`` so a caller's small + # ``max_pixels`` -- intended as an output-window budget -- does not + # reject normal 256x256 tiles. See #1823. if window is None: _check_dimensions(width, height, samples, max_pixels) - _check_dimensions(tw, th, samples, max_pixels) + _check_dimensions(tw, th, samples, MAX_PIXELS_DEFAULT) # Reject malformed TIFFs whose declared tile grid exceeds the supplied # TileOffsets length. See issue #1219. diff --git a/xrspatial/geotiff/tests/test_vrt_source_tile_check_1823.py b/xrspatial/geotiff/tests/test_vrt_source_tile_check_1823.py new file mode 100644 index 00000000..b02dd2fb --- /dev/null +++ b/xrspatial/geotiff/tests/test_vrt_source_tile_check_1823.py @@ -0,0 +1,113 @@ +"""Regression tests for #1823. + +PR #1803 forwarded the caller's ``max_pixels`` to ``read_to_array`` inside +the VRT source loop so that a tiny VRT output could not force a huge +source decode (#1796). The output-window check enforces that. A separate +per-tile dimension check was incorrectly using the same ``max_pixels`` +value, so a caller setting ``max_pixels`` as an output budget (e.g. +10,000) would also fail the per-tile sanity check on every normal source +whose default tile size is 256x256 (= 65,536 pixels). + +The #1796 protection remains: the output-window check still catches a +tiny VRT output that asks for a large source window. +""" +from __future__ import annotations + +import os +import tempfile + +import numpy as np +import pytest + +from xrspatial.geotiff import to_geotiff +from xrspatial.geotiff._reader import PixelSafetyLimitError +from xrspatial.geotiff._vrt import read_vrt + + +def _write_normal_tile_source(td: str) -> str: + """10x10 uint8 source -- ``to_geotiff`` pads to a 256x256 tile.""" + src = os.path.join(td, 'src.tif') + to_geotiff(np.zeros((10, 10), dtype=np.uint8), src, compression='none') + return src + + +def _write_vrt(td: str, *, dst_x_size: int, dst_y_size: int, + raster_x: int = 100, raster_y: int = 100, + src_x_size: int = 10, src_y_size: int = 10) -> str: + vrt = os.path.join(td, 'mosaic.vrt') + xml = ( + f'\n' + f' \n' + f' \n' + f' src.tif\n' + f' 1\n' + f' \n' + f' \n' + f' \n' + f' \n' + f'\n' + ) + with open(vrt, 'w') as f: + f.write(xml) + return vrt + + +class TestPerTileCheckDoesNotUseCallerBudget: + """Per-tile dim sanity must not reject normal 256x256 source tiles + when the caller's ``max_pixels`` is a small output-budget value.""" + + def test_normal_tile_source_with_small_max_pixels(self): + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: + _write_normal_tile_source(td) + vrt = _write_vrt(td, dst_x_size=100, dst_y_size=100) + arr, _ = read_vrt(vrt, max_pixels=10_000) + assert arr.shape == (100, 100) + + def test_normal_tile_source_with_tiny_max_pixels(self): + """An output budget below a single tile must still succeed when + the requested output window itself fits.""" + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: + _write_normal_tile_source(td) + # Output 5x5 = 25 pixels; max_pixels = 100 fits 25 with room. + vrt = _write_vrt(td, dst_x_size=5, dst_y_size=5, + raster_x=5, raster_y=5) + arr, _ = read_vrt(vrt, max_pixels=100) + assert arr.shape == (5, 5) + + +class TestOutputWindowCheckStillEnforced: + """The output-window check at the source read still rejects an + over-budget read; the #1796 protection is preserved.""" + + def test_output_window_exceeds_max_pixels_still_rejected(self): + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: + src = os.path.join(td, 'src.tif') + to_geotiff(np.arange(16, dtype=np.uint8).reshape(4, 4), + src, compression='none') + vrt = _write_vrt(td, dst_x_size=1, dst_y_size=1, + raster_x=1, raster_y=1, + src_x_size=4, src_y_size=4) + # SrcRect 4x4 = 16 pixels > max_pixels=1 → output check fires. + with pytest.raises(ValueError, match="exceed"): + read_vrt(vrt, max_pixels=1) + + +class TestPerTileCheckStillRejectsCraftedHeader: + """A pathological ``TileWidth``/``TileLength`` must still fail at + the per-tile sanity check, which uses ``MAX_PIXELS_DEFAULT``.""" + + def test_per_tile_check_caps_at_default(self, monkeypatch): + """Lower ``MAX_PIXELS_DEFAULT`` to verify the per-tile call site + is wired to it (rather than to the caller's budget).""" + from xrspatial.geotiff import _reader as reader_mod + + monkeypatch.setattr(reader_mod, "MAX_PIXELS_DEFAULT", 100) + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: + _write_normal_tile_source(td) + vrt = _write_vrt(td, dst_x_size=100, dst_y_size=100) + # 256x256 tile > patched MAX_PIXELS_DEFAULT=100 → per-tile + # check fires regardless of caller's max_pixels (1e9 here). + with pytest.raises(PixelSafetyLimitError, match="65,536"): + read_vrt(vrt, max_pixels=1_000_000_000)