From e6d86c8d5465e52c642a92b82fe1eba3656b35e8 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Tue, 12 May 2026 13:11:14 -0700 Subject: [PATCH 1/2] geotiff: surface VRT skipped sources as attrs['vrt_holes'] (#1734) The default lenient mode of read_vrt warns once per unreadable source and continues, returning a mosaic with zero-filled holes that downstream integer-VRT code cannot distinguish from real data. The warning is easy to miss in a pipeline that filters UserWarnings. Add a structured side-channel: VRTDataset gains a .holes list that the read loop populates with {source, band, dst_rect, error} on each skipped source. The public read_vrt copies a non-empty list into the returned DataArray as attrs['vrt_holes'] so callers can detect partial mosaics by a single attribute lookup. The fallback warning message now points at attrs['vrt_holes'] or the XRSPATIAL_GEOTIFF_STRICT env var, so the recovery path is discoverable from a single captured warning. Strict mode (XRSPATIAL_GEOTIFF_STRICT=1) is unchanged and still raises. Adds regression coverage in test_vrt_holes_attr_1734.py. --- xrspatial/geotiff/__init__.py | 10 ++ xrspatial/geotiff/_vrt.py | 30 +++- .../geotiff/tests/test_vrt_holes_attr_1734.py | 149 ++++++++++++++++++ 3 files changed, 188 insertions(+), 1 deletion(-) create mode 100644 xrspatial/geotiff/tests/test_vrt_holes_attr_1734.py diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 3e67eebf..5203a2a2 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -3474,6 +3474,16 @@ def read_vrt(source: str, *, dtype=None, attrs['crs_wkt'] = vrt.crs_wkt if vrt.raster_type == 'point': attrs['raster_type'] = 'point' + # Surface skipped-source records as ``attrs['vrt_holes']`` so + # callers can detect a partial mosaic by attribute lookup. Under + # lenient mode (the default), the underlying ``_vrt.read_vrt`` + # already warned per skipped source but the warning is easy to + # miss in a pipeline; the attr lets downstream code branch on + # ``"vrt_holes" in da.attrs`` instead of monitoring the warnings + # stream. Empty list is omitted so the attr only appears when + # there is actually a hole. See issue #1734. + if vrt.holes: + attrs['vrt_holes'] = list(vrt.holes) # When a specific band is selected, source its nodata from that # band's instead of band 0's. Otherwise multi-band # VRTs with per-band sentinels would mis-mask the read: attrs would diff --git a/xrspatial/geotiff/_vrt.py b/xrspatial/geotiff/_vrt.py index 090cff0b..608fecf0 100644 --- a/xrspatial/geotiff/_vrt.py +++ b/xrspatial/geotiff/_vrt.py @@ -181,6 +181,14 @@ class VRTDataset: # coords must be emitted without the shift. Parsed from # ``Point``. raster_type: str = 'area' # 'area' or 'point' + # Per-load record of sources skipped by ``read_vrt`` under the + # lenient default (not strict mode). Each entry is a dict with + # ``source``, ``band`` (1-based), ``dst_rect`` (xoff, yoff, + # xsize, ysize), and ``error`` keys. Empty when no sources failed + # to read. Populated by :func:`read_vrt` so callers can detect + # holes by attribute lookup instead of parsing a UserWarning + # message (issue #1734). + holes: list[dict] = field(default_factory=list) def _parse_rect(elem) -> _Rect: @@ -656,6 +664,17 @@ def read_vrt(vrt_path: str, *, window=None, # so partial mosaics are caught in CI. Default mode warns # once per missing source then continues, preserving the # historical behaviour. See issue #1662. + # + # The lenient default leaves zero-filled holes in + # integer VRTs that downstream code cannot tell from + # real data unless the band has a nodata sentinel set. + # Recording the skipped source on ``vrt.holes`` lets the + # public ``read_vrt`` surface a machine-readable + # ``attrs['vrt_holes']`` entry on the returned + # DataArray, alongside the warning. Callers that need + # to fail loudly can either inspect that attr or set + # ``XRSPATIAL_GEOTIFF_STRICT=1`` to raise here. + # See issue #1734. import warnings from . import _geotiff_strict_mode, GeoTIFFFallbackWarning if _geotiff_strict_mode(): @@ -663,10 +682,19 @@ def read_vrt(vrt_path: str, *, window=None, warnings.warn( f"VRT source {src.filename!r} could not be read " f"({type(e).__name__}: {e}); skipping. The output " - f"mosaic will have a hole at this tile.", + f"mosaic will have a hole at this tile. Inspect " + f"``DataArray.attrs['vrt_holes']`` or set " + f"XRSPATIAL_GEOTIFF_STRICT=1 to raise instead.", GeoTIFFFallbackWarning, stacklevel=2, ) + vrt.holes.append({ + 'source': src.filename, + 'band': src.band, + 'dst_rect': (dr.x_off, dr.y_off, + dr.x_size, dr.y_size), + 'error': f"{type(e).__name__}: {e}", + }) continue # skip missing/unreadable sources # Handle source nodata. Cast the sentinel to the *source* diff --git a/xrspatial/geotiff/tests/test_vrt_holes_attr_1734.py b/xrspatial/geotiff/tests/test_vrt_holes_attr_1734.py new file mode 100644 index 00000000..ba39c00a --- /dev/null +++ b/xrspatial/geotiff/tests/test_vrt_holes_attr_1734.py @@ -0,0 +1,149 @@ +"""Regression test for issue #1734. + +Under the lenient default (``XRSPATIAL_GEOTIFF_STRICT`` unset), +``read_vrt`` warns once per unreadable source and continues, producing +a mosaic with zero-filled holes for integer VRTs that downstream code +cannot distinguish from real data. The warning is easy to miss in a +pipeline that ignores ``UserWarning``s. + +This module pins the new behaviour: the returned DataArray now carries +an ``attrs['vrt_holes']`` list describing each skipped source so +callers can detect a partial mosaic with a single attribute lookup. +Strict mode is unchanged and still raises. +""" +from __future__ import annotations + +import warnings + +import pytest + +from xrspatial.geotiff import GeoTIFFFallbackWarning, read_vrt + + +@pytest.fixture +def clear_strict_env(monkeypatch): + monkeypatch.delenv('XRSPATIAL_GEOTIFF_STRICT', raising=False) + + +@pytest.fixture +def set_strict_env(monkeypatch): + monkeypatch.setenv('XRSPATIAL_GEOTIFF_STRICT', '1') + + +def _write_vrt_with_missing_source(vrt_path, missing_src) -> None: + vrt_path.write_text( + '\n' + ' \n' + ' 0, 1, 0, 0, 0, -1\n' + ' \n' + ' -9999\n' + ' \n' + f' {missing_src}' + '\n' + ' 1\n' + ' \n' + ' \n' + ' \n' + ' \n' + '\n' + ) + + +def test_skipped_source_records_vrt_holes_attr( + clear_strict_env, tmp_path, +): + """A VRT with a missing source returns a DataArray whose attrs + carry a ``vrt_holes`` entry naming the source, band, dst_rect, + and underlying error.""" + vrt_path = tmp_path / 'mosaic_1734_missing.vrt' + missing_src = f'{tmp_path}/does_not_exist_1734.tif' + _write_vrt_with_missing_source(vrt_path, missing_src) + + with warnings.catch_warnings(): + warnings.simplefilter('ignore', GeoTIFFFallbackWarning) + da = read_vrt(str(vrt_path)) + + assert 'vrt_holes' in da.attrs + holes = da.attrs['vrt_holes'] + assert isinstance(holes, list) + assert len(holes) == 1 + h = holes[0] + assert h['source'].endswith('does_not_exist_1734.tif') + assert h['band'] == 1 + assert h['dst_rect'] == (0, 0, 4, 4) + assert 'error' in h + assert h['error'] # non-empty + + +def test_no_holes_attr_when_all_sources_read(clear_strict_env, tmp_path): + """A successful VRT read does not advertise an empty ``vrt_holes`` + attr; the key is omitted entirely so ``"vrt_holes" in attrs`` is a + cheap completeness check.""" + import numpy as np + import xarray as xr + from xrspatial.geotiff import to_geotiff + + # Write a real source the VRT can reference. + src_path = tmp_path / 'src_1734.tif' + arr = np.arange(16, dtype=np.float32).reshape(4, 4) + da_src = xr.DataArray( + arr, dims=['y', 'x'], + coords={'y': np.linspace(3.5, 0.5, 4), + 'x': np.linspace(0.5, 3.5, 4)}, + attrs={'crs': 4326}, + ) + to_geotiff(da_src, str(src_path), compression='none') + + vrt_path = tmp_path / 'mosaic_1734_ok.vrt' + vrt_path.write_text( + '\n' + ' \n' + ' 0, 1, 0, 0, 0, -1\n' + ' \n' + ' \n' + f' {src_path}' + '\n' + ' 1\n' + ' \n' + ' \n' + ' \n' + ' \n' + '\n' + ) + + with warnings.catch_warnings(): + warnings.simplefilter('error', GeoTIFFFallbackWarning) + da = read_vrt(str(vrt_path)) + + assert 'vrt_holes' not in da.attrs + + +def test_strict_mode_still_raises(set_strict_env, tmp_path): + """Strict mode is unchanged: the missing source raises and no + DataArray is returned.""" + vrt_path = tmp_path / 'mosaic_1734_strict.vrt' + missing_src = f'{tmp_path}/does_not_exist_1734_strict.tif' + _write_vrt_with_missing_source(vrt_path, missing_src) + + with pytest.raises(Exception): + read_vrt(str(vrt_path)) + + +def test_warning_mentions_how_to_detect_holes(clear_strict_env, tmp_path): + """The fallback warning now points callers at the attr or the + strict env var so the recovery path is discoverable from a single + captured warning.""" + vrt_path = tmp_path / 'mosaic_1734_msg.vrt' + missing_src = f'{tmp_path}/does_not_exist_1734_msg.tif' + _write_vrt_with_missing_source(vrt_path, missing_src) + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter('always') + read_vrt(str(vrt_path)) + + fallback = [ + x for x in w if issubclass(x.category, GeoTIFFFallbackWarning) + ] + assert fallback, "expected at least one GeoTIFFFallbackWarning" + msg = ' '.join(str(x.message) for x in fallback) + assert 'vrt_holes' in msg or 'XRSPATIAL_GEOTIFF_STRICT' in msg From 5915bd94c5687819d5617e5eccf5d385142342ae Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Tue, 12 May 2026 13:36:06 -0700 Subject: [PATCH 2/2] test(vrt): tighten #1734 regression to integer band and FileNotFoundError Address PR #1747 review comments: * The helper that wrote the VRT used `dataType="Float32"` with a `NoDataValue`, so the missing-source hole came out as NaN, which is not the failure mode #1734 documents. Switch to `Int32` and drop the NoDataValue so the lenient path produces the zero-filled integer hole the attr is meant to surface. Add an explicit `(da.values == 0).all()` assert on the integer dtype so the regression is pinned, not just the attr surface. * The strict-mode test caught `Exception`, which would have happily swallowed any unrelated failure. The missing-source path raises `FileNotFoundError` (an `OSError` subclass) from `_FileSource` via `read_to_array`; assert that concrete class with a `match=` against the file basename so an unrelated read-path bug surfaces as a real test failure. --- .../geotiff/tests/test_vrt_holes_attr_1734.py | 41 ++++++++++++++++--- 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/xrspatial/geotiff/tests/test_vrt_holes_attr_1734.py b/xrspatial/geotiff/tests/test_vrt_holes_attr_1734.py index ba39c00a..5894bd02 100644 --- a/xrspatial/geotiff/tests/test_vrt_holes_attr_1734.py +++ b/xrspatial/geotiff/tests/test_vrt_holes_attr_1734.py @@ -31,12 +31,20 @@ def set_strict_env(monkeypatch): def _write_vrt_with_missing_source(vrt_path, missing_src) -> None: + """Write a VRT with an Int32 band whose only source is missing. + + Integer ``dataType`` is the failure mode issue #1734 was about: the + pre-fix lenient path zero-fills the output buffer (``fill = 0`` for + integer dtypes) and the user cannot distinguish that hole from real + zero-valued data. ``NoDataValue`` is omitted on purpose -- having + one would let downstream code mask the hole and side-step the + regression. See the module docstring. + """ vrt_path.write_text( '\n' ' \n' ' 0, 1, 0, 0, 0, -1\n' - ' \n' - ' -9999\n' + ' \n' ' \n' f' {missing_src}' '\n' @@ -54,7 +62,14 @@ def test_skipped_source_records_vrt_holes_attr( ): """A VRT with a missing source returns a DataArray whose attrs carry a ``vrt_holes`` entry naming the source, band, dst_rect, - and underlying error.""" + and underlying error. + + Uses an Int32 VRT so the hole is zero-filled (the exact failure + mode #1734 was about): without the attr there is no way to tell + the all-zeros tile from real data. + """ + import numpy as np + vrt_path = tmp_path / 'mosaic_1734_missing.vrt' missing_src = f'{tmp_path}/does_not_exist_1734.tif' _write_vrt_with_missing_source(vrt_path, missing_src) @@ -63,6 +78,12 @@ def test_skipped_source_records_vrt_holes_attr( warnings.simplefilter('ignore', GeoTIFFFallbackWarning) da = read_vrt(str(vrt_path)) + # Confirm the integer-specific failure mode is in play: the hole is + # filled with zeros (not NaN), indistinguishable from real data + # without the attr. + assert np.issubdtype(da.dtype, np.integer) + assert (da.values == 0).all() + assert 'vrt_holes' in da.attrs holes = da.attrs['vrt_holes'] assert isinstance(holes, list) @@ -119,13 +140,21 @@ def test_no_holes_attr_when_all_sources_read(clear_strict_env, tmp_path): def test_strict_mode_still_raises(set_strict_env, tmp_path): - """Strict mode is unchanged: the missing source raises and no - DataArray is returned.""" + """Strict mode is unchanged: the missing source surfaces the + underlying ``FileNotFoundError`` (an ``OSError`` subclass) from + ``read_to_array`` instead of warning-and-skipping. + + Asserting the concrete exception class -- not a bare ``Exception`` + -- keeps the regression test honest: an unrelated bug somewhere in + the read path that happens to raise a different exception will + fail this test instead of silently satisfying it. + """ vrt_path = tmp_path / 'mosaic_1734_strict.vrt' missing_src = f'{tmp_path}/does_not_exist_1734_strict.tif' _write_vrt_with_missing_source(vrt_path, missing_src) - with pytest.raises(Exception): + with pytest.raises(FileNotFoundError, + match='does_not_exist_1734_strict.tif'): read_vrt(str(vrt_path))