From e296f216f78834b9e437988cc67224663db8da09 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Mon, 18 May 2026 14:42:26 -0700 Subject: [PATCH 1/2] geotiff: activate mixed-band-metadata fail-closed check (#1987 PR 5) Registers ``_check_read_mixed_band_metadata`` so VRT reads whose bands declare disagreeing per-band ```` sentinels raise ``MixedBandMetadataError`` by default. The check and its ``band_nodata=`` opt-out were staged in #2031 but not registered: the activation needed the existing VRT test sites that mosaic distinct per-band sentinels to migrate to ``band_nodata='first'`` first. Wires the opt-out into ``open_geotiff`` and ``read_geotiff_dask`` so all three VRT entry points expose the same kwarg, with a clear ValueError on non-VRT sources (same dispatch-kwarg guard pattern as ``missing_sources`` / ``on_gpu_failure`` / ``max_cloud_bytes``). Migrates the four VRT test files whose fixtures intentionally mosaic bands with distinct sentinels: - ``test_vrt_band_nodata_1598.py`` - ``test_vrt_multiband_int_nodata_1611.py`` - ``test_vrt_int_source_float_dtype_1616.py`` - ``test_vrt_multiband_dtype_1696.py`` Each migrated call adds ``band_nodata='first'`` so the regression keeps asserting the legacy flatten-to-first-band semantics, now via the documented opt-out instead of implicitly. Adds ``test_mixed_band_metadata_fail_closed_1987.py`` with 11 cases pinning the fail-closed contract end-to-end: default raise, chunked raise, opt-out passthrough, shared-sentinel acceptance, one-band-only acceptance, ``open_geotiff`` / ``read_geotiff_dask`` propagation, and the non-VRT-source rejection on both entry points. Refs #1987. --- xrspatial/geotiff/__init__.py | 26 ++ xrspatial/geotiff/_backends/dask.py | 19 ++ xrspatial/geotiff/_validation.py | 18 +- ...st_mixed_band_metadata_fail_closed_1987.py | 270 ++++++++++++++++++ .../tests/test_reader_kwarg_order_1935.py | 1 + .../tests/test_remaining_fail_closed_1987.py | 9 +- .../tests/test_vrt_band_nodata_1598.py | 14 +- .../test_vrt_int_source_float_dtype_1616.py | 8 +- .../tests/test_vrt_multiband_dtype_1696.py | 6 +- .../test_vrt_multiband_int_nodata_1611.py | 23 +- 10 files changed, 367 insertions(+), 27 deletions(-) create mode 100644 xrspatial/geotiff/tests/test_mixed_band_metadata_fail_closed_1987.py diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 442d7ec3f..215fa354d 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -254,6 +254,7 @@ def open_geotiff(source: str | BinaryIO, *, missing_sources: str = _MISSING_SOURCES_SENTINEL, allow_rotated: bool = False, allow_unparseable_crs: bool = False, + band_nodata: str | None = None, mask_nodata: bool = True, ) -> xr.DataArray: """Read a GeoTIFF, COG, or VRT file into an xarray.DataArray. @@ -325,6 +326,17 @@ def open_geotiff(source: str | BinaryIO, *, return a partial mosaic. Passing this kwarg with a non-VRT source raises ``ValueError`` because the policy only applies to the VRT pipeline. See ``read_vrt`` for the full description. + band_nodata : {'first', None}, optional + VRT-only. Opt-out for the fail-closed check that rejects VRT + sources whose bands declare disagreeing per-band nodata + sentinels (issue #1987 PR 5). When ``None`` (the default), a VRT + that mosaics bands with different sentinels raises + ``MixedBandMetadataError``; flattening to one value would let + one band's valid pixels collide with another band's sentinel. + Pass ``band_nodata='first'`` to keep the legacy behaviour of + using band 0's sentinel for the whole mosaic. Passing this + kwarg with a non-VRT source raises ``ValueError`` because the + policy only applies to the VRT pipeline. mask_nodata : bool, default True If True (the default), replace the nodata sentinel with ``NaN``; integer rasters get promoted to ``float64`` first so NaN can be @@ -409,6 +421,19 @@ def open_geotiff(source: str | BinaryIO, *, "Pass a .vrt path to enable the VRT pipeline, or drop " "missing_sources to keep the default GeoTIFF path.") + # ``band_nodata`` is the #1987 PR 5 opt-out for the mixed-band + # metadata fail-closed check. It only has meaning on the VRT pipeline + # (a plain GeoTIFF has one nodata sentinel per file, not per band), + # so reject the kwarg up front on non-VRT sources rather than letting + # it leak into ``read_vrt`` and confuse the caller about what the + # opt-out actually controls. + if band_nodata is not None and not _is_vrt_source: + raise ValueError( + "band_nodata only applies to VRT sources. " + "Pass a .vrt path to enable the VRT pipeline, or drop " + "band_nodata to keep the default GeoTIFF path. " + "See issue #1987.") + # ``max_cloud_bytes`` is the eager fsspec-read budget. Only # ``_read_to_array`` on the eager non-VRT, non-GPU, non-dask branch # consumes it; the GPU (``read_geotiff_gpu``), dask @@ -473,6 +498,7 @@ def open_geotiff(source: str | BinaryIO, *, max_pixels=max_pixels, allow_rotated=allow_rotated, allow_unparseable_crs=allow_unparseable_crs, + band_nodata=band_nodata, mask_nodata=mask_nodata, **vrt_kwargs) diff --git a/xrspatial/geotiff/_backends/dask.py b/xrspatial/geotiff/_backends/dask.py index bfc7851c0..4ce41e42f 100644 --- a/xrspatial/geotiff/_backends/dask.py +++ b/xrspatial/geotiff/_backends/dask.py @@ -41,6 +41,7 @@ def read_geotiff_dask(source: str, *, max_pixels: int | None = None, allow_rotated: bool = False, allow_unparseable_crs: bool = False, + band_nodata: str | None = None, mask_nodata: bool = True) -> xr.DataArray: """Read a GeoTIFF as a dask-backed DataArray for out-of-core processing. @@ -74,6 +75,11 @@ def read_geotiff_dask(source: str, *, directly. name : str or None Name for the DataArray. + band_nodata : {'first', None}, optional + VRT-only opt-out for the fail-closed mixed-band-metadata check + (issue #1987 PR 5). Forwarded verbatim to ``read_vrt`` when the + source is a ``.vrt`` file. Passing it with a non-VRT GeoTIFF + source raises ``ValueError``. mask_nodata : bool, default True If True, replace the nodata sentinel with NaN per chunk (integer rasters get promoted to ``float64``). If False, skip the @@ -114,8 +120,21 @@ def read_geotiff_dask(source: str, *, return read_vrt( source, dtype=dtype, window=window, band=band, name=name, chunks=chunks, max_pixels=max_pixels, + allow_rotated=allow_rotated, + allow_unparseable_crs=allow_unparseable_crs, + band_nodata=band_nodata, mask_nodata=mask_nodata, ) + # ``band_nodata`` only has meaning for the VRT path (per-band sentinel + # ambiguity). Reject the kwarg up front on non-VRT GeoTIFF inputs so + # callers learn the opt-out is being dropped, matching the + # ``open_geotiff`` guard. See issue #1987 PR 5. + if band_nodata is not None: + raise ValueError( + "band_nodata only applies to VRT sources. " + "Pass a .vrt path to enable the VRT pipeline, or drop " + "band_nodata to keep the default GeoTIFF path. " + "See issue #1987.") # P5: HTTP COG sources used to fire one IFD/header GET per chunk # task. Parse metadata once here so every delayed task can reuse it. diff --git a/xrspatial/geotiff/_validation.py b/xrspatial/geotiff/_validation.py index 658d5c81d..a608c43a7 100644 --- a/xrspatial/geotiff/_validation.py +++ b/xrspatial/geotiff/_validation.py @@ -950,11 +950,13 @@ def _same_nodata(a: float, b: float) -> bool: return a == b -# NOT registered by default. The mixed-band check has a much larger -# migration cost than its sibling read-side checks (around 35 existing -# test sites would need to opt in via ``band_nodata='first'`` to keep -# their legacy assertions), so it lands as a follow-up PR that bundles -# the check activation with the test migration. The check itself and -# the ``band_nodata=`` VRT kwarg ship now so the follow-up is a -# one-line registration call plus the test sweep. -# register_read_metadata_check(_check_read_mixed_band_metadata) +# Registered as of issue #1987 PR 5. The check was staged in the +# preceding bundle (#2031) along with the ``band_nodata=`` VRT kwarg +# but not registered, because activation needed a coordinated migration +# of the VRT test sites that read fixtures with disagreeing per-band +# sentinels. The migration sweep ships alongside this registration. +# Callers that still want the legacy flatten-to-first-band behaviour +# pass ``band_nodata='first'`` to ``read_vrt`` / ``open_geotiff`` / +# ``read_geotiff_dask``; the explicit opt-in surfaces the per-band +# ambiguity at the call site instead of papering over it silently. +register_read_metadata_check(_check_read_mixed_band_metadata) diff --git a/xrspatial/geotiff/tests/test_mixed_band_metadata_fail_closed_1987.py b/xrspatial/geotiff/tests/test_mixed_band_metadata_fail_closed_1987.py new file mode 100644 index 000000000..24323d046 --- /dev/null +++ b/xrspatial/geotiff/tests/test_mixed_band_metadata_fail_closed_1987.py @@ -0,0 +1,270 @@ +"""Fail-closed coverage for ``MixedBandMetadataError`` (issue #1987 PR 5). + +A VRT can mosaic source bands that declare disagreeing per-band +```` sentinels. The legacy reader picked band 0's sentinel +for the whole mosaic, which let band N's valid pixels collide with +band 0's sentinel after the flatten-to-scalar step. The fail-closed +default refuses the ambiguity; ``band_nodata='first'`` keeps the legacy +behaviour explicitly. + +The validator check itself, the ``band_nodata=`` kwarg on +``read_vrt``, and the per-VRT-read context plumbing landed in +the #2031 bundle. This PR registers the check and forwards +``band_nodata`` through ``open_geotiff`` and ``read_geotiff_dask``. +The tests below pin the four entry-point contracts: + +* default ``read_vrt`` raises on disagreeing per-band sentinels +* default ``read_vrt(chunks=...)`` raises before any task fires +* ``band_nodata='first'`` opts back into the legacy reader +* ``band_nodata`` raises on non-VRT sources for ``open_geotiff`` and + ``read_geotiff_dask`` +""" +from __future__ import annotations + +import os + +import numpy as np +import pytest + +from xrspatial.geotiff import ( + GeoTIFFAmbiguousMetadataError, + MixedBandMetadataError, + open_geotiff, + read_geotiff_dask, + read_vrt, + to_geotiff, +) +from xrspatial.geotiff._writer import write + + +def _write_mixed_band_vrt(tmp_path, *, sentinel_a=65535, sentinel_b=65000): + """Two uint16 sources with distinct sentinels, mosaiced into one VRT. + + ``[1, 1]`` is the sentinel pixel in each source so the legacy + reader's flatten-to-first-band step would mis-mask band 1. + """ + band0 = np.array([[1, 2], [3, sentinel_a]], dtype=np.uint16) + band1 = np.array([[7, 8], [9, sentinel_b]], dtype=np.uint16) + p0 = str(tmp_path / 'mixed_band_1987_a.tif') + p1 = str(tmp_path / 'mixed_band_1987_b.tif') + write(band0, p0, nodata=sentinel_a, compression='none', tiled=False) + write(band1, p1, nodata=sentinel_b, compression='none', tiled=False) + vrt_path = str(tmp_path / 'mixed_band_1987.vrt') + vrt_xml = f""" + 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 + + {sentinel_a} + + {p0} + 1 + + + + + + {sentinel_b} + + {p1} + 1 + + + + +""" + with open(vrt_path, 'w') as f: + f.write(vrt_xml) + return vrt_path + + +def _write_shared_sentinel_vrt(tmp_path, *, sentinel=65535): + """Two uint16 sources mosaiced with the *same* per-band sentinel. + + The fail-closed check must not trigger on a VRT whose per-band + nodata agrees; the check is about ambiguity, not about per-band + declarations. + """ + band0 = np.array([[1, 2], [3, sentinel]], dtype=np.uint16) + band1 = np.array([[7, 8], [9, sentinel]], dtype=np.uint16) + p0 = str(tmp_path / 'shared_band_1987_a.tif') + p1 = str(tmp_path / 'shared_band_1987_b.tif') + write(band0, p0, nodata=sentinel, compression='none', tiled=False) + write(band1, p1, nodata=sentinel, compression='none', tiled=False) + vrt_path = str(tmp_path / 'shared_band_1987.vrt') + vrt_xml = f""" + 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 + + {sentinel} + + {p0} + 1 + + + + + + {sentinel} + + {p1} + 1 + + + + +""" + with open(vrt_path, 'w') as f: + f.write(vrt_xml) + return vrt_path + + +def _write_one_band_no_sentinel_vrt(tmp_path, *, sentinel=65535): + """One band declares a sentinel, the other does not. + + ``[b.nodata for b in vrt.bands]`` is ``[sentinel, None]``. The + check considers ``None`` entries "no declaration" rather than a + distinct sentinel, so only one concrete value is present and the + check must accept. + """ + band0 = np.array([[1, 2], [3, sentinel]], dtype=np.uint16) + band1 = np.array([[7, 8], [9, 99]], dtype=np.uint16) + p0 = str(tmp_path / 'one_band_sentinel_a.tif') + p1 = str(tmp_path / 'one_band_sentinel_b.tif') + write(band0, p0, nodata=sentinel, compression='none', tiled=False) + write(band1, p1, nodata=None, compression='none', tiled=False) + vrt_path = str(tmp_path / 'one_band_sentinel.vrt') + vrt_xml = f""" + 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 + + {sentinel} + + {p0} + 1 + + + + + + + {p1} + 1 + + + + +""" + with open(vrt_path, 'w') as f: + f.write(vrt_xml) + return vrt_path + + +def test_read_vrt_rejects_mixed_per_band_nodata(tmp_path): + """Default ``read_vrt`` raises on disagreeing per-band sentinels.""" + vrt_path = _write_mixed_band_vrt(tmp_path) + with pytest.raises(MixedBandMetadataError) as exc_info: + read_vrt(vrt_path) + msg = str(exc_info.value) + assert "65535" in msg and "65000" in msg + assert "band_nodata='first'" in msg + assert "#1987" in msg + + +def test_read_vrt_chunked_rejects_mixed_per_band_nodata(tmp_path): + """The chunked dispatch validates before scheduling any task.""" + vrt_path = _write_mixed_band_vrt(tmp_path) + with pytest.raises(MixedBandMetadataError): + read_vrt(vrt_path, chunks=1) + + +def test_read_vrt_band_nodata_first_opts_back_in(tmp_path): + """``band_nodata='first'`` keeps the legacy flatten-to-first path.""" + vrt_path = _write_mixed_band_vrt(tmp_path) + r = read_vrt(vrt_path, band_nodata='first') + # Legacy semantics: band 0's sentinel becomes ``attrs['nodata']``. + assert r.attrs.get('nodata') == 65535.0 + + +def test_read_vrt_shared_sentinel_accepts(tmp_path): + """A VRT whose bands agree on a single sentinel is not ambiguous.""" + vrt_path = _write_shared_sentinel_vrt(tmp_path) + r = read_vrt(vrt_path) + assert r.attrs.get('nodata') == 65535.0 + + +def test_read_vrt_only_one_band_declares_sentinel_accepts(tmp_path): + """``None`` per-band entries count as "no declaration", not a value. + + A VRT with one declared sentinel and one undeclared band has only + one concrete sentinel so the fail-closed check must accept. + """ + vrt_path = _write_one_band_no_sentinel_vrt(tmp_path) + # Should not raise. + read_vrt(vrt_path) + + +def test_open_geotiff_propagates_mixed_band_rejection(tmp_path): + """``open_geotiff`` on a ``.vrt`` source routes through ``read_vrt``.""" + vrt_path = _write_mixed_band_vrt(tmp_path) + with pytest.raises(MixedBandMetadataError): + open_geotiff(vrt_path) + + +def test_open_geotiff_band_nodata_first_passes_through(tmp_path): + """``open_geotiff(band_nodata='first')`` forwards to ``read_vrt``.""" + vrt_path = _write_mixed_band_vrt(tmp_path) + r = open_geotiff(vrt_path, band_nodata='first') + assert r.attrs.get('nodata') == 65535.0 + + +def test_read_geotiff_dask_band_nodata_first_passes_through(tmp_path): + """``read_geotiff_dask`` on a ``.vrt`` source forwards the kwarg.""" + vrt_path = _write_mixed_band_vrt(tmp_path) + r = read_geotiff_dask(vrt_path, chunks=1, band_nodata='first') + assert r.attrs.get('nodata') == 65535.0 + + +def test_open_geotiff_band_nodata_rejected_on_non_vrt_source(tmp_path): + """``band_nodata`` is VRT-only; reject up front on plain GeoTIFFs. + + Same dispatch-kwarg pattern as ``missing_sources``, ``on_gpu_failure``, + and ``max_cloud_bytes``: silently dropping a backend-specific kwarg + is the bug class #1561 / #1605 / #1685 / #1795 / #1810 / #1974 closed + on the rest of the surface. + """ + arr = np.zeros((2, 2), dtype=np.uint16) + arr_da = _wrap_2d(arr) + p = tmp_path / 'plain.tif' + to_geotiff(arr_da, str(p), compression='none', tiled=False) + with pytest.raises(ValueError, match="band_nodata only applies to VRT"): + open_geotiff(str(p), band_nodata='first') + + +def test_read_geotiff_dask_band_nodata_rejected_on_non_vrt_source(tmp_path): + """``read_geotiff_dask`` matches ``open_geotiff``'s VRT-only guard.""" + arr = np.zeros((2, 2), dtype=np.uint16) + arr_da = _wrap_2d(arr) + p = tmp_path / 'plain.tif' + to_geotiff(arr_da, str(p), compression='none', tiled=False) + with pytest.raises(ValueError, match="band_nodata only applies to VRT"): + read_geotiff_dask(str(p), chunks=1, band_nodata='first') + + +def test_mixed_band_metadata_error_subclasses_base(tmp_path): + """``MixedBandMetadataError`` is catchable via the base class.""" + vrt_path = _write_mixed_band_vrt(tmp_path) + with pytest.raises(GeoTIFFAmbiguousMetadataError): + read_vrt(vrt_path) + + +def _wrap_2d(arr): + """Wrap a 2D numpy array as a minimal DataArray for ``to_geotiff``.""" + import xarray as xr + h, w = arr.shape + return xr.DataArray( + arr, + dims=('y', 'x'), + coords={ + 'y': np.arange(h, dtype=np.float64), + 'x': np.arange(w, dtype=np.float64), + }, + attrs={'crs': 4326, + 'transform': (1.0, 0.0, 0.0, 0.0, -1.0, 0.0)}, + ) diff --git a/xrspatial/geotiff/tests/test_reader_kwarg_order_1935.py b/xrspatial/geotiff/tests/test_reader_kwarg_order_1935.py index ceb97be9b..36378a63e 100644 --- a/xrspatial/geotiff/tests/test_reader_kwarg_order_1935.py +++ b/xrspatial/geotiff/tests/test_reader_kwarg_order_1935.py @@ -44,6 +44,7 @@ "missing_sources", "allow_rotated", "allow_unparseable_crs", + "band_nodata", "mask_nodata", ) diff --git a/xrspatial/geotiff/tests/test_remaining_fail_closed_1987.py b/xrspatial/geotiff/tests/test_remaining_fail_closed_1987.py index 1c36dacf9..ee874883f 100644 --- a/xrspatial/geotiff/tests/test_remaining_fail_closed_1987.py +++ b/xrspatial/geotiff/tests/test_remaining_fail_closed_1987.py @@ -19,10 +19,11 @@ entry in ``attrs['nodatavals']``. Opt-out via explicit ``nodata=`` kwarg. -``MixedBandMetadataError`` (#1987 PR 5) is intentionally NOT -registered in this PR (see the corresponding ``# register_...`` comment -in ``_validation.py``); the migration cost on existing VRT fixtures is -its own follow-up. +``MixedBandMetadataError`` (#1987 PR 5) was staged but unregistered in +the bundle above. The activation, opt-out wiring on ``open_geotiff`` / +``read_geotiff_dask``, and VRT test sweep ship in a follow-up PR; the +dedicated fail-closed coverage for that case lives in +``test_mixed_band_metadata_fail_closed_1987.py``. """ from __future__ import annotations diff --git a/xrspatial/geotiff/tests/test_vrt_band_nodata_1598.py b/xrspatial/geotiff/tests/test_vrt_band_nodata_1598.py index 44fdea848..29a743f10 100644 --- a/xrspatial/geotiff/tests/test_vrt_band_nodata_1598.py +++ b/xrspatial/geotiff/tests/test_vrt_band_nodata_1598.py @@ -63,9 +63,15 @@ def test_read_vrt_band0_uses_band0_nodata(tmp_path): """Sanity check the band-0 selection still works after the fix. Confirms the refactor did not flip the index. + + The fixture mosaics two bands with distinct per-band sentinels, so + after #1987 PR 5 the default read raises ``MixedBandMetadataError``. + The pre-#1987 flatten-to-first-band semantics this regression tests + are still reachable via ``band_nodata='first'``; the opt-in surfaces + at the call site that the test is exercising the legacy behaviour. """ vrt_path = _write_two_band_per_band_nodata_vrt(tmp_path) - r = read_vrt(vrt_path, band=0) + r = read_vrt(vrt_path, band=0, band_nodata='first') assert r.dtype == np.float64 assert r.attrs.get('nodata') == 65535.0 assert np.isnan(r.values[1, 1]) @@ -79,7 +85,7 @@ def test_read_vrt_band1_uses_band1_nodata(tmp_path): [9,65000]] and attrs['nodata']=65535. """ vrt_path = _write_two_band_per_band_nodata_vrt(tmp_path) - r = read_vrt(vrt_path, band=1) + r = read_vrt(vrt_path, band=1, band_nodata='first') assert r.dtype == np.float64, ( "band=1 read kept uint16 dtype; per-band nodata regression." ) @@ -104,7 +110,7 @@ def test_read_vrt_no_band_keeps_band0_nodata_attr(tmp_path): "first band wins" contract for multi-band reads. """ vrt_path = _write_two_band_per_band_nodata_vrt(tmp_path) - r = read_vrt(vrt_path) + r = read_vrt(vrt_path, band_nodata='first') assert r.attrs.get('nodata') == 65535.0 @@ -126,7 +132,7 @@ def test_read_vrt_out_of_range_band_raises(tmp_path): """ vrt_path = _write_two_band_per_band_nodata_vrt(tmp_path) with pytest.raises(ValueError, match="out of range"): - read_vrt(vrt_path, band=5) + read_vrt(vrt_path, band=5, band_nodata='first') def test_read_vrt_non_integer_band_raises(tmp_path): diff --git a/xrspatial/geotiff/tests/test_vrt_int_source_float_dtype_1616.py b/xrspatial/geotiff/tests/test_vrt_int_source_float_dtype_1616.py index 4fa00555f..a4cdc0bbe 100644 --- a/xrspatial/geotiff/tests/test_vrt_int_source_float_dtype_1616.py +++ b/xrspatial/geotiff/tests/test_vrt_int_source_float_dtype_1616.py @@ -181,14 +181,18 @@ def test_float_vrt_int_source_with_band_select(tmp_path): with open(vrt_path, 'w') as f: f.write(vrt_xml) + # Mixed per-band sentinels in the VRT XML: after #1987 PR 5 this + # raises by default. ``band_nodata='first'`` opts back into the + # legacy flatten-to-first-band behaviour, which is what the + # band-selection regression in this test is asserting. # band 0 -> 65535 sentinel masked - r0 = read_vrt(vrt_path, band=0) + r0 = read_vrt(vrt_path, band=0, band_nodata='first') assert r0.dtype == np.float32 assert np.isnan(r0.values[1, 1]) assert r0.attrs.get('nodata') == 65535.0 # band 1 -> 65000 sentinel masked, not 65535 - r1 = read_vrt(vrt_path, band=1) + r1 = read_vrt(vrt_path, band=1, band_nodata='first') assert r1.dtype == np.float32 # band b had its sentinel at the same [1, 1] cell assert np.isnan(r1.values[1, 1]) diff --git a/xrspatial/geotiff/tests/test_vrt_multiband_dtype_1696.py b/xrspatial/geotiff/tests/test_vrt_multiband_dtype_1696.py index 0f753df33..10bf1b23a 100644 --- a/xrspatial/geotiff/tests/test_vrt_multiband_dtype_1696.py +++ b/xrspatial/geotiff/tests/test_vrt_multiband_dtype_1696.py @@ -279,7 +279,11 @@ def test_nodata_round_trip_through_widened_int_dtype(tmp_path): """ out = tmp_path / 'mixed.vrt' out.write_text(vrt_path) - r = read_vrt(str(out)) + # ``255`` (band 0) and ``-9999`` (band 1) are distinct sentinels, so + # after #1987 PR 5 the default read raises. The widened-dtype + # behaviour this regression covers is still reachable through + # ``band_nodata='first'``. + r = read_vrt(str(out), band_nodata='first') # Per-band nodata masking in __init__.py promotes uint/int VRT # buffers to float64 when at least one band's sentinel hits a pixel. # Either we land on float64 (NaN-masked) or stay int16 (sentinel diff --git a/xrspatial/geotiff/tests/test_vrt_multiband_int_nodata_1611.py b/xrspatial/geotiff/tests/test_vrt_multiband_int_nodata_1611.py index b392c6308..827aefff0 100644 --- a/xrspatial/geotiff/tests/test_vrt_multiband_int_nodata_1611.py +++ b/xrspatial/geotiff/tests/test_vrt_multiband_int_nodata_1611.py @@ -14,6 +14,13 @@ This file mirrors test_vrt_band_nodata_1598.py for the multi-band ``band=None`` path. PR #1602 fixed ``band=N`` single-band selection. + +After #1987 PR 5 (mixed-band-metadata fail-closed), the fixtures here -- +which mosaic bands with deliberately distinct per-band sentinels -- +raise ``MixedBandMetadataError`` by default. Each call passes +``band_nodata='first'`` to assert the legacy flatten-to-first-band +behaviour is still reachable via the documented opt-out, which is the +exact semantics the regression covers. """ from __future__ import annotations @@ -80,7 +87,7 @@ def test_multiband_uint16_per_band_sentinel_each_masked(tmp_path): as NaN but band 1's (1,1) cell as the literal 65000.0. """ vrt_path = _write_two_band_per_band_nodata_vrt(tmp_path) - r = read_vrt(vrt_path) + r = read_vrt(vrt_path, band_nodata="first") assert r.shape == (2, 2, 2) assert r.dtype == np.float64, ( f"expected float64 promotion, got {r.dtype}" @@ -111,7 +118,7 @@ def test_multiband_int32_negative_per_band_sentinel(tmp_path): tmp_path, dtype_str="Int32", np_dtype=np.int32, band0_sentinel=-9999, band1_sentinel=-7777, band0_other=(10, 20, 30), band1_other=(40, 50, 60)) - r = read_vrt(vrt_path) + r = read_vrt(vrt_path, band_nodata="first") assert r.dtype == np.float64 assert np.isnan(r.values[1, 1, 0]) assert np.isnan(r.values[1, 1, 1]) @@ -138,7 +145,7 @@ def test_multiband_only_one_band_has_sentinel_present(tmp_path): write(b1_no_sentinel, p1, nodata=65000, compression='none', tiled=False) - r = read_vrt(vrt_path) + r = read_vrt(vrt_path, band_nodata="first") assert r.dtype == np.float64, ( "Even when only band 0 has a present sentinel, the array still " "needs promotion so band 0's NaN can be expressed." @@ -164,7 +171,7 @@ def test_multiband_no_sentinel_present_anywhere_keeps_int_dtype(tmp_path): write(b0, p0, nodata=65535, compression='none', tiled=False) write(b1, p1, nodata=65000, compression='none', tiled=False) - r = read_vrt(vrt_path) + r = read_vrt(vrt_path, band_nodata="first") # Sentinels not present -> integer dtype preserved (matches the # eager open_geotiff fast-path which also skips promotion when the # mask is empty). @@ -194,7 +201,7 @@ def test_multiband_per_band_out_of_range_sentinel_is_no_op(tmp_path): f.write(xml) # Should not raise and should still mask band 0. - r = read_vrt(vrt_path) + r = read_vrt(vrt_path, band_nodata="first") assert np.isnan(r.values[1, 1, 0]) # band 0 sentinel still masked # Band 1's sentinel (-9999) is out of uint16 range; the value 10 # in band1[1,1] survives unchanged. @@ -209,8 +216,8 @@ def test_multiband_band_kwarg_still_per_band_post_pr1602(tmp_path): sentinel. """ vrt_path = _write_two_band_per_band_nodata_vrt(tmp_path) - r0 = read_vrt(vrt_path, band=0) - r1 = read_vrt(vrt_path, band=1) + r0 = read_vrt(vrt_path, band=0, band_nodata="first") + r1 = read_vrt(vrt_path, band=1, band_nodata="first") assert r0.dtype == np.float64 assert r1.dtype == np.float64 assert r0.attrs.get('nodata') == 65535.0 @@ -225,5 +232,5 @@ def test_multiband_attrs_nodata_still_band0(tmp_path): The pixel-level fix must not change that contract. """ vrt_path = _write_two_band_per_band_nodata_vrt(tmp_path) - r = read_vrt(vrt_path) + r = read_vrt(vrt_path, band_nodata="first") assert r.attrs.get('nodata') == 65535.0 From 7dda3600f8e56557d471965d3c33bd7e388f80c1 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Mon, 18 May 2026 19:50:46 -0700 Subject: [PATCH 2/2] geotiff: validate band_nodata value at the VRT boundary read_vrt accepted any string for band_nodata, but the registered mixed-band-metadata check (issue #1987 PR 5) only treats 'first' as the opt-out and silently falls back to strict mode for anything else. A typo like band_nodata='firs' looked like an explicit opt-out at the call site but raised MixedBandMetadataError at run time. Reject unknown values at the read_vrt entry point so the error surfaces where the typo lives, matching the missing_sources value-validation pattern already in this function. open_geotiff and read_geotiff_dask route through read_vrt for VRT sources, so one validation site covers all three public entry points. Also fills in the read_vrt docstring entry for band_nodata, which had been left undocumented since #2031 staged the kwarg. Refs #1987. --- xrspatial/geotiff/_backends/vrt.py | 27 ++++++++++++++++++ ...st_mixed_band_metadata_fail_closed_1987.py | 28 +++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/xrspatial/geotiff/_backends/vrt.py b/xrspatial/geotiff/_backends/vrt.py index 9d57b5f81..18baf20aa 100644 --- a/xrspatial/geotiff/_backends/vrt.py +++ b/xrspatial/geotiff/_backends/vrt.py @@ -85,6 +85,16 @@ def read_vrt(source: str, *, sentinel (or zero on integer bands without a sentinel). ``XRSPATIAL_GEOTIFF_STRICT=1`` forces a raise across the whole module regardless of this kwarg. + band_nodata : {'first', None}, optional + Opt-out for the fail-closed mixed-band-metadata check (issue + #1987 PR 5). ``None`` (the default) rejects a VRT whose bands + declare disagreeing per-band ```` sentinels with + ``MixedBandMetadataError``; flattening to one value would + otherwise let one band's valid pixels collide with another + band's sentinel after the flatten. Pass ``band_nodata='first'`` + to keep the legacy flatten-to-band-0 semantics explicitly. Any + other value raises ``ValueError`` at the boundary so typos + surface up front instead of degrading silently into strict mode. mask_nodata : bool, default True If True, run the integer-sentinel-to-NaN promotion on the assembled mosaic. If False, skip it and keep the source dtype @@ -154,6 +164,23 @@ def read_vrt(source: str, *, f"missing_sources must be 'warn' or 'raise', got " f"{missing_sources!r}") + # ``band_nodata`` accepts only ``None`` (strict, the default) and + # ``'first'`` (legacy flatten-to-band-0 opt-out for the fail-closed + # mixed-band-metadata check, issue #1987 PR 5). Any other value would + # silently degrade to strict mode because the registered check treats + # anything other than ``'first'`` as "no opt-out", which means a typo + # like ``band_nodata='firs'`` looks like opt-out at the call site but + # raises ``MixedBandMetadataError`` at run time. Mirror the + # ``missing_sources`` value-validation pattern above so the typo + # surfaces at the boundary instead. + if band_nodata not in (None, 'first'): + raise ValueError( + f"band_nodata must be None or 'first', got {band_nodata!r}. " + f"Pass ``band_nodata='first'`` to opt back into the legacy " + f"flatten-to-first-band semantics for VRT sources with " + f"disagreeing per-band nodata sentinels, or drop the kwarg " + f"to keep the fail-closed default. See issue #1987.") + # Lazy chunked path (issue #1814). The eager call below materialises # the full mosaic on host RAM and then wraps the array via # ``.chunk()``, so chunks= gave no memory protection and gpu=True + diff --git a/xrspatial/geotiff/tests/test_mixed_band_metadata_fail_closed_1987.py b/xrspatial/geotiff/tests/test_mixed_band_metadata_fail_closed_1987.py index 24323d046..c8eff7ec7 100644 --- a/xrspatial/geotiff/tests/test_mixed_band_metadata_fail_closed_1987.py +++ b/xrspatial/geotiff/tests/test_mixed_band_metadata_fail_closed_1987.py @@ -254,6 +254,34 @@ def test_mixed_band_metadata_error_subclasses_base(tmp_path): read_vrt(vrt_path) +def test_read_vrt_band_nodata_rejects_unknown_value(tmp_path): + """Typos like ``band_nodata='firs'`` raise at the boundary. + + Without the value check, an unknown string degrades to strict mode + (the registered check treats anything other than ``'first'`` as "no + opt-out") and the caller sees ``MixedBandMetadataError`` from a call + site that looked like an explicit opt-out. Mirrors the + ``missing_sources`` value-validation pattern in ``read_vrt``. + """ + vrt_path = _write_mixed_band_vrt(tmp_path) + with pytest.raises(ValueError, match="band_nodata must be None or 'first'"): + read_vrt(vrt_path, band_nodata='firs') + + +def test_open_geotiff_band_nodata_rejects_unknown_value(tmp_path): + """``open_geotiff`` surfaces the same value-validation on VRT sources.""" + vrt_path = _write_mixed_band_vrt(tmp_path) + with pytest.raises(ValueError, match="band_nodata must be None or 'first'"): + open_geotiff(vrt_path, band_nodata='legacy') + + +def test_read_geotiff_dask_band_nodata_rejects_unknown_value(tmp_path): + """``read_geotiff_dask`` forwards the value-validation on VRT sources.""" + vrt_path = _write_mixed_band_vrt(tmp_path) + with pytest.raises(ValueError, match="band_nodata must be None or 'first'"): + read_geotiff_dask(vrt_path, chunks=1, band_nodata='banana') + + def _wrap_2d(arr): """Wrap a 2D numpy array as a minimal DataArray for ``to_geotiff``.""" import xarray as xr