diff --git a/.claude/sweep-metadata-state.csv b/.claude/sweep-metadata-state.csv index dedc2336..6f8ebcf6 100644 --- a/.claude/sweep-metadata-state.csv +++ b/.claude/sweep-metadata-state.csv @@ -1,3 +1,4 @@ module,last_inspected,issue,severity_max,categories_found,notes +geotiff,2026-05-11,1598,MEDIUM,4,"read_vrt(band=N) used vrt.bands[0].nodata for attrs and integer-promotion mask, mis-masking band N reads with per-band sentinels (#1598)." geotiff,2026-05-11,1597,HIGH,4;5,read_geotiff_dask dropped nodata mask when sentinel only hit non-first chunks (per-chunk dtype divergence; #1597). reproject,2026-05-10,1572;1573,HIGH,1;3;4,geoid_height_raster dropped input attrs and used dims[-2:] for 3D inputs (#1572). reproject/merge ignored nodatavals (rasterio convention) when rioxarray absent (#1573). Fixed in same branch. diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 2ad5c8d9..e66e6dc9 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -2769,9 +2769,18 @@ def read_vrt(source: str, *, dtype=None, window=None, attrs['crs_wkt'] = vrt.crs_wkt if vrt.raster_type == 'point': attrs['raster_type'] = 'point' + # When a specific band is selected, source its nodata from that + # band's instead of band 0's. Otherwise multi-band + # VRTs with per-band sentinels would mis-mask the read: attrs would + # advertise band 0's sentinel, the integer-promotion block below + # would mask against band 0's sentinel, and band N's actual nodata + # pixels would survive as literal integers. See issue #1598. + # ``band`` has already been validated by ``_vrt.read_vrt`` as + # 0 <= band < len(vrt.bands), so a simple lookup is safe here. nodata = None if vrt.bands: - nodata = vrt.bands[0].nodata + band_idx_for_nodata = band if band is not None else 0 + nodata = vrt.bands[band_idx_for_nodata].nodata if nodata is not None: attrs['nodata'] = nodata diff --git a/xrspatial/geotiff/_vrt.py b/xrspatial/geotiff/_vrt.py index 7afba86e..a29ae767 100644 --- a/xrspatial/geotiff/_vrt.py +++ b/xrspatial/geotiff/_vrt.py @@ -242,6 +242,23 @@ def read_vrt(vrt_path: str, *, window=None, vrt_dir = os.path.dirname(os.path.abspath(vrt_path)) vrt = parse_vrt(xml_str, vrt_dir) + # Validate ``band`` against the parsed band count. Python list + # indexing would silently accept negative values (``vrt.bands[-1]`` + # returns the last band) and raise an opaque ``IndexError`` for + # out-of-range positive values, neither of which is the contract the + # public API documents (``band`` is a 0-based positive index). Reject + # both up front with a clear ``ValueError`` so callers do not get + # band-N data paired with band-0's nodata sentinel or a downstream + # IndexError from deep in the read path. + if band is not None: + if not isinstance(band, (int, np.integer)) or isinstance(band, bool): + raise ValueError( + f"band must be a non-negative int, got {band!r}") + if band < 0 or band >= len(vrt.bands): + raise ValueError( + f"band index {band} out of range for VRT with " + f"{len(vrt.bands)} band(s)") + if window is not None: r0, c0, r1, c1 = window r0 = max(0, r0) diff --git a/xrspatial/geotiff/tests/test_vrt_band_nodata_1598.py b/xrspatial/geotiff/tests/test_vrt_band_nodata_1598.py new file mode 100644 index 00000000..44fdea84 --- /dev/null +++ b/xrspatial/geotiff/tests/test_vrt_band_nodata_1598.py @@ -0,0 +1,141 @@ +"""Regression tests for issue #1598. + +``read_vrt(path, band=N)`` used to always source the nodata sentinel +from ``vrt.bands[0]`` rather than the requested band, so a multi-band +VRT with per-band ```` would mis-mask reads of any band +other than band 0: + +* ``attrs['nodata']`` advertised band 0's sentinel (wrong). +* The integer-to-float64 promotion ran against band 0's sentinel, so + band N's actual nodata pixels stayed as literal integers. +* The returned dtype was integer when it should have been float64. + +The fix uses ``vrt.bands[band].nodata`` when a band is selected. +""" +from __future__ import annotations + +import numpy as np +import pytest + +from xrspatial.geotiff import read_vrt +from xrspatial.geotiff._writer import write + + +def _write_two_band_per_band_nodata_vrt(tmp_path): + """Two single-band uint16 sources, each with a distinct nodata + sentinel, exposed as bands 1 and 2 of a hand-rolled VRT. + """ + band0 = np.array([[1, 2], [3, 65535]], dtype=np.uint16) + band1 = np.array([[7, 8], [9, 65000]], dtype=np.uint16) + p0 = str(tmp_path / 'vrt_band0_1598.tif') + p1 = str(tmp_path / 'vrt_band1_1598.tif') + write(band0, p0, nodata=65535, compression='none', tiled=False) + write(band1, p1, nodata=65000, compression='none', tiled=False) + + vrt_path = str(tmp_path / 'two_band_per_band_nodata_1598.vrt') + vrt_xml = f""" + 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 + + 65535 + + {p0} + 1 + + + + + + 65000 + + {p1} + 1 + + + + +""" + with open(vrt_path, 'w') as f: + f.write(vrt_xml) + return vrt_path + + +def test_read_vrt_band0_uses_band0_nodata(tmp_path): + """Sanity check the band-0 selection still works after the fix. + + Confirms the refactor did not flip the index. + """ + vrt_path = _write_two_band_per_band_nodata_vrt(tmp_path) + r = read_vrt(vrt_path, band=0) + assert r.dtype == np.float64 + assert r.attrs.get('nodata') == 65535.0 + assert np.isnan(r.values[1, 1]) + assert r.values[0, 0] == 1 + + +def test_read_vrt_band1_uses_band1_nodata(tmp_path): + """The previously-broken case: band=1 must use band 1's sentinel. + + Before the fix this returned dtype=uint16 with values=[[7,8], + [9,65000]] and attrs['nodata']=65535. + """ + vrt_path = _write_two_band_per_band_nodata_vrt(tmp_path) + r = read_vrt(vrt_path, band=1) + assert r.dtype == np.float64, ( + "band=1 read kept uint16 dtype; per-band nodata regression." + ) + assert r.attrs.get('nodata') == 65000.0, ( + f"attrs['nodata'] was {r.attrs.get('nodata')}, " + f"expected 65000 from band 1's ." + ) + assert np.isnan(r.values[1, 1]), ( + "band 1's sentinel pixel was not NaN-masked; " + "promotion ran against the wrong sentinel." + ) + assert r.values[0, 0] == 7 + assert r.values[1, 0] == 9 + + +def test_read_vrt_no_band_keeps_band0_nodata_attr(tmp_path): + """Unselected reads still surface band 0's sentinel. + + Multi-band VRTs with mixed sentinels return all bands stacked, and + the canonical attr cannot encode per-band values; advertising + band 0's sentinel matches the prior behavior and the documented + "first band wins" contract for multi-band reads. + """ + vrt_path = _write_two_band_per_band_nodata_vrt(tmp_path) + r = read_vrt(vrt_path) + assert r.attrs.get('nodata') == 65535.0 + + +def test_read_vrt_negative_band_raises(tmp_path): + """Negative band indices used to be silently accepted via Python + list indexing (``vrt.bands[-1]`` returned the last band) while the + public reader's nodata lookup rejected them, producing band-N data + with no nodata sentinel. They are now a clear ValueError up front. + """ + vrt_path = _write_two_band_per_band_nodata_vrt(tmp_path) + with pytest.raises(ValueError, match="band"): + read_vrt(vrt_path, band=-1) + + +def test_read_vrt_out_of_range_band_raises(tmp_path): + """Out-of-range band indices used to raise IndexError from deep in + the read path. They are now a ValueError that names the available + band count. + """ + vrt_path = _write_two_band_per_band_nodata_vrt(tmp_path) + with pytest.raises(ValueError, match="out of range"): + read_vrt(vrt_path, band=5) + + +def test_read_vrt_non_integer_band_raises(tmp_path): + """A non-int ``band`` would previously have raised TypeError on the + list index. ValueError here matches the rest of the input + validation surface. + """ + vrt_path = _write_two_band_per_band_nodata_vrt(tmp_path) + with pytest.raises(ValueError, match="band"): + read_vrt(vrt_path, band="1") + with pytest.raises(ValueError, match="band"): + read_vrt(vrt_path, band=True)