From 1fa6ca9b81fa2a610bba24feab41e11ea91b33a8 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Tue, 19 May 2026 07:34:30 -0700 Subject: [PATCH 1/2] geotiff: thread masked decision through _set_nodata_attrs (#2092) `attrs['masked_nodata']` was inferred purely from the final array dtype. With `mask_nodata=False` on a float file with a non-NaN sentinel, the masking step was skipped, the buffer kept the literal sentinel pixels, but the attr still claimed True; downstream code that trusted the attr treated those pixels as already-NaN. Change `_set_nodata_attrs` to take an explicit `masked: bool` argument and have every read path pass the actual decision. The eager / dask / GPU paths compute it as `mask_nodata and final_dtype.kind == 'f'`. VRT keeps the dtype-driven rule because its internal reader inlines float NaN-masking unconditionally. Closes #2092 --- xrspatial/geotiff/__init__.py | 16 +- xrspatial/geotiff/_attrs.py | 38 +-- xrspatial/geotiff/_backends/dask.py | 17 +- xrspatial/geotiff/_backends/gpu.py | 34 ++- xrspatial/geotiff/_backends/vrt.py | 27 +- .../tests/test_masked_nodata_attr_2092.py | 263 ++++++++++++++++++ .../tests/test_nodata_semantics_split_1988.py | 30 +- 7 files changed, 374 insertions(+), 51 deletions(-) create mode 100644 xrspatial/geotiff/tests/test_masked_nodata_attr_2092.py diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 442d7ec3f..6efab881f 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -618,10 +618,18 @@ def open_geotiff(source: str | BinaryIO, *, _validate_dtype_cast(arr.dtype, target) arr = arr.astype(target) - # Set ``attrs['nodata']`` + ``attrs['masked_nodata']`` after the - # mask + optional dtype cast so ``masked_nodata`` reflects the - # final array dtype (issue #1988). - _set_nodata_attrs(attrs, nodata, array_dtype=arr.dtype) + # ``attrs['masked_nodata']`` reflects whether the function + # actually replaced sentinel pixels with NaN (#2092). The pre-fix + # implementation inferred it from final dtype, which lied when + # ``mask_nodata=False`` left literal sentinel values in a float + # buffer. True iff the caller opted into masking and the result + # is a float-dtype buffer (the mask block above either ran the + # float-replacement step or promoted an int input to float64; + # either way the buffer holds NaN where the sentinel used to be). + _set_nodata_attrs( + attrs, nodata, + masked=(mask_nodata and arr.dtype.kind == 'f'), + ) if arr.ndim == 3: dims = ['y', 'x', 'band'] diff --git a/xrspatial/geotiff/_attrs.py b/xrspatial/geotiff/_attrs.py index d0da7c148..5792053c8 100644 --- a/xrspatial/geotiff/_attrs.py +++ b/xrspatial/geotiff/_attrs.py @@ -443,7 +443,7 @@ def _extent_to_window(transform, file_height, file_width, return (row_start, col_start, row_stop, col_stop) -def _set_nodata_attrs(attrs: dict, nodata, *, array_dtype) -> None: +def _set_nodata_attrs(attrs: dict, nodata, *, masked: bool) -> None: """Set ``attrs['nodata']`` and ``attrs['masked_nodata']`` on a read. Splits the two meanings previously fused into ``attrs['nodata']`` @@ -452,25 +452,31 @@ def _set_nodata_attrs(attrs: dict, nodata, *, array_dtype) -> None: * ``attrs['nodata']`` -- declared file sentinel, as a scalar of the source dtype. Set whenever the source declared one, regardless of whether the array is float-with-NaN or int-with-sentinels. - * ``attrs['masked_nodata']`` -- boolean flag. ``True`` iff the in- - memory array has been NaN-masked (i.e. it is float dtype and the - reader's sentinel-to-NaN step ran). ``False`` iff the array still - carries the literal integer sentinel value. - - Callers pass ``array_dtype`` as the final post-mask, post-cast dtype - of the array that will be wrapped in the returned DataArray. The - float/non-float split drives the ``masked_nodata`` value: any float - output is treated as NaN-aware (NaN is the sentinel proxy), any - integer output still carries the raw sentinel. - - ``masked_nodata`` is only emitted when ``nodata is not None``. With - no declared sentinel, the flag is meaningless and its absence is the - signal. + * ``attrs['masked_nodata']`` -- boolean flag. ``True`` iff the + reader replaced sentinel pixels with NaN (or the buffer is + otherwise NaN-aware as a result of the reader's masking step). + ``False`` iff the literal sentinel values are still present in + the buffer. + + Callers pass ``masked`` as the actual decision made by the read + path. The pre-#2092 implementation inferred this from the final + array dtype, which lied when ``mask_nodata=False`` left literal + sentinel values in a float buffer; downstream code that trusted + the attr treated those literal values as already-NaN. The eager, + dask, and GPU paths compute ``masked`` as + ``mask_nodata and final_dtype.kind == 'f'``. The VRT path inlines + NaN-masking on float sources unconditionally, so it keeps the + dtype-driven rule (``final_dtype.kind == 'f'``) which is still + accurate for that backend. See issue #2092. + + ``masked_nodata`` is only emitted when ``nodata is not None``. + With no declared sentinel, the flag is meaningless and its + absence is the signal. """ if nodata is None: return attrs['nodata'] = nodata - attrs['masked_nodata'] = bool(np.dtype(array_dtype).kind == 'f') + attrs['masked_nodata'] = bool(masked) def _validate_read_geo_info( diff --git a/xrspatial/geotiff/_backends/dask.py b/xrspatial/geotiff/_backends/dask.py index bfc7851c0..3c9a2ce97 100644 --- a/xrspatial/geotiff/_backends/dask.py +++ b/xrspatial/geotiff/_backends/dask.py @@ -320,12 +320,17 @@ def read_geotiff_dask(source: str, *, attrs = {} _populate_attrs_from_geo_info(attrs, geo_info, window=window) - # ``masked_nodata`` reflects the declared dask graph dtype: a float - # graph means every chunk runs the sentinel-to-NaN promotion (or - # already arrived as float), so the in-memory array is NaN-masked. - # An integer graph means each chunk keeps the literal sentinel - # value. See issue #1988. - _set_nodata_attrs(attrs, nodata_attr, array_dtype=target_dtype) + # ``masked_nodata`` reflects whether per-chunk masking actually + # runs in the lazy graph (#2092). With ``mask_nodata=False`` each + # chunk skips the sentinel-to-NaN step, so even if the graph + # dtype happens to be float (e.g. caller-supplied ``dtype=float64`` + # on an int file), the in-memory buffers hold literal sentinel + # values. True iff the caller opted into masking and the graph + # dtype is float. + _set_nodata_attrs( + attrs, nodata_attr, + masked=(mask_nodata and target_dtype.kind == 'f'), + ) if isinstance(chunks, int): ch_h = ch_w = chunks diff --git a/xrspatial/geotiff/_backends/gpu.py b/xrspatial/geotiff/_backends/gpu.py index 4333a00e6..5c98dab42 100644 --- a/xrspatial/geotiff/_backends/gpu.py +++ b/xrspatial/geotiff/_backends/gpu.py @@ -421,11 +421,15 @@ def read_geotiff_gpu(source: str, *, target = np.dtype(dtype) _validate_dtype_cast(np.dtype(str(arr_gpu.dtype)), target) arr_gpu = arr_gpu.astype(target) - # ``attrs['nodata']`` + ``attrs['masked_nodata']`` reflect the - # post-mask, post-cast array dtype (issue #1988). + # ``attrs['masked_nodata']`` reflects whether the function + # actually replaced sentinel pixels with NaN (#2092). With + # ``mask_nodata=False`` the GPU mask kernel above is + # skipped, so the buffer holds literal sentinel values + # even if the final dtype is float. _set_nodata_attrs( attrs, nodata, - array_dtype=np.dtype(str(arr_gpu.dtype)), + masked=(mask_nodata + and np.dtype(str(arr_gpu.dtype)).kind == 'f'), ) # ``read_to_array`` already applied window + band slicing, so # ``arr_gpu`` is at output shape. Compute coords for that @@ -740,10 +744,15 @@ def _read_once(): attrs = {} _populate_attrs_from_geo_info(attrs, geo_info, window=window) - # ``attrs['nodata']`` + ``attrs['masked_nodata']`` reflect the - # post-mask, post-cast array dtype (issue #1988). + # ``attrs['masked_nodata']`` reflects whether the function + # actually replaced sentinel pixels with NaN (#2092). The GPU + # mask kernel runs only when ``mask_nodata=True`` and a sentinel + # is declared, so the post-cast float dtype alone is not enough + # to claim masking. _set_nodata_attrs( - attrs, nodata, array_dtype=np.dtype(str(arr_gpu.dtype)), + attrs, nodata, + masked=(mask_nodata + and np.dtype(str(arr_gpu.dtype)).kind == 'f'), ) # Apply window/band slicing post-decode. Coords are derived from the @@ -1209,9 +1218,16 @@ def _chunk_task(meta, r0, c0, r1, c1): attrs = {} _populate_attrs_from_geo_info(attrs, geo_info, window=window) - # ``masked_nodata`` reflects the declared dask graph dtype; mirrors - # the dask+numpy backend contract (issue #1988). - _set_nodata_attrs(attrs, nodata, array_dtype=declared_dtype) + # ``masked_nodata`` reflects whether per-chunk masking actually + # runs in the lazy graph (#2092); mirrors the dask+numpy backend + # contract. With ``mask_nodata=False`` ``declared_dtype`` stays + # equal to ``file_dtype`` (see the float-promotion gate earlier + # in this function), so the rule below is equivalent to "graph + # dtype is float AND the caller opted into masking." + _set_nodata_attrs( + attrs, nodata, + masked=(mask_nodata and declared_dtype.kind == 'f'), + ) if name is None: import os diff --git a/xrspatial/geotiff/_backends/vrt.py b/xrspatial/geotiff/_backends/vrt.py index 9d57b5f81..adf586687 100644 --- a/xrspatial/geotiff/_backends/vrt.py +++ b/xrspatial/geotiff/_backends/vrt.py @@ -316,11 +316,20 @@ def read_vrt(source: str, *, _validate_dtype_cast(np.dtype(str(arr.dtype)), target) arr = arr.astype(target) - # Record ``nodata`` + ``masked_nodata`` from the final array dtype - # (issue #1988). On GPU the dtype is read off the CuPy array via - # ``str(arr.dtype)`` so ``np.dtype`` accepts it without a CuPy - # dependency in ``_set_nodata_attrs``. - _set_nodata_attrs(attrs, nodata, array_dtype=np.dtype(str(arr.dtype))) + # Record ``nodata`` + ``masked_nodata``. The VRT internal reader + # NaN-masks float source arrays inline (see ``_vrt._read_data``) + # regardless of the ``mask_nodata`` kwarg, so a float final dtype + # really does mean "sentinel pixels are NaN in the buffer." The + # ``mask_nodata`` kwarg only gates the integer-sentinel mask + # helper that runs above; when it's skipped on an int source the + # final dtype stays integer and the rule below correctly reports + # False. The dtype-driven rule is therefore still accurate for + # this backend; the per-backend explanation is in + # ``_set_nodata_attrs`` (#2092). + _set_nodata_attrs( + attrs, nodata, + masked=(np.dtype(str(arr.dtype)).kind == 'f'), + ) if arr.ndim == 3: dims = ['y', 'x', 'band'] @@ -681,7 +690,13 @@ def _read_vrt_chunked(source, *, window, band, name, chunks, gpu, dtype, if vrt.bands: band_idx_for_nodata = band if band is not None else 0 nodata_meta = vrt.bands[band_idx_for_nodata].nodata - _set_nodata_attrs(attrs, nodata_meta, array_dtype=final_dtype) + # VRT chunked path: the per-task VRT reader inlines float + # NaN-masking unconditionally, so a float graph dtype really does + # mean "buffer is NaN-aware." See ``_set_nodata_attrs`` (#2092). + _set_nodata_attrs( + attrs, nodata_meta, + masked=(final_dtype.kind == 'f'), + ) # Static hole detection: mirror the eager-path ``attrs['vrt_holes']`` # contract (#1734) by scanning every source referenced in the parsed diff --git a/xrspatial/geotiff/tests/test_masked_nodata_attr_2092.py b/xrspatial/geotiff/tests/test_masked_nodata_attr_2092.py new file mode 100644 index 000000000..22dca8d65 --- /dev/null +++ b/xrspatial/geotiff/tests/test_masked_nodata_attr_2092.py @@ -0,0 +1,263 @@ +"""``attrs['masked_nodata']`` must reflect whether masking ran (#2092). + +The pre-fix ``_set_nodata_attrs`` set ``attrs['masked_nodata']`` +purely from the final array dtype: any float output got ``True``. +With ``mask_nodata=False`` on a float file with a non-NaN sentinel +(e.g. -9999), the masking step is skipped, the buffer keeps the +literal sentinel pixels, but the attr still claimed ``True``. Any +downstream code that trusted the attr ("NaN means missing, sentinels +have been replaced") then treated the -9999 pixels as already-masked +valid data. + +The fix threads the actual masking decision through to the helper +and computes it as ``mask_nodata and final_dtype.kind == 'f'`` for +the eager / dask / GPU paths. VRT keeps the dtype-driven rule +because its internal reader inlines float NaN-masking +unconditionally; with ``mask_nodata=False`` on an integer source +the dtype stays integer and the rule correctly reports False. + +These tests cover eager numpy, dask, VRT, and the GPU path (gated +on cuda) for both directions: literal sentinel preserved when +masking is opt-out, NaN replacement when masking is opt-in. +""" +from __future__ import annotations + +import importlib.util + +import numpy as np +import pytest +import xarray as xr + +from xrspatial.geotiff import open_geotiff, read_geotiff_dask, to_geotiff + + +def _gpu_available() -> bool: + if importlib.util.find_spec("cupy") is None: + return False + try: + import cupy + return bool(cupy.cuda.is_available()) + except Exception: + return False + + +_HAS_GPU = _gpu_available() +_gpu_only = pytest.mark.skipif(not _HAS_GPU, reason="cupy + CUDA required") + + +def _make_float_raster_with_nodata(path): + """Float32 raster with -9999 sentinel embedded in two cells.""" + da = xr.DataArray( + np.array( + [[1.0, 2.0, -9999.0], + [4.0, -9999.0, 6.0]], + dtype=np.float32, + ), + coords={'y': np.array([0.5, 1.5]), 'x': np.array([0.5, 1.5, 2.5])}, + dims=('y', 'x'), + attrs={'nodata': -9999.0}, + ) + to_geotiff(da, path) + return da + + +# --- Eager numpy path --------------------------------------------------- + + +def test_eager_mask_nodata_false_reports_false(tmp_path): + """Float file + nodata=-9999 + mask_nodata=False: buffer keeps + literal sentinel pixels; attr must say False so downstream code + knows the sentinel is still present (#2092 bug case).""" + path = str(tmp_path / "tmp_2092_eager_unmasked.tif") + _make_float_raster_with_nodata(path) + + out = open_geotiff(path, mask_nodata=False) + assert out.attrs.get('nodata') == -9999.0 + assert out.attrs.get('masked_nodata') is False, ( + f"buffer holds literal -9999 pixels but attrs say " + f"masked_nodata={out.attrs.get('masked_nodata')!r}") + # And the literal sentinel really is in the buffer. + assert -9999.0 in out.values + + +def test_eager_mask_nodata_true_reports_true(tmp_path): + """Default behaviour: float file + nodata=-9999 + mask_nodata=True + replaces -9999 with NaN. Attr must say True. Pin the canonical + direction so the fix doesn't accidentally flip it everywhere.""" + path = str(tmp_path / "tmp_2092_eager_masked.tif") + _make_float_raster_with_nodata(path) + + out = open_geotiff(path) # mask_nodata defaults to True + assert out.attrs.get('nodata') == -9999.0 + assert out.attrs.get('masked_nodata') is True + # -9999 replaced with NaN. + assert np.isnan(out.values).sum() == 2 + assert -9999.0 not in out.values[~np.isnan(out.values)] + + +def test_eager_int_file_mask_nodata_true_no_match_reports_false(tmp_path): + """Int file + mask_nodata=True but sentinel is out of range, so + no pixel matches and no cast to float happens. Buffer stays int + with literal sentinel; attr must say False. The pre-fix rule + already got this right (dtype is int -> False); regression + check that the new rule doesn't break it.""" + # uint16 file with a -9999 declared sentinel that cannot match + # any pixel (uint16 range is 0..65535). + da = xr.DataArray( + np.array([[10, 20, 30], [40, 50, 60]], dtype=np.uint16), + coords={'y': np.array([0.5, 1.5]), 'x': np.array([0.5, 1.5, 2.5])}, + dims=('y', 'x'), + attrs={'nodata': -9999}, + ) + path = str(tmp_path / "tmp_2092_int_oor_sentinel.tif") + to_geotiff(da, path) + + out = open_geotiff(path) + assert out.attrs.get('nodata') == -9999 + # No pixel matched, no cast, buffer stays uint16. + assert out.dtype.kind == 'u' + assert out.attrs.get('masked_nodata') is False + + +# --- Dask path ---------------------------------------------------------- + + +def test_dask_mask_nodata_false_reports_false(tmp_path): + path = str(tmp_path / "tmp_2092_dask_unmasked.tif") + _make_float_raster_with_nodata(path) + + out = read_geotiff_dask(path, chunks=2, mask_nodata=False) + assert out.attrs.get('masked_nodata') is False + computed = out.values + assert -9999.0 in computed + + +def test_dask_mask_nodata_true_reports_true(tmp_path): + path = str(tmp_path / "tmp_2092_dask_masked.tif") + _make_float_raster_with_nodata(path) + + out = read_geotiff_dask(path, chunks=2) + assert out.attrs.get('masked_nodata') is True + computed = out.values + assert np.isnan(computed).sum() == 2 + + +def test_dask_explicit_float_dtype_mask_off_reports_false(tmp_path): + """Edge: caller passes ``dtype=np.float64`` on an int file with + ``mask_nodata=False``. Old rule would have said + ``masked_nodata=True`` (final dtype is float) even though the + buffer holds literal sentinel values. New rule correctly says + False because the caller opted out of masking.""" + da = xr.DataArray( + np.array([[10, 20, 30], [40, 50, 60]], dtype=np.int16), + coords={'y': np.array([0.5, 1.5]), 'x': np.array([0.5, 1.5, 2.5])}, + dims=('y', 'x'), + attrs={'nodata': 30}, + ) + path = str(tmp_path / "tmp_2092_dask_int_to_float_unmasked.tif") + to_geotiff(da, path) + + out = read_geotiff_dask( + path, chunks=2, mask_nodata=False, dtype=np.float64, + ) + assert out.dtype == np.float64 + assert out.attrs.get('masked_nodata') is False + computed = out.values + # The literal 30 is still in the float buffer (cast, not masked). + assert 30.0 in computed + + +# --- VRT path ----------------------------------------------------------- + + +def _write_int_vrt(tmp_path, src_basename, vrt_basename, sentinel=30): + """Build a single-band VRT pointing at an int16 source TIFF. + + The VRT XML follows the working pattern from + ``test_vrt_multiband_int_nodata_1611``: ``GeoTransform`` plus + explicit ``SrcRect`` and ``DstRect`` are required by the + in-repo VRT reader; without them the reader returns a zero-fill + buffer instead of decoding the source. + """ + import tifffile + src = str(tmp_path / src_basename) + tifffile.imwrite(src, np.array( + [[10, 20, 30], [40, 50, 60]], dtype=np.int16, + ), metadata=None) + vrt = str(tmp_path / vrt_basename) + vrt_xml = f""" + 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 + + {sentinel} + + {src} + 1 + + + + + +""" + with open(vrt, 'w') as fh: + fh.write(vrt_xml) + return vrt + + +def test_vrt_int_source_mask_nodata_false_reports_false(tmp_path): + """VRT with int source + mask_nodata=False: buffer stays int with + literal sentinel; attr must say False. (Float VRT sources get + inline NaN-masking from the VRT internal reader regardless of + mask_nodata, so the int case is where the kwarg actually changes + behaviour. This is the bug-class test for the VRT path.)""" + vrt = _write_int_vrt( + tmp_path, + "tmp_2092_vrt_src.tif", + "tmp_2092_vrt_unmasked.vrt", + ) + out = open_geotiff(vrt, mask_nodata=False) + assert out.dtype.kind == 'i', f"expected int dtype, got {out.dtype}" + assert out.attrs.get('masked_nodata') is False + # The literal sentinel is still in the buffer. + assert 30 in out.values + + +def test_vrt_int_source_mask_nodata_true_reports_true(tmp_path): + """VRT with int source + mask_nodata=True: helper promotes to + float64 and replaces sentinel pixels with NaN. Attr says True. + Baseline direction so the fix doesn't silently flip the canonical + behaviour.""" + vrt = _write_int_vrt( + tmp_path, + "tmp_2092_vrt_src2.tif", + "tmp_2092_vrt_masked.vrt", + ) + out = open_geotiff(vrt) # mask_nodata defaults to True + assert out.dtype == np.float64, ( + f"expected float64 promotion, got {out.dtype}") + assert out.attrs.get('masked_nodata') is True + assert np.isnan(out.values).sum() == 1 # the lone 30 cell + + +# --- GPU path ----------------------------------------------------------- + + +@_gpu_only +def test_gpu_mask_nodata_false_reports_false(tmp_path): + from xrspatial.geotiff import read_geotiff_gpu + + path = str(tmp_path / "tmp_2092_gpu_unmasked.tif") + _make_float_raster_with_nodata(path) + + out = read_geotiff_gpu(path, mask_nodata=False) + assert out.attrs.get('masked_nodata') is False + + +@_gpu_only +def test_gpu_mask_nodata_true_reports_true(tmp_path): + from xrspatial.geotiff import read_geotiff_gpu + + path = str(tmp_path / "tmp_2092_gpu_masked.tif") + _make_float_raster_with_nodata(path) + + out = read_geotiff_gpu(path) + assert out.attrs.get('masked_nodata') is True diff --git a/xrspatial/geotiff/tests/test_nodata_semantics_split_1988.py b/xrspatial/geotiff/tests/test_nodata_semantics_split_1988.py index 514038499..9f5768dd3 100644 --- a/xrspatial/geotiff/tests/test_nodata_semantics_split_1988.py +++ b/xrspatial/geotiff/tests/test_nodata_semantics_split_1988.py @@ -506,32 +506,42 @@ def test_int_source_with_out_of_range_sentinel(tmp_path): class TestSetNodataAttrsHelper: - """Direct coverage of :func:`_set_nodata_attrs` in ``_attrs.py``.""" + """Direct coverage of :func:`_set_nodata_attrs` in ``_attrs.py``. - def test_float_dtype_marks_masked(self): + The helper signature changed in #2092 from ``array_dtype`` (dtype + inference) to ``masked`` (explicit decision passed by the + caller). These tests pin the new contract. + """ + + def test_masked_true_marks_masked(self): from xrspatial.geotiff._attrs import _set_nodata_attrs attrs: dict = {} - _set_nodata_attrs(attrs, -9999, array_dtype=np.float64) + _set_nodata_attrs(attrs, -9999, masked=True) assert attrs == {"nodata": -9999, "masked_nodata": True} - def test_int_dtype_marks_unmasked(self): + def test_masked_false_marks_unmasked(self): from xrspatial.geotiff._attrs import _set_nodata_attrs attrs: dict = {} - _set_nodata_attrs(attrs, -9999, array_dtype=np.uint16) + _set_nodata_attrs(attrs, -9999, masked=False) assert attrs == {"nodata": -9999, "masked_nodata": False} def test_none_nodata_is_noop(self): from xrspatial.geotiff._attrs import _set_nodata_attrs attrs: dict = {} - _set_nodata_attrs(attrs, None, array_dtype=np.uint16) + _set_nodata_attrs(attrs, None, masked=False) + assert attrs == {} + _set_nodata_attrs(attrs, None, masked=True) assert attrs == {} - def test_accepts_dtype_string(self): - """``array_dtype`` may be a numpy dtype object, type, or string.""" + def test_masked_coerced_to_bool(self): + """Non-bool truthy/falsy values are coerced to bool so the + attr is always a plain Python bool (downstream serialisers + can't always handle numpy scalars or 0/1 ints).""" from xrspatial.geotiff._attrs import _set_nodata_attrs attrs: dict = {} - _set_nodata_attrs(attrs, 0, array_dtype="float32") + _set_nodata_attrs(attrs, 0, masked=np.True_) assert attrs["masked_nodata"] is True + assert type(attrs["masked_nodata"]) is bool attrs = {} - _set_nodata_attrs(attrs, 0, array_dtype="uint8") + _set_nodata_attrs(attrs, 0, masked=0) assert attrs["masked_nodata"] is False From 1953135be8833f285e90249bb8a76ce9da71383a Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Tue, 19 May 2026 07:44:40 -0700 Subject: [PATCH 2/2] Address review nits: VRT pre-cast dtype, docs, tests (#2092) Follow-up to review feedback: * VRT eager (`vrt.py:329-353`) and chunked (`vrt.py:719-738`) paths now read the pre-cast / declared dtype, not the post-cast one. Fixes the corner case where `mask_nodata=False` + `dtype=float64` on an int VRT source claimed `masked_nodata=True` even though the buffer still held literal sentinels. * `attrs_contract.rst` updated to describe the actual rule (replaced pixels with NaN) instead of the old coupled "float dtype + step ran" wording. * `_set_nodata_attrs` docstring now leads with the one-line contract; history moves below. * Two new tests pin the int-source + mask_off + float-cast case for the eager (`__init__.py`) and VRT paths, mirroring the existing dask coverage. --- docs/source/user_guide/attrs_contract.rst | 12 ++--- xrspatial/geotiff/_attrs.py | 40 ++++++++--------- xrspatial/geotiff/_backends/vrt.py | 38 +++++++++------- .../tests/test_masked_nodata_attr_2092.py | 44 +++++++++++++++++++ 4 files changed, 92 insertions(+), 42 deletions(-) diff --git a/docs/source/user_guide/attrs_contract.rst b/docs/source/user_guide/attrs_contract.rst index 4b9d31a6a..4f53d588e 100644 --- a/docs/source/user_guide/attrs_contract.rst +++ b/docs/source/user_guide/attrs_contract.rst @@ -68,11 +68,13 @@ write. ``_resolve_nodata_attr``. * - ``masked_nodata`` - bool - - Paired with ``nodata``. ``True`` when the in-memory array is - float dtype and the reader's sentinel-to-NaN step ran; - ``False`` when the array still carries the literal integer - sentinel. Only set when ``nodata`` is set; absence means no - declared sentinel. + - Paired with ``nodata``. ``True`` when the reader actually + replaced sentinel pixels with NaN (so the buffer is NaN-aware); + ``False`` when the array still carries the literal sentinel + values, including the case where the array is float dtype + because the caller passed ``mask_nodata=False`` together with + ``dtype=float...``. Only set when ``nodata`` is set; absence + means no declared sentinel. See issue #2092. * - ``raster_type`` - str - ``'point'`` when the file declares ``RasterPixelIsPoint``; diff --git a/xrspatial/geotiff/_attrs.py b/xrspatial/geotiff/_attrs.py index 2d90fda85..24a21aa81 100644 --- a/xrspatial/geotiff/_attrs.py +++ b/xrspatial/geotiff/_attrs.py @@ -244,32 +244,28 @@ def _should_restore_nan_sentinel(attrs) -> bool: def _set_nodata_attrs(attrs: dict, nodata, *, masked: bool) -> None: """Set ``attrs['nodata']`` and ``attrs['masked_nodata']`` on a read. - Splits the two meanings previously fused into ``attrs['nodata']`` - (issue #1988): + ``masked`` is the actual mask-decision the read path made: True iff + sentinel pixels in the in-memory buffer have been replaced with NaN + (or the buffer is NaN-aware as a result of the reader's masking + step). False iff the literal sentinel values are still present in + the buffer. + + Contract (splits the two meanings previously fused into + ``attrs['nodata']`` per issue #1988): * ``attrs['nodata']`` -- declared file sentinel, as a scalar of the source dtype. Set whenever the source declared one, regardless of whether the array is float-with-NaN or int-with-sentinels. - * ``attrs['masked_nodata']`` -- boolean flag. ``True`` iff the - reader replaced sentinel pixels with NaN (or the buffer is - otherwise NaN-aware as a result of the reader's masking step). - ``False`` iff the literal sentinel values are still present in - the buffer. - - Callers pass ``masked`` as the actual decision made by the read - path. The pre-#2092 implementation inferred this from the final - array dtype, which lied when ``mask_nodata=False`` left literal - sentinel values in a float buffer; downstream code that trusted - the attr treated those literal values as already-NaN. The eager, - dask, and GPU paths compute ``masked`` as - ``mask_nodata and final_dtype.kind == 'f'``. The VRT path inlines - NaN-masking on float sources unconditionally, so it keeps the - dtype-driven rule (``final_dtype.kind == 'f'``) which is still - accurate for that backend. See issue #2092. - - ``masked_nodata`` is only emitted when ``nodata is not None``. - With no declared sentinel, the flag is meaningless and its - absence is the signal. + * ``attrs['masked_nodata']`` -- the ``masked`` value the caller + passed, coerced to bool. Only emitted when ``nodata is not + None``; absence of the flag means there is no declared sentinel. + + Pre-#2092 the helper inferred ``masked`` from the final array + dtype, which lied when ``mask_nodata=False`` left literal sentinel + values in a float buffer; downstream code that trusted the attr + treated those literal values as already-NaN. The eager, dask, GPU, + and VRT paths now compute ``masked`` as + ``mask_nodata and final_dtype.kind == 'f'``. See issue #2092. """ if nodata is None: return diff --git a/xrspatial/geotiff/_backends/vrt.py b/xrspatial/geotiff/_backends/vrt.py index 96a8ecc3e..2a78d3482 100644 --- a/xrspatial/geotiff/_backends/vrt.py +++ b/xrspatial/geotiff/_backends/vrt.py @@ -320,6 +320,17 @@ def read_vrt(source: str, *, if mask_nodata: arr = _vrt_apply_integer_sentinel_mask(arr, vrt, band) + # Capture pre-cast dtype: ``_vrt._read_data`` NaN-masks float source + # arrays (and int sources feeding a float VRT dataType) inline, and + # ``_vrt_apply_integer_sentinel_mask`` above promotes int-with-sentinel + # to float64 with NaNs written in place. Any of those paths yield a + # float pre-cast dtype; an int pre-cast dtype means literal sentinels + # are still in the buffer. The optional user dtype cast below may + # promote int -> float without masking, so reading dtype after the + # cast would falsely claim ``masked_nodata=True`` (issue #2092 + # follow-up). + pre_cast_dtype = np.dtype(str(arr.dtype)) + # Surface the source GeoTransform in the same rasterio ordering used # by open_geotiff: (pixel_width, 0, origin_x, 0, pixel_height, origin_y). # vrt.geo_transform is GDAL ordering, so reorder. For a windowed read @@ -343,19 +354,9 @@ def read_vrt(source: str, *, _validate_dtype_cast(np.dtype(str(arr.dtype)), target) arr = arr.astype(target) - # Record ``nodata`` + ``masked_nodata``. The VRT internal reader - # NaN-masks float source arrays inline (see ``_vrt._read_data``) - # regardless of the ``mask_nodata`` kwarg, so a float final dtype - # really does mean "sentinel pixels are NaN in the buffer." The - # ``mask_nodata`` kwarg only gates the integer-sentinel mask - # helper that runs above; when it's skipped on an int source the - # final dtype stays integer and the rule below correctly reports - # False. The dtype-driven rule is therefore still accurate for - # this backend; the per-backend explanation is in - # ``_set_nodata_attrs`` (#2092). _set_nodata_attrs( attrs, nodata, - masked=(np.dtype(str(arr.dtype)).kind == 'f'), + masked=(pre_cast_dtype.kind == 'f'), ) if arr.ndim == 3: @@ -717,12 +718,19 @@ def _read_vrt_chunked(source, *, window, band, name, chunks, gpu, dtype, if vrt.bands: band_idx_for_nodata = band if band is not None else 0 nodata_meta = vrt.bands[band_idx_for_nodata].nodata - # VRT chunked path: the per-task VRT reader inlines float - # NaN-masking unconditionally, so a float graph dtype really does - # mean "buffer is NaN-aware." See ``_set_nodata_attrs`` (#2092). + # VRT chunked path: the per-task VRT reader NaN-masks float source + # arrays inline, and ``declared_dtype`` is promoted to float64 only + # when ``mask_nodata`` is on and an integer band has a representable + # sentinel (see the ``declared_dtype`` block earlier in this + # function). Either way, ``declared_dtype.kind == 'f'`` is the + # correct gate for "buffer is NaN-aware." A user-supplied + # ``dtype=`` cast happens on top of the lazy graph above (see + # ``final_dtype`` block) and must not flip this attr, so we read + # the pre-cast ``declared_dtype`` here rather than ``final_dtype`` + # (#2092 follow-up). _set_nodata_attrs( attrs, nodata_meta, - masked=(final_dtype.kind == 'f'), + masked=(declared_dtype.kind == 'f'), ) # Static hole detection: mirror the eager-path ``attrs['vrt_holes']`` diff --git a/xrspatial/geotiff/tests/test_masked_nodata_attr_2092.py b/xrspatial/geotiff/tests/test_masked_nodata_attr_2092.py index 22dca8d65..70b5e5d05 100644 --- a/xrspatial/geotiff/tests/test_masked_nodata_attr_2092.py +++ b/xrspatial/geotiff/tests/test_masked_nodata_attr_2092.py @@ -119,6 +119,29 @@ def test_eager_int_file_mask_nodata_true_no_match_reports_false(tmp_path): assert out.attrs.get('masked_nodata') is False +def test_eager_explicit_float_dtype_mask_off_reports_false(tmp_path): + """Eager path: int file + mask_nodata=False + dtype=float64. + The mask block at __init__.py is gated on ``mask_nodata``, then + the explicit ``dtype=`` cast promotes the int buffer to float + with literal sentinels still in it. The rule must report False + (mask_nodata short-circuits the conjunction). Mirrors the dask + edge in test_dask_explicit_float_dtype_mask_off_reports_false.""" + da = xr.DataArray( + np.array([[10, 20, 30], [40, 50, 60]], dtype=np.int16), + coords={'y': np.array([0.5, 1.5]), 'x': np.array([0.5, 1.5, 2.5])}, + dims=('y', 'x'), + attrs={'nodata': 30}, + ) + path = str(tmp_path / "tmp_2092_eager_int_to_float_unmasked.tif") + to_geotiff(da, path) + + out = open_geotiff(path, mask_nodata=False, dtype=np.float64) + assert out.dtype == np.float64 + assert out.attrs.get('masked_nodata') is False + # The literal 30 is still in the float buffer (cast, not masked). + assert 30.0 in out.values + + # --- Dask path ---------------------------------------------------------- @@ -238,6 +261,27 @@ def test_vrt_int_source_mask_nodata_true_reports_true(tmp_path): assert np.isnan(out.values).sum() == 1 # the lone 30 cell +def test_vrt_int_source_mask_off_with_float_cast_reports_false(tmp_path): + """VRT int source + mask_nodata=False + dtype=float64 cast. + + Initial PR landed a dtype-only VRT rule (``arr.dtype.kind == 'f'``) + which mis-claimed ``masked_nodata=True`` here -- the integer mask + helper is skipped under ``mask_nodata=False``, then the explicit + dtype cast promotes the int buffer to float64 with literal + sentinels still in it. The fix reads the pre-cast dtype, so + pre_cast is int and the attr is False. Pins the follow-up fix.""" + vrt = _write_int_vrt( + tmp_path, + "tmp_2092_vrt_src_cast.tif", + "tmp_2092_vrt_unmasked_cast.vrt", + ) + out = open_geotiff(vrt, mask_nodata=False, dtype=np.float64) + assert out.dtype == np.float64 + assert out.attrs.get('masked_nodata') is False + # The literal 30 is still in the float buffer (cast, not masked). + assert 30.0 in out.values + + # --- GPU path -----------------------------------------------------------