From 4e7f01edebe0f7d713aa2dcc02fc51267d2ecdbb Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Tue, 19 May 2026 11:54:30 -0700 Subject: [PATCH 1/3] geotiff: split overloaded masked_nodata into lifecycle signals (#2135) Adds two additive attrs alongside the existing nodata / masked_nodata pair so consumers can answer separate questions without a single bit that lies in the cast-after-mask case: - nodata_pixels_present: bool, True iff the read window contained any pixel matching the declared sentinel before masking. Emitted by the eager numpy, GPU, and VRT paths; left unset on the dask backends so the lazy contract is preserved (a strict per-chunk reduction would force eager compute). - nodata_dtype_cast: target dtype name (e.g. "float64") when the caller passed an explicit dtype= kwarg. Lets downstream tell float-because-masked from float-because-promoted. The existing nodata and masked_nodata keys keep their post-#2092 semantics; nothing renames or deprecates. attrs_contract.rst documents both new keys. --- docs/source/user_guide/attrs_contract.rst | 17 + xrspatial/geotiff/__init__.py | 68 +++- xrspatial/geotiff/_attrs.py | 53 ++- xrspatial/geotiff/_backends/_gpu_helpers.py | 51 +++ xrspatial/geotiff/_backends/dask.py | 9 + xrspatial/geotiff/_backends/gpu.py | 26 +- xrspatial/geotiff/_backends/vrt.py | 47 ++- xrspatial/geotiff/_vrt.py | 95 +++++ .../tests/test_nodata_lifecycle_attrs_2135.py | 330 ++++++++++++++++++ 9 files changed, 687 insertions(+), 9 deletions(-) create mode 100644 xrspatial/geotiff/tests/test_nodata_lifecycle_attrs_2135.py diff --git a/docs/source/user_guide/attrs_contract.rst b/docs/source/user_guide/attrs_contract.rst index 7bb80a310..f7550ca29 100644 --- a/docs/source/user_guide/attrs_contract.rst +++ b/docs/source/user_guide/attrs_contract.rst @@ -75,6 +75,23 @@ write. because the caller passed ``mask_nodata=False`` together with ``dtype=float...``. Only set when ``nodata`` is set; absence means no declared sentinel. See issue #2092. + * - ``nodata_pixels_present`` + - bool + - Paired with ``nodata``. ``True`` iff the read window contained + at least one pixel matching the declared sentinel before + masking. Lets QA and writer code answer "any nodata in this + tile" without rescanning the buffer. Only emitted by the + eager-numpy, GPU, and VRT paths; the dask path leaves the attr + unset because a strict per-chunk reduction would force eager + ``.compute()``. See issue #2135. + * - ``nodata_dtype_cast`` + - str + - Paired with ``nodata``. Set to the resolved target dtype name + (e.g. ``"float64"``) when the caller passed an explicit + ``dtype=`` kwarg, otherwise absent. Distinguishes + float-because-masked from float-because-promoted, which a + ``masked_nodata`` lookup alone cannot disambiguate. See issue + #2135. * - ``raster_type`` - str - ``'point'`` when the file declares ``RasterPixelIsPoint``; diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 342811f00..a77793968 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -640,6 +640,12 @@ def open_geotiff(source: str | BinaryIO, *, # write is safe; an extra ``arr.copy()`` would just double peak # memory for a multi-MB raster. nodata = geo_info.nodata + # Track whether any sentinel pixel was present in the read window. + # Only meaningful when a declared sentinel exists; reported via + # ``attrs['nodata_pixels_present']`` (issue #2135) so consumers can + # answer "any nodata in this tile" without rescanning. ``None`` + # keeps the attr out when ``mask_nodata=False`` and no scan happened. + nodata_pixels_present: bool | None = None if nodata is not None and mask_nodata: # When the reader applied MinIsWhite, the sentinel-equality mask # must compare against the inverted sentinel value (issue #1809). @@ -651,7 +657,20 @@ def open_geotiff(source: str | BinaryIO, *, nodata_sentinel = getattr(geo_info, '_mask_nodata', nodata) if arr.dtype.kind == 'f': if nodata_sentinel is not None and not np.isnan(nodata_sentinel): - arr[arr == arr.dtype.type(nodata_sentinel)] = np.nan + mask_f = arr == arr.dtype.type(nodata_sentinel) + nodata_pixels_present = bool(mask_f.any()) + if nodata_pixels_present: + arr[mask_f] = np.nan + else: + # NaN-only sentinel on a float buffer: ``mask_nodata`` is + # a no-op, but downstream may want to know if any NaN + # pixels already exist in the source so the attr stays + # informative. + nodata_pixels_present = ( + bool(np.isnan(arr).any()) + if nodata_sentinel is not None + else False + ) elif arr.dtype.kind in ('u', 'i'): # Integer arrays: convert to float to represent NaN. # An out-of-range sentinel (e.g. uint16 file with @@ -677,14 +696,55 @@ def open_geotiff(source: str | BinaryIO, *, info = np.iinfo(arr.dtype) if info.min <= nodata_int <= info.max: mask = arr == arr.dtype.type(nodata_int) - if mask.any(): + nodata_pixels_present = bool(mask.any()) + if nodata_pixels_present: arr = arr.astype(np.float64) arr[mask] = np.nan - + else: + # Sentinel cannot match any pixel in this dtype's + # range. No mask was run; no pixels were present. + nodata_pixels_present = False + else: + nodata_pixels_present = False + elif nodata is not None and not mask_nodata: + # ``mask_nodata=False``: the masking branch above is skipped, + # but ``attrs['nodata_pixels_present']`` should still surface so + # callers know whether literal sentinel pixels survive in the + # buffer (issue #2135). Mirror the same dtype / range / integer + # gates as the masking branch so an out-of-range or non-integer + # sentinel cleanly resolves to ``False`` rather than crashing in + # the equality check. + nodata_sentinel = getattr(geo_info, '_mask_nodata', nodata) + if nodata_sentinel is not None: + if arr.dtype.kind == 'f': + if np.isnan(nodata_sentinel): + nodata_pixels_present = bool(np.isnan(arr).any()) + else: + nodata_pixels_present = bool( + (arr == arr.dtype.type(nodata_sentinel)).any() + ) + elif arr.dtype.kind in ('u', 'i'): + if (np.isfinite(nodata_sentinel) + and float(nodata_sentinel).is_integer()): + nodata_int = int(nodata_sentinel) + info = np.iinfo(arr.dtype) + if info.min <= nodata_int <= info.max: + nodata_pixels_present = bool( + (arr == arr.dtype.type(nodata_int)).any() + ) + else: + nodata_pixels_present = False + else: + nodata_pixels_present = False + + dtype_cast_attr: str | None = None if dtype is not None: target = np.dtype(dtype) _validate_dtype_cast(arr.dtype, target) arr = arr.astype(target) + # Record the post-mask cast so downstream can tell + # float-because-masked from float-because-promoted (issue #2135). + dtype_cast_attr = target.name # ``attrs['masked_nodata']`` reflects whether the function # actually replaced sentinel pixels with NaN (#2092). The pre-fix @@ -697,6 +757,8 @@ def open_geotiff(source: str | BinaryIO, *, _set_nodata_attrs( attrs, nodata, masked=(mask_nodata and arr.dtype.kind == 'f'), + pixels_present=nodata_pixels_present, + dtype_cast=dtype_cast_attr, ) if arr.ndim == 3: diff --git a/xrspatial/geotiff/_attrs.py b/xrspatial/geotiff/_attrs.py index 9f60c31d2..27d94e86a 100644 --- a/xrspatial/geotiff/_attrs.py +++ b/xrspatial/geotiff/_attrs.py @@ -46,6 +46,18 @@ step ran; ``False`` iff the array still carries the literal integer sentinel. Only emitted when ``nodata`` is set; absence is the "no declared sentinel" signal. See ``_set_nodata_attrs``. +- ``nodata_pixels_present`` (#2135): bool, only emitted when + ``nodata`` is set and the backend computed the answer cheaply. + True iff the read window contained at least one pixel matching the + declared sentinel before masking. Lets QA and writer code answer + "are any sentinel pixels in this tile" without scanning the buffer. + The dask path leaves this unset because a strict per-chunk + reduction would force an eager ``.compute()``. +- ``nodata_dtype_cast`` (#2135): string dtype name (e.g. + ``"float64"``), only emitted when ``nodata`` is set and the caller + passed an explicit ``dtype=`` kwarg. Records that a post-mask cast + happened so consumers can tell float-because-masked from + float-because-promoted. - ``raster_type``: ``'area'`` (implicit / RasterPixelIsArea) or ``'point'`` (explicit / RasterPixelIsPoint). - ``extra_tags``: list of ``(tag_id, type_id, count, value)`` tuples for @@ -245,8 +257,15 @@ def _should_restore_nan_sentinel(attrs) -> bool: return value is not False -def _set_nodata_attrs(attrs: dict, nodata, *, masked: bool) -> None: - """Set ``attrs['nodata']`` and ``attrs['masked_nodata']`` on a read. +def _set_nodata_attrs( + attrs: dict, + nodata, + *, + masked: bool, + pixels_present: bool | None = None, + dtype_cast: str | None = None, +) -> None: + """Set the nodata lifecycle attrs on a read. ``masked`` is the actual mask-decision the read path made: True iff sentinel pixels in the in-memory buffer have been replaced with NaN @@ -254,8 +273,24 @@ def _set_nodata_attrs(attrs: dict, nodata, *, masked: bool) -> None: step). False iff the literal sentinel values are still present in the buffer. + ``pixels_present`` is the lifecycle signal added in issue #2135. If + not ``None``, the read path computed whether the read window + contained at least one pixel matching the declared sentinel before + masking; the value is forwarded to ``attrs['nodata_pixels_present']`` + so consumers can answer "any nodata in this tile" without scanning + the buffer. Pass ``None`` when the backend cannot cheaply produce + the value (e.g. dask, where a strict per-chunk reduction would + force eager compute). + + ``dtype_cast`` is the second lifecycle signal added in issue #2135. + If the caller passed an explicit ``dtype=`` kwarg, the backend + forwards the resolved target dtype string (e.g. ``"float64"``) so + consumers can distinguish "float because masking promoted it" from + "float because the caller cast it". ``None`` means no caller cast + happened; the attr is omitted in that case. + Contract (splits the two meanings previously fused into - ``attrs['nodata']`` per issue #1988): + ``attrs['nodata']`` per issue #1988, extended for #2135): * ``attrs['nodata']`` -- declared file sentinel, as a scalar of the source dtype. Set whenever the source declared one, regardless of @@ -263,6 +298,14 @@ def _set_nodata_attrs(attrs: dict, nodata, *, masked: bool) -> None: * ``attrs['masked_nodata']`` -- the ``masked`` value the caller passed, coerced to bool. Only emitted when ``nodata is not None``; absence of the flag means there is no declared sentinel. + * ``attrs['nodata_pixels_present']`` (additive, #2135) -- bool, + only emitted when ``nodata is not None`` and ``pixels_present`` + is not ``None``. Tracks whether the read window contained any + sentinel pixel before masking. + * ``attrs['nodata_dtype_cast']`` (additive, #2135) -- string dtype + name (e.g. ``"float64"``), only emitted when ``nodata is not + None`` and ``dtype_cast`` is not ``None``. Records that a + caller-requested cast happened after masking. Pre-#2092 the helper inferred ``masked`` from the final array dtype, which lied when ``mask_nodata=False`` left literal sentinel @@ -275,6 +318,10 @@ def _set_nodata_attrs(attrs: dict, nodata, *, masked: bool) -> None: return attrs['nodata'] = nodata attrs['masked_nodata'] = bool(masked) + if pixels_present is not None: + attrs['nodata_pixels_present'] = bool(pixels_present) + if dtype_cast is not None: + attrs['nodata_dtype_cast'] = str(dtype_cast) def _validate_read_geo_info( diff --git a/xrspatial/geotiff/_backends/_gpu_helpers.py b/xrspatial/geotiff/_backends/_gpu_helpers.py index 723a477e9..7a84eb672 100644 --- a/xrspatial/geotiff/_backends/_gpu_helpers.py +++ b/xrspatial/geotiff/_backends/_gpu_helpers.py @@ -42,6 +42,57 @@ def _is_gpu_data(data) -> bool: return isinstance(data, _cupy_type) +def _apply_nodata_mask_gpu_with_presence(arr_gpu, nodata): + """Same as :func:`_apply_nodata_mask_gpu` but also reports presence. + + Returns ``(arr_gpu, pixels_present)`` where ``pixels_present`` is a + bool indicating whether any pixel in ``arr_gpu`` matched the + declared sentinel before masking. Lets the eager GPU read path + surface ``attrs['nodata_pixels_present']`` (issue #2135) without + rescanning the buffer. + + ``pixels_present`` is ``False`` when ``nodata`` is ``None`` (no + declared sentinel), when the sentinel is out of range for an + integer buffer, or when the sentinel is fractional / non-finite + and cannot match an integer pixel -- the same fast-paths the + masking branch takes. On a NaN-only sentinel against a float + buffer, ``pixels_present`` is whether the buffer already contains + NaN. + """ + import cupy + + if nodata is None: + return arr_gpu, False + arr_dtype = np.dtype(str(arr_gpu.dtype)) + if arr_dtype.kind == 'f': + if np.isnan(nodata): + # NaN-only sentinel: no replacement happens, but report + # whether any NaN already lives in the buffer so the attr + # stays informative. + return arr_gpu, bool(cupy.isnan(arr_gpu).any().item()) + sentinel = arr_dtype.type(nodata) + mask = arr_gpu == sentinel + present = bool(mask.any().item()) + if present: + cupy.putmask(arr_gpu, mask, arr_dtype.type('nan')) + return arr_gpu, present + if arr_dtype.kind in ('u', 'i'): + if not (np.isfinite(nodata) and float(nodata).is_integer()): + return arr_gpu, False + nodata_int = int(nodata) + info = np.iinfo(arr_dtype) + if not (info.min <= nodata_int <= info.max): + return arr_gpu, False + sentinel = arr_dtype.type(nodata_int) + mask = arr_gpu == sentinel + present = bool(mask.any().item()) + if present: + arr_gpu = arr_gpu.astype(cupy.float64) + cupy.putmask(arr_gpu, mask, cupy.float64('nan')) + return arr_gpu, present + return arr_gpu, False + + def _apply_nodata_mask_gpu(arr_gpu, nodata): """Replace nodata sentinel values with NaN on a CuPy array. diff --git a/xrspatial/geotiff/_backends/dask.py b/xrspatial/geotiff/_backends/dask.py index ccb52f96c..30dd19d97 100644 --- a/xrspatial/geotiff/_backends/dask.py +++ b/xrspatial/geotiff/_backends/dask.py @@ -354,9 +354,18 @@ def read_geotiff_dask(source: str, *, # on an int file), the in-memory buffers hold literal sentinel # values. True iff the caller opted into masking and the graph # dtype is float. + # ``nodata_pixels_present`` is intentionally left unset on the + # dask path (issue #2135). A strict per-chunk reduction would force + # an eager ``.compute()`` here, defeating the lazy contract; callers + # that need the answer can fall back to scanning the materialised + # array. ``nodata_dtype_cast`` is recorded when the caller passed an + # explicit ``dtype=`` kwarg so downstream can tell float-by-cast + # apart from float-by-masking on the lazy output. _set_nodata_attrs( attrs, nodata_attr, masked=(mask_nodata and target_dtype.kind == 'f'), + pixels_present=None, + dtype_cast=(np.dtype(dtype).name if dtype is not None else None), ) if isinstance(chunks, int): diff --git a/xrspatial/geotiff/_backends/gpu.py b/xrspatial/geotiff/_backends/gpu.py index fb5201ac1..69e90a845 100644 --- a/xrspatial/geotiff/_backends/gpu.py +++ b/xrspatial/geotiff/_backends/gpu.py @@ -41,6 +41,7 @@ ) from ._gpu_helpers import ( _apply_nodata_mask_gpu, + _apply_nodata_mask_gpu_with_presence, _apply_orientation_geo_info, _apply_orientation_gpu, _gpu_apply_window_band, @@ -443,13 +444,18 @@ def read_geotiff_gpu(source: str, *, # ``_mask_nodata`` when applicable; fall back to the original # sentinel otherwise (#1809). nodata = geo_info.nodata + nodata_pixels_present_attr: bool | None = None if nodata is not None and mask_nodata: mask_value = getattr(_stripped_geo, '_mask_nodata', nodata) - arr_gpu = _apply_nodata_mask_gpu(arr_gpu, mask_value) + arr_gpu, nodata_pixels_present_attr = ( + _apply_nodata_mask_gpu_with_presence(arr_gpu, mask_value) + ) + dtype_cast_attr: str | None = None if dtype is not None: target = np.dtype(dtype) _validate_dtype_cast(np.dtype(str(arr_gpu.dtype)), target) arr_gpu = arr_gpu.astype(target) + dtype_cast_attr = target.name # ``attrs['masked_nodata']`` reflects whether the function # actually replaced sentinel pixels with NaN (#2092). With # ``mask_nodata=False`` the GPU mask kernel above is @@ -459,6 +465,8 @@ def read_geotiff_gpu(source: str, *, attrs, nodata, masked=(mask_nodata and np.dtype(str(arr_gpu.dtype)).kind == 'f'), + pixels_present=nodata_pixels_present_attr, + dtype_cast=dtype_cast_attr, ) # ``read_to_array`` already applied window + band slicing, so # ``arr_gpu`` is at output shape. Compute coords for that @@ -780,6 +788,7 @@ def _read_once(): # dtype cast so the float promotion for masked integer rasters doesn't # surprise a user-supplied dtype. nodata = geo_info.nodata + nodata_pixels_present_attr: bool | None = None if nodata is not None and mask_nodata: # When MinIsWhite was applied, the mask must use the inverted # sentinel; otherwise the original sentinel. The pure GPU path @@ -794,12 +803,17 @@ def _read_once(): _cpu_fallback_geo, '_mask_nodata', nodata) else: _gpu_mask_value = nodata - arr_gpu = _apply_nodata_mask_gpu(arr_gpu, _gpu_mask_value) + arr_gpu, nodata_pixels_present_attr = ( + _apply_nodata_mask_gpu_with_presence( + arr_gpu, _gpu_mask_value) + ) + dtype_cast_attr: str | None = None if dtype is not None: target = np.dtype(dtype) _validate_dtype_cast(np.dtype(str(arr_gpu.dtype)), target) arr_gpu = arr_gpu.astype(target) + dtype_cast_attr = target.name # Build DataArray if name is None: @@ -823,6 +837,8 @@ def _read_once(): attrs, nodata, masked=(mask_nodata and np.dtype(str(arr_gpu.dtype)).kind == 'f'), + pixels_present=nodata_pixels_present_attr, + dtype_cast=dtype_cast_attr, ) # Apply window/band slicing post-decode. Coords are derived from the @@ -1373,9 +1389,15 @@ def _chunk_task(meta, r0, c0, r1, c1): # equal to ``file_dtype`` (see the float-promotion gate earlier # in this function), so the rule below is equivalent to "graph # dtype is float AND the caller opted into masking." + # ``nodata_pixels_present`` stays unset on the dask+GPU path for the + # same reason as the dask+numpy path: a strict per-chunk reduction + # would force an eager ``.compute()`` (issue #2135). ``dtype_cast`` + # records the caller-supplied ``dtype=`` kwarg when present. _set_nodata_attrs( attrs, nodata, masked=(mask_nodata and declared_dtype.kind == 'f'), + pixels_present=None, + dtype_cast=(np.dtype(dtype).name if dtype is not None else None), ) if name is None: diff --git a/xrspatial/geotiff/_backends/vrt.py b/xrspatial/geotiff/_backends/vrt.py index 0156ba8f0..53cef0369 100644 --- a/xrspatial/geotiff/_backends/vrt.py +++ b/xrspatial/geotiff/_backends/vrt.py @@ -149,6 +149,8 @@ def read_vrt(source: str, *, from .._vrt import ( read_vrt as _read_vrt_internal, _apply_integer_sentinel_mask as _vrt_apply_integer_sentinel_mask, + _apply_integer_sentinel_mask_with_presence as _vrt_mask_with_presence, + _scan_for_sentinel as _vrt_scan_for_sentinel, ) source = _coerce_path(source) @@ -326,8 +328,21 @@ def read_vrt(source: str, *, # (issue #1825) so behaviour stays in lockstep. See issue #1611. # ``mask_nodata=False`` skips this so callers can preserve an # integer source dtype via ``dtype=...`` (issue #2052). + # ``nodata_pixels_present`` tracks whether the read window contained + # any sentinel pixel before masking (issue #2135). On the VRT eager + # path it has two sources: the integer-sentinel helper below (which + # returns ``True`` iff at least one band hit its sentinel during + # promotion), and the inline float-NaN masking inside ``_read_data`` + # (proxied via NaN presence on the resulting float buffer when the + # source declared a sentinel). Stays ``None`` only when no sentinel + # was declared. + nodata_pixels_present: bool | None = None if mask_nodata: - arr = _vrt_apply_integer_sentinel_mask(arr, vrt, band) + arr, nodata_pixels_present = _vrt_mask_with_presence(arr, vrt, band) + elif nodata is not None: + # ``mask_nodata=False``: skip the masking helper but still scan + # for the literal sentinel so callers can branch on the attr. + nodata_pixels_present = _vrt_scan_for_sentinel(arr, vrt, band) # Capture pre-cast dtype: ``_vrt._read_data`` NaN-masks float source # arrays (and int sources feeding a float VRT dataType) inline, and @@ -340,6 +355,24 @@ def read_vrt(source: str, *, # follow-up). pre_cast_dtype = np.dtype(str(arr.dtype)) + # When the inline float-NaN masking inside ``_vrt._read_data`` already + # ran (float source + float VRT dataType + declared sentinel), the + # integer helper above is a no-op but the resulting float buffer may + # already carry NaNs at the sentinel locations. Promote + # ``nodata_pixels_present`` to reflect that, since a downstream "any + # nodata?" check should answer yes for those tiles too. + if (nodata is not None + and pre_cast_dtype.kind == 'f' + and not nodata_pixels_present): + # ``arr`` is still on the CPU at this point (the optional + # ``cupy.asarray`` lift happens further down). + try: + nodata_pixels_present = bool(np.isnan(arr).any()) + except (TypeError, ValueError): + # Defensive: an unexpected dtype that fails ``np.isnan`` falls + # back to the helper's answer rather than raising mid-read. + pass + # Surface the source GeoTransform in the same rasterio ordering used # by open_geotiff: (pixel_width, 0, origin_x, 0, pixel_height, origin_y). # vrt.geo_transform is GDAL ordering, so reorder. For a windowed read @@ -358,14 +391,18 @@ def read_vrt(source: str, *, import cupy arr = cupy.asarray(arr) + dtype_cast_attr: str | None = None if dtype is not None: target = np.dtype(dtype) _validate_dtype_cast(np.dtype(str(arr.dtype)), target) arr = arr.astype(target) + dtype_cast_attr = target.name _set_nodata_attrs( attrs, nodata, masked=(pre_cast_dtype.kind == 'f'), + pixels_present=nodata_pixels_present, + dtype_cast=dtype_cast_attr, ) if arr.ndim == 3: @@ -740,9 +777,17 @@ def _read_vrt_chunked(source, *, window, band, name, chunks, gpu, dtype, # ``final_dtype`` block) and must not flip this attr, so we read # the pre-cast ``declared_dtype`` here rather than ``final_dtype`` # (#2092 follow-up). + # ``nodata_pixels_present`` is intentionally left unset on the + # chunked VRT path: a per-chunk reduction would force eager + # ``.compute()`` (matches the dask backend's policy for issue + # #2135). ``dtype_cast`` records the caller-supplied ``dtype=`` + # kwarg when present so downstream can tell float-by-cast apart + # from float-by-masking even on the lazy output. _set_nodata_attrs( attrs, nodata_meta, masked=(declared_dtype.kind == 'f'), + pixels_present=None, + dtype_cast=(np.dtype(dtype).name if dtype is not None else None), ) # Static hole detection: mirror the eager-path ``attrs['vrt_holes']`` diff --git a/xrspatial/geotiff/_vrt.py b/xrspatial/geotiff/_vrt.py index 68ba21c4f..953466f29 100644 --- a/xrspatial/geotiff/_vrt.py +++ b/xrspatial/geotiff/_vrt.py @@ -850,6 +850,101 @@ def _apply_integer_sentinel_mask(arr, vrt, band): return arr +def _apply_integer_sentinel_mask_with_presence(arr, vrt, band): + """Same as :func:`_apply_integer_sentinel_mask` but also reports presence. + + Returns ``(arr, pixels_present)`` where ``pixels_present`` is a bool + indicating whether any sentinel pixel was found in the integer + buffer before masking. Used by the VRT eager and chunked paths to + surface ``attrs['nodata_pixels_present']`` (issue #2135) without + rescanning the (possibly promoted) float buffer. + """ + if arr.dtype.kind not in ('u', 'i'): + return arr, False + if arr.ndim == 3 and band is None and vrt.bands: + int_arr = arr + int_dtype = int_arr.dtype + any_hit = False + for i, vrt_band in enumerate(vrt.bands): + if i >= int_arr.shape[-1]: + break + sentinel = _sentinel_for_dtype(vrt_band.nodata, int_dtype) + if sentinel is None: + continue + mask = int_arr[..., i] == sentinel + if not mask.any(): + continue + any_hit = True + if arr.dtype != np.float64: + arr = arr.astype(np.float64) + arr[..., i][mask] = np.nan + return arr, any_hit + band_idx = band if band is not None else 0 + nodata = None + if vrt.bands and 0 <= band_idx < len(vrt.bands): + nodata = vrt.bands[band_idx].nodata + if nodata is None: + return arr, False + sentinel = _sentinel_for_dtype(nodata, arr.dtype) + if sentinel is None: + return arr, False + mask = arr == sentinel + present = bool(mask.any()) + if present: + arr = arr.astype(np.float64) + arr[mask] = np.nan + return arr, present + + +def _scan_for_sentinel(arr, vrt, band): + """Detect whether ``arr`` contains any pixel matching a declared sentinel. + + Used by the VRT eager path when ``mask_nodata=False`` so + ``attrs['nodata_pixels_present']`` (issue #2135) still surfaces a + meaningful answer even though the masking branch was skipped. The + multi-band case checks each band against its own ````; + the single-band case checks the selected band's sentinel. + + Returns a plain bool. ``False`` covers the no-declared-sentinel and + sentinel-out-of-range cases (mirrors the masking-branch gates). + """ + if arr.ndim == 3 and band is None and vrt.bands: + for i, vrt_band in enumerate(vrt.bands): + if i >= arr.shape[-1]: + break + if vrt_band.nodata is None: + continue + if arr.dtype.kind == 'f': + if np.isnan(vrt_band.nodata): + if bool(np.isnan(arr[..., i]).any()): + return True + else: + if bool((arr[..., i] + == arr.dtype.type(vrt_band.nodata)).any()): + return True + else: + sentinel = _sentinel_for_dtype(vrt_band.nodata, arr.dtype) + if sentinel is None: + continue + if bool((arr[..., i] == sentinel).any()): + return True + return False + band_idx = band if band is not None else 0 + nodata = None + if vrt.bands and 0 <= band_idx < len(vrt.bands): + nodata = vrt.bands[band_idx].nodata + if nodata is None: + return False + if arr.dtype.kind == 'f': + if np.isnan(nodata): + return bool(np.isnan(arr).any()) + return bool((arr == arr.dtype.type(nodata)).any()) + sentinel = _sentinel_for_dtype(nodata, arr.dtype) + if sentinel is None: + return False + return bool((arr == sentinel).any()) + + def read_vrt(vrt_path: str, *, window=None, band: int | None = None, max_pixels: int | None = None, diff --git a/xrspatial/geotiff/tests/test_nodata_lifecycle_attrs_2135.py b/xrspatial/geotiff/tests/test_nodata_lifecycle_attrs_2135.py new file mode 100644 index 000000000..b24a4868b --- /dev/null +++ b/xrspatial/geotiff/tests/test_nodata_lifecycle_attrs_2135.py @@ -0,0 +1,330 @@ +"""Split overloaded ``masked_nodata`` into separate lifecycle signals (#2135). + +``attrs['masked_nodata']`` was a single boolean trying to describe a +multi-stage process (declared sentinel exists, masking step ran, +sentinel pixels actually present, dtype cast after masking). Issue +#2135 keeps the existing flag for backward compatibility and adds two +additive lifecycle attrs: + +* ``nodata_pixels_present`` -- bool, ``True`` iff at least one pixel in + the read window matched the declared sentinel before masking. Lets + consumer code answer "any nodata in this tile" without rescanning. + The dask path leaves this attr unset because a strict per-chunk + reduction would force eager ``.compute()``. +* ``nodata_dtype_cast`` -- string dtype name (e.g. ``"float64"``), + only emitted when the caller passed an explicit ``dtype=`` kwarg. + Distinguishes float-because-masked from float-because-promoted. + +These tests cover the eager / dask / GPU / VRT paths and pin the +emission rules so future changes do not silently drop or rename keys. +""" +from __future__ import annotations + +import importlib.util + +import numpy as np +import pytest +import xarray as xr + +from xrspatial.geotiff import open_geotiff, read_geotiff_dask, to_geotiff + + +def _gpu_available() -> bool: + if importlib.util.find_spec("cupy") is None: + return False + try: + import cupy + return bool(cupy.cuda.is_available()) + except Exception: + return False + + +_HAS_GPU = _gpu_available() +_gpu_only = pytest.mark.skipif(not _HAS_GPU, reason="cupy + CUDA required") + + +def _make_float_raster(path, sentinel=-9999.0, plant_sentinel=True): + """Float32 raster: 2x3 with one (or zero) sentinel pixels.""" + if plant_sentinel: + data = np.array( + [[1.0, 2.0, sentinel], [4.0, 5.0, 6.0]], dtype=np.float32, + ) + else: + data = np.array( + [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32, + ) + da = xr.DataArray( + data, + coords={'y': np.array([0.5, 1.5]), 'x': np.array([0.5, 1.5, 2.5])}, + dims=('y', 'x'), + attrs={'nodata': sentinel}, + ) + to_geotiff(da, path) + return da + + +def _make_int_raster(path, sentinel=30, plant_sentinel=True): + """Int16 raster with sentinel optionally embedded.""" + if plant_sentinel: + data = np.array([[10, 20, sentinel], [40, 50, 60]], dtype=np.int16) + else: + data = np.array([[10, 20, 25], [40, 50, 60]], dtype=np.int16) + da = xr.DataArray( + data, + coords={'y': np.array([0.5, 1.5]), 'x': np.array([0.5, 1.5, 2.5])}, + dims=('y', 'x'), + attrs={'nodata': sentinel}, + ) + to_geotiff(da, path) + return da + + +# --- Eager numpy path --------------------------------------------------- + + +def test_eager_float_sentinel_present_masked(tmp_path): + """Float file + sentinel embedded + mask_nodata=True: + nodata_pixels_present=True, nodata_dtype_cast absent.""" + path = str(tmp_path / "tmp_2135_eager_float_present.tif") + _make_float_raster(path) + out = open_geotiff(path) + assert out.attrs.get('masked_nodata') is True + assert out.attrs.get('nodata_pixels_present') is True + assert 'nodata_dtype_cast' not in out.attrs + + +def test_eager_float_sentinel_absent_masked(tmp_path): + """Float file + sentinel NOT embedded + mask_nodata=True: + nodata_pixels_present=False.""" + path = str(tmp_path / "tmp_2135_eager_float_absent.tif") + _make_float_raster(path, plant_sentinel=False) + out = open_geotiff(path) + assert out.attrs.get('masked_nodata') is True + assert out.attrs.get('nodata_pixels_present') is False + + +def test_eager_float_sentinel_present_unmasked(tmp_path): + """Float file + sentinel embedded + mask_nodata=False: + masking branch skipped but presence scan still runs.""" + path = str(tmp_path / "tmp_2135_eager_float_present_unmasked.tif") + _make_float_raster(path) + out = open_geotiff(path, mask_nodata=False) + assert out.attrs.get('masked_nodata') is False + assert out.attrs.get('nodata_pixels_present') is True + + +def test_eager_int_sentinel_present(tmp_path): + """Int file + sentinel embedded + mask_nodata=True: + promotion fires, nodata_pixels_present=True.""" + path = str(tmp_path / "tmp_2135_eager_int_present.tif") + _make_int_raster(path) + out = open_geotiff(path) + assert out.dtype == np.float64 + assert out.attrs.get('masked_nodata') is True + assert out.attrs.get('nodata_pixels_present') is True + + +def test_eager_int_out_of_range_sentinel(tmp_path): + """Int (uint16) file + sentinel out of range: + no cast, nodata_pixels_present=False.""" + da = xr.DataArray( + np.array([[10, 20, 30], [40, 50, 60]], dtype=np.uint16), + coords={'y': np.array([0.5, 1.5]), 'x': np.array([0.5, 1.5, 2.5])}, + dims=('y', 'x'), + attrs={'nodata': -9999}, + ) + path = str(tmp_path / "tmp_2135_eager_int_oor.tif") + to_geotiff(da, path) + out = open_geotiff(path) + assert out.attrs.get('nodata') == -9999 + assert out.attrs.get('masked_nodata') is False + assert out.attrs.get('nodata_pixels_present') is False + + +def test_eager_dtype_cast_records_target(tmp_path): + """``dtype=`` kwarg surfaces as nodata_dtype_cast.""" + path = str(tmp_path / "tmp_2135_eager_dtype_cast.tif") + _make_int_raster(path) + out = open_geotiff(path, mask_nodata=False, dtype=np.float64) + assert out.dtype == np.float64 + assert out.attrs.get('masked_nodata') is False + assert out.attrs.get('nodata_dtype_cast') == 'float64' + # Literal sentinel still in buffer (cast, not masked). + assert 30.0 in out.values + # Pixel-presence scan should still confirm the sentinel is there. + assert out.attrs.get('nodata_pixels_present') is True + + +def test_eager_dtype_cast_absent_without_dtype_kwarg(tmp_path): + """No ``dtype=`` kwarg: ``nodata_dtype_cast`` absent from attrs.""" + path = str(tmp_path / "tmp_2135_eager_no_dtype.tif") + _make_float_raster(path) + out = open_geotiff(path) + assert 'nodata_dtype_cast' not in out.attrs + + +def test_eager_no_declared_sentinel(tmp_path): + """File without GDAL_NODATA: no nodata-related attrs surface.""" + da = xr.DataArray( + np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32), + coords={'y': np.array([0.5, 1.5]), 'x': np.array([0.5, 1.5, 2.5])}, + dims=('y', 'x'), + ) + path = str(tmp_path / "tmp_2135_eager_no_sentinel.tif") + to_geotiff(da, path) + out = open_geotiff(path) + assert 'nodata' not in out.attrs + assert 'masked_nodata' not in out.attrs + assert 'nodata_pixels_present' not in out.attrs + assert 'nodata_dtype_cast' not in out.attrs + + +# --- Dask path ---------------------------------------------------------- + + +def test_dask_leaves_pixels_present_unset(tmp_path): + """Dask path: per-chunk reduction would force eager compute, so + ``nodata_pixels_present`` stays unset by design (#2135).""" + path = str(tmp_path / "tmp_2135_dask_present.tif") + _make_float_raster(path) + out = read_geotiff_dask(path, chunks=2) + assert out.attrs.get('masked_nodata') is True + assert 'nodata_pixels_present' not in out.attrs + + +def test_dask_dtype_cast_records_target(tmp_path): + """Dask path emits ``nodata_dtype_cast`` when caller passes dtype=.""" + path = str(tmp_path / "tmp_2135_dask_cast.tif") + _make_int_raster(path) + out = read_geotiff_dask( + path, chunks=2, mask_nodata=False, dtype=np.float64, + ) + assert out.attrs.get('masked_nodata') is False + assert out.attrs.get('nodata_dtype_cast') == 'float64' + assert 'nodata_pixels_present' not in out.attrs + + +def test_dask_no_dtype_cast_attr_absent(tmp_path): + """Dask path without dtype=: nodata_dtype_cast absent.""" + path = str(tmp_path / "tmp_2135_dask_no_cast.tif") + _make_float_raster(path) + out = read_geotiff_dask(path, chunks=2) + assert 'nodata_dtype_cast' not in out.attrs + + +# --- VRT path ----------------------------------------------------------- + + +def _write_int_vrt(tmp_path, src_basename, vrt_basename, sentinel=30, + plant_sentinel=True): + tifffile = pytest.importorskip("tifffile") + src = str(tmp_path / src_basename) + if plant_sentinel: + data = np.array([[10, 20, sentinel], [40, 50, 60]], dtype=np.int16) + else: + data = np.array([[10, 20, 25], [40, 50, 60]], dtype=np.int16) + tifffile.imwrite(src, data, metadata=None) + vrt = str(tmp_path / vrt_basename) + vrt_xml = f""" + 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 + + {sentinel} + + {src} + 1 + + + + + +""" + with open(vrt, 'w') as fh: + fh.write(vrt_xml) + return vrt + + +def test_vrt_int_sentinel_present_masked(tmp_path): + """VRT int source + sentinel embedded + mask_nodata=True: + helper promotes to float, nodata_pixels_present=True.""" + vrt = _write_int_vrt( + tmp_path, "tmp_2135_vrt_src.tif", "tmp_2135_vrt_present.vrt", + ) + out = open_geotiff(vrt) + assert out.dtype == np.float64 + assert out.attrs.get('masked_nodata') is True + assert out.attrs.get('nodata_pixels_present') is True + + +def test_vrt_int_sentinel_absent_masked(tmp_path): + """VRT int source + sentinel NOT embedded + mask_nodata=True: + helper does not promote, nodata_pixels_present=False.""" + vrt = _write_int_vrt( + tmp_path, "tmp_2135_vrt_src_absent.tif", + "tmp_2135_vrt_absent.vrt", + plant_sentinel=False, + ) + out = open_geotiff(vrt) + assert out.dtype.kind == 'i' # no promotion + assert out.attrs.get('masked_nodata') is False + assert out.attrs.get('nodata_pixels_present') is False + + +def test_vrt_int_unmasked_still_scans(tmp_path): + """VRT int + mask_nodata=False: presence scan still runs.""" + vrt = _write_int_vrt( + tmp_path, "tmp_2135_vrt_src_unmasked.tif", + "tmp_2135_vrt_unmasked.vrt", + ) + out = open_geotiff(vrt, mask_nodata=False) + assert out.dtype.kind == 'i' + assert out.attrs.get('masked_nodata') is False + assert out.attrs.get('nodata_pixels_present') is True + + +def test_vrt_dtype_cast_records_target(tmp_path): + """VRT + dtype=float64 + mask_nodata=False: cast attr surfaces.""" + vrt = _write_int_vrt( + tmp_path, "tmp_2135_vrt_src_cast.tif", + "tmp_2135_vrt_cast.vrt", + ) + out = open_geotiff(vrt, mask_nodata=False, dtype=np.float64) + assert out.dtype == np.float64 + assert out.attrs.get('masked_nodata') is False + assert out.attrs.get('nodata_dtype_cast') == 'float64' + assert out.attrs.get('nodata_pixels_present') is True + + +# --- GPU path ----------------------------------------------------------- + + +@_gpu_only +def test_gpu_float_sentinel_present_masked(tmp_path): + from xrspatial.geotiff import read_geotiff_gpu + + path = str(tmp_path / "tmp_2135_gpu_float_present.tif") + _make_float_raster(path) + out = read_geotiff_gpu(path) + assert out.attrs.get('masked_nodata') is True + assert out.attrs.get('nodata_pixels_present') is True + + +@_gpu_only +def test_gpu_int_sentinel_absent(tmp_path): + from xrspatial.geotiff import read_geotiff_gpu + + path = str(tmp_path / "tmp_2135_gpu_int_absent.tif") + _make_int_raster(path, plant_sentinel=False) + out = read_geotiff_gpu(path) + # No sentinel pixel: helper short-circuits, buffer stays int. + assert out.attrs.get('masked_nodata') is False + assert out.attrs.get('nodata_pixels_present') is False + + +@_gpu_only +def test_gpu_dtype_cast_records_target(tmp_path): + from xrspatial.geotiff import read_geotiff_gpu + + path = str(tmp_path / "tmp_2135_gpu_cast.tif") + _make_int_raster(path) + out = read_geotiff_gpu(path, mask_nodata=False, dtype=np.float64) + assert out.attrs.get('nodata_dtype_cast') == 'float64' From ebbf34d5bbbd04e1801bb5a2e8a52d0cfb5f4f59 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Tue, 19 May 2026 11:56:17 -0700 Subject: [PATCH 2/3] tests: cover matrix row 5 (int + mask_off + no cast) for #2135 --- .../tests/test_nodata_lifecycle_attrs_2135.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/xrspatial/geotiff/tests/test_nodata_lifecycle_attrs_2135.py b/xrspatial/geotiff/tests/test_nodata_lifecycle_attrs_2135.py index b24a4868b..f9b6d4a92 100644 --- a/xrspatial/geotiff/tests/test_nodata_lifecycle_attrs_2135.py +++ b/xrspatial/geotiff/tests/test_nodata_lifecycle_attrs_2135.py @@ -141,6 +141,19 @@ def test_eager_int_out_of_range_sentinel(tmp_path): assert out.attrs.get('nodata_pixels_present') is False +def test_eager_int_sentinel_present_unmasked(tmp_path): + """Matrix row (5): int file + sentinel embedded + mask_nodata=False + + no dtype= kwarg. Buffer stays int with literal sentinel, + nodata_pixels_present=True from the no-mask scan branch.""" + path = str(tmp_path / "tmp_2135_eager_int_present_unmasked.tif") + _make_int_raster(path) + out = open_geotiff(path, mask_nodata=False) + assert out.dtype.kind == 'i' + assert out.attrs.get('masked_nodata') is False + assert out.attrs.get('nodata_pixels_present') is True + assert 'nodata_dtype_cast' not in out.attrs + + def test_eager_dtype_cast_records_target(tmp_path): """``dtype=`` kwarg surfaces as nodata_dtype_cast.""" path = str(tmp_path / "tmp_2135_eager_dtype_cast.tif") From 17c23b9915bf86f18380743726720c5509575800 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Tue, 19 May 2026 11:57:47 -0700 Subject: [PATCH 3/3] Address review nits: clarify lifecycle attr invariants (#2135) - vrt.py: document the single-branch invariant the float-NaN proxy scan relies on so a future maintainer does not accidentally double-scan a buffer that both the integer helper and the inline reader masked. - __init__.py: explain why nodata_pixels_present stays None on exotic dtypes rather than being defaulted to False. - _gpu_helpers.py / _vrt.py: note why each presence-reporting helper is a sibling of its base rather than a folded variant; defer the refactor until a third variant exists. --- xrspatial/geotiff/__init__.py | 8 +++++++- xrspatial/geotiff/_backends/_gpu_helpers.py | 9 +++++++++ xrspatial/geotiff/_backends/vrt.py | 8 ++++++++ xrspatial/geotiff/_vrt.py | 6 ++++++ 4 files changed, 30 insertions(+), 1 deletion(-) diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index a77793968..15d82a9ae 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -644,7 +644,13 @@ def open_geotiff(source: str | BinaryIO, *, # Only meaningful when a declared sentinel exists; reported via # ``attrs['nodata_pixels_present']`` (issue #2135) so consumers can # answer "any nodata in this tile" without rescanning. ``None`` - # keeps the attr out when ``mask_nodata=False`` and no scan happened. + # keeps the attr out when no scan happened: that includes the no + # sentinel declared case (early-return in ``_set_nodata_attrs``) and + # any exotic dtype branch (e.g. complex source) that neither the + # float nor integer mask gates handle. Do not "fix" the asymmetry by + # forcing a False default -- the absence-means-unknown contract is + # deliberate so downstream can distinguish "scanned and saw nothing" + # from "did not scan." nodata_pixels_present: bool | None = None if nodata is not None and mask_nodata: # When the reader applied MinIsWhite, the sentinel-equality mask diff --git a/xrspatial/geotiff/_backends/_gpu_helpers.py b/xrspatial/geotiff/_backends/_gpu_helpers.py index 7a84eb672..23129a899 100644 --- a/xrspatial/geotiff/_backends/_gpu_helpers.py +++ b/xrspatial/geotiff/_backends/_gpu_helpers.py @@ -45,6 +45,15 @@ def _is_gpu_data(data) -> bool: def _apply_nodata_mask_gpu_with_presence(arr_gpu, nodata): """Same as :func:`_apply_nodata_mask_gpu` but also reports presence. + Kept as a sibling helper rather than collapsed with + ``_apply_nodata_mask_gpu`` because the original is a hot inner-loop + callee from the GPU writers and the per-call cost of computing the + presence bit (one extra ``.any().item()`` plus a small Python-side + tuple) is paid even when the caller does not need it. A future + refactor can fold both into one helper with an opt-in flag if a + third variant appears; for now the duplication keeps the original + no-overhead path intact. + Returns ``(arr_gpu, pixels_present)`` where ``pixels_present`` is a bool indicating whether any pixel in ``arr_gpu`` matched the declared sentinel before masking. Lets the eager GPU read path diff --git a/xrspatial/geotiff/_backends/vrt.py b/xrspatial/geotiff/_backends/vrt.py index 53cef0369..885b4403c 100644 --- a/xrspatial/geotiff/_backends/vrt.py +++ b/xrspatial/geotiff/_backends/vrt.py @@ -361,6 +361,14 @@ def read_vrt(source: str, *, # already carry NaNs at the sentinel locations. Promote # ``nodata_pixels_present`` to reflect that, since a downstream "any # nodata?" check should answer yes for those tiles too. + # + # Invariant: at most one branch sets ``nodata_pixels_present`` to True + # before this block runs. ``_vrt_mask_with_presence`` only sets True + # on integer buffers (it short-circuits on float dtype), and + # ``_vrt_scan_for_sentinel`` is gated on the ``mask_nodata=False`` + # branch above. Either way, the float-NaN proxy below is the only + # presence signal for float buffers, so the ``not present`` guard + # is sufficient -- we will not double-scan the same buffer. if (nodata is not None and pre_cast_dtype.kind == 'f' and not nodata_pixels_present): diff --git a/xrspatial/geotiff/_vrt.py b/xrspatial/geotiff/_vrt.py index 953466f29..02ff3302c 100644 --- a/xrspatial/geotiff/_vrt.py +++ b/xrspatial/geotiff/_vrt.py @@ -858,6 +858,12 @@ def _apply_integer_sentinel_mask_with_presence(arr, vrt, band): buffer before masking. Used by the VRT eager and chunked paths to surface ``attrs['nodata_pixels_present']`` (issue #2135) without rescanning the (possibly promoted) float buffer. + + Kept as a sibling of :func:`_apply_integer_sentinel_mask` rather + than collapsed into one helper with an opt-in flag because the + original helper is shared with the chunked-VRT per-task path that + cannot afford an extra Python-side branch per chunk. A future + refactor can unify them once a third variant is needed. """ if arr.dtype.kind not in ('u', 'i'): return arr, False