Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions docs/source/user_guide/attrs_contract.rst
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,23 @@ write.
because the caller passed ``mask_nodata=False`` together with
``dtype=float...``. Only set when ``nodata`` is set; absence
means no declared sentinel. See issue #2092.
* - ``nodata_pixels_present``
- bool
- Paired with ``nodata``. ``True`` iff the read window contained
at least one pixel matching the declared sentinel before
masking. Lets QA and writer code answer "any nodata in this
tile" without rescanning the buffer. Only emitted by the
eager-numpy, GPU, and VRT paths; the dask path leaves the attr
unset because a strict per-chunk reduction would force eager
``.compute()``. See issue #2135.
* - ``nodata_dtype_cast``
- str
- Paired with ``nodata``. Set to the resolved target dtype name
(e.g. ``"float64"``) when the caller passed an explicit
``dtype=`` kwarg, otherwise absent. Distinguishes
float-because-masked from float-because-promoted, which a
``masked_nodata`` lookup alone cannot disambiguate. See issue
#2135.
* - ``raster_type``
- str
- ``'point'`` when the file declares ``RasterPixelIsPoint``;
Expand Down
74 changes: 71 additions & 3 deletions xrspatial/geotiff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -642,6 +642,18 @@ def open_geotiff(source: str | BinaryIO, *,
# write is safe; an extra ``arr.copy()`` would just double peak
# memory for a multi-MB raster.
nodata = geo_info.nodata
# Track whether any sentinel pixel was present in the read window.
# Only meaningful when a declared sentinel exists; reported via
# ``attrs['nodata_pixels_present']`` (issue #2135) so consumers can
# answer "any nodata in this tile" without rescanning. ``None``
# keeps the attr out when no scan happened: that includes the no
# sentinel declared case (early-return in ``_set_nodata_attrs``) and
# any exotic dtype branch (e.g. complex source) that neither the
# float nor integer mask gates handle. Do not "fix" the asymmetry by
# forcing a False default -- the absence-means-unknown contract is
# deliberate so downstream can distinguish "scanned and saw nothing"
# from "did not scan."
nodata_pixels_present: bool | None = None
if nodata is not None and mask_nodata:
# When the reader applied MinIsWhite, the sentinel-equality mask
# must compare against the inverted sentinel value (issue #1809).
Expand All @@ -653,7 +665,20 @@ def open_geotiff(source: str | BinaryIO, *,
nodata_sentinel = getattr(geo_info, '_mask_nodata', nodata)
if arr.dtype.kind == 'f':
if nodata_sentinel is not None and not np.isnan(nodata_sentinel):
arr[arr == arr.dtype.type(nodata_sentinel)] = np.nan
mask_f = arr == arr.dtype.type(nodata_sentinel)
nodata_pixels_present = bool(mask_f.any())
if nodata_pixels_present:
arr[mask_f] = np.nan
else:
# NaN-only sentinel on a float buffer: ``mask_nodata`` is
# a no-op, but downstream may want to know if any NaN
# pixels already exist in the source so the attr stays
# informative.
nodata_pixels_present = (
bool(np.isnan(arr).any())
if nodata_sentinel is not None
else False
)
elif arr.dtype.kind in ('u', 'i'):
# Integer arrays: convert to float to represent NaN.
# An out-of-range sentinel (e.g. uint16 file with
Expand All @@ -679,14 +704,55 @@ def open_geotiff(source: str | BinaryIO, *,
info = np.iinfo(arr.dtype)
if info.min <= nodata_int <= info.max:
mask = arr == arr.dtype.type(nodata_int)
if mask.any():
nodata_pixels_present = bool(mask.any())
if nodata_pixels_present:
arr = arr.astype(np.float64)
arr[mask] = np.nan

else:
# Sentinel cannot match any pixel in this dtype's
# range. No mask was run; no pixels were present.
nodata_pixels_present = False
else:
nodata_pixels_present = False
elif nodata is not None and not mask_nodata:
# ``mask_nodata=False``: the masking branch above is skipped,
# but ``attrs['nodata_pixels_present']`` should still surface so
# callers know whether literal sentinel pixels survive in the
# buffer (issue #2135). Mirror the same dtype / range / integer
# gates as the masking branch so an out-of-range or non-integer
# sentinel cleanly resolves to ``False`` rather than crashing in
# the equality check.
nodata_sentinel = getattr(geo_info, '_mask_nodata', nodata)
if nodata_sentinel is not None:
if arr.dtype.kind == 'f':
if np.isnan(nodata_sentinel):
nodata_pixels_present = bool(np.isnan(arr).any())
else:
nodata_pixels_present = bool(
(arr == arr.dtype.type(nodata_sentinel)).any()
)
elif arr.dtype.kind in ('u', 'i'):
if (np.isfinite(nodata_sentinel)
and float(nodata_sentinel).is_integer()):
nodata_int = int(nodata_sentinel)
info = np.iinfo(arr.dtype)
if info.min <= nodata_int <= info.max:
nodata_pixels_present = bool(
(arr == arr.dtype.type(nodata_int)).any()
)
else:
nodata_pixels_present = False
else:
nodata_pixels_present = False

dtype_cast_attr: str | None = None
if dtype is not None:
target = np.dtype(dtype)
_validate_dtype_cast(arr.dtype, target)
arr = arr.astype(target)
# Record the post-mask cast so downstream can tell
# float-because-masked from float-because-promoted (issue #2135).
dtype_cast_attr = target.name

# ``attrs['masked_nodata']`` reflects whether the function
# actually replaced sentinel pixels with NaN (#2092). The pre-fix
Expand All @@ -699,6 +765,8 @@ def open_geotiff(source: str | BinaryIO, *,
_set_nodata_attrs(
attrs, nodata,
masked=(mask_nodata and arr.dtype.kind == 'f'),
pixels_present=nodata_pixels_present,
dtype_cast=dtype_cast_attr,
)

if arr.ndim == 3:
Expand Down
53 changes: 50 additions & 3 deletions xrspatial/geotiff/_attrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,18 @@
step ran; ``False`` iff the array still carries the literal integer
sentinel. Only emitted when ``nodata`` is set; absence is the
"no declared sentinel" signal. See ``_set_nodata_attrs``.
- ``nodata_pixels_present`` (#2135): bool, only emitted when
``nodata`` is set and the backend computed the answer cheaply.
True iff the read window contained at least one pixel matching the
declared sentinel before masking. Lets QA and writer code answer
"are any sentinel pixels in this tile" without scanning the buffer.
The dask path leaves this unset because a strict per-chunk
reduction would force an eager ``.compute()``.
- ``nodata_dtype_cast`` (#2135): string dtype name (e.g.
``"float64"``), only emitted when ``nodata`` is set and the caller
passed an explicit ``dtype=`` kwarg. Records that a post-mask cast
happened so consumers can tell float-because-masked from
float-because-promoted.
- ``raster_type``: ``'area'`` (implicit / RasterPixelIsArea) or ``'point'``
(explicit / RasterPixelIsPoint).
- ``extra_tags``: list of ``(tag_id, type_id, count, value)`` tuples for
Expand Down Expand Up @@ -245,24 +257,55 @@ def _should_restore_nan_sentinel(attrs) -> bool:
return value is not False


def _set_nodata_attrs(attrs: dict, nodata, *, masked: bool) -> None:
"""Set ``attrs['nodata']`` and ``attrs['masked_nodata']`` on a read.
def _set_nodata_attrs(
attrs: dict,
nodata,
*,
masked: bool,
pixels_present: bool | None = None,
dtype_cast: str | None = None,
) -> None:
"""Set the nodata lifecycle attrs on a read.

``masked`` is the actual mask-decision the read path made: True iff
sentinel pixels in the in-memory buffer have been replaced with NaN
(or the buffer is NaN-aware as a result of the reader's masking
step). False iff the literal sentinel values are still present in
the buffer.

``pixels_present`` is the lifecycle signal added in issue #2135. If
not ``None``, the read path computed whether the read window
contained at least one pixel matching the declared sentinel before
masking; the value is forwarded to ``attrs['nodata_pixels_present']``
so consumers can answer "any nodata in this tile" without scanning
the buffer. Pass ``None`` when the backend cannot cheaply produce
the value (e.g. dask, where a strict per-chunk reduction would
force eager compute).

``dtype_cast`` is the second lifecycle signal added in issue #2135.
If the caller passed an explicit ``dtype=`` kwarg, the backend
forwards the resolved target dtype string (e.g. ``"float64"``) so
consumers can distinguish "float because masking promoted it" from
"float because the caller cast it". ``None`` means no caller cast
happened; the attr is omitted in that case.

Contract (splits the two meanings previously fused into
``attrs['nodata']`` per issue #1988):
``attrs['nodata']`` per issue #1988, extended for #2135):

* ``attrs['nodata']`` -- declared file sentinel, as a scalar of the
source dtype. Set whenever the source declared one, regardless of
whether the array is float-with-NaN or int-with-sentinels.
* ``attrs['masked_nodata']`` -- the ``masked`` value the caller
passed, coerced to bool. Only emitted when ``nodata is not
None``; absence of the flag means there is no declared sentinel.
* ``attrs['nodata_pixels_present']`` (additive, #2135) -- bool,
only emitted when ``nodata is not None`` and ``pixels_present``
is not ``None``. Tracks whether the read window contained any
sentinel pixel before masking.
* ``attrs['nodata_dtype_cast']`` (additive, #2135) -- string dtype
name (e.g. ``"float64"``), only emitted when ``nodata is not
None`` and ``dtype_cast`` is not ``None``. Records that a
caller-requested cast happened after masking.

Pre-#2092 the helper inferred ``masked`` from the final array
dtype, which lied when ``mask_nodata=False`` left literal sentinel
Expand All @@ -275,6 +318,10 @@ def _set_nodata_attrs(attrs: dict, nodata, *, masked: bool) -> None:
return
attrs['nodata'] = nodata
attrs['masked_nodata'] = bool(masked)
if pixels_present is not None:
attrs['nodata_pixels_present'] = bool(pixels_present)
if dtype_cast is not None:
attrs['nodata_dtype_cast'] = str(dtype_cast)


def _validate_read_geo_info(
Expand Down
60 changes: 60 additions & 0 deletions xrspatial/geotiff/_backends/_gpu_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,66 @@ def _is_gpu_data(data) -> bool:
return isinstance(data, _cupy_type)


def _apply_nodata_mask_gpu_with_presence(arr_gpu, nodata):
"""Same as :func:`_apply_nodata_mask_gpu` but also reports presence.

Kept as a sibling helper rather than collapsed with
``_apply_nodata_mask_gpu`` because the original is a hot inner-loop
callee from the GPU writers and the per-call cost of computing the
presence bit (one extra ``.any().item()`` plus a small Python-side
tuple) is paid even when the caller does not need it. A future
refactor can fold both into one helper with an opt-in flag if a
third variant appears; for now the duplication keeps the original
no-overhead path intact.

Returns ``(arr_gpu, pixels_present)`` where ``pixels_present`` is a
bool indicating whether any pixel in ``arr_gpu`` matched the
declared sentinel before masking. Lets the eager GPU read path
surface ``attrs['nodata_pixels_present']`` (issue #2135) without
rescanning the buffer.

``pixels_present`` is ``False`` when ``nodata`` is ``None`` (no
declared sentinel), when the sentinel is out of range for an
integer buffer, or when the sentinel is fractional / non-finite
and cannot match an integer pixel -- the same fast-paths the
masking branch takes. On a NaN-only sentinel against a float
buffer, ``pixels_present`` is whether the buffer already contains
NaN.
"""
import cupy

if nodata is None:
return arr_gpu, False
arr_dtype = np.dtype(str(arr_gpu.dtype))
if arr_dtype.kind == 'f':
if np.isnan(nodata):
# NaN-only sentinel: no replacement happens, but report
# whether any NaN already lives in the buffer so the attr
# stays informative.
return arr_gpu, bool(cupy.isnan(arr_gpu).any().item())
sentinel = arr_dtype.type(nodata)
mask = arr_gpu == sentinel
present = bool(mask.any().item())
if present:
cupy.putmask(arr_gpu, mask, arr_dtype.type('nan'))
return arr_gpu, present
if arr_dtype.kind in ('u', 'i'):
if not (np.isfinite(nodata) and float(nodata).is_integer()):
return arr_gpu, False
nodata_int = int(nodata)
info = np.iinfo(arr_dtype)
if not (info.min <= nodata_int <= info.max):
return arr_gpu, False
sentinel = arr_dtype.type(nodata_int)
mask = arr_gpu == sentinel
present = bool(mask.any().item())
if present:
arr_gpu = arr_gpu.astype(cupy.float64)
cupy.putmask(arr_gpu, mask, cupy.float64('nan'))
return arr_gpu, present
return arr_gpu, False


def _apply_nodata_mask_gpu(arr_gpu, nodata):
"""Replace nodata sentinel values with NaN on a CuPy array.

Expand Down
9 changes: 9 additions & 0 deletions xrspatial/geotiff/_backends/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,9 +354,18 @@ def read_geotiff_dask(source: str, *,
# on an int file), the in-memory buffers hold literal sentinel
# values. True iff the caller opted into masking and the graph
# dtype is float.
# ``nodata_pixels_present`` is intentionally left unset on the
# dask path (issue #2135). A strict per-chunk reduction would force
# an eager ``.compute()`` here, defeating the lazy contract; callers
# that need the answer can fall back to scanning the materialised
# array. ``nodata_dtype_cast`` is recorded when the caller passed an
# explicit ``dtype=`` kwarg so downstream can tell float-by-cast
# apart from float-by-masking on the lazy output.
_set_nodata_attrs(
attrs, nodata_attr,
masked=(mask_nodata and target_dtype.kind == 'f'),
pixels_present=None,
dtype_cast=(np.dtype(dtype).name if dtype is not None else None),
)

if isinstance(chunks, int):
Expand Down
26 changes: 24 additions & 2 deletions xrspatial/geotiff/_backends/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
)
from ._gpu_helpers import (
_apply_nodata_mask_gpu,
_apply_nodata_mask_gpu_with_presence,
_apply_orientation_geo_info,
_apply_orientation_gpu,
_gpu_apply_window_band,
Expand Down Expand Up @@ -443,13 +444,18 @@ def read_geotiff_gpu(source: str, *,
# ``_mask_nodata`` when applicable; fall back to the original
# sentinel otherwise (#1809).
nodata = geo_info.nodata
nodata_pixels_present_attr: bool | None = None
if nodata is not None and mask_nodata:
mask_value = getattr(_stripped_geo, '_mask_nodata', nodata)
arr_gpu = _apply_nodata_mask_gpu(arr_gpu, mask_value)
arr_gpu, nodata_pixels_present_attr = (
_apply_nodata_mask_gpu_with_presence(arr_gpu, mask_value)
)
dtype_cast_attr: str | None = None
if dtype is not None:
target = np.dtype(dtype)
_validate_dtype_cast(np.dtype(str(arr_gpu.dtype)), target)
arr_gpu = arr_gpu.astype(target)
dtype_cast_attr = target.name
# ``attrs['masked_nodata']`` reflects whether the function
# actually replaced sentinel pixels with NaN (#2092). With
# ``mask_nodata=False`` the GPU mask kernel above is
Expand All @@ -459,6 +465,8 @@ def read_geotiff_gpu(source: str, *,
attrs, nodata,
masked=(mask_nodata
and np.dtype(str(arr_gpu.dtype)).kind == 'f'),
pixels_present=nodata_pixels_present_attr,
dtype_cast=dtype_cast_attr,
)
# ``read_to_array`` already applied window + band slicing, so
# ``arr_gpu`` is at output shape. Compute coords for that
Expand Down Expand Up @@ -780,6 +788,7 @@ def _read_once():
# dtype cast so the float promotion for masked integer rasters doesn't
# surprise a user-supplied dtype.
nodata = geo_info.nodata
nodata_pixels_present_attr: bool | None = None
if nodata is not None and mask_nodata:
# When MinIsWhite was applied, the mask must use the inverted
# sentinel; otherwise the original sentinel. The pure GPU path
Expand All @@ -794,12 +803,17 @@ def _read_once():
_cpu_fallback_geo, '_mask_nodata', nodata)
else:
_gpu_mask_value = nodata
arr_gpu = _apply_nodata_mask_gpu(arr_gpu, _gpu_mask_value)
arr_gpu, nodata_pixels_present_attr = (
_apply_nodata_mask_gpu_with_presence(
arr_gpu, _gpu_mask_value)
)

dtype_cast_attr: str | None = None
if dtype is not None:
target = np.dtype(dtype)
_validate_dtype_cast(np.dtype(str(arr_gpu.dtype)), target)
arr_gpu = arr_gpu.astype(target)
dtype_cast_attr = target.name

# Build DataArray
if name is None:
Expand All @@ -823,6 +837,8 @@ def _read_once():
attrs, nodata,
masked=(mask_nodata
and np.dtype(str(arr_gpu.dtype)).kind == 'f'),
pixels_present=nodata_pixels_present_attr,
dtype_cast=dtype_cast_attr,
)

# Apply window/band slicing post-decode. Coords are derived from the
Expand Down Expand Up @@ -1373,9 +1389,15 @@ def _chunk_task(meta, r0, c0, r1, c1):
# equal to ``file_dtype`` (see the float-promotion gate earlier
# in this function), so the rule below is equivalent to "graph
# dtype is float AND the caller opted into masking."
# ``nodata_pixels_present`` stays unset on the dask+GPU path for the
# same reason as the dask+numpy path: a strict per-chunk reduction
# would force an eager ``.compute()`` (issue #2135). ``dtype_cast``
# records the caller-supplied ``dtype=`` kwarg when present.
_set_nodata_attrs(
attrs, nodata,
masked=(mask_nodata and declared_dtype.kind == 'f'),
pixels_present=None,
dtype_cast=(np.dtype(dtype).name if dtype is not None else None),
)

if name is None:
Expand Down
Loading
Loading