Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions docs/source/user_guide/attrs_contract.rst
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,13 @@ write.
``_resolve_nodata_attr``.
* - ``masked_nodata``
- bool
- Paired with ``nodata``. ``True`` when the in-memory array is
float dtype and the reader's sentinel-to-NaN step ran;
``False`` when the array still carries the literal integer
sentinel. Only set when ``nodata`` is set; absence means no
declared sentinel.
- Paired with ``nodata``. ``True`` when the reader actually
replaced sentinel pixels with NaN (so the buffer is NaN-aware);
``False`` when the array still carries the literal sentinel
values, including the case where the array is float dtype
because the caller passed ``mask_nodata=False`` together with
``dtype=float...``. Only set when ``nodata`` is set; absence
means no declared sentinel. See issue #2092.
* - ``raster_type``
- str
- ``'point'`` when the file declares ``RasterPixelIsPoint``;
Expand Down
16 changes: 12 additions & 4 deletions xrspatial/geotiff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -686,10 +686,18 @@ def open_geotiff(source: str | BinaryIO, *,
_validate_dtype_cast(arr.dtype, target)
arr = arr.astype(target)

# Set ``attrs['nodata']`` + ``attrs['masked_nodata']`` after the
# mask + optional dtype cast so ``masked_nodata`` reflects the
# final array dtype (issue #1988).
_set_nodata_attrs(attrs, nodata, array_dtype=arr.dtype)
# ``attrs['masked_nodata']`` reflects whether the function
# actually replaced sentinel pixels with NaN (#2092). The pre-fix
# implementation inferred it from final dtype, which lied when
# ``mask_nodata=False`` left literal sentinel values in a float
# buffer. True iff the caller opted into masking and the result
# is a float-dtype buffer (the mask block above either ran the
# float-replacement step or promoted an int input to float64;
# either way the buffer holds NaN where the sentinel used to be).
_set_nodata_attrs(
attrs, nodata,
masked=(mask_nodata and arr.dtype.kind == 'f'),
)

if arr.ndim == 3:
dims = ['y', 'x', 'band']
Expand Down
38 changes: 20 additions & 18 deletions xrspatial/geotiff/_attrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,34 +241,36 @@ def _should_restore_nan_sentinel(attrs) -> bool:
return value is not False


def _set_nodata_attrs(attrs: dict, nodata, *, array_dtype) -> None:
def _set_nodata_attrs(attrs: dict, nodata, *, masked: bool) -> None:
"""Set ``attrs['nodata']`` and ``attrs['masked_nodata']`` on a read.

Splits the two meanings previously fused into ``attrs['nodata']``
(issue #1988):
``masked`` is the actual mask-decision the read path made: True iff
sentinel pixels in the in-memory buffer have been replaced with NaN
(or the buffer is NaN-aware as a result of the reader's masking
step). False iff the literal sentinel values are still present in
the buffer.

Contract (splits the two meanings previously fused into
``attrs['nodata']`` per issue #1988):

* ``attrs['nodata']`` -- declared file sentinel, as a scalar of the
source dtype. Set whenever the source declared one, regardless of
whether the array is float-with-NaN or int-with-sentinels.
* ``attrs['masked_nodata']`` -- boolean flag. ``True`` iff the in-
memory array has been NaN-masked (i.e. it is float dtype and the
reader's sentinel-to-NaN step ran). ``False`` iff the array still
carries the literal integer sentinel value.

Callers pass ``array_dtype`` as the final post-mask, post-cast dtype
of the array that will be wrapped in the returned DataArray. The
float/non-float split drives the ``masked_nodata`` value: any float
output is treated as NaN-aware (NaN is the sentinel proxy), any
integer output still carries the raw sentinel.

``masked_nodata`` is only emitted when ``nodata is not None``. With
no declared sentinel, the flag is meaningless and its absence is the
signal.
* ``attrs['masked_nodata']`` -- the ``masked`` value the caller
passed, coerced to bool. Only emitted when ``nodata is not
None``; absence of the flag means there is no declared sentinel.

Pre-#2092 the helper inferred ``masked`` from the final array
dtype, which lied when ``mask_nodata=False`` left literal sentinel
values in a float buffer; downstream code that trusted the attr
treated those literal values as already-NaN. The eager, dask, GPU,
and VRT paths now compute ``masked`` as
``mask_nodata and final_dtype.kind == 'f'``. See issue #2092.
"""
if nodata is None:
return
attrs['nodata'] = nodata
attrs['masked_nodata'] = bool(np.dtype(array_dtype).kind == 'f')
attrs['masked_nodata'] = bool(masked)


def _validate_read_geo_info(
Expand Down
17 changes: 11 additions & 6 deletions xrspatial/geotiff/_backends/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,12 +340,17 @@ def read_geotiff_dask(source: str, *,

attrs = {}
_populate_attrs_from_geo_info(attrs, geo_info, window=window)
# ``masked_nodata`` reflects the declared dask graph dtype: a float
# graph means every chunk runs the sentinel-to-NaN promotion (or
# already arrived as float), so the in-memory array is NaN-masked.
# An integer graph means each chunk keeps the literal sentinel
# value. See issue #1988.
_set_nodata_attrs(attrs, nodata_attr, array_dtype=target_dtype)
# ``masked_nodata`` reflects whether per-chunk masking actually
# runs in the lazy graph (#2092). With ``mask_nodata=False`` each
# chunk skips the sentinel-to-NaN step, so even if the graph
# dtype happens to be float (e.g. caller-supplied ``dtype=float64``
# on an int file), the in-memory buffers hold literal sentinel
# values. True iff the caller opted into masking and the graph
# dtype is float.
_set_nodata_attrs(
attrs, nodata_attr,
masked=(mask_nodata and target_dtype.kind == 'f'),
)

if isinstance(chunks, int):
ch_h = ch_w = chunks
Expand Down
34 changes: 25 additions & 9 deletions xrspatial/geotiff/_backends/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,11 +450,15 @@ def read_geotiff_gpu(source: str, *,
target = np.dtype(dtype)
_validate_dtype_cast(np.dtype(str(arr_gpu.dtype)), target)
arr_gpu = arr_gpu.astype(target)
# ``attrs['nodata']`` + ``attrs['masked_nodata']`` reflect the
# post-mask, post-cast array dtype (issue #1988).
# ``attrs['masked_nodata']`` reflects whether the function
# actually replaced sentinel pixels with NaN (#2092). With
# ``mask_nodata=False`` the GPU mask kernel above is
# skipped, so the buffer holds literal sentinel values
# even if the final dtype is float.
_set_nodata_attrs(
attrs, nodata,
array_dtype=np.dtype(str(arr_gpu.dtype)),
masked=(mask_nodata
and np.dtype(str(arr_gpu.dtype)).kind == 'f'),
)
# ``read_to_array`` already applied window + band slicing, so
# ``arr_gpu`` is at output shape. Compute coords for that
Expand Down Expand Up @@ -810,10 +814,15 @@ def _read_once():

attrs = {}
_populate_attrs_from_geo_info(attrs, geo_info, window=window)
# ``attrs['nodata']`` + ``attrs['masked_nodata']`` reflect the
# post-mask, post-cast array dtype (issue #1988).
# ``attrs['masked_nodata']`` reflects whether the function
# actually replaced sentinel pixels with NaN (#2092). The GPU
# mask kernel runs only when ``mask_nodata=True`` and a sentinel
# is declared, so the post-cast float dtype alone is not enough
# to claim masking.
_set_nodata_attrs(
attrs, nodata, array_dtype=np.dtype(str(arr_gpu.dtype)),
attrs, nodata,
masked=(mask_nodata
and np.dtype(str(arr_gpu.dtype)).kind == 'f'),
)

# Apply window/band slicing post-decode. Coords are derived from the
Expand Down Expand Up @@ -1358,9 +1367,16 @@ def _chunk_task(meta, r0, c0, r1, c1):

attrs = {}
_populate_attrs_from_geo_info(attrs, geo_info, window=window)
# ``masked_nodata`` reflects the declared dask graph dtype; mirrors
# the dask+numpy backend contract (issue #1988).
_set_nodata_attrs(attrs, nodata, array_dtype=declared_dtype)
# ``masked_nodata`` reflects whether per-chunk masking actually
# runs in the lazy graph (#2092); mirrors the dask+numpy backend
# contract. With ``mask_nodata=False`` ``declared_dtype`` stays
# equal to ``file_dtype`` (see the float-promotion gate earlier
# in this function), so the rule below is equivalent to "graph
# dtype is float AND the caller opted into masking."
_set_nodata_attrs(
attrs, nodata,
masked=(mask_nodata and declared_dtype.kind == 'f'),
)

if name is None:
import os
Expand Down
35 changes: 29 additions & 6 deletions xrspatial/geotiff/_backends/vrt.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,17 @@ def read_vrt(source: str, *,
if mask_nodata:
arr = _vrt_apply_integer_sentinel_mask(arr, vrt, band)

# Capture pre-cast dtype: ``_vrt._read_data`` NaN-masks float source
# arrays (and int sources feeding a float VRT dataType) inline, and
# ``_vrt_apply_integer_sentinel_mask`` above promotes int-with-sentinel
# to float64 with NaNs written in place. Any of those paths yield a
# float pre-cast dtype; an int pre-cast dtype means literal sentinels
# are still in the buffer. The optional user dtype cast below may
# promote int -> float without masking, so reading dtype after the
# cast would falsely claim ``masked_nodata=True`` (issue #2092
# follow-up).
pre_cast_dtype = np.dtype(str(arr.dtype))

# Surface the source GeoTransform in the same rasterio ordering used
# by open_geotiff: (pixel_width, 0, origin_x, 0, pixel_height, origin_y).
# vrt.geo_transform is GDAL ordering, so reorder. For a windowed read
Expand All @@ -343,11 +354,10 @@ def read_vrt(source: str, *,
_validate_dtype_cast(np.dtype(str(arr.dtype)), target)
arr = arr.astype(target)

# Record ``nodata`` + ``masked_nodata`` from the final array dtype
# (issue #1988). On GPU the dtype is read off the CuPy array via
# ``str(arr.dtype)`` so ``np.dtype`` accepts it without a CuPy
# dependency in ``_set_nodata_attrs``.
_set_nodata_attrs(attrs, nodata, array_dtype=np.dtype(str(arr.dtype)))
_set_nodata_attrs(
attrs, nodata,
masked=(pre_cast_dtype.kind == 'f'),
)

if arr.ndim == 3:
dims = ['y', 'x', 'band']
Expand Down Expand Up @@ -708,7 +718,20 @@ def _read_vrt_chunked(source, *, window, band, name, chunks, gpu, dtype,
if vrt.bands:
band_idx_for_nodata = band if band is not None else 0
nodata_meta = vrt.bands[band_idx_for_nodata].nodata
_set_nodata_attrs(attrs, nodata_meta, array_dtype=final_dtype)
# VRT chunked path: the per-task VRT reader NaN-masks float source
# arrays inline, and ``declared_dtype`` is promoted to float64 only
# when ``mask_nodata`` is on and an integer band has a representable
# sentinel (see the ``declared_dtype`` block earlier in this
# function). Either way, ``declared_dtype.kind == 'f'`` is the
# correct gate for "buffer is NaN-aware." A user-supplied
# ``dtype=`` cast happens on top of the lazy graph above (see
# ``final_dtype`` block) and must not flip this attr, so we read
# the pre-cast ``declared_dtype`` here rather than ``final_dtype``
# (#2092 follow-up).
_set_nodata_attrs(
attrs, nodata_meta,
masked=(declared_dtype.kind == 'f'),
)

# Static hole detection: mirror the eager-path ``attrs['vrt_holes']``
# contract (#1734) by scanning every source referenced in the parsed
Expand Down
Loading
Loading