Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 35 additions & 14 deletions xrspatial/geotiff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ def open_geotiff(source: str | BinaryIO, *,
missing_sources: str = _MISSING_SOURCES_SENTINEL,
allow_rotated: bool = False,
allow_unparseable_crs: bool = False,
mask_nodata: bool = True,
) -> xr.DataArray:
"""Read a GeoTIFF, COG, or VRT file into an xarray.DataArray.

Expand Down Expand Up @@ -322,6 +323,17 @@ def open_geotiff(source: str | BinaryIO, *,
return a partial mosaic. Passing this kwarg with a non-VRT
source raises ``ValueError`` because the policy only applies to
the VRT pipeline. See ``read_vrt`` for the full description.
mask_nodata : bool, default True
If True (the default), replace the nodata sentinel with ``NaN``;
integer rasters get promoted to ``float64`` first so NaN can be
represented. If False, skip the sentinel-to-NaN step and keep
the source dtype. ``attrs['nodata']`` still carries the raw
sentinel either way, so downstream code can mask explicitly.
Pass ``mask_nodata=False`` when you want to preserve an integer
source dtype via ``dtype=``: the default ``mask_nodata=True``
promotes to ``float64`` whenever the sentinel matches an actual
pixel, and ``dtype=<integer>`` then raises ``ValueError`` on the
float-to-int cast.

Returns
-------
Expand All @@ -346,10 +358,14 @@ def open_geotiff(source: str | BinaryIO, *,

Integer rasters with a nodata sentinel are silently promoted to
``float64`` with NaN replacing the sentinel so downstream NaN-aware
code works uniformly. Pass ``dtype=...`` to keep the source dtype
(the cast will fail with ``ValueError`` for float-to-int because that
is lossy in a way users rarely intend; cast explicitly after read if
you need it).
code works uniformly. To keep the source dtype on a file whose
sentinel matches actual pixels, pass ``mask_nodata=False``; the raw
sentinel stays in the data and ``attrs['nodata']`` still carries it.
Passing ``dtype=<integer>`` on its own is not enough: the
sentinel-to-NaN promotion runs first and the subsequent integer cast
then raises ``ValueError`` (float-to-int is lossy in a way users
rarely intend). When the file has no in-range sentinel match, the
promotion is skipped and ``dtype=<integer>`` works either way.
"""
from ._reader import _coerce_path

Expand Down Expand Up @@ -447,6 +463,7 @@ def open_geotiff(source: str | BinaryIO, *,
max_pixels=max_pixels,
allow_rotated=allow_rotated,
allow_unparseable_crs=allow_unparseable_crs,
mask_nodata=mask_nodata,
**vrt_kwargs)

# File-like buffers don't support the GPU or dask code paths because
Expand Down Expand Up @@ -474,6 +491,7 @@ def open_geotiff(source: str | BinaryIO, *,
max_pixels=max_pixels,
allow_rotated=allow_rotated,
allow_unparseable_crs=allow_unparseable_crs,
mask_nodata=mask_nodata,
**gpu_kwargs)

# Dask path (CPU)
Expand All @@ -483,7 +501,8 @@ def open_geotiff(source: str | BinaryIO, *,
window=window, band=band,
max_pixels=max_pixels, name=name,
allow_rotated=allow_rotated,
allow_unparseable_crs=allow_unparseable_crs)
allow_unparseable_crs=allow_unparseable_crs,
mask_nodata=mask_nodata)

kwargs = {}
if max_pixels is not None:
Expand Down Expand Up @@ -543,16 +562,18 @@ def open_geotiff(source: str | BinaryIO, *,
# write is safe; an extra ``arr.copy()`` would just double peak
# memory for a multi-MB raster.
nodata = geo_info.nodata
if nodata is not None:
if nodata is not None and mask_nodata:
# When the reader applied MinIsWhite, the sentinel-equality mask
# must compare against the inverted sentinel value (issue #1809).
# ``read_to_array`` / ``_read_cog_http`` stash that value on
# ``geo_info._mask_nodata``; fall back to the original sentinel
# on non-MinIsWhite files.
mask_nodata = getattr(geo_info, '_mask_nodata', nodata)
# on non-MinIsWhite files. Renamed from ``mask_nodata`` to
# ``nodata_sentinel`` so the local does not shadow the
# ``mask_nodata`` opt-out kwarg (#2052).
nodata_sentinel = getattr(geo_info, '_mask_nodata', nodata)
if arr.dtype.kind == 'f':
if mask_nodata is not None and not np.isnan(mask_nodata):
arr[arr == arr.dtype.type(mask_nodata)] = np.nan
if nodata_sentinel is not None and not np.isnan(nodata_sentinel):
arr[arr == arr.dtype.type(nodata_sentinel)] = np.nan
elif arr.dtype.kind in ('u', 'i'):
# Integer arrays: convert to float to represent NaN.
# An out-of-range sentinel (e.g. uint16 file with
Expand All @@ -571,10 +592,10 @@ def open_geotiff(source: str | BinaryIO, *,
# ``_writer.py`` / ``_vrt.py`` pattern used for #1564 / #1616).
# attrs['nodata'] still carries the raw sentinel so a write
# round-trip preserves the tag.
if (mask_nodata is not None
and np.isfinite(mask_nodata)
and float(mask_nodata).is_integer()):
nodata_int = int(mask_nodata)
if (nodata_sentinel is not None
and np.isfinite(nodata_sentinel)
and float(nodata_sentinel).is_integer()):
nodata_int = int(nodata_sentinel)
info = np.iinfo(arr.dtype)
if info.min <= nodata_int <= info.max:
mask = arr == arr.dtype.type(nodata_int)
Expand Down
24 changes: 21 additions & 3 deletions xrspatial/geotiff/_backends/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ def read_geotiff_dask(source: str, *,
chunks: int | tuple = 512,
max_pixels: int | None = None,
allow_rotated: bool = False,
allow_unparseable_crs: bool = False) -> xr.DataArray:
allow_unparseable_crs: bool = False,
mask_nodata: bool = True) -> xr.DataArray:
"""Read a GeoTIFF as a dask-backed DataArray for out-of-core processing.

Each chunk is loaded lazily via windowed reads.
Expand Down Expand Up @@ -73,6 +74,14 @@ def read_geotiff_dask(source: str, *,
directly.
name : str or None
Name for the DataArray.
mask_nodata : bool, default True
If True, replace the nodata sentinel with NaN per chunk (integer
rasters get promoted to ``float64``). If False, skip the
sentinel-to-NaN step so the source dtype survives. The raw
sentinel is still carried on ``attrs['nodata']`` either way.
Pass ``mask_nodata=False`` together with ``dtype=<integer>`` to
keep an integer source dtype; the default promotes to
``float64`` and the cast then raises. See issue #2052.

Returns
-------
Expand Down Expand Up @@ -105,6 +114,7 @@ def read_geotiff_dask(source: str, *,
return read_vrt(
source, dtype=dtype, window=window, band=band, name=name,
chunks=chunks, max_pixels=max_pixels,
mask_nodata=mask_nodata,
)

# P5: HTTP COG sources used to fire one IFD/header GET per chunk
Expand Down Expand Up @@ -208,7 +218,8 @@ def read_geotiff_dask(source: str, *,
# pass an exotic ``nodata`` type (e.g. complex) on the no-op path
# rather than surfacing an opaque error here.
effective_dtype = file_dtype
if (nodata is not None
if (mask_nodata
and nodata is not None
and file_dtype.kind in ('u', 'i')
and np.isfinite(nodata)
and float(nodata).is_integer()):
Expand Down Expand Up @@ -379,11 +390,18 @@ def read_geotiff_dask(source: str, *,
# actual dtype (uint16), silently casting later float64
# chunks back to int and converting their NaNs to 0. See
# issue #1597.
# Per-chunk nodata mask is skipped when ``mask_nodata=False``;
# passing ``nodata=None`` short-circuits both the float-NaN and
# int-promotion branches in ``_delayed_read_window``. The
# original sentinel is still carried in ``attrs['nodata']`` via
# ``nodata_attr`` so write round-trips preserve the tag. See
# issue #2052.
chunk_nodata = nodata if mask_nodata else None
block = da.from_delayed(
_delayed_read_window(source,
r0 + win_r0, c0 + win_c0,
r1 + win_r0, c1 + win_c0,
overview_level, nodata,
overview_level, chunk_nodata,
band_arg,
target_dtype=target_dtype,
http_meta_key=http_meta_key,
Expand Down
27 changes: 21 additions & 6 deletions xrspatial/geotiff/_backends/gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def read_geotiff_gpu(source: str, *,
on_gpu_failure: str = _ON_GPU_FAILURE_SENTINEL,
allow_rotated: bool = False,
allow_unparseable_crs: bool = False,
mask_nodata: bool = True,
gpu: str = _GPU_DEPRECATED_SENTINEL,
) -> xr.DataArray:
"""Read a GeoTIFF with GPU-accelerated decompression via Numba CUDA.
Expand Down Expand Up @@ -171,6 +172,13 @@ def read_geotiff_gpu(source: str, *,
``TypeError``. The old name shipped with values ``'auto'`` /
``'strict'`` and was easy to confuse with the boolean ``gpu=``
kwarg on ``open_geotiff`` / ``to_geotiff`` / ``read_vrt``.
mask_nodata : bool, default True
If True, replace the nodata sentinel with NaN (integer rasters
get promoted to ``float64`` first). If False, keep the source
dtype and leave the raw sentinel in the data. ``attrs['nodata']``
carries the sentinel either way. Pass ``mask_nodata=False``
together with ``dtype=<integer>`` to preserve an integer source
dtype on a file with a matching sentinel. See issue #2052.

Returns
-------
Expand Down Expand Up @@ -241,6 +249,7 @@ def read_geotiff_gpu(source: str, *,
name=name, max_pixels=max_pixels,
allow_rotated=allow_rotated,
allow_unparseable_crs=allow_unparseable_crs,
mask_nodata=mask_nodata,
)

from .._reader import (
Expand Down Expand Up @@ -405,7 +414,7 @@ def read_geotiff_gpu(source: str, *,
# ``_mask_nodata`` when applicable; fall back to the original
# sentinel otherwise (#1809).
nodata = geo_info.nodata
if nodata is not None:
if nodata is not None and mask_nodata:
mask_value = getattr(_stripped_geo, '_mask_nodata', nodata)
arr_gpu = _apply_nodata_mask_gpu(arr_gpu, mask_value)
if dtype is not None:
Expand Down Expand Up @@ -697,7 +706,7 @@ def _read_once():
# dtype cast so the float promotion for masked integer rasters doesn't
# surprise a user-supplied dtype.
nodata = geo_info.nodata
if nodata is not None:
if nodata is not None and mask_nodata:
# When MinIsWhite was applied, the mask must use the inverted
# sentinel; otherwise the original sentinel. The pure GPU path
# records the inverted sentinel in ``_mw_mask_nodata`` above; the
Expand Down Expand Up @@ -902,7 +911,8 @@ def _decode_window_gpu_direct(file_path, all_offsets, all_byte_counts,
def _read_geotiff_gpu_chunked(source, *, dtype, chunks, overview_level,
window, band, name, max_pixels,
allow_rotated: bool = False,
allow_unparseable_crs: bool = False):
allow_unparseable_crs: bool = False,
mask_nodata: bool = True):
"""Lazy Dask+CuPy backend for ``read_geotiff_gpu(chunks=...)``.

Two paths produce the same shape of dask graph:
Expand Down Expand Up @@ -964,6 +974,7 @@ def _read_geotiff_gpu_chunked(source, *, dtype, chunks, overview_level,
name=name, max_pixels=max_pixels,
allow_rotated=allow_rotated,
allow_unparseable_crs=allow_unparseable_crs,
mask_nodata=mask_nodata,
)
except Exception:
# GDS qualification failed; fall back to the CPU path. The
Expand All @@ -977,6 +988,7 @@ def _read_geotiff_gpu_chunked(source, *, dtype, chunks, overview_level,
max_pixels=max_pixels, name=name,
allow_rotated=allow_rotated,
allow_unparseable_crs=allow_unparseable_crs,
mask_nodata=mask_nodata,
)

cpu_dask_arr = cpu_da.data
Expand All @@ -1000,7 +1012,8 @@ def _read_geotiff_gpu_chunked_gds(source, ifd, geo_info, header, *,
dtype, chunks, window, band, name,
max_pixels,
allow_rotated: bool = False,
allow_unparseable_crs: bool = False):
allow_unparseable_crs: bool = False,
mask_nodata: bool = True):
"""Build a Dask+CuPy graph that decodes each chunk disk->GPU.

Caller must have verified that the source qualifies via
Expand Down Expand Up @@ -1108,8 +1121,10 @@ def _read_geotiff_gpu_chunked_gds(source, ifd, geo_info, header, *,

# Determine declared dtype for the dask graph. Nodata masking
# promotes integer rasters to float64; mirror the CPU dask path.
# When ``mask_nodata=False`` the masking is skipped, so no promotion.
declared_dtype = file_dtype
if nodata is not None and file_dtype.kind in ('u', 'i'):
if (mask_nodata and nodata is not None
and file_dtype.kind in ('u', 'i')):
if np.isfinite(nodata) and float(nodata).is_integer():
info = np.iinfo(file_dtype)
if info.min <= int(nodata) <= info.max:
Expand All @@ -1127,7 +1142,7 @@ def _chunk_task(meta, r0, c0, r1, c1):
r0, c0, r1, c1,
masked_fill=masked_fill,
)
if nodata is not None:
if nodata is not None and mask_nodata:
arr = _apply_nodata_mask_gpu(arr, nodata)
if dtype is not None:
target = np.dtype(dtype)
Expand Down
34 changes: 27 additions & 7 deletions xrspatial/geotiff/_backends/vrt.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ def read_vrt(source: str, *,
missing_sources: str = 'raise',
allow_rotated: bool = False,
allow_unparseable_crs: bool = False,
band_nodata: str | None = None) -> xr.DataArray:
band_nodata: str | None = None,
mask_nodata: bool = True) -> xr.DataArray:
"""Read a GDAL Virtual Raster Table (.vrt) into an xarray.DataArray.

The VRT's source GeoTIFFs are read via windowed reads and assembled
Expand Down Expand Up @@ -84,6 +85,16 @@ def read_vrt(source: str, *,
sentinel (or zero on integer bands without a sentinel).
``XRSPATIAL_GEOTIFF_STRICT=1`` forces a raise across the whole
module regardless of this kwarg.
mask_nodata : bool, default True
If True, run the integer-sentinel-to-NaN promotion on the
assembled mosaic. If False, skip it and keep the source dtype
with the raw sentinel still in the data. ``attrs['nodata']``
carries the sentinel either way. Pass ``mask_nodata=False``
together with ``dtype=<integer>`` when you need to preserve an
integer source dtype on a VRT whose declared sentinel matches
actual pixels. See issue #2052. Float source bands are NaN-aware
by virtue of how the internal reader handles them, so this kwarg
is most useful for integer-dtype mosaics.

Returns
-------
Expand Down Expand Up @@ -164,6 +175,7 @@ def read_vrt(source: str, *,
allow_rotated=allow_rotated,
allow_unparseable_crs=allow_unparseable_crs,
band_nodata=band_nodata,
mask_nodata=mask_nodata,
)

# Issue #1987 ambiguous-metadata checks for the eager VRT path. Parse
Expand Down Expand Up @@ -276,7 +288,10 @@ def read_vrt(source: str, *,
# promoting ``arr`` to float64 on the first sentinel hit and writing
# NaNs in place on the promoted view. Shared with the chunked path
# (issue #1825) so behaviour stays in lockstep. See issue #1611.
arr = _vrt_apply_integer_sentinel_mask(arr, vrt, band)
# ``mask_nodata=False`` skips this so callers can preserve an
# integer source dtype via ``dtype=...`` (issue #2052).
if mask_nodata:
arr = _vrt_apply_integer_sentinel_mask(arr, vrt, band)

# Surface the source GeoTransform in the same rasterio ordering used
# by open_geotiff: (pixel_width, 0, origin_x, 0, pixel_height, origin_y).
Expand Down Expand Up @@ -323,7 +338,8 @@ def read_vrt(source: str, *,

def _vrt_chunk_read(source, r0, c0, r1, c1, *,
band, max_pixels, missing_sources,
declared_dtype, gpu, parsed_vrt):
declared_dtype, gpu, parsed_vrt,
mask_nodata: bool = True):
"""Decode a single chunk window from a VRT.

Called by ``dask.delayed`` from :func:`_read_vrt_chunked`. The
Expand Down Expand Up @@ -360,8 +376,10 @@ def _vrt_chunk_read(source, r0, c0, r1, c1, *,
# promotes to float64 when sentinels hit. The surrounding dask graph
# already declared float64 when any band has a representable integer
# sentinel, so any chunk that actually fires the mask returns a
# buffer whose dtype matches the declared one.
arr = _apply_integer_sentinel_mask(arr, vrt, band)
# buffer whose dtype matches the declared one. Skip the helper when
# ``mask_nodata=False`` so the source integer dtype survives (#2052).
if mask_nodata:
arr = _apply_integer_sentinel_mask(arr, vrt, band)

if declared_dtype is not None and arr.dtype != declared_dtype:
arr = arr.astype(declared_dtype)
Expand All @@ -377,7 +395,8 @@ def _read_vrt_chunked(source, *, window, band, name, chunks, gpu, dtype,
max_pixels, missing_sources,
allow_rotated: bool = False,
allow_unparseable_crs: bool = False,
band_nodata: str | None = None):
band_nodata: str | None = None,
mask_nodata: bool = True):
"""Lazy ``read_vrt`` dispatch when ``chunks=`` is set (issue #1814).

Parses the VRT XML once to recover the extent, CRS, GeoTransform,
Expand Down Expand Up @@ -539,7 +558,7 @@ def _read_vrt_chunked(source, *, window, band, name, chunks, gpu, dtype,
# See also Copilot review on PR #1822.
declared_dtype = _effective_dtype_for_bands(selected_bands)

if declared_dtype.kind in ('u', 'i'):
if mask_nodata and declared_dtype.kind in ('u', 'i'):
promotes = False
for vrt_band in selected_bands:
if _sentinel_for_dtype(vrt_band.nodata,
Expand Down Expand Up @@ -602,6 +621,7 @@ def _read_vrt_chunked(source, *, window, band, name, chunks, gpu, dtype,
declared_dtype=declared_dtype,
gpu=gpu,
parsed_vrt=parsed_vrt_key,
mask_nodata=mask_nodata,
)
block = da.from_delayed(d, shape=block_shape,
dtype=declared_dtype, meta=meta)
Expand Down
Loading
Loading