diff --git a/CHANGELOG.md b/CHANGELOG.md index 6fe2c450e..efd616bb9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,10 @@ - Default internal `_vrt.read_vrt` `missing_sources` to `'raise'` so an unreadable VRT source no longer produces a silent zero-fill hole on integer rasters; pass `missing_sources='warn'` to opt back into the previous lenient behaviour (#1843) - Deprecate read-side emission of matplotlib colormap-derived attrs (cmap, colormap_rgba) on palette TIFFs; the writer cannot set Photometric=3 so they do not round-trip. Construct ListedColormap from attrs['colormap'] in caller code. These attrs still emit for now but trigger a DeprecationWarning. Removal planned for a future release. (#1984) - Reject writes whose `attrs['crs']` and `attrs['crs_wkt']` resolve to different CRSes (after pyproj canonicalisation) instead of silently emitting the EPSG and dropping the WKT. The new `ConflictingCRSError` (subclass of `ValueError` and `GeoTIFFAmbiguousMetadataError`) names the offending attrs; pass `crs=` explicitly to override both attrs and bypass the check. Read-back DataArrays carrying both attrs continue to round-trip because the reader's two attrs derive from the same on-disk CRS. (#1987) +- Reject reads whose CRS string cannot be parsed by pyproj instead of emitting it verbatim in `attrs['crs_wkt']` and letting downstream code crash on first use. Raises `UnparseableCRSError` (subclass of `ValueError`); pass `allow_unparseable_crs=True` to `open_geotiff` / `read_geotiff_dask` / `read_geotiff_gpu` / `read_vrt` to keep the legacy behaviour. The existing write-side raise from `_validate_crs_fallback` was retyped from plain `ValueError` to the new `UnparseableCRSError` subclass (no behaviour change). (#1987) +- Reject reads whose affine transform has non-zero rotation/shear terms instead of returning an axis-misaligned grid that downstream xrspatial ops (slope, aspect, hillshade, proximity, zonal) silently compute wrong results on. Raises `RotatedTransformError`; pass `allow_rotated=True` to `open_geotiff` / `read_geotiff_dask` / `read_geotiff_gpu` / `read_vrt` to read the pixel grid without the axis-aligned-grid assumption. (#1987) +- Reject writes whose `coords['y']` or `coords['x']` are not uniformly spaced instead of silently using the first two values as the pixel size and misrepresenting the rest of the axis. Raises `NonUniformCoordsError`; the existing int-dtype sentinel convention from #1969 (used by the no-georef coord fallback) is exempted. (#1987) +- Reject writes whose `attrs['nodata']` disagrees with every concrete entry in `attrs['nodatavals']` instead of silently picking the canonical scalar and dropping the rioxarray tuple. Raises `ConflictingNodataError`; pass `nodata=` explicitly to the writer to override both attrs and bypass the check. `_FillValue` continues to be deprioritised per the existing resolver convention. (#1987) ### Version 0.9.9 - 2026-05-05 diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index c4a9d6bdd..2e2122df3 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -79,6 +79,7 @@ _extent_to_window, _extract_rich_tags, _populate_attrs_from_geo_info, + _validate_read_geo_info, _resolve_nodata_attr, _set_nodata_attrs, ) @@ -251,6 +252,8 @@ def open_geotiff(source: str | BinaryIO, *, max_cloud_bytes=_MAX_CLOUD_BYTES_SENTINEL, on_gpu_failure: str = _ON_GPU_FAILURE_SENTINEL, missing_sources: str = _MISSING_SOURCES_SENTINEL, + allow_rotated: bool = False, + allow_unparseable_crs: bool = False, ) -> xr.DataArray: """Read a GeoTIFF, COG, or VRT file into an xarray.DataArray. @@ -441,7 +444,10 @@ def open_geotiff(source: str | BinaryIO, *, vrt_kwargs['missing_sources'] = missing_sources return read_vrt(source, dtype=dtype, window=window, band=band, name=name, chunks=chunks, gpu=gpu, - max_pixels=max_pixels, **vrt_kwargs) + max_pixels=max_pixels, + allow_rotated=allow_rotated, + allow_unparseable_crs=allow_unparseable_crs, + **vrt_kwargs) # File-like buffers don't support the GPU or dask code paths because # those re-open the source by path from worker tasks or device-side @@ -466,6 +472,8 @@ def open_geotiff(source: str | BinaryIO, *, window=window, band=band, name=name, chunks=chunks, max_pixels=max_pixels, + allow_rotated=allow_rotated, + allow_unparseable_crs=allow_unparseable_crs, **gpu_kwargs) # Dask path (CPU) @@ -473,7 +481,9 @@ def open_geotiff(source: str | BinaryIO, *, return read_geotiff_dask(source, dtype=dtype, chunks=chunks, overview_level=overview_level, window=window, band=band, - max_pixels=max_pixels, name=name) + max_pixels=max_pixels, name=name, + allow_rotated=allow_rotated, + allow_unparseable_crs=allow_unparseable_crs) kwargs = {} if max_pixels is not None: @@ -514,6 +524,14 @@ def open_geotiff(source: str | BinaryIO, *, import os name = os.path.splitext(os.path.basename(source))[0] + # Issue #1987 ambiguous-metadata checks. Run before attrs population + # so a rejected file does not leak a partly-populated attrs dict. + _validate_read_geo_info( + geo_info, window=window, + allow_rotated=allow_rotated, + allow_unparseable_crs=allow_unparseable_crs, + ) + attrs = {} _populate_attrs_from_geo_info(attrs, geo_info, window=window) diff --git a/xrspatial/geotiff/_attrs.py b/xrspatial/geotiff/_attrs.py index 47d5a6b9c..2d6752026 100644 --- a/xrspatial/geotiff/_attrs.py +++ b/xrspatial/geotiff/_attrs.py @@ -467,6 +467,57 @@ def _set_nodata_attrs(attrs: dict, nodata, *, array_dtype) -> None: attrs['masked_nodata'] = bool(np.dtype(array_dtype).kind == 'f') +def _validate_read_geo_info( + geo_info, + *, + window=None, + allow_rotated: bool = False, + allow_unparseable_crs: bool = False, +) -> None: + """Run issue #1987 read-side ambiguous-metadata checks against ``geo_info``. + + Centralised helper so the eager numpy, dask, GPU, and VRT read + paths run the same checks before constructing the returned + DataArray. Forwards ``allow_rotated`` / ``allow_unparseable_crs`` + to the registered checks (``_check_read_rotated_transform`` and + ``_check_read_unparseable_crs`` today; sibling checks attach via + the registry). + + Raises whichever ``GeoTIFFAmbiguousMetadataError`` subclass a + registered check picks. The hook is a no-op when no check is + registered, so callers can use this helper unconditionally without + coupling each backend to the current check list. + + Note: the transform tuple built here is always axis-aligned + (``b == 0`` / ``d == 0``) because ``_transform_tuple_from_pixel_geometry`` + only carries origin + pixel size, and the upstream TIFF reader + rejects rotated ``ModelTransformationTag`` entries with + ``NotImplementedError`` in ``_geotags._extract_transform_and_georef`` + before we reach this helper. The rotated-transform check therefore + fires only on the VRT path, which builds its context from the GDAL + ``geo_transform`` via ``_gdal_geotransform_to_affine_tuple``. + """ + from ._validation import validate_read_metadata + transform_for_check = ( + _transform_tuple_from_pixel_geometry( + geo_info.transform.origin_x, + geo_info.transform.origin_y, + geo_info.transform.pixel_width, + geo_info.transform.pixel_height, + window=window, + ) + if (geo_info.transform is not None + and getattr(geo_info, 'has_georef', True)) + else None + ) + validate_read_metadata({ + 'allow_rotated': allow_rotated, + 'allow_unparseable_crs': allow_unparseable_crs, + 'transform': transform_for_check, + 'crs_wkt': geo_info.crs_wkt, + }) + + def _populate_attrs_from_geo_info(attrs: dict, geo_info, *, window=None) -> None: """Populate ``attrs`` with all GeoTIFF metadata from ``geo_info``. diff --git a/xrspatial/geotiff/_backends/dask.py b/xrspatial/geotiff/_backends/dask.py index b559ea3ae..d36a6b224 100644 --- a/xrspatial/geotiff/_backends/dask.py +++ b/xrspatial/geotiff/_backends/dask.py @@ -17,7 +17,11 @@ import numpy as np import xarray as xr -from .._attrs import _populate_attrs_from_geo_info, _set_nodata_attrs +from .._attrs import ( + _populate_attrs_from_geo_info, + _set_nodata_attrs, + _validate_read_geo_info, +) from .._coords import ( coords_from_geo_info as _coords_from_geo_info, geo_to_coords as _geo_to_coords, @@ -34,7 +38,9 @@ def read_geotiff_dask(source: str, *, band: int | None = None, name: str | None = None, chunks: int | tuple = 512, - max_pixels: int | None = None) -> xr.DataArray: + max_pixels: int | None = None, + allow_rotated: bool = False, + allow_unparseable_crs: bool = False) -> xr.DataArray: """Read a GeoTIFF as a dask-backed DataArray for out-of-core processing. Each chunk is loaded lazily via windowed reads. @@ -294,6 +300,13 @@ def read_geotiff_dask(source: str, *, import os name = os.path.splitext(os.path.basename(source))[0] + # Issue #1987 ambiguous-metadata checks. + _validate_read_geo_info( + geo_info, window=window, + allow_rotated=allow_rotated, + allow_unparseable_crs=allow_unparseable_crs, + ) + attrs = {} _populate_attrs_from_geo_info(attrs, geo_info, window=window) # ``masked_nodata`` reflects the declared dask graph dtype: a float diff --git a/xrspatial/geotiff/_backends/gpu.py b/xrspatial/geotiff/_backends/gpu.py index ceccff97e..175e6f792 100644 --- a/xrspatial/geotiff/_backends/gpu.py +++ b/xrspatial/geotiff/_backends/gpu.py @@ -20,7 +20,11 @@ import numpy as np import xarray as xr -from .._attrs import _populate_attrs_from_geo_info, _set_nodata_attrs +from .._attrs import ( + _populate_attrs_from_geo_info, + _set_nodata_attrs, + _validate_read_geo_info, +) from .._coords import ( coords_from_geo_info as _coords_from_geo_info, ) @@ -80,6 +84,8 @@ def read_geotiff_gpu(source: str, *, chunks: int | tuple | None = None, max_pixels: int | None = None, on_gpu_failure: str = _ON_GPU_FAILURE_SENTINEL, + allow_rotated: bool = False, + allow_unparseable_crs: bool = False, gpu: str = _GPU_DEPRECATED_SENTINEL, ) -> xr.DataArray: """Read a GeoTIFF with GPU-accelerated decompression via Numba CUDA. @@ -233,6 +239,8 @@ def read_geotiff_gpu(source: str, *, source, dtype=dtype, chunks=chunks, overview_level=overview_level, window=window, band=band, name=name, max_pixels=max_pixels, + allow_rotated=allow_rotated, + allow_unparseable_crs=allow_unparseable_crs, ) from .._reader import ( @@ -382,6 +390,11 @@ def read_geotiff_gpu(source: str, *, if name is None: import os name = os.path.splitext(os.path.basename(source))[0] + _validate_read_geo_info( + geo_info, window=window, + allow_rotated=allow_rotated, + allow_unparseable_crs=allow_unparseable_crs, + ) attrs = {} _populate_attrs_from_geo_info(attrs, geo_info, window=window) # Apply nodata mask + record sentinel so the GPU read agrees @@ -710,6 +723,12 @@ def _read_once(): import os name = os.path.splitext(os.path.basename(source))[0] + _validate_read_geo_info( + geo_info, window=window, + allow_rotated=allow_rotated, + allow_unparseable_crs=allow_unparseable_crs, + ) + attrs = {} _populate_attrs_from_geo_info(attrs, geo_info, window=window) # ``attrs['nodata']`` + ``attrs['masked_nodata']`` reflect the @@ -881,7 +900,9 @@ def _decode_window_gpu_direct(file_path, all_offsets, all_byte_counts, def _read_geotiff_gpu_chunked(source, *, dtype, chunks, overview_level, - window, band, name, max_pixels): + window, band, name, max_pixels, + allow_rotated: bool = False, + allow_unparseable_crs: bool = False): """Lazy Dask+CuPy backend for ``read_geotiff_gpu(chunks=...)``. Two paths produce the same shape of dask graph: @@ -941,6 +962,8 @@ def _read_geotiff_gpu_chunked(source, *, dtype, chunks, overview_level, src_path, ifd, geo_info, header, dtype=dtype, chunks=chunks, window=window, band=band, name=name, max_pixels=max_pixels, + allow_rotated=allow_rotated, + allow_unparseable_crs=allow_unparseable_crs, ) except Exception: # GDS qualification failed; fall back to the CPU path. The @@ -952,6 +975,8 @@ def _read_geotiff_gpu_chunked(source, *, dtype, chunks, overview_level, source, dtype=dtype, chunks=chunks, overview_level=overview_level, window=window, band=band, max_pixels=max_pixels, name=name, + allow_rotated=allow_rotated, + allow_unparseable_crs=allow_unparseable_crs, ) cpu_dask_arr = cpu_da.data @@ -973,7 +998,9 @@ def _upload(block): def _read_geotiff_gpu_chunked_gds(source, ifd, geo_info, header, *, dtype, chunks, window, band, name, - max_pixels): + max_pixels, + allow_rotated: bool = False, + allow_unparseable_crs: bool = False): """Build a Dask+CuPy graph that decodes each chunk disk->GPU. Caller must have verified that the source qualifies via @@ -1159,6 +1186,12 @@ def _chunk_task(meta, r0, c0, r1, c1): else: dims = ['y', 'x'] + _validate_read_geo_info( + geo_info, window=window, + allow_rotated=allow_rotated, + allow_unparseable_crs=allow_unparseable_crs, + ) + attrs = {} _populate_attrs_from_geo_info(attrs, geo_info, window=window) # ``masked_nodata`` reflects the declared dask graph dtype; mirrors diff --git a/xrspatial/geotiff/_backends/vrt.py b/xrspatial/geotiff/_backends/vrt.py index 4d1c125f3..5fe1cbbac 100644 --- a/xrspatial/geotiff/_backends/vrt.py +++ b/xrspatial/geotiff/_backends/vrt.py @@ -37,7 +37,10 @@ def read_vrt(source: str, *, chunks: int | tuple | None = None, gpu: bool = False, max_pixels: int | None = None, - missing_sources: str = 'raise') -> xr.DataArray: + missing_sources: str = 'raise', + allow_rotated: bool = False, + allow_unparseable_crs: bool = False, + band_nodata: str | None = None) -> xr.DataArray: """Read a GDAL Virtual Raster Table (.vrt) into an xarray.DataArray. The VRT's source GeoTIFFs are read via windowed reads and assembled @@ -158,11 +161,43 @@ def read_vrt(source: str, *, dtype=dtype, max_pixels=max_pixels, missing_sources=missing_sources, + allow_rotated=allow_rotated, + allow_unparseable_crs=allow_unparseable_crs, + band_nodata=band_nodata, ) + # Issue #1987 ambiguous-metadata checks for the eager VRT path. Parse + # the VRT XML up front and validate before ``_read_vrt_internal`` + # touches any pixel data, so a rejected file does not first + # materialise the full mosaic into host memory. The parsed + # ``VRTDataset`` is threaded into the internal reader via ``parsed=`` + # so we don't double-parse the XML. + import os as _os + from .._validation import ( + validate_read_metadata, + _gdal_geotransform_to_affine_tuple, + ) + from .._vrt import parse_vrt as _parse_vrt, _read_vrt_xml + _xml_str = _read_vrt_xml(source) + _vrt_dir = _os.path.dirname(_os.path.abspath(source)) + _parsed_vrt = _parse_vrt(_xml_str, _vrt_dir) + validate_read_metadata({ + 'allow_rotated': allow_rotated, + 'allow_unparseable_crs': allow_unparseable_crs, + 'transform': _gdal_geotransform_to_affine_tuple( + _parsed_vrt.geo_transform + ), + 'crs_wkt': _parsed_vrt.crs_wkt, + 'band_nodata': band_nodata, + 'band_nodata_values': ( + [b.nodata for b in _parsed_vrt.bands] + if _parsed_vrt.bands else None + ), + }) + arr, vrt = _read_vrt_internal( source, window=window, band=band, max_pixels=max_pixels, - missing_sources=missing_sources, + missing_sources=missing_sources, parsed=_parsed_vrt, ) if name is None: @@ -339,7 +374,10 @@ def _vrt_chunk_read(source, r0, c0, r1, c1, *, def _read_vrt_chunked(source, *, window, band, name, chunks, gpu, dtype, - max_pixels, missing_sources): + max_pixels, missing_sources, + allow_rotated: bool = False, + allow_unparseable_crs: bool = False, + band_nodata: str | None = None): """Lazy ``read_vrt`` dispatch when ``chunks=`` is set (issue #1814). Parses the VRT XML once to recover the extent, CRS, GeoTransform, @@ -383,6 +421,24 @@ def _read_vrt_chunked(source, *, window, band, name, chunks, gpu, dtype, vrt_dir = _os.path.dirname(_os.path.abspath(source)) vrt = parse_vrt(xml_str, vrt_dir) + # Issue #1987 ambiguous-metadata checks on the chunked VRT path. Run + # before the band-count validator below so a rejected file does not + # produce side effects. + from .._validation import ( + validate_read_metadata, + _gdal_geotransform_to_affine_tuple, + ) + validate_read_metadata({ + 'allow_rotated': allow_rotated, + 'allow_unparseable_crs': allow_unparseable_crs, + 'transform': _gdal_geotransform_to_affine_tuple(vrt.geo_transform), + 'crs_wkt': vrt.crs_wkt, + 'band_nodata': band_nodata, + 'band_nodata_values': ( + [b.nodata for b in vrt.bands] if vrt.bands else None + ), + }) + # Validate ``band`` against the parsed band count, matching the # internal reader's contract so the failure mode is the same whether # the user reads eagerly or chunked. diff --git a/xrspatial/geotiff/_crs.py b/xrspatial/geotiff/_crs.py index 2d74a4f24..0b08e20ed 100644 --- a/xrspatial/geotiff/_crs.py +++ b/xrspatial/geotiff/_crs.py @@ -15,6 +15,7 @@ import numbers import warnings +from ._errors import UnparseableCRSError from ._runtime import GeoTIFFFallbackWarning, _geotiff_strict_mode @@ -165,7 +166,7 @@ def _validate_crs_fallback( return if allow_unparseable_crs: return - raise ValueError( + raise UnparseableCRSError( "crs is not an EPSG code, is not a WKT string " "(no PROJCS / GEOGCS / PROJCRS / GEOGCRS root), and could not " f"be parsed: got {wkt_fallback!r}. Writing it verbatim to " @@ -173,7 +174,7 @@ def _validate_crs_fallback( "cannot interpret. Pass an EPSG int (recommended), a real " "WKT string, install pyproj so EPSG / PROJ tokens can be " "resolved, or pass allow_unparseable_crs=True to keep the " - "pre-#1929 citation-only behaviour." + "pre-#1929 citation-only behaviour. See issue #1987." ) diff --git a/xrspatial/geotiff/_validation.py b/xrspatial/geotiff/_validation.py index bf301acd7..24b184c20 100644 --- a/xrspatial/geotiff/_validation.py +++ b/xrspatial/geotiff/_validation.py @@ -26,7 +26,14 @@ import numpy as np from ._coords import _BAND_DIM_NAMES -from ._errors import ConflictingCRSError +from ._errors import ( + ConflictingCRSError, + ConflictingNodataError, + MixedBandMetadataError, + NonUniformCoordsError, + RotatedTransformError, + UnparseableCRSError, +) from ._runtime import _TIME_DIM_NAMES, _X_DIM_NAMES, _Y_DIM_NAMES @@ -514,3 +521,363 @@ def _check_write_conflicting_crs(context: Mapping[str, Any]) -> None: # use ``unregister_write_metadata_check(_check_write_conflicting_crs)`` # in a fixture rather than mutating the registry directly. register_write_metadata_check(_check_write_conflicting_crs) + + +# --------------------------------------------------------------------------- +# Conflicting nodata aliases write check (issue #1987 PR 7) +# --------------------------------------------------------------------------- + + +def _check_write_conflicting_nodata(context: Mapping[str, Any]) -> None: + """Refuse writes whose ``attrs['nodata']`` and ``attrs['nodatavals']`` + disagree. + + The legacy resolver ``_resolve_nodata_attr`` prefers + ``attrs['nodata']`` and falls back to ``attrs['nodatavals']`` only + when the canonical scalar is missing. When both are set to + different sentinels, the writer silently picks the canonical one + and drops the rioxarray-style tuple. This check refuses the + ambiguity at the boundary. + + Tolerant of bands that legitimately disagree with the canonical + sentinel as long as at least one band in ``nodatavals`` matches. + The rioxarray convention is per-band; xrspatial's canonical + ``nodata`` flattens to a single value. A tuple where every band + disagrees with ``attrs['nodata']`` is the regression case. + + ``_FillValue`` stays deprioritised per the existing resolver + convention; this check does not look at it. + + Context keys consumed: + + * ``nodata_kwarg`` -- the explicit ``nodata=`` writer kwarg. + When set, it overrides attrs and the check short-circuits. + * ``attrs_nodata`` -- ``data.attrs.get('nodata')``. + * ``attrs_nodatavals`` -- ``data.attrs.get('nodatavals')``. + """ + if context.get('nodata_kwarg') is not None: + return + nodata_attr = context.get('attrs_nodata') + nodatavals_attr = context.get('attrs_nodatavals') + if nodata_attr is None or nodatavals_attr is None: + return + # Coerce to float once for comparison; non-numeric entries (None, + # strings) and NaN are already handled by _resolve_nodata_attr and + # are out of scope for this disagreement check. + try: + canonical = float(nodata_attr) + except (TypeError, ValueError): + return + try: + vals = list(nodatavals_attr) + except TypeError: + return + concrete: list[float] = [] + for v in vals: + if v is None: + continue + try: + fv = float(v) + except (TypeError, ValueError): + continue + if np.isnan(fv): + continue + concrete.append(fv) + if not concrete: + return + # NaN canonical handling: if attrs['nodata'] is NaN, any concrete + # entry in the tuple is by definition "different". But NaN means + # "the float NaN is the sentinel", which contradicts a concrete + # numeric in the tuple. Refuse rather than guess. + if np.isnan(canonical): + raise ConflictingNodataError( + f"attrs['nodata']=nan but attrs['nodatavals']={nodatavals_attr!r} " + f"contains concrete numeric sentinels {concrete!r}. The " + f"canonical scalar (NaN) and the per-band tuple disagree on " + f"what the missing-data sentinel is. Reconcile the two attrs " + f"(drop the stale one, or pass the intended sentinel via the " + f"``nodata=`` kwarg) and retry. See issue #1987." + ) + if any(v == canonical for v in concrete): + return + raise ConflictingNodataError( + f"attrs['nodata']={nodata_attr!r} disagrees with every concrete " + f"entry in attrs['nodatavals']={nodatavals_attr!r}. The writer " + f"would silently use attrs['nodata'] and discard the rioxarray " + f"tuple, so the on-disk sentinel would not match what the " + f"per-band tuple advertised. Reconcile the two attrs (drop the " + f"stale one, or pass the intended sentinel via the ``nodata=`` " + f"kwarg, which overrides both attrs) and retry. See issue #1987." + ) + + +register_write_metadata_check(_check_write_conflicting_nodata) + + +# --------------------------------------------------------------------------- +# Non-uniform coords write check (issue #1987 PR 4) +# --------------------------------------------------------------------------- + + +def _check_write_non_uniform_coords(context: Mapping[str, Any]) -> None: + """Refuse writes whose ``coords`` imply a non-uniform pixel grid. + + The writer treats the first two y / x coord values as the pixel + size and emits a uniform-grid ``GeoTransform``. When the rest of + the coords disagree with that step (variable cell size, gaps), + the on-disk file silently misrepresents the geometry. + + Exemption: int-dtype coords keep the existing #1969 sentinel + contract. Those coords are the 0..N-1 fallback the no-georef + reader emits, not geographic positions, so non-uniformity is + meaningless for them. + + Context keys consumed: + + * ``coord_y`` -- ``data.coords['y'].values`` (or equivalent). + * ``coord_x`` -- ``data.coords['x'].values``. + + Missing or non-numeric coords are out of scope (handled by other + writer validation upstream). + """ + for axis_name in ('y', 'x'): + coord = context.get(f'coord_{axis_name}') + if coord is None: + continue + arr = np.asarray(coord) + if arr.size < 3: + # Need at least 3 samples to detect non-uniformity. + continue + if np.issubdtype(arr.dtype, np.integer): + # #1969 sentinel exemption. + continue + if not np.issubdtype(arr.dtype, np.floating): + continue + diffs = np.diff(arr.astype(np.float64)) + # Use a relative tolerance pegged to the step magnitude so that + # float32 rounding noise (~1e-7 relative) does not trip the + # check. Same tolerance the existing #1720 coord-regularity + # check uses. + step = float(diffs[0]) + if step == 0.0: + raise NonUniformCoordsError( + f"data.coords[{axis_name!r}] is constant; the writer cannot " + f"derive a pixel size from a zero-step axis. See issue #1987." + ) + rel = np.abs(diffs - step) / np.abs(step) + if not np.all(rel < 1e-5): + worst = int(np.argmax(rel)) + raise NonUniformCoordsError( + f"data.coords[{axis_name!r}] is non-uniform: step " + f"{step!r} expected, but found {float(diffs[worst])!r} at " + f"index {worst} (relative drift {float(rel[worst]):.2e}). " + f"The writer treats the first two coord values as the " + f"pixel size and would emit a uniform-grid GeoTransform " + f"that misrepresents the rest of the axis. Resample to a " + f"uniform grid before writing (e.g. ``data.interp(...)``) " + f"or write each contiguous block as a separate file. " + f"See issue #1987." + ) + + +register_write_metadata_check(_check_write_non_uniform_coords) + + +# --------------------------------------------------------------------------- +# Unparseable CRS read check (issue #1987 PR 2) +# +# The write-side equivalent (refusing to emit an unparseable string to +# GTCitationGeoKey) is already in place in ``_crs._validate_crs_fallback`` +# and now raises ``UnparseableCRSError`` (subclass of ``ValueError`` so +# existing ``except ValueError`` callers keep working). +# --------------------------------------------------------------------------- + + +def _check_read_unparseable_crs(context: Mapping[str, Any]) -> None: + """Refuse reads whose ``crs_wkt`` cannot be parsed by pyproj. + + A WKT that the reader extracts from ``GeoAsciiParams`` or a VRT + ``SRS`` tag may be partial or corrupted. The legacy reader emitted + it verbatim in ``attrs['crs_wkt']``; downstream code reading the + attr through pyproj would then crash with a less obvious message. + + Context keys consumed: + + * ``allow_unparseable_crs`` -- caller opt-out kwarg. When True, + this check is silent (matches the existing write-side behaviour). + * ``crs_wkt`` -- the WKT string extracted from the file. + + Soft preconditions: + + * No pyproj installed -> no-op (cannot prove the WKT is bad). + * WKT empty or None -> no-op. + * WKT that pyproj parses cleanly -> no-op. + """ + if context.get('allow_unparseable_crs'): + return + crs_wkt = context.get('crs_wkt') + if not crs_wkt: + return + try: + from pyproj import CRS as _PyProjCRS + from pyproj.exceptions import CRSError as _PyProjCRSError + except ImportError: + return + try: + # ``from_user_input`` accepts WKT, EPSG-style ``"EPSG:NNNN"`` and + # PROJ strings -- anything pyproj can resolve. The GDAL VRT + # ```` tag (and a few other source paths) routinely stash + # non-WKT but pyproj-parseable strings into ``crs_wkt``, so the + # check rejects only the strict "pyproj cannot parse" case. + _PyProjCRS.from_user_input(crs_wkt) + except _PyProjCRSError as e: + excerpt = crs_wkt if len(crs_wkt) <= 80 else (crs_wkt[:77] + '...') + raise UnparseableCRSError( + f"GeoTIFF source advertises a CRS string that pyproj cannot " + f"parse: {excerpt!r} ({type(e).__name__}: {e}). The legacy " + f"reader would emit the unparseable string verbatim in " + f"attrs['crs_wkt'] and let downstream code crash on first use. " + f"Pass ``allow_unparseable_crs=True`` to keep that behaviour, " + f"or re-encode the source with a valid CRS. See issue #1987." + ) from e + + +register_read_metadata_check(_check_read_unparseable_crs) + + +# --------------------------------------------------------------------------- +# Rotated transform read check (issue #1987 PR 3) +# --------------------------------------------------------------------------- + + +def _check_read_rotated_transform(context: Mapping[str, Any]) -> None: + """Refuse reads whose ``transform`` has non-zero rotation/shear terms. + + Downstream xrspatial functions assume axis-aligned rasters + (the GeoTransform's b and d terms are zero). A rotated grid would + silently produce wrong results in slope / aspect / hillshade / + proximity / zonal-statistics. + + Context keys consumed: + + * ``allow_rotated`` -- caller opt-out kwarg. + * ``transform`` -- the 6-tuple ``(pixel_width, b, origin_x, d, + pixel_height, origin_y)`` (rasterio Affine ordering). + + A transform with ``b == 0`` and ``d == 0`` is axis-aligned and + passes. A missing transform is out of scope (the no-georef path + handles it). + """ + if context.get('allow_rotated'): + return + transform = context.get('transform') + if not transform: + return + try: + # rasterio Affine ordering: (pixel_width, b, origin_x, d, pixel_height, origin_y) + b = float(transform[1]) + d = float(transform[3]) + except (IndexError, TypeError, ValueError): + return + if b == 0.0 and d == 0.0: + return + raise RotatedTransformError( + f"GeoTIFF source carries a rotated affine transform (b={b!r}, " + f"d={d!r}; both must be 0.0 for an axis-aligned grid). " + f"Downstream xrspatial ops (slope, aspect, hillshade, proximity, " + f"zonal) assume axis-aligned rasters and would silently produce " + f"wrong results on a rotated grid. Pass ``allow_rotated=True`` " + f"to read the pixel grid without the geospatial assumption " + f"(useful when you only want the array, not the geo-aware " + f"downstream ops). See issue #1987." + ) + + +register_read_metadata_check(_check_read_rotated_transform) + + +def _gdal_geotransform_to_affine_tuple(gt) -> tuple | None: + """Reorder a GDAL ``GeoTransform`` 6-tuple into rasterio ``Affine`` + layout for ``_check_read_rotated_transform``. + + GDAL: ``(origin_x, pixel_width, b, origin_y, d, pixel_height)`` + rasterio:``(pixel_width, b, origin_x, d, pixel_height, origin_y)`` + + Returns ``None`` when ``gt`` is ``None`` so callers can pass the + result straight into a validation context dict. + """ + if gt is None: + return None + return (gt[1], gt[2], gt[0], gt[4], gt[5], gt[3]) + + +# --------------------------------------------------------------------------- +# Mixed band metadata read check (issue #1987 PR 5) +# --------------------------------------------------------------------------- + + +def _check_read_mixed_band_metadata(context: Mapping[str, Any]) -> None: + """Refuse VRT reads where bands declare disagreeing per-band nodata. + + A VRT can mosaic sources with different per-band nodata sentinels. + The legacy reader flattens to one value silently, which means one + band's "valid" pixels can collide with another band's sentinel + after the flatten. The fail-closed default refuses the ambiguity; + pass ``band_nodata='first'`` to keep the legacy behaviour + explicitly. + + Context keys consumed: + + * ``band_nodata`` -- caller opt-out kwarg. ``None`` means + strict (raise on disagreement); ``'first'`` keeps the legacy + flatten-to-first behaviour. + * ``band_nodata_values`` -- iterable of per-band nodata sentinels + extracted from the VRT (one entry per band; ``None`` for bands + without a declared sentinel). + """ + band_nodata = context.get('band_nodata') + if band_nodata == 'first': + return + vals = context.get('band_nodata_values') + if not vals: + return + seen: list = [] + for v in vals: + if v is None: + continue + # Two NaNs are considered equal here (both mean "NaN sentinel"), + # matching how _resolve_nodata_attr treats them. + try: + fv = float(v) + except (TypeError, ValueError): + continue + if any(_same_nodata(fv, s) for s in seen): + continue + seen.append(fv) + if len(seen) <= 1: + return + raise MixedBandMetadataError( + f"VRT source declares disagreeing per-band nodata sentinels: " + f"{vals!r}. Flattening to a single value would silently collide " + f"with another band's valid pixels. Pass " + f"``band_nodata='first'`` to keep the legacy flatten-to-first " + f"behaviour, or rebuild the VRT with a single shared sentinel. " + f"See issue #1987." + ) + + +def _same_nodata(a: float, b: float) -> bool: + """Equality with NaN-equals-NaN semantics for nodata comparison.""" + if np.isnan(a) and np.isnan(b): + return True + return a == b + + +# NOT registered by default. The mixed-band check has a much larger +# migration cost than its sibling read-side checks (around 35 existing +# test sites would need to opt in via ``band_nodata='first'`` to keep +# their legacy assertions), so it lands as a follow-up PR that bundles +# the check activation with the test migration. The check itself and +# the ``band_nodata=`` VRT kwarg ship now so the follow-up is a +# one-line registration call plus the test sweep. +# register_read_metadata_check(_check_read_mixed_band_metadata) diff --git a/xrspatial/geotiff/_writers/eager.py b/xrspatial/geotiff/_writers/eager.py index d563ab896..c1934f641 100644 --- a/xrspatial/geotiff/_writers/eager.py +++ b/xrspatial/geotiff/_writers/eager.py @@ -271,16 +271,26 @@ def to_geotiff(data: xr.DataArray | np.ndarray, _validate_nodata_arg(nodata) - # Issue #1987 ambiguous-metadata checks. Today only the - # conflicting-crs/crs_wkt write check is registered; other cases - # land in follow-up PRs. The hook is a no-op when no check is - # registered, so this call is safe even if every check is later - # unregistered for a specific entry point. + # Issue #1987 ambiguous-metadata checks. The hook is a no-op + # when no check is registered, so this call is safe even if every + # check is later unregistered for a specific entry point. _attrs = getattr(data, 'attrs', None) or {} + _coords = getattr(data, 'coords', None) + _coord_y = _coords['y'].values if ( + _coords is not None and 'y' in _coords + ) else None + _coord_x = _coords['x'].values if ( + _coords is not None and 'x' in _coords + ) else None validate_write_metadata({ 'crs_kwarg': crs, 'attrs_crs': _attrs.get('crs'), 'attrs_crs_wkt': _attrs.get('crs_wkt'), + 'nodata_kwarg': nodata, + 'attrs_nodata': _attrs.get('nodata'), + 'attrs_nodatavals': _attrs.get('nodatavals'), + 'coord_y': _coord_y, + 'coord_x': _coord_x, }) # Up-front validation: catch bad compression names before they reach @@ -801,12 +811,24 @@ def _write_vrt_tiled(data, vrt_path, *, crs=None, nodata=None, # Issue #1987 ambiguous-metadata checks; mirrors the call in # ``to_geotiff`` so the dask-VRT write path enforces the same - # crs/crs_wkt consistency rule. + # crs/crs_wkt / nodata / coord rules. _attrs = getattr(data, 'attrs', None) or {} + _coords = getattr(data, 'coords', None) + _coord_y = _coords['y'].values if ( + _coords is not None and 'y' in _coords + ) else None + _coord_x = _coords['x'].values if ( + _coords is not None and 'x' in _coords + ) else None validate_write_metadata({ 'crs_kwarg': crs, 'attrs_crs': _attrs.get('crs'), 'attrs_crs_wkt': _attrs.get('crs_wkt'), + 'nodata_kwarg': nodata, + 'attrs_nodata': _attrs.get('nodata'), + 'attrs_nodatavals': _attrs.get('nodatavals'), + 'coord_y': _coord_y, + 'coord_x': _coord_x, }) # Validate compression_level against codec-specific range diff --git a/xrspatial/geotiff/_writers/gpu.py b/xrspatial/geotiff/_writers/gpu.py index 9fa3879c2..4b60f1914 100644 --- a/xrspatial/geotiff/_writers/gpu.py +++ b/xrspatial/geotiff/_writers/gpu.py @@ -266,10 +266,22 @@ def write_geotiff_gpu(data: xr.DataArray | cupy.ndarray | np.ndarray, # Issue #1987 ambiguous-metadata checks; mirrors ``to_geotiff`` so the # GPU writer enforces the same crs/crs_wkt consistency rule. _attrs = getattr(data, 'attrs', None) or {} + _coords = getattr(data, 'coords', None) + _coord_y = _coords['y'].values if ( + _coords is not None and 'y' in _coords + ) else None + _coord_x = _coords['x'].values if ( + _coords is not None and 'x' in _coords + ) else None validate_write_metadata({ 'crs_kwarg': crs, 'attrs_crs': _attrs.get('crs'), 'attrs_crs_wkt': _attrs.get('crs_wkt'), + 'nodata_kwarg': nodata, + 'attrs_nodata': _attrs.get('nodata'), + 'attrs_nodatavals': _attrs.get('nodatavals'), + 'coord_y': _coord_y, + 'coord_x': _coord_x, }) if max_z_error < 0: raise ValueError( diff --git a/xrspatial/geotiff/tests/test_ambiguous_metadata_hooks_1987.py b/xrspatial/geotiff/tests/test_ambiguous_metadata_hooks_1987.py index 8e0495b7b..2790b2c33 100644 --- a/xrspatial/geotiff/tests/test_ambiguous_metadata_hooks_1987.py +++ b/xrspatial/geotiff/tests/test_ambiguous_metadata_hooks_1987.py @@ -152,6 +152,9 @@ def test_write_hook_is_noop_when_no_checks_registered(): def test_register_and_dispatch_read_check(): + # Use an opaque payload key that no registered #1987 check inspects, + # so this test stays scoped to the dispatch mechanism rather than + # the semantics of any one check. seen: list[dict] = [] def check(ctx): @@ -159,13 +162,14 @@ def check(ctx): register_read_metadata_check(check) try: - validate_read_metadata({"crs_wkt": "EPSG:4326"}) - assert seen == [{"crs_wkt": "EPSG:4326"}] + validate_read_metadata({"_dispatch_probe": "value"}) + assert seen == [{"_dispatch_probe": "value"}] finally: unregister_read_metadata_check(check) def test_register_and_dispatch_write_check(): + # Same isolation as the read-side test above. seen: list[dict] = [] def check(ctx): @@ -173,8 +177,8 @@ def check(ctx): register_write_metadata_check(check) try: - validate_write_metadata({"transform": (1.0, 0, 0, 0, -1.0, 0)}) - assert seen == [{"transform": (1.0, 0, 0, 0, -1.0, 0)}] + validate_write_metadata({"_dispatch_probe": "value"}) + assert seen == [{"_dispatch_probe": "value"}] finally: unregister_write_metadata_check(check) @@ -308,7 +312,10 @@ def deny(ctx): register_read_metadata_check(deny) try: with pytest.raises(UnparseableCRSError, match="bad WKT"): - validate_read_metadata({"crs_wkt": "MALFORMED"}) + # Opaque key so the test does not collide with the registered + # ``_check_read_unparseable_crs`` check's behaviour on real + # ``crs_wkt`` payloads. + validate_read_metadata({"_dispatch_probe": "value"}) finally: unregister_read_metadata_check(deny) diff --git a/xrspatial/geotiff/tests/test_attrs_contract_aliases_1984.py b/xrspatial/geotiff/tests/test_attrs_contract_aliases_1984.py index ef10eecc5..7f646a0f3 100644 --- a/xrspatial/geotiff/tests/test_attrs_contract_aliases_1984.py +++ b/xrspatial/geotiff/tests/test_attrs_contract_aliases_1984.py @@ -87,9 +87,12 @@ def test_read_uses_fill_value_when_nodata_absent(tmp_path, _arr_with_sentinel): # --------------------------------------------------------------------------- -def test_canonical_nodata_wins_over_aliases(tmp_path, _arr_with_sentinel): - """Pin: when all three keys are present and disagree, ``attrs['nodata']`` - is the canonical one and wins. Guards against alias precedence drift.""" +def test_canonical_nodata_wins_over_aliases_at_resolver(_arr_with_sentinel): + """``_resolve_nodata_attr`` (the read-side chokepoint) still prefers + canonical ``nodata`` over both aliases. This is the resolver-layer + contract that downstream code consumes; the write-side fail-closed + check is exercised in ``test_canonical_nodata_wins_over_aliases_at_write`` + below.""" canonical = -8888.0 da = _da_float( _arr_with_sentinel, crs=4326, @@ -97,15 +100,33 @@ def test_canonical_nodata_wins_over_aliases(tmp_path, _arr_with_sentinel): nodatavals=(_SENTINEL,), **{"_FillValue": -7777.0}, ) - # _resolve_nodata_attr is the single chokepoint; assert at that layer - # too so a precedence regression surfaces with a clear message. assert _resolve_nodata_attr(dict(da.attrs)) == canonical + +def test_canonical_nodata_wins_over_aliases_at_write( + tmp_path, _arr_with_sentinel): + """Pin: the legacy "canonical nodata silently wins on write" was + replaced by a fail-closed raise (issue #1987 PR 7). A DataArray with + disagreeing ``nodata`` and ``nodatavals`` attrs now refuses to write + instead of dropping the alias. The opt-out is the explicit + ``nodata=`` kwarg, which overrides both attrs.""" + from xrspatial.geotiff import ConflictingNodataError + + da = _da_float( + _arr_with_sentinel, crs=4326, + nodata=-8888.0, + nodatavals=(_SENTINEL,), + **{"_FillValue": -7777.0}, + ) + out = str(tmp_path / "canonical_wins.tif") - to_geotiff(da, out) + with pytest.raises(ConflictingNodataError): + to_geotiff(da, out) + # Explicit ``nodata=`` bypasses the check. + to_geotiff(da, out, nodata=-8888.0) rd = open_geotiff(out) - assert rd.attrs.get("nodata") == canonical + assert rd.attrs.get("nodata") == -8888.0 # --------------------------------------------------------------------------- diff --git a/xrspatial/geotiff/tests/test_nodata_attr_aliases_1582.py b/xrspatial/geotiff/tests/test_nodata_attr_aliases_1582.py index d2a205be7..ed614d5d2 100644 --- a/xrspatial/geotiff/tests/test_nodata_attr_aliases_1582.py +++ b/xrspatial/geotiff/tests/test_nodata_attr_aliases_1582.py @@ -93,8 +93,13 @@ def test_fill_value_resolves_to_nodata_tag(tmp_path, _arr_with_sentinel): def test_explicit_nodata_attr_wins_over_aliases(tmp_path, _arr_with_sentinel): - """``attrs['nodata']`` is xrspatial's canonical key; it must win over - rioxarray and CF aliases when both are present.""" + """``attrs['nodata']`` is xrspatial's canonical key. Issue #1987 PR 7 + replaced the legacy "canonical silently wins, alias dropped" path + with a fail-closed raise: a DataArray with disagreeing ``nodata`` and + ``nodatavals`` refuses to write. The explicit ``nodata=`` writer + kwarg overrides both attrs and bypasses the check.""" + from xrspatial.geotiff import ConflictingNodataError + da = _da_float( _arr_with_sentinel, crs=4326, nodata=-8888.0, @@ -102,8 +107,12 @@ def test_explicit_nodata_attr_wins_over_aliases(tmp_path, _arr_with_sentinel): **{"_FillValue": -7777.0}, ) out = str(tmp_path / "explicit_wins.tif") - to_geotiff(da, out) + with pytest.raises(ConflictingNodataError): + to_geotiff(da, out) + + # Explicit kwarg overrides both attrs. + to_geotiff(da, out, nodata=-8888.0) rd = open_geotiff(out) assert rd.attrs.get("nodata") == -8888.0 diff --git a/xrspatial/geotiff/tests/test_reader_kwarg_order_1935.py b/xrspatial/geotiff/tests/test_reader_kwarg_order_1935.py index 17fd19d95..f17c2a540 100644 --- a/xrspatial/geotiff/tests/test_reader_kwarg_order_1935.py +++ b/xrspatial/geotiff/tests/test_reader_kwarg_order_1935.py @@ -42,6 +42,8 @@ "max_cloud_bytes", "on_gpu_failure", "missing_sources", + "allow_rotated", + "allow_unparseable_crs", ) @@ -117,8 +119,14 @@ def test_read_geotiff_dask_matches_canonical_order(): def test_read_vrt_matches_canonical_order(): - """``read_vrt`` must list shared params in the canonical order.""" - _assert_canonical(read_vrt) + """``read_vrt`` must list shared params in the canonical order. + + ``band_nodata`` is the #1987 PR 5 opt-out for the mixed-band metadata + check; it is VRT-specific (no analogue on the other readers) and so + lives in the per-function tail rather than in the shared canonical + order. + """ + _assert_canonical(read_vrt, allowed_tail=('band_nodata',)) def test_no_pairwise_order_inversions(): diff --git a/xrspatial/geotiff/tests/test_remaining_fail_closed_1987.py b/xrspatial/geotiff/tests/test_remaining_fail_closed_1987.py new file mode 100644 index 000000000..1c36dacf9 --- /dev/null +++ b/xrspatial/geotiff/tests/test_remaining_fail_closed_1987.py @@ -0,0 +1,439 @@ +"""Fail-closed checks for ambiguous geospatial metadata (issue #1987). + +PR 1 of the #1987 series wired ``ConflictingCRSError`` (PR 6 in the +issue's PR numbering). This file pins the remaining four checks that +land in the bundled follow-up: + +* ``UnparseableCRSError`` (#1987 PR 2): read + write paths refuse a + CRS string that pyproj cannot parse. The existing write-side + ``_validate_crs_fallback`` raise was retyped from ``ValueError`` to + the new subclass; the read-side check is new. +* ``RotatedTransformError`` (#1987 PR 3): the four read entry points + refuse a non-axis-aligned affine transform. Opt-out via + ``allow_rotated=True``. +* ``NonUniformCoordsError`` (#1987 PR 4): the writer refuses coords + that imply a non-uniform pixel grid. The int-dtype sentinel + exemption from #1969 still applies. +* ``ConflictingNodataError`` (#1987 PR 7): the writer refuses a + DataArray whose ``attrs['nodata']`` disagrees with every concrete + entry in ``attrs['nodatavals']``. Opt-out via explicit ``nodata=`` + kwarg. + +``MixedBandMetadataError`` (#1987 PR 5) is intentionally NOT +registered in this PR (see the corresponding ``# register_...`` comment +in ``_validation.py``); the migration cost on existing VRT fixtures is +its own follow-up. +""" +from __future__ import annotations + +import os +import struct +from pathlib import Path + +import numpy as np +import pytest +import xarray as xr + +from xrspatial.geotiff import ( + ConflictingNodataError, + GeoTIFFAmbiguousMetadataError, + NonUniformCoordsError, + RotatedTransformError, + UnparseableCRSError, + open_geotiff, + to_geotiff, +) +from xrspatial.geotiff._validation import ( + _check_read_rotated_transform, + _check_read_unparseable_crs, + _check_write_conflicting_nodata, + _check_write_non_uniform_coords, + _registered_read_metadata_checks, + _registered_write_metadata_checks, +) + +pyproj = pytest.importorskip("pyproj") + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _da(*, coords=None, attrs=None, shape=(4, 4)): + """Build a minimal 2-D DataArray. Caller supplies coords + attrs.""" + data = np.zeros(shape, dtype=np.float32) + if coords is None: + coords = { + 'y': np.linspace(3.0, 0.0, shape[0], dtype=np.float64), + 'x': np.linspace(0.0, 3.0, shape[1], dtype=np.float64), + } + return xr.DataArray( + data, dims=('y', 'x'), coords=coords, attrs=dict(attrs or {}), + ) + + +def _write_minimal_tiff_with_wkt(path: str, wkt: str) -> None: + """Hand-build a 4x4 float32 TIFF that stashes ``wkt`` in + ``GeoAsciiParams`` (tag 34737), referenced from + ``GeoKeyDirectory`` (tag 34735) as ``GTCitationGeoKey`` (id 1026).""" + bo = '<' + pixels = np.zeros((4, 4), dtype=np.float32).tobytes() + ascii_buf = bytearray((wkt + '|').encode('ascii')) + # GeoKeyDirectory: header + one entry pointing into GeoAsciiParams. + gkd = [ + 1, 1, 0, 1, + 1026, # GTCitationGeoKey + 34737, # location = GeoAsciiParams tag + len(wkt) + 1, # count (incl. '|') + 0, # offset into GeoAsciiParams + ] + tag_list = [] + + def add_short(tag, val): + tag_list.append((tag, 3, 1, struct.pack(f'{bo}H', val))) + + def add_long(tag, val): + tag_list.append((tag, 4, 1, struct.pack(f'{bo}I', val))) + + def add_shorts(tag, vals): + tag_list.append((tag, 3, len(vals), + struct.pack(f'{bo}{len(vals)}H', *vals))) + + def add_doubles(tag, vals): + tag_list.append((tag, 12, len(vals), + struct.pack(f'{bo}{len(vals)}d', *vals))) + + def add_ascii(tag, raw_bytes): + if not raw_bytes.endswith(b'\x00'): + raw_bytes = raw_bytes + b'\x00' + tag_list.append((tag, 2, len(raw_bytes), raw_bytes)) + + add_short(256, 4) + add_short(257, 4) + add_short(258, 32) + add_short(259, 1) + add_short(262, 1) + add_short(277, 1) + add_short(339, 3) + add_short(278, 4) + add_long(273, 0) + add_long(279, len(pixels)) + add_doubles(33550, [1.0, 1.0, 0.0]) + add_doubles(33922, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) + add_shorts(34735, gkd) + add_ascii(34737, bytes(ascii_buf)) + tag_list.sort(key=lambda t: t[0]) + + n = len(tag_list) + ifd_start = 8 + ifd_size = 2 + 12 * n + 4 + overflow_start = ifd_start + ifd_size + overflow_buf = bytearray() + tag_offsets: dict[int, int | None] = {} + for tag, _typ, _count, raw in tag_list: + if len(raw) > 4: + tag_offsets[tag] = len(overflow_buf) + overflow_buf.extend(raw) + if len(overflow_buf) % 2: + overflow_buf.append(0) + else: + tag_offsets[tag] = None + pixel_data_start = overflow_start + len(overflow_buf) + patched = [] + for tag, typ, count, raw in tag_list: + if tag == 273: + raw = struct.pack(f'{bo}I', pixel_data_start) + patched.append((tag, typ, count, raw)) + tag_list = patched + out = bytearray(b'II') + out.extend(struct.pack(f'{bo}H', 42)) + out.extend(struct.pack(f'{bo}I', ifd_start)) + out.extend(struct.pack(f'{bo}H', n)) + for tag, typ, count, raw in tag_list: + out.extend(struct.pack(f'{bo}HHI', tag, typ, count)) + if len(raw) <= 4: + out.extend(raw.ljust(4, b'\x00')) + else: + out.extend(struct.pack(f'{bo}I', overflow_start + tag_offsets[tag])) + out.extend(struct.pack(f'{bo}I', 0)) + out.extend(overflow_buf) + out.extend(pixels) + Path(path).write_bytes(bytes(out)) + + +def _write_rotated_vrt( + path: Path, + source_basename: str, + *, + geo_transform: str = '0.0, 1.0, 0.5, 0.0, 0.0, -1.0', +) -> None: + """Build a VRT carrying ``geo_transform``. Default value rotates on + the x-axis (GDAL ``GT[2] = 0.5``); pass a different string to + exercise the row-axis (``GT[4]``) branch.""" + Path(path).write_text( + '\n' + ' EPSG:4326\n' + f' {geo_transform}\n' + ' \n' + ' \n' + f' {source_basename}' + '\n' + ' 1\n' + ' \n' + ' \n' + ' \n' + ' \n' + '\n' + ) + + +# --------------------------------------------------------------------------- +# Registry sanity. +# --------------------------------------------------------------------------- + + +def test_four_checks_registered(): + """Four of the five #1987 PR 2-7 checks are active by default. The + mixed-band check is intentionally deferred to a follow-up that also + migrates the legacy VRT test fixtures.""" + write_names = {c.__name__ for c in _registered_write_metadata_checks()} + read_names = {c.__name__ for c in _registered_read_metadata_checks()} + assert _check_write_conflicting_nodata.__name__ in write_names + assert _check_write_non_uniform_coords.__name__ in write_names + assert _check_read_unparseable_crs.__name__ in read_names + assert _check_read_rotated_transform.__name__ in read_names + + +def test_error_class_hierarchy(): + for cls in ( + UnparseableCRSError, + RotatedTransformError, + NonUniformCoordsError, + ConflictingNodataError, + ): + assert issubclass(cls, GeoTIFFAmbiguousMetadataError) + assert issubclass(cls, ValueError) + + +# --------------------------------------------------------------------------- +# UnparseableCRSError (write side + read side). +# --------------------------------------------------------------------------- + + +def test_write_rejects_garbage_crs_kwarg(tmp_path): + """``to_geotiff(..., crs="not a CRS")`` refuses to emit the + unparseable string into ``GTCitationGeoKey``.""" + da = _da() + with pytest.raises(UnparseableCRSError): + to_geotiff(da, str(tmp_path / 'bad_crs.tif'), crs='not a CRS') + + +def test_write_garbage_crs_kwarg_opt_in_passes(tmp_path): + """``allow_unparseable_crs=True`` keeps the pre-#1929 citation-only + behaviour for callers who need it.""" + da = _da() + to_geotiff( + da, str(tmp_path / 'bad_crs_opt_in.tif'), + crs='not a CRS', allow_unparseable_crs=True, + ) + + +def test_read_rejects_garbage_crs_wkt(): + """The read-side check raises on a string pyproj cannot parse. + Driven directly through ``validate_read_metadata`` because building + a TIFF whose stored WKT actually surfaces as ``attrs['crs_wkt']`` + (vs ``attrs['crs_name']``, which carries the GeoKey citation) is a + full-stack fixture exercise rather than a unit one.""" + from xrspatial.geotiff._validation import validate_read_metadata + + with pytest.raises(UnparseableCRSError): + validate_read_metadata({'crs_wkt': 'NOT A REAL WKT STRING'}) + + +def test_read_garbage_crs_wkt_opt_in_passes(): + """``allow_unparseable_crs=True`` short-circuits the check.""" + from xrspatial.geotiff._validation import validate_read_metadata + + validate_read_metadata({ + 'crs_wkt': 'NOT A REAL WKT STRING', + 'allow_unparseable_crs': True, + }) + + +def test_read_pyproj_parseable_non_wkt_passes(): + """``EPSG:4326`` is not WKT but pyproj parses it via + ``from_user_input``. The check tolerates pyproj-parseable + placeholders (e.g. GDAL's ``EPSG:4326`` convention).""" + from xrspatial.geotiff._validation import validate_read_metadata + + validate_read_metadata({'crs_wkt': 'EPSG:4326'}) + + +# --------------------------------------------------------------------------- +# RotatedTransformError. +# --------------------------------------------------------------------------- + + +def test_read_rejects_rotated_vrt(tmp_path): + """A VRT whose GeoTransform carries non-zero rotation/shear terms is + refused on read.""" + src = tmp_path / 'flat.tif' + _write_minimal_tiff_with_wkt(str(src), 'EPSG:4326') + vrt = tmp_path / 'rotated.vrt' + _write_rotated_vrt(vrt, os.path.basename(src)) + + with pytest.raises(RotatedTransformError): + open_geotiff(str(vrt)) + + +def test_read_rotated_vrt_opt_in_passes(tmp_path): + """``allow_rotated=True`` skips the check and returns the pixel grid + without the axis-aligned-grid assumption.""" + src = tmp_path / 'flat.tif' + _write_minimal_tiff_with_wkt(str(src), 'EPSG:4326') + vrt = tmp_path / 'rotated_ok.vrt' + _write_rotated_vrt(vrt, os.path.basename(src)) + + da = open_geotiff(str(vrt), allow_rotated=True) + assert da.shape == (4, 4) + + +def test_read_rejects_row_axis_rotated_vrt(tmp_path): + """The ``d`` term (GDAL ``GT[4]``, rasterio ``Affine`` index 3) is + the row-axis rotation; the sibling test above only sets the column + rotation. Pin that the row-axis branch raises too.""" + src = tmp_path / 'flat_d.tif' + _write_minimal_tiff_with_wkt(str(src), 'EPSG:4326') + vrt = tmp_path / 'rotated_d.vrt' + _write_rotated_vrt( + vrt, os.path.basename(src), + geo_transform='0.0, 1.0, 0.0, 0.0, 0.5, -1.0', + ) + + with pytest.raises(RotatedTransformError): + open_geotiff(str(vrt)) + + +# --------------------------------------------------------------------------- +# NonUniformCoordsError. +# --------------------------------------------------------------------------- + + +def test_write_rejects_non_uniform_y_coords(tmp_path): + """A DataArray whose ``y`` coords are not uniformly spaced is refused. + The writer would otherwise pick the first two values as the pixel + size and silently misrepresent the rest of the axis.""" + coords = { + 'y': np.array([10.0, 9.0, 7.0, 4.0], dtype=np.float64), + 'x': np.array([0.0, 1.0, 2.0, 3.0], dtype=np.float64), + } + da = _da(coords=coords) + with pytest.raises(NonUniformCoordsError): + to_geotiff(da, str(tmp_path / 'non_uniform_y.tif')) + + +def test_write_rejects_non_uniform_x_coords(tmp_path): + coords = { + 'y': np.array([3.0, 2.0, 1.0, 0.0], dtype=np.float64), + 'x': np.array([0.0, 1.0, 3.0, 6.0], dtype=np.float64), + } + da = _da(coords=coords) + with pytest.raises(NonUniformCoordsError): + to_geotiff(da, str(tmp_path / 'non_uniform_x.tif')) + + +def test_write_accepts_uniform_coords(tmp_path): + """Float coords that are uniformly spaced pass.""" + coords = { + 'y': np.array([3.0, 2.0, 1.0, 0.0], dtype=np.float64), + 'x': np.array([0.0, 1.0, 2.0, 3.0], dtype=np.float64), + } + da = _da(coords=coords) + to_geotiff(da, str(tmp_path / 'uniform.tif')) + + +def test_write_accepts_integer_coord_sentinel(tmp_path): + """Int-dtype coords (the no-georef sentinel from #1969) bypass the + uniformity check because they don't represent geographic positions.""" + coords = { + 'y': np.array([0, 1, 2, 3], dtype=np.int64), + 'x': np.array([0, 1, 2, 3], dtype=np.int64), + } + da = _da(coords=coords) + to_geotiff(da, str(tmp_path / 'int_sentinel.tif')) + + +def test_write_rejects_constant_float_coords(tmp_path): + """A float coord array whose first two values are equal makes the + derived pixel step zero. The check refuses rather than emitting a + zero-step GeoTransform.""" + coords = { + 'y': np.array([1.0, 1.0, 1.0, 1.0], dtype=np.float64), + 'x': np.array([0.0, 1.0, 2.0, 3.0], dtype=np.float64), + } + da = _da(coords=coords) + with pytest.raises(NonUniformCoordsError, match='constant'): + to_geotiff(da, str(tmp_path / 'constant_y.tif')) + + +# --------------------------------------------------------------------------- +# ConflictingNodataError. +# --------------------------------------------------------------------------- + + +def test_write_rejects_conflicting_nodata_attrs(tmp_path): + """``attrs['nodata']`` disagreeing with every concrete entry in + ``attrs['nodatavals']`` raises.""" + da = _da(attrs={'nodata': -8888.0, 'nodatavals': (-9999.0,)}) + with pytest.raises(ConflictingNodataError): + to_geotiff(da, str(tmp_path / 'conflict_nodata.tif')) + + +def test_write_accepts_agreeing_nodata_attrs(tmp_path): + """When the tuple includes the canonical value (e.g. per-band + rioxarray convention) the check passes.""" + da = _da(attrs={'nodata': -9999.0, 'nodatavals': (-9999.0, -9999.0)}) + to_geotiff(da, str(tmp_path / 'agree_nodata.tif')) + + +def test_write_nan_canonical_with_concrete_alias_raises(tmp_path): + """``nodata=NaN`` paired with a concrete numeric in ``nodatavals`` + means "NaN is the sentinel" and "some integer is the sentinel" + at the same time. Refuse the ambiguity.""" + da = _da(attrs={'nodata': float('nan'), 'nodatavals': (-9999.0,)}) + with pytest.raises(ConflictingNodataError): + to_geotiff(da, str(tmp_path / 'nan_vs_concrete.tif')) + + +def test_write_explicit_nodata_kwarg_bypasses_check(tmp_path): + """``to_geotiff(..., nodata=X)`` overrides both attrs and bypasses + the conflict check entirely.""" + da = _da(attrs={'nodata': -8888.0, 'nodatavals': (-9999.0,)}) + to_geotiff(da, str(tmp_path / 'kwarg_override.tif'), nodata=-1.0) + + +def test_write_none_in_nodatavals_tuple_is_skipped(tmp_path): + """``None`` entries in the rioxarray tuple are "no sentinel on this + band" and don't count as disagreement.""" + da = _da(attrs={'nodata': -9999.0, 'nodatavals': (None, -9999.0)}) + to_geotiff(da, str(tmp_path / 'none_skipped.tif')) + + +# --------------------------------------------------------------------------- +# Round-trip safety: a written-then-read DataArray with both attrs set +# (which the reader does emit by default) still writes again cleanly. +# --------------------------------------------------------------------------- + + +def test_read_then_write_round_trip_does_not_raise(tmp_path): + """``to_geotiff -> open_geotiff -> to_geotiff`` is the typical + pipeline. The reader emits both ``crs`` and ``crs_wkt`` whenever the + file has a CRS, so the second write hits both attrs populated; they + must agree (they came from the same file).""" + src = _da(attrs={'crs': 4326, 'nodata': -9999.0}) + first = str(tmp_path / 'rt_first.tif') + second = str(tmp_path / 'rt_second.tif') + to_geotiff(src, first) + da = open_geotiff(first) + to_geotiff(da, second)