diff --git a/CHANGELOG.md b/CHANGELOG.md index 095d4de52..6fe2c450e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ - Decode TIFF predictor=3 un-transpose by file byte order so big-endian floating-point TIFFs read back exactly - Default internal `_vrt.read_vrt` `missing_sources` to `'raise'` so an unreadable VRT source no longer produces a silent zero-fill hole on integer rasters; pass `missing_sources='warn'` to opt back into the previous lenient behaviour (#1843) - Deprecate read-side emission of matplotlib colormap-derived attrs (cmap, colormap_rgba) on palette TIFFs; the writer cannot set Photometric=3 so they do not round-trip. Construct ListedColormap from attrs['colormap'] in caller code. These attrs still emit for now but trigger a DeprecationWarning. Removal planned for a future release. (#1984) +- Reject writes whose `attrs['crs']` and `attrs['crs_wkt']` resolve to different CRSes (after pyproj canonicalisation) instead of silently emitting the EPSG and dropping the WKT. The new `ConflictingCRSError` (subclass of `ValueError` and `GeoTIFFAmbiguousMetadataError`) names the offending attrs; pass `crs=` explicitly to override both attrs and bypass the check. Read-back DataArrays carrying both attrs continue to round-trip because the reader's two attrs derive from the same on-disk CRS. (#1987) ### Version 0.9.9 - 2026-05-05 diff --git a/xrspatial/geotiff/_validation.py b/xrspatial/geotiff/_validation.py index f299bbbe1..bf301acd7 100644 --- a/xrspatial/geotiff/_validation.py +++ b/xrspatial/geotiff/_validation.py @@ -26,6 +26,7 @@ import numpy as np from ._coords import _BAND_DIM_NAMES +from ._errors import ConflictingCRSError from ._runtime import _TIME_DIM_NAMES, _X_DIM_NAMES, _Y_DIM_NAMES @@ -433,3 +434,83 @@ def validate_write_metadata(context: Mapping[str, Any] | None = None) -> None: # Snapshot for the same reason as the read hook above. for check in tuple(_WRITE_METADATA_CHECKS): check(ctx) + + +# --------------------------------------------------------------------------- +# Conflicting crs / crs_wkt write check (issue #1987 PR 1) +# --------------------------------------------------------------------------- + + +def _check_write_conflicting_crs(context: Mapping[str, Any]) -> None: + """Refuse writes whose ``attrs['crs']`` and ``attrs['crs_wkt']`` disagree. + + The legacy writer consults both attrs (``crs`` takes priority; the + WKT is a fallback). When the two encode different CRSes, the writer + silently emits the EPSG and drops the WKT, producing a TIFF whose + on-disk CRS does not match what the caller's DataArray advertised. + + This check runs whenever both attrs are populated. It canonicalises + each via :mod:`pyproj` and raises :class:`ConflictingCRSError` if + they do not match. A read-back DataArray that legitimately carries + both attrs (the reader emits both when the file has a CRS) passes + because the two derive from the same on-disk CRS. + + Soft preconditions kept lenient: + + * ``pyproj`` not installed -> no-op (downstream paths handle the + missing dependency). + * Either attr unparseable -> no-op (a sibling check in this issue's + series, :class:`UnparseableCRSError`, will refuse those on its + own PR). + + Context keys consumed: + + * ``crs_kwarg`` -- the value of the explicit ``crs=`` writer kwarg. + When this is not ``None`` the kwarg overrides both attrs, so the + attrs disagreement does not affect this write and the check + short-circuits. + * ``attrs_crs`` -- the value of ``data.attrs.get('crs')``. + * ``attrs_crs_wkt`` -- the value of ``data.attrs.get('crs_wkt')``. + + All keys are optional; absence is treated as "nothing to compare". + """ + if context.get('crs_kwarg') is not None: + return + crs_attr = context.get('attrs_crs') + crs_wkt_attr = context.get('attrs_crs_wkt') + if crs_attr is None or not crs_wkt_attr: + return + try: + from pyproj import CRS as _PyProjCRS + except ImportError: + return + try: + crs_from_attr = _PyProjCRS.from_user_input(crs_attr) + crs_from_wkt = _PyProjCRS.from_wkt(crs_wkt_attr) + except Exception: + return + if crs_from_attr.equals(crs_from_wkt): + return + # Truncate the WKT in the message; full WKTs are 1-3 kB and would + # make the error unreadable. The user's attrs still carry the full + # string so they can introspect it directly. + wkt_excerpt = crs_wkt_attr if len(crs_wkt_attr) <= 60 else ( + crs_wkt_attr[:57] + '...' + ) + raise ConflictingCRSError( + f"attrs['crs']={crs_attr!r} and attrs['crs_wkt']={wkt_excerpt!r} " + f"resolve to different CRSes after pyproj canonicalisation. The " + f"writer would silently pick attrs['crs'] and drop the WKT, so " + f"the on-disk CRS would not match what the DataArray advertises. " + f"Reconcile the two attrs (drop the stale one, or pass the " + f"intended CRS explicitly via the ``crs=`` kwarg, which overrides " + f"both attrs) and retry. See issue #1987." + ) + + +# Active by default: the conflicting-CRS write check is registered at +# import time so every entry point that calls ``validate_write_metadata`` +# enforces the rule. Tests that need to scope around the check should +# use ``unregister_write_metadata_check(_check_write_conflicting_crs)`` +# in a fixture rather than mutating the registry directly. +register_write_metadata_check(_check_write_conflicting_crs) diff --git a/xrspatial/geotiff/_writers/eager.py b/xrspatial/geotiff/_writers/eager.py index 21046f4ee..d563ab896 100644 --- a/xrspatial/geotiff/_writers/eager.py +++ b/xrspatial/geotiff/_writers/eager.py @@ -44,6 +44,7 @@ _validate_3d_writer_dims, _validate_nodata_arg, _validate_tile_size_arg, + validate_write_metadata, ) from .._writer import write from .gpu import write_geotiff_gpu @@ -270,6 +271,18 @@ def to_geotiff(data: xr.DataArray | np.ndarray, _validate_nodata_arg(nodata) + # Issue #1987 ambiguous-metadata checks. Today only the + # conflicting-crs/crs_wkt write check is registered; other cases + # land in follow-up PRs. The hook is a no-op when no check is + # registered, so this call is safe even if every check is later + # unregistered for a specific entry point. + _attrs = getattr(data, 'attrs', None) or {} + validate_write_metadata({ + 'crs_kwarg': crs, + 'attrs_crs': _attrs.get('crs'), + 'attrs_crs_wkt': _attrs.get('crs_wkt'), + }) + # Up-front validation: catch bad compression names before they reach # any of the deeper write paths (streaming, GPU, VRT, COG) where the # error surfaces from _compression_tag with a less obvious traceback. @@ -785,6 +798,17 @@ def _write_vrt_tiled(data, vrt_path, *, crs=None, nodata=None, full array in RAM. """ _validate_nodata_arg(nodata) + + # Issue #1987 ambiguous-metadata checks; mirrors the call in + # ``to_geotiff`` so the dask-VRT write path enforces the same + # crs/crs_wkt consistency rule. + _attrs = getattr(data, 'attrs', None) or {} + validate_write_metadata({ + 'crs_kwarg': crs, + 'attrs_crs': _attrs.get('crs'), + 'attrs_crs_wkt': _attrs.get('crs_wkt'), + }) + # Validate compression_level against codec-specific range if compression_level is not None: level_range = _LEVEL_RANGES.get(compression.lower()) diff --git a/xrspatial/geotiff/_writers/gpu.py b/xrspatial/geotiff/_writers/gpu.py index ba8d4f929..9fa3879c2 100644 --- a/xrspatial/geotiff/_writers/gpu.py +++ b/xrspatial/geotiff/_writers/gpu.py @@ -29,6 +29,7 @@ _validate_3d_writer_dims, _validate_nodata_arg, _validate_tile_size_arg, + validate_write_metadata, ) @@ -261,6 +262,15 @@ def write_geotiff_gpu(data: xr.DataArray | cupy.ndarray | np.ndarray, # keep parity with the public to_geotiff entry point. _validate_tile_size_arg(tile_size) _validate_nodata_arg(nodata) + + # Issue #1987 ambiguous-metadata checks; mirrors ``to_geotiff`` so the + # GPU writer enforces the same crs/crs_wkt consistency rule. + _attrs = getattr(data, 'attrs', None) or {} + validate_write_metadata({ + 'crs_kwarg': crs, + 'attrs_crs': _attrs.get('crs'), + 'attrs_crs_wkt': _attrs.get('crs_wkt'), + }) if max_z_error < 0: raise ValueError( f"max_z_error must be >= 0, got {max_z_error}") diff --git a/xrspatial/geotiff/tests/test_conflicting_crs_write_1987.py b/xrspatial/geotiff/tests/test_conflicting_crs_write_1987.py new file mode 100644 index 000000000..7a30ad3df --- /dev/null +++ b/xrspatial/geotiff/tests/test_conflicting_crs_write_1987.py @@ -0,0 +1,265 @@ +"""Conflicting ``attrs['crs']`` and ``attrs['crs_wkt']`` write check. + +Issue #1987, PR 1 of the ambiguous-metadata fail-closed series. + +The writer historically consulted both ``attrs['crs']`` and +``attrs['crs_wkt']`` and gave priority to the EPSG-style ``crs`` attr. +When the two encoded different CRSes, the writer silently emitted the +EPSG and dropped the WKT -- the on-disk CRS no longer matched what the +DataArray advertised. PR 1 wires +``xrspatial.geotiff._validation._check_write_conflicting_crs`` into the +write entry points so the disagreement now raises +``ConflictingCRSError`` (subclass of ``ValueError``) at the boundary. + +This file pins: + +* The check fires on disagreeing attrs (``test_disagreeing_*``). +* The check is silent on consistent attrs, even when both are set -- + this matches the read-back state where the reader always emits both + (``test_matching_*``). +* The check is silent when only one attr is set (``test_only_*``). +* The check is bypassed when the caller passes ``crs=`` explicitly -- + the kwarg overrides both attrs (``test_explicit_crs_kwarg_*``). +* Subclass relationships hold: ``ConflictingCRSError`` is a + ``GeoTIFFAmbiguousMetadataError`` and a ``ValueError`` + (``test_error_class_hierarchy``). +* The check is registered in the write-side registry at module load + (``test_check_registered_at_import``). +* Unparseable inputs fall through without firing this check -- they + are the subject of a sibling PR's ``UnparseableCRSError`` + (``test_unparseable_attrs_dont_raise_conflicting``). +""" +from __future__ import annotations + +import numpy as np +import pytest +import xarray as xr + +from xrspatial.geotiff import ( + ConflictingCRSError, + GeoTIFFAmbiguousMetadataError, + to_geotiff, +) +from xrspatial.geotiff._validation import ( + _check_write_conflicting_crs, + _registered_write_metadata_checks, +) + +pyproj = pytest.importorskip("pyproj") + + +def _da(*, attrs): + """Build a minimal 2-D georeferenced DataArray with the given attrs.""" + data = np.zeros((2, 2), dtype=np.float32) + return xr.DataArray( + data, + dims=('y', 'x'), + coords={ + 'y': np.array([1.0, 0.0], dtype=np.float64), + 'x': np.array([0.0, 1.0], dtype=np.float64), + }, + attrs=dict(attrs), + ) + + +_WKT_4326 = pyproj.CRS.from_epsg(4326).to_wkt() +_WKT_32633 = pyproj.CRS.from_epsg(32633).to_wkt() + + +# --------------------------------------------------------------------------- +# Registry sanity: the check is wired in by import-time registration. +# --------------------------------------------------------------------------- + + +def test_check_registered_at_import(): + """``_check_write_conflicting_crs`` is in the write-side registry. + + Confirms the registration call at the bottom of ``_validation.py`` + fired on first import. A regression here would mean the check is + silently inert and every write path stopped enforcing the rule. + """ + registered = _registered_write_metadata_checks() + assert _check_write_conflicting_crs in registered, ( + f"_check_write_conflicting_crs missing from registry; " + f"registered checks: {[c.__name__ for c in registered]}" + ) + + +def test_error_class_hierarchy(): + """``ConflictingCRSError`` subclasses both the ambiguous-metadata + family and ``ValueError``. + + Catching ``GeoTIFFAmbiguousMetadataError`` handles the whole #1987 + family. Catching ``ValueError`` keeps legacy ``except ValueError`` + blocks working through the deprecation of guess-and-continue. + """ + assert issubclass(ConflictingCRSError, GeoTIFFAmbiguousMetadataError) + assert issubclass(ConflictingCRSError, ValueError) + + +# --------------------------------------------------------------------------- +# Disagreeing attrs: the check raises. +# --------------------------------------------------------------------------- + + +def test_disagreeing_crs_and_crs_wkt_raises(tmp_path): + """``attrs['crs']=4326`` plus ``attrs['crs_wkt']=`` + raises ``ConflictingCRSError`` at write.""" + da = _da(attrs={'crs': 4326, 'crs_wkt': _WKT_32633}) + out = str(tmp_path / 'disagree.tif') + + with pytest.raises(ConflictingCRSError) as excinfo: + to_geotiff(da, out) + + msg = str(excinfo.value) + assert "attrs['crs']" in msg and "attrs['crs_wkt']" in msg, msg + assert '#1987' in msg, msg + + +def test_disagreeing_attrs_raise_value_error_too(tmp_path): + """Legacy ``except ValueError`` still catches the conflict because + ``ConflictingCRSError`` is a ``ValueError`` subclass.""" + da = _da(attrs={'crs': 4326, 'crs_wkt': _WKT_32633}) + out = str(tmp_path / 'disagree_value_error.tif') + + with pytest.raises(ValueError): + to_geotiff(da, out) + + +def test_disagreeing_attrs_raise_ambiguous_metadata_error(tmp_path): + """The whole #1987 family is catchable via the base class.""" + da = _da(attrs={'crs': 4326, 'crs_wkt': _WKT_32633}) + out = str(tmp_path / 'disagree_base.tif') + + with pytest.raises(GeoTIFFAmbiguousMetadataError): + to_geotiff(da, out) + + +# --------------------------------------------------------------------------- +# Agreeing attrs: the check is silent. This is the round-trip read-back +# state, which always carries both keys. +# --------------------------------------------------------------------------- + + +def test_matching_crs_int_and_crs_wkt_passes(tmp_path): + """``attrs['crs']=4326`` plus ``attrs['crs_wkt']=`` + writes without error -- this is the normal read-back state.""" + da = _da(attrs={'crs': 4326, 'crs_wkt': _WKT_4326}) + out = str(tmp_path / 'matching_int.tif') + + to_geotiff(da, out) + + +def test_matching_crs_string_and_crs_wkt_passes(tmp_path): + """``attrs['crs']='EPSG:4326'`` plus the matching WKT canonicalises + to the same CRS via pyproj and writes without error.""" + da = _da(attrs={'crs': 'EPSG:4326', 'crs_wkt': _WKT_4326}) + out = str(tmp_path / 'matching_str.tif') + + to_geotiff(da, out) + + +# --------------------------------------------------------------------------- +# Only one attr set: the check has nothing to compare against. +# --------------------------------------------------------------------------- + + +def test_only_crs_passes(tmp_path): + da = _da(attrs={'crs': 4326}) + to_geotiff(da, str(tmp_path / 'only_crs.tif')) + + +def test_only_crs_wkt_passes(tmp_path): + da = _da(attrs={'crs_wkt': _WKT_4326}) + to_geotiff(da, str(tmp_path / 'only_wkt.tif')) + + +def test_neither_attr_set_passes(tmp_path): + da = _da(attrs={}) + to_geotiff(da, str(tmp_path / 'neither.tif')) + + +# --------------------------------------------------------------------------- +# Explicit ``crs=`` kwarg overrides both attrs, so the check is bypassed. +# --------------------------------------------------------------------------- + + +def test_explicit_crs_kwarg_bypasses_check(tmp_path): + """``to_geotiff(da, ..., crs=4326)`` overrides attrs entirely, + so the disagreement in attrs does not affect this write.""" + da = _da(attrs={'crs': 4326, 'crs_wkt': _WKT_32633}) + + to_geotiff(da, str(tmp_path / 'explicit_kwarg.tif'), crs=4326) + + +def test_explicit_crs_kwarg_zero_does_not_bypass(tmp_path): + """``crs=0`` is not a valid EPSG and the writer's own kwarg + validation rejects it; the conflicting-CRS check fires too. + + The bypass branch checks ``is not None``, so any non-None value + triggers it. ``0`` would slip past the bypass and reach the + downstream EPSG validator (issue #1971). This test pins that the + conflicting-CRS check does NOT mask the downstream rejection in + this corner: pass ``crs=0`` plus disagreeing attrs and ensure + *some* ``ValueError`` fires (whichever sibling check raises + first). + """ + da = _da(attrs={'crs': 4326, 'crs_wkt': _WKT_32633}) + + with pytest.raises(ValueError): + to_geotiff(da, str(tmp_path / 'kwarg_zero.tif'), crs=0) + + +# --------------------------------------------------------------------------- +# Unparseable inputs: this check defers to the sibling +# ``UnparseableCRSError`` PR, which is not yet wired. +# --------------------------------------------------------------------------- + + +def test_unparseable_crs_wkt_does_not_fire_conflicting(tmp_path): + """A ``crs_wkt`` that pyproj cannot parse should not surface as + ``ConflictingCRSError``. The sibling ``UnparseableCRSError`` PR + will handle it; until then the writer's existing WKT-string fallback + decides what happens. This test pins that *our* check stays out of + the way.""" + da = _da(attrs={'crs': 4326, 'crs_wkt': 'NOT A VALID WKT STRING'}) + out = str(tmp_path / 'unparseable_wkt.tif') + + try: + to_geotiff(da, out) + except ConflictingCRSError as e: + pytest.fail( + "Unparseable WKT should not surface as ConflictingCRSError; " + f"that case is owned by UnparseableCRSError. Got: {e}" + ) + except Exception: + # Any other exception from the downstream WKT-fallback path is + # acceptable -- this test only asserts that we do not falsely + # claim a "conflict" for a value that pyproj cannot parse. + pass + + +# --------------------------------------------------------------------------- +# Round-trip safety: the typical read-back DataArray (both attrs set, +# agreeing) writes cleanly without firing the check. +# --------------------------------------------------------------------------- + + +def test_read_back_roundtrip_does_not_raise(tmp_path): + """End-to-end: write a 4326 file, open it (reader emits both attrs), + write again. The second write must not trip the conflict check.""" + from xrspatial.geotiff import open_geotiff + + first = str(tmp_path / 'roundtrip_first.tif') + second = str(tmp_path / 'roundtrip_second.tif') + + src = _da(attrs={'crs': 4326}) + to_geotiff(src, first) + + da = open_geotiff(first) + assert 'crs' in da.attrs and 'crs_wkt' in da.attrs, ( + "reader should emit both crs and crs_wkt on a CRS-bearing file; " + f"got attrs={sorted(da.attrs)}" + ) + + to_geotiff(da, second)