diff --git a/xrspatial/geotiff/_attrs.py b/xrspatial/geotiff/_attrs.py index 9ca929439..30f22e447 100644 --- a/xrspatial/geotiff/_attrs.py +++ b/xrspatial/geotiff/_attrs.py @@ -140,6 +140,8 @@ from __future__ import annotations import warnings +from dataclasses import dataclass, replace +from typing import Any import numpy as np @@ -249,6 +251,294 @@ _RESOLUTION_UNIT_IDS = {'none': 1, 'inch': 2, 'centimeter': 3} +# Reverse map of ``_RESOLUTION_UNIT_IDS``: TIFF ResolutionUnit tag ids back to +# the xrspatial string identifiers. Used by ``geo_info_to_metadata`` so the +# read side stores the same string label the writer expects on the way out. +# Derived from ``_RESOLUTION_UNIT_IDS`` so the forward and reverse maps cannot +# drift if a new unit is added. +_RESOLUTION_UNIT_NAMES = {v: k for k, v in _RESOLUTION_UNIT_IDS.items()} + + +@dataclass(frozen=True) +class GeoTIFFMetadata: + """Typed internal record for GeoTIFF read/write metadata (issue #2139). + + Mirrors the public attrs contract documented in this module's + docstring field-for-field. Read paths build a ``GeoTIFFMetadata`` + once and call :func:`metadata_to_attrs` at the DataArray-construction + boundary; write paths call :func:`attrs_to_metadata` once at entry + and read fields off the record instead of re-resolving from + ``data.attrs``. + + The public attrs surface is unchanged. The dataclass exists so the + four read paths (eager numpy, dask+numpy, GPU, VRT) and three writers + stop building / parsing the same dict by hand. + """ + + # Spatial reference + transform: tuple | None = None + crs_epsg: int | None = None + crs_wkt: str | None = None + raster_type: str = 'area' + has_georef: bool = True + + # NoData semantics (issue #1988 / #2092) + nodata: Any = None + masked_nodata: bool | None = None + + # Pass-through TIFF tags + extra_tags: list | None = None + image_description: str | None = None + extra_samples: tuple | None = None + colormap: tuple | None = None + + # GDAL_METADATA + gdal_metadata: dict | None = None + gdal_metadata_xml: str | None = None + + # Resolution tags + x_resolution: float | None = None + y_resolution: float | None = None + resolution_unit: str | None = None + + # VRT-only + vrt_holes: list | None = None + + # Contract version stamped on read + contract_version: int = _ATTRS_CONTRACT_VERSION + + def with_nodata(self, nodata, *, masked: bool) -> 'GeoTIFFMetadata': + """Return a copy with ``nodata`` and ``masked_nodata`` set. + + Mirrors :func:`_set_nodata_attrs`: when ``nodata is None`` the + record is returned unchanged so absence keeps signalling "no + declared sentinel"; otherwise ``masked`` is coerced to ``bool`` + and stored as ``masked_nodata``. + """ + if nodata is None: + return self + return replace(self, nodata=nodata, masked_nodata=bool(masked)) + + +def geo_info_to_metadata(geo_info, *, window=None) -> GeoTIFFMetadata: + """Build a :class:`GeoTIFFMetadata` from a reader's ``GeoInfo``. + + Centralises the field-by-field copy previously done inline in + :func:`_populate_attrs_from_geo_info`. ``window`` carries the + ``(r0, c0, r1, c1)`` tuple for windowed reads and is applied to the + emitted transform. + """ + has_georef = getattr(geo_info, 'has_georef', True) + src_t = geo_info.transform + + transform = None + if src_t is not None and has_georef: + transform = _transform_tuple_from_pixel_geometry( + src_t.origin_x, src_t.origin_y, + src_t.pixel_width, src_t.pixel_height, + window=window, + ) + + # ``allow_rotated=True`` opt-in path (#2115): the parser returns a + # GeoTransform with ``rotated_affine`` set and ``has_georef=False``. + # The rotated 6-tuple cannot be expressed as an axis-aligned + # rasterio transform, so the writer cannot round-trip it via + # ``attrs['transform']``. Per the documented contract on + # ``open_geotiff(allow_rotated=True)``, CRS attrs are dropped on + # this path too -- otherwise downstream code that gates on + # ``"crs" in da.attrs`` treats the array as spatially meaningful + # while the actual mapping is gone (#2126). + rotated_optin = ( + src_t is not None + and getattr(src_t, 'rotated_affine', None) is not None + and not has_georef + ) + crs_epsg = None if rotated_optin else geo_info.crs_epsg + crs_wkt = None if rotated_optin else geo_info.crs_wkt + + raster_type = ( + 'point' if geo_info.raster_type == RASTER_PIXEL_IS_POINT else 'area') + + resolution_unit = None + if geo_info.resolution_unit is not None: + resolution_unit = _RESOLUTION_UNIT_NAMES.get( + geo_info.resolution_unit, str(geo_info.resolution_unit)) + + colormap = None + if geo_info.extra_tags is not None: + for _tag_id, _tt, _tc, _tv in geo_info.extra_tags: + if _tag_id == 320: # TAG_COLORMAP + colormap = _tv + break + + return GeoTIFFMetadata( + transform=transform, + crs_epsg=crs_epsg, + crs_wkt=crs_wkt, + raster_type=raster_type, + has_georef=bool(has_georef and src_t is not None), + extra_tags=geo_info.extra_tags, + image_description=geo_info.image_description, + extra_samples=geo_info.extra_samples, + colormap=colormap, + gdal_metadata=geo_info.gdal_metadata, + gdal_metadata_xml=geo_info.gdal_metadata_xml, + x_resolution=geo_info.x_resolution, + y_resolution=geo_info.y_resolution, + resolution_unit=resolution_unit, + contract_version=_ATTRS_CONTRACT_VERSION, + ) + + +def metadata_to_attrs(md: GeoTIFFMetadata) -> dict: + """Build the public attrs dict from a :class:`GeoTIFFMetadata` record. + + The output is identical to what + :func:`_populate_attrs_from_geo_info` writes today, so the read-path + swap is a behaviour no-op when the source record comes from + :func:`geo_info_to_metadata`. + """ + attrs: dict = {'_xrspatial_geotiff_contract': md.contract_version} + + if md.crs_epsg is not None: + attrs['crs'] = md.crs_epsg + if md.crs_wkt is not None: + attrs['crs_wkt'] = md.crs_wkt + if md.raster_type == 'point': + attrs['raster_type'] = 'point' + + # Three states on the (transform, has_georef) pair: + # + # * transform set + has_georef -> emit ``attrs['transform']``. + # * not has_georef -> emit ``attrs[_NO_GEOREF_KEY] = True``. + # * transform None + has_georef -> emit neither. + # + # The third state is the eager VRT path's contract: the VRT builder + # constructs the record with ``has_georef=True`` but leaves + # ``transform=None`` because the inline VRT code stamps the + # rasterio-ordered transform tuple onto the dict a few lines later + # (after the GPU transfer / dtype cast / nodata mask steps). Removing + # this branch would re-introduce a duplicate transform write or, worse, + # would emit ``_NO_GEOREF_KEY`` on a georef'd VRT array. See #2139. + if md.transform is not None and md.has_georef: + attrs['transform'] = md.transform + elif not md.has_georef: + attrs[_NO_GEOREF_KEY] = True + + if md.nodata is not None: + attrs['nodata'] = md.nodata + attrs['masked_nodata'] = bool(md.masked_nodata) + + if md.gdal_metadata is not None: + attrs['gdal_metadata'] = md.gdal_metadata + if md.gdal_metadata_xml is not None: + attrs['gdal_metadata_xml'] = md.gdal_metadata_xml + + if md.extra_tags is not None: + attrs['extra_tags'] = md.extra_tags + if md.image_description is not None: + attrs['image_description'] = md.image_description + if md.extra_samples is not None: + attrs['extra_samples'] = md.extra_samples + + if md.x_resolution is not None: + attrs['x_resolution'] = md.x_resolution + if md.y_resolution is not None: + attrs['y_resolution'] = md.y_resolution + if md.resolution_unit is not None: + attrs['resolution_unit'] = md.resolution_unit + + if md.colormap is not None: + attrs['colormap'] = md.colormap + + if md.vrt_holes: + attrs['vrt_holes'] = md.vrt_holes + + return attrs + + +def attrs_to_metadata(attrs) -> GeoTIFFMetadata: + """Parse a (possibly user-supplied) attrs dict into a metadata record. + + Honours the alias resolution :func:`_resolve_nodata_attr` already + implements. ``attrs`` may be a plain dict, an + ``xarray.core.utils.Frozen``, or ``None``. + + Boundary contract -- this function parses leniently and lets the + writer enforce strict validation against the parsed record: + + * ``attrs['crs']=True`` lands as ``crs_epsg=None`` rather than + raising. ``_validate_crs_arg`` (called from the writers) is the + validator that should reject bool values; the boundary parser + only needs to keep the bad value out of the record so the writer + sees ``crs_epsg=None`` and falls through to ``crs_wkt``. See + ``test_crs_arg_validation_1971.py``. + * ``transform`` is coerced via ``tuple(...)`` with no length or + numeric-type check. ``_transform_from_attr`` is the canonical + validator and runs inside the writer. + """ + if attrs is None: + attrs = {} + + raster_type = 'point' if attrs.get('raster_type') == 'point' else 'area' + + transform = attrs.get('transform') + no_georef = bool(attrs.get(_NO_GEOREF_KEY)) + has_georef = (transform is not None) and not no_georef + + nodata = _resolve_nodata_attr(attrs) + masked_attr = attrs.get('masked_nodata') + masked_nodata: bool | None + if nodata is None: + masked_nodata = None + elif masked_attr is None: + masked_nodata = None + else: + masked_nodata = bool(masked_attr) + + crs_epsg = None + crs_wkt = attrs.get('crs_wkt') if isinstance(attrs.get('crs_wkt'), str) else None + crs_attr = attrs.get('crs') + if isinstance(crs_attr, str): + # ``attrs['crs']`` carries a WKT string under some pipelines; fold + # it into ``crs_wkt`` here so the writer's resolve step does not + # have to re-branch on type. + if crs_wkt is None: + crs_wkt = crs_attr + elif crs_attr is not None and not isinstance(crs_attr, bool): + # ``isinstance(True, int)`` is True; reject bool here so the + # writer's _validate_crs_arg gate is not bypassed at the + # boundary. + try: + crs_epsg = int(crs_attr) + except (TypeError, ValueError): + crs_epsg = None + + contract_version = attrs.get( + '_xrspatial_geotiff_contract', _ATTRS_CONTRACT_VERSION) + + return GeoTIFFMetadata( + transform=tuple(transform) if transform is not None else None, + crs_epsg=crs_epsg, + crs_wkt=crs_wkt, + raster_type=raster_type, + has_georef=has_georef, + nodata=nodata, + masked_nodata=masked_nodata, + extra_tags=attrs.get('extra_tags'), + image_description=attrs.get('image_description'), + extra_samples=attrs.get('extra_samples'), + colormap=attrs.get('colormap'), + gdal_metadata=attrs.get('gdal_metadata'), + gdal_metadata_xml=attrs.get('gdal_metadata_xml'), + x_resolution=attrs.get('x_resolution'), + y_resolution=attrs.get('y_resolution'), + resolution_unit=attrs.get('resolution_unit'), + vrt_holes=attrs.get('vrt_holes'), + contract_version=contract_version, + ) + + def _extent_to_window(transform, file_height, file_width, y_min, y_max, x_min, x_max): """Convert geographic extent to pixel window (row_start, col_start, row_stop, col_stop). @@ -461,106 +751,15 @@ def _populate_attrs_from_geo_info(attrs: dict, geo_info, *, window=None) -> None overwritten with the current ``_ATTRS_CONTRACT_VERSION``; callers pass freshly built dicts, so this is the intended behaviour. """ - # Stamp the contract version first so every read path that funnels - # through this helper carries the marker. The VRT backends build - # their attrs dict directly and stamp the version there (see - # ``_backends/vrt.py``); keep both sites in sync via the constant - # rather than the bare literal. - attrs['_xrspatial_geotiff_contract'] = _ATTRS_CONTRACT_VERSION - - src_t = geo_info.transform - has_georef = getattr(geo_info, 'has_georef', True) - # ``allow_rotated=True`` opt-in path (#2115): the parser returns a - # GeoTransform with ``rotated_affine`` set and ``has_georef=False``. - # The rotated 6-tuple cannot be expressed as an axis-aligned - # rasterio transform, so the writer cannot round-trip it via - # ``attrs['transform']``. Per the documented contract on - # ``open_geotiff(allow_rotated=True)``, CRS attrs are dropped on - # this path too -- otherwise downstream code that gates on - # ``"crs" in da.attrs`` treats the array as spatially meaningful - # while the actual mapping is gone (#2122 / #2126). - rotated_optin = ( - src_t is not None - and getattr(src_t, 'rotated_affine', None) is not None - and not has_georef - ) - - if not rotated_optin: - if geo_info.crs_epsg is not None: - attrs['crs'] = geo_info.crs_epsg - if geo_info.crs_wkt is not None: - attrs['crs_wkt'] = geo_info.crs_wkt - if geo_info.raster_type == RASTER_PIXEL_IS_POINT: - attrs['raster_type'] = 'point' - - # Skip the transform attr for files where no GeoTIFF transform tags - # (ModelTransformation, ModelPixelScale, or ModelTiepoint) are - # present, signalled by ``has_georef=False``. GeoKeys / CRS metadata - # can still be present in that case (and are kept for plain - # no-georef files; the rotated opt-in path drops them above). The - # default unit ``GeoTransform`` is a struct placeholder, not real - # georef -- emitting it leaks an identity transform into attrs and - # confuses downstream code that expects ``'transform' in attrs`` to - # mean "this raster has a georef transform" (#1710). - if src_t is not None and has_georef: - attrs['transform'] = _transform_tuple_from_pixel_geometry( - src_t.origin_x, src_t.origin_y, - src_t.pixel_width, src_t.pixel_height, - window=window, - ) - else: - # Stamp the no-georef marker so the writer can tell our - # placeholder int64 step-1 coords apart from a user-authored - # integer-coord grid (#2120). Pre-#2120 the writer detected the - # placeholder by shape alone, which silently stripped georef - # from any user array whose coords matched the same arange - # pattern. - attrs[_NO_GEOREF_KEY] = True - - # Contract v2 (issue #2016) removed the 13 secondary GeoKey-derived - # attrs that v1 emitted under a ``DeprecationWarning`` (``crs_name``, - # ``geog_citation``, ``datum_code``, ``angular_units``, - # ``semi_major_axis``, ``inv_flattening``, ``linear_units``, - # ``projection_code``, ``vertical_crs``, ``vertical_citation``, - # ``vertical_units``). The underlying ``GeoInfo`` fields are still - # populated by the GeoKey parser because ``_synthesize_user_defined_wkt`` - # consumes ``geog_citation`` (and siblings) to fill ``crs_wkt`` for - # user-defined CRSes; the reader no longer surfaces them as - # separate user-visible attrs. - - if geo_info.gdal_metadata is not None: - attrs['gdal_metadata'] = geo_info.gdal_metadata - if geo_info.gdal_metadata_xml is not None: - attrs['gdal_metadata_xml'] = geo_info.gdal_metadata_xml - - if geo_info.extra_tags is not None: - attrs['extra_tags'] = geo_info.extra_tags - if geo_info.image_description is not None: - attrs['image_description'] = geo_info.image_description - if geo_info.extra_samples is not None: - attrs['extra_samples'] = geo_info.extra_samples - - if geo_info.x_resolution is not None: - attrs['x_resolution'] = geo_info.x_resolution - if geo_info.y_resolution is not None: - attrs['y_resolution'] = geo_info.y_resolution - if geo_info.resolution_unit is not None: - _unit_names = {1: 'none', 2: 'inch', 3: 'centimeter'} - attrs['resolution_unit'] = _unit_names.get( - geo_info.resolution_unit, str(geo_info.resolution_unit)) - - # Contract v2 (issue #2016) removed ``attrs['cmap']`` and - # ``attrs['colormap_rgba']``. The canonical ``attrs['colormap']`` - # (raw uint16 RGB triples from TIFF tag 320) is still emitted below - # via the ``extra_tags`` scan; callers that need an RGBA palette or - # a :class:`matplotlib.colors.ListedColormap` should build one from - # ``attrs['colormap']`` directly. - - if geo_info.extra_tags is not None: - for _tag_id, _tt, _tc, _tv in geo_info.extra_tags: - if _tag_id == 320: # TAG_COLORMAP - attrs['colormap'] = _tv - break + # Compatibility shim: build a typed :class:`GeoTIFFMetadata` once + # and fold it into the caller's dict. The two routes + # (:func:`metadata_to_attrs` and the legacy field-by-field writes) + # produce the same attrs surface; centralising on the record lets + # the VRT path emit the same field set without copying this block. + # The ``allow_rotated=True`` opt-in CRS-drop (#2126) is handled + # inside ``geo_info_to_metadata``. See issue #2139 / ``metadata_to_attrs``. + md = geo_info_to_metadata(geo_info, window=window) + attrs.update(metadata_to_attrs(md)) def _resolve_nodata_attr(attrs: dict): diff --git a/xrspatial/geotiff/_backends/vrt.py b/xrspatial/geotiff/_backends/vrt.py index ba38bff84..70948532a 100644 --- a/xrspatial/geotiff/_backends/vrt.py +++ b/xrspatial/geotiff/_backends/vrt.py @@ -13,12 +13,15 @@ import numpy as np import xarray as xr -from .._attrs import _ATTRS_CONTRACT_VERSION, _set_nodata_attrs +from .._attrs import ( + GeoTIFFMetadata, + _set_nodata_attrs, + metadata_to_attrs, +) from .._coords import ( coords_from_pixel_geometry as _coords_from_pixel_geometry, transform_tuple_from_pixel_geometry as _transform_tuple_from_pixel_geometry, ) -from .._geotags import _NO_GEOREF_KEY from .._crs import _wkt_to_epsg from .._validation import _validate_chunks_arg, _validate_dtype_cast @@ -298,36 +301,37 @@ def read_vrt(source: str, *, else: coords = {} - # VRT builds its attrs dict inline rather than going through - # ``_populate_attrs_from_geo_info``; stamp the contract version here - # so both code paths emit the same marker. - attrs = {'_xrspatial_geotiff_contract': _ATTRS_CONTRACT_VERSION} - if gt is None or _vrt_is_rotated: - # Mirror the eager non-VRT no-georef path: stamp the no-georef - # marker whenever the read produced placeholder int64 coords. - # Covers both "no transform tags at all" and rotated reads under - # ``allow_rotated=True``. Without the rotated arm, the writer - # round-trip on a rotated read mistook the int64 coords for a - # user-authored integer-coord grid and stripped georef from the - # output (#2120 / #2122 follow-up). - attrs[_NO_GEOREF_KEY] = True - if vrt.crs_wkt and not _vrt_is_rotated: - epsg = _wkt_to_epsg(vrt.crs_wkt) - if epsg is not None: - attrs['crs'] = epsg - attrs['crs_wkt'] = vrt.crs_wkt - if vrt.raster_type == 'point': - attrs['raster_type'] = 'point' - # Surface skipped-source records as ``attrs['vrt_holes']`` so - # callers can detect a partial mosaic by attribute lookup. Under - # lenient mode (the default), the underlying ``_vrt.read_vrt`` - # already warned per skipped source but the warning is easy to - # miss in a pipeline; the attr lets downstream code branch on - # ``"vrt_holes" in da.attrs`` instead of monitoring the warnings - # stream. Empty list is omitted so the attr only appears when - # there is actually a hole. See issue #1734. - if vrt.holes: - attrs['vrt_holes'] = list(vrt.holes) + # Build the VRT attrs surface via :class:`GeoTIFFMetadata` so the + # eager VRT, chunked VRT, and non-VRT read paths share one + # field-set-to-attrs marshalling step. The VRT path intentionally + # populates only the subset of fields it owns (CRS, transform, + # raster_type, no-georef marker, vrt_holes); ``extra_tags``, + # ``gdal_metadata`` and resolution tags continue to be omitted on + # this path until the migration's VRT-tag-coverage follow-up lands. + # See issue #2139. + # + # Rotated VRT reads (``allow_rotated=True`` opt-in, #2122) drop both + # the transform AND the CRS attrs because the writer cannot + # round-trip a rotated 6-tuple as ``attrs['transform']``. Modelled by + # setting ``has_georef=False`` (so ``metadata_to_attrs`` emits the + # no-georef marker) and clearing the CRS fields below. + # + # ``vrt.crs_wkt`` carries an empty string when the VRT XML has a + # ```` element but no recognised CRS body; treat empty as + # absent so ``metadata_to_attrs`` does not emit ``attrs['crs_wkt']=''``. + _vrt_keep_crs = bool(vrt.crs_wkt) and not _vrt_is_rotated + _vrt_epsg = _wkt_to_epsg(vrt.crs_wkt) if _vrt_keep_crs else None + _vrt_md = GeoTIFFMetadata( + crs_epsg=_vrt_epsg, + crs_wkt=vrt.crs_wkt if _vrt_keep_crs else None, + raster_type='point' if vrt.raster_type == 'point' else 'area', + has_georef=gt is not None and not _vrt_is_rotated, + # Surface skipped-source records as ``attrs['vrt_holes']`` so + # callers can detect a partial mosaic by attribute lookup. See + # issue #1734. + vrt_holes=list(vrt.holes) if vrt.holes else None, + ) + attrs = metadata_to_attrs(_vrt_md) # When a specific band is selected, source its nodata from that # band's instead of band 0's. Otherwise multi-band # VRTs with per-band sentinels would mis-mask the read: attrs would @@ -768,10 +772,9 @@ def _read_vrt_chunked(source, *, window, band, name, chunks, gpu, dtype, # eager reads share the same x/y arrays. gt = vrt.geo_transform coords = {} - # Mirrors the eager VRT branch: this code path bypasses - # ``_populate_attrs_from_geo_info``, so the contract version is - # stamped inline using the shared constant to stay in lockstep. - attrs = {'_xrspatial_geotiff_contract': _ATTRS_CONTRACT_VERSION} + # Build attrs via :class:`GeoTIFFMetadata` to share the marshalling + # step with the eager VRT branch. See issue #2139. + # # ``_vrt_is_rotated`` matches the eager VRT branch's gate (#2122): # rotated VRTs read with ``allow_rotated=True`` opt out of the # georef attrs (``crs`` / ``crs_wkt`` / ``transform``) so downstream @@ -796,28 +799,34 @@ def _read_vrt_chunked(source, *, window, band, name, chunks, gpu, dtype, window=coord_window, has_georef=not _vrt_is_rotated, ) - if not _vrt_is_rotated: - attrs['transform'] = _transform_tuple_from_pixel_geometry( + # Rotated VRTs (#2122) drop the transform because it cannot + # round-trip as ``attrs['transform']``; ``_vrt_md.has_georef`` + # gates this below so ``metadata_to_attrs`` emits the no-georef + # marker on the rotated path. + _vrt_transform = ( + None if _vrt_is_rotated + else _transform_tuple_from_pixel_geometry( origin_x, origin_y, res_x, res_y, window=(win_r0, win_c0, 0, 0), ) - else: - # Mirrors the no-transform arm: rotated reads emit - # placeholder int64 coords, so stamp the marker that lets - # the writer tell them apart from a user-authored integer - # grid (#2120 / #2122 follow-up). - attrs[_NO_GEOREF_KEY] = True + ) else: - # Defensive marker (issue #2120). See the eager VRT branch. - attrs[_NO_GEOREF_KEY] = True - - if vrt.crs_wkt and not _vrt_is_rotated: - epsg = _wkt_to_epsg(vrt.crs_wkt) - if epsg is not None: - attrs['crs'] = epsg - attrs['crs_wkt'] = vrt.crs_wkt - if vrt.raster_type == 'point': - attrs['raster_type'] = 'point' + _vrt_transform = None + + # ``vrt.crs_wkt`` carries an empty string when the VRT XML has a + # ```` element but no recognised CRS body; treat empty as + # absent so ``metadata_to_attrs`` does not emit ``attrs['crs_wkt']=''``. + # Rotated VRTs drop CRS attrs alongside the transform (#2122). + _vrt_keep_crs = bool(vrt.crs_wkt) and not _vrt_is_rotated + _vrt_epsg = _wkt_to_epsg(vrt.crs_wkt) if _vrt_keep_crs else None + _vrt_md = GeoTIFFMetadata( + transform=_vrt_transform, + crs_epsg=_vrt_epsg, + crs_wkt=vrt.crs_wkt if _vrt_keep_crs else None, + raster_type='point' if vrt.raster_type == 'point' else 'area', + has_georef=gt is not None and not _vrt_is_rotated, + ) + attrs = metadata_to_attrs(_vrt_md) # Surface the nodata sentinel for the selected band. The chunked # path declares ``float64`` up front whenever any selected band has diff --git a/xrspatial/geotiff/tests/test_geotiff_metadata_2139.py b/xrspatial/geotiff/tests/test_geotiff_metadata_2139.py new file mode 100644 index 000000000..b64e3fb8e --- /dev/null +++ b/xrspatial/geotiff/tests/test_geotiff_metadata_2139.py @@ -0,0 +1,376 @@ +"""Round-trip tests for the internal ``GeoTIFFMetadata`` dataclass. + +Issue #2139. + +The dataclass and the two boundary functions +(``metadata_to_attrs`` and ``attrs_to_metadata``) replace the manual +``attrs[...] = ...`` blocks scattered across the four read paths and +three writers. The public attrs surface does not change; what changes +is that every backend now goes through one marshalling step. + +These tests pin two invariants: + +* ``metadata_to_attrs(geo_info_to_metadata(...))`` emits the same key + set the legacy ``_populate_attrs_from_geo_info`` writes. +* ``metadata_to_attrs(attrs_to_metadata(x))`` is a stable round trip + for representative attrs dicts (eager numpy, dask, GPU, VRT, + no-georef, point raster, user-defined CRS WKT, with-nodata). +""" +from __future__ import annotations + +import numpy as np +import pytest + +from xrspatial.geotiff._attrs import ( + GeoTIFFMetadata, + _ATTRS_CONTRACT_VERSION, + attrs_to_metadata, + geo_info_to_metadata, + metadata_to_attrs, +) +from xrspatial.geotiff._geotags import _NO_GEOREF_KEY + + +# --------------------------------------------------------------------------- +# Builders +# --------------------------------------------------------------------------- + + +class _FakeTransform: + """Stand-in for ``GeoInfo.transform`` used by the reader.""" + + def __init__(self, origin_x=0.0, origin_y=0.0, + pixel_width=1.0, pixel_height=-1.0): + self.origin_x = origin_x + self.origin_y = origin_y + self.pixel_width = pixel_width + self.pixel_height = pixel_height + + +class _FakeGeoInfo: + """Minimal ``GeoInfo`` stand-in covering the fields the reader emits.""" + + def __init__( + self, + *, + transform=None, + crs_epsg=None, + crs_wkt=None, + raster_type=1, # RASTER_PIXEL_IS_AREA + has_georef=True, + nodata=None, + extra_tags=None, + image_description=None, + extra_samples=None, + gdal_metadata=None, + gdal_metadata_xml=None, + x_resolution=None, + y_resolution=None, + resolution_unit=None, + ): + self.transform = transform + self.crs_epsg = crs_epsg + self.crs_wkt = crs_wkt + self.raster_type = raster_type + self.has_georef = has_georef + self.nodata = nodata + self.extra_tags = extra_tags + self.image_description = image_description + self.extra_samples = extra_samples + self.gdal_metadata = gdal_metadata + self.gdal_metadata_xml = gdal_metadata_xml + self.x_resolution = x_resolution + self.y_resolution = y_resolution + self.resolution_unit = resolution_unit + + +# --------------------------------------------------------------------------- +# geo_info_to_metadata -> metadata_to_attrs +# --------------------------------------------------------------------------- + + +def test_geo_info_to_metadata_stamps_contract_version(): + md = geo_info_to_metadata(_FakeGeoInfo()) + assert md.contract_version == _ATTRS_CONTRACT_VERSION + attrs = metadata_to_attrs(md) + assert attrs['_xrspatial_geotiff_contract'] == _ATTRS_CONTRACT_VERSION + + +def test_geo_info_to_metadata_transform_present(): + gi = _FakeGeoInfo( + transform=_FakeTransform(origin_x=1.0, origin_y=10.0, + pixel_width=0.5, pixel_height=-0.5), + crs_epsg=4326, crs_wkt='WKT-CRS', + ) + md = geo_info_to_metadata(gi) + assert md.has_georef is True + assert md.transform is not None + assert md.crs_epsg == 4326 + assert md.crs_wkt == 'WKT-CRS' + + attrs = metadata_to_attrs(md) + assert 'transform' in attrs + assert attrs['crs'] == 4326 + assert attrs['crs_wkt'] == 'WKT-CRS' + assert _NO_GEOREF_KEY not in attrs + + +def test_geo_info_to_metadata_no_georef_marker(): + # No transform tags -> has_georef=False on the GeoInfo + gi = _FakeGeoInfo(transform=None, has_georef=False) + md = geo_info_to_metadata(gi) + assert md.has_georef is False + assert md.transform is None + + attrs = metadata_to_attrs(md) + assert 'transform' not in attrs + assert attrs[_NO_GEOREF_KEY] is True + + +def test_geo_info_to_metadata_point_raster(): + gi = _FakeGeoInfo(transform=_FakeTransform(), raster_type=2) # POINT + md = geo_info_to_metadata(gi) + assert md.raster_type == 'point' + + attrs = metadata_to_attrs(md) + assert attrs['raster_type'] == 'point' + + +def test_geo_info_to_metadata_resolution_unit_label(): + gi = _FakeGeoInfo( + transform=_FakeTransform(), + x_resolution=300.0, y_resolution=300.0, resolution_unit=2, + ) + md = geo_info_to_metadata(gi) + assert md.resolution_unit == 'inch' + + attrs = metadata_to_attrs(md) + assert attrs['resolution_unit'] == 'inch' + assert attrs['x_resolution'] == 300.0 + assert attrs['y_resolution'] == 300.0 + + +def test_geo_info_to_metadata_colormap_from_extra_tags(): + extra_tags = [(320, 3, 6, (10, 20, 30, 40, 50, 60))] + gi = _FakeGeoInfo(transform=_FakeTransform(), extra_tags=extra_tags) + md = geo_info_to_metadata(gi) + assert md.colormap == (10, 20, 30, 40, 50, 60) + attrs = metadata_to_attrs(md) + assert attrs['colormap'] == (10, 20, 30, 40, 50, 60) + + +# --------------------------------------------------------------------------- +# attrs_to_metadata: parse user/reader attrs back to a record +# --------------------------------------------------------------------------- + + +def test_attrs_to_metadata_none_attrs_returns_default_record(): + md = attrs_to_metadata(None) + assert isinstance(md, GeoTIFFMetadata) + assert md.transform is None + assert md.crs_epsg is None + assert md.crs_wkt is None + assert md.nodata is None + + +def test_attrs_to_metadata_empty_dict(): + md = attrs_to_metadata({}) + assert md.transform is None + assert md.has_georef is False # no transform -> not georeferenced + assert md.raster_type == 'area' + + +def test_attrs_to_metadata_crs_int_epsg(): + md = attrs_to_metadata({'crs': 4326, 'crs_wkt': 'WKT-X'}) + assert md.crs_epsg == 4326 + assert md.crs_wkt == 'WKT-X' + + +def test_attrs_to_metadata_crs_wkt_string_in_crs_field(): + # Some upstream pipelines stash a WKT string in attrs['crs'] + md = attrs_to_metadata({'crs': 'PROJCS["X"...]'}) + assert md.crs_epsg is None + assert md.crs_wkt == 'PROJCS["X"...]' + + +def test_attrs_to_metadata_crs_bool_rejected_at_boundary(): + # attrs={'crs': True} previously round-tripped as EPSG=1 in some + # writers; the boundary parser refuses to coerce booleans here. + md = attrs_to_metadata({'crs': True}) + assert md.crs_epsg is None + + +def test_attrs_to_metadata_nodata_canonical(): + md = attrs_to_metadata({'nodata': -9999, 'masked_nodata': True}) + assert md.nodata == -9999 + assert md.masked_nodata is True + + +def test_attrs_to_metadata_nodata_alias_nodatavals(): + md = attrs_to_metadata({'nodatavals': (-9999,)}) + assert md.nodata == -9999 + # No masked_nodata flag on the input -> None on the record + assert md.masked_nodata is None + + +def test_attrs_to_metadata_nodata_alias_fill_value(): + md = attrs_to_metadata({'_FillValue': -1}) + assert md.nodata == -1 + + +def test_attrs_to_metadata_no_georef_marker_clears_has_georef(): + md = attrs_to_metadata({_NO_GEOREF_KEY: True}) + assert md.has_georef is False + + +def test_attrs_to_metadata_transform_present_implies_georef(): + md = attrs_to_metadata({'transform': (1.0, 0.0, 0.0, 0.0, -1.0, 10.0)}) + assert md.has_georef is True + assert md.transform == (1.0, 0.0, 0.0, 0.0, -1.0, 10.0) + + +# --------------------------------------------------------------------------- +# Round-trip metadata_to_attrs <-> attrs_to_metadata +# --------------------------------------------------------------------------- + + +def _representative_attrs_dicts(): + """Yield representative attrs dicts that the round trip must preserve. + + Each entry mirrors a real backend's emit set: + + * eager numpy file (CRS + transform + nodata + masked) + * point raster + * no-georef file + * user-defined CRS (WKT only, no EPSG) + * VRT with holes + """ + yield { + '_xrspatial_geotiff_contract': _ATTRS_CONTRACT_VERSION, + 'crs': 4326, + 'crs_wkt': 'WKT-4326', + 'transform': (1.0, 0.0, 0.0, 0.0, -1.0, 10.0), + 'nodata': -9999, + 'masked_nodata': True, + } + yield { + '_xrspatial_geotiff_contract': _ATTRS_CONTRACT_VERSION, + 'crs': 4326, + 'crs_wkt': 'WKT-4326', + 'transform': (1.0, 0.0, 0.0, 0.0, -1.0, 10.0), + 'raster_type': 'point', + } + yield { + '_xrspatial_geotiff_contract': _ATTRS_CONTRACT_VERSION, + _NO_GEOREF_KEY: True, + } + yield { + '_xrspatial_geotiff_contract': _ATTRS_CONTRACT_VERSION, + 'crs_wkt': 'PROJCS["user defined"...]', + 'transform': (1.0, 0.0, 0.0, 0.0, -1.0, 10.0), + } + yield { + '_xrspatial_geotiff_contract': _ATTRS_CONTRACT_VERSION, + 'crs': 32610, + 'crs_wkt': 'WKT-32610', + 'transform': (10.0, 0.0, 500000.0, 0.0, -10.0, 4000000.0), + 'vrt_holes': [{'source': '/tmp/missing.tif', 'band': 1}], + } + + +@pytest.mark.parametrize( + 'attrs', + list(_representative_attrs_dicts()), + ids=['eager', 'point', 'no_georef', 'user_wkt', 'vrt'], +) +def test_round_trip_attrs_to_metadata_to_attrs(attrs): + """``metadata_to_attrs(attrs_to_metadata(x))`` preserves every key in x.""" + md = attrs_to_metadata(attrs) + round_tripped = metadata_to_attrs(md) + for key, value in attrs.items(): + assert key in round_tripped, f"key {key!r} dropped on round trip" + assert round_tripped[key] == value, ( + f"value at {key!r} changed: {value!r} -> {round_tripped[key]!r}" + ) + + +@pytest.mark.parametrize( + 'attrs', + list(_representative_attrs_dicts()), + ids=['eager', 'point', 'no_georef', 'user_wkt', 'vrt'], +) +def test_round_trip_emits_no_unexpected_keys(attrs): + """Round trip emits the input keys and the contract version, nothing else. + + Locks the marshalling step so a future field added to + ``metadata_to_attrs`` without an attr-contract update fails this + test rather than silently appearing on every read. + """ + md = attrs_to_metadata(attrs) + round_tripped = metadata_to_attrs(md) + expected_keys = set(attrs.keys()) | {'_xrspatial_geotiff_contract'} + extra = set(round_tripped) - expected_keys + assert not extra, ( + f"metadata_to_attrs emitted unexpected keys {extra!r} not present in input " + f"{set(attrs)!r}; either add them to the attrs contract or fix " + f"the marshalling step." + ) + + +def test_round_trip_metadata_to_attrs_to_metadata(): + """Building a record, marshalling, parsing back recovers all fields.""" + md = GeoTIFFMetadata( + transform=(1.0, 0.0, 0.0, 0.0, -1.0, 10.0), + crs_epsg=4326, + crs_wkt='WKT-4326', + raster_type='point', + has_georef=True, + nodata=-9999, + masked_nodata=True, + x_resolution=300.0, y_resolution=300.0, resolution_unit='inch', + ) + attrs = metadata_to_attrs(md) + md2 = attrs_to_metadata(attrs) + + assert md2.transform == md.transform + assert md2.crs_epsg == md.crs_epsg + assert md2.crs_wkt == md.crs_wkt + assert md2.raster_type == md.raster_type + assert md2.has_georef == md.has_georef + assert md2.nodata == md.nodata + assert md2.masked_nodata == md.masked_nodata + assert md2.x_resolution == md.x_resolution + assert md2.y_resolution == md.y_resolution + assert md2.resolution_unit == md.resolution_unit + + +# --------------------------------------------------------------------------- +# with_nodata builder +# --------------------------------------------------------------------------- + + +def test_with_nodata_sets_pair(): + md = GeoTIFFMetadata() + md2 = md.with_nodata(-9999, masked=True) + assert md2.nodata == -9999 + assert md2.masked_nodata is True + # frozen dataclass returns a new instance + assert md.nodata is None + assert md is not md2 + + +def test_with_nodata_none_returns_unchanged(): + md = GeoTIFFMetadata(crs_epsg=4326) + md2 = md.with_nodata(None, masked=True) + # ``None`` mirrors ``_set_nodata_attrs`` no-op contract; record + # comes back equal so absence keeps signalling "no declared + # sentinel." + assert md2.nodata is None + assert md2.masked_nodata is None + assert md2.crs_epsg == 4326 + + +def test_with_nodata_masked_false_records_false(): + md = GeoTIFFMetadata() + md2 = md.with_nodata(-9999, masked=False) + assert md2.masked_nodata is False