From 487c84555750b91b4eaad90518fc98ffa7877ace Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Tue, 19 May 2026 11:52:14 -0700 Subject: [PATCH 1/2] geotiff: add internal GeoTIFFMetadata dataclass and migrate readers (#2139) Introduce a frozen GeoTIFFMetadata dataclass with two boundary functions (metadata_to_attrs, attrs_to_metadata) plus a geo_info_to_metadata builder so the four read paths and three writers stop building / parsing the same attrs dict by hand. The public attrs surface is unchanged. Migrated in this PR (reader-first slice from the issue's plan): - _populate_attrs_from_geo_info becomes a thin wrapper that builds a GeoTIFFMetadata and folds metadata_to_attrs back into the caller's dict. Field-for-field equivalent to the prior implementation, so the three non-VRT read paths see no change. - The two VRT inline attrs blocks (eager + chunked) now build a GeoTIFFMetadata and run it through metadata_to_attrs, removing the duplicated 'stamp contract version / set crs / set crs_wkt / set raster_type / mark no-georef / surface vrt_holes' sequences. The VRT path keeps emitting the same key subset it always did (no extra_tags / gdal_metadata / resolution tags); aligning that surface with the non-VRT readers is a separate follow-up. Writers stay on the dict for now to keep the diff small and avoid overlap with the in-flight attrs-contract work on the same files (#2133 / #2135 / #2136). The dataclass already exposes the fields the writers need (with_nodata builder, attrs_to_metadata) so the writer-side migration is the natural next PR. Adds 25 tests in test_geotiff_metadata_2139.py covering: - geo_info_to_metadata field-by-field copy for transform / point / no-georef / resolution-unit / colormap cases. - attrs_to_metadata for nodata aliases, CRS-as-WKT-in-crs-field, and the bool-EPSG guard. - Round-trip stability for representative attrs dicts (eager / point / no-georef / user-WKT / VRT with holes). - with_nodata builder semantics. All 4066 geotiff tests pass. --- xrspatial/geotiff/_attrs.py | 336 +++++++++++++---- xrspatial/geotiff/_backends/vrt.py | 85 ++--- .../tests/test_geotiff_metadata_2139.py | 353 ++++++++++++++++++ 3 files changed, 645 insertions(+), 129 deletions(-) create mode 100644 xrspatial/geotiff/tests/test_geotiff_metadata_2139.py diff --git a/xrspatial/geotiff/_attrs.py b/xrspatial/geotiff/_attrs.py index 9f60c31d2..50a6a84d7 100644 --- a/xrspatial/geotiff/_attrs.py +++ b/xrspatial/geotiff/_attrs.py @@ -123,6 +123,8 @@ from __future__ import annotations import warnings +from dataclasses import dataclass, replace +from typing import Any import numpy as np @@ -177,6 +179,249 @@ _RESOLUTION_UNIT_IDS = {'none': 1, 'inch': 2, 'centimeter': 3} +# Reverse map of ``_RESOLUTION_UNIT_IDS``: TIFF ResolutionUnit tag ids back to +# the xrspatial string identifiers. Used by ``geo_info_to_metadata`` so the +# read side stores the same string label the writer expects on the way out. +_RESOLUTION_UNIT_NAMES = {1: 'none', 2: 'inch', 3: 'centimeter'} + + +@dataclass(frozen=True) +class GeoTIFFMetadata: + """Typed internal record for GeoTIFF read/write metadata (issue #2139). + + Mirrors the public attrs contract documented in this module's + docstring field-for-field. Read paths build a ``GeoTIFFMetadata`` + once and call :func:`metadata_to_attrs` at the DataArray-construction + boundary; write paths call :func:`attrs_to_metadata` once at entry + and read fields off the record instead of re-resolving from + ``data.attrs``. + + The public attrs surface is unchanged. The dataclass exists so the + four read paths (eager numpy, dask+numpy, GPU, VRT) and three writers + stop building / parsing the same dict by hand. + """ + + # Spatial reference + transform: tuple | None = None + crs_epsg: int | None = None + crs_wkt: str | None = None + raster_type: str = 'area' + has_georef: bool = True + + # NoData semantics (issue #1988 / #2092) + nodata: Any = None + masked_nodata: bool | None = None + + # Pass-through TIFF tags + extra_tags: list | None = None + image_description: str | None = None + extra_samples: tuple | None = None + colormap: tuple | None = None + + # GDAL_METADATA + gdal_metadata: dict | None = None + gdal_metadata_xml: str | None = None + + # Resolution tags + x_resolution: float | None = None + y_resolution: float | None = None + resolution_unit: str | None = None + + # VRT-only + vrt_holes: list | None = None + + # Contract version stamped on read + contract_version: int = _ATTRS_CONTRACT_VERSION + + def with_nodata(self, nodata, *, masked: bool) -> 'GeoTIFFMetadata': + """Return a copy with ``nodata`` and ``masked_nodata`` set. + + Mirrors :func:`_set_nodata_attrs`: when ``nodata is None`` the + record is returned unchanged so absence keeps signalling "no + declared sentinel"; otherwise ``masked`` is coerced to ``bool`` + and stored as ``masked_nodata``. + """ + if nodata is None: + return self + return replace(self, nodata=nodata, masked_nodata=bool(masked)) + + +def geo_info_to_metadata(geo_info, *, window=None) -> GeoTIFFMetadata: + """Build a :class:`GeoTIFFMetadata` from a reader's ``GeoInfo``. + + Centralises the field-by-field copy previously done inline in + :func:`_populate_attrs_from_geo_info`. ``window`` carries the + ``(r0, c0, r1, c1)`` tuple for windowed reads and is applied to the + emitted transform. + """ + has_georef = getattr(geo_info, 'has_georef', True) + src_t = geo_info.transform + + transform = None + if src_t is not None and has_georef: + transform = _transform_tuple_from_pixel_geometry( + src_t.origin_x, src_t.origin_y, + src_t.pixel_width, src_t.pixel_height, + window=window, + ) + + raster_type = ( + 'point' if geo_info.raster_type == RASTER_PIXEL_IS_POINT else 'area') + + resolution_unit = None + if geo_info.resolution_unit is not None: + resolution_unit = _RESOLUTION_UNIT_NAMES.get( + geo_info.resolution_unit, str(geo_info.resolution_unit)) + + colormap = None + if geo_info.extra_tags is not None: + for _tag_id, _tt, _tc, _tv in geo_info.extra_tags: + if _tag_id == 320: # TAG_COLORMAP + colormap = _tv + break + + return GeoTIFFMetadata( + transform=transform, + crs_epsg=geo_info.crs_epsg, + crs_wkt=geo_info.crs_wkt, + raster_type=raster_type, + has_georef=bool(has_georef and src_t is not None), + extra_tags=geo_info.extra_tags, + image_description=geo_info.image_description, + extra_samples=geo_info.extra_samples, + colormap=colormap, + gdal_metadata=geo_info.gdal_metadata, + gdal_metadata_xml=geo_info.gdal_metadata_xml, + x_resolution=geo_info.x_resolution, + y_resolution=geo_info.y_resolution, + resolution_unit=resolution_unit, + contract_version=_ATTRS_CONTRACT_VERSION, + ) + + +def metadata_to_attrs(md: GeoTIFFMetadata) -> dict: + """Build the public attrs dict from a :class:`GeoTIFFMetadata` record. + + The output is identical to what + :func:`_populate_attrs_from_geo_info` writes today, so the read-path + swap is a behaviour no-op when the source record comes from + :func:`geo_info_to_metadata`. + """ + attrs: dict = {'_xrspatial_geotiff_contract': md.contract_version} + + if md.crs_epsg is not None: + attrs['crs'] = md.crs_epsg + if md.crs_wkt is not None: + attrs['crs_wkt'] = md.crs_wkt + if md.raster_type == 'point': + attrs['raster_type'] = 'point' + + if md.transform is not None and md.has_georef: + attrs['transform'] = md.transform + elif not md.has_georef: + attrs[_NO_GEOREF_KEY] = True + + if md.nodata is not None: + attrs['nodata'] = md.nodata + attrs['masked_nodata'] = bool(md.masked_nodata) + + if md.gdal_metadata is not None: + attrs['gdal_metadata'] = md.gdal_metadata + if md.gdal_metadata_xml is not None: + attrs['gdal_metadata_xml'] = md.gdal_metadata_xml + + if md.extra_tags is not None: + attrs['extra_tags'] = md.extra_tags + if md.image_description is not None: + attrs['image_description'] = md.image_description + if md.extra_samples is not None: + attrs['extra_samples'] = md.extra_samples + + if md.x_resolution is not None: + attrs['x_resolution'] = md.x_resolution + if md.y_resolution is not None: + attrs['y_resolution'] = md.y_resolution + if md.resolution_unit is not None: + attrs['resolution_unit'] = md.resolution_unit + + if md.colormap is not None: + attrs['colormap'] = md.colormap + + if md.vrt_holes: + attrs['vrt_holes'] = md.vrt_holes + + return attrs + + +def attrs_to_metadata(attrs) -> GeoTIFFMetadata: + """Parse a (possibly user-supplied) attrs dict into a metadata record. + + Honours the alias resolution :func:`_resolve_nodata_attr` already + implements. ``attrs`` may be a plain dict, an + ``xarray.core.utils.Frozen``, or ``None``. + """ + if attrs is None: + attrs = {} + + raster_type = 'point' if attrs.get('raster_type') == 'point' else 'area' + + transform = attrs.get('transform') + no_georef = bool(attrs.get(_NO_GEOREF_KEY)) + has_georef = (transform is not None) and not no_georef + + nodata = _resolve_nodata_attr(attrs) + masked_attr = attrs.get('masked_nodata') + masked_nodata: bool | None + if nodata is None: + masked_nodata = None + elif masked_attr is None: + masked_nodata = None + else: + masked_nodata = bool(masked_attr) + + crs_epsg = None + crs_wkt = attrs.get('crs_wkt') if isinstance(attrs.get('crs_wkt'), str) else None + crs_attr = attrs.get('crs') + if isinstance(crs_attr, str): + # ``attrs['crs']`` carries a WKT string under some pipelines; fold + # it into ``crs_wkt`` here so the writer's resolve step does not + # have to re-branch on type. + if crs_wkt is None: + crs_wkt = crs_attr + elif crs_attr is not None and not isinstance(crs_attr, bool): + # ``isinstance(True, int)`` is True; reject bool here so the + # writer's _validate_crs_arg gate is not bypassed at the + # boundary. + try: + crs_epsg = int(crs_attr) + except (TypeError, ValueError): + crs_epsg = None + + contract_version = attrs.get( + '_xrspatial_geotiff_contract', _ATTRS_CONTRACT_VERSION) + + return GeoTIFFMetadata( + transform=tuple(transform) if transform is not None else None, + crs_epsg=crs_epsg, + crs_wkt=crs_wkt, + raster_type=raster_type, + has_georef=has_georef, + nodata=nodata, + masked_nodata=masked_nodata, + extra_tags=attrs.get('extra_tags'), + image_description=attrs.get('image_description'), + extra_samples=attrs.get('extra_samples'), + colormap=attrs.get('colormap'), + gdal_metadata=attrs.get('gdal_metadata'), + gdal_metadata_xml=attrs.get('gdal_metadata_xml'), + x_resolution=attrs.get('x_resolution'), + y_resolution=attrs.get('y_resolution'), + resolution_unit=attrs.get('resolution_unit'), + vrt_holes=attrs.get('vrt_holes'), + contract_version=contract_version, + ) + + def _extent_to_window(transform, file_height, file_width, y_min, y_max, x_min, x_max): """Convert geographic extent to pixel window (row_start, col_start, row_stop, col_stop). @@ -354,89 +599,14 @@ def _populate_attrs_from_geo_info(attrs: dict, geo_info, *, window=None) -> None overwritten with the current ``_ATTRS_CONTRACT_VERSION``; callers pass freshly built dicts, so this is the intended behaviour. """ - # Stamp the contract version first so every read path that funnels - # through this helper carries the marker. The VRT backends build - # their attrs dict directly and stamp the version there (see - # ``_backends/vrt.py``); keep both sites in sync via the constant - # rather than the bare literal. - attrs['_xrspatial_geotiff_contract'] = _ATTRS_CONTRACT_VERSION - - if geo_info.crs_epsg is not None: - attrs['crs'] = geo_info.crs_epsg - if geo_info.crs_wkt is not None: - attrs['crs_wkt'] = geo_info.crs_wkt - if geo_info.raster_type == RASTER_PIXEL_IS_POINT: - attrs['raster_type'] = 'point' - - src_t = geo_info.transform - # Skip the transform attr for files where no GeoTIFF transform tags - # (ModelTransformation, ModelPixelScale, or ModelTiepoint) are - # present, signalled by ``has_georef=False``. GeoKeys / CRS metadata - # can still be present in that case. The default unit - # ``GeoTransform`` is a struct placeholder, not real georef -- - # emitting it leaks an identity transform into attrs and confuses - # downstream code that expects ``'transform' in attrs`` to mean - # "this raster has a georef transform" (#1710). - has_georef = getattr(geo_info, 'has_georef', True) - if src_t is not None and has_georef: - attrs['transform'] = _transform_tuple_from_pixel_geometry( - src_t.origin_x, src_t.origin_y, - src_t.pixel_width, src_t.pixel_height, - window=window, - ) - else: - # Stamp the no-georef marker so the writer can tell our - # placeholder int64 step-1 coords apart from a user-authored - # integer-coord grid (#2120). Pre-#2120 the writer detected the - # placeholder by shape alone, which silently stripped georef - # from any user array whose coords matched the same arange - # pattern. - attrs[_NO_GEOREF_KEY] = True - - # Contract v2 (issue #2016) removed the 13 secondary GeoKey-derived - # attrs that v1 emitted under a ``DeprecationWarning`` (``crs_name``, - # ``geog_citation``, ``datum_code``, ``angular_units``, - # ``semi_major_axis``, ``inv_flattening``, ``linear_units``, - # ``projection_code``, ``vertical_crs``, ``vertical_citation``, - # ``vertical_units``). The underlying ``GeoInfo`` fields are still - # populated by the GeoKey parser because ``_synthesize_user_defined_wkt`` - # consumes ``geog_citation`` (and siblings) to fill ``crs_wkt`` for - # user-defined CRSes; the reader no longer surfaces them as - # separate user-visible attrs. - - if geo_info.gdal_metadata is not None: - attrs['gdal_metadata'] = geo_info.gdal_metadata - if geo_info.gdal_metadata_xml is not None: - attrs['gdal_metadata_xml'] = geo_info.gdal_metadata_xml - - if geo_info.extra_tags is not None: - attrs['extra_tags'] = geo_info.extra_tags - if geo_info.image_description is not None: - attrs['image_description'] = geo_info.image_description - if geo_info.extra_samples is not None: - attrs['extra_samples'] = geo_info.extra_samples - - if geo_info.x_resolution is not None: - attrs['x_resolution'] = geo_info.x_resolution - if geo_info.y_resolution is not None: - attrs['y_resolution'] = geo_info.y_resolution - if geo_info.resolution_unit is not None: - _unit_names = {1: 'none', 2: 'inch', 3: 'centimeter'} - attrs['resolution_unit'] = _unit_names.get( - geo_info.resolution_unit, str(geo_info.resolution_unit)) - - # Contract v2 (issue #2016) removed ``attrs['cmap']`` and - # ``attrs['colormap_rgba']``. The canonical ``attrs['colormap']`` - # (raw uint16 RGB triples from TIFF tag 320) is still emitted below - # via the ``extra_tags`` scan; callers that need an RGBA palette or - # a :class:`matplotlib.colors.ListedColormap` should build one from - # ``attrs['colormap']`` directly. - - if geo_info.extra_tags is not None: - for _tag_id, _tt, _tc, _tv in geo_info.extra_tags: - if _tag_id == 320: # TAG_COLORMAP - attrs['colormap'] = _tv - break + # Compatibility shim: build a typed :class:`GeoTIFFMetadata` once + # and fold it into the caller's dict. The two routes + # (:func:`metadata_to_attrs` and the legacy field-by-field writes) + # produce the same attrs surface; centralising on the record lets + # the VRT path emit the same field set without copying this block. + # See issue #2139 / ``metadata_to_attrs``. + md = geo_info_to_metadata(geo_info, window=window) + attrs.update(metadata_to_attrs(md)) def _resolve_nodata_attr(attrs: dict): diff --git a/xrspatial/geotiff/_backends/vrt.py b/xrspatial/geotiff/_backends/vrt.py index 0156ba8f0..9802917ac 100644 --- a/xrspatial/geotiff/_backends/vrt.py +++ b/xrspatial/geotiff/_backends/vrt.py @@ -13,12 +13,15 @@ import numpy as np import xarray as xr -from .._attrs import _ATTRS_CONTRACT_VERSION, _set_nodata_attrs +from .._attrs import ( + GeoTIFFMetadata, + _set_nodata_attrs, + metadata_to_attrs, +) from .._coords import ( coords_from_pixel_geometry as _coords_from_pixel_geometry, transform_tuple_from_pixel_geometry as _transform_tuple_from_pixel_geometry, ) -from .._geotags import _NO_GEOREF_KEY from .._crs import _wkt_to_epsg from .._validation import _validate_chunks_arg, _validate_dtype_cast @@ -269,35 +272,26 @@ def read_vrt(source: str, *, else: coords = {} - # VRT builds its attrs dict inline rather than going through - # ``_populate_attrs_from_geo_info``; stamp the contract version here - # so both code paths emit the same marker. - attrs = {'_xrspatial_geotiff_contract': _ATTRS_CONTRACT_VERSION} - if gt is None: - # Mirror the eager non-VRT no-georef path: stamp the no-georef - # marker whenever the source carries no transform. The current - # VRT no-transform branch emits empty coords so the writer has - # nothing to misinterpret, but stamping defensively keeps the - # contract consistent if a future change adds placeholder - # coords here. See issue #2120. - attrs[_NO_GEOREF_KEY] = True - if vrt.crs_wkt: - epsg = _wkt_to_epsg(vrt.crs_wkt) - if epsg is not None: - attrs['crs'] = epsg - attrs['crs_wkt'] = vrt.crs_wkt - if vrt.raster_type == 'point': - attrs['raster_type'] = 'point' - # Surface skipped-source records as ``attrs['vrt_holes']`` so - # callers can detect a partial mosaic by attribute lookup. Under - # lenient mode (the default), the underlying ``_vrt.read_vrt`` - # already warned per skipped source but the warning is easy to - # miss in a pipeline; the attr lets downstream code branch on - # ``"vrt_holes" in da.attrs`` instead of monitoring the warnings - # stream. Empty list is omitted so the attr only appears when - # there is actually a hole. See issue #1734. - if vrt.holes: - attrs['vrt_holes'] = list(vrt.holes) + # Build the VRT attrs surface via :class:`GeoTIFFMetadata` so the + # eager VRT, chunked VRT, and non-VRT read paths share one + # field-set-to-attrs marshalling step. The VRT path intentionally + # populates only the subset of fields it owns (CRS, transform, + # raster_type, no-georef marker, vrt_holes); ``extra_tags``, + # ``gdal_metadata`` and resolution tags continue to be omitted on + # this path until the migration's VRT-tag-coverage follow-up lands. + # See issue #2139. + _vrt_epsg = _wkt_to_epsg(vrt.crs_wkt) if vrt.crs_wkt else None + _vrt_md = GeoTIFFMetadata( + crs_epsg=_vrt_epsg, + crs_wkt=vrt.crs_wkt or None, + raster_type='point' if vrt.raster_type == 'point' else 'area', + has_georef=gt is not None, + # Surface skipped-source records as ``attrs['vrt_holes']`` so + # callers can detect a partial mosaic by attribute lookup. See + # issue #1734. + vrt_holes=list(vrt.holes) if vrt.holes else None, + ) + attrs = metadata_to_attrs(_vrt_md) # When a specific band is selected, source its nodata from that # band's instead of band 0's. Otherwise multi-band # VRTs with per-band sentinels would mis-mask the read: attrs would @@ -691,10 +685,8 @@ def _read_vrt_chunked(source, *, window, band, name, chunks, gpu, dtype, # eager reads share the same x/y arrays. gt = vrt.geo_transform coords = {} - # Mirrors the eager VRT branch: this code path bypasses - # ``_populate_attrs_from_geo_info``, so the contract version is - # stamped inline using the shared constant to stay in lockstep. - attrs = {'_xrspatial_geotiff_contract': _ATTRS_CONTRACT_VERSION} + # Build attrs via :class:`GeoTIFFMetadata` to share the marshalling + # step with the eager VRT branch. See issue #2139. if gt is not None: origin_x, res_x, _, origin_y, _, res_y = gt coord_window = (win_r0, win_c0, win_r0 + full_h, win_c0 + full_w) @@ -703,21 +695,22 @@ def _read_vrt_chunked(source, *, window, band, name, chunks, gpu, dtype, is_point=vrt.raster_type == 'point', window=coord_window, ) - attrs['transform'] = _transform_tuple_from_pixel_geometry( + _vrt_transform = _transform_tuple_from_pixel_geometry( origin_x, origin_y, res_x, res_y, window=(win_r0, win_c0, 0, 0), ) else: - # Defensive marker (issue #2120). See the eager VRT branch. - attrs[_NO_GEOREF_KEY] = True - - if vrt.crs_wkt: - epsg = _wkt_to_epsg(vrt.crs_wkt) - if epsg is not None: - attrs['crs'] = epsg - attrs['crs_wkt'] = vrt.crs_wkt - if vrt.raster_type == 'point': - attrs['raster_type'] = 'point' + _vrt_transform = None + + _vrt_epsg = _wkt_to_epsg(vrt.crs_wkt) if vrt.crs_wkt else None + _vrt_md = GeoTIFFMetadata( + transform=_vrt_transform, + crs_epsg=_vrt_epsg, + crs_wkt=vrt.crs_wkt or None, + raster_type='point' if vrt.raster_type == 'point' else 'area', + has_georef=gt is not None, + ) + attrs = metadata_to_attrs(_vrt_md) # Surface the nodata sentinel for the selected band. The chunked # path declares ``float64`` up front whenever any selected band has diff --git a/xrspatial/geotiff/tests/test_geotiff_metadata_2139.py b/xrspatial/geotiff/tests/test_geotiff_metadata_2139.py new file mode 100644 index 000000000..495b65f6a --- /dev/null +++ b/xrspatial/geotiff/tests/test_geotiff_metadata_2139.py @@ -0,0 +1,353 @@ +"""Round-trip tests for the internal ``GeoTIFFMetadata`` dataclass. + +Issue #2139. + +The dataclass and the two boundary functions +(``metadata_to_attrs`` and ``attrs_to_metadata``) replace the manual +``attrs[...] = ...`` blocks scattered across the four read paths and +three writers. The public attrs surface does not change; what changes +is that every backend now goes through one marshalling step. + +These tests pin two invariants: + +* ``metadata_to_attrs(geo_info_to_metadata(...))`` emits the same key + set the legacy ``_populate_attrs_from_geo_info`` writes. +* ``metadata_to_attrs(attrs_to_metadata(x))`` is a stable round trip + for representative attrs dicts (eager numpy, dask, GPU, VRT, + no-georef, point raster, user-defined CRS WKT, with-nodata). +""" +from __future__ import annotations + +import numpy as np +import pytest + +from xrspatial.geotiff._attrs import ( + GeoTIFFMetadata, + _ATTRS_CONTRACT_VERSION, + attrs_to_metadata, + geo_info_to_metadata, + metadata_to_attrs, +) +from xrspatial.geotiff._geotags import _NO_GEOREF_KEY + + +# --------------------------------------------------------------------------- +# Builders +# --------------------------------------------------------------------------- + + +class _FakeTransform: + """Stand-in for ``GeoInfo.transform`` used by the reader.""" + + def __init__(self, origin_x=0.0, origin_y=0.0, + pixel_width=1.0, pixel_height=-1.0): + self.origin_x = origin_x + self.origin_y = origin_y + self.pixel_width = pixel_width + self.pixel_height = pixel_height + + +class _FakeGeoInfo: + """Minimal ``GeoInfo`` stand-in covering the fields the reader emits.""" + + def __init__( + self, + *, + transform=None, + crs_epsg=None, + crs_wkt=None, + raster_type=1, # RASTER_PIXEL_IS_AREA + has_georef=True, + nodata=None, + extra_tags=None, + image_description=None, + extra_samples=None, + gdal_metadata=None, + gdal_metadata_xml=None, + x_resolution=None, + y_resolution=None, + resolution_unit=None, + ): + self.transform = transform + self.crs_epsg = crs_epsg + self.crs_wkt = crs_wkt + self.raster_type = raster_type + self.has_georef = has_georef + self.nodata = nodata + self.extra_tags = extra_tags + self.image_description = image_description + self.extra_samples = extra_samples + self.gdal_metadata = gdal_metadata + self.gdal_metadata_xml = gdal_metadata_xml + self.x_resolution = x_resolution + self.y_resolution = y_resolution + self.resolution_unit = resolution_unit + + +# --------------------------------------------------------------------------- +# geo_info_to_metadata -> metadata_to_attrs +# --------------------------------------------------------------------------- + + +def test_geo_info_to_metadata_stamps_contract_version(): + md = geo_info_to_metadata(_FakeGeoInfo()) + assert md.contract_version == _ATTRS_CONTRACT_VERSION + attrs = metadata_to_attrs(md) + assert attrs['_xrspatial_geotiff_contract'] == _ATTRS_CONTRACT_VERSION + + +def test_geo_info_to_metadata_transform_present(): + gi = _FakeGeoInfo( + transform=_FakeTransform(origin_x=1.0, origin_y=10.0, + pixel_width=0.5, pixel_height=-0.5), + crs_epsg=4326, crs_wkt='WKT-CRS', + ) + md = geo_info_to_metadata(gi) + assert md.has_georef is True + assert md.transform is not None + assert md.crs_epsg == 4326 + assert md.crs_wkt == 'WKT-CRS' + + attrs = metadata_to_attrs(md) + assert 'transform' in attrs + assert attrs['crs'] == 4326 + assert attrs['crs_wkt'] == 'WKT-CRS' + assert _NO_GEOREF_KEY not in attrs + + +def test_geo_info_to_metadata_no_georef_marker(): + # No transform tags -> has_georef=False on the GeoInfo + gi = _FakeGeoInfo(transform=None, has_georef=False) + md = geo_info_to_metadata(gi) + assert md.has_georef is False + assert md.transform is None + + attrs = metadata_to_attrs(md) + assert 'transform' not in attrs + assert attrs[_NO_GEOREF_KEY] is True + + +def test_geo_info_to_metadata_point_raster(): + gi = _FakeGeoInfo(transform=_FakeTransform(), raster_type=2) # POINT + md = geo_info_to_metadata(gi) + assert md.raster_type == 'point' + + attrs = metadata_to_attrs(md) + assert attrs['raster_type'] == 'point' + + +def test_geo_info_to_metadata_resolution_unit_label(): + gi = _FakeGeoInfo( + transform=_FakeTransform(), + x_resolution=300.0, y_resolution=300.0, resolution_unit=2, + ) + md = geo_info_to_metadata(gi) + assert md.resolution_unit == 'inch' + + attrs = metadata_to_attrs(md) + assert attrs['resolution_unit'] == 'inch' + assert attrs['x_resolution'] == 300.0 + assert attrs['y_resolution'] == 300.0 + + +def test_geo_info_to_metadata_colormap_from_extra_tags(): + extra_tags = [(320, 3, 6, (10, 20, 30, 40, 50, 60))] + gi = _FakeGeoInfo(transform=_FakeTransform(), extra_tags=extra_tags) + md = geo_info_to_metadata(gi) + assert md.colormap == (10, 20, 30, 40, 50, 60) + attrs = metadata_to_attrs(md) + assert attrs['colormap'] == (10, 20, 30, 40, 50, 60) + + +# --------------------------------------------------------------------------- +# attrs_to_metadata: parse user/reader attrs back to a record +# --------------------------------------------------------------------------- + + +def test_attrs_to_metadata_none_attrs_returns_default_record(): + md = attrs_to_metadata(None) + assert isinstance(md, GeoTIFFMetadata) + assert md.transform is None + assert md.crs_epsg is None + assert md.crs_wkt is None + assert md.nodata is None + + +def test_attrs_to_metadata_empty_dict(): + md = attrs_to_metadata({}) + assert md.transform is None + assert md.has_georef is False # no transform -> not georeferenced + assert md.raster_type == 'area' + + +def test_attrs_to_metadata_crs_int_epsg(): + md = attrs_to_metadata({'crs': 4326, 'crs_wkt': 'WKT-X'}) + assert md.crs_epsg == 4326 + assert md.crs_wkt == 'WKT-X' + + +def test_attrs_to_metadata_crs_wkt_string_in_crs_field(): + # Some upstream pipelines stash a WKT string in attrs['crs'] + md = attrs_to_metadata({'crs': 'PROJCS["X"...]'}) + assert md.crs_epsg is None + assert md.crs_wkt == 'PROJCS["X"...]' + + +def test_attrs_to_metadata_crs_bool_rejected_at_boundary(): + # attrs={'crs': True} previously round-tripped as EPSG=1 in some + # writers; the boundary parser refuses to coerce booleans here. + md = attrs_to_metadata({'crs': True}) + assert md.crs_epsg is None + + +def test_attrs_to_metadata_nodata_canonical(): + md = attrs_to_metadata({'nodata': -9999, 'masked_nodata': True}) + assert md.nodata == -9999 + assert md.masked_nodata is True + + +def test_attrs_to_metadata_nodata_alias_nodatavals(): + md = attrs_to_metadata({'nodatavals': (-9999,)}) + assert md.nodata == -9999 + # No masked_nodata flag on the input -> None on the record + assert md.masked_nodata is None + + +def test_attrs_to_metadata_nodata_alias_fill_value(): + md = attrs_to_metadata({'_FillValue': -1}) + assert md.nodata == -1 + + +def test_attrs_to_metadata_no_georef_marker_clears_has_georef(): + md = attrs_to_metadata({_NO_GEOREF_KEY: True}) + assert md.has_georef is False + + +def test_attrs_to_metadata_transform_present_implies_georef(): + md = attrs_to_metadata({'transform': (1.0, 0.0, 0.0, 0.0, -1.0, 10.0)}) + assert md.has_georef is True + assert md.transform == (1.0, 0.0, 0.0, 0.0, -1.0, 10.0) + + +# --------------------------------------------------------------------------- +# Round-trip metadata_to_attrs <-> attrs_to_metadata +# --------------------------------------------------------------------------- + + +def _representative_attrs_dicts(): + """Yield representative attrs dicts that the round trip must preserve. + + Each entry mirrors a real backend's emit set: + + * eager numpy file (CRS + transform + nodata + masked) + * point raster + * no-georef file + * user-defined CRS (WKT only, no EPSG) + * VRT with holes + """ + yield { + '_xrspatial_geotiff_contract': _ATTRS_CONTRACT_VERSION, + 'crs': 4326, + 'crs_wkt': 'WKT-4326', + 'transform': (1.0, 0.0, 0.0, 0.0, -1.0, 10.0), + 'nodata': -9999, + 'masked_nodata': True, + } + yield { + '_xrspatial_geotiff_contract': _ATTRS_CONTRACT_VERSION, + 'crs': 4326, + 'crs_wkt': 'WKT-4326', + 'transform': (1.0, 0.0, 0.0, 0.0, -1.0, 10.0), + 'raster_type': 'point', + } + yield { + '_xrspatial_geotiff_contract': _ATTRS_CONTRACT_VERSION, + _NO_GEOREF_KEY: True, + } + yield { + '_xrspatial_geotiff_contract': _ATTRS_CONTRACT_VERSION, + 'crs_wkt': 'PROJCS["user defined"...]', + 'transform': (1.0, 0.0, 0.0, 0.0, -1.0, 10.0), + } + yield { + '_xrspatial_geotiff_contract': _ATTRS_CONTRACT_VERSION, + 'crs': 32610, + 'crs_wkt': 'WKT-32610', + 'transform': (10.0, 0.0, 500000.0, 0.0, -10.0, 4000000.0), + 'vrt_holes': [{'source': '/tmp/missing.tif', 'band': 1}], + } + + +@pytest.mark.parametrize( + 'attrs', + list(_representative_attrs_dicts()), + ids=['eager', 'point', 'no_georef', 'user_wkt', 'vrt'], +) +def test_round_trip_attrs_to_metadata_to_attrs(attrs): + """``metadata_to_attrs(attrs_to_metadata(x))`` preserves every key in x.""" + md = attrs_to_metadata(attrs) + round_tripped = metadata_to_attrs(md) + for key, value in attrs.items(): + assert key in round_tripped, f"key {key!r} dropped on round trip" + assert round_tripped[key] == value, ( + f"value at {key!r} changed: {value!r} -> {round_tripped[key]!r}" + ) + + +def test_round_trip_metadata_to_attrs_to_metadata(): + """Building a record, marshalling, parsing back recovers all fields.""" + md = GeoTIFFMetadata( + transform=(1.0, 0.0, 0.0, 0.0, -1.0, 10.0), + crs_epsg=4326, + crs_wkt='WKT-4326', + raster_type='point', + has_georef=True, + nodata=-9999, + masked_nodata=True, + x_resolution=300.0, y_resolution=300.0, resolution_unit='inch', + ) + attrs = metadata_to_attrs(md) + md2 = attrs_to_metadata(attrs) + + assert md2.transform == md.transform + assert md2.crs_epsg == md.crs_epsg + assert md2.crs_wkt == md.crs_wkt + assert md2.raster_type == md.raster_type + assert md2.has_georef == md.has_georef + assert md2.nodata == md.nodata + assert md2.masked_nodata == md.masked_nodata + assert md2.x_resolution == md.x_resolution + assert md2.y_resolution == md.y_resolution + assert md2.resolution_unit == md.resolution_unit + + +# --------------------------------------------------------------------------- +# with_nodata builder +# --------------------------------------------------------------------------- + + +def test_with_nodata_sets_pair(): + md = GeoTIFFMetadata() + md2 = md.with_nodata(-9999, masked=True) + assert md2.nodata == -9999 + assert md2.masked_nodata is True + # frozen dataclass returns a new instance + assert md.nodata is None + assert md is not md2 + + +def test_with_nodata_none_returns_unchanged(): + md = GeoTIFFMetadata(crs_epsg=4326) + md2 = md.with_nodata(None, masked=True) + # ``None`` mirrors ``_set_nodata_attrs`` no-op contract; record + # comes back equal so absence keeps signalling "no declared + # sentinel." + assert md2.nodata is None + assert md2.masked_nodata is None + assert md2.crs_epsg == 4326 + + +def test_with_nodata_masked_false_records_false(): + md = GeoTIFFMetadata() + md2 = md.with_nodata(-9999, masked=False) + assert md2.masked_nodata is False From 5161385d3f96d9f34b91d93fabfb62c758d759c1 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Tue, 19 May 2026 11:58:41 -0700 Subject: [PATCH 2/2] Address review nits and suggestions (#2139) * Document the lenient-parse / writer-validates contract on attrs_to_metadata so the dropped-bool-EPSG and unchecked-tuple transform are obvious as intentional boundary behaviour. * Annotate the three-state branch in metadata_to_attrs so the transform-less-but-georef'd case (eager VRT path) reads as intentional rather than a missed branch. * Comment the empty-string crs_wkt normalisation in the VRT builder. * Derive _RESOLUTION_UNIT_NAMES from _RESOLUTION_UNIT_IDS so the forward and reverse maps cannot drift. * Add test_round_trip_emits_no_unexpected_keys to lock the marshalling step against silent field additions slipping in unannounced. All 4071 geotiff tests pass. --- xrspatial/geotiff/_attrs.py | 30 ++++++++++++++++++- xrspatial/geotiff/_backends/vrt.py | 6 ++++ .../tests/test_geotiff_metadata_2139.py | 23 ++++++++++++++ 3 files changed, 58 insertions(+), 1 deletion(-) diff --git a/xrspatial/geotiff/_attrs.py b/xrspatial/geotiff/_attrs.py index 50a6a84d7..701c32087 100644 --- a/xrspatial/geotiff/_attrs.py +++ b/xrspatial/geotiff/_attrs.py @@ -182,7 +182,9 @@ # Reverse map of ``_RESOLUTION_UNIT_IDS``: TIFF ResolutionUnit tag ids back to # the xrspatial string identifiers. Used by ``geo_info_to_metadata`` so the # read side stores the same string label the writer expects on the way out. -_RESOLUTION_UNIT_NAMES = {1: 'none', 2: 'inch', 3: 'centimeter'} +# Derived from ``_RESOLUTION_UNIT_IDS`` so the forward and reverse maps cannot +# drift if a new unit is added. +_RESOLUTION_UNIT_NAMES = {v: k for k, v in _RESOLUTION_UNIT_IDS.items()} @dataclass(frozen=True) @@ -316,6 +318,19 @@ def metadata_to_attrs(md: GeoTIFFMetadata) -> dict: if md.raster_type == 'point': attrs['raster_type'] = 'point' + # Three states on the (transform, has_georef) pair: + # + # * transform set + has_georef -> emit ``attrs['transform']``. + # * not has_georef -> emit ``attrs[_NO_GEOREF_KEY] = True``. + # * transform None + has_georef -> emit neither. + # + # The third state is the eager VRT path's contract: the VRT builder + # constructs the record with ``has_georef=True`` but leaves + # ``transform=None`` because the inline VRT code stamps the + # rasterio-ordered transform tuple onto the dict a few lines later + # (after the GPU transfer / dtype cast / nodata mask steps). Removing + # this branch would re-introduce a duplicate transform write or, worse, + # would emit ``_NO_GEOREF_KEY`` on a georef'd VRT array. See #2139. if md.transform is not None and md.has_georef: attrs['transform'] = md.transform elif not md.has_georef: @@ -359,6 +374,19 @@ def attrs_to_metadata(attrs) -> GeoTIFFMetadata: Honours the alias resolution :func:`_resolve_nodata_attr` already implements. ``attrs`` may be a plain dict, an ``xarray.core.utils.Frozen``, or ``None``. + + Boundary contract -- this function parses leniently and lets the + writer enforce strict validation against the parsed record: + + * ``attrs['crs']=True`` lands as ``crs_epsg=None`` rather than + raising. ``_validate_crs_arg`` (called from the writers) is the + validator that should reject bool values; the boundary parser + only needs to keep the bad value out of the record so the writer + sees ``crs_epsg=None`` and falls through to ``crs_wkt``. See + ``test_crs_arg_validation_1971.py``. + * ``transform`` is coerced via ``tuple(...)`` with no length or + numeric-type check. ``_transform_from_attr`` is the canonical + validator and runs inside the writer. """ if attrs is None: attrs = {} diff --git a/xrspatial/geotiff/_backends/vrt.py b/xrspatial/geotiff/_backends/vrt.py index 9802917ac..8f56f91b5 100644 --- a/xrspatial/geotiff/_backends/vrt.py +++ b/xrspatial/geotiff/_backends/vrt.py @@ -280,6 +280,9 @@ def read_vrt(source: str, *, # ``gdal_metadata`` and resolution tags continue to be omitted on # this path until the migration's VRT-tag-coverage follow-up lands. # See issue #2139. + # ``vrt.crs_wkt`` carries an empty string when the VRT XML has a + # ```` element but no recognised CRS body; treat empty as + # absent so ``metadata_to_attrs`` does not emit ``attrs['crs_wkt']=''``. _vrt_epsg = _wkt_to_epsg(vrt.crs_wkt) if vrt.crs_wkt else None _vrt_md = GeoTIFFMetadata( crs_epsg=_vrt_epsg, @@ -702,6 +705,9 @@ def _read_vrt_chunked(source, *, window, band, name, chunks, gpu, dtype, else: _vrt_transform = None + # ``vrt.crs_wkt`` carries an empty string when the VRT XML has a + # ```` element but no recognised CRS body; treat empty as + # absent so ``metadata_to_attrs`` does not emit ``attrs['crs_wkt']=''``. _vrt_epsg = _wkt_to_epsg(vrt.crs_wkt) if vrt.crs_wkt else None _vrt_md = GeoTIFFMetadata( transform=_vrt_transform, diff --git a/xrspatial/geotiff/tests/test_geotiff_metadata_2139.py b/xrspatial/geotiff/tests/test_geotiff_metadata_2139.py index 495b65f6a..b64e3fb8e 100644 --- a/xrspatial/geotiff/tests/test_geotiff_metadata_2139.py +++ b/xrspatial/geotiff/tests/test_geotiff_metadata_2139.py @@ -294,6 +294,29 @@ def test_round_trip_attrs_to_metadata_to_attrs(attrs): ) +@pytest.mark.parametrize( + 'attrs', + list(_representative_attrs_dicts()), + ids=['eager', 'point', 'no_georef', 'user_wkt', 'vrt'], +) +def test_round_trip_emits_no_unexpected_keys(attrs): + """Round trip emits the input keys and the contract version, nothing else. + + Locks the marshalling step so a future field added to + ``metadata_to_attrs`` without an attr-contract update fails this + test rather than silently appearing on every read. + """ + md = attrs_to_metadata(attrs) + round_tripped = metadata_to_attrs(md) + expected_keys = set(attrs.keys()) | {'_xrspatial_geotiff_contract'} + extra = set(round_tripped) - expected_keys + assert not extra, ( + f"metadata_to_attrs emitted unexpected keys {extra!r} not present in input " + f"{set(attrs)!r}; either add them to the attrs contract or fix " + f"the marshalling step." + ) + + def test_round_trip_metadata_to_attrs_to_metadata(): """Building a record, marshalling, parsing back recovers all fields.""" md = GeoTIFFMetadata(