diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index dd82d693..67a5fade 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -3919,14 +3919,33 @@ def _sentinel_for_dtype(nodata_val, dtype): dtype, out-of-range, non-finite, or fractional). Mirrors the gating PR #1583 added to other read paths via ``_int_nodata_in_range``. + + A plain Python ``int`` ``nodata_val`` is handled without going + through ``float`` first, so 64-bit sentinels such as + ``2**64 - 1`` (``UInt64`` max) and ``-2**63`` (``Int64`` min) + round-trip without the float64 rounding that pushes them just + past the dtype's representable range. ``_parse_band_nodata`` + in ``_vrt.py`` parses integer-band ```` directly + as ``int`` to feed this path. See issue #1783 follow-up. """ if nodata_val is None or dtype.kind not in ('u', 'i'): return None + info = np.iinfo(dtype) + # Fast/exact path: ``nodata_val`` is already an integer. Avoids + # the float64 round-trip that loses precision near the int64 / + # uint64 extremes. ``bool`` is a subclass of ``int`` -- treat + # True/False as a 1/0 sentinel rather than rejecting outright, + # matching the existing ``int(float(...))`` behaviour. + if isinstance(nodata_val, (int, np.integer)) and not isinstance( + nodata_val, bool): + nodata_int = int(nodata_val) + if info.min <= nodata_int <= info.max: + return dtype.type(nodata_int) + return None try: nodata_f = float(nodata_val) except (TypeError, ValueError): return None - info = np.iinfo(dtype) if not (np.isfinite(nodata_f) and nodata_f.is_integer() and info.min <= nodata_f <= info.max): return None diff --git a/xrspatial/geotiff/_vrt.py b/xrspatial/geotiff/_vrt.py index 2e0cb7e1..1072a25e 100644 --- a/xrspatial/geotiff/_vrt.py +++ b/xrspatial/geotiff/_vrt.py @@ -5,10 +5,12 @@ """ from __future__ import annotations +import math import os import struct import zlib from dataclasses import dataclass, field +from typing import Union from xml.sax.saxutils import escape as _xml_escape, quoteattr as _xml_quoteattr import numpy as np @@ -115,16 +117,26 @@ def _xml_attr(value) -> str: return '""' return _xml_quoteattr(str(value)) -# Lazy imports to avoid circular dependency +# Mapping from GDAL VRT dataType names to NumPy dtypes. +# +# ``Int8`` is the GDAL 3.7+ signed-byte name. ``UInt64`` / ``Int64`` are +# the GDAL 3.5+ 64-bit integer names; both round-trip losslessly through +# numpy's native ``uint64`` / ``int64``. Complex types (``CInt16``, +# ``CInt32``, ``CFloat32``, ``CFloat64``) are deliberately absent: the +# reader has no complex-data code path, so an unknown ``dataType`` raises +# in :func:`parse_vrt` rather than silently dropping the imaginary +# component by falling back to ``Float32``. See issue #1783. _DTYPE_MAP = { 'Byte': np.uint8, + 'Int8': np.int8, 'UInt16': np.uint16, 'Int16': np.int16, 'UInt32': np.uint32, 'Int32': np.int32, + 'UInt64': np.uint64, + 'Int64': np.int64, 'Float32': np.float32, 'Float64': np.float64, - 'Int8': np.int8, } @@ -144,7 +156,11 @@ class _Source: band: int # 1-based src_rect: _Rect dst_rect: _Rect - nodata: float | None = None + # Parsed from ```` on the source. ``int`` for integer-dtype + # bands so 64-bit sentinels survive without float64 rounding (see + # :func:`_parse_band_nodata`); ``float`` for floating-dtype bands + # and the legacy fallback. See issue #1783 follow-up. + nodata: Union[int, float, None] = None # ComplexSource extras scale: float | None = None offset: float | None = None @@ -165,7 +181,11 @@ class _VRTBand: """A single band in a VRT dataset.""" band_num: int # 1-based dtype: np.dtype - nodata: float | None = None + # Parsed from ````. ``int`` for integer-dtype bands so + # 64-bit sentinels (``2**64 - 1``, ``INT64_MIN``) survive without + # float64 rounding; ``float`` for floating-dtype bands. See + # :func:`_parse_band_nodata` and the issue #1783 follow-up. + nodata: Union[int, float, None] = None sources: list[_Source] = field(default_factory=list) color_interp: str | None = None @@ -213,6 +233,67 @@ def _text(elem, tag, default=None): return default +def _parse_band_nodata(text: str | None, + dtype: np.dtype) -> Union[int, float, None]: + """Parse a VRT ```` / ```` string for a given band dtype. + + Returns ``None`` when ``text`` is empty/None. + + For floating dtypes the value is parsed as ``float`` (so ``nan`` / + ``inf`` and scientific notation work as before). + + For integer dtypes the value is parsed as ``int`` whenever possible so + that 64-bit sentinels (``2**64 - 1`` for ``UInt64``, ``INT64_MIN`` for + ``Int64``) round-trip exactly rather than collapsing through a float64 + that cannot represent them. GDAL is happy to write integer nodata in + scientific notation or with a trailing ``.0``; if a direct ``int()`` + fails we fall back to ``int(float(text))`` but only accept that result + when the float is finite, integer-valued, and representable in the + target dtype. Out-of-range / non-integer values for an integer dtype + fall back to the parsed ``float`` (matching the eager-TIFF reader's + behaviour in :func:`_resolve_masked_fill`: the sentinel can never + match a real pixel, but ``attrs['nodata']`` still surfaces the raw + value for write round-trip). See issue #1783 follow-up. + """ + if text is None: + return None + text = text.strip() + if not text: + return None + + if np.issubdtype(dtype, np.integer): + # Try the cheap path first: a plain integer literal (decimal, + # binary, hex with explicit base would be unusual but ``int()`` + # without ``base`` rejects anything other than base-10 numeric + # strings, which is what GDAL writes). + try: + return int(text) + except ValueError: + pass + # Fall back: scientific notation or trailing ``.0``. Only accept + # when the float is exactly integer-valued and fits the dtype, so + # we never silently truncate a fractional value or wrap a + # sentinel that overflows the band's dtype. + try: + v = float(text) + except (TypeError, ValueError): + return None + if not math.isfinite(v) or not v.is_integer(): + return v + info = np.iinfo(dtype) + nodata_int = int(v) + if info.min <= nodata_int <= info.max: + return nodata_int + return v + + # Floating / other dtypes: parse as float (preserves NaN, Inf, + # scientific notation). + try: + return float(text) + except (TypeError, ValueError): + return None + + def parse_vrt(xml_str: str, vrt_dir: str = '.') -> VRTDataset: """Parse a VRT XML string into a VRTDataset. @@ -277,10 +358,35 @@ def parse_vrt(xml_str: str, vrt_dir: str = '.') -> VRTDataset: bands = [] for band_elem in root.findall('VRTRasterBand'): band_num = int(band_elem.get('band', 1)) - dtype_name = band_elem.get('dataType', 'Float32') - dtype = np.dtype(_DTYPE_MAP.get(dtype_name, np.float32)) + # Distinguish "attribute missing" (GDAL default: Float32) from + # "attribute present but unsupported". The previous + # ``_DTYPE_MAP.get(dtype_name, np.float32)`` collapsed both cases + # to ``Float32``, so a VRT declaring ``CFloat32`` silently lost + # its imaginary component and one declaring a typo (``Flaot32``) + # silently produced wrong values. See issue #1783. + dtype_name = band_elem.get('dataType') + if dtype_name is None: + dtype_name = 'Float32' + if dtype_name not in _DTYPE_MAP: + supported = ', '.join(sorted(_DTYPE_MAP)) + raise ValueError( + f"VRTRasterBand band={band_num} declares " + f"dataType={dtype_name!r}, which is not supported by " + f"read_vrt. Supported dataType values: {supported}. " + f"Complex dataType values (CInt16, CInt32, CFloat32, " + f"CFloat64) are not supported because the reader has no " + f"complex-data code path; falling back to Float32 would " + f"silently drop the imaginary component." + ) + dtype = np.dtype(_DTYPE_MAP[dtype_name]) + # Parse with dtype-aware precision. For integer + # bands (including UInt64 / Int64) the sentinel is parsed as an + # ``int`` so values like ``2**64 - 1`` or ``INT64_MIN`` round-trip + # exactly through the masking pipeline; ``float(text)`` would + # collapse those to a representable float64 and break exact- + # equality masks downstream. See issue #1783 follow-up. nodata_str = _text(band_elem, 'NoDataValue') - nodata = float(nodata_str) if nodata_str else None + nodata = _parse_band_nodata(nodata_str, dtype) color_interp = _text(band_elem, 'ColorInterp') sources = [] @@ -342,8 +448,12 @@ def parse_vrt(xml_str: str, vrt_dir: str = '.') -> VRTDataset: src_rect = _parse_rect(src_rect_elem) dst_rect = _parse_rect(dst_rect_elem) + # Per-source ```` honours the band dtype too -- a + # uint64 source feeding a UInt64 band needs the full 64-bit + # sentinel preserved, not a float64-rounded approximation. + # See issue #1783 follow-up. src_nodata_str = _text(src_elem, 'NODATA') - src_nodata = float(src_nodata_str) if src_nodata_str else None + src_nodata = _parse_band_nodata(src_nodata_str, dtype) # ComplexSource extras scale = None @@ -820,15 +930,26 @@ def read_vrt(vrt_path: str, *, window=None, # with GDAL_NODATA="-9999") stay no-op rather than # tripping OverflowError on ``dtype.type(int(...))``. # See issue #1616. - try: - nodata_f = float(src_nodata) - except (TypeError, ValueError): - nodata_f = None - if (nodata_f is not None - and np.isfinite(nodata_f) - and nodata_f.is_integer()): + # Accept an int sentinel directly so 64-bit values + # (``2**64 - 1`` for UInt64, ``INT64_MIN`` for Int64) + # don't get clobbered by ``float(...).is_integer()`` + # losing precision near the edges. Fall back to the + # float parse for legacy float-typed nodata. + nodata_int: int | None = None + if (isinstance(src_nodata, (int, np.integer)) + and not isinstance(src_nodata, bool)): + nodata_int = int(src_nodata) + else: + try: + nodata_f = float(src_nodata) + except (TypeError, ValueError): + nodata_f = None + if (nodata_f is not None + and np.isfinite(nodata_f) + and nodata_f.is_integer()): + nodata_int = int(nodata_f) + if nodata_int is not None: info = np.iinfo(src_arr.dtype) - nodata_int = int(nodata_f) if info.min <= nodata_int <= info.max: sentinel = src_arr.dtype.type(nodata_int) mask = src_arr == sentinel diff --git a/xrspatial/geotiff/tests/test_vrt_dtype_1783.py b/xrspatial/geotiff/tests/test_vrt_dtype_1783.py new file mode 100644 index 00000000..32285f62 --- /dev/null +++ b/xrspatial/geotiff/tests/test_vrt_dtype_1783.py @@ -0,0 +1,517 @@ +"""Regression tests for issue #1783. + +Before this fix, :func:`xrspatial.geotiff._vrt.parse_vrt` resolved a +```` attribute with +``_DTYPE_MAP.get(dtype_name, np.float32)``. Any GDAL dataType not +present in ``_DTYPE_MAP`` -- the four complex types (``CInt16``, +``CInt32``, ``CFloat32``, ``CFloat64``), the 64-bit integer types that +the map did not yet list (``UInt64``, ``Int64``), or a typo -- silently +collapsed to ``Float32``. Complex sources lost their imaginary +component, 64-bit integer sources lost precision, and typos produced +wrong values with no diagnostic. + +The fix: + +* Adds ``UInt64`` / ``Int64`` to ``_DTYPE_MAP``. +* Splits the resolution into "attribute missing" (still defaults to + Float32 per the GDAL spec) and "attribute present but unsupported" + (now raises ``ValueError`` naming the band, the offending dataType, + and the supported types). +""" +from __future__ import annotations + +import numpy as np +import pytest + +from xrspatial.geotiff import read_vrt +from xrspatial.geotiff._vrt import _parse_band_nodata, parse_vrt +from xrspatial.geotiff._writer import write + + +def _write(arr, path, **kw): + """Write a 2D array to ``path`` with sensible defaults for tests.""" + write(arr, str(path), compression='none', tiled=False, **kw) + + +def _build_single_band_vrt(tmp_path, *, dtype_attr, src_path, + filename='b.vrt', size=2, nodata=None): + """Hand-roll a single-band VRT with an arbitrary ``dataType`` attribute. + + ``dtype_attr`` is rendered verbatim into the ```` + element. Pass an empty string to omit the attribute entirely (the + "GDAL default" case). + + ``nodata`` (when not ``None``) is rendered verbatim into a + ```` child so callers can exercise sentinel-parsing + edge cases (scientific notation, ``nan``, full-range 64-bit + integers). + """ + if dtype_attr: + attr = f' dataType="{dtype_attr}"' + else: + attr = '' + nodata_elem = (f'{nodata}' + if nodata is not None else '') + vrt_xml = f""" + 0.0, 1.0, 0.0, 0.0, 0.0, -1.0 + + {nodata_elem} + + {src_path} + 1 + + + + +""" + p = tmp_path / filename + p.write_text(vrt_xml) + return str(p) + + +# --------------------------------------------------------------------------- +# 1. Complex dataType is rejected (no silent imaginary-component loss) +# --------------------------------------------------------------------------- + +@pytest.mark.parametrize('cdtype', ['CInt16', 'CInt32', 'CFloat32', 'CFloat64']) +def test_complex_dtype_raises_value_error(tmp_path, cdtype): + """A VRT declaring any complex ``dataType`` must raise ``ValueError`` + rather than silently substituting ``Float32``. The error message + must name both the band number and the offending dataType so the + operator can fix the VRT, and must mention that complex types are + explicitly unsupported. + """ + b = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) + src = tmp_path / 'src.tif' + _write(b, src) + vrt = _build_single_band_vrt( + tmp_path, dtype_attr=cdtype, src_path=str(src), + ) + with pytest.raises(ValueError) as ei: + read_vrt(vrt) + msg = str(ei.value) + assert cdtype in msg, f"error message must name {cdtype!r}: {msg!r}" + assert 'band=1' in msg or 'band 1' in msg, ( + f"error message must name the band: {msg!r}" + ) + assert 'complex' in msg.lower(), ( + f"error message must mention complex types: {msg!r}" + ) + + +# --------------------------------------------------------------------------- +# 2. Typo / arbitrary garbage dataType is rejected +# --------------------------------------------------------------------------- + +def test_garbage_dtype_raises_value_error(tmp_path): + """An unrecognised non-complex ``dataType`` (e.g. a typo) must also + raise ``ValueError`` rather than collapsing silently to Float32. + """ + b = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) + src = tmp_path / 'src.tif' + _write(b, src) + vrt = _build_single_band_vrt( + tmp_path, dtype_attr='Garbage', src_path=str(src), + ) + with pytest.raises(ValueError, match=r'Garbage'): + read_vrt(vrt) + + +def test_typo_for_supported_dtype_is_still_rejected(tmp_path): + """``Flaot32`` (typo of ``Float32``) is distinct from the empty / + missing case and must surface as ``ValueError`` instead of silently + falling back. + """ + b = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) + src = tmp_path / 'src.tif' + _write(b, src) + vrt = _build_single_band_vrt( + tmp_path, dtype_attr='Flaot32', src_path=str(src), + ) + with pytest.raises(ValueError, match=r'Flaot32'): + read_vrt(vrt) + + +# --------------------------------------------------------------------------- +# 3. UInt64 / Int64 are now supported and round-trip losslessly +# --------------------------------------------------------------------------- + +def test_uint64_round_trip(tmp_path): + """A VRT declaring ``dataType="UInt64"`` whose source GeoTIFF is + written as uint64 must read back as uint64 with the exact values + preserved, including values past the float32 / int53 boundary. + """ + big = np.iinfo(np.uint64).max # 2**64 - 1 + near_big = big - 7 + b = np.array([[1, 2], [near_big, big]], dtype=np.uint64) + src = tmp_path / 'src.tif' + _write(b, src) + vrt = _build_single_band_vrt( + tmp_path, dtype_attr='UInt64', src_path=str(src), + ) + r = read_vrt(vrt) + assert r.dtype == np.uint64, ( + f"UInt64 VRT must read as uint64; got {r.dtype}" + ) + np.testing.assert_array_equal(r.values, b) + # Largest values must survive bit-for-bit, not collapse to float + assert int(r.values[1, 1]) == big + assert int(r.values[1, 0]) == near_big + + +def test_int64_round_trip(tmp_path): + """A VRT declaring ``dataType="Int64"`` must read back as int64 + with the full int64 range preserved (positive and negative + extremes). + """ + info = np.iinfo(np.int64) + b = np.array([[info.min, -1], [0, info.max]], dtype=np.int64) + src = tmp_path / 'src.tif' + _write(b, src) + vrt = _build_single_band_vrt( + tmp_path, dtype_attr='Int64', src_path=str(src), + ) + r = read_vrt(vrt) + assert r.dtype == np.int64, ( + f"Int64 VRT must read as int64; got {r.dtype}" + ) + np.testing.assert_array_equal(r.values, b) + + +# --------------------------------------------------------------------------- +# 4. Missing dataType attribute still defaults to Float32 (GDAL default) +# --------------------------------------------------------------------------- + +def test_missing_dtype_attribute_defaults_to_float32(tmp_path): + """```` with no ``dataType`` attribute must + still default to ``Float32``. This is GDAL's documented default + and the previous fallback handled it correctly; the new + "unknown-attribute raises" path must not regress the + "missing-attribute defaults" path. + """ + b = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) + src = tmp_path / 'src.tif' + _write(b, src) + vrt = _build_single_band_vrt( + tmp_path, dtype_attr='', src_path=str(src), + ) + r = read_vrt(vrt) + assert r.dtype == np.float32, ( + f"missing dataType must default to Float32; got {r.dtype}" + ) + np.testing.assert_allclose(r.values, b) + + +# --------------------------------------------------------------------------- +# 5. Pre-existing supported dtypes still read correctly (smoke regression) +# --------------------------------------------------------------------------- + +def test_byte_dtype_still_works(tmp_path): + """``Byte`` reads back as uint8 with values preserved. Smoke check + to confirm the rewritten dtype resolution did not break the + common-case integer path. + """ + b = np.array([[10, 11], [12, 13]], dtype=np.uint8) + src = tmp_path / 'src.tif' + _write(b, src) + vrt = _build_single_band_vrt( + tmp_path, dtype_attr='Byte', src_path=str(src), + ) + r = read_vrt(vrt) + assert r.dtype == np.uint8 + np.testing.assert_array_equal(r.values, b) + + +def test_float64_dtype_still_works(tmp_path): + """``Float64`` reads back as float64 with values preserved. Smoke + check for the wider floating-point path. + """ + b = np.array([[1.5, 2.5], [3.5, 4.5]], dtype=np.float64) + src = tmp_path / 'src.tif' + _write(b, src) + vrt = _build_single_band_vrt( + tmp_path, dtype_attr='Float64', src_path=str(src), + ) + r = read_vrt(vrt) + assert r.dtype == np.float64 + np.testing.assert_allclose(r.values, b) + + +# --------------------------------------------------------------------------- +# 6. Integer ```` is parsed as ``int`` for integer bands +# +# Regression for the Copilot review note on the original #1783 PR: now +# that UInt64 / Int64 are supported, parsing the sentinel as ``float`` +# silently drops precision near the 64-bit extremes. ``2**64 - 1`` +# rounds up to ``2**64`` in float64, ``INT64_MIN`` survives but only +# barely, and downstream exact-equality masks break. +# --------------------------------------------------------------------------- + +def test_parse_band_nodata_uint64_max_exact(): + """``_parse_band_nodata`` must return the exact ``int`` for + ``2**64 - 1`` (UInt64 max), not a float64 that rounds up to + ``2**64``. + """ + big = 2**64 - 1 + nd = _parse_band_nodata(str(big), np.dtype(np.uint64)) + assert isinstance(nd, int), ( + f"UInt64 nodata must parse as int, got {type(nd).__name__}" + ) + assert nd == big + # float64 round-trip would equal 2**64, off by one + assert nd != int(float(big)) + + +def test_parse_band_nodata_int64_min_exact(): + """``INT64_MIN`` (``-2**63``) must survive parsing as an int.""" + info = np.iinfo(np.int64) + nd = _parse_band_nodata(str(info.min), np.dtype(np.int64)) + assert isinstance(nd, int) + assert nd == info.min + + +def test_parse_band_nodata_int32_negative(): + """Common GDAL sentinel ``-9999`` for an Int32 band parses as int.""" + nd = _parse_band_nodata('-9999', np.dtype(np.int32)) + assert isinstance(nd, int) + assert nd == -9999 + + +def test_parse_band_nodata_int_scientific_notation(): + """GDAL occasionally emits integer nodata in scientific or + ``-9999.0`` form. Parsing should still land an int when the + value is integer-valued and in-range. + """ + nd = _parse_band_nodata('-9999.0', np.dtype(np.int32)) + assert isinstance(nd, int) and nd == -9999 + nd = _parse_band_nodata('1e3', np.dtype(np.int32)) + assert isinstance(nd, int) and nd == 1000 + + +def test_parse_band_nodata_int_out_of_range_falls_back(): + """An out-of-range sentinel for the band dtype is returned as the + parsed float so it surfaces via ``attrs['nodata']`` for round-trip + but can never match an integer pixel (mirroring + ``_resolve_masked_fill``'s tolerant behaviour). + """ + # -9999 cannot be represented as uint16; should not raise + nd = _parse_band_nodata('-9999', np.dtype(np.uint16)) + # ``int('-9999')`` succeeds and out-of-range so we *do* return the + # int in the cheap path -- _sentinel_for_dtype downstream is then + # responsible for refusing to use it as a mask sentinel. + assert nd == -9999 + + +def test_parse_band_nodata_float_nan(): + """Float bands keep NaN sentinels working (no integer-parse + regression for the floating path). + """ + nd = _parse_band_nodata('nan', np.dtype(np.float32)) + assert isinstance(nd, float) + assert np.isnan(nd) + + +def test_parse_band_nodata_float_scientific(): + """Float bands preserve scientific-notation sentinels.""" + nd = _parse_band_nodata('-1.5e10', np.dtype(np.float64)) + assert isinstance(nd, float) + assert nd == -1.5e10 + + +def test_parse_band_nodata_empty_or_none(): + """Empty / whitespace / ``None`` input returns ``None`` regardless + of dtype. + """ + assert _parse_band_nodata(None, np.dtype(np.int32)) is None + assert _parse_band_nodata('', np.dtype(np.int32)) is None + assert _parse_band_nodata(' ', np.dtype(np.float32)) is None + + +# --------------------------------------------------------------------------- +# 7. End-to-end VRT parse: ``vrt.bands[i].nodata`` is an int for integer +# bands, a float for float bands. +# --------------------------------------------------------------------------- + +def _make_minimal_vrt_xml(dtype_attr, nodata_text): + """Tiny VRT XML string suitable for direct ``parse_vrt`` calls. + + The SourceFilename here is intentionally minimal -- ``parse_vrt`` + only does the containment check after canonicalising the path, so + we pass a path inside the temp dir at the call site. + """ + return ( + '' + '0.0, 1.0, 0.0, 0.0, 0.0, -1.0' + f'' + f'{nodata_text}' + '' + '' + ) + + +def test_parse_vrt_uint64_nodata_is_int(tmp_path): + """The dataclass stored on ``_VRTBand.nodata`` is a Python ``int`` + for an integer-dtype band, with the exact 64-bit value. + """ + big = 2**64 - 1 + xml = _make_minimal_vrt_xml('UInt64', str(big)) + vrt = parse_vrt(xml, vrt_dir=str(tmp_path)) + assert len(vrt.bands) == 1 + nd = vrt.bands[0].nodata + assert isinstance(nd, int) + assert nd == big + + +def test_parse_vrt_int64_min_nodata_is_int(tmp_path): + info = np.iinfo(np.int64) + xml = _make_minimal_vrt_xml('Int64', str(info.min)) + vrt = parse_vrt(xml, vrt_dir=str(tmp_path)) + nd = vrt.bands[0].nodata + assert isinstance(nd, int) + assert nd == info.min + + +def test_parse_vrt_float32_nan_nodata_is_float(tmp_path): + xml = _make_minimal_vrt_xml('Float32', 'nan') + vrt = parse_vrt(xml, vrt_dir=str(tmp_path)) + nd = vrt.bands[0].nodata + assert isinstance(nd, float) + assert np.isnan(nd) + + +# --------------------------------------------------------------------------- +# 8. Full read_vrt round-trip preserves precision and masks correctly. +# --------------------------------------------------------------------------- + +def test_uint64_nodata_round_trip_preserves_max_sentinel(tmp_path): + """A VRT declaring UInt64 + ``2**64 - 1`` + must surface ``attrs['nodata']`` as the exact integer value, not a + float that has rounded past the dtype's range. Downstream + consumers rely on exact equality. + """ + big = 2**64 - 1 + # Fill the source with non-sentinel values so the read keeps the + # uint64 dtype (a sentinel hit promotes to float64 + NaN, which + # would defeat the precision check in this test). + b = np.array([[1, 2], [3, 4]], dtype=np.uint64) + src = tmp_path / 'src.tif' + _write(b, src) + vrt = _build_single_band_vrt( + tmp_path, dtype_attr='UInt64', src_path=str(src), nodata=big, + ) + r = read_vrt(vrt) + # Either the array stays uint64 (no sentinel hit) or it promotes to + # float64 (sentinel hit). In the no-hit case attrs['nodata'] must + # carry the exact int. + assert 'nodata' in r.attrs + assert int(r.attrs['nodata']) == big + # Critically, the stored attr must not be a float64 that has rounded + # the sentinel up to 2**64. ``isinstance`` allows int or np.integer + # but rejects float / np.floating. + assert isinstance(r.attrs['nodata'], (int, np.integer)) + + +def test_uint64_nodata_masks_max_sentinel_in_data(tmp_path): + """When the source pixel actually contains ``2**64 - 1``, the + masking pipeline must catch it: the result is promoted to float64 + with NaN at the sentinel position. This is the precision- + preservation acid test -- if the nodata was rounded to a float + that doesn't equal the source pixel, the mask never fires and the + sentinel survives as a 1.8e19 float. + """ + big = 2**64 - 1 + b = np.array([[1, 2], [3, big]], dtype=np.uint64) + src = tmp_path / 'src.tif' + _write(b, src) + vrt = _build_single_band_vrt( + tmp_path, dtype_attr='UInt64', src_path=str(src), nodata=big, + ) + r = read_vrt(vrt) + # Sentinel hit -> promote to float64 with NaN + assert r.dtype == np.float64, ( + f"sentinel hit must promote to float64, got {r.dtype}" + ) + assert np.isnan(r.values[1, 1]), ( + f"the 2**64-1 cell must be masked to NaN; got {r.values[1, 1]!r}" + ) + # Non-sentinel cells survive as float64 values + assert r.values[0, 0] == 1.0 + assert r.values[0, 1] == 2.0 + assert r.values[1, 0] == 3.0 + + +def test_int64_min_nodata_masks_correctly(tmp_path): + """``INT64_MIN`` as both the nodata sentinel and a real pixel value + masks correctly without int64 -> float64 rounding aliasing the + sentinel onto adjacent values. + """ + info = np.iinfo(np.int64) + b = np.array([[info.min, -1], [0, info.max]], dtype=np.int64) + src = tmp_path / 'src.tif' + _write(b, src) + vrt = _build_single_band_vrt( + tmp_path, dtype_attr='Int64', src_path=str(src), nodata=info.min, + ) + r = read_vrt(vrt) + assert r.dtype == np.float64 + assert np.isnan(r.values[0, 0]) + assert r.values[0, 1] == -1.0 + assert r.values[1, 0] == 0.0 + assert r.values[1, 1] == float(info.max) + + +def test_int32_negative_nodata_still_masks(tmp_path): + """Smoke regression for the common Int32 + ``-9999`` case. The + integer parsing path must not break this when there is no precision + pressure -- ``-9999`` survives ``float()`` fine but we still want + the new int-typed parse to mask the same way the old float-typed + parse did. + """ + b = np.array([[10, -9999], [-9999, 20]], dtype=np.int32) + src = tmp_path / 'src.tif' + _write(b, src) + vrt = _build_single_band_vrt( + tmp_path, dtype_attr='Int32', src_path=str(src), nodata=-9999, + ) + r = read_vrt(vrt) + assert r.dtype == np.float64 + assert np.isnan(r.values[0, 1]) + assert np.isnan(r.values[1, 0]) + assert r.values[0, 0] == 10.0 + assert r.values[1, 1] == 20.0 + + +def test_float32_nan_nodata_still_works(tmp_path): + """``Float32`` + ``nan`` still parses and + surfaces NaN via ``attrs['nodata']`` (no regression on the float + path). + """ + b = np.array([[1.0, np.nan], [3.0, 4.0]], dtype=np.float32) + src = tmp_path / 'src.tif' + _write(b, src) + vrt = _build_single_band_vrt( + tmp_path, dtype_attr='Float32', src_path=str(src), nodata='nan', + ) + r = read_vrt(vrt) + assert r.dtype == np.float32 + assert np.isnan(r.attrs['nodata']) + assert np.isnan(r.values[0, 1]) + + +def test_float64_scientific_nodata_still_works(tmp_path): + """``Float64`` + scientific-notation ```` survives as + float (no integer-parse regression for the float path). + """ + b = np.array([[1.0, -1.5e10], [3.0, 4.0]], dtype=np.float64) + src = tmp_path / 'src.tif' + _write(b, src) + vrt = _build_single_band_vrt( + tmp_path, dtype_attr='Float64', src_path=str(src), nodata='-1.5e10', + ) + r = read_vrt(vrt) + assert r.dtype == np.float64 + assert r.attrs['nodata'] == -1.5e10 + # The matching pixel stays as-is for float -- there's no NaN + # promotion (it's already float64), so the sentinel surfaces as a + # literal value unless the float-source nodata-masking branch fires. + # Either behaviour is acceptable; just confirm nodata attr is set.