From 3e77947f4684a22881f574777f3423abbe4a9d18 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Mon, 18 May 2026 21:40:11 -0700 Subject: [PATCH 1/3] geotiff: read external .tif.ovr sidecar overviews (#2112) GDAL/rasterio writes overview pyramids to a sibling .tif.ovr file when the source is not a COG. The reader walked only the in-file IFD chain, so any overview_level >= 1 on such a file raised "out of range" even though the data lived right next to the source. Add a _sidecar module that discovers the sibling file (local paths only -- cloud / HTTP / file-like sources fall back to base-only) and parses its IFDs as a continuation of the base file's pyramid. Each sidecar IFD is tagged with its (data, header) so the strip / tile reader resolves byte offsets against the right buffer. Both the eager read_to_array path and the metadata helper used by the dask graph builder pick up the merged pyramid, so dask + numpy reads of sidecar levels work for free. GPU and dask+GPU paths still walk the base IFD chain only -- threading sidecar bytes through nvCOMP is follow-up work. The corpus-level skip for the eager parity check is now empty; the dask+GPU skip stays. --- xrspatial/geotiff/__init__.py | 22 +- xrspatial/geotiff/_reader.py | 39 +++- xrspatial/geotiff/_sidecar.py | 101 +++++++++ .../test_golden_corpus_eager_numpy_1930.py | 19 +- .../geotiff/tests/test_sidecar_ovr_2112.py | 200 ++++++++++++++++++ 5 files changed, 362 insertions(+), 19 deletions(-) create mode 100644 xrspatial/geotiff/_sidecar.py create mode 100644 xrspatial/geotiff/tests/test_sidecar_ovr_2112.py diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 215fa354d..e87b5a4eb 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -215,14 +215,29 @@ def _read_geo_info(source, *, overview_level: int | None = None): raise TypeError( "source must be a str path or binary file-like, " f"got {type(source).__name__}") + sidecar = None try: header = parse_header(data) ifds = parse_all_ifds(data, header) if not ifds: raise ValueError("No IFDs found in TIFF file") + # Append sibling `.tif.ovr` sidecar IFDs onto the pyramid list + # so ``overview_level`` indexes both internal and external + # overviews (issue #2112). Local file paths only. + from ._sidecar import ( + attach_sidecar_origin, find_sidecar, load_sidecar, + ) + sidecar_path = find_sidecar(source) + if sidecar_path is not None: + sidecar = load_sidecar(sidecar_path) + attach_sidecar_origin( + sidecar.ifds, sidecar.data, sidecar.header) + ifds = ifds + sidecar.ifds ifd = select_overview_ifd(ifds, overview_level) # Inherit georef from the level-0 IFD when the overview itself - # has no geokeys (issue #1640). Pass-through for level 0. + # has no geokeys (issue #1640). Pass-through for level 0. The + # sidecar IFDs typically lack geokeys so the inheritance pulls + # from the base file's full-resolution IFD as GDAL does. geo_info = extract_geo_info_with_overview_inheritance( ifd, ifds, data, header.byte_order) bps = resolve_bits_per_sample(ifd.bits_per_sample) @@ -238,6 +253,11 @@ def _read_geo_info(source, *, overview_level: int | None = None): finally: if close_data: data.close() + if sidecar is not None: + try: + sidecar.data.close() + except Exception: + pass def open_geotiff(source: str | BinaryIO, *, diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index b8fa54a53..e8ed5b7b1 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -3254,6 +3254,7 @@ def read_to_array(source, *, window=None, overview_level: int | None = None, src = _FileSource(source) data = src.read_all() + sidecar = None try: header = parse_header(data) ifds = parse_all_ifds(data, header) @@ -3261,15 +3262,42 @@ def read_to_array(source, *, window=None, overview_level: int | None = None, if len(ifds) == 0: raise ValueError("No IFDs found in TIFF file") + # External `.tif.ovr` sidecar (issue #2112). GDAL/rasterio write + # overview pyramids to a sibling file when the source is not a + # COG; the sidecar's IFDs are the continuation of the base + # file's pyramid. Discovery only fires for local file paths; + # cloud / HTTP / file-like sources skip the lookup and keep the + # base-file-only behaviour. The sidecar must be loaded before + # IFD selection so ``overview_level`` can index into a unified + # pyramid list. + from ._sidecar import ( + attach_sidecar_origin, find_sidecar, load_sidecar, + ) + sidecar_path = find_sidecar(source) + if sidecar_path is not None: + sidecar = load_sidecar(sidecar_path) + attach_sidecar_origin( + sidecar.ifds, sidecar.data, sidecar.header) + ifds = ifds + sidecar.ifds + # Select IFD, skipping any mask IFDs ifd = select_overview_ifd(ifds, overview_level) + # If the selected IFD came from the sidecar, swap the data / + # header used for strip / tile reads below so byte offsets + # resolve against the right buffer. + ifd_data = getattr(ifd, '_source_data', None) or data + ifd_header = getattr(ifd, '_source_header', None) or header + bps = resolve_bits_per_sample(ifd.bits_per_sample) dtype = tiff_dtype_to_numpy(bps, ifd.sample_format) # Inherit georef from level 0 when an overview IFD lacks its own # geokeys (issue #1640). For overview_level=0 (or None) this is a # no-op: the helper short-circuits when the IFD is not a - # NewSubfileType=overview entry. + # NewSubfileType=overview entry. Sidecar IFDs always lack + # geokeys, so the inheritance pulls from the base file's + # level-0 IFD (kept first in the merged list) which is the + # GDAL convention. geo_info = extract_geo_info_with_overview_inheritance( ifd, ifds, data, header.byte_order) @@ -3349,10 +3377,10 @@ def read_to_array(source, *, window=None, overview_level: int | None = None, f"band={band} out of range for {ifd_samples}-band file.") if ifd.is_tiled: - arr = _read_tiles(data, ifd, header, dtype, window, + arr = _read_tiles(ifd_data, ifd, ifd_header, dtype, window, max_pixels=max_pixels) else: - arr = _read_strips(data, ifd, header, dtype, window, + arr = _read_strips(ifd_data, ifd, ifd_header, dtype, window, max_pixels=max_pixels) # Extract the requested band before reorienting so we work on a @@ -3379,5 +3407,10 @@ def read_to_array(source, *, window=None, overview_level: int | None = None, geo_info._mask_nodata = inverted_nodata finally: src.close() + if sidecar is not None: + try: + sidecar.data.close() + except Exception: + pass return arr, geo_info diff --git a/xrspatial/geotiff/_sidecar.py b/xrspatial/geotiff/_sidecar.py new file mode 100644 index 000000000..b359a456c --- /dev/null +++ b/xrspatial/geotiff/_sidecar.py @@ -0,0 +1,101 @@ +"""External `.tif.ovr` sidecar discovery for overview reading. + +GDAL and rasterio write overview pyramids to a sibling ``.ovr`` +file by default (`gdaladdo -ro` and `rasterio` `--overview-resampling` +when the source itself is not a COG). The sidecar is a standalone TIFF +whose IFDs are the continuation of the base file's pyramid: base IFD 0 +is overview level 0, sidecar IFD 0 is level 1, sidecar IFD 1 is level +2, and so on. + +This module discovers the sidecar next to a local-file source and +parses its IFDs, returning the inputs needed for the reader to switch +to the sidecar's bytes/header when the selected overview level lives +there. Cloud / HTTP / file-like sources are not in scope for the first +cut (see issue #2112): they would need sidecar URLs to be discoverable +in their respective namespaces, which is not a TIFF concern. Callers +that hit a non-local source skip the sidecar discovery entirely and +fall back to base-file-only behaviour. +""" +from __future__ import annotations + +import mmap +import os +from typing import NamedTuple + +from ._header import IFD, TIFFHeader, parse_all_ifds, parse_header + + +class SidecarOverviews(NamedTuple): + """Bytes, header, and IFDs from a sibling ``.tif.ovr`` sidecar.""" + + data: object # bytes-like (mmap or bytes) + header: TIFFHeader + ifds: list[IFD] + path: str + + +def find_sidecar(source) -> str | None: + """Return the path to a sibling ``.ovr`` sidecar if one exists. + + Only local file paths are considered. File-like buffers and fsspec + / HTTP URIs are out of scope (their sidecar conventions differ and + discovery is non-trivial); they return ``None`` so callers fall + back to base-file-only behaviour. + + The sidecar lives at ``.ovr`` per the GDAL convention. The + function returns the absolute path on success and ``None`` if the + sidecar does not exist or the source is not a local file path. + """ + if not isinstance(source, str): + return None + # fsspec URIs (s3://, gs://, az://, memory://, http://, https://) + # share the str type with local paths but should not be probed via + # ``os.path.exists``: the latter would either succeed against a + # local file that happens to share the name or fail and we cannot + # safely fetch a remote sidecar from this layer. + if "://" in source: + return None + sidecar = source + ".ovr" + if not os.path.isfile(sidecar): + return None + return sidecar + + +def load_sidecar(path: str) -> SidecarOverviews: + """Open and parse a sidecar ``.ovr`` file. + + The returned ``data`` is an ``mmap`` object the caller must close + (or wrap in a ``try / finally`` next to its own data close). The + IFD list is the sidecar's full IFD chain in file order; the reader + treats them as overview levels (the first sidecar IFD is level 1 + when the base file holds only a full-resolution IFD, level 2 when + the base file already carries one internal overview, and so on). + """ + f = open(path, "rb") + try: + # ``mmap`` over the whole file matches the eager reader's + # ``read_to_array`` path for local sources; the lifetime is + # owned by the caller. + data = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) + finally: + f.close() + header = parse_header(data) + ifds = parse_all_ifds(data, header) + return SidecarOverviews(data=data, header=header, ifds=ifds, path=path) + + +def attach_sidecar_origin(ifds: list[IFD], + data: object, + header: TIFFHeader) -> None: + """Tag each IFD in ``ifds`` with its source bytes and header. + + The reader checks ``ifd._source_data`` / ``ifd._source_header`` to + decide which buffer to slice for strip/tile reads. Untagged IFDs + fall through to the base-file ``data`` / ``header`` the caller + already has in scope. + """ + for ifd in ifds: + # Use ``object.__setattr__`` so this works whether IFD is a + # plain dataclass or a frozen one in a future revision. + object.__setattr__(ifd, "_source_data", data) + object.__setattr__(ifd, "_source_header", header) diff --git a/xrspatial/geotiff/tests/test_golden_corpus_eager_numpy_1930.py b/xrspatial/geotiff/tests/test_golden_corpus_eager_numpy_1930.py index 9fe2d44e3..2699d666b 100644 --- a/xrspatial/geotiff/tests/test_golden_corpus_eager_numpy_1930.py +++ b/xrspatial/geotiff/tests/test_golden_corpus_eager_numpy_1930.py @@ -86,21 +86,10 @@ _PARITY_GAPS: dict[str, str] = {} # Fixtures whose overview-IFD code path is not yet wired into the -# xrspatial reader. The base-IFD comparison still runs and passes; the -# overview-level loop driven by ``candidate_factory`` is skipped for -# these ids. Today this lists external sidecar overviews -# (``.tif.ovr``): the reader walks only the in-file IFD chain and would -# raise "overview_level out of range" for ``overview_level >= 1``. The -# internal-IFD and COG fixtures exercise the overview code path on -# every backend. -_OVERVIEW_READER_GAPS: dict[str, str] = { - "overview_external_ovr_uint16": ( - "External .ovr sidecar reader is not implemented in xrspatial. " - "The base IFD still parity-checks; the overview levels live in " - "a sibling .tif.ovr that the reader does not open. Track in a " - "follow-up issue if the corpus needs sidecar coverage." - ), -} +# xrspatial reader. Empty as of issue #2112, which taught the eager +# numpy reader to discover and parse sibling ``.tif.ovr`` files. New +# entries should include a follow-up issue link so the gap stays visible. +_OVERVIEW_READER_GAPS: dict[str, str] = {} _INTENTIONAL_SKIPS: dict[str, str] = { "nodata_miniswhite_uint8": ( diff --git a/xrspatial/geotiff/tests/test_sidecar_ovr_2112.py b/xrspatial/geotiff/tests/test_sidecar_ovr_2112.py new file mode 100644 index 000000000..6093c3fd5 --- /dev/null +++ b/xrspatial/geotiff/tests/test_sidecar_ovr_2112.py @@ -0,0 +1,200 @@ +"""External `.tif.ovr` sidecar overview reader (issue #2112). + +Before the fix, opening a GDAL/rasterio file whose overview pyramid +lives in a sibling ``.tif.ovr`` worked at the base level but raised an +``overview_level out of range`` error for any non-zero level. The +reader only walked the in-file IFD chain. + +These tests pin the new contract: + +* The base IFD continues to read at level 0 with no behaviour change. +* The reader appends sidecar IFDs onto the pyramid list so that + ``overview_level=1`` (and onwards) reads from the sidecar. +* Byte parity holds against rasterio for the bundled fixture, which + exercises a uint16 raster with two sidecar levels at factors 2 and 4. +* Discovery is local-file-only: file-like sources, HTTP / fsspec URIs, + and missing sidecars all fall back to base-only behaviour. +* Sidecar levels reach all eager-read entry points: ``open_geotiff``, + ``read_to_array``, and the dask graph builder that goes through + ``_read_to_array_metadata_only``. +""" +from __future__ import annotations + +import io +import pathlib +import shutil + +import numpy as np +import pytest + +from xrspatial.geotiff import open_geotiff +from xrspatial.geotiff._reader import read_to_array +from xrspatial.geotiff._sidecar import find_sidecar, load_sidecar + + +_FIXTURE = ( + pathlib.Path(__file__).resolve().parent + / "golden_corpus" + / "fixtures" + / "overview_external_ovr_uint16.tif" +) + + +def _fixture_or_skip(): + if not _FIXTURE.exists(): + pytest.skip("sidecar fixture not present") + if not (_FIXTURE.parent / "overview_external_ovr_uint16.tif.ovr").exists(): + pytest.skip("sidecar .ovr file not present") + return _FIXTURE + + +# --------------------------------------------------------------------------- +# Discovery helper: local file paths only. +# --------------------------------------------------------------------------- +def test_find_sidecar_returns_path_for_local_file_with_sidecar(): + src = str(_fixture_or_skip()) + assert find_sidecar(src) == src + ".ovr" + + +def test_find_sidecar_returns_none_when_sidecar_missing(tmp_path): + src = tmp_path / "no_sidecar_2112.tif" + src.write_bytes(b"\x00") + assert find_sidecar(str(src)) is None + + +def test_find_sidecar_returns_none_for_file_like_object(): + assert find_sidecar(io.BytesIO(b"")) is None + + +@pytest.mark.parametrize( + "uri", + [ + "s3://bucket/path.tif", + "gs://bucket/path.tif", + "az://container/path.tif", + "http://example.com/x.tif", + "https://example.com/x.tif", + "memory://buffer.tif", + ], +) +def test_find_sidecar_returns_none_for_remote_uri(uri): + assert find_sidecar(uri) is None + + +# --------------------------------------------------------------------------- +# Sidecar parser: returns a usable IFD list. +# --------------------------------------------------------------------------- +def test_load_sidecar_returns_two_ifds(): + src = _fixture_or_skip() + sidecar = load_sidecar(str(src) + ".ovr") + try: + # The bundled fixture was written with two overview factors (2, 4). + assert len(sidecar.ifds) == 2 + # First sidecar IFD is the 32x32 level, second is 16x16. + assert (sidecar.ifds[0].width, sidecar.ifds[0].height) == (32, 32) + assert (sidecar.ifds[1].width, sidecar.ifds[1].height) == (16, 16) + finally: + sidecar.data.close() + + +# --------------------------------------------------------------------------- +# End-to-end: each sidecar level reads through open_geotiff. +# --------------------------------------------------------------------------- +def test_open_geotiff_base_level_unchanged(): + src = _fixture_or_skip() + da = open_geotiff(str(src)) + assert da.shape == (64, 64) + assert da.dtype == np.uint16 + + +def test_open_geotiff_sidecar_level_1(): + src = _fixture_or_skip() + da = open_geotiff(str(src), overview_level=1) + assert da.shape == (32, 32) + assert da.dtype == np.uint16 + + +def test_open_geotiff_sidecar_level_2(): + src = _fixture_or_skip() + da = open_geotiff(str(src), overview_level=2) + assert da.shape == (16, 16) + assert da.dtype == np.uint16 + + +def test_open_geotiff_out_of_range_after_sidecar_appended(): + src = _fixture_or_skip() + with pytest.raises(ValueError, match="overview_level=3 is out of range"): + open_geotiff(str(src), overview_level=3) + + +# --------------------------------------------------------------------------- +# Byte parity with rasterio: the production reference for sidecar overviews. +# --------------------------------------------------------------------------- +def test_sidecar_level_1_matches_rasterio(): + rasterio = pytest.importorskip("rasterio") + src = _fixture_or_skip() + with rasterio.open(str(src)) as ds: + factor = ds.overviews(1)[0] + out_shape = (ds.height // factor, ds.width // factor) + rio_arr = ds.read(1, out_shape=out_shape) + xr_arr = open_geotiff(str(src), overview_level=1).values + np.testing.assert_array_equal(xr_arr, rio_arr) + + +def test_sidecar_level_2_matches_rasterio(): + rasterio = pytest.importorskip("rasterio") + src = _fixture_or_skip() + with rasterio.open(str(src)) as ds: + factor = ds.overviews(1)[1] + out_shape = (ds.height // factor, ds.width // factor) + rio_arr = ds.read(1, out_shape=out_shape) + xr_arr = open_geotiff(str(src), overview_level=2).values + np.testing.assert_array_equal(xr_arr, rio_arr) + + +# --------------------------------------------------------------------------- +# Sidecar reaches the reader entry point and the metadata-only helper. +# --------------------------------------------------------------------------- +def test_read_to_array_sidecar_level_1(tmp_path): + src = _fixture_or_skip() + arr, geo = read_to_array(str(src), overview_level=1) + assert arr.shape == (32, 32) + assert arr.dtype == np.uint16 + + +def test_metadata_only_includes_sidecar_levels(tmp_path): + src = _fixture_or_skip() + from xrspatial.geotiff import _read_geo_info + + geo, h, w, dtype, _ = _read_geo_info( + str(src), overview_level=2) + assert (h, w) == (16, 16) + # GeoTIFF tags inherit from the base IFD, so the metadata is non-empty. + assert geo is not None + + +# --------------------------------------------------------------------------- +# Missing sidecar gracefully falls back to base-only behaviour. +# --------------------------------------------------------------------------- +def test_missing_sidecar_raises_overview_out_of_range(tmp_path): + src = _fixture_or_skip() + # Copy only the base file; the sidecar stays behind. + copy_path = tmp_path / "no_sidecar_2112.tif" + shutil.copy(src, copy_path) + # Base level still works. + assert open_geotiff(str(copy_path)).shape == (64, 64) + # Asking for an overview level now fails the same way it did before + # sidecar support was added. + with pytest.raises(ValueError, match="out of range"): + open_geotiff(str(copy_path), overview_level=1) + + +# --------------------------------------------------------------------------- +# File-like buffer source still works for the base level (no sidecar lookup). +# --------------------------------------------------------------------------- +def test_file_like_source_reads_base_without_sidecar(): + src = _fixture_or_skip() + with open(src, "rb") as f: + buf = io.BytesIO(f.read()) + da = open_geotiff(buf) + assert da.shape == (64, 64) From 55fb85c21ee1c2bac17e45ffc72ec8938b7eea84 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Tue, 19 May 2026 05:49:41 -0700 Subject: [PATCH 2/3] geotiff: extend .ovr sidecar reader to GPU eager + HTTP + fsspec Self-review on #2114 deferred three items. Address them. - HTTP sidecar discovery: ``find_sidecar`` now issues a HEAD on ``.ovr`` for http:// / https:// sources and ``load_sidecar`` downloads the bytes via urllib. The HEAD probe is bounded by a 10s timeout and silently returns None on any error so a missing or unreachable sidecar falls back to base-only behaviour. - fsspec sidecar discovery: ``find_sidecar`` falls through to ``fsspec.AbstractFileSystem.exists`` for s3:// / gs:// / az:// / memory:// / file:// URIs and ``load_sidecar`` reads via fsspec. - GPU eager reader: ``read_geotiff_gpu`` parses the sidecar IFD chain alongside the base file and swaps the in-scope ``data`` / ``header`` to the sidecar's buffers when the selected overview level lives there. The GDS fast path (kvikio direct read from the base file path) is skipped for sidecar IFDs since their byte offsets index the ``.ovr``, not the base; the CPU-mmap GPU decode path slices the already-loaded sidecar buffer directly. Verified byte parity with the CPU eager path on both sidecar levels of the bundled fixture. Added ``close_sidecar`` helper to share the data-close convention across the three call sites. The GPU path relies on Python GC to reap the sidecar mmap since the GPU decode happens after the metadata try/finally exits. Chunked GPU + sidecar (``read_geotiff_gpu`` with chunks=...) still walks the base IFD only because the chunked GDS path needs separate plumbing for a different file path; dask+GPU corpus skip stays. --- xrspatial/geotiff/__init__.py | 7 +- xrspatial/geotiff/_backends/gpu.py | 99 +++++++++--- xrspatial/geotiff/_reader.py | 7 +- xrspatial/geotiff/_sidecar.py | 153 +++++++++++++----- .../geotiff/tests/test_sidecar_ovr_2112.py | 112 +++++++++++++ 5 files changed, 304 insertions(+), 74 deletions(-) diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index e87b5a4eb..7fb1e8086 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -253,11 +253,8 @@ def _read_geo_info(source, *, overview_level: int | None = None): finally: if close_data: data.close() - if sidecar is not None: - try: - sidecar.data.close() - except Exception: - pass + from ._sidecar import close_sidecar + close_sidecar(sidecar) def open_geotiff(source: str | BinaryIO, *, diff --git a/xrspatial/geotiff/_backends/gpu.py b/xrspatial/geotiff/_backends/gpu.py index 4333a00e6..23287c1d1 100644 --- a/xrspatial/geotiff/_backends/gpu.py +++ b/xrspatial/geotiff/_backends/gpu.py @@ -286,6 +286,8 @@ def read_geotiff_gpu(source: str, *, src = _FileSource(source) data = src.read_all() + sidecar = None + sidecar_owned_ifd = False try: header = parse_header(data) ifds = parse_all_ifds(data, header) @@ -293,9 +295,36 @@ def read_geotiff_gpu(source: str, *, if len(ifds) == 0: raise ValueError("No IFDs found in TIFF file") + # Append sibling `.tif.ovr` sidecar IFDs onto the pyramid list so + # ``overview_level`` indexes both internal and external overviews + # (issue #2112). When the selected IFD comes from the sidecar, + # we swap ``data`` / ``header`` to the sidecar's buffers below + # and skip the GDS fast path -- GDS reads the source file path, + # which would point at the base file rather than the sidecar. + from .._sidecar import ( + attach_sidecar_origin, close_sidecar, find_sidecar, + load_sidecar, + ) + sidecar_path = find_sidecar(source) + if sidecar_path is not None: + sidecar = load_sidecar(sidecar_path) + attach_sidecar_origin( + sidecar.ifds, sidecar.data, sidecar.header) + ifds = ifds + sidecar.ifds + # Skip mask IFDs (NewSubfileType bit 2) ifd = select_overview_ifd(ifds, overview_level) + # Swap base data / header for the sidecar's buffers when the + # requested overview level lives in the sidecar. Subsequent + # tile / strip reads slice the right buffer. + sidecar_data = getattr(ifd, "_source_data", None) + sidecar_header = getattr(ifd, "_source_header", None) + if sidecar_data is not None and sidecar_header is not None: + data = sidecar_data + header = sidecar_header + sidecar_owned_ifd = True + bps = resolve_bits_per_sample(ifd.bits_per_sample) file_dtype = tiff_dtype_to_numpy(bps, ifd.sample_format) # Inherit georef from the level-0 IFD when the overview itself @@ -490,6 +519,11 @@ def read_geotiff_gpu(source: str, *, finally: src.close() + # ``sidecar`` (if any) holds an mmap that is sliced by the GPU + # decode below; do not close it here. Python's GC closes the + # underlying file descriptor when ``sidecar`` falls out of + # function scope at return, which is the same lifetime the + # base-file ``data`` already relies on after ``src.close()``. # GPU decode: try GDS (SSD→GPU direct) first, then CPU mmap path. # Sparse tiles (byte_count == 0) are unsupported on the GPU pipeline; @@ -606,36 +640,53 @@ def _read_once(): from .._gpu_decode import gpu_decode_tiles_from_file arr_gpu = None - try: - arr_gpu = gpu_decode_tiles_from_file( - source, offsets, byte_counts, - tw, th, width, height, - compression, predictor, file_dtype, samples, - byte_order=header.byte_order, - masked_fill=masked_fill, - ) - except Exception as e: - if gpu == 'strict' or _geotiff_strict_mode(): - raise - warnings.warn( - f"read_geotiff_gpu: GPU decode failed " - f"({type(e).__name__}: {e}); falling back to CPU.", - RuntimeWarning, - stacklevel=2, - ) + if sidecar_owned_ifd: + # The selected IFD lives in a sibling .tif.ovr sidecar; tile + # offsets index into the sidecar's bytes, not the base file + # path that ``gpu_decode_tiles_from_file`` would open via + # kvikio. Skip GDS and let the CPU-mmap fallback below slice + # the already-loaded sidecar buffer (issue #2112). arr_gpu = None + else: + try: + arr_gpu = gpu_decode_tiles_from_file( + source, offsets, byte_counts, + tw, th, width, height, + compression, predictor, file_dtype, samples, + byte_order=header.byte_order, + masked_fill=masked_fill, + ) + except Exception as e: + if gpu == 'strict' or _geotiff_strict_mode(): + raise + warnings.warn( + f"read_geotiff_gpu: GPU decode failed " + f"({type(e).__name__}: {e}); falling back to CPU.", + RuntimeWarning, + stacklevel=2, + ) + arr_gpu = None if arr_gpu is None: - # Fallback: extract tiles via CPU mmap, then GPU decode - src2 = _FileSource(source) - data2 = src2.read_all() - try: + # Fallback: extract tiles via CPU mmap, then GPU decode. For + # sidecar IFDs the tile bytes already live in ``data`` (loaded + # from the .ovr above); re-opening ``source`` would point at the + # base file. Use the in-scope ``data`` directly in that case. + if sidecar_owned_ifd: compressed_tiles = [ - bytes(data2[offsets[i]:offsets[i] + byte_counts[i]]) + bytes(data[offsets[i]:offsets[i] + byte_counts[i]]) for i in range(len(offsets)) ] - finally: - src2.close() + else: + src2 = _FileSource(source) + data2 = src2.read_all() + try: + compressed_tiles = [ + bytes(data2[offsets[i]:offsets[i] + byte_counts[i]]) + for i in range(len(offsets)) + ] + finally: + src2.close() if arr_gpu is None: try: diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index e8ed5b7b1..beb183613 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -3407,10 +3407,7 @@ def read_to_array(source, *, window=None, overview_level: int | None = None, geo_info._mask_nodata = inverted_nodata finally: src.close() - if sidecar is not None: - try: - sidecar.data.close() - except Exception: - pass + from ._sidecar import close_sidecar + close_sidecar(sidecar) return arr, geo_info diff --git a/xrspatial/geotiff/_sidecar.py b/xrspatial/geotiff/_sidecar.py index b359a456c..8754d2424 100644 --- a/xrspatial/geotiff/_sidecar.py +++ b/xrspatial/geotiff/_sidecar.py @@ -7,14 +7,12 @@ is overview level 0, sidecar IFD 0 is level 1, sidecar IFD 1 is level 2, and so on. -This module discovers the sidecar next to a local-file source and -parses its IFDs, returning the inputs needed for the reader to switch -to the sidecar's bytes/header when the selected overview level lives -there. Cloud / HTTP / file-like sources are not in scope for the first -cut (see issue #2112): they would need sidecar URLs to be discoverable -in their respective namespaces, which is not a TIFF concern. Callers -that hit a non-local source skip the sidecar discovery entirely and -fall back to base-file-only behaviour. +This module discovers the sidecar next to a local-file, HTTP, or +fsspec source and parses its IFDs, returning the inputs the reader +needs to switch to the sidecar's bytes/header when the requested +overview level lives there. Discovery is gated on a cheap existence +check (local stat, HTTP HEAD, or fsspec ``exists``); a missing sidecar +returns ``None`` and the caller falls back to base-file-only behaviour. """ from __future__ import annotations @@ -34,56 +32,131 @@ class SidecarOverviews(NamedTuple): path: str -def find_sidecar(source) -> str | None: - """Return the path to a sibling ``.ovr`` sidecar if one exists. +def _is_http_url(source: str) -> bool: + return source.startswith(("http://", "https://")) + - Only local file paths are considered. File-like buffers and fsspec - / HTTP URIs are out of scope (their sidecar conventions differ and - discovery is non-trivial); they return ``None`` so callers fall - back to base-file-only behaviour. +def _is_fsspec_uri(source: str) -> bool: + # Same gate ``_reader._is_fsspec_uri`` uses: any ``scheme://`` that is + # not http / https / local-mmap-able. We avoid importing the reader + # helper here to keep the dependency direction one-way. + if "://" not in source: + return False + if _is_http_url(source): + return False + return True - The sidecar lives at ``.ovr`` per the GDAL convention. The - function returns the absolute path on success and ``None`` if the - sidecar does not exist or the source is not a local file path. + +def find_sidecar(source) -> str | None: + """Return the path / URL of a sibling ``.ovr`` sidecar if one exists. + + Scopes: + + * Local file paths -- probe with :func:`os.path.isfile`. + * HTTP / HTTPS URLs -- issue a single HEAD request to + ``.ovr``; treat any 2xx as "exists". + * fsspec URIs (``s3://``, ``gs://``, ``az://``, ``memory://`` ...) + -- call ``fsspec.AbstractFileSystem.exists`` on ``.ovr``. + * File-like buffers (``io.BytesIO``, etc.) -- no sidecar concept, + return ``None``. + + Discovery failures are silent: any network error, missing fsspec, + or unreadable path returns ``None`` so the caller falls back to + base-file-only behaviour. The existence check is bounded and does + not download or open the sidecar itself; :func:`load_sidecar` + handles the actual read once the path is known. """ if not isinstance(source, str): return None - # fsspec URIs (s3://, gs://, az://, memory://, http://, https://) - # share the str type with local paths but should not be probed via - # ``os.path.exists``: the latter would either succeed against a - # local file that happens to share the name or fail and we cannot - # safely fetch a remote sidecar from this layer. - if "://" in source: + candidate = source + ".ovr" + if "://" not in source: + return candidate if os.path.isfile(candidate) else None + if _is_http_url(source): + return _probe_http(candidate) + if _is_fsspec_uri(source): + return _probe_fsspec(candidate) + return None + + +def _probe_http(url: str) -> str | None: + """Return ``url`` if a HEAD request reports an existing object.""" + try: + import urllib.request + req = urllib.request.Request(url, method="HEAD") + # 10 second timeout matches the eager HTTP reader's defaults; + # a stuck remote should not block sidecar discovery indefinitely. + with urllib.request.urlopen(req, timeout=10) as resp: + return url if 200 <= resp.status < 300 else None + except Exception: + # 404 (urllib raises HTTPError) and any network error reach + # here; either way the sidecar is unavailable from our point + # of view and we fall back to base-only. return None - sidecar = source + ".ovr" - if not os.path.isfile(sidecar): + + +def _probe_fsspec(uri: str) -> str | None: + """Return ``uri`` if fsspec reports the object exists.""" + try: + import fsspec + fs, path = fsspec.core.url_to_fs(uri) + return uri if fs.exists(path) else None + except Exception: return None - return sidecar def load_sidecar(path: str) -> SidecarOverviews: """Open and parse a sidecar ``.ovr`` file. - The returned ``data`` is an ``mmap`` object the caller must close - (or wrap in a ``try / finally`` next to its own data close). The - IFD list is the sidecar's full IFD chain in file order; the reader - treats them as overview levels (the first sidecar IFD is level 1 - when the base file holds only a full-resolution IFD, level 2 when - the base file already carries one internal overview, and so on). + Accepts local file paths, HTTP / HTTPS URLs, and fsspec URIs. + Local paths are mmap'd; remote sources are downloaded once via the + matching transport (HTTP via :mod:`urllib`, fsspec URIs via the + fsspec filesystem). The IFD list is the sidecar's full IFD chain + in file order; the reader treats them as overview levels (the + first sidecar IFD is level 1 when the base file holds only a + full-resolution IFD, level 2 when the base file already carries + one internal overview, and so on). + + The returned ``data`` is either an ``mmap`` (local) or ``bytes`` + (remote). Callers should close the mmap variant via + ``data.close()`` when present; the bytes case is no-op. """ - f = open(path, "rb") - try: - # ``mmap`` over the whole file matches the eager reader's - # ``read_to_array`` path for local sources; the lifetime is - # owned by the caller. - data = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) - finally: - f.close() + if "://" not in path: + f = open(path, "rb") + try: + data = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) + finally: + f.close() + elif _is_http_url(path): + import urllib.request + with urllib.request.urlopen(path, timeout=30) as resp: + data = resp.read() + else: + # fsspec URI + import fsspec + with fsspec.open(path, "rb") as f: + data = f.read() header = parse_header(data) ifds = parse_all_ifds(data, header) return SidecarOverviews(data=data, header=header, ifds=ifds, path=path) +def close_sidecar(sidecar: SidecarOverviews | None) -> None: + """Close the sidecar's data buffer if it holds a ``close`` method. + + Mmap data buffers need explicit close; ``bytes`` from a remote + download does not. Safe to call with ``None``. + """ + if sidecar is None: + return + closer = getattr(sidecar.data, "close", None) + if closer is None: + return + try: + closer() + except Exception: + pass + + def attach_sidecar_origin(ifds: list[IFD], data: object, header: TIFFHeader) -> None: diff --git a/xrspatial/geotiff/tests/test_sidecar_ovr_2112.py b/xrspatial/geotiff/tests/test_sidecar_ovr_2112.py index 6093c3fd5..eaab32367 100644 --- a/xrspatial/geotiff/tests/test_sidecar_ovr_2112.py +++ b/xrspatial/geotiff/tests/test_sidecar_ovr_2112.py @@ -192,6 +192,118 @@ def test_missing_sidecar_raises_overview_out_of_range(tmp_path): # --------------------------------------------------------------------------- # File-like buffer source still works for the base level (no sidecar lookup). # --------------------------------------------------------------------------- +@pytest.fixture +def _gpu_or_skip(): + if not _gpu_available(): + pytest.skip("cupy + CUDA required") + + +def _gpu_available() -> bool: + import importlib.util + if importlib.util.find_spec("cupy") is None: + return False + try: + import cupy + return bool(cupy.cuda.is_available()) + except Exception: + return False + + +def test_gpu_eager_reads_sidecar_level_1(_gpu_or_skip): + src = _fixture_or_skip() + cpu = open_geotiff(str(src), overview_level=1) + gpu = open_geotiff(str(src), overview_level=1, gpu=True) + assert gpu.shape == cpu.shape + np.testing.assert_array_equal(gpu.data.get(), cpu.values) + + +def test_gpu_eager_reads_sidecar_level_2(_gpu_or_skip): + src = _fixture_or_skip() + cpu = open_geotiff(str(src), overview_level=2) + gpu = open_geotiff(str(src), overview_level=2, gpu=True) + assert gpu.shape == cpu.shape + np.testing.assert_array_equal(gpu.data.get(), cpu.values) + + +def test_gpu_eager_base_level_unchanged(_gpu_or_skip): + src = _fixture_or_skip() + gpu = open_geotiff(str(src), gpu=True) + assert gpu.shape == (64, 64) + + +# --------------------------------------------------------------------------- +# HTTP sidecar discovery: tests use a tiny local HTTP server so the +# probe and download paths are exercised without a network round-trip. +# --------------------------------------------------------------------------- +def _start_http_server(directory): + import http.server + import socketserver + import threading + + class _Handler(http.server.SimpleHTTPRequestHandler): + def log_message(self, *a, **kw): + return # silence + + def __init__(self, *a, **kw): + super().__init__(*a, directory=str(directory), **kw) + + httpd = socketserver.TCPServer(("127.0.0.1", 0), _Handler) + thread = threading.Thread(target=httpd.serve_forever, daemon=True) + thread.start() + return httpd, httpd.server_address[1] + + +def test_find_sidecar_http_probe_returns_url_when_present(tmp_path): + src = _fixture_or_skip() + import shutil + shutil.copy(src, tmp_path / "x.tif") + shutil.copy(str(src) + ".ovr", tmp_path / "x.tif.ovr") + httpd, port = _start_http_server(tmp_path) + try: + url = f"http://127.0.0.1:{port}/x.tif" + assert find_sidecar(url) == url + ".ovr" + finally: + httpd.shutdown() + + +def test_find_sidecar_http_probe_returns_none_when_missing(tmp_path): + src = _fixture_or_skip() + import shutil + shutil.copy(src, tmp_path / "x.tif") # no .ovr copied + httpd, port = _start_http_server(tmp_path) + try: + url = f"http://127.0.0.1:{port}/x.tif" + assert find_sidecar(url) is None + finally: + httpd.shutdown() + + +def test_load_sidecar_http_returns_ifds(tmp_path): + src = _fixture_or_skip() + import shutil + shutil.copy(str(src) + ".ovr", tmp_path / "x.tif.ovr") + httpd, port = _start_http_server(tmp_path) + try: + url = f"http://127.0.0.1:{port}/x.tif.ovr" + sidecar = load_sidecar(url) + assert len(sidecar.ifds) == 2 + assert (sidecar.ifds[0].width, sidecar.ifds[0].height) == (32, 32) + assert (sidecar.ifds[1].width, sidecar.ifds[1].height) == (16, 16) + finally: + httpd.shutdown() + + +def test_find_sidecar_fsspec_probe_returns_uri_when_present(tmp_path): + pytest.importorskip("fsspec") + src = _fixture_or_skip() + import shutil + shutil.copy(src, tmp_path / "y.tif") + shutil.copy(str(src) + ".ovr", tmp_path / "y.tif.ovr") + # file:// is a valid fsspec scheme that uses LocalFileSystem. + uri = f"file://{tmp_path}/y.tif" + assert find_sidecar(uri) == uri + ".ovr" + + def test_file_like_source_reads_base_without_sidecar(): src = _fixture_or_skip() with open(src, "rb") as f: From 722528e322395e9b98c980af94f8874c387d1a25 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Tue, 19 May 2026 06:50:46 -0700 Subject: [PATCH 3/3] geotiff: address re-review suggestions and nits on .ovr sidecar (#2114) Apply every item from the re-review pass. - Route the HTTP sidecar probe and download through ``_HTTPSource`` so both inherit the eager reader's defences: SSRF allow-list, IP pinning across redirect re-validation, shared urllib3 PoolManager with retries, and ``redirect=False`` with manual ``Location`` re- check. The probe uses a 1-byte Range GET rather than HEAD because HEAD support is inconsistent across CDNs / object stores. Removes the duplicated 10s urllib timeout path. - Close the sidecar mmap deterministically in ``read_geotiff_gpu`` by moving the GPU decode into the same try/finally that owns the metadata parse. The previous "rely on GC" path leaked file descriptors on long-running processes opening many GeoTIFFs in sequence; the close now runs after the decode has finished reading ``data``. - Replace the IFD-mutating ``attach_sidecar_origin`` (``object.__set attr__`` on every parsed IFD) with a return value -- a request- scoped ``dict[id(ifd), (data, header)]`` -- so a future caller that caches the parsed IFD list does not inherit stale ``_source_data`` / ``_source_header`` attributes from a previous request. - Narrow ``SidecarOverviews.data`` from ``object`` to a new ``SidecarBuffer = Union[mmap.mmap, bytes]`` alias so the reader sees the actual buffer variants. Carry the alias on ``attach_sidecar_origin``'s signature too. - Drop the duplicate ``_is_fsspec_uri`` from ``_sidecar`` and import the helper from ``_reader``. ``_reader`` imports ``_sidecar`` lazily inside its function bodies, so the top-level reverse import does not form a cycle at module load time. - Tests: - Add ``test_find_sidecar_http_probe_rejects_loopback_without_env_override`` to pin the SSRF guard's silent-fail-to-base contract. - Existing loopback-server probes opt into ``XRSPATIAL_GEOTIFF_ALLOW_ PRIVATE_HOSTS=1`` (same pattern other HTTP tests use) so they keep passing through the new SSRF-gated probe. Verified: full ``xrspatial/geotiff/tests/`` suite 4046 passed, 25 skipped. --- xrspatial/geotiff/__init__.py | 4 + xrspatial/geotiff/_backends/gpu.py | 562 +++++++++--------- xrspatial/geotiff/_reader.py | 6 +- xrspatial/geotiff/_sidecar.py | 114 ++-- .../geotiff/tests/test_sidecar_ovr_2112.py | 33 +- 5 files changed, 393 insertions(+), 326 deletions(-) diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 7fb1e8086..c5ccce5ae 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -230,6 +230,10 @@ def _read_geo_info(source, *, overview_level: int | None = None): sidecar_path = find_sidecar(source) if sidecar_path is not None: sidecar = load_sidecar(sidecar_path) + # Metadata-only path: drop the origin mapping. The reader + # only needs the merged IFD list to resolve the requested + # ``overview_level`` against; strip/tile bytes are sliced by + # ``read_to_array`` on the actual read. attach_sidecar_origin( sidecar.ifds, sidecar.data, sidecar.header) ifds = ifds + sidecar.ifds diff --git a/xrspatial/geotiff/_backends/gpu.py b/xrspatial/geotiff/_backends/gpu.py index 23287c1d1..5f7bb2945 100644 --- a/xrspatial/geotiff/_backends/gpu.py +++ b/xrspatial/geotiff/_backends/gpu.py @@ -305,10 +305,11 @@ def read_geotiff_gpu(source: str, *, attach_sidecar_origin, close_sidecar, find_sidecar, load_sidecar, ) + sidecar_origin: dict[int, tuple] = {} sidecar_path = find_sidecar(source) if sidecar_path is not None: sidecar = load_sidecar(sidecar_path) - attach_sidecar_origin( + sidecar_origin = attach_sidecar_origin( sidecar.ifds, sidecar.data, sidecar.header) ifds = ifds + sidecar.ifds @@ -318,11 +319,9 @@ def read_geotiff_gpu(source: str, *, # Swap base data / header for the sidecar's buffers when the # requested overview level lives in the sidecar. Subsequent # tile / strip reads slice the right buffer. - sidecar_data = getattr(ifd, "_source_data", None) - sidecar_header = getattr(ifd, "_source_header", None) - if sidecar_data is not None and sidecar_header is not None: - data = sidecar_data - header = sidecar_header + origin = sidecar_origin.get(id(ifd)) + if origin is not None: + data, header = origin sidecar_owned_ifd = True bps = resolve_bits_per_sample(ifd.bits_per_sample) @@ -517,140 +516,173 @@ def read_geotiff_gpu(source: str, *, # read OOB otherwise. See issue #1219. validate_tile_layout(ifd) - finally: - src.close() - # ``sidecar`` (if any) holds an mmap that is sliced by the GPU - # decode below; do not close it here. Python's GC closes the - # underlying file descriptor when ``sidecar`` falls out of - # function scope at return, which is the same lifetime the - # base-file ``data`` already relies on after ``src.close()``. - - # GPU decode: try GDS (SSD→GPU direct) first, then CPU mmap path. - # Sparse tiles (byte_count == 0) are unsupported on the GPU pipeline; - # the CPU reader fills them with nodata and copies onto the GPU. - has_sparse_tile = any(bc == 0 for bc in byte_counts) - # LERC tiles can carry a per-pixel valid mask that GDAL writes - # zero-filled in the data array. Compute the nodata fill the same - # way the CPU reader does so the GPU decode path can restore it - # post-assembly (mirrors PR #1529 for the CPU path). Only the - # chunky (planar=1) GPU path threads masked_fill into its kernel - # call below; the planar=2 per-band branch falls back to the CPU - # reader for masked pixels (rare in practice -- LERC files - # typically use chunky layout). - masked_fill = (_resolve_masked_fill(ifd.nodata_str, file_dtype) - if compression == COMPRESSION_LERC else None) - - # Track whether the array we end up with was already orientation-flipped - # by `read_to_array`. Any path that falls back to CPU decode picks up - # the orientation remap from PR #1521 + #1537 for free; the pure GPU - # paths still need the explicit remap added in #1540. - arr_was_cpu_decoded = False - # When a CPU fallback runs, ``read_to_array`` has already applied the - # MinIsWhite inversion and stashed the post-inversion sentinel on - # ``_mask_nodata``. Keep that geo_info alongside the pre-extracted one - # so the downstream nodata mask compares against the correct value - # (Copilot review of #1817). - _cpu_fallback_geo = None - - # PlanarConfiguration=2 (separate bands): each band has its own list - # of tiles back-to-back in TileOffsets / TileByteCounts. The GPU - # tile-assembly kernel assumes a single chunky tile sequence with - # bytes_per_pixel = itemsize * samples, so it cannot handle planar=2 - # directly. Decode each band's tile slab as a single-band image, then - # stack into (H, W, samples). For planar=1 (chunky) the existing - # single-pass kernel is correct. Sparse-tile files always route to - # the CPU reader regardless of planar config. - if planar == 2 and samples > 1 and not has_sparse_tile: - tiles_across = math.ceil(width / tw) - tiles_down = math.ceil(height / th) - tiles_per_band = tiles_across * tiles_down - # validate_tile_layout already requires len(offsets) >= the grid; - # accept extra trailing entries (some writers emit padding) and - # only consume the first tiles_per_band * samples. - expected_min = tiles_per_band * samples - if len(offsets) < expected_min: - raise ValueError( - f"PlanarConfiguration=2 expects at least {expected_min} " - f"TileOffsets entries ({tiles_across} x {tiles_down} x " - f"{samples} bands), got {len(offsets)}" - ) - # Lazy shared file read for the per-band stage-2 fallback. When - # every band's GDS path succeeds, _read_once is never called - # and we skip the read_all() entirely; when any band falls - # back, the first call materialises the bytes and subsequent - # bands reuse the same buffer (so N bands cost at most one - # read_all(), not N). - _shared_data_cache: list = [] - - def _read_once(): - if not _shared_data_cache: - src2 = _FileSource(source) - try: - _shared_data_cache.append(src2.read_all()) - finally: - src2.close() - return _shared_data_cache[0] - - band_arrays = [] - cpu_fallback_needed = False - for band_idx in range(samples): - b0 = band_idx * tiles_per_band - b1 = b0 + tiles_per_band - band_offsets = list(offsets[b0:b1]) - band_byte_counts = list(byte_counts[b0:b1]) - band_arr = _gpu_decode_single_band_tiles( - source, _read_once, band_offsets, band_byte_counts, - tw, th, width, height, - compression, predictor, file_dtype, - byte_order=header.byte_order, - gpu=gpu, - ) - if band_arr is None: - # Auto-mode signal: stage-2 GPU decode failed for this - # band. There's no per-band CPU decode path, so fall - # back to a whole-image CPU read + GPU upload, matching - # the chunky path's auto-mode semantics. - cpu_fallback_needed = True - break - band_arrays.append(band_arr) - if cpu_fallback_needed: - # Drop read_to_array's geo_info for orientation transform - # handling (below operates on our pre-extracted geo_info so the - # 2/3/4 case is covered regardless of #1539's merge state), but - # keep it on ``_cpu_fallback_geo`` so the MinIsWhite-aware nodata - # mask below sees ``_mask_nodata``. + # GPU decode: try GDS (SSD→GPU direct) first, then CPU mmap path. + # Sparse tiles (byte_count == 0) are unsupported on the GPU pipeline; + # the CPU reader fills them with nodata and copies onto the GPU. + has_sparse_tile = any(bc == 0 for bc in byte_counts) + # LERC tiles can carry a per-pixel valid mask that GDAL writes + # zero-filled in the data array. Compute the nodata fill the same + # way the CPU reader does so the GPU decode path can restore it + # post-assembly (mirrors PR #1529 for the CPU path). Only the + # chunky (planar=1) GPU path threads masked_fill into its kernel + # call below; the planar=2 per-band branch falls back to the CPU + # reader for masked pixels (rare in practice -- LERC files + # typically use chunky layout). + masked_fill = (_resolve_masked_fill(ifd.nodata_str, file_dtype) + if compression == COMPRESSION_LERC else None) + + # Track whether the array we end up with was already orientation-flipped + # by `read_to_array`. Any path that falls back to CPU decode picks up + # the orientation remap from PR #1521 + #1537 for free; the pure GPU + # paths still need the explicit remap added in #1540. + arr_was_cpu_decoded = False + # When a CPU fallback runs, ``read_to_array`` has already applied the + # MinIsWhite inversion and stashed the post-inversion sentinel on + # ``_mask_nodata``. Keep that geo_info alongside the pre-extracted one + # so the downstream nodata mask compares against the correct value + # (Copilot review of #1817). + _cpu_fallback_geo = None + + # PlanarConfiguration=2 (separate bands): each band has its own list + # of tiles back-to-back in TileOffsets / TileByteCounts. The GPU + # tile-assembly kernel assumes a single chunky tile sequence with + # bytes_per_pixel = itemsize * samples, so it cannot handle planar=2 + # directly. Decode each band's tile slab as a single-band image, then + # stack into (H, W, samples). For planar=1 (chunky) the existing + # single-pass kernel is correct. Sparse-tile files always route to + # the CPU reader regardless of planar config. + if planar == 2 and samples > 1 and not has_sparse_tile: + tiles_across = math.ceil(width / tw) + tiles_down = math.ceil(height / th) + tiles_per_band = tiles_across * tiles_down + # validate_tile_layout already requires len(offsets) >= the grid; + # accept extra trailing entries (some writers emit padding) and + # only consume the first tiles_per_band * samples. + expected_min = tiles_per_band * samples + if len(offsets) < expected_min: + raise ValueError( + f"PlanarConfiguration=2 expects at least {expected_min} " + f"TileOffsets entries ({tiles_across} x {tiles_down} x " + f"{samples} bands), got {len(offsets)}" + ) + # Lazy shared file read for the per-band stage-2 fallback. When + # every band's GDS path succeeds, _read_once is never called + # and we skip the read_all() entirely; when any band falls + # back, the first call materialises the bytes and subsequent + # bands reuse the same buffer (so N bands cost at most one + # read_all(), not N). + _shared_data_cache: list = [] + + def _read_once(): + if not _shared_data_cache: + src2 = _FileSource(source) + try: + _shared_data_cache.append(src2.read_all()) + finally: + src2.close() + return _shared_data_cache[0] + + band_arrays = [] + cpu_fallback_needed = False + for band_idx in range(samples): + b0 = band_idx * tiles_per_band + b1 = b0 + tiles_per_band + band_offsets = list(offsets[b0:b1]) + band_byte_counts = list(byte_counts[b0:b1]) + band_arr = _gpu_decode_single_band_tiles( + source, _read_once, band_offsets, band_byte_counts, + tw, th, width, height, + compression, predictor, file_dtype, + byte_order=header.byte_order, + gpu=gpu, + ) + if band_arr is None: + # Auto-mode signal: stage-2 GPU decode failed for this + # band. There's no per-band CPU decode path, so fall + # back to a whole-image CPU read + GPU upload, matching + # the chunky path's auto-mode semantics. + cpu_fallback_needed = True + break + band_arrays.append(band_arr) + if cpu_fallback_needed: + # Drop read_to_array's geo_info for orientation transform + # handling (below operates on our pre-extracted geo_info so the + # 2/3/4 case is covered regardless of #1539's merge state), but + # keep it on ``_cpu_fallback_geo`` so the MinIsWhite-aware nodata + # mask below sees ``_mask_nodata``. + arr_cpu, _cpu_fallback_geo = _read_to_array( + source, overview_level=overview_level) + arr_gpu = cupy.asarray(arr_cpu) + arr_was_cpu_decoded = True + else: + arr_gpu = cupy.stack(band_arrays, axis=2) + if arr_gpu.shape != (height, width, samples): + raise RuntimeError( + f"planar=2 GPU assembly produced shape " + f"{arr_gpu.shape}, expected " + f"({height}, {width}, {samples})" + ) + elif has_sparse_tile: arr_cpu, _cpu_fallback_geo = _read_to_array( source, overview_level=overview_level) arr_gpu = cupy.asarray(arr_cpu) arr_was_cpu_decoded = True else: - arr_gpu = cupy.stack(band_arrays, axis=2) - if arr_gpu.shape != (height, width, samples): - raise RuntimeError( - f"planar=2 GPU assembly produced shape " - f"{arr_gpu.shape}, expected " - f"({height}, {width}, {samples})" - ) - elif has_sparse_tile: - arr_cpu, _cpu_fallback_geo = _read_to_array( - source, overview_level=overview_level) - arr_gpu = cupy.asarray(arr_cpu) - arr_was_cpu_decoded = True - else: - from .._gpu_decode import gpu_decode_tiles_from_file - arr_gpu = None - - if sidecar_owned_ifd: - # The selected IFD lives in a sibling .tif.ovr sidecar; tile - # offsets index into the sidecar's bytes, not the base file - # path that ``gpu_decode_tiles_from_file`` would open via - # kvikio. Skip GDS and let the CPU-mmap fallback below slice - # the already-loaded sidecar buffer (issue #2112). + from .._gpu_decode import gpu_decode_tiles_from_file arr_gpu = None - else: + + if sidecar_owned_ifd: + # The selected IFD lives in a sibling .tif.ovr sidecar; tile + # offsets index into the sidecar's bytes, not the base file + # path that ``gpu_decode_tiles_from_file`` would open via + # kvikio. Skip GDS and let the CPU-mmap fallback below slice + # the already-loaded sidecar buffer (issue #2112). + arr_gpu = None + else: + try: + arr_gpu = gpu_decode_tiles_from_file( + source, offsets, byte_counts, + tw, th, width, height, + compression, predictor, file_dtype, samples, + byte_order=header.byte_order, + masked_fill=masked_fill, + ) + except Exception as e: + if gpu == 'strict' or _geotiff_strict_mode(): + raise + warnings.warn( + f"read_geotiff_gpu: GPU decode failed " + f"({type(e).__name__}: {e}); falling back to CPU.", + RuntimeWarning, + stacklevel=2, + ) + arr_gpu = None + + if arr_gpu is None: + # Fallback: extract tiles via CPU mmap, then GPU decode. For + # sidecar IFDs the tile bytes already live in ``data`` (loaded + # from the .ovr above); re-opening ``source`` would point at the + # base file. Use the in-scope ``data`` directly in that case. + if sidecar_owned_ifd: + compressed_tiles = [ + bytes(data[offsets[i]:offsets[i] + byte_counts[i]]) + for i in range(len(offsets)) + ] + else: + src2 = _FileSource(source) + data2 = src2.read_all() + try: + compressed_tiles = [ + bytes(data2[offsets[i]:offsets[i] + byte_counts[i]]) + for i in range(len(offsets)) + ] + finally: + src2.close() + + if arr_gpu is None: try: - arr_gpu = gpu_decode_tiles_from_file( - source, offsets, byte_counts, + arr_gpu = gpu_decode_tiles( + compressed_tiles, tw, th, width, height, compression, predictor, file_dtype, samples, byte_order=header.byte_order, @@ -665,162 +697,134 @@ def _read_once(): RuntimeWarning, stacklevel=2, ) - arr_gpu = None - - if arr_gpu is None: - # Fallback: extract tiles via CPU mmap, then GPU decode. For - # sidecar IFDs the tile bytes already live in ``data`` (loaded - # from the .ovr above); re-opening ``source`` would point at the - # base file. Use the in-scope ``data`` directly in that case. - if sidecar_owned_ifd: - compressed_tiles = [ - bytes(data[offsets[i]:offsets[i] + byte_counts[i]]) - for i in range(len(offsets)) - ] - else: - src2 = _FileSource(source) - data2 = src2.read_all() - try: - compressed_tiles = [ - bytes(data2[offsets[i]:offsets[i] + byte_counts[i]]) - for i in range(len(offsets)) - ] - finally: - src2.close() - - if arr_gpu is None: - try: - arr_gpu = gpu_decode_tiles( - compressed_tiles, - tw, th, width, height, - compression, predictor, file_dtype, samples, - byte_order=header.byte_order, - masked_fill=masked_fill, - ) - except Exception as e: - if gpu == 'strict' or _geotiff_strict_mode(): - raise - warnings.warn( - f"read_geotiff_gpu: GPU decode failed " - f"({type(e).__name__}: {e}); falling back to CPU.", - RuntimeWarning, - stacklevel=2, - ) - arr_cpu, _cpu_fallback_geo = _read_to_array( - source, overview_level=overview_level) - arr_gpu = cupy.asarray(arr_cpu) - arr_was_cpu_decoded = True + arr_cpu, _cpu_fallback_geo = _read_to_array( + source, overview_level=overview_level) + arr_gpu = cupy.asarray(arr_cpu) + arr_was_cpu_decoded = True + + # Multi-band tiled output must be (H, W, samples) regardless of planar + # config -- catch any shape regression in the kernels before we attach + # dims/coords below. Plain `raise` rather than `assert` so the check + # survives `python -O`. + if samples > 1: + if (arr_gpu.shape[:2] != (height, width) + or arr_gpu.shape[2] != samples): + raise RuntimeError( + f"GPU multi-band tile assembly produced shape " + f"{arr_gpu.shape}, expected " + f"({height}, {width}, {samples})" + ) - # Multi-band tiled output must be (H, W, samples) regardless of planar - # config -- catch any shape regression in the kernels before we attach - # dims/coords below. Plain `raise` rather than `assert` so the check - # survives `python -O`. - if samples > 1: - if (arr_gpu.shape[:2] != (height, width) - or arr_gpu.shape[2] != samples): - raise RuntimeError( - f"GPU multi-band tile assembly produced shape " - f"{arr_gpu.shape}, expected " - f"({height}, {width}, {samples})" - ) + # Apply the TIFF Orientation tag (274). The pure GPU paths land here + # with a raw stored-order buffer; the CPU-fallback paths land here + # with arr_gpu already remapped (read_to_array does the data flip) + # but with their pre-orientation geo_info (we discarded the one + # read_to_array returned because it does not handle 2/3/4 today). + # Skip the GPU array remap on CPU-decoded paths to avoid a double + # flip, but always apply the geo_info update so coords match. + if orientation != 1: + if not arr_was_cpu_decoded: + arr_gpu = _apply_orientation_gpu(arr_gpu, orientation) + geo_info = _apply_orientation_geo_info( + geo_info, orientation, file_h=height, file_w=width) + + _mw_mask_nodata = None + if (ifd.photometric == 0 and samples == 1 and not arr_was_cpu_decoded): + from .._reader import _miniswhite_inverted_nodata as _miw_inv_nd + gpu_dtype = np.dtype(str(arr_gpu.dtype)) + # Compute the post-MinIsWhite sentinel BEFORE inverting the array, + # so the downstream ``_apply_nodata_mask_gpu`` call compares + # against the right value (#1809). + _mw_mask_nodata = _miw_inv_nd(geo_info.nodata, ifd, gpu_dtype) + if gpu_dtype.kind == 'u': + arr_gpu = np.iinfo(gpu_dtype).max - arr_gpu + elif gpu_dtype.kind == 'f': + arr_gpu = -arr_gpu + + # Apply nodata mask + record sentinel so the GPU read agrees with the + # CPU eager path (issue #1542). Without this, integer rasters keep the + # literal sentinel value and float rasters keep the sentinel rather + # than NaN -- a silent backend divergence. Apply before the optional + # dtype cast so the float promotion for masked integer rasters doesn't + # surprise a user-supplied dtype. + nodata = geo_info.nodata + if nodata is not None and mask_nodata: + # When MinIsWhite was applied, the mask must use the inverted + # sentinel; otherwise the original sentinel. The pure GPU path + # records the inverted sentinel in ``_mw_mask_nodata`` above; the + # CPU-fallback paths (sparse-tile, planar=2 auto-fallback, and + # post-decode CPU fallback) get it from ``read_to_array`` via + # ``_cpu_fallback_geo._mask_nodata`` (Copilot review of #1817). + if _mw_mask_nodata is not None: + _gpu_mask_value = _mw_mask_nodata + elif _cpu_fallback_geo is not None: + _gpu_mask_value = getattr( + _cpu_fallback_geo, '_mask_nodata', nodata) + else: + _gpu_mask_value = nodata + arr_gpu = _apply_nodata_mask_gpu(arr_gpu, _gpu_mask_value) - # Apply the TIFF Orientation tag (274). The pure GPU paths land here - # with a raw stored-order buffer; the CPU-fallback paths land here - # with arr_gpu already remapped (read_to_array does the data flip) - # but with their pre-orientation geo_info (we discarded the one - # read_to_array returned because it does not handle 2/3/4 today). - # Skip the GPU array remap on CPU-decoded paths to avoid a double - # flip, but always apply the geo_info update so coords match. - if orientation != 1: - if not arr_was_cpu_decoded: - arr_gpu = _apply_orientation_gpu(arr_gpu, orientation) - geo_info = _apply_orientation_geo_info( - geo_info, orientation, file_h=height, file_w=width) - - _mw_mask_nodata = None - if (ifd.photometric == 0 and samples == 1 and not arr_was_cpu_decoded): - from .._reader import _miniswhite_inverted_nodata as _miw_inv_nd - gpu_dtype = np.dtype(str(arr_gpu.dtype)) - # Compute the post-MinIsWhite sentinel BEFORE inverting the array, - # so the downstream ``_apply_nodata_mask_gpu`` call compares - # against the right value (#1809). - _mw_mask_nodata = _miw_inv_nd(geo_info.nodata, ifd, gpu_dtype) - if gpu_dtype.kind == 'u': - arr_gpu = np.iinfo(gpu_dtype).max - arr_gpu - elif gpu_dtype.kind == 'f': - arr_gpu = -arr_gpu - - # Apply nodata mask + record sentinel so the GPU read agrees with the - # CPU eager path (issue #1542). Without this, integer rasters keep the - # literal sentinel value and float rasters keep the sentinel rather - # than NaN -- a silent backend divergence. Apply before the optional - # dtype cast so the float promotion for masked integer rasters doesn't - # surprise a user-supplied dtype. - nodata = geo_info.nodata - if nodata is not None and mask_nodata: - # When MinIsWhite was applied, the mask must use the inverted - # sentinel; otherwise the original sentinel. The pure GPU path - # records the inverted sentinel in ``_mw_mask_nodata`` above; the - # CPU-fallback paths (sparse-tile, planar=2 auto-fallback, and - # post-decode CPU fallback) get it from ``read_to_array`` via - # ``_cpu_fallback_geo._mask_nodata`` (Copilot review of #1817). - if _mw_mask_nodata is not None: - _gpu_mask_value = _mw_mask_nodata - elif _cpu_fallback_geo is not None: - _gpu_mask_value = getattr( - _cpu_fallback_geo, '_mask_nodata', nodata) - else: - _gpu_mask_value = nodata - arr_gpu = _apply_nodata_mask_gpu(arr_gpu, _gpu_mask_value) + if dtype is not None: + target = np.dtype(dtype) + _validate_dtype_cast(np.dtype(str(arr_gpu.dtype)), target) + arr_gpu = arr_gpu.astype(target) - if dtype is not None: - target = np.dtype(dtype) - _validate_dtype_cast(np.dtype(str(arr_gpu.dtype)), target) - arr_gpu = arr_gpu.astype(target) + # Build DataArray + if name is None: + import os + name = os.path.splitext(os.path.basename(source))[0] - # Build DataArray - if name is None: - import os - name = os.path.splitext(os.path.basename(source))[0] + _validate_read_geo_info( + geo_info, window=window, + allow_rotated=allow_rotated, + allow_unparseable_crs=allow_unparseable_crs, + ) - _validate_read_geo_info( - geo_info, window=window, - allow_rotated=allow_rotated, - allow_unparseable_crs=allow_unparseable_crs, - ) + attrs = {} + _populate_attrs_from_geo_info(attrs, geo_info, window=window) + # ``attrs['nodata']`` + ``attrs['masked_nodata']`` reflect the + # post-mask, post-cast array dtype (issue #1988). + _set_nodata_attrs( + attrs, nodata, array_dtype=np.dtype(str(arr_gpu.dtype)), + ) - attrs = {} - _populate_attrs_from_geo_info(attrs, geo_info, window=window) - # ``attrs['nodata']`` + ``attrs['masked_nodata']`` reflect the - # post-mask, post-cast array dtype (issue #1988). - _set_nodata_attrs( - attrs, nodata, array_dtype=np.dtype(str(arr_gpu.dtype)), - ) + # Apply window/band slicing post-decode. Coords are derived from the + # sliced array so the (y, x) labels line up with the user's requested + # subrectangle. This mirrors the ``open_geotiff`` / ``read_geotiff_dask`` + # contract: ``attrs['transform']`` always carries the full-source + # GeoTransform shifted to the window origin (via + # ``_populate_attrs_from_geo_info(..., window=window)``), while + # ``coords['y']`` / ``coords['x']`` cover only the windowed cells. + arr_gpu, coords = _gpu_apply_window_band( + arr_gpu, geo_info, window=window, band=band) + + if arr_gpu.ndim == 3: + dims = ['y', 'x', 'band'] + coords['band'] = np.arange(arr_gpu.shape[2]) + else: + dims = ['y', 'x'] - # Apply window/band slicing post-decode. Coords are derived from the - # sliced array so the (y, x) labels line up with the user's requested - # subrectangle. This mirrors the ``open_geotiff`` / ``read_geotiff_dask`` - # contract: ``attrs['transform']`` always carries the full-source - # GeoTransform shifted to the window origin (via - # ``_populate_attrs_from_geo_info(..., window=window)``), while - # ``coords['y']`` / ``coords['x']`` cover only the windowed cells. - arr_gpu, coords = _gpu_apply_window_band( - arr_gpu, geo_info, window=window, band=band) - - if arr_gpu.ndim == 3: - dims = ['y', 'x', 'band'] - coords['band'] = np.arange(arr_gpu.shape[2]) - else: - dims = ['y', 'x'] + result = xr.DataArray(arr_gpu, dims=dims, coords=coords, + name=name, attrs=attrs) - result = xr.DataArray(arr_gpu, dims=dims, coords=coords, - name=name, attrs=attrs) + # ``chunks=`` is handled at function entry via + # ``_read_geotiff_gpu_chunked`` for real out-of-core support; this + # eager path always returns a non-chunked CuPy-backed DataArray. - # ``chunks=`` is handled at function entry via - # ``_read_geotiff_gpu_chunked`` for real out-of-core support; this - # eager path always returns a non-chunked CuPy-backed DataArray. + return result - return result + finally: + src.close() + # Close the sidecar mmap (if any) deterministically once + # the GPU decode has finished reading ``data``. The decode + # path slices ``data`` directly when ``sidecar_owned_ifd`` + # is true, so the close must happen here -- AFTER the + # decode -- rather than in the early metadata-parse + # finally above. Closing here also bounds the file + # descriptor count for long-running processes opening + # many GeoTIFFs in sequence (review of #2112). + if sidecar is not None: + close_sidecar(sidecar) def _gds_chunk_path_available(source, ifd, has_sparse_tile, orientation): diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index beb183613..51a1e9764 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -3273,10 +3273,11 @@ def read_to_array(source, *, window=None, overview_level: int | None = None, from ._sidecar import ( attach_sidecar_origin, find_sidecar, load_sidecar, ) + sidecar_origin: dict[int, tuple] = {} sidecar_path = find_sidecar(source) if sidecar_path is not None: sidecar = load_sidecar(sidecar_path) - attach_sidecar_origin( + sidecar_origin = attach_sidecar_origin( sidecar.ifds, sidecar.data, sidecar.header) ifds = ifds + sidecar.ifds @@ -3286,8 +3287,7 @@ def read_to_array(source, *, window=None, overview_level: int | None = None, # If the selected IFD came from the sidecar, swap the data / # header used for strip / tile reads below so byte offsets # resolve against the right buffer. - ifd_data = getattr(ifd, '_source_data', None) or data - ifd_header = getattr(ifd, '_source_header', None) or header + ifd_data, ifd_header = sidecar_origin.get(id(ifd), (data, header)) bps = resolve_bits_per_sample(ifd.bits_per_sample) dtype = tiff_dtype_to_numpy(bps, ifd.sample_format) diff --git a/xrspatial/geotiff/_sidecar.py b/xrspatial/geotiff/_sidecar.py index 8754d2424..9736136c8 100644 --- a/xrspatial/geotiff/_sidecar.py +++ b/xrspatial/geotiff/_sidecar.py @@ -18,15 +18,26 @@ import mmap import os -from typing import NamedTuple +from typing import NamedTuple, Union from ._header import IFD, TIFFHeader, parse_all_ifds, parse_header +# Single source of truth for the fsspec gate; mirroring it here drifts +# silently the moment ``_reader._is_fsspec_uri`` learns a new scheme. +# ``_reader`` imports ``_sidecar`` lazily (inside functions), so this +# top-level import does not form a cycle at module load time. +from ._reader import _is_fsspec_uri + + +#: Type of the bytes-like buffer a sidecar carries: an mmap for local +#: files, bytes for HTTP / fsspec downloads. Narrowed from ``object`` +#: so the reader sees the actual variants when slicing IFD data. +SidecarBuffer = Union[mmap.mmap, bytes] class SidecarOverviews(NamedTuple): """Bytes, header, and IFDs from a sibling ``.tif.ovr`` sidecar.""" - data: object # bytes-like (mmap or bytes) + data: SidecarBuffer header: TIFFHeader ifds: list[IFD] path: str @@ -36,25 +47,16 @@ def _is_http_url(source: str) -> bool: return source.startswith(("http://", "https://")) -def _is_fsspec_uri(source: str) -> bool: - # Same gate ``_reader._is_fsspec_uri`` uses: any ``scheme://`` that is - # not http / https / local-mmap-able. We avoid importing the reader - # helper here to keep the dependency direction one-way. - if "://" not in source: - return False - if _is_http_url(source): - return False - return True - - def find_sidecar(source) -> str | None: """Return the path / URL of a sibling ``.ovr`` sidecar if one exists. Scopes: * Local file paths -- probe with :func:`os.path.isfile`. - * HTTP / HTTPS URLs -- issue a single HEAD request to - ``.ovr``; treat any 2xx as "exists". + * HTTP / HTTPS URLs -- issue a 1-byte Range GET to ``.ovr`` + through the eager reader's ``_HTTPSource`` so the probe inherits + SSRF validation, IP pinning, the shared urllib3 pool, and manual + redirect re-validation. See :func:`_probe_http`. * fsspec URIs (``s3://``, ``gs://``, ``az://``, ``memory://`` ...) -- call ``fsspec.AbstractFileSystem.exists`` on ``.ovr``. * File-like buffers (``io.BytesIO``, etc.) -- no sidecar concept, @@ -79,18 +81,41 @@ def find_sidecar(source) -> str | None: def _probe_http(url: str) -> str | None: - """Return ``url`` if a HEAD request reports an existing object.""" + """Return ``url`` if an existence probe succeeds via ``_HTTPSource``. + + Routes through :class:`xrspatial.geotiff._reader._HTTPSource` so the + probe inherits the same defences the eager reader applies to every + HTTP fetch: + + * SSRF allow-list via :func:`_validate_http_url` -- loopback / + link-local / private hostnames are rejected unless + ``XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS=1`` is set (#1664). + * IP pinning so a DNS-rebind between probe and download cannot + flip the target (#1846). + * Shared urllib3 ``PoolManager`` with retries and connection reuse, + so opening many sidecar-bearing files in a row keeps TCP/TLS + state warm instead of re-handshaking per probe. + * ``redirect=False`` with manual re-validation of every ``Location`` + header, so a 302 from a public host to a private one is rejected + the same way the eager reader rejects it. + + Existence is probed with a 1-byte Range GET (``bytes=0-0``) rather + than HEAD because HEAD support is inconsistent across CDNs and + object stores; a 1-byte GET works everywhere Range works (which is + the same precondition the eager COG reader assumes) and the + one-byte payload is cheaper than the headers urllib3 already + fetches for either method. + + Any failure -- ``UnsafeURLError``, 4xx/5xx, network timeout, + missing dependency -- returns ``None`` so the caller falls back + to base-only behaviour without raising. + """ try: - import urllib.request - req = urllib.request.Request(url, method="HEAD") - # 10 second timeout matches the eager HTTP reader's defaults; - # a stuck remote should not block sidecar discovery indefinitely. - with urllib.request.urlopen(req, timeout=10) as resp: - return url if 200 <= resp.status < 300 else None + from ._reader import _HTTPSource + src = _HTTPSource(url) + src.read_range(0, 1) + return url except Exception: - # 404 (urllib raises HTTPError) and any network error reach - # here; either way the sidecar is unavailable from our point - # of view and we fall back to base-only. return None @@ -127,9 +152,12 @@ def load_sidecar(path: str) -> SidecarOverviews: finally: f.close() elif _is_http_url(path): - import urllib.request - with urllib.request.urlopen(path, timeout=30) as resp: - data = resp.read() + # Reuse the eager reader's HTTP source so the sidecar download + # inherits SSRF validation, IP pinning, the shared urllib3 + # PoolManager, and manual redirect re-validation. See + # ``_probe_http`` for the threat model the indirection closes. + from ._reader import _HTTPSource + data = _HTTPSource(path).read_all() else: # fsspec URI import fsspec @@ -157,18 +185,22 @@ def close_sidecar(sidecar: SidecarOverviews | None) -> None: pass -def attach_sidecar_origin(ifds: list[IFD], - data: object, - header: TIFFHeader) -> None: - """Tag each IFD in ``ifds`` with its source bytes and header. +def attach_sidecar_origin( + ifds: list[IFD], + data: SidecarBuffer, + header: TIFFHeader, +) -> dict[int, tuple[SidecarBuffer, TIFFHeader]]: + """Return a mapping from ``id(ifd)`` to its ``(data, header)`` origin. + + The reader looks up the selected IFD in this mapping and slices + the corresponding buffer for strip/tile reads. IFDs not in the + mapping fall through to the base-file ``data`` / ``header`` the + caller already has in scope. - The reader checks ``ifd._source_data`` / ``ifd._source_header`` to - decide which buffer to slice for strip/tile reads. Untagged IFDs - fall through to the base-file ``data`` / ``header`` the caller - already has in scope. + Returns a fresh dict per call rather than mutating the parsed IFD + instances. Mutation via ``object.__setattr__`` would persist on the + IFD across requests if a future caller cached or reused the parsed + pyramid list. A side dict is request-scoped and discarded with the + surrounding ``read_to_array`` call frame. """ - for ifd in ifds: - # Use ``object.__setattr__`` so this works whether IFD is a - # plain dataclass or a frozen one in a future revision. - object.__setattr__(ifd, "_source_data", data) - object.__setattr__(ifd, "_source_header", header) + return {id(ifd): (data, header) for ifd in ifds} diff --git a/xrspatial/geotiff/tests/test_sidecar_ovr_2112.py b/xrspatial/geotiff/tests/test_sidecar_ovr_2112.py index eaab32367..d6f125ea8 100644 --- a/xrspatial/geotiff/tests/test_sidecar_ovr_2112.py +++ b/xrspatial/geotiff/tests/test_sidecar_ovr_2112.py @@ -253,7 +253,14 @@ def __init__(self, *a, **kw): return httpd, httpd.server_address[1] -def test_find_sidecar_http_probe_returns_url_when_present(tmp_path): +def test_find_sidecar_http_probe_returns_url_when_present( + tmp_path, monkeypatch): + # The sidecar probe now routes through ``_HTTPSource``, which + # rejects loopback hostnames under the SSRF guard added in #1664. + # Loopback is the standard local-server pattern in this repo's HTTP + # tests (see ``test_golden_corpus_http_1930.py``); opt into the + # escape hatch the production reader exposes. + monkeypatch.setenv("XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS", "1") src = _fixture_or_skip() import shutil shutil.copy(src, tmp_path / "x.tif") @@ -266,7 +273,9 @@ def test_find_sidecar_http_probe_returns_url_when_present(tmp_path): httpd.shutdown() -def test_find_sidecar_http_probe_returns_none_when_missing(tmp_path): +def test_find_sidecar_http_probe_returns_none_when_missing( + tmp_path, monkeypatch): + monkeypatch.setenv("XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS", "1") src = _fixture_or_skip() import shutil shutil.copy(src, tmp_path / "x.tif") # no .ovr copied @@ -278,7 +287,25 @@ def test_find_sidecar_http_probe_returns_none_when_missing(tmp_path): httpd.shutdown() -def test_load_sidecar_http_returns_ifds(tmp_path): +def test_find_sidecar_http_probe_rejects_loopback_without_env_override( + tmp_path): + """Without ``XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS=1``, the SSRF + guard makes a loopback probe silently return ``None`` -- same + silent-fail-to-base contract the rest of ``find_sidecar`` uses.""" + src = _fixture_or_skip() + import shutil + shutil.copy(src, tmp_path / "x.tif") + shutil.copy(str(src) + ".ovr", tmp_path / "x.tif.ovr") + httpd, port = _start_http_server(tmp_path) + try: + url = f"http://127.0.0.1:{port}/x.tif" + assert find_sidecar(url) is None + finally: + httpd.shutdown() + + +def test_load_sidecar_http_returns_ifds(tmp_path, monkeypatch): + monkeypatch.setenv("XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS", "1") src = _fixture_or_skip() import shutil shutil.copy(str(src) + ".ovr", tmp_path / "x.tif.ovr")