diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 6fe74d17..679f670d 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -215,14 +215,33 @@ def _read_geo_info(source, *, overview_level: int | None = None): raise TypeError( "source must be a str path or binary file-like, " f"got {type(source).__name__}") + sidecar = None try: header = parse_header(data) ifds = parse_all_ifds(data, header) if not ifds: raise ValueError("No IFDs found in TIFF file") + # Append sibling `.tif.ovr` sidecar IFDs onto the pyramid list + # so ``overview_level`` indexes both internal and external + # overviews (issue #2112). Local file paths only. + from ._sidecar import ( + attach_sidecar_origin, find_sidecar, load_sidecar, + ) + sidecar_path = find_sidecar(source) + if sidecar_path is not None: + sidecar = load_sidecar(sidecar_path) + # Metadata-only path: drop the origin mapping. The reader + # only needs the merged IFD list to resolve the requested + # ``overview_level`` against; strip/tile bytes are sliced by + # ``read_to_array`` on the actual read. + attach_sidecar_origin( + sidecar.ifds, sidecar.data, sidecar.header) + ifds = ifds + sidecar.ifds ifd = select_overview_ifd(ifds, overview_level) # Inherit georef from the level-0 IFD when the overview itself - # has no geokeys (issue #1640). Pass-through for level 0. + # has no geokeys (issue #1640). Pass-through for level 0. The + # sidecar IFDs typically lack geokeys so the inheritance pulls + # from the base file's full-resolution IFD as GDAL does. geo_info = extract_geo_info_with_overview_inheritance( ifd, ifds, data, header.byte_order) bps = resolve_bits_per_sample(ifd.bits_per_sample) @@ -238,6 +257,8 @@ def _read_geo_info(source, *, overview_level: int | None = None): finally: if close_data: data.close() + from ._sidecar import close_sidecar + close_sidecar(sidecar) def open_geotiff(source: str | BinaryIO, *, diff --git a/xrspatial/geotiff/_backends/gpu.py b/xrspatial/geotiff/_backends/gpu.py index f1dbcfbb..8a0f5473 100644 --- a/xrspatial/geotiff/_backends/gpu.py +++ b/xrspatial/geotiff/_backends/gpu.py @@ -286,6 +286,8 @@ def read_geotiff_gpu(source: str, *, src = _FileSource(source) data = src.read_all() + sidecar = None + sidecar_owned_ifd = False try: header = parse_header(data) ifds = parse_all_ifds(data, header) @@ -293,9 +295,35 @@ def read_geotiff_gpu(source: str, *, if len(ifds) == 0: raise ValueError("No IFDs found in TIFF file") + # Append sibling `.tif.ovr` sidecar IFDs onto the pyramid list so + # ``overview_level`` indexes both internal and external overviews + # (issue #2112). When the selected IFD comes from the sidecar, + # we swap ``data`` / ``header`` to the sidecar's buffers below + # and skip the GDS fast path -- GDS reads the source file path, + # which would point at the base file rather than the sidecar. + from .._sidecar import ( + attach_sidecar_origin, close_sidecar, find_sidecar, + load_sidecar, + ) + sidecar_origin: dict[int, tuple] = {} + sidecar_path = find_sidecar(source) + if sidecar_path is not None: + sidecar = load_sidecar(sidecar_path) + sidecar_origin = attach_sidecar_origin( + sidecar.ifds, sidecar.data, sidecar.header) + ifds = ifds + sidecar.ifds + # Skip mask IFDs (NewSubfileType bit 2) ifd = select_overview_ifd(ifds, overview_level) + # Swap base data / header for the sidecar's buffers when the + # requested overview level lives in the sidecar. Subsequent + # tile / strip reads slice the right buffer. + origin = sidecar_origin.get(id(ifd)) + if origin is not None: + data, header = origin + sidecar_owned_ifd = True + bps = resolve_bits_per_sample(ifd.bits_per_sample) file_dtype = tiff_dtype_to_numpy(bps, ifd.sample_format) # Inherit georef from the level-0 IFD when the overview itself @@ -515,288 +543,314 @@ def read_geotiff_gpu(source: str, *, f"legitimate." ) - finally: - src.close() - - # GPU decode: try GDS (SSD→GPU direct) first, then CPU mmap path. - # Sparse tiles (byte_count == 0) are unsupported on the GPU pipeline; - # the CPU reader fills them with nodata and copies onto the GPU. - has_sparse_tile = any(bc == 0 for bc in byte_counts) - # LERC tiles can carry a per-pixel valid mask that GDAL writes - # zero-filled in the data array. Compute the nodata fill the same - # way the CPU reader does so the GPU decode path can restore it - # post-assembly (mirrors PR #1529 for the CPU path). Only the - # chunky (planar=1) GPU path threads masked_fill into its kernel - # call below; the planar=2 per-band branch falls back to the CPU - # reader for masked pixels (rare in practice -- LERC files - # typically use chunky layout). - masked_fill = (_resolve_masked_fill(ifd.nodata_str, file_dtype) - if compression == COMPRESSION_LERC else None) - - # Track whether the array we end up with was already orientation-flipped - # by `read_to_array`. Any path that falls back to CPU decode picks up - # the orientation remap from PR #1521 + #1537 for free; the pure GPU - # paths still need the explicit remap added in #1540. - arr_was_cpu_decoded = False - # When a CPU fallback runs, ``read_to_array`` has already applied the - # MinIsWhite inversion and stashed the post-inversion sentinel on - # ``_mask_nodata``. Keep that geo_info alongside the pre-extracted one - # so the downstream nodata mask compares against the correct value - # (Copilot review of #1817). - _cpu_fallback_geo = None - - # PlanarConfiguration=2 (separate bands): each band has its own list - # of tiles back-to-back in TileOffsets / TileByteCounts. The GPU - # tile-assembly kernel assumes a single chunky tile sequence with - # bytes_per_pixel = itemsize * samples, so it cannot handle planar=2 - # directly. Decode each band's tile slab as a single-band image, then - # stack into (H, W, samples). For planar=1 (chunky) the existing - # single-pass kernel is correct. Sparse-tile files always route to - # the CPU reader regardless of planar config. - if planar == 2 and samples > 1 and not has_sparse_tile: - tiles_across = math.ceil(width / tw) - tiles_down = math.ceil(height / th) - tiles_per_band = tiles_across * tiles_down - # validate_tile_layout already requires len(offsets) >= the grid; - # accept extra trailing entries (some writers emit padding) and - # only consume the first tiles_per_band * samples. - expected_min = tiles_per_band * samples - if len(offsets) < expected_min: - raise ValueError( - f"PlanarConfiguration=2 expects at least {expected_min} " - f"TileOffsets entries ({tiles_across} x {tiles_down} x " - f"{samples} bands), got {len(offsets)}" - ) - # Lazy shared file read for the per-band stage-2 fallback. When - # every band's GDS path succeeds, _read_once is never called - # and we skip the read_all() entirely; when any band falls - # back, the first call materialises the bytes and subsequent - # bands reuse the same buffer (so N bands cost at most one - # read_all(), not N). - _shared_data_cache: list = [] - - def _read_once(): - if not _shared_data_cache: - src2 = _FileSource(source) - try: - _shared_data_cache.append(src2.read_all()) - finally: - src2.close() - return _shared_data_cache[0] - - band_arrays = [] - cpu_fallback_needed = False - for band_idx in range(samples): - b0 = band_idx * tiles_per_band - b1 = b0 + tiles_per_band - band_offsets = list(offsets[b0:b1]) - band_byte_counts = list(byte_counts[b0:b1]) - band_arr = _gpu_decode_single_band_tiles( - source, _read_once, band_offsets, band_byte_counts, - tw, th, width, height, - compression, predictor, file_dtype, - byte_order=header.byte_order, - gpu=gpu, - ) - if band_arr is None: - # Auto-mode signal: stage-2 GPU decode failed for this - # band. There's no per-band CPU decode path, so fall - # back to a whole-image CPU read + GPU upload, matching - # the chunky path's auto-mode semantics. - cpu_fallback_needed = True - break - band_arrays.append(band_arr) - if cpu_fallback_needed: - # Drop read_to_array's geo_info for orientation transform - # handling (below operates on our pre-extracted geo_info so the - # 2/3/4 case is covered regardless of #1539's merge state), but - # keep it on ``_cpu_fallback_geo`` so the MinIsWhite-aware nodata - # mask below sees ``_mask_nodata``. + # GPU decode: try GDS (SSD→GPU direct) first, then CPU mmap path. + # Sparse tiles (byte_count == 0) are unsupported on the GPU pipeline; + # the CPU reader fills them with nodata and copies onto the GPU. + has_sparse_tile = any(bc == 0 for bc in byte_counts) + # LERC tiles can carry a per-pixel valid mask that GDAL writes + # zero-filled in the data array. Compute the nodata fill the same + # way the CPU reader does so the GPU decode path can restore it + # post-assembly (mirrors PR #1529 for the CPU path). Only the + # chunky (planar=1) GPU path threads masked_fill into its kernel + # call below; the planar=2 per-band branch falls back to the CPU + # reader for masked pixels (rare in practice -- LERC files + # typically use chunky layout). + masked_fill = (_resolve_masked_fill(ifd.nodata_str, file_dtype) + if compression == COMPRESSION_LERC else None) + + # Track whether the array we end up with was already orientation-flipped + # by `read_to_array`. Any path that falls back to CPU decode picks up + # the orientation remap from PR #1521 + #1537 for free; the pure GPU + # paths still need the explicit remap added in #1540. + arr_was_cpu_decoded = False + # When a CPU fallback runs, ``read_to_array`` has already applied the + # MinIsWhite inversion and stashed the post-inversion sentinel on + # ``_mask_nodata``. Keep that geo_info alongside the pre-extracted one + # so the downstream nodata mask compares against the correct value + # (Copilot review of #1817). + _cpu_fallback_geo = None + + # PlanarConfiguration=2 (separate bands): each band has its own list + # of tiles back-to-back in TileOffsets / TileByteCounts. The GPU + # tile-assembly kernel assumes a single chunky tile sequence with + # bytes_per_pixel = itemsize * samples, so it cannot handle planar=2 + # directly. Decode each band's tile slab as a single-band image, then + # stack into (H, W, samples). For planar=1 (chunky) the existing + # single-pass kernel is correct. Sparse-tile files always route to + # the CPU reader regardless of planar config. + if planar == 2 and samples > 1 and not has_sparse_tile: + tiles_across = math.ceil(width / tw) + tiles_down = math.ceil(height / th) + tiles_per_band = tiles_across * tiles_down + # validate_tile_layout already requires len(offsets) >= the grid; + # accept extra trailing entries (some writers emit padding) and + # only consume the first tiles_per_band * samples. + expected_min = tiles_per_band * samples + if len(offsets) < expected_min: + raise ValueError( + f"PlanarConfiguration=2 expects at least {expected_min} " + f"TileOffsets entries ({tiles_across} x {tiles_down} x " + f"{samples} bands), got {len(offsets)}" + ) + # Lazy shared file read for the per-band stage-2 fallback. When + # every band's GDS path succeeds, _read_once is never called + # and we skip the read_all() entirely; when any band falls + # back, the first call materialises the bytes and subsequent + # bands reuse the same buffer (so N bands cost at most one + # read_all(), not N). + _shared_data_cache: list = [] + + def _read_once(): + if not _shared_data_cache: + src2 = _FileSource(source) + try: + _shared_data_cache.append(src2.read_all()) + finally: + src2.close() + return _shared_data_cache[0] + + band_arrays = [] + cpu_fallback_needed = False + for band_idx in range(samples): + b0 = band_idx * tiles_per_band + b1 = b0 + tiles_per_band + band_offsets = list(offsets[b0:b1]) + band_byte_counts = list(byte_counts[b0:b1]) + band_arr = _gpu_decode_single_band_tiles( + source, _read_once, band_offsets, band_byte_counts, + tw, th, width, height, + compression, predictor, file_dtype, + byte_order=header.byte_order, + gpu=gpu, + ) + if band_arr is None: + # Auto-mode signal: stage-2 GPU decode failed for this + # band. There's no per-band CPU decode path, so fall + # back to a whole-image CPU read + GPU upload, matching + # the chunky path's auto-mode semantics. + cpu_fallback_needed = True + break + band_arrays.append(band_arr) + if cpu_fallback_needed: + # Drop read_to_array's geo_info for orientation transform + # handling (below operates on our pre-extracted geo_info so the + # 2/3/4 case is covered regardless of #1539's merge state), but + # keep it on ``_cpu_fallback_geo`` so the MinIsWhite-aware nodata + # mask below sees ``_mask_nodata``. + arr_cpu, _cpu_fallback_geo = _read_to_array( + source, overview_level=overview_level) + arr_gpu = cupy.asarray(arr_cpu) + arr_was_cpu_decoded = True + else: + arr_gpu = cupy.stack(band_arrays, axis=2) + if arr_gpu.shape != (height, width, samples): + raise RuntimeError( + f"planar=2 GPU assembly produced shape " + f"{arr_gpu.shape}, expected " + f"({height}, {width}, {samples})" + ) + elif has_sparse_tile: arr_cpu, _cpu_fallback_geo = _read_to_array( source, overview_level=overview_level) arr_gpu = cupy.asarray(arr_cpu) arr_was_cpu_decoded = True else: - arr_gpu = cupy.stack(band_arrays, axis=2) - if arr_gpu.shape != (height, width, samples): + from .._gpu_decode import gpu_decode_tiles_from_file + arr_gpu = None + + if sidecar_owned_ifd: + # The selected IFD lives in a sibling .tif.ovr sidecar; tile + # offsets index into the sidecar's bytes, not the base file + # path that ``gpu_decode_tiles_from_file`` would open via + # kvikio. Skip GDS and let the CPU-mmap fallback below slice + # the already-loaded sidecar buffer (issue #2112). + arr_gpu = None + else: + try: + arr_gpu = gpu_decode_tiles_from_file( + source, offsets, byte_counts, + tw, th, width, height, + compression, predictor, file_dtype, samples, + byte_order=header.byte_order, + masked_fill=masked_fill, + ) + except Exception as e: + if gpu == 'strict' or _geotiff_strict_mode(): + raise + warnings.warn( + f"read_geotiff_gpu: GPU decode failed " + f"({type(e).__name__}: {e}); falling back to CPU.", + RuntimeWarning, + stacklevel=2, + ) + arr_gpu = None + + if arr_gpu is None: + # Fallback: extract tiles via CPU mmap, then GPU decode. For + # sidecar IFDs the tile bytes already live in ``data`` (loaded + # from the .ovr above); re-opening ``source`` would point at the + # base file. Use the in-scope ``data`` directly in that case. + if sidecar_owned_ifd: + compressed_tiles = [ + bytes(data[offsets[i]:offsets[i] + byte_counts[i]]) + for i in range(len(offsets)) + ] + else: + src2 = _FileSource(source) + data2 = src2.read_all() + try: + compressed_tiles = [ + bytes(data2[offsets[i]:offsets[i] + byte_counts[i]]) + for i in range(len(offsets)) + ] + finally: + src2.close() + + if arr_gpu is None: + try: + arr_gpu = gpu_decode_tiles( + compressed_tiles, + tw, th, width, height, + compression, predictor, file_dtype, samples, + byte_order=header.byte_order, + masked_fill=masked_fill, + ) + except Exception as e: + if gpu == 'strict' or _geotiff_strict_mode(): + raise + warnings.warn( + f"read_geotiff_gpu: GPU decode failed " + f"({type(e).__name__}: {e}); falling back to CPU.", + RuntimeWarning, + stacklevel=2, + ) + arr_cpu, _cpu_fallback_geo = _read_to_array( + source, overview_level=overview_level) + arr_gpu = cupy.asarray(arr_cpu) + arr_was_cpu_decoded = True + + # Multi-band tiled output must be (H, W, samples) regardless of planar + # config -- catch any shape regression in the kernels before we attach + # dims/coords below. Plain `raise` rather than `assert` so the check + # survives `python -O`. + if samples > 1: + if (arr_gpu.shape[:2] != (height, width) + or arr_gpu.shape[2] != samples): raise RuntimeError( - f"planar=2 GPU assembly produced shape " + f"GPU multi-band tile assembly produced shape " f"{arr_gpu.shape}, expected " f"({height}, {width}, {samples})" ) - elif has_sparse_tile: - arr_cpu, _cpu_fallback_geo = _read_to_array( - source, overview_level=overview_level) - arr_gpu = cupy.asarray(arr_cpu) - arr_was_cpu_decoded = True - else: - from .._gpu_decode import gpu_decode_tiles_from_file - arr_gpu = None - try: - arr_gpu = gpu_decode_tiles_from_file( - source, offsets, byte_counts, - tw, th, width, height, - compression, predictor, file_dtype, samples, - byte_order=header.byte_order, - masked_fill=masked_fill, - ) - except Exception as e: - if gpu == 'strict' or _geotiff_strict_mode(): - raise - warnings.warn( - f"read_geotiff_gpu: GPU decode failed " - f"({type(e).__name__}: {e}); falling back to CPU.", - RuntimeWarning, - stacklevel=2, - ) - arr_gpu = None - - if arr_gpu is None: - # Fallback: extract tiles via CPU mmap, then GPU decode - src2 = _FileSource(source) - data2 = src2.read_all() - try: - compressed_tiles = [ - bytes(data2[offsets[i]:offsets[i] + byte_counts[i]]) - for i in range(len(offsets)) - ] - finally: - src2.close() - - if arr_gpu is None: - try: - arr_gpu = gpu_decode_tiles( - compressed_tiles, - tw, th, width, height, - compression, predictor, file_dtype, samples, - byte_order=header.byte_order, - masked_fill=masked_fill, - ) - except Exception as e: - if gpu == 'strict' or _geotiff_strict_mode(): - raise - warnings.warn( - f"read_geotiff_gpu: GPU decode failed " - f"({type(e).__name__}: {e}); falling back to CPU.", - RuntimeWarning, - stacklevel=2, - ) - arr_cpu, _cpu_fallback_geo = _read_to_array( - source, overview_level=overview_level) - arr_gpu = cupy.asarray(arr_cpu) - arr_was_cpu_decoded = True - - # Multi-band tiled output must be (H, W, samples) regardless of planar - # config -- catch any shape regression in the kernels before we attach - # dims/coords below. Plain `raise` rather than `assert` so the check - # survives `python -O`. - if samples > 1: - if (arr_gpu.shape[:2] != (height, width) - or arr_gpu.shape[2] != samples): - raise RuntimeError( - f"GPU multi-band tile assembly produced shape " - f"{arr_gpu.shape}, expected " - f"({height}, {width}, {samples})" - ) + # Apply the TIFF Orientation tag (274). The pure GPU paths land here + # with a raw stored-order buffer; the CPU-fallback paths land here + # with arr_gpu already remapped (read_to_array does the data flip) + # but with their pre-orientation geo_info (we discarded the one + # read_to_array returned because it does not handle 2/3/4 today). + # Skip the GPU array remap on CPU-decoded paths to avoid a double + # flip, but always apply the geo_info update so coords match. + if orientation != 1: + if not arr_was_cpu_decoded: + arr_gpu = _apply_orientation_gpu(arr_gpu, orientation) + geo_info = _apply_orientation_geo_info( + geo_info, orientation, file_h=height, file_w=width) + + _mw_mask_nodata = None + if (ifd.photometric == 0 and samples == 1 and not arr_was_cpu_decoded): + from .._reader import _miniswhite_inverted_nodata as _miw_inv_nd + gpu_dtype = np.dtype(str(arr_gpu.dtype)) + # Compute the post-MinIsWhite sentinel BEFORE inverting the array, + # so the downstream ``_apply_nodata_mask_gpu`` call compares + # against the right value (#1809). + _mw_mask_nodata = _miw_inv_nd(geo_info.nodata, ifd, gpu_dtype) + if gpu_dtype.kind == 'u': + arr_gpu = np.iinfo(gpu_dtype).max - arr_gpu + elif gpu_dtype.kind == 'f': + arr_gpu = -arr_gpu + + # Apply nodata mask + record sentinel so the GPU read agrees with the + # CPU eager path (issue #1542). Without this, integer rasters keep the + # literal sentinel value and float rasters keep the sentinel rather + # than NaN -- a silent backend divergence. Apply before the optional + # dtype cast so the float promotion for masked integer rasters doesn't + # surprise a user-supplied dtype. + nodata = geo_info.nodata + if nodata is not None and mask_nodata: + # When MinIsWhite was applied, the mask must use the inverted + # sentinel; otherwise the original sentinel. The pure GPU path + # records the inverted sentinel in ``_mw_mask_nodata`` above; the + # CPU-fallback paths (sparse-tile, planar=2 auto-fallback, and + # post-decode CPU fallback) get it from ``read_to_array`` via + # ``_cpu_fallback_geo._mask_nodata`` (Copilot review of #1817). + if _mw_mask_nodata is not None: + _gpu_mask_value = _mw_mask_nodata + elif _cpu_fallback_geo is not None: + _gpu_mask_value = getattr( + _cpu_fallback_geo, '_mask_nodata', nodata) + else: + _gpu_mask_value = nodata + arr_gpu = _apply_nodata_mask_gpu(arr_gpu, _gpu_mask_value) - # Apply the TIFF Orientation tag (274). The pure GPU paths land here - # with a raw stored-order buffer; the CPU-fallback paths land here - # with arr_gpu already remapped (read_to_array does the data flip) - # but with their pre-orientation geo_info (we discarded the one - # read_to_array returned because it does not handle 2/3/4 today). - # Skip the GPU array remap on CPU-decoded paths to avoid a double - # flip, but always apply the geo_info update so coords match. - if orientation != 1: - if not arr_was_cpu_decoded: - arr_gpu = _apply_orientation_gpu(arr_gpu, orientation) - geo_info = _apply_orientation_geo_info( - geo_info, orientation, file_h=height, file_w=width) - - _mw_mask_nodata = None - if (ifd.photometric == 0 and samples == 1 and not arr_was_cpu_decoded): - from .._reader import _miniswhite_inverted_nodata as _miw_inv_nd - gpu_dtype = np.dtype(str(arr_gpu.dtype)) - # Compute the post-MinIsWhite sentinel BEFORE inverting the array, - # so the downstream ``_apply_nodata_mask_gpu`` call compares - # against the right value (#1809). - _mw_mask_nodata = _miw_inv_nd(geo_info.nodata, ifd, gpu_dtype) - if gpu_dtype.kind == 'u': - arr_gpu = np.iinfo(gpu_dtype).max - arr_gpu - elif gpu_dtype.kind == 'f': - arr_gpu = -arr_gpu - - # Apply nodata mask + record sentinel so the GPU read agrees with the - # CPU eager path (issue #1542). Without this, integer rasters keep the - # literal sentinel value and float rasters keep the sentinel rather - # than NaN -- a silent backend divergence. Apply before the optional - # dtype cast so the float promotion for masked integer rasters doesn't - # surprise a user-supplied dtype. - nodata = geo_info.nodata - if nodata is not None and mask_nodata: - # When MinIsWhite was applied, the mask must use the inverted - # sentinel; otherwise the original sentinel. The pure GPU path - # records the inverted sentinel in ``_mw_mask_nodata`` above; the - # CPU-fallback paths (sparse-tile, planar=2 auto-fallback, and - # post-decode CPU fallback) get it from ``read_to_array`` via - # ``_cpu_fallback_geo._mask_nodata`` (Copilot review of #1817). - if _mw_mask_nodata is not None: - _gpu_mask_value = _mw_mask_nodata - elif _cpu_fallback_geo is not None: - _gpu_mask_value = getattr( - _cpu_fallback_geo, '_mask_nodata', nodata) - else: - _gpu_mask_value = nodata - arr_gpu = _apply_nodata_mask_gpu(arr_gpu, _gpu_mask_value) + if dtype is not None: + target = np.dtype(dtype) + _validate_dtype_cast(np.dtype(str(arr_gpu.dtype)), target) + arr_gpu = arr_gpu.astype(target) - if dtype is not None: - target = np.dtype(dtype) - _validate_dtype_cast(np.dtype(str(arr_gpu.dtype)), target) - arr_gpu = arr_gpu.astype(target) + # Build DataArray + if name is None: + import os + name = os.path.splitext(os.path.basename(source))[0] - # Build DataArray - if name is None: - import os - name = os.path.splitext(os.path.basename(source))[0] + _validate_read_geo_info( + geo_info, window=window, + allow_rotated=allow_rotated, + allow_unparseable_crs=allow_unparseable_crs, + ) - _validate_read_geo_info( - geo_info, window=window, - allow_rotated=allow_rotated, - allow_unparseable_crs=allow_unparseable_crs, - ) + attrs = {} + _populate_attrs_from_geo_info(attrs, geo_info, window=window) + # ``attrs['nodata']`` + ``attrs['masked_nodata']`` reflect the + # post-mask, post-cast array dtype (issue #1988). + _set_nodata_attrs( + attrs, nodata, array_dtype=np.dtype(str(arr_gpu.dtype)), + ) - attrs = {} - _populate_attrs_from_geo_info(attrs, geo_info, window=window) - # ``attrs['nodata']`` + ``attrs['masked_nodata']`` reflect the - # post-mask, post-cast array dtype (issue #1988). - _set_nodata_attrs( - attrs, nodata, array_dtype=np.dtype(str(arr_gpu.dtype)), - ) + # Apply window/band slicing post-decode. Coords are derived from the + # sliced array so the (y, x) labels line up with the user's requested + # subrectangle. This mirrors the ``open_geotiff`` / ``read_geotiff_dask`` + # contract: ``attrs['transform']`` always carries the full-source + # GeoTransform shifted to the window origin (via + # ``_populate_attrs_from_geo_info(..., window=window)``), while + # ``coords['y']`` / ``coords['x']`` cover only the windowed cells. + arr_gpu, coords = _gpu_apply_window_band( + arr_gpu, geo_info, window=window, band=band) + + if arr_gpu.ndim == 3: + dims = ['y', 'x', 'band'] + coords['band'] = np.arange(arr_gpu.shape[2]) + else: + dims = ['y', 'x'] - # Apply window/band slicing post-decode. Coords are derived from the - # sliced array so the (y, x) labels line up with the user's requested - # subrectangle. This mirrors the ``open_geotiff`` / ``read_geotiff_dask`` - # contract: ``attrs['transform']`` always carries the full-source - # GeoTransform shifted to the window origin (via - # ``_populate_attrs_from_geo_info(..., window=window)``), while - # ``coords['y']`` / ``coords['x']`` cover only the windowed cells. - arr_gpu, coords = _gpu_apply_window_band( - arr_gpu, geo_info, window=window, band=band) - - if arr_gpu.ndim == 3: - dims = ['y', 'x', 'band'] - coords['band'] = np.arange(arr_gpu.shape[2]) - else: - dims = ['y', 'x'] + result = xr.DataArray(arr_gpu, dims=dims, coords=coords, + name=name, attrs=attrs) - result = xr.DataArray(arr_gpu, dims=dims, coords=coords, - name=name, attrs=attrs) + # ``chunks=`` is handled at function entry via + # ``_read_geotiff_gpu_chunked`` for real out-of-core support; this + # eager path always returns a non-chunked CuPy-backed DataArray. - # ``chunks=`` is handled at function entry via - # ``_read_geotiff_gpu_chunked`` for real out-of-core support; this - # eager path always returns a non-chunked CuPy-backed DataArray. + return result - return result + finally: + # Single close site for the file source and any sidecar mmap. + # The sidecar can only be released here, AFTER the GPU decode + # has finished reading ``data`` -- the decode slices + # ``sidecar.data`` directly when ``sidecar_owned_ifd`` is true. + # Deferring to Python GC instead would leak file descriptors + # in long-running processes opening many GeoTIFFs in sequence + # (review of #2112). + src.close() + if sidecar is not None: + close_sidecar(sidecar) def _gds_chunk_path_available(source, ifd, has_sparse_tile, orientation): diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index b8fa54a5..51a1e976 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -3254,6 +3254,7 @@ def read_to_array(source, *, window=None, overview_level: int | None = None, src = _FileSource(source) data = src.read_all() + sidecar = None try: header = parse_header(data) ifds = parse_all_ifds(data, header) @@ -3261,15 +3262,42 @@ def read_to_array(source, *, window=None, overview_level: int | None = None, if len(ifds) == 0: raise ValueError("No IFDs found in TIFF file") + # External `.tif.ovr` sidecar (issue #2112). GDAL/rasterio write + # overview pyramids to a sibling file when the source is not a + # COG; the sidecar's IFDs are the continuation of the base + # file's pyramid. Discovery only fires for local file paths; + # cloud / HTTP / file-like sources skip the lookup and keep the + # base-file-only behaviour. The sidecar must be loaded before + # IFD selection so ``overview_level`` can index into a unified + # pyramid list. + from ._sidecar import ( + attach_sidecar_origin, find_sidecar, load_sidecar, + ) + sidecar_origin: dict[int, tuple] = {} + sidecar_path = find_sidecar(source) + if sidecar_path is not None: + sidecar = load_sidecar(sidecar_path) + sidecar_origin = attach_sidecar_origin( + sidecar.ifds, sidecar.data, sidecar.header) + ifds = ifds + sidecar.ifds + # Select IFD, skipping any mask IFDs ifd = select_overview_ifd(ifds, overview_level) + # If the selected IFD came from the sidecar, swap the data / + # header used for strip / tile reads below so byte offsets + # resolve against the right buffer. + ifd_data, ifd_header = sidecar_origin.get(id(ifd), (data, header)) + bps = resolve_bits_per_sample(ifd.bits_per_sample) dtype = tiff_dtype_to_numpy(bps, ifd.sample_format) # Inherit georef from level 0 when an overview IFD lacks its own # geokeys (issue #1640). For overview_level=0 (or None) this is a # no-op: the helper short-circuits when the IFD is not a - # NewSubfileType=overview entry. + # NewSubfileType=overview entry. Sidecar IFDs always lack + # geokeys, so the inheritance pulls from the base file's + # level-0 IFD (kept first in the merged list) which is the + # GDAL convention. geo_info = extract_geo_info_with_overview_inheritance( ifd, ifds, data, header.byte_order) @@ -3349,10 +3377,10 @@ def read_to_array(source, *, window=None, overview_level: int | None = None, f"band={band} out of range for {ifd_samples}-band file.") if ifd.is_tiled: - arr = _read_tiles(data, ifd, header, dtype, window, + arr = _read_tiles(ifd_data, ifd, ifd_header, dtype, window, max_pixels=max_pixels) else: - arr = _read_strips(data, ifd, header, dtype, window, + arr = _read_strips(ifd_data, ifd, ifd_header, dtype, window, max_pixels=max_pixels) # Extract the requested band before reorienting so we work on a @@ -3379,5 +3407,7 @@ def read_to_array(source, *, window=None, overview_level: int | None = None, geo_info._mask_nodata = inverted_nodata finally: src.close() + from ._sidecar import close_sidecar + close_sidecar(sidecar) return arr, geo_info diff --git a/xrspatial/geotiff/_sidecar.py b/xrspatial/geotiff/_sidecar.py new file mode 100644 index 00000000..9736136c --- /dev/null +++ b/xrspatial/geotiff/_sidecar.py @@ -0,0 +1,206 @@ +"""External `.tif.ovr` sidecar discovery for overview reading. + +GDAL and rasterio write overview pyramids to a sibling ``.ovr`` +file by default (`gdaladdo -ro` and `rasterio` `--overview-resampling` +when the source itself is not a COG). The sidecar is a standalone TIFF +whose IFDs are the continuation of the base file's pyramid: base IFD 0 +is overview level 0, sidecar IFD 0 is level 1, sidecar IFD 1 is level +2, and so on. + +This module discovers the sidecar next to a local-file, HTTP, or +fsspec source and parses its IFDs, returning the inputs the reader +needs to switch to the sidecar's bytes/header when the requested +overview level lives there. Discovery is gated on a cheap existence +check (local stat, HTTP HEAD, or fsspec ``exists``); a missing sidecar +returns ``None`` and the caller falls back to base-file-only behaviour. +""" +from __future__ import annotations + +import mmap +import os +from typing import NamedTuple, Union + +from ._header import IFD, TIFFHeader, parse_all_ifds, parse_header +# Single source of truth for the fsspec gate; mirroring it here drifts +# silently the moment ``_reader._is_fsspec_uri`` learns a new scheme. +# ``_reader`` imports ``_sidecar`` lazily (inside functions), so this +# top-level import does not form a cycle at module load time. +from ._reader import _is_fsspec_uri + + +#: Type of the bytes-like buffer a sidecar carries: an mmap for local +#: files, bytes for HTTP / fsspec downloads. Narrowed from ``object`` +#: so the reader sees the actual variants when slicing IFD data. +SidecarBuffer = Union[mmap.mmap, bytes] + + +class SidecarOverviews(NamedTuple): + """Bytes, header, and IFDs from a sibling ``.tif.ovr`` sidecar.""" + + data: SidecarBuffer + header: TIFFHeader + ifds: list[IFD] + path: str + + +def _is_http_url(source: str) -> bool: + return source.startswith(("http://", "https://")) + + +def find_sidecar(source) -> str | None: + """Return the path / URL of a sibling ``.ovr`` sidecar if one exists. + + Scopes: + + * Local file paths -- probe with :func:`os.path.isfile`. + * HTTP / HTTPS URLs -- issue a 1-byte Range GET to ``.ovr`` + through the eager reader's ``_HTTPSource`` so the probe inherits + SSRF validation, IP pinning, the shared urllib3 pool, and manual + redirect re-validation. See :func:`_probe_http`. + * fsspec URIs (``s3://``, ``gs://``, ``az://``, ``memory://`` ...) + -- call ``fsspec.AbstractFileSystem.exists`` on ``.ovr``. + * File-like buffers (``io.BytesIO``, etc.) -- no sidecar concept, + return ``None``. + + Discovery failures are silent: any network error, missing fsspec, + or unreadable path returns ``None`` so the caller falls back to + base-file-only behaviour. The existence check is bounded and does + not download or open the sidecar itself; :func:`load_sidecar` + handles the actual read once the path is known. + """ + if not isinstance(source, str): + return None + candidate = source + ".ovr" + if "://" not in source: + return candidate if os.path.isfile(candidate) else None + if _is_http_url(source): + return _probe_http(candidate) + if _is_fsspec_uri(source): + return _probe_fsspec(candidate) + return None + + +def _probe_http(url: str) -> str | None: + """Return ``url`` if an existence probe succeeds via ``_HTTPSource``. + + Routes through :class:`xrspatial.geotiff._reader._HTTPSource` so the + probe inherits the same defences the eager reader applies to every + HTTP fetch: + + * SSRF allow-list via :func:`_validate_http_url` -- loopback / + link-local / private hostnames are rejected unless + ``XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS=1`` is set (#1664). + * IP pinning so a DNS-rebind between probe and download cannot + flip the target (#1846). + * Shared urllib3 ``PoolManager`` with retries and connection reuse, + so opening many sidecar-bearing files in a row keeps TCP/TLS + state warm instead of re-handshaking per probe. + * ``redirect=False`` with manual re-validation of every ``Location`` + header, so a 302 from a public host to a private one is rejected + the same way the eager reader rejects it. + + Existence is probed with a 1-byte Range GET (``bytes=0-0``) rather + than HEAD because HEAD support is inconsistent across CDNs and + object stores; a 1-byte GET works everywhere Range works (which is + the same precondition the eager COG reader assumes) and the + one-byte payload is cheaper than the headers urllib3 already + fetches for either method. + + Any failure -- ``UnsafeURLError``, 4xx/5xx, network timeout, + missing dependency -- returns ``None`` so the caller falls back + to base-only behaviour without raising. + """ + try: + from ._reader import _HTTPSource + src = _HTTPSource(url) + src.read_range(0, 1) + return url + except Exception: + return None + + +def _probe_fsspec(uri: str) -> str | None: + """Return ``uri`` if fsspec reports the object exists.""" + try: + import fsspec + fs, path = fsspec.core.url_to_fs(uri) + return uri if fs.exists(path) else None + except Exception: + return None + + +def load_sidecar(path: str) -> SidecarOverviews: + """Open and parse a sidecar ``.ovr`` file. + + Accepts local file paths, HTTP / HTTPS URLs, and fsspec URIs. + Local paths are mmap'd; remote sources are downloaded once via the + matching transport (HTTP via :mod:`urllib`, fsspec URIs via the + fsspec filesystem). The IFD list is the sidecar's full IFD chain + in file order; the reader treats them as overview levels (the + first sidecar IFD is level 1 when the base file holds only a + full-resolution IFD, level 2 when the base file already carries + one internal overview, and so on). + + The returned ``data`` is either an ``mmap`` (local) or ``bytes`` + (remote). Callers should close the mmap variant via + ``data.close()`` when present; the bytes case is no-op. + """ + if "://" not in path: + f = open(path, "rb") + try: + data = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) + finally: + f.close() + elif _is_http_url(path): + # Reuse the eager reader's HTTP source so the sidecar download + # inherits SSRF validation, IP pinning, the shared urllib3 + # PoolManager, and manual redirect re-validation. See + # ``_probe_http`` for the threat model the indirection closes. + from ._reader import _HTTPSource + data = _HTTPSource(path).read_all() + else: + # fsspec URI + import fsspec + with fsspec.open(path, "rb") as f: + data = f.read() + header = parse_header(data) + ifds = parse_all_ifds(data, header) + return SidecarOverviews(data=data, header=header, ifds=ifds, path=path) + + +def close_sidecar(sidecar: SidecarOverviews | None) -> None: + """Close the sidecar's data buffer if it holds a ``close`` method. + + Mmap data buffers need explicit close; ``bytes`` from a remote + download does not. Safe to call with ``None``. + """ + if sidecar is None: + return + closer = getattr(sidecar.data, "close", None) + if closer is None: + return + try: + closer() + except Exception: + pass + + +def attach_sidecar_origin( + ifds: list[IFD], + data: SidecarBuffer, + header: TIFFHeader, +) -> dict[int, tuple[SidecarBuffer, TIFFHeader]]: + """Return a mapping from ``id(ifd)`` to its ``(data, header)`` origin. + + The reader looks up the selected IFD in this mapping and slices + the corresponding buffer for strip/tile reads. IFDs not in the + mapping fall through to the base-file ``data`` / ``header`` the + caller already has in scope. + + Returns a fresh dict per call rather than mutating the parsed IFD + instances. Mutation via ``object.__setattr__`` would persist on the + IFD across requests if a future caller cached or reused the parsed + pyramid list. A side dict is request-scoped and discarded with the + surrounding ``read_to_array`` call frame. + """ + return {id(ifd): (data, header) for ifd in ifds} diff --git a/xrspatial/geotiff/tests/test_golden_corpus_eager_numpy_1930.py b/xrspatial/geotiff/tests/test_golden_corpus_eager_numpy_1930.py index 9fe2d44e..2699d666 100644 --- a/xrspatial/geotiff/tests/test_golden_corpus_eager_numpy_1930.py +++ b/xrspatial/geotiff/tests/test_golden_corpus_eager_numpy_1930.py @@ -86,21 +86,10 @@ _PARITY_GAPS: dict[str, str] = {} # Fixtures whose overview-IFD code path is not yet wired into the -# xrspatial reader. The base-IFD comparison still runs and passes; the -# overview-level loop driven by ``candidate_factory`` is skipped for -# these ids. Today this lists external sidecar overviews -# (``.tif.ovr``): the reader walks only the in-file IFD chain and would -# raise "overview_level out of range" for ``overview_level >= 1``. The -# internal-IFD and COG fixtures exercise the overview code path on -# every backend. -_OVERVIEW_READER_GAPS: dict[str, str] = { - "overview_external_ovr_uint16": ( - "External .ovr sidecar reader is not implemented in xrspatial. " - "The base IFD still parity-checks; the overview levels live in " - "a sibling .tif.ovr that the reader does not open. Track in a " - "follow-up issue if the corpus needs sidecar coverage." - ), -} +# xrspatial reader. Empty as of issue #2112, which taught the eager +# numpy reader to discover and parse sibling ``.tif.ovr`` files. New +# entries should include a follow-up issue link so the gap stays visible. +_OVERVIEW_READER_GAPS: dict[str, str] = {} _INTENTIONAL_SKIPS: dict[str, str] = { "nodata_miniswhite_uint8": ( diff --git a/xrspatial/geotiff/tests/test_sidecar_ovr_2112.py b/xrspatial/geotiff/tests/test_sidecar_ovr_2112.py new file mode 100644 index 00000000..d6f125ea --- /dev/null +++ b/xrspatial/geotiff/tests/test_sidecar_ovr_2112.py @@ -0,0 +1,339 @@ +"""External `.tif.ovr` sidecar overview reader (issue #2112). + +Before the fix, opening a GDAL/rasterio file whose overview pyramid +lives in a sibling ``.tif.ovr`` worked at the base level but raised an +``overview_level out of range`` error for any non-zero level. The +reader only walked the in-file IFD chain. + +These tests pin the new contract: + +* The base IFD continues to read at level 0 with no behaviour change. +* The reader appends sidecar IFDs onto the pyramid list so that + ``overview_level=1`` (and onwards) reads from the sidecar. +* Byte parity holds against rasterio for the bundled fixture, which + exercises a uint16 raster with two sidecar levels at factors 2 and 4. +* Discovery is local-file-only: file-like sources, HTTP / fsspec URIs, + and missing sidecars all fall back to base-only behaviour. +* Sidecar levels reach all eager-read entry points: ``open_geotiff``, + ``read_to_array``, and the dask graph builder that goes through + ``_read_to_array_metadata_only``. +""" +from __future__ import annotations + +import io +import pathlib +import shutil + +import numpy as np +import pytest + +from xrspatial.geotiff import open_geotiff +from xrspatial.geotiff._reader import read_to_array +from xrspatial.geotiff._sidecar import find_sidecar, load_sidecar + + +_FIXTURE = ( + pathlib.Path(__file__).resolve().parent + / "golden_corpus" + / "fixtures" + / "overview_external_ovr_uint16.tif" +) + + +def _fixture_or_skip(): + if not _FIXTURE.exists(): + pytest.skip("sidecar fixture not present") + if not (_FIXTURE.parent / "overview_external_ovr_uint16.tif.ovr").exists(): + pytest.skip("sidecar .ovr file not present") + return _FIXTURE + + +# --------------------------------------------------------------------------- +# Discovery helper: local file paths only. +# --------------------------------------------------------------------------- +def test_find_sidecar_returns_path_for_local_file_with_sidecar(): + src = str(_fixture_or_skip()) + assert find_sidecar(src) == src + ".ovr" + + +def test_find_sidecar_returns_none_when_sidecar_missing(tmp_path): + src = tmp_path / "no_sidecar_2112.tif" + src.write_bytes(b"\x00") + assert find_sidecar(str(src)) is None + + +def test_find_sidecar_returns_none_for_file_like_object(): + assert find_sidecar(io.BytesIO(b"")) is None + + +@pytest.mark.parametrize( + "uri", + [ + "s3://bucket/path.tif", + "gs://bucket/path.tif", + "az://container/path.tif", + "http://example.com/x.tif", + "https://example.com/x.tif", + "memory://buffer.tif", + ], +) +def test_find_sidecar_returns_none_for_remote_uri(uri): + assert find_sidecar(uri) is None + + +# --------------------------------------------------------------------------- +# Sidecar parser: returns a usable IFD list. +# --------------------------------------------------------------------------- +def test_load_sidecar_returns_two_ifds(): + src = _fixture_or_skip() + sidecar = load_sidecar(str(src) + ".ovr") + try: + # The bundled fixture was written with two overview factors (2, 4). + assert len(sidecar.ifds) == 2 + # First sidecar IFD is the 32x32 level, second is 16x16. + assert (sidecar.ifds[0].width, sidecar.ifds[0].height) == (32, 32) + assert (sidecar.ifds[1].width, sidecar.ifds[1].height) == (16, 16) + finally: + sidecar.data.close() + + +# --------------------------------------------------------------------------- +# End-to-end: each sidecar level reads through open_geotiff. +# --------------------------------------------------------------------------- +def test_open_geotiff_base_level_unchanged(): + src = _fixture_or_skip() + da = open_geotiff(str(src)) + assert da.shape == (64, 64) + assert da.dtype == np.uint16 + + +def test_open_geotiff_sidecar_level_1(): + src = _fixture_or_skip() + da = open_geotiff(str(src), overview_level=1) + assert da.shape == (32, 32) + assert da.dtype == np.uint16 + + +def test_open_geotiff_sidecar_level_2(): + src = _fixture_or_skip() + da = open_geotiff(str(src), overview_level=2) + assert da.shape == (16, 16) + assert da.dtype == np.uint16 + + +def test_open_geotiff_out_of_range_after_sidecar_appended(): + src = _fixture_or_skip() + with pytest.raises(ValueError, match="overview_level=3 is out of range"): + open_geotiff(str(src), overview_level=3) + + +# --------------------------------------------------------------------------- +# Byte parity with rasterio: the production reference for sidecar overviews. +# --------------------------------------------------------------------------- +def test_sidecar_level_1_matches_rasterio(): + rasterio = pytest.importorskip("rasterio") + src = _fixture_or_skip() + with rasterio.open(str(src)) as ds: + factor = ds.overviews(1)[0] + out_shape = (ds.height // factor, ds.width // factor) + rio_arr = ds.read(1, out_shape=out_shape) + xr_arr = open_geotiff(str(src), overview_level=1).values + np.testing.assert_array_equal(xr_arr, rio_arr) + + +def test_sidecar_level_2_matches_rasterio(): + rasterio = pytest.importorskip("rasterio") + src = _fixture_or_skip() + with rasterio.open(str(src)) as ds: + factor = ds.overviews(1)[1] + out_shape = (ds.height // factor, ds.width // factor) + rio_arr = ds.read(1, out_shape=out_shape) + xr_arr = open_geotiff(str(src), overview_level=2).values + np.testing.assert_array_equal(xr_arr, rio_arr) + + +# --------------------------------------------------------------------------- +# Sidecar reaches the reader entry point and the metadata-only helper. +# --------------------------------------------------------------------------- +def test_read_to_array_sidecar_level_1(tmp_path): + src = _fixture_or_skip() + arr, geo = read_to_array(str(src), overview_level=1) + assert arr.shape == (32, 32) + assert arr.dtype == np.uint16 + + +def test_metadata_only_includes_sidecar_levels(tmp_path): + src = _fixture_or_skip() + from xrspatial.geotiff import _read_geo_info + + geo, h, w, dtype, _ = _read_geo_info( + str(src), overview_level=2) + assert (h, w) == (16, 16) + # GeoTIFF tags inherit from the base IFD, so the metadata is non-empty. + assert geo is not None + + +# --------------------------------------------------------------------------- +# Missing sidecar gracefully falls back to base-only behaviour. +# --------------------------------------------------------------------------- +def test_missing_sidecar_raises_overview_out_of_range(tmp_path): + src = _fixture_or_skip() + # Copy only the base file; the sidecar stays behind. + copy_path = tmp_path / "no_sidecar_2112.tif" + shutil.copy(src, copy_path) + # Base level still works. + assert open_geotiff(str(copy_path)).shape == (64, 64) + # Asking for an overview level now fails the same way it did before + # sidecar support was added. + with pytest.raises(ValueError, match="out of range"): + open_geotiff(str(copy_path), overview_level=1) + + +# --------------------------------------------------------------------------- +# File-like buffer source still works for the base level (no sidecar lookup). +# --------------------------------------------------------------------------- +@pytest.fixture +def _gpu_or_skip(): + if not _gpu_available(): + pytest.skip("cupy + CUDA required") + + +def _gpu_available() -> bool: + import importlib.util + if importlib.util.find_spec("cupy") is None: + return False + try: + import cupy + return bool(cupy.cuda.is_available()) + except Exception: + return False + + +def test_gpu_eager_reads_sidecar_level_1(_gpu_or_skip): + src = _fixture_or_skip() + cpu = open_geotiff(str(src), overview_level=1) + gpu = open_geotiff(str(src), overview_level=1, gpu=True) + assert gpu.shape == cpu.shape + np.testing.assert_array_equal(gpu.data.get(), cpu.values) + + +def test_gpu_eager_reads_sidecar_level_2(_gpu_or_skip): + src = _fixture_or_skip() + cpu = open_geotiff(str(src), overview_level=2) + gpu = open_geotiff(str(src), overview_level=2, gpu=True) + assert gpu.shape == cpu.shape + np.testing.assert_array_equal(gpu.data.get(), cpu.values) + + +def test_gpu_eager_base_level_unchanged(_gpu_or_skip): + src = _fixture_or_skip() + gpu = open_geotiff(str(src), gpu=True) + assert gpu.shape == (64, 64) + + +# --------------------------------------------------------------------------- +# HTTP sidecar discovery: tests use a tiny local HTTP server so the +# probe and download paths are exercised without a network round-trip. +# --------------------------------------------------------------------------- +def _start_http_server(directory): + import http.server + import socketserver + import threading + + class _Handler(http.server.SimpleHTTPRequestHandler): + def log_message(self, *a, **kw): + return # silence + + def __init__(self, *a, **kw): + super().__init__(*a, directory=str(directory), **kw) + + httpd = socketserver.TCPServer(("127.0.0.1", 0), _Handler) + thread = threading.Thread(target=httpd.serve_forever, daemon=True) + thread.start() + return httpd, httpd.server_address[1] + + +def test_find_sidecar_http_probe_returns_url_when_present( + tmp_path, monkeypatch): + # The sidecar probe now routes through ``_HTTPSource``, which + # rejects loopback hostnames under the SSRF guard added in #1664. + # Loopback is the standard local-server pattern in this repo's HTTP + # tests (see ``test_golden_corpus_http_1930.py``); opt into the + # escape hatch the production reader exposes. + monkeypatch.setenv("XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS", "1") + src = _fixture_or_skip() + import shutil + shutil.copy(src, tmp_path / "x.tif") + shutil.copy(str(src) + ".ovr", tmp_path / "x.tif.ovr") + httpd, port = _start_http_server(tmp_path) + try: + url = f"http://127.0.0.1:{port}/x.tif" + assert find_sidecar(url) == url + ".ovr" + finally: + httpd.shutdown() + + +def test_find_sidecar_http_probe_returns_none_when_missing( + tmp_path, monkeypatch): + monkeypatch.setenv("XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS", "1") + src = _fixture_or_skip() + import shutil + shutil.copy(src, tmp_path / "x.tif") # no .ovr copied + httpd, port = _start_http_server(tmp_path) + try: + url = f"http://127.0.0.1:{port}/x.tif" + assert find_sidecar(url) is None + finally: + httpd.shutdown() + + +def test_find_sidecar_http_probe_rejects_loopback_without_env_override( + tmp_path): + """Without ``XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS=1``, the SSRF + guard makes a loopback probe silently return ``None`` -- same + silent-fail-to-base contract the rest of ``find_sidecar`` uses.""" + src = _fixture_or_skip() + import shutil + shutil.copy(src, tmp_path / "x.tif") + shutil.copy(str(src) + ".ovr", tmp_path / "x.tif.ovr") + httpd, port = _start_http_server(tmp_path) + try: + url = f"http://127.0.0.1:{port}/x.tif" + assert find_sidecar(url) is None + finally: + httpd.shutdown() + + +def test_load_sidecar_http_returns_ifds(tmp_path, monkeypatch): + monkeypatch.setenv("XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS", "1") + src = _fixture_or_skip() + import shutil + shutil.copy(str(src) + ".ovr", tmp_path / "x.tif.ovr") + httpd, port = _start_http_server(tmp_path) + try: + url = f"http://127.0.0.1:{port}/x.tif.ovr" + sidecar = load_sidecar(url) + assert len(sidecar.ifds) == 2 + assert (sidecar.ifds[0].width, sidecar.ifds[0].height) == (32, 32) + assert (sidecar.ifds[1].width, sidecar.ifds[1].height) == (16, 16) + finally: + httpd.shutdown() + + +def test_find_sidecar_fsspec_probe_returns_uri_when_present(tmp_path): + pytest.importorskip("fsspec") + src = _fixture_or_skip() + import shutil + shutil.copy(src, tmp_path / "y.tif") + shutil.copy(str(src) + ".ovr", tmp_path / "y.tif.ovr") + # file:// is a valid fsspec scheme that uses LocalFileSystem. + uri = f"file://{tmp_path}/y.tif" + assert find_sidecar(uri) == uri + ".ovr" + + +def test_file_like_source_reads_base_without_sidecar(): + src = _fixture_or_skip() + with open(src, "rb") as f: + buf = io.BytesIO(f.read()) + da = open_geotiff(buf) + assert da.shape == (64, 64)