diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index d057f07f..5842e9e6 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -1872,102 +1872,111 @@ def _read_cog_http(url: str, overview_level: int | None = None, (array, geo_info) tuple """ source = _HTTPSource(url) - header, ifd, geo_info, header_bytes = _parse_cog_http_meta( - source, overview_level=overview_level) - - # Mirror the local-path orientation guard in ``read_to_array``: a - # windowed read against a non-default Orientation tag (274) has - # ambiguous semantics (does the window refer to file pixels or to - # display pixels?) and the HTTP path does not yet implement - # ``_apply_orientation``. Reject the combination here so HTTP and - # local reads agree on the contract for oriented TIFFs instead of - # silently returning a different region or pixel order. See PR - # #1680 review feedback on issue #1669. - if ifd.orientation != 1 and window is not None: - source.close() - raise ValueError( - f"Orientation tag (274) is {ifd.orientation}; windowed reads " - f"(window=...) and dask-chunked reads (chunks=...) are not " - f"supported for non-default orientation. Read the full " - f"array first, then slice." - ) - - # Validate ``window`` against the selected IFD's extent before the - # tile fetch is built. Without this, the helper silently clamps an - # out-of-bounds window and returns a smaller array, mismatching - # ``open_geotiff``'s caller-built coord arrays. Mirrors the - # local-path validator in ``read_to_array`` (#1634). - if window is not None: - w_r0, w_c0, w_r1, w_c1 = window - if (w_r0 < 0 or w_c0 < 0 - or w_r1 > ifd.height or w_c1 > ifd.width - or w_r0 >= w_r1 or w_c0 >= w_c1): - source.close() - raise ValueError( - f"window={window} is outside the source extent " - f"({ifd.height}x{ifd.width}) or has non-positive size.") - - # Validate ``band`` against the selected IFD's sample count before - # the tile fetch. Without this, ``band=-1`` silently picks the last - # channel via numpy negative indexing and ``band>=samples_per_pixel`` - # leaks a raw numpy ``IndexError``; on a single-band file ``band=N`` - # (N != 0) is dropped on the floor because the post-decode slice - # below is gated on ``arr.ndim == 3 and samples_per_pixel > 1``. - # Mirrors the local-path validator in ``read_to_array`` so all - # backends agree on the contract: 0-based non-negative index only. - # ``source.close()`` is called for symmetry with the success-path - # teardown below; it is a no-op on ``_HTTPSource`` today (the - # urllib3 ``PoolManager`` is shared module-level, not per-source) - # but a future resource-holding source will need it. See issue #1695. - if band is not None: - # Reject ``bool`` (and ``np.bool_``) up front; ``isinstance(True, int)`` - # is True in Python so ``True < samples_per_pixel`` evaluates without - # raising and silently reads band 1. ``np.bool_`` is not a subclass of - # ``bool`` so it needs its own check to match the VRT path's - # rejection. See #1786. - if isinstance(band, (bool, np.bool_)): + # Issue #1816: wrap everything after the ``_HTTPSource`` construction + # in try/finally so ``source.close()`` runs even when header parsing, + # validation, fetch/decode, or orientation/photometric post-processing + # raises. ``_HTTPSource.close()`` is a no-op today, but a future + # resource-holding source would leak on the error path without this. + # ``close()`` is idempotent, so the explicit pre-raise ``source.close()`` + # calls in the validation blocks below stay as-is. + try: + header, ifd, geo_info, header_bytes = _parse_cog_http_meta( + source, overview_level=overview_level) + + # Mirror the local-path orientation guard in ``read_to_array``: a + # windowed read against a non-default Orientation tag (274) has + # ambiguous semantics (does the window refer to file pixels or to + # display pixels?) and the HTTP path does not yet implement + # ``_apply_orientation``. Reject the combination here so HTTP and + # local reads agree on the contract for oriented TIFFs instead of + # silently returning a different region or pixel order. See PR + # #1680 review feedback on issue #1669. + if ifd.orientation != 1 and window is not None: source.close() raise ValueError( - f"band must be a non-negative int, got {band!r}") - if ifd.samples_per_pixel <= 1: - if band != 0: + f"Orientation tag (274) is {ifd.orientation}; windowed reads " + f"(window=...) and dask-chunked reads (chunks=...) are not " + f"supported for non-default orientation. Read the full " + f"array first, then slice." + ) + + # Validate ``window`` against the selected IFD's extent before the + # tile fetch is built. Without this, the helper silently clamps an + # out-of-bounds window and returns a smaller array, mismatching + # ``open_geotiff``'s caller-built coord arrays. Mirrors the + # local-path validator in ``read_to_array`` (#1634). + if window is not None: + w_r0, w_c0, w_r1, w_c1 = window + if (w_r0 < 0 or w_c0 < 0 + or w_r1 > ifd.height or w_c1 > ifd.width + or w_r0 >= w_r1 or w_c0 >= w_c1): + source.close() + raise ValueError( + f"window={window} is outside the source extent " + f"({ifd.height}x{ifd.width}) or has non-positive size.") + + # Validate ``band`` against the selected IFD's sample count before + # the tile fetch. Without this, ``band=-1`` silently picks the last + # channel via numpy negative indexing and ``band>=samples_per_pixel`` + # leaks a raw numpy ``IndexError``; on a single-band file ``band=N`` + # (N != 0) is dropped on the floor because the post-decode slice + # below is gated on ``arr.ndim == 3 and samples_per_pixel > 1``. + # Mirrors the local-path validator in ``read_to_array`` so all + # backends agree on the contract: 0-based non-negative index only. + # ``source.close()`` is called for symmetry with the success-path + # teardown below; it is a no-op on ``_HTTPSource`` today (the + # urllib3 ``PoolManager`` is shared module-level, not per-source) + # but a future resource-holding source will need it. See issue #1695. + if band is not None: + # Reject ``bool`` (and ``np.bool_``) up front; ``isinstance(True, int)`` + # is True in Python so ``True < samples_per_pixel`` evaluates without + # raising and silently reads band 1. ``np.bool_`` is not a subclass of + # ``bool`` so it needs its own check to match the VRT path's + # rejection. See #1786. + if isinstance(band, (bool, np.bool_)): + source.close() + raise ValueError( + f"band must be a non-negative int, got {band!r}") + if ifd.samples_per_pixel <= 1: + if band != 0: + source.close() + raise IndexError( + f"band={band} requested on a single-band file.") + elif not 0 <= band < ifd.samples_per_pixel: source.close() raise IndexError( - f"band={band} requested on a single-band file.") - elif not 0 <= band < ifd.samples_per_pixel: - source.close() - raise IndexError( - f"band={band} out of range for " - f"{ifd.samples_per_pixel}-band file.") - - arr = _fetch_decode_cog_http_tiles( - source, header, ifd, max_pixels=max_pixels, window=window) - source.close() - - # Mirror the local-path band selection in ``read_to_array``: extract - # the requested band only after the array is materialised so the - # multi-band tile decode can populate every plane first. ``band`` - # outside the valid range raises ``IndexError`` the same as numpy. - if arr.ndim == 3 and ifd.samples_per_pixel > 1 and band is not None: - arr = arr[:, :, band] - - # Apply Orientation tag (274) so HTTP reads return the same pixel - # order and transform as the local-file path. Only the full-read - # branch reaches here; the windowed-read branch is rejected above - # for non-default orientation. See issue #1717. - if ifd.orientation != 1: - arr, geo_info = _apply_orientation_with_geo( - arr, geo_info, ifd.orientation) - - if ifd.photometric == 0 and ifd.samples_per_pixel == 1: - # Stash the inverted sentinel on geo_info so the caller's - # sentinel-to-NaN mask runs against the post-MinIsWhite value - # while ``attrs['nodata']`` keeps the original sentinel for - # round-trip on write (issue #1809). - inverted_nodata = _miniswhite_inverted_nodata( - geo_info.nodata, ifd, arr.dtype) + f"band={band} out of range for " + f"{ifd.samples_per_pixel}-band file.") + + arr = _fetch_decode_cog_http_tiles( + source, header, ifd, max_pixels=max_pixels, window=window) + + # Mirror the local-path band selection in ``read_to_array``: extract + # the requested band only after the array is materialised so the + # multi-band tile decode can populate every plane first. ``band`` + # outside the valid range raises ``IndexError`` the same as numpy. + if arr.ndim == 3 and ifd.samples_per_pixel > 1 and band is not None: + arr = arr[:, :, band] + + # Apply Orientation tag (274) so HTTP reads return the same pixel + # order and transform as the local-file path. Only the full-read + # branch reaches here; the windowed-read branch is rejected above + # for non-default orientation. See issue #1717. + if ifd.orientation != 1: + arr, geo_info = _apply_orientation_with_geo( + arr, geo_info, ifd.orientation) + + if ifd.photometric == 0 and ifd.samples_per_pixel == 1: + # Stash the inverted sentinel on geo_info so the caller's + # sentinel-to-NaN mask runs against the post-MinIsWhite value + # while ``attrs['nodata']`` keeps the original sentinel for + # round-trip on write (issue #1809). + inverted_nodata = _miniswhite_inverted_nodata( + geo_info.nodata, ifd, arr.dtype) + geo_info._mask_nodata = inverted_nodata arr = _apply_photometric_miniswhite(arr, ifd) - geo_info._mask_nodata = inverted_nodata + finally: + source.close() return arr, geo_info diff --git a/xrspatial/geotiff/tests/test_cog_http_close_on_error_1816.py b/xrspatial/geotiff/tests/test_cog_http_close_on_error_1816.py new file mode 100644 index 00000000..cd00779a --- /dev/null +++ b/xrspatial/geotiff/tests/test_cog_http_close_on_error_1816.py @@ -0,0 +1,229 @@ +"""Regression tests for issue #1816. + +``_read_cog_http`` called ``source.close()`` only on the success path: +the fetch/decode step plus the post-processing block (band slice, +orientation, photometric inversion) ran outside any ``try/finally``, +so a raise from any of them skipped the close. ``_HTTPSource.close()`` +is currently a no-op (a module-level urllib3 pool, not a per-source +resource), so the leak is latent rather than active, but the structure +needs the guard so a future resource-holding source does not silently +leak. + +These tests pin the close-on-error contract by injecting a wrapper +around ``_HTTPSource`` that records every ``close()`` call, then +exercising both the happy path and a tile-fetch failure path. +""" +from __future__ import annotations + +import http.server +import socketserver +import threading + +import numpy as np +import pytest + +from xrspatial.geotiff import _reader as reader_mod +from xrspatial.geotiff._reader import _read_cog_http +from xrspatial.geotiff._writer import write + + +# --------------------------------------------------------------------------- +# Loopback HTTP server with Range support (mirrors #1695 pattern). +# --------------------------------------------------------------------------- + +class _RangeHandler(http.server.BaseHTTPRequestHandler): + payload: bytes = b'' + + def do_GET(self): # noqa: N802 + rng = self.headers.get('Range') + if rng and rng.startswith('bytes='): + spec = rng[len('bytes='):] + start_s, _, end_s = spec.partition('-') + start = int(start_s) + end = int(end_s) if end_s else len(self.payload) - 1 + chunk = self.payload[start:end + 1] + self.send_response(206) + self.send_header('Content-Type', 'application/octet-stream') + self.send_header( + 'Content-Range', + f'bytes {start}-{start + len(chunk) - 1}/{len(self.payload)}', + ) + self.send_header('Content-Length', str(len(chunk))) + self.end_headers() + self.wfile.write(chunk) + return + self.send_response(200) + self.send_header('Content-Type', 'application/octet-stream') + self.send_header('Content-Length', str(len(self.payload))) + self.end_headers() + self.wfile.write(self.payload) + + def log_message(self, *_args, **_kwargs): + pass + + +def _serve(payload: bytes): + handler_cls = type( + 'RangeHandler1816', (_RangeHandler,), {'payload': payload} + ) + httpd = socketserver.TCPServer(('127.0.0.1', 0), handler_cls) + port = httpd.server_address[1] + thread = threading.Thread(target=httpd.serve_forever, daemon=True) + thread.start() + return f'http://127.0.0.1:{port}/cog.tif', httpd, thread + + +def _stop(httpd): + httpd.shutdown() + httpd.server_close() + + +@pytest.fixture(autouse=True) +def _allow_loopback(monkeypatch): + """The HTTP source rejects loopback by default after #1664.""" + monkeypatch.setenv('XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS', '1') + + +# --------------------------------------------------------------------------- +# Close-tracking wrapper installed via monkeypatch on the _HTTPSource +# constructor used inside _read_cog_http. +# --------------------------------------------------------------------------- + +class _CloseTracker: + """Delegates every attribute to a real ``_HTTPSource`` while + recording ``close()`` calls. Used to verify that ``_read_cog_http`` + closes the source on both the success and the failure path. + """ + + def __init__(self, real): + self._real = real + self.close_count = 0 + + def __getattr__(self, name): + return getattr(self._real, name) + + def close(self): + self.close_count += 1 + return self._real.close() + + +def _install_tracker(monkeypatch): + """Replace ``_HTTPSource`` in ``_reader`` with a factory that wraps + each instance in a ``_CloseTracker`` and stashes the trackers on a + list so the test can inspect them afterwards. + """ + trackers: list[_CloseTracker] = [] + real_cls = reader_mod._HTTPSource + + def factory(url, *args, **kwargs): + tracker = _CloseTracker(real_cls(url, *args, **kwargs)) + trackers.append(tracker) + return tracker + + monkeypatch.setattr(reader_mod, '_HTTPSource', factory) + return trackers + + +# --------------------------------------------------------------------------- +# Fixture: a real single-band COG served over loopback. +# --------------------------------------------------------------------------- + +@pytest.fixture +def single_band_cog(tmp_path): + arr = np.arange(32 * 32, dtype=np.float32).reshape(32, 32) + path = str(tmp_path / 'tmp_1816_single.tif') + write(arr, path, compression='deflate', tiled=True, tile_size=16, + cog=True) + with open(path, 'rb') as f: + payload = f.read() + return path, payload, arr + + +# --------------------------------------------------------------------------- +# Happy path: close called exactly once after full post-processing. +# --------------------------------------------------------------------------- + +def test_http_source_closed_on_success(single_band_cog, monkeypatch): + """A successful ``_read_cog_http`` closes the source exactly once. + + Establishes the baseline so the failure-path test below isn't just + catching an unrelated regression in the success path. + """ + _path, payload, expected = single_band_cog + trackers = _install_tracker(monkeypatch) + url, httpd, _ = _serve(payload) + try: + arr, _geo = _read_cog_http(url) + np.testing.assert_array_equal(arr, expected) + finally: + _stop(httpd) + + assert len(trackers) == 1, ( + f"expected one _HTTPSource construction, got {len(trackers)}") + assert trackers[0].close_count == 1, ( + f"expected close() called once, got {trackers[0].close_count}") + + +# --------------------------------------------------------------------------- +# Failure path: tile fetch raises, source still closed. +# --------------------------------------------------------------------------- + +def test_http_source_closed_when_tile_fetch_raises( + single_band_cog, monkeypatch, +): + """When ``_fetch_decode_cog_http_tiles`` raises, ``_read_cog_http`` + still closes the source. Before the fix, ``source.close()`` ran + only on the success path, so any exception in the fetch/decode + bypassed the close. + """ + _path, payload, _expected = single_band_cog + trackers = _install_tracker(monkeypatch) + + def boom(*_args, **_kwargs): + raise OSError("simulated tile fetch failure") + + monkeypatch.setattr( + reader_mod, '_fetch_decode_cog_http_tiles', boom) + + url, httpd, _ = _serve(payload) + try: + with pytest.raises(OSError, match="simulated tile fetch failure"): + _read_cog_http(url) + finally: + _stop(httpd) + + assert len(trackers) == 1 + assert trackers[0].close_count == 1, ( + "source.close() was not called on the exception path; " + "the try/finally guard in _read_cog_http is missing or broken") + + +# --------------------------------------------------------------------------- +# Failure path: post-processing (orientation) raises, source still closed. +# --------------------------------------------------------------------------- + +def test_http_source_closed_when_post_processing_raises( + single_band_cog, monkeypatch, +): + """An exception from the orientation/photometric step also runs + through ``finally``. Guards against a future regression that moves + ``source.close()`` back between the fetch and the post-processing. + """ + _path, payload, _expected = single_band_cog + trackers = _install_tracker(monkeypatch) + + def boom(*_args, **_kwargs): + raise RuntimeError("simulated photometric failure") + + monkeypatch.setattr( + reader_mod, '_apply_photometric_miniswhite', boom) + + url, httpd, _ = _serve(payload) + try: + with pytest.raises(RuntimeError, match="simulated photometric"): + _read_cog_http(url) + finally: + _stop(httpd) + + assert len(trackers) == 1 + assert trackers[0].close_count == 1