diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index b8d289b60..ca05daef7 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -935,7 +935,8 @@ def _get_pinned_pool(self, scheme: str, host: str, port: int | None, self._pinned_pools[key] = pool return pool - def _request(self, headers: dict | None = None): + def _request(self, headers: dict | None = None, + preload_content: bool = True): """Issue a GET with manual, validated redirect following. urllib3's built-in redirect follower has no validation hook, so @@ -949,6 +950,13 @@ def _request(self, headers: dict | None = None): that exists between ``getaddrinfo`` in the validator and the second ``getaddrinfo`` urllib3 would otherwise do at connect time. Issue #1846. + + ``preload_content=False`` returns a streaming response: the body + is not buffered into ``resp.data`` and the caller must drain it + via ``resp.stream(...)``. Used by :meth:`read_all` when a + ``max_bytes`` budget is in play, so the body is bounded + on-the-wire instead of being fully allocated before the cap is + checked. Issue #2051. """ from urllib.parse import urljoin timeout = self._urllib3_timeout() @@ -961,11 +969,31 @@ def _request(self, headers: dict | None = None): headers=headers, timeout=timeout, redirect=False, + preload_content=preload_content, ) if 300 <= resp.status < 400 and resp.status != 304: location = resp.headers.get('Location') if not location: return resp + # Release the redirect response's connection back to + # the pool. ``preload_content=True`` (the default) drains + # the body for us, but the streaming path + # (``preload_content=False``, used by ``read_all`` with a + # byte budget) leaves the connection borrowed -- if we + # do not release it here, subsequent hops will allocate + # fresh connections every time. Drain first so urllib3 + # can return the connection to the pool instead of + # closing it; a 3xx body is bounded by Content-Length so + # the drain is cheap. + if not preload_content: + try: + resp.drain_conn() + except Exception: # noqa: BLE001 + pass + try: + resp.release_conn() + except Exception: # noqa: BLE001 + pass # Resolve relative ``Location`` against the URL we just # requested, not against ``self._url``: chained # redirects can land us on a different origin. @@ -1184,8 +1212,99 @@ def read_ranges_coalesced( merged_bytes = self.read_ranges(merged, max_workers=max_workers) return split_coalesced_bytes(merged_bytes, mapping) - def read_all(self) -> bytes: - return self._request().data + def read_all(self, max_bytes: int | None = None) -> bytes: + """Fetch the full body, optionally bounded by ``max_bytes``. + + ``max_bytes`` caps both the advertised ``Content-Length`` (rejected + up front before any bytes are read into memory) and the actual + body size (streamed and aborted once ``max_bytes + 1`` bytes have + arrived). The ``+ 1`` is the over-shoot detector: a body that + exactly matches the cap passes, but a server that ignores or + lies about ``Content-Length`` and streams more bytes is caught + as soon as the first extra byte lands. + + Without a cap, a tiny TIFF header (e.g. 100x100) that survives + :func:`_check_dimensions` can still be served as a multi-gigabyte + HTTP body and the whole body is allocated before TIFF parsing + gets a chance to reject it. Issue #2051. + + ``max_bytes=None`` preserves the legacy unbounded behaviour for + callers that already gate the read upstream (e.g. cloud reads + gated by :data:`max_cloud_bytes`). + """ + if max_bytes is None: + return self._request().data + # Stream the body so the cap is enforced before the bytes land + # in memory. ``preload_content=False`` makes urllib3 hand us + # the response without buffering ``resp.data``. + resp = self._request(preload_content=False) + try: + self._check_content_length(resp.headers, max_bytes) + return self._read_capped(resp, max_bytes) + finally: + try: + resp.release_conn() + except Exception: # noqa: BLE001 + pass + + @staticmethod + def _check_content_length(headers, max_bytes: int) -> None: + """Reject a response whose advertised ``Content-Length`` exceeds the cap. + + This is the cheap pre-flight check; we still cap the actual read + below in case the server omits the header or lies about it. + + Missing or unparseable ``Content-Length`` returns silently -- + the streaming cap in :meth:`_read_capped_urllib3` / + :meth:`_read_capped_stdlib` is the real defence and will catch + an over-sized body whether the header was honest, dishonest, or + absent. + """ + raw = None + try: + raw = headers.get('Content-Length') + except AttributeError: + return + if raw is None: + return + try: + declared = int(raw) + except (TypeError, ValueError): + return + if declared > max_bytes: + raise OSError( + f"HTTP response declares Content-Length={declared:,} " + f"bytes, which exceeds the byte budget of " + f"{max_bytes:,} bytes computed from the TIFF strip " + f"table. The file is malformed or attempting " + f"denial-of-service. Issue #2051." + ) + + @staticmethod + def _read_capped(resp, max_bytes: int) -> bytes: + """Stream-read a urllib3 response, aborting past ``max_bytes``. + + Read at most ``max_bytes + 1`` bytes. The extra byte is the + over-shoot probe: if it arrives the server lied or omitted + ``Content-Length`` and tried to send a larger body. Raise + :class:`OSError` so callers that already handle network failures + also handle this. + """ + chunks: list[bytes] = [] + received = 0 + for chunk in resp.stream(amt=65536, decode_content=True): + if not chunk: + continue + chunks.append(chunk) + received += len(chunk) + if received > max_bytes: + raise OSError( + f"HTTP response body exceeded the byte budget of " + f"{max_bytes:,} bytes (received {received:,} bytes " + f"before abort). The server likely ignored or lied " + f"about Content-Length. Issue #2051." + ) + return b''.join(chunks) @property def size(self) -> int | None: @@ -1640,6 +1759,49 @@ def _has_sparse(byte_counts) -> bool: return False +#: Slack added to the strip-table byte budget for the TIFF header, +#: trailing IFD chain, ExifIFD, GeoKey directory, GDAL_METADATA, and any +#: ICC profile or XMP packet. 4 MiB is comfortable for real-world COGs +#: (the prefetch path already tolerates up to ``MAX_HTTP_HEADER_BYTES`` +#: of header bytes) while still bounding the body away from gigabyte +#: scale. Issue #2051. +_FULL_IMAGE_BUDGET_HEADER_SLACK = 4 * 1024 * 1024 + + +def _compute_full_image_byte_budget(offsets, byte_counts) -> int: + """Compute an upper bound on the legitimate HTTP body size for a stripped TIFF. + + A stripped TIFF body is laid out as: [TIFF header + IFDs + tag value + arrays] followed by strip payloads at the offsets listed in + ``StripOffsets``. The largest byte index any strip references is + ``max(offset + byte_count)`` across the strip table; the body cannot + legitimately extend past that point plus a small tail for trailing + metadata. We add :data:`_FULL_IMAGE_BUDGET_HEADER_SLACK` to cover the + header prologue (which lives at offset 0) and any tags that follow + the last strip. The cap is loose by design -- it exists to reject + bodies that are orders of magnitude larger than the file claims to + be, not to second-guess legitimate layouts. + + If the strip table is missing or empty (sparse-only, malformed), + fall back to the per-strip safety cap so the read is still bounded. + Issue #2051. + """ + fallback = _max_tile_bytes_from_env() + _FULL_IMAGE_BUDGET_HEADER_SLACK + if not offsets or not byte_counts: + return fallback + max_end = 0 + for off, bc in zip(offsets, byte_counts): + try: + end = int(off) + int(bc) + except (TypeError, ValueError): + continue + if end > max_end: + max_end = end + if max_end <= 0: + return fallback + return max_end + _FULL_IMAGE_BUDGET_HEADER_SLACK + + # --------------------------------------------------------------------------- # Strip reader # --------------------------------------------------------------------------- @@ -2391,7 +2553,15 @@ def _fetch_decode_cog_http_strips( # so the dim check uses their cap instead of the default 1B. if window is None: _check_dimensions(width, height, samples, max_pixels) - all_data = source.read_all() + # Bound the HTTP body to the byte size implied by the TIFF strip + # table. Without this cap, a tiny declared raster (which sails + # past ``_check_dimensions``) can still pull a multi-gigabyte + # body off the wire and into memory before ``_read_strips`` + # gets a chance to reject anything. The strip table tells us + # the maximum legitimate byte offset; anything beyond that is + # either a malformed file or a hostile server. Issue #2051. + max_bytes = _compute_full_image_byte_budget(offsets, byte_counts) + all_data = source.read_all(max_bytes=max_bytes) return _read_strips(all_data, ifd, header, dtype, max_pixels=max_pixels) diff --git a/xrspatial/geotiff/tests/test_http_read_all_bounded_2051.py b/xrspatial/geotiff/tests/test_http_read_all_bounded_2051.py new file mode 100644 index 000000000..379bc8d82 --- /dev/null +++ b/xrspatial/geotiff/tests/test_http_read_all_bounded_2051.py @@ -0,0 +1,330 @@ +"""Regression tests for issue #2051. + +``_HTTPSource.read_all()`` used to pull the full HTTP body unconditionally: +no ``Content-Length`` check, no streaming cap. A TIFF whose header +declares a tiny raster (which sails past ``_check_dimensions``) could +still be served as a multi-gigabyte body and the whole thing landed in +memory before TIFF parsing got a chance to reject anything. + +These tests stand up tiny loopback HTTP servers that misbehave in three +ways: + +- declared ``Content-Length`` exceeds the byte budget, +- ``Content-Length`` lies (says small, sends big), +- ``Content-Length`` is omitted entirely (chunked transfer encoding). + +Plus a positive test that legitimate full-image reads still work, and +unit tests for the ``_compute_full_image_byte_budget`` helper. +""" +from __future__ import annotations + +import http.server +import socketserver +import threading + +import numpy as np +import pytest + +from xrspatial.geotiff._reader import ( + _HTTPSource, + _compute_full_image_byte_budget, + _FULL_IMAGE_BUDGET_HEADER_SLACK, + _read_cog_http, +) +from xrspatial.geotiff._writer import write + + +# --------------------------------------------------------------------------- +# Server helpers +# --------------------------------------------------------------------------- + +class _BaseHandler(http.server.BaseHTTPRequestHandler): + payload: bytes = b'' + # Subclasses override these to fake misbehaviour. + lie_content_length: int | None = None + drop_content_length: bool = False + truncated_payload: bytes | None = None + + def log_message(self, *_args, **_kwargs): + pass + + +def _serve(handler_cls): + httpd = socketserver.TCPServer(('127.0.0.1', 0), handler_cls) + port = httpd.server_address[1] + thread = threading.Thread(target=httpd.serve_forever, daemon=True) + thread.start() + return f'http://127.0.0.1:{port}/cog.tif', httpd, thread + + +def _stop(httpd): + httpd.shutdown() + httpd.server_close() + + +@pytest.fixture(autouse=True) +def _allow_loopback(monkeypatch): + # Loopback addresses are blocked by the SSRF allow-list; the escape + # hatch lets the test reach 127.0.0.1. + monkeypatch.setenv('XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS', '1') + + +# --------------------------------------------------------------------------- +# Unit tests for the budget helper +# --------------------------------------------------------------------------- + +def test_budget_uses_max_strip_end_plus_slack(): + """Budget is ``max(offset + byte_count) + slack`` over the strip table.""" + offsets = [1024, 5000, 100_000] + byte_counts = [512, 1024, 4096] + budget = _compute_full_image_byte_budget(offsets, byte_counts) + # Largest end is 100_000 + 4096 = 104_096 + assert budget == 104_096 + _FULL_IMAGE_BUDGET_HEADER_SLACK + + +def test_budget_empty_strip_table_falls_back_to_per_strip_cap(): + """Empty / missing strip table falls back to the per-strip safety cap.""" + budget = _compute_full_image_byte_budget(None, None) + assert budget > 0 + budget_empty = _compute_full_image_byte_budget([], []) + assert budget_empty > 0 + + +def test_budget_all_sparse_falls_back_to_per_strip_cap(): + """A strip table where every strip is sparse (byte_count=0 and + offset=0) is degenerate; the helper falls back rather than picking + a useless cap of zero.""" + offsets = [0, 0, 0] + byte_counts = [0, 0, 0] + budget = _compute_full_image_byte_budget(offsets, byte_counts) + # Falls back to per-strip cap + slack, not 0. + assert budget > _FULL_IMAGE_BUDGET_HEADER_SLACK + + +# --------------------------------------------------------------------------- +# read_all with a byte budget +# --------------------------------------------------------------------------- + +def test_read_all_no_budget_returns_full_body(): + """Without ``max_bytes`` the legacy unbounded behaviour is preserved.""" + + class _Handler(_BaseHandler): + payload = b'A' * 1024 + + def do_GET(self): # noqa: N802 + self.send_response(200) + self.send_header('Content-Length', str(len(self.payload))) + self.end_headers() + self.wfile.write(self.payload) + + url, httpd, _ = _serve(_Handler) + try: + src = _HTTPSource(url) + data = src.read_all() + assert data == b'A' * 1024 + finally: + _stop(httpd) + + +def test_read_all_rejects_oversized_content_length(): + """Server advertises a Content-Length larger than the budget -- + rejected up front via OSError before any body is read.""" + + class _Handler(_BaseHandler): + payload = b'B' * 2048 + + def do_GET(self): # noqa: N802 + self.send_response(200) + self.send_header('Content-Length', str(len(self.payload))) + self.end_headers() + self.wfile.write(self.payload) + + url, httpd, _ = _serve(_Handler) + try: + src = _HTTPSource(url) + with pytest.raises(OSError, match="Content-Length"): + src.read_all(max_bytes=1024) + finally: + _stop(httpd) + + +def test_read_all_truncates_when_server_lies_about_content_length_small(): + """Server lies low: advertises a small Content-Length but sends a + much larger body. urllib3 trusts the advertised length and truncates + at the byte count the server declared, so the client is already + protected -- the extra bytes never reach Python memory. The cap is + irrelevant on this path because the body the caller sees never + exceeds the (truthful or lying) Content-Length. Lock in the + truncation behaviour so a future urllib3 / stdlib change does not + quietly turn this back into a vector.""" + + class _Handler(_BaseHandler): + # 100 KiB body, but advertised as 100 bytes. + big_body = b'L' * (100 * 1024) + lied_length = 100 + + def do_GET(self): # noqa: N802 + self.send_response(200) + self.send_header('Content-Length', str(self.lied_length)) + self.end_headers() + self.wfile.write(self.big_body) + + url, httpd, _ = _serve(_Handler) + try: + src = _HTTPSource(url) + # Budget is 1024 bytes, server says 100 -> pre-flight passes. + # The body returned is the 100 bytes the server claimed, not the + # 100 KiB it tried to send. + data = src.read_all(max_bytes=1024) + assert len(data) <= 100, ( + f"Got {len(data)} bytes from a server that advertised 100; " + f"the HTTP client failed to truncate at Content-Length and " + f"the byte budget did not catch the over-shoot." + ) + finally: + _stop(httpd) + + +def test_read_all_catches_missing_content_length(): + """Server omits Content-Length and uses chunked transfer encoding. + The pre-flight check has nothing to look at; the streaming cap must + still catch the over-sized body.""" + + class _Handler(_BaseHandler): + def do_GET(self): # noqa: N802 + body = b'C' * (100 * 1024) + self.send_response(200) + # No Content-Length header at all. + self.send_header('Transfer-Encoding', 'chunked') + self.end_headers() + # Send as a single chunk. + self.wfile.write(f'{len(body):x}\r\n'.encode('ascii')) + self.wfile.write(body) + self.wfile.write(b'\r\n0\r\n\r\n') + + url, httpd, _ = _serve(_Handler) + try: + src = _HTTPSource(url) + with pytest.raises(OSError, match="exceeded the byte budget"): + src.read_all(max_bytes=1024) + finally: + _stop(httpd) + + +def test_read_all_passes_when_body_fits_budget(): + """Legitimate path: body equals the budget exactly, returns cleanly.""" + + class _Handler(_BaseHandler): + payload = b'D' * 1024 + + def do_GET(self): # noqa: N802 + self.send_response(200) + self.send_header('Content-Length', str(len(self.payload))) + self.end_headers() + self.wfile.write(self.payload) + + url, httpd, _ = _serve(_Handler) + try: + src = _HTTPSource(url) + data = src.read_all(max_bytes=2048) + assert data == b'D' * 1024 + finally: + _stop(httpd) + + +# --------------------------------------------------------------------------- +# Stdlib fallback (urllib3 unavailable) +# --------------------------------------------------------------------------- + +# The stdlib ``urllib.request`` fallback path was removed in #2050 / +# #2055 (urllib3 is now a hard dependency). The three tests that +# previously covered the fallback's byte-budget enforcement no longer +# have a code path to exercise; the urllib3-only equivalents above +# (test_read_all_rejects_oversized_content_length, +# test_read_all_catches_missing_content_length, +# test_read_all_passes_when_body_fits_budget) keep the contract +# covered. + + +# --------------------------------------------------------------------------- +# End-to-end COG read +# --------------------------------------------------------------------------- + +class _RangeHandler(_BaseHandler): + """Honours Range requests; serves the full body on a no-Range GET.""" + + def do_GET(self): # noqa: N802 + rng = self.headers.get('Range') + if rng and rng.startswith('bytes='): + spec = rng[len('bytes='):] + start_s, _, end_s = spec.partition('-') + start = int(start_s) + end = int(end_s) if end_s else len(self.payload) - 1 + chunk = self.payload[start:end + 1] + self.send_response(206) + self.send_header( + 'Content-Range', + f'bytes {start}-{start + len(chunk) - 1}/{len(self.payload)}', + ) + self.send_header('Content-Length', str(len(chunk))) + self.end_headers() + self.wfile.write(chunk) + return + self.send_response(200) + self.send_header('Content-Length', str(len(self.payload))) + self.end_headers() + self.wfile.write(self.payload) + + +def _serve_payload(payload: bytes): + handler_cls = type( + 'BoundRangeHandler', (_RangeHandler,), {'payload': payload} + ) + return _serve(handler_cls) + + +def test_full_image_http_read_still_works_for_legitimate_cog(tmp_path): + """Sanity: with the cap in place, a normal stripped COG still reads + cleanly end-to-end. The strip-table-derived budget is loose enough + to cover the real on-wire body.""" + arr = np.arange(64 * 64, dtype=np.float32).reshape(64, 64) + path = str(tmp_path / 'legit_2051.tif') + # Stripped (not tiled) to exercise the strips path. ``cog=True`` + # writes COG-friendly tag ordering but stripped layout is the + # default for non-tiled writes. + write(arr, path, compression='deflate', tiled=False) + + with open(path, 'rb') as f: + payload = f.read() + + url, httpd, _ = _serve_payload(payload) + try: + result, _geo = _read_cog_http(url) + np.testing.assert_array_equal(result, arr) + finally: + _stop(httpd) + + +def test_full_image_http_read_rejects_padded_body(tmp_path): + """Attack scenario: a legitimate TIFF header is followed by extra + garbage past what the strip table accounts for. The + strip-table-derived budget rejects the body before it is fully + buffered into memory.""" + arr = np.arange(32 * 32, dtype=np.float32).reshape(32, 32) + path = str(tmp_path / 'padded_2051.tif') + write(arr, path, compression='deflate', tiled=False) + + with open(path, 'rb') as f: + legit_payload = f.read() + + # Append 64 MiB of zeros to the body. The strip table only covers + # the first len(legit_payload) bytes; anything past max(offset + + # byte_count) + slack is over-budget. + bloated = legit_payload + (b'\x00' * (64 * 1024 * 1024)) + + url, httpd, _ = _serve_payload(bloated) + try: + with pytest.raises(OSError, match="Content-Length|byte budget"): + _read_cog_http(url) + finally: + _stop(httpd)