From 96d3c42b1610dedf3679b46150c59b9b3e5031d8 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Mon, 18 May 2026 11:25:44 -0700 Subject: [PATCH 1/3] geotiff: bound HTTP full-image strip reads to declared byte size (#2051) `_HTTPSource.read_all()` was unbounded: no Content-Length validation, no streaming cap. The full-image stripped HTTP path in `_fetch_decode_cog_http_strips` only ran `_check_dimensions` against the TIFF header's declared pixel count, then handed the full body to `_read_strips`. A tiny declared raster (100x100, well within `_check_dimensions`) could still be served as a multi-gigabyte body and the whole body was allocated before TIFF parsing rejected anything. Fix (option b from the security report): - `_HTTPSource.read_all()` gains a `max_bytes: int | None = None` parameter. When set, it rejects an oversized `Content-Length` up front via `OSError`, then streams the body (urllib3 `preload_content=False` / stdlib `resp.read` in 64 KiB chunks) and aborts past `max_bytes + 1`. The `+ 1` is the over-shoot probe: a body that exactly matches the cap passes, but a server that omits or lies about Content-Length is caught the moment the first extra byte arrives. - `_request()` gains `preload_content=False` plumbing and releases the connection on the redirect path so streaming responses do not leak pool slots. - `_fetch_decode_cog_http_strips` computes the budget from the strip table (`max(offset + byte_count) + 4 MiB slack` for TIFF header / trailing metadata) and passes it through. - New `_compute_full_image_byte_budget` helper handles the degenerate cases (missing or all-sparse strip table) by falling back to the per-strip safety cap. `max_bytes=None` preserves the legacy unbounded behaviour for the non-HTTP callers (cloud reads are already gated by `max_cloud_bytes`; file reads are bounded by file size). Tests cover: - Content-Length advertised over budget -> OSError before the read. - Server omits Content-Length entirely (chunked) -> streaming cap catches the over-shoot. - Server lies low (advertises small, sends big) -> urllib3 truncates at the advertised length, locked in as the expected behaviour. - Legitimate full-image COG reads still work end-to-end. - Attacker pads a legitimate COG body with 64 MiB of zeros -> read rejected by either the Content-Length pre-flight or the streaming cap. - Budget helper returns the strip-table maximum plus slack, and falls back to the per-strip cap for empty / degenerate tables. Closes #2051. --- xrspatial/geotiff/_reader.py | 165 ++++++++- .../tests/test_http_read_all_bounded_2051.py | 315 ++++++++++++++++++ 2 files changed, 476 insertions(+), 4 deletions(-) create mode 100644 xrspatial/geotiff/tests/test_http_read_all_bounded_2051.py diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index b8d289b60..d1defeb8f 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -935,7 +935,8 @@ def _get_pinned_pool(self, scheme: str, host: str, port: int | None, self._pinned_pools[key] = pool return pool - def _request(self, headers: dict | None = None): + def _request(self, headers: dict | None = None, + preload_content: bool = True): """Issue a GET with manual, validated redirect following. urllib3's built-in redirect follower has no validation hook, so @@ -949,6 +950,13 @@ def _request(self, headers: dict | None = None): that exists between ``getaddrinfo`` in the validator and the second ``getaddrinfo`` urllib3 would otherwise do at connect time. Issue #1846. + + ``preload_content=False`` returns a streaming response: the body + is not buffered into ``resp.data`` and the caller must drain it + via ``resp.stream(...)``. Used by :meth:`read_all` when a + ``max_bytes`` budget is in play, so the body is bounded + on-the-wire instead of being fully allocated before the cap is + checked. Issue #2051. """ from urllib.parse import urljoin timeout = self._urllib3_timeout() @@ -961,11 +969,24 @@ def _request(self, headers: dict | None = None): headers=headers, timeout=timeout, redirect=False, + preload_content=preload_content, ) if 300 <= resp.status < 400 and resp.status != 304: location = resp.headers.get('Location') if not location: return resp + # Release the redirect response's connection back to + # the pool. ``preload_content=True`` (the default) drains + # the body for us, but the streaming path + # (``preload_content=False``, used by ``read_all`` with a + # byte budget) leaves the connection borrowed -- if we + # do not release it here, subsequent hops will allocate + # fresh connections every time. + if not preload_content: + try: + resp.release_conn() + except Exception: # noqa: BLE001 + pass # Resolve relative ``Location`` against the URL we just # requested, not against ``self._url``: chained # redirects can land us on a different origin. @@ -1184,8 +1205,93 @@ def read_ranges_coalesced( merged_bytes = self.read_ranges(merged, max_workers=max_workers) return split_coalesced_bytes(merged_bytes, mapping) - def read_all(self) -> bytes: - return self._request().data + def read_all(self, max_bytes: int | None = None) -> bytes: + """Fetch the full body, optionally bounded by ``max_bytes``. + + ``max_bytes`` caps both the advertised ``Content-Length`` (rejected + up front before any bytes are read into memory) and the actual + body size (streamed and aborted once ``max_bytes + 1`` bytes have + arrived). The ``+ 1`` is the over-shoot detector: a body that + exactly matches the cap passes, but a server that ignores or + lies about ``Content-Length`` and streams more bytes is caught + as soon as the first extra byte lands. + + Without a cap, a tiny TIFF header (e.g. 100x100) that survives + :func:`_check_dimensions` can still be served as a multi-gigabyte + HTTP body and the whole body is allocated before TIFF parsing + gets a chance to reject it. Issue #2051. + + ``max_bytes=None`` preserves the legacy unbounded behaviour for + callers that already gate the read upstream (e.g. cloud reads + gated by :data:`max_cloud_bytes`). + """ + if max_bytes is None: + return self._request().data + # Stream the body so the cap is enforced before the bytes land + # in memory. ``preload_content=False`` makes urllib3 hand us + # the response without buffering ``resp.data``. + resp = self._request(preload_content=False) + try: + self._check_content_length(resp.headers, max_bytes) + return self._read_capped(resp, max_bytes) + finally: + try: + resp.release_conn() + except Exception: # noqa: BLE001 + pass + + @staticmethod + def _check_content_length(headers, max_bytes: int) -> None: + """Reject a response whose advertised ``Content-Length`` exceeds the cap. + + This is the cheap pre-flight check; we still cap the actual read + below in case the server omits the header or lies about it. + """ + raw = None + try: + raw = headers.get('Content-Length') + except AttributeError: + return + if raw is None: + return + try: + declared = int(raw) + except (TypeError, ValueError): + return + if declared > max_bytes: + raise OSError( + f"HTTP response declares Content-Length={declared:,} " + f"bytes, which exceeds the byte budget of " + f"{max_bytes:,} bytes computed from the TIFF strip " + f"table. The file is malformed or attempting " + f"denial-of-service. Issue #2051." + ) + + @staticmethod + def _read_capped(resp, max_bytes: int) -> bytes: + """Stream-read a urllib3 response, aborting past ``max_bytes``. + + Read at most ``max_bytes + 1`` bytes. The extra byte is the + over-shoot probe: if it arrives the server lied or omitted + ``Content-Length`` and tried to send a larger body. Raise + :class:`OSError` so callers that already handle network failures + also handle this. + """ + chunks: list[bytes] = [] + received = 0 + for chunk in resp.stream(amt=65536, decode_content=True): + if not chunk: + continue + chunks.append(chunk) + received += len(chunk) + if received > max_bytes: + raise OSError( + f"HTTP response body exceeded the byte budget of " + f"{max_bytes:,} bytes (received {received:,} bytes " + f"before abort). The server likely ignored or lied " + f"about Content-Length. Issue #2051." + ) + return b''.join(chunks) @property def size(self) -> int | None: @@ -1640,6 +1746,49 @@ def _has_sparse(byte_counts) -> bool: return False +#: Slack added to the strip-table byte budget for the TIFF header, +#: trailing IFD chain, ExifIFD, GeoKey directory, GDAL_METADATA, and any +#: ICC profile or XMP packet. 4 MiB is comfortable for real-world COGs +#: (the prefetch path already tolerates up to ``MAX_HTTP_HEADER_BYTES`` +#: of header bytes) while still bounding the body away from gigabyte +#: scale. Issue #2051. +_FULL_IMAGE_BUDGET_HEADER_SLACK = 4 * 1024 * 1024 + + +def _compute_full_image_byte_budget(offsets, byte_counts) -> int: + """Compute an upper bound on the legitimate HTTP body size for a stripped TIFF. + + A stripped TIFF body is laid out as: [TIFF header + IFDs + tag value + arrays] followed by strip payloads at the offsets listed in + ``StripOffsets``. The largest byte index any strip references is + ``max(offset + byte_count)`` across the strip table; the body cannot + legitimately extend past that point plus a small tail for trailing + metadata. We add :data:`_FULL_IMAGE_BUDGET_HEADER_SLACK` to cover the + header prologue (which lives at offset 0) and any tags that follow + the last strip. The cap is loose by design -- it exists to reject + bodies that are orders of magnitude larger than the file claims to + be, not to second-guess legitimate layouts. + + If the strip table is missing or empty (sparse-only, malformed), + fall back to the per-strip safety cap so the read is still bounded. + Issue #2051. + """ + fallback = _max_tile_bytes_from_env() + _FULL_IMAGE_BUDGET_HEADER_SLACK + if not offsets or not byte_counts: + return fallback + max_end = 0 + for off, bc in zip(offsets, byte_counts): + try: + end = int(off) + int(bc) + except (TypeError, ValueError): + continue + if end > max_end: + max_end = end + if max_end <= 0: + return fallback + return max_end + _FULL_IMAGE_BUDGET_HEADER_SLACK + + # --------------------------------------------------------------------------- # Strip reader # --------------------------------------------------------------------------- @@ -2391,7 +2540,15 @@ def _fetch_decode_cog_http_strips( # so the dim check uses their cap instead of the default 1B. if window is None: _check_dimensions(width, height, samples, max_pixels) - all_data = source.read_all() + # Bound the HTTP body to the byte size implied by the TIFF strip + # table. Without this cap, a tiny declared raster (which sails + # past ``_check_dimensions``) can still pull a multi-gigabyte + # body off the wire and into memory before ``_read_strips`` + # gets a chance to reject anything. The strip table tells us + # the maximum legitimate byte offset; anything beyond that is + # either a malformed file or a hostile server. Issue #2051. + max_bytes = _compute_full_image_byte_budget(offsets, byte_counts) + all_data = source.read_all(max_bytes=max_bytes) return _read_strips(all_data, ifd, header, dtype, max_pixels=max_pixels) diff --git a/xrspatial/geotiff/tests/test_http_read_all_bounded_2051.py b/xrspatial/geotiff/tests/test_http_read_all_bounded_2051.py new file mode 100644 index 000000000..ccc313f9a --- /dev/null +++ b/xrspatial/geotiff/tests/test_http_read_all_bounded_2051.py @@ -0,0 +1,315 @@ +"""Regression tests for issue #2051. + +``_HTTPSource.read_all()`` used to pull the full HTTP body unconditionally: +no ``Content-Length`` check, no streaming cap. A TIFF whose header +declares a tiny raster (which sails past ``_check_dimensions``) could +still be served as a multi-gigabyte body and the whole thing landed in +memory before TIFF parsing got a chance to reject anything. + +These tests stand up tiny loopback HTTP servers that misbehave in three +ways: + +- declared ``Content-Length`` exceeds the byte budget, +- ``Content-Length`` lies (says small, sends big), +- ``Content-Length`` is omitted entirely (chunked transfer encoding). + +Plus a positive test that legitimate full-image reads still work, and +unit tests for the ``_compute_full_image_byte_budget`` helper. +""" +from __future__ import annotations + +import http.server +import socketserver +import threading + +import numpy as np +import pytest + +from xrspatial.geotiff._reader import ( + _HTTPSource, + _compute_full_image_byte_budget, + _FULL_IMAGE_BUDGET_HEADER_SLACK, + _read_cog_http, +) +from xrspatial.geotiff._writer import write + + +# --------------------------------------------------------------------------- +# Server helpers +# --------------------------------------------------------------------------- + +class _BaseHandler(http.server.BaseHTTPRequestHandler): + payload: bytes = b'' + # Subclasses override these to fake misbehaviour. + lie_content_length: int | None = None + drop_content_length: bool = False + truncated_payload: bytes | None = None + + def log_message(self, *_args, **_kwargs): + pass + + +def _serve(handler_cls): + httpd = socketserver.TCPServer(('127.0.0.1', 0), handler_cls) + port = httpd.server_address[1] + thread = threading.Thread(target=httpd.serve_forever, daemon=True) + thread.start() + return f'http://127.0.0.1:{port}/cog.tif', httpd, thread + + +def _stop(httpd): + httpd.shutdown() + httpd.server_close() + + +@pytest.fixture(autouse=True) +def _allow_loopback(monkeypatch): + # Loopback addresses are blocked by the SSRF allow-list; the escape + # hatch lets the test reach 127.0.0.1. + monkeypatch.setenv('XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS', '1') + + +# --------------------------------------------------------------------------- +# Unit tests for the budget helper +# --------------------------------------------------------------------------- + +def test_budget_uses_max_strip_end_plus_slack(): + """Budget is ``max(offset + byte_count) + slack`` over the strip table.""" + offsets = [1024, 5000, 100_000] + byte_counts = [512, 1024, 4096] + budget = _compute_full_image_byte_budget(offsets, byte_counts) + # Largest end is 100_000 + 4096 = 104_096 + assert budget == 104_096 + _FULL_IMAGE_BUDGET_HEADER_SLACK + + +def test_budget_empty_strip_table_falls_back_to_per_strip_cap(): + """Empty / missing strip table falls back to the per-strip safety cap.""" + budget = _compute_full_image_byte_budget(None, None) + assert budget > 0 + budget_empty = _compute_full_image_byte_budget([], []) + assert budget_empty > 0 + + +def test_budget_all_sparse_falls_back_to_per_strip_cap(): + """A strip table where every strip is sparse (byte_count=0 and + offset=0) is degenerate; the helper falls back rather than picking + a useless cap of zero.""" + offsets = [0, 0, 0] + byte_counts = [0, 0, 0] + budget = _compute_full_image_byte_budget(offsets, byte_counts) + # Falls back to per-strip cap + slack, not 0. + assert budget > _FULL_IMAGE_BUDGET_HEADER_SLACK + + +# --------------------------------------------------------------------------- +# read_all with a byte budget +# --------------------------------------------------------------------------- + +def test_read_all_no_budget_returns_full_body(): + """Without ``max_bytes`` the legacy unbounded behaviour is preserved.""" + + class _Handler(_BaseHandler): + payload = b'A' * 1024 + + def do_GET(self): # noqa: N802 + self.send_response(200) + self.send_header('Content-Length', str(len(self.payload))) + self.end_headers() + self.wfile.write(self.payload) + + url, httpd, _ = _serve(_Handler) + try: + src = _HTTPSource(url) + data = src.read_all() + assert data == b'A' * 1024 + finally: + _stop(httpd) + + +def test_read_all_rejects_oversized_content_length(): + """Server advertises a Content-Length larger than the budget -- + rejected up front via OSError before any body is read.""" + + class _Handler(_BaseHandler): + payload = b'B' * 2048 + + def do_GET(self): # noqa: N802 + self.send_response(200) + self.send_header('Content-Length', str(len(self.payload))) + self.end_headers() + self.wfile.write(self.payload) + + url, httpd, _ = _serve(_Handler) + try: + src = _HTTPSource(url) + with pytest.raises(OSError, match="Content-Length"): + src.read_all(max_bytes=1024) + finally: + _stop(httpd) + + +def test_read_all_truncates_when_server_lies_about_content_length_small(): + """Server lies low: advertises a small Content-Length but sends a + much larger body. urllib3 trusts the advertised length and truncates + at the byte count the server declared, so the client is already + protected -- the extra bytes never reach Python memory. The cap is + irrelevant on this path because the body the caller sees never + exceeds the (truthful or lying) Content-Length. Lock in the + truncation behaviour so a future urllib3 / stdlib change does not + quietly turn this back into a vector.""" + + class _Handler(_BaseHandler): + # 100 KiB body, but advertised as 100 bytes. + big_body = b'L' * (100 * 1024) + lied_length = 100 + + def do_GET(self): # noqa: N802 + self.send_response(200) + self.send_header('Content-Length', str(self.lied_length)) + self.end_headers() + self.wfile.write(self.big_body) + + url, httpd, _ = _serve(_Handler) + try: + src = _HTTPSource(url) + # Budget is 1024 bytes, server says 100 -> pre-flight passes. + # The body returned is the 100 bytes the server claimed, not the + # 100 KiB it tried to send. + data = src.read_all(max_bytes=1024) + assert len(data) <= 100, ( + f"Got {len(data)} bytes from a server that advertised 100; " + f"the HTTP client failed to truncate at Content-Length and " + f"the byte budget did not catch the over-shoot." + ) + finally: + _stop(httpd) + + +def test_read_all_catches_missing_content_length(): + """Server omits Content-Length and uses chunked transfer encoding. + The pre-flight check has nothing to look at; the streaming cap must + still catch the over-sized body.""" + + class _Handler(_BaseHandler): + def do_GET(self): # noqa: N802 + body = b'C' * (100 * 1024) + self.send_response(200) + # No Content-Length header at all. + self.send_header('Transfer-Encoding', 'chunked') + self.end_headers() + # Send as a single chunk. + self.wfile.write(f'{len(body):x}\r\n'.encode('ascii')) + self.wfile.write(body) + self.wfile.write(b'\r\n0\r\n\r\n') + + url, httpd, _ = _serve(_Handler) + try: + src = _HTTPSource(url) + with pytest.raises(OSError, match="exceeded the byte budget"): + src.read_all(max_bytes=1024) + finally: + _stop(httpd) + + +def test_read_all_passes_when_body_fits_budget(): + """Legitimate path: body equals the budget exactly, returns cleanly.""" + + class _Handler(_BaseHandler): + payload = b'D' * 1024 + + def do_GET(self): # noqa: N802 + self.send_response(200) + self.send_header('Content-Length', str(len(self.payload))) + self.end_headers() + self.wfile.write(self.payload) + + url, httpd, _ = _serve(_Handler) + try: + src = _HTTPSource(url) + data = src.read_all(max_bytes=2048) + assert data == b'D' * 1024 + finally: + _stop(httpd) + + +# --------------------------------------------------------------------------- +# End-to-end COG read +# --------------------------------------------------------------------------- + +class _RangeHandler(_BaseHandler): + """Honours Range requests; serves the full body on a no-Range GET.""" + + def do_GET(self): # noqa: N802 + rng = self.headers.get('Range') + if rng and rng.startswith('bytes='): + spec = rng[len('bytes='):] + start_s, _, end_s = spec.partition('-') + start = int(start_s) + end = int(end_s) if end_s else len(self.payload) - 1 + chunk = self.payload[start:end + 1] + self.send_response(206) + self.send_header( + 'Content-Range', + f'bytes {start}-{start + len(chunk) - 1}/{len(self.payload)}', + ) + self.send_header('Content-Length', str(len(chunk))) + self.end_headers() + self.wfile.write(chunk) + return + self.send_response(200) + self.send_header('Content-Length', str(len(self.payload))) + self.end_headers() + self.wfile.write(self.payload) + + +def _serve_payload(payload: bytes): + handler_cls = type( + 'BoundRangeHandler', (_RangeHandler,), {'payload': payload} + ) + return _serve(handler_cls) + + +def test_full_image_http_read_still_works_for_legitimate_cog(tmp_path): + """Sanity: with the cap in place, a normal stripped COG still reads + cleanly end-to-end. The strip-table-derived budget is loose enough + to cover the real on-wire body.""" + arr = np.arange(64 * 64, dtype=np.float32).reshape(64, 64) + path = str(tmp_path / 'legit_2051.tif') + # Stripped (not tiled) to exercise the strips path. ``cog=True`` + # writes COG-friendly tag ordering but stripped layout is the + # default for non-tiled writes. + write(arr, path, compression='deflate', tiled=False) + + with open(path, 'rb') as f: + payload = f.read() + + url, httpd, _ = _serve_payload(payload) + try: + result, _geo = _read_cog_http(url) + np.testing.assert_array_equal(result, arr) + finally: + _stop(httpd) + + +def test_full_image_http_read_rejects_padded_body(tmp_path): + """Attack scenario: a legitimate TIFF header is followed by gigabytes + of garbage. The strip-table-derived budget rejects the body before + it can be allocated.""" + arr = np.arange(32 * 32, dtype=np.float32).reshape(32, 32) + path = str(tmp_path / 'padded_2051.tif') + write(arr, path, compression='deflate', tiled=False) + + with open(path, 'rb') as f: + legit_payload = f.read() + + # Append 64 MiB of zeros to the body. The strip table only covers + # the first len(legit_payload) bytes; anything past max(offset + + # byte_count) + slack is over-budget. + bloated = legit_payload + (b'\x00' * (64 * 1024 * 1024)) + + url, httpd, _ = _serve_payload(bloated) + try: + with pytest.raises(OSError, match="Content-Length|byte budget"): + _read_cog_http(url) + finally: + _stop(httpd) From a034fac7e9886f9a5c1889a76083e42ba7b505e1 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Mon, 18 May 2026 11:29:09 -0700 Subject: [PATCH 2/3] geotiff: address review feedback on read_all byte-budget (#2051) - Stdlib fallback: add three tests that force ``self._pool = None`` so the urllib.request branch in ``read_all`` is actually exercised. One for the Content-Length pre-flight check, one for the streaming cap on chunked bodies, and one for the legitimate-body pass-through. - Redirect on the streaming path: drain the redirect response body before ``release_conn`` so urllib3 can return the connection to the pool instead of closing it. 3xx bodies are tiny so the drain is cheap. - Doc tweak in ``_check_content_length``: spell out that returning silently on missing / unparseable headers is intentional because the streaming cap is the real defence. - Test docstring fix: the padded-body test rejects before the body is fully buffered, not before it is allocated at all. --- xrspatial/geotiff/_reader.py | 15 +++- .../tests/test_http_read_all_bounded_2051.py | 82 ++++++++++++++++++- 2 files changed, 93 insertions(+), 4 deletions(-) diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index d1defeb8f..ca05daef7 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -981,8 +981,15 @@ def _request(self, headers: dict | None = None, # (``preload_content=False``, used by ``read_all`` with a # byte budget) leaves the connection borrowed -- if we # do not release it here, subsequent hops will allocate - # fresh connections every time. + # fresh connections every time. Drain first so urllib3 + # can return the connection to the pool instead of + # closing it; a 3xx body is bounded by Content-Length so + # the drain is cheap. if not preload_content: + try: + resp.drain_conn() + except Exception: # noqa: BLE001 + pass try: resp.release_conn() except Exception: # noqa: BLE001 @@ -1246,6 +1253,12 @@ def _check_content_length(headers, max_bytes: int) -> None: This is the cheap pre-flight check; we still cap the actual read below in case the server omits the header or lies about it. + + Missing or unparseable ``Content-Length`` returns silently -- + the streaming cap in :meth:`_read_capped_urllib3` / + :meth:`_read_capped_stdlib` is the real defence and will catch + an over-sized body whether the header was honest, dishonest, or + absent. """ raw = None try: diff --git a/xrspatial/geotiff/tests/test_http_read_all_bounded_2051.py b/xrspatial/geotiff/tests/test_http_read_all_bounded_2051.py index ccc313f9a..42c23b522 100644 --- a/xrspatial/geotiff/tests/test_http_read_all_bounded_2051.py +++ b/xrspatial/geotiff/tests/test_http_read_all_bounded_2051.py @@ -232,6 +232,81 @@ def do_GET(self): # noqa: N802 _stop(httpd) +# --------------------------------------------------------------------------- +# Stdlib fallback (urllib3 unavailable) +# --------------------------------------------------------------------------- + +def test_read_all_stdlib_fallback_rejects_oversized_content_length(): + """When urllib3 is not in play, ``read_all`` drops to stdlib + ``urllib.request``. The Content-Length pre-flight check must still + fire on that path.""" + + class _Handler(_BaseHandler): + payload = b'E' * 2048 + + def do_GET(self): # noqa: N802 + self.send_response(200) + self.send_header('Content-Length', str(len(self.payload))) + self.end_headers() + self.wfile.write(self.payload) + + url, httpd, _ = _serve(_Handler) + try: + src = _HTTPSource(url) + # Force the stdlib branch by dropping the pool reference. + src._pool = None + with pytest.raises(OSError, match="Content-Length"): + src.read_all(max_bytes=1024) + finally: + _stop(httpd) + + +def test_read_all_stdlib_fallback_catches_missing_content_length(): + """Stdlib path with chunked-encoded body must also cap on the + streamed bytes.""" + + class _Handler(_BaseHandler): + def do_GET(self): # noqa: N802 + body = b'F' * (100 * 1024) + self.send_response(200) + self.send_header('Transfer-Encoding', 'chunked') + self.end_headers() + self.wfile.write(f'{len(body):x}\r\n'.encode('ascii')) + self.wfile.write(body) + self.wfile.write(b'\r\n0\r\n\r\n') + + url, httpd, _ = _serve(_Handler) + try: + src = _HTTPSource(url) + src._pool = None + with pytest.raises(OSError, match="exceeded the byte budget"): + src.read_all(max_bytes=1024) + finally: + _stop(httpd) + + +def test_read_all_stdlib_fallback_passes_when_body_fits(): + """Stdlib path returns the body cleanly when it fits the budget.""" + + class _Handler(_BaseHandler): + payload = b'G' * 1024 + + def do_GET(self): # noqa: N802 + self.send_response(200) + self.send_header('Content-Length', str(len(self.payload))) + self.end_headers() + self.wfile.write(self.payload) + + url, httpd, _ = _serve(_Handler) + try: + src = _HTTPSource(url) + src._pool = None + data = src.read_all(max_bytes=2048) + assert data == b'G' * 1024 + finally: + _stop(httpd) + + # --------------------------------------------------------------------------- # End-to-end COG read # --------------------------------------------------------------------------- @@ -292,9 +367,10 @@ def test_full_image_http_read_still_works_for_legitimate_cog(tmp_path): def test_full_image_http_read_rejects_padded_body(tmp_path): - """Attack scenario: a legitimate TIFF header is followed by gigabytes - of garbage. The strip-table-derived budget rejects the body before - it can be allocated.""" + """Attack scenario: a legitimate TIFF header is followed by extra + garbage past what the strip table accounts for. The + strip-table-derived budget rejects the body before it is fully + buffered into memory.""" arr = np.arange(32 * 32, dtype=np.float32).reshape(32, 32) path = str(tmp_path / 'padded_2051.tif') write(arr, path, compression='deflate', tiled=False) From 8cd5172c5b4153ddb993bbcf88cfc766844021b4 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Mon, 18 May 2026 13:34:52 -0700 Subject: [PATCH 3/3] geotiff: drop stdlib-fallback tests now that urllib3 is required (#2051) #2050 / #2055 dropped the stdlib ``urllib.request`` HTTP fallback and made urllib3 a hard dependency. The three ``test_read_all_stdlib_fallback_*`` cells that exercised the fallback's byte-budget enforcement no longer have a code path to hit; the urllib3-only equivalents above keep the contract covered. --- .../tests/test_http_read_all_bounded_2051.py | 77 ++----------------- 1 file changed, 8 insertions(+), 69 deletions(-) diff --git a/xrspatial/geotiff/tests/test_http_read_all_bounded_2051.py b/xrspatial/geotiff/tests/test_http_read_all_bounded_2051.py index 42c23b522..379bc8d82 100644 --- a/xrspatial/geotiff/tests/test_http_read_all_bounded_2051.py +++ b/xrspatial/geotiff/tests/test_http_read_all_bounded_2051.py @@ -236,75 +236,14 @@ def do_GET(self): # noqa: N802 # Stdlib fallback (urllib3 unavailable) # --------------------------------------------------------------------------- -def test_read_all_stdlib_fallback_rejects_oversized_content_length(): - """When urllib3 is not in play, ``read_all`` drops to stdlib - ``urllib.request``. The Content-Length pre-flight check must still - fire on that path.""" - - class _Handler(_BaseHandler): - payload = b'E' * 2048 - - def do_GET(self): # noqa: N802 - self.send_response(200) - self.send_header('Content-Length', str(len(self.payload))) - self.end_headers() - self.wfile.write(self.payload) - - url, httpd, _ = _serve(_Handler) - try: - src = _HTTPSource(url) - # Force the stdlib branch by dropping the pool reference. - src._pool = None - with pytest.raises(OSError, match="Content-Length"): - src.read_all(max_bytes=1024) - finally: - _stop(httpd) - - -def test_read_all_stdlib_fallback_catches_missing_content_length(): - """Stdlib path with chunked-encoded body must also cap on the - streamed bytes.""" - - class _Handler(_BaseHandler): - def do_GET(self): # noqa: N802 - body = b'F' * (100 * 1024) - self.send_response(200) - self.send_header('Transfer-Encoding', 'chunked') - self.end_headers() - self.wfile.write(f'{len(body):x}\r\n'.encode('ascii')) - self.wfile.write(body) - self.wfile.write(b'\r\n0\r\n\r\n') - - url, httpd, _ = _serve(_Handler) - try: - src = _HTTPSource(url) - src._pool = None - with pytest.raises(OSError, match="exceeded the byte budget"): - src.read_all(max_bytes=1024) - finally: - _stop(httpd) - - -def test_read_all_stdlib_fallback_passes_when_body_fits(): - """Stdlib path returns the body cleanly when it fits the budget.""" - - class _Handler(_BaseHandler): - payload = b'G' * 1024 - - def do_GET(self): # noqa: N802 - self.send_response(200) - self.send_header('Content-Length', str(len(self.payload))) - self.end_headers() - self.wfile.write(self.payload) - - url, httpd, _ = _serve(_Handler) - try: - src = _HTTPSource(url) - src._pool = None - data = src.read_all(max_bytes=2048) - assert data == b'G' * 1024 - finally: - _stop(httpd) +# The stdlib ``urllib.request`` fallback path was removed in #2050 / +# #2055 (urllib3 is now a hard dependency). The three tests that +# previously covered the fallback's byte-budget enforcement no longer +# have a code path to exercise; the urllib3-only equivalents above +# (test_read_all_rejects_oversized_content_length, +# test_read_all_catches_missing_content_length, +# test_read_all_passes_when_body_fits_budget) keep the contract +# covered. # ---------------------------------------------------------------------------