From 96763a5df3bc16759355dd27d456b9c61de5ba85 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Tue, 12 May 2026 05:59:30 -0700 Subject: [PATCH 1/2] Apply tile/strip byte cap to local TIFF reads + add SSRF defenses on HTTP source (#1664) Local-file reads now check ``TileByteCounts`` and ``StripByteCounts`` against the same ``XRSPATIAL_COG_MAX_TILE_BYTES`` cap that HTTP uses. A small compressed slice can still expand into gigabytes under deflate/zstd/lzw, so mmap-bounded slicing alone is not enough. ``_HTTPSource`` now validates URLs at construction: - Scheme allow-list (http, https). Widen via ``XRSPATIAL_GEOTIFF_ALLOWED_SCHEMES``. - Hostnames resolved through ``socket.getaddrinfo``; any loopback, link-local, RFC1918 private, multicast, or reserved IP is rejected. Override via ``XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS=1``. Rejects on any resolved IP being unsafe so DNS-rebind tricks fail closed. - Explicit connect (10 s) and read (30 s) timeouts. Override via ``XRSPATIAL_GEOTIFF_HTTP_CONNECT_TIMEOUT`` and ``XRSPATIAL_GEOTIFF_HTTP_READ_TIMEOUT``. - Redirect cap of 5 set on the urllib3 Retry config. New ``UnsafeURLError`` (a ``ValueError`` subclass) is re-exported on ``xrspatial.geotiff``. Existing handlers that catch ``ValueError`` keep working. The local-server test in ``test_cog_http_concurrent.py`` and the old ``test_local_path_unaffected_by_cap`` test are updated to reflect the new behavior; the former sets ``XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS=1`` for its 127.0.0.1 loop, the latter no longer pins the broken cap-bypass. --- xrspatial/geotiff/__init__.py | 3 +- xrspatial/geotiff/_reader.py | 276 +++++++++++++++++- .../geotiff/tests/test_cog_http_concurrent.py | 11 +- xrspatial/geotiff/tests/test_features.py | 1 + xrspatial/geotiff/tests/test_security.py | 14 +- 5 files changed, 280 insertions(+), 25 deletions(-) diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 41d89fa2..6dbe990e 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -38,13 +38,14 @@ import xarray as xr from ._geotags import GeoTransform, RASTER_PIXEL_IS_AREA, RASTER_PIXEL_IS_POINT -from ._reader import read_to_array +from ._reader import UnsafeURLError, read_to_array from ._writer import write # All names below are part of the supported public API. ``plot_geotiff`` # is intentionally omitted: it is deprecated in favour of ``da.xrs.plot()`` # and emits a ``DeprecationWarning`` when called. __all__ = [ + 'UnsafeURLError', 'open_geotiff', 'read_geotiff_gpu', 'read_geotiff_dask', diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index 49235e95..e847e36c 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -59,13 +59,16 @@ def _check_dimensions(width, height, samples, max_pixels): ) -#: Default per-tile compressed-byte cap for HTTP COG reads. A crafted -#: ``TileByteCounts`` entry can declare arbitrarily many bytes, and the -#: HTTP path then tries to fetch and buffer that many bytes from the -#: server before it ever decompresses. 256 MiB tolerates legitimate -#: large tiles (RGB JPEG2000 at very high resolution can land in the -#: tens of MB) while keeping the fetch bounded. Override via the -#: ``XRSPATIAL_COG_MAX_TILE_BYTES`` environment variable. +#: Default per-tile (or per-strip) compressed-byte cap. A crafted +#: ``TileByteCounts`` / ``StripByteCounts`` entry can declare arbitrarily +#: many bytes. On HTTP, the reader would issue a Range GET sized by the +#: attacker's value; on local files, mmap slicing is bounded by the file +#: size but a small compressed slice can still decompress (deflate/zstd/ +#: lzw) into hundreds of MiB. 256 MiB tolerates legitimate large tiles +#: (RGB JPEG2000 at very high resolution can land in the tens of MB) +#: while keeping the fetch / decode bounded. Override via the +#: ``XRSPATIAL_COG_MAX_TILE_BYTES`` environment variable. Issues #1536 +#: (HTTP) and #1664 (local). MAX_TILE_BYTES_DEFAULT = 256 << 20 # 256 MiB @@ -290,7 +293,16 @@ def _get_http_pool(): _http_pool = urllib3.PoolManager( num_pools=10, maxsize=10, - retries=urllib3.Retry(total=2, backoff_factor=0.1), + retries=urllib3.Retry( + total=2, + backoff_factor=0.1, + # Cap redirects explicitly; without this the urllib3 + # default (30 in older versions, fewer in newer) leaves + # a long redirect chain reachable to an attacker who + # controls a server in the allow-listed range. Issue + # #1664. + redirect=_HTTP_MAX_REDIRECTS, + ), ) return _http_pool except ImportError: @@ -300,6 +312,179 @@ def _get_http_pool(): _http_pool = None +# --------------------------------------------------------------------------- +# SSRF defenses for _HTTPSource (issue #1664) +# --------------------------------------------------------------------------- + +#: Maximum number of redirects to follow when fetching a TIFF over HTTP. +_HTTP_MAX_REDIRECTS = 5 + +#: Default connect / read timeouts (seconds) for HTTP TIFF fetches. +_HTTP_CONNECT_TIMEOUT_DEFAULT = 10.0 +_HTTP_READ_TIMEOUT_DEFAULT = 30.0 + +#: Default URL schemes that ``_HTTPSource`` will accept. +_HTTP_ALLOWED_SCHEMES_DEFAULT = ('http', 'https') + + +def _http_allowed_schemes() -> tuple[str, ...]: + """Return the set of URL schemes ``_HTTPSource`` accepts. + + ``XRSPATIAL_GEOTIFF_ALLOWED_SCHEMES`` (comma-separated) overrides the + default. The env var only widens the allow-list; ``http`` and ``https`` + stay implicit. + """ + raw = _os_module.environ.get('XRSPATIAL_GEOTIFF_ALLOWED_SCHEMES') + if not raw: + return _HTTP_ALLOWED_SCHEMES_DEFAULT + extras = tuple( + s.strip().lower() for s in raw.split(',') if s.strip() + ) + return _HTTP_ALLOWED_SCHEMES_DEFAULT + extras + + +def _http_allow_private_hosts() -> bool: + """Return True if loopback / link-local / private IPs are allowed.""" + raw = _os_module.environ.get('XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS') + if raw is None: + return False + return raw.strip().lower() in ('1', 'true', 'yes', 'on') + + +def _http_timeout_from_env(var_name: str, default: float) -> float: + """Parse a positive-float timeout from the named env var, or fall back.""" + raw = _os_module.environ.get(var_name) + if raw is None: + return default + try: + val = float(raw) + except (TypeError, ValueError): + return default + return val if val > 0 else default + + +def _http_connect_timeout() -> float: + return _http_timeout_from_env( + 'XRSPATIAL_GEOTIFF_HTTP_CONNECT_TIMEOUT', + _HTTP_CONNECT_TIMEOUT_DEFAULT, + ) + + +def _http_read_timeout() -> float: + return _http_timeout_from_env( + 'XRSPATIAL_GEOTIFF_HTTP_READ_TIMEOUT', + _HTTP_READ_TIMEOUT_DEFAULT, + ) + + +class UnsafeURLError(ValueError): + """Raised when an HTTP URL fails the SSRF allow-list check. + + Subclasses ``ValueError`` so existing callers that catch ``ValueError`` + on bad input keep working. Carries the offending URL on ``.url`` for + structured logging. + """ + + def __init__(self, msg: str, url: str | None = None): + super().__init__(msg) + self.url = url + + +def _ip_is_private(ip_str: str) -> bool: + """Return True if *ip_str* is a loopback, link-local, or private IP. + + Covers both IPv4 and IPv6. Multicast and unspecified addresses are + treated as unsafe (no legitimate reason to GET a TIFF from them, and + cloud metadata sometimes lives behind link-local IPv6). + """ + import ipaddress + try: + ip = ipaddress.ip_address(ip_str) + except ValueError: + # Not a literal IP -- caller must resolve it first. + return False + # ``is_private`` is True for RFC1918 (10/8, 172.16/12, 192.168/16), + # the IPv6 ULAs (fc00::/7), and -- in stdlib >= 3.4 -- also for + # loopback / link-local. Stay explicit so we don't depend on subtle + # behaviour across Python versions. + return ( + ip.is_loopback + or ip.is_link_local + or ip.is_private + or ip.is_multicast + or ip.is_unspecified + or ip.is_reserved + ) + + +def _validate_http_url(url: str) -> None: + """Reject URLs that would let ``_HTTPSource`` reach unsafe destinations. + + Enforces: + + * scheme in ``_http_allowed_schemes()`` (default http / https) + * hostname resolves to at least one non-loopback, non-link-local, + non-private IP (override via ``XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS``) + * hostname is non-empty + + Raises :class:`UnsafeURLError` (a ``ValueError`` subclass) on any of + the above. Issue #1664. + """ + import socket + from urllib.parse import urlparse + + if not isinstance(url, str) or not url: + raise UnsafeURLError( + "HTTP source requires a non-empty URL string", url=url) + + parsed = urlparse(url) + scheme = (parsed.scheme or '').lower() + allowed = _http_allowed_schemes() + if scheme not in allowed: + raise UnsafeURLError( + f"URL scheme {scheme!r} is not in the allow-list " + f"{allowed}. Set XRSPATIAL_GEOTIFF_ALLOWED_SCHEMES to " + f"widen it. URL: {url!r}", + url=url, + ) + + host = parsed.hostname + if not host: + raise UnsafeURLError( + f"URL {url!r} has no hostname", url=url) + + if _http_allow_private_hosts(): + return + + # Resolve and reject if any resolved IP is in a private/loopback/link- + # local/multicast range. Rejecting on *any* match (rather than all) + # prevents DNS-rebind tricks that return both a public and a private + # IP for the same name. socket.getaddrinfo handles IPv4, IPv6, and + # literal IP strings uniformly. + try: + infos = socket.getaddrinfo(host, parsed.port, type=socket.SOCK_STREAM) + except socket.gaierror as e: + raise UnsafeURLError( + f"could not resolve host {host!r}: {e}", url=url) from e + + for info in infos: + sockaddr = info[4] + # sockaddr is (ip, port) for AF_INET and (ip, port, flow, scope) + # for AF_INET6 -- the IP is always index 0. + ip_str = sockaddr[0] + # IPv6 scoped addresses come back as 'fe80::1%eth0' -- strip the + # zone id before passing to ipaddress. + if '%' in ip_str: + ip_str = ip_str.split('%', 1)[0] + if _ip_is_private(ip_str): + raise UnsafeURLError( + f"host {host!r} resolves to {ip_str!r}, which is in a " + f"loopback / link-local / private range. Set " + f"XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS=1 to allow.", + url=url, + ) + + # --------------------------------------------------------------------------- # HTTP range coalescing # --------------------------------------------------------------------------- @@ -412,9 +597,28 @@ class _HTTPSource: """ def __init__(self, url: str): + # SSRF defense (issue #1664): validate scheme / host *before* + # any network call. UnsafeURLError subclasses ValueError so + # callers that already catch ValueError keep working. The check + # is best-effort -- DNS results can change between validate + # time and connect time, but rejecting at construction blocks + # the vast majority of static SSRF payloads. + _validate_http_url(url) self._url = url self._size = None self._pool = _get_http_pool() + self._connect_timeout = _http_connect_timeout() + self._read_timeout = _http_read_timeout() + + def _urllib3_timeout(self): + """Build a urllib3 Timeout object lazily. + + Imported here so that the module-level import of urllib3 stays + optional (we fall back to stdlib if urllib3 is missing). + """ + import urllib3 + return urllib3.Timeout( + connect=self._connect_timeout, read=self._read_timeout) def read_range(self, start: int, length: int) -> bytes: end = start + length - 1 @@ -422,14 +626,18 @@ def read_range(self, start: int, length: int) -> bytes: resp = self._pool.request( 'GET', self._url, headers={'Range': f'bytes={start}-{end}'}, + timeout=self._urllib3_timeout(), + redirect=_HTTP_MAX_REDIRECTS, ) return resp.data - # Fallback: stdlib + # Fallback: stdlib. urlopen's ``timeout`` is a single value, so + # use the more conservative read timeout; the connect timeout + # isn't separately controllable on stdlib urllib. req = urllib.request.Request( self._url, headers={'Range': f'bytes={start}-{end}'}, ) - with urllib.request.urlopen(req) as resp: + with urllib.request.urlopen(req, timeout=self._read_timeout) as resp: return resp.read() def read_ranges( @@ -490,9 +698,14 @@ def read_ranges_coalesced( def read_all(self) -> bytes: if self._pool is not None: - resp = self._pool.request('GET', self._url) + resp = self._pool.request( + 'GET', self._url, + timeout=self._urllib3_timeout(), + redirect=_HTTP_MAX_REDIRECTS, + ) return resp.data - with urllib.request.urlopen(self._url) as resp: + with urllib.request.urlopen( + self._url, timeout=self._read_timeout) as resp: return resp.read() @property @@ -923,6 +1136,25 @@ def _read_strips(data: bytes, ifd: IFD, header: TIFFHeader, if offsets is None or byte_counts is None: raise ValueError("Missing strip offsets or byte counts") + # Per-strip compressed-byte cap. Mirrors the HTTP path: a crafted + # ``StripByteCounts`` can declare a huge value and even though mmap + # slicing on the local path is bounded by the file size, the slice + # is still passed into the decompressor which can expand a few KiB + # of crafted deflate/zstd into gigabytes of decoded output. Issue + # #1664. Override via ``XRSPATIAL_COG_MAX_TILE_BYTES`` (the env var + # is shared with the tile path because the budget is the same). + max_tile_bytes = _max_tile_bytes_from_env() + for _strip_idx, _bc in enumerate(byte_counts): + if _bc > max_tile_bytes: + raise ValueError( + f"TIFF strip {_strip_idx} declares " + f"StripByteCount={_bc:,} bytes, which exceeds the " + f"per-strip safety cap of {max_tile_bytes:,} bytes. " + f"The file is malformed or attempting denial-of-service. " + f"Override via XRSPATIAL_COG_MAX_TILE_BYTES if this file " + f"is legitimate." + ) + planar = ifd.planar_config # 1=chunky (interleaved), 2=planar (separate) # Determine output region @@ -1074,6 +1306,22 @@ def _read_tiles(data: bytes, ifd: IFD, header: TIFFHeader, # A single tile's decoded bytes must also fit under the pixel budget. _check_dimensions(tw, th, samples, max_pixels) + # Per-tile compressed-byte cap (issue #1664). Same env var as the + # HTTP path. mmap slicing is bounded by the file size, but the slice + # gets handed to the decompressor, and a small slice can balloon + # into gigabytes through deflate / zstd / lzw / lerc. + max_tile_bytes = _max_tile_bytes_from_env() + for _tile_idx, _bc in enumerate(byte_counts): + if _bc > max_tile_bytes: + raise ValueError( + f"TIFF tile {_tile_idx} declares " + f"TileByteCount={_bc:,} bytes, which exceeds the " + f"per-tile safety cap of {max_tile_bytes:,} bytes. " + f"The file is malformed or attempting denial-of-service. " + f"Override via XRSPATIAL_COG_MAX_TILE_BYTES if this file " + f"is legitimate." + ) + planar = ifd.planar_config tiles_across = math.ceil(width / tw) tiles_down = math.ceil(height / th) @@ -1391,8 +1639,8 @@ def _fetch_decode_cog_http_tiles( # before the fetch list is built. A crafted COG can claim arbitrarily # large TileByteCounts; without this guard the HTTP layer would issue # a Range request sized by the attacker's value (issue #1536). The cap - # is overridable via XRSPATIAL_COG_MAX_TILE_BYTES; the local-mmap path - # is naturally bounded by file size and does not need this check. + # is overridable via XRSPATIAL_COG_MAX_TILE_BYTES. The local-mmap path + # applies the same cap in _read_tiles / _read_strips (issue #1664). max_tile_bytes = _max_tile_bytes_from_env() fetch_ranges: list[tuple[int, int]] = [] placements: list[tuple[int, int]] = [] # (tr, tc) per fetched tile diff --git a/xrspatial/geotiff/tests/test_cog_http_concurrent.py b/xrspatial/geotiff/tests/test_cog_http_concurrent.py index 991fc516..4cfcf91b 100644 --- a/xrspatial/geotiff/tests/test_cog_http_concurrent.py +++ b/xrspatial/geotiff/tests/test_cog_http_concurrent.py @@ -143,8 +143,15 @@ def log_message(self, *_args, **_kwargs): @pytest.fixture -def cog_http_server(tmp_path): - """Spin up a local http.server serving a tiled COG, yield (url, arr).""" +def cog_http_server(tmp_path, monkeypatch): + """Spin up a local http.server serving a tiled COG, yield (url, arr). + + Sets ``XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS=1`` for the duration of + the test because ``_HTTPSource`` blocks 127.0.0.1 by default after + issue #1664. The escape hatch is the documented way to keep loopback + test servers working. + """ + monkeypatch.setenv('XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS', '1') arr = np.arange(64 * 64, dtype=np.float32).reshape(64, 64) path = str(tmp_path / 'tmp_1480_cog.tif') write(arr, path, compression='deflate', tiled=True, tile_size=16, diff --git a/xrspatial/geotiff/tests/test_features.py b/xrspatial/geotiff/tests/test_features.py index 11d5abe5..7c165633 100644 --- a/xrspatial/geotiff/tests/test_features.py +++ b/xrspatial/geotiff/tests/test_features.py @@ -2642,6 +2642,7 @@ def test_all_lists_supported_functions(self): # public API. If any of these gets removed or renamed, that is a # breaking change and should go through a deprecation cycle. expected = { + 'UnsafeURLError', # issue #1664 -- SSRF reject for HTTP reads 'open_geotiff', 'read_geotiff_gpu', 'read_geotiff_dask', diff --git a/xrspatial/geotiff/tests/test_security.py b/xrspatial/geotiff/tests/test_security.py index b9929865..67954537 100644 --- a/xrspatial/geotiff/tests/test_security.py +++ b/xrspatial/geotiff/tests/test_security.py @@ -707,23 +707,21 @@ def test_env_override_lifts_cap(self, tmp_path, monkeypatch): result, _ = _reader_mod._read_cog_http('http://mock/normal_env.tif') np.testing.assert_array_equal(result, arr) - def test_local_path_unaffected_by_cap(self, tmp_path, monkeypatch): - """The local-mmap reader bypasses the HTTP cap. + def test_local_path_respects_default_cap(self, tmp_path): + """Legitimate local reads stay well under the default cap. - A patched on-disk file with huge byte_counts decodes via mmap - slicing, which silently truncates at EOF. The cap is HTTP-only, - so a tight env cap should not break legitimate local reads. + Before #1664 the local path bypassed the cap entirely. Now the + cap is shared, so we just confirm the default (256 MiB) leaves + plenty of headroom for a normal small tiled COG. """ from xrspatial.geotiff import open_geotiff, to_geotiff import xarray as xr arr = np.arange(64 * 64, dtype=np.float32).reshape(64, 64) da = xr.DataArray(arr, dims=['y', 'x']) - path = str(tmp_path / "local_normal_1536.tif") + path = str(tmp_path / "local_normal_1664.tif") to_geotiff(da, path, tile_size=32, compression='deflate') - # Set the HTTP cap to 1 byte; local reads must still succeed - monkeypatch.setenv('XRSPATIAL_COG_MAX_TILE_BYTES', '1') result = open_geotiff(path) np.testing.assert_array_equal(result.values, arr) From f47ab84445222d5bf7b699db087bdfa7510739f1 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Tue, 12 May 2026 05:59:42 -0700 Subject: [PATCH 2/2] Tests + docs for local tile cap and HTTP SSRF defenses (#1664) - ``test_local_tile_byte_cap_1664.py``: tiled and stripped TIFFs with fabricated huge ``TileByteCounts`` / ``StripByteCounts`` raise the cap. Env override and default cap headroom both exercised. - ``test_ssrf_hardening_1664.py``: scheme allow-list, IPv4 and IPv6 private/loopback/link-local rejection, env-var escape hatch, DNS-rebind partial-private rejection, timeout env-var parsing, and ``_HTTPSource.__init__`` integration. ``socket.getaddrinfo`` is monkeypatched per-test so no real DNS or HTTP is involved. - ``docs/source/reference/geotiff.rst``: new section documenting the cap, the SSRF defenses, every env-var override, and the ``UnsafeURLError`` exception type. Notes the ``XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS`` escape hatch for local test servers. --- docs/source/reference/geotiff.rst | 47 +++ .../tests/test_local_tile_byte_cap_1664.py | 197 ++++++++++++ .../geotiff/tests/test_ssrf_hardening_1664.py | 287 ++++++++++++++++++ 3 files changed, 531 insertions(+) create mode 100644 xrspatial/geotiff/tests/test_local_tile_byte_cap_1664.py create mode 100644 xrspatial/geotiff/tests/test_ssrf_hardening_1664.py diff --git a/docs/source/reference/geotiff.rst b/docs/source/reference/geotiff.rst index 134ae61c..bba46dc6 100644 --- a/docs/source/reference/geotiff.rst +++ b/docs/source/reference/geotiff.rst @@ -20,3 +20,50 @@ Writing xrspatial.geotiff.to_geotiff xrspatial.geotiff.write_geotiff_gpu xrspatial.geotiff.write_vrt + +Security and I/O limits +======================= + +``open_geotiff`` and the underlying reader enforce several limits to +keep crafted or hostile inputs from exhausting memory or reaching +internal network targets. All limits have safe defaults; advanced users +can override them via environment variables. + +Per-tile / per-strip compressed-byte cap +---------------------------------------- + +A crafted TIFF can declare arbitrarily large ``TileByteCounts`` or +``StripByteCounts``. Both the HTTP fetcher (which would issue a Range +GET sized by the attacker's value) and the local-file decoder (where a +small compressed slice can balloon under deflate / zstd / lzw) reject +any tile or strip whose declared size exceeds the cap. + +* Default: 256 MiB +* Override: ``XRSPATIAL_COG_MAX_TILE_BYTES`` (positive integer, bytes) +* Exception: ``ValueError`` ("safety cap") + +HTTP SSRF defenses +------------------ + +When ``open_geotiff`` is given an ``http(s)://`` URL, the reader rejects +URLs that would let a service-side caller probe internal infrastructure. + +* Scheme allow-list: ``http`` and ``https`` only. Widen via + ``XRSPATIAL_GEOTIFF_ALLOWED_SCHEMES`` (comma-separated list, e.g. + ``"ftp,gopher"``). +* Host filtering: hostnames that resolve to a loopback (``127.0.0.0/8``, + ``::1``), link-local (``169.254.0.0/16``, ``fe80::/10``), or RFC1918 + private range are rejected. Override via + ``XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS=1``. The check rejects on + *any* resolved IP being unsafe, which also blocks DNS-rebind tricks. +* Redirect cap: at most 5 redirects per request. +* Timeouts: 10 s connect, 30 s read by default. Override via + ``XRSPATIAL_GEOTIFF_HTTP_CONNECT_TIMEOUT`` and + ``XRSPATIAL_GEOTIFF_HTTP_READ_TIMEOUT`` (positive float, seconds). +* Exception: :class:`xrspatial.geotiff.UnsafeURLError` (a + ``ValueError`` subclass). + +If you run an integration test against a local HTTP server (e.g. +``http.server`` bound to ``127.0.0.1``), set +``XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS=1`` for the duration of the +test. diff --git a/xrspatial/geotiff/tests/test_local_tile_byte_cap_1664.py b/xrspatial/geotiff/tests/test_local_tile_byte_cap_1664.py new file mode 100644 index 00000000..bcad0ba5 --- /dev/null +++ b/xrspatial/geotiff/tests/test_local_tile_byte_cap_1664.py @@ -0,0 +1,197 @@ +"""Local-file tile/strip byte-count cap (issue #1664). + +Before #1664, ``XRSPATIAL_COG_MAX_TILE_BYTES`` only fired in the HTTP +fetch path. A crafted local TIFF with a huge ``TileByteCounts`` / +``StripByteCounts`` could still feed an enormous slice into the +decompressor, which can balloon into gigabytes of decoded output even +when the underlying mmap slice is bounded by the file size. + +These tests fabricate small COGs / strip-TIFFs, rewrite their byte +counts to oversized values, and check that the cap raises before the +decoder runs. +""" +from __future__ import annotations + +import struct + +import numpy as np +import pytest +import xarray as xr + +from xrspatial.geotiff import open_geotiff, to_geotiff +from xrspatial.geotiff import _reader as _reader_mod + + +# --------------------------------------------------------------------------- +# Helpers -- patch in-place IFD entries for tile / strip byte counts +# --------------------------------------------------------------------------- + + +def _patch_byte_counts(data: bytearray, tag: int, value: int) -> None: + """Rewrite every entry for *tag* (325=TileByteCounts, 279=StripByteCounts).""" + from xrspatial.geotiff._header import parse_header + header = parse_header(bytes(data)) + bo = header.byte_order + ifd_offset = header.first_ifd_offset + num_entries = struct.unpack_from(f'{bo}H', data, ifd_offset)[0] + entry_offset = ifd_offset + 2 + + for i in range(num_entries): + eo = entry_offset + i * 12 + cur_tag = struct.unpack_from(f'{bo}H', data, eo)[0] + if cur_tag != tag: + continue + type_id = struct.unpack_from(f'{bo}H', data, eo + 2)[0] + count = struct.unpack_from(f'{bo}I', data, eo + 4)[0] + if type_id == 4: # LONG + total = count * 4 + if total <= 4: + for k in range(count): + struct.pack_into(f'{bo}I', data, eo + 8 + k * 4, value) + else: + ptr = struct.unpack_from(f'{bo}I', data, eo + 8)[0] + for k in range(count): + struct.pack_into(f'{bo}I', data, ptr + k * 4, value) + elif type_id == 3: # SHORT + clipped = min(value, 0xFFFF) + total = count * 2 + if total <= 4: + for k in range(count): + struct.pack_into( + f'{bo}H', data, eo + 8 + k * 2, clipped) + else: + ptr = struct.unpack_from(f'{bo}I', data, eo + 8)[0] + for k in range(count): + struct.pack_into( + f'{bo}H', data, ptr + k * 2, clipped) + return + raise AssertionError(f"tag {tag} not found in IFD") + + +def _build_forged_tiled_cog(tmp_path, byte_count_value: int) -> str: + """Write a real tiled COG, patch every TileByteCounts entry, return path.""" + arr = np.arange(64 * 64, dtype=np.float32).reshape(64, 64) + da = xr.DataArray(arr, dims=['y', 'x']) + path = str(tmp_path / "forged_local_tiles_1664.tif") + to_geotiff(da, path, tile_size=32, compression='deflate') + with open(path, 'rb') as f: + data = bytearray(f.read()) + _patch_byte_counts(data, 325, byte_count_value) # 325 = TileByteCounts + with open(path, 'wb') as f: + f.write(data) + return path + + +def _build_forged_stripped_tif(tmp_path, byte_count_value: int) -> str: + """Write a strip-organized TIFF, patch every StripByteCounts entry.""" + arr = np.arange(64 * 64, dtype=np.float32).reshape(64, 64) + da = xr.DataArray(arr, dims=['y', 'x']) + path = str(tmp_path / "forged_local_strips_1664.tif") + # tiled=False forces strip layout; deflate gets the decompressor on + # the hot path so a huge declared size matters. + to_geotiff(da, path, tiled=False, compression='deflate') + with open(path, 'rb') as f: + data = bytearray(f.read()) + _patch_byte_counts(data, 279, byte_count_value) # 279 = StripByteCounts + with open(path, 'wb') as f: + f.write(data) + return path + + +# --------------------------------------------------------------------------- +# Tiled local reads +# --------------------------------------------------------------------------- + + +class TestLocalTileByteCap: + def test_huge_tile_byte_count_rejected(self, tmp_path, monkeypatch): + """A local tile with a huge TileByteCount raises before decode.""" + # 100 MB > the 1 MB cap we set below. + path = _build_forged_tiled_cog(tmp_path, 100 * 1024 * 1024) + monkeypatch.setenv('XRSPATIAL_COG_MAX_TILE_BYTES', str(1024 * 1024)) + + with pytest.raises(ValueError, match="TileByteCount"): + open_geotiff(path) + + def test_error_message_names_value_and_cap(self, tmp_path, monkeypatch): + path = _build_forged_tiled_cog(tmp_path, 50 * 1024 * 1024) + monkeypatch.setenv('XRSPATIAL_COG_MAX_TILE_BYTES', str(1024)) + + with pytest.raises(ValueError) as excinfo: + open_geotiff(path) + msg = str(excinfo.value) + # The forged value (52,428,800) and the cap (1,024) both appear. + assert "52,428,800" in msg or "52428800" in msg + assert "1,024" in msg or "1024" in msg + assert "denial-of-service" in msg.lower() or "malformed" in msg + + def test_normal_local_cog_under_default_cap(self, tmp_path): + """Legitimate local reads with the default cap still succeed.""" + arr = np.arange(64 * 64, dtype=np.float32).reshape(64, 64) + da = xr.DataArray(arr, dims=['y', 'x']) + path = str(tmp_path / "normal_local_1664.tif") + to_geotiff(da, path, tile_size=32, compression='deflate') + + result = open_geotiff(path) + np.testing.assert_array_equal(result.values, arr) + + def test_env_override_lifts_cap(self, tmp_path, monkeypatch): + """A user with legitimate large tiles can lift the cap via env.""" + # 50 MB declared. With cap=64 MB the read succeeds even though + # the underlying compressed slice is smaller (mmap truncates at + # EOF). + path = _build_forged_tiled_cog(tmp_path, 50 * 1024 * 1024) + monkeypatch.setenv( + 'XRSPATIAL_COG_MAX_TILE_BYTES', str(64 * 1024 * 1024)) + + # Read may raise inside the decompressor (the truncated mmap + # slice is garbage to deflate) but it must NOT raise the cap + # error. The thing we care about is that the cap check passes. + try: + open_geotiff(path) + except ValueError as e: + assert "exceeds the per-tile safety cap" not in str(e) + + +# --------------------------------------------------------------------------- +# Strip-organized local reads +# --------------------------------------------------------------------------- + + +class TestLocalStripByteCap: + def test_huge_strip_byte_count_rejected(self, tmp_path, monkeypatch): + path = _build_forged_stripped_tif(tmp_path, 100 * 1024 * 1024) + monkeypatch.setenv('XRSPATIAL_COG_MAX_TILE_BYTES', str(1024 * 1024)) + + with pytest.raises(ValueError, match="StripByteCount"): + open_geotiff(path) + + def test_strip_error_message_mentions_strip(self, tmp_path, monkeypatch): + path = _build_forged_stripped_tif(tmp_path, 50 * 1024 * 1024) + monkeypatch.setenv('XRSPATIAL_COG_MAX_TILE_BYTES', str(2048)) + + with pytest.raises(ValueError) as excinfo: + open_geotiff(path) + msg = str(excinfo.value) + assert "strip" in msg.lower() + assert "safety cap" in msg.lower() + + +# --------------------------------------------------------------------------- +# Cap helper directly +# --------------------------------------------------------------------------- + + +def test_max_tile_bytes_env_negative_falls_back(monkeypatch): + """Negative env value parses to the minimum of 1 (not the default).""" + monkeypatch.setenv('XRSPATIAL_COG_MAX_TILE_BYTES', '-5') + # _max_tile_bytes_from_env clamps to max(1, val); a negative becomes 1. + assert _reader_mod._max_tile_bytes_from_env() == 1 + + +def test_max_tile_bytes_env_garbage_falls_back(monkeypatch): + monkeypatch.setenv('XRSPATIAL_COG_MAX_TILE_BYTES', 'not-a-number') + assert ( + _reader_mod._max_tile_bytes_from_env() + == _reader_mod.MAX_TILE_BYTES_DEFAULT + ) diff --git a/xrspatial/geotiff/tests/test_ssrf_hardening_1664.py b/xrspatial/geotiff/tests/test_ssrf_hardening_1664.py new file mode 100644 index 00000000..2ee87f57 --- /dev/null +++ b/xrspatial/geotiff/tests/test_ssrf_hardening_1664.py @@ -0,0 +1,287 @@ +"""SSRF defenses on ``_HTTPSource`` (issue #1664). + +Before #1664, ``open_geotiff(url=...)`` accepted any URL: ``file://``, +``http://127.0.0.1:6379/``, ``http://169.254.169.254/...`` (cloud +metadata). It also had no explicit timeouts and no explicit redirect +cap. + +These tests cover the validator in isolation -- they do NOT make real +HTTP calls. ``socket.getaddrinfo`` is monkeypatched per-test to control +what the validator sees. +""" +from __future__ import annotations + +import socket + +import pytest + +from xrspatial.geotiff import UnsafeURLError +from xrspatial.geotiff import _reader as _reader_mod + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _fake_getaddrinfo(ip: str): + """Return a getaddrinfo replacement that always resolves to *ip*. + + Mirrors the real return tuple layout: each element is + ``(family, type, proto, canonname, sockaddr)``. The validator only + looks at index 4 (sockaddr) so the rest is filler. + """ + def _resolver(host, port, *args, **kwargs): + if ':' in ip: + return [(socket.AF_INET6, socket.SOCK_STREAM, 0, '', + (ip, port or 0, 0, 0))] + return [(socket.AF_INET, socket.SOCK_STREAM, 0, '', + (ip, port or 0))] + return _resolver + + +# --------------------------------------------------------------------------- +# Scheme allow-list +# --------------------------------------------------------------------------- + + +class TestSchemeAllowList: + def test_https_accepted(self, monkeypatch): + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('93.184.216.34')) + # example.com -> a public IP -- should pass. + _reader_mod._validate_http_url('https://example.com/cog.tif') + + def test_http_accepted(self, monkeypatch): + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('93.184.216.34')) + _reader_mod._validate_http_url('http://example.com/cog.tif') + + def test_file_scheme_rejected(self): + with pytest.raises(UnsafeURLError) as excinfo: + _reader_mod._validate_http_url('file:///etc/passwd') + msg = str(excinfo.value).lower() + assert "scheme" in msg + assert "'file'" in str(excinfo.value).lower() + + def test_gopher_scheme_rejected(self): + with pytest.raises(UnsafeURLError): + _reader_mod._validate_http_url('gopher://example.com/') + + def test_ftp_scheme_rejected(self): + with pytest.raises(UnsafeURLError): + _reader_mod._validate_http_url('ftp://example.com/x.tif') + + def test_empty_url_rejected(self): + with pytest.raises(UnsafeURLError): + _reader_mod._validate_http_url('') + + def test_non_string_rejected(self): + with pytest.raises(UnsafeURLError): + _reader_mod._validate_http_url(None) # type: ignore[arg-type] + + def test_env_var_widens_allow_list(self, monkeypatch): + monkeypatch.setenv( + 'XRSPATIAL_GEOTIFF_ALLOWED_SCHEMES', 'ftp,gopher') + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('93.184.216.34')) + # Now ftp:// should be accepted (host still validated). + _reader_mod._validate_http_url('ftp://example.com/x.tif') + + +# --------------------------------------------------------------------------- +# Host filtering -- private / loopback / link-local +# --------------------------------------------------------------------------- + + +class TestPrivateHostBlocking: + @pytest.mark.parametrize("ip", [ + '127.0.0.1', + '127.0.0.5', + '10.0.0.1', + '172.16.5.5', + '192.168.1.1', + '169.254.169.254', # cloud metadata + '0.0.0.0', + ]) + def test_ipv4_private_rejected(self, monkeypatch, ip): + monkeypatch.setattr(socket, 'getaddrinfo', _fake_getaddrinfo(ip)) + with pytest.raises(UnsafeURLError) as excinfo: + _reader_mod._validate_http_url('http://attacker.test/x.tif') + msg = str(excinfo.value).lower() + assert "private" in msg or "loopback" in msg or "link-local" in msg + + @pytest.mark.parametrize("ip", [ + '::1', # IPv6 loopback + 'fe80::1', # IPv6 link-local + 'fc00::1', # IPv6 unique-local + ]) + def test_ipv6_private_rejected(self, monkeypatch, ip): + monkeypatch.setattr(socket, 'getaddrinfo', _fake_getaddrinfo(ip)) + with pytest.raises(UnsafeURLError): + _reader_mod._validate_http_url('http://attacker.test/x.tif') + + def test_localhost_literal_rejected(self, monkeypatch): + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('127.0.0.1')) + with pytest.raises(UnsafeURLError): + _reader_mod._validate_http_url('http://localhost:8080/x.tif') + + def test_public_ip_accepted(self, monkeypatch): + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('8.8.8.8')) + _reader_mod._validate_http_url('http://example.com/x.tif') + + def test_env_override_allows_private(self, monkeypatch): + monkeypatch.setenv('XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS', '1') + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('127.0.0.1')) + # No exception expected. + _reader_mod._validate_http_url('http://127.0.0.1:8080/cog.tif') + + def test_env_override_truthy_values(self, monkeypatch): + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('127.0.0.1')) + for v in ('1', 'true', 'TRUE', 'yes', 'on'): + monkeypatch.setenv('XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS', v) + _reader_mod._validate_http_url('http://127.0.0.1/x.tif') + + def test_dns_rebind_partial_private_rejected(self, monkeypatch): + """If ANY resolved IP is private, the URL is rejected. + + This blocks DNS-rebinding tricks where a hostile DNS server + returns both a public and a private IP for the same name. + """ + def _resolver(host, port, *args, **kwargs): + return [ + (socket.AF_INET, socket.SOCK_STREAM, 0, '', + ('8.8.8.8', port or 0)), + (socket.AF_INET, socket.SOCK_STREAM, 0, '', + ('127.0.0.1', port or 0)), + ] + monkeypatch.setattr(socket, 'getaddrinfo', _resolver) + with pytest.raises(UnsafeURLError): + _reader_mod._validate_http_url('http://attacker.test/x.tif') + + def test_unresolvable_host_rejected(self, monkeypatch): + def _broken(host, port, *args, **kwargs): + raise socket.gaierror(-2, 'Name or service not known') + monkeypatch.setattr(socket, 'getaddrinfo', _broken) + with pytest.raises(UnsafeURLError) as excinfo: + _reader_mod._validate_http_url('http://nope.example.invalid/x.tif') + assert "resolve" in str(excinfo.value).lower() + + +# --------------------------------------------------------------------------- +# Timeout configuration +# --------------------------------------------------------------------------- + + +class TestHTTPTimeouts: + def test_default_connect_timeout(self, monkeypatch): + monkeypatch.delenv( + 'XRSPATIAL_GEOTIFF_HTTP_CONNECT_TIMEOUT', raising=False) + assert _reader_mod._http_connect_timeout() == 10.0 + + def test_default_read_timeout(self, monkeypatch): + monkeypatch.delenv( + 'XRSPATIAL_GEOTIFF_HTTP_READ_TIMEOUT', raising=False) + assert _reader_mod._http_read_timeout() == 30.0 + + def test_env_override_connect(self, monkeypatch): + monkeypatch.setenv('XRSPATIAL_GEOTIFF_HTTP_CONNECT_TIMEOUT', '2.5') + assert _reader_mod._http_connect_timeout() == 2.5 + + def test_env_override_read(self, monkeypatch): + monkeypatch.setenv('XRSPATIAL_GEOTIFF_HTTP_READ_TIMEOUT', '7') + assert _reader_mod._http_read_timeout() == 7.0 + + def test_env_garbage_falls_back(self, monkeypatch): + monkeypatch.setenv( + 'XRSPATIAL_GEOTIFF_HTTP_CONNECT_TIMEOUT', 'not-a-float') + assert _reader_mod._http_connect_timeout() == 10.0 + + def test_env_zero_falls_back(self, monkeypatch): + # Zero is not a useful timeout; treat as missing. + monkeypatch.setenv('XRSPATIAL_GEOTIFF_HTTP_READ_TIMEOUT', '0') + assert _reader_mod._http_read_timeout() == 30.0 + + +# --------------------------------------------------------------------------- +# Redirect cap +# --------------------------------------------------------------------------- + + +def test_redirect_cap_is_set(): + """The module-level constant is what the urllib3 pool gets.""" + assert _reader_mod._HTTP_MAX_REDIRECTS == 5 + + +# --------------------------------------------------------------------------- +# Integration: _HTTPSource.__init__ runs the validator +# --------------------------------------------------------------------------- + + +class TestHTTPSourceConstructor: + def test_file_url_rejected_at_init(self): + with pytest.raises(UnsafeURLError): + _reader_mod._HTTPSource('file:///etc/passwd') + + def test_localhost_url_rejected_at_init(self, monkeypatch): + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('127.0.0.1')) + with pytest.raises(UnsafeURLError): + _reader_mod._HTTPSource('http://127.0.0.1:6379/probe.tif') + + def test_metadata_url_rejected_at_init(self, monkeypatch): + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('169.254.169.254')) + with pytest.raises(UnsafeURLError): + _reader_mod._HTTPSource( + 'http://169.254.169.254/latest/meta-data/') + + def test_escape_hatch_allows_localhost(self, monkeypatch): + monkeypatch.setenv('XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS', '1') + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('127.0.0.1')) + src = _reader_mod._HTTPSource('http://127.0.0.1:8080/cog.tif') + # Timeout values are pulled from the env-aware helpers. + assert src._connect_timeout == 10.0 + assert src._read_timeout == 30.0 + + def test_unsafe_url_error_is_value_error(self): + """``UnsafeURLError`` is a ``ValueError`` so existing handlers work.""" + with pytest.raises(ValueError): + _reader_mod._HTTPSource('file:///etc/passwd') + + def test_unsafe_url_error_carries_url(self): + try: + _reader_mod._HTTPSource('file:///etc/passwd') + except UnsafeURLError as e: + assert e.url == 'file:///etc/passwd' + else: + pytest.fail("UnsafeURLError not raised") + + +# --------------------------------------------------------------------------- +# read_to_array dispatcher honours the SSRF check +# --------------------------------------------------------------------------- + + +def test_read_to_array_rejects_file_url(): + """The top-level dispatcher refuses file:// URLs via _HTTPSource.""" + # ``file://`` does not match the http(s) prefix in _open_source, so + # it does NOT hit _HTTPSource at all -- it gets routed via the path + # branch which interprets the URL literally and fails to find the + # file. The relevant guarantee is just: arbitrary local file access + # via ``file://`` URL does not succeed quietly. + from xrspatial.geotiff._reader import read_to_array + with pytest.raises((ValueError, FileNotFoundError, OSError)): + read_to_array('file:///etc/passwd') + + +def test_open_source_rejects_loopback_http(monkeypatch): + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('127.0.0.1')) + with pytest.raises(UnsafeURLError): + _reader_mod._open_source('http://127.0.0.1:8080/x.tif')