diff --git a/docs/source/reference/geotiff.rst b/docs/source/reference/geotiff.rst index 08ffb242..93d82685 100644 --- a/docs/source/reference/geotiff.rst +++ b/docs/source/reference/geotiff.rst @@ -21,6 +21,59 @@ Writing xrspatial.geotiff.write_geotiff_gpu xrspatial.geotiff.write_vrt +Security and I/O limits +======================= + +``open_geotiff`` and the underlying reader enforce several limits to +keep crafted or hostile inputs from exhausting memory or reaching +internal network targets. All limits have safe defaults; advanced users +can override them via environment variables. + +Per-tile / per-strip compressed-byte cap +---------------------------------------- + +A crafted TIFF can declare arbitrarily large ``TileByteCounts`` or +``StripByteCounts``. Both the HTTP fetcher (which would issue a Range +GET sized by the attacker's value) and the local-file decoder (where a +small compressed slice can balloon under deflate / zstd / lzw) reject +any tile or strip whose declared size exceeds the cap. + +* Default: 256 MiB +* Override: ``XRSPATIAL_COG_MAX_TILE_BYTES`` (positive integer, bytes). + Non-integer, empty, zero, or negative values are ignored and fall back + to the default. Set above your largest legitimate tile or strip size. +* Exception: ``ValueError`` ("safety cap") + +HTTP SSRF defenses +------------------ + +When ``open_geotiff`` is given an ``http://`` or ``https://`` URL, the +reader rejects URLs that would let a service-side caller probe internal +infrastructure. Other ``scheme://`` strings are dispatched through +fsspec and are not covered by these checks. + +* Scheme allow-list: ``http`` and ``https`` only. +* Host filtering: hostnames that resolve to a loopback (``127.0.0.0/8``, + ``::1``), link-local (``169.254.0.0/16``, ``fe80::/10``), or RFC1918 + private range are rejected. Override via + ``XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS=1``. The check rejects on + *any* resolved IP being unsafe, which also blocks DNS-rebind tricks. +* Redirect handling: at most 5 redirects per request. Each ``Location`` + is re-validated against the same scheme and host filter, so a public + URL cannot 3xx-redirect into private space. Requires ``urllib3``; on + the stdlib fallback the same cap and re-validation are enforced via + a custom redirect handler. +* Timeouts: 10 s connect, 30 s read by default. Override via + ``XRSPATIAL_GEOTIFF_HTTP_CONNECT_TIMEOUT`` and + ``XRSPATIAL_GEOTIFF_HTTP_READ_TIMEOUT`` (positive float, seconds). +* Exception: :class:`xrspatial.geotiff.UnsafeURLError` (a + ``ValueError`` subclass). + +If you run an integration test against a local HTTP server (e.g. +``http.server`` bound to ``127.0.0.1``), set +``XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS=1`` for the duration of the +test. + Strict mode (``XRSPATIAL_GEOTIFF_STRICT``) ========================================== diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 24ddbe7e..4d72479c 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -43,7 +43,7 @@ from typing import BinaryIO from ._geotags import GeoTransform, RASTER_PIXEL_IS_AREA, RASTER_PIXEL_IS_POINT -from ._reader import read_to_array +from ._reader import UnsafeURLError, read_to_array from ._writer import write # All names below are part of the supported public API. ``plot_geotiff`` @@ -51,6 +51,7 @@ # and emits a ``DeprecationWarning`` when called. __all__ = [ 'GeoTIFFFallbackWarning', + 'UnsafeURLError', 'open_geotiff', 'read_geotiff_gpu', 'read_geotiff_dask', diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index cfc39a18..3dc79409 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -59,18 +59,27 @@ def _check_dimensions(width, height, samples, max_pixels): ) -#: Default per-tile compressed-byte cap for HTTP COG reads. A crafted -#: ``TileByteCounts`` entry can declare arbitrarily many bytes, and the -#: HTTP path then tries to fetch and buffer that many bytes from the -#: server before it ever decompresses. 256 MiB tolerates legitimate -#: large tiles (RGB JPEG2000 at very high resolution can land in the -#: tens of MB) while keeping the fetch bounded. Override via the -#: ``XRSPATIAL_COG_MAX_TILE_BYTES`` environment variable. +#: Default per-tile (or per-strip) compressed-byte cap. A crafted +#: ``TileByteCounts`` / ``StripByteCounts`` entry can declare arbitrarily +#: many bytes. On HTTP, the reader would issue a Range GET sized by the +#: attacker's value; on local files, mmap slicing is bounded by the file +#: size but a small compressed slice can still decompress (deflate/zstd/ +#: lzw) into hundreds of MiB. 256 MiB tolerates legitimate large tiles +#: (RGB JPEG2000 at very high resolution can land in the tens of MB) +#: while keeping the fetch / decode bounded. Override via the +#: ``XRSPATIAL_COG_MAX_TILE_BYTES`` environment variable. Issues #1536 +#: (HTTP) and #1664 (local). MAX_TILE_BYTES_DEFAULT = 256 << 20 # 256 MiB def _max_tile_bytes_from_env() -> int: - """Read the per-tile byte cap from the environment, or fall back to the default.""" + """Read the per-tile byte cap from the environment, or fall back to the default. + + Non-integer, empty, zero, or negative values all fall back to + ``MAX_TILE_BYTES_DEFAULT``. Matches the policy used by the HTTP + timeout helpers so callers don't accidentally set an unreachable + 1-byte cap with ``XRSPATIAL_COG_MAX_TILE_BYTES=-1``. + """ raw = _os_module.environ.get('XRSPATIAL_COG_MAX_TILE_BYTES') if raw is None: return MAX_TILE_BYTES_DEFAULT @@ -78,7 +87,7 @@ def _max_tile_bytes_from_env() -> int: val = int(raw) except (TypeError, ValueError): return MAX_TILE_BYTES_DEFAULT - return max(1, val) + return val if val > 0 else MAX_TILE_BYTES_DEFAULT # --------------------------------------------------------------------------- @@ -290,7 +299,15 @@ def _get_http_pool(): _http_pool = urllib3.PoolManager( num_pools=10, maxsize=10, - retries=urllib3.Retry(total=2, backoff_factor=0.1), + retries=urllib3.Retry( + total=2, + backoff_factor=0.1, + # Redirects are *not* delegated to urllib3 -- they're + # followed manually in ``_HTTPSource._request`` so each + # ``Location`` runs through ``_validate_http_url`` before + # the next GET. Issue #1664. + redirect=False, + ), ) return _http_pool except ImportError: @@ -300,6 +317,166 @@ def _get_http_pool(): _http_pool = None +# --------------------------------------------------------------------------- +# SSRF defenses for _HTTPSource (issue #1664) +# --------------------------------------------------------------------------- + +#: Maximum number of redirects to follow when fetching a TIFF over HTTP. +_HTTP_MAX_REDIRECTS = 5 + +#: Default connect / read timeouts (seconds) for HTTP TIFF fetches. +_HTTP_CONNECT_TIMEOUT_DEFAULT = 10.0 +_HTTP_READ_TIMEOUT_DEFAULT = 30.0 + +#: URL schemes that ``_HTTPSource`` accepts. The HTTP source is a Range +#: GET implementation backed by urllib3 / urllib, both of which only speak +#: ``http`` and ``https`` -- widening here would just push the failure to +#: connect time. fsspec handles every other ``scheme://`` and is routed +#: separately by :func:`_open_source`. +_HTTP_ALLOWED_SCHEMES = ('http', 'https') + + +def _http_allow_private_hosts() -> bool: + """Return True if loopback / link-local / private IPs are allowed.""" + raw = _os_module.environ.get('XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS') + if raw is None: + return False + return raw.strip().lower() in ('1', 'true', 'yes', 'on') + + +def _http_timeout_from_env(var_name: str, default: float) -> float: + """Parse a positive-float timeout from the named env var, or fall back.""" + raw = _os_module.environ.get(var_name) + if raw is None: + return default + try: + val = float(raw) + except (TypeError, ValueError): + return default + return val if val > 0 else default + + +def _http_connect_timeout() -> float: + return _http_timeout_from_env( + 'XRSPATIAL_GEOTIFF_HTTP_CONNECT_TIMEOUT', + _HTTP_CONNECT_TIMEOUT_DEFAULT, + ) + + +def _http_read_timeout() -> float: + return _http_timeout_from_env( + 'XRSPATIAL_GEOTIFF_HTTP_READ_TIMEOUT', + _HTTP_READ_TIMEOUT_DEFAULT, + ) + + +class UnsafeURLError(ValueError): + """Raised when an HTTP URL fails the SSRF allow-list check. + + Subclasses ``ValueError`` so existing callers that catch ``ValueError`` + on bad input keep working. Carries the offending URL on ``.url`` for + structured logging. + """ + + def __init__(self, msg: str, url: str | None = None): + super().__init__(msg) + self.url = url + + +def _ip_is_private(ip_str: str) -> bool: + """Return True if *ip_str* is a loopback, link-local, or private IP. + + Covers both IPv4 and IPv6. Multicast and unspecified addresses are + treated as unsafe (no legitimate reason to GET a TIFF from them, and + cloud metadata sometimes lives behind link-local IPv6). + """ + import ipaddress + try: + ip = ipaddress.ip_address(ip_str) + except ValueError: + # Not a literal IP -- caller must resolve it first. + return False + # ``is_private`` is True for RFC1918 (10/8, 172.16/12, 192.168/16), + # the IPv6 ULAs (fc00::/7), and -- in stdlib >= 3.4 -- also for + # loopback / link-local. Stay explicit so we don't depend on subtle + # behaviour across Python versions. + return ( + ip.is_loopback + or ip.is_link_local + or ip.is_private + or ip.is_multicast + or ip.is_unspecified + or ip.is_reserved + ) + + +def _validate_http_url(url: str) -> None: + """Reject URLs that would let ``_HTTPSource`` reach unsafe destinations. + + Enforces: + + * scheme in ``_HTTP_ALLOWED_SCHEMES`` (http / https) + * hostname resolves to at least one non-loopback, non-link-local, + non-private IP (override via ``XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS``) + * hostname is non-empty + + Raises :class:`UnsafeURLError` (a ``ValueError`` subclass) on any of + the above. Issue #1664. + """ + import socket + from urllib.parse import urlparse + + if not isinstance(url, str) or not url: + raise UnsafeURLError( + "HTTP source requires a non-empty URL string", url=url) + + parsed = urlparse(url) + scheme = (parsed.scheme or '').lower() + if scheme not in _HTTP_ALLOWED_SCHEMES: + raise UnsafeURLError( + f"URL scheme {scheme!r} is not in the allow-list " + f"{_HTTP_ALLOWED_SCHEMES}. Only HTTP(S) is supported; other " + f"schemes are dispatched via fsspec. URL: {url!r}", + url=url, + ) + + host = parsed.hostname + if not host: + raise UnsafeURLError( + f"URL {url!r} has no hostname", url=url) + + if _http_allow_private_hosts(): + return + + # Resolve and reject if any resolved IP is in a private/loopback/link- + # local/multicast range. Rejecting on *any* match (rather than all) + # prevents DNS-rebind tricks that return both a public and a private + # IP for the same name. socket.getaddrinfo handles IPv4, IPv6, and + # literal IP strings uniformly. + try: + infos = socket.getaddrinfo(host, parsed.port, type=socket.SOCK_STREAM) + except socket.gaierror as e: + raise UnsafeURLError( + f"could not resolve host {host!r}: {e}", url=url) from e + + for info in infos: + sockaddr = info[4] + # sockaddr is (ip, port) for AF_INET and (ip, port, flow, scope) + # for AF_INET6 -- the IP is always index 0. + ip_str = sockaddr[0] + # IPv6 scoped addresses come back as 'fe80::1%eth0' -- strip the + # zone id before passing to ipaddress. + if '%' in ip_str: + ip_str = ip_str.split('%', 1)[0] + if _ip_is_private(ip_str): + raise UnsafeURLError( + f"host {host!r} resolves to {ip_str!r}, which is in a " + f"loopback / link-local / private range. Set " + f"XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS=1 to allow.", + url=url, + ) + + # --------------------------------------------------------------------------- # HTTP range coalescing # --------------------------------------------------------------------------- @@ -403,6 +580,35 @@ def split_coalesced_bytes( return out +class _ValidatingRedirectHandler(urllib.request.HTTPRedirectHandler): + """Stdlib redirect handler that re-validates each ``Location``. + + The default ``HTTPRedirectHandler`` follows 3xx responses with no + awareness of the SSRF allow-list, so a public URL could 302 into a + loopback or private IP. This subclass calls :func:`_validate_http_url` + on every redirect target before building the follow-up request, and + caps the chain at :data:`_HTTP_MAX_REDIRECTS`. Issue #1664. + """ + + max_redirections = _HTTP_MAX_REDIRECTS + + def redirect_request(self, req, fp, code, msg, headers, newurl): + _validate_http_url(newurl) + return super().redirect_request(req, fp, code, msg, headers, newurl) + + +_stdlib_opener = None + + +def _get_stdlib_opener(): + """Return a stdlib opener with the validating redirect handler installed.""" + global _stdlib_opener + if _stdlib_opener is None: + _stdlib_opener = urllib.request.build_opener( + _ValidatingRedirectHandler()) + return _stdlib_opener + + class _HTTPSource: """HTTP data source using range requests with connection reuse. @@ -412,24 +618,79 @@ class _HTTPSource: """ def __init__(self, url: str): + # SSRF defense (issue #1664): validate scheme / host *before* + # any network call. UnsafeURLError subclasses ValueError so + # callers that already catch ValueError keep working. The check + # is best-effort -- DNS results can change between validate + # time and connect time, but rejecting at construction blocks + # the vast majority of static SSRF payloads. + _validate_http_url(url) self._url = url self._size = None self._pool = _get_http_pool() + self._connect_timeout = _http_connect_timeout() + self._read_timeout = _http_read_timeout() + + def _urllib3_timeout(self): + """Build a urllib3 Timeout object lazily. + + Imported here so that the module-level import of urllib3 stays + optional (we fall back to stdlib if urllib3 is missing). + """ + import urllib3 + return urllib3.Timeout( + connect=self._connect_timeout, read=self._read_timeout) + + def _request(self, headers: dict | None = None): + """Issue a GET with manual, validated redirect following. + + urllib3's built-in redirect follower has no validation hook, so + we set ``redirect=False`` and walk the chain ourselves. Each + ``Location`` runs through :func:`_validate_http_url` before the + next GET, defeating a public-to-private 3xx bounce. Cap at + :data:`_HTTP_MAX_REDIRECTS` hops. Issue #1664. + """ + from urllib.parse import urljoin + pool = self._pool + timeout = self._urllib3_timeout() + current_url = self._url + for _ in range(_HTTP_MAX_REDIRECTS + 1): + resp = pool.request( + 'GET', current_url, + headers=headers, + timeout=timeout, + redirect=False, + ) + if 300 <= resp.status < 400 and resp.status != 304: + location = resp.headers.get('Location') + if not location: + return resp + # Resolve relative ``Location`` against the URL we just + # requested, not against ``self._url``: chained + # redirects can land us on a different origin. + next_url = urljoin(current_url, location) + _validate_http_url(next_url) + current_url = next_url + continue + return resp + raise UnsafeURLError( + f"More than {_HTTP_MAX_REDIRECTS} HTTP redirects " + f"starting from {self._url!r}", + url=self._url, + ) def read_range(self, start: int, length: int) -> bytes: end = start + length - 1 + headers = {'Range': f'bytes={start}-{end}'} if self._pool is not None: - resp = self._pool.request( - 'GET', self._url, - headers={'Range': f'bytes={start}-{end}'}, - ) - return resp.data - # Fallback: stdlib - req = urllib.request.Request( - self._url, - headers={'Range': f'bytes={start}-{end}'}, - ) - with urllib.request.urlopen(req) as resp: + return self._request(headers=headers).data + # Fallback: stdlib. urlopen's ``timeout`` is a single value, so + # use the more conservative read timeout; the connect timeout + # isn't separately controllable on stdlib urllib. The opener + # carries ``_ValidatingRedirectHandler`` so 3xx hops are re- + # validated and capped at ``_HTTP_MAX_REDIRECTS``. + req = urllib.request.Request(self._url, headers=headers) + with _get_stdlib_opener().open(req, timeout=self._read_timeout) as resp: return resp.read() def read_ranges( @@ -490,9 +751,9 @@ def read_ranges_coalesced( def read_all(self) -> bytes: if self._pool is not None: - resp = self._pool.request('GET', self._url) - return resp.data - with urllib.request.urlopen(self._url) as resp: + return self._request().data + req = urllib.request.Request(self._url) + with _get_stdlib_opener().open(req, timeout=self._read_timeout) as resp: return resp.read() @property @@ -923,6 +1184,25 @@ def _read_strips(data: bytes, ifd: IFD, header: TIFFHeader, if offsets is None or byte_counts is None: raise ValueError("Missing strip offsets or byte counts") + # Per-strip compressed-byte cap (issue #1664). Mirrors the HTTP path: + # a crafted ``StripByteCounts`` can declare a huge value and even + # though mmap slicing on the local path is bounded by the file size, + # the slice is still passed into the decompressor which can expand + # a few KiB of crafted deflate/zstd into gigabytes of decoded output. + # Override via ``XRSPATIAL_COG_MAX_TILE_BYTES`` (the env var is shared + # with the tile path because the budget is the same). + max_tile_bytes = _max_tile_bytes_from_env() + for _strip_idx, _bc in enumerate(byte_counts): + if _bc > max_tile_bytes: + raise ValueError( + f"TIFF strip {_strip_idx} declares " + f"StripByteCount={_bc:,} bytes, which exceeds the " + f"per-strip safety cap of {max_tile_bytes:,} bytes. " + f"The file is malformed or attempting denial-of-service. " + f"Override via XRSPATIAL_COG_MAX_TILE_BYTES if this file " + f"is legitimate." + ) + # A corrupt header can report RowsPerStrip=0, which would divide by zero # below. Reject it as a typed parse error rather than letting the # ZeroDivisionError leak out to the caller. @@ -1091,6 +1371,22 @@ def _read_tiles(data: bytes, ifd: IFD, header: TIFFHeader, # A single tile's decoded bytes must also fit under the pixel budget. _check_dimensions(tw, th, samples, max_pixels) + # Per-tile compressed-byte cap (issue #1664). Same env var as the + # HTTP path. mmap slicing is bounded by the file size, but the slice + # gets handed to the decompressor, and a small slice can balloon + # into gigabytes through deflate / zstd / lzw / lerc. + max_tile_bytes = _max_tile_bytes_from_env() + for _tile_idx, _bc in enumerate(byte_counts): + if _bc > max_tile_bytes: + raise ValueError( + f"TIFF tile {_tile_idx} declares " + f"TileByteCount={_bc:,} bytes, which exceeds the " + f"per-tile safety cap of {max_tile_bytes:,} bytes. " + f"The file is malformed or attempting denial-of-service. " + f"Override via XRSPATIAL_COG_MAX_TILE_BYTES if this file " + f"is legitimate." + ) + planar = ifd.planar_config tiles_across = math.ceil(width / tw) tiles_down = math.ceil(height / th) @@ -1408,8 +1704,8 @@ def _fetch_decode_cog_http_tiles( # before the fetch list is built. A crafted COG can claim arbitrarily # large TileByteCounts; without this guard the HTTP layer would issue # a Range request sized by the attacker's value (issue #1536). The cap - # is overridable via XRSPATIAL_COG_MAX_TILE_BYTES; the local-mmap path - # is naturally bounded by file size and does not need this check. + # is overridable via XRSPATIAL_COG_MAX_TILE_BYTES. The local-mmap path + # applies the same cap in _read_tiles / _read_strips (issue #1664). max_tile_bytes = _max_tile_bytes_from_env() fetch_ranges: list[tuple[int, int]] = [] placements: list[tuple[int, int]] = [] # (tr, tc) per fetched tile diff --git a/xrspatial/geotiff/tests/test_cog_http_concurrent.py b/xrspatial/geotiff/tests/test_cog_http_concurrent.py index 991fc516..4cfcf91b 100644 --- a/xrspatial/geotiff/tests/test_cog_http_concurrent.py +++ b/xrspatial/geotiff/tests/test_cog_http_concurrent.py @@ -143,8 +143,15 @@ def log_message(self, *_args, **_kwargs): @pytest.fixture -def cog_http_server(tmp_path): - """Spin up a local http.server serving a tiled COG, yield (url, arr).""" +def cog_http_server(tmp_path, monkeypatch): + """Spin up a local http.server serving a tiled COG, yield (url, arr). + + Sets ``XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS=1`` for the duration of + the test because ``_HTTPSource`` blocks 127.0.0.1 by default after + issue #1664. The escape hatch is the documented way to keep loopback + test servers working. + """ + monkeypatch.setenv('XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS', '1') arr = np.arange(64 * 64, dtype=np.float32).reshape(64, 64) path = str(tmp_path / 'tmp_1480_cog.tif') write(arr, path, compression='deflate', tiled=True, tile_size=16, diff --git a/xrspatial/geotiff/tests/test_features.py b/xrspatial/geotiff/tests/test_features.py index 0e719a4f..66c7c6c2 100644 --- a/xrspatial/geotiff/tests/test_features.py +++ b/xrspatial/geotiff/tests/test_features.py @@ -2643,6 +2643,7 @@ def test_all_lists_supported_functions(self): # breaking change and should go through a deprecation cycle. expected = { 'GeoTIFFFallbackWarning', + 'UnsafeURLError', 'open_geotiff', 'read_geotiff_gpu', 'read_geotiff_dask', diff --git a/xrspatial/geotiff/tests/test_local_tile_byte_cap_1664.py b/xrspatial/geotiff/tests/test_local_tile_byte_cap_1664.py new file mode 100644 index 00000000..6437d808 --- /dev/null +++ b/xrspatial/geotiff/tests/test_local_tile_byte_cap_1664.py @@ -0,0 +1,214 @@ +"""Local-file tile/strip byte-count cap (issue #1664). + +Before #1664, ``XRSPATIAL_COG_MAX_TILE_BYTES`` only fired in the HTTP +fetch path. A crafted local TIFF with a huge ``TileByteCounts`` / +``StripByteCounts`` could still feed an enormous slice into the +decompressor, which can balloon into gigabytes of decoded output even +when the underlying mmap slice is bounded by the file size. + +These tests fabricate small COGs / strip-TIFFs, rewrite their byte +counts to oversized values, and check that the cap raises before the +decoder runs. +""" +from __future__ import annotations + +import struct + +import numpy as np +import pytest +import xarray as xr + +from xrspatial.geotiff import open_geotiff, to_geotiff +from xrspatial.geotiff import _reader as _reader_mod + + +# --------------------------------------------------------------------------- +# Helpers -- patch in-place IFD entries for tile / strip byte counts +# --------------------------------------------------------------------------- + + +def _patch_byte_counts(data: bytearray, tag: int, value: int) -> None: + """Rewrite every entry for *tag* (325=TileByteCounts, 279=StripByteCounts).""" + from xrspatial.geotiff._header import parse_header + header = parse_header(bytes(data)) + bo = header.byte_order + ifd_offset = header.first_ifd_offset + num_entries = struct.unpack_from(f'{bo}H', data, ifd_offset)[0] + entry_offset = ifd_offset + 2 + + for i in range(num_entries): + eo = entry_offset + i * 12 + cur_tag = struct.unpack_from(f'{bo}H', data, eo)[0] + if cur_tag != tag: + continue + type_id = struct.unpack_from(f'{bo}H', data, eo + 2)[0] + count = struct.unpack_from(f'{bo}I', data, eo + 4)[0] + if type_id == 4: # LONG + total = count * 4 + if total <= 4: + for k in range(count): + struct.pack_into(f'{bo}I', data, eo + 8 + k * 4, value) + else: + ptr = struct.unpack_from(f'{bo}I', data, eo + 8)[0] + for k in range(count): + struct.pack_into(f'{bo}I', data, ptr + k * 4, value) + elif type_id == 3: # SHORT + clipped = min(value, 0xFFFF) + total = count * 2 + if total <= 4: + for k in range(count): + struct.pack_into( + f'{bo}H', data, eo + 8 + k * 2, clipped) + else: + ptr = struct.unpack_from(f'{bo}I', data, eo + 8)[0] + for k in range(count): + struct.pack_into( + f'{bo}H', data, ptr + k * 2, clipped) + return + raise AssertionError(f"tag {tag} not found in IFD") + + +def _build_forged_tiled_cog(tmp_path, byte_count_value: int) -> str: + """Write a real tiled COG, patch every TileByteCounts entry, return path.""" + arr = np.arange(64 * 64, dtype=np.float32).reshape(64, 64) + da = xr.DataArray(arr, dims=['y', 'x']) + path = str(tmp_path / "forged_local_tiles_1664.tif") + to_geotiff(da, path, tile_size=32, compression='deflate') + with open(path, 'rb') as f: + data = bytearray(f.read()) + _patch_byte_counts(data, 325, byte_count_value) # 325 = TileByteCounts + with open(path, 'wb') as f: + f.write(data) + return path + + +def _build_forged_stripped_tif(tmp_path, byte_count_value: int) -> str: + """Write a strip-organized TIFF, patch every StripByteCounts entry.""" + arr = np.arange(64 * 64, dtype=np.float32).reshape(64, 64) + da = xr.DataArray(arr, dims=['y', 'x']) + path = str(tmp_path / "forged_local_strips_1664.tif") + # tiled=False forces strip layout; deflate gets the decompressor on + # the hot path so a huge declared size matters. + to_geotiff(da, path, tiled=False, compression='deflate') + with open(path, 'rb') as f: + data = bytearray(f.read()) + _patch_byte_counts(data, 279, byte_count_value) # 279 = StripByteCounts + with open(path, 'wb') as f: + f.write(data) + return path + + +# --------------------------------------------------------------------------- +# Tiled local reads +# --------------------------------------------------------------------------- + + +class TestLocalTileByteCap: + def test_huge_tile_byte_count_rejected(self, tmp_path, monkeypatch): + """A local tile with a huge TileByteCount raises before decode.""" + # 100 MB > the 1 MB cap we set below. + path = _build_forged_tiled_cog(tmp_path, 100 * 1024 * 1024) + monkeypatch.setenv('XRSPATIAL_COG_MAX_TILE_BYTES', str(1024 * 1024)) + + with pytest.raises(ValueError, match="TileByteCount"): + open_geotiff(path) + + def test_error_message_names_value_and_cap(self, tmp_path, monkeypatch): + path = _build_forged_tiled_cog(tmp_path, 50 * 1024 * 1024) + monkeypatch.setenv('XRSPATIAL_COG_MAX_TILE_BYTES', str(1024)) + + with pytest.raises(ValueError) as excinfo: + open_geotiff(path) + msg = str(excinfo.value) + # The forged value (52,428,800) and the cap (1,024) both appear. + assert "52,428,800" in msg or "52428800" in msg + assert "1,024" in msg or "1024" in msg + assert "denial-of-service" in msg.lower() or "malformed" in msg + + def test_normal_local_cog_under_default_cap(self, tmp_path): + """Legitimate local reads with the default cap still succeed.""" + arr = np.arange(64 * 64, dtype=np.float32).reshape(64, 64) + da = xr.DataArray(arr, dims=['y', 'x']) + path = str(tmp_path / "normal_local_1664.tif") + to_geotiff(da, path, tile_size=32, compression='deflate') + + result = open_geotiff(path) + np.testing.assert_array_equal(result.values, arr) + + def test_env_override_lifts_cap(self, tmp_path, monkeypatch): + """A user with legitimate large tiles can lift the cap via env.""" + # 50 MB declared. With cap=64 MB the read succeeds even though + # the underlying compressed slice is smaller (mmap truncates at + # EOF). + path = _build_forged_tiled_cog(tmp_path, 50 * 1024 * 1024) + monkeypatch.setenv( + 'XRSPATIAL_COG_MAX_TILE_BYTES', str(64 * 1024 * 1024)) + + # Read may raise inside the decompressor (the truncated mmap + # slice is garbage to deflate) but it must NOT raise the cap + # error. The thing we care about is that the cap check passes. + try: + open_geotiff(path) + except ValueError as e: + assert "exceeds the per-tile safety cap" not in str(e) + + +# --------------------------------------------------------------------------- +# Strip-organized local reads +# --------------------------------------------------------------------------- + + +class TestLocalStripByteCap: + def test_huge_strip_byte_count_rejected(self, tmp_path, monkeypatch): + path = _build_forged_stripped_tif(tmp_path, 100 * 1024 * 1024) + monkeypatch.setenv('XRSPATIAL_COG_MAX_TILE_BYTES', str(1024 * 1024)) + + with pytest.raises(ValueError, match="StripByteCount"): + open_geotiff(path) + + def test_strip_error_message_mentions_strip(self, tmp_path, monkeypatch): + path = _build_forged_stripped_tif(tmp_path, 50 * 1024 * 1024) + monkeypatch.setenv('XRSPATIAL_COG_MAX_TILE_BYTES', str(2048)) + + with pytest.raises(ValueError) as excinfo: + open_geotiff(path) + msg = str(excinfo.value) + assert "strip" in msg.lower() + assert "safety cap" in msg.lower() + + +# --------------------------------------------------------------------------- +# Cap helper directly +# --------------------------------------------------------------------------- + + +def test_max_tile_bytes_env_negative_falls_back(monkeypatch): + """Negative env value falls back to the default, not a 1-byte cap. + + Earlier drafts clamped to ``max(1, val)`` which made a typo + (``XRSPATIAL_COG_MAX_TILE_BYTES=-1``) silently reject every tile. + The current policy matches ``_http_timeout_from_env``: any non- + positive integer is ignored. + """ + monkeypatch.setenv('XRSPATIAL_COG_MAX_TILE_BYTES', '-5') + assert ( + _reader_mod._max_tile_bytes_from_env() + == _reader_mod.MAX_TILE_BYTES_DEFAULT + ) + + +def test_max_tile_bytes_env_zero_falls_back(monkeypatch): + """Zero env value falls back to the default for the same reason.""" + monkeypatch.setenv('XRSPATIAL_COG_MAX_TILE_BYTES', '0') + assert ( + _reader_mod._max_tile_bytes_from_env() + == _reader_mod.MAX_TILE_BYTES_DEFAULT + ) + + +def test_max_tile_bytes_env_garbage_falls_back(monkeypatch): + monkeypatch.setenv('XRSPATIAL_COG_MAX_TILE_BYTES', 'not-a-number') + assert ( + _reader_mod._max_tile_bytes_from_env() + == _reader_mod.MAX_TILE_BYTES_DEFAULT + ) diff --git a/xrspatial/geotiff/tests/test_security.py b/xrspatial/geotiff/tests/test_security.py index b9929865..67954537 100644 --- a/xrspatial/geotiff/tests/test_security.py +++ b/xrspatial/geotiff/tests/test_security.py @@ -707,23 +707,21 @@ def test_env_override_lifts_cap(self, tmp_path, monkeypatch): result, _ = _reader_mod._read_cog_http('http://mock/normal_env.tif') np.testing.assert_array_equal(result, arr) - def test_local_path_unaffected_by_cap(self, tmp_path, monkeypatch): - """The local-mmap reader bypasses the HTTP cap. + def test_local_path_respects_default_cap(self, tmp_path): + """Legitimate local reads stay well under the default cap. - A patched on-disk file with huge byte_counts decodes via mmap - slicing, which silently truncates at EOF. The cap is HTTP-only, - so a tight env cap should not break legitimate local reads. + Before #1664 the local path bypassed the cap entirely. Now the + cap is shared, so we just confirm the default (256 MiB) leaves + plenty of headroom for a normal small tiled COG. """ from xrspatial.geotiff import open_geotiff, to_geotiff import xarray as xr arr = np.arange(64 * 64, dtype=np.float32).reshape(64, 64) da = xr.DataArray(arr, dims=['y', 'x']) - path = str(tmp_path / "local_normal_1536.tif") + path = str(tmp_path / "local_normal_1664.tif") to_geotiff(da, path, tile_size=32, compression='deflate') - # Set the HTTP cap to 1 byte; local reads must still succeed - monkeypatch.setenv('XRSPATIAL_COG_MAX_TILE_BYTES', '1') result = open_geotiff(path) np.testing.assert_array_equal(result.values, arr) diff --git a/xrspatial/geotiff/tests/test_ssrf_hardening_1664.py b/xrspatial/geotiff/tests/test_ssrf_hardening_1664.py new file mode 100644 index 00000000..9c45fefd --- /dev/null +++ b/xrspatial/geotiff/tests/test_ssrf_hardening_1664.py @@ -0,0 +1,455 @@ +"""SSRF defenses on ``_HTTPSource`` (issue #1664). + +Before #1664, ``open_geotiff(url=...)`` accepted any URL: ``file://``, +``http://127.0.0.1:6379/``, ``http://169.254.169.254/...`` (cloud +metadata). It also had no explicit timeouts and no explicit redirect +cap. + +These tests cover the validator in isolation -- they do NOT make real +HTTP calls. ``socket.getaddrinfo`` is monkeypatched per-test to control +what the validator sees. +""" +from __future__ import annotations + +import socket + +import pytest + +from xrspatial.geotiff import UnsafeURLError +from xrspatial.geotiff import _reader as _reader_mod + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _fake_getaddrinfo(ip: str): + """Return a getaddrinfo replacement that always resolves to *ip*. + + Mirrors the real return tuple layout: each element is + ``(family, type, proto, canonname, sockaddr)``. The validator only + looks at index 4 (sockaddr) so the rest is filler. + """ + def _resolver(host, port, *args, **kwargs): + if ':' in ip: + return [(socket.AF_INET6, socket.SOCK_STREAM, 0, '', + (ip, port or 0, 0, 0))] + return [(socket.AF_INET, socket.SOCK_STREAM, 0, '', + (ip, port or 0))] + return _resolver + + +# --------------------------------------------------------------------------- +# Scheme allow-list +# --------------------------------------------------------------------------- + + +class TestSchemeAllowList: + def test_https_accepted(self, monkeypatch): + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('93.184.216.34')) + # example.com -> a public IP -- should pass. + _reader_mod._validate_http_url('https://example.com/cog.tif') + + def test_http_accepted(self, monkeypatch): + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('93.184.216.34')) + _reader_mod._validate_http_url('http://example.com/cog.tif') + + def test_file_scheme_rejected(self): + with pytest.raises(UnsafeURLError) as excinfo: + _reader_mod._validate_http_url('file:///etc/passwd') + msg = str(excinfo.value).lower() + assert "scheme" in msg + assert "'file'" in str(excinfo.value).lower() + + def test_gopher_scheme_rejected(self): + with pytest.raises(UnsafeURLError): + _reader_mod._validate_http_url('gopher://example.com/') + + def test_ftp_scheme_rejected(self): + with pytest.raises(UnsafeURLError): + _reader_mod._validate_http_url('ftp://example.com/x.tif') + + def test_empty_url_rejected(self): + with pytest.raises(UnsafeURLError): + _reader_mod._validate_http_url('') + + def test_non_string_rejected(self): + with pytest.raises(UnsafeURLError): + _reader_mod._validate_http_url(None) # type: ignore[arg-type] + + def test_env_var_does_not_widen_allow_list(self, monkeypatch): + """The scheme allow-list is fixed at http/https. + + Earlier drafts of #1664 exposed ``XRSPATIAL_GEOTIFF_ALLOWED_SCHEMES`` + as an escape hatch, but ``_HTTPSource`` is a urllib3 / urllib + Range-GET implementation that only speaks http(s); widening the + validator without widening the source just moves the failure to + connect time. fsspec handles every other ``scheme://``. + """ + monkeypatch.setenv( + 'XRSPATIAL_GEOTIFF_ALLOWED_SCHEMES', 'ftp,gopher') + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('93.184.216.34')) + with pytest.raises(UnsafeURLError): + _reader_mod._validate_http_url('ftp://example.com/x.tif') + + +# --------------------------------------------------------------------------- +# Host filtering -- private / loopback / link-local +# --------------------------------------------------------------------------- + + +class TestPrivateHostBlocking: + @pytest.mark.parametrize("ip", [ + '127.0.0.1', + '127.0.0.5', + '10.0.0.1', + '172.16.5.5', + '192.168.1.1', + '169.254.169.254', # cloud metadata + '0.0.0.0', + ]) + def test_ipv4_private_rejected(self, monkeypatch, ip): + monkeypatch.setattr(socket, 'getaddrinfo', _fake_getaddrinfo(ip)) + with pytest.raises(UnsafeURLError) as excinfo: + _reader_mod._validate_http_url('http://attacker.test/x.tif') + msg = str(excinfo.value).lower() + assert "private" in msg or "loopback" in msg or "link-local" in msg + + @pytest.mark.parametrize("ip", [ + '::1', # IPv6 loopback + 'fe80::1', # IPv6 link-local + 'fc00::1', # IPv6 unique-local + ]) + def test_ipv6_private_rejected(self, monkeypatch, ip): + monkeypatch.setattr(socket, 'getaddrinfo', _fake_getaddrinfo(ip)) + with pytest.raises(UnsafeURLError): + _reader_mod._validate_http_url('http://attacker.test/x.tif') + + def test_localhost_literal_rejected(self, monkeypatch): + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('127.0.0.1')) + with pytest.raises(UnsafeURLError): + _reader_mod._validate_http_url('http://localhost:8080/x.tif') + + def test_public_ip_accepted(self, monkeypatch): + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('8.8.8.8')) + _reader_mod._validate_http_url('http://example.com/x.tif') + + def test_env_override_allows_private(self, monkeypatch): + monkeypatch.setenv('XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS', '1') + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('127.0.0.1')) + # No exception expected. + _reader_mod._validate_http_url('http://127.0.0.1:8080/cog.tif') + + def test_env_override_truthy_values(self, monkeypatch): + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('127.0.0.1')) + for v in ('1', 'true', 'TRUE', 'yes', 'on'): + monkeypatch.setenv('XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS', v) + _reader_mod._validate_http_url('http://127.0.0.1/x.tif') + + def test_dns_rebind_partial_private_rejected(self, monkeypatch): + """If ANY resolved IP is private, the URL is rejected. + + This blocks DNS-rebinding tricks where a hostile DNS server + returns both a public and a private IP for the same name. + """ + def _resolver(host, port, *args, **kwargs): + return [ + (socket.AF_INET, socket.SOCK_STREAM, 0, '', + ('8.8.8.8', port or 0)), + (socket.AF_INET, socket.SOCK_STREAM, 0, '', + ('127.0.0.1', port or 0)), + ] + monkeypatch.setattr(socket, 'getaddrinfo', _resolver) + with pytest.raises(UnsafeURLError): + _reader_mod._validate_http_url('http://attacker.test/x.tif') + + def test_unresolvable_host_rejected(self, monkeypatch): + def _broken(host, port, *args, **kwargs): + raise socket.gaierror(-2, 'Name or service not known') + monkeypatch.setattr(socket, 'getaddrinfo', _broken) + with pytest.raises(UnsafeURLError) as excinfo: + _reader_mod._validate_http_url('http://nope.example.invalid/x.tif') + assert "resolve" in str(excinfo.value).lower() + + +# --------------------------------------------------------------------------- +# Timeout configuration +# --------------------------------------------------------------------------- + + +class TestHTTPTimeouts: + def test_default_connect_timeout(self, monkeypatch): + monkeypatch.delenv( + 'XRSPATIAL_GEOTIFF_HTTP_CONNECT_TIMEOUT', raising=False) + assert _reader_mod._http_connect_timeout() == 10.0 + + def test_default_read_timeout(self, monkeypatch): + monkeypatch.delenv( + 'XRSPATIAL_GEOTIFF_HTTP_READ_TIMEOUT', raising=False) + assert _reader_mod._http_read_timeout() == 30.0 + + def test_env_override_connect(self, monkeypatch): + monkeypatch.setenv('XRSPATIAL_GEOTIFF_HTTP_CONNECT_TIMEOUT', '2.5') + assert _reader_mod._http_connect_timeout() == 2.5 + + def test_env_override_read(self, monkeypatch): + monkeypatch.setenv('XRSPATIAL_GEOTIFF_HTTP_READ_TIMEOUT', '7') + assert _reader_mod._http_read_timeout() == 7.0 + + def test_env_garbage_falls_back(self, monkeypatch): + monkeypatch.setenv( + 'XRSPATIAL_GEOTIFF_HTTP_CONNECT_TIMEOUT', 'not-a-float') + assert _reader_mod._http_connect_timeout() == 10.0 + + def test_env_zero_falls_back(self, monkeypatch): + # Zero is not a useful timeout; treat as missing. + monkeypatch.setenv('XRSPATIAL_GEOTIFF_HTTP_READ_TIMEOUT', '0') + assert _reader_mod._http_read_timeout() == 30.0 + + +# --------------------------------------------------------------------------- +# Redirect cap +# --------------------------------------------------------------------------- + + +def test_redirect_cap_is_set(): + """The module-level constant is what both transports honour.""" + assert _reader_mod._HTTP_MAX_REDIRECTS == 5 + + +# --------------------------------------------------------------------------- +# Redirect re-validation -- the initial-URL allow-list is not enough on its +# own because a public URL can 3xx-redirect to a private/loopback IP. Each +# hop has to be re-validated. Issue #1664 review. +# --------------------------------------------------------------------------- + + +class _MockPoolResponse: + """Stand-in for ``urllib3.HTTPResponse`` -- only the bits we touch.""" + + def __init__(self, status: int, location: str | None = None, + data: bytes = b''): + self.status = status + self.headers = {'Location': location} if location else {} + self.data = data + + +class _MockPool: + """Records requests, returns scripted responses in order. + + Each scripted response is a ``_MockPoolResponse``. After the script is + exhausted, returns a 200 with empty body so loops terminate. + """ + + def __init__(self, script: list[_MockPoolResponse]): + self.script = list(script) + self.calls: list[str] = [] + + def request(self, method, url, **kwargs): + self.calls.append(url) + # Caller must explicitly opt out of urllib3's built-in redirect + # follower; if it doesn't, our hop-by-hop validation is dead code. + assert kwargs.get('redirect') is False, ( + f"_HTTPSource must request with redirect=False so each hop " + f"is validated by Python; got {kwargs.get('redirect')!r}" + ) + if self.script: + return self.script.pop(0) + return _MockPoolResponse(200, data=b'OK') + + +class TestRedirectRevalidation: + def test_urllib3_redirect_to_private_rejected(self, monkeypatch): + """Public host that 302-redirects to loopback must be rejected.""" + # Initial validator pass: example.com resolves to a public IP. + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('93.184.216.34')) + src = _reader_mod._HTTPSource('https://example.com/cog.tif') + + # Now the redirect target resolves to loopback. The validator on + # the *Location* hop must catch this even though the initial URL + # was clean. + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('127.0.0.1')) + src._pool = _MockPool([ + _MockPoolResponse(302, location='http://attacker.test/inner.tif'), + ]) + + with pytest.raises(UnsafeURLError) as excinfo: + src.read_range(0, 100) + assert 'attacker.test' in str(excinfo.value) or \ + '127.0.0.1' in str(excinfo.value) + + def test_urllib3_redirect_to_public_followed(self, monkeypatch): + """Public -> public redirect is followed; validator passes each hop.""" + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('93.184.216.34')) + src = _reader_mod._HTTPSource('https://example.com/cog.tif') + src._pool = _MockPool([ + _MockPoolResponse(302, location='https://cdn.example.com/x.tif'), + _MockPoolResponse(200, data=b'tiff-bytes'), + ]) + data = src.read_range(0, 100) + assert data == b'tiff-bytes' + # Two GETs were issued: original + redirected target. + assert len(src._pool.calls) == 2 + assert src._pool.calls[1] == 'https://cdn.example.com/x.tif' + + def test_urllib3_redirect_chain_capped(self, monkeypatch): + """More than _HTTP_MAX_REDIRECTS hops raises rather than looping.""" + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('93.184.216.34')) + src = _reader_mod._HTTPSource('https://example.com/cog.tif') + # Script: every response is a redirect (one more than the cap). + n_redirects = _reader_mod._HTTP_MAX_REDIRECTS + 1 + src._pool = _MockPool([ + _MockPoolResponse( + 302, location=f'https://example.com/hop{i}.tif') + for i in range(n_redirects) + ]) + with pytest.raises(_reader_mod.UnsafeURLError) as excinfo: + src.read_range(0, 100) + msg = str(excinfo.value).lower() + assert 'redirect' in msg + + def test_urllib3_relative_location_resolved(self, monkeypatch): + """Relative Location like ``/other.tif`` resolves against the source.""" + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('93.184.216.34')) + src = _reader_mod._HTTPSource('https://example.com/dir/cog.tif') + src._pool = _MockPool([ + _MockPoolResponse(302, location='/other.tif'), + _MockPoolResponse(200, data=b'after-relative'), + ]) + data = src.read_range(0, 100) + assert data == b'after-relative' + assert src._pool.calls[1] == 'https://example.com/other.tif' + + def test_stdlib_redirect_handler_rejects_private(self): + """The stdlib redirect handler re-validates each ``Location``. + + Direct test of :class:`_ValidatingRedirectHandler` -- we don't go + through a real HTTP request, we just confirm that + ``redirect_request`` runs the URL through ``_validate_http_url`` + before allowing the follow-up. Loopback IP literal (``127.0.0.1``) + is rejected without needing DNS, so the test stays hermetic. + """ + import http.client + import io + from urllib.request import Request + + handler = _reader_mod._ValidatingRedirectHandler() + original = Request('https://example.com/cog.tif') + new_url = 'http://127.0.0.1:8000/inner.tif' + # ``http_error_30x`` is what urllib calls on a redirect; it + # delegates to ``redirect_request`` for policy. The new URL has + # to be re-validated *before* the next request is built. + headers = http.client.HTTPMessage() + with pytest.raises(UnsafeURLError): + handler.redirect_request( + original, io.BytesIO(b''), 302, 'Found', headers, new_url) + + def test_stdlib_redirect_handler_allows_public(self, monkeypatch): + """Public redirect target passes through unchanged.""" + import http.client + import io + from urllib.request import Request + + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('93.184.216.34')) + + handler = _reader_mod._ValidatingRedirectHandler() + original = Request('https://example.com/cog.tif') + new_url = 'https://cdn.example.com/x.tif' + headers = http.client.HTTPMessage() + # Should not raise; returns a Request for the new URL (parent + # class behaviour) so urllib can follow it. + result = handler.redirect_request( + original, io.BytesIO(b''), 302, 'Found', headers, new_url) + assert result is None or result.full_url == new_url + + def test_stdlib_redirect_handler_caps_chain(self): + """The handler's max_redirections matches _HTTP_MAX_REDIRECTS.""" + handler = _reader_mod._ValidatingRedirectHandler() + assert handler.max_redirections == _reader_mod._HTTP_MAX_REDIRECTS + + +# --------------------------------------------------------------------------- +# Integration: _HTTPSource.__init__ runs the validator +# --------------------------------------------------------------------------- + + +class TestHTTPSourceConstructor: + def test_file_url_rejected_at_init(self): + with pytest.raises(UnsafeURLError): + _reader_mod._HTTPSource('file:///etc/passwd') + + def test_localhost_url_rejected_at_init(self, monkeypatch): + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('127.0.0.1')) + with pytest.raises(UnsafeURLError): + _reader_mod._HTTPSource('http://127.0.0.1:6379/probe.tif') + + def test_metadata_url_rejected_at_init(self, monkeypatch): + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('169.254.169.254')) + with pytest.raises(UnsafeURLError): + _reader_mod._HTTPSource( + 'http://169.254.169.254/latest/meta-data/') + + def test_escape_hatch_allows_localhost(self, monkeypatch): + monkeypatch.setenv('XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS', '1') + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('127.0.0.1')) + src = _reader_mod._HTTPSource('http://127.0.0.1:8080/cog.tif') + # Timeout values are pulled from the env-aware helpers. + assert src._connect_timeout == 10.0 + assert src._read_timeout == 30.0 + + def test_unsafe_url_error_is_value_error(self): + """``UnsafeURLError`` is a ``ValueError`` so existing handlers work.""" + with pytest.raises(ValueError): + _reader_mod._HTTPSource('file:///etc/passwd') + + def test_unsafe_url_error_carries_url(self): + try: + _reader_mod._HTTPSource('file:///etc/passwd') + except UnsafeURLError as e: + assert e.url == 'file:///etc/passwd' + else: + pytest.fail("UnsafeURLError not raised") + + +# --------------------------------------------------------------------------- +# read_to_array dispatcher honours the SSRF check +# --------------------------------------------------------------------------- + + +def test_read_to_array_rejects_file_url(): + """The top-level dispatcher refuses ``file://`` URLs. + + ``_open_source()`` routes any non-http(s) ``scheme://`` string to the + fsspec ``_CloudSource`` branch, so ``file://`` never reaches + ``_HTTPSource`` and the SSRF allow-list never sees it. The relevant + guarantee here is just: arbitrary local file access via a + ``file://`` URL does not silently succeed. fsspec raises (or + ``ImportError`` if fsspec isn't installed). + """ + from xrspatial.geotiff._reader import read_to_array + with pytest.raises((ValueError, FileNotFoundError, OSError, ImportError)): + read_to_array('file:///etc/passwd') + + +def test_open_source_rejects_loopback_http(monkeypatch): + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('127.0.0.1')) + with pytest.raises(UnsafeURLError): + _reader_mod._open_source('http://127.0.0.1:8080/x.tif')