diff --git a/setup.cfg b/setup.cfg index 224ebf11b..a59670f37 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,6 +24,7 @@ install_requires = xarray numpy matplotlib + urllib3 zstandard packages = find: python_requires = >=3.12 diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index 5a3cea119..d393ca285 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -5,11 +5,11 @@ import mmap import os as _os_module import threading -import urllib.request from collections import OrderedDict from concurrent.futures import ThreadPoolExecutor import numpy as np +import urllib3 from ._compression import ( COMPRESSION_LERC, @@ -342,28 +342,24 @@ def close(self): def _get_http_pool(): - """Return a module-level urllib3 PoolManager, or None if unavailable.""" + """Return the module-level urllib3 PoolManager, building it on first call.""" global _http_pool if _http_pool is not None: return _http_pool - try: - import urllib3 - _http_pool = urllib3.PoolManager( - num_pools=10, - maxsize=10, - retries=urllib3.Retry( - total=2, - backoff_factor=0.1, - # Redirects are *not* delegated to urllib3 -- they're - # followed manually in ``_HTTPSource._request`` so each - # ``Location`` runs through ``_validate_http_url`` before - # the next GET. Issue #1664. - redirect=False, - ), - ) - return _http_pool - except ImportError: - return None + _http_pool = urllib3.PoolManager( + num_pools=10, + maxsize=10, + retries=urllib3.Retry( + total=2, + backoff_factor=0.1, + # Redirects are *not* delegated to urllib3 -- they're + # followed manually in ``_HTTPSource._request`` so each + # ``Location`` runs through ``_validate_http_url`` before + # the next GET. Issue #1664. + redirect=False, + ), + ) + return _http_pool _http_pool = None @@ -381,10 +377,10 @@ def _get_http_pool(): _HTTP_READ_TIMEOUT_DEFAULT = 30.0 #: URL schemes that ``_HTTPSource`` accepts. The HTTP source is a Range -#: GET implementation backed by urllib3 / urllib, both of which only speak -#: ``http`` and ``https`` -- widening here would just push the failure to -#: connect time. fsspec handles every other ``scheme://`` and is routed -#: separately by :func:`_open_source`. +#: GET implementation backed by urllib3, which only speaks ``http`` and +#: ``https`` -- widening here would just push the failure to connect time. +#: fsspec handles every other ``scheme://`` and is routed separately by +#: :func:`_open_source`. _HTTP_ALLOWED_SCHEMES = ('http', 'https') @@ -663,23 +659,6 @@ def split_coalesced_bytes( return out -class _ValidatingRedirectHandler(urllib.request.HTTPRedirectHandler): - """Stdlib redirect handler that re-validates each ``Location``. - - The default ``HTTPRedirectHandler`` follows 3xx responses with no - awareness of the SSRF allow-list, so a public URL could 302 into a - loopback or private IP. This subclass calls :func:`_validate_http_url` - on every redirect target before building the follow-up request, and - caps the chain at :data:`_HTTP_MAX_REDIRECTS`. Issue #1664. - """ - - max_redirections = _HTTP_MAX_REDIRECTS - - def redirect_request(self, req, fp, code, msg, headers, newurl): - _validate_http_url(newurl) - return super().redirect_request(req, fp, code, msg, headers, newurl) - - # --------------------------------------------------------------------------- # Pinned-IP urllib3 connection (issue #1846) # --------------------------------------------------------------------------- @@ -714,8 +693,9 @@ def redirect_request(self, req, fp, code, msg, headers, newurl): def _build_pinned_connection_classes(): """Build pinned ``HTTPConnection`` / ``HTTPSConnection`` subclasses. - Done lazily so urllib3 stays an optional import. The subclasses - override ``_new_conn`` to dial the validated IP directly. + Built lazily on first use so the urllib3 connection submodules are + only imported when ``_HTTPSource`` is actually exercised. The + subclasses override ``_new_conn`` to dial the validated IP directly. """ import socket as _socket from urllib3.connection import HTTPConnection, HTTPSConnection @@ -872,24 +852,16 @@ class _Conn(HTTPConn): return pool -_stdlib_opener = None - - -def _get_stdlib_opener(): - """Return a stdlib opener with the validating redirect handler installed.""" - global _stdlib_opener - if _stdlib_opener is None: - _stdlib_opener = urllib.request.build_opener( - _ValidatingRedirectHandler()) - return _stdlib_opener - - class _HTTPSource: """HTTP data source using range requests with connection reuse. - Uses urllib3.PoolManager when available (reuses TCP connections and - TLS sessions across range requests to the same host). Falls back to - stdlib urllib.request if urllib3 is not installed. + Uses :class:`urllib3.PoolManager` for the unpinned escape-hatch path + and a per-hop pinned ``HTTP[S]ConnectionPool`` for the default path, + so TCP and TLS state is reused across range requests to the same host. + urllib3 is a hard install dependency; there is no stdlib fallback. + The stdlib ``urllib.request`` path was removed in #2050 because it + re-resolved the hostname at request time, defeating the IP pin that + closes the DNS-rebinding TOCTOU from #1846. """ def __init__(self, url: str): @@ -918,12 +890,7 @@ def __init__(self, url: str): self._read_timeout = _http_read_timeout() def _urllib3_timeout(self): - """Build a urllib3 Timeout object lazily. - - Imported here so that the module-level import of urllib3 stays - optional (we fall back to stdlib if urllib3 is missing). - """ - import urllib3 + """Build a :class:`urllib3.Timeout` for this source.""" return urllib3.Timeout( connect=self._connect_timeout, read=self._read_timeout) @@ -1030,34 +997,14 @@ def read_range(self, start: int, length: int) -> bytes: return b'' end = start + length - 1 headers = {'Range': f'bytes={start}-{end}'} - if self._pool is not None: - resp = self._request(headers=headers) - data = resp.data - return self._validate_range_response( - status=resp.status, - content_range=resp.headers.get('Content-Range'), - data=data, - start=start, - length=length, - ) - # Fallback: stdlib. urlopen's ``timeout`` is a single value, so - # use the more conservative read timeout; the connect timeout - # isn't separately controllable on stdlib urllib. The opener - # carries ``_ValidatingRedirectHandler`` so 3xx hops are re- - # validated and capped at ``_HTTP_MAX_REDIRECTS``. - req = urllib.request.Request(self._url, headers=headers) - with _get_stdlib_opener().open(req, timeout=self._read_timeout) as resp: - data = resp.read() - # stdlib raises HTTPError for 4xx/5xx automatically; we still - # need to catch the "server ignored Range and returned 200 - # with the whole object" case, plus any short body. - return self._validate_range_response( - status=getattr(resp, 'status', None) or resp.getcode(), - content_range=resp.headers.get('Content-Range'), - data=data, - start=start, - length=length, - ) + resp = self._request(headers=headers) + return self._validate_range_response( + status=resp.status, + content_range=resp.headers.get('Content-Range'), + data=resp.data, + start=start, + length=length, + ) @staticmethod def _validate_range_response(*, status, content_range, data, @@ -1217,11 +1164,7 @@ def read_ranges_coalesced( return split_coalesced_bytes(merged_bytes, mapping) def read_all(self) -> bytes: - if self._pool is not None: - return self._request().data - req = urllib.request.Request(self._url) - with _get_stdlib_opener().open(req, timeout=self._read_timeout) as resp: - return resp.read() + return self._request().data @property def size(self) -> int | None: diff --git a/xrspatial/geotiff/tests/test_dns_rebinding_pin_issue_1846.py b/xrspatial/geotiff/tests/test_dns_rebinding_pin_issue_1846.py index 84d7fb804..14aebe45a 100644 --- a/xrspatial/geotiff/tests/test_dns_rebinding_pin_issue_1846.py +++ b/xrspatial/geotiff/tests/test_dns_rebinding_pin_issue_1846.py @@ -123,7 +123,6 @@ def _resolver(host, port, *args, **kwargs): class TestPinnedConnectionTarget: def test_init_records_pinned_ip(self, monkeypatch): - pytest.importorskip("urllib3") monkeypatch.setattr( socket, 'getaddrinfo', _ip_resolver('93.184.216.34')) src = _reader_mod._HTTPSource('https://example.com/cog.tif') @@ -142,8 +141,6 @@ def test_rebind_does_not_reach_private_ip(self, monkeypatch): will fail when the mock returns no data; we only care that the connection was attempted against the pinned IP. """ - pytest.importorskip("urllib3") - # First getaddrinfo call (validation) returns public IP. Every # subsequent call returns the rebound private IP. monkeypatch.setattr( @@ -193,7 +190,6 @@ def test_host_header_and_sni_preserved(self, monkeypatch): hostname, not the IP literal. Required for HTTP virtual hosting and TLS certificate verification. """ - pytest.importorskip("urllib3") monkeypatch.setattr( socket, 'getaddrinfo', _ip_resolver('93.184.216.34')) src = _reader_mod._HTTPSource('https://example.com/cog.tif') @@ -256,7 +252,6 @@ def test_redirect_to_safe_host_revalidates(self, monkeypatch): """A redirect from safe-host -> also-safe re-runs validation on the new hostname and pins the new IP. """ - pytest.importorskip("urllib3") monkeypatch.setattr( socket, 'getaddrinfo', _ip_resolver('93.184.216.34')) src = _reader_mod._HTTPSource('https://safe-host.example.com/a.tif') @@ -313,7 +308,6 @@ def _stub_pool_for_request(url, pinned_ip): def test_redirect_to_private_still_rejected(self, monkeypatch): """Pinning doesn't weaken the existing redirect-to-private guard.""" - pytest.importorskip("urllib3") monkeypatch.setattr( socket, 'getaddrinfo', _ip_resolver('93.184.216.34')) src = _reader_mod._HTTPSource('https://example.com/cog.tif') diff --git a/xrspatial/geotiff/tests/test_http_no_stdlib_fallback_2050.py b/xrspatial/geotiff/tests/test_http_no_stdlib_fallback_2050.py new file mode 100644 index 000000000..03c8ca801 --- /dev/null +++ b/xrspatial/geotiff/tests/test_http_no_stdlib_fallback_2050.py @@ -0,0 +1,213 @@ +"""urllib3 is the only HTTP transport for ``_HTTPSource`` (issue #2050). + +Before #2050, ``_HTTPSource.read_range`` and ``_HTTPSource.read_all`` had +two code paths: a urllib3 path that pinned the TCP connection to the IP +returned by ``_validate_http_url``, and a stdlib ``urllib.request`` +fallback that re-resolved the hostname at request time. With urllib3 +missing from ``install_requires``, a production install could land on +the stdlib path and silently lose the DNS-rebinding IP pin from #1846. + +#2050 makes ``urllib3`` a hard install dependency and removes the stdlib +fallback. These tests lock in that contract. +""" +from __future__ import annotations + +import inspect +import socket + +import pytest + +from xrspatial.geotiff import _reader as _reader_mod + + +# --------------------------------------------------------------------------- +# urllib3 is a hard runtime dependency +# --------------------------------------------------------------------------- + + +def test_urllib3_is_importable(): + """urllib3 is in install_requires; importing the module must work.""" + import urllib3 # noqa: F401 + + +def test_reader_imports_urllib3_at_module_level(): + """The reader keeps a module-level reference to urllib3. + + Module-level rather than deferred-import makes it impossible to ship + a build where urllib3 is missing and the code silently degrades to + a different transport. + """ + assert hasattr(_reader_mod, 'urllib3') + + +def test_get_http_pool_returns_a_pool_manager(): + """``_get_http_pool`` is no longer allowed to return None. + + Pre-#2050 it returned ``None`` when urllib3 was missing, which is + what routed callers into the stdlib fallback. + """ + import urllib3 + pool = _reader_mod._get_http_pool() + assert pool is not None + assert isinstance(pool, urllib3.PoolManager) + + +# --------------------------------------------------------------------------- +# The stdlib fallback symbols are gone +# --------------------------------------------------------------------------- + + +def test_stdlib_opener_helper_is_removed(): + """``_get_stdlib_opener`` was the entry point for the unpinned path.""" + assert not hasattr(_reader_mod, '_get_stdlib_opener') + assert not hasattr(_reader_mod, '_stdlib_opener') + + +def test_validating_redirect_handler_is_removed(): + """The stdlib redirect handler is gone with the stdlib transport.""" + assert not hasattr(_reader_mod, '_ValidatingRedirectHandler') + + +def test_reader_does_not_import_urllib_request(): + """``urllib.request`` is no longer needed once the stdlib path is gone. + + A residual ``import urllib.request`` at module scope would be a + smell -- the only legitimate consumer was the deleted opener. + """ + src = inspect.getsource(_reader_mod) + # The token has to appear in *executable* form, not just inside a + # comment or docstring. Strip comment lines and check the rest. + code_lines = [ + line for line in src.splitlines() + if not line.lstrip().startswith('#') + ] + code = '\n'.join(code_lines) + assert 'import urllib.request' not in code, ( + "urllib.request should not be imported now that the stdlib " + "HTTP fallback is removed (#2050)." + ) + + +def test_read_range_source_has_no_stdlib_branch(): + """``read_range`` body must not reference ``urllib.request``.""" + src = inspect.getsource(_reader_mod._HTTPSource.read_range) + assert 'urllib.request' not in src + assert 'stdlib_opener' not in src + + +def test_read_all_source_has_no_stdlib_branch(): + """``read_all`` body must not reference ``urllib.request``.""" + src = inspect.getsource(_reader_mod._HTTPSource.read_all) + assert 'urllib.request' not in src + assert 'stdlib_opener' not in src + + +# --------------------------------------------------------------------------- +# urllib3 path still works -- mock the pool and exercise read_range / read_all +# --------------------------------------------------------------------------- + + +def _fake_getaddrinfo(ip: str): + def _resolver(host, port, *args, **kwargs): + return [(socket.AF_INET, socket.SOCK_STREAM, 0, '', + (ip, port or 0))] + return _resolver + + +class _MockResp: + def __init__(self, status, data=b'', content_range=None): + self.status = status + self.data = data + self.headers = {} + if content_range is not None: + self.headers['Content-Range'] = content_range + + +class _MockPool: + def __init__(self, resp): + self._resp = resp + self.calls = [] + + def request(self, method, url, **kwargs): + self.calls.append((method, url, kwargs)) + return self._resp + + +def test_read_range_uses_urllib3_pool(monkeypatch): + """Sanity check: a successful 206 round-trips through ``_request``.""" + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('93.184.216.34')) + src = _reader_mod._HTTPSource('https://example.com/cog.tif') + body = b'A' * 100 + pool = _MockPool(_MockResp( + status=206, data=body, content_range='bytes 0-99/1000')) + src._pool = pool + + data = src.read_range(0, 100) + assert data == body + assert len(pool.calls) == 1 + method, url, kwargs = pool.calls[0] + assert method == 'GET' + assert kwargs.get('redirect') is False + assert kwargs.get('headers', {}).get('Range') == 'bytes=0-99' + + +def test_read_all_uses_urllib3_pool(monkeypatch): + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('93.184.216.34')) + src = _reader_mod._HTTPSource('https://example.com/cog.tif') + body = b'tiff-bytes' + pool = _MockPool(_MockResp(status=200, data=body)) + src._pool = pool + + data = src.read_all() + assert data == body + assert len(pool.calls) == 1 + + +def test_read_range_short_circuits_zero_length(monkeypatch): + """No HTTP traffic for length<=0 -- behaviour preserved from pre-#2050.""" + monkeypatch.setattr( + socket, 'getaddrinfo', _fake_getaddrinfo('93.184.216.34')) + src = _reader_mod._HTTPSource('https://example.com/cog.tif') + pool = _MockPool(_MockResp(status=206, data=b'')) + src._pool = pool + + assert src.read_range(0, 0) == b'' + assert src.read_range(10, -5) == b'' + assert pool.calls == [] + + +# --------------------------------------------------------------------------- +# install_requires advertises urllib3 +# --------------------------------------------------------------------------- + + +def test_install_requires_lists_urllib3(): + """setup.cfg must list urllib3 so deployed installs get it. + + Without this, a wheel built today would let pip resolve the install + without urllib3, and the import at top of _reader would fail at + open_geotiff() time rather than at install time. + """ + import pathlib + setup_cfg = ( + pathlib.Path(_reader_mod.__file__).resolve() + .parent.parent.parent / 'setup.cfg' + ) + if not setup_cfg.exists(): # pragma: no cover + pytest.skip(f"setup.cfg not found at {setup_cfg}") + text = setup_cfg.read_text() + # Locate the install_requires block and confirm urllib3 appears in it. + head, _, tail = text.partition('install_requires =') + assert tail, "install_requires section not found in setup.cfg" + # The block ends at the next top-level key (lines that start in + # column 0). Walk until we see one. + block_lines = [] + for line in tail.splitlines()[1:]: + if line and not line.startswith((' ', '\t')): + break + block_lines.append(line.strip()) + assert 'urllib3' in block_lines, ( + f"urllib3 must be in install_requires; found: {block_lines}" + ) diff --git a/xrspatial/geotiff/tests/test_ssrf_hardening_1664.py b/xrspatial/geotiff/tests/test_ssrf_hardening_1664.py index f1a12f026..78d9e2ff5 100644 --- a/xrspatial/geotiff/tests/test_ssrf_hardening_1664.py +++ b/xrspatial/geotiff/tests/test_ssrf_hardening_1664.py @@ -267,17 +267,13 @@ def request(self, method, url, **kwargs): class TestRedirectRevalidation: - # The test_urllib3_* tests exercise the urllib3 transport path: they mock - # urllib3.PoolManager and call read_range(), which internally builds a - # urllib3.Timeout via _urllib3_timeout(). urllib3 is an optional runtime - # dependency (_HTTPSource falls back to stdlib urllib.request when it's - # missing -- see _reader.py:615-617), so each urllib3-using test starts - # with pytest.importorskip("urllib3"). The test_stdlib_* tests below - # exercise the stdlib redirect handler directly and run regardless. + # urllib3 is a hard install dependency (see setup.cfg install_requires), + # so the urllib3 transport path is the only path. The stdlib fallback + # was removed in #2050 along with ``_ValidatingRedirectHandler``: it + # bypassed the IP pin that closes the #1846 DNS-rebinding TOCTOU. def test_urllib3_redirect_to_private_rejected(self, monkeypatch): """Public host that 302-redirects to loopback must be rejected.""" - pytest.importorskip("urllib3") # Initial validator pass: example.com resolves to a public IP. monkeypatch.setattr( socket, 'getaddrinfo', _fake_getaddrinfo('93.184.216.34')) @@ -299,7 +295,6 @@ def test_urllib3_redirect_to_private_rejected(self, monkeypatch): def test_urllib3_redirect_to_public_followed(self, monkeypatch): """Public -> public redirect is followed; validator passes each hop.""" - pytest.importorskip("urllib3") monkeypatch.setattr( socket, 'getaddrinfo', _fake_getaddrinfo('93.184.216.34')) src = _reader_mod._HTTPSource('https://example.com/cog.tif') @@ -315,7 +310,6 @@ def test_urllib3_redirect_to_public_followed(self, monkeypatch): def test_urllib3_redirect_chain_capped(self, monkeypatch): """More than _HTTP_MAX_REDIRECTS hops raises rather than looping.""" - pytest.importorskip("urllib3") monkeypatch.setattr( socket, 'getaddrinfo', _fake_getaddrinfo('93.184.216.34')) src = _reader_mod._HTTPSource('https://example.com/cog.tif') @@ -333,7 +327,6 @@ def test_urllib3_redirect_chain_capped(self, monkeypatch): def test_urllib3_relative_location_resolved(self, monkeypatch): """Relative Location like ``/other.tif`` resolves against the source.""" - pytest.importorskip("urllib3") monkeypatch.setattr( socket, 'getaddrinfo', _fake_getaddrinfo('93.184.216.34')) src = _reader_mod._HTTPSource('https://example.com/dir/cog.tif') @@ -345,55 +338,6 @@ def test_urllib3_relative_location_resolved(self, monkeypatch): assert data == b'after-relative' assert src._pool.calls[1] == 'https://example.com/other.tif' - def test_stdlib_redirect_handler_rejects_private(self): - """The stdlib redirect handler re-validates each ``Location``. - - Direct test of :class:`_ValidatingRedirectHandler` -- we don't go - through a real HTTP request, we just confirm that - ``redirect_request`` runs the URL through ``_validate_http_url`` - before allowing the follow-up. Loopback IP literal (``127.0.0.1``) - is rejected without needing DNS, so the test stays hermetic. - """ - import http.client - import io - from urllib.request import Request - - handler = _reader_mod._ValidatingRedirectHandler() - original = Request('https://example.com/cog.tif') - new_url = 'http://127.0.0.1:8000/inner.tif' - # ``http_error_30x`` is what urllib calls on a redirect; it - # delegates to ``redirect_request`` for policy. The new URL has - # to be re-validated *before* the next request is built. - headers = http.client.HTTPMessage() - with pytest.raises(UnsafeURLError): - handler.redirect_request( - original, io.BytesIO(b''), 302, 'Found', headers, new_url) - - def test_stdlib_redirect_handler_allows_public(self, monkeypatch): - """Public redirect target passes through unchanged.""" - import http.client - import io - from urllib.request import Request - - monkeypatch.setattr( - socket, 'getaddrinfo', _fake_getaddrinfo('93.184.216.34')) - - handler = _reader_mod._ValidatingRedirectHandler() - original = Request('https://example.com/cog.tif') - new_url = 'https://cdn.example.com/x.tif' - headers = http.client.HTTPMessage() - # Should not raise; returns a Request for the new URL (parent - # class behaviour) so urllib can follow it. - result = handler.redirect_request( - original, io.BytesIO(b''), 302, 'Found', headers, new_url) - assert result is None or result.full_url == new_url - - def test_stdlib_redirect_handler_caps_chain(self): - """The handler's max_redirections matches _HTTP_MAX_REDIRECTS.""" - handler = _reader_mod._ValidatingRedirectHandler() - assert handler.max_redirections == _reader_mod._HTTP_MAX_REDIRECTS - - # --------------------------------------------------------------------------- # Integration: _HTTPSource.__init__ runs the validator # ---------------------------------------------------------------------------