Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ install_requires =
xarray
numpy
matplotlib
urllib3
zstandard
packages = find:
python_requires = >=3.12
Expand Down
137 changes: 40 additions & 97 deletions xrspatial/geotiff/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
import mmap
import os as _os_module
import threading
import urllib.request
from collections import OrderedDict
from concurrent.futures import ThreadPoolExecutor

import numpy as np
import urllib3

from ._compression import (
COMPRESSION_LERC,
Expand Down Expand Up @@ -342,28 +342,24 @@ def close(self):


def _get_http_pool():
"""Return a module-level urllib3 PoolManager, or None if unavailable."""
"""Return the module-level urllib3 PoolManager, building it on first call."""
global _http_pool
if _http_pool is not None:
return _http_pool
try:
import urllib3
_http_pool = urllib3.PoolManager(
num_pools=10,
maxsize=10,
retries=urllib3.Retry(
total=2,
backoff_factor=0.1,
# Redirects are *not* delegated to urllib3 -- they're
# followed manually in ``_HTTPSource._request`` so each
# ``Location`` runs through ``_validate_http_url`` before
# the next GET. Issue #1664.
redirect=False,
),
)
return _http_pool
except ImportError:
return None
_http_pool = urllib3.PoolManager(
num_pools=10,
maxsize=10,
retries=urllib3.Retry(
total=2,
backoff_factor=0.1,
# Redirects are *not* delegated to urllib3 -- they're
# followed manually in ``_HTTPSource._request`` so each
# ``Location`` runs through ``_validate_http_url`` before
# the next GET. Issue #1664.
redirect=False,
),
)
return _http_pool


_http_pool = None
Expand All @@ -381,10 +377,10 @@ def _get_http_pool():
_HTTP_READ_TIMEOUT_DEFAULT = 30.0

#: URL schemes that ``_HTTPSource`` accepts. The HTTP source is a Range
#: GET implementation backed by urllib3 / urllib, both of which only speak
#: ``http`` and ``https`` -- widening here would just push the failure to
#: connect time. fsspec handles every other ``scheme://`` and is routed
#: separately by :func:`_open_source`.
#: GET implementation backed by urllib3, which only speaks ``http`` and
#: ``https`` -- widening here would just push the failure to connect time.
#: fsspec handles every other ``scheme://`` and is routed separately by
#: :func:`_open_source`.
_HTTP_ALLOWED_SCHEMES = ('http', 'https')


Expand Down Expand Up @@ -663,23 +659,6 @@ def split_coalesced_bytes(
return out


class _ValidatingRedirectHandler(urllib.request.HTTPRedirectHandler):
"""Stdlib redirect handler that re-validates each ``Location``.

The default ``HTTPRedirectHandler`` follows 3xx responses with no
awareness of the SSRF allow-list, so a public URL could 302 into a
loopback or private IP. This subclass calls :func:`_validate_http_url`
on every redirect target before building the follow-up request, and
caps the chain at :data:`_HTTP_MAX_REDIRECTS`. Issue #1664.
"""

max_redirections = _HTTP_MAX_REDIRECTS

def redirect_request(self, req, fp, code, msg, headers, newurl):
_validate_http_url(newurl)
return super().redirect_request(req, fp, code, msg, headers, newurl)


# ---------------------------------------------------------------------------
# Pinned-IP urllib3 connection (issue #1846)
# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -714,8 +693,9 @@ def redirect_request(self, req, fp, code, msg, headers, newurl):
def _build_pinned_connection_classes():
"""Build pinned ``HTTPConnection`` / ``HTTPSConnection`` subclasses.

Done lazily so urllib3 stays an optional import. The subclasses
override ``_new_conn`` to dial the validated IP directly.
Built lazily on first use so the urllib3 connection submodules are
only imported when ``_HTTPSource`` is actually exercised. The
subclasses override ``_new_conn`` to dial the validated IP directly.
"""
import socket as _socket
from urllib3.connection import HTTPConnection, HTTPSConnection
Expand Down Expand Up @@ -872,24 +852,16 @@ class _Conn(HTTPConn):
return pool


_stdlib_opener = None


def _get_stdlib_opener():
"""Return a stdlib opener with the validating redirect handler installed."""
global _stdlib_opener
if _stdlib_opener is None:
_stdlib_opener = urllib.request.build_opener(
_ValidatingRedirectHandler())
return _stdlib_opener


class _HTTPSource:
"""HTTP data source using range requests with connection reuse.

Uses urllib3.PoolManager when available (reuses TCP connections and
TLS sessions across range requests to the same host). Falls back to
stdlib urllib.request if urllib3 is not installed.
Uses :class:`urllib3.PoolManager` for the unpinned escape-hatch path
and a per-hop pinned ``HTTP[S]ConnectionPool`` for the default path,
so TCP and TLS state is reused across range requests to the same host.
urllib3 is a hard install dependency; there is no stdlib fallback.
The stdlib ``urllib.request`` path was removed in #2050 because it
re-resolved the hostname at request time, defeating the IP pin that
closes the DNS-rebinding TOCTOU from #1846.
"""

def __init__(self, url: str):
Expand Down Expand Up @@ -918,12 +890,7 @@ def __init__(self, url: str):
self._read_timeout = _http_read_timeout()

def _urllib3_timeout(self):
"""Build a urllib3 Timeout object lazily.

Imported here so that the module-level import of urllib3 stays
optional (we fall back to stdlib if urllib3 is missing).
"""
import urllib3
"""Build a :class:`urllib3.Timeout` for this source."""
return urllib3.Timeout(
connect=self._connect_timeout, read=self._read_timeout)

Expand Down Expand Up @@ -1030,34 +997,14 @@ def read_range(self, start: int, length: int) -> bytes:
return b''
end = start + length - 1
headers = {'Range': f'bytes={start}-{end}'}
if self._pool is not None:
resp = self._request(headers=headers)
data = resp.data
return self._validate_range_response(
status=resp.status,
content_range=resp.headers.get('Content-Range'),
data=data,
start=start,
length=length,
)
# Fallback: stdlib. urlopen's ``timeout`` is a single value, so
# use the more conservative read timeout; the connect timeout
# isn't separately controllable on stdlib urllib. The opener
# carries ``_ValidatingRedirectHandler`` so 3xx hops are re-
# validated and capped at ``_HTTP_MAX_REDIRECTS``.
req = urllib.request.Request(self._url, headers=headers)
with _get_stdlib_opener().open(req, timeout=self._read_timeout) as resp:
data = resp.read()
# stdlib raises HTTPError for 4xx/5xx automatically; we still
# need to catch the "server ignored Range and returned 200
# with the whole object" case, plus any short body.
return self._validate_range_response(
status=getattr(resp, 'status', None) or resp.getcode(),
content_range=resp.headers.get('Content-Range'),
data=data,
start=start,
length=length,
)
resp = self._request(headers=headers)
return self._validate_range_response(
status=resp.status,
content_range=resp.headers.get('Content-Range'),
data=resp.data,
start=start,
length=length,
)

@staticmethod
def _validate_range_response(*, status, content_range, data,
Expand Down Expand Up @@ -1217,11 +1164,7 @@ def read_ranges_coalesced(
return split_coalesced_bytes(merged_bytes, mapping)

def read_all(self) -> bytes:
if self._pool is not None:
return self._request().data
req = urllib.request.Request(self._url)
with _get_stdlib_opener().open(req, timeout=self._read_timeout) as resp:
return resp.read()
return self._request().data

@property
def size(self) -> int | None:
Expand Down
6 changes: 0 additions & 6 deletions xrspatial/geotiff/tests/test_dns_rebinding_pin_issue_1846.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@ def _resolver(host, port, *args, **kwargs):

class TestPinnedConnectionTarget:
def test_init_records_pinned_ip(self, monkeypatch):
pytest.importorskip("urllib3")
monkeypatch.setattr(
socket, 'getaddrinfo', _ip_resolver('93.184.216.34'))
src = _reader_mod._HTTPSource('https://example.com/cog.tif')
Expand All @@ -142,8 +141,6 @@ def test_rebind_does_not_reach_private_ip(self, monkeypatch):
will fail when the mock returns no data; we only care that the
connection was attempted against the pinned IP.
"""
pytest.importorskip("urllib3")

# First getaddrinfo call (validation) returns public IP. Every
# subsequent call returns the rebound private IP.
monkeypatch.setattr(
Expand Down Expand Up @@ -193,7 +190,6 @@ def test_host_header_and_sni_preserved(self, monkeypatch):
hostname, not the IP literal. Required for HTTP virtual hosting
and TLS certificate verification.
"""
pytest.importorskip("urllib3")
monkeypatch.setattr(
socket, 'getaddrinfo', _ip_resolver('93.184.216.34'))
src = _reader_mod._HTTPSource('https://example.com/cog.tif')
Expand Down Expand Up @@ -256,7 +252,6 @@ def test_redirect_to_safe_host_revalidates(self, monkeypatch):
"""A redirect from safe-host -> also-safe re-runs validation on
the new hostname and pins the new IP.
"""
pytest.importorskip("urllib3")
monkeypatch.setattr(
socket, 'getaddrinfo', _ip_resolver('93.184.216.34'))
src = _reader_mod._HTTPSource('https://safe-host.example.com/a.tif')
Expand Down Expand Up @@ -313,7 +308,6 @@ def _stub_pool_for_request(url, pinned_ip):

def test_redirect_to_private_still_rejected(self, monkeypatch):
"""Pinning doesn't weaken the existing redirect-to-private guard."""
pytest.importorskip("urllib3")
monkeypatch.setattr(
socket, 'getaddrinfo', _ip_resolver('93.184.216.34'))
src = _reader_mod._HTTPSource('https://example.com/cog.tif')
Expand Down
Loading
Loading