From c9b3519d1f270134b049798f3e2f568ee15d9234 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Fri, 15 May 2026 07:31:10 -0700 Subject: [PATCH 1/2] geotiff: bound eager fsspec reads with max_cloud_bytes (#1928) Eager reads from cloud sources used to call _CloudSource.read_all() unconditionally, pulling the entire object into memory before the TIFF header parsed or the max_pixels guard ran. A crafted s3://, gs://, or memory:// object could exhaust memory and bandwidth before any dimension check fired. Add a max_cloud_bytes budget (default 256 MiB, env override XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES, per-call kwarg on read_to_array and open_geotiff). _CloudSource already knows the object size from fsspec.size() at construction, so the check runs before any bytes are fetched. Pass max_cloud_bytes=None to opt out and restore pre-#1928 behaviour. The HTTP path already reads only what it needs via range requests and is unaffected. Raises a new CloudSizeLimitError (a ValueError subclass) when the cloud object exceeds the budget, with a message that points at the kwarg, the env var, and the dask windowed-read alternative. --- xrspatial/geotiff/__init__.py | 18 +- xrspatial/geotiff/_reader.py | 83 ++++++++ .../tests/test_cloud_read_byte_limit_1928.py | 197 ++++++++++++++++++ 3 files changed, 297 insertions(+), 1 deletion(-) create mode 100644 xrspatial/geotiff/tests/test_cloud_read_byte_limit_1928.py diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 7f61e514..319fec2f 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -75,7 +75,10 @@ from ._backends.gpu import read_geotiff_gpu from ._backends.vrt import read_vrt from ._crs import _resolve_crs_to_wkt, _wkt_to_epsg -from ._reader import read_to_array as _read_to_array +from ._reader import ( + _MAX_CLOUD_BYTES_SENTINEL, + read_to_array as _read_to_array, +) from ._runtime import ( GeoTIFFFallbackWarning, _CRS_WKT_DEPRECATED_SENTINEL, @@ -220,6 +223,7 @@ def open_geotiff(source: str | BinaryIO, *, chunks: int | tuple | None = None, gpu: bool = False, max_pixels: int | None = None, + max_cloud_bytes=_MAX_CLOUD_BYTES_SENTINEL, on_gpu_failure: str = _ON_GPU_FAILURE_SENTINEL, missing_sources: str = _MISSING_SOURCES_SENTINEL, ) -> xr.DataArray: @@ -260,6 +264,16 @@ def open_geotiff(source: str | BinaryIO, *, Maximum allowed pixel count (width * height * samples). None uses the default (~1 billion). Raise to read legitimately large files. + max_cloud_bytes : int or None, optional + Byte ceiling for eager reads from fsspec sources (``s3://``, + ``gs://``, ``az://``, ``abfs://``, ``memory://``, ...). The + compressed object size is checked against this budget before + any bytes are downloaded. Default is 256 MiB, overridable via + the ``XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES`` env var. Pass + ``None`` to skip the check entirely. The HTTP path already + reads only what it needs via range requests and is not subject + to this limit. Has no effect on local file or file-like + sources. See issue #1928. on_gpu_failure : {'auto', 'strict'}, optional Forwarded to ``read_geotiff_gpu`` when ``gpu=True``. Controls whether GPU decode failures fall back to CPU (``'auto'``, @@ -408,6 +422,8 @@ def open_geotiff(source: str | BinaryIO, *, kwargs = {} if max_pixels is not None: kwargs['max_pixels'] = max_pixels + if max_cloud_bytes is not _MAX_CLOUD_BYTES_SENTINEL: + kwargs['max_cloud_bytes'] = max_cloud_bytes # ``read_to_array`` validates ``window`` against the selected IFD's # extent and raises ``ValueError`` for out-of-bounds windows with diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index cd27338a..ab3d7e87 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -46,11 +46,58 @@ #: Override per-call via the ``max_pixels`` keyword argument. MAX_PIXELS_DEFAULT = 1_000_000_000 +#: Default byte ceiling for eager reads from fsspec cloud sources +#: (``s3://``, ``gs://``, ``az://``, ``abfs://``, ``memory://``, ...). +#: ``_CloudSource`` knows the object size up front via ``fsspec.size()``, +#: so checking against this budget runs before any data is downloaded. +#: 256 MiB is comfortable for typical demo COGs while bounding the blast +#: radius of a crafted or oversized remote object. Override per call +#: with the ``max_cloud_bytes`` kwarg, or env-wide with +#: ``XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES``. Pass ``max_cloud_bytes=None`` +#: to skip the check entirely (the pre-#1928 behaviour). See issue #1928. +MAX_CLOUD_BYTES_DEFAULT = 256 * 1024 * 1024 + +#: Sentinel for "caller did not pass ``max_cloud_bytes``". Distinguishes +#: that case from ``max_cloud_bytes=None`` (caller explicitly disabling +#: the check) so the env-var override has somewhere to land. +_MAX_CLOUD_BYTES_SENTINEL = object() + + +def _resolve_max_cloud_bytes(max_cloud_bytes): + """Return the effective cloud byte budget. + + Precedence: + 1. Explicit kwarg (including ``None`` to disable) wins. + 2. ``XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES`` env var, if set to a + positive int. + 3. :data:`MAX_CLOUD_BYTES_DEFAULT`. + """ + if max_cloud_bytes is not _MAX_CLOUD_BYTES_SENTINEL: + return max_cloud_bytes + env = _os_module.environ.get('XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES') + if env: + try: + v = int(env) + except ValueError: + return MAX_CLOUD_BYTES_DEFAULT + if v > 0: + return v + return MAX_CLOUD_BYTES_DEFAULT + class PixelSafetyLimitError(ValueError): """Raised when a requested TIFF allocation exceeds max_pixels.""" +class CloudSizeLimitError(ValueError): + """Raised when an eager fsspec read would exceed ``max_cloud_bytes``. + + Distinct from :class:`PixelSafetyLimitError` because the cloud check + runs against the compressed object size before any TIFF header parse, + so the pixel-count message would be misleading. + """ + + def _check_dimensions(width, height, samples, max_pixels): """Raise PixelSafetyLimitError if the request exceeds *max_pixels*.""" total = width * height * samples @@ -2895,6 +2942,7 @@ def _miniswhite_inverted_nodata(nodata, ifd: IFD, dtype: np.dtype): def read_to_array(source, *, window=None, overview_level: int | None = None, band: int | None = None, max_pixels: int = MAX_PIXELS_DEFAULT, + max_cloud_bytes=_MAX_CLOUD_BYTES_SENTINEL, ) -> tuple[np.ndarray, GeoInfo]: """Read a GeoTIFF/COG to a numpy array. @@ -2912,6 +2960,16 @@ def read_to_array(source, *, window=None, overview_level: int | None = None, Maximum allowed total pixel count (width * height * samples). Prevents memory exhaustion from crafted TIFF headers. Default is 1 billion (~4 GB for float32 single-band). + max_cloud_bytes : int or None, optional + Byte ceiling for eager reads from fsspec sources (``s3://``, + ``gs://``, ``az://``, ``abfs://``, ``memory://``, ...). The + compressed object size is checked against this budget before any + bytes are downloaded. Default is :data:`MAX_CLOUD_BYTES_DEFAULT` + (256 MiB), overridable via the + ``XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES`` env var. Pass ``None`` to + skip the check entirely (pre-#1928 behaviour). The HTTP path + already reads only what it needs via range requests and is not + subject to this limit. See issue #1928. Returns ------- @@ -2927,6 +2985,31 @@ def read_to_array(source, *, window=None, overview_level: int | None = None, src = _BytesIOSource(source) elif _is_fsspec_uri(source): src = _CloudSource(source) + # Check the compressed object size before any bytes are + # downloaded. ``_CloudSource.__init__`` already fetched the size + # via ``fsspec.size()``, so this is free. See issue #1928. + cloud_budget = _resolve_max_cloud_bytes(max_cloud_bytes) + if cloud_budget is not None: + size = src.size + if size is None: + src.close() + raise CloudSizeLimitError( + f"Cloud source {source!r} reports unknown size; " + f"refusing to download to avoid an unbounded read. " + f"Pass max_cloud_bytes=None to override, or raise " + f"the limit via the XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES " + f"environment variable.") + if size > cloud_budget: + src.close() + raise CloudSizeLimitError( + f"Cloud source {source!r} is {size:,} bytes, which " + f"exceeds max_cloud_bytes={cloud_budget:,}. Eager " + f"reads pull the full object before any TIFF header " + f"parse; raise max_cloud_bytes (or set " + f"XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES) if the file is " + f"legitimate, pass max_cloud_bytes=None to disable " + f"the check, or use chunks=... for a windowed dask " + f"read.") else: src = _FileSource(source) data = src.read_all() diff --git a/xrspatial/geotiff/tests/test_cloud_read_byte_limit_1928.py b/xrspatial/geotiff/tests/test_cloud_read_byte_limit_1928.py new file mode 100644 index 00000000..e3d9ddb2 --- /dev/null +++ b/xrspatial/geotiff/tests/test_cloud_read_byte_limit_1928.py @@ -0,0 +1,197 @@ +"""Regression tests for issue #1928. + +Eager reads from fsspec sources used to call ``_CloudSource.read_all()`` +unconditionally, downloading the entire object before any TIFF header +parse or ``max_pixels`` guard could fire. A crafted ``s3://`` / ``gs://`` +/ ``memory://`` object could exhaust memory or bandwidth before the +dimensions were checked. + +The fix adds a ``max_cloud_bytes`` budget (default 256 MiB, env override +``XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES``) that runs against the compressed +object size before any bytes are fetched. ``_CloudSource`` already +fetches the size from fsspec at construction, so the check is free. +""" +from __future__ import annotations + +import importlib.util +import os + +import numpy as np +import pytest + + +fsspec = pytest.importorskip("fsspec") + +from xrspatial.geotiff import open_geotiff, to_geotiff # noqa: E402 +from xrspatial.geotiff._reader import ( # noqa: E402 + MAX_CLOUD_BYTES_DEFAULT, + CloudSizeLimitError, + _MAX_CLOUD_BYTES_SENTINEL, + _resolve_max_cloud_bytes, + read_to_array, +) + + +def _put_in_memory_fs(path: str, payload: bytes) -> None: + fs = fsspec.filesystem("memory") + fs.pipe(path, payload) + + +def _drop_from_memory_fs(path: str) -> None: + fs = fsspec.filesystem("memory") + try: + fs.rm(path) + except FileNotFoundError: + pass + + +def _make_small_tif_bytes(tmp_path) -> bytes: + """Build a small valid TIFF via the public writer.""" + arr = np.arange(16, dtype=np.float32).reshape(4, 4) + local = str(tmp_path / "src_1928.tif") + to_geotiff(arr, local, compression="none") + with open(local, "rb") as f: + return f.read() + + +class TestResolveMaxCloudBytes: + """``_resolve_max_cloud_bytes`` precedence: kwarg > env > default.""" + + def test_sentinel_returns_default(self): + assert _resolve_max_cloud_bytes( + _MAX_CLOUD_BYTES_SENTINEL + ) == MAX_CLOUD_BYTES_DEFAULT + + def test_none_disables_check(self): + assert _resolve_max_cloud_bytes(None) is None + + def test_int_kwarg_wins(self): + assert _resolve_max_cloud_bytes(42) == 42 + + def test_env_override(self, monkeypatch): + monkeypatch.setenv("XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES", "9999") + assert _resolve_max_cloud_bytes(_MAX_CLOUD_BYTES_SENTINEL) == 9999 + + def test_kwarg_overrides_env(self, monkeypatch): + monkeypatch.setenv("XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES", "9999") + assert _resolve_max_cloud_bytes(123) == 123 + assert _resolve_max_cloud_bytes(None) is None + + def test_invalid_env_falls_back_to_default(self, monkeypatch): + monkeypatch.setenv( + "XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES", "not-an-int" + ) + assert _resolve_max_cloud_bytes( + _MAX_CLOUD_BYTES_SENTINEL + ) == MAX_CLOUD_BYTES_DEFAULT + + def test_zero_or_negative_env_falls_back(self, monkeypatch): + monkeypatch.setenv("XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES", "0") + assert _resolve_max_cloud_bytes( + _MAX_CLOUD_BYTES_SENTINEL + ) == MAX_CLOUD_BYTES_DEFAULT + monkeypatch.setenv("XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES", "-1") + assert _resolve_max_cloud_bytes( + _MAX_CLOUD_BYTES_SENTINEL + ) == MAX_CLOUD_BYTES_DEFAULT + + +class TestCloudByteLimit: + """End-to-end through ``read_to_array`` / ``open_geotiff``.""" + + def test_small_cloud_object_under_budget_reads(self, tmp_path): + """Default budget (256 MiB) does not block normal-sized files.""" + payload = _make_small_tif_bytes(tmp_path) + path = "/under_budget_1928.tif" + _put_in_memory_fs(path, payload) + try: + arr, _ = read_to_array(f"memory://{path}") + assert arr.shape == (4, 4) + finally: + _drop_from_memory_fs(path) + + def test_oversized_cloud_object_rejected_before_read(self, tmp_path): + """A file larger than ``max_cloud_bytes`` raises without reading. + + The TIFF itself is valid and small, but the explicit per-call + ``max_cloud_bytes`` is set below the object size to force the + guard to fire. + """ + payload = _make_small_tif_bytes(tmp_path) + path = "/over_budget_1928.tif" + _put_in_memory_fs(path, payload) + try: + with pytest.raises( + CloudSizeLimitError, match="exceeds max_cloud_bytes" + ): + read_to_array(f"memory://{path}", max_cloud_bytes=10) + finally: + _drop_from_memory_fs(path) + + def test_none_disables_limit(self, tmp_path): + """``max_cloud_bytes=None`` restores pre-#1928 behaviour.""" + payload = _make_small_tif_bytes(tmp_path) + path = "/disabled_check_1928.tif" + _put_in_memory_fs(path, payload) + try: + arr, _ = read_to_array( + f"memory://{path}", max_cloud_bytes=None + ) + assert arr.shape == (4, 4) + finally: + _drop_from_memory_fs(path) + + def test_env_var_threshold_applied(self, tmp_path, monkeypatch): + """Env override threads through when the kwarg is unspecified.""" + payload = _make_small_tif_bytes(tmp_path) + path = "/env_budget_1928.tif" + _put_in_memory_fs(path, payload) + monkeypatch.setenv("XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES", "10") + try: + with pytest.raises(CloudSizeLimitError): + read_to_array(f"memory://{path}") + finally: + _drop_from_memory_fs(path) + + def test_open_geotiff_plumbs_max_cloud_bytes(self, tmp_path): + """The kwarg is reachable from the public ``open_geotiff`` entry + point and reaches the eager path. Without it, the read succeeds; + a tight limit rejects.""" + payload = _make_small_tif_bytes(tmp_path) + path = "/open_geotiff_kwarg_1928.tif" + _put_in_memory_fs(path, payload) + try: + da = open_geotiff(f"memory://{path}") + assert da.shape == (4, 4) + with pytest.raises(CloudSizeLimitError): + open_geotiff(f"memory://{path}", max_cloud_bytes=8) + finally: + _drop_from_memory_fs(path) + + def test_local_file_unaffected(self, tmp_path): + """The limit only applies to fsspec URIs. A local file with a + tight ``max_cloud_bytes`` still reads (the kwarg is ignored). + """ + arr = np.arange(16, dtype=np.float32).reshape(4, 4) + local = str(tmp_path / "local_1928.tif") + to_geotiff(arr, local, compression="none") + # Tight limit must not fire on a local path. + out, _ = read_to_array(local, max_cloud_bytes=1) + np.testing.assert_array_equal(out, arr) + + def test_http_path_unaffected(self): + """The HTTP path uses range requests, not ``read_all``, so the + budget does not run there. We only check that the kwarg does not + change the dispatch (no ``CloudSizeLimitError`` for http URLs). + The HTTP code path is exercised by the loopback tests; here we + just confirm dispatch. + """ + # A clearly bogus HTTP URL should fail with a connection / DNS + # style error, not a CloudSizeLimitError, since the cloud-byte + # guard is not on the HTTP path. + with pytest.raises(Exception) as exc_info: + read_to_array( + "http://127.0.0.1:1/nonexistent.tif", + max_cloud_bytes=1, + ) + assert not isinstance(exc_info.value, CloudSizeLimitError) From 6643a7a0cf665443219b26ac5abb1a40d7c0412d Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Fri, 15 May 2026 07:42:56 -0700 Subject: [PATCH 2/2] geotiff: address copilot review on #1932 - Drop unused ``importlib.util`` and ``os`` imports from the cloud-read byte-limit test module (flake8 F401). - Tighten the unknown-size CloudSizeLimitError message: raising the byte budget does not unblock a source whose size is unknown, so the only working override is ``max_cloud_bytes=None``. Mention that explicitly. --- xrspatial/geotiff/_reader.py | 6 +++--- xrspatial/geotiff/tests/test_cloud_read_byte_limit_1928.py | 3 --- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index ab3d7e87..bc07c9ce 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -2996,9 +2996,9 @@ def read_to_array(source, *, window=None, overview_level: int | None = None, raise CloudSizeLimitError( f"Cloud source {source!r} reports unknown size; " f"refusing to download to avoid an unbounded read. " - f"Pass max_cloud_bytes=None to override, or raise " - f"the limit via the XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES " - f"environment variable.") + f"Pass max_cloud_bytes=None to disable the size " + f"check for this source. Raising the byte limit " + f"does not help when the source size is unknown.") if size > cloud_budget: src.close() raise CloudSizeLimitError( diff --git a/xrspatial/geotiff/tests/test_cloud_read_byte_limit_1928.py b/xrspatial/geotiff/tests/test_cloud_read_byte_limit_1928.py index e3d9ddb2..ddae777b 100644 --- a/xrspatial/geotiff/tests/test_cloud_read_byte_limit_1928.py +++ b/xrspatial/geotiff/tests/test_cloud_read_byte_limit_1928.py @@ -13,9 +13,6 @@ """ from __future__ import annotations -import importlib.util -import os - import numpy as np import pytest