Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion xrspatial/geotiff/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,10 @@
from ._backends.gpu import read_geotiff_gpu
from ._backends.vrt import read_vrt
from ._crs import _resolve_crs_to_wkt, _wkt_to_epsg
from ._reader import read_to_array as _read_to_array
from ._reader import (
_MAX_CLOUD_BYTES_SENTINEL,
read_to_array as _read_to_array,
)
from ._runtime import (
GeoTIFFFallbackWarning,
_CRS_WKT_DEPRECATED_SENTINEL,
Expand Down Expand Up @@ -220,6 +223,7 @@ def open_geotiff(source: str | BinaryIO, *,
chunks: int | tuple | None = None,
gpu: bool = False,
max_pixels: int | None = None,
max_cloud_bytes=_MAX_CLOUD_BYTES_SENTINEL,
on_gpu_failure: str = _ON_GPU_FAILURE_SENTINEL,
missing_sources: str = _MISSING_SOURCES_SENTINEL,
) -> xr.DataArray:
Expand Down Expand Up @@ -260,6 +264,16 @@ def open_geotiff(source: str | BinaryIO, *,
Maximum allowed pixel count (width * height * samples). None
uses the default (~1 billion). Raise to read legitimately
large files.
max_cloud_bytes : int or None, optional
Byte ceiling for eager reads from fsspec sources (``s3://``,
``gs://``, ``az://``, ``abfs://``, ``memory://``, ...). The
compressed object size is checked against this budget before
any bytes are downloaded. Default is 256 MiB, overridable via
the ``XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES`` env var. Pass
``None`` to skip the check entirely. The HTTP path already
reads only what it needs via range requests and is not subject
to this limit. Has no effect on local file or file-like
sources. See issue #1928.
on_gpu_failure : {'auto', 'strict'}, optional
Forwarded to ``read_geotiff_gpu`` when ``gpu=True``. Controls
whether GPU decode failures fall back to CPU (``'auto'``,
Expand Down Expand Up @@ -408,6 +422,8 @@ def open_geotiff(source: str | BinaryIO, *,
kwargs = {}
if max_pixels is not None:
kwargs['max_pixels'] = max_pixels
if max_cloud_bytes is not _MAX_CLOUD_BYTES_SENTINEL:
kwargs['max_cloud_bytes'] = max_cloud_bytes

# ``read_to_array`` validates ``window`` against the selected IFD's
# extent and raises ``ValueError`` for out-of-bounds windows with
Expand Down
83 changes: 83 additions & 0 deletions xrspatial/geotiff/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,58 @@
#: Override per-call via the ``max_pixels`` keyword argument.
MAX_PIXELS_DEFAULT = 1_000_000_000

#: Default byte ceiling for eager reads from fsspec cloud sources
#: (``s3://``, ``gs://``, ``az://``, ``abfs://``, ``memory://``, ...).
#: ``_CloudSource`` knows the object size up front via ``fsspec.size()``,
#: so checking against this budget runs before any data is downloaded.
#: 256 MiB is comfortable for typical demo COGs while bounding the blast
#: radius of a crafted or oversized remote object. Override per call
#: with the ``max_cloud_bytes`` kwarg, or env-wide with
#: ``XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES``. Pass ``max_cloud_bytes=None``
#: to skip the check entirely (the pre-#1928 behaviour). See issue #1928.
MAX_CLOUD_BYTES_DEFAULT = 256 * 1024 * 1024

#: Sentinel for "caller did not pass ``max_cloud_bytes``". Distinguishes
#: that case from ``max_cloud_bytes=None`` (caller explicitly disabling
#: the check) so the env-var override has somewhere to land.
_MAX_CLOUD_BYTES_SENTINEL = object()


def _resolve_max_cloud_bytes(max_cloud_bytes):
"""Return the effective cloud byte budget.

Precedence:
1. Explicit kwarg (including ``None`` to disable) wins.
2. ``XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES`` env var, if set to a
positive int.
3. :data:`MAX_CLOUD_BYTES_DEFAULT`.
"""
if max_cloud_bytes is not _MAX_CLOUD_BYTES_SENTINEL:
return max_cloud_bytes
env = _os_module.environ.get('XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES')
if env:
try:
v = int(env)
except ValueError:
return MAX_CLOUD_BYTES_DEFAULT
if v > 0:
return v
return MAX_CLOUD_BYTES_DEFAULT


class PixelSafetyLimitError(ValueError):
"""Raised when a requested TIFF allocation exceeds max_pixels."""


class CloudSizeLimitError(ValueError):
"""Raised when an eager fsspec read would exceed ``max_cloud_bytes``.

Distinct from :class:`PixelSafetyLimitError` because the cloud check
runs against the compressed object size before any TIFF header parse,
so the pixel-count message would be misleading.
"""


def _check_dimensions(width, height, samples, max_pixels):
"""Raise PixelSafetyLimitError if the request exceeds *max_pixels*."""
total = width * height * samples
Expand Down Expand Up @@ -2895,6 +2942,7 @@ def _miniswhite_inverted_nodata(nodata, ifd: IFD, dtype: np.dtype):
def read_to_array(source, *, window=None, overview_level: int | None = None,
band: int | None = None,
max_pixels: int = MAX_PIXELS_DEFAULT,
max_cloud_bytes=_MAX_CLOUD_BYTES_SENTINEL,
) -> tuple[np.ndarray, GeoInfo]:
"""Read a GeoTIFF/COG to a numpy array.

Expand All @@ -2912,6 +2960,16 @@ def read_to_array(source, *, window=None, overview_level: int | None = None,
Maximum allowed total pixel count (width * height * samples).
Prevents memory exhaustion from crafted TIFF headers.
Default is 1 billion (~4 GB for float32 single-band).
max_cloud_bytes : int or None, optional
Byte ceiling for eager reads from fsspec sources (``s3://``,
``gs://``, ``az://``, ``abfs://``, ``memory://``, ...). The
compressed object size is checked against this budget before any
bytes are downloaded. Default is :data:`MAX_CLOUD_BYTES_DEFAULT`
(256 MiB), overridable via the
``XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES`` env var. Pass ``None`` to
skip the check entirely (pre-#1928 behaviour). The HTTP path
already reads only what it needs via range requests and is not
subject to this limit. See issue #1928.

Returns
-------
Expand All @@ -2927,6 +2985,31 @@ def read_to_array(source, *, window=None, overview_level: int | None = None,
src = _BytesIOSource(source)
elif _is_fsspec_uri(source):
src = _CloudSource(source)
# Check the compressed object size before any bytes are
# downloaded. ``_CloudSource.__init__`` already fetched the size
# via ``fsspec.size()``, so this is free. See issue #1928.
cloud_budget = _resolve_max_cloud_bytes(max_cloud_bytes)
if cloud_budget is not None:
size = src.size
if size is None:
src.close()
raise CloudSizeLimitError(
f"Cloud source {source!r} reports unknown size; "
f"refusing to download to avoid an unbounded read. "
f"Pass max_cloud_bytes=None to disable the size "
f"check for this source. Raising the byte limit "
f"does not help when the source size is unknown.")
if size > cloud_budget:
src.close()
raise CloudSizeLimitError(
f"Cloud source {source!r} is {size:,} bytes, which "
f"exceeds max_cloud_bytes={cloud_budget:,}. Eager "
f"reads pull the full object before any TIFF header "
f"parse; raise max_cloud_bytes (or set "
f"XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES) if the file is "
f"legitimate, pass max_cloud_bytes=None to disable "
f"the check, or use chunks=... for a windowed dask "
f"read.")
else:
src = _FileSource(source)
data = src.read_all()
Expand Down
194 changes: 194 additions & 0 deletions xrspatial/geotiff/tests/test_cloud_read_byte_limit_1928.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
"""Regression tests for issue #1928.

Eager reads from fsspec sources used to call ``_CloudSource.read_all()``
unconditionally, downloading the entire object before any TIFF header
parse or ``max_pixels`` guard could fire. A crafted ``s3://`` / ``gs://``
/ ``memory://`` object could exhaust memory or bandwidth before the
dimensions were checked.

The fix adds a ``max_cloud_bytes`` budget (default 256 MiB, env override
``XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES``) that runs against the compressed
object size before any bytes are fetched. ``_CloudSource`` already
fetches the size from fsspec at construction, so the check is free.
"""
from __future__ import annotations

import numpy as np
import pytest


fsspec = pytest.importorskip("fsspec")

from xrspatial.geotiff import open_geotiff, to_geotiff # noqa: E402
from xrspatial.geotiff._reader import ( # noqa: E402
MAX_CLOUD_BYTES_DEFAULT,
CloudSizeLimitError,
_MAX_CLOUD_BYTES_SENTINEL,
_resolve_max_cloud_bytes,
read_to_array,
)


def _put_in_memory_fs(path: str, payload: bytes) -> None:
fs = fsspec.filesystem("memory")
fs.pipe(path, payload)


def _drop_from_memory_fs(path: str) -> None:
fs = fsspec.filesystem("memory")
try:
fs.rm(path)
except FileNotFoundError:
pass


def _make_small_tif_bytes(tmp_path) -> bytes:
"""Build a small valid TIFF via the public writer."""
arr = np.arange(16, dtype=np.float32).reshape(4, 4)
local = str(tmp_path / "src_1928.tif")
to_geotiff(arr, local, compression="none")
with open(local, "rb") as f:
return f.read()


class TestResolveMaxCloudBytes:
"""``_resolve_max_cloud_bytes`` precedence: kwarg > env > default."""

def test_sentinel_returns_default(self):
assert _resolve_max_cloud_bytes(
_MAX_CLOUD_BYTES_SENTINEL
) == MAX_CLOUD_BYTES_DEFAULT

def test_none_disables_check(self):
assert _resolve_max_cloud_bytes(None) is None

def test_int_kwarg_wins(self):
assert _resolve_max_cloud_bytes(42) == 42

def test_env_override(self, monkeypatch):
monkeypatch.setenv("XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES", "9999")
assert _resolve_max_cloud_bytes(_MAX_CLOUD_BYTES_SENTINEL) == 9999

def test_kwarg_overrides_env(self, monkeypatch):
monkeypatch.setenv("XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES", "9999")
assert _resolve_max_cloud_bytes(123) == 123
assert _resolve_max_cloud_bytes(None) is None

def test_invalid_env_falls_back_to_default(self, monkeypatch):
monkeypatch.setenv(
"XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES", "not-an-int"
)
assert _resolve_max_cloud_bytes(
_MAX_CLOUD_BYTES_SENTINEL
) == MAX_CLOUD_BYTES_DEFAULT

def test_zero_or_negative_env_falls_back(self, monkeypatch):
monkeypatch.setenv("XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES", "0")
assert _resolve_max_cloud_bytes(
_MAX_CLOUD_BYTES_SENTINEL
) == MAX_CLOUD_BYTES_DEFAULT
monkeypatch.setenv("XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES", "-1")
assert _resolve_max_cloud_bytes(
_MAX_CLOUD_BYTES_SENTINEL
) == MAX_CLOUD_BYTES_DEFAULT


class TestCloudByteLimit:
"""End-to-end through ``read_to_array`` / ``open_geotiff``."""

def test_small_cloud_object_under_budget_reads(self, tmp_path):
"""Default budget (256 MiB) does not block normal-sized files."""
payload = _make_small_tif_bytes(tmp_path)
path = "/under_budget_1928.tif"
_put_in_memory_fs(path, payload)
try:
arr, _ = read_to_array(f"memory://{path}")
assert arr.shape == (4, 4)
finally:
_drop_from_memory_fs(path)

def test_oversized_cloud_object_rejected_before_read(self, tmp_path):
"""A file larger than ``max_cloud_bytes`` raises without reading.

The TIFF itself is valid and small, but the explicit per-call
``max_cloud_bytes`` is set below the object size to force the
guard to fire.
"""
payload = _make_small_tif_bytes(tmp_path)
path = "/over_budget_1928.tif"
_put_in_memory_fs(path, payload)
try:
with pytest.raises(
CloudSizeLimitError, match="exceeds max_cloud_bytes"
):
read_to_array(f"memory://{path}", max_cloud_bytes=10)
finally:
_drop_from_memory_fs(path)

def test_none_disables_limit(self, tmp_path):
"""``max_cloud_bytes=None`` restores pre-#1928 behaviour."""
payload = _make_small_tif_bytes(tmp_path)
path = "/disabled_check_1928.tif"
_put_in_memory_fs(path, payload)
try:
arr, _ = read_to_array(
f"memory://{path}", max_cloud_bytes=None
)
assert arr.shape == (4, 4)
finally:
_drop_from_memory_fs(path)

def test_env_var_threshold_applied(self, tmp_path, monkeypatch):
"""Env override threads through when the kwarg is unspecified."""
payload = _make_small_tif_bytes(tmp_path)
path = "/env_budget_1928.tif"
_put_in_memory_fs(path, payload)
monkeypatch.setenv("XRSPATIAL_GEOTIFF_MAX_CLOUD_BYTES", "10")
try:
with pytest.raises(CloudSizeLimitError):
read_to_array(f"memory://{path}")
finally:
_drop_from_memory_fs(path)

def test_open_geotiff_plumbs_max_cloud_bytes(self, tmp_path):
"""The kwarg is reachable from the public ``open_geotiff`` entry
point and reaches the eager path. Without it, the read succeeds;
a tight limit rejects."""
payload = _make_small_tif_bytes(tmp_path)
path = "/open_geotiff_kwarg_1928.tif"
_put_in_memory_fs(path, payload)
try:
da = open_geotiff(f"memory://{path}")
assert da.shape == (4, 4)
with pytest.raises(CloudSizeLimitError):
open_geotiff(f"memory://{path}", max_cloud_bytes=8)
finally:
_drop_from_memory_fs(path)

def test_local_file_unaffected(self, tmp_path):
"""The limit only applies to fsspec URIs. A local file with a
tight ``max_cloud_bytes`` still reads (the kwarg is ignored).
"""
arr = np.arange(16, dtype=np.float32).reshape(4, 4)
local = str(tmp_path / "local_1928.tif")
to_geotiff(arr, local, compression="none")
# Tight limit must not fire on a local path.
out, _ = read_to_array(local, max_cloud_bytes=1)
np.testing.assert_array_equal(out, arr)

def test_http_path_unaffected(self):
"""The HTTP path uses range requests, not ``read_all``, so the
budget does not run there. We only check that the kwarg does not
change the dispatch (no ``CloudSizeLimitError`` for http URLs).
The HTTP code path is exercised by the loopback tests; here we
just confirm dispatch.
"""
# A clearly bogus HTTP URL should fail with a connection / DNS
# style error, not a CloudSizeLimitError, since the cloud-byte
# guard is not on the HTTP path.
with pytest.raises(Exception) as exc_info:
read_to_array(
"http://127.0.0.1:1/nonexistent.tif",
max_cloud_bytes=1,
)
assert not isinstance(exc_info.value, CloudSizeLimitError)
Loading