From ca97bb392268f93867bb220d267db8bb27d6e8d7 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Wed, 13 May 2026 07:38:21 -0700 Subject: [PATCH 1/4] Make VRT chunks read lazily (#1798) --- xrspatial/geotiff/__init__.py | 165 ++++++++++++++++++ .../tests/test_read_vrt_lazy_chunks_1798.py | 51 ++++++ 2 files changed, 216 insertions(+) create mode 100644 xrspatial/geotiff/tests/test_read_vrt_lazy_chunks_1798.py diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 67a5fade..1c693b05 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -3753,6 +3753,165 @@ def _gpu_compress_to_part(gpu_arr, w, h, spp): _write_bytes(file_bytes, path) +def _vrt_effective_dtype(vrt, band): + """Return the dtype a VRT read is expected to materialize.""" + selected = [vrt.bands[band]] if band is not None else vrt.bands + if not selected: + raise ValueError( + "VRT has no elements; cannot determine " + "output dtype" + ) + effective = [] + for vrt_band in selected: + dt = vrt_band.dtype + for src in vrt_band.sources: + scaled = src.scale is not None and src.scale != 1.0 + offset = src.offset is not None and src.offset != 0.0 + if scaled or offset: + dt = np.dtype(np.float64) + break + if dt.kind in ('u', 'i') and vrt_band.nodata is not None: + try: + if isinstance(vrt_band.nodata, (int, np.integer)): + nd = int(vrt_band.nodata) + else: + nf = float(vrt_band.nodata) + nd = int(nf) if np.isfinite(nf) and nf.is_integer() else None + if nd is not None: + info = np.iinfo(dt) + if info.min <= nd <= info.max: + dt = np.dtype(np.float64) + except (TypeError, ValueError): + pass + effective.append(dt) + return np.result_type(*effective) + + +def _read_vrt_dask(source: str, *, dtype=None, window=None, band=None, + name=None, chunks=None, max_pixels=None): + """Build a truly lazy dask-backed VRT DataArray from window tasks.""" + import os + import dask + import dask.array as da + from ._reader import _check_dimensions, MAX_PIXELS_DEFAULT + from ._vrt import parse_vrt + + with open(source, 'r') as f: + xml_str = f.read() + vrt_dir = os.path.dirname(os.path.abspath(source)) + vrt = parse_vrt(xml_str, vrt_dir) + + if band is not None: + if not isinstance(band, (int, np.integer)) or isinstance(band, bool): + raise ValueError(f"band must be a non-negative int, got {band!r}") + if band < 0 or band >= len(vrt.bands): + raise ValueError( + f"band index {band} out of range for VRT with " + f"{len(vrt.bands)} band(s)") + + if window is not None: + win_r0, win_c0, win_r1, win_c1 = window + if (win_r0 < 0 or win_c0 < 0 + or win_r1 > vrt.height or win_c1 > vrt.width + or win_r0 >= win_r1 or win_c0 >= win_c1): + raise ValueError( + f"window={window} is outside the VRT extent " + f"({vrt.height}x{vrt.width}) or has non-positive size.") + else: + win_r0, win_c0, win_r1, win_c1 = 0, 0, vrt.height, vrt.width + + height = win_r1 - win_r0 + width = win_c1 - win_c0 + n_bands = len([vrt.bands[band]] if band is not None else vrt.bands) + if max_pixels is None: + max_pixels = MAX_PIXELS_DEFAULT + _check_dimensions(width, height, n_bands, max_pixels) + + out_dtype = np.dtype(dtype) if dtype is not None else _vrt_effective_dtype(vrt, band) + if dtype is not None: + _validate_dtype_cast(_vrt_effective_dtype(vrt, band), out_dtype) + + if isinstance(chunks, int): + ch_h = ch_w = chunks + else: + ch_h, ch_w = chunks + + rows = list(range(0, height, ch_h)) + cols = list(range(0, width, ch_w)) + out_has_band_axis = band is None and n_bands > 1 + + @dask.delayed + def _read_chunk(chunk_window): + chunk_da = read_vrt( + source, dtype=dtype, window=chunk_window, band=band, + chunks=None, gpu=False, max_pixels=max_pixels, + ) + arr = np.asarray(chunk_da.values) + if arr.dtype != out_dtype: + arr = arr.astype(out_dtype) + return arr + + dask_rows = [] + for r0 in rows: + r1 = min(r0 + ch_h, height) + dask_cols = [] + for c0 in cols: + c1 = min(c0 + ch_w, width) + chunk_window = (r0 + win_r0, c0 + win_c0, + r1 + win_r0, c1 + win_c0) + shape = ((r1 - r0, c1 - c0, n_bands) + if out_has_band_axis else (r1 - r0, c1 - c0)) + dask_cols.append(da.from_delayed( + _read_chunk(chunk_window), shape=shape, dtype=out_dtype)) + dask_rows.append(da.concatenate(dask_cols, axis=1)) + dask_arr = da.concatenate(dask_rows, axis=0) + + coords = {} + gt = vrt.geo_transform + if gt is not None: + origin_x, res_x, _, origin_y, _, res_y = gt + if vrt.raster_type == 'point': + x_shift = win_c0 * res_x + y_shift = win_r0 * res_y + else: + x_shift = (win_c0 + 0.5) * res_x + y_shift = (win_r0 + 0.5) * res_y + coords = { + 'x': np.arange(width, dtype=np.float64) * res_x + origin_x + x_shift, + 'y': np.arange(height, dtype=np.float64) * res_y + origin_y + y_shift, + } + + attrs = {} + if vrt.crs_wkt: + epsg = _wkt_to_epsg(vrt.crs_wkt) + if epsg is not None: + attrs['crs'] = epsg + attrs['crs_wkt'] = vrt.crs_wkt + if vrt.raster_type == 'point': + attrs['raster_type'] = 'point' + if vrt.bands: + band_idx_for_nodata = band if band is not None else 0 + nodata = vrt.bands[band_idx_for_nodata].nodata + if nodata is not None: + attrs['nodata'] = nodata + if gt is not None: + origin_x, res_x, _, origin_y, _, res_y = gt + attrs['transform'] = ( + float(res_x), 0.0, float(origin_x) + win_c0 * float(res_x), + 0.0, float(res_y), float(origin_y) + win_r0 * float(res_y), + ) + + if name is None: + name = os.path.splitext(os.path.basename(source))[0] + if out_has_band_axis: + dims = ['y', 'x', 'band'] + coords['band'] = np.arange(n_bands) + else: + dims = ['y', 'x'] + return xr.DataArray(dask_arr, dims=dims, coords=coords, + name=name, attrs=attrs) + + def read_vrt(source: str, *, dtype: str | np.dtype | None = None, window: tuple | None = None, @@ -3828,6 +3987,12 @@ def read_vrt(source: str, *, # default (eager read), so allow it through here. chunks = _validate_chunks_arg(chunks, allow_none=True) + if chunks is not None and not gpu: + return _read_vrt_dask( + source, dtype=dtype, window=window, band=band, name=name, + chunks=chunks, max_pixels=max_pixels, + ) + arr, vrt = _read_vrt_internal(source, window=window, band=band, max_pixels=max_pixels) diff --git a/xrspatial/geotiff/tests/test_read_vrt_lazy_chunks_1798.py b/xrspatial/geotiff/tests/test_read_vrt_lazy_chunks_1798.py new file mode 100644 index 00000000..85ecfd75 --- /dev/null +++ b/xrspatial/geotiff/tests/test_read_vrt_lazy_chunks_1798.py @@ -0,0 +1,51 @@ +"""read_vrt(chunks=...) should build lazy window tasks (#1798).""" +from __future__ import annotations + +import os +import warnings + +import numpy as np + +from xrspatial.geotiff import to_geotiff, read_vrt + + +def _write_vrt(vrt_path, source_name): + vrt_path.write_text( + '\n' + ' \n' + ' \n' + f' {source_name}' + '\n' + ' 1\n' + ' \n' + ' \n' + ' \n' + ' \n' + '\n' + ) + + +def test_read_vrt_chunks_matches_eager_values(tmp_path): + arr = np.arange(24, dtype=np.float32).reshape(4, 6) + src = tmp_path / "tmp_1798_source.tif" + to_geotiff(arr, str(src), compression='none') + vrt = tmp_path / "tmp_1798_source.vrt" + _write_vrt(vrt, os.path.basename(src)) + + eager = read_vrt(str(vrt)) + lazy = read_vrt(str(vrt), chunks=2) + + assert lazy.data.chunks == ((2, 2), (2, 2, 2)) + np.testing.assert_array_equal(lazy.compute().values, eager.values) + + +def test_read_vrt_chunks_does_not_read_sources_during_construction(tmp_path): + vrt = tmp_path / "tmp_1798_missing_source.vrt" + _write_vrt(vrt, "missing.tif") + + with warnings.catch_warnings(record=True) as caught: + lazy = read_vrt(str(vrt), chunks=2) + + assert caught == [] + assert hasattr(lazy.data, 'compute') + From b51dc2fc82a3c63664796aec3f832f551bf61fc0 Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Wed, 13 May 2026 08:19:27 -0700 Subject: [PATCH 2/4] Cap lazy VRT dask task graphs --- xrspatial/geotiff/__init__.py | 18 ++++++++++++++++++ .../tests/test_read_vrt_lazy_chunks_1798.py | 12 ++++++++++++ 2 files changed, 30 insertions(+) diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 1c693b05..3c8e1443 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -3836,6 +3836,24 @@ def _read_vrt_dask(source: str, *, dtype=None, window=None, band=None, else: ch_h, ch_w = chunks + # Match read_geotiff_dask's graph-size guard. Each VRT chunk becomes a + # delayed task, so tiny chunks over very large VRT extents can OOM the + # driver during graph construction before any source read executes. + _MAX_DASK_CHUNKS = 50_000 + n_chunks = ((height + ch_h - 1) // ch_h) * ((width + ch_w - 1) // ch_w) + if n_chunks > _MAX_DASK_CHUNKS: + import math + scale = math.sqrt(n_chunks / _MAX_DASK_CHUNKS) + suggested_h = int(math.ceil(ch_h * scale)) + suggested_w = int(math.ceil(ch_w * scale)) + raise ValueError( + f"read_vrt: chunks=({ch_h}, {ch_w}) on a {height}x{width} " + f"VRT window would produce {n_chunks:,} dask tasks, exceeding " + f"the {_MAX_DASK_CHUNKS:,}-task cap. Pass a larger chunks=... " + f"value explicitly (e.g. chunks=({suggested_h}, " + f"{suggested_w}) keeps the task count under the cap)." + ) + rows = list(range(0, height, ch_h)) cols = list(range(0, width, ch_w)) out_has_band_axis = band is None and n_bands > 1 diff --git a/xrspatial/geotiff/tests/test_read_vrt_lazy_chunks_1798.py b/xrspatial/geotiff/tests/test_read_vrt_lazy_chunks_1798.py index 85ecfd75..d4cb89d8 100644 --- a/xrspatial/geotiff/tests/test_read_vrt_lazy_chunks_1798.py +++ b/xrspatial/geotiff/tests/test_read_vrt_lazy_chunks_1798.py @@ -5,6 +5,7 @@ import warnings import numpy as np +import pytest from xrspatial.geotiff import to_geotiff, read_vrt @@ -49,3 +50,14 @@ def test_read_vrt_chunks_does_not_read_sources_during_construction(tmp_path): assert caught == [] assert hasattr(lazy.data, 'compute') + +def test_read_vrt_chunks_rejects_excessive_task_count(tmp_path): + vrt = tmp_path / "tmp_1798_huge_extent.vrt" + vrt.write_text( + '\n' + ' \n' + '\n' + ) + + with pytest.raises(ValueError, match="task cap"): + read_vrt(str(vrt), chunks=1, max_pixels=20_000_000_000) From 7329dd9fd8641f6c2f4487025e1288f219702b93 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 13 May 2026 15:31:27 +0000 Subject: [PATCH 3/4] Merge origin/main into issue-1798 Agent-Logs-Url: https://github.com/xarray-contrib/xarray-spatial/sessions/27f4131a-2907-4ca0-bf00-f303ab61f2e9 Co-authored-by: brendancol <433221+brendancol@users.noreply.github.com> --- .claude/sweep-security-state.csv | 2 +- docs/source/reference/geotiff.rst | 11 + xrspatial/geotiff/__init__.py | 63 +++- xrspatial/geotiff/_compression.py | 177 ++++++++++- xrspatial/geotiff/_reader.py | 46 ++- xrspatial/geotiff/_vrt.py | 23 +- xrspatial/geotiff/_writer.py | 268 ++++++++++++---- .../geotiff/tests/test_decompression_caps.py | 182 +++++++++++ .../test_geotiff_band_bool_rejection_1786.py | 290 ++++++++++++++++++ .../tests/test_http_dask_orientation_1794.py | 81 +++++ .../test_miniswhite_backend_parity_1797.py | 120 ++++++++ .../tests/test_parallel_writer_1800.py | 276 +++++++++++++++++ .../test_read_geotiff_dask_vrt_kwargs_1795.py | 56 ++++ .../test_vrt_missing_sources_policy_1799.py | 51 +++ .../tests/test_vrt_source_max_pixels_1796.py | 35 +++ ...eotiff_streaming_bigtiff_threshold_1785.py | 263 ++++++++++++++++ 16 files changed, 1870 insertions(+), 74 deletions(-) create mode 100644 xrspatial/geotiff/tests/test_geotiff_band_bool_rejection_1786.py create mode 100644 xrspatial/geotiff/tests/test_http_dask_orientation_1794.py create mode 100644 xrspatial/geotiff/tests/test_miniswhite_backend_parity_1797.py create mode 100644 xrspatial/geotiff/tests/test_parallel_writer_1800.py create mode 100644 xrspatial/geotiff/tests/test_read_geotiff_dask_vrt_kwargs_1795.py create mode 100644 xrspatial/geotiff/tests/test_vrt_missing_sources_policy_1799.py create mode 100644 xrspatial/geotiff/tests/test_vrt_source_max_pixels_1796.py create mode 100644 xrspatial/tests/test_geotiff_streaming_bigtiff_threshold_1785.py diff --git a/.claude/sweep-security-state.csv b/.claude/sweep-security-state.csv index bfe45a06..5f97061b 100644 --- a/.claude/sweep-security-state.csv +++ b/.claude/sweep-security-state.csv @@ -18,7 +18,7 @@ fire,2026-04-25,,,,,"Clean. Despite the module's size hint, fire.py is purely pe flood,2026-05-03,1437,MEDIUM,3,,Re-audit 2026-05-03. MEDIUM Cat 3 fixed in PR #1438 (travel_time and flood_depth_vegetation now validate mannings_n DataArray values are finite and strictly positive via _validate_mannings_n_dataarray helper). No remaining unfixed findings. Other categories clean: every allocation is same-shape as input; no flat index math; NaN propagation explicit in every backend; tan_slope clamped by _TAN_MIN; no CUDA kernels; no file I/O; every public API calls _validate_raster on DataArray inputs. focal,2026-04-27,1284,HIGH,1,,"HIGH (fixed PR #1286): apply(), focal_stats(), and hotspots() accepted unbounded user-supplied kernels via custom_kernel(), which only checks shape parity. The kernel-size guard from #1241 (_check_kernel_memory) only ran inside circle_kernel/annulus_kernel, so a (50001, 50001) custom kernel on a 10x10 raster allocated ~10 GB on the kernel itself plus a much larger padded raster before any work -- same shape as the bilateral DoS in #1236. Fixed by adding _check_kernel_vs_raster_memory in focal.py and wiring it into apply(), focal_stats(), and hotspots() after custom_kernel() validation. All 134 focal tests + 19 bilateral tests pass. No other findings: 10 CUDA kernels all have proper bounds + stencil guards; _validate_raster called on every public entry point; hotspots already raises ZeroDivisionError on constant-value rasters; _focal_variety_cuda uses a fixed-size local buffer (silent truncation but bounded); _focal_std_cuda/_focal_var_cuda clamp the catastrophic-cancellation case via if var < 0.0: var = 0.0; no file I/O." geodesic,2026-04-27,1283,HIGH,1,,"HIGH (fixed PR #1285): slope(method='geodesic') and aspect(method='geodesic') stack a (3, H, W) float64 array (data, lat, lon) before dispatch with no memory check. A large lat/lon-tagged raster passed to either function would OOM. Fixed by adding _check_geodesic_memory(rows, cols) in xrspatial/geodesic.py (mirrors morphology._check_kernel_memory): budgets 56 bytes/cell (24 stacked float64 + 4 float32 output + 24 padded copy + slack) and raises MemoryError when > 50% of available RAM; called from slope.py and aspect.py inside the geodesic branch before dispatch. No other findings: 6 CUDA kernels all have bounds guards (e.g. _run_gpu_geodesic_aspect at geodesic.py:395), custom 16x16 thread blocks avoid register spill, no shared memory, _validate_raster runs upstream in slope/aspect, all backends cast to float32, slope_mag < 1e-7 flat threshold prevents arctan2 NaN propagation, curvature correction uses hardcoded WGS84 R." -geotiff,2026-05-12,1737,HIGH,1,,"Re-audit pass 16 2026-05-12 (deep-sweep p3). NEW HIGH (Cat 1, fixed PR pending against #1737): VRT _resample_nearest allocated (dr.y_size, dr.x_size) before the clip was taken. A crafted can declare values orders of magnitude larger than the VRT's rasterXSize/rasterYSize; the output buffer was bounded by _check_dimensions but the resample intermediate was not. Tracing: a 10x10 source with DstRect 50000x50000 on a 100x100 VRT extent allocated ~2.5 GB inside np.repeat. Fixed by checking dr.x_size * dr.y_size against max_pixels in _vrt.py:read_vrt() before _resample_nearest runs. Mirrors the _check_dimensions pattern from _reader.py. Six new tests in test_vrt_dstrect_resample_cap_1737.py cover the huge-X, huge-Y, legitimate, max_pixels override, at-cap, and negative cases. All 201 existing VRT tests still pass. Other categories verified clean (no new findings): Cat 1 (allocations): _check_dimensions covers the public window / HTTP / dask paths; MAX_TILE_BYTES_DEFAULT (256 MiB) caps per-tile / per-strip compressed bytes locally and over HTTP (PR #1668); LERC blob-header pre-check, JP2K SIZ pre-check, deflate/zstd/lz4/packbits caps still in place; Cat 2 (overflow): _check_dimensions catches before alloc; int64 in CUDA tile offsets; Cat 3 (NaN logic): NaN-aware paths via #1597, #1630; Cat 4 (GPU bounds): all CUDA kernels (_byte_swap_lanes_kernel, _lzw_decode_tiles_kernel, _inflate_tiles_kernel, _predictor_decode_kernel_u8/u16/u32/u64, _fp_predictor_decode_kernel, _assemble_tiles_kernel, _extract_tiles_kernel, _predictor_encode_kernel_u8/u16/u32/u64, _fp_predictor_encode_kernel) have bounds guards; shared memory sizes fixed; Cat 5 (path): _MmapCache realpath, VRT path containment #1671 with XRSPATIAL_VRT_ALLOWED_ROOTS opt-in; tempfile.mkstemp + os.replace for writer; SSRF defenses for _HTTPSource via #1664; Cat 6 (dtype): _validate_dtype_cast in __init__.py; _NATIVE_ORDER byte-swap. XML payloads gated via _safe_xml.safe_fromstring with DOCTYPE rejection (#1579). _compression.py uses tempfile.mkstemp without dir= so the JP2K temp file lands in the system tmpdir (TMPDIR / TEMP), which is the documented safe default. No new findings beyond #1737." +geotiff,2026-05-13,1792,MEDIUM,1,,"Re-audit pass 17 2026-05-13 (deep-sweep s2). NEW MEDIUM (Cat 1): jpeg_decompress (_compression.py:1042-1066) hands attacker-controlled JPEG bytes to Pillow without consulting the declared tile width/height/samples; a tile-size mismatch lets a small JPEG payload allocate up to Pillow's MAX_IMAGE_PIXELS*2 (~178M pixels, ~500 MB RGB) before the downstream chunk.size != expected check fires. Asymmetric with the JP2K SIZ pre-check and LERC blob-info pre-check. Pillow's default DecompressionBombError is a partial guard so severity is MEDIUM. Other categories verified clean: Cat 2-6 same coverage as pass 16 audit; JPEG2000 / LERC / deflate / zstd / lz4 / packbits / LZW caps still in place; VRT _resample_nearest DstRect cap (#1737) merged; VRT path containment + DOCTYPE rejection in _safe_xml; CUDA kernels have bounds guards; mmap cache uses realpath; SSRF defenses on _HTTPSource." glcm,2026-04-24,1257,HIGH,1,,"HIGH (fixed #1257): glcm_texture() validated window_size only as >= 3 and distance only as >= 1, with no upper bound on either. _glcm_numba_kernel iterates range(r-half, r+half+1) for every pixel, so window_size=1_000_001 on a 10x10 raster ran ~10^14 loop iterations with all neighbors failing the interior bounds check (CPU DoS). On the dask backends depth = window_size // 2 + distance drove map_overlap padding, so a huge window also caused oversize per-chunk allocations (memory DoS). Fixed by adding max_val caps in the public entrypoint: window_size <= max(3, min(rows, cols)) and distance <= max(1, window_size // 2). One cap covers every backend because cupy and dask+cupy call through to the CPU kernel after cupy.asnumpy. No other HIGH findings: levels is already capped at 256 so the per-pixel np.zeros((levels, levels)) matrix in the kernel is bounded to 512 KB. No CUDA kernels. No file I/O. Quantization clips to [0, levels-1] before the kernel and NaN maps to -1 which the kernel filters with i_val >= 0. Entropy log(p) and correlation p / (std_i * std_j) are both guarded. All four backends use _validate_raster and cast to float64 before quantizing. MEDIUM (unfixed, Cat 1): the per-pixel np.zeros((levels, levels)) allocation inside the hot loop is a perf issue (levels=256 -> 512 KB alloc+free per pixel) but not a security issue because levels is bounded. Could be hoisted out of the loop or replaced with an in-place clear, but that is an efficiency concern, not security." gpu_rtx,2026-04-29,1308,HIGH,1,,"HIGH (fixed #1308 / PR #1310): hillshade_rtx (gpu_rtx/hillshade.py:184) and viewshed_gpu (gpu_rtx/viewshed.py:269) allocated cupy device buffers sized by raster shape with no memory check. create_triangulation (mesh_utils.py:23-24) adds verts (12 B/px) + triangles (24 B/px) = 36 B/px; hillshade_rtx adds d_rays(32) + d_hits(16) + d_aux(12) + d_output(4) = 64 B/px (100 B/px total); viewshed_gpu adds d_rays(32) + d_hits(16) + d_visgrid(4) + d_vsrays(32) = 84 B/px (120 B/px total). A 30000x30000 raster asked for 90-108 GB of VRAM before cupy surfaced an opaque allocator error. Fixed by adding gpu_rtx/_memory.py with _available_gpu_memory_bytes() and _check_gpu_memory(func_name, h, w) helpers (cost_distance #1262 / sky_view_factor #1299 pattern, 120 B/px budget covers worst case, raises MemoryError when required > 50% of free VRAM, skips silently when memGetInfo() unavailable). Wired into both entry points after the cupy.ndarray type check and before create_triangulation. 9 new tests in test_gpu_rtx_memory.py (5 helper-unit + 4 end-to-end gated on has_rtx). All 81 existing hillshade/viewshed tests still pass. Cat 4 clean: all CUDA kernels (hillshade.py:25/62/106, viewshed.py:32/74/116, mesh_utils.py:50) have bounds guards; no shared memory, no syncthreads needed. MEDIUM not fixed (Cat 6): hillshade_rtx and viewshed_gpu do not call _validate_raster directly but parent hillshade() (hillshade.py:252) and viewshed() (viewshed.py:1707) already validate, so input validation runs before the gpu_rtx entry point - defense-in-depth, not exploitable. MEDIUM not fixed (Cat 2): mesh_utils.py:64-68 cast mesh_map_index to int32 in the triangle index buffer; overflows at H*W > 2.1B vertices (~46341x46341+) but the new memory guard rejects rasters that large first - documentation/clarity item rather than exploitable. MEDIUM not fixed (Cat 3): mesh_utils.py:19 scale = maxDim / maxH divides by zero on an all-zero raster, propagating inf/NaN into mesh vertex z-coords; separate follow-up. LOW not fixed (Cat 5): mesh_utils.write() opens user-supplied path without canonicalization but its only call site (mesh_utils.py:38-39) sits behind if False: in create_triangulation, not reachable in production." hillshade,2026-04-27,,,,,"Clean. Cat 1: only allocation is the output np.empty(data.shape) at line 32 (cupy at line 165) and a _pad_array with hardcoded depth=1 (line 62) -- bounded by caller, no user-controlled amplifier. Azimuth/altitude are scalars and don't drive size. Cat 2: numba kernel uses range(1, rows-1) with simple (y, x) indexing; numba range loops promote to int64. Cat 3: math.sqrt(1.0 + xx_plus_yy) is always >= 1.0 (no neg sqrt, no div-by-zero); NaN elevation propagates correctly through dz_dx/dz_dy -> shaded -> output (the shaded < 0.0 / shaded > 1.0 clamps don't fire on NaN). Azimuth validated to [0, 360], altitude to [0, 90]. Cat 4: _gpu_calc_numba (line 107) guards both grid bounds and 3x3 stencil reads via i > 0 and i < shape[0]-1 and j > 0 and j < shape[1]-1; no shared memory. Cat 5: no file I/O. Cat 6: hillshade() calls _validate_raster (line 252) and _validate_scalar for both azimuth (253) and angle_altitude (254); all four backend paths cast to float32; tests parametrize int32/int64/float32/float64." diff --git a/docs/source/reference/geotiff.rst b/docs/source/reference/geotiff.rst index 93d82685..e58a6f16 100644 --- a/docs/source/reference/geotiff.rst +++ b/docs/source/reference/geotiff.rst @@ -94,3 +94,14 @@ silently falls back to CPU. XRSPATIAL_GEOTIFF_STRICT=1 pytest xrspatial/geotiff/tests/ See issue #1662 for the audit and the full list of affected call sites. + +VRT missing sources +=================== + +``read_vrt`` accepts ``missing_sources='warn'`` or ``'raise'``. The default +``'warn'`` preserves the historical behavior: unreadable source files emit +:class:`xrspatial.geotiff.GeoTIFFFallbackWarning`, the returned DataArray +contains ``attrs['vrt_holes']``, and the mosaic is returned with holes. +Use ``missing_sources='raise'`` when a partial mosaic should fail the +pipeline immediately. ``XRSPATIAL_GEOTIFF_STRICT=1`` still raises in +``'warn'`` mode so CI environments can enforce fail-fast behavior globally. diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 3c8e1443..06edd26f 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -2154,7 +2154,10 @@ def read_geotiff_dask(source: str, *, # rather than letting the windowed-read path try to parse VRT XML as # TIFF bytes. ``read_vrt`` is the single source of truth for VRT. if isinstance(source, str) and source.lower().endswith('.vrt'): - return read_vrt(source, dtype=dtype, name=name, chunks=chunks) + return read_vrt( + source, dtype=dtype, window=window, band=band, name=name, + chunks=chunks, max_pixels=max_pixels, + ) # P5: HTTP COG sources used to fire one IFD/header GET per chunk # task. Parse metadata once here so every delayed task can reuse it. @@ -2186,6 +2189,13 @@ def read_geotiff_dask(source: str, *, finally: _src.close() http_meta = (http_header, http_ifd) + if http_ifd.orientation != 1: + raise ValueError( + f"Orientation tag (274) is {http_ifd.orientation}; " + f"dask-chunked reads (chunks=...) are not supported for " + f"non-default orientation on remote GeoTIFF sources. Read " + f"the full array first, then slice/chunk it." + ) # Wrap the parsed metadata in a single dask Delayed so every # window task takes it as a graph input, not a Python closure. # Without this, the (TIFFHeader, IFD) pair -- which can carry @@ -2282,6 +2292,14 @@ def read_geotiff_dask(source: str, *, coords = _geo_to_coords(geo_info, full_h, full_w) if band is not None: + # Reject ``bool`` and ``np.bool_`` up front; ``isinstance(True, int)`` + # is True in Python so ``True < n_bands`` evaluates without raising + # and silently reads band 1. ``np.bool_`` is not a subclass of + # ``bool`` so it needs its own check to match the VRT path's + # rejection. See #1786. + if isinstance(band, (bool, np.bool_)): + raise ValueError( + f"band must be a non-negative int, got {band!r}") if n_bands == 0: if band != 0: raise IndexError( @@ -2435,6 +2453,7 @@ def _read(http_meta): from ._reader import ( _fetch_decode_cog_http_tiles, MAX_PIXELS_DEFAULT, + _apply_photometric_miniswhite, ) header, ifd = http_meta if _is_http_src: @@ -2454,6 +2473,7 @@ def _read(http_meta): if (arr.ndim == 3 and ifd.samples_per_pixel > 1 and band is not None): arr = arr[:, :, band] + arr = _apply_photometric_miniswhite(arr, ifd) else: _r2a_kwargs = {} if max_pixels is not None: @@ -2967,6 +2987,15 @@ def read_geotiff_gpu(source: str, *, # behaviour mirrors ``read_geotiff_dask``. ifd_samples = ifd.samples_per_pixel if band is not None: + # Reject ``bool`` and ``np.bool_`` up front; + # ``isinstance(True, int)`` is True in Python so + # ``True < ifd_samples`` evaluates without raising and silently + # reads band 1. ``np.bool_`` is not a subclass of ``bool`` so it + # needs its own check to match the VRT path's rejection. + # See #1786. + if isinstance(band, (bool, np.bool_)): + raise ValueError( + f"band must be a non-negative int, got {band!r}") if ifd_samples <= 1: if band != 0: raise IndexError( @@ -3299,6 +3328,13 @@ def _read_once(): geo_info = _apply_orientation_geo_info( geo_info, orientation, file_h=height, file_w=width) + if (ifd.photometric == 0 and samples == 1 and not arr_was_cpu_decoded): + gpu_dtype = np.dtype(str(arr_gpu.dtype)) + if gpu_dtype.kind == 'u': + arr_gpu = np.iinfo(gpu_dtype).max - arr_gpu + elif gpu_dtype.kind == 'f': + arr_gpu = -arr_gpu + # Apply nodata mask + record sentinel so the GPU read agrees with the # CPU eager path (issue #1542). Without this, integer rasters keep the # literal sentinel value and float rasters keep the sentinel rather @@ -3788,7 +3824,8 @@ def _vrt_effective_dtype(vrt, band): def _read_vrt_dask(source: str, *, dtype=None, window=None, band=None, - name=None, chunks=None, max_pixels=None): + name=None, chunks=None, max_pixels=None, + missing_sources='warn'): """Build a truly lazy dask-backed VRT DataArray from window tasks.""" import os import dask @@ -3863,6 +3900,7 @@ def _read_chunk(chunk_window): chunk_da = read_vrt( source, dtype=dtype, window=chunk_window, band=band, chunks=None, gpu=False, max_pixels=max_pixels, + missing_sources=missing_sources, ) arr = np.asarray(chunk_da.values) if arr.dtype != out_dtype: @@ -3937,7 +3975,8 @@ def read_vrt(source: str, *, name: str | None = None, chunks: int | tuple | None = None, gpu: bool = False, - max_pixels: int | None = None) -> xr.DataArray: + max_pixels: int | None = None, + missing_sources: str = 'warn') -> xr.DataArray: """Read a GDAL Virtual Raster Table (.vrt) into an xarray.DataArray. The VRT's source GeoTIFFs are read via windowed reads and assembled @@ -3966,6 +4005,12 @@ def read_vrt(source: str, *, assembled VRT region. None uses the reader default (~1 billion). Matches ``open_geotiff`` / ``read_geotiff_dask`` / ``read_geotiff_gpu``. + missing_sources : {'warn', 'raise'}, default 'warn' + Policy for unreadable source files referenced by the VRT. ``'warn'`` + preserves the historical behavior: emit ``GeoTIFFFallbackWarning``, + record ``attrs['vrt_holes']``, and return a partial mosaic. + ``'raise'`` fails immediately. ``XRSPATIAL_GEOTIFF_STRICT=1`` also + raises, even when ``missing_sources='warn'``. Returns ------- @@ -4005,14 +4050,22 @@ def read_vrt(source: str, *, # default (eager read), so allow it through here. chunks = _validate_chunks_arg(chunks, allow_none=True) + if missing_sources not in ('warn', 'raise'): + raise ValueError( + f"missing_sources must be 'warn' or 'raise', got " + f"{missing_sources!r}") + if chunks is not None and not gpu: return _read_vrt_dask( source, dtype=dtype, window=window, band=band, name=name, chunks=chunks, max_pixels=max_pixels, + missing_sources=missing_sources, ) - arr, vrt = _read_vrt_internal(source, window=window, band=band, - max_pixels=max_pixels) + arr, vrt = _read_vrt_internal( + source, window=window, band=band, max_pixels=max_pixels, + missing_sources=missing_sources, + ) if name is None: import os diff --git a/xrspatial/geotiff/_compression.py b/xrspatial/geotiff/_compression.py index 3be9400c..2912198f 100644 --- a/xrspatial/geotiff/_compression.py +++ b/xrspatial/geotiff/_compression.py @@ -1,12 +1,48 @@ """Compression codecs: deflate (zlib) and LZW (Numba), plus horizontal predictor.""" from __future__ import annotations +import threading import zlib import numpy as np from xrspatial.utils import ngjit +# -- Optional libdeflate backend -------------------------------------------- +# +# When the ``libdeflate`` package is installed, ``deflate_compress`` routes +# through it: libdeflate is typically 1.5-2x faster than ``zlib`` at the +# same compression level, and GDAL >= 3.7 already uses it when available +# so our installs match throughput. Output is wire-compatible (zlib +# format), so encoded streams round-trip through stdlib ``zlib.decompress`` +# unchanged. +# +# libdeflate's ``Compressor`` objects are not thread-safe, so we keep one +# per (thread, level) pair via ``threading.local``. The writer drives +# compression from a ``ThreadPoolExecutor``; per-thread caching avoids +# allocating a fresh compressor per strip/tile. +try: # pragma: no cover - exercised only when libdeflate is installed + import libdeflate as _libdeflate + _HAVE_LIBDEFLATE = True +except ImportError: + _libdeflate = None + _HAVE_LIBDEFLATE = False + +_libdeflate_thread_local = threading.local() + + +def _libdeflate_compressor(level: int): + """Return a thread-local libdeflate Compressor for *level*.""" + cache = getattr(_libdeflate_thread_local, 'cache', None) + if cache is None: + cache = {} + _libdeflate_thread_local.cache = cache + comp = cache.get(level) + if comp is None: + comp = _libdeflate.Compressor(level) + cache[level] = comp + return comp + # -- Decompression-bomb defenses --------------------------------------------- # # A malicious TIFF can declare a small strip/tile compressed payload that @@ -98,7 +134,14 @@ def deflate_decompress(data: bytes, expected_size: int = 0) -> bytes: def deflate_compress(data: bytes, level: int = 6) -> bytes: - """Compress data with deflate/zlib.""" + """Compress data with deflate/zlib. + + Uses ``libdeflate`` when installed (1.5-2x faster than ``zlib``) and + falls back to ``zlib.compress`` otherwise. Output is wire-compatible + either way: the stdlib ``zlib.decompress`` accepts both. + """ + if _HAVE_LIBDEFLATE: + return _libdeflate_compressor(level).compress(data, _libdeflate.Format.ZLIB) return zlib.compress(data, level) @@ -1039,6 +1082,125 @@ def _splice_jpeg_tables(tile_data: bytes, return tile_data[:2] + tables_body + tile_data[2:] +# JPEG Start-Of-Frame marker codes: 0xFFC0..0xFFCF *except* DHT (0xC4), +# JPG (0xC8), and DAC (0xCC). The payload format is the same across every +# SOF variant: 2-byte segment length, 1-byte sample precision, 2-byte +# image height (big-endian), 2-byte image width (big-endian), 1-byte +# component count. +_JPEG_SOF_CODES = frozenset( + {0xC0, 0xC1, 0xC2, 0xC3, 0xC5, 0xC6, 0xC7, + 0xC9, 0xCA, 0xCB, 0xCD, 0xCE, 0xCF} +) + + +def _read_jpeg_sof(data: bytes) -> tuple[int, int, int] | None: + """Return ``(height, width, components)`` from the first JPEG SOF marker. + + Scans *data* for the first Start-Of-Frame marker without decoding any + pixel data. Returns ``None`` when the buffer is too short, when the + SOI marker is missing, when a length-prefixed segment runs off the + end, or when no SOF is found before EOI/end-of-buffer. The caller + treats a ``None`` return as "could not determine declared size" and + falls back to Pillow's own guard. + + Used by :func:`jpeg_decompress` to enforce a pre-decode bomb cap + analogous to the JP2K SIZ pre-check and the LERC ``getLercBlobInfo`` + pre-check. + """ + n = len(data) + if n < 4 or data[0] != 0xFF or data[1] != 0xD8: + return None + i = 2 + while i + 3 < n: + if data[i] != 0xFF: + return None + # Skip JPEG fill bytes (0xFF padding). + marker = data[i + 1] + while marker == 0xFF and i + 2 < n: + i += 1 + marker = data[i + 1] + # Standalone markers without payload: SOI, EOI, RSTm, TEM. + # Encountering EOI before SOF means the stream is malformed for + # this purpose; caller falls back to Pillow. + if marker == 0xD9: # EOI + return None + if marker in (0xD8,) or 0xD0 <= marker <= 0xD7 or marker == 0x01: + i += 2 + continue + if marker in _JPEG_SOF_CODES: + # SOF is a length-prefixed segment; validate the declared + # segment length before reading the fixed fields so a + # truncated/malformed segment header is rejected as "unknown + # size" rather than silently reading past the segment into + # later bytes. The fields we need are at offsets 5..9 in the + # segment, which sits inside the declared seg_len bytes. + if i + 3 >= n: + return None + seg_len = (data[i + 2] << 8) | data[i + 3] + # SOF requires at least: 2 (length) + 1 (precision) + 2 (h) + # + 2 (w) + 1 (components) = 8 bytes; the segment must also + # fit inside the buffer. + if seg_len < 8 or i + 2 + seg_len > n: + return None + height = (data[i + 5] << 8) | data[i + 6] + width = (data[i + 7] << 8) | data[i + 8] + components = data[i + 9] + return height, width, components + # Length-prefixed segment: payload length includes the two length + # bytes themselves but not the marker. + if i + 3 >= n: + return None + seg_len = (data[i + 2] << 8) | data[i + 3] + if seg_len < 2: + return None + i += 2 + seg_len + return None + + +def _check_jpeg_bomb(data: bytes, expected_width: int, expected_height: int, + expected_samples: int) -> None: + """Reject JPEG blobs whose SOF dimensions exceed the expected tile size. + + The caller passes the TIFF-declared tile width/height/samples; we + parse the JPEG SOF marker to discover the JPEG's own declared + dimensions and refuse to decode when the projected output exceeds + the same ``expected * 1.05 + 1`` margin used by every other codec + wrapper. Skipping when any expected value is non-positive matches + the convention from the other codecs: callers that don't supply a + size fall back to Pillow's built-in ``DecompressionBombError`` + guard. + + A return of ``None`` from :func:`_read_jpeg_sof` is treated as + "could not determine declared size"; in that case we defer to + Pillow rather than raising, so legitimate streams with unusual + framing still decode. + """ + if expected_width <= 0 or expected_height <= 0 or expected_samples <= 0: + return + info = _read_jpeg_sof(data) + if info is None: + return + declared_h, declared_w, declared_components = info + # JPEG components ship as 8-bit samples in every TIFF JPEG we + # support, so ``width * height * components`` equals the post-decode + # byte count exactly. JPEG-12 (12-bit precision) would round up to + # 2 bytes per sample, but tifffile/Pillow doesn't surface those on + # the read path so we treat samples as bytes here. The expected + # side uses what the *caller* declared (TIFF tile size and + # samples-per-pixel), so a JPEG claiming extra components is + # rejected as a bomb. + expected_bytes = expected_width * expected_height * expected_samples + declared_bytes = declared_w * declared_h * max(declared_components, 1) + cap = _max_output_with_margin(expected_bytes) + if declared_bytes > cap: + raise ValueError( + f"jpeg decode would exceed expected size: declared output is " + f"{declared_bytes} bytes ({declared_w}x{declared_h}x" + f"{declared_components}), cap is {cap} (expected " + f"{expected_bytes}). Likely a decompression bomb." + ) + + def jpeg_decompress(data: bytes, width: int = 0, height: int = 0, samples: int = 1, jpeg_tables: bytes | None = None) -> bytes: """Decompress JPEG tile/strip data. Requires Pillow. @@ -1048,6 +1210,13 @@ def jpeg_decompress(data: bytes, width: int = 0, height: int = 0, data : bytes Raw JPEG bytes from one TIFF strip or tile. May be a fragment when ``jpeg_tables`` is supplied (GDAL tiled JPEG). + width, height, samples : int, optional + TIFF-declared tile dimensions. When all three are positive the + wrapper inspects the JPEG SOF marker before decoding and raises + ``ValueError`` when the JPEG's declared output exceeds + ``width * height * samples * 1.05 + 1`` bytes + (decompression-bomb guard). Defaults of 0/0/1 disable the cap + for direct callers and round-trip tests. jpeg_tables : bytes, optional Contents of TIFF tag 347 (JPEGTables). If supplied, the shared DQT/DHT segments are spliced into ``data`` before decoding so @@ -1060,6 +1229,12 @@ def jpeg_decompress(data: bytes, width: int = 0, height: int = 0, import io if jpeg_tables: data = _splice_jpeg_tables(data, jpeg_tables) + # Pre-decode bomb cap (issue #1792). Pillow's built-in + # DecompressionBombError fires at ~178M pixels (~500 MB RGB) which + # is well above a typical tile's expected size; without this + # per-tile pre-check a single malicious tile can allocate hundreds + # of MB before the downstream chunk.size != expected check fires. + _check_jpeg_bomb(data, width, height, samples) img = Image.open(io.BytesIO(data)) # libjpeg already converts YCbCr->RGB during decode, so rely on the # mode Pillow returns. Calling .convert() unnecessarily would copy. diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index 19c2105f..b3da32bd 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -47,11 +47,15 @@ MAX_PIXELS_DEFAULT = 1_000_000_000 +class PixelSafetyLimitError(ValueError): + """Raised when a requested TIFF allocation exceeds max_pixels.""" + + def _check_dimensions(width, height, samples, max_pixels): - """Raise ValueError if the requested allocation exceeds *max_pixels*.""" + """Raise PixelSafetyLimitError if the request exceeds *max_pixels*.""" total = width * height * samples if total > max_pixels: - raise ValueError( + raise PixelSafetyLimitError( f"TIFF image dimensions ({width} x {height} x {samples} = " f"{total:,} pixels) exceed the safety limit of " f"{max_pixels:,} pixels. Pass a larger max_pixels value to " @@ -1911,6 +1915,15 @@ def _read_cog_http(url: str, overview_level: int | None = None, # urllib3 ``PoolManager`` is shared module-level, not per-source) # but a future resource-holding source will need it. See issue #1695. if band is not None: + # Reject ``bool`` (and ``np.bool_``) up front; ``isinstance(True, int)`` + # is True in Python so ``True < samples_per_pixel`` evaluates without + # raising and silently reads band 1. ``np.bool_`` is not a subclass of + # ``bool`` so it needs its own check to match the VRT path's + # rejection. See #1786. + if isinstance(band, (bool, np.bool_)): + source.close() + raise ValueError( + f"band must be a non-negative int, got {band!r}") if ifd.samples_per_pixel <= 1: if band != 0: source.close() @@ -1941,6 +1954,8 @@ def _read_cog_http(url: str, overview_level: int | None = None, arr, geo_info = _apply_orientation_with_geo( arr, geo_info, ifd.orientation) + arr = _apply_photometric_miniswhite(arr, ifd) + return arr, geo_info @@ -2290,6 +2305,16 @@ def _apply_orientation_with_geo( return arr, geo_info +def _apply_photometric_miniswhite(arr: np.ndarray, ifd: IFD) -> np.ndarray: + """Apply TIFF MinIsWhite inversion for single-band grayscale images.""" + if ifd.photometric == 0 and ifd.samples_per_pixel == 1: + if arr.dtype.kind == 'u': + return np.iinfo(arr.dtype).max - arr + if arr.dtype.kind == 'f': + return -arr + return arr + + def read_to_array(source, *, window=None, overview_level: int | None = None, band: int | None = None, max_pixels: int = MAX_PIXELS_DEFAULT, @@ -2396,6 +2421,16 @@ def read_to_array(source, *, window=None, overview_level: int | None = None, # index only. See issue #1673. ifd_samples = ifd.samples_per_pixel if band is not None: + # Reject ``bool`` and ``np.bool_`` before the range check. + # ``isinstance(True, int)`` is True in Python and + # ``True < ifd_samples`` evaluates as ``1``, so without this + # guard ``band=True`` silently reads band 1 and ``band=False`` + # reads band 0. ``np.bool_`` is not a subclass of ``bool`` so it + # needs its own check to match the VRT path's existing + # rejection. See #1786. + if isinstance(band, (bool, np.bool_)): + raise ValueError( + f"band must be a non-negative int, got {band!r}") if ifd_samples <= 1: if band != 0: raise IndexError( @@ -2421,12 +2456,7 @@ def read_to_array(source, *, window=None, overview_level: int | None = None, arr, geo_info = _apply_orientation_with_geo( arr, geo_info, orientation) - # MinIsWhite (photometric=0): invert single-band grayscale values - if ifd.photometric == 0 and ifd.samples_per_pixel == 1: - if arr.dtype.kind == 'u': - arr = np.iinfo(arr.dtype).max - arr - elif arr.dtype.kind == 'f': - arr = -arr + arr = _apply_photometric_miniswhite(arr, ifd) finally: src.close() diff --git a/xrspatial/geotiff/_vrt.py b/xrspatial/geotiff/_vrt.py index 5a792ffa..8616f18e 100644 --- a/xrspatial/geotiff/_vrt.py +++ b/xrspatial/geotiff/_vrt.py @@ -610,7 +610,8 @@ def _resample_nearest(src_arr: np.ndarray, def read_vrt(vrt_path: str, *, window=None, band: int | None = None, - max_pixels: int | None = None) -> tuple[np.ndarray, VRTDataset]: + max_pixels: int | None = None, + missing_sources: str = 'warn') -> tuple[np.ndarray, VRTDataset]: """Read a VRT file by assembling pixel data from its source files. Parameters @@ -621,18 +622,30 @@ def read_vrt(vrt_path: str, *, window=None, (row_start, col_start, row_stop, col_stop) for windowed read. band : int or None Band index (0-based). None returns all bands. + max_pixels : int or None + Maximum allowed pixel count (width * height * samples) for the + assembled VRT region. None uses the reader default. + missing_sources : {'warn', 'raise'} + Policy for unreadable source files referenced by the VRT. + ``'warn'`` emits ``GeoTIFFFallbackWarning`` and records + ``vrt.holes`` unless ``XRSPATIAL_GEOTIFF_STRICT=1`` is set. + ``'raise'`` fails immediately. Returns ------- (np.ndarray, VRTDataset) tuple """ - from ._reader import read_to_array + from ._reader import PixelSafetyLimitError, read_to_array with open(vrt_path, 'r') as f: xml_str = f.read() vrt_dir = os.path.dirname(os.path.abspath(vrt_path)) vrt = parse_vrt(xml_str, vrt_dir) + if missing_sources not in ('warn', 'raise'): + raise ValueError( + f"missing_sources must be 'warn' or 'raise', got " + f"{missing_sources!r}") # Validate ``band`` against the parsed band count. Python list # indexing would silently accept negative values (``vrt.bands[-1]`` @@ -871,10 +884,13 @@ def read_vrt(vrt_path: str, *, window=None, src.filename, window=(read_r0, read_c0, read_r1, read_c1), band=src.band - 1, # convert 1-based to 0-based + max_pixels=max_pixels, ) except ( OSError, ValueError, struct.error, ) + _CODEC_DECODE_EXCEPTIONS as e: + if isinstance(e, PixelSafetyLimitError): + raise # Under XRSPATIAL_GEOTIFF_STRICT=1, surface the read failure # so partial mosaics are caught in CI. Default mode warns # once per missing source then continues, preserving the @@ -892,13 +908,14 @@ def read_vrt(vrt_path: str, *, window=None, # See issue #1734. import warnings from . import _geotiff_strict_mode, GeoTIFFFallbackWarning - if _geotiff_strict_mode(): + if missing_sources == 'raise' or _geotiff_strict_mode(): raise warnings.warn( f"VRT source {src.filename!r} could not be read " f"({type(e).__name__}: {e}); skipping. The output " f"mosaic will have a hole at this tile. Inspect " f"``DataArray.attrs['vrt_holes']`` or set " + f"missing_sources='raise' or " f"XRSPATIAL_GEOTIFF_STRICT=1 to raise instead.", GeoTIFFFallbackWarning, stacklevel=2, diff --git a/xrspatial/geotiff/_writer.py b/xrspatial/geotiff/_writer.py index abe22bf3..53590619 100644 --- a/xrspatial/geotiff/_writer.py +++ b/xrspatial/geotiff/_writer.py @@ -225,6 +225,14 @@ def _compression_tag(compression_name: str) -> int: #: override. _MAX_OVERVIEW_LEVELS = 8 +#: Total uncompressed payload (bytes) below which the strip and tile +#: writers stay sequential. The thread-pool startup cost dominates on +#: small rasters; above this size the per-block compression cost more +#: than pays for it. 4 MiB was chosen empirically on a 20-core box: +#: parallel becomes a net win around ~2 MiB, and the 4 MiB margin keeps +#: a few-tile / two-strip layout from incurring a slowdown. +_PARALLEL_MIN_BYTES = 4 * 1024 * 1024 + def _validate_overview_levels(overview_levels, height=None, width=None): """Validate and normalise an explicit ``overview_levels`` list. @@ -651,12 +659,50 @@ def _build_ifd(tags: list[tuple], overflow_base: int, # Strip writer # --------------------------------------------------------------------------- +def _prepare_strip(data, i, rows_per_strip, height, width, samples, dtype, + bytes_per_sample, predictor: int, compression, + compression_level=None, max_z_error: float = 0.0): + """Extract and compress a single strip. Thread-safe.""" + r0 = i * rows_per_strip + r1 = min(r0 + rows_per_strip, height) + strip_rows = r1 - r0 + + if compression == COMPRESSION_JPEG: + strip_data = np.ascontiguousarray(data[r0:r1]).tobytes() + return jpeg_compress(strip_data, width, strip_rows, samples) + if predictor != 1 and compression != COMPRESSION_NONE: + strip_arr = np.ascontiguousarray(data[r0:r1]) + buf = strip_arr.view(np.uint8).ravel().copy() + buf = _apply_predictor_encode( + buf, predictor, width, strip_rows, bytes_per_sample, samples) + strip_data = buf.tobytes() + else: + strip_data = np.ascontiguousarray(data[r0:r1]).tobytes() + + if compression == COMPRESSION_JPEG2000: + from ._compression import jpeg2000_compress + return jpeg2000_compress( + strip_data, width, strip_rows, samples=samples, dtype=dtype) + if compression == COMPRESSION_LERC: + from ._compression import lerc_compress + return lerc_compress( + strip_data, width, strip_rows, samples=samples, dtype=dtype, + max_z_error=max_z_error) + if compression_level is None: + return compress(strip_data, compression) + return compress(strip_data, compression, level=compression_level) + + def _write_stripped(data: np.ndarray, compression: int, predictor: int, rows_per_strip: int = 256, compression_level: int | None = None, max_z_error: float = 0.0) -> tuple[list, list, list]: """Compress data as strips. + For compressed formats (deflate, lzw, zstd, lz4, ...) strips are + compressed in parallel using a thread pool: zlib, zstandard, lz4, + and the Numba LZW kernel all release the GIL during compression. + Returns ------- (offsets_placeholder, byte_counts, compressed_chunks) @@ -668,53 +714,60 @@ def _write_stripped(data: np.ndarray, compression: int, predictor: int, dtype = data.dtype bytes_per_sample = dtype.itemsize - strips = [] + num_strips = math.ceil(height / rows_per_strip) + + total_bytes = int(data.nbytes) + + # Sequential path: uncompressed, few strips, or small payload. The + # threshold mirrors the tile writer so we don't pay thread-pool + # overhead on tiny rasters. + use_parallel = ( + compression != COMPRESSION_NONE + and num_strips > 2 + and total_bytes > _PARALLEL_MIN_BYTES + ) + + if not use_parallel: + strips = [] + rel_offsets = [] + byte_counts = [] + current_offset = 0 + for i in range(num_strips): + compressed = _prepare_strip( + data, i, rows_per_strip, height, width, samples, dtype, + bytes_per_sample, predictor, compression, + compression_level, max_z_error, + ) + rel_offsets.append(current_offset) + byte_counts.append(len(compressed)) + strips.append(compressed) + current_offset += len(compressed) + return rel_offsets, byte_counts, strips + + # Parallel strip compression -- zlib/zstd/lz4/LZW all release the GIL. + from concurrent.futures import ThreadPoolExecutor + import os + + n_workers = min(num_strips, os.cpu_count() or 4) + with ThreadPoolExecutor(max_workers=n_workers) as pool: + compressed_strips = list(pool.map( + lambda i: _prepare_strip( + data, i, rows_per_strip, height, width, samples, dtype, + bytes_per_sample, predictor, compression, + compression_level, max_z_error, + ), + range(num_strips), + )) + rel_offsets = [] byte_counts = [] current_offset = 0 - - num_strips = math.ceil(height / rows_per_strip) - for i in range(num_strips): - r0 = i * rows_per_strip - r1 = min(r0 + rows_per_strip, height) - strip_rows = r1 - r0 - - if compression == COMPRESSION_JPEG: - strip_data = np.ascontiguousarray(data[r0:r1]).tobytes() - compressed = jpeg_compress(strip_data, width, strip_rows, samples) - elif predictor != 1 and compression != COMPRESSION_NONE: - strip_arr = np.ascontiguousarray(data[r0:r1]) - buf = strip_arr.view(np.uint8).ravel().copy() - buf = _apply_predictor_encode( - buf, predictor, width, strip_rows, bytes_per_sample, samples) - strip_data = buf.tobytes() - if compression_level is None: - compressed = compress(strip_data, compression) - else: - compressed = compress(strip_data, compression, level=compression_level) - else: - strip_data = np.ascontiguousarray(data[r0:r1]).tobytes() - - if compression == COMPRESSION_JPEG2000: - from ._compression import jpeg2000_compress - compressed = jpeg2000_compress( - strip_data, width, strip_rows, samples=samples, dtype=dtype) - elif compression == COMPRESSION_LERC: - from ._compression import lerc_compress - compressed = lerc_compress( - strip_data, width, strip_rows, samples=samples, dtype=dtype, - max_z_error=max_z_error) - elif compression_level is None: - compressed = compress(strip_data, compression) - else: - compressed = compress(strip_data, compression, level=compression_level) - + for cs in compressed_strips: rel_offsets.append(current_offset) - byte_counts.append(len(compressed)) - strips.append(compressed) - current_offset += len(compressed) + byte_counts.append(len(cs)) + current_offset += len(cs) - return rel_offsets, byte_counts, strips + return rel_offsets, byte_counts, compressed_strips # --------------------------------------------------------------------------- @@ -841,8 +894,13 @@ def _write_tiled(data: np.ndarray, compression: int, predictor: int, return rel_offsets, byte_counts, tiles - if n_tiles <= 4: - # Very few tiles: sequential (thread pool overhead not worth it) + # Sequential path: very few tiles, or small total payload. A previous + # ``n_tiles <= 4`` cutoff sent ``tile_size=1024`` writes on a 2048x2048 + # image down the serial path (n_tiles=4) and made them ~8x slower than + # the parallel path. Switching to a bytes-based threshold lets + # large-tile writes parallelize while still skipping the pool on + # small rasters where its setup cost dominates. + if n_tiles <= 2 or int(data.nbytes) <= _PARALLEL_MIN_BYTES: tiles = [] rel_offsets = [] byte_counts = [] @@ -1562,6 +1620,77 @@ def _compress_block(arr, block_w, block_h, samples, dtype, bytes_per_sample, # Streaming writer (dask -> monolithic TIFF without full materialisation) # --------------------------------------------------------------------------- +def _compute_classic_ifd_overhead(tags: list) -> int: + """Return the on-disk size of the classic-TIFF IFD for ``tags``. + + Sums the fixed IFD block (entry count + 12 bytes per entry + next-IFD + pointer) and the variable overflow heap (values whose serialised size + exceeds the 4-byte inline limit, including ASCII strings such as + ``gdal_metadata`` and user-supplied ``extra_tags``). + + The heap size is recovered by building the IFD with + ``_build_ifd(tags, overflow_base=0, bigtiff=False)`` and measuring the + returned overflow buffer; this matches the bytes the streaming writer + will actually emit, with no fudge constant. + """ + num_tags = len(tags) + # classic IFD: 2-byte count + 12-byte entries + 4-byte next-IFD pointer + ifd_block_size = 2 + 12 * num_tags + 4 + _, overflow_bytes = _build_ifd(tags, overflow_base=0, bigtiff=False) + return ifd_block_size + len(overflow_bytes) + + +def _should_use_bigtiff_streaming(uncompressed_bytes: int, + n_entries: int, + ifd_overhead_bytes: int, + header_size_classic: int = 8) -> bool: + """Decide whether the streaming writer must emit BigTIFF. + + Classic TIFF stores offsets as uint32, so the file size addressable + via classic offsets is at most ``UINT32_MAX`` bytes (offsets run + ``0..UINT32_MAX - 1``). The streaming writer appends pixel data after + the header and IFD, so the final file size is + ``header + ifd + overflow + strip_table + uncompressed_bytes``. + + The comparison is ``> UINT32_MAX`` to match the eager + ``_assemble_tiff`` decision (``estimated_file_size > UINT32_MAX``): + a file that is exactly ``UINT32_MAX`` bytes still fits classic. + + See issue #1785 and the Copilot review on PR #1787: the previous + helper applied a 200-byte fudge for IFD overhead, which silently + underestimated when ``gdal_metadata_xml`` or large ``extra_tags`` + pushed the actual overflow heap well past that constant. + + Parameters + ---------- + uncompressed_bytes : int + Total pixel-data bytes that will be written after the IFD. + n_entries : int + Number of strip or tile entries; each contributes a LONG offset + (4 bytes) plus a LONG byte-count (4 bytes) to the overflow heap. + Pass ``0`` if ``ifd_overhead_bytes`` already covers the strip + table (the streaming-writer caller does this by passing the + actual tag list through ``_compute_classic_ifd_overhead``). + ifd_overhead_bytes : int + Classic-TIFF IFD size: fixed entry block plus variable overflow + heap (ASCII metadata, geo tags, strip/tile offset arrays, etc.). + Computed via ``_compute_classic_ifd_overhead(tags)``. + header_size_classic : int, optional + Classic-TIFF header size (8 bytes). + """ + # strip/tile-table overhead is 8 bytes per entry (LONG offset + LONG + # byte count). If the caller already accounted for the offset arrays + # inside ``ifd_overhead_bytes`` they should pass n_entries=0. + strip_table_overhead = n_entries * 8 + reserved_overhead = ( + header_size_classic + ifd_overhead_bytes + strip_table_overhead + ) + UINT32_MAX = 0xFFFFFFFF + # ``> UINT32_MAX`` matches the eager path's + # ``estimated_file_size > UINT32_MAX`` check in ``_assemble_tiff``. + return uncompressed_bytes + reserved_overhead > UINT32_MAX + + def write_streaming(dask_data, path: str, *, geo_transform: 'GeoTransform | None' = None, crs_epsg: int | None = None, @@ -1649,17 +1778,18 @@ def write_streaming(dask_data, path: str, *, rows_per_strip = min(256, height) n_entries = math.ceil(height / rows_per_strip) - # BigTIFF detection (use uncompressed size as conservative estimate) + # BigTIFF detection has to wait until the full tag list is built so + # that variable-length payloads (gdal_metadata, geo tags, user + # extra_tags) feed into the IFD-overhead calculation. Build the tag + # list assuming classic offsets first, then decide BigTIFF, then + # promote the strip/tile offset arrays to LONG8 if needed. See + # issue #1785 and the Copilot review on PR #1787. uncompressed_bytes = height * width * bytes_per_sample * samples - UINT32_MAX = 0xFFFFFFFF - if bigtiff is not None: - use_bigtiff = bigtiff - else: - use_bigtiff = uncompressed_bytes > UINT32_MAX - - header_size = 16 if use_bigtiff else 8 # ---- Build tag list (mirrors _assemble_tiff for level 0) ---- + # Start with classic offset types; the offset arrays are promoted to + # LONG8 below once BigTIFF is chosen. + use_bigtiff = bool(bigtiff) if bigtiff is not None else False tags = [] tags.append((TAG_IMAGE_WIDTH, LONG, 1, width)) tags.append((TAG_IMAGE_LENGTH, LONG, 1, height)) @@ -1714,10 +1844,11 @@ def write_streaming(dask_data, path: str, *, if resolution_unit is not None: tags.append((TAG_RESOLUTION_UNIT, SHORT, 1, resolution_unit)) - # Layout tags with placeholder offsets / byte-counts. BigTIFF - # needs 64-bit offsets (LONG8) since strip/tile positions can - # exceed 4 GB; classic TIFF uses LONG (uint32). - offset_type = LONG8 if use_bigtiff else LONG + # Layout tags with placeholder offsets / byte-counts. Use classic + # LONG (uint32) here; if the auto-BigTIFF decision below promotes + # the file, ``_promote_offsets_to_long8`` retypes these to LONG8. + # A caller-forced ``bigtiff=True`` is also resolved at that point. + offset_type = LONG placeholder = [0] * n_entries if tiled: tags.append((TAG_TILE_WIDTH, SHORT, 1, tile_size)) @@ -1769,6 +1900,31 @@ def write_streaming(dask_data, path: str, *, and etag_id not in _DANGEROUS_EXTRA_TAG_IDS): tags.append((etag_id, etype_id, ecount, evalue)) + # ---- BigTIFF decision (auto path) ---- + # Compute the real classic-TIFF IFD overhead from the actual tag + # list, including overflow heap (gdal_metadata, geo ascii params, + # strip/tile offset arrays, user extra_tags). This replaces the + # 200-byte fudge constant the original PR used; with metadata-heavy + # writes that constant silently underestimated overhead and let + # sub-4 GiB rasters overflow classic offsets late in the write. + # See issue #1785 and the Copilot review on PR #1787. + if bigtiff is None: + ifd_overhead_bytes = _compute_classic_ifd_overhead(tags) + # n_entries=0 because the strip/tile offset arrays are already + # inside ``tags`` and therefore in ``ifd_overhead_bytes``. + use_bigtiff = _should_use_bigtiff_streaming( + uncompressed_bytes, + n_entries=0, + ifd_overhead_bytes=ifd_overhead_bytes, + header_size_classic=8, + ) + + header_size = 16 if use_bigtiff else 8 + + # Promote the strip/tile offset arrays to LONG8 once BigTIFF is set. + if use_bigtiff: + tags = _promote_offsets_to_long8(tags) + # ---- Pre-compute IFD reservation size ---- sorted_tags = sorted(tags, key=lambda t: t[0]) entry_size = 20 if use_bigtiff else 12 diff --git a/xrspatial/geotiff/tests/test_decompression_caps.py b/xrspatial/geotiff/tests/test_decompression_caps.py index 09f49326..b3aae94e 100644 --- a/xrspatial/geotiff/tests/test_decompression_caps.py +++ b/xrspatial/geotiff/tests/test_decompression_caps.py @@ -49,6 +49,7 @@ def _module_available(name: str) -> bool: _HAS_LZ4 = _module_available("lz4.frame") _HAS_LERC = _module_available("lerc") _HAS_GLYMUR = _module_available("glymur") +_HAS_PILLOW = _module_available("PIL") # --------------------------------------------------------------------------- @@ -473,3 +474,184 @@ def __getitem__(self, _): _compression.jpeg2000_decompress( blob, width=64, height=64, samples=1, expected_size=arr.nbytes) + + +# --------------------------------------------------------------------------- +# JPEG (issue #1792) +# --------------------------------------------------------------------------- +# +# Pillow has its own DecompressionBombError, but it fires only at +# ~178M pixels (~500 MB RGB). A malicious TIFF can declare a small tile +# (e.g. 256x256 RGB, ~196 KiB expected) while shipping a JPEG payload +# whose SOF marker declares a much larger image; that lets ~500 MB +# allocate per tile before the downstream chunk.size != expected +# reshape check fires. ``jpeg_decompress`` now parses the JPEG SOF +# marker and raises before handing the blob to Pillow when the +# declared output exceeds ``expected * 1.05 + 1`` bytes. See +# https://github.com/xarray-contrib/xarray-spatial/issues/1792 . + +def _forge_jpeg_with_sof_dimensions(real_h: int, real_w: int, + real_c: int, + declared_h: int, + declared_w: int) -> bytes: + """Build a real JPEG and rewrite the SOF marker's H/W fields. + + Pillow encodes a small valid image so the bytestream is a complete + JPEG; we then overwrite the height/width fields in the SOF segment + so the SOF claims a much larger image than the payload can decode. + The decoder never gets the chance to fail on the mismatch because + the pre-decode cap fires first -- which is the property under test. + """ + from PIL import Image + import io + mode = 'RGB' if real_c == 3 else 'L' + img = Image.new(mode, (real_w, real_h), color=0) + buf = io.BytesIO() + img.save(buf, format='JPEG', quality=75) + data = bytearray(buf.getvalue()) + # Find SOF0..SOF3,SOF5..SOF7,SOF9..SOF11,SOF13..SOF15 marker. + sof_codes = { + 0xC0, 0xC1, 0xC2, 0xC3, 0xC5, 0xC6, 0xC7, + 0xC9, 0xCA, 0xCB, 0xCD, 0xCE, 0xCF, + } + i = 2 + n = len(data) + while i + 3 < n: + if data[i] != 0xFF: + raise AssertionError("forged JPEG lost marker alignment") + marker = data[i + 1] + if marker in sof_codes: + # SOF: 0xFF Cx | len(2) | precision(1) | H(2) | W(2) | components(1) + data[i + 5] = (declared_h >> 8) & 0xFF + data[i + 6] = declared_h & 0xFF + data[i + 7] = (declared_w >> 8) & 0xFF + data[i + 8] = declared_w & 0xFF + return bytes(data) + if marker == 0xD9 or 0xD0 <= marker <= 0xD7 or marker == 0x01: + i += 2 + continue + seg_len = (data[i + 2] << 8) | data[i + 3] + i += 2 + seg_len + raise AssertionError("forged JPEG has no SOF marker") + + +@pytest.mark.skipif(not _HAS_PILLOW, reason="Pillow not installed") +class TestJpegDirect: + def test_jpeg_bomb_raises(self): + """A JPEG whose SOF dimensions exceed the per-tile cap must raise. + + The forged JPEG payload itself is small (a real 16x16 image with + the SOF marker rewritten to declare 8000x8000x3). The wrapper + is given a per-tile expected size of 32x32x3 = 3072 bytes; the + declared output 8000*8000*3 = 192_000_000 bytes is well above + the ``3072 * 1.05 + 1`` cap and the decode must be refused. + """ + blob = _forge_jpeg_with_sof_dimensions( + real_h=16, real_w=16, real_c=3, + declared_h=8000, declared_w=8000, + ) + from xrspatial.geotiff._compression import jpeg_decompress + # Match the full diagnostic so a regression that swaps in a + # different error path (e.g. Pillow's own DecompressionBombError + # with a different wording, or a numeric overflow before the + # explicit guard) fails the test instead of silently passing. + with pytest.raises( + ValueError, + match=r"jpeg decode would exceed.*Likely a decompression bomb", + ): + jpeg_decompress(blob, width=32, height=32, samples=3) + + def test_jpeg_legitimate_passes(self): + """A JPEG whose SOF dimensions match the expected tile size passes.""" + from PIL import Image + import io + img = Image.new('RGB', (32, 32), color=(10, 20, 30)) + buf = io.BytesIO() + img.save(buf, format='JPEG', quality=90) + from xrspatial.geotiff._compression import jpeg_decompress + out = jpeg_decompress(buf.getvalue(), width=32, height=32, samples=3) + arr = np.frombuffer(out, dtype=np.uint8).reshape(32, 32, 3) + assert arr.shape == (32, 32, 3) + + def test_jpeg_no_cap_when_size_kwargs_default(self): + """Backward-compat: omitting size kwargs falls back to Pillow's guard. + + Direct callers and round-trip tests pass no dimensions; the + pre-check must be a no-op so those keep working. + """ + blob = _forge_jpeg_with_sof_dimensions( + real_h=16, real_w=16, real_c=3, + declared_h=64, declared_w=64, + ) + from xrspatial.geotiff._compression import jpeg_decompress + # With no dimension kwargs, the cap is disabled. The forged JPEG + # declares 64x64 but encodes only 16x16 of payload -- libjpeg + # raises on the truncation; the bomb cap is what we're checking + # is *not* the source of any exception here. Catch whatever + # Pillow raises and assert it isn't our bomb message. + from PIL import Image as _Img # noqa: F401 + try: + jpeg_decompress(blob) + except ValueError as exc: + assert "Likely a decompression bomb" not in str(exc) + except Exception: + # libjpeg/Pillow errors are acceptable -- we only care that + # the bomb cap did not fire. + pass + + def test_jpeg_malformed_falls_through_to_pillow(self): + """A JPEG without a parseable SOF defers to Pillow's own guard. + + We don't want the pre-check to misclassify weird-but-valid + streams; if the helper can't read the SOF it should return + ``None`` and let Pillow raise its own error. + """ + # SOI followed by EOI -- a syntactically valid but empty stream + # with no SOF marker. + blob = bytes([0xFF, 0xD8, 0xFF, 0xD9]) + from xrspatial.geotiff._compression import jpeg_decompress + # No SOF -> bomb cap returns None -> Pillow raises on the empty + # stream. + with pytest.raises(Exception): + jpeg_decompress(blob, width=32, height=32, samples=3) + + def test_jpeg_sof_with_truncated_segment_length_returns_none(self): + """A SOF segment whose declared length runs past EOF returns None. + + Without segment-length validation, ``_read_jpeg_sof`` would happily + read height/width/components at fixed offsets even when those + offsets pointed past the segment. The pre-check now demands + ``seg_len >= 8`` and ``i + 2 + seg_len <= n`` before reading; + truncated SOFs are treated as "unknown size" and the bomb cap + defers to Pillow. + """ + from xrspatial.geotiff._compression import _read_jpeg_sof + + # SOI | SOF0 | seg_len=64 (advertises 64 bytes of segment, but + # the buffer ends after only 10 bytes of segment payload). + # Truncation -> _read_jpeg_sof must return None. + truncated = bytes([ + 0xFF, 0xD8, # SOI + 0xFF, 0xC0, # SOF0 + 0x00, 0x40, # seg_len = 64 (lies; buf is short) + 0x08, # precision = 8 + 0x10, 0x00, # height = 4096 + 0x10, 0x00, # width = 4096 + 0x03, # components = 3 + ]) + assert _read_jpeg_sof(truncated) is None + + def test_jpeg_sof_with_segment_length_below_minimum_returns_none(self): + """A SOF segment whose declared length is < 8 returns None.""" + from xrspatial.geotiff._compression import _read_jpeg_sof + + too_small = bytes([ + 0xFF, 0xD8, # SOI + 0xFF, 0xC0, # SOF0 + 0x00, 0x04, # seg_len = 4 (too small for SOF) + 0x08, + 0x10, 0x00, + 0x10, 0x00, + 0x03, + ]) + assert _read_jpeg_sof(too_small) is None diff --git a/xrspatial/geotiff/tests/test_geotiff_band_bool_rejection_1786.py b/xrspatial/geotiff/tests/test_geotiff_band_bool_rejection_1786.py new file mode 100644 index 00000000..dc2391fe --- /dev/null +++ b/xrspatial/geotiff/tests/test_geotiff_band_bool_rejection_1786.py @@ -0,0 +1,290 @@ +"""Regression tests for issue #1786. + +Every non-VRT read path range-checks ``band`` but does not reject +``bool``. Because ``isinstance(True, int)`` is True in Python and +``True < N`` evaluates as ``1 < N``, ``band=True`` silently reads +band 1 and ``band=False`` reads band 0. The VRT path +(``_vrt.read_vrt``) already rejects bools up front (#1673 follow-up) +so the API contract is inconsistent across read paths. + +These tests pin every read entry point -- ``read_to_array`` (local +and HTTP), ``open_geotiff``, ``read_geotiff_dask``, +``read_geotiff_gpu`` (when cupy is available), and ``read_vrt`` -- +to the same rejection so all four backends agree: ``band`` must be +a non-negative int, never a bool. +""" +from __future__ import annotations + +import importlib.util +import uuid + +import numpy as np +import pytest +import xarray as xr + + +def _gpu_available() -> bool: + if importlib.util.find_spec("cupy") is None: + return False + try: + import cupy + return bool(cupy.cuda.is_available()) + except Exception: + return False + + +_HAS_GPU = _gpu_available() +_gpu_only = pytest.mark.skipif(not _HAS_GPU, reason="cupy + CUDA required") + + +@pytest.fixture +def multiband_tiff_path(tmp_path): + """4x6 three-band tiled tiff for the bool-rejection tests.""" + from xrspatial.geotiff import to_geotiff + + arr = np.arange(72, dtype=np.float32).reshape(4, 6, 3) + da = xr.DataArray( + arr, + dims=['y', 'x', 'band'], + coords={ + 'y': np.array([0.5, 1.5, 2.5, 3.5]), + 'x': np.array([0.5, 1.5, 2.5, 3.5, 4.5, 5.5]), + 'band': [0, 1, 2], + }, + attrs={'crs': 4326}, + ) + p = tmp_path / 'mb_1786.tif' + to_geotiff(da, str(p), tile_size=16) + return str(p), arr + + +def _write_vrt_xml(vrt_path: str, source_filename: str, size_h: int, + size_w: int, n_bands: int) -> None: + bands_xml = "" + for b in range(1, n_bands + 1): + bands_xml += ( + f' \n' + ' \n' + f' {source_filename}' + '\n' + f' {b}\n' + f' \n' + f' \n' + ' \n' + ' \n' + ) + xml = ( + f'\n' + ' 0, 1, 0, 0, 0, -1\n' + f'{bands_xml}' + '\n' + ) + with open(vrt_path, 'w') as f: + f.write(xml) + + +@pytest.fixture +def multiband_vrt_path(tmp_path, multiband_tiff_path): + """A 3-band VRT wrapping the same multi-band TIFF used above.""" + src_tif, _ = multiband_tiff_path + d = tmp_path / f'vrt_1786_{uuid.uuid4().hex[:8]}' + d.mkdir() + # The VRT needs the source TIFF inside (or under an allowed root) + # for path-containment (#1671). Copy bytes rather than symlink so + # the test does not depend on the platform's symlink behaviour. + import shutil + local_tif = d / 'data.tif' + shutil.copy(src_tif, local_tif) + vrt_path = d / 'mosaic.vrt' + _write_vrt_xml(str(vrt_path), 'data.tif', size_h=4, size_w=6, + n_bands=3) + return str(vrt_path) + + +# --------------------------------------------------------------------------- +# read_to_array (local eager path) +# --------------------------------------------------------------------------- + + +def test_read_to_array_band_true_rejected(multiband_tiff_path): + """``band=True`` no longer silently reads band 1.""" + from xrspatial.geotiff._reader import read_to_array + + path, _ = multiband_tiff_path + with pytest.raises(ValueError, match="band must be a non-negative int"): + read_to_array(path, band=True) + + +def test_read_to_array_band_false_rejected(multiband_tiff_path): + """``band=False`` no longer silently reads band 0.""" + from xrspatial.geotiff._reader import read_to_array + + path, _ = multiband_tiff_path + with pytest.raises(ValueError, match="band must be a non-negative int"): + read_to_array(path, band=False) + + +def test_read_to_array_band_zero_still_works(multiband_tiff_path): + """``band=0`` is a plain int and still selects band 0.""" + from xrspatial.geotiff._reader import read_to_array + + path, arr = multiband_tiff_path + out, _ = read_to_array(path, band=0) + np.testing.assert_array_equal(out, arr[:, :, 0]) + + +def test_read_to_array_band_one_still_works(multiband_tiff_path): + """``band=1`` is a plain int and still selects band 1.""" + from xrspatial.geotiff._reader import read_to_array + + path, arr = multiband_tiff_path + out, _ = read_to_array(path, band=1) + np.testing.assert_array_equal(out, arr[:, :, 1]) + + +# --------------------------------------------------------------------------- +# open_geotiff (public dispatcher) +# --------------------------------------------------------------------------- + + +def test_open_geotiff_band_true_rejected(multiband_tiff_path): + """The public ``open_geotiff`` entry point rejects ``band=True``.""" + from xrspatial.geotiff import open_geotiff + + path, _ = multiband_tiff_path + with pytest.raises(ValueError, match="band must be a non-negative int"): + open_geotiff(path, band=True) + + +def test_open_geotiff_band_false_rejected(multiband_tiff_path): + """``open_geotiff(..., band=False)`` is rejected the same way.""" + from xrspatial.geotiff import open_geotiff + + path, _ = multiband_tiff_path + with pytest.raises(ValueError, match="band must be a non-negative int"): + open_geotiff(path, band=False) + + +# --------------------------------------------------------------------------- +# read_geotiff_dask (dask CPU path) +# --------------------------------------------------------------------------- + + +def test_read_geotiff_dask_band_true_rejected(multiband_tiff_path): + """``read_geotiff_dask(..., band=True)`` is rejected before scheduling.""" + from xrspatial.geotiff import read_geotiff_dask + + path, _ = multiband_tiff_path + with pytest.raises(ValueError, match="band must be a non-negative int"): + read_geotiff_dask(path, chunks=4, band=True) + + +def test_read_geotiff_dask_band_false_rejected(multiband_tiff_path): + """``read_geotiff_dask(..., band=False)`` raises the same way.""" + from xrspatial.geotiff import read_geotiff_dask + + path, _ = multiband_tiff_path + with pytest.raises(ValueError, match="band must be a non-negative int"): + read_geotiff_dask(path, chunks=4, band=False) + + +# --------------------------------------------------------------------------- +# read_geotiff_gpu (GPU path) +# --------------------------------------------------------------------------- + + +@_gpu_only +def test_read_geotiff_gpu_band_true_rejected(multiband_tiff_path): + """``read_geotiff_gpu(..., band=True)`` is rejected (cupy required).""" + from xrspatial.geotiff import read_geotiff_gpu + + path, _ = multiband_tiff_path + with pytest.raises(ValueError, match="band must be a non-negative int"): + read_geotiff_gpu(path, band=True) + + +@_gpu_only +def test_read_geotiff_gpu_band_false_rejected(multiband_tiff_path): + """``read_geotiff_gpu(..., band=False)`` raises the same way.""" + from xrspatial.geotiff import read_geotiff_gpu + + path, _ = multiband_tiff_path + with pytest.raises(ValueError, match="band must be a non-negative int"): + read_geotiff_gpu(path, band=False) + + +# --------------------------------------------------------------------------- +# read_vrt (regression: was already rejecting bool; should keep doing so) +# --------------------------------------------------------------------------- + + +def test_read_vrt_band_true_still_rejected(multiband_vrt_path): + """VRT path's existing bool rejection remains in place.""" + from xrspatial.geotiff import read_vrt + + with pytest.raises(ValueError, match="band must be a non-negative int"): + read_vrt(multiband_vrt_path, band=True) + + +def test_read_vrt_band_false_still_rejected(multiband_vrt_path): + """VRT path rejects ``band=False`` as well.""" + from xrspatial.geotiff import read_vrt + + with pytest.raises(ValueError, match="band must be a non-negative int"): + read_vrt(multiband_vrt_path, band=False) + + +# --------------------------------------------------------------------------- +# np.bool_ parity: ``isinstance(np.bool_(True), bool)`` is False so it +# bypasses a plain ``isinstance(band, bool)`` guard and is then treated +# as 1/0 by the range check. The VRT path's +# ``not isinstance(band, (int, np.integer))`` clause already rejects it; +# every other read path must too so the four backends agree. +# --------------------------------------------------------------------------- + + +def test_read_to_array_band_np_bool_rejected(multiband_tiff_path): + """Local file path rejects ``band=np.bool_(True)``.""" + from xrspatial.geotiff._reader import read_to_array + + path, _ = multiband_tiff_path + with pytest.raises(ValueError, match="band must be a non-negative int"): + read_to_array(path, band=np.bool_(True)) + + +def test_open_geotiff_band_np_bool_rejected(multiband_tiff_path): + """``open_geotiff`` rejects ``band=np.bool_(False)``.""" + from xrspatial.geotiff import open_geotiff + + path, _ = multiband_tiff_path + with pytest.raises(ValueError, match="band must be a non-negative int"): + open_geotiff(path, band=np.bool_(False)) + + +def test_read_geotiff_dask_band_np_bool_rejected(multiband_tiff_path): + """``read_geotiff_dask`` rejects ``band=np.bool_(True)``.""" + from xrspatial.geotiff import read_geotiff_dask + + path, _ = multiband_tiff_path + with pytest.raises(ValueError, match="band must be a non-negative int"): + read_geotiff_dask(path, band=np.bool_(True)) + + +@_gpu_only +def test_read_geotiff_gpu_band_np_bool_rejected(multiband_tiff_path): + """``read_geotiff_gpu`` rejects ``band=np.bool_(True)``.""" + from xrspatial.geotiff import read_geotiff_gpu + + path, _ = multiband_tiff_path + with pytest.raises(ValueError, match="band must be a non-negative int"): + read_geotiff_gpu(path, band=np.bool_(True)) + + +def test_read_vrt_band_np_bool_still_rejected(multiband_vrt_path): + """VRT path already rejects ``np.bool_`` via its integer-type check.""" + from xrspatial.geotiff import read_vrt + + with pytest.raises(ValueError, match="band must be a non-negative int"): + read_vrt(multiband_vrt_path, band=np.bool_(True)) diff --git a/xrspatial/geotiff/tests/test_http_dask_orientation_1794.py b/xrspatial/geotiff/tests/test_http_dask_orientation_1794.py new file mode 100644 index 00000000..45d353d6 --- /dev/null +++ b/xrspatial/geotiff/tests/test_http_dask_orientation_1794.py @@ -0,0 +1,81 @@ +"""Remote dask reads must not bypass TIFF Orientation handling (#1794).""" +from __future__ import annotations + +import http.server +import socketserver +import threading + +import numpy as np +import pytest + +from xrspatial.geotiff import open_geotiff + +tifffile = pytest.importorskip("tifffile") + + +def _write_with_orientation(path, arr, orientation): + tifffile.imwrite( + str(path), + arr, + extratags=[(274, 'H', 1, orientation, True)], + ) + + +class _RangeHandler(http.server.BaseHTTPRequestHandler): + payload: bytes = b'' + + def do_GET(self): # noqa: N802 + rng = self.headers.get('Range') + if rng and rng.startswith('bytes='): + spec = rng[len('bytes='):] + start_s, _, end_s = spec.partition('-') + start = int(start_s) + end = int(end_s) if end_s else len(self.payload) - 1 + chunk = self.payload[start:end + 1] + self.send_response(206) + self.send_header('Content-Type', 'application/octet-stream') + self.send_header( + 'Content-Range', + f'bytes {start}-{start + len(chunk) - 1}/{len(self.payload)}', + ) + self.send_header('Content-Length', str(len(chunk))) + self.end_headers() + self.wfile.write(chunk) + return + self.send_response(200) + self.send_header('Content-Type', 'application/octet-stream') + self.send_header('Content-Length', str(len(self.payload))) + self.end_headers() + self.wfile.write(self.payload) + + def log_message(self, *_args, **_kwargs): + pass + + +def _serve(payload: bytes): + handler_cls = type( + 'RangeHandler1794', (_RangeHandler,), {'payload': payload} + ) + httpd = socketserver.TCPServer(('127.0.0.1', 0), handler_cls) + port = httpd.server_address[1] + thread = threading.Thread(target=httpd.serve_forever, daemon=True) + thread.start() + return httpd, port + + +def test_http_dask_read_rejects_non_default_orientation(tmp_path, monkeypatch): + monkeypatch.setenv('XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS', '1') + arr = np.arange(64, dtype=np.uint8).reshape(8, 8) + path = tmp_path / "tmp_1794_orientation_2.tif" + _write_with_orientation(path, arr, 2) + + payload = path.read_bytes() + httpd, port = _serve(payload) + try: + url = f'http://127.0.0.1:{port}/tmp_1794_orientation_2.tif' + with pytest.raises(ValueError, match="Orientation tag"): + open_geotiff(url, chunks=4) + finally: + httpd.shutdown() + httpd.server_close() + diff --git a/xrspatial/geotiff/tests/test_miniswhite_backend_parity_1797.py b/xrspatial/geotiff/tests/test_miniswhite_backend_parity_1797.py new file mode 100644 index 00000000..6520ada6 --- /dev/null +++ b/xrspatial/geotiff/tests/test_miniswhite_backend_parity_1797.py @@ -0,0 +1,120 @@ +"""MinIsWhite photometric handling must be backend-consistent (#1797).""" +from __future__ import annotations + +import http.server +import importlib.util +import socketserver +import threading + +import numpy as np +import pytest + +from xrspatial.geotiff import open_geotiff + +tifffile = pytest.importorskip("tifffile") + + +def _gpu_available() -> bool: + """True if cupy is importable and CUDA is initialised.""" + if importlib.util.find_spec("cupy") is None: + return False + try: + import cupy + return bool(cupy.cuda.is_available()) + except Exception: + return False + + +_HAS_GPU = _gpu_available() +_gpu_only = pytest.mark.skipif( + not _HAS_GPU, + reason="cupy + CUDA required", +) + + +class _RangeHandler(http.server.BaseHTTPRequestHandler): + payload: bytes = b'' + + def do_GET(self): # noqa: N802 + rng = self.headers.get('Range') + if rng and rng.startswith('bytes='): + spec = rng[len('bytes='):] + start_s, _, end_s = spec.partition('-') + start = int(start_s) + end = int(end_s) if end_s else len(self.payload) - 1 + chunk = self.payload[start:end + 1] + self.send_response(206) + self.send_header('Content-Type', 'application/octet-stream') + self.send_header( + 'Content-Range', + f'bytes {start}-{start + len(chunk) - 1}/{len(self.payload)}', + ) + self.send_header('Content-Length', str(len(chunk))) + self.end_headers() + self.wfile.write(chunk) + return + self.send_response(200) + self.send_header('Content-Type', 'application/octet-stream') + self.send_header('Content-Length', str(len(self.payload))) + self.end_headers() + self.wfile.write(self.payload) + + def log_message(self, *_args, **_kwargs): + pass + + +def _serve(payload: bytes): + handler_cls = type( + 'RangeHandler1797', (_RangeHandler,), {'payload': payload} + ) + httpd = socketserver.TCPServer(('127.0.0.1', 0), handler_cls) + port = httpd.server_address[1] + thread = threading.Thread(target=httpd.serve_forever, daemon=True) + thread.start() + return httpd, port + + +@pytest.fixture +def miniswhite_http_url(tmp_path, monkeypatch): + monkeypatch.setenv('XRSPATIAL_GEOTIFF_ALLOW_PRIVATE_HOSTS', '1') + stored = np.array([[0, 1, 2], [10, 128, 255]], dtype=np.uint8) + path = tmp_path / "tmp_1797_miniswhite.tif" + tifffile.imwrite(str(path), stored, photometric='miniswhite') + httpd, port = _serve(path.read_bytes()) + try: + yield f'http://127.0.0.1:{port}/tmp_1797_miniswhite.tif', stored + finally: + httpd.shutdown() + httpd.server_close() + + +def test_http_miniswhite_matches_local_reader(miniswhite_http_url): + url, stored = miniswhite_http_url + + got = open_geotiff(url) + + np.testing.assert_array_equal(got.values, np.iinfo(stored.dtype).max - stored) + + +def test_http_dask_miniswhite_matches_local_reader(miniswhite_http_url): + url, stored = miniswhite_http_url + + got = open_geotiff(url, chunks=2).compute() + + np.testing.assert_array_equal(got.values, np.iinfo(stored.dtype).max - stored) + + +@_gpu_only +def test_gpu_miniswhite_matches_cpu_reader(tmp_path): + from xrspatial.geotiff._writer import write + + stored = np.array([[0, 1, 2], [10, 128, 255]], dtype=np.uint8) + path = str(tmp_path / "tmp_1797_miniswhite_gpu.tif") + write(stored, path, compression='deflate', tiled=True, tile_size=16, + photometric='miniswhite') + + cpu = open_geotiff(path) + gpu = open_geotiff(path, gpu=True) + + np.testing.assert_array_equal(cpu.values, np.iinfo(stored.dtype).max - stored) + np.testing.assert_array_equal(gpu.data.get(), cpu.values) diff --git a/xrspatial/geotiff/tests/test_parallel_writer_1800.py b/xrspatial/geotiff/tests/test_parallel_writer_1800.py new file mode 100644 index 00000000..39baead0 --- /dev/null +++ b/xrspatial/geotiff/tests/test_parallel_writer_1800.py @@ -0,0 +1,276 @@ +"""Round-trip and threshold tests for the parallel strip/tile writer (#1800).""" +from __future__ import annotations + +from concurrent.futures import ThreadPoolExecutor + +import numpy as np +import pytest + +from xrspatial.geotiff._writer import ( + _PARALLEL_MIN_BYTES, + _write_stripped, + _write_tiled, + write, +) +from xrspatial.geotiff._reader import read_to_array +from xrspatial.geotiff._compression import ( + COMPRESSION_DEFLATE, + COMPRESSION_NONE, + _HAVE_LIBDEFLATE, + deflate_compress, +) + + +# -- Strip writer parity -------------------------------------------------- + + +def _make_data(h, w, dtype=np.float32, pattern='gradient'): + """Reproducible array used across tests.""" + n = h * w + if pattern == 'gradient': + return np.arange(n, dtype=dtype).reshape(h, w) + rng = np.random.RandomState(1800) + arr = rng.rand(h, w) * 1000 + return arr.astype(dtype) + + +@pytest.mark.parametrize('compression', ['deflate', 'lzw', 'zstd']) +@pytest.mark.parametrize('predictor', [False, True]) +def test_strip_writer_round_trip_large(tmp_path, compression, predictor): + """Multi-strip writes round-trip bit-identically through the parallel path.""" + expected = _make_data(1024, 768, pattern='random') + path = str(tmp_path / f'parallel_strip_1800_{compression}_{predictor}.tif') + write(expected, path, compression=compression, tiled=False, + predictor=predictor) + arr, _ = read_to_array(path) + np.testing.assert_array_equal(arr, expected) + + +@pytest.mark.parametrize('dtype', [np.uint8, np.uint16, np.int16, np.int32, + np.float32, np.float64]) +def test_strip_writer_dtypes(tmp_path, dtype): + """Parallel strip path preserves every supported numeric dtype.""" + if np.issubdtype(dtype, np.floating): + expected = _make_data(800, 400, dtype=dtype, pattern='random') + else: + info = np.iinfo(dtype) + rng = np.random.RandomState(1800) + expected = rng.randint(info.min, info.max, + size=(800, 400), dtype=dtype) + path = str(tmp_path / f'parallel_strip_1800_dtype_{dtype.__name__}.tif') + write(expected, path, compression='deflate', tiled=False) + arr, _ = read_to_array(path) + np.testing.assert_array_equal(arr, expected) + + +def test_strip_writer_small_takes_sequential_path(tmp_path): + """Below the byte threshold the parallel strip path is skipped. + + The sequential branch is functionally identical, so the round-trip + check just guards against the threshold logic accidentally breaking + the small-payload case. + """ + expected = _make_data(32, 64, pattern='gradient') + assert expected.nbytes < _PARALLEL_MIN_BYTES + path = str(tmp_path / 'small_seq_strip_1800.tif') + write(expected, path, compression='deflate', tiled=False) + arr, _ = read_to_array(path) + np.testing.assert_array_equal(arr, expected) + + +def test_strip_writer_thread_pool_used_when_large(monkeypatch): + """A multi-MiB strip write must dispatch through ThreadPoolExecutor.""" + expected = _make_data(2048, 2048, dtype=np.float32, pattern='random') + assert expected.nbytes > _PARALLEL_MIN_BYTES + + used = {'pool': False} + + import concurrent.futures as cf + + class _Probe(cf.ThreadPoolExecutor): + def __init__(self, *a, **kw): + used['pool'] = True + super().__init__(*a, **kw) + + # The writer does ``from concurrent.futures import ThreadPoolExecutor`` + # inside the function, so patching the module attribute is enough. + monkeypatch.setattr(cf, 'ThreadPoolExecutor', _Probe) + + rel, bc, blobs = _write_stripped( + expected, COMPRESSION_DEFLATE, predictor=1, rows_per_strip=256) + assert used['pool'], 'parallel strip writer should have used ThreadPoolExecutor' + # And the output should still round-trip + import zlib + decoded = b''.join(zlib.decompress(b) for b in blobs) + rt = np.frombuffer(decoded, dtype=np.float32).reshape(expected.shape) + np.testing.assert_array_equal(rt, expected) + + +def test_strip_writer_uncompressed_stays_sequential(monkeypatch): + """``compression='none'`` never dispatches to the thread pool.""" + expected = _make_data(2048, 2048, dtype=np.float32, pattern='gradient') + assert expected.nbytes > _PARALLEL_MIN_BYTES + + used = {'pool': False} + + import concurrent.futures as cf + + class _Probe(cf.ThreadPoolExecutor): + def __init__(self, *a, **kw): + used['pool'] = True + super().__init__(*a, **kw) + + monkeypatch.setattr(cf, 'ThreadPoolExecutor', _Probe) + _write_stripped(expected, COMPRESSION_NONE, predictor=1, rows_per_strip=256) + assert not used['pool'], 'uncompressed strip writer must stay sequential' + + +# -- Tile writer adaptive threshold --------------------------------------- + + +def test_tile_writer_large_tile_size_parallelizes(monkeypatch): + """A 2048x2048 deflate write with tile_size=1024 (n_tiles=4) must run + in parallel after the threshold fix. + + Pre-fix, ``n_tiles <= 4`` shoved this case onto the serial path even + though the payload was 16 MiB; that produced ~8x slower writes. + """ + expected = _make_data(2048, 2048, dtype=np.float32, pattern='random') + assert expected.nbytes > _PARALLEL_MIN_BYTES + + used = {'pool': False} + + import concurrent.futures as cf + + class _Probe(cf.ThreadPoolExecutor): + def __init__(self, *a, **kw): + used['pool'] = True + super().__init__(*a, **kw) + + monkeypatch.setattr(cf, 'ThreadPoolExecutor', _Probe) + _write_tiled( + expected, COMPRESSION_DEFLATE, predictor=1, tile_size=1024) + assert used['pool'], ( + 'tile writer with tile_size=1024 on 2048x2048 (n_tiles=4, 16 MiB) ' + 'must parallelize after the adaptive-threshold change' + ) + + +def test_tile_writer_small_payload_stays_sequential(monkeypatch): + """A small raster keeps the sequential path even with n_tiles > 2.""" + expected = _make_data(128, 128, dtype=np.float32, pattern='gradient') + assert expected.nbytes < _PARALLEL_MIN_BYTES + + used = {'pool': False} + + import concurrent.futures as cf + + class _Probe(cf.ThreadPoolExecutor): + def __init__(self, *a, **kw): + used['pool'] = True + super().__init__(*a, **kw) + + monkeypatch.setattr(cf, 'ThreadPoolExecutor', _Probe) + _write_tiled( + expected, COMPRESSION_DEFLATE, predictor=1, tile_size=32) + assert not used['pool'] + + +# -- libdeflate backend ---------------------------------------------------- + + +def test_deflate_compress_zlib_wire_compatible(): + """Output is decompressible by stdlib zlib regardless of backend.""" + import zlib + raw = (np.arange(1024, dtype=np.uint8) % 251).tobytes() * 64 + compressed = deflate_compress(raw, level=6) + assert zlib.decompress(compressed) == raw + + +def test_deflate_compress_fallback_when_libdeflate_missing(monkeypatch): + """When libdeflate is absent we route through stdlib zlib unchanged.""" + import zlib + + import xrspatial.geotiff._compression as comp_mod + monkeypatch.setattr(comp_mod, '_HAVE_LIBDEFLATE', False) + monkeypatch.setattr(comp_mod, '_libdeflate', None) + + raw = b'1800-deflate-fallback' * 4096 + blob = comp_mod.deflate_compress(raw, level=6) + assert zlib.decompress(blob) == raw + # Exact byte equality to ``zlib.compress`` at the same level (the + # fallback path is a direct call). + assert blob == zlib.compress(raw, 6) + + +@pytest.mark.skipif(not _HAVE_LIBDEFLATE, + reason='libdeflate not installed') +def test_deflate_compress_uses_libdeflate_when_available(): + """When libdeflate is installed, deflate output stays wire-compatible.""" + import zlib + raw = (np.arange(8192, dtype=np.uint8) % 251).tobytes() * 16 + blob = deflate_compress(raw, level=6) + assert zlib.decompress(blob) == raw + + +def test_libdeflate_compressor_cache_is_thread_local(): + """The cache lives in threading.local, so two threads see distinct dicts. + + Uses a ``threading.Barrier`` to force both tasks to occupy a worker + at the same time. Without that, ``ThreadPoolExecutor(max_workers=2)`` + is free to run both submissions on the same thread (if the first + returns before the second is scheduled), and the test would + intermittently pass with only one observed cache id. + """ + import threading + + import xrspatial.geotiff._compression as comp_mod + + if not comp_mod._HAVE_LIBDEFLATE: + pytest.skip('libdeflate not installed') + + seen_caches: dict[int, int] = {} + barrier = threading.Barrier(2, timeout=10) + + def grab(_tag): + # Both workers must reach the barrier before either proceeds, so + # the pool is forced to use both threads. + barrier.wait() + comp_mod._libdeflate_compressor(6) + tid = threading.get_ident() + seen_caches[tid] = id(comp_mod._libdeflate_thread_local.cache) + + with ThreadPoolExecutor(max_workers=2) as pool: + list(pool.map(grab, ['a', 'b'])) + + # Two distinct threads ran and each populated its own threading.local + # cache, so we should see two thread ids and two cache ids. + assert len(seen_caches) == 2, f'expected 2 threads, saw {len(seen_caches)}' + assert len(set(seen_caches.values())) == 2, ( + f'expected 2 distinct caches, saw {seen_caches}' + ) + + +# -- End-to-end via write() ------------------------------------------------ + + +def test_write_strip_deflate_round_trip_multi_strip(tmp_path): + """Drive the writer entrypoint with a multi-strip deflate payload. + + The reader doesn't care which path produced the bytes; this guards + the full write pipeline (predictor on, multiple strips). + """ + expected = _make_data(900, 700, dtype=np.float32, pattern='random') + path = str(tmp_path / 'e2e_strip_1800.tif') + write(expected, path, compression='deflate', tiled=False, predictor=True) + arr, _ = read_to_array(path) + np.testing.assert_array_equal(arr, expected) + + +def test_write_tiled_deflate_large_tile_round_trip(tmp_path): + """tile_size=1024 on 2048x2048 must round-trip through the parallel path.""" + expected = _make_data(2048, 2048, dtype=np.float32, pattern='random') + path = str(tmp_path / 'e2e_tile1024_1800.tif') + write(expected, path, compression='deflate', tiled=True, tile_size=1024) + arr, _ = read_to_array(path) + np.testing.assert_array_equal(arr, expected) diff --git a/xrspatial/geotiff/tests/test_read_geotiff_dask_vrt_kwargs_1795.py b/xrspatial/geotiff/tests/test_read_geotiff_dask_vrt_kwargs_1795.py new file mode 100644 index 00000000..eb148a7e --- /dev/null +++ b/xrspatial/geotiff/tests/test_read_geotiff_dask_vrt_kwargs_1795.py @@ -0,0 +1,56 @@ +"""Direct read_geotiff_dask(.vrt) must forward VRT kwargs (#1795).""" +from __future__ import annotations + +import os + +import numpy as np +import pytest + +from xrspatial.geotiff import to_geotiff, read_geotiff_dask + + +def _write_vrt(vrt_path, source_name, *, bands=1): + band_xml = [] + for i in range(bands): + band_xml.append( + f' \n' + ' \n' + f' {source_name}' + '\n' + f' {i + 1}\n' + ' \n' + ' \n' + ' \n' + ' \n' + ) + vrt_path.write_text( + '\n' + + ''.join(band_xml) + + '\n' + ) + + +def test_direct_read_geotiff_dask_vrt_forwards_window_and_band(tmp_path): + arr = np.arange(4 * 6 * 2, dtype=np.float32).reshape(4, 6, 2) + src = tmp_path / "tmp_1797_source.tif" + to_geotiff(arr, str(src), compression='none') + vrt = tmp_path / "tmp_1797_source.vrt" + _write_vrt(vrt, os.path.basename(src), bands=2) + + got = read_geotiff_dask( + str(vrt), chunks=2, window=(1, 2, 4, 6), band=1, + ) + + assert got.shape == (3, 4) + np.testing.assert_array_equal(got.values, arr[1:4, 2:6, 1]) + + +def test_direct_read_geotiff_dask_vrt_forwards_max_pixels(tmp_path): + arr = np.arange(24, dtype=np.float32).reshape(4, 6) + src = tmp_path / "tmp_1797_source_cap.tif" + to_geotiff(arr, str(src), compression='none') + vrt = tmp_path / "tmp_1797_source_cap.vrt" + _write_vrt(vrt, os.path.basename(src)) + + with pytest.raises(ValueError, match="exceed"): + read_geotiff_dask(str(vrt), chunks=2, max_pixels=10) diff --git a/xrspatial/geotiff/tests/test_vrt_missing_sources_policy_1799.py b/xrspatial/geotiff/tests/test_vrt_missing_sources_policy_1799.py new file mode 100644 index 00000000..217e62a2 --- /dev/null +++ b/xrspatial/geotiff/tests/test_vrt_missing_sources_policy_1799.py @@ -0,0 +1,51 @@ +"""VRT missing-source handling has an explicit policy (#1799).""" +from __future__ import annotations + +import pytest + +from xrspatial.geotiff import read_vrt +from xrspatial.geotiff import GeoTIFFFallbackWarning + + +def _write_missing_source_vrt(path): + path.write_text( + '\n' + ' \n' + ' \n' + ' missing.tif' + '\n' + ' 1\n' + ' \n' + ' \n' + ' \n' + ' \n' + '\n' + ) + + +def test_read_vrt_missing_sources_warns_and_records_hole(tmp_path): + vrt = tmp_path / "tmp_1799_missing_warn.vrt" + _write_missing_source_vrt(vrt) + + with pytest.warns(GeoTIFFFallbackWarning, match="could not be read"): + da = read_vrt(str(vrt), missing_sources='warn') + + assert 'vrt_holes' in da.attrs + assert da.attrs['vrt_holes'][0]['source'].endswith('missing.tif') + + +def test_read_vrt_missing_sources_raise_fails_fast(tmp_path): + vrt = tmp_path / "tmp_1799_missing_raise.vrt" + _write_missing_source_vrt(vrt) + + with pytest.raises((OSError, ValueError)): + read_vrt(str(vrt), missing_sources='raise') + + +def test_read_vrt_missing_sources_validates_policy(tmp_path): + vrt = tmp_path / "tmp_1799_missing_bad_policy.vrt" + _write_missing_source_vrt(vrt) + + with pytest.raises(ValueError, match="missing_sources"): + read_vrt(str(vrt), missing_sources='ignore') + diff --git a/xrspatial/geotiff/tests/test_vrt_source_max_pixels_1796.py b/xrspatial/geotiff/tests/test_vrt_source_max_pixels_1796.py new file mode 100644 index 00000000..7ce72288 --- /dev/null +++ b/xrspatial/geotiff/tests/test_vrt_source_max_pixels_1796.py @@ -0,0 +1,35 @@ +"""VRT source reads must honor the caller's max_pixels budget (#1796).""" +from __future__ import annotations + +import os + +import numpy as np +import pytest + +from xrspatial.geotiff import to_geotiff, read_vrt + + +def test_vrt_source_read_forwards_max_pixels(tmp_path): + """A tiny VRT output cannot force an oversized source-window decode.""" + src = tmp_path / "tmp_1796_source.tif" + data = np.arange(16, dtype=np.uint8).reshape(4, 4) + to_geotiff(data, str(src), compression='none') + + vrt = tmp_path / "tmp_1796_source_cap.vrt" + vrt.write_text( + '\n' + ' \n' + ' \n' + f' {os.path.basename(src)}' + '\n' + ' 1\n' + ' \n' + ' \n' + ' \n' + ' \n' + '\n' + ) + + with pytest.raises(ValueError, match="exceed"): + read_vrt(str(vrt), max_pixels=1) + diff --git a/xrspatial/tests/test_geotiff_streaming_bigtiff_threshold_1785.py b/xrspatial/tests/test_geotiff_streaming_bigtiff_threshold_1785.py new file mode 100644 index 00000000..d960d299 --- /dev/null +++ b/xrspatial/tests/test_geotiff_streaming_bigtiff_threshold_1785.py @@ -0,0 +1,263 @@ +"""Regression tests for issue #1785. + +The streaming writer's auto-BigTIFF decision used to compare only the +uncompressed pixel-data size against ``UINT32_MAX``. For rasters just +under 4 GiB the IFD plus the strip/tile-offset table pushed the actual +file past the classic-TIFF uint32 offset ceiling, and the write failed +late with ``struct.error``. + +These tests pin the corrected decision: + +* The helper takes an actual ``ifd_overhead_bytes`` value (computed from + the real tag list via ``_compute_classic_ifd_overhead``) rather than a + 200-byte fudge constant; large ``gdal_metadata_xml`` or ``extra_tags`` + payloads must not silently undercount overhead. See the Copilot review + on PR #1787. +* The comparison is ``> UINT32_MAX``, matching the eager + ``_assemble_tiff`` decision (``estimated_file_size > UINT32_MAX``). A + file that is exactly ``UINT32_MAX`` bytes still fits classic. +* The explicit ``bigtiff=True``/``False`` user override still wins. +""" +from __future__ import annotations + +import struct + +import dask.array as da +import numpy as np +import pytest +import xarray as xr + +from xrspatial.geotiff import to_geotiff +from xrspatial.geotiff._dtypes import ASCII, LONG, SHORT +from xrspatial.geotiff._header import ( + TAG_BITS_PER_SAMPLE, + TAG_COMPRESSION, + TAG_GDAL_METADATA, + TAG_IMAGE_LENGTH, + TAG_IMAGE_WIDTH, + TAG_PHOTOMETRIC, + TAG_SAMPLE_FORMAT, + TAG_SAMPLES_PER_PIXEL, + TAG_STRIP_BYTE_COUNTS, + TAG_STRIP_OFFSETS, +) +from xrspatial.geotiff._writer import ( + _compute_classic_ifd_overhead, + _should_use_bigtiff_streaming, +) + + +UINT32_MAX = 0xFFFFFFFF + + +def _minimal_tag_list(n_entries: int, gdal_metadata_size: int = 0) -> list: + """Build a representative classic-TIFF tag list for sizing. + + Mirrors the streaming writer's tag set for a stripped uint8 raster + so ``_compute_classic_ifd_overhead`` returns a realistic byte count. + ``gdal_metadata_size`` injects an ASCII overflow payload to model + metadata-heavy writes. + """ + tags = [ + (TAG_IMAGE_WIDTH, LONG, 1, 1), + (TAG_IMAGE_LENGTH, LONG, 1, 1), + (TAG_BITS_PER_SAMPLE, SHORT, 1, 8), + (TAG_COMPRESSION, SHORT, 1, 1), + (TAG_PHOTOMETRIC, SHORT, 1, 1), + (TAG_SAMPLES_PER_PIXEL, SHORT, 1, 1), + (TAG_SAMPLE_FORMAT, SHORT, 1, 1), + (TAG_STRIP_OFFSETS, LONG, n_entries, [0] * n_entries), + (TAG_STRIP_BYTE_COUNTS, LONG, n_entries, [0] * n_entries), + ] + if gdal_metadata_size > 0: + blob = 'x' * gdal_metadata_size + tags.append((TAG_GDAL_METADATA, ASCII, len(blob) + 1, blob)) + return tags + + +# -- Helper-level unit tests ------------------------------------------------ + +class TestShouldUseBigTIFFStreaming: + def test_just_under_uint32_max_promotes(self): + """uncompressed = UINT32_MAX - 50 with non-trivial overhead promotes. + + Even ~50 bytes of slack disappears once IFD + strip-table overhead + is added, so this case must promote to BigTIFF. + """ + # 1024 entries: strip table contributes 8 * 1024 = 8 KiB. + tags = _minimal_tag_list(n_entries=1024) + overhead = _compute_classic_ifd_overhead(tags) + assert _should_use_bigtiff_streaming( + uncompressed_bytes=UINT32_MAX - 50, + n_entries=0, + ifd_overhead_bytes=overhead, + ) is True + + def test_half_uint32_max_stays_classic(self): + """uncompressed = UINT32_MAX // 2 stays classic.""" + tags = _minimal_tag_list(n_entries=1024) + overhead = _compute_classic_ifd_overhead(tags) + assert _should_use_bigtiff_streaming( + uncompressed_bytes=UINT32_MAX // 2, + n_entries=0, + ifd_overhead_bytes=overhead, + ) is False + + def test_exactly_uint32_max_stays_classic(self): + """Boundary: total file size == UINT32_MAX bytes still fits classic. + + Eager ``_assemble_tiff`` uses ``estimated_file_size > UINT32_MAX``; + the streaming helper must match. A file of exactly ``UINT32_MAX`` + bytes has its last byte at offset ``UINT32_MAX - 1``, which is a + valid classic-TIFF offset. + """ + # Construct uncompressed_bytes so total = exactly UINT32_MAX. + tags = _minimal_tag_list(n_entries=1) + overhead = _compute_classic_ifd_overhead(tags) + header = 8 + uncompressed = UINT32_MAX - header - overhead + assert _should_use_bigtiff_streaming( + uncompressed_bytes=uncompressed, + n_entries=0, + ifd_overhead_bytes=overhead, + ) is False + # One byte over and we must promote. + assert _should_use_bigtiff_streaming( + uncompressed_bytes=uncompressed + 1, + n_entries=0, + ifd_overhead_bytes=overhead, + ) is True + + def test_small_raster_no_overhead_stays_classic(self): + """Small rasters with one strip stay classic.""" + tags = _minimal_tag_list(n_entries=1) + overhead = _compute_classic_ifd_overhead(tags) + assert _should_use_bigtiff_streaming( + uncompressed_bytes=1024, + n_entries=0, + ifd_overhead_bytes=overhead, + ) is False + + def test_large_strip_table_alone_can_promote(self): + """Even a small pixel payload can need BigTIFF if n_entries is huge. + + Documents the strip-table contribution: ~536 M entries puts the + table itself near 4 GiB and forces BigTIFF with no pixel data. + Driven through the ``n_entries`` parameter (8 bytes per entry) + to avoid allocating a 536 M-element Python list at test time; + the ``ifd_overhead_bytes`` path is exercised by + ``test_overhead_pushes_just_under_threshold_over``. + """ + n_entries = (UINT32_MAX // 8) + 1 + assert _should_use_bigtiff_streaming( + uncompressed_bytes=0, + n_entries=n_entries, + ifd_overhead_bytes=0, + ) is True + + def test_overhead_pushes_just_under_threshold_over(self): + """Regression: a payload that fits classic by raw bytes but not + once header + IFD + strip table is added must promote. + """ + n_entries = 100_000 # ~800 KB strip table + tags = _minimal_tag_list(n_entries=n_entries) + overhead = _compute_classic_ifd_overhead(tags) + # Choose uncompressed so the total equals exactly UINT32_MAX + 1. + header = 8 + uncompressed = UINT32_MAX + 1 - header - overhead + assert _should_use_bigtiff_streaming( + uncompressed_bytes=uncompressed, + n_entries=0, + ifd_overhead_bytes=overhead, + ) is True + # One byte less and we stay classic (boundary is now ``>``). + assert _should_use_bigtiff_streaming( + uncompressed_bytes=uncompressed - 1, + n_entries=0, + ifd_overhead_bytes=overhead, + ) is False + + def test_large_gdal_metadata_flips_decision(self): + """A 5000-byte gdal_metadata blob must flip a borderline case. + + Under the old 200-byte fudge, ``uncompressed + 200 < UINT32_MAX`` + could stay classic even when a multi-KB gdal_metadata overflow + pushed real overhead well past 200 bytes. With the actual + overhead computed from the tag list, the decision flips. + """ + n_entries = 1024 + big_blob = 5000 # ASCII overflow heap entry + plain_tags = _minimal_tag_list(n_entries=n_entries) + meta_tags = _minimal_tag_list( + n_entries=n_entries, gdal_metadata_size=big_blob) + + plain_overhead = _compute_classic_ifd_overhead(plain_tags) + meta_overhead = _compute_classic_ifd_overhead(meta_tags) + # Metadata blob really does increase computed overhead. + assert meta_overhead - plain_overhead >= big_blob + + # Pick uncompressed so plain_overhead path stays classic but + # the metadata path tips over. + header = 8 + uncompressed = UINT32_MAX - header - plain_overhead + # Plain: total == UINT32_MAX -> classic. + assert _should_use_bigtiff_streaming( + uncompressed_bytes=uncompressed, + n_entries=0, + ifd_overhead_bytes=plain_overhead, + ) is False + # With the large metadata blob folded into the real overhead, + # the total now exceeds UINT32_MAX and we must promote. + assert _should_use_bigtiff_streaming( + uncompressed_bytes=uncompressed, + n_entries=0, + ifd_overhead_bytes=meta_overhead, + ) is True + + +# -- Integration tests against the writer ------------------------------------ + +def _read_tiff_magic(path: str) -> int: + """Return the TIFF version field: 42 (0x002A) classic, 43 (0x002B) BigTIFF.""" + with open(path, 'rb') as f: + head = f.read(4) + byte_order = head[:2] + if byte_order == b'II': + fmt = ' Date: Wed, 13 May 2026 10:13:49 -0700 Subject: [PATCH 4/4] geotiff: per-tile dim check uses default cap, not caller budget (#1823) PR #1803 forwarded the caller's max_pixels to read_to_array inside read_vrt's source loop so a tiny VRT output cannot force a huge source decode (#1796). The output-window check at the source read enforces that correctly. A separate per-tile dimension check at the same call sites also consumed the caller's max_pixels, so a caller setting max_pixels as an output budget (e.g. 10_000) failed the per-tile sanity check on any normal source whose default tile size is 256x256 (= 65_536 pixels). Use MAX_PIXELS_DEFAULT for the per-tile dim check at the two call sites in _read_tiles (local) and _read_tiles_cog_http (HTTP). The output-window check at the same functions continues to enforce the user-supplied max_pixels, preserving the #1796 protection. --- xrspatial/geotiff/_reader.py | 19 ++- .../tests/test_vrt_source_tile_check_1823.py | 113 ++++++++++++++++++ 2 files changed, 127 insertions(+), 5 deletions(-) create mode 100644 xrspatial/geotiff/tests/test_vrt_source_tile_check_1823.py diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py index b3da32bd..c174c3fc 100644 --- a/xrspatial/geotiff/_reader.py +++ b/xrspatial/geotiff/_reader.py @@ -1560,9 +1560,14 @@ def _read_tiles(data: bytes, ifd: IFD, header: TIFFHeader, raise ValueError( f"Invalid tile dimensions: TileWidth={tw}, TileLength={th}") - # Reject crafted tile dims that would force huge per-tile allocations. - # A single tile's decoded bytes must also fit under the pixel budget. - _check_dimensions(tw, th, samples, max_pixels) + # Reject crafted tile dims (e.g. TileWidth = 2**31). This guards the + # TIFF header against malformed values; it is not the caller's output + # budget. The output-window check below uses ``max_pixels`` and is + # what enforces the user's per-call memory cap. The source-read path + # under ``read_vrt`` (#1796) relies on that output check to honour a + # small caller ``max_pixels`` against a normal-tile source; see + # #1823. + _check_dimensions(tw, th, samples, MAX_PIXELS_DEFAULT) # Per-tile compressed-byte cap (issue #1664). Same env var as the # HTTP path. mmap slicing is bounded by the file size, but the slice @@ -2016,10 +2021,14 @@ def _fetch_decode_cog_http_tiles( # A windowed HTTP read of a multi-billion-pixel COG only allocates # the window, so capping the full image would reject legitimate # tiled reads. The full-image cap still applies for whole-file - # reads (window is None). The single-tile budget always applies. + # reads (window is None). The per-tile dim check below guards the + # TIFF header against absurd ``TileWidth`` / ``TileLength`` values + # (e.g. 2**31) and uses ``MAX_PIXELS_DEFAULT`` so a caller's small + # ``max_pixels`` -- intended as an output-window budget -- does not + # reject normal 256x256 tiles. See #1823. if window is None: _check_dimensions(width, height, samples, max_pixels) - _check_dimensions(tw, th, samples, max_pixels) + _check_dimensions(tw, th, samples, MAX_PIXELS_DEFAULT) # Reject malformed TIFFs whose declared tile grid exceeds the supplied # TileOffsets length. See issue #1219. diff --git a/xrspatial/geotiff/tests/test_vrt_source_tile_check_1823.py b/xrspatial/geotiff/tests/test_vrt_source_tile_check_1823.py new file mode 100644 index 00000000..b02dd2fb --- /dev/null +++ b/xrspatial/geotiff/tests/test_vrt_source_tile_check_1823.py @@ -0,0 +1,113 @@ +"""Regression tests for #1823. + +PR #1803 forwarded the caller's ``max_pixels`` to ``read_to_array`` inside +the VRT source loop so that a tiny VRT output could not force a huge +source decode (#1796). The output-window check enforces that. A separate +per-tile dimension check was incorrectly using the same ``max_pixels`` +value, so a caller setting ``max_pixels`` as an output budget (e.g. +10,000) would also fail the per-tile sanity check on every normal source +whose default tile size is 256x256 (= 65,536 pixels). + +The #1796 protection remains: the output-window check still catches a +tiny VRT output that asks for a large source window. +""" +from __future__ import annotations + +import os +import tempfile + +import numpy as np +import pytest + +from xrspatial.geotiff import to_geotiff +from xrspatial.geotiff._reader import PixelSafetyLimitError +from xrspatial.geotiff._vrt import read_vrt + + +def _write_normal_tile_source(td: str) -> str: + """10x10 uint8 source -- ``to_geotiff`` pads to a 256x256 tile.""" + src = os.path.join(td, 'src.tif') + to_geotiff(np.zeros((10, 10), dtype=np.uint8), src, compression='none') + return src + + +def _write_vrt(td: str, *, dst_x_size: int, dst_y_size: int, + raster_x: int = 100, raster_y: int = 100, + src_x_size: int = 10, src_y_size: int = 10) -> str: + vrt = os.path.join(td, 'mosaic.vrt') + xml = ( + f'\n' + f' \n' + f' \n' + f' src.tif\n' + f' 1\n' + f' \n' + f' \n' + f' \n' + f' \n' + f'\n' + ) + with open(vrt, 'w') as f: + f.write(xml) + return vrt + + +class TestPerTileCheckDoesNotUseCallerBudget: + """Per-tile dim sanity must not reject normal 256x256 source tiles + when the caller's ``max_pixels`` is a small output-budget value.""" + + def test_normal_tile_source_with_small_max_pixels(self): + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: + _write_normal_tile_source(td) + vrt = _write_vrt(td, dst_x_size=100, dst_y_size=100) + arr, _ = read_vrt(vrt, max_pixels=10_000) + assert arr.shape == (100, 100) + + def test_normal_tile_source_with_tiny_max_pixels(self): + """An output budget below a single tile must still succeed when + the requested output window itself fits.""" + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: + _write_normal_tile_source(td) + # Output 5x5 = 25 pixels; max_pixels = 100 fits 25 with room. + vrt = _write_vrt(td, dst_x_size=5, dst_y_size=5, + raster_x=5, raster_y=5) + arr, _ = read_vrt(vrt, max_pixels=100) + assert arr.shape == (5, 5) + + +class TestOutputWindowCheckStillEnforced: + """The output-window check at the source read still rejects an + over-budget read; the #1796 protection is preserved.""" + + def test_output_window_exceeds_max_pixels_still_rejected(self): + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: + src = os.path.join(td, 'src.tif') + to_geotiff(np.arange(16, dtype=np.uint8).reshape(4, 4), + src, compression='none') + vrt = _write_vrt(td, dst_x_size=1, dst_y_size=1, + raster_x=1, raster_y=1, + src_x_size=4, src_y_size=4) + # SrcRect 4x4 = 16 pixels > max_pixels=1 → output check fires. + with pytest.raises(ValueError, match="exceed"): + read_vrt(vrt, max_pixels=1) + + +class TestPerTileCheckStillRejectsCraftedHeader: + """A pathological ``TileWidth``/``TileLength`` must still fail at + the per-tile sanity check, which uses ``MAX_PIXELS_DEFAULT``.""" + + def test_per_tile_check_caps_at_default(self, monkeypatch): + """Lower ``MAX_PIXELS_DEFAULT`` to verify the per-tile call site + is wired to it (rather than to the caller's budget).""" + from xrspatial.geotiff import _reader as reader_mod + + monkeypatch.setattr(reader_mod, "MAX_PIXELS_DEFAULT", 100) + with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td: + _write_normal_tile_source(td) + vrt = _write_vrt(td, dst_x_size=100, dst_y_size=100) + # 256x256 tile > patched MAX_PIXELS_DEFAULT=100 → per-tile + # check fires regardless of caller's max_pixels (1e9 here). + with pytest.raises(PixelSafetyLimitError, match="65,536"): + read_vrt(vrt, max_pixels=1_000_000_000)