diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 3949d219..c98f55d1 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -308,11 +308,13 @@ def _populate_attrs_from_geo_info(attrs: dict, geo_info, *, window=None) -> None sets it next to its own nodata-masking step (the value's presence in attrs signals "this array has been NaN-masked"). - ``window`` is a ``(r0, c0, r1, c1)`` tuple for the eager windowed - read; when set, the emitted ``attrs['transform']`` shifts the origin - to the window's top-left. The dask and GPU paths do not use this -- - their windows are per-chunk inside the graph, not on the outer - DataArray. + ``window`` is a ``(r0, c0, r1, c1)`` tuple for windowed reads; when + set, the emitted ``attrs['transform']`` shifts the origin to the + window's top-left. The eager path and the dask path (since #1561, + which threads ``window=`` through ``read_geotiff_dask``) both pass + the outer window through this helper so the resulting DataArray + advertises the windowed transform. The GPU path does not currently + expose a windowed read, so it passes ``window=None``. """ if geo_info.crs_epsg is not None: attrs['crs'] = geo_info.crs_epsg @@ -502,7 +504,9 @@ def open_geotiff(source, *, dtype=None, window=None, # Dask path (CPU) if chunks is not None: return read_geotiff_dask(source, dtype=dtype, chunks=chunks, - overview_level=overview_level, name=name) + overview_level=overview_level, + window=window, band=band, + max_pixels=max_pixels, name=name) kwargs = {} if max_pixels is not None: @@ -964,15 +968,24 @@ def to_geotiff(data: xr.DataArray | np.ndarray, path, *, "max_z_error is not supported on the GPU writer " "(nvCOMP has no LERC backend). Use the CPU path " "(gpu=False) or omit max_z_error.") + # Strip output is not implemented on the GPU path; reject up + # front rather than silently producing a tiled file. + if not tiled: + raise ValueError( + "tiled=False is not supported on the GPU writer. " + "Pass gpu=False or omit tiled=False.") try: write_geotiff_gpu(data, path, crs=crs, nodata=nodata, compression=compression, compression_level=compression_level, + tiled=tiled, tile_size=tile_size, predictor=predictor, cog=cog, overview_levels=overview_levels, - overview_resampling=overview_resampling) + overview_resampling=overview_resampling, + bigtiff=bigtiff, + streaming_buffer_bytes=streaming_buffer_bytes) return except (ImportError, Exception): pass # fall through to CPU path @@ -1379,6 +1392,9 @@ def _write_vrt_tiled(data, vrt_path, *, crs=None, nodata=None, def read_geotiff_dask(source: str, *, dtype=None, chunks: int | tuple = 512, overview_level: int | None = None, + window: tuple | None = None, + band: int | None = None, + max_pixels: int | None = None, name: str | None = None) -> xr.DataArray: """Read a GeoTIFF as a dask-backed DataArray for out-of-core processing. @@ -1395,6 +1411,21 @@ def read_geotiff_dask(source: str, *, dtype=None, chunks: int | tuple = 512, Chunk size in pixels. Default 512. overview_level : int or None Overview level (0 = full resolution). + window : tuple or None + ``(row_start, col_start, row_stop, col_stop)`` to restrict + chunking to a sub-region of the file. Chunks are laid out + relative to the window origin. None reads the full raster. + band : int or None + Zero-based band index. None returns all bands (3D for + multi-band files, 2D for single-band). Selecting a single band + produces a 2D DataArray. + max_pixels : int or None + Maximum allowed pixel count (width * height * samples) for the + windowed region. None uses the reader default (~1 billion). + The cap is checked once up-front against the lazy region; each + chunk task also re-checks against ``max_pixels`` so windowed + reads stay bounded even when ``read_to_array`` is invoked + directly. name : str or None Name for the DataArray. @@ -1478,14 +1509,68 @@ def read_geotiff_dask(source: str, *, dtype=None, chunks: int | tuple = 512, else: target_dtype = effective_dtype - coords = _geo_to_coords(geo_info, full_h, full_w) + # Window clipping: restrict the lazy region to the requested + # sub-rectangle. ``read_to_array`` already accepts ``window=`` per + # chunk; we only need to remap the chunk grid so its origin moves to + # ``(win_r0, win_c0)`` and its extent shrinks to the window. + win_r0 = win_c0 = 0 + if window is not None: + win_r0, win_c0, win_r1, win_c1 = window + if (win_r0 < 0 or win_c0 < 0 + or win_r1 > full_h or win_c1 > full_w + or win_r0 >= win_r1 or win_c0 >= win_c1): + raise ValueError( + f"window={window} is outside the source extent " + f"({full_h}x{full_w}) or has non-positive size.") + # Mirror the eager-path windowed coord computation in open_geotiff. + t = geo_info.transform + if geo_info.raster_type == RASTER_PIXEL_IS_POINT: + win_x = (np.arange(win_c0, win_c1, dtype=np.float64) + * t.pixel_width + t.origin_x) + win_y = (np.arange(win_r0, win_r1, dtype=np.float64) + * t.pixel_height + t.origin_y) + else: + win_x = (np.arange(win_c0, win_c1, dtype=np.float64) + * t.pixel_width + t.origin_x + + t.pixel_width * 0.5) + win_y = (np.arange(win_r0, win_r1, dtype=np.float64) + * t.pixel_height + t.origin_y + + t.pixel_height * 0.5) + coords = {'y': win_y, 'x': win_x} + full_h = win_r1 - win_r0 + full_w = win_c1 - win_c0 + else: + coords = _geo_to_coords(geo_info, full_h, full_w) + + if band is not None: + if n_bands == 0: + if band != 0: + raise IndexError( + f"band={band} requested on a single-band file.") + elif not 0 <= band < n_bands: + raise IndexError( + f"band={band} out of range for {n_bands}-band file.") + + # Up-front pixel-count guard against the windowed extent. Chunk + # tasks re-check via read_to_array's own ``max_pixels`` (which we + # forward through ``_delayed_read_window``), but catching an + # oversized request before any task is scheduled saves the caller + # from a misleading "tile size exceeds max_pixels" error in a + # chunk that happens to align with the file's tile grid. + if max_pixels is not None: + eff_bands = (1 if band is not None + else (n_bands if n_bands > 0 else 1)) + if full_h * full_w * eff_bands > max_pixels: + raise ValueError( + f"Requested region {full_h}x{full_w}x{eff_bands} " + f"exceeds max_pixels={max_pixels:,}.") if name is None: import os name = os.path.splitext(os.path.basename(source))[0] attrs = {} - _populate_attrs_from_geo_info(attrs, geo_info) + _populate_attrs_from_geo_info(attrs, geo_info, window=window) if nodata is not None: attrs['nodata'] = nodata @@ -1522,7 +1607,12 @@ def read_geotiff_dask(source: str, *, dtype=None, chunks: int | tuple = 512, # For multi-band, each window read returns (h, w, bands); for single-band (h, w) # read_to_array with band=0 extracts a single band, band=None returns all - band_arg = None # return all bands (or 2D if single-band) + band_arg = band # None => all bands (or 2D for single-band file) + + # When ``band`` is set, each chunk reads a 2D slice -- collapse the + # output dims so the returned DataArray is 2D regardless of file band + # count. + out_has_band_axis = band is None and n_bands > 0 dask_rows = [] for r0 in rows: @@ -1530,16 +1620,22 @@ def read_geotiff_dask(source: str, *, dtype=None, chunks: int | tuple = 512, dask_cols = [] for c0 in cols: c1 = min(c0 + ch_w, full_w) - if n_bands > 0: + if out_has_band_axis: block_shape = (r1 - r0, c1 - c0, n_bands) else: block_shape = (r1 - r0, c1 - c0) + # Translate window-relative chunk coords back to file-relative + # coords for ``read_to_array``. ``win_r0`` / ``win_c0`` are 0 + # when no window was requested. block = da.from_delayed( - _delayed_read_window(source, r0, c0, r1, c1, + _delayed_read_window(source, + r0 + win_r0, c0 + win_c0, + r1 + win_r0, c1 + win_c0, overview_level, nodata, band_arg, target_dtype=target_dtype if dtype is not None else None, - http_meta_key=http_meta_key), + http_meta_key=http_meta_key, + max_pixels=max_pixels), shape=block_shape, dtype=target_dtype, ) @@ -1548,7 +1644,7 @@ def read_geotiff_dask(source: str, *, dtype=None, chunks: int | tuple = 512, dask_arr = da.concatenate(dask_rows, axis=0) - if n_bands > 0: + if out_has_band_axis: dims = ['y', 'x', 'band'] coords['band'] = np.arange(n_bands) else: @@ -1560,7 +1656,8 @@ def read_geotiff_dask(source: str, *, dtype=None, chunks: int | tuple = 512, def _delayed_read_window(source, r0, c0, r1, c1, overview_level, nodata, - band, *, target_dtype=None, http_meta_key=None): + band, *, target_dtype=None, http_meta_key=None, + max_pixels=None): """Dask-delayed function to read a single window. *http_meta_key* is an optional ``Delayed[(TIFFHeader, IFD)]`` parsed @@ -1576,21 +1673,30 @@ def _delayed_read_window(source, r0, c0, r1, c1, overview_level, nodata, def _read(http_meta): if http_meta is not None and isinstance(source, str) and \ source.startswith(('http://', 'https://')): - from ._reader import _HTTPSource, _fetch_decode_cog_http_tiles + from ._reader import ( + _HTTPSource, _fetch_decode_cog_http_tiles, + MAX_PIXELS_DEFAULT, + ) header, ifd = http_meta src = _HTTPSource(source) try: arr = _fetch_decode_cog_http_tiles( - src, header, ifd, window=(r0, c0, r1, c1)) + src, header, ifd, + max_pixels=(max_pixels if max_pixels is not None + else MAX_PIXELS_DEFAULT), + window=(r0, c0, r1, c1)) finally: src.close() if (arr.ndim == 3 and ifd.samples_per_pixel > 1 and band is not None): arr = arr[:, :, band] else: + _r2a_kwargs = {} + if max_pixels is not None: + _r2a_kwargs['max_pixels'] = max_pixels arr, _ = read_to_array(source, window=(r0, c0, r1, c1), overview_level=overview_level, - band=band) + band=band, **_r2a_kwargs) if nodata is not None: # ``arr`` was just decoded by ``_fetch_decode_cog_http_tiles`` # or ``read_to_array``; both return freshly-allocated buffers @@ -2226,11 +2332,15 @@ def write_geotiff_gpu(data, path: str, *, nodata=None, compression: str = 'zstd', compression_level: int | None = None, + tiled: bool = True, tile_size: int = 256, predictor: bool | int = False, cog: bool = False, overview_levels: list[int] | None = None, - overview_resampling: str = 'mean') -> None: + overview_resampling: str = 'mean', + bigtiff: bool | None = None, + max_z_error: float = 0.0, + streaming_buffer_bytes: int | None = None) -> None: """Write a CuPy-backed DataArray as a GeoTIFF with GPU compression. Tiles are extracted and compressed on the GPU via nvCOMP, then @@ -2260,6 +2370,12 @@ def write_geotiff_gpu(data, path: str, *, compression_level : int or None Compression effort level. Accepted for API compatibility but currently ignored -- nvCOMP does not expose level control. + tiled : bool + Must be True (default). The GPU writer is tiled-only because + nvCOMP batch compression operates on per-tile streams; passing + ``tiled=False`` raises ``ValueError`` rather than silently + producing a tiled file. Accepted for API parity with + ``to_geotiff``. tile_size : int Tile size in pixels (default 256). predictor : bool or int @@ -2275,7 +2391,37 @@ def write_geotiff_gpu(data, path: str, *, overview_resampling : str Resampling method for overviews: 'mean' (default), 'nearest', 'min', 'max', 'median', or 'mode'. + bigtiff : bool or None + Force BigTIFF (64-bit offsets). None auto-promotes when the + estimated file size would exceed the classic-TIFF 4 GB limit. + max_z_error : float + Per-pixel error budget for LERC compression. The GPU writer + does not implement LERC (nvCOMP has no LERC backend), so any + non-zero value raises ``ValueError``. Accepted at the signature + level for API parity with ``to_geotiff``. + streaming_buffer_bytes : int or None + Accepted for API parity with ``to_geotiff``. The GPU writer + materialises the entire array on device and has no streaming + concept, so this kwarg is a no-op. """ + if not tiled: + raise ValueError( + "write_geotiff_gpu requires tiled=True. nvCOMP batch " + "compression is tile-based; the strip layout is not " + "implemented on the GPU path. Use to_geotiff(..., gpu=False, " + "tiled=False) for strip output on CPU.") + if max_z_error < 0: + raise ValueError( + f"max_z_error must be >= 0, got {max_z_error}") + if max_z_error != 0: + raise ValueError( + "max_z_error is not supported on the GPU writer " + "(nvCOMP has no LERC backend). Use to_geotiff(..., gpu=False) " + "or omit max_z_error.") + # streaming_buffer_bytes is intentionally a no-op on the GPU path; + # the kwarg exists for API parity with to_geotiff so callers can pass + # the same kwargs to both entry points without filtering. + del streaming_buffer_bytes try: import cupy except ImportError: @@ -2444,6 +2590,7 @@ def _gpu_compress_to_part(gpu_arr, w, h, spp): x_resolution=x_res, y_resolution=y_res, resolution_unit=res_unit, + force_bigtiff=bigtiff, ) _write_bytes(file_bytes, path) diff --git a/xrspatial/geotiff/tests/test_backend_kwarg_parity_1561.py b/xrspatial/geotiff/tests/test_backend_kwarg_parity_1561.py new file mode 100644 index 00000000..730b08f2 --- /dev/null +++ b/xrspatial/geotiff/tests/test_backend_kwarg_parity_1561.py @@ -0,0 +1,216 @@ +"""Regression tests for issue #1561. + +``open_geotiff`` and ``to_geotiff`` route to backend-specific entry +points (``read_geotiff_dask``, ``write_geotiff_gpu``) whose kwarg sets +were narrower than the dispatcher's. The dispatcher silently dropped +the missing kwargs when it routed to the smaller-API backend. + +These tests pin the kwargs through to each backend so dispatcher calls +no longer lose them. +""" +from __future__ import annotations + +import importlib.util + +import numpy as np +import pytest +import xarray as xr + + +def _gpu_available() -> bool: + """True if cupy is importable and CUDA is initialized.""" + if importlib.util.find_spec("cupy") is None: + return False + try: + import cupy + return bool(cupy.cuda.is_available()) + except Exception: + return False + + +_HAS_GPU = _gpu_available() +_gpu_only = pytest.mark.skipif( + not _HAS_GPU, + reason="cupy + CUDA required", +) + + +@pytest.fixture +def small_tiff_path(tmp_path): + """4x6 single-band tiled tiff with a small CRS+transform.""" + from xrspatial.geotiff import to_geotiff + + arr = np.arange(24, dtype=np.float32).reshape(4, 6) + da = xr.DataArray( + arr, + dims=['y', 'x'], + coords={ + 'y': np.array([0.5, 1.5, 2.5, 3.5]), + 'x': np.array([0.5, 1.5, 2.5, 3.5, 4.5, 5.5]), + }, + attrs={'crs': 4326}, + ) + p = tmp_path / 'parity_1561_small.tif' + to_geotiff(da, str(p), tile_size=4) + return str(p), arr + + +@pytest.fixture +def small_multiband_tiff_path(tmp_path): + """4x6 three-band tiled tiff.""" + from xrspatial.geotiff import to_geotiff + + arr = np.arange(72, dtype=np.float32).reshape(4, 6, 3) + da = xr.DataArray( + arr, + dims=['y', 'x', 'band'], + coords={ + 'y': np.array([0.5, 1.5, 2.5, 3.5]), + 'x': np.array([0.5, 1.5, 2.5, 3.5, 4.5, 5.5]), + 'band': [0, 1, 2], + }, + attrs={'crs': 4326}, + ) + p = tmp_path / 'parity_1561_mb.tif' + to_geotiff(da, str(p), tile_size=4) + return str(p), arr + + +# -------------------------------------------------------------------- +# read_geotiff_dask: window / band / max_pixels now threaded through +# -------------------------------------------------------------------- + + +def test_read_geotiff_dask_window_clips_region(small_tiff_path): + """``window=`` restricts the lazy region; chunks span only the window.""" + from xrspatial.geotiff import read_geotiff_dask + + path, arr = small_tiff_path + da = read_geotiff_dask(path, chunks=2, window=(1, 2, 4, 6)) + assert da.shape == (3, 4) + np.testing.assert_array_equal(da.values, arr[1:4, 2:6]) + + +def test_read_geotiff_dask_window_via_dispatcher(small_tiff_path): + """``open_geotiff(window=..., chunks=...)`` now keeps the window.""" + from xrspatial.geotiff import open_geotiff + + path, arr = small_tiff_path + da = open_geotiff(path, window=(0, 1, 3, 4), chunks=2) + assert da.shape == (3, 3) + np.testing.assert_array_equal(da.values, arr[0:3, 1:4]) + + +def test_read_geotiff_dask_band_selects_single_band(small_multiband_tiff_path): + """``band=`` produces a 2D DataArray with the selected band.""" + from xrspatial.geotiff import read_geotiff_dask + + path, arr = small_multiband_tiff_path + da = read_geotiff_dask(path, chunks=4, band=1) + assert da.ndim == 2 + np.testing.assert_array_equal(da.values, arr[:, :, 1]) + + +def test_read_geotiff_dask_band_via_dispatcher(small_multiband_tiff_path): + """``open_geotiff(band=..., chunks=...)`` now keeps the band.""" + from xrspatial.geotiff import open_geotiff + + path, arr = small_multiband_tiff_path + da = open_geotiff(path, band=2, chunks=4) + assert da.ndim == 2 + np.testing.assert_array_equal(da.values, arr[:, :, 2]) + + +def test_read_geotiff_dask_max_pixels_rejects_oversized(small_tiff_path): + """``max_pixels=`` rejects the windowed region up front.""" + from xrspatial.geotiff import read_geotiff_dask + + path, _ = small_tiff_path + with pytest.raises(ValueError, match="exceeds max_pixels"): + read_geotiff_dask(path, chunks=2, max_pixels=10) + + +def test_read_geotiff_dask_window_band_combined(small_multiband_tiff_path): + """``window`` and ``band`` cooperate.""" + from xrspatial.geotiff import read_geotiff_dask + + path, arr = small_multiband_tiff_path + da = read_geotiff_dask(path, chunks=2, window=(1, 1, 4, 5), band=0) + assert da.shape == (3, 4) + np.testing.assert_array_equal(da.values, arr[1:4, 1:5, 0]) + + +def test_read_geotiff_dask_invalid_window_raises(small_tiff_path): + """Out-of-bounds windows fail loudly instead of silently clipping.""" + from xrspatial.geotiff import read_geotiff_dask + + path, _ = small_tiff_path + with pytest.raises(ValueError, match="window=.* is outside"): + read_geotiff_dask(path, chunks=2, window=(0, 0, 100, 100)) + + +def test_read_geotiff_dask_invalid_band_raises(small_multiband_tiff_path): + """Out-of-range band indexes fail with IndexError.""" + from xrspatial.geotiff import read_geotiff_dask + + path, _ = small_multiband_tiff_path + with pytest.raises(IndexError, match="band=5 out of range"): + read_geotiff_dask(path, chunks=4, band=5) + + +# -------------------------------------------------------------------- +# write_geotiff_gpu: bigtiff / tiled / max_z_error / streaming_buffer_bytes +# now accepted (with appropriate rejections where the GPU path can't +# implement them). +# -------------------------------------------------------------------- + + +def test_write_geotiff_gpu_rejects_tiled_false(tmp_path): + """The GPU writer is tiled-only; ``tiled=False`` must fail loudly.""" + from xrspatial.geotiff import write_geotiff_gpu + + dummy = np.zeros((2, 2), dtype=np.float32) + with pytest.raises(ValueError, match="tiled=True"): + # path is irrelevant -- validation fires before any file I/O. + write_geotiff_gpu(dummy, str(tmp_path / 'never.tif'), tiled=False) + + +def test_write_geotiff_gpu_rejects_nonzero_max_z_error(tmp_path): + """LERC budget is not implementable on the GPU path.""" + from xrspatial.geotiff import write_geotiff_gpu + + dummy = np.zeros((2, 2), dtype=np.float32) + with pytest.raises(ValueError, match="max_z_error is not supported"): + write_geotiff_gpu(dummy, str(tmp_path / 'never.tif'), max_z_error=1.0) + + +@_gpu_only +def test_write_geotiff_gpu_accepts_streaming_buffer_bytes_as_noop(tmp_path): + """``streaming_buffer_bytes`` is accepted for API parity (no-op).""" + import cupy + from xrspatial.geotiff import write_geotiff_gpu, open_geotiff + + arr = cupy.arange(16, dtype=cupy.float32).reshape(4, 4) + da = xr.DataArray(arr, dims=['y', 'x'], + coords={'y': np.arange(4, dtype=np.float64), + 'x': np.arange(4, dtype=np.float64)}) + p = tmp_path / 'parity_1561_streaming.tif' + # Argument is accepted; result must round-trip identically to a + # call without it. + write_geotiff_gpu(da, str(p), streaming_buffer_bytes=4096, tile_size=4) + rd = open_geotiff(str(p)) + np.testing.assert_array_equal(rd.values, arr.get()) + + +@_gpu_only +def test_to_geotiff_threads_tiled_false_into_gpu_dispatcher(tmp_path): + """``to_geotiff(..., gpu=True, tiled=False)`` rejects, not silently flips.""" + import cupy + from xrspatial.geotiff import to_geotiff + + arr = cupy.zeros((2, 2), dtype=cupy.float32) + da = xr.DataArray(arr, dims=['y', 'x'], + coords={'y': [0.0, 1.0], 'x': [0.0, 1.0]}) + with pytest.raises(ValueError, match="tiled=False"): + to_geotiff(da, str(tmp_path / 'never.tif'), + gpu=True, tiled=False)