diff --git a/.claude/sweep-metadata-state.csv b/.claude/sweep-metadata-state.csv index a454a620..dedc2336 100644 --- a/.claude/sweep-metadata-state.csv +++ b/.claude/sweep-metadata-state.csv @@ -1,3 +1,3 @@ module,last_inspected,issue,severity_max,categories_found,notes -geotiff,2026-05-11,1580;1582,HIGH,1;3;4;5,"write_geotiff_gpu silently mis-ordered (band, y, x) DataArrays (#1580). to_geotiff ignored rioxarray nodatavals and CF _FillValue attrs (#1582). Both fixed in same branch." +geotiff,2026-05-11,1597,HIGH,4;5,read_geotiff_dask dropped nodata mask when sentinel only hit non-first chunks (per-chunk dtype divergence; #1597). reproject,2026-05-10,1572;1573,HIGH,1;3;4,geoid_height_raster dropped input attrs and used dims[-2:] for 3D inputs (#1572). reproject/merge ignored nodatavals (rasterio convention) when rioxarray absent (#1573). Fixed in same branch. diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 3e2a65b2..48a4d531 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -1641,13 +1641,23 @@ def read_geotiff_dask(source: str, *, dtype=None, chunks: int | tuple = 512, # Translate window-relative chunk coords back to file-relative # coords for ``read_to_array``. ``win_r0`` / ``win_c0`` are 0 # when no window was requested. + # Always thread ``target_dtype`` so each delayed chunk lands + # in the same dtype that the dask array declared. Without + # this, an integer raster with an in-range nodata sentinel + # would have ``effective_dtype=float64`` declared on the + # dask graph but per-chunk arrays only promoted when a + # chunk happened to contain a sentinel pixel. Dask + # concatenation then preallocates from the first chunk's + # actual dtype (uint16), silently casting later float64 + # chunks back to int and converting their NaNs to 0. See + # issue #1597. block = da.from_delayed( _delayed_read_window(source, r0 + win_r0, c0 + win_c0, r1 + win_r0, c1 + win_c0, overview_level, nodata, band_arg, - target_dtype=target_dtype if dtype is not None else None, + target_dtype=target_dtype, http_meta_key=http_meta_key, max_pixels=max_pixels), shape=block_shape, diff --git a/xrspatial/geotiff/tests/test_dask_int_nodata_chunks_1597.py b/xrspatial/geotiff/tests/test_dask_int_nodata_chunks_1597.py new file mode 100644 index 00000000..d86e1789 --- /dev/null +++ b/xrspatial/geotiff/tests/test_dask_int_nodata_chunks_1597.py @@ -0,0 +1,116 @@ +"""Regression tests for issue #1597. + +``read_geotiff_dask`` on an integer raster with an in-range nodata +sentinel used to silently lose the mask when the sentinel only appeared +in non-first chunks. Per-chunk dtype divergence (uint16 vs float64) +caused dask concatenation to preallocate from the first chunk's actual +dtype, casting float64 chunks back to int and converting NaN to 0. + +The fix threads the resolved ``target_dtype`` (the dask graph's +declared dtype) unconditionally through ``_delayed_read_window`` so +every chunk lands as float64 regardless of whether its mask hit. +""" +from __future__ import annotations + +import numpy as np +import pytest + +from xrspatial.geotiff import open_geotiff +from xrspatial.geotiff._writer import write + + +@pytest.fixture +def uint16_with_sentinel_only_in_corner(tmp_path): + """Write a uint16 8x8 TIFF whose nodata sentinel is in the + bottom-right 2x2 quadrant. With ``chunks=4`` the top-left chunk + never sees a sentinel and used to keep its uint16 dtype. + """ + arr = np.arange(64, dtype=np.uint16).reshape(8, 8) + 1 + arr[6:8, 6:8] = 65535 + path = str(tmp_path / 'uint16_corner_sentinel_1597.tif') + write(arr, path, nodata=65535, compression='none', tiled=False) + return path, arr + + +def test_eager_promotes_to_float64_and_masks(uint16_with_sentinel_only_in_corner): + """Baseline: the eager path produces float64 with 4 NaNs.""" + path, _ = uint16_with_sentinel_only_in_corner + eager = open_geotiff(path) + assert eager.dtype == np.float64 + assert np.isnan(eager.values).sum() == 4 + assert np.isnan(eager.values[6:8, 6:8]).all() + + +def test_dask_chunks_4_matches_eager(uint16_with_sentinel_only_in_corner): + """The dask compute result matches the eager path bit-for-bit. + + Before the fix this returned a uint16 array with 0s where the + sentinel had been, because dask coerced the late-arriving float64 + chunk back to uint16 at concat time. + """ + path, _ = uint16_with_sentinel_only_in_corner + eager = open_geotiff(path) + dk = open_geotiff(path, chunks=4) + assert dk.dtype == np.float64 + computed = dk.compute() + assert computed.dtype == np.float64 + np.testing.assert_array_equal(np.isnan(computed.values), + np.isnan(eager.values)) + finite = ~np.isnan(eager.values) + np.testing.assert_array_equal(computed.values[finite], + eager.values[finite]) + + +def test_dask_chunks_2_per_chunk_dtype_uniform( + uint16_with_sentinel_only_in_corner): + """Every dask chunk returns float64 regardless of mask hit. + + Iterates the delayed blocks and asserts each one computes to + float64; the regression had the first chunk's actual data come back + as uint16 because the mask never matched there. + """ + path, _ = uint16_with_sentinel_only_in_corner + dk = open_geotiff(path, chunks=2) + blocks = dk.data.to_delayed().flatten() + for i, block in enumerate(blocks): + chunk = block.compute() + assert chunk.dtype == np.float64, ( + f"chunk {i} computed as {chunk.dtype}, expected float64; " + f"per-chunk dtype divergence is the #1597 regression." + ) + + +def test_dask_keeps_dtype_for_out_of_range_sentinel(tmp_path): + """Out-of-range sentinels (uint16 + nodata=-9999) stay uint16. + + The fix should not regress #1581: when the sentinel cannot match + any pixel, no float64 promotion is needed and the dask path keeps + the file's native dtype. + """ + arr = np.array([[1, 2, 3, 4]] * 4, dtype=np.uint16) + path = str(tmp_path / 'uint16_out_of_range_1597.tif') + write(arr, path, nodata=-9999, compression='none', tiled=False) + + dk = open_geotiff(path, chunks=2) + assert dk.dtype == np.uint16 + result = dk.compute() + assert result.dtype == np.uint16 + np.testing.assert_array_equal(result.values, arr) + + +def test_dask_float_input_with_sentinel_in_one_chunk(tmp_path): + """Float rasters with sentinel in non-first chunk also stay float. + + The float path doesn't promote dtype, but it does in-place NaN + substitution. Verify the substitution holds for chunks with and + without the sentinel. + """ + arr = np.arange(64, dtype=np.float32).reshape(8, 8) + 1 + arr[6:8, 6:8] = -9999.0 + path = str(tmp_path / 'float_corner_sentinel_1597.tif') + write(arr, path, nodata=-9999, compression='none', tiled=False) + + eager = open_geotiff(path) + dk = open_geotiff(path, chunks=4).compute() + np.testing.assert_array_equal(np.isnan(dk.values), + np.isnan(eager.values))