From 02bd1c6f1486750b95724b1ee8054d51baff395e Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Sun, 10 May 2026 06:49:53 -0700 Subject: [PATCH 1/2] Drop defensive copies of freshly-allocated nodata-mask arrays (#1553) Two ``arr = arr.copy()`` calls on the read path duplicated a multi-MB array right before an in-place sentinel-to-NaN rewrite. The arrays were freshly allocated by ``_read_tiles`` / ``_read_strips`` / ``_fetch_decode_cog_http_tiles`` and uniquely owned at that point, so the copies were pure peak-memory waste. Removed: - ``_geotiff_to_xarray`` (was ~line 518): eager-read nodata path. - ``_delayed_read_window`` (was ~line 1448): dask-chunked read. Kept (with a comment explaining why): - ``to_geotiff`` (~line 990): ``arr`` may be a view of a caller-owned numpy buffer (``np.asarray(raw)`` or ``np.moveaxis``). - ``_write_single_tile`` (~line 1050): same shape; ``arr`` may alias caller data through ``np.asarray(chunk_data)``. Adds ``test_nodata_no_extra_copy_1553.py`` covering float32 strip, float32 tiled, uint16 tiled, dask-chunked float, dask-chunked uint16, repeat reads (independence), and a writer-no-mutate guard for the defensive copies kept on the write path. --- xrspatial/geotiff/__init__.py | 27 ++- .../tests/test_nodata_no_extra_copy_1553.py | 194 ++++++++++++++++++ 2 files changed, 217 insertions(+), 4 deletions(-) create mode 100644 xrspatial/geotiff/tests/test_nodata_no_extra_copy_1553.py diff --git a/xrspatial/geotiff/__init__.py b/xrspatial/geotiff/__init__.py index 4b27f245d..cd7a3ac4b 100644 --- a/xrspatial/geotiff/__init__.py +++ b/xrspatial/geotiff/__init__.py @@ -509,13 +509,18 @@ def open_geotiff(source, *, dtype=None, window=None, attrs['colormap'] = _tv break - # Apply nodata mask: replace nodata sentinel values with NaN + # Apply nodata mask: replace nodata sentinel values with NaN. + # ``arr`` came from ``read_to_array``, which returns a freshly + # allocated buffer from ``_read_tiles`` / ``_read_strips`` (and is + # ``np.ascontiguousarray``-wrapped if the orientation tag triggered + # a remap). Nothing else holds a reference here, so the in-place + # write is safe; an extra ``arr.copy()`` would just double peak + # memory for a multi-MB raster. nodata = geo_info.nodata if nodata is not None: attrs['nodata'] = nodata if arr.dtype.kind == 'f': if not np.isnan(nodata): - arr = arr.copy() arr[arr == arr.dtype.type(nodata)] = np.nan elif arr.dtype.kind in ('u', 'i'): # Integer arrays: convert to float to represent NaN @@ -984,6 +989,11 @@ def to_geotiff(data: xr.DataArray | np.ndarray, path, *, # Restore NaN pixels to the nodata sentinel value so the written file # has sentinel values matching the GDAL_NODATA tag. + # + # The defensive ``arr.copy()`` here is intentional: ``arr`` may be + # ``np.asarray(raw)`` of a caller-owned numpy DataArray (a view, + # not a copy) or ``np.moveaxis(arr, 0, -1)`` of one (also a view). + # Mutating without a copy would corrupt the user's input buffer. if nodata is not None and arr.dtype.kind == 'f' and not np.isnan(nodata): nan_mask = np.isnan(arr) if nan_mask.any(): @@ -1043,7 +1053,12 @@ def _write_single_tile(chunk_data, path, geo_transform, epsg, wkt, elif arr.dtype == np.bool_: arr = arr.astype(np.uint8) - # Restore NaN to nodata sentinel + # Restore NaN to nodata sentinel. + # + # The defensive ``arr.copy()`` here is intentional: ``arr`` came + # from ``np.asarray(chunk_data)`` where ``chunk_data`` may be a + # caller-owned numpy buffer. Mutating without a copy would corrupt + # the user's input. if nodata is not None and arr.dtype.kind == 'f' and not np.isnan(nodata): nan_mask = np.isnan(arr) if nan_mask.any(): @@ -1444,8 +1459,12 @@ def _read(http_meta): overview_level=overview_level, band=band) if nodata is not None: + # ``arr`` was just decoded by ``_fetch_decode_cog_http_tiles`` + # or ``read_to_array``; both return freshly-allocated buffers + # that nothing else references, so the in-place sentinel + # rewrite is safe. Skip the defensive ``arr.copy()`` to + # avoid a peak-memory doubler on every dask chunk. if arr.dtype.kind == 'f' and not np.isnan(nodata): - arr = arr.copy() arr[arr == arr.dtype.type(nodata)] = np.nan elif arr.dtype.kind in ('u', 'i'): mask = arr == arr.dtype.type(int(nodata)) diff --git a/xrspatial/geotiff/tests/test_nodata_no_extra_copy_1553.py b/xrspatial/geotiff/tests/test_nodata_no_extra_copy_1553.py new file mode 100644 index 000000000..08ebf64d2 --- /dev/null +++ b/xrspatial/geotiff/tests/test_nodata_no_extra_copy_1553.py @@ -0,0 +1,194 @@ +"""Regression tests for issue #1553. + +Two ``arr = arr.copy()`` calls on freshly-allocated buffers were +removed from the read path in ``xrspatial/geotiff/__init__.py`` +(``_geotiff_to_xarray`` and ``_delayed_read_window``). The copies +were unnecessary because the source array was uniquely owned at that +point, so dropping them halves peak memory for a nodata-bearing read +without changing observed values. + +These tests exercise the float and integer nodata sentinel paths on +both tiled and strip TIFFs, and on the dask-chunked read path that +flows through ``_delayed_read_window``. They confirm: + +- Sentinel pixels become NaN on read. +- Non-sentinel pixels round-trip exactly. +- The dropped copies do not alias any caller-visible buffer (a second + read of the same file returns the expected mask, proving we did not + smear NaNs into shared state across calls). +""" +from __future__ import annotations + +import numpy as np +import xarray as xr + +from xrspatial.geotiff import open_geotiff, to_geotiff + + +def _make_float_with_sentinel(h=24, w=24, sentinel=-9999.0, + dtype=np.float32): + """Float array with a few sentinel pixels at known positions.""" + rng = np.random.default_rng(seed=1553) + arr = rng.uniform(0.0, 100.0, size=(h, w)).astype(dtype) + arr[0, 0] = sentinel + arr[5, 7] = sentinel + arr[-1, -1] = sentinel + return arr + + +def _make_uint16_with_sentinel(h=24, w=24, sentinel=65535): + """uint16 array with a few sentinel pixels at known positions.""" + rng = np.random.default_rng(seed=1554) + arr = rng.integers(0, 1000, size=(h, w), dtype=np.uint16) + arr[0, 0] = sentinel + arr[5, 7] = sentinel + arr[-1, -1] = sentinel + return arr + + +def test_float_sentinel_strip_tiff_read(tmp_path): + """Strip-organized float32 TIFF with sentinel nodata reads correctly.""" + src = _make_float_with_sentinel() + path = str(tmp_path / 'issue_1553_float_strip.tif') + da = xr.DataArray(src, dims=('y', 'x'), + attrs={'crs': 4326, 'nodata': -9999.0}) + to_geotiff(da, path, nodata=-9999.0, tiled=False, + compression='deflate') + + out = open_geotiff(path) + expected_mask = (src == np.float32(-9999.0)) + np.testing.assert_array_equal(np.isnan(out.data), expected_mask) + finite = ~expected_mask + np.testing.assert_allclose(out.data[finite], src[finite]) + assert out.attrs.get('nodata') == -9999.0 + + +def test_float_sentinel_tiled_tiff_read(tmp_path): + """Tiled float32 TIFF with sentinel nodata reads correctly.""" + src = _make_float_with_sentinel(h=64, w=64) + path = str(tmp_path / 'issue_1553_float_tiled.tif') + da = xr.DataArray(src, dims=('y', 'x'), + attrs={'crs': 4326, 'nodata': -9999.0}) + to_geotiff(da, path, nodata=-9999.0, tiled=True, tile_size=16, + compression='deflate') + + out = open_geotiff(path) + expected_mask = (src == np.float32(-9999.0)) + np.testing.assert_array_equal(np.isnan(out.data), expected_mask) + finite = ~expected_mask + np.testing.assert_allclose(out.data[finite], src[finite]) + + +def test_uint16_sentinel_tiled_tiff_read(tmp_path): + """Tiled uint16 TIFF with sentinel nodata is promoted to float+NaN.""" + src = _make_uint16_with_sentinel(h=48, w=48) + path = str(tmp_path / 'issue_1553_uint16_tiled.tif') + da = xr.DataArray(src, dims=('y', 'x'), + attrs={'crs': 4326, 'nodata': 65535}) + to_geotiff(da, path, nodata=65535, tiled=True, tile_size=16, + compression='deflate') + + out = open_geotiff(path) + assert out.dtype.kind == 'f' + expected_mask = (src == 65535) + np.testing.assert_array_equal(np.isnan(out.data), expected_mask) + finite = ~expected_mask + np.testing.assert_array_equal(out.data[finite].astype(np.uint16), + src[finite]) + + +def test_repeat_reads_independent(tmp_path): + """Repeated reads return independent arrays with the correct mask. + + If the dropped ``.copy()`` had been protecting against shared + state across reads, the second read would either see corrupted + values (NaN bleeding into other slots) or share a buffer with the + first call. We mutate the first result and then re-read to verify + the file's contents are not affected. + """ + src = _make_float_with_sentinel() + path = str(tmp_path / 'issue_1553_repeat_reads.tif') + da = xr.DataArray(src, dims=('y', 'x'), + attrs={'crs': 4326, 'nodata': -9999.0}) + to_geotiff(da, path, nodata=-9999.0, compression='deflate') + + first = open_geotiff(path) + expected_mask = (src == np.float32(-9999.0)) + np.testing.assert_array_equal(np.isnan(first.data), expected_mask) + + # Mutate the first read in place. If subsequent reads share state + # with this buffer, the second read will look corrupted. + first.data[1, 1] = np.nan + first.data[2, 2] = 12345.0 + + second = open_geotiff(path) + np.testing.assert_array_equal(np.isnan(second.data), expected_mask) + finite = ~expected_mask + np.testing.assert_allclose(second.data[finite], src[finite]) + + +def test_dask_chunked_float_sentinel_read(tmp_path): + """Dask-chunked read of a float TIFF with sentinel nodata. + + Exercises ``_delayed_read_window`` — the second site where the + defensive copy was dropped. Each chunk runs the per-window nodata + rewrite path in a worker task. + """ + src = _make_float_with_sentinel(h=64, w=64) + path = str(tmp_path / 'issue_1553_dask_float.tif') + da = xr.DataArray(src, dims=('y', 'x'), + attrs={'crs': 4326, 'nodata': -9999.0}) + to_geotiff(da, path, nodata=-9999.0, tiled=True, tile_size=16, + compression='deflate') + + out = open_geotiff(path, chunks=16) + materialised = out.compute().data + expected_mask = (src == np.float32(-9999.0)) + np.testing.assert_array_equal(np.isnan(materialised), expected_mask) + finite = ~expected_mask + np.testing.assert_allclose(materialised[finite], src[finite]) + + +def test_dask_chunked_uint16_sentinel_read(tmp_path): + """Dask-chunked read of a uint16 TIFF promotes to float+NaN per chunk.""" + src = _make_uint16_with_sentinel(h=64, w=64) + path = str(tmp_path / 'issue_1553_dask_uint16.tif') + da = xr.DataArray(src, dims=('y', 'x'), + attrs={'crs': 4326, 'nodata': 65535}) + to_geotiff(da, path, nodata=65535, tiled=True, tile_size=16, + compression='deflate') + + out = open_geotiff(path, chunks=16) + materialised = out.compute().data + assert materialised.dtype.kind == 'f' + expected_mask = (src == 65535) + np.testing.assert_array_equal(np.isnan(materialised), expected_mask) + finite = ~expected_mask + np.testing.assert_array_equal(materialised[finite].astype(np.uint16), + src[finite]) + + +def test_writer_does_not_mutate_caller_input(tmp_path): + """``to_geotiff`` must not mutate the caller's input array. + + The defensive copies kept in ``to_geotiff`` and + ``_write_single_tile`` exist for this reason. This test guards + against accidentally dropping them in a future audit pass. + """ + src = np.array([ + [1.0, 2.0, np.nan], + [np.nan, 5.0, 6.0], + [7.0, 8.0, 9.0], + ], dtype=np.float32) + snapshot = src.copy() + path = str(tmp_path / 'issue_1553_writer_no_mutate.tif') + da = xr.DataArray(src, dims=('y', 'x'), + attrs={'crs': 4326, 'nodata': -9999.0}) + to_geotiff(da, path, nodata=-9999.0, compression='deflate') + + # Caller's buffer must still hold its original NaNs and finite + # values; the writer must not have stamped the sentinel value into + # the user's array in place. + np.testing.assert_array_equal(np.isnan(src), np.isnan(snapshot)) + finite = ~np.isnan(snapshot) + np.testing.assert_array_equal(src[finite], snapshot[finite]) From 3522899076a4e0c68228ce49ca01df6b3e99d9fd Mon Sep 17 00:00:00 2001 From: Brendan Collins Date: Sun, 10 May 2026 21:10:16 -0700 Subject: [PATCH 2/2] Address PR #1558 review Add test_write_single_tile_does_not_mutate_caller_input to cover the second kept defensive copy (the .vrt tiled-output path). Calls _write_single_tile directly with a numpy buffer + integer nodata so the NaN -> sentinel rewrite at line 1065 fires, then asserts the source array still has its NaNs in the original positions. Verified with a mutation test: dropping the .copy() at line 1065 makes the test fail loudly (sentinel stamped over the source's NaN cells), restoring it makes it pass. Real regression guard, not vacuous. Updated the test_writer_does_not_mutate_caller_input docstring to reflect what each test now covers. --- .../tests/test_nodata_no_extra_copy_1553.py | 38 +++++++++++++++++-- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/xrspatial/geotiff/tests/test_nodata_no_extra_copy_1553.py b/xrspatial/geotiff/tests/test_nodata_no_extra_copy_1553.py index 08ebf64d2..98d874094 100644 --- a/xrspatial/geotiff/tests/test_nodata_no_extra_copy_1553.py +++ b/xrspatial/geotiff/tests/test_nodata_no_extra_copy_1553.py @@ -171,9 +171,9 @@ def test_dask_chunked_uint16_sentinel_read(tmp_path): def test_writer_does_not_mutate_caller_input(tmp_path): """``to_geotiff`` must not mutate the caller's input array. - The defensive copies kept in ``to_geotiff`` and - ``_write_single_tile`` exist for this reason. This test guards - against accidentally dropping them in a future audit pass. + Covers the defensive copy at the ``to_geotiff`` entry; the sibling + test below covers the second kept copy in ``_write_single_tile`` + (the .vrt tiled-output path). """ src = np.array([ [1.0, 2.0, np.nan], @@ -192,3 +192,35 @@ def test_writer_does_not_mutate_caller_input(tmp_path): np.testing.assert_array_equal(np.isnan(src), np.isnan(snapshot)) finite = ~np.isnan(snapshot) np.testing.assert_array_equal(src[finite], snapshot[finite]) + + +def test_write_single_tile_does_not_mutate_caller_input(tmp_path): + """``_write_single_tile`` must not mutate the caller's array either. + + The .vrt tiled-output path goes through ``_write_single_tile`` per + chunk. That helper has its own defensive copy at the + NaN -> sentinel rewrite. Direct invocation pins it: pass a numpy + buffer with NaNs, request an integer nodata sentinel, then assert + the source still has NaNs in the same places. + """ + from xrspatial.geotiff import _write_single_tile + + src = np.array([ + [1.0, np.nan, 3.0], + [4.0, 5.0, np.nan], + [np.nan, 8.0, 9.0], + ], dtype=np.float32) + snapshot = src.copy() + + out_path = str(tmp_path / 'issue_1553_single_tile_no_mutate.tif') + _write_single_tile( + src, out_path, + geo_transform=None, epsg=4326, wkt=None, + nodata=-9999.0, compression='deflate', compression_level=None, + tile_size=16, predictor=False, bigtiff=False, + ) + + # Source must still hold its original NaNs and finite values. + np.testing.assert_array_equal(np.isnan(src), np.isnan(snapshot)) + finite = ~np.isnan(snapshot) + np.testing.assert_array_equal(src[finite], snapshot[finite])