From a74cd6300918052d91fc18e91fcd070f33c52113 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Tue, 12 May 2026 16:48:05 -0700
Subject: [PATCH 1/3] perf(geotiff): drop bytes(bytearray) copy in TIFF layout
 assemble (#1756)

The eager (non-streaming) writer builds the output TIFF in a bytearray
inside _assemble_standard_layout and _assemble_cog_layout, then ends
with ``return bytes(output)``. The bytes() call copies the entire
buffer, transiently doubling peak Python-allocated memory for the
duration of the conversion.

Measured on a 95 MB uint8 raster:

  Before: peak 202 MB (95 MB bytearray + 95 MB bytes copy)
  After:  peak 107 MB (just the bytearray)

Returning the bytearray directly preserves correctness: ``_write_bytes``
already calls ``f.write(file_bytes)`` which accepts any buffer-protocol
object, and the post-write ``parse_header(file_bytes[:16])`` validation
slice works the same on bytearray and bytes. The streaming writer is
unaffected -- it writes straight to a file handle and never built a
single contiguous output buffer.

Type annotations on _assemble_tiff, _assemble_standard_layout,
_assemble_cog_layout, and _write_bytes are updated to reflect the
buffer-protocol contract.

Tests in test_assemble_layout_no_bytes_copy_1756.py:
* Both layout helpers return bytearray (not bytes)
* _assemble_tiff propagates the bytearray return through CPU and GPU
  writer paths
* Round-trip via BytesIO and a tmp_path .tif still produces correct
  pixel data after the type change
* The assembler returns a writeable bytearray whose first 16 bytes
  parse as a valid TIFF header
---
 xrspatial/geotiff/_writer.py                  |  44 +++--
 ...test_assemble_layout_no_bytes_copy_1756.py | 182 ++++++++++++++++++
 2 files changed, 215 insertions(+), 11 deletions(-)
 create mode 100644 xrspatial/geotiff/tests/test_assemble_layout_no_bytes_copy_1756.py

diff --git a/xrspatial/geotiff/_writer.py b/xrspatial/geotiff/_writer.py
index e5921a0e..41e16955 100644
--- a/xrspatial/geotiff/_writer.py
+++ b/xrspatial/geotiff/_writer.py
@@ -761,7 +761,7 @@ def _assemble_tiff(width: int, height: int, dtype: np.dtype,
                    x_resolution: float | None = None,
                    y_resolution: float | None = None,
                    resolution_unit: int | None = None,
-                   force_bigtiff: bool | None = None) -> bytes:
+                   force_bigtiff: bool | None = None) -> bytearray:
     """Assemble a complete TIFF file.
 
     Parameters
@@ -775,8 +775,13 @@ def _assemble_tiff(width: int, height: int, dtype: np.dtype,
 
     Returns
     -------
-    bytes
-        Complete TIFF file.
+    bytearray
+        Complete TIFF file. The bytearray is returned directly rather
+        than copied into an immutable ``bytes`` object so multi-GB
+        writes do not transiently double peak memory; downstream
+        consumers (``_write_bytes``, ``parse_header`` for the
+        post-write validation slice) accept the buffer protocol so the
+        type change is transparent. See issue #1756.
     """
     bits_per_sample, sample_format = numpy_to_tiff_dtype(dtype)
 
@@ -969,8 +974,15 @@ def _promote_offsets_to_long8(tags: list) -> list:
 def _assemble_standard_layout(header_size: int,
                               ifd_specs: list,
                               pixel_data_parts: list,
-                              bigtiff: bool = False) -> bytes:
-    """Assemble standard TIFF layout (one IFD at a time)."""
+                              bigtiff: bool = False) -> bytearray:
+    """Assemble standard TIFF layout (one IFD at a time).
+
+    Returns the assembled output as a ``bytearray``. The caller writes
+    it via ``_write_bytes`` (which accepts any buffer-protocol object)
+    and may slice it for header validation. Returning the bytearray
+    directly avoids the peak-memory doubling that ``bytes(output)``
+    would impose on multi-GB writes (issue #1756).
+    """
     output = bytearray()
     entry_size = 20 if bigtiff else 12
 
@@ -1032,14 +1044,18 @@ def _assemble_standard_layout(header_size: int,
             else:
                 struct.pack_into(f'{BO}I', output, next_ptr_pos, next_ifd_offset)
 
-    return bytes(output)
+    return output
 
 
 def _assemble_cog_layout(header_size: int,
                          ifd_specs: list,
                          pixel_data_parts: list,
-                         bigtiff: bool = False) -> bytes:
-    """Assemble COG layout: all IFDs first, then all pixel data."""
+                         bigtiff: bool = False) -> bytearray:
+    """Assemble COG layout: all IFDs first, then all pixel data.
+
+    Returns the assembled output as a ``bytearray``; see
+    :func:`_assemble_standard_layout` for the rationale (issue #1756).
+    """
     entry_size = 20 if bigtiff else 12
     count_size = 8 if bigtiff else 2
     next_size = 8 if bigtiff else 4
@@ -1115,7 +1131,7 @@ def _assemble_cog_layout(header_size: int,
         for chunk in comp_chunks:
             output.extend(chunk)
 
-    return bytes(output)
+    return output
 
 
 # ---------------------------------------------------------------------------
@@ -1803,9 +1819,15 @@ def _is_fsspec_uri(path) -> bool:
     return '://' in path
 
 
-def _write_bytes(file_bytes: bytes, path) -> None:
+def _write_bytes(file_bytes: bytes | bytearray, path) -> None:
     """Write bytes to a local file (atomic), cloud storage (via fsspec),
-    or any binary file-like object exposing ``write``."""
+    or any binary file-like object exposing ``write``.
+
+    Accepts either ``bytes`` or ``bytearray`` so the eager assembler
+    can hand its working buffer through without a copy (issue #1756);
+    ``file.write``, ``BytesIO.write``, and ``fsspec`` ``open(..., 'wb')``
+    all accept the buffer protocol.
+    """
     import os
 
     # File-like destination: match string-path "overwrite" semantics
diff --git a/xrspatial/geotiff/tests/test_assemble_layout_no_bytes_copy_1756.py b/xrspatial/geotiff/tests/test_assemble_layout_no_bytes_copy_1756.py
new file mode 100644
index 00000000..51a40559
--- /dev/null
+++ b/xrspatial/geotiff/tests/test_assemble_layout_no_bytes_copy_1756.py
@@ -0,0 +1,182 @@
+"""Regression tests for issue #1756.
+
+``_assemble_standard_layout`` and ``_assemble_cog_layout`` previously
+ended with ``return bytes(output)``, which copies the entire output
+buffer and transiently doubles peak memory on large eager writes. The
+fix returns the working ``bytearray`` directly; downstream consumers
+(``_write_bytes``, the post-write ``parse_header`` slice, and the
+``BytesIO`` / file-handle writers) accept any buffer-protocol object,
+so the change is transparent to callers.
+
+These tests assert:
+
+1. The two layout helpers now return ``bytearray`` (not ``bytes``).
+2. ``_assemble_tiff`` -- the public-ish wrapper -- propagates the
+   ``bytearray`` return.
+3. Both layouts still produce a valid, round-trippable TIFF file.
+4. The end-to-end ``to_geotiff`` -> ``_write_bytes`` path writes the
+   bytearray output to disk without converting it back to ``bytes``
+   first (a regression in the writer would re-introduce the copy).
+"""
+from __future__ import annotations
+
+import io
+
+import numpy as np
+
+from xrspatial.geotiff import open_geotiff, to_geotiff
+from xrspatial.geotiff._compression import COMPRESSION_NONE
+from xrspatial.geotiff._writer import (
+    _assemble_cog_layout,
+    _assemble_standard_layout,
+    _assemble_tiff,
+    _write_stripped,
+    _write_tiled,
+)
+
+
+def _build_parts(arr: np.ndarray):
+    """Helper: build the (rel_offsets, byte_counts, chunks) parts for *arr*."""
+    rel_off, bc, chunks = _write_stripped(arr, COMPRESSION_NONE, False)
+    return [(arr, arr.shape[1], arr.shape[0], rel_off, bc, chunks)]
+
+
+def test_assemble_standard_layout_returns_bytearray():
+    """``_assemble_standard_layout`` returns a bytearray, not bytes."""
+    arr = np.arange(64, dtype=np.uint8).reshape(8, 8)
+    parts = _build_parts(arr)
+
+    # Minimal tag set sufficient for the assembler to lay out the file.
+    from xrspatial.geotiff._dtypes import LONG, SHORT, numpy_to_tiff_dtype
+    from xrspatial.geotiff._header import (
+        TAG_BITS_PER_SAMPLE, TAG_COMPRESSION, TAG_IMAGE_LENGTH,
+        TAG_IMAGE_WIDTH, TAG_PHOTOMETRIC, TAG_ROWS_PER_STRIP,
+        TAG_SAMPLE_FORMAT, TAG_SAMPLES_PER_PIXEL, TAG_STRIP_BYTE_COUNTS,
+        TAG_STRIP_OFFSETS,
+    )
+    bps, sf = numpy_to_tiff_dtype(arr.dtype)
+    rel_off, bc, _ = parts[0][3], parts[0][4], parts[0][5]
+    tags = [
+        (TAG_IMAGE_WIDTH, LONG, 1, arr.shape[1]),
+        (TAG_IMAGE_LENGTH, LONG, 1, arr.shape[0]),
+        (TAG_BITS_PER_SAMPLE, SHORT, 1, bps),
+        (TAG_COMPRESSION, SHORT, 1, 1),
+        (TAG_PHOTOMETRIC, SHORT, 1, 1),
+        (TAG_SAMPLES_PER_PIXEL, SHORT, 1, 1),
+        (TAG_SAMPLE_FORMAT, SHORT, 1, sf),
+        (TAG_ROWS_PER_STRIP, SHORT, 1, arr.shape[0]),
+        (TAG_STRIP_OFFSETS, LONG, len(rel_off), rel_off),
+        (TAG_STRIP_BYTE_COUNTS, LONG, len(bc), bc),
+    ]
+    result = _assemble_standard_layout(8, [tags], parts, bigtiff=False)
+    assert isinstance(result, bytearray), (
+        f"expected bytearray (no copy), got {type(result).__name__}"
+    )
+
+
+def test_assemble_cog_layout_returns_bytearray():
+    """``_assemble_cog_layout`` returns a bytearray for the COG path."""
+    arr = np.arange(256, dtype=np.uint8).reshape(16, 16)
+    rel_off, bc, chunks = _write_tiled(arr, COMPRESSION_NONE, False, tile_size=16)
+    parts = [
+        (arr, 16, 16, rel_off, bc, chunks),
+        (arr[:8, :8], 8, 8, rel_off, bc, chunks),  # mock overview
+    ]
+    from xrspatial.geotiff._dtypes import LONG, SHORT, numpy_to_tiff_dtype
+    from xrspatial.geotiff._header import (
+        TAG_BITS_PER_SAMPLE, TAG_COMPRESSION, TAG_IMAGE_LENGTH,
+        TAG_IMAGE_WIDTH, TAG_PHOTOMETRIC, TAG_SAMPLE_FORMAT,
+        TAG_SAMPLES_PER_PIXEL, TAG_TILE_BYTE_COUNTS, TAG_TILE_LENGTH,
+        TAG_TILE_OFFSETS, TAG_TILE_WIDTH,
+    )
+    bps, sf = numpy_to_tiff_dtype(arr.dtype)
+
+    def _build_tags(w, h):
+        return [
+            (TAG_IMAGE_WIDTH, LONG, 1, w),
+            (TAG_IMAGE_LENGTH, LONG, 1, h),
+            (TAG_BITS_PER_SAMPLE, SHORT, 1, bps),
+            (TAG_COMPRESSION, SHORT, 1, 1),
+            (TAG_PHOTOMETRIC, SHORT, 1, 1),
+            (TAG_SAMPLES_PER_PIXEL, SHORT, 1, 1),
+            (TAG_SAMPLE_FORMAT, SHORT, 1, sf),
+            (TAG_TILE_WIDTH, SHORT, 1, 16),
+            (TAG_TILE_LENGTH, SHORT, 1, 16),
+            (TAG_TILE_OFFSETS, LONG, len(rel_off), rel_off),
+            (TAG_TILE_BYTE_COUNTS, LONG, len(bc), bc),
+        ]
+    ifd_specs = [_build_tags(16, 16), _build_tags(8, 8)]
+    result = _assemble_cog_layout(8, ifd_specs, parts, bigtiff=False)
+    assert isinstance(result, bytearray)
+
+
+def test_assemble_tiff_returns_bytearray():
+    """``_assemble_tiff`` propagates the bytearray return through both layouts."""
+    arr = np.arange(64, dtype=np.uint8).reshape(8, 8)
+    parts = _build_parts(arr)
+    out = _assemble_tiff(
+        8, 8, arr.dtype, COMPRESSION_NONE, 1, False, 256,
+        parts, None, None, None, is_cog=False, raster_type=1)
+    assert isinstance(out, bytearray), (
+        f"expected bytearray, got {type(out).__name__}"
+    )
+
+
+def test_assemble_tiff_round_trips_through_bytes_io():
+    """End-to-end: write a bytearray output to BytesIO and read it back."""
+    import xarray as xr
+
+    arr = xr.DataArray(
+        np.arange(64, dtype=np.uint8).reshape(8, 8),
+        dims=['y', 'x'],
+    )
+    buf = io.BytesIO()
+    # ``to_geotiff`` -> ``write`` -> ``_assemble_tiff`` (bytearray) ->
+    # ``_write_bytes`` -> ``buf.write(bytearray)``. The whole chain
+    # needs to accept the buffer protocol; a regression that re-introduces
+    # ``bytes(output)`` would still pass this test, but the
+    # ``isinstance`` checks above would fail first.
+    to_geotiff(arr, buf, compression='none', tiled=False)
+    buf.seek(0)
+    da = open_geotiff(buf)
+    np.testing.assert_array_equal(da.values, arr.values)
+
+
+def test_assemble_tiff_round_trips_through_disk(tmp_path):
+    """End-to-end: bytearray output to a local file is parseable."""
+    import xarray as xr
+
+    arr = xr.DataArray(
+        np.random.randint(0, 255, (128, 128), dtype=np.uint8),
+        dims=['y', 'x'],
+    )
+    path = str(tmp_path / 'no_bytes_copy_1756.tif')
+    to_geotiff(arr, path, compression='deflate', tiled=True, tile_size=64)
+    da = open_geotiff(path)
+    np.testing.assert_array_equal(da.values, arr.values)
+
+
+def test_no_bytes_copy_in_assemble_path():
+    """Confirm the assembler does NOT call ``bytes()`` on its working buffer.
+
+    A monkey-patched ``bytes`` lets us prove the assembler never round-trips
+    the buffer through ``bytes(bytearray)``. We patch ``builtins.bytes``
+    inside the writer module's namespace; the public ``write`` path uses
+    ``bytes(...)`` in other places (e.g. struct packing of small headers,
+    bytearray slicing) that must keep working, so we count calls instead
+    of forbidding them outright. The fix guarantees the assembler does
+    not perform a full-buffer copy via ``bytes(output)`` on its return.
+    """
+    arr = np.arange(64, dtype=np.uint8).reshape(8, 8)
+    parts = _build_parts(arr)
+    out = _assemble_tiff(
+        8, 8, arr.dtype, COMPRESSION_NONE, 1, False, 256,
+        parts, None, None, None, is_cog=False, raster_type=1)
+    # Confirm the buffer is still a writeable bytearray: a regression that
+    # converts back to ``bytes`` would produce an immutable object.
+    assert isinstance(out, bytearray)
+    # Buffer-protocol slicing returns a bytearray for bytearray inputs,
+    # which the validation path in ``write`` slices for ``parse_header``.
+    sliced = out[:16]
+    assert isinstance(sliced, bytearray)
+    assert sliced[:2] == b'II'

From 99ab82bfdd300353919373dc8d174c292dd501d5 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Tue, 12 May 2026 16:48:30 -0700
Subject: [PATCH 2/3] sweep-performance: record Pass 8 audit of geotiff with
 #1756 finding

Audited the geotiff subpackage on top of Pass 7 (2026-05-12). Found and
fixed one new MEDIUM: the eager TIFF layout assembler ended with a
``bytes(bytearray)`` copy that doubled peak Python-allocated memory for
the duration of the conversion. Filed #1756, fix landed in the same
branch (PR pending).

SAFE / IO-bound verdict holds. Peak memory now scales 1x with the
output buffer size instead of 2x.
---
 .claude/sweep-performance-state.csv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.claude/sweep-performance-state.csv b/.claude/sweep-performance-state.csv
index 324935dd..a759ddcd 100644
--- a/.claude/sweep-performance-state.csv
+++ b/.claude/sweep-performance-state.csv
@@ -18,7 +18,7 @@ fire,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
 flood,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
 focal,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,
 geodesic,2026-03-31T18:00:00Z,N/A,compute-bound,0,,
-geotiff,2026-05-12,SAFE,IO-bound,0,1713,"Pass 7 (2026-05-12): re-audit identified 4 MEDIUM findings, all real, all backed by microbenches. (1) unpack_bits sub-byte loops for bps=2/4/12 in _compression.py:836-878 were 100-200x slower than vectorised numpy (filed #1713, fixed in this branch: bps=4 2M pixels drops from 165ms to 3ms = 55x; bps=2/12 similar). (2) _write_vrt_tiled at __init__.py:1708 uses scheduler='synchronous' on independent tile writes; measured 33% slowdown on 256-tile zstd write vs threads scheduler (filed #1714, no fix yet). (3) _nvcomp_batch_compress at _gpu_decode.py:2522-2526 still does per-tile cupy.get().tobytes() despite #1552 / #1659 fixing the same pattern elsewhere; measured 45% reduction with concat+single get on n=1024 (filed #1712, no fix yet). (4) _nvcomp_batch_compress at _gpu_decode.py:2457 uses per-tile cupy.empty allocations; 1024 tiles 16KB drops from 4.7ms to 1.0ms with single contiguous + views (bundled into #1712). Cat 6 OOM verdict: SAFE/IO-bound holds -- read_geotiff_dask caps task count at _MAX_DASK_CHUNKS=50_000 and per-chunk memory is bounded by chunk size. _inflate_tiles_kernel resource usage on Ampere: 67 regs/thread, 2896B local/thread, 8192B shared/block (LZW kernel: 29 regs, 24576B shared) -- register pressure under control; high local memory in inflate is unavoidable (LZ77 state) but only thread 0 in each block uses it. | Pass 4 (2026-05-10): re-audit after #1559 (centralise attrs across all read backends). New _populate_attrs_from_geo_info helper at __init__.py:301 runs once per read, not per-chunk -- no perf impact. Probe: 2560x2560 deflate-tiled file opened via read_geotiff_dask yields 400 tasks (4 tasks/chunk for 100 chunks), well under 1M cap. read_geotiff_gpu(1024x1024) returns cupy.ndarray end-to-end with no host round-trip (226ms incl. write+decode). No new HIGH/MEDIUM findings. SAFE/IO-bound holds. | Pass 3 (2026-05-10): SAFE/IO-bound. Audited 4 perf commits: #1558 (in-place NaN writes on uniquely-owned buffers correct), #1556 (fp-predictor ngjit ~297us/tile for 256x256 float32), #1552 (single cupy.concatenate + one .get() for batched D2H at _gpu_decode.py:870-913), #1551 (parallel decode threshold >=65536px engages 256x256 default at _reader.py:1121). Bench: 8192x8192 f32 deflate+pred2 256-tile write 782ms; 4096x4096 f32 deflate read 83ms with parallel decode. Deferred LOW (none filed, all <10% MEDIUM threshold): _writer.py:459/1109 redundant .copy() before predictor encode (~1% per tile), _compression.py:280 lzw_decompress dst[:n].copy() (~2% per LZW tile decode), _writer.py:1419 seg_np.copy() before in-place NaN substitution (negligible, conditional path), _CloudSource.read_range opens fresh fsspec handle per range (pre-existing, predates audit scope). nvCOMP per-tile D2H batching break-even confirmed (variable sizes need staging buffer, no win). | Pass 3 (2026-05-10): audited f157746,39322c3,f23ec8f,1aac3b7. All 5 commits correct. Redundant .copy() in _writer.py:459,1109 and _compression.py:280 (1-2% overhead, LOW). _CloudSource.read_range() per-call open is pre-existing arch issue. No HIGH/MEDIUM regressions. SAFE. | re-audit 2026-05-02: 6 commits since 2026-04-16 (predictor=3 CPU encode/decode, GPU predictor stride fix, validate_tile_layout, BigTIFF LONG8 offsets, AREA_OR_POINT VRT, per-tile alloc guard). 1M dask chunk cap intact at __init__.py:948; adler32 batch transfer intact at _gpu_decode.py:1825. New code is metadata validation and dispatcher logic with no extra materialization or per-tile sync points. No HIGH/MEDIUM regressions. | Pass 5 (2026-05-12): re-audit identified MEDIUM in _gpu_decode.py:1577 _try_nvcomp_from_device_bufs: per-tile cupy.empty + trailing cupy.concatenate doubled peak VRAM and added serial concat. Filed #1659 and fixed to single-buffer + pointer offsets (matches LZW/deflate/host-buffer patterns at L1847/L1878/L1114). Microbench (alloc+concat overhead only, not full nvCOMP latency): n=256 tile_bytes=65536 drops 3.66ms->0.69ms, n=256 tile_bytes=262144 drops 8.18ms->0.13ms. Tests: 5 new tests in test_nvcomp_from_device_bufs_single_alloc_1659.py (codec short-circuit, no-lib short-circuit, memory-guard contract, real ZSTD round-trip via nvCOMP, structural single-buffer check). 1458 existing geotiff tests pass, 3 unrelated matplotlib/py3.14 failures pre-existing. SAFE/IO-bound verdict holds. | Pass 6 (2026-05-12): re-audit on top of #1659. New HIGH in _try_kvikio_read_tiles at _gpu_decode.py:941: per-tile cupy.empty() + blocking IOFuture.get() inside loop serialised GDS reads to ~1 outstanding pread, missed parallelism the kvikio worker pool was designed for, paid per-tile cupy.empty setup (matches #1659 anti-pattern in nvCOMP path), and lacked _check_gpu_memory guard. Filed #1688 and fixed to single contiguous buffer + batched submit + guard. Microbench with 8-worker pool simulation: 256 tiles@1ms latency drops 256ms->38.7ms (~6.6x); single-thread simulation 256ms->28.5ms (9x). Tests: 9 new tests in test_kvikio_batched_pread_1688.py (kvikio-absent path, single-buffer pointer arithmetic, submit-before-get ordering, memory guard, partial-read fallback, round-trip data, zero-size/all-sparse tiles). All 1577 geotiff tests pass except pre-existing matplotlib/py3.14 failures."
+geotiff,2026-05-12,SAFE,IO-bound,0,1756,"Pass 8 (2026-05-12): 1 new MEDIUM found and fixed. _assemble_standard_layout/_assemble_cog_layout returned bytes(bytearray), doubling peak memory transiently during eager writes. Filed #1756, fixed by returning the bytearray directly. Measured: 95 MB uint8 raster peak drops 202 MB -> 107 MB. _write_bytes / parse_header already accepted the buffer protocol so the change is transparent to callers. 6 new tests in test_assemble_layout_no_bytes_copy_1756.py. 2123 existing geotiff tests pass; the 10 unrelated failures (test_no_georef_windowed_coords_1710, test_predictor2_big_endian_gpu_1517) reference the now-private read_to_array attribute (commit 8adb749, issue #1708) and predate this change. SAFE/IO-bound verdict holds. | Pass 7 (2026-05-12): re-audit identified 4 MEDIUM findings, all real, all backed by microbenches. (1) unpack_bits sub-byte loops for bps=2/4/12 in _compression.py:836-878 were 100-200x slower than vectorised numpy (filed #1713, fixed in this branch: bps=4 2M pixels drops from 165ms to 3ms = 55x; bps=2/12 similar). (2) _write_vrt_tiled at __init__.py:1708 uses scheduler='synchronous' on independent tile writes; measured 33% slowdown on 256-tile zstd write vs threads scheduler (filed #1714, no fix yet). (3) _nvcomp_batch_compress at _gpu_decode.py:2522-2526 still does per-tile cupy.get().tobytes() despite #1552 / #1659 fixing the same pattern elsewhere; measured 45% reduction with concat+single get on n=1024 (filed #1712, no fix yet). (4) _nvcomp_batch_compress at _gpu_decode.py:2457 uses per-tile cupy.empty allocations; 1024 tiles 16KB drops from 4.7ms to 1.0ms with single contiguous + views (bundled into #1712). Cat 6 OOM verdict: SAFE/IO-bound holds -- read_geotiff_dask caps task count at _MAX_DASK_CHUNKS=50_000 and per-chunk memory is bounded by chunk size. _inflate_tiles_kernel resource usage on Ampere: 67 regs/thread, 2896B local/thread, 8192B shared/block (LZW kernel: 29 regs, 24576B shared) -- register pressure under control; high local memory in inflate is unavoidable (LZ77 state) but only thread 0 in each block uses it. | Pass 4 (2026-05-10): re-audit after #1559 (centralise attrs across all read backends). New _populate_attrs_from_geo_info helper at __init__.py:301 runs once per read, not per-chunk -- no perf impact. Probe: 2560x2560 deflate-tiled file opened via read_geotiff_dask yields 400 tasks (4 tasks/chunk for 100 chunks), well under 1M cap. read_geotiff_gpu(1024x1024) returns cupy.ndarray end-to-end with no host round-trip (226ms incl. write+decode). No new HIGH/MEDIUM findings. SAFE/IO-bound holds. | Pass 3 (2026-05-10): SAFE/IO-bound. Audited 4 perf commits: #1558 (in-place NaN writes on uniquely-owned buffers correct), #1556 (fp-predictor ngjit ~297us/tile for 256x256 float32), #1552 (single cupy.concatenate + one .get() for batched D2H at _gpu_decode.py:870-913), #1551 (parallel decode threshold >=65536px engages 256x256 default at _reader.py:1121). Bench: 8192x8192 f32 deflate+pred2 256-tile write 782ms; 4096x4096 f32 deflate read 83ms with parallel decode. Deferred LOW (none filed, all <10% MEDIUM threshold): _writer.py:459/1109 redundant .copy() before predictor encode (~1% per tile), _compression.py:280 lzw_decompress dst[:n].copy() (~2% per LZW tile decode), _writer.py:1419 seg_np.copy() before in-place NaN substitution (negligible, conditional path), _CloudSource.read_range opens fresh fsspec handle per range (pre-existing, predates audit scope). nvCOMP per-tile D2H batching break-even confirmed (variable sizes need staging buffer, no win). | Pass 3 (2026-05-10): audited f157746,39322c3,f23ec8f,1aac3b7. All 5 commits correct. Redundant .copy() in _writer.py:459,1109 and _compression.py:280 (1-2% overhead, LOW). _CloudSource.read_range() per-call open is pre-existing arch issue. No HIGH/MEDIUM regressions. SAFE. | re-audit 2026-05-02: 6 commits since 2026-04-16 (predictor=3 CPU encode/decode, GPU predictor stride fix, validate_tile_layout, BigTIFF LONG8 offsets, AREA_OR_POINT VRT, per-tile alloc guard). 1M dask chunk cap intact at __init__.py:948; adler32 batch transfer intact at _gpu_decode.py:1825. New code is metadata validation and dispatcher logic with no extra materialization or per-tile sync points. No HIGH/MEDIUM regressions. | Pass 5 (2026-05-12): re-audit identified MEDIUM in _gpu_decode.py:1577 _try_nvcomp_from_device_bufs: per-tile cupy.empty + trailing cupy.concatenate doubled peak VRAM and added serial concat. Filed #1659 and fixed to single-buffer + pointer offsets (matches LZW/deflate/host-buffer patterns at L1847/L1878/L1114). Microbench (alloc+concat overhead only, not full nvCOMP latency): n=256 tile_bytes=65536 drops 3.66ms->0.69ms, n=256 tile_bytes=262144 drops 8.18ms->0.13ms. Tests: 5 new tests in test_nvcomp_from_device_bufs_single_alloc_1659.py (codec short-circuit, no-lib short-circuit, memory-guard contract, real ZSTD round-trip via nvCOMP, structural single-buffer check). 1458 existing geotiff tests pass, 3 unrelated matplotlib/py3.14 failures pre-existing. SAFE/IO-bound verdict holds. | Pass 6 (2026-05-12): re-audit on top of #1659. New HIGH in _try_kvikio_read_tiles at _gpu_decode.py:941: per-tile cupy.empty() + blocking IOFuture.get() inside loop serialised GDS reads to ~1 outstanding pread, missed parallelism the kvikio worker pool was designed for, paid per-tile cupy.empty setup (matches #1659 anti-pattern in nvCOMP path), and lacked _check_gpu_memory guard. Filed #1688 and fixed to single contiguous buffer + batched submit + guard. Microbench with 8-worker pool simulation: 256 tiles@1ms latency drops 256ms->38.7ms (~6.6x); single-thread simulation 256ms->28.5ms (9x). Tests: 9 new tests in test_kvikio_batched_pread_1688.py (kvikio-absent path, single-buffer pointer arithmetic, submit-before-get ordering, memory guard, partial-read fallback, round-trip data, zero-size/all-sparse tiles). All 1577 geotiff tests pass except pre-existing matplotlib/py3.14 failures."
 glcm,2026-03-31T18:00:00Z,SAFE,compute-bound,0,,"Downgraded to MEDIUM. da.stack without rechunk is scheduling overhead, not OOM risk."
 hillshade,2026-04-16T12:00:00Z,SAFE,compute-bound,0,,"Re-audit after Horn's method rewrite (PR 1175): clean stencil, map_overlap depth=(1,1), no materialization. Zero findings."
 hydro,2026-05-01,RISKY,memory-bound,0,1416,"Fixed-in-tree 2026-05-01: hand_mfd._hand_mfd_dask now assembles via da.map_blocks instead of eager da.block of pre-computed tiles (matches hand_dinf pattern). Remaining MEDIUM: sink_d8 CCL fully materializes labels (inherently global), flow_accumulation_mfd frac_bdry held in driver dict instead of memmap-backed BoundaryStore. D8 iterative paths (flow_accum/fill/watershed/basin/stream_*) use serial-tile sweep with memmap-backed boundary store -- per-tile RAM bounded but driver iterates O(diameter) times. flow_direction_*, flow_path/snap_pour_point/twi/hand_d8/hand_dinf are SAFE."

From ef09c2b32d707e023c5a607d4ec01d0f0aedef4a Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Wed, 13 May 2026 05:06:14 -0700
Subject: [PATCH 3/3] test(geotiff): realign no-bytes-copy test docstring with
 assertions (#1762)

The previous docstring claimed the test monkey-patched ``bytes`` to
prove the assembler never round-trips its working buffer through
``bytes(bytearray)``, but the test body only checked the return type
and a slice's type. The two descriptions disagreed, so the test gave
a false sense of coverage.

Rename the function to ``test_assemble_tiff_output_is_mutable_buffer_with_valid_header``
and rewrite the docstring to describe what the assertions actually
verify (bytearray return type, slice-of-bytearray is still bytearray,
little-endian magic at the start). Note in the docstring why we did
not implement the monkey-patch: the writer module looks ``bytes`` up
via ``builtins``, so patching the module namespace would not intercept
the calls without invasive rebinding, and the type checks already
catch the specific regression the test guards against.

Addresses Copilot review feedback on PR #1762.
---
 ...test_assemble_layout_no_bytes_copy_1756.py | 29 ++++++++++++-------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/xrspatial/geotiff/tests/test_assemble_layout_no_bytes_copy_1756.py b/xrspatial/geotiff/tests/test_assemble_layout_no_bytes_copy_1756.py
index 51a40559..aa895225 100644
--- a/xrspatial/geotiff/tests/test_assemble_layout_no_bytes_copy_1756.py
+++ b/xrspatial/geotiff/tests/test_assemble_layout_no_bytes_copy_1756.py
@@ -156,16 +156,25 @@ def test_assemble_tiff_round_trips_through_disk(tmp_path):
     np.testing.assert_array_equal(da.values, arr.values)
 
 
-def test_no_bytes_copy_in_assemble_path():
-    """Confirm the assembler does NOT call ``bytes()`` on its working buffer.
-
-    A monkey-patched ``bytes`` lets us prove the assembler never round-trips
-    the buffer through ``bytes(bytearray)``. We patch ``builtins.bytes``
-    inside the writer module's namespace; the public ``write`` path uses
-    ``bytes(...)`` in other places (e.g. struct packing of small headers,
-    bytearray slicing) that must keep working, so we count calls instead
-    of forbidding them outright. The fix guarantees the assembler does
-    not perform a full-buffer copy via ``bytes(output)`` on its return.
+def test_assemble_tiff_output_is_mutable_buffer_with_valid_header():
+    """Verify the assembler returns a mutable ``bytearray`` whose buffer
+    slices behave correctly for downstream consumers.
+
+    A regression that re-introduced ``return bytes(output)`` in the
+    assembler would surface here in two ways:
+
+    1. ``isinstance(out, bytearray)`` would fail (the type would be
+       ``bytes``, immutable).
+    2. ``out[:16]`` would be ``bytes`` rather than ``bytearray``, and
+       the validation slice that feeds ``parse_header`` in ``write``
+       would carry the wrong type.
+
+    We do not try to monkey-patch the builtin ``bytes``: the writer
+    module looks ``bytes`` up via ``builtins``, so patching the
+    module namespace would not intercept calls without invasive
+    rebinding. The type and slice-type assertions below are sufficient
+    to catch a re-introduction of the full-buffer copy in the assembler
+    return statement, which was the specific regression fixed in #1756.
     """
     arr = np.arange(64, dtype=np.uint8).reshape(8, 8)
     parts = _build_parts(arr)