From 268618d4c74b718a8a0cb0c851608fa7d66fe4ec Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Mon, 30 Mar 2026 06:42:08 -0700
Subject: [PATCH 1/3] Add design spec for geotiff performance and memory
 controls

Covers dtype on read, compression_level on write, and VRT tiled
output from to_geotiff for streaming dask writes.
---
 ...2026-03-30-geotiff-perf-controls-design.md | 147 ++++++++++++++++++
 1 file changed, 147 insertions(+)
 create mode 100644 docs/superpowers/specs/2026-03-30-geotiff-perf-controls-design.md

diff --git a/docs/superpowers/specs/2026-03-30-geotiff-perf-controls-design.md b/docs/superpowers/specs/2026-03-30-geotiff-perf-controls-design.md
new file mode 100644
index 00000000..761d5f55
--- /dev/null
+++ b/docs/superpowers/specs/2026-03-30-geotiff-perf-controls-design.md
@@ -0,0 +1,147 @@
+# GeoTIFF Performance and Memory Controls
+
+Adds three parameters to `open_geotiff` and `to_geotiff` that let callers
+control memory usage, compression speed, and large-raster write strategy.
+All three are opt-in; default behaviour is unchanged.
+
+## 1. `dtype` parameter on `open_geotiff`
+
+### API
+
+```python
+open_geotiff(source, *, dtype=None, ...)
+```
+
+`dtype` accepts any numpy dtype string or object (`np.float32`, `'float32'`,
+etc.). `None` preserves the file's native dtype (current behaviour).
+
+### Read paths
+
+| Path | Behaviour |
+|------|-----------|
+| Eager (numpy) | Output array allocated at target dtype. Each decoded tile/strip cast before copy-in. Peak overhead: one tile at native dtype. |
+| Dask | Each delayed chunk function casts after decode. Output chunks are target dtype. Same per-tile overhead. |
+| GPU (CuPy) | Cast on device after decode. |
+| Dask + CuPy | Combination of dask and GPU paths. |
+
+### Numba LZW fast path
+
+The LZW decoder is a numba JIT function that emits values one at a time into a
+byte buffer. A variant will decode each value and cast inline to the target
+dtype so the per-tile buffer is never allocated at native dtype. Other codecs
+(deflate, zstd) return byte buffers from C libraries where per-value
+interception isn't possible, so those fall back to the tile-level cast.
+
+### Validation
+
+- Narrowing float casts (float64 to float32): allowed.
+- Narrowing int casts (int64 to int16): allowed (user asked for it explicitly).
+- Widening casts (float32 to float64, uint8 to int32): allowed.
+- Float to int: `ValueError` (lossy in a way users often don't intend).
+- Unsupported casts (e.g. complex128 to uint8): `ValueError`.
+
+## 2. `compression_level` parameter on `to_geotiff`
+
+### API
+
+```python
+to_geotiff(data, path, *, compression='zstd', compression_level=None, ...)
+```
+
+`compression_level` is `int | None`. `None` uses the codec's existing default.
+
+### Ranges
+
+| Codec | Range | Default | Direction |
+|-------|-------|---------|-----------|
+| deflate | 1 -- 9 | 6 | 1 = fastest, 9 = smallest |
+| zstd | 1 -- 22 | 3 | 1 = fastest, 22 = smallest |
+| lz4 | 0 -- 16 | 0 | 0 = fastest |
+| lzw | n/a | n/a | No level support; ignored silently |
+| jpeg | n/a | n/a | Quality is a separate axis; ignored |
+| packbits | n/a | n/a | Ignored |
+| none | n/a | n/a | Ignored |
+
+### Plumbing
+
+`to_geotiff` passes `compression_level` to `write()`, which passes it to
+`compress()`. The internal `compress()` already accepts a `level` argument; we
+just thread it through the two intermediate call sites that currently hardcode
+it.
+
+### Validation
+
+- Out-of-range level for a codec that supports levels: `ValueError`.
+- Level set for a codec without level support: silently ignored.
+
+### GPU path
+
+`write_geotiff_gpu` also accepts and forwards the level to nvCOMP batch
+compression, which supports levels for zstd and deflate.
+
+## 3. VRT output from `to_geotiff` via `.vrt` extension
+
+### Trigger
+
+When `path` ends in `.vrt`, `to_geotiff` writes a tiled VRT instead of a
+monolithic TIFF. No new parameter needed.
+
+### Output layout
+
+```
+output.vrt
+output_tiles/
+  tile_0000_0000.tif   # row_col, zero-padded
+  tile_0000_0001.tif
+  ...
+```
+
+Directory name derived from the VRT stem (`foo.vrt` -> `foo_tiles/`).
+Zero-padding width scales to the grid dimensions.
+
+### Behaviour per input type
+
+| Input | Tiling strategy | Memory profile |
+|-------|----------------|----------------|
+| Dask DataArray | One tile per dask chunk. Each task computes its chunk and writes one `.tif`. | One chunk in RAM at a time (scheduler controlled). |
+| Dask + CuPy | Same, GPU compress per tile. | One chunk in GPU memory at a time. |
+| Numpy / ndarray | Slice into `tile_size`-sized pieces, write each. | Source array already in RAM; tile slices are views (no duplication). |
+| CuPy | Same as numpy but GPU compress. | Source on GPU; tiles are views. |
+
+### Per-tile properties
+
+- Same `compression`, `compression_level`, `predictor`, `nodata`, `crs` as the
+  parent call.
+- `tiled=True` with the caller's `tile_size` (internal TIFF tiling within each
+  chunk-file).
+- GeoTransform adjusted to each tile's spatial position (row/col offset from
+  the full raster origin).
+- No COG overviews on individual tiles.
+
+### VRT generation
+
+After all tiles are written, call `write_vrt()` with relative paths. The VRT
+XML references each tile by its spatial extent and band mapping.
+
+### Edge cases and validation
+
+- `cog=True` with a `.vrt` path: `ValueError` (mutually exclusive).
+- Tiles directory exists and is non-empty: `FileExistsError` to prevent silent
+  overwrites.
+- Tiles directory doesn't exist: created automatically.
+- `overview_levels` with `.vrt` path: `ValueError` (overviews don't apply).
+
+### Dask scheduling
+
+For dask inputs, all delayed tile-write tasks are submitted to
+`dask.compute()` at once. The scheduler manages parallelism and memory. Each
+task is: compute chunk, compress, write tile file. No coordination between
+tasks.
+
+## Out of scope
+
+- Streaming write of a monolithic `.tif` from dask input (tracked as a separate
+  issue). Users who need a single file from a large dask array can write to VRT
+  and convert externally, or ensure sufficient RAM.
+- JPEG quality parameter (separate concern from compression level).
+- Automatic chunk-size recommendation based on available memory.

From 95c79095a64a2b0a0f47b42205b3acc8400add2e Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Mon, 30 Mar 2026 10:59:14 -0700
Subject: [PATCH 2/3] Fix three accuracy bugs in reproject resampling kernels
 (#1086)

1. Nearest-neighbor: use floor(r + 0.5) instead of int(r + 0.5) to
   round correctly for negative fractional pixel coordinates.
   int() truncates toward zero, which maps [-1.0, -0.5) to pixel 0
   instead of out-of-bounds.

2. Cubic: check real neighbor indices against source bounds instead of
   clamping OOB indices to the edge pixel. Border replication masked
   the OOB condition, preventing the bilinear fallback that GDAL uses
   when the 4x4 stencil extends outside the source.

3. CuPy map_coordinates fallback: tighten the NaN contamination
   threshold from 0.1 to 1e-6 so that small NaN weights from distant
   cubic neighbors are caught, matching the native CUDA kernel behavior.
---
 xrspatial/reproject/_interpolate.py | 55 ++++++++++++-----------------
 xrspatial/tests/test_reproject.py   | 55 +++++++++++++++++++++++++++++
 2 files changed, 78 insertions(+), 32 deletions(-)

diff --git a/xrspatial/reproject/_interpolate.py b/xrspatial/reproject/_interpolate.py
index 7593007b..711cd467 100644
--- a/xrspatial/reproject/_interpolate.py
+++ b/xrspatial/reproject/_interpolate.py
@@ -46,16 +46,11 @@ def _resample_nearest_jit(src, row_coords, col_coords, nodata):
             if r < -1.0 or r > sh or c < -1.0 or c > sw:
                 out[i, j] = nodata
                 continue
-            ri = int(r + 0.5)
-            ci = int(c + 0.5)
-            if ri < 0:
-                ri = 0
-            if ri >= sh:
-                ri = sh - 1
-            if ci < 0:
-                ci = 0
-            if ci >= sw:
-                ci = sw - 1
+            ri = int(np.floor(r + 0.5))
+            ci = int(np.floor(c + 0.5))
+            if ri < 0 or ri >= sh or ci < 0 or ci >= sw:
+                out[i, j] = nodata
+                continue
             v = src[ri, ci]
             # NaN check: works for float64
             if v != v:
@@ -109,21 +104,17 @@ def _resample_cubic_jit(src, row_coords, col_coords, nodata):
             has_nan = False
             for di in range(4):
                 ri = r0 - 1 + di
-                ric = ri
-                if ric < 0:
-                    ric = 0
-                elif ric >= sh:
-                    ric = sh - 1
+                if ri < 0 or ri >= sh:
+                    has_nan = True
+                    break
                 # Interpolate along columns for this row
                 rv = 0.0
                 for dj in range(4):
                     cj = c0 - 1 + dj
-                    cjc = cj
-                    if cjc < 0:
-                        cjc = 0
-                    elif cjc >= sw:
-                        cjc = sw - 1
-                    sv = src[ric, cjc]
+                    if cj < 0 or cj >= sw:
+                        has_nan = True
+                        break
+                    sv = src[ri, cj]
                     if sv != sv:  # NaN check
                         has_nan = True
                         break
@@ -339,16 +330,11 @@ def _resample_nearest_cuda(src, row_coords, col_coords, out, nodata):
         if r < -1.0 or r > sh or c < -1.0 or c > sw:
             out[i, j] = nodata
             return
-        ri = int(r + 0.5)
-        ci = int(c + 0.5)
-        if ri < 0:
-            ri = 0
-        if ri >= sh:
-            ri = sh - 1
-        if ci < 0:
-            ci = 0
-        if ci >= sw:
-            ci = sw - 1
+        ri = int(math.floor(r + 0.5))
+        ci = int(math.floor(c + 0.5))
+        if ri < 0 or ri >= sh or ci < 0 or ci >= sw:
+            out[i, j] = nodata
+            return
         v = src[ri, ci]
         # NaN check
         if v != v:
@@ -453,6 +439,11 @@ def _resample_cubic_cuda(src, row_coords, col_coords, out, nodata):
         val = 0.0
         has_nan = False
 
+        # If any of the 4x4 stencil neighbors is outside source, fall
+        # back to bilinear (matches GDAL behavior for boundary pixels).
+        if r0 - 1 < 0 or r0 + 2 >= sh or c0 - 1 < 0 or c0 + 2 >= sw:
+            has_nan = True
+
         # Row 0
         ric = r0 - 1
         if ric < 0:
@@ -844,7 +835,7 @@ def _resample_cupy(source_window, src_row_coords, src_col_coords,
             nan_mask.astype(cp.float64), coords,
             order=order, mode='constant', cval=1.0
         ).reshape(src_row_coords.shape)
-        oob = oob | (nan_weight > 0.1)
+        oob = oob | (nan_weight > 1e-6)
 
     result[oob] = nodata
 
diff --git a/xrspatial/tests/test_reproject.py b/xrspatial/tests/test_reproject.py
index d883d9ef..a565b005 100644
--- a/xrspatial/tests/test_reproject.py
+++ b/xrspatial/tests/test_reproject.py
@@ -195,6 +195,61 @@ def test_resample_oob_fills_nodata(self):
         result = _resample_numpy(src, rows, cols, nodata=-999)
         assert result[0, 0] == -999
 
+    def test_nearest_negative_rounding(self):
+        """int(r + 0.5) must round toward -inf, not toward zero (#1086)."""
+        from xrspatial.reproject._interpolate import _resample_numpy
+        src = np.arange(1, 17, dtype=np.float64).reshape(4, 4)
+        # r = -0.6 is beyond the half-pixel boundary of pixel 0 -> nodata
+        rows = np.array([[-0.6]])
+        cols = np.array([[1.0]])
+        result = _resample_numpy(src, rows, cols, resampling='nearest', nodata=-999)
+        assert result[0, 0] == -999, (
+            f"r=-0.6 should be nodata, got {result[0, 0]}"
+        )
+        # r = -0.4 is within pixel 0's domain -> pixel 0
+        rows2 = np.array([[-0.4]])
+        result2 = _resample_numpy(src, rows2, cols, resampling='nearest', nodata=-999)
+        assert result2[0, 0] == src[0, 1], (
+            f"r=-0.4 should map to pixel 0, got {result2[0, 0]}"
+        )
+
+    def test_cubic_oob_fallback(self):
+        """Cubic must fall back to bilinear when stencil extends outside source (#1086)."""
+        from xrspatial.reproject._interpolate import _resample_numpy
+        # 6x6 source with a gradient
+        src = np.arange(36, dtype=np.float64).reshape(6, 6)
+        # Query at r=0.5, c=0.5: cubic stencil needs row -1, which is OOB.
+        # Should fall back to bilinear using pixels (0,0),(0,1),(1,0),(1,1).
+        rows = np.array([[0.5]])
+        cols = np.array([[0.5]])
+        cubic_result = _resample_numpy(src, rows, cols, resampling='cubic', nodata=-999)
+        bilinear_result = _resample_numpy(src, rows, cols, resampling='bilinear', nodata=-999)
+        # At the boundary, cubic should produce the same result as bilinear
+        np.testing.assert_allclose(
+            cubic_result, bilinear_result, atol=1e-10,
+            err_msg="Cubic near boundary should fall back to bilinear"
+        )
+        # Interior query at r=2.5, c=2.5: full stencil fits, cubic should differ from bilinear
+        rows_int = np.array([[2.5]])
+        cols_int = np.array([[2.5]])
+        cubic_int = _resample_numpy(src, rows_int, cols_int, resampling='cubic', nodata=-999)
+        bilinear_int = _resample_numpy(src, rows_int, cols_int, resampling='bilinear', nodata=-999)
+        # For a linear gradient, cubic and bilinear should agree closely
+        # but the point is the code path exercises the non-fallback branch
+        assert cubic_int[0, 0] != -999
+
+    def test_cubic_oob_bilinear_fallback_renormalizes(self):
+        """Cubic at (-0.8,-0.8): stencil OOB triggers bilinear, which
+        finds pixel (0,0) as the only valid neighbor and returns it (#1086)."""
+        from xrspatial.reproject._interpolate import _resample_numpy
+        src = np.arange(1, 17, dtype=np.float64).reshape(4, 4)
+        rows = np.array([[-0.8]])
+        cols = np.array([[-0.8]])
+        result = _resample_numpy(src, rows, cols, resampling='cubic', nodata=-999)
+        # bilinear fallback: r0=-1 (OOB), r1=0, c0=-1 (OOB), c1=0
+        # only (r1,c1)=(0,0) is valid -> returns src[0,0]=1.0
+        assert result[0, 0] == 1.0
+
     def test_invalid_resampling(self):
         from xrspatial.reproject._interpolate import _validate_resampling
         with pytest.raises(ValueError, match="resampling"):

From 9faf10636392e17b7d52483965a32d7097a1318d Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Mon, 30 Mar 2026 11:10:10 -0700
Subject: [PATCH 3/3] Address code review: edge-case tests and threshold
 comment (#1086)

- Test nearest-neighbor at exact boundary r=-0.5
- Test cubic fallback at far (bottom-right) edge
- Comment the 1e-6 NaN threshold rationale
---
 xrspatial/reproject/_interpolate.py |  3 +++
 xrspatial/tests/test_reproject.py   | 17 +++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/xrspatial/reproject/_interpolate.py b/xrspatial/reproject/_interpolate.py
index 711cd467..6fe4ce6c 100644
--- a/xrspatial/reproject/_interpolate.py
+++ b/xrspatial/reproject/_interpolate.py
@@ -835,6 +835,9 @@ def _resample_cupy(source_window, src_row_coords, src_col_coords,
             nan_mask.astype(cp.float64), coords,
             order=order, mode='constant', cval=1.0
         ).reshape(src_row_coords.shape)
+        # Any nonzero weight means NaN pixels contributed to the output.
+        # Use 1e-6 instead of 0.0 to absorb floating-point interpolation
+        # noise from map_coordinates on the binary mask.
         oob = oob | (nan_weight > 1e-6)
 
     result[oob] = nodata
diff --git a/xrspatial/tests/test_reproject.py b/xrspatial/tests/test_reproject.py
index a565b005..db0c96f9 100644
--- a/xrspatial/tests/test_reproject.py
+++ b/xrspatial/tests/test_reproject.py
@@ -212,6 +212,12 @@ def test_nearest_negative_rounding(self):
         assert result2[0, 0] == src[0, 1], (
             f"r=-0.4 should map to pixel 0, got {result2[0, 0]}"
         )
+        # r = -0.5 is exactly on the half-pixel boundary: floor(-0.5+0.5)=0 -> pixel 0
+        rows3 = np.array([[-0.5]])
+        result3 = _resample_numpy(src, rows3, cols, resampling='nearest', nodata=-999)
+        assert result3[0, 0] == src[0, 1], (
+            f"r=-0.5 should map to pixel 0, got {result3[0, 0]}"
+        )
 
     def test_cubic_oob_fallback(self):
         """Cubic must fall back to bilinear when stencil extends outside source (#1086)."""
@@ -238,6 +244,17 @@ def test_cubic_oob_fallback(self):
         # but the point is the code path exercises the non-fallback branch
         assert cubic_int[0, 0] != -999
 
+    def test_cubic_oob_fallback_far_edge(self):
+        """Cubic at bottom-right boundary: stencil needs row sh, same fallback (#1086)."""
+        from xrspatial.reproject._interpolate import _resample_numpy
+        src = np.arange(36, dtype=np.float64).reshape(6, 6)
+        # r=4.5: cubic stencil needs row 6 (= sh), which is OOB
+        rows = np.array([[4.5]])
+        cols = np.array([[4.5]])
+        cubic = _resample_numpy(src, rows, cols, resampling='cubic', nodata=-999)
+        bilinear = _resample_numpy(src, rows, cols, resampling='bilinear', nodata=-999)
+        np.testing.assert_allclose(cubic, bilinear, atol=1e-10)
+
     def test_cubic_oob_bilinear_fallback_renormalizes(self):
         """Cubic at (-0.8,-0.8): stencil OOB triggers bilinear, which
         finds pixel (0,0) as the only valid neighbor and returns it (#1086)."""