From 42276f63041da4a9b6340037fcefe3ed122719b0 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Mon, 18 May 2026 21:33:37 -0700
Subject: [PATCH 1/3] geotiff: record security sweep pass 18 (MEDIUM Cat 1
 finding)

Re-audit of the geotiff subpackage on 2026-05-18 (deep-sweep p1).
NEW MEDIUM (Cat 1): the eager read_geotiff_gpu path skips the
per-tile byte cap that the CPU paths (_read_tiles,
_fetch_decode_cog_http_tiles) enforce via _max_tile_bytes_from_env().
A malformed local TIFF with TileByteCounts pointing into a large
file region can pass through GPU decode at sizes between 256 MiB
(CPU cap) and ~90% of free VRAM (GPU sum guard).

All other security categories verified clean: JPEG bomb cap (#1792),
HTTP read_all byte budget (#2057), VRT XML cap, DOCTYPE rejection,
path containment, SSRF defenses, dimension caps, IFD entry caps,
MAX_IFDS, MAX_PIXEL_ARRAY_COUNT, GPU bounds guards, atomic writes,
realpath canonicalization, dtype validation.
---
 .claude/sweep-security-state.csv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.claude/sweep-security-state.csv b/.claude/sweep-security-state.csv
index 68ebb65c8..720f3f28d 100644
--- a/.claude/sweep-security-state.csv
+++ b/.claude/sweep-security-state.csv
@@ -18,7 +18,7 @@ fire,2026-04-25,,,,,"Clean. Despite the module's size hint, fire.py is purely pe
 flood,2026-05-03,1437,MEDIUM,3,,Re-audit 2026-05-03. MEDIUM Cat 3 fixed in PR #1438 (travel_time and flood_depth_vegetation now validate mannings_n DataArray values are finite and strictly positive via _validate_mannings_n_dataarray helper). No remaining unfixed findings. Other categories clean: every allocation is same-shape as input; no flat index math; NaN propagation explicit in every backend; tan_slope clamped by _TAN_MIN; no CUDA kernels; no file I/O; every public API calls _validate_raster on DataArray inputs.
 focal,2026-04-27,1284,HIGH,1,,"HIGH (fixed PR #1286): apply(), focal_stats(), and hotspots() accepted unbounded user-supplied kernels via custom_kernel(), which only checks shape parity. The kernel-size guard from #1241 (_check_kernel_memory) only ran inside circle_kernel/annulus_kernel, so a (50001, 50001) custom kernel on a 10x10 raster allocated ~10 GB on the kernel itself plus a much larger padded raster before any work -- same shape as the bilateral DoS in #1236. Fixed by adding _check_kernel_vs_raster_memory in focal.py and wiring it into apply(), focal_stats(), and hotspots() after custom_kernel() validation. All 134 focal tests + 19 bilateral tests pass. No other findings: 10 CUDA kernels all have proper bounds + stencil guards; _validate_raster called on every public entry point; hotspots already raises ZeroDivisionError on constant-value rasters; _focal_variety_cuda uses a fixed-size local buffer (silent truncation but bounded); _focal_std_cuda/_focal_var_cuda clamp the catastrophic-cancellation case via if var < 0.0: var = 0.0; no file I/O."
 geodesic,2026-04-27,1283,HIGH,1,,"HIGH (fixed PR #1285): slope(method='geodesic') and aspect(method='geodesic') stack a (3, H, W) float64 array (data, lat, lon) before dispatch with no memory check. A large lat/lon-tagged raster passed to either function would OOM. Fixed by adding _check_geodesic_memory(rows, cols) in xrspatial/geodesic.py (mirrors morphology._check_kernel_memory): budgets 56 bytes/cell (24 stacked float64 + 4 float32 output + 24 padded copy + slack) and raises MemoryError when > 50% of available RAM; called from slope.py and aspect.py inside the geodesic branch before dispatch. No other findings: 6 CUDA kernels all have bounds guards (e.g. _run_gpu_geodesic_aspect at geodesic.py:395), custom 16x16 thread blocks avoid register spill, no shared memory, _validate_raster runs upstream in slope/aspect, all backends cast to float32, slope_mag < 1e-7 flat threshold prevents arctan2 NaN propagation, curvature correction uses hardcoded WGS84 R."
-geotiff,2026-05-13,1792,MEDIUM,1,,"Re-audit pass 17 2026-05-13 (deep-sweep s2). NEW MEDIUM (Cat 1): jpeg_decompress (_compression.py:1042-1066) hands attacker-controlled JPEG bytes to Pillow without consulting the declared tile width/height/samples; a tile-size mismatch lets a small JPEG payload allocate up to Pillow's MAX_IMAGE_PIXELS*2 (~178M pixels, ~500 MB RGB) before the downstream chunk.size != expected check fires. Asymmetric with the JP2K SIZ pre-check and LERC blob-info pre-check. Pillow's default DecompressionBombError is a partial guard so severity is MEDIUM. Other categories verified clean: Cat 2-6 same coverage as pass 16 audit; JPEG2000 / LERC / deflate / zstd / lz4 / packbits / LZW caps still in place; VRT _resample_nearest DstRect cap (#1737) merged; VRT path containment + DOCTYPE rejection in _safe_xml; CUDA kernels have bounds guards; mmap cache uses realpath; SSRF defenses on _HTTPSource."
+geotiff,2026-05-18,,MEDIUM,1,,"Re-audit pass 18 2026-05-18 (deep-sweep p1). NEW MEDIUM (Cat 1): read_geotiff_gpu eager path (_backends/gpu.py:466-489) skips the _max_tile_bytes_from_env() per-tile cap that _read_tiles (_reader.py:2084) and _fetch_decode_cog_http_tiles (_reader.py:2563) enforce. validate_tile_layout checks offsets count but not byte_counts entries. A malformed local TIFF with TileByteCounts pointing into a large file region can pass through to GPU decode, where _check_gpu_memory only catches the sum at ~90% of free VRAM, not the per-tile 256 MiB CPU limit. mmap bounds protect against bytes past EOF; sum guard catches extreme totals. Exploit surface is local files with byte_counts in the 256MB-1GB range. Other categories verified clean: JPEG bomb cap (#1792) merged; HTTP read_all byte budget (#2057) merged; VRT XML cap, DOCTYPE rejection, path containment, SSRF, _check_gpu_memory, validate_tile_layout, dimension caps, IFD entry caps, MAX_IFDS, MAX_PIXEL_ARRAY_COUNT all in place. Cat 2: pixel-array tag count caps prevent int32 overflow on indices. Cat 3: NaN handling consistent across backends. Cat 4: GPU kernels have bounds guards (_lzw_decode_tiles_kernel, _inflate_tiles_kernel, _predictor_decode_kernel_u*, _assemble_tiles_kernel); shared memory sized to constants. Cat 5: tempfile.mkstemp + os.replace atomic writes; realpath canonicalization in VRT path containment and mmap cache. Cat 6: dtype validation in resolve_bits_per_sample / resolve_sample_format + _validate_predictor_sample_format."
 glcm,2026-04-24,1257,HIGH,1,,"HIGH (fixed #1257): glcm_texture() validated window_size only as >= 3 and distance only as >= 1, with no upper bound on either. _glcm_numba_kernel iterates range(r-half, r+half+1) for every pixel, so window_size=1_000_001 on a 10x10 raster ran ~10^14 loop iterations with all neighbors failing the interior bounds check (CPU DoS). On the dask backends depth = window_size // 2 + distance drove map_overlap padding, so a huge window also caused oversize per-chunk allocations (memory DoS). Fixed by adding max_val caps in the public entrypoint: window_size <= max(3, min(rows, cols)) and distance <= max(1, window_size // 2). One cap covers every backend because cupy and dask+cupy call through to the CPU kernel after cupy.asnumpy. No other HIGH findings: levels is already capped at 256 so the per-pixel np.zeros((levels, levels)) matrix in the kernel is bounded to 512 KB. No CUDA kernels. No file I/O. Quantization clips to [0, levels-1] before the kernel and NaN maps to -1 which the kernel filters with i_val >= 0. Entropy log(p) and correlation p / (std_i * std_j) are both guarded. All four backends use _validate_raster and cast to float64 before quantizing. MEDIUM (unfixed, Cat 1): the per-pixel np.zeros((levels, levels)) allocation inside the hot loop is a perf issue (levels=256 -> 512 KB alloc+free per pixel) but not a security issue because levels is bounded. Could be hoisted out of the loop or replaced with an in-place clear, but that is an efficiency concern, not security."
 gpu_rtx,2026-04-29,1308,HIGH,1,,"HIGH (fixed #1308 / PR #1310): hillshade_rtx (gpu_rtx/hillshade.py:184) and viewshed_gpu (gpu_rtx/viewshed.py:269) allocated cupy device buffers sized by raster shape with no memory check. create_triangulation (mesh_utils.py:23-24) adds verts (12 B/px) + triangles (24 B/px) = 36 B/px; hillshade_rtx adds d_rays(32) + d_hits(16) + d_aux(12) + d_output(4) = 64 B/px (100 B/px total); viewshed_gpu adds d_rays(32) + d_hits(16) + d_visgrid(4) + d_vsrays(32) = 84 B/px (120 B/px total). A 30000x30000 raster asked for 90-108 GB of VRAM before cupy surfaced an opaque allocator error. Fixed by adding gpu_rtx/_memory.py with _available_gpu_memory_bytes() and _check_gpu_memory(func_name, h, w) helpers (cost_distance #1262 / sky_view_factor #1299 pattern, 120 B/px budget covers worst case, raises MemoryError when required > 50% of free VRAM, skips silently when memGetInfo() unavailable). Wired into both entry points after the cupy.ndarray type check and before create_triangulation. 9 new tests in test_gpu_rtx_memory.py (5 helper-unit + 4 end-to-end gated on has_rtx). All 81 existing hillshade/viewshed tests still pass. Cat 4 clean: all CUDA kernels (hillshade.py:25/62/106, viewshed.py:32/74/116, mesh_utils.py:50) have bounds guards; no shared memory, no syncthreads needed. MEDIUM not fixed (Cat 6): hillshade_rtx and viewshed_gpu do not call _validate_raster directly but parent hillshade() (hillshade.py:252) and viewshed() (viewshed.py:1707) already validate, so input validation runs before the gpu_rtx entry point - defense-in-depth, not exploitable. MEDIUM not fixed (Cat 2): mesh_utils.py:64-68 cast mesh_map_index to int32 in the triangle index buffer; overflows at H*W > 2.1B vertices (~46341x46341+) but the new memory guard rejects rasters that large first - documentation/clarity item rather than exploitable. MEDIUM not fixed (Cat 3): mesh_utils.py:19 scale = maxDim / maxH divides by zero on an all-zero raster, propagating inf/NaN into mesh vertex z-coords; separate follow-up. LOW not fixed (Cat 5): mesh_utils.write() opens user-supplied path without canonicalization but its only call site (mesh_utils.py:38-39) sits behind if False: in create_triangulation, not reachable in production."
 hillshade,2026-04-27,,,,,"Clean. Cat 1: only allocation is the output np.empty(data.shape) at line 32 (cupy at line 165) and a _pad_array with hardcoded depth=1 (line 62) -- bounded by caller, no user-controlled amplifier. Azimuth/altitude are scalars and don't drive size. Cat 2: numba kernel uses range(1, rows-1) with simple (y, x) indexing; numba range loops promote to int64. Cat 3: math.sqrt(1.0 + xx_plus_yy) is always >= 1.0 (no neg sqrt, no div-by-zero); NaN elevation propagates correctly through dz_dx/dz_dy -> shaded -> output (the shaded < 0.0 / shaded > 1.0 clamps don't fire on NaN). Azimuth validated to [0, 360], altitude to [0, 90]. Cat 4: _gpu_calc_numba (line 107) guards both grid bounds and 3x3 stencil reads via i > 0 and i < shape[0]-1 and j > 0 and j < shape[1]-1; no shared memory. Cat 5: no file I/O. Cat 6: hillshade() calls _validate_raster (line 252) and _validate_scalar for both azimuth (253) and angle_altitude (254); all four backend paths cast to float32; tests parametrize int32/int64/float32/float64."

From 1da7ec50b19043c8a5d17703dab5525a80bb4f30 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Mon, 18 May 2026 21:39:27 -0700
Subject: [PATCH 2/3] geotiff: apply per-tile byte cap on GPU read path (Cat 1
 MEDIUM)

The CPU readers ``_read_tiles`` and ``_fetch_decode_cog_http_tiles``
reject tiles whose declared ``TileByteCount`` exceeds the env-driven
``_max_tile_bytes_from_env()`` cap (default 256 MiB). The eager GPU
read path skipped this check; ``validate_tile_layout`` only bounds
the offsets array length, not the byte-count entries. A crafted
local TIFF with multi-hundred-MB ``TileByteCount`` values could
pass through to GPU decode where ``_check_gpu_memory`` catches only
the aggregate sum at ~90% of free VRAM, leaving the per-tile budget
asymmetric across the CPU and GPU contracts.

Add the same per-tile loop in ``read_geotiff_gpu`` (after
``validate_tile_layout``) so a single oversized tile is rejected
before any decode work runs. Mirrors the wording of the existing CPU
guards so callers see consistent error messages across backends.

Add ``test_gpu_tile_byte_cap_2026_05_18.py`` covering the rejection,
the wording (forged value + cap appear in the error), the env
override escape hatch, and the legit-read pass-through under the
default cap. The tests use the same forged-TIFF helpers as the CPU
companion suite ``test_local_tile_byte_cap_1664.py``.
---
 .claude/sweep-security-state.csv              |   2 +-
 xrspatial/geotiff/_backends/gpu.py            |  26 ++-
 .../test_gpu_tile_byte_cap_2026_05_18.py      | 172 ++++++++++++++++++
 3 files changed, 198 insertions(+), 2 deletions(-)
 create mode 100644 xrspatial/geotiff/tests/test_gpu_tile_byte_cap_2026_05_18.py

diff --git a/.claude/sweep-security-state.csv b/.claude/sweep-security-state.csv
index 720f3f28d..a58dab2a8 100644
--- a/.claude/sweep-security-state.csv
+++ b/.claude/sweep-security-state.csv
@@ -18,7 +18,7 @@ fire,2026-04-25,,,,,"Clean. Despite the module's size hint, fire.py is purely pe
 flood,2026-05-03,1437,MEDIUM,3,,Re-audit 2026-05-03. MEDIUM Cat 3 fixed in PR #1438 (travel_time and flood_depth_vegetation now validate mannings_n DataArray values are finite and strictly positive via _validate_mannings_n_dataarray helper). No remaining unfixed findings. Other categories clean: every allocation is same-shape as input; no flat index math; NaN propagation explicit in every backend; tan_slope clamped by _TAN_MIN; no CUDA kernels; no file I/O; every public API calls _validate_raster on DataArray inputs.
 focal,2026-04-27,1284,HIGH,1,,"HIGH (fixed PR #1286): apply(), focal_stats(), and hotspots() accepted unbounded user-supplied kernels via custom_kernel(), which only checks shape parity. The kernel-size guard from #1241 (_check_kernel_memory) only ran inside circle_kernel/annulus_kernel, so a (50001, 50001) custom kernel on a 10x10 raster allocated ~10 GB on the kernel itself plus a much larger padded raster before any work -- same shape as the bilateral DoS in #1236. Fixed by adding _check_kernel_vs_raster_memory in focal.py and wiring it into apply(), focal_stats(), and hotspots() after custom_kernel() validation. All 134 focal tests + 19 bilateral tests pass. No other findings: 10 CUDA kernels all have proper bounds + stencil guards; _validate_raster called on every public entry point; hotspots already raises ZeroDivisionError on constant-value rasters; _focal_variety_cuda uses a fixed-size local buffer (silent truncation but bounded); _focal_std_cuda/_focal_var_cuda clamp the catastrophic-cancellation case via if var < 0.0: var = 0.0; no file I/O."
 geodesic,2026-04-27,1283,HIGH,1,,"HIGH (fixed PR #1285): slope(method='geodesic') and aspect(method='geodesic') stack a (3, H, W) float64 array (data, lat, lon) before dispatch with no memory check. A large lat/lon-tagged raster passed to either function would OOM. Fixed by adding _check_geodesic_memory(rows, cols) in xrspatial/geodesic.py (mirrors morphology._check_kernel_memory): budgets 56 bytes/cell (24 stacked float64 + 4 float32 output + 24 padded copy + slack) and raises MemoryError when > 50% of available RAM; called from slope.py and aspect.py inside the geodesic branch before dispatch. No other findings: 6 CUDA kernels all have bounds guards (e.g. _run_gpu_geodesic_aspect at geodesic.py:395), custom 16x16 thread blocks avoid register spill, no shared memory, _validate_raster runs upstream in slope/aspect, all backends cast to float32, slope_mag < 1e-7 flat threshold prevents arctan2 NaN propagation, curvature correction uses hardcoded WGS84 R."
-geotiff,2026-05-18,,MEDIUM,1,,"Re-audit pass 18 2026-05-18 (deep-sweep p1). NEW MEDIUM (Cat 1): read_geotiff_gpu eager path (_backends/gpu.py:466-489) skips the _max_tile_bytes_from_env() per-tile cap that _read_tiles (_reader.py:2084) and _fetch_decode_cog_http_tiles (_reader.py:2563) enforce. validate_tile_layout checks offsets count but not byte_counts entries. A malformed local TIFF with TileByteCounts pointing into a large file region can pass through to GPU decode, where _check_gpu_memory only catches the sum at ~90% of free VRAM, not the per-tile 256 MiB CPU limit. mmap bounds protect against bytes past EOF; sum guard catches extreme totals. Exploit surface is local files with byte_counts in the 256MB-1GB range. Other categories verified clean: JPEG bomb cap (#1792) merged; HTTP read_all byte budget (#2057) merged; VRT XML cap, DOCTYPE rejection, path containment, SSRF, _check_gpu_memory, validate_tile_layout, dimension caps, IFD entry caps, MAX_IFDS, MAX_PIXEL_ARRAY_COUNT all in place. Cat 2: pixel-array tag count caps prevent int32 overflow on indices. Cat 3: NaN handling consistent across backends. Cat 4: GPU kernels have bounds guards (_lzw_decode_tiles_kernel, _inflate_tiles_kernel, _predictor_decode_kernel_u*, _assemble_tiles_kernel); shared memory sized to constants. Cat 5: tempfile.mkstemp + os.replace atomic writes; realpath canonicalization in VRT path containment and mmap cache. Cat 6: dtype validation in resolve_bits_per_sample / resolve_sample_format + _validate_predictor_sample_format."
+geotiff,2026-05-18,,MEDIUM,1,,"Re-audit pass 18 2026-05-18 (deep-sweep p1). MEDIUM Cat 1 fixed in deep-sweep-security-geotiff-2026-05-18-p1: read_geotiff_gpu eager path (_backends/gpu.py) now applies the same _max_tile_bytes_from_env() per-tile cap that _read_tiles and _fetch_decode_cog_http_tiles enforce. The CPU and GPU readers now agree on the per-tile budget; a malformed local TIFF with TileByteCounts pointing into a large file region is rejected before GPU decode rather than relying on _check_gpu_memory's aggregate-sum guard. Test: tests/test_gpu_tile_byte_cap_2026_05_18.py. Other categories verified clean: JPEG bomb cap (#1792), HTTP read_all byte budget (#2057), VRT XML cap, DOCTYPE rejection, path containment, SSRF, validate_tile_layout, dimension caps, IFD entry caps, MAX_IFDS, MAX_PIXEL_ARRAY_COUNT, GPU bounds guards, atomic writes, realpath canonicalization, dtype validation."
 glcm,2026-04-24,1257,HIGH,1,,"HIGH (fixed #1257): glcm_texture() validated window_size only as >= 3 and distance only as >= 1, with no upper bound on either. _glcm_numba_kernel iterates range(r-half, r+half+1) for every pixel, so window_size=1_000_001 on a 10x10 raster ran ~10^14 loop iterations with all neighbors failing the interior bounds check (CPU DoS). On the dask backends depth = window_size // 2 + distance drove map_overlap padding, so a huge window also caused oversize per-chunk allocations (memory DoS). Fixed by adding max_val caps in the public entrypoint: window_size <= max(3, min(rows, cols)) and distance <= max(1, window_size // 2). One cap covers every backend because cupy and dask+cupy call through to the CPU kernel after cupy.asnumpy. No other HIGH findings: levels is already capped at 256 so the per-pixel np.zeros((levels, levels)) matrix in the kernel is bounded to 512 KB. No CUDA kernels. No file I/O. Quantization clips to [0, levels-1] before the kernel and NaN maps to -1 which the kernel filters with i_val >= 0. Entropy log(p) and correlation p / (std_i * std_j) are both guarded. All four backends use _validate_raster and cast to float64 before quantizing. MEDIUM (unfixed, Cat 1): the per-pixel np.zeros((levels, levels)) allocation inside the hot loop is a perf issue (levels=256 -> 512 KB alloc+free per pixel) but not a security issue because levels is bounded. Could be hoisted out of the loop or replaced with an in-place clear, but that is an efficiency concern, not security."
 gpu_rtx,2026-04-29,1308,HIGH,1,,"HIGH (fixed #1308 / PR #1310): hillshade_rtx (gpu_rtx/hillshade.py:184) and viewshed_gpu (gpu_rtx/viewshed.py:269) allocated cupy device buffers sized by raster shape with no memory check. create_triangulation (mesh_utils.py:23-24) adds verts (12 B/px) + triangles (24 B/px) = 36 B/px; hillshade_rtx adds d_rays(32) + d_hits(16) + d_aux(12) + d_output(4) = 64 B/px (100 B/px total); viewshed_gpu adds d_rays(32) + d_hits(16) + d_visgrid(4) + d_vsrays(32) = 84 B/px (120 B/px total). A 30000x30000 raster asked for 90-108 GB of VRAM before cupy surfaced an opaque allocator error. Fixed by adding gpu_rtx/_memory.py with _available_gpu_memory_bytes() and _check_gpu_memory(func_name, h, w) helpers (cost_distance #1262 / sky_view_factor #1299 pattern, 120 B/px budget covers worst case, raises MemoryError when required > 50% of free VRAM, skips silently when memGetInfo() unavailable). Wired into both entry points after the cupy.ndarray type check and before create_triangulation. 9 new tests in test_gpu_rtx_memory.py (5 helper-unit + 4 end-to-end gated on has_rtx). All 81 existing hillshade/viewshed tests still pass. Cat 4 clean: all CUDA kernels (hillshade.py:25/62/106, viewshed.py:32/74/116, mesh_utils.py:50) have bounds guards; no shared memory, no syncthreads needed. MEDIUM not fixed (Cat 6): hillshade_rtx and viewshed_gpu do not call _validate_raster directly but parent hillshade() (hillshade.py:252) and viewshed() (viewshed.py:1707) already validate, so input validation runs before the gpu_rtx entry point - defense-in-depth, not exploitable. MEDIUM not fixed (Cat 2): mesh_utils.py:64-68 cast mesh_map_index to int32 in the triangle index buffer; overflows at H*W > 2.1B vertices (~46341x46341+) but the new memory guard rejects rasters that large first - documentation/clarity item rather than exploitable. MEDIUM not fixed (Cat 3): mesh_utils.py:19 scale = maxDim / maxH divides by zero on an all-zero raster, propagating inf/NaN into mesh vertex z-coords; separate follow-up. LOW not fixed (Cat 5): mesh_utils.write() opens user-supplied path without canonicalization but its only call site (mesh_utils.py:38-39) sits behind if False: in create_triangulation, not reachable in production."
 hillshade,2026-04-27,,,,,"Clean. Cat 1: only allocation is the output np.empty(data.shape) at line 32 (cupy at line 165) and a _pad_array with hardcoded depth=1 (line 62) -- bounded by caller, no user-controlled amplifier. Azimuth/altitude are scalars and don't drive size. Cat 2: numba kernel uses range(1, rows-1) with simple (y, x) indexing; numba range loops promote to int64. Cat 3: math.sqrt(1.0 + xx_plus_yy) is always >= 1.0 (no neg sqrt, no div-by-zero); NaN elevation propagates correctly through dz_dx/dz_dy -> shaded -> output (the shaded < 0.0 / shaded > 1.0 clamps don't fire on NaN). Azimuth validated to [0, 360], altitude to [0, 90]. Cat 4: _gpu_calc_numba (line 107) guards both grid bounds and 3x3 stencil reads via i > 0 and i < shape[0]-1 and j > 0 and j < shape[1]-1; no shared memory. Cat 5: no file I/O. Cat 6: hillshade() calls _validate_raster (line 252) and _validate_scalar for both azimuth (253) and angle_altitude (254); all four backend paths cast to float32; tests parametrize int32/int64/float32/float64."
diff --git a/xrspatial/geotiff/_backends/gpu.py b/xrspatial/geotiff/_backends/gpu.py
index 4333a00e6..44fd0f0f5 100644
--- a/xrspatial/geotiff/_backends/gpu.py
+++ b/xrspatial/geotiff/_backends/gpu.py
@@ -254,7 +254,7 @@ def read_geotiff_gpu(source: str, *,
 
     from .._reader import (
         _FileSource, _check_dimensions, MAX_PIXELS_DEFAULT, _coerce_path,
-        _resolve_masked_fill,
+        _max_tile_bytes_from_env, _resolve_masked_fill,
     )
     from .._compression import COMPRESSION_LERC
     from .._header import (
@@ -488,6 +488,30 @@ def read_geotiff_gpu(source: str, *,
         # read OOB otherwise. See issue #1219.
         validate_tile_layout(ifd)
 
+        # Per-tile compressed-byte cap, matching the CPU paths
+        # ``_read_tiles`` and ``_fetch_decode_cog_http_tiles`` apply
+        # via the same env var (issue #1664). ``validate_tile_layout``
+        # bounds the offsets array length but not the byte_counts
+        # entries; a crafted ``TileByteCounts`` value can still ask
+        # the GPU pipeline to fetch and decompress a multi-hundred-MB
+        # tile that the CPU paths would already refuse. The
+        # ``_check_gpu_memory`` guard in the downstream kvikio /
+        # nvCOMP paths runs against ``sum(byte_counts)`` so it only
+        # catches the extreme aggregate case; this loop closes the
+        # per-tile asymmetry between the CPU and GPU readers.
+        max_tile_bytes = _max_tile_bytes_from_env()
+        for _tile_idx, _bc in enumerate(byte_counts):
+            if _bc > max_tile_bytes:
+                raise ValueError(
+                    f"TIFF tile {_tile_idx} declares "
+                    f"TileByteCount={_bc:,} bytes, which exceeds the "
+                    f"per-tile safety cap of {max_tile_bytes:,} bytes. "
+                    f"The file is malformed or attempting "
+                    f"denial-of-service. Override via "
+                    f"XRSPATIAL_COG_MAX_TILE_BYTES if this file is "
+                    f"legitimate."
+                )
+
     finally:
         src.close()
 
diff --git a/xrspatial/geotiff/tests/test_gpu_tile_byte_cap_2026_05_18.py b/xrspatial/geotiff/tests/test_gpu_tile_byte_cap_2026_05_18.py
new file mode 100644
index 000000000..8ca776ad2
--- /dev/null
+++ b/xrspatial/geotiff/tests/test_gpu_tile_byte_cap_2026_05_18.py
@@ -0,0 +1,172 @@
+"""GPU read path per-tile byte cap (security sweep follow-up).
+
+The CPU readers ``_read_tiles`` (xrspatial/geotiff/_reader.py:2084) and
+``_fetch_decode_cog_http_tiles`` (xrspatial/geotiff/_reader.py:2563)
+reject a tile whose declared ``TileByteCount`` exceeds the env-driven
+``_max_tile_bytes_from_env()`` cap (default 256 MiB). The eager GPU
+read path in ``xrspatial.geotiff._backends.gpu.read_geotiff_gpu`` did
+not run the same check; ``validate_tile_layout`` bounds the offsets
+array length but not the byte-count entries. A crafted local TIFF with
+a multi-hundred-MB ``TileByteCount`` could then pass through to GPU
+decode, where ``_check_gpu_memory`` only catches the aggregate at
+~90% of free VRAM and not the per-tile asymmetry between the CPU and
+GPU paths.
+
+The GPU eager path now applies the same per-tile cap so the CPU and
+GPU contracts agree. These tests cover the rejection, the wording of
+the rejection message, the env-override escape hatch, and the legit-
+read pass-through under the default cap.
+
+Mirrors the structure of ``test_local_tile_byte_cap_1664.py`` for the
+CPU paths so a side-by-side comparison is easy.
+"""
+from __future__ import annotations
+
+import importlib.util
+import struct
+
+import numpy as np
+import pytest
+import xarray as xr
+
+from xrspatial.geotiff import read_geotiff_gpu, to_geotiff
+
+
+def _cupy_available() -> bool:
+    if importlib.util.find_spec("cupy") is None:
+        return False
+    try:
+        import cupy
+
+        return bool(cupy.cuda.is_available())
+    except Exception:
+        return False
+
+
+_HAS_GPU = _cupy_available()
+_gpu_only = pytest.mark.skipif(
+    not _HAS_GPU, reason="cupy + CUDA required for the GPU read path",
+)
+
+
+def _patch_byte_counts(data: bytearray, tag: int, value: int) -> None:
+    """Rewrite every entry for *tag* in the first IFD.
+
+    Mirrors the helper in ``test_local_tile_byte_cap_1664.py``: parses
+    the TIFF header, walks the IFD, and writes ``value`` over every
+    occurrence of the named tag's value array. ``tag=325`` is
+    ``TileByteCounts``; ``tag=279`` is ``StripByteCounts``.
+    """
+    from xrspatial.geotiff._header import parse_header
+
+    header = parse_header(bytes(data))
+    bo = header.byte_order
+    ifd_offset = header.first_ifd_offset
+    num_entries = struct.unpack_from(f"{bo}H", data, ifd_offset)[0]
+    entry_offset = ifd_offset + 2
+
+    for i in range(num_entries):
+        eo = entry_offset + i * 12
+        cur_tag = struct.unpack_from(f"{bo}H", data, eo)[0]
+        if cur_tag != tag:
+            continue
+        type_id = struct.unpack_from(f"{bo}H", data, eo + 2)[0]
+        count = struct.unpack_from(f"{bo}I", data, eo + 4)[0]
+        if type_id == 4:  # LONG
+            total = count * 4
+            if total <= 4:
+                for k in range(count):
+                    struct.pack_into(f"{bo}I", data, eo + 8 + k * 4, value)
+            else:
+                ptr = struct.unpack_from(f"{bo}I", data, eo + 8)[0]
+                for k in range(count):
+                    struct.pack_into(f"{bo}I", data, ptr + k * 4, value)
+        elif type_id == 3:  # SHORT
+            clipped = min(value, 0xFFFF)
+            total = count * 2
+            if total <= 4:
+                for k in range(count):
+                    struct.pack_into(
+                        f"{bo}H", data, eo + 8 + k * 2, clipped)
+            else:
+                ptr = struct.unpack_from(f"{bo}I", data, eo + 8)[0]
+                for k in range(count):
+                    struct.pack_into(
+                        f"{bo}H", data, ptr + k * 2, clipped)
+        return
+    raise AssertionError(f"tag {tag} not found in IFD")
+
+
+def _build_forged_tiled_cog(tmp_path, byte_count_value: int) -> str:
+    """Write a real tiled COG, patch every TileByteCounts entry, return path."""
+    arr = np.arange(64 * 64, dtype=np.float32).reshape(64, 64)
+    da = xr.DataArray(arr, dims=["y", "x"])
+    path = str(tmp_path / "forged_gpu_tiles_2026_05_18.tif")
+    to_geotiff(da, path, tile_size=32, compression="deflate")
+    with open(path, "rb") as f:
+        data = bytearray(f.read())
+    _patch_byte_counts(data, 325, byte_count_value)
+    with open(path, "wb") as f:
+        f.write(data)
+    return path
+
+
+# ---------------------------------------------------------------------------
+# GPU eager path: per-tile byte cap
+# ---------------------------------------------------------------------------
+
+
+class TestGpuTileByteCap:
+    @_gpu_only
+    def test_huge_tile_byte_count_rejected(self, tmp_path, monkeypatch):
+        """A local tile with a huge TileByteCount raises before GPU decode."""
+        path = _build_forged_tiled_cog(tmp_path, 100 * 1024 * 1024)
+        monkeypatch.setenv("XRSPATIAL_COG_MAX_TILE_BYTES", str(1024 * 1024))
+
+        with pytest.raises(ValueError, match="TileByteCount"):
+            read_geotiff_gpu(path)
+
+    @_gpu_only
+    def test_error_message_names_value_and_cap(self, tmp_path, monkeypatch):
+        path = _build_forged_tiled_cog(tmp_path, 50 * 1024 * 1024)
+        monkeypatch.setenv("XRSPATIAL_COG_MAX_TILE_BYTES", str(1024))
+
+        with pytest.raises(ValueError) as excinfo:
+            read_geotiff_gpu(path)
+        msg = str(excinfo.value)
+        # The forged value (52,428,800) and the cap (1,024) both appear.
+        assert "52,428,800" in msg or "52428800" in msg
+        assert "1,024" in msg or "1024" in msg
+        assert "denial-of-service" in msg.lower() or "malformed" in msg
+
+    @_gpu_only
+    def test_normal_gpu_read_under_default_cap(self, tmp_path):
+        """Legitimate GPU reads with the default cap still succeed."""
+        arr = np.arange(64 * 64, dtype=np.float32).reshape(64, 64)
+        da = xr.DataArray(arr, dims=["y", "x"])
+        path = str(tmp_path / "normal_gpu_2026_05_18.tif")
+        to_geotiff(da, path, tile_size=32, compression="deflate")
+
+        result = read_geotiff_gpu(path)
+        # CuPy -> numpy for comparison.
+        np.testing.assert_array_equal(result.data.get(), arr)
+
+    @_gpu_only
+    def test_env_override_lifts_cap(self, tmp_path, monkeypatch):
+        """A user with legitimate large tiles can lift the cap via env."""
+        path = _build_forged_tiled_cog(tmp_path, 50 * 1024 * 1024)
+        monkeypatch.setenv(
+            "XRSPATIAL_COG_MAX_TILE_BYTES", str(64 * 1024 * 1024))
+
+        # The decompressor may raise on the truncated mmap slice, but
+        # the per-tile cap error must not be the source. Match the
+        # behaviour pinned by ``test_env_override_lifts_cap`` in the
+        # CPU companion module.
+        try:
+            read_geotiff_gpu(path)
+        except ValueError as exc:
+            assert "exceeds the per-tile safety cap" not in str(exc)
+        except Exception:
+            # Codec failures on the truncated payload are acceptable;
+            # we only care that the cap check did not fire.
+            pass

From 6d68acd245e7ebea9e22d06da2dc0f99f52572e3 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Tue, 19 May 2026 06:37:00 -0700
Subject: [PATCH 3/3] geotiff: extend per-tile byte cap to dask + GPU paths
 (#2113 review)

Self-review caught three issues. Address them.

- Extend the per-tile cap to the dask + GPU chunked path. The eager
  GPU path's cap landed at the right spot but the chunked path
  (``_read_geotiff_gpu_dask_path`` -> ``_read_geotiff_gpu_chunked_gds``
  or fall-through to ``read_geotiff_dask``) parsed IFDs and built a
  dask graph without applying the cap, so a forged file could slip
  past at graph-build time. The cap now fires at metadata parse time
  inside ``_read_geotiff_gpu_dask_path`` before any qualification
  probe, and is also applied inside ``_read_geotiff_gpu_chunked_gds``
  for the case where that function is reached directly.

- Hoist the TIFF byte-surgery helper out of the per-test file into
  ``tests/_tiff_surgery.py``. The CPU companion test and the new GPU
  test were carrying near-identical copies of the same struct-packing
  helper. Drift between the two would silently desync the cap tests.

- Spell the sparse-tile pass-through behaviour in the eager and
  chunked GPU comments so a future reader does not add a ``bc > 0``
  guard and accidentally reject sparse files. Sparse tiles
  (``byte_count == 0``) pass under any positive cap by design, mirroring
  the CPU path in ``_reader.py``.

- Tighten ``test_env_override_lifts_cap`` to assert the cap message
  is *not* in any exception fired, rather than swallowing all errors
  via a bare ``except``. A regression that re-fires the cap through a
  different error path now produces a focused failure.

- Drop the misleading underscore-prefix loop variables in the eager
  GPU cap loop; both names are read inside the body.

- Cover the chunked + GPU cap with a parallel regression test.
---
 xrspatial/geotiff/_backends/gpu.py            |  83 ++++++++++++--
 xrspatial/geotiff/tests/_tiff_surgery.py      |  75 +++++++++++++
 .../test_gpu_tile_byte_cap_2026_05_18.py      | 104 ++++++++----------
 .../tests/test_local_tile_byte_cap_1664.py    |  43 +-------
 4 files changed, 196 insertions(+), 109 deletions(-)
 create mode 100644 xrspatial/geotiff/tests/_tiff_surgery.py

diff --git a/xrspatial/geotiff/_backends/gpu.py b/xrspatial/geotiff/_backends/gpu.py
index 44fd0f0f5..f1dbcfbbb 100644
--- a/xrspatial/geotiff/_backends/gpu.py
+++ b/xrspatial/geotiff/_backends/gpu.py
@@ -498,13 +498,16 @@ def read_geotiff_gpu(source: str, *,
         # ``_check_gpu_memory`` guard in the downstream kvikio /
         # nvCOMP paths runs against ``sum(byte_counts)`` so it only
         # catches the extreme aggregate case; this loop closes the
-        # per-tile asymmetry between the CPU and GPU readers.
+        # per-tile asymmetry between the CPU and GPU readers. Sparse
+        # tiles (``byte_count == 0``) pass under any positive cap by
+        # design -- they carry no compressed bytes to decode and the
+        # CPU mirror at ``_reader.py`` does the same.
         max_tile_bytes = _max_tile_bytes_from_env()
-        for _tile_idx, _bc in enumerate(byte_counts):
-            if _bc > max_tile_bytes:
+        for tile_idx, bc in enumerate(byte_counts):
+            if bc > max_tile_bytes:
                 raise ValueError(
-                    f"TIFF tile {_tile_idx} declares "
-                    f"TileByteCount={_bc:,} bytes, which exceeds the "
+                    f"TIFF tile {tile_idx} declares "
+                    f"TileByteCount={bc:,} bytes, which exceeds the "
                     f"per-tile safety cap of {max_tile_bytes:,} bytes. "
                     f"The file is malformed or attempting "
                     f"denial-of-service. Override via "
@@ -959,12 +962,56 @@ def _read_geotiff_gpu_chunked(source, *, dtype, chunks, overview_level,
     """
     import cupy
 
-    from .._reader import _FileSource, _coerce_path
+    from .._reader import (
+        _FileSource, _coerce_path, _max_tile_bytes_from_env,
+    )
     from .._header import parse_header, parse_all_ifds, select_overview_ifd
     from .._geotags import extract_geo_info_with_overview_inheritance
 
     src_path = _coerce_path(source)
 
+    # Per-tile compressed-byte cap, mirroring the eager GPU path and
+    # the CPU readers (issue #1664 + the GPU eager fix in this PR).
+    # The chunked dask + GPU path either qualifies for the GDS fast
+    # path (handled in ``_read_geotiff_gpu_chunked_gds`` which runs
+    # the same cap on its own metadata parse) or falls through to
+    # ``read_geotiff_dask`` whose per-chunk ``read_to_array`` calls
+    # apply the cap inside the CPU reader. The check here closes the
+    # window between "qualification probe parses the IFDs" and "the
+    # dispatch decides which path to take" so a forged tile is
+    # rejected at graph-build time rather than at first ``.compute()``.
+    # Sparse tiles (``byte_count == 0``) pass under any positive cap
+    # by design.
+    if isinstance(src_path, str) and not src_path.startswith(
+            ('http://', 'https://')):
+        try:
+            _cap_fs = _FileSource(src_path)
+            try:
+                _cap_raw = _cap_fs.read_all()
+            finally:
+                _cap_fs.close()
+            _cap_header = parse_header(_cap_raw)
+            _cap_ifds = parse_all_ifds(_cap_raw, _cap_header)
+            _cap_ifd = select_overview_ifd(_cap_ifds, overview_level)
+            _cap_byte_counts = _cap_ifd.tile_byte_counts
+        except Exception:
+            # If metadata parse fails here, the downstream path will
+            # surface a clear error; do not double-report.
+            _cap_byte_counts = None
+        if _cap_byte_counts is not None:
+            _cap = _max_tile_bytes_from_env()
+            for _tile_idx, _bc in enumerate(_cap_byte_counts):
+                if _bc > _cap:
+                    raise ValueError(
+                        f"TIFF tile {_tile_idx} declares "
+                        f"TileByteCount={_bc:,} bytes, which exceeds "
+                        f"the per-tile safety cap of {_cap:,} bytes. "
+                        f"The file is malformed or attempting "
+                        f"denial-of-service. Override via "
+                        f"XRSPATIAL_COG_MAX_TILE_BYTES if this file "
+                        f"is legitimate."
+                    )
+
     # Try the disk->GPU path. Parse metadata once; if the file does not
     # qualify, fall through to the CPU-decode path. Any unexpected
     # exception during the qualification probe also falls through so we
@@ -1050,7 +1097,8 @@ def _read_geotiff_gpu_chunked_gds(source, ifd, geo_info, header, *,
     import dask.array as da_mod
 
     from .._reader import (
-        _check_dimensions, MAX_PIXELS_DEFAULT, _resolve_masked_fill,
+        _check_dimensions, MAX_PIXELS_DEFAULT,
+        _max_tile_bytes_from_env, _resolve_masked_fill,
     )
     from .._compression import COMPRESSION_LERC
     from .._header import validate_tile_layout
@@ -1077,6 +1125,27 @@ def _read_geotiff_gpu_chunked_gds(source, ifd, geo_info, header, *,
     _check_dimensions(tw, th, samples, max_pixels)
     validate_tile_layout(ifd)
 
+    # Per-tile compressed-byte cap, mirroring the eager GPU path's loop
+    # (issue #1664 + the original eager fix above). The chunked GDS
+    # graph fans tile reads out across dask tasks, so a forged
+    # ``TileByteCount`` would otherwise slip past every task's GDS
+    # request and the downstream ``_check_gpu_memory`` guard, which
+    # only catches the aggregate sum. Running the check here means the
+    # dask graph never builds for a hostile file. Sparse tiles
+    # (``byte_count == 0``) pass under any positive cap by design.
+    max_tile_bytes = _max_tile_bytes_from_env()
+    for tile_idx, bc in enumerate(byte_counts):
+        if bc > max_tile_bytes:
+            raise ValueError(
+                f"TIFF tile {tile_idx} declares "
+                f"TileByteCount={bc:,} bytes, which exceeds the "
+                f"per-tile safety cap of {max_tile_bytes:,} bytes. "
+                f"The file is malformed or attempting "
+                f"denial-of-service. Override via "
+                f"XRSPATIAL_COG_MAX_TILE_BYTES if this file is "
+                f"legitimate."
+            )
+
     # Window restricts the visible region; offsets are computed relative
     # to the windowed origin so chunks line up with the user's request.
     if window is not None:
diff --git a/xrspatial/geotiff/tests/_tiff_surgery.py b/xrspatial/geotiff/tests/_tiff_surgery.py
new file mode 100644
index 000000000..4f269c776
--- /dev/null
+++ b/xrspatial/geotiff/tests/_tiff_surgery.py
@@ -0,0 +1,75 @@
+"""In-place TIFF byte-surgery helpers shared by security-cap tests.
+
+The local strip / tile byte-cap tests and the GPU per-tile byte-cap
+test both need to forge a TIFF whose declared ``TileByteCounts`` (tag
+325) or ``StripByteCounts`` (tag 279) entries exceed the production
+cap. They each parse the leading IFD and rewrite every matching tag's
+value array in place. Keeping two near-identical copies of that
+surgery in two test files invited drift, so the helpers now live here.
+
+Not part of the public API; used only by the test suite.
+"""
+from __future__ import annotations
+
+import struct
+
+
+def patch_byte_counts(data: bytearray, tag: int, value: int) -> None:
+    """Rewrite every entry for *tag* in the first IFD of *data*.
+
+    Parameters
+    ----------
+    data : bytearray
+        Mutable TIFF file bytes (entire file). Mutated in place.
+    tag : int
+        ``325`` for ``TileByteCounts`` or ``279`` for ``StripByteCounts``.
+        Other tags work mechanically but the helper exists for those two.
+    value : int
+        New value to stamp into every byte-count entry. For ``SHORT``
+        (type 3) entries the value is clipped to ``0xFFFF`` because the
+        on-disk slot is 16-bit; tests that need a multi-MB value must
+        ensure the source file was written with a ``LONG`` (type 4) tag.
+
+    Raises
+    ------
+    AssertionError
+        When ``tag`` is not present in the first IFD.
+    """
+    from xrspatial.geotiff._header import parse_header
+
+    header = parse_header(bytes(data))
+    bo = header.byte_order
+    ifd_offset = header.first_ifd_offset
+    num_entries = struct.unpack_from(f"{bo}H", data, ifd_offset)[0]
+    entry_offset = ifd_offset + 2
+
+    for i in range(num_entries):
+        eo = entry_offset + i * 12
+        cur_tag = struct.unpack_from(f"{bo}H", data, eo)[0]
+        if cur_tag != tag:
+            continue
+        type_id = struct.unpack_from(f"{bo}H", data, eo + 2)[0]
+        count = struct.unpack_from(f"{bo}I", data, eo + 4)[0]
+        if type_id == 4:  # LONG
+            total = count * 4
+            if total <= 4:
+                for k in range(count):
+                    struct.pack_into(f"{bo}I", data, eo + 8 + k * 4, value)
+            else:
+                ptr = struct.unpack_from(f"{bo}I", data, eo + 8)[0]
+                for k in range(count):
+                    struct.pack_into(f"{bo}I", data, ptr + k * 4, value)
+        elif type_id == 3:  # SHORT
+            clipped = min(value, 0xFFFF)
+            total = count * 2
+            if total <= 4:
+                for k in range(count):
+                    struct.pack_into(
+                        f"{bo}H", data, eo + 8 + k * 2, clipped)
+            else:
+                ptr = struct.unpack_from(f"{bo}I", data, eo + 8)[0]
+                for k in range(count):
+                    struct.pack_into(
+                        f"{bo}H", data, ptr + k * 2, clipped)
+        return
+    raise AssertionError(f"tag {tag} not found in IFD")
diff --git a/xrspatial/geotiff/tests/test_gpu_tile_byte_cap_2026_05_18.py b/xrspatial/geotiff/tests/test_gpu_tile_byte_cap_2026_05_18.py
index 8ca776ad2..744fac6f3 100644
--- a/xrspatial/geotiff/tests/test_gpu_tile_byte_cap_2026_05_18.py
+++ b/xrspatial/geotiff/tests/test_gpu_tile_byte_cap_2026_05_18.py
@@ -23,7 +23,6 @@
 from __future__ import annotations
 
 import importlib.util
-import struct
 
 import numpy as np
 import pytest
@@ -31,6 +30,8 @@
 
 from xrspatial.geotiff import read_geotiff_gpu, to_geotiff
 
+from ._tiff_surgery import patch_byte_counts as _patch_byte_counts
+
 
 def _cupy_available() -> bool:
     if importlib.util.find_spec("cupy") is None:
@@ -49,54 +50,6 @@ def _cupy_available() -> bool:
 )
 
 
-def _patch_byte_counts(data: bytearray, tag: int, value: int) -> None:
-    """Rewrite every entry for *tag* in the first IFD.
-
-    Mirrors the helper in ``test_local_tile_byte_cap_1664.py``: parses
-    the TIFF header, walks the IFD, and writes ``value`` over every
-    occurrence of the named tag's value array. ``tag=325`` is
-    ``TileByteCounts``; ``tag=279`` is ``StripByteCounts``.
-    """
-    from xrspatial.geotiff._header import parse_header
-
-    header = parse_header(bytes(data))
-    bo = header.byte_order
-    ifd_offset = header.first_ifd_offset
-    num_entries = struct.unpack_from(f"{bo}H", data, ifd_offset)[0]
-    entry_offset = ifd_offset + 2
-
-    for i in range(num_entries):
-        eo = entry_offset + i * 12
-        cur_tag = struct.unpack_from(f"{bo}H", data, eo)[0]
-        if cur_tag != tag:
-            continue
-        type_id = struct.unpack_from(f"{bo}H", data, eo + 2)[0]
-        count = struct.unpack_from(f"{bo}I", data, eo + 4)[0]
-        if type_id == 4:  # LONG
-            total = count * 4
-            if total <= 4:
-                for k in range(count):
-                    struct.pack_into(f"{bo}I", data, eo + 8 + k * 4, value)
-            else:
-                ptr = struct.unpack_from(f"{bo}I", data, eo + 8)[0]
-                for k in range(count):
-                    struct.pack_into(f"{bo}I", data, ptr + k * 4, value)
-        elif type_id == 3:  # SHORT
-            clipped = min(value, 0xFFFF)
-            total = count * 2
-            if total <= 4:
-                for k in range(count):
-                    struct.pack_into(
-                        f"{bo}H", data, eo + 8 + k * 2, clipped)
-            else:
-                ptr = struct.unpack_from(f"{bo}I", data, eo + 8)[0]
-                for k in range(count):
-                    struct.pack_into(
-                        f"{bo}H", data, ptr + k * 2, clipped)
-        return
-    raise AssertionError(f"tag {tag} not found in IFD")
-
-
 def _build_forged_tiled_cog(tmp_path, byte_count_value: int) -> str:
     """Write a real tiled COG, patch every TileByteCounts entry, return path."""
     arr = np.arange(64 * 64, dtype=np.float32).reshape(64, 64)
@@ -153,20 +106,51 @@ def test_normal_gpu_read_under_default_cap(self, tmp_path):
 
     @_gpu_only
     def test_env_override_lifts_cap(self, tmp_path, monkeypatch):
-        """A user with legitimate large tiles can lift the cap via env."""
+        """A user with legitimate large tiles can lift the cap via env.
+
+        The truncated forged payload makes the downstream codec raise;
+        the assertion below asserts only that whatever error fires is
+        *not* the cap rejection. Catch the broad ``Exception`` so the
+        test stays focused on the cap-loop contract rather than
+        chasing every decoder failure mode, but still inspect the
+        message string to make sure a regression that re-fires the cap
+        through a different error path would be visible.
+        """
         path = _build_forged_tiled_cog(tmp_path, 50 * 1024 * 1024)
         monkeypatch.setenv(
             "XRSPATIAL_COG_MAX_TILE_BYTES", str(64 * 1024 * 1024))
 
-        # The decompressor may raise on the truncated mmap slice, but
-        # the per-tile cap error must not be the source. Match the
-        # behaviour pinned by ``test_env_override_lifts_cap`` in the
-        # CPU companion module.
         try:
             read_geotiff_gpu(path)
-        except ValueError as exc:
-            assert "exceeds the per-tile safety cap" not in str(exc)
-        except Exception:
-            # Codec failures on the truncated payload are acceptable;
-            # we only care that the cap check did not fire.
-            pass
+        except Exception as exc:
+            assert "exceeds the per-tile safety cap" not in str(exc), (
+                "cap loop fired despite the env override lifting the cap"
+            )
+
+
+# ---------------------------------------------------------------------------
+# Dask + GPU chunked path: same per-tile cap (added in the review pass)
+# ---------------------------------------------------------------------------
+
+
+class TestGpuChunkedTileByteCap:
+    @_gpu_only
+    def test_chunked_huge_tile_byte_count_rejected(
+            self, tmp_path, monkeypatch):
+        """Sibling check on the dask + GPU chunked path.
+
+        ``_read_geotiff_gpu_chunked_gds`` parses the IFDs and then fans
+        out per-chunk GDS reads. Without the cap, the chunked path
+        would build a graph that still pulls the forged tile per task;
+        the metadata-time check rejects the file before any graph is
+        built.
+        """
+        path = _build_forged_tiled_cog(tmp_path, 100 * 1024 * 1024)
+        monkeypatch.setenv(
+            "XRSPATIAL_COG_MAX_TILE_BYTES", str(1024 * 1024))
+
+        with pytest.raises(ValueError, match="TileByteCount"):
+            # ``chunks`` enables the dask + GPU pipeline; the read path
+            # internally routes through ``_read_geotiff_gpu_chunked_gds``
+            # when the file qualifies for the GDS chunked fast path.
+            read_geotiff_gpu(path, chunks=32)
diff --git a/xrspatial/geotiff/tests/test_local_tile_byte_cap_1664.py b/xrspatial/geotiff/tests/test_local_tile_byte_cap_1664.py
index 6437d808c..852616dc3 100644
--- a/xrspatial/geotiff/tests/test_local_tile_byte_cap_1664.py
+++ b/xrspatial/geotiff/tests/test_local_tile_byte_cap_1664.py
@@ -12,8 +12,6 @@
 """
 from __future__ import annotations
 
-import struct
-
 import numpy as np
 import pytest
 import xarray as xr
@@ -26,46 +24,7 @@
 # Helpers -- patch in-place IFD entries for tile / strip byte counts
 # ---------------------------------------------------------------------------
 
-
-def _patch_byte_counts(data: bytearray, tag: int, value: int) -> None:
-    """Rewrite every entry for *tag* (325=TileByteCounts, 279=StripByteCounts)."""
-    from xrspatial.geotiff._header import parse_header
-    header = parse_header(bytes(data))
-    bo = header.byte_order
-    ifd_offset = header.first_ifd_offset
-    num_entries = struct.unpack_from(f'{bo}H', data, ifd_offset)[0]
-    entry_offset = ifd_offset + 2
-
-    for i in range(num_entries):
-        eo = entry_offset + i * 12
-        cur_tag = struct.unpack_from(f'{bo}H', data, eo)[0]
-        if cur_tag != tag:
-            continue
-        type_id = struct.unpack_from(f'{bo}H', data, eo + 2)[0]
-        count = struct.unpack_from(f'{bo}I', data, eo + 4)[0]
-        if type_id == 4:  # LONG
-            total = count * 4
-            if total <= 4:
-                for k in range(count):
-                    struct.pack_into(f'{bo}I', data, eo + 8 + k * 4, value)
-            else:
-                ptr = struct.unpack_from(f'{bo}I', data, eo + 8)[0]
-                for k in range(count):
-                    struct.pack_into(f'{bo}I', data, ptr + k * 4, value)
-        elif type_id == 3:  # SHORT
-            clipped = min(value, 0xFFFF)
-            total = count * 2
-            if total <= 4:
-                for k in range(count):
-                    struct.pack_into(
-                        f'{bo}H', data, eo + 8 + k * 2, clipped)
-            else:
-                ptr = struct.unpack_from(f'{bo}I', data, eo + 8)[0]
-                for k in range(count):
-                    struct.pack_into(
-                        f'{bo}H', data, ptr + k * 2, clipped)
-        return
-    raise AssertionError(f"tag {tag} not found in IFD")
+from ._tiff_surgery import patch_byte_counts as _patch_byte_counts  # noqa: E402
 
 
 def _build_forged_tiled_cog(tmp_path, byte_count_value: int) -> str: