From 922cc8990a225f8e93e470e45c2789c041489a78 Mon Sep 17 00:00:00 2001
From: Brendan Collins <brendancol@gmail.com>
Date: Wed, 22 Apr 2026 13:21:37 -0700
Subject: [PATCH] Guard viewshed numpy path against oversize rasters (#1229)

`_viewshed_cpu` allocated ~500 bytes per pixel with no size check, so a
20000x20000 raster tried to grab ~200 GB before any validation and
OOM-killed the process.

Add the same style of guard `_viewshed_dask` already has: estimate peak
working memory (~500 * H * W) against `_available_memory_bytes()` and
raise `MemoryError` pointing users at `max_distance=` when the request
would eat more than 50% of RAM. `max_distance` routes through
`_viewshed_windowed`, which shrinks the raster before `_viewshed_cpu`
sees it, so the guard is not a dead end.

Two regression tests: one patches `_available_memory_bytes` to a small
value and asserts the guard fires with a message mentioning
`max_distance`; the other confirms `max_distance` bypasses it.
---
 .claude/sweep-security-state.json |  7 ++++++
 xrspatial/tests/test_viewshed.py  | 42 +++++++++++++++++++++++++++++++
 xrspatial/viewshed.py             | 16 ++++++++++++
 3 files changed, 65 insertions(+)

diff --git a/.claude/sweep-security-state.json b/.claude/sweep-security-state.json
index dfc45825..b49d6393 100644
--- a/.claude/sweep-security-state.json
+++ b/.claude/sweep-security-state.json
@@ -33,6 +33,13 @@
       "severity_max": "HIGH",
       "categories_found": [1, 2, 6],
       "notes": "HIGH (fixed #1227): _stats_cupy used `if nodata_values:` (truthy) so nodata_values=0 silently skipped the filter on the cupy backend, producing wrong stats vs every other backend. MEDIUM (unfixed): _strides uses np.int32 for stride indices — can wrap for arrays > ~2B elements in the numpy path. MEDIUM (unfixed): hypsometric_integral() skips _validate_raster on zones/values; _regions_numpy has no memory guard (numpy-only path, bounded by caller-allocated input). MEDIUM (unfixed): _stats_numpy return_type='xarray.DataArray' allocates np.full((n_stats, values.size)) with no guard."
+    },
+    "viewshed": {
+      "last_inspected": "2026-04-22",
+      "issue": 1229,
+      "severity_max": "HIGH",
+      "categories_found": [1],
+      "notes": "HIGH (fixed #1229): _viewshed_cpu allocated ~500 bytes/pixel of working memory (event_list 3*H*W*7*8 bytes + status_values/status_struct/idle + visibility_grid + lexsort temporary) with no guard. A 20000x20000 raster tried to allocate ~200 GB. Fixed by adding peak-memory guard mirroring the _viewshed_dask pattern (_available_memory_bytes() check, raises MemoryError with max_distance= hint). No other HIGH findings: dask path already guarded, _validate_raster is called, distance-sweep uses dtype=float64, _calc_dist_n_grad guards zero distance."
     }
   }
 }
diff --git a/xrspatial/tests/test_viewshed.py b/xrspatial/tests/test_viewshed.py
index 408e18fb..c821a90e 100644
--- a/xrspatial/tests/test_viewshed.py
+++ b/xrspatial/tests/test_viewshed.py
@@ -417,3 +417,45 @@ def test_viewshed_dask_distance_sweep_target_elev():
         np.testing.assert_allclose(
             result[vis_mask], v_np.values[vis_mask], atol=1.0,
             err_msg="Tier C angles diverge too far from numpy reference")
+
+
+def test_viewshed_cpu_memory_guard():
+    """_viewshed_cpu should refuse rasters that would blow past RAM.
+
+    Regression test for the DoS vulnerability where the numpy path
+    allocated ~500 bytes/pixel of working memory with no size check.
+    """
+    from unittest.mock import patch
+
+    ny, nx = 100, 100
+    terrain = np.zeros((ny, nx))
+    xs = np.arange(nx, dtype=float)
+    ys = np.arange(ny, dtype=float)
+    raster = xa.DataArray(terrain, coords=dict(x=xs, y=ys), dims=["y", "x"])
+
+    # 100x100 needs ~5 MB of working memory.  Report 1 MB available so
+    # the 50% threshold (~500 kB) is exceeded and the guard trips.
+    with patch('xrspatial.viewshed._available_memory_bytes',
+               return_value=1_000_000):
+        with pytest.raises(MemoryError, match="max_distance"):
+            viewshed(raster, x=50.0, y=50.0, observer_elev=5)
+
+
+def test_viewshed_cpu_memory_guard_passes_with_max_distance():
+    """max_distance should bypass the memory guard by windowing first."""
+    from unittest.mock import patch
+
+    ny, nx = 100, 100
+    terrain = np.zeros((ny, nx))
+    xs = np.arange(nx, dtype=float)
+    ys = np.arange(ny, dtype=float)
+    raster = xa.DataArray(terrain, coords=dict(x=xs, y=ys), dims=["y", "x"])
+
+    # Same tight memory budget as above — but max_distance=3 shrinks the
+    # working window to a 7x7 tile, so the guard on the small window
+    # does not trip.
+    with patch('xrspatial.viewshed._available_memory_bytes',
+               return_value=1_000_000):
+        v = viewshed(raster, x=50.0, y=50.0, observer_elev=5,
+                     max_distance=3.0)
+    assert v.values[50, 50] == 180.0
diff --git a/xrspatial/viewshed.py b/xrspatial/viewshed.py
index 0e282052..b5d21019 100644
--- a/xrspatial/viewshed.py
+++ b/xrspatial/viewshed.py
@@ -1514,6 +1514,22 @@ def _viewshed_cpu(
 
     height, width = raster.shape
 
+    # Peak-memory guard.  The sweep algorithm allocates an event_list of
+    # 3*H*W rows (168 bytes/pixel), plus the red-black status structure
+    # (status_values + status_struct + idle ~= 104 bytes/pixel),
+    # visibility_grid and raster (~16 bytes/pixel), and a lexsort
+    # temporary that roughly doubles event_list during sort.
+    # Round to ~500 bytes/pixel and refuse if it would eat most of RAM.
+    peak_bytes = 500 * height * width
+    avail = _available_memory_bytes()
+    if peak_bytes > 0.5 * avail:
+        raise MemoryError(
+            f"viewshed CPU sweep would need ~{peak_bytes / 1e9:.1f} GB of "
+            f"working memory for a {height}x{width} raster, which exceeds "
+            f"50% of available RAM ({avail / 1e9:.1f} GB). "
+            f"Pass max_distance= to limit the analysis area."
+        )
+
     y_coords = raster.indexes.get('y').values
     x_coords = raster.indexes.get('x').values