From 1515632ace4308c769e7a75e8fc51823079219d3 Mon Sep 17 00:00:00 2001
From: Cryptopoly <31970407+cryptopoly@users.noreply.github.com>
Date: Fri, 1 May 2026 13:02:59 +0100
Subject: [PATCH] Fix Windows CUDA detection + post-install runtime probe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two related Windows-only bugs surfaced by the v0.7.2 smoke test on
an RTX 4090 box:

Bug #6 — RTX 4090 reported as 12 GB total
  GPUMonitor._snapshot_nvidia() shells out to nvidia-smi, and on
  Windows boxes without it on PATH (driver installed but no CUDA
  toolkit) it fell through to _fallback_psutil() which returns
  psutil.virtual_memory().total — system RAM, not VRAM. The image /
  video safety estimators then read that as the GPU budget and
  produced 'Likely to crash' warnings on a 24 GB card holding an
  11 GB FLUX model.

  Fix:
  - Try torch.cuda.get_device_properties(0).total_memory first.
    When the GPU bundle is installed this is the most reliable
    source — it reads through the CUDA driver, no PATH needed.
  - Fall back to nvidia-smi as before.
  - Drop the psutil fallback. When neither answers we now return
    {'vram_total_gb': None}, which the TS estimators
    (utils/images.ts, utils/videos.ts) already treat as 'unknown'
    via the DEFAULT_*_MEMORY_GB fallbacks. Better an honest
    'unknown' than a wrong 12 GB.

Bug #7 — Image gen produces gibberish placeholder after install
  DiffusersImageEngine.probe() uses importlib.util.find_spec to
  decide between the placeholder engine and the real diffusers
  pipeline. Once the GPU bundle install lands new packages into
  the extras dir, importlib's negative-lookup cache still answers
  None for the new modules until invalidate_caches() is called.
  The probe kept reporting realGenerationAvailable=False and the
  generation pipeline returned the SVG placeholder, which lands as
  a gibberish image when the frontend renders it as data:image/svg+xml.

  Fix:
  - probe() now calls importlib.invalidate_caches() before
    find_spec so newly-installed packages are picked up without a
    backend restart.
  - The GPU bundle worker (_gpu_bundle_job_worker) now also calls
    invalidate_caches and resets the VRAM total cache when it
    transitions to phase=done, so the immediately-following
    capabilities snapshot reflects the freshly-importable torch.

Tests
  tests/test_gpu_detection.py — 9 unit tests covering
  torch.cuda detection, nvidia-smi precedence, the new
  no-system-RAM fallback path, and the process-lifetime cache.
  All pass; existing pytest suite still green.
---
 backend_service/helpers/gpu.py   |  64 +++++++++++-
 backend_service/image_runtime.py |  11 ++
 backend_service/routes/setup.py  |  17 ++++
 tests/test_gpu_detection.py      | 170 +++++++++++++++++++++++++++++++
 4 files changed, 260 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_gpu_detection.py

diff --git a/backend_service/helpers/gpu.py b/backend_service/helpers/gpu.py
index 2c4e84a..8556bf9 100644
--- a/backend_service/helpers/gpu.py
+++ b/backend_service/helpers/gpu.py
@@ -106,6 +106,14 @@ def _snapshot_macos(self) -> dict[str, Any]:
     # ------------------------------------------------------------------
 
     def _snapshot_nvidia(self) -> dict[str, Any]:
+        # Try torch.cuda first — when the GPU bundle is installed it reads
+        # the right total VRAM via the CUDA driver without shelling out,
+        # and works even if ``nvidia-smi`` isn't on PATH (common on Windows
+        # when the user installs the driver but not the CUDA toolkit).
+        torch_snapshot = self._snapshot_torch_cuda()
+        if torch_snapshot is not None:
+            return torch_snapshot
+
         try:
             out = subprocess.check_output(
                 [
@@ -130,8 +138,60 @@ def _snapshot_nvidia(self) -> dict[str, Any]:
         except (FileNotFoundError, subprocess.SubprocessError, ValueError):
             pass
 
-        # Fallback: system RAM via psutil
-        return self._fallback_psutil()
+        # No GPU detected — return a None-VRAM dict rather than reporting
+        # system RAM as if it were VRAM. The image / video safety
+        # estimators downstream treat ``vram_total_gb is None`` as
+        # "unknown" and skip the crash warning, which is the correct
+        # behaviour when we genuinely don't know the card's capacity.
+        return self._no_gpu_detected()
+
+    def _snapshot_torch_cuda(self) -> dict[str, Any] | None:
+        """Read total + used VRAM from torch.cuda when available.
+
+        Returns ``None`` if torch isn't importable, has no CUDA build, or
+        no CUDA device is currently visible (driver missing, GPU
+        passthrough disabled, etc.). The caller then falls through to
+        ``nvidia-smi``.
+
+        Importing torch is heavy (~200ms first time) but the result is
+        cached one level up by ``get_device_vram_total_gb``, so the cost
+        is paid at most once per backend session.
+        """
+        try:
+            import torch  # type: ignore
+        except Exception:
+            return None
+        try:
+            if not torch.cuda.is_available():
+                return None
+            device = torch.cuda.current_device()
+            props = torch.cuda.get_device_properties(device)
+            total_bytes = int(props.total_memory)
+            try:
+                free_bytes, _ = torch.cuda.mem_get_info(device)
+                used_bytes = max(0, total_bytes - int(free_bytes))
+            except Exception:
+                used_bytes = 0
+            return {
+                "gpu_name": props.name,
+                "vram_total_gb": round(total_bytes / (1024 ** 3), 2),
+                "vram_used_gb": round(used_bytes / (1024 ** 3), 2),
+                "utilization_pct": None,
+                "temperature_c": None,
+                "power_w": None,
+            }
+        except Exception:
+            return None
+
+    def _no_gpu_detected(self) -> dict[str, Any]:
+        return {
+            "gpu_name": "No GPU detected",
+            "vram_total_gb": None,
+            "vram_used_gb": None,
+            "utilization_pct": None,
+            "temperature_c": None,
+            "power_w": None,
+        }
 
     # ------------------------------------------------------------------
     # Fallback
diff --git a/backend_service/image_runtime.py b/backend_service/image_runtime.py
index 5fd46ea..1c73d43 100644
--- a/backend_service/image_runtime.py
+++ b/backend_service/image_runtime.py
@@ -537,6 +537,17 @@ def probe(self) -> ImageRuntimeStatus:
         # find_spec answers "is it installable?" without triggering the
         # import side effects. Device detection (cuda vs cpu) is deferred
         # to preload/generate where we're about to import torch anyway.
+        #
+        # ``invalidate_caches`` matters when the GPU bundle install has
+        # finished mid-process: pip writes the new packages into the
+        # extras dir (already on ``sys.path`` from process start), but
+        # ``importlib`` keeps a per-finder cache of negative lookups, so
+        # the find_spec calls below would still report None even though
+        # the .dist-info folders are sitting on disk. Calling
+        # ``invalidate_caches`` first re-walks the path entries so the
+        # newly installed packages are picked up without a process
+        # restart.
+        importlib.invalidate_caches()
         missing = [
             package
             for package, module_name in (
diff --git a/backend_service/routes/setup.py b/backend_service/routes/setup.py
index dcdfd92..ee381e6 100644
--- a/backend_service/routes/setup.py
+++ b/backend_service/routes/setup.py
@@ -1067,6 +1067,23 @@ def _gpu_bundle_job_worker(python: str, extras_dir: Path) -> None:
         state.cuda_verified = cuda_ok
         state.attempts.append({"phase": "verify", "ok": cuda_ok, "output": detail[-2000:]})
 
+        # Tell the import system to re-scan ``sys.path`` so packages
+        # written into the extras dir during this run are visible to the
+        # next ``importlib.util.find_spec`` call (the image-runtime probe
+        # uses one). Without this, the runtime continues reporting
+        # "placeholder" until a backend restart even though the bundle
+        # is on disk. Also reset the cached VRAM total so the post-install
+        # capabilities snapshot reflects the freshly importable torch.
+        try:
+            importlib.invalidate_caches()
+        except Exception:
+            pass
+        try:
+            from backend_service.helpers.gpu import reset_vram_total_cache
+            reset_vram_total_cache()
+        except Exception:
+            pass
+
         state.phase = "done"
         state.percent = 100.0
         state.done = True
diff --git a/tests/test_gpu_detection.py b/tests/test_gpu_detection.py
new file mode 100644
index 0000000..3a410b2
--- /dev/null
+++ b/tests/test_gpu_detection.py
@@ -0,0 +1,170 @@
+"""Tests for the Windows / Linux GPU detection helper.
+
+The pre-fix path returned system RAM via ``psutil.virtual_memory().total``
+when ``nvidia-smi`` wasn't on PATH — so an RTX 4090 box on Windows showed
+12 GB total in the safety estimator instead of 24 GB. The new path tries
+``torch.cuda`` first, falls back to ``nvidia-smi``, and only returns a
+``vram_total_gb=None`` when neither answers. The frontend treats ``None``
+as "unknown" and skips the spurious crash warning.
+"""
+
+from __future__ import annotations
+
+import sys
+import types
+import unittest
+from unittest import mock
+
+from backend_service.helpers import gpu as gpu_module
+
+
+def _fake_torch_with_cuda(total_bytes: int, free_bytes: int, name: str = "NVIDIA GeForce RTX 4090") -> types.ModuleType:
+    cuda = types.SimpleNamespace()
+    cuda.is_available = lambda: True
+    cuda.current_device = lambda: 0
+
+    class _Props:
+        def __init__(self, mem: int, gpu_name: str) -> None:
+            self.total_memory = mem
+            self.name = gpu_name
+
+    cuda.get_device_properties = lambda device: _Props(total_bytes, name)
+    cuda.mem_get_info = lambda device: (free_bytes, total_bytes)
+
+    fake = types.ModuleType("torch")
+    fake.cuda = cuda  # type: ignore[attr-defined]
+    return fake
+
+
+def _fake_torch_no_cuda() -> types.ModuleType:
+    cuda = types.SimpleNamespace()
+    cuda.is_available = lambda: False
+    fake = types.ModuleType("torch")
+    fake.cuda = cuda  # type: ignore[attr-defined]
+    return fake
+
+
+class SnapshotTorchCudaTests(unittest.TestCase):
+    def setUp(self) -> None:
+        gpu_module.reset_vram_total_cache()
+        self.monitor = gpu_module.GPUMonitor()
+        # Force the monitor onto the nvidia path even when running these
+        # tests on a Mac developer machine.
+        self.monitor._system = "Linux"
+
+    def tearDown(self) -> None:
+        gpu_module.reset_vram_total_cache()
+
+    def test_torch_cuda_returns_full_vram_for_rtx_4090(self) -> None:
+        twenty_four_gb = 24 * 1024 ** 3
+        free = 22 * 1024 ** 3
+        with mock.patch.dict(sys.modules, {"torch": _fake_torch_with_cuda(twenty_four_gb, free)}):
+            snapshot = self.monitor._snapshot_torch_cuda()
+        self.assertIsNotNone(snapshot)
+        assert snapshot is not None  # type narrow
+        self.assertEqual(snapshot["gpu_name"], "NVIDIA GeForce RTX 4090")
+        self.assertEqual(snapshot["vram_total_gb"], 24.0)
+        # 24 - 22 = 2 GB used.
+        self.assertEqual(snapshot["vram_used_gb"], 2.0)
+
+    def test_torch_cuda_unavailable_returns_none(self) -> None:
+        with mock.patch.dict(sys.modules, {"torch": _fake_torch_no_cuda()}):
+            snapshot = self.monitor._snapshot_torch_cuda()
+        self.assertIsNone(snapshot)
+
+    def test_torch_not_installed_returns_none(self) -> None:
+        # Monkeypatch the import to raise ImportError.
+        original_import = __builtins__["__import__"] if isinstance(__builtins__, dict) else __builtins__.__import__
+
+        def fake_import(name, *args, **kwargs):
+            if name == "torch":
+                raise ImportError("No module named 'torch'")
+            return original_import(name, *args, **kwargs)
+
+        with mock.patch("builtins.__import__", side_effect=fake_import):
+            # Also remove any previously cached torch entry so the
+            # function's ``import torch`` actually invokes the patched
+            # ``__import__`` instead of resolving via sys.modules.
+            with mock.patch.dict(sys.modules, {}, clear=False):
+                sys.modules.pop("torch", None)
+                snapshot = self.monitor._snapshot_torch_cuda()
+        self.assertIsNone(snapshot)
+
+
+class SnapshotNvidiaTests(unittest.TestCase):
+    def setUp(self) -> None:
+        gpu_module.reset_vram_total_cache()
+        self.monitor = gpu_module.GPUMonitor()
+        self.monitor._system = "Linux"
+
+    def tearDown(self) -> None:
+        gpu_module.reset_vram_total_cache()
+
+    def test_falls_back_to_no_gpu_when_torch_and_nvidia_smi_both_fail(self) -> None:
+        with mock.patch.object(self.monitor, "_snapshot_torch_cuda", return_value=None), \
+             mock.patch("subprocess.check_output", side_effect=FileNotFoundError):
+            snapshot = self.monitor._snapshot_nvidia()
+        self.assertEqual(snapshot["gpu_name"], "No GPU detected")
+        self.assertIsNone(snapshot["vram_total_gb"])
+        self.assertIsNone(snapshot["vram_used_gb"])
+
+    def test_does_not_fall_back_to_system_ram(self) -> None:
+        """The whole point of this fix: don't lie that system RAM is VRAM."""
+        with mock.patch.object(self.monitor, "_snapshot_torch_cuda", return_value=None), \
+             mock.patch("subprocess.check_output", side_effect=FileNotFoundError):
+            snapshot = self.monitor._snapshot_nvidia()
+        self.assertNotEqual(snapshot["gpu_name"], "System RAM (no GPU detected)")
+
+    def test_torch_cuda_takes_precedence_over_nvidia_smi(self) -> None:
+        torch_snapshot = {
+            "gpu_name": "RTX 4090",
+            "vram_total_gb": 24.0,
+            "vram_used_gb": 1.0,
+            "utilization_pct": None,
+            "temperature_c": None,
+            "power_w": None,
+        }
+        with mock.patch.object(self.monitor, "_snapshot_torch_cuda", return_value=torch_snapshot), \
+             mock.patch("subprocess.check_output") as mock_subprocess:
+            snapshot = self.monitor._snapshot_nvidia()
+        self.assertEqual(snapshot["vram_total_gb"], 24.0)
+        mock_subprocess.assert_not_called()
+
+
+class GetDeviceVramTotalGbTests(unittest.TestCase):
+    def setUp(self) -> None:
+        gpu_module.reset_vram_total_cache()
+
+    def tearDown(self) -> None:
+        gpu_module.reset_vram_total_cache()
+
+    def test_returns_none_when_snapshot_has_no_vram(self) -> None:
+        with mock.patch.object(
+            gpu_module._monitor,
+            "snapshot",
+            return_value={"vram_total_gb": None},
+        ):
+            self.assertIsNone(gpu_module.get_device_vram_total_gb())
+
+    def test_returns_float_when_snapshot_has_vram(self) -> None:
+        with mock.patch.object(
+            gpu_module._monitor,
+            "snapshot",
+            return_value={"vram_total_gb": 24.0},
+        ):
+            self.assertEqual(gpu_module.get_device_vram_total_gb(), 24.0)
+
+    def test_caches_result_for_process_lifetime(self) -> None:
+        with mock.patch.object(
+            gpu_module._monitor,
+            "snapshot",
+            return_value={"vram_total_gb": 24.0},
+        ) as mock_snapshot:
+            gpu_module.get_device_vram_total_gb()
+            gpu_module.get_device_vram_total_gb()
+            gpu_module.get_device_vram_total_gb()
+        self.assertEqual(mock_snapshot.call_count, 1)
+
+
+if __name__ == "__main__":
+    unittest.main()