From c63553ab48a1718a37e191970fa56f9bb3926803 Mon Sep 17 00:00:00 2001
From: Cryptopoly <31970407+cryptopoly@users.noreply.github.com>
Date: Fri, 1 May 2026 17:51:02 +0100
Subject: [PATCH] Probe torch.cuda via subprocess + opaque install-log
 background
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two regressions reported from the v0.7.2 Windows smoke test (RTX 4090).

1. GPU bundle install fails with PermissionError on torch DLLs.

   PR #22's _snapshot_torch_cuda probed the GPU by importing torch
   directly in the backend process. On Windows that loads
   torch/lib/*.dll (asmjit, cublas, cudnn, ...) into the process
   handle table, which then makes pip's --target install fail with

       PermissionError: [WinError 5] Access is denied:
       '...\extras\cp312\site-packages\torch\lib\asmjit.dll'

   when shutil.rmtree tries to swap the existing torch wheel.
   DiffusersImageEngine.probe() already documents this exact trap
   and explicitly avoids importing torch — _snapshot_torch_cuda
   was undoing that protection.

   Fix: spawn a short-lived Python subprocess that imports torch,
   prints {gpu_name, total, used} as JSON to stdout, and exits.
   The OS releases the DLL handles on process exit, so the next
   Install GPU runtime click can rmtree + replace torch in place.
   Prefer the embedded sidecar Python (CHAOSENGINE_EMBED_PYTHON_BIN)
   so the subprocess sees the same site-packages as the backend;
   fall back to sys.executable when the env var is not set.

   Also skip the probe entirely on macOS — Apple Silicon has no
   torch.cuda; the unified-memory path in _snapshot_macos owns
   that case.

2. InstallLogPanel still appears to overlap Prompt + Recent Outputs.

   The previous rgba(0, 0, 0, 0.22) background let the sibling
   panel headers bleed through whenever the install log was
   visually adjacent to them, which read as 'overlap' even when
   the layout wasn't actually intersecting. Switch to var(--surface)
   for a fully opaque card background, and add 'contain: layout'
   so the panel's growth during a long torch download cannot leak
   into sibling grid rows.

Tests
- tests/test_gpu_detection.py rewritten to mock subprocess.run
  instead of sys.modules['torch']. Adds an explicit assertion
  that the probe never imports torch in the main process — if
  anyone reverts to an in-process import, that test catches it.
- All existing tests still pass.
---
 backend_service/helpers/gpu.py | 125 +++++++++++++++++++++++++--------
 src/styles.css                 |  20 ++++--
 tests/test_gpu_detection.py    | 116 +++++++++++++++++-------------
 3 files changed, 176 insertions(+), 85 deletions(-)

diff --git a/backend_service/helpers/gpu.py b/backend_service/helpers/gpu.py
index 8556bf9..1530ef2 100644
--- a/backend_service/helpers/gpu.py
+++ b/backend_service/helpers/gpu.py
@@ -7,10 +7,12 @@
 """
 from __future__ import annotations
 
+import json
+import os
 import platform
 import shutil
 import subprocess
-import json
+import sys
 import threading
 from typing import Any
 
@@ -146,42 +148,103 @@ def _snapshot_nvidia(self) -> dict[str, Any]:
         return self._no_gpu_detected()
 
     def _snapshot_torch_cuda(self) -> dict[str, Any] | None:
-        """Read total + used VRAM from torch.cuda when available.
+        """Read total + used VRAM from torch.cuda via a short-lived subprocess.
+
+        We deliberately do NOT ``import torch`` in the backend process.
+        On Windows, importing torch loads ``torch/lib/*.dll`` (asmjit,
+        cublas, cudnn, ...) into the backend's process handle table,
+        and pip's ``--target`` install of a fresh torch then fails with
+        ``[WinError 5] Access is denied`` when ``shutil.rmtree`` tries
+        to delete the locked DLLs:
+
+            PermissionError: [WinError 5] Access is denied:
+            '...\\extras\\cp312\\site-packages\\torch\\lib\\asmjit.dll'
+
+        The fix is to query torch in a child Python process that exits
+        as soon as it has printed the JSON — the OS releases the DLL
+        handles, and the next ``Install GPU runtime`` click can swap
+        torch in place.
+
+        Returns ``None`` if torch isn't installed, has no CUDA build,
+        no CUDA device is visible, or the subprocess errors. The caller
+        then falls through to ``nvidia-smi``.
+        """
+        # Skip on macOS — Apple Silicon has no torch.cuda; ``_snapshot_macos``
+        # owns the unified-memory path.
+        if self._system == "Darwin":
+            return None
 
-        Returns ``None`` if torch isn't importable, has no CUDA build, or
-        no CUDA device is currently visible (driver missing, GPU
-        passthrough disabled, etc.). The caller then falls through to
-        ``nvidia-smi``.
+        executable = self._resolve_python_executable()
+        if executable is None:
+            return None
+
+        script = (
+            "import json, sys\n"
+            "try:\n"
+            "    import torch\n"
+            "except Exception:\n"
+            "    sys.exit(0)\n"
+            "if not getattr(torch, 'cuda', None) or not torch.cuda.is_available():\n"
+            "    sys.exit(0)\n"
+            "device = torch.cuda.current_device()\n"
+            "props = torch.cuda.get_device_properties(device)\n"
+            "total = int(props.total_memory)\n"
+            "try:\n"
+            "    free, _ = torch.cuda.mem_get_info(device)\n"
+            "    used = max(0, total - int(free))\n"
+            "except Exception:\n"
+            "    used = 0\n"
+            "json.dump({'gpu_name': props.name, 'total': total, 'used': used}, sys.stdout)\n"
+        )
 
-        Importing torch is heavy (~200ms first time) but the result is
-        cached one level up by ``get_device_vram_total_gb``, so the cost
-        is paid at most once per backend session.
-        """
         try:
-            import torch  # type: ignore
-        except Exception:
+            result = subprocess.run(
+                [executable, "-c", script],
+                capture_output=True,
+                text=True,
+                timeout=15,
+                **_SUBPROCESS_KWARGS,
+            )
+        except (FileNotFoundError, subprocess.SubprocessError):
+            return None
+        if result.returncode != 0:
+            return None
+        payload = (result.stdout or "").strip()
+        if not payload:
             return None
         try:
-            if not torch.cuda.is_available():
-                return None
-            device = torch.cuda.current_device()
-            props = torch.cuda.get_device_properties(device)
-            total_bytes = int(props.total_memory)
-            try:
-                free_bytes, _ = torch.cuda.mem_get_info(device)
-                used_bytes = max(0, total_bytes - int(free_bytes))
-            except Exception:
-                used_bytes = 0
-            return {
-                "gpu_name": props.name,
-                "vram_total_gb": round(total_bytes / (1024 ** 3), 2),
-                "vram_used_gb": round(used_bytes / (1024 ** 3), 2),
-                "utilization_pct": None,
-                "temperature_c": None,
-                "power_w": None,
-            }
-        except Exception:
+            data = json.loads(payload)
+            total_bytes = int(data["total"])
+            used_bytes = int(data.get("used") or 0)
+            gpu_name = str(data.get("gpu_name") or "NVIDIA GPU")
+        except (ValueError, KeyError, TypeError):
             return None
+        return {
+            "gpu_name": gpu_name,
+            "vram_total_gb": round(total_bytes / (1024 ** 3), 2),
+            "vram_used_gb": round(used_bytes / (1024 ** 3), 2),
+            "utilization_pct": None,
+            "temperature_c": None,
+            "power_w": None,
+        }
+
+    def _resolve_python_executable(self) -> str | None:
+        """Pick a Python interpreter for the torch.cuda subprocess probe.
+
+        Prefers the embedded sidecar Python (the same one pip writes the
+        GPU bundle wheels to) so ``import torch`` resolves the freshly
+        installed wheel. Falls back to the running interpreter if the
+        embed override isn't set.
+        """
+        candidates: list[str] = []
+        embed = os.environ.get("CHAOSENGINE_EMBED_PYTHON_BIN")
+        if embed:
+            candidates.append(embed)
+        candidates.append(sys.executable)
+        for candidate in candidates:
+            if candidate and os.path.isfile(candidate):
+                return candidate
+        return None
 
     def _no_gpu_detected(self) -> dict[str, Any]:
         return {
diff --git a/src/styles.css b/src/styles.css
index d68b611..48265d6 100644
--- a/src/styles.css
+++ b/src/styles.css
@@ -6265,19 +6265,25 @@ select.text-input {
   margin-top: 12px;
   border: 1px solid rgba(255, 255, 255, 0.08);
   border-radius: 8px;
-  background: rgba(0, 0, 0, 0.22);
+  /* Fully opaque card background — the previous rgba(0, 0, 0, 0.22) let
+   * the Prompt and Recent Outputs panel headers bleed through visually
+   * during a long GPU install, making the streaming pip output look
+   * like it was overlapping those cards on Windows. Match the rest of
+   * the surface tokens so the install log reads as a solid card. */
+  background: var(--surface);
   width: 100%;
   max-width: 100%;
   overflow: hidden;
   /* Establishes a stacking context so the streaming pip output stays
-   * above the Prompt + Recent Outputs cards in Image Studio and Video
-   * Studio. Without these the panel renders behind those siblings on
-   * Windows during a long GPU bundle install — the log is still alive
-   * but the user can't see it. ``z-index: 5`` is enough to win against
-   * the surrounding ``.panel`` cards (which set no z-index of their
-   * own) without fighting the global tooltip portal (z-index: 1000+). */
+   * above the Prompt + Recent Outputs cards even on Chrome / WebKit
+   * versions that lay out adjacent grid rows with subpixel overlap. */
   position: relative;
   z-index: 5;
+  /* ``contain: layout`` keeps the panel's growth from leaking into
+   * sibling grid rows — important when the parent runtime callout is
+   * laid out as a flex column and the install log expands by 350+ px
+   * during a torch download. */
+  contain: layout;
 }
 .install-log-summary {
   cursor: pointer;
diff --git a/tests/test_gpu_detection.py b/tests/test_gpu_detection.py
index 3a410b2..bf5e776 100644
--- a/tests/test_gpu_detection.py
+++ b/tests/test_gpu_detection.py
@@ -2,46 +2,26 @@
 
 The pre-fix path returned system RAM via ``psutil.virtual_memory().total``
 when ``nvidia-smi`` wasn't on PATH — so an RTX 4090 box on Windows showed
-12 GB total in the safety estimator instead of 24 GB. The new path tries
-``torch.cuda`` first, falls back to ``nvidia-smi``, and only returns a
-``vram_total_gb=None`` when neither answers. The frontend treats ``None``
-as "unknown" and skips the spurious crash warning.
+12 GB total in the safety estimator instead of 24 GB. The new path probes
+``torch.cuda`` via a short-lived subprocess (so we don't lock torch DLLs
+in the backend process and break the next ``Install GPU runtime``), then
+falls back to ``nvidia-smi``, and only returns ``vram_total_gb=None`` when
+neither answers. The frontend treats ``None`` as "unknown" and skips the
+spurious crash warning.
 """
 
 from __future__ import annotations
 
-import sys
-import types
+import json
 import unittest
 from unittest import mock
 
 from backend_service.helpers import gpu as gpu_module
 
 
-def _fake_torch_with_cuda(total_bytes: int, free_bytes: int, name: str = "NVIDIA GeForce RTX 4090") -> types.ModuleType:
-    cuda = types.SimpleNamespace()
-    cuda.is_available = lambda: True
-    cuda.current_device = lambda: 0
-
-    class _Props:
-        def __init__(self, mem: int, gpu_name: str) -> None:
-            self.total_memory = mem
-            self.name = gpu_name
-
-    cuda.get_device_properties = lambda device: _Props(total_bytes, name)
-    cuda.mem_get_info = lambda device: (free_bytes, total_bytes)
-
-    fake = types.ModuleType("torch")
-    fake.cuda = cuda  # type: ignore[attr-defined]
-    return fake
-
-
-def _fake_torch_no_cuda() -> types.ModuleType:
-    cuda = types.SimpleNamespace()
-    cuda.is_available = lambda: False
-    fake = types.ModuleType("torch")
-    fake.cuda = cuda  # type: ignore[attr-defined]
-    return fake
+def _fake_completed_process(returncode: int, stdout: str, stderr: str = ""):
+    """Build a CompletedProcess-shaped mock for ``subprocess.run``."""
+    return mock.MagicMock(returncode=returncode, stdout=stdout, stderr=stderr)
 
 
 class SnapshotTorchCudaTests(unittest.TestCase):
@@ -58,7 +38,14 @@ def tearDown(self) -> None:
     def test_torch_cuda_returns_full_vram_for_rtx_4090(self) -> None:
         twenty_four_gb = 24 * 1024 ** 3
         free = 22 * 1024 ** 3
-        with mock.patch.dict(sys.modules, {"torch": _fake_torch_with_cuda(twenty_four_gb, free)}):
+        used = twenty_four_gb - free
+        payload = json.dumps({
+            "gpu_name": "NVIDIA GeForce RTX 4090",
+            "total": twenty_four_gb,
+            "used": used,
+        })
+        with mock.patch.object(self.monitor, "_resolve_python_executable", return_value="/usr/bin/python3"), \
+             mock.patch("backend_service.helpers.gpu.subprocess.run", return_value=_fake_completed_process(0, payload)):
             snapshot = self.monitor._snapshot_torch_cuda()
         self.assertIsNotNone(snapshot)
         assert snapshot is not None  # type narrow
@@ -68,26 +55,61 @@ def test_torch_cuda_returns_full_vram_for_rtx_4090(self) -> None:
         self.assertEqual(snapshot["vram_used_gb"], 2.0)
 
     def test_torch_cuda_unavailable_returns_none(self) -> None:
-        with mock.patch.dict(sys.modules, {"torch": _fake_torch_no_cuda()}):
+        # Subprocess exits 0 with empty stdout — the inline script printed
+        # nothing because torch.cuda.is_available() was False.
+        with mock.patch.object(self.monitor, "_resolve_python_executable", return_value="/usr/bin/python3"), \
+             mock.patch("backend_service.helpers.gpu.subprocess.run", return_value=_fake_completed_process(0, "")):
             snapshot = self.monitor._snapshot_torch_cuda()
         self.assertIsNone(snapshot)
 
     def test_torch_not_installed_returns_none(self) -> None:
-        # Monkeypatch the import to raise ImportError.
-        original_import = __builtins__["__import__"] if isinstance(__builtins__, dict) else __builtins__.__import__
-
-        def fake_import(name, *args, **kwargs):
-            if name == "torch":
-                raise ImportError("No module named 'torch'")
-            return original_import(name, *args, **kwargs)
-
-        with mock.patch("builtins.__import__", side_effect=fake_import):
-            # Also remove any previously cached torch entry so the
-            # function's ``import torch`` actually invokes the patched
-            # ``__import__`` instead of resolving via sys.modules.
-            with mock.patch.dict(sys.modules, {}, clear=False):
-                sys.modules.pop("torch", None)
-                snapshot = self.monitor._snapshot_torch_cuda()
+        # Subprocess exits 0 with empty stdout — the inline script's
+        # ``import torch`` raised, the except branch did sys.exit(0).
+        with mock.patch.object(self.monitor, "_resolve_python_executable", return_value="/usr/bin/python3"), \
+             mock.patch("backend_service.helpers.gpu.subprocess.run", return_value=_fake_completed_process(0, "")):
+            snapshot = self.monitor._snapshot_torch_cuda()
+        self.assertIsNone(snapshot)
+
+    def test_subprocess_error_returns_none(self) -> None:
+        with mock.patch.object(self.monitor, "_resolve_python_executable", return_value="/usr/bin/python3"), \
+             mock.patch(
+                "backend_service.helpers.gpu.subprocess.run",
+                side_effect=FileNotFoundError("python3 missing"),
+             ):
+            snapshot = self.monitor._snapshot_torch_cuda()
+        self.assertIsNone(snapshot)
+
+    def test_no_python_executable_returns_none(self) -> None:
+        with mock.patch.object(self.monitor, "_resolve_python_executable", return_value=None):
+            snapshot = self.monitor._snapshot_torch_cuda()
+        self.assertIsNone(snapshot)
+
+    def test_does_not_import_torch_in_main_process(self) -> None:
+        """Critical: importing torch in-process locks Windows DLLs and
+        breaks the next Install GPU runtime click. The probe MUST go via
+        a child process so its DLL handles are released on exit."""
+        twenty_four_gb = 24 * 1024 ** 3
+        payload = json.dumps({"gpu_name": "RTX 4090", "total": twenty_four_gb, "used": 0})
+        captured: list[list[str]] = []
+
+        def fake_run(cmd, **kwargs):
+            captured.append(list(cmd))
+            return _fake_completed_process(0, payload)
+
+        with mock.patch.object(self.monitor, "_resolve_python_executable", return_value="/usr/bin/python3"), \
+             mock.patch("backend_service.helpers.gpu.subprocess.run", side_effect=fake_run):
+            self.monitor._snapshot_torch_cuda()
+        self.assertEqual(len(captured), 1)
+        cmd = captured[0]
+        # The probe must spawn a Python with a -c script containing
+        # 'import torch'. If the implementation ever switches back to
+        # an in-process import this assertion will catch it.
+        self.assertEqual(cmd[1], "-c")
+        self.assertIn("import torch", cmd[2])
+
+    def test_skipped_on_macos(self) -> None:
+        self.monitor._system = "Darwin"
+        snapshot = self.monitor._snapshot_torch_cuda()
         self.assertIsNone(snapshot)