cryptopoly · cryptopoly · May 1, 2026 · May 1, 2026
diff --git a/backend_service/helpers/gpu.py b/backend_service/helpers/gpu.py
@@ -7,10 +7,12 @@
 """
 from __future__ import annotations
 
+import json
+import os
 import platform
 import shutil
 import subprocess
-import json
+import sys
 import threading
 from typing import Any
 
@@ -146,42 +148,103 @@ def _snapshot_nvidia(self) -> dict[str, Any]:
         return self._no_gpu_detected()
 
     def _snapshot_torch_cuda(self) -> dict[str, Any] | None:
-        """Read total + used VRAM from torch.cuda when available.
+        """Read total + used VRAM from torch.cuda via a short-lived subprocess.
+
+        We deliberately do NOT ``import torch`` in the backend process.
+        On Windows, importing torch loads ``torch/lib/*.dll`` (asmjit,
+        cublas, cudnn, ...) into the backend's process handle table,
+        and pip's ``--target`` install of a fresh torch then fails with
+        ``[WinError 5] Access is denied`` when ``shutil.rmtree`` tries
+        to delete the locked DLLs:
+
+            PermissionError: [WinError 5] Access is denied:
+            '...\\extras\\cp312\\site-packages\\torch\\lib\\asmjit.dll'
+
+        The fix is to query torch in a child Python process that exits
+        as soon as it has printed the JSON — the OS releases the DLL
+        handles, and the next ``Install GPU runtime`` click can swap
+        torch in place.
+
+        Returns ``None`` if torch isn't installed, has no CUDA build,
+        no CUDA device is visible, or the subprocess errors. The caller
+        then falls through to ``nvidia-smi``.
+        """
+        # Skip on macOS — Apple Silicon has no torch.cuda; ``_snapshot_macos``
+        # owns the unified-memory path.
+        if self._system == "Darwin":
+            return None
 
-        Returns ``None`` if torch isn't importable, has no CUDA build, or
-        no CUDA device is currently visible (driver missing, GPU
-        passthrough disabled, etc.). The caller then falls through to
-        ``nvidia-smi``.
+        executable = self._resolve_python_executable()
+        if executable is None:
+            return None
+
+        script = (
+            "import json, sys\n"
+            "try:\n"
+            "    import torch\n"
+            "except Exception:\n"
+            "    sys.exit(0)\n"
+            "if not getattr(torch, 'cuda', None) or not torch.cuda.is_available():\n"
+            "    sys.exit(0)\n"
+            "device = torch.cuda.current_device()\n"
+            "props = torch.cuda.get_device_properties(device)\n"
+            "total = int(props.total_memory)\n"
+            "try:\n"
+            "    free, _ = torch.cuda.mem_get_info(device)\n"
+            "    used = max(0, total - int(free))\n"
+            "except Exception:\n"
+            "    used = 0\n"
+            "json.dump({'gpu_name': props.name, 'total': total, 'used': used}, sys.stdout)\n"
+        )
 
-        Importing torch is heavy (~200ms first time) but the result is
-        cached one level up by ``get_device_vram_total_gb``, so the cost
-        is paid at most once per backend session.
-        """
         try:
-            import torch  # type: ignore
-        except Exception:
+            result = subprocess.run(
+                [executable, "-c", script],
+                capture_output=True,
+                text=True,
+                timeout=15,
+                **_SUBPROCESS_KWARGS,
+            )
+        except (FileNotFoundError, subprocess.SubprocessError):
+            return None
+        if result.returncode != 0:
+            return None
+        payload = (result.stdout or "").strip()
+        if not payload:
             return None
         try:
-            if not torch.cuda.is_available():
-                return None
-            device = torch.cuda.current_device()
-            props = torch.cuda.get_device_properties(device)
-            total_bytes = int(props.total_memory)
-            try:
-                free_bytes, _ = torch.cuda.mem_get_info(device)
-                used_bytes = max(0, total_bytes - int(free_bytes))
-            except Exception:
-                used_bytes = 0
-            return {
-                "gpu_name": props.name,
-                "vram_total_gb": round(total_bytes / (1024 ** 3), 2),
-                "vram_used_gb": round(used_bytes / (1024 ** 3), 2),
-                "utilization_pct": None,
-                "temperature_c": None,
-                "power_w": None,
-            }
-        except Exception:
+            data = json.loads(payload)
+            total_bytes = int(data["total"])
+            used_bytes = int(data.get("used") or 0)
+            gpu_name = str(data.get("gpu_name") or "NVIDIA GPU")
+        except (ValueError, KeyError, TypeError):
             return None
+        return {
+            "gpu_name": gpu_name,
+            "vram_total_gb": round(total_bytes / (1024 ** 3), 2),
+            "vram_used_gb": round(used_bytes / (1024 ** 3), 2),
+            "utilization_pct": None,
+            "temperature_c": None,
+            "power_w": None,
+        }
+
+    def _resolve_python_executable(self) -> str | None:
+        """Pick a Python interpreter for the torch.cuda subprocess probe.
+
+        Prefers the embedded sidecar Python (the same one pip writes the
+        GPU bundle wheels to) so ``import torch`` resolves the freshly
+        installed wheel. Falls back to the running interpreter if the
+        embed override isn't set.
+        """
+        candidates: list[str] = []
+        embed = os.environ.get("CHAOSENGINE_EMBED_PYTHON_BIN")
+        if embed:
+            candidates.append(embed)
+        candidates.append(sys.executable)
+        for candidate in candidates:
+            if candidate and os.path.isfile(candidate):
+                return candidate
+        return None
 
     def _no_gpu_detected(self) -> dict[str, Any]:
         return {

diff --git a/src/styles.css b/src/styles.css
@@ -6265,19 +6265,25 @@ select.text-input {
   margin-top: 12px;
   border: 1px solid rgba(255, 255, 255, 0.08);
   border-radius: 8px;
-  background: rgba(0, 0, 0, 0.22);
+  /* Fully opaque card background — the previous rgba(0, 0, 0, 0.22) let
+   * the Prompt and Recent Outputs panel headers bleed through visually
+   * during a long GPU install, making the streaming pip output look
+   * like it was overlapping those cards on Windows. Match the rest of
+   * the surface tokens so the install log reads as a solid card. */
+  background: var(--surface);
   width: 100%;
   max-width: 100%;
   overflow: hidden;
   /* Establishes a stacking context so the streaming pip output stays
-   * above the Prompt + Recent Outputs cards in Image Studio and Video
-   * Studio. Without these the panel renders behind those siblings on
-   * Windows during a long GPU bundle install — the log is still alive
-   * but the user can't see it. ``z-index: 5`` is enough to win against
-   * the surrounding ``.panel`` cards (which set no z-index of their
-   * own) without fighting the global tooltip portal (z-index: 1000+). */
+   * above the Prompt + Recent Outputs cards even on Chrome / WebKit
+   * versions that lay out adjacent grid rows with subpixel overlap. */
   position: relative;
   z-index: 5;
+  /* ``contain: layout`` keeps the panel's growth from leaking into
+   * sibling grid rows — important when the parent runtime callout is
+   * laid out as a flex column and the install log expands by 350+ px
+   * during a torch download. */
+  contain: layout;
 }
 .install-log-summary {
   cursor: pointer;

diff --git a/tests/test_gpu_detection.py b/tests/test_gpu_detection.py
@@ -2,46 +2,26 @@
 
 The pre-fix path returned system RAM via ``psutil.virtual_memory().total``
 when ``nvidia-smi`` wasn't on PATH — so an RTX 4090 box on Windows showed
-12 GB total in the safety estimator instead of 24 GB. The new path tries
-``torch.cuda`` first, falls back to ``nvidia-smi``, and only returns a
-``vram_total_gb=None`` when neither answers. The frontend treats ``None``
-as "unknown" and skips the spurious crash warning.
+12 GB total in the safety estimator instead of 24 GB. The new path probes
+``torch.cuda`` via a short-lived subprocess (so we don't lock torch DLLs
+in the backend process and break the next ``Install GPU runtime``), then
+falls back to ``nvidia-smi``, and only returns ``vram_total_gb=None`` when
+neither answers. The frontend treats ``None`` as "unknown" and skips the
+spurious crash warning.
 """
 
 from __future__ import annotations
 
-import sys
-import types
+import json
 import unittest
 from unittest import mock
 
 from backend_service.helpers import gpu as gpu_module
 
 
-def _fake_torch_with_cuda(total_bytes: int, free_bytes: int, name: str = "NVIDIA GeForce RTX 4090") -> types.ModuleType:
-    cuda = types.SimpleNamespace()
-    cuda.is_available = lambda: True
-    cuda.current_device = lambda: 0
-
-    class _Props:
-        def __init__(self, mem: int, gpu_name: str) -> None:
-            self.total_memory = mem
-            self.name = gpu_name
-
-    cuda.get_device_properties = lambda device: _Props(total_bytes, name)
-    cuda.mem_get_info = lambda device: (free_bytes, total_bytes)
-
-    fake = types.ModuleType("torch")
-    fake.cuda = cuda  # type: ignore[attr-defined]
-    return fake
-
-
-def _fake_torch_no_cuda() -> types.ModuleType:
-    cuda = types.SimpleNamespace()
-    cuda.is_available = lambda: False
-    fake = types.ModuleType("torch")
-    fake.cuda = cuda  # type: ignore[attr-defined]
-    return fake
+def _fake_completed_process(returncode: int, stdout: str, stderr: str = ""):
+    """Build a CompletedProcess-shaped mock for ``subprocess.run``."""
+    return mock.MagicMock(returncode=returncode, stdout=stdout, stderr=stderr)
 
 
 class SnapshotTorchCudaTests(unittest.TestCase):
@@ -58,7 +38,14 @@ def tearDown(self) -> None:
     def test_torch_cuda_returns_full_vram_for_rtx_4090(self) -> None:
         twenty_four_gb = 24 * 1024 ** 3
         free = 22 * 1024 ** 3
-        with mock.patch.dict(sys.modules, {"torch": _fake_torch_with_cuda(twenty_four_gb, free)}):
+        used = twenty_four_gb - free
+        payload = json.dumps({
+            "gpu_name": "NVIDIA GeForce RTX 4090",
+            "total": twenty_four_gb,
+            "used": used,
+        })
+        with mock.patch.object(self.monitor, "_resolve_python_executable", return_value="/usr/bin/python3"), \
+             mock.patch("backend_service.helpers.gpu.subprocess.run", return_value=_fake_completed_process(0, payload)):
             snapshot = self.monitor._snapshot_torch_cuda()
         self.assertIsNotNone(snapshot)
         assert snapshot is not None  # type narrow
@@ -68,26 +55,61 @@ def test_torch_cuda_returns_full_vram_for_rtx_4090(self) -> None:
         self.assertEqual(snapshot["vram_used_gb"], 2.0)
 
     def test_torch_cuda_unavailable_returns_none(self) -> None:
-        with mock.patch.dict(sys.modules, {"torch": _fake_torch_no_cuda()}):
+        # Subprocess exits 0 with empty stdout — the inline script printed
+        # nothing because torch.cuda.is_available() was False.
+        with mock.patch.object(self.monitor, "_resolve_python_executable", return_value="/usr/bin/python3"), \
+             mock.patch("backend_service.helpers.gpu.subprocess.run", return_value=_fake_completed_process(0, "")):
             snapshot = self.monitor._snapshot_torch_cuda()
         self.assertIsNone(snapshot)
 
     def test_torch_not_installed_returns_none(self) -> None:
-        # Monkeypatch the import to raise ImportError.
-        original_import = __builtins__["__import__"] if isinstance(__builtins__, dict) else __builtins__.__import__
-
-        def fake_import(name, *args, **kwargs):
-            if name == "torch":
-                raise ImportError("No module named 'torch'")
-            return original_import(name, *args, **kwargs)
-
-        with mock.patch("builtins.__import__", side_effect=fake_import):
-            # Also remove any previously cached torch entry so the
-            # function's ``import torch`` actually invokes the patched
-            # ``__import__`` instead of resolving via sys.modules.
-            with mock.patch.dict(sys.modules, {}, clear=False):
-                sys.modules.pop("torch", None)
-                snapshot = self.monitor._snapshot_torch_cuda()
+        # Subprocess exits 0 with empty stdout — the inline script's
+        # ``import torch`` raised, the except branch did sys.exit(0).
+        with mock.patch.object(self.monitor, "_resolve_python_executable", return_value="/usr/bin/python3"), \
+             mock.patch("backend_service.helpers.gpu.subprocess.run", return_value=_fake_completed_process(0, "")):
+            snapshot = self.monitor._snapshot_torch_cuda()
+        self.assertIsNone(snapshot)
+
+    def test_subprocess_error_returns_none(self) -> None:
+        with mock.patch.object(self.monitor, "_resolve_python_executable", return_value="/usr/bin/python3"), \
+             mock.patch(
+                "backend_service.helpers.gpu.subprocess.run",
+                side_effect=FileNotFoundError("python3 missing"),
+             ):
+            snapshot = self.monitor._snapshot_torch_cuda()
+        self.assertIsNone(snapshot)
+
+    def test_no_python_executable_returns_none(self) -> None:
+        with mock.patch.object(self.monitor, "_resolve_python_executable", return_value=None):
+            snapshot = self.monitor._snapshot_torch_cuda()
+        self.assertIsNone(snapshot)
+
+    def test_does_not_import_torch_in_main_process(self) -> None:
+        """Critical: importing torch in-process locks Windows DLLs and
+        breaks the next Install GPU runtime click. The probe MUST go via
+        a child process so its DLL handles are released on exit."""
+        twenty_four_gb = 24 * 1024 ** 3
+        payload = json.dumps({"gpu_name": "RTX 4090", "total": twenty_four_gb, "used": 0})
+        captured: list[list[str]] = []
+
+        def fake_run(cmd, **kwargs):
+            captured.append(list(cmd))
+            return _fake_completed_process(0, payload)
+
+        with mock.patch.object(self.monitor, "_resolve_python_executable", return_value="/usr/bin/python3"), \
+             mock.patch("backend_service.helpers.gpu.subprocess.run", side_effect=fake_run):
+            self.monitor._snapshot_torch_cuda()
+        self.assertEqual(len(captured), 1)
+        cmd = captured[0]
+        # The probe must spawn a Python with a -c script containing
+        # 'import torch'. If the implementation ever switches back to
+        # an in-process import this assertion will catch it.
+        self.assertEqual(cmd[1], "-c")
+        self.assertIn("import torch", cmd[2])
+
+    def test_skipped_on_macos(self) -> None:
+        self.monitor._system = "Darwin"
+        snapshot = self.monitor._snapshot_torch_cuda()
         self.assertIsNone(snapshot)