cryptopoly · cryptopoly · Apr 21, 2026 · Apr 20, 2026 · Apr 20, 2026 · Apr 20, 2026
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -34,7 +34,13 @@ jobs:
         run: |
           python -m venv .venv
           source .venv/bin/activate
-          pip install -e ".[desktop,dev]"
+          # [images] brings torch+diffusers+accelerate+pillow+safetensors, which
+          # tests/test_video_runtime.py needs because probe() does a real
+          # ``import torch`` after the mocked _find_missing returns. Without
+          # torch installed, probe bails out with "PyTorch could not be
+          # imported cleanly" and every ``realGenerationAvailable == True``
+          # assertion fails.
+          pip install -e ".[desktop,dev,images]"
           python -m pytest tests/ -v --tb=short
 
   build:

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -1,5 +1,73 @@
 # ChaosEngineAI — Project Guide
 
+## Behavioral Guidelines (Karpathy)
+
+Sourced from `~/andrej-karpathy-skills/CLAUDE.md`. Bias toward caution over
+speed — for trivial tasks, use judgment.
+
+### 1. Think Before Coding
+
+**Don't assume. Don't hide confusion. Surface tradeoffs.**
+
+Before implementing:
+- State your assumptions explicitly. If uncertain, ask.
+- If multiple interpretations exist, present them — don't pick silently.
+- If a simpler approach exists, say so. Push back when warranted.
+- If something is unclear, stop. Name what's confusing. Ask.
+
+### 2. Simplicity First
+
+**Minimum code that solves the problem. Nothing speculative.**
+
+- No features beyond what was asked.
+- No abstractions for single-use code.
+- No "flexibility" or "configurability" that wasn't requested.
+- No error handling for impossible scenarios.
+- If you write 200 lines and it could be 50, rewrite it.
+
+Ask yourself: "Would a senior engineer say this is overcomplicated?" If yes, simplify.
+
+### 3. Surgical Changes
+
+**Touch only what you must. Clean up only your own mess.**
+
+When editing existing code:
+- Don't "improve" adjacent code, comments, or formatting.
+- Don't refactor things that aren't broken.
+- Match existing style, even if you'd do it differently.
+- If you notice unrelated dead code, mention it — don't delete it.
+
+When your changes create orphans:
+- Remove imports/variables/functions that YOUR changes made unused.
+- Don't remove pre-existing dead code unless asked.
+
+The test: every changed line should trace directly to the user's request.
+
+### 4. Goal-Driven Execution
+
+**Define success criteria. Loop until verified.**
+
+Transform tasks into verifiable goals:
+- "Add validation" → "Write tests for invalid inputs, then make them pass"
+- "Fix the bug" → "Write a test that reproduces it, then make it pass"
+- "Refactor X" → "Ensure tests pass before and after"
+
+For multi-step tasks, state a brief plan:
+```
+1. [Step] → verify: [check]
+2. [Step] → verify: [check]
+3. [Step] → verify: [check]
+```
+
+Strong success criteria let you loop independently. Weak criteria
+("make it work") require constant clarification.
+
+**These guidelines are working if:** fewer unnecessary changes in diffs,
+fewer rewrites due to overcomplication, and clarifying questions come
+before implementation rather than after mistakes.
+
+---
+
 ## Architecture Overview
 
 ChaosEngineAI is a desktop AI inference app built with:

diff --git a/backend_service/app.py b/backend_service/app.py
@@ -561,22 +561,46 @@ async def log_requests(request, call_next):
 
     register_routes(app)
 
-    # Kick off a background torch import so the first Video Studio probe
-    # doesn't pay the 30-60s cold-disk cost on Windows. Failures are captured
-    # and surfaced by probe() itself.
-    start_torch_warmup()
-
+    # Deliberately DO NOT call start_torch_warmup() here. Warmup eagerly
+    # imports torch into the backend process, which on Windows pins every
+    # torch/lib/*.dll into the process handle table. That blocks
+    # /api/setup/install-gpu-bundle from pip-installing a new torch (pip's
+    # --upgrade --target rmtree can't remove DLLs held by another process).
+    # Warmup still exists for callers that want pre-priming (preload() in
+    # the video/image runtimes triggers it) — it just isn't automatic.
     return app
 
 
 app = create_app()
 
 
 def _watch_parent_and_exit():
-    """Exit if our parent process dies (e.g. Tauri shell killed via Ctrl+C).
-
-    This prevents orphaned backend + MLX worker processes from holding
-    GPU memory after the desktop app shuts down.
+    """Kill ourselves and every child when the Tauri parent dies.
+
+    Fires when the desktop shell crashes, gets force-closed from Task
+    Manager / Activity Monitor, or is killed via Ctrl+C in dev. Without
+    this, subprocess children we spawned (llama-server, llama-server-turbo,
+    MLX worker) get re-parented to init/launchd and become multi-GB
+    memory ghosts — the exact pattern the user reported where two
+    llama-server.exe processes survived at 28 GB each.
+
+    Platform semantics:
+      - Unix (macOS / Linux): the backend was started inside its own
+        session via setsid() in Tauri's pre_exec hook, so all our
+        descendants share our process group. killpg signals the whole
+        tree atomically. We send SIGTERM then SIGKILL 300ms later as a
+        belt-and-braces — SIGTERM gives llama-server a chance to flush
+        caches / release GPU handles cleanly, SIGKILL catches anything
+        that was ignoring SIGTERM.
+      - Windows: no killpg equivalent. os.kill(self, SIGTERM) just
+        kills Python — the llama-server grandchildren still leak.
+        The real fix on Windows is a Job Object created by the Tauri
+        shell with JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE (implemented in
+        src-tauri/src/lib.rs). When the Tauri process exits, Windows
+        kernel kills the whole job. This watchdog runs first as a
+        fast-path termination trigger; the Job Object is the safety
+        net for the case where Python itself crashed before the
+        watchdog fires.
     """
     import threading
     initial_ppid = os.getppid()
@@ -588,13 +612,25 @@ def _watcher():
             time.sleep(0.5)
             current_ppid = os.getppid()
             if current_ppid != initial_ppid or current_ppid == 1:
-                # Parent died — kill ourselves and any subprocess children
                 try:
                     if hasattr(os, "killpg"):
-                        # Unix: kill our entire process group (includes MLX worker children)
+                        # Unix: SIGTERM the whole process group, give
+                        # llama-server a moment to release GPU VRAM,
+                        # then SIGKILL as backup. killpg(pgrp, SIGKILL)
+                        # kills us too since we're in the group, so the
+                        # os._exit below is only reached if SIGKILL was
+                        # somehow ignored (e.g. PID 1 protections).
                         os.killpg(os.getpgrp(), signal.SIGTERM)
+                        time.sleep(0.3)
+                        try:
+                            os.killpg(os.getpgrp(), signal.SIGKILL)
+                        except ProcessLookupError:
+                            pass  # group gone already, fine
                     else:
-                        # Windows: terminate our own process
+                        # Windows fallback. The Job Object in the Tauri
+                        # shell is the real mechanism for Windows orphan
+                        # prevention; this just makes sure Python itself
+                        # exits fast so the Job handle closes promptly.
                         os.kill(os.getpid(), signal.SIGTERM)
                 except Exception:
                     pass

diff --git a/backend_service/image_runtime.py b/backend_service/image_runtime.py
@@ -365,6 +365,13 @@ def __init__(self) -> None:
         self._device: str | None = None
 
     def probe(self) -> ImageRuntimeStatus:
+        # Deliberately does NOT ``import torch`` — that would load
+        # torch/lib/*.dll into the backend process handle table, and on
+        # Windows those locked DLLs break /api/setup/install-gpu-bundle
+        # (pip's rmtree can't remove files another process has open).
+        # find_spec answers "is it installable?" without triggering the
+        # import side effects. Device detection (cuda vs cpu) is deferred
+        # to preload/generate where we're about to import torch anyway.
         missing = [
             package
             for package, module_name in (
@@ -378,8 +385,8 @@ def probe(self) -> ImageRuntimeStatus:
         ]
         if missing:
             message = (
-                "Install the optional image runtime packages to enable real local generation: "
-                "pip install -e '.[desktop,images]'"
+                "Install the GPU image runtime packages to enable real local generation. "
+                "Click the 'Install GPU runtime' button above."
             )
             return ImageRuntimeStatus(
                 activeEngine="placeholder",
@@ -390,39 +397,18 @@ def probe(self) -> ImageRuntimeStatus:
                 loadedModelRepo=self._loaded_repo,
             )
 
-        try:
-            import torch  # type: ignore
-        except Exception as exc:
-            return ImageRuntimeStatus(
-                activeEngine="placeholder",
-                realGenerationAvailable=False,
-                missingDependencies=["torch"],
-                pythonExecutable=_resolve_image_python(),
-                message=f"PyTorch could not be imported cleanly: {exc}",
-                loadedModelRepo=self._loaded_repo,
-            )
-
-        device = self._detect_device(torch)
         message = (
             "Real local generation is available. Download an image model locally, then Image Studio "
             "will use the diffusers runtime instead of the placeholder engine."
         )
-        # A CPU-only torch on a machine with an NVIDIA GPU is the single
-        # most common "image gen takes 10 minutes per step" misconfiguration
-        # on Windows and Linux. Detect the NVIDIA driver via nvidia-smi and,
-        # if torch didn't pick up CUDA, surface an actionable hint instead
-        # of letting users watch the progress bar crawl.
-        if device == "cpu" and platform.system() in ("Windows", "Linux") and _nvidia_gpu_present():
-            message = (
-                "torch was imported but CUDA is unavailable — diffusion will run on CPU "
-                "(expect minutes per step). Reinstall with the CUDA wheel: "
-                "pip install --upgrade --force-reinstall torch "
-                "--index-url https://download.pytorch.org/whl/cu121"
-            )
         return ImageRuntimeStatus(
             activeEngine="diffusers",
             realGenerationAvailable=True,
-            device=device,
+            # ``device`` is the *currently-loaded* model's device, or None
+            # if no model is loaded. We no longer speculatively import
+            # torch just to report cuda/mps/cpu availability in the empty
+            # case — users find out on first Generate which is cheap.
+            device=self._device,
             pythonExecutable=_resolve_image_python(),
             message=message,
             loadedModelRepo=self._loaded_repo,

diff --git a/backend_service/inference.py b/backend_service/inference.py
@@ -1004,7 +1004,12 @@ class MockInferenceEngine(BaseInferenceEngine):
     """
 
     engine_name = "mock"
-    engine_label = "No backend"
+    # Displayed in the Dashboard "Runtime engine" stat. Used to read "No
+    # backend", which collided with the footer's "BACKEND ONLINE" badge —
+    # two different meanings of "backend" (inference engine vs API sidecar)
+    # sitting on the same screen. "Idle" matches the ``RuntimeStatus.state``
+    # enum already used elsewhere and doesn't claim the sidecar is down.
+    engine_label = "Idle"
 
     def __init__(self, capabilities: BackendCapabilities) -> None:
         self.capabilities = capabilities

diff --git a/backend_service/routes/__init__.py b/backend_service/routes/__init__.py
@@ -23,6 +23,7 @@ def register_routes(app: FastAPI) -> None:
     from .plugins import router as plugins_router
     from .finetuning import router as finetuning_router
     from .prompts import router as prompts_router
+    from .diagnostics import router as diagnostics_router
 
     app.include_router(auth_router)
     app.include_router(health_router)
@@ -41,3 +42,4 @@ def register_routes(app: FastAPI) -> None:
     app.include_router(plugins_router)
     app.include_router(finetuning_router)
     app.include_router(prompts_router)
+    app.include_router(diagnostics_router)