diff --git a/.claude/skills/processing-state-safety/SKILL.md b/.claude/skills/processing-state-safety/SKILL.md
index 2861b2a042..fbc031cab5 100644
--- a/.claude/skills/processing-state-safety/SKILL.md
+++ b/.claude/skills/processing-state-safety/SKILL.md
@@ -5,12 +5,13 @@ description: >
   abort/error paths, or CompleteResponse in CopilotService. Use when: (1) Adding or
   modifying code paths that set IsProcessing=false, (2) Touching HandleSessionEvent,
   CompleteResponse, AbortSessionAsync, or the processing watchdog, (3) Adding new
-  SDK event handlers, (4) Debugging stuck sessions showing "Thinking..." forever,
-  (5) Modifying IsResumed, HasUsedToolsThisTurn, or ActiveToolCallCount,
+  SDK event handlers, (4) Debugging stuck sessions showing "Thinking..." forever
+  or spinner stuck, (5) Modifying IsResumed, HasUsedToolsThisTurn, or ActiveToolCallCount,
   (6) Adding diagnostic log tags, (7) Modifying session restore paths
   (RestoreSingleSessionAsync) that must initialize watchdog-dependent state,
   (8) Modifying ReconcileOrganization or any code that reads Organization.Sessions
-  during the IsRestoring window. Covers: 13 invariants from 10 PRs of fix cycles,
+  during the IsRestoring window, (9) Session appears hung or unresponsive after tool use.
+  Covers: 13 invariants from 10 PRs of fix cycles,
   the 9 code paths that clear IsProcessing, and common regression patterns.
 ---
 
@@ -31,6 +32,7 @@ Every code path that sets `IsProcessing = false` MUST also:
 10. Fire `OnSessionComplete` (unblocks orchestrator loops waiting for completion)
 11. Add a diagnostic log entry (`[COMPLETE]`, `[ERROR]`, `[ABORT]`, etc.)
 12. Run on UI thread (via `InvokeOnUI()` or already on UI thread)
+13. After changes, run `ProcessingWatchdogTests.cs` to catch regressions
 
 ## The 9 Paths That Clear IsProcessing
 
@@ -74,6 +76,17 @@ ALL IsProcessing mutations go through UI thread via `InvokeOnUI()`.
 Use generation guard before clearing IsProcessing. `SyncContext.Post` is
 async — new `SendPromptAsync` can race between `Post()` and callback.
 
+```csharp
+// Capture BEFORE posting to UI thread
+var gen = Interlocked.Read(ref state.ProcessingGeneration);
+InvokeOnUI(() =>
+{
+    // Validate INSIDE the callback — abort if a new turn started
+    if (Interlocked.Read(ref state.ProcessingGeneration) != gen) return;
+    // Safe to clear IsProcessing here
+});
+```
+
 ### INV-4: No hardcoded short timeouts
 NEVER add hardcoded short timeouts for session resume. The watchdog
 (120s/600s) with tiered approach is the correct mechanism.
@@ -88,9 +101,11 @@ Cleared on ALL termination paths. Extends watchdog to 600s.
 Clearing guarded on `!hasActiveTool && !HasUsedToolsThisTurn`.
 
 ### INV-7: Volatile for cross-thread fields
-`HasUsedToolsThisTurn`, `HasReceivedEventsSinceResume` should use
-`Volatile.Write`/`Volatile.Read`. ARM weak memory model issue.
-(Currently partial — resets use plain assignment.)
+When adding NEW cross-thread boolean/int flags, use `Volatile.Write`/`Volatile.Read`
+for ARM weak memory model correctness. Existing fields `HasUsedToolsThisTurn` and
+`HasReceivedEventsSinceResume` use plain assignment (pre-existing inconsistency —
+tracked separately, do not fix inline). Do NOT introduce additional plain-assignment
+cross-thread fields without a tracking comment explaining the gap.
 
 ### INV-8: No InvokeAsync in HandleComplete
 `HandleComplete` is already on UI thread. `InvokeAsync` defers execution
@@ -180,6 +195,34 @@ intent is less clear when reading cross-threaded code.
 **Retired mistake (was #2):** *ActiveToolCallCount as sole tool signal* — still relevant per
 INV-5, but the more impactful version is #2 above (suppressing the fallback entirely).
 
+## Diagnosing a Stuck Session
+
+When a session shows "Thinking..." indefinitely:
+
+1. **Check the diagnostic log** — `~/.polypilot/event-diagnostics.log`
+   ```bash
+   grep 'SESSION_NAME' ~/.polypilot/event-diagnostics.log | tail -20
+   ```
+   Look for the last `[SEND]` (turn started) and whether `[IDLE]` or `[COMPLETE]` followed.
+
+2. **Check if the watchdog is running** — look for `[WATCHDOG]` entries after the `[SEND]`.
+   If none appear, the watchdog wasn't started (see INV-9 for restore path issues).
+
+3. **Check `IsProcessing` state** — via MauiDevFlow CDP:
+   ```bash
+   maui-devflow cdp Runtime evaluate "document.querySelector('.processing-indicator')?.textContent"
+   ```
+
+4. **Common stuck patterns:**
+   | Symptom | Likely Cause | Fix |
+   |---------|-------------|-----|
+   | `[SEND]` then silence | SDK never responded, watchdog will catch at 120s | Wait or abort |
+   | `[EVT] TurnEnd` but no `[IDLE]` | Zero-idle SDK bug | Watchdog catches at 30s fallback (INV-10) |
+   | `[COMPLETE]` fired but spinner persists | UI thread not notified | Check INV-2, INV-8 |
+   | `[WATCHDOG]` clears but re-sticks | New turn started before watchdog callback ran | Check INV-3 generation guard |
+
+5. **Nuclear option** — user clicks Stop (AbortSessionAsync, path #5/#6).
+
 ## Regression History
 
 10 PRs of fix/regression cycles: #141 → #147 → #148 → #153 → #158 → #163 → #164 → #276 → #284 → #332.
diff --git a/.claude/skills/processing-state-safety/tests/eval.yaml b/.claude/skills/processing-state-safety/tests/eval.yaml
new file mode 100644
index 0000000000..738a409542
--- /dev/null
+++ b/.claude/skills/processing-state-safety/tests/eval.yaml
@@ -0,0 +1,276 @@
+scenarios:
+  - name: "New IsProcessing=false path needs full cleanup checklist"
+    prompt: |
+      I need to add a new code path in CopilotService that clears IsProcessing when a
+      network timeout is detected. Can you write the code for this? The path should be
+      in a Task.Run callback that runs on a timer. Make sure it's correct.
+    assertions:
+      - type: "output_contains"
+        value: "FlushCurrentResponse"
+      - type: "output_contains"
+        value: "InvokeOnUI"
+      - type: "output_contains"
+        value: "ProcessingGeneration"
+      - type: "output_matches"
+        pattern: "IsResumed|HasUsedToolsThisTurn|ActiveToolCallCount"
+      - type: "output_matches"
+        pattern: "ProcessingPhase|ProcessingStartedAt|ToolCallCount|SendingFlag"
+      - type: "output_matches"
+        # OnSessionComplete unblocks orchestrator loops — must be in every cleanup path
+        pattern: "OnSessionComplete|ClearPermissionDenials"
+      - type: "output_matches"
+        # Must include a diagnostic log tag for debugging stuck sessions
+        pattern: "\\[ERROR\\]|\\[TIMEOUT\\]|\\[NETWORK\\]|diagnostic.log"
+    rubric:
+      - "Agent includes all or nearly all of the 13-item cleanup checklist (IsResumed, HasUsedToolsThisTurn, ActiveToolCallCount, ProcessingStartedAt, ToolCallCount, ProcessingPhase, SendingFlag, ClearPermissionDenials, FlushCurrentResponse, OnSessionComplete, diagnostic log)"
+      - "Agent uses InvokeOnUI() class method (not local Invoke) for the background→UI dispatch per INV-13"
+      - "Agent captures ProcessingGeneration before the InvokeOnUI call and validates it inside the lambda (generation guard pattern per INV-3 and INV-12)"
+      - "Agent adds a diagnostic log entry with an appropriate tag like [ERROR] or [TIMEOUT]"
+    timeout: 120
+
+  - name: "Debugging a session stuck showing Thinking... forever"
+    prompt: |
+      A user reports that a session is stuck showing "Thinking..." indefinitely. 
+      The session used some tools earlier but nothing happened for several minutes.
+      How do I diagnose and fix this?
+    assertions:
+      - type: "output_contains"
+        value: "event-diagnostics.log"
+      - type: "output_contains"
+        value: "[SEND]"
+      - type: "output_matches"
+        pattern: "(?i)(watchdog|120s|600s|WATCHDOG)"
+      - type: "output_matches"
+        # Should reference specific diagnostic tags, not just generic "check the logs"
+        pattern: "\\[IDLE\\]|\\[COMPLETE\\]|\\[WATCHDOG\\]|\\[EVT\\]"
+      - type: "output_not_contains"
+        value: "restart the app"
+        # Should suggest specific diagnostic steps, not generic advice
+    rubric:
+      - "Agent directs the user to check event-diagnostics.log and explains what tags to look for ([SEND], [IDLE], [COMPLETE], [WATCHDOG])"
+      - "Agent explains the watchdog timeout tiers (120s inactivity, 600s tool/resume) and which case likely applies given tools were used"
+      - "Agent references the stuck-session diagnostic table: [SEND] then silence, [EVT] TurnEnd but no [IDLE], [COMPLETE] fired but spinner persists, etc."
+      - "Agent mentions the nuclear option (AbortSessionAsync / Stop button) as a fallback"
+    timeout: 120
+
+  - name: "Unrelated UI styling task should not trigger processing-state concepts"
+    prompt: |
+      I want to update the CSS for the session list items in SessionListItem.razor to
+      add a subtle hover animation. Can you help me write the CSS?
+    assertions:
+      - type: "output_not_contains"
+        value: "IsProcessing"
+      - type: "output_not_contains"
+        value: "FlushCurrentResponse"
+      - type: "output_not_contains"
+        value: "InvokeOnUI"
+      - type: "output_not_contains"
+        value: "ProcessingGeneration"
+      - type: "output_not_contains"
+        value: "watchdog"
+    rubric:
+      - "Agent focuses purely on CSS/styling and does not inject unrelated processing-state safety guidance"
+      - "Response is concise and on-topic for a CSS change"
+    timeout: 60
+
+  - name: "TurnEnd fallback regression - avoid suppressing with HasUsedToolsThisTurn"
+    prompt: |
+      The AssistantTurnEndEvent fallback is firing too early during multi-tool agent
+      sessions and completing the response prematurely. I want to fix this by checking
+      if HasUsedToolsThisTurn is true and skipping the fallback entirely in that case.
+      Can you implement this fix?
+    assertions:
+      - type: "output_matches"
+        pattern: "(?i)(do not|don't|should not|shouldn't|wrong|dangerous|avoid|warn|mistake)"
+      - type: "output_contains"
+        value: "ActiveToolCallCount"
+      - type: "output_matches"
+        pattern: "(?i)(30s|30.000|30_000|extended|delay|TurnEndIdleToolFallbackAdditionalMs)"
+      - type: "output_matches"
+        # Must reference the cancellation mechanism, not just the delay
+        pattern: "(?i)(cancel|CancellationToken|AssistantTurnStartEvent|CTS)"
+    rubric:
+      - "Agent explicitly warns against using HasUsedToolsThisTurn to skip the fallback entirely, explaining that it leaves agent sessions with no recovery if SessionIdleEvent is dropped (SDK bug #299)"
+      - "Agent proposes the correct approach: use ActiveToolCallCount > 0 to skip the 4s fallback, with an extended 30s delay when tools were used but ActiveToolCallCount is 0"
+      - "Agent references the cancellation token from AssistantTurnStartEvent as the mechanism to prevent premature firing in multi-round tool use"
+    timeout: 120
+
+  - name: "Session restore must initialize watchdog-dependent state"
+    prompt: |
+      I'm adding a new boolean flag IsHighPrioritySession to AgentSessionInfo. It needs
+      to affect the watchdog timeout — when true, use a longer 900s timeout instead of
+      600s. I've added it to SendPromptAsync. Is there anything else I need to update?
+    assertions:
+      - type: "output_contains"
+        value: "RestoreSingleSessionAsync"
+      - type: "output_matches"
+        pattern: "(?i)(restore|restart|resume)"
+      - type: "output_matches"
+        pattern: "(?i)(before.*watchdog|StartProcessingWatchdog|watchdog.*depend)"
+      - type: "output_matches"
+        # Should mention the precedent or dual-path pattern
+        pattern: "(?i)(IsMultiAgentSession|both.*path|two.*path|dual|separate.*path)"
+    rubric:
+      - "Agent identifies that the restore path (RestoreSingleSessionAsync) must also initialize the new flag, not just SendPromptAsync"
+      - "Agent explains that state must be set BEFORE StartProcessingWatchdog is called, and the restore path is independent of the send path"
+      - "Agent references the IsMultiAgentSession precedent (PR #284) where the same mistake caused premature watchdog kills"
+    timeout: 120
+
+  - name: "ChatDatabase catch filter narrowing - prevent regression"
+    prompt: |
+      The ChatDatabase is swallowing too many exceptions. I want to narrow the catch
+      filter from `catch (Exception ex)` to `catch (SqliteException ex)` so we only
+      catch database-specific errors. Can you update the AddMessageAsync callers?
+    assertions:
+      - type: "output_not_contains"
+        value: "SqliteException"
+        # Should NOT recommend narrowing to SqliteException
+      - type: "output_matches"
+        pattern: "(?i)(unobserved|crash|AggregateException|ObjectDisposedException)"
+      - type: "output_matches"
+        pattern: "(?i)(fire.and.forget|task exception|source of truth|events\\.jsonl)"
+      - type: "output_matches"
+        # Should explain self-healing via BulkInsertAsync
+        pattern: "(?i)(self.heal|BulkInsert|replay|restart|write.through)"
+    rubric:
+      - "Agent explicitly warns against narrowing the catch filter, explaining that fire-and-forget tasks with narrow catches produce unobserved task exceptions that crash the app"
+      - "Agent explains that events.jsonl is the source of truth and DB write failures are self-healing via BulkInsertAsync on resume — broad catch is intentional"
+      - "Agent may reference PR #276 where this exact narrowing caused crashes in production"
+    timeout: 120
+
+  - name: "Background thread IsProcessing mutation without UI marshaling"
+    prompt: |
+      I'm adding a new SDK event handler for SessionReconnectEvent in
+      CopilotService.Events.cs. When we get this event, I need to clear the
+      processing state. Here's what I have inside HandleSessionEvent which runs
+      on a background thread:
+
+      ```csharp
+      case "session.reconnect":
+          state.Info.IsProcessing = false;
+          state.Info.IsResumed = false;
+          state.Info.HasUsedToolsThisTurn = false;
+          state.Info.ActiveToolCallCount = 0;
+          OnStateChanged?.Invoke();
+          break;
+      ```
+
+      Does this look correct?
+    assertions:
+      - type: "output_matches"
+        pattern: "(?i)(InvokeOnUI|UI thread|main thread|marshal|dispatch|synchronization)"
+      - type: "output_matches"
+        pattern: "(?i)(background.*thread|thread.*safe|cross.*thread)"
+      - type: "output_matches"
+        pattern: "ProcessingGeneration|generation.*guard"
+      - type: "output_matches"
+        pattern: "FlushCurrentResponse|flush"
+      - type: "output_matches"
+        # Must identify the missing companion fields from the 13-item checklist
+        pattern: "(?i)(ProcessingPhase|ProcessingStartedAt|ToolCallCount|SendingFlag|missing|incomplete)"
+      - type: "output_matches"
+        # Must mention diagnostic log entry or OnSessionComplete
+        pattern: "(?i)(diagnostic.*log|\\[RECONNECT\\]|OnSessionComplete|ClearPermissionDenials)"
+    rubric:
+      - "Agent identifies the critical thread safety violation: IsProcessing and companion fields must not be mutated directly on a background SDK event thread"
+      - "Agent recommends wrapping the state mutations in InvokeOnUI() to marshal to the UI thread"
+      - "Agent points out missing companion fields beyond the ones listed (ProcessingPhase, ProcessingStartedAt, ToolCallCount, SendingFlag, ClearPermissionDenials, OnSessionComplete)"
+      - "Agent mentions the ProcessingGeneration guard pattern and the need for FlushCurrentResponse before clearing IsProcessing"
+      - "Agent recommends adding a diagnostic log entry with an appropriate tag"
+    timeout: 120
+
+  - name: "No hardcoded short timeout for session resume (INV-4)"
+    prompt: |
+      After a session is restored, we want to quickly detect if the turn was already
+      complete so the spinner doesn't hang. Can I add a 5-second sleep after
+      RestoreSingleSessionAsync and then clear IsProcessing if no events arrived?
+      Something like:
+      ```csharp
+      await Task.Delay(5000);
+      if (state.Info.IsProcessing && !state.Info.HasReceivedEventsSinceResume)
+          await CompleteResponseAsync(state);
+      ```
+    assertions:
+      - type: "output_matches"
+        pattern: "(?i)(do not|don't|avoid|should not|wrong|dangerous|warn|problem|incorrect)"
+      - type: "output_matches"
+        pattern: "(?i)(watchdog|120s|600s|tiered|timeout.tier)"
+      - type: "output_matches"
+        # Should reference IsResumed or resume quiescence mechanism
+        pattern: "(?i)(IsResumed|resume.quiescence|30s|quiescence)"
+      - type: "output_not_contains"
+        value: "Task.Delay"
+        # Should not approve or refine the Task.Delay approach
+    rubric:
+      - "Agent warns against hardcoded short timeouts for session resume, explaining that legitimate multi-agent or tool-heavy resumes need the full watchdog timeout window"
+      - "Agent directs the developer to the existing watchdog mechanism (IsResumed flag + 600s extended timeout, or 30s resume-quiescence path) as the correct solution"
+      - "Agent explains that the 5s Task.Delay would prematurely kill sessions that are genuinely still processing after an app restart"
+    timeout: 120
+
+  - name: "Watchdog timeout should use 3-way branch not a flat 600s wait"
+    prompt: |
+      I'm simplifying the watchdog timeout logic. Currently it has complicated case
+      branches. I want to replace it with a single check: if ActiveToolCallCount > 0
+      OR HasUsedToolsThisTurn, wait the full 600s before declaring the session stuck
+      and killing it. Otherwise use 120s. Is this safe?
+    assertions:
+      - type: "output_matches"
+        pattern: "(?i)(do not|don't|should not|shouldn't|wrong|dangerous|problem|issue|warn)"
+      - type: "output_matches"
+        pattern: "(?i)(SessionIdleEvent|dropped|lost.*event|event.*dropped|zero.idle|terminal event)"
+      - type: "output_matches"
+        pattern: "(?i)(Case B|CompleteResponse.*clean|cleanly.*complete|tools.*done|!hasActiveTool)"
+      - type: "output_not_contains"
+        value: "that's fine"
+    rubric:
+      - "Agent warns against the flat 600s wait and explains that tools may have finished (ActiveToolCallCount==0, HasUsedToolsThisTurn==true) but the SDK dropped SessionIdleEvent — leading to a 10-minute kill of a successfully-completed session"
+      - "Agent describes the correct 3-way branch: Case A (active tools + server alive → reset timer), Case B (tools done, used tools → call CompleteResponse cleanly), Case C (default kill with error message)"
+      - "Agent explains that Case B calls CompleteResponse cleanly without an error message, preserving the session's output"
+    timeout: 120
+
+  - name: "Edge case - adding cross-thread boolean without Volatile (INV-7)"
+    prompt: |
+      I'm adding a new boolean flag `IsWaitingForUserInput` to AgentSessionInfo that gets
+      set to true from an SDK event handler (background thread) and read by the watchdog
+      timer. I'll just use a regular bool property like the existing HasUsedToolsThisTurn.
+      Should I also clear it in the IsProcessing cleanup paths?
+    assertions:
+      - type: "output_matches"
+        # Must mention Volatile or memory model concern
+        pattern: "(?i)(Volatile\\.Write|Volatile\\.Read|volatile|memory.model|ARM|thread.safe)"
+      - type: "output_matches"
+        # Must address the cleanup question positively
+        pattern: "(?i)(yes|clear|cleanup|checklist|IsProcessing.*path)"
+      - type: "output_matches"
+        # Should mention the existing inconsistency or tracking comment requirement
+        pattern: "(?i)(HasUsedToolsThisTurn|plain.assignment|inconsisten|tracking.comment|pre.existing)"
+    rubric:
+      - "Agent answers yes to the cleanup question — new fields read by the watchdog must be cleared on all IsProcessing=false paths"
+      - "Agent warns about using plain bool for cross-thread access and recommends Volatile.Write/Volatile.Read for ARM correctness (INV-7)"
+      - "Agent notes the existing inconsistency (HasUsedToolsThisTurn uses plain assignment) and advises not to repeat it — either use Volatile or add a tracking comment explaining the gap"
+      - "Agent identifies that both SendPromptAsync and RestoreSingleSessionAsync need to initialize the new field (INV-9 dual-path requirement)"
+    timeout: 120
+
+  - name: "No InvokeAsync inside HandleComplete - use synchronous completion (INV-8)"
+    prompt: |
+      I'm modifying HandleComplete in CopilotService to notify the UI after finalizing
+      a response. I want to call `await InvokeAsync(StateHasChanged)` inside the method
+      to trigger a Blazor render refresh. HandleComplete is called via Invoke() from the
+      SDK event handler. Is this pattern correct?
+    assertions:
+      - type: "output_matches"
+        pattern: "(?i)(do not|don't|avoid|should not|wrong|problem|stale|deferred)"
+      - type: "output_matches"
+        # Should explain why InvokeAsync defers execution
+        pattern: "(?i)(defer|async.*defer|already.*UI|UI.*thread|synchronous|stale.*render)"
+      - type: "output_matches"
+        # Should mention StateHasChanged or OnStateChanged as the correct approach
+        pattern: "(?i)(StateHasChanged|OnStateChanged|direct.*call|synchronous)"
+      - type: "output_not_contains"
+        value: "await InvokeAsync"
+        # Should not recommend InvokeAsync inside HandleComplete
+    rubric:
+      - "Agent warns against using InvokeAsync inside HandleComplete, explaining it defers execution and causes stale renders because HandleComplete already runs on the UI thread"
+      - "Agent explains that HandleComplete is dispatched via Invoke() (synchronous), so it is already on the UI thread and can call StateHasChanged or OnStateChanged directly"
+      - "Agent proposes the correct synchronous approach that avoids the deferred render issue"
+    timeout: 120
diff --git a/PolyPilot.Tests/ChatDatabaseResilienceTests.cs b/PolyPilot.Tests/ChatDatabaseResilienceTests.cs
index c0e88dc02d..951b54e39a 100644
--- a/PolyPilot.Tests/ChatDatabaseResilienceTests.cs
+++ b/PolyPilot.Tests/ChatDatabaseResilienceTests.cs
@@ -261,12 +261,15 @@ void handler(object? s, UnobservedTaskExceptionEventArgs e)
             _ = db.UpdateToolCompleteAsync("session-1", "tool-1", "result", true);
             _ = db.UpdateReasoningContentAsync("session-1", "reason-1", "content", true);
 
-            // Give tasks time to complete and finalize
+            // Give tasks time to complete and finalize — multiple GC cycles
+            // needed to reliably trigger UnobservedTaskException under load
+            await Task.Delay(500);
+            for (int i = 0; i < 3; i++)
+            {
+                GC.Collect(2, GCCollectionMode.Forced, blocking: true);
+                GC.WaitForPendingFinalizers();
+            }
             await Task.Delay(200);
-            GC.Collect();
-            GC.WaitForPendingFinalizers();
-            GC.Collect();
-            await Task.Delay(100);
 
             Assert.False(unobservedException,
                 "Fire-and-forget ChatDatabase calls must not produce unobserved task exceptions");
diff --git a/PolyPilot.Tests/SessionOrganizationTests.cs b/PolyPilot.Tests/SessionOrganizationTests.cs
index e123df7a22..dee8ce0597 100644
--- a/PolyPilot.Tests/SessionOrganizationTests.cs
+++ b/PolyPilot.Tests/SessionOrganizationTests.cs
@@ -1370,13 +1370,13 @@ public void BuiltInPresets_IncludeSkillValidator()
     {
         var skillValidator = GroupPreset.BuiltIn.FirstOrDefault(p => p.Name == "Skill Validator");
         Assert.NotNull(skillValidator);
-        Assert.Equal(2, skillValidator!.WorkerModels.Length);
+        Assert.Equal(3, skillValidator!.WorkerModels.Length);
         Assert.Equal(MultiAgentMode.OrchestratorReflect, skillValidator.Mode);
         Assert.Equal("⚖️", skillValidator.Emoji);
         Assert.NotNull(skillValidator.SharedContext);
         Assert.NotNull(skillValidator.RoutingContext);
         Assert.NotNull(skillValidator.WorkerSystemPrompts);
-        Assert.Equal(2, skillValidator.WorkerSystemPrompts!.Length);
+        Assert.Equal(3, skillValidator.WorkerSystemPrompts!.Length);
         Assert.All(skillValidator.WorkerSystemPrompts, p => Assert.False(string.IsNullOrWhiteSpace(p)));
         Assert.NotNull(skillValidator.MaxReflectIterations);
     }
diff --git a/PolyPilot.Tests/WsBridgeIntegrationTests.cs b/PolyPilot.Tests/WsBridgeIntegrationTests.cs
index 6612ce4252..400587429a 100644
--- a/PolyPilot.Tests/WsBridgeIntegrationTests.cs
+++ b/PolyPilot.Tests/WsBridgeIntegrationTests.cs
@@ -27,13 +27,17 @@ private static int GetFreePort()
     }
 
     /// <summary>
-    /// Polls until a condition is true, with a timeout. Replaces fixed Task.Delay for reliability under load.
+    /// Polls until a condition is true, with a timeout. Throws TimeoutException on silent timeout
+    /// to surface flaky races instead of letting assertions fail on stale state.
     /// </summary>
     private static async Task WaitForAsync(Func<bool> condition, CancellationToken ct, int pollMs = 50, int maxMs = 4000)
     {
         var sw = System.Diagnostics.Stopwatch.StartNew();
         while (!condition() && sw.ElapsedMilliseconds < maxMs)
             await Task.Delay(pollMs, ct);
+
+        if (!condition())
+            throw new TimeoutException($"WaitForAsync condition not met within {maxMs}ms");
     }
 
     public WsBridgeIntegrationTests()
@@ -677,7 +681,7 @@ public async Task RenameSession_ViaClient_RenamesOnServer()
         var client = await ConnectClientAsync(cts.Token);
 
         await client.RenameSessionAsync("old-name", "new-name", cts.Token);
-        await WaitForAsync(() => _copilot.GetSession("new-name") != null, cts.Token);
+        await WaitForAsync(() => _copilot.GetSession("new-name") != null, cts.Token, maxMs: 8000);
 
         Assert.Null(_copilot.GetSession("old-name"));
         Assert.NotNull(_copilot.GetSession("new-name"));
diff --git a/PolyPilot/Models/ModelCapabilities.cs b/PolyPilot/Models/ModelCapabilities.cs
index c8773e8695..deef5b238b 100644
--- a/PolyPilot/Models/ModelCapabilities.cs
+++ b/PolyPilot/Models/ModelCapabilities.cs
@@ -184,6 +184,13 @@ public record GroupPreset(string Name, string Description, string Emoji, MultiAg
     /// </summary>
     public int? MaxReflectIterations { get; init; }
 
+    /// <summary>
+    /// Custom display names for workers, indexed to match WorkerModels.
+    /// E.g. ["dotnet-validator", "anthropic-validator", "eval-generator"].
+    /// Null = use default "worker-{i}" naming.
+    /// </summary>
+    public string?[]? WorkerDisplayNames { get; init; }
+
     private const string WorkerReviewPrompt = """
         You are a PR reviewer. When assigned a PR, follow this process:
 
@@ -346,62 +353,109 @@ public record GroupPreset(string Name, string Description, string Emoji, MultiAg
         },
 
         new GroupPreset(
-            "Skill Validator", "Two evaluators assess your skill from different angles — dotnet empirical testing vs. Anthropic design review — orchestrator builds consensus",
+            "Skill Validator", "Three-phase skill evaluation: generate evals → empirical A/B testing → prompt design review → orchestrator builds consensus",
             "⚖️", MultiAgentMode.OrchestratorReflect,
-            "claude-opus-4.6", new[] { "claude-sonnet-4.6", "claude-sonnet-4.6" })
+            "claude-opus-4.6", new[] { "claude-sonnet-4.6", "claude-sonnet-4.6", "claude-sonnet-4.6" })
         {
             WorkerSystemPrompts = new[]
             {
                 """
-                You are the Dotnet Skill Validator. You evaluate skills empirically using the methodology from the dotnet/skills skill-validator tool.
+                You are the Dotnet Skill Validator. You evaluate skills by actually RUNNING the `skill-validator` tool from dotnet/skills — not by guessing or theorizing.
+
+                ## STEP 1: Ensure skill-validator is available
+
+                Check if the binary exists, and download it if missing:
+                ```bash
+                if [ ! -x /tmp/skill-validator ]; then
+                  echo "Downloading skill-validator..."
+                  ARCH=$(uname -m)
+                  case "$(uname -s)-${ARCH}" in
+                    Darwin-arm64) PATTERN='skill-validator-osx-arm64.tar.gz' ;;
+                    Darwin-x86_64) PATTERN='skill-validator-osx-x64.tar.gz' ;;
+                    Linux-aarch64) PATTERN='skill-validator-linux-arm64.tar.gz' ;;
+                    Linux-x86_64) PATTERN='skill-validator-linux-x64.tar.gz' ;;
+                    *) echo "Unsupported platform: $(uname -s)-${ARCH}"; exit 1 ;;
+                  esac
+                  cd /tmp && gh release download skill-validator-nightly \
+                    --repo dotnet/skills \
+                    --pattern "$PATTERN" \
+                    --clobber && \
+                  tar xzf "$PATTERN"
+                fi
+                /tmp/skill-validator --help | head -5
+                ```
+                If `gh` is not available or the download fails, explain the failure and fall back to manual analysis (Step 3 only).
 
-                ## Your Evaluation Approach
+                ## STEP 2: Run skill-validator against the skill
+
+                Run the tool against the skill directory. The skill directory must contain a `SKILL.md` file.
+
+                ```bash
+                /tmp/skill-validator <path-to-skill-directory> \
+                  --runs 1 \
+                  --model claude-sonnet-4.6 \
+                  --verdict-warn-only \
+                  --verbose \
+                  --results-dir /tmp/skill-validator-results
+                ```
+
+                **Flags explained:**
+                - `--runs 1`: Single run for speed (the Anthropic evaluator covers qualitative depth)
+                - `--model claude-sonnet-4.6`: Use Sonnet for agent runs (fast, cost-effective)
+                - `--verdict-warn-only`: Don't fail hard on missing eval.yaml — report the gap instead
+                - `--verbose`: Show tool calls and agent events so we can see what happened
 
-                For each skill under evaluation, perform these steps:
+                **If the skill has no `tests/eval.yaml`:** Report this as a finding. The tool will still run static profile analysis. Note this gap prominently in your verdict.
 
-                ### 1. Inspect the Skill Definition
-                - Read the SKILL.md file carefully
-                - Read the skill's `tests/eval.yaml` if present
-                - Identify what scenarios the skill claims to improve
+                **After the run:** Read the results:
+                ```bash
+                cat /tmp/skill-validator-results/summary.md 2>/dev/null
+                cat /tmp/skill-validator-results/results.json 2>/dev/null | head -100
+                ```
 
-                ### 2. Empirical Baseline Comparison
-                - For each scenario in the skill's eval.yaml (or infer representative scenarios from SKILL.md):
-                  a. Describe what the agent response would look like WITHOUT the skill (baseline)
-                  b. Describe what the agent response looks like WITH the skill
-                  c. Assess: token efficiency, tool call count, task completion, error rate
+                ## STEP 3: Analyze the tool output
 
-                ### 3. Pairwise Comparative Judgment
-                - Compare baseline vs. skill-augmented performance side-by-side
-                - Apply position-swap bias mitigation: consider both orderings in your judgment
-                - Rate improvement on a scale of -1 (degraded) to +1 (strong improvement) with confidence
+                Based on the ACTUAL tool output, assess:
+                - **Profile analysis**: Token count, section structure, code blocks (from tool's static analysis)
+                - **A/B comparison**: Did the skill improve agent performance? By how much?
+                - **Statistical significance**: Was the improvement score above the threshold?
+                - **Overfitting**: Did the tool flag any overfitting concerns?
+                - **Task completion**: Did assertions pass with the skill active?
 
-                ### 4. Statistical Assessment
-                - Estimate variance across the scenario set
-                - Flag scenarios where the skill helps most vs. least
-                - Identify scenarios where the skill might cause regressions
+                If the tool couldn't run (no eval.yaml, download failed, etc.), perform manual analysis:
+                - Read SKILL.md and estimate token count, structure quality
+                - Identify likely improvement scenarios and potential regressions
+                - Note that this is manual analysis, not empirical data
+
+                ## STEP 4: Verdict
 
-                ### 5. Verdict
                 Format your verdict as:
                 ```
                 ## Dotnet Validator Verdict
+                **Tool Run**: ✅ Completed / ⚠️ Partial (no eval.yaml) / ❌ Failed (reason)
+                **Improvement Score**: X% [CI: low%, high%] (from tool) or N/A
                 **Overall Score**: X/10
                 **Confidence**: High / Medium / Low
                 **Verdict**: KEEP / IMPROVE / REMOVE
 
+                ### Tool Output Summary
+                - [key findings from skill-validator output]
+
                 ### Strengths
-                - [specific strengths with evidence]
+                - [specific strengths with evidence from tool output]
 
                 ### Weaknesses
-                - [specific weaknesses with evidence]
+                - [specific weaknesses with evidence from tool output]
 
                 ### Suggested Improvements
                 - [concrete, actionable suggestions]
                 ```
 
                 ## Rules
-                - Be data-driven: cite specific scenarios and observed behaviors
-                - Focus on measurable outcomes: does it reduce tokens? improve task completion? reduce errors?
-                - Don't recommend keeping a skill that shows no measurable improvement in your empirical analysis
+                - ALWAYS attempt to run the tool first. Do not skip to manual analysis.
+                - Include the actual tool output or error in your report — transparency matters.
+                - If the skill has no eval.yaml, recommend creating one and suggest specific scenarios.
+                - Be data-driven: cite tool metrics, not vibes.
                 """,
 
                 """
@@ -455,20 +509,134 @@ b. Describe what the agent response looks like WITH the skill
                 - Be concrete: quote specific lines from SKILL.md when critiquing
                 - Focus on prompt quality, not empirical test results
                 - Suggest specific rewrites, not vague advice like "be clearer"
+
+                ## Optional: Live Trigger Testing with Claude Code
+                If `claude` CLI is available, test trigger accuracy with non-interactive prompts:
+                ```bash
+                # Check availability first
+                which claude 2>/dev/null && claude --version 2>&1
+                ```
+                If available and authenticated, run:
+                ```bash
+                # Positive test — should trigger the skill
+                claude -p "<prompt that should trigger>" --output-format text 2>&1 | head -50
+                # Negative test — should NOT trigger the skill  
+                claude -p "<unrelated prompt>" --output-format text 2>&1 | head -50
+                ```
+                If `claude` returns auth errors ("does not have access", "login again"), skip live
+                testing and note: "Claude Code CLI not authenticated — trigger testing skipped."
+                Do NOT retry auth failures. Fall back to manual analysis.
+                """,
+
+                """
+                You are the Eval Generator. You create `tests/eval.yaml` files for skills that don't have them, enabling empirical A/B validation by the skill-validator tool.
+
+                ## Your Job
+
+                Given a skill directory with SKILL.md, generate a comprehensive `eval.yaml` that tests whether the skill actually improves agent behavior.
+
+                ## STEP 1: Read the skill
+
+                Read the SKILL.md file and understand:
+                - What the skill teaches the agent to do
+                - What triggers should activate the skill
+                - What the agent should do differently WITH the skill vs WITHOUT it
+                - What mistakes the skill prevents
+
+                ## STEP 2: Check for existing eval.yaml
+
+                ```bash
+                ls <skill-directory>/tests/eval.yaml 2>/dev/null
+                ```
+                If one already exists, read it and IMPROVE it rather than replacing it.
+                If none exists, create the `tests/` directory and generate a new one.
+
+                ## STEP 3: Design scenarios
+
+                Create 3-5 scenarios covering:
+
+                1. **Positive trigger (happy path)**: A prompt that SHOULD trigger the skill. Assert the agent follows the skill's key instructions.
+                2. **Negative trigger**: A prompt that should NOT trigger the skill. Assert the agent doesn't mention skill-specific concepts.
+                3. **Edge case**: A prompt that's ambiguous or borderline. The skill should help the agent handle it correctly.
+                4. **Regression prevention**: A prompt that tests a specific mistake the skill warns about. Assert the agent avoids the mistake.
+
+                ### Assertion guidelines
+                - Use `output_contains` for key terms/patterns the skilled agent MUST produce
+                - Use `output_not_contains` for anti-patterns the skilled agent must avoid
+                - Use `output_matches` with regex for flexible pattern matching
+                - Keep assertions focused on BEHAVIOR differences, not exact wording (avoid overfitting)
+
+                ### Rubric guidelines
+                - Each rubric item should describe a QUALITY criterion the judge can evaluate
+                - Focus on outcomes ("correctly identified the issue") not vocabulary ("used the word X")
+                - Include 2-4 rubric items per scenario
+                - Rubric items are evaluated by pairwise LLM comparison (with-skill vs without-skill)
+
+                ## STEP 4: Write the eval.yaml
+
+                ```bash
+                mkdir -p <skill-directory>/tests
+                cat > <skill-directory>/tests/eval.yaml << 'EVALEOF'
+                scenarios:
+                  - name: "Description of scenario"
+                    prompt: "The exact prompt to send to the agent"
+                    assertions:
+                      - type: "output_contains"
+                        value: "expected behavior indicator"
+                    rubric:
+                      - "Quality criterion for the judge"
+                    timeout: 120
+                EVALEOF
+                ```
+
+                ## STEP 5: Validate the file
+
+                ```bash
+                cat <skill-directory>/tests/eval.yaml
+                # Verify it's valid YAML
+                python3 -c "import yaml; yaml.safe_load(open('<skill-directory>/tests/eval.yaml'))" 2>&1 || echo "YAML parse error"
+                ```
+
+                ## Output format
+
+                After writing the file, report:
+                ```
+                ## Eval Generator Report
+                **Skill**: [skill name]
+                **Scenarios Generated**: N
+                **File Written**: <path>/tests/eval.yaml
+
+                ### Scenarios
+                1. [name] — [what it tests]
+                2. [name] — [what it tests]
+                ...
+
+                ### Design Rationale
+                - [why these specific scenarios were chosen]
+                - [what behavioral differences they target]
+                ```
+
+                ## Rules
+                - ALWAYS write the file to disk. Your output is consumed by other workers.
+                - Focus on BEHAVIORAL differences — what changes when the skill is active?
+                - Avoid overfitting: don't assert the skill's exact vocabulary, assert the outcomes
+                - Keep prompts realistic — they should sound like real user requests
+                - Set reasonable timeouts (60-180s depending on complexity)
+                - Use `exit_success` assertion sparingly — it just checks for non-empty output
                 """,
             },
             SharedContext = """
                 ## Skill Evaluation Standards
 
-                Both evaluators assess the same skill and produce independent verdicts.
-                A good skill must satisfy BOTH evaluators to be marked KEEP.
+                All three workers assess the same skill from different angles.
+                A good skill must satisfy the empirical validator AND the prompt reviewer to be marked KEEP.
 
                 ### What makes a skill worth keeping
                 - Measurable improvement in task completion (Dotnet validator perspective)
                 - Clear, precise description that triggers reliably (Anthropic evaluator perspective)
+                - Comprehensive eval.yaml that tests real behavioral differences (Eval Generator perspective)
                 - Focused scope — does one thing well
                 - Actionable instructions that guide the agent without over-constraining it
-                - Adequate test coverage
 
                 ### What warrants IMPROVE
                 - Good intent but fixable gaps (bad trigger description, missing scenarios, ambiguous instructions)
@@ -480,37 +648,96 @@ A good skill must satisfy BOTH evaluators to be marked KEEP.
                 - Instructions that would cause regressions or confusion
 
                 ### Consensus Rule
-                - KEEP requires both evaluators to say KEEP or one KEEP + one IMPROVE
+                - KEEP requires both evaluators (dotnet-validator and anthropic-validator) to say KEEP or one KEEP + one IMPROVE
                 - IMPROVE if the evaluators disagree or both say IMPROVE
                 - REMOVE if either evaluator says REMOVE with strong evidence
+
+                ### eval.yaml Format Reference
+                ```yaml
+                scenarios:
+                  - name: "Descriptive name"
+                    prompt: "Prompt sent to the agent"
+                    assertions:
+                      - type: "output_contains"
+                        value: "expected text"
+                      - type: "output_not_contains"
+                        value: "text that should not appear"
+                      - type: "output_matches"
+                        pattern: "regex"
+                    rubric:
+                      - "Quality criterion for pairwise LLM judge"
+                    timeout: 120
+                ```
+                Assertion types: output_contains, output_not_contains, output_matches, output_not_matches, file_exists, file_contains, exit_success.
                 """,
             RoutingContext = """
                 ## Skill Validator Orchestration
 
-                You orchestrate two skill evaluators who assess skills from different angles. Your role is to:
-                1. Dispatch the skill to BOTH evaluators simultaneously (or sequentially if needed)
-                2. Collect their verdicts
-                3. Identify where they agree and disagree
-                4. Build a consensus verdict explaining which suggestions to adopt and why
+                ⚠️ CRITICAL: You are a DISPATCHER. You delegate work ONLY via @worker:/@end blocks.
+                NEVER use task(), bash(), view(), grep(), or ANY tool yourself. NEVER read files, run commands, or do analysis.
+                If you catch yourself about to call a tool — STOP and write an @worker block instead.
+
+                ### Delegation Format (the ONLY way to assign work)
+
+                ```
+                @worker:Skill Validator-eval-generator
+                Your task: [detailed instructions]
+                @end
+                ```
+
+                You MUST use the FULL worker session name (e.g., "Skill Validator-eval-generator", not just "eval-generator").
+                Each response MUST contain @worker:/@end blocks. Text outside the blocks is your planning notes.
 
                 ### Worker Names
-                - **worker-1** = Dotnet Skill Validator (empirical, outcome-focused)
-                - **worker-2** = Anthropic Skill Evaluator (prompt-design-focused)
+                - **Skill Validator-dotnet-validator** = Dotnet Skill Validator (runs `skill-validator` tool)
+                - **Skill Validator-anthropic-validator** = Anthropic Skill Evaluator (prompt analysis via Claude Code)
+                - **Skill Validator-eval-generator** = Eval Generator (creates tests/eval.yaml)
 
-                ### Dispatch Pattern
-                1. **First dispatch**: Send the skill content to BOTH workers. Include the full SKILL.md content and eval.yaml if available.
-                2. **After both complete**: Compare their verdicts. Identify agreements (high confidence) and disagreements (requires judgment).
-                3. **Build consensus**: Produce a final report that explains which worker's suggestions are adopted, which are declined, and why.
+                ### 2-Phase Dispatch
+
+                **Phase 1 — Always dispatch eval-generator first:**
+
+                Write a single @worker block for eval-generator. Tell it the skill directory path and to generate tests/eval.yaml.
+                Do NOT check if eval.yaml exists yourself — eval-generator will check and skip if it already exists.
+
+                Example:
+                ```
+                @worker:Skill Validator-eval-generator
+                Generate tests/eval.yaml for the skill at: <path>
+                The skill is about: <brief description from user prompt>
+                @end
+                ```
+
+                **Phase 2 — After eval-generator completes, dispatch both validators together:**
+
+                Write TWO @worker blocks in a single response:
+                ```
+                @worker:Skill Validator-dotnet-validator
+                Evaluate the skill at: <path>
+                Eval.yaml status: [generated by eval-generator / pre-existing]
+                @end
+
+                @worker:Skill Validator-anthropic-validator
+                Evaluate the skill at: <path>
+                Eval.yaml status: [generated by eval-generator / pre-existing]
+                @end
+                ```
+
+                **Phase 3 — After both evaluators complete, write the consensus report (no @worker blocks needed).**
 
                 ### Consensus Report Format
                 ```
                 ## Skill Validator Consensus Report: [Skill Name]
 
                 ### Summary
+                **Eval Generation**: ✅ Pre-existing / 🆕 Generated by eval-generator / ❌ None
                 **Dotnet Verdict**: KEEP/IMPROVE/REMOVE (X/10)
                 **Anthropic Verdict**: KEEP/IMPROVE/REMOVE (X/10)
                 **Consensus**: KEEP / IMPROVE / REMOVE
 
+                ### Eval Quality (if generated)
+                - [assessment of generated eval.yaml scenarios]
+
                 ### Points of Agreement (High Confidence)
                 - [issues both evaluators flagged]
 
@@ -528,12 +755,15 @@ 4. Build a consensus verdict explaining which suggestions to adopt and why
                 ```
 
                 ### Rules
+                - NEVER use tools yourself — you are a message relay, not a worker
                 - Always explain WHY suggestions are adopted or declined
                 - Where evaluators disagree, explain the tradeoff and make a reasoned judgment
                 - Highlight suggestions both evaluators agree on as high-confidence improvements
-                - NEVER emit [[GROUP_REFLECT_COMPLETE]] until both evaluators have responded and a consensus report is produced
+                - If eval-generator generated evals, note whether the Dotnet validator was able to use them
+                - NEVER emit [[GROUP_REFLECT_COMPLETE]] until all dispatched workers have responded and a consensus report is produced
                 """,
             MaxReflectIterations = 6,
+            WorkerDisplayNames = new[] { "dotnet-validator", "anthropic-validator", "eval-generator" },
         },
     };
 }
diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs
index bde7ecaf91..69357dddcb 100644
--- a/PolyPilot/Services/CopilotService.Events.cs
+++ b/PolyPilot/Services/CopilotService.Events.cs
@@ -828,6 +828,25 @@ private void FlushCurrentResponse(SessionState state)
         
         state.CurrentResponse.Clear();
         state.HasReceivedDeltasThisTurn = false;
+        
+        // Early dispatch: if the orchestrator wrote @worker blocks in an intermediate sub-turn,
+        // resolve the TCS now so ParseTaskAssignments can run immediately. Without this, the
+        // orchestrator continues doing tool work itself for minutes before dispatch happens.
+        if (state.EarlyDispatchOnWorkerBlocks && state.ResponseCompletion != null)
+        {
+            var flushed = state.FlushedResponse.ToString();
+            if (System.Text.RegularExpressions.Regex.IsMatch(flushed, @"@worker:.+?\n[\s\S]+?@end", System.Text.RegularExpressions.RegexOptions.IgnoreCase))
+            {
+                Debug($"[DISPATCH] Early dispatch: @worker blocks detected in flushed text ({flushed.Length} chars) for '{state.Info.Name}'");
+                state.EarlyDispatchOnWorkerBlocks = false; // One-shot
+                // Build the full response the same way CompleteResponse does
+                var remaining = state.CurrentResponse.ToString();
+                var fullResponse = string.IsNullOrEmpty(remaining) ? flushed : flushed + "\n\n" + remaining;
+                state.FlushedResponse.Clear();
+                state.CurrentResponse.Clear();
+                state.ResponseCompletion.TrySetResult(fullResponse);
+            }
+        }
     }
 
     /// <summary>
diff --git a/PolyPilot/Services/CopilotService.Organization.cs b/PolyPilot/Services/CopilotService.Organization.cs
index 86f313047e..50e64e8094 100644
--- a/PolyPilot/Services/CopilotService.Organization.cs
+++ b/PolyPilot/Services/CopilotService.Organization.cs
@@ -1066,6 +1066,9 @@ private async Task SendViaOrchestratorAsync(string groupId, List<string> members
         InvokeOnUI(() => OnOrchestratorPhaseChanged?.Invoke(groupId, OrchestratorPhase.Planning, null));
 
         var planningPrompt = BuildOrchestratorPlanningPrompt(prompt, workerNames, group?.OrchestratorPrompt, group?.RoutingContext);
+        // Enable early dispatch for the non-reflect orchestrator flow too
+        if (_sessions.TryGetValue(orchestratorName, out var orchPlanning))
+            orchPlanning.EarlyDispatchOnWorkerBlocks = true;
         var planResponse = await SendPromptAndWaitAsync(orchestratorName, planningPrompt, cancellationToken, originalPrompt: prompt);
 
         // Phase 2: Parse task assignments from orchestrator response, with retry loop.
@@ -1165,6 +1168,9 @@ private async Task SendViaOrchestratorAsync(string groupId, List<string> members
             }
             var results = await Task.WhenAll(workerTasks);
 
+            // After early dispatch, the orchestrator may still be doing tool work.
+            await WaitForSessionIdleAsync(orchestratorName, cancellationToken);
+
             // Phase 4: Synthesize — send worker results back to orchestrator
             InvokeOnUI(() => OnOrchestratorPhaseChanged?.Invoke(groupId, OrchestratorPhase.Synthesizing, null));
 
@@ -1218,6 +1224,7 @@ private string BuildOrchestratorPlanningPrompt(string userPrompt, List<string> w
         {
             sb.AppendLine("You are a DISPATCHER ONLY. You do NOT have tools. You CANNOT write code, read files, or run commands yourself.");
             sb.AppendLine("Your ONLY job is to write @worker/@end blocks that assign work to your workers.");
+            sb.AppendLine("⚠️ This is a NEW request. Even if your conversation history shows previous work on a similar topic, you MUST delegate FRESH tasks to your workers. Previous results may be stale.");
         }
         else
         {
@@ -1263,7 +1270,10 @@ private static List<TaskAssignment> DeduplicateAssignments(
     internal static string BuildDelegationNudgePrompt(List<string> workerNames)
     {
         var sb = new System.Text.StringBuilder();
-        sb.AppendLine("Your previous response did not contain any @worker delegation blocks. You MUST delegate work to your workers.");
+        sb.AppendLine("⚠️ CRITICAL: Your previous response did not contain any @worker delegation blocks.");
+        sb.AppendLine();
+        sb.AppendLine("You are a DISPATCHER. You CANNOT do the work yourself. Even if you believe the work is already done from a previous conversation, you MUST delegate FRESH work to your workers NOW.");
+        sb.AppendLine("Any previous results are STALE — the user is requesting a NEW evaluation. Ignore prior results and dispatch fresh tasks.");
         sb.AppendLine();
         sb.AppendLine($"Available workers ({workerNames.Count}): {string.Join(", ", workerNames)}");
         sb.AppendLine();
@@ -1273,7 +1283,7 @@ internal static string BuildDelegationNudgePrompt(List<string> workerNames)
         sb.AppendLine("Describe the task here on a separate line.");
         sb.AppendLine("@end");
         sb.AppendLine();
-        sb.AppendLine("Do NOT do the work yourself. Output @worker blocks now.");
+        sb.AppendLine("Produce @worker blocks for ALL workers now. Do NOT explain, do NOT summarize previous work, ONLY output @worker blocks.");
         return sb.ToString();
     }
 
@@ -1340,6 +1350,48 @@ internal static List<TaskAssignment> TryParseJsonAssignments(string response, Li
 
     private record WorkerResult(string WorkerName, string? Response, bool Success, string? Error, TimeSpan Duration);
 
+    /// <summary>
+    /// Wait for a session to finish processing (go idle). Used after early dispatch
+    /// resolves the orchestrator's TCS while it's still doing tool work — we must
+    /// wait for it to go idle before sending the next prompt (synthesis).
+    /// Uses a short inactivity threshold: if no SDK events for 60s, the session is
+    /// likely stuck in a "zero-idle" state and will be aborted.
+    /// </summary>
+    private async Task WaitForSessionIdleAsync(string sessionName, CancellationToken ct, int timeoutSeconds = 300)
+    {
+        if (!_sessions.TryGetValue(sessionName, out var state))
+            return;
+        if (!state.Info.IsProcessing)
+            return;
+
+        Debug($"[DISPATCH] Waiting for '{sessionName}' to go idle before sending next prompt...");
+        var sw = System.Diagnostics.Stopwatch.StartNew();
+        const int inactivityThresholdSeconds = 60;
+
+        while (state.Info.IsProcessing && !ct.IsCancellationRequested && sw.Elapsed.TotalSeconds < timeoutSeconds)
+        {
+            // Check if the session has been quiet (no SDK events) for the inactivity threshold.
+            // This catches the "zero-idle" bug where the SDK never emits SessionIdleEvent.
+            var lastEventTicks = Interlocked.Read(ref state.LastEventAtTicks);
+            var secondsSinceLastEvent = (DateTime.UtcNow - new DateTime(lastEventTicks)).TotalSeconds;
+            if (secondsSinceLastEvent >= inactivityThresholdSeconds)
+            {
+                Debug($"[DISPATCH] '{sessionName}' no SDK events for {secondsSinceLastEvent:F0}s — aborting to proceed with synthesis");
+                await AbortSessionAsync(sessionName);
+                await Task.Delay(500, ct);
+                break;
+            }
+            await Task.Delay(1000, ct);
+        }
+        if (state.Info.IsProcessing)
+        {
+            Debug($"[DISPATCH] '{sessionName}' still processing after {sw.Elapsed.TotalSeconds:F1}s — aborting to allow synthesis");
+            await AbortSessionAsync(sessionName);
+            await Task.Delay(500, ct);
+        }
+        Debug($"[DISPATCH] '{sessionName}' now idle after {sw.Elapsed.TotalSeconds:F1}s");
+    }
+
     private async Task<WorkerResult> ExecuteWorkerAsync(string workerName, string task, string originalPrompt, CancellationToken cancellationToken)
     {
         var sw = System.Diagnostics.Stopwatch.StartNew();
@@ -1371,6 +1423,7 @@ private async Task<WorkerResult> ExecuteWorkerAsync(string workerName, string ta
         var workerPrompt = $"{identity}{worktreeNote}\n\nYour response will be collected and synthesized with other workers' responses.\n\n{sharedPrefix}## Original User Request (context)\n{originalPrompt}\n\n## Your Assigned Task\n{task}";
 
         const int maxRetries = 2;
+        var dispatchTime = DateTime.UtcNow;
         for (int attempt = 1; attempt <= maxRetries; attempt++)
         {
             try
@@ -1402,6 +1455,21 @@ private async Task<WorkerResult> ExecuteWorkerAsync(string workerName, string ta
                     }
                 }
 
+                // Fallback: if response is still empty, try extracting from chat history.
+                // The SDK may have streamed content via delta events that ended up in history
+                // even though FlushedResponse/CurrentResponse were empty (e.g., watchdog completion).
+                if (string.IsNullOrWhiteSpace(response) && _sessions.TryGetValue(workerName, out var histState))
+                {
+                    var lastAssistant = histState.Info.History.ToArray()
+                        .LastOrDefault(m => m.Role == "assistant" && !string.IsNullOrWhiteSpace(m.Content)
+                            && m.Timestamp >= dispatchTime);
+                    if (lastAssistant != null)
+                    {
+                        response = lastAssistant.Content;
+                        Debug($"[DISPATCH] Worker '{workerName}' recovered {response!.Length} chars from chat history");
+                    }
+                }
+
                 Debug($"[DISPATCH] Worker '{workerName}' completed (response len={response?.Length ?? 0}, elapsed={sw.Elapsed.TotalSeconds:F1}s)");
                 return new WorkerResult(workerName, response, true, null, sw.Elapsed);
             }
@@ -2059,10 +2127,13 @@ public string GetEffectiveModel(string sessionName)
         Debug($"[WorktreeStrategy] Creating {preset.WorkerModels.Length} workers with strategy={strategy}, repoId={repoId}");
         for (int i = 0; i < preset.WorkerModels.Length; i++)
         {
-            var workerName = $"{teamName}-worker-{i + 1}";
+            var displayName = preset.WorkerDisplayNames != null && i < preset.WorkerDisplayNames.Length && preset.WorkerDisplayNames[i] != null
+                ? preset.WorkerDisplayNames[i]!
+                : $"worker-{i + 1}";
+            var workerName = $"{teamName}-{displayName}";
             { int suffix = 1;
               while (_sessions.ContainsKey(workerName) || Organization.Sessions.Any(s => s.SessionName == workerName))
-                  workerName = $"{teamName}-worker-{i + 1}-{suffix++}";
+                  workerName = $"{teamName}-{displayName}-{suffix++}";
             }
             var workerModel = preset.WorkerModels[i];
             var workerWorkDir = workerWorkDirs[i] ?? orchWorkDir ?? workingDirectory;
@@ -2241,6 +2312,7 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List<string>
             return;
         }
 
+        var leftoverPrompts = new List<string>();
         try
         {
 
@@ -2297,6 +2369,12 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List<string>
                 planPrompt = BuildReplanPrompt(reflectState.LastEvaluation ?? "Continue iterating.", workerNames, prompt, group.RoutingContext);
             }
 
+            // Enable early dispatch: if the orchestrator writes @worker blocks in an intermediate
+            // sub-turn (before finishing all tool rounds), FlushCurrentResponse will resolve the
+            // TCS immediately. This prevents the orchestrator from doing all the work itself.
+            if (_sessions.TryGetValue(orchestratorName, out var orchState))
+                orchState.EarlyDispatchOnWorkerBlocks = true;
+
             var planResponse = await SendPromptAndWaitAsync(orchestratorName, planPrompt, ct, originalPrompt: prompt);
             var rawAssignments = ParseTaskAssignments(planResponse, workerNames);
             Debug($"[DISPATCH] '{orchestratorName}' reflect plan parsed: {rawAssignments.Count} raw assignments from {workerNames.Count} workers. Iteration={reflectState.CurrentIteration}, Response length={planResponse.Length}");
@@ -2312,6 +2390,9 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List<string>
                     AddOrchestratorSystemMessage(orchestratorName,
                         "⚠️ No @worker assignments parsed from orchestrator response. Retrying...");
                     var nudgePrompt = BuildDelegationNudgePrompt(workerNames);
+                    // Re-enable early dispatch for the nudge attempt too
+                    if (_sessions.TryGetValue(orchestratorName, out var nudgeState))
+                        nudgeState.EarlyDispatchOnWorkerBlocks = true;
                     var nudgeResponse = await SendPromptAndWaitAsync(orchestratorName, nudgePrompt, ct, originalPrompt: prompt);
                     var nudgeAssignments = ParseTaskAssignments(nudgeResponse, workerNames);
                     Debug($"[DISPATCH] '{orchestratorName}' nudge parsed: {nudgeAssignments.Count} raw assignments. Response length={nudgeResponse.Length}");
@@ -2322,26 +2403,38 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List<string>
                     }
                     else
                     {
-                        reflectState.ConsecutiveErrors++;
-                        if (reflectState.ConsecutiveErrors >= 3)
-                        {
-                            reflectState.IsStalled = true;
-                            reflectState.IsCancelled = true;
-                            break;
-                        }
-                        continue;
+                        // Nudge also failed — the orchestrator refuses to delegate (likely due to
+                        // stale history from a previous run). Force delegation by creating synthetic
+                        // @worker blocks that assign the original prompt to each worker.
+                        Debug($"[DISPATCH] Nudge failed, forcing delegation to all {workerNames.Count} workers");
+                        AddOrchestratorSystemMessage(orchestratorName,
+                            $"⚡ Orchestrator refused to delegate after nudge. Forcing dispatch to all {workerNames.Count} workers.");
+                        assignments = workerNames.Select(w => new TaskAssignment(w, prompt)).ToList();
+                        // Fall through to dispatch below
                     }
                 }
                 else
                 {
                     // Later iterations: orchestrator decided no more work needed —
-                    // but only declare GoalMet if no queued prompts produced @worker blocks
-                    if (queuedAssignments.Count == 0)
+                    // but only declare GoalMet if all workers have been dispatched
+                    var allDispatched = workerNames.All(w => dispatchedWorkers.Contains(w));
+                    var allAttempted = workerNames.All(w => attemptedWorkers.Contains(w));
+                    if (queuedAssignments.Count == 0 && (allDispatched || allAttempted))
                     {
                         reflectState.GoalMet = true;
                         AddOrchestratorSystemMessage(orchestratorName, $"✅ Orchestrator completed without delegation (iteration {reflectState.CurrentIteration}).");
                         break;
                     }
+                    if (!allDispatched && queuedAssignments.Count == 0)
+                    {
+                        // Not all workers dispatched — force dispatch remaining workers
+                        var remaining = workerNames.Where(w => !dispatchedWorkers.Contains(w) && !attemptedWorkers.Contains(w)).ToList();
+                        Debug($"[DISPATCH] Iteration {reflectState.CurrentIteration}: 0 assignments but {remaining.Count} workers never dispatched — forcing: {string.Join(", ", remaining)}");
+                        AddOrchestratorSystemMessage(orchestratorName,
+                            $"⚡ Forcing dispatch to {remaining.Count} remaining worker(s): {string.Join(", ", remaining)}");
+                        assignments = remaining.Select(w => new TaskAssignment(w, prompt)).ToList();
+                        // Fall through to dispatch below
+                    }
                     // Fall through to merge and dispatch queued work
                 }
             }
@@ -2390,6 +2483,10 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List<string>
             foreach (var r in results.Where(r => r.Success))
                 dispatchedWorkers.Add(r.WorkerName);
 
+            // After early dispatch, the orchestrator may still be doing tool work.
+            // Wait for it to go idle before sending the synthesis prompt.
+            await WaitForSessionIdleAsync(orchestratorName, ct);
+
             // Phase 4: Synthesize + Evaluate
             InvokeOnUI(() => OnOrchestratorPhaseChanged?.Invoke(groupId, OrchestratorPhase.Synthesizing, iterDetail));
 
@@ -2418,13 +2515,16 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List<string>
                 var evaluatorModel = GetEffectiveModel(evaluatorName);
                 var trend = reflectState.RecordEvaluation(reflectState.CurrentIteration, score, rationale, evaluatorModel);
 
-                // Check if evaluator says complete — but only if all workers have participated successfully
+                // Check if evaluator says complete
+                var allWorkersAttempted = workerNames.All(w => attemptedWorkers.Contains(w));
                 if ((evalResponse.Contains("[[GROUP_REFLECT_COMPLETE]]", StringComparison.OrdinalIgnoreCase) || score >= 0.9)
-                    && allWorkersDispatched)
+                    && (allWorkersDispatched || allWorkersAttempted))
                 {
+                    var partial = !allWorkersDispatched && allWorkersAttempted;
                     reflectState.GoalMet = true;
                     reflectState.IsActive = false;
-                    AddOrchestratorSystemMessage(orchestratorName, $"✅ {reflectState.BuildCompletionSummary()} (score: {score:F1})");
+                    var suffix = partial ? " (some workers failed but all were attempted)" : "";
+                    AddOrchestratorSystemMessage(orchestratorName, $"✅ {reflectState.BuildCompletionSummary()} (score: {score:F1}){suffix}");
                     break;
                 }
 
@@ -2435,7 +2535,7 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List<string>
                     var neverDispatched = missing.Where(w => !attemptedWorkers.Contains(w)).ToList();
                     var detail = neverDispatched.Count > 0
                         ? $"Not yet dispatched: {string.Join(", ", neverDispatched)}."
-                        : $"Dispatched but failed: {string.Join(", ", failedButAttempted)}. Retry or address errors.";
+                        : $"Dispatched but failed: {string.Join(", ", failedButAttempted)}. Will retry next iteration.";
                     Debug($"Reflection: overriding completion — {detail}");
                     reflectState.LastEvaluation = $"{detail} Dispatch to the remaining workers before completing.";
                     AddOrchestratorSystemMessage(orchestratorName, $"🔄 Overriding completion — {detail}");
@@ -2452,18 +2552,21 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List<string>
             {
                 synthesisResponse = await SendPromptAndWaitAsync(orchestratorName, synthEvalPrompt, ct, originalPrompt: prompt);
 
-                // Check completion sentinel — but only if all workers have participated successfully
+                // Check completion sentinel
+                var allWorkersAttempted = workerNames.All(w => attemptedWorkers.Contains(w));
                 if (synthesisResponse.Contains("[[GROUP_REFLECT_COMPLETE]]", StringComparison.OrdinalIgnoreCase)
-                    && allWorkersDispatched)
+                    && (allWorkersDispatched || allWorkersAttempted))
                 {
+                    var partial = !allWorkersDispatched && allWorkersAttempted;
                     reflectState.GoalMet = true;
                     reflectState.IsActive = false;
-                    AddOrchestratorSystemMessage(orchestratorName, $"✅ {reflectState.BuildCompletionSummary()}");
+                    var suffix = partial ? " (some workers failed but all were attempted)" : "";
+                    AddOrchestratorSystemMessage(orchestratorName, $"✅ {reflectState.BuildCompletionSummary()}{suffix}");
                     break;
                 }
 
                 if (synthesisResponse.Contains("[[GROUP_REFLECT_COMPLETE]]", StringComparison.OrdinalIgnoreCase)
-                    && !allWorkersDispatched)
+                    && !allWorkersAttempted)
                 {
                     // Override premature completion — not all workers have participated
                     var missing = workerNames.Where(w => !dispatchedWorkers.Contains(w)).ToList();
@@ -2559,25 +2662,16 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List<string>
             reflectState.IsActive = false;
             reflectState.CompletedAt = DateTime.Now;
             ClearPendingOrchestration();
-            // Send any remaining queued prompts to the orchestrator before releasing the lock.
-            // These were acknowledged to the user ("📨 queued") but the loop is exiting —
-            // sending them ensures the model sees them rather than silently discarding.
+            // Collect leftover queued prompts synchronously for best-effort delivery
+            // AFTER releasing the semaphore. This prevents holding the semaphore forever
+            // if SendPromptAndWaitAsync blocks on a broken SDK connection (e.g., StreamJsonRpc
+            // serialization errors during cancellation). See PR #323.
             if (_reflectQueuedPrompts.TryGetValue(groupId, out var remainingQueue))
             {
                 while (remainingQueue.TryDequeue(out var leftover))
-                {
-                    try
-                    {
-                        Debug($"[DISPATCH] Sending leftover queued prompt on loop exit (len={leftover.Length})");
-                        await SendPromptAndWaitAsync(orchestratorName,
-                            $"[User sent a message — the reflection loop has completed]\n\n{leftover}", ct, originalPrompt: leftover);
-                    }
-                    catch (Exception ex)
-                    {
-                        Debug($"[DISPATCH] Failed to send leftover queued prompt: {ex.Message}");
-                    }
-                }
+                    leftoverPrompts.Add(leftover);
             }
+
             SaveOrganization();
             InvokeOnUI(() =>
             {
@@ -2591,6 +2685,31 @@ await SendPromptAndWaitAsync(orchestratorName,
         {
             loopLock.Release();
         }
+
+        // Fire-and-forget: deliver leftover prompts after semaphore is released.
+        // If delivery fails (broken connection), prompts are still visible in chat history.
+        if (leftoverPrompts.Count > 0)
+        {
+            var orchName = orchestratorName;
+            SafeFireAndForget(Task.Run(async () =>
+            {
+                foreach (var leftover in leftoverPrompts)
+                {
+                    try
+                    {
+                        Debug($"[DISPATCH] Sending leftover queued prompt after loop exit (len={leftover.Length})");
+                        using var cleanupCts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
+                        await SendPromptAndWaitAsync(orchName,
+                            $"[User sent a message — the reflection loop has completed]\n\n{leftover}",
+                            cleanupCts.Token, originalPrompt: leftover);
+                    }
+                    catch (Exception ex)
+                    {
+                        Debug($"[DISPATCH] Failed to send leftover queued prompt: {ex.Message}");
+                    }
+                }
+            }), "leftover-prompt-delivery");
+        }
     }
 
     private string BuildSynthesisWithEvalPrompt(string originalPrompt, List<WorkerResult> results, ReflectionCycle state,
@@ -2654,6 +2773,7 @@ private string BuildReplanPrompt(string lastEvaluation, List<string> workerNames
         var sb = new System.Text.StringBuilder();
         sb.AppendLine("You are a DISPATCHER ONLY. You do NOT have tools. You CANNOT write code, read files, or run commands yourself.");
         sb.AppendLine("Your ONLY job is to write @worker/@end blocks that assign work to your workers.");
+        sb.AppendLine("Even if previous conversation history shows completed work, you MUST dispatch FRESH tasks now.");
         sb.AppendLine();
         sb.AppendLine("## Previous Iteration Evaluation");
         sb.AppendLine(lastEvaluation);
diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs
index 9bdb631d78..86ad3ade16 100644
--- a/PolyPilot/Services/CopilotService.cs
+++ b/PolyPilot/Services/CopilotService.cs
@@ -449,6 +449,12 @@ private class SessionState
         /// Used for diagnostic logging: when the next TurnEnd re-arms the fallback, the log shows
         /// the self-healing loop in action (TurnEnd → TurnStart cancel → TurnEnd re-arm).</summary>
         public volatile bool FallbackCanceledByTurnStart;
+        /// <summary>When true, FlushCurrentResponse checks accumulated FlushedResponse for
+        /// @worker:...@end blocks after each sub-turn. If found, resolves ResponseCompletion
+        /// early so orchestrator dispatch can proceed without waiting for the model to finish
+        /// all its tool rounds. This prevents the orchestrator from doing all the work itself
+        /// when it has tool access and ignores "dispatcher only" instructions.</summary>
+        public bool EarlyDispatchOnWorkerBlocks;
         /// <summary>Timer that fires shortly after a tool starts to verify the connection is still alive.
         /// If no tool completion event arrives within ToolHealthCheckIntervalMs, we do an active health
         /// check to detect dead connections early (instead of waiting for the 600s watchdog timeout).</summary>