From 512cee9fbb33c4c231b383701e0290e062c6a112 Mon Sep 17 00:00:00 2001
From: Shane Neuville <shneuvil@microsoft.com>
Date: Wed, 11 Mar 2026 07:36:02 -0500
Subject: [PATCH 01/14] Fix orchestrator delegation infinite loop and empty
 worker responses

Two fixes for the Evaluate-pr-tests-orchestrator failure pattern:

1. **Completion override loop fix**: When all workers were attempted (even
   if some failed/returned empty), accept [[GROUP_REFLECT_COMPLETE]] instead
   of overriding indefinitely. Previously, workers that failed due to SDK
   bug #299 (missing SessionIdleEvent) kept dispatchedWorkers empty, causing
   the orchestrator to loop forever with '0 raw assignments'.

   Changed both the evaluator path and self-eval path to check
   allWorkersAttempted (not just allWorkersDispatched) as a valid
   completion condition.

2. **Chat history fallback for empty responses**: When a worker returns
   empty (common with SDK bug #299 where watchdog completes the session
   but FlushedResponse/CurrentResponse are empty), extract the last
   assistant message from chat history as a fallback. The SDK may have
   streamed content via delta events that ended up in history even though
   the response buffers were empty.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../Services/CopilotService.Organization.cs   | 36 ++++++++++++++-----
 1 file changed, 28 insertions(+), 8 deletions(-)
diff --git a/PolyPilot/Services/CopilotService.Organization.cs b/PolyPilot/Services/CopilotService.Organization.cs
index 86f313047e..0e551a7826 100644
--- a/PolyPilot/Services/CopilotService.Organization.cs
+++ b/PolyPilot/Services/CopilotService.Organization.cs
@@ -1402,6 +1402,20 @@ private async Task<WorkerResult> ExecuteWorkerAsync(string workerName, string ta
                     }
                 }
 
+                // Fallback: if response is still empty, try extracting from chat history.
+                // The SDK may have streamed content via delta events that ended up in history
+                // even though FlushedResponse/CurrentResponse were empty (e.g., watchdog completion).
+                if (string.IsNullOrWhiteSpace(response) && _sessions.TryGetValue(workerName, out var histState))
+                {
+                    var lastAssistant = histState.Info.History
+                        .LastOrDefault(m => m.Role == "assistant" && !string.IsNullOrWhiteSpace(m.Content));
+                    if (lastAssistant != null)
+                    {
+                        response = lastAssistant.Content;
+                        Debug($"[DISPATCH] Worker '{workerName}' recovered {response!.Length} chars from chat history");
+                    }
+                }
+
                 Debug($"[DISPATCH] Worker '{workerName}' completed (response len={response?.Length ?? 0}, elapsed={sw.Elapsed.TotalSeconds:F1}s)");
                 return new WorkerResult(workerName, response, true, null, sw.Elapsed);
             }
@@ -2418,13 +2432,16 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List<string>
                 var evaluatorModel = GetEffectiveModel(evaluatorName);
                 var trend = reflectState.RecordEvaluation(reflectState.CurrentIteration, score, rationale, evaluatorModel);
 
-                // Check if evaluator says complete — but only if all workers have participated successfully
+                // Check if evaluator says complete
+                var allWorkersAttempted = workerNames.All(w => attemptedWorkers.Contains(w));
                 if ((evalResponse.Contains("[[GROUP_REFLECT_COMPLETE]]", StringComparison.OrdinalIgnoreCase) || score >= 0.9)
-                    && allWorkersDispatched)
+                    && (allWorkersDispatched || allWorkersAttempted))
                 {
+                    var partial = !allWorkersDispatched && allWorkersAttempted;
                     reflectState.GoalMet = true;
                     reflectState.IsActive = false;
-                    AddOrchestratorSystemMessage(orchestratorName, $"✅ {reflectState.BuildCompletionSummary()} (score: {score:F1})");
+                    var suffix = partial ? " (some workers failed but all were attempted)" : "";
+                    AddOrchestratorSystemMessage(orchestratorName, $"✅ {reflectState.BuildCompletionSummary()} (score: {score:F1}){suffix}");
                     break;
                 }
 
@@ -2435,7 +2452,7 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List<string>
                     var neverDispatched = missing.Where(w => !attemptedWorkers.Contains(w)).ToList();
                     var detail = neverDispatched.Count > 0
                         ? $"Not yet dispatched: {string.Join(", ", neverDispatched)}."
-                        : $"Dispatched but failed: {string.Join(", ", failedButAttempted)}. Retry or address errors.";
+                        : $"Dispatched but failed: {string.Join(", ", failedButAttempted)}. Will retry next iteration.";
                     Debug($"Reflection: overriding completion — {detail}");
                     reflectState.LastEvaluation = $"{detail} Dispatch to the remaining workers before completing.";
                     AddOrchestratorSystemMessage(orchestratorName, $"🔄 Overriding completion — {detail}");
@@ -2452,18 +2469,21 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List<string>
             {
                 synthesisResponse = await SendPromptAndWaitAsync(orchestratorName, synthEvalPrompt, ct, originalPrompt: prompt);
 
-                // Check completion sentinel — but only if all workers have participated successfully
+                // Check completion sentinel
+                var allWorkersAttempted = workerNames.All(w => attemptedWorkers.Contains(w));
                 if (synthesisResponse.Contains("[[GROUP_REFLECT_COMPLETE]]", StringComparison.OrdinalIgnoreCase)
-                    && allWorkersDispatched)
+                    && (allWorkersDispatched || allWorkersAttempted))
                 {
+                    var partial = !allWorkersDispatched && allWorkersAttempted;
                     reflectState.GoalMet = true;
                     reflectState.IsActive = false;
-                    AddOrchestratorSystemMessage(orchestratorName, $"✅ {reflectState.BuildCompletionSummary()}");
+                    var suffix = partial ? " (some workers failed but all were attempted)" : "";
+                    AddOrchestratorSystemMessage(orchestratorName, $"✅ {reflectState.BuildCompletionSummary()}{suffix}");
                     break;
                 }
 
                 if (synthesisResponse.Contains("[[GROUP_REFLECT_COMPLETE]]", StringComparison.OrdinalIgnoreCase)
-                    && !allWorkersDispatched)
+                    && !allWorkersAttempted)
                 {
                     // Override premature completion — not all workers have participated
                     var missing = workerNames.Where(w => !dispatchedWorkers.Contains(w)).ToList();

From 71ead7ea790133648ceb5ecdc12eec49e3050980 Mon Sep 17 00:00:00 2001
From: Shane Neuville <shneuvil@microsoft.com>
Date: Wed, 11 Mar 2026 08:48:08 -0500
Subject: [PATCH 02/14] Fix reflect loop semaphore hang on cleanup (supersedes
 PR #323)

The inner finally block in SendViaOrchestratorReflectAsync awaited
SendPromptAndWaitAsync for leftover queued prompts while still holding
the semaphore. If the SDK connection was broken (StreamJsonRpc errors),
this await would never complete, permanently blocking all future
messages with 'Reflect loop already running'.

Fix: Collect leftover prompts synchronously in the finally block, then
deliver them in a fire-and-forget task with a 30s timeout AFTER the
semaphore is released. The semaphore is now always released promptly
regardless of SDK connection health.

Also includes PR #352 fixes:
- Accept [[GROUP_REFLECT_COMPLETE]] when all workers were attempted
  (not just succeeded), preventing infinite delegation loops
- Chat history fallback for empty worker responses (SDK bug #299)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../Services/CopilotService.Organization.cs   | 47 +++++++++++++------
 1 file changed, 32 insertions(+), 15 deletions(-)

diff --git a/PolyPilot/Services/CopilotService.Organization.cs b/PolyPilot/Services/CopilotService.Organization.cs
index 0e551a7826..323a0726a4 100644
--- a/PolyPilot/Services/CopilotService.Organization.cs
+++ b/PolyPilot/Services/CopilotService.Organization.cs
@@ -2255,6 +2255,7 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List<string>
             return;
         }
 
+        var leftoverPrompts = new List<string>();
         try
         {
 
@@ -2579,25 +2580,16 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List<string>
             reflectState.IsActive = false;
             reflectState.CompletedAt = DateTime.Now;
             ClearPendingOrchestration();
-            // Send any remaining queued prompts to the orchestrator before releasing the lock.
-            // These were acknowledged to the user ("📨 queued") but the loop is exiting —
-            // sending them ensures the model sees them rather than silently discarding.
+            // Collect leftover queued prompts synchronously for best-effort delivery
+            // AFTER releasing the semaphore. This prevents holding the semaphore forever
+            // if SendPromptAndWaitAsync blocks on a broken SDK connection (e.g., StreamJsonRpc
+            // serialization errors during cancellation). See PR #323.
             if (_reflectQueuedPrompts.TryGetValue(groupId, out var remainingQueue))
             {
                 while (remainingQueue.TryDequeue(out var leftover))
-                {
-                    try
-                    {
-                        Debug($"[DISPATCH] Sending leftover queued prompt on loop exit (len={leftover.Length})");
-                        await SendPromptAndWaitAsync(orchestratorName,
-                            $"[User sent a message — the reflection loop has completed]\n\n{leftover}", ct, originalPrompt: leftover);
-                    }
-                    catch (Exception ex)
-                    {
-                        Debug($"[DISPATCH] Failed to send leftover queued prompt: {ex.Message}");
-                    }
-                }
+                    leftoverPrompts.Add(leftover);
             }
+
             SaveOrganization();
             InvokeOnUI(() =>
             {
@@ -2611,6 +2603,31 @@ await SendPromptAndWaitAsync(orchestratorName,
         {
             loopLock.Release();
         }
+
+        // Fire-and-forget: deliver leftover prompts after semaphore is released.
+        // If delivery fails (broken connection), prompts are still visible in chat history.
+        if (leftoverPrompts.Count > 0)
+        {
+            var orchName = orchestratorName;
+            _ = Task.Run(async () =>
+            {
+                foreach (var leftover in leftoverPrompts)
+                {
+                    try
+                    {
+                        Debug($"[DISPATCH] Sending leftover queued prompt after loop exit (len={leftover.Length})");
+                        using var cleanupCts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
+                        await SendPromptAndWaitAsync(orchName,
+                            $"[User sent a message — the reflection loop has completed]\n\n{leftover}",
+                            cleanupCts.Token, originalPrompt: leftover);
+                    }
+                    catch (Exception ex)
+                    {
+                        Debug($"[DISPATCH] Failed to send leftover queued prompt: {ex.Message}");
+                    }
+                }
+            });
+        }
     }
 
     private string BuildSynthesisWithEvalPrompt(string originalPrompt, List<WorkerResult> results, ReflectionCycle state,

From 0705867ad15ead18847ab94d2eebcd66a9fbe6c5 Mon Sep 17 00:00:00 2001
From: Shane Neuville <shneuvil@microsoft.com>
Date: Wed, 11 Mar 2026 09:37:28 -0500
Subject: [PATCH 03/14] Force delegation when orchestrator refuses to delegate
 (stale history)

Three prompt improvements + structural fallback:

1. Stronger planning prompt: explicit 'NEW request, ignore previous
   results' instruction when dispatcherOnly=true
2. Stronger nudge: warns about stale results, demands only @worker blocks
3. Forced delegation fallback: when both plan and nudge produce 0
   @worker blocks, synthetically create assignments that send the
   original prompt to all workers. This handles the case where the
   orchestrator has stale session history and refuses to delegate.

Also strengthened the replan prompt with the same fresh-start language.

Verified live: orchestrator successfully delegated to 2 workers
(10K+ char responses each), produced a 5K char consensus synthesis.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../Services/CopilotService.Organization.cs   | 25 +++++++++++--------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/PolyPilot/Services/CopilotService.Organization.cs b/PolyPilot/Services/CopilotService.Organization.cs
index 323a0726a4..fb659d822c 100644
--- a/PolyPilot/Services/CopilotService.Organization.cs
+++ b/PolyPilot/Services/CopilotService.Organization.cs
@@ -1218,6 +1218,7 @@ private string BuildOrchestratorPlanningPrompt(string userPrompt, List<string> w
         {
             sb.AppendLine("You are a DISPATCHER ONLY. You do NOT have tools. You CANNOT write code, read files, or run commands yourself.");
             sb.AppendLine("Your ONLY job is to write @worker/@end blocks that assign work to your workers.");
+            sb.AppendLine("⚠️ This is a NEW request. Even if your conversation history shows previous work on a similar topic, you MUST delegate FRESH tasks to your workers. Previous results may be stale.");
         }
         else
         {
@@ -1263,7 +1264,10 @@ private static List<TaskAssignment> DeduplicateAssignments(
     internal static string BuildDelegationNudgePrompt(List<string> workerNames)
     {
         var sb = new System.Text.StringBuilder();
-        sb.AppendLine("Your previous response did not contain any @worker delegation blocks. You MUST delegate work to your workers.");
+        sb.AppendLine("⚠️ CRITICAL: Your previous response did not contain any @worker delegation blocks.");
+        sb.AppendLine();
+        sb.AppendLine("You are a DISPATCHER. You CANNOT do the work yourself. Even if you believe the work is already done from a previous conversation, you MUST delegate FRESH work to your workers NOW.");
+        sb.AppendLine("Any previous results are STALE — the user is requesting a NEW evaluation. Ignore prior results and dispatch fresh tasks.");
         sb.AppendLine();
         sb.AppendLine($"Available workers ({workerNames.Count}): {string.Join(", ", workerNames)}");
         sb.AppendLine();
@@ -1273,7 +1277,7 @@ internal static string BuildDelegationNudgePrompt(List<string> workerNames)
         sb.AppendLine("Describe the task here on a separate line.");
         sb.AppendLine("@end");
         sb.AppendLine();
-        sb.AppendLine("Do NOT do the work yourself. Output @worker blocks now.");
+        sb.AppendLine("Produce @worker blocks for ALL workers now. Do NOT explain, do NOT summarize previous work, ONLY output @worker blocks.");
         return sb.ToString();
     }
 
@@ -2337,14 +2341,14 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List<string>
                     }
                     else
                     {
-                        reflectState.ConsecutiveErrors++;
-                        if (reflectState.ConsecutiveErrors >= 3)
-                        {
-                            reflectState.IsStalled = true;
-                            reflectState.IsCancelled = true;
-                            break;
-                        }
-                        continue;
+                        // Nudge also failed — the orchestrator refuses to delegate (likely due to
+                        // stale history from a previous run). Force delegation by creating synthetic
+                        // @worker blocks that assign the original prompt to each worker.
+                        Debug($"[DISPATCH] Nudge failed, forcing delegation to all {workerNames.Count} workers");
+                        AddOrchestratorSystemMessage(orchestratorName,
+                            $"⚡ Orchestrator refused to delegate after nudge. Forcing dispatch to all {workerNames.Count} workers.");
+                        assignments = workerNames.Select(w => new TaskAssignment(w, prompt)).ToList();
+                        // Fall through to dispatch below
                     }
                 }
                 else
@@ -2691,6 +2695,7 @@ private string BuildReplanPrompt(string lastEvaluation, List<string> workerNames
         var sb = new System.Text.StringBuilder();
         sb.AppendLine("You are a DISPATCHER ONLY. You do NOT have tools. You CANNOT write code, read files, or run commands yourself.");
         sb.AppendLine("Your ONLY job is to write @worker/@end blocks that assign work to your workers.");
+        sb.AppendLine("Even if previous conversation history shows completed work, you MUST dispatch FRESH tasks now.");
         sb.AppendLine();
         sb.AppendLine("## Previous Iteration Evaluation");
         sb.AppendLine(lastEvaluation);

From 88e839df27119ab15d8492188ac4d5da0cf7f985 Mon Sep 17 00:00:00 2001
From: Shane Neuville <shneuvil@microsoft.com>
Date: Wed, 11 Mar 2026 10:21:52 -0500
Subject: [PATCH 04/14] Fix dotnet worker to actually invoke skill-validator
 tool

The Dotnet Skill Validator worker was just describing the methodology
from dotnet/skills but never actually running the tool. Updated the
system prompt to:

- Download skill-validator binary from dotnet/skills nightly releases
- Run it against the skill directory with --runs 1 --verbose
- Report actual tool output (improvement scores, profile analysis)
- Fall back to manual analysis only if the tool can't run

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 PolyPilot/Models/ModelCapabilities.cs | 95 +++++++++++++++++++--------
 1 file changed, 67 insertions(+), 28 deletions(-)

diff --git a/PolyPilot/Models/ModelCapabilities.cs b/PolyPilot/Models/ModelCapabilities.cs
index c8773e8695..4c6c75ab9c 100644
--- a/PolyPilot/Models/ModelCapabilities.cs
+++ b/PolyPilot/Models/ModelCapabilities.cs
@@ -353,55 +353,94 @@ public record GroupPreset(string Name, string Description, string Emoji, MultiAg
             WorkerSystemPrompts = new[]
             {
                 """
-                You are the Dotnet Skill Validator. You evaluate skills empirically using the methodology from the dotnet/skills skill-validator tool.
+                You are the Dotnet Skill Validator. You evaluate skills by actually RUNNING the `skill-validator` tool from dotnet/skills — not by guessing or theorizing.
+
+                ## STEP 1: Ensure skill-validator is available
+
+                Check if the binary exists, and download it if missing:
+                ```bash
+                if [ ! -x /tmp/skill-validator ]; then
+                  echo "Downloading skill-validator..."
+                  cd /tmp && gh release download skill-validator-nightly \
+                    --repo dotnet/skills \
+                    --pattern 'skill-validator-osx-arm64.tar.gz' \
+                    --clobber && \
+                  tar xzf skill-validator-osx-arm64.tar.gz 2>/dev/null
+                fi
+                /tmp/skill-validator --help | head -5
+                ```
+                If `gh` is not available or the download fails, explain the failure and fall back to manual analysis (Step 3 only).
 
-                ## Your Evaluation Approach
+                ## STEP 2: Run skill-validator against the skill
 
-                For each skill under evaluation, perform these steps:
+                Run the tool against the skill directory. The skill directory must contain a `SKILL.md` file.
 
-                ### 1. Inspect the Skill Definition
-                - Read the SKILL.md file carefully
-                - Read the skill's `tests/eval.yaml` if present
-                - Identify what scenarios the skill claims to improve
+                ```bash
+                /tmp/skill-validator <path-to-skill-directory> \
+                  --runs 1 \
+                  --model claude-sonnet-4.6 \
+                  --verdict-warn-only \
+                  --verbose \
+                  --results-dir /tmp/skill-validator-results
+                ```
 
-                ### 2. Empirical Baseline Comparison
-                - For each scenario in the skill's eval.yaml (or infer representative scenarios from SKILL.md):
-                  a. Describe what the agent response would look like WITHOUT the skill (baseline)
-                  b. Describe what the agent response looks like WITH the skill
-                  c. Assess: token efficiency, tool call count, task completion, error rate
+                **Flags explained:**
+                - `--runs 1`: Single run for speed (the Anthropic evaluator covers qualitative depth)
+                - `--model claude-sonnet-4.6`: Use Sonnet for agent runs (fast, cost-effective)
+                - `--verdict-warn-only`: Don't fail hard on missing eval.yaml — report the gap instead
+                - `--verbose`: Show tool calls and agent events so we can see what happened
 
-                ### 3. Pairwise Comparative Judgment
-                - Compare baseline vs. skill-augmented performance side-by-side
-                - Apply position-swap bias mitigation: consider both orderings in your judgment
-                - Rate improvement on a scale of -1 (degraded) to +1 (strong improvement) with confidence
+                **If the skill has no `tests/eval.yaml`:** Report this as a finding. The tool will still run static profile analysis. Note this gap prominently in your verdict.
 
-                ### 4. Statistical Assessment
-                - Estimate variance across the scenario set
-                - Flag scenarios where the skill helps most vs. least
-                - Identify scenarios where the skill might cause regressions
+                **After the run:** Read the results:
+                ```bash
+                cat /tmp/skill-validator-results/summary.md 2>/dev/null
+                cat /tmp/skill-validator-results/results.json 2>/dev/null | head -100
+                ```
+
+                ## STEP 3: Analyze the tool output
+
+                Based on the ACTUAL tool output, assess:
+                - **Profile analysis**: Token count, section structure, code blocks (from tool's static analysis)
+                - **A/B comparison**: Did the skill improve agent performance? By how much?
+                - **Statistical significance**: Was the improvement score above the threshold?
+                - **Overfitting**: Did the tool flag any overfitting concerns?
+                - **Task completion**: Did assertions pass with the skill active?
+
+                If the tool couldn't run (no eval.yaml, download failed, etc.), perform manual analysis:
+                - Read SKILL.md and estimate token count, structure quality
+                - Identify likely improvement scenarios and potential regressions
+                - Note that this is manual analysis, not empirical data
+
+                ## STEP 4: Verdict
 
-                ### 5. Verdict
                 Format your verdict as:
                 ```
                 ## Dotnet Validator Verdict
+                **Tool Run**: ✅ Completed / ⚠️ Partial (no eval.yaml) / ❌ Failed (reason)
+                **Improvement Score**: X% [CI: low%, high%] (from tool) or N/A
                 **Overall Score**: X/10
                 **Confidence**: High / Medium / Low
                 **Verdict**: KEEP / IMPROVE / REMOVE
 
+                ### Tool Output Summary
+                - [key findings from skill-validator output]
+
                 ### Strengths
-                - [specific strengths with evidence]
+                - [specific strengths with evidence from tool output]
 
                 ### Weaknesses
-                - [specific weaknesses with evidence]
+                - [specific weaknesses with evidence from tool output]
 
                 ### Suggested Improvements
                 - [concrete, actionable suggestions]
                 ```
 
                 ## Rules
-                - Be data-driven: cite specific scenarios and observed behaviors
-                - Focus on measurable outcomes: does it reduce tokens? improve task completion? reduce errors?
-                - Don't recommend keeping a skill that shows no measurable improvement in your empirical analysis
+                - ALWAYS attempt to run the tool first. Do not skip to manual analysis.
+                - Include the actual tool output or error in your report — transparency matters.
+                - If the skill has no eval.yaml, recommend creating one and suggest specific scenarios.
+                - Be data-driven: cite tool metrics, not vibes.
                 """,
 
                 """
@@ -494,8 +533,8 @@ 3. Identify where they agree and disagree
                 4. Build a consensus verdict explaining which suggestions to adopt and why
 
                 ### Worker Names
-                - **worker-1** = Dotnet Skill Validator (empirical, outcome-focused)
-                - **worker-2** = Anthropic Skill Evaluator (prompt-design-focused)
+                - **worker-1** = Dotnet Skill Validator (runs the actual `skill-validator` tool from dotnet/skills for empirical A/B testing)
+                - **worker-2** = Anthropic Skill Evaluator (prompt-design-focused, manual analysis)
 
                 ### Dispatch Pattern
                 1. **First dispatch**: Send the skill content to BOTH workers. Include the full SKILL.md content and eval.yaml if available.

From 4b7947751aaf0255696907888a2b47cc8353e510 Mon Sep 17 00:00:00 2001
From: Shane Neuville <shneuvil@microsoft.com>
Date: Wed, 11 Mar 2026 10:30:24 -0500
Subject: [PATCH 05/14] Fix two flaky tests: WsBridge rename timeout and ChatDB
 GC race
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

WsBridgeIntegrationTests:
- WaitForAsync now throws TimeoutException on silent timeout instead
  of falling through to assertions on stale state
- Increased rename test wait to 8s for WebSocket round-trip margin

ChatDatabaseResilienceTests:
- Increased GC delay (200ms→500ms) and added 3 forced Gen2 collections
  to reliably trigger UnobservedTaskException finalization under load

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 PolyPilot.Tests/ChatDatabaseResilienceTests.cs | 13 ++++++++-----
 PolyPilot.Tests/WsBridgeIntegrationTests.cs    |  8 ++++++--
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/PolyPilot.Tests/ChatDatabaseResilienceTests.cs b/PolyPilot.Tests/ChatDatabaseResilienceTests.cs
index c0e88dc02d..951b54e39a 100644
--- a/PolyPilot.Tests/ChatDatabaseResilienceTests.cs
+++ b/PolyPilot.Tests/ChatDatabaseResilienceTests.cs
@@ -261,12 +261,15 @@ void handler(object? s, UnobservedTaskExceptionEventArgs e)
             _ = db.UpdateToolCompleteAsync("session-1", "tool-1", "result", true);
             _ = db.UpdateReasoningContentAsync("session-1", "reason-1", "content", true);
 
-            // Give tasks time to complete and finalize
+            // Give tasks time to complete and finalize — multiple GC cycles
+            // needed to reliably trigger UnobservedTaskException under load
+            await Task.Delay(500);
+            for (int i = 0; i < 3; i++)
+            {
+                GC.Collect(2, GCCollectionMode.Forced, blocking: true);
+                GC.WaitForPendingFinalizers();
+            }
             await Task.Delay(200);
-            GC.Collect();
-            GC.WaitForPendingFinalizers();
-            GC.Collect();
-            await Task.Delay(100);
 
             Assert.False(unobservedException,
                 "Fire-and-forget ChatDatabase calls must not produce unobserved task exceptions");
diff --git a/PolyPilot.Tests/WsBridgeIntegrationTests.cs b/PolyPilot.Tests/WsBridgeIntegrationTests.cs
index 6612ce4252..400587429a 100644
--- a/PolyPilot.Tests/WsBridgeIntegrationTests.cs
+++ b/PolyPilot.Tests/WsBridgeIntegrationTests.cs
@@ -27,13 +27,17 @@ private static int GetFreePort()
     }
 
     /// <summary>
-    /// Polls until a condition is true, with a timeout. Replaces fixed Task.Delay for reliability under load.
+    /// Polls until a condition is true, with a timeout. Throws TimeoutException on silent timeout
+    /// to surface flaky races instead of letting assertions fail on stale state.
     /// </summary>
     private static async Task WaitForAsync(Func<bool> condition, CancellationToken ct, int pollMs = 50, int maxMs = 4000)
     {
         var sw = System.Diagnostics.Stopwatch.StartNew();
         while (!condition() && sw.ElapsedMilliseconds < maxMs)
             await Task.Delay(pollMs, ct);
+
+        if (!condition())
+            throw new TimeoutException($"WaitForAsync condition not met within {maxMs}ms");
     }
 
     public WsBridgeIntegrationTests()
@@ -677,7 +681,7 @@ public async Task RenameSession_ViaClient_RenamesOnServer()
         var client = await ConnectClientAsync(cts.Token);
 
         await client.RenameSessionAsync("old-name", "new-name", cts.Token);
-        await WaitForAsync(() => _copilot.GetSession("new-name") != null, cts.Token);
+        await WaitForAsync(() => _copilot.GetSession("new-name") != null, cts.Token, maxMs: 8000);
 
         Assert.Null(_copilot.GetSession("old-name"));
         Assert.NotNull(_copilot.GetSession("new-name"));

From 7ad09691bf2cd47d7e158218792507231aaf6d27 Mon Sep 17 00:00:00 2001
From: Shane Neuville <shneuvil@microsoft.com>
Date: Wed, 11 Mar 2026 13:01:25 -0500
Subject: [PATCH 06/14] Address PR #352 review findings (Round 2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. Thread-safe History access: .ToArray() snapshot before .LastOrDefault()
   to prevent InvalidOperationException from concurrent SDK mutations
2. Timestamp guard on history fallback: only return assistant messages
   from the current dispatch, not stale content from prior turns
3. SafeFireAndForget wrapper for leftover prompt delivery instead of
   bare _ = Task.Run(...) — matches codebase convention
4. Platform-aware skill-validator download: uname detection for
   osx-arm64/osx-x64/linux-arm64/linux-x64 instead of hardcoded macOS

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 PolyPilot/Models/ModelCapabilities.cs             | 12 ++++++++++--
 PolyPilot/Services/CopilotService.Organization.cs | 10 ++++++----
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/PolyPilot/Models/ModelCapabilities.cs b/PolyPilot/Models/ModelCapabilities.cs
index 4c6c75ab9c..1ca901e8ee 100644
--- a/PolyPilot/Models/ModelCapabilities.cs
+++ b/PolyPilot/Models/ModelCapabilities.cs
@@ -361,11 +361,19 @@ You are the Dotnet Skill Validator. You evaluate skills by actually RUNNING the
                 ```bash
                 if [ ! -x /tmp/skill-validator ]; then
                   echo "Downloading skill-validator..."
+                  ARCH=$(uname -m)
+                  case "$(uname -s)-${ARCH}" in
+                    Darwin-arm64) PATTERN='skill-validator-osx-arm64.tar.gz' ;;
+                    Darwin-x86_64) PATTERN='skill-validator-osx-x64.tar.gz' ;;
+                    Linux-aarch64) PATTERN='skill-validator-linux-arm64.tar.gz' ;;
+                    Linux-x86_64) PATTERN='skill-validator-linux-x64.tar.gz' ;;
+                    *) echo "Unsupported platform: $(uname -s)-${ARCH}"; exit 1 ;;
+                  esac
                   cd /tmp && gh release download skill-validator-nightly \
                     --repo dotnet/skills \
-                    --pattern 'skill-validator-osx-arm64.tar.gz' \
+                    --pattern "$PATTERN" \
                     --clobber && \
-                  tar xzf skill-validator-osx-arm64.tar.gz 2>/dev/null
+                  tar xzf "$PATTERN"
                 fi
                 /tmp/skill-validator --help | head -5
                 ```
diff --git a/PolyPilot/Services/CopilotService.Organization.cs b/PolyPilot/Services/CopilotService.Organization.cs
index fb659d822c..c93438b445 100644
--- a/PolyPilot/Services/CopilotService.Organization.cs
+++ b/PolyPilot/Services/CopilotService.Organization.cs
@@ -1375,6 +1375,7 @@ private async Task<WorkerResult> ExecuteWorkerAsync(string workerName, string ta
         var workerPrompt = $"{identity}{worktreeNote}\n\nYour response will be collected and synthesized with other workers' responses.\n\n{sharedPrefix}## Original User Request (context)\n{originalPrompt}\n\n## Your Assigned Task\n{task}";
 
         const int maxRetries = 2;
+        var dispatchTime = DateTime.UtcNow;
         for (int attempt = 1; attempt <= maxRetries; attempt++)
         {
             try
@@ -1411,8 +1412,9 @@ private async Task<WorkerResult> ExecuteWorkerAsync(string workerName, string ta
                 // even though FlushedResponse/CurrentResponse were empty (e.g., watchdog completion).
                 if (string.IsNullOrWhiteSpace(response) && _sessions.TryGetValue(workerName, out var histState))
                 {
-                    var lastAssistant = histState.Info.History
-                        .LastOrDefault(m => m.Role == "assistant" && !string.IsNullOrWhiteSpace(m.Content));
+                    var lastAssistant = histState.Info.History.ToArray()
+                        .LastOrDefault(m => m.Role == "assistant" && !string.IsNullOrWhiteSpace(m.Content)
+                            && m.Timestamp >= dispatchTime);
                     if (lastAssistant != null)
                     {
                         response = lastAssistant.Content;
@@ -2613,7 +2615,7 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List<string>
         if (leftoverPrompts.Count > 0)
         {
             var orchName = orchestratorName;
-            _ = Task.Run(async () =>
+            SafeFireAndForget(Task.Run(async () =>
             {
                 foreach (var leftover in leftoverPrompts)
                 {
@@ -2630,7 +2632,7 @@ await SendPromptAndWaitAsync(orchName,
                         Debug($"[DISPATCH] Failed to send leftover queued prompt: {ex.Message}");
                     }
                 }
-            });
+            }), "leftover-prompt-delivery");
         }
     }
 

From 973a6f00a83b5746dcc112bb1f5ca4ed72e5c146 Mon Sep 17 00:00:00 2001
From: Shane Neuville <shneuvil@microsoft.com>
Date: Wed, 11 Mar 2026 15:13:21 -0500
Subject: [PATCH 07/14] Improve processing-state-safety skill based on
 validator consensus
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Apply all 6 improvements from the Skill Validator consensus report (6.5/10 → IMPROVE):

SKILL.md:
- Add code example for INV-3 (generation guard pattern)
- Clarify INV-7: explicit guidance for new cross-thread fields vs pre-existing gap
- Add 'Diagnosing a Stuck Session' section with symptom table and diagnostic steps
- Add symptom-based trigger phrases to description (spinner stuck, hung after tool use)
- Add test file pointer: run ProcessingWatchdogTests.cs after changes

ModelCapabilities.cs:
- Add optional Claude Code live trigger testing to Anthropic evaluator prompt
- Handle auth failures gracefully (skip and note, don't retry)

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../skills/processing-state-safety/SKILL.md   | 55 +++++++++++++++++--
 PolyPilot/Models/ModelCapabilities.cs         | 19 ++++++-
 2 files changed, 67 insertions(+), 7 deletions(-)

diff --git a/.claude/skills/processing-state-safety/SKILL.md b/.claude/skills/processing-state-safety/SKILL.md
index 2861b2a042..fbc031cab5 100644
--- a/.claude/skills/processing-state-safety/SKILL.md
+++ b/.claude/skills/processing-state-safety/SKILL.md
@@ -5,12 +5,13 @@ description: >
   abort/error paths, or CompleteResponse in CopilotService. Use when: (1) Adding or
   modifying code paths that set IsProcessing=false, (2) Touching HandleSessionEvent,
   CompleteResponse, AbortSessionAsync, or the processing watchdog, (3) Adding new
-  SDK event handlers, (4) Debugging stuck sessions showing "Thinking..." forever,
-  (5) Modifying IsResumed, HasUsedToolsThisTurn, or ActiveToolCallCount,
+  SDK event handlers, (4) Debugging stuck sessions showing "Thinking..." forever
+  or spinner stuck, (5) Modifying IsResumed, HasUsedToolsThisTurn, or ActiveToolCallCount,
   (6) Adding diagnostic log tags, (7) Modifying session restore paths
   (RestoreSingleSessionAsync) that must initialize watchdog-dependent state,
   (8) Modifying ReconcileOrganization or any code that reads Organization.Sessions
-  during the IsRestoring window. Covers: 13 invariants from 10 PRs of fix cycles,
+  during the IsRestoring window, (9) Session appears hung or unresponsive after tool use.
+  Covers: 13 invariants from 10 PRs of fix cycles,
   the 9 code paths that clear IsProcessing, and common regression patterns.
 ---
 
@@ -31,6 +32,7 @@ Every code path that sets `IsProcessing = false` MUST also:
 10. Fire `OnSessionComplete` (unblocks orchestrator loops waiting for completion)
 11. Add a diagnostic log entry (`[COMPLETE]`, `[ERROR]`, `[ABORT]`, etc.)
 12. Run on UI thread (via `InvokeOnUI()` or already on UI thread)
+13. After changes, run `ProcessingWatchdogTests.cs` to catch regressions
 
 ## The 9 Paths That Clear IsProcessing
 
@@ -74,6 +76,17 @@ ALL IsProcessing mutations go through UI thread via `InvokeOnUI()`.
 Use generation guard before clearing IsProcessing. `SyncContext.Post` is
 async — new `SendPromptAsync` can race between `Post()` and callback.
 
+```csharp
+// Capture BEFORE posting to UI thread
+var gen = Interlocked.Read(ref state.ProcessingGeneration);
+InvokeOnUI(() =>
+{
+    // Validate INSIDE the callback — abort if a new turn started
+    if (Interlocked.Read(ref state.ProcessingGeneration) != gen) return;
+    // Safe to clear IsProcessing here
+});
+```
+
 ### INV-4: No hardcoded short timeouts
 NEVER add hardcoded short timeouts for session resume. The watchdog
 (120s/600s) with tiered approach is the correct mechanism.
@@ -88,9 +101,11 @@ Cleared on ALL termination paths. Extends watchdog to 600s.
 Clearing guarded on `!hasActiveTool && !HasUsedToolsThisTurn`.
 
 ### INV-7: Volatile for cross-thread fields
-`HasUsedToolsThisTurn`, `HasReceivedEventsSinceResume` should use
-`Volatile.Write`/`Volatile.Read`. ARM weak memory model issue.
-(Currently partial — resets use plain assignment.)
+When adding NEW cross-thread boolean/int flags, use `Volatile.Write`/`Volatile.Read`
+for ARM weak memory model correctness. Existing fields `HasUsedToolsThisTurn` and
+`HasReceivedEventsSinceResume` use plain assignment (pre-existing inconsistency —
+tracked separately, do not fix inline). Do NOT introduce additional plain-assignment
+cross-thread fields without a tracking comment explaining the gap.
 
 ### INV-8: No InvokeAsync in HandleComplete
 `HandleComplete` is already on UI thread. `InvokeAsync` defers execution
@@ -180,6 +195,34 @@ intent is less clear when reading cross-threaded code.
 **Retired mistake (was #2):** *ActiveToolCallCount as sole tool signal* — still relevant per
 INV-5, but the more impactful version is #2 above (suppressing the fallback entirely).
 
+## Diagnosing a Stuck Session
+
+When a session shows "Thinking..." indefinitely:
+
+1. **Check the diagnostic log** — `~/.polypilot/event-diagnostics.log`
+   ```bash
+   grep 'SESSION_NAME' ~/.polypilot/event-diagnostics.log | tail -20
+   ```
+   Look for the last `[SEND]` (turn started) and whether `[IDLE]` or `[COMPLETE]` followed.
+
+2. **Check if the watchdog is running** — look for `[WATCHDOG]` entries after the `[SEND]`.
+   If none appear, the watchdog wasn't started (see INV-9 for restore path issues).
+
+3. **Check `IsProcessing` state** — via MauiDevFlow CDP:
+   ```bash
+   maui-devflow cdp Runtime evaluate "document.querySelector('.processing-indicator')?.textContent"
+   ```
+
+4. **Common stuck patterns:**
+   | Symptom | Likely Cause | Fix |
+   |---------|-------------|-----|
+   | `[SEND]` then silence | SDK never responded, watchdog will catch at 120s | Wait or abort |
+   | `[EVT] TurnEnd` but no `[IDLE]` | Zero-idle SDK bug | Watchdog catches at 30s fallback (INV-10) |
+   | `[COMPLETE]` fired but spinner persists | UI thread not notified | Check INV-2, INV-8 |
+   | `[WATCHDOG]` clears but re-sticks | New turn started before watchdog callback ran | Check INV-3 generation guard |
+
+5. **Nuclear option** — user clicks Stop (AbortSessionAsync, path #5/#6).
+
 ## Regression History
 
 10 PRs of fix/regression cycles: #141 → #147 → #148 → #153 → #158 → #163 → #164 → #276 → #284 → #332.
diff --git a/PolyPilot/Models/ModelCapabilities.cs b/PolyPilot/Models/ModelCapabilities.cs
index 1ca901e8ee..3ad4f5eda9 100644
--- a/PolyPilot/Models/ModelCapabilities.cs
+++ b/PolyPilot/Models/ModelCapabilities.cs
@@ -502,7 +502,24 @@ Run the tool against the skill directory. The skill directory must contain a `SK
                 - Be concrete: quote specific lines from SKILL.md when critiquing
                 - Focus on prompt quality, not empirical test results
                 - Suggest specific rewrites, not vague advice like "be clearer"
-                """,
+
+                ## Optional: Live Trigger Testing with Claude Code
+                If `claude` CLI is available, test trigger accuracy with non-interactive prompts:
+                ```bash
+                # Check availability first
+                which claude 2>/dev/null && claude --version 2>&1
+                ```
+                If available and authenticated, run:
+                ```bash
+                # Positive test — should trigger the skill
+                claude -p "<prompt that should trigger>" --output-format text 2>&1 | head -50
+                # Negative test — should NOT trigger the skill  
+                claude -p "<unrelated prompt>" --output-format text 2>&1 | head -50
+                ```
+                If `claude` returns auth errors ("does not have access", "login again"), skip live
+                testing and note: "Claude Code CLI not authenticated — trigger testing skipped."
+                Do NOT retry auth failures. Fall back to manual analysis.
+                """
             },
             SharedContext = """
                 ## Skill Evaluation Standards

From 845f0498acb0d5e6e231ada8e942afe419eab026 Mon Sep 17 00:00:00 2001
From: Shane Neuville <shneuvil@microsoft.com>
Date: Wed, 11 Mar 2026 16:27:50 -0500
Subject: [PATCH 08/14] feat: add Eval Generator worker to Skill Validator
 preset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add 3rd worker (Eval Generator) to the Skill Validator multi-agent preset:
- Worker-3 reads SKILL.md and generates tests/eval.yaml with positive/negative scenarios
- Updated RoutingContext for 2-phase dispatch (generate evals → run evaluators)
- Updated SharedContext with eval.yaml format reference
- Updated test assertions for 3 workers

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 PolyPilot.Tests/SessionOrganizationTests.cs |   4 +-
 PolyPilot/Models/ModelCapabilities.cs       | 170 ++++++++++++++++++--
 2 files changed, 155 insertions(+), 19 deletions(-)

diff --git a/PolyPilot.Tests/SessionOrganizationTests.cs b/PolyPilot.Tests/SessionOrganizationTests.cs
index e123df7a22..dee8ce0597 100644
--- a/PolyPilot.Tests/SessionOrganizationTests.cs
+++ b/PolyPilot.Tests/SessionOrganizationTests.cs
@@ -1370,13 +1370,13 @@ public void BuiltInPresets_IncludeSkillValidator()
     {
         var skillValidator = GroupPreset.BuiltIn.FirstOrDefault(p => p.Name == "Skill Validator");
         Assert.NotNull(skillValidator);
-        Assert.Equal(2, skillValidator!.WorkerModels.Length);
+        Assert.Equal(3, skillValidator!.WorkerModels.Length);
         Assert.Equal(MultiAgentMode.OrchestratorReflect, skillValidator.Mode);
         Assert.Equal("⚖️", skillValidator.Emoji);
         Assert.NotNull(skillValidator.SharedContext);
         Assert.NotNull(skillValidator.RoutingContext);
         Assert.NotNull(skillValidator.WorkerSystemPrompts);
-        Assert.Equal(2, skillValidator.WorkerSystemPrompts!.Length);
+        Assert.Equal(3, skillValidator.WorkerSystemPrompts!.Length);
         Assert.All(skillValidator.WorkerSystemPrompts, p => Assert.False(string.IsNullOrWhiteSpace(p)));
         Assert.NotNull(skillValidator.MaxReflectIterations);
     }
diff --git a/PolyPilot/Models/ModelCapabilities.cs b/PolyPilot/Models/ModelCapabilities.cs
index 3ad4f5eda9..c3a69951d0 100644
--- a/PolyPilot/Models/ModelCapabilities.cs
+++ b/PolyPilot/Models/ModelCapabilities.cs
@@ -346,9 +346,9 @@ public record GroupPreset(string Name, string Description, string Emoji, MultiAg
         },
 
         new GroupPreset(
-            "Skill Validator", "Two evaluators assess your skill from different angles — dotnet empirical testing vs. Anthropic design review — orchestrator builds consensus",
+            "Skill Validator", "Three-phase skill evaluation: generate evals → empirical A/B testing → prompt design review → orchestrator builds consensus",
             "⚖️", MultiAgentMode.OrchestratorReflect,
-            "claude-opus-4.6", new[] { "claude-sonnet-4.6", "claude-sonnet-4.6" })
+            "claude-opus-4.6", new[] { "claude-sonnet-4.6", "claude-sonnet-4.6", "claude-sonnet-4.6" })
         {
             WorkerSystemPrompts = new[]
             {
@@ -519,20 +519,117 @@ which claude 2>/dev/null && claude --version 2>&1
                 If `claude` returns auth errors ("does not have access", "login again"), skip live
                 testing and note: "Claude Code CLI not authenticated — trigger testing skipped."
                 Do NOT retry auth failures. Fall back to manual analysis.
+                """,
+
                 """
+                You are the Eval Generator. You create `tests/eval.yaml` files for skills that don't have them, enabling empirical A/B validation by the skill-validator tool.
+
+                ## Your Job
+
+                Given a skill directory with SKILL.md, generate a comprehensive `eval.yaml` that tests whether the skill actually improves agent behavior.
+
+                ## STEP 1: Read the skill
+
+                Read the SKILL.md file and understand:
+                - What the skill teaches the agent to do
+                - What triggers should activate the skill
+                - What the agent should do differently WITH the skill vs WITHOUT it
+                - What mistakes the skill prevents
+
+                ## STEP 2: Check for existing eval.yaml
+
+                ```bash
+                ls <skill-directory>/tests/eval.yaml 2>/dev/null
+                ```
+                If one already exists, read it and IMPROVE it rather than replacing it.
+                If none exists, create the `tests/` directory and generate a new one.
+
+                ## STEP 3: Design scenarios
+
+                Create 3-5 scenarios covering:
+
+                1. **Positive trigger (happy path)**: A prompt that SHOULD trigger the skill. Assert the agent follows the skill's key instructions.
+                2. **Negative trigger**: A prompt that should NOT trigger the skill. Assert the agent doesn't mention skill-specific concepts.
+                3. **Edge case**: A prompt that's ambiguous or borderline. The skill should help the agent handle it correctly.
+                4. **Regression prevention**: A prompt that tests a specific mistake the skill warns about. Assert the agent avoids the mistake.
+
+                ### Assertion guidelines
+                - Use `output_contains` for key terms/patterns the skilled agent MUST produce
+                - Use `output_not_contains` for anti-patterns the skilled agent must avoid
+                - Use `output_matches` with regex for flexible pattern matching
+                - Keep assertions focused on BEHAVIOR differences, not exact wording (avoid overfitting)
+
+                ### Rubric guidelines
+                - Each rubric item should describe a QUALITY criterion the judge can evaluate
+                - Focus on outcomes ("correctly identified the issue") not vocabulary ("used the word X")
+                - Include 2-4 rubric items per scenario
+                - Rubric items are evaluated by pairwise LLM comparison (with-skill vs without-skill)
+
+                ## STEP 4: Write the eval.yaml
+
+                ```bash
+                mkdir -p <skill-directory>/tests
+                cat > <skill-directory>/tests/eval.yaml << 'EVALEOF'
+                scenarios:
+                  - name: "Description of scenario"
+                    prompt: "The exact prompt to send to the agent"
+                    assertions:
+                      - type: "output_contains"
+                        value: "expected behavior indicator"
+                    rubric:
+                      - "Quality criterion for the judge"
+                    timeout: 120
+                EVALEOF
+                ```
+
+                ## STEP 5: Validate the file
+
+                ```bash
+                cat <skill-directory>/tests/eval.yaml
+                # Verify it's valid YAML
+                python3 -c "import yaml; yaml.safe_load(open('<skill-directory>/tests/eval.yaml'))" 2>&1 || echo "YAML parse error"
+                ```
+
+                ## Output format
+
+                After writing the file, report:
+                ```
+                ## Eval Generator Report
+                **Skill**: [skill name]
+                **Scenarios Generated**: N
+                **File Written**: <path>/tests/eval.yaml
+
+                ### Scenarios
+                1. [name] — [what it tests]
+                2. [name] — [what it tests]
+                ...
+
+                ### Design Rationale
+                - [why these specific scenarios were chosen]
+                - [what behavioral differences they target]
+                ```
+
+                ## Rules
+                - ALWAYS write the file to disk. Your output is consumed by other workers.
+                - Focus on BEHAVIORAL differences — what changes when the skill is active?
+                - Avoid overfitting: don't assert the skill's exact vocabulary, assert the outcomes
+                - Keep prompts realistic — they should sound like real user requests
+                - Set reasonable timeouts (60-180s depending on complexity)
+                - Use `exit_success` assertion sparingly — it just checks for non-empty output
+                """,
             },
             SharedContext = """
                 ## Skill Evaluation Standards
 
-                Both evaluators assess the same skill and produce independent verdicts.
-                A good skill must satisfy BOTH evaluators to be marked KEEP.
+                All three workers assess the same skill from different angles.
+                A good skill must satisfy the empirical validator AND the prompt reviewer to be marked KEEP.
 
                 ### What makes a skill worth keeping
                 - Measurable improvement in task completion (Dotnet validator perspective)
                 - Clear, precise description that triggers reliably (Anthropic evaluator perspective)
+                - Comprehensive eval.yaml that tests real behavioral differences (Eval Generator perspective)
                 - Focused scope — does one thing well
                 - Actionable instructions that guide the agent without over-constraining it
-                - Adequate test coverage
 
                 ### What warrants IMPROVE
                 - Good intent but fixable gaps (bad trigger description, missing scenarios, ambiguous instructions)
@@ -544,37 +641,75 @@ A good skill must satisfy BOTH evaluators to be marked KEEP.
                 - Instructions that would cause regressions or confusion
 
                 ### Consensus Rule
-                - KEEP requires both evaluators to say KEEP or one KEEP + one IMPROVE
+                - KEEP requires both evaluators (worker-1 and worker-2) to say KEEP or one KEEP + one IMPROVE
                 - IMPROVE if the evaluators disagree or both say IMPROVE
                 - REMOVE if either evaluator says REMOVE with strong evidence
+
+                ### eval.yaml Format Reference
+                ```yaml
+                scenarios:
+                  - name: "Descriptive name"
+                    prompt: "Prompt sent to the agent"
+                    assertions:
+                      - type: "output_contains"
+                        value: "expected text"
+                      - type: "output_not_contains"
+                        value: "text that should not appear"
+                      - type: "output_matches"
+                        pattern: "regex"
+                    rubric:
+                      - "Quality criterion for pairwise LLM judge"
+                    timeout: 120
+                ```
+                Assertion types: output_contains, output_not_contains, output_matches, output_not_matches, file_exists, file_contains, exit_success.
                 """,
             RoutingContext = """
                 ## Skill Validator Orchestration
 
-                You orchestrate two skill evaluators who assess skills from different angles. Your role is to:
-                1. Dispatch the skill to BOTH evaluators simultaneously (or sequentially if needed)
-                2. Collect their verdicts
-                3. Identify where they agree and disagree
-                4. Build a consensus verdict explaining which suggestions to adopt and why
+                You orchestrate THREE skill evaluation workers in a 2-phase pipeline. Your role is to dispatch them in the correct order and build a consensus verdict.
 
                 ### Worker Names
-                - **worker-1** = Dotnet Skill Validator (runs the actual `skill-validator` tool from dotnet/skills for empirical A/B testing)
+                - **worker-1** = Dotnet Skill Validator (runs the actual `skill-validator` tool for empirical A/B testing)
                 - **worker-2** = Anthropic Skill Evaluator (prompt-design-focused, manual analysis)
+                - **worker-3** = Eval Generator (creates tests/eval.yaml if missing)
 
-                ### Dispatch Pattern
-                1. **First dispatch**: Send the skill content to BOTH workers. Include the full SKILL.md content and eval.yaml if available.
-                2. **After both complete**: Compare their verdicts. Identify agreements (high confidence) and disagreements (requires judgment).
-                3. **Build consensus**: Produce a final report that explains which worker's suggestions are adopted, which are declined, and why.
+                ### 2-Phase Dispatch Pattern
+
+                **Phase 1: Check for eval.yaml and generate if missing**
+
+                Before dispatching the evaluators, check if the skill has `tests/eval.yaml`:
+                - If `tests/eval.yaml` EXISTS → skip to Phase 2 (dispatch worker-1 and worker-2 together)
+                - If `tests/eval.yaml` is MISSING → dispatch worker-3 ALONE first
+
+                When dispatching worker-3, include the skill directory path and explain that it needs to CREATE the eval.yaml file. Worker-3 will write it to disk.
+
+                After worker-3 completes, proceed to Phase 2.
+
+                **Phase 2: Run both evaluators**
+
+                Dispatch worker-1 (Dotnet) and worker-2 (Anthropic) simultaneously. Include:
+                - The full SKILL.md content (or path)
+                - The skill directory path
+                - Whether eval.yaml was pre-existing or just generated by worker-3
+                - If worker-3 ran, mention that evals are now available at `tests/eval.yaml`
+
+                **Phase 3: Build consensus**
+
+                After both evaluators complete, synthesize their verdicts into a consensus report.
 
                 ### Consensus Report Format
                 ```
                 ## Skill Validator Consensus Report: [Skill Name]
 
                 ### Summary
+                **Eval Generation**: ✅ Pre-existing / 🆕 Generated by worker-3 / ❌ None
                 **Dotnet Verdict**: KEEP/IMPROVE/REMOVE (X/10)
                 **Anthropic Verdict**: KEEP/IMPROVE/REMOVE (X/10)
                 **Consensus**: KEEP / IMPROVE / REMOVE
 
+                ### Eval Quality (if generated)
+                - [assessment of generated eval.yaml scenarios]
+
                 ### Points of Agreement (High Confidence)
                 - [issues both evaluators flagged]
 
@@ -595,7 +730,8 @@ 4. Build a consensus verdict explaining which suggestions to adopt and why
                 - Always explain WHY suggestions are adopted or declined
                 - Where evaluators disagree, explain the tradeoff and make a reasoned judgment
                 - Highlight suggestions both evaluators agree on as high-confidence improvements
-                - NEVER emit [[GROUP_REFLECT_COMPLETE]] until both evaluators have responded and a consensus report is produced
+                - If worker-3 generated evals, note whether the Dotnet validator was able to use them
+                - NEVER emit [[GROUP_REFLECT_COMPLETE]] until all dispatched workers have responded and a consensus report is produced
                 """,
             MaxReflectIterations = 6,
         },

From c9a5f364d6b4a750835fbdba1ab678737821c985 Mon Sep 17 00:00:00 2001
From: Shane Neuville <shneuvil@microsoft.com>
Date: Wed, 11 Mar 2026 16:37:44 -0500
Subject: [PATCH 09/14] fix: strengthen orchestrator routing to prevent
 self-work

The orchestrator was using task()/bash()/view() tools itself instead of
emitting @worker:/@end blocks for PolyPilot to dispatch to worker sessions.

Changes to RoutingContext:
- Added explicit anti-self-work warning at the top
- Listed full worker session names (not just 'worker-1')
- Added concrete @worker block examples for each phase
- Removed 'check if eval.yaml exists' instruction that invited tool use
- Added 'NEVER use tools yourself' rule

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 PolyPilot/Models/ModelCapabilities.cs | 62 ++++++++++++++++++---------
 1 file changed, 42 insertions(+), 20 deletions(-)

diff --git a/PolyPilot/Models/ModelCapabilities.cs b/PolyPilot/Models/ModelCapabilities.cs
index c3a69951d0..0a468d4b02 100644
--- a/PolyPilot/Models/ModelCapabilities.cs
+++ b/PolyPilot/Models/ModelCapabilities.cs
@@ -666,36 +666,57 @@ A good skill must satisfy the empirical validator AND the prompt reviewer to be
             RoutingContext = """
                 ## Skill Validator Orchestration
 
-                You orchestrate THREE skill evaluation workers in a 2-phase pipeline. Your role is to dispatch them in the correct order and build a consensus verdict.
+                ⚠️ CRITICAL: You are a DISPATCHER. You delegate work ONLY via @worker:/@end blocks.
+                NEVER use task(), bash(), view(), grep(), or ANY tool yourself. NEVER read files, run commands, or do analysis.
+                If you catch yourself about to call a tool — STOP and write an @worker block instead.
 
-                ### Worker Names
-                - **worker-1** = Dotnet Skill Validator (runs the actual `skill-validator` tool for empirical A/B testing)
-                - **worker-2** = Anthropic Skill Evaluator (prompt-design-focused, manual analysis)
-                - **worker-3** = Eval Generator (creates tests/eval.yaml if missing)
+                ### Delegation Format (the ONLY way to assign work)
 
-                ### 2-Phase Dispatch Pattern
+                ```
+                @worker:Skill Validator-worker-3
+                Your task: [detailed instructions]
+                @end
+                ```
 
-                **Phase 1: Check for eval.yaml and generate if missing**
+                You MUST use the FULL worker session name (e.g., "Skill Validator-worker-3", not just "worker-3").
+                Each response MUST contain @worker:/@end blocks. Text outside the blocks is your planning notes.
 
-                Before dispatching the evaluators, check if the skill has `tests/eval.yaml`:
-                - If `tests/eval.yaml` EXISTS → skip to Phase 2 (dispatch worker-1 and worker-2 together)
-                - If `tests/eval.yaml` is MISSING → dispatch worker-3 ALONE first
+                ### Worker Names
+                - **Skill Validator-worker-1** = Dotnet Skill Validator (runs `skill-validator` tool)
+                - **Skill Validator-worker-2** = Anthropic Skill Evaluator (prompt analysis via Claude Code)
+                - **Skill Validator-worker-3** = Eval Generator (creates tests/eval.yaml)
 
-                When dispatching worker-3, include the skill directory path and explain that it needs to CREATE the eval.yaml file. Worker-3 will write it to disk.
+                ### 2-Phase Dispatch
 
-                After worker-3 completes, proceed to Phase 2.
+                **Phase 1 — Always dispatch worker-3 first:**
 
-                **Phase 2: Run both evaluators**
+                Write a single @worker block for worker-3. Tell it the skill directory path and to generate tests/eval.yaml.
+                Do NOT check if eval.yaml exists yourself — worker-3 will check and skip if it already exists.
 
-                Dispatch worker-1 (Dotnet) and worker-2 (Anthropic) simultaneously. Include:
-                - The full SKILL.md content (or path)
-                - The skill directory path
-                - Whether eval.yaml was pre-existing or just generated by worker-3
-                - If worker-3 ran, mention that evals are now available at `tests/eval.yaml`
+                Example:
+                ```
+                @worker:Skill Validator-worker-3
+                Generate tests/eval.yaml for the skill at: <path>
+                The skill is about: <brief description from user prompt>
+                @end
+                ```
+
+                **Phase 2 — After worker-3 completes, dispatch workers 1 and 2 together:**
 
-                **Phase 3: Build consensus**
+                Write TWO @worker blocks in a single response:
+                ```
+                @worker:Skill Validator-worker-1
+                Evaluate the skill at: <path>
+                Eval.yaml status: [generated by worker-3 / pre-existing]
+                @end
+
+                @worker:Skill Validator-worker-2
+                Evaluate the skill at: <path>
+                Eval.yaml status: [generated by worker-3 / pre-existing]
+                @end
+                ```
 
-                After both evaluators complete, synthesize their verdicts into a consensus report.
+                **Phase 3 — After both evaluators complete, write the consensus report (no @worker blocks needed).**
 
                 ### Consensus Report Format
                 ```
@@ -727,6 +748,7 @@ You orchestrate THREE skill evaluation workers in a 2-phase pipeline. Your role
                 ```
 
                 ### Rules
+                - NEVER use tools yourself — you are a message relay, not a worker
                 - Always explain WHY suggestions are adopted or declined
                 - Where evaluators disagree, explain the tradeoff and make a reasoned judgment
                 - Highlight suggestions both evaluators agree on as high-confidence improvements

From 72b55002a02abf396ee86bd1dd21bc3b3535d9ce Mon Sep 17 00:00:00 2001
From: Shane Neuville <shneuvil@microsoft.com>
Date: Wed, 11 Mar 2026 16:52:30 -0500
Subject: [PATCH 10/14] =?UTF-8?q?feat:=20early=20dispatch=20=E2=80=94=20ex?=
 =?UTF-8?q?tract=20@worker=20blocks=20from=20intermediate=20sub-turns?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Copilot CLI agent has full tool access and ignores 'dispatcher only'
instructions, doing all the work itself via task()/bash()/view() before
the orchestrator response completes. By the time ParseTaskAssignments runs,
the @worker blocks are buried in a multi-minute tool session.

Fix: FlushCurrentResponse now checks accumulated FlushedResponse for
@worker:...@end blocks when EarlyDispatchOnWorkerBlocks is set. If found,
it resolves the ResponseCompletion TCS immediately so ParseTaskAssignments
can extract and dispatch workers while the orchestrator is still running.

The flag is set for all orchestrator planning prompts (reflect, non-reflect,
and nudge paths).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 PolyPilot/Services/CopilotService.Events.cs   | 19 +++++++++++++++++++
 .../Services/CopilotService.Organization.cs   | 12 ++++++++++++
 PolyPilot/Services/CopilotService.cs          |  6 ++++++
 3 files changed, 37 insertions(+)

diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs
index bde7ecaf91..69357dddcb 100644
--- a/PolyPilot/Services/CopilotService.Events.cs
+++ b/PolyPilot/Services/CopilotService.Events.cs
@@ -828,6 +828,25 @@ private void FlushCurrentResponse(SessionState state)
         
         state.CurrentResponse.Clear();
         state.HasReceivedDeltasThisTurn = false;
+        
+        // Early dispatch: if the orchestrator wrote @worker blocks in an intermediate sub-turn,
+        // resolve the TCS now so ParseTaskAssignments can run immediately. Without this, the
+        // orchestrator continues doing tool work itself for minutes before dispatch happens.
+        if (state.EarlyDispatchOnWorkerBlocks && state.ResponseCompletion != null)
+        {
+            var flushed = state.FlushedResponse.ToString();
+            if (System.Text.RegularExpressions.Regex.IsMatch(flushed, @"@worker:.+?\n[\s\S]+?@end", System.Text.RegularExpressions.RegexOptions.IgnoreCase))
+            {
+                Debug($"[DISPATCH] Early dispatch: @worker blocks detected in flushed text ({flushed.Length} chars) for '{state.Info.Name}'");
+                state.EarlyDispatchOnWorkerBlocks = false; // One-shot
+                // Build the full response the same way CompleteResponse does
+                var remaining = state.CurrentResponse.ToString();
+                var fullResponse = string.IsNullOrEmpty(remaining) ? flushed : flushed + "\n\n" + remaining;
+                state.FlushedResponse.Clear();
+                state.CurrentResponse.Clear();
+                state.ResponseCompletion.TrySetResult(fullResponse);
+            }
+        }
     }
 
     /// <summary>
diff --git a/PolyPilot/Services/CopilotService.Organization.cs b/PolyPilot/Services/CopilotService.Organization.cs
index c93438b445..77c94f1ea0 100644
--- a/PolyPilot/Services/CopilotService.Organization.cs
+++ b/PolyPilot/Services/CopilotService.Organization.cs
@@ -1066,6 +1066,9 @@ private async Task SendViaOrchestratorAsync(string groupId, List<string> members
         InvokeOnUI(() => OnOrchestratorPhaseChanged?.Invoke(groupId, OrchestratorPhase.Planning, null));
 
         var planningPrompt = BuildOrchestratorPlanningPrompt(prompt, workerNames, group?.OrchestratorPrompt, group?.RoutingContext);
+        // Enable early dispatch for the non-reflect orchestrator flow too
+        if (_sessions.TryGetValue(orchestratorName, out var orchPlanning))
+            orchPlanning.EarlyDispatchOnWorkerBlocks = true;
         var planResponse = await SendPromptAndWaitAsync(orchestratorName, planningPrompt, cancellationToken, originalPrompt: prompt);
 
         // Phase 2: Parse task assignments from orchestrator response, with retry loop.
@@ -2318,6 +2321,12 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List<string>
                 planPrompt = BuildReplanPrompt(reflectState.LastEvaluation ?? "Continue iterating.", workerNames, prompt, group.RoutingContext);
             }
 
+            // Enable early dispatch: if the orchestrator writes @worker blocks in an intermediate
+            // sub-turn (before finishing all tool rounds), FlushCurrentResponse will resolve the
+            // TCS immediately. This prevents the orchestrator from doing all the work itself.
+            if (_sessions.TryGetValue(orchestratorName, out var orchState))
+                orchState.EarlyDispatchOnWorkerBlocks = true;
+
             var planResponse = await SendPromptAndWaitAsync(orchestratorName, planPrompt, ct, originalPrompt: prompt);
             var rawAssignments = ParseTaskAssignments(planResponse, workerNames);
             Debug($"[DISPATCH] '{orchestratorName}' reflect plan parsed: {rawAssignments.Count} raw assignments from {workerNames.Count} workers. Iteration={reflectState.CurrentIteration}, Response length={planResponse.Length}");
@@ -2333,6 +2342,9 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List<string>
                     AddOrchestratorSystemMessage(orchestratorName,
                         "⚠️ No @worker assignments parsed from orchestrator response. Retrying...");
                     var nudgePrompt = BuildDelegationNudgePrompt(workerNames);
+                    // Re-enable early dispatch for the nudge attempt too
+                    if (_sessions.TryGetValue(orchestratorName, out var nudgeState))
+                        nudgeState.EarlyDispatchOnWorkerBlocks = true;
                     var nudgeResponse = await SendPromptAndWaitAsync(orchestratorName, nudgePrompt, ct, originalPrompt: prompt);
                     var nudgeAssignments = ParseTaskAssignments(nudgeResponse, workerNames);
                     Debug($"[DISPATCH] '{orchestratorName}' nudge parsed: {nudgeAssignments.Count} raw assignments. Response length={nudgeResponse.Length}");
diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs
index 9bdb631d78..86ad3ade16 100644
--- a/PolyPilot/Services/CopilotService.cs
+++ b/PolyPilot/Services/CopilotService.cs
@@ -449,6 +449,12 @@ private class SessionState
         /// Used for diagnostic logging: when the next TurnEnd re-arms the fallback, the log shows
         /// the self-healing loop in action (TurnEnd → TurnStart cancel → TurnEnd re-arm).</summary>
         public volatile bool FallbackCanceledByTurnStart;
+        /// <summary>When true, FlushCurrentResponse checks accumulated FlushedResponse for
+        /// @worker:...@end blocks after each sub-turn. If found, resolves ResponseCompletion
+        /// early so orchestrator dispatch can proceed without waiting for the model to finish
+        /// all its tool rounds. This prevents the orchestrator from doing all the work itself
+        /// when it has tool access and ignores "dispatcher only" instructions.</summary>
+        public bool EarlyDispatchOnWorkerBlocks;
         /// <summary>Timer that fires shortly after a tool starts to verify the connection is still alive.
         /// If no tool completion event arrives within ToolHealthCheckIntervalMs, we do an active health
         /// check to detect dead connections early (instead of waiting for the 600s watchdog timeout).</summary>

From 44c0baee48f1ad4a569e7af7b909581b405b8613 Mon Sep 17 00:00:00 2001
From: Shane Neuville <shneuvil@microsoft.com>
Date: Wed, 11 Mar 2026 17:05:37 -0500
Subject: [PATCH 11/14] feat: use WorkerDisplayNames for worker session naming

Workers now use preset-defined display names (e.g., dotnet-validator,
anthropic-validator, eval-generator) instead of generic worker-1/2/3.
Updated RoutingContext examples and consensus rules to match.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 PolyPilot/Models/ModelCapabilities.cs         | 42 +++++++++++--------
 .../Services/CopilotService.Organization.cs   |  7 +++-
 2 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/PolyPilot/Models/ModelCapabilities.cs b/PolyPilot/Models/ModelCapabilities.cs
index 0a468d4b02..deef5b238b 100644
--- a/PolyPilot/Models/ModelCapabilities.cs
+++ b/PolyPilot/Models/ModelCapabilities.cs
@@ -184,6 +184,13 @@ public record GroupPreset(string Name, string Description, string Emoji, MultiAg
     /// </summary>
     public int? MaxReflectIterations { get; init; }
 
+    /// <summary>
+    /// Custom display names for workers, indexed to match WorkerModels.
+    /// E.g. ["dotnet-validator", "anthropic-validator", "eval-generator"].
+    /// Null = use default "worker-{i}" naming.
+    /// </summary>
+    public string?[]? WorkerDisplayNames { get; init; }
+
     private const string WorkerReviewPrompt = """
         You are a PR reviewer. When assigned a PR, follow this process:
 
@@ -641,7 +648,7 @@ A good skill must satisfy the empirical validator AND the prompt reviewer to be
                 - Instructions that would cause regressions or confusion
 
                 ### Consensus Rule
-                - KEEP requires both evaluators (worker-1 and worker-2) to say KEEP or one KEEP + one IMPROVE
+                - KEEP requires both evaluators (dotnet-validator and anthropic-validator) to say KEEP or one KEEP + one IMPROVE
                 - IMPROVE if the evaluators disagree or both say IMPROVE
                 - REMOVE if either evaluator says REMOVE with strong evidence
 
@@ -673,46 +680,46 @@ A good skill must satisfy the empirical validator AND the prompt reviewer to be
                 ### Delegation Format (the ONLY way to assign work)
 
                 ```
-                @worker:Skill Validator-worker-3
+                @worker:Skill Validator-eval-generator
                 Your task: [detailed instructions]
                 @end
                 ```
 
-                You MUST use the FULL worker session name (e.g., "Skill Validator-worker-3", not just "worker-3").
+                You MUST use the FULL worker session name (e.g., "Skill Validator-eval-generator", not just "eval-generator").
                 Each response MUST contain @worker:/@end blocks. Text outside the blocks is your planning notes.
 
                 ### Worker Names
-                - **Skill Validator-worker-1** = Dotnet Skill Validator (runs `skill-validator` tool)
-                - **Skill Validator-worker-2** = Anthropic Skill Evaluator (prompt analysis via Claude Code)
-                - **Skill Validator-worker-3** = Eval Generator (creates tests/eval.yaml)
+                - **Skill Validator-dotnet-validator** = Dotnet Skill Validator (runs `skill-validator` tool)
+                - **Skill Validator-anthropic-validator** = Anthropic Skill Evaluator (prompt analysis via Claude Code)
+                - **Skill Validator-eval-generator** = Eval Generator (creates tests/eval.yaml)
 
                 ### 2-Phase Dispatch
 
-                **Phase 1 — Always dispatch worker-3 first:**
+                **Phase 1 — Always dispatch eval-generator first:**
 
-                Write a single @worker block for worker-3. Tell it the skill directory path and to generate tests/eval.yaml.
-                Do NOT check if eval.yaml exists yourself — worker-3 will check and skip if it already exists.
+                Write a single @worker block for eval-generator. Tell it the skill directory path and to generate tests/eval.yaml.
+                Do NOT check if eval.yaml exists yourself — eval-generator will check and skip if it already exists.
 
                 Example:
                 ```
-                @worker:Skill Validator-worker-3
+                @worker:Skill Validator-eval-generator
                 Generate tests/eval.yaml for the skill at: <path>
                 The skill is about: <brief description from user prompt>
                 @end
                 ```
 
-                **Phase 2 — After worker-3 completes, dispatch workers 1 and 2 together:**
+                **Phase 2 — After eval-generator completes, dispatch both validators together:**
 
                 Write TWO @worker blocks in a single response:
                 ```
-                @worker:Skill Validator-worker-1
+                @worker:Skill Validator-dotnet-validator
                 Evaluate the skill at: <path>
-                Eval.yaml status: [generated by worker-3 / pre-existing]
+                Eval.yaml status: [generated by eval-generator / pre-existing]
                 @end
 
-                @worker:Skill Validator-worker-2
+                @worker:Skill Validator-anthropic-validator
                 Evaluate the skill at: <path>
-                Eval.yaml status: [generated by worker-3 / pre-existing]
+                Eval.yaml status: [generated by eval-generator / pre-existing]
                 @end
                 ```
 
@@ -723,7 +730,7 @@ You MUST use the FULL worker session name (e.g., "Skill Validator-worker-3", not
                 ## Skill Validator Consensus Report: [Skill Name]
 
                 ### Summary
-                **Eval Generation**: ✅ Pre-existing / 🆕 Generated by worker-3 / ❌ None
+                **Eval Generation**: ✅ Pre-existing / 🆕 Generated by eval-generator / ❌ None
                 **Dotnet Verdict**: KEEP/IMPROVE/REMOVE (X/10)
                 **Anthropic Verdict**: KEEP/IMPROVE/REMOVE (X/10)
                 **Consensus**: KEEP / IMPROVE / REMOVE
@@ -752,10 +759,11 @@ You MUST use the FULL worker session name (e.g., "Skill Validator-worker-3", not
                 - Always explain WHY suggestions are adopted or declined
                 - Where evaluators disagree, explain the tradeoff and make a reasoned judgment
                 - Highlight suggestions both evaluators agree on as high-confidence improvements
-                - If worker-3 generated evals, note whether the Dotnet validator was able to use them
+                - If eval-generator generated evals, note whether the Dotnet validator was able to use them
                 - NEVER emit [[GROUP_REFLECT_COMPLETE]] until all dispatched workers have responded and a consensus report is produced
                 """,
             MaxReflectIterations = 6,
+            WorkerDisplayNames = new[] { "dotnet-validator", "anthropic-validator", "eval-generator" },
         },
     };
 }
diff --git a/PolyPilot/Services/CopilotService.Organization.cs b/PolyPilot/Services/CopilotService.Organization.cs
index 77c94f1ea0..e20a923fba 100644
--- a/PolyPilot/Services/CopilotService.Organization.cs
+++ b/PolyPilot/Services/CopilotService.Organization.cs
@@ -2082,10 +2082,13 @@ public string GetEffectiveModel(string sessionName)
         Debug($"[WorktreeStrategy] Creating {preset.WorkerModels.Length} workers with strategy={strategy}, repoId={repoId}");
         for (int i = 0; i < preset.WorkerModels.Length; i++)
         {
-            var workerName = $"{teamName}-worker-{i + 1}";
+            var displayName = preset.WorkerDisplayNames != null && i < preset.WorkerDisplayNames.Length && preset.WorkerDisplayNames[i] != null
+                ? preset.WorkerDisplayNames[i]!
+                : $"worker-{i + 1}";
+            var workerName = $"{teamName}-{displayName}";
             { int suffix = 1;
               while (_sessions.ContainsKey(workerName) || Organization.Sessions.Any(s => s.SessionName == workerName))
-                  workerName = $"{teamName}-worker-{i + 1}-{suffix++}";
+                  workerName = $"{teamName}-{displayName}-{suffix++}";
             }
             var workerModel = preset.WorkerModels[i];
             var workerWorkDir = workerWorkDirs[i] ?? orchWorkDir ?? workingDirectory;

From 8945c59d0ad13f7cde63e15c2716c482b75c7646 Mon Sep 17 00:00:00 2001
From: Shane Neuville <shneuvil@microsoft.com>
Date: Wed, 11 Mar 2026 17:21:30 -0500
Subject: [PATCH 12/14] fix: wait for orchestrator idle before synthesis after
 early dispatch

After early dispatch resolves the TCS, the orchestrator continues doing
tool work. The reflect loop must wait for it to go idle before sending
the synthesis prompt, otherwise SendPromptAsync throws 'already processing'.
Added WaitForSessionIdleAsync helper used in both reflect and non-reflect flows.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../processing-state-safety/tests/eval.yaml   | 153 ++++++++++++++++++
 .../Services/CopilotService.Organization.cs   |  34 ++++
 2 files changed, 187 insertions(+)
 create mode 100644 .claude/skills/processing-state-safety/tests/eval.yaml

diff --git a/.claude/skills/processing-state-safety/tests/eval.yaml b/.claude/skills/processing-state-safety/tests/eval.yaml
new file mode 100644
index 0000000000..f6800c7913
--- /dev/null
+++ b/.claude/skills/processing-state-safety/tests/eval.yaml
@@ -0,0 +1,153 @@
+scenarios:
+  - name: "New IsProcessing=false path needs full cleanup checklist"
+    prompt: |
+      I need to add a new code path in CopilotService that clears IsProcessing when a
+      network timeout is detected. Can you write the code for this? The path should be
+      in a Task.Run callback that runs on a timer. Make sure it's correct.
+    assertions:
+      - type: "output_contains"
+        value: "FlushCurrentResponse"
+      - type: "output_contains"
+        value: "InvokeOnUI"
+      - type: "output_contains"
+        value: "ProcessingGeneration"
+      - type: "output_matches"
+        pattern: "IsResumed|HasUsedToolsThisTurn|ActiveToolCallCount"
+      - type: "output_matches"
+        pattern: "ProcessingPhase|ProcessingStartedAt|ToolCallCount|SendingFlag"
+    rubric:
+      - "Agent includes all or nearly all of the 13-item cleanup checklist (IsResumed, HasUsedToolsThisTurn, ActiveToolCallCount, ProcessingStartedAt, ToolCallCount, ProcessingPhase, SendingFlag, ClearPermissionDenials, FlushCurrentResponse, OnSessionComplete, diagnostic log)"
+      - "Agent uses InvokeOnUI() class method (not local Invoke) for the background→UI dispatch"
+      - "Agent captures ProcessingGeneration before the InvokeOnUI call and validates it inside the lambda (generation guard pattern)"
+      - "Agent adds a diagnostic log entry with an appropriate tag like [ERROR] or [TIMEOUT]"
+    timeout: 120
+
+  - name: "Debugging a session stuck showing Thinking... forever"
+    prompt: |
+      A user reports that a session is stuck showing "Thinking..." indefinitely. 
+      The session used some tools earlier but nothing happened for several minutes.
+      How do I diagnose and fix this?
+    assertions:
+      - type: "output_contains"
+        value: "event-diagnostics.log"
+      - type: "output_contains"
+        value: "[SEND]"
+      - type: "output_matches"
+        pattern: "(?i)(watchdog|120s|600s|WATCHDOG)"
+      - type: "output_not_contains"
+        value: "restart the app"
+        # Should suggest specific diagnostic steps, not generic advice
+    rubric:
+      - "Agent directs the user to check event-diagnostics.log and explains what tags to look for ([SEND], [IDLE], [COMPLETE], [WATCHDOG])"
+      - "Agent explains the watchdog timeout tiers (120s inactivity, 600s tool/resume) and which case likely applies given tools were used"
+      - "Agent mentions the nuclear option (AbortSessionAsync / Stop button) as a fallback"
+    timeout: 120
+
+  - name: "Unrelated UI styling task should not trigger processing-state concepts"
+    prompt: |
+      I want to update the CSS for the session list items in SessionListItem.razor to
+      add a subtle hover animation. Can you help me write the CSS?
+    assertions:
+      - type: "output_not_contains"
+        value: "IsProcessing"
+      - type: "output_not_contains"
+        value: "FlushCurrentResponse"
+      - type: "output_not_contains"
+        value: "InvokeOnUI"
+      - type: "output_not_contains"
+        value: "ProcessingGeneration"
+    rubric:
+      - "Agent focuses purely on CSS/styling and does not inject unrelated processing-state safety guidance"
+      - "Response is concise and on-topic for a CSS change"
+    timeout: 60
+
+  - name: "TurnEnd fallback regression - avoid suppressing with HasUsedToolsThisTurn"
+    prompt: |
+      The AssistantTurnEndEvent fallback is firing too early during multi-tool agent
+      sessions and completing the response prematurely. I want to fix this by checking
+      if HasUsedToolsThisTurn is true and skipping the fallback entirely in that case.
+      Can you implement this fix?
+    assertions:
+      - type: "output_matches"
+        pattern: "(?i)(do not|don't|should not|shouldn't|wrong|dangerous|avoid|warn|mistake)"
+      - type: "output_contains"
+        value: "ActiveToolCallCount"
+      - type: "output_matches"
+        pattern: "(?i)(30s|30_000|extended|delay|TurnEndIdleToolFallbackAdditionalMs)"
+    rubric:
+      - "Agent explicitly warns against using HasUsedToolsThisTurn to skip the fallback entirely, explaining that it leaves agent sessions with no recovery if SessionIdleEvent is dropped"
+      - "Agent proposes the correct approach: use ActiveToolCallCount > 0 to skip the 4s fallback, with an extended 30s delay when tools were used but ActiveToolCallCount is 0"
+      - "Agent references the cancellation token from AssistantTurnStartEvent as the mechanism to prevent premature firing in multi-round tool use"
+    timeout: 120
+
+  - name: "Session restore must initialize watchdog-dependent state"
+    prompt: |
+      I'm adding a new boolean flag IsHighPrioritySession to AgentSessionInfo. It needs
+      to affect the watchdog timeout — when true, use a longer 900s timeout instead of
+      600s. I've added it to SendPromptAsync. Is there anything else I need to update?
+    assertions:
+      - type: "output_contains"
+        value: "RestoreSingleSessionAsync"
+      - type: "output_matches"
+        pattern: "(?i)(restore|restart|resume)"
+      - type: "output_matches"
+        pattern: "(?i)(before.*watchdog|StartProcessingWatchdog|watchdog.*depend)"
+    rubric:
+      - "Agent identifies that the restore path (RestoreSingleSessionAsync) must also initialize the new flag, not just SendPromptAsync"
+      - "Agent explains that state must be set BEFORE StartProcessingWatchdog is called, and the restore path is independent of the send path"
+      - "Agent may reference the IsMultiAgentSession precedent (PR #284) where the same mistake was made"
+    timeout: 120
+
+  - name: "ChatDatabase catch filter narrowing - prevent regression"
+    prompt: |
+      The ChatDatabase is swallowing too many exceptions. I want to narrow the catch
+      filter from `catch (Exception ex)` to `catch (SqliteException ex)` so we only
+      catch database-specific errors. Can you update the AddMessageAsync callers?
+    assertions:
+      - type: "output_not_contains"
+        value: "SqliteException"
+        # Should NOT recommend narrowing to SqliteException
+      - type: "output_contains"
+        value: "unobserved"
+      - type: "output_matches"
+        pattern: "(?i)(fire.and.forget|task exception|crash|source of truth|events\\.jsonl)"
+    rubric:
+      - "Agent explicitly warns against narrowing the catch filter, explaining that fire-and-forget tasks with narrow catches produce unobserved task exceptions that crash the app"
+      - "Agent explains that events.jsonl is the source of truth and DB write failures are self-healing via BulkInsertAsync on resume — broad catch is intentional"
+    timeout: 120
+
+  - name: "Background thread IsProcessing mutation without UI marshaling"
+    prompt: |
+      I'm adding a new SDK event handler for SessionReconnectEvent in
+      CopilotService.Events.cs. When we get this event, I need to clear the
+      processing state. Here's what I have inside HandleSessionEvent which runs
+      on a background thread:
+
+      ```csharp
+      case "session.reconnect":
+          state.Info.IsProcessing = false;
+          state.Info.IsResumed = false;
+          state.Info.HasUsedToolsThisTurn = false;
+          state.Info.ActiveToolCallCount = 0;
+          OnStateChanged?.Invoke();
+          break;
+      ```
+
+      Does this look correct?
+    assertions:
+      - type: "output_matches"
+        pattern: "(?i)(InvokeOnUI|UI thread|main thread|marshal|dispatch|synchronization)"
+      - type: "output_matches"
+        pattern: "(?i)(background.*thread|thread.*safe|cross.*thread)"
+      - type: "output_matches"
+        pattern: "ProcessingGeneration|generation.*guard"
+      - type: "output_matches"
+        pattern: "FlushCurrentResponse|flush"
+      - type: "output_matches"
+        pattern: "(?i)(ProcessingPhase|ProcessingStartedAt|ToolCallCount|SendingFlag|missing|incomplete)"
+    rubric:
+      - "Agent identifies the critical thread safety violation: IsProcessing and companion fields must not be mutated directly on a background SDK event thread"
+      - "Agent recommends wrapping the state mutations in InvokeOnUI() to marshal to the UI thread"
+      - "Agent points out missing companion fields beyond the ones listed (ProcessingPhase, ProcessingStartedAt, ToolCallCount, SendingFlag, ClearPermissionDenials)"
+      - "Agent mentions the ProcessingGeneration guard pattern and the need for FlushCurrentResponse before clearing IsProcessing"
+    timeout: 120
diff --git a/PolyPilot/Services/CopilotService.Organization.cs b/PolyPilot/Services/CopilotService.Organization.cs
index e20a923fba..8e094d4146 100644
--- a/PolyPilot/Services/CopilotService.Organization.cs
+++ b/PolyPilot/Services/CopilotService.Organization.cs
@@ -1168,6 +1168,9 @@ private async Task SendViaOrchestratorAsync(string groupId, List<string> members
             }
             var results = await Task.WhenAll(workerTasks);
 
+            // After early dispatch, the orchestrator may still be doing tool work.
+            await WaitForSessionIdleAsync(orchestratorName, cancellationToken);
+
             // Phase 4: Synthesize — send worker results back to orchestrator
             InvokeOnUI(() => OnOrchestratorPhaseChanged?.Invoke(groupId, OrchestratorPhase.Synthesizing, null));
 
@@ -1347,6 +1350,33 @@ internal static List<TaskAssignment> TryParseJsonAssignments(string response, Li
 
     private record WorkerResult(string WorkerName, string? Response, bool Success, string? Error, TimeSpan Duration);
 
+    /// <summary>
+    /// Wait for a session to finish processing (go idle). Used after early dispatch
+    /// resolves the orchestrator's TCS while it's still doing tool work — we must
+    /// wait for it to go idle before sending the next prompt (synthesis).
+    /// </summary>
+    private async Task WaitForSessionIdleAsync(string sessionName, CancellationToken ct, int timeoutSeconds = 300)
+    {
+        if (!_sessions.TryGetValue(sessionName, out var state))
+            return;
+        if (!state.Info.IsProcessing)
+            return;
+
+        Debug($"[DISPATCH] Waiting for '{sessionName}' to go idle before sending next prompt...");
+        var sw = System.Diagnostics.Stopwatch.StartNew();
+        while (state.Info.IsProcessing && !ct.IsCancellationRequested && sw.Elapsed.TotalSeconds < timeoutSeconds)
+        {
+            await Task.Delay(500, ct);
+        }
+        if (state.Info.IsProcessing)
+        {
+            Debug($"[DISPATCH] '{sessionName}' still processing after {sw.Elapsed.TotalSeconds:F1}s — aborting to allow synthesis");
+            await AbortSessionAsync(sessionName);
+            await Task.Delay(500, ct); // Give abort a moment to clear state
+        }
+        Debug($"[DISPATCH] '{sessionName}' now idle after {sw.Elapsed.TotalSeconds:F1}s");
+    }
+
     private async Task<WorkerResult> ExecuteWorkerAsync(string workerName, string task, string originalPrompt, CancellationToken cancellationToken)
     {
         var sw = System.Diagnostics.Stopwatch.StartNew();
@@ -2426,6 +2456,10 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List<string>
             foreach (var r in results.Where(r => r.Success))
                 dispatchedWorkers.Add(r.WorkerName);
 
+            // After early dispatch, the orchestrator may still be doing tool work.
+            // Wait for it to go idle before sending the synthesis prompt.
+            await WaitForSessionIdleAsync(orchestratorName, ct);
+
             // Phase 4: Synthesize + Evaluate
             InvokeOnUI(() => OnOrchestratorPhaseChanged?.Invoke(groupId, OrchestratorPhase.Synthesizing, iterDetail));
 

From ef3dfbfc4db547d2b4bf3748d8d4a698e498da6d Mon Sep 17 00:00:00 2001
From: Shane Neuville <shneuvil@microsoft.com>
Date: Wed, 11 Mar 2026 17:31:24 -0500
Subject: [PATCH 13/14] fix: use inactivity detection in
 WaitForSessionIdleAsync
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After early dispatch, the orchestrator may hit the zero-idle bug where
the SDK never emits SessionIdleEvent. Instead of waiting the full 300s
timeout, check LastEventAtTicks — if no SDK events for 60s, abort the
orchestrator and proceed to synthesis immediately.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../processing-state-safety/tests/eval.yaml   | 89 +++++++++++++++++--
 .../Services/CopilotService.Organization.cs   | 19 +++-
 2 files changed, 97 insertions(+), 11 deletions(-)

diff --git a/.claude/skills/processing-state-safety/tests/eval.yaml b/.claude/skills/processing-state-safety/tests/eval.yaml
index f6800c7913..6899c25997 100644
--- a/.claude/skills/processing-state-safety/tests/eval.yaml
+++ b/.claude/skills/processing-state-safety/tests/eval.yaml
@@ -15,10 +15,16 @@ scenarios:
         pattern: "IsResumed|HasUsedToolsThisTurn|ActiveToolCallCount"
       - type: "output_matches"
         pattern: "ProcessingPhase|ProcessingStartedAt|ToolCallCount|SendingFlag"
+      - type: "output_matches"
+        # OnSessionComplete unblocks orchestrator loops — must be in every cleanup path
+        pattern: "OnSessionComplete|ClearPermissionDenials"
+      - type: "output_matches"
+        # Must include a diagnostic log tag for debugging stuck sessions
+        pattern: "\\[ERROR\\]|\\[TIMEOUT\\]|\\[NETWORK\\]|diagnostic.log"
     rubric:
       - "Agent includes all or nearly all of the 13-item cleanup checklist (IsResumed, HasUsedToolsThisTurn, ActiveToolCallCount, ProcessingStartedAt, ToolCallCount, ProcessingPhase, SendingFlag, ClearPermissionDenials, FlushCurrentResponse, OnSessionComplete, diagnostic log)"
-      - "Agent uses InvokeOnUI() class method (not local Invoke) for the background→UI dispatch"
-      - "Agent captures ProcessingGeneration before the InvokeOnUI call and validates it inside the lambda (generation guard pattern)"
+      - "Agent uses InvokeOnUI() class method (not local Invoke) for the background→UI dispatch per INV-13"
+      - "Agent captures ProcessingGeneration before the InvokeOnUI call and validates it inside the lambda (generation guard pattern per INV-3 and INV-12)"
       - "Agent adds a diagnostic log entry with an appropriate tag like [ERROR] or [TIMEOUT]"
     timeout: 120
 
@@ -34,12 +40,16 @@ scenarios:
         value: "[SEND]"
       - type: "output_matches"
         pattern: "(?i)(watchdog|120s|600s|WATCHDOG)"
+      - type: "output_matches"
+        # Should reference specific diagnostic tags, not just generic "check the logs"
+        pattern: "\\[IDLE\\]|\\[COMPLETE\\]|\\[WATCHDOG\\]|\\[EVT\\]"
       - type: "output_not_contains"
         value: "restart the app"
         # Should suggest specific diagnostic steps, not generic advice
     rubric:
       - "Agent directs the user to check event-diagnostics.log and explains what tags to look for ([SEND], [IDLE], [COMPLETE], [WATCHDOG])"
       - "Agent explains the watchdog timeout tiers (120s inactivity, 600s tool/resume) and which case likely applies given tools were used"
+      - "Agent references the stuck-session diagnostic table: [SEND] then silence, [EVT] TurnEnd but no [IDLE], [COMPLETE] fired but spinner persists, etc."
       - "Agent mentions the nuclear option (AbortSessionAsync / Stop button) as a fallback"
     timeout: 120
 
@@ -56,6 +66,8 @@ scenarios:
         value: "InvokeOnUI"
       - type: "output_not_contains"
         value: "ProcessingGeneration"
+      - type: "output_not_contains"
+        value: "watchdog"
     rubric:
       - "Agent focuses purely on CSS/styling and does not inject unrelated processing-state safety guidance"
       - "Response is concise and on-topic for a CSS change"
@@ -73,9 +85,12 @@ scenarios:
       - type: "output_contains"
         value: "ActiveToolCallCount"
       - type: "output_matches"
-        pattern: "(?i)(30s|30_000|extended|delay|TurnEndIdleToolFallbackAdditionalMs)"
+        pattern: "(?i)(30s|30.000|30_000|extended|delay|TurnEndIdleToolFallbackAdditionalMs)"
+      - type: "output_matches"
+        # Must reference the cancellation mechanism, not just the delay
+        pattern: "(?i)(cancel|CancellationToken|AssistantTurnStartEvent|CTS)"
     rubric:
-      - "Agent explicitly warns against using HasUsedToolsThisTurn to skip the fallback entirely, explaining that it leaves agent sessions with no recovery if SessionIdleEvent is dropped"
+      - "Agent explicitly warns against using HasUsedToolsThisTurn to skip the fallback entirely, explaining that it leaves agent sessions with no recovery if SessionIdleEvent is dropped (SDK bug #299)"
       - "Agent proposes the correct approach: use ActiveToolCallCount > 0 to skip the 4s fallback, with an extended 30s delay when tools were used but ActiveToolCallCount is 0"
       - "Agent references the cancellation token from AssistantTurnStartEvent as the mechanism to prevent premature firing in multi-round tool use"
     timeout: 120
@@ -92,10 +107,13 @@ scenarios:
         pattern: "(?i)(restore|restart|resume)"
       - type: "output_matches"
         pattern: "(?i)(before.*watchdog|StartProcessingWatchdog|watchdog.*depend)"
+      - type: "output_matches"
+        # Should mention the precedent or dual-path pattern
+        pattern: "(?i)(IsMultiAgentSession|both.*path|two.*path|dual|separate.*path)"
     rubric:
       - "Agent identifies that the restore path (RestoreSingleSessionAsync) must also initialize the new flag, not just SendPromptAsync"
       - "Agent explains that state must be set BEFORE StartProcessingWatchdog is called, and the restore path is independent of the send path"
-      - "Agent may reference the IsMultiAgentSession precedent (PR #284) where the same mistake was made"
+      - "Agent references the IsMultiAgentSession precedent (PR #284) where the same mistake caused premature watchdog kills"
     timeout: 120
 
   - name: "ChatDatabase catch filter narrowing - prevent regression"
@@ -107,13 +125,17 @@ scenarios:
       - type: "output_not_contains"
         value: "SqliteException"
         # Should NOT recommend narrowing to SqliteException
-      - type: "output_contains"
-        value: "unobserved"
       - type: "output_matches"
-        pattern: "(?i)(fire.and.forget|task exception|crash|source of truth|events\\.jsonl)"
+        pattern: "(?i)(unobserved|crash|AggregateException|ObjectDisposedException)"
+      - type: "output_matches"
+        pattern: "(?i)(fire.and.forget|task exception|source of truth|events\\.jsonl)"
+      - type: "output_matches"
+        # Should explain self-healing via BulkInsertAsync
+        pattern: "(?i)(self.heal|BulkInsert|replay|restart|write.through)"
     rubric:
       - "Agent explicitly warns against narrowing the catch filter, explaining that fire-and-forget tasks with narrow catches produce unobserved task exceptions that crash the app"
       - "Agent explains that events.jsonl is the source of truth and DB write failures are self-healing via BulkInsertAsync on resume — broad catch is intentional"
+      - "Agent may reference PR #276 where this exact narrowing caused crashes in production"
     timeout: 120
 
   - name: "Background thread IsProcessing mutation without UI marshaling"
@@ -144,10 +166,59 @@ scenarios:
       - type: "output_matches"
         pattern: "FlushCurrentResponse|flush"
       - type: "output_matches"
+        # Must identify the missing companion fields from the 13-item checklist
         pattern: "(?i)(ProcessingPhase|ProcessingStartedAt|ToolCallCount|SendingFlag|missing|incomplete)"
+      - type: "output_matches"
+        # Must mention diagnostic log entry or OnSessionComplete
+        pattern: "(?i)(diagnostic.*log|\\[RECONNECT\\]|OnSessionComplete|ClearPermissionDenials)"
     rubric:
       - "Agent identifies the critical thread safety violation: IsProcessing and companion fields must not be mutated directly on a background SDK event thread"
       - "Agent recommends wrapping the state mutations in InvokeOnUI() to marshal to the UI thread"
-      - "Agent points out missing companion fields beyond the ones listed (ProcessingPhase, ProcessingStartedAt, ToolCallCount, SendingFlag, ClearPermissionDenials)"
+      - "Agent points out missing companion fields beyond the ones listed (ProcessingPhase, ProcessingStartedAt, ToolCallCount, SendingFlag, ClearPermissionDenials, OnSessionComplete)"
       - "Agent mentions the ProcessingGeneration guard pattern and the need for FlushCurrentResponse before clearing IsProcessing"
+      - "Agent recommends adding a diagnostic log entry with an appropriate tag"
+    timeout: 120
+
+  - name: "Watchdog timeout should use 3-way branch not a flat 600s wait"
+    prompt: |
+      I'm simplifying the watchdog timeout logic. Currently it has complicated case
+      branches. I want to replace it with a single check: if ActiveToolCallCount > 0
+      OR HasUsedToolsThisTurn, wait the full 600s before declaring the session stuck
+      and killing it. Otherwise use 120s. Is this safe?
+    assertions:
+      - type: "output_matches"
+        pattern: "(?i)(do not|don't|should not|shouldn't|wrong|dangerous|problem|issue|warn)"
+      - type: "output_matches"
+        pattern: "(?i)(SessionIdleEvent|dropped|lost.*event|event.*dropped|zero.idle|terminal event)"
+      - type: "output_matches"
+        pattern: "(?i)(Case B|CompleteResponse.*clean|cleanly.*complete|tools.*done|!hasActiveTool)"
+      - type: "output_not_contains"
+        value: "that's fine"
+    rubric:
+      - "Agent warns against the flat 600s wait and explains that tools may have finished (ActiveToolCallCount==0, HasUsedToolsThisTurn==true) but the SDK dropped SessionIdleEvent — leading to a 10-minute kill of a successfully-completed session"
+      - "Agent describes the correct 3-way branch: Case A (active tools + server alive → reset timer), Case B (tools done, used tools → call CompleteResponse cleanly), Case C (default kill with error message)"
+      - "Agent explains that Case B calls CompleteResponse cleanly without an error message, preserving the session's output"
+    timeout: 120
+
+  - name: "Edge case - adding cross-thread boolean without Volatile (INV-7)"
+    prompt: |
+      I'm adding a new boolean flag `IsWaitingForUserInput` to AgentSessionInfo that gets
+      set to true from an SDK event handler (background thread) and read by the watchdog
+      timer. I'll just use a regular bool property like the existing HasUsedToolsThisTurn.
+      Should I also clear it in the IsProcessing cleanup paths?
+    assertions:
+      - type: "output_matches"
+        # Must mention Volatile or memory model concern
+        pattern: "(?i)(Volatile\\.Write|Volatile\\.Read|volatile|memory.model|ARM|thread.safe)"
+      - type: "output_matches"
+        # Must address the cleanup question positively
+        pattern: "(?i)(yes|clear|cleanup|checklist|IsProcessing.*path)"
+      - type: "output_matches"
+        # Should mention the existing inconsistency or tracking comment requirement
+        pattern: "(?i)(HasUsedToolsThisTurn|plain.assignment|inconsisten|tracking.comment|pre.existing)"
+    rubric:
+      - "Agent answers yes to the cleanup question — new fields read by the watchdog must be cleared on all IsProcessing=false paths"
+      - "Agent warns about using plain bool for cross-thread access and recommends Volatile.Write/Volatile.Read for ARM correctness (INV-7)"
+      - "Agent notes the existing inconsistency (HasUsedToolsThisTurn uses plain assignment) and advises not to repeat it — either use Volatile or add a tracking comment explaining the gap"
+      - "Agent identifies that both SendPromptAsync and RestoreSingleSessionAsync need to initialize the new field (INV-9 dual-path requirement)"
     timeout: 120
diff --git a/PolyPilot/Services/CopilotService.Organization.cs b/PolyPilot/Services/CopilotService.Organization.cs
index 8e094d4146..efd281bc36 100644
--- a/PolyPilot/Services/CopilotService.Organization.cs
+++ b/PolyPilot/Services/CopilotService.Organization.cs
@@ -1354,6 +1354,8 @@ private record WorkerResult(string WorkerName, string? Response, bool Success, s
     /// Wait for a session to finish processing (go idle). Used after early dispatch
     /// resolves the orchestrator's TCS while it's still doing tool work — we must
     /// wait for it to go idle before sending the next prompt (synthesis).
+    /// Uses a short inactivity threshold: if no SDK events for 60s, the session is
+    /// likely stuck in a "zero-idle" state and will be aborted.
     /// </summary>
     private async Task WaitForSessionIdleAsync(string sessionName, CancellationToken ct, int timeoutSeconds = 300)
     {
@@ -1364,15 +1366,28 @@ private async Task WaitForSessionIdleAsync(string sessionName, CancellationToken
 
         Debug($"[DISPATCH] Waiting for '{sessionName}' to go idle before sending next prompt...");
         var sw = System.Diagnostics.Stopwatch.StartNew();
+        const int inactivityThresholdSeconds = 60;
+
         while (state.Info.IsProcessing && !ct.IsCancellationRequested && sw.Elapsed.TotalSeconds < timeoutSeconds)
         {
-            await Task.Delay(500, ct);
+            // Check if the session has been quiet (no SDK events) for the inactivity threshold.
+            // This catches the "zero-idle" bug where the SDK never emits SessionIdleEvent.
+            var lastEventTicks = Interlocked.Read(ref state.LastEventAtTicks);
+            var secondsSinceLastEvent = (DateTime.UtcNow - new DateTime(lastEventTicks)).TotalSeconds;
+            if (secondsSinceLastEvent >= inactivityThresholdSeconds)
+            {
+                Debug($"[DISPATCH] '{sessionName}' no SDK events for {secondsSinceLastEvent:F0}s — aborting to proceed with synthesis");
+                await AbortSessionAsync(sessionName);
+                await Task.Delay(500, ct);
+                break;
+            }
+            await Task.Delay(1000, ct);
         }
         if (state.Info.IsProcessing)
         {
             Debug($"[DISPATCH] '{sessionName}' still processing after {sw.Elapsed.TotalSeconds:F1}s — aborting to allow synthesis");
             await AbortSessionAsync(sessionName);
-            await Task.Delay(500, ct); // Give abort a moment to clear state
+            await Task.Delay(500, ct);
         }
         Debug($"[DISPATCH] '{sessionName}' now idle after {sw.Elapsed.TotalSeconds:F1}s");
     }

From 8fe94de3f30780230acf8346d84083b7a74035d4 Mon Sep 17 00:00:00 2001
From: Shane Neuville <shneuvil@microsoft.com>
Date: Wed, 11 Mar 2026 17:40:03 -0500
Subject: [PATCH 14/14] fix: force-dispatch remaining workers when orchestrator
 skips Phase 2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the reflect loop's iteration > 1 returns 0 assignments, the code
used to declare GoalMet immediately. But with 2-phase dispatch, only
the eval-generator ran in Phase 1 — the orchestrator's replan response
didn't write @worker blocks for the validators. Now we check if all
workers have been dispatched and force-dispatch remaining ones if not.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../processing-state-safety/tests/eval.yaml   | 52 +++++++++++++++++++
 .../Services/CopilotService.Organization.cs   | 16 +++++-
 2 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/.claude/skills/processing-state-safety/tests/eval.yaml b/.claude/skills/processing-state-safety/tests/eval.yaml
index 6899c25997..738a409542 100644
--- a/.claude/skills/processing-state-safety/tests/eval.yaml
+++ b/.claude/skills/processing-state-safety/tests/eval.yaml
@@ -179,6 +179,34 @@ scenarios:
       - "Agent recommends adding a diagnostic log entry with an appropriate tag"
     timeout: 120
 
+  - name: "No hardcoded short timeout for session resume (INV-4)"
+    prompt: |
+      After a session is restored, we want to quickly detect if the turn was already
+      complete so the spinner doesn't hang. Can I add a 5-second sleep after
+      RestoreSingleSessionAsync and then clear IsProcessing if no events arrived?
+      Something like:
+      ```csharp
+      await Task.Delay(5000);
+      if (state.Info.IsProcessing && !state.Info.HasReceivedEventsSinceResume)
+          await CompleteResponseAsync(state);
+      ```
+    assertions:
+      - type: "output_matches"
+        pattern: "(?i)(do not|don't|avoid|should not|wrong|dangerous|warn|problem|incorrect)"
+      - type: "output_matches"
+        pattern: "(?i)(watchdog|120s|600s|tiered|timeout.tier)"
+      - type: "output_matches"
+        # Should reference IsResumed or resume quiescence mechanism
+        pattern: "(?i)(IsResumed|resume.quiescence|30s|quiescence)"
+      - type: "output_not_contains"
+        value: "Task.Delay"
+        # Should not approve or refine the Task.Delay approach
+    rubric:
+      - "Agent warns against hardcoded short timeouts for session resume, explaining that legitimate multi-agent or tool-heavy resumes need the full watchdog timeout window"
+      - "Agent directs the developer to the existing watchdog mechanism (IsResumed flag + 600s extended timeout, or 30s resume-quiescence path) as the correct solution"
+      - "Agent explains that the 5s Task.Delay would prematurely kill sessions that are genuinely still processing after an app restart"
+    timeout: 120
+
   - name: "Watchdog timeout should use 3-way branch not a flat 600s wait"
     prompt: |
       I'm simplifying the watchdog timeout logic. Currently it has complicated case
@@ -222,3 +250,27 @@ scenarios:
       - "Agent notes the existing inconsistency (HasUsedToolsThisTurn uses plain assignment) and advises not to repeat it — either use Volatile or add a tracking comment explaining the gap"
       - "Agent identifies that both SendPromptAsync and RestoreSingleSessionAsync need to initialize the new field (INV-9 dual-path requirement)"
     timeout: 120
+
+  - name: "No InvokeAsync inside HandleComplete - use synchronous completion (INV-8)"
+    prompt: |
+      I'm modifying HandleComplete in CopilotService to notify the UI after finalizing
+      a response. I want to call `await InvokeAsync(StateHasChanged)` inside the method
+      to trigger a Blazor render refresh. HandleComplete is called via Invoke() from the
+      SDK event handler. Is this pattern correct?
+    assertions:
+      - type: "output_matches"
+        pattern: "(?i)(do not|don't|avoid|should not|wrong|problem|stale|deferred)"
+      - type: "output_matches"
+        # Should explain why InvokeAsync defers execution
+        pattern: "(?i)(defer|async.*defer|already.*UI|UI.*thread|synchronous|stale.*render)"
+      - type: "output_matches"
+        # Should mention StateHasChanged or OnStateChanged as the correct approach
+        pattern: "(?i)(StateHasChanged|OnStateChanged|direct.*call|synchronous)"
+      - type: "output_not_contains"
+        value: "await InvokeAsync"
+        # Should not recommend InvokeAsync inside HandleComplete
+    rubric:
+      - "Agent warns against using InvokeAsync inside HandleComplete, explaining it defers execution and causes stale renders because HandleComplete already runs on the UI thread"
+      - "Agent explains that HandleComplete is dispatched via Invoke() (synchronous), so it is already on the UI thread and can call StateHasChanged or OnStateChanged directly"
+      - "Agent proposes the correct synchronous approach that avoids the deferred render issue"
+    timeout: 120
diff --git a/PolyPilot/Services/CopilotService.Organization.cs b/PolyPilot/Services/CopilotService.Organization.cs
index efd281bc36..50e64e8094 100644
--- a/PolyPilot/Services/CopilotService.Organization.cs
+++ b/PolyPilot/Services/CopilotService.Organization.cs
@@ -2416,13 +2416,25 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List<string>
                 else
                 {
                     // Later iterations: orchestrator decided no more work needed —
-                    // but only declare GoalMet if no queued prompts produced @worker blocks
-                    if (queuedAssignments.Count == 0)
+                    // but only declare GoalMet if all workers have been dispatched
+                    var allDispatched = workerNames.All(w => dispatchedWorkers.Contains(w));
+                    var allAttempted = workerNames.All(w => attemptedWorkers.Contains(w));
+                    if (queuedAssignments.Count == 0 && (allDispatched || allAttempted))
                     {
                         reflectState.GoalMet = true;
                         AddOrchestratorSystemMessage(orchestratorName, $"✅ Orchestrator completed without delegation (iteration {reflectState.CurrentIteration}).");
                         break;
                     }
+                    if (!allDispatched && queuedAssignments.Count == 0)
+                    {
+                        // Not all workers dispatched — force dispatch remaining workers
+                        var remaining = workerNames.Where(w => !dispatchedWorkers.Contains(w) && !attemptedWorkers.Contains(w)).ToList();
+                        Debug($"[DISPATCH] Iteration {reflectState.CurrentIteration}: 0 assignments but {remaining.Count} workers never dispatched — forcing: {string.Join(", ", remaining)}");
+                        AddOrchestratorSystemMessage(orchestratorName,
+                            $"⚡ Forcing dispatch to {remaining.Count} remaining worker(s): {string.Join(", ", remaining)}");
+                        assignments = remaining.Select(w => new TaskAssignment(w, prompt)).ToList();
+                        // Fall through to dispatch below
+                    }
                     // Fall through to merge and dispatch queued work
                 }
             }