diff --git a/.claude/skills/processing-state-safety/SKILL.md b/.claude/skills/processing-state-safety/SKILL.md
index 2861b2a042..fbc031cab5 100644
--- a/.claude/skills/processing-state-safety/SKILL.md
+++ b/.claude/skills/processing-state-safety/SKILL.md
@@ -5,12 +5,13 @@ description: >
abort/error paths, or CompleteResponse in CopilotService. Use when: (1) Adding or
modifying code paths that set IsProcessing=false, (2) Touching HandleSessionEvent,
CompleteResponse, AbortSessionAsync, or the processing watchdog, (3) Adding new
- SDK event handlers, (4) Debugging stuck sessions showing "Thinking..." forever,
- (5) Modifying IsResumed, HasUsedToolsThisTurn, or ActiveToolCallCount,
+ SDK event handlers, (4) Debugging stuck sessions showing "Thinking..." forever
+ or spinner stuck, (5) Modifying IsResumed, HasUsedToolsThisTurn, or ActiveToolCallCount,
(6) Adding diagnostic log tags, (7) Modifying session restore paths
(RestoreSingleSessionAsync) that must initialize watchdog-dependent state,
(8) Modifying ReconcileOrganization or any code that reads Organization.Sessions
- during the IsRestoring window. Covers: 13 invariants from 10 PRs of fix cycles,
+ during the IsRestoring window, (9) Session appears hung or unresponsive after tool use.
+ Covers: 13 invariants from 10 PRs of fix cycles,
the 9 code paths that clear IsProcessing, and common regression patterns.
---
@@ -31,6 +32,7 @@ Every code path that sets `IsProcessing = false` MUST also:
10. Fire `OnSessionComplete` (unblocks orchestrator loops waiting for completion)
11. Add a diagnostic log entry (`[COMPLETE]`, `[ERROR]`, `[ABORT]`, etc.)
12. Run on UI thread (via `InvokeOnUI()` or already on UI thread)
+13. After changes, run `ProcessingWatchdogTests.cs` to catch regressions
## The 9 Paths That Clear IsProcessing
@@ -74,6 +76,17 @@ ALL IsProcessing mutations go through UI thread via `InvokeOnUI()`.
Use generation guard before clearing IsProcessing. `SyncContext.Post` is
async — new `SendPromptAsync` can race between `Post()` and callback.
+```csharp
+// Capture BEFORE posting to UI thread
+var gen = Interlocked.Read(ref state.ProcessingGeneration);
+InvokeOnUI(() =>
+{
+ // Validate INSIDE the callback — abort if a new turn started
+ if (Interlocked.Read(ref state.ProcessingGeneration) != gen) return;
+ // Safe to clear IsProcessing here
+});
+```
+
### INV-4: No hardcoded short timeouts
NEVER add hardcoded short timeouts for session resume. The watchdog
(120s/600s) with tiered approach is the correct mechanism.
@@ -88,9 +101,11 @@ Cleared on ALL termination paths. Extends watchdog to 600s.
Clearing guarded on `!hasActiveTool && !HasUsedToolsThisTurn`.
### INV-7: Volatile for cross-thread fields
-`HasUsedToolsThisTurn`, `HasReceivedEventsSinceResume` should use
-`Volatile.Write`/`Volatile.Read`. ARM weak memory model issue.
-(Currently partial — resets use plain assignment.)
+When adding NEW cross-thread boolean/int flags, use `Volatile.Write`/`Volatile.Read`
+for ARM weak memory model correctness. Existing fields `HasUsedToolsThisTurn` and
+`HasReceivedEventsSinceResume` use plain assignment (pre-existing inconsistency —
+tracked separately, do not fix inline). Do NOT introduce additional plain-assignment
+cross-thread fields without a tracking comment explaining the gap.
### INV-8: No InvokeAsync in HandleComplete
`HandleComplete` is already on UI thread. `InvokeAsync` defers execution
@@ -180,6 +195,34 @@ intent is less clear when reading cross-threaded code.
**Retired mistake (was #2):** *ActiveToolCallCount as sole tool signal* — still relevant per
INV-5, but the more impactful version is #2 above (suppressing the fallback entirely).
+## Diagnosing a Stuck Session
+
+When a session shows "Thinking..." indefinitely:
+
+1. **Check the diagnostic log** — `~/.polypilot/event-diagnostics.log`
+ ```bash
+ grep 'SESSION_NAME' ~/.polypilot/event-diagnostics.log | tail -20
+ ```
+ Look for the last `[SEND]` (turn started) and whether `[IDLE]` or `[COMPLETE]` followed.
+
+2. **Check if the watchdog is running** — look for `[WATCHDOG]` entries after the `[SEND]`.
+ If none appear, the watchdog wasn't started (see INV-9 for restore path issues).
+
+3. **Check `IsProcessing` state** — via MauiDevFlow CDP:
+ ```bash
+ maui-devflow cdp Runtime evaluate "document.querySelector('.processing-indicator')?.textContent"
+ ```
+
+4. **Common stuck patterns:**
+ | Symptom | Likely Cause | Fix |
+ |---------|-------------|-----|
+ | `[SEND]` then silence | SDK never responded, watchdog will catch at 120s | Wait or abort |
+ | `[EVT] TurnEnd` but no `[IDLE]` | Zero-idle SDK bug | Watchdog catches at 30s fallback (INV-10) |
+ | `[COMPLETE]` fired but spinner persists | UI thread not notified | Check INV-2, INV-8 |
+ | `[WATCHDOG]` clears but re-sticks | New turn started before watchdog callback ran | Check INV-3 generation guard |
+
+5. **Nuclear option** — user clicks Stop (AbortSessionAsync, path #5/#6).
+
## Regression History
10 PRs of fix/regression cycles: #141 → #147 → #148 → #153 → #158 → #163 → #164 → #276 → #284 → #332.
diff --git a/.claude/skills/processing-state-safety/tests/eval.yaml b/.claude/skills/processing-state-safety/tests/eval.yaml
new file mode 100644
index 0000000000..738a409542
--- /dev/null
+++ b/.claude/skills/processing-state-safety/tests/eval.yaml
@@ -0,0 +1,276 @@
+scenarios:
+ - name: "New IsProcessing=false path needs full cleanup checklist"
+ prompt: |
+ I need to add a new code path in CopilotService that clears IsProcessing when a
+ network timeout is detected. Can you write the code for this? The path should be
+ in a Task.Run callback that runs on a timer. Make sure it's correct.
+ assertions:
+ - type: "output_contains"
+ value: "FlushCurrentResponse"
+ - type: "output_contains"
+ value: "InvokeOnUI"
+ - type: "output_contains"
+ value: "ProcessingGeneration"
+ - type: "output_matches"
+ pattern: "IsResumed|HasUsedToolsThisTurn|ActiveToolCallCount"
+ - type: "output_matches"
+ pattern: "ProcessingPhase|ProcessingStartedAt|ToolCallCount|SendingFlag"
+ - type: "output_matches"
+ # OnSessionComplete unblocks orchestrator loops — must be in every cleanup path
+ pattern: "OnSessionComplete|ClearPermissionDenials"
+ - type: "output_matches"
+ # Must include a diagnostic log tag for debugging stuck sessions
+ pattern: "\\[ERROR\\]|\\[TIMEOUT\\]|\\[NETWORK\\]|diagnostic.log"
+ rubric:
+ - "Agent includes all or nearly all of the 13-item cleanup checklist (IsResumed, HasUsedToolsThisTurn, ActiveToolCallCount, ProcessingStartedAt, ToolCallCount, ProcessingPhase, SendingFlag, ClearPermissionDenials, FlushCurrentResponse, OnSessionComplete, diagnostic log)"
+ - "Agent uses InvokeOnUI() class method (not local Invoke) for the background→UI dispatch per INV-13"
+ - "Agent captures ProcessingGeneration before the InvokeOnUI call and validates it inside the lambda (generation guard pattern per INV-3 and INV-12)"
+ - "Agent adds a diagnostic log entry with an appropriate tag like [ERROR] or [TIMEOUT]"
+ timeout: 120
+
+ - name: "Debugging a session stuck showing Thinking... forever"
+ prompt: |
+ A user reports that a session is stuck showing "Thinking..." indefinitely.
+ The session used some tools earlier but nothing happened for several minutes.
+ How do I diagnose and fix this?
+ assertions:
+ - type: "output_contains"
+ value: "event-diagnostics.log"
+ - type: "output_contains"
+ value: "[SEND]"
+ - type: "output_matches"
+ pattern: "(?i)(watchdog|120s|600s|WATCHDOG)"
+ - type: "output_matches"
+ # Should reference specific diagnostic tags, not just generic "check the logs"
+ pattern: "\\[IDLE\\]|\\[COMPLETE\\]|\\[WATCHDOG\\]|\\[EVT\\]"
+ - type: "output_not_contains"
+ value: "restart the app"
+ # Should suggest specific diagnostic steps, not generic advice
+ rubric:
+ - "Agent directs the user to check event-diagnostics.log and explains what tags to look for ([SEND], [IDLE], [COMPLETE], [WATCHDOG])"
+ - "Agent explains the watchdog timeout tiers (120s inactivity, 600s tool/resume) and which case likely applies given tools were used"
+ - "Agent references the stuck-session diagnostic table: [SEND] then silence, [EVT] TurnEnd but no [IDLE], [COMPLETE] fired but spinner persists, etc."
+ - "Agent mentions the nuclear option (AbortSessionAsync / Stop button) as a fallback"
+ timeout: 120
+
+ - name: "Unrelated UI styling task should not trigger processing-state concepts"
+ prompt: |
+ I want to update the CSS for the session list items in SessionListItem.razor to
+ add a subtle hover animation. Can you help me write the CSS?
+ assertions:
+ - type: "output_not_contains"
+ value: "IsProcessing"
+ - type: "output_not_contains"
+ value: "FlushCurrentResponse"
+ - type: "output_not_contains"
+ value: "InvokeOnUI"
+ - type: "output_not_contains"
+ value: "ProcessingGeneration"
+ - type: "output_not_contains"
+ value: "watchdog"
+ rubric:
+ - "Agent focuses purely on CSS/styling and does not inject unrelated processing-state safety guidance"
+ - "Response is concise and on-topic for a CSS change"
+ timeout: 60
+
+ - name: "TurnEnd fallback regression - avoid suppressing with HasUsedToolsThisTurn"
+ prompt: |
+ The AssistantTurnEndEvent fallback is firing too early during multi-tool agent
+ sessions and completing the response prematurely. I want to fix this by checking
+ if HasUsedToolsThisTurn is true and skipping the fallback entirely in that case.
+ Can you implement this fix?
+ assertions:
+ - type: "output_matches"
+ pattern: "(?i)(do not|don't|should not|shouldn't|wrong|dangerous|avoid|warn|mistake)"
+ - type: "output_contains"
+ value: "ActiveToolCallCount"
+ - type: "output_matches"
+ pattern: "(?i)(30s|30.000|30_000|extended|delay|TurnEndIdleToolFallbackAdditionalMs)"
+ - type: "output_matches"
+ # Must reference the cancellation mechanism, not just the delay
+ pattern: "(?i)(cancel|CancellationToken|AssistantTurnStartEvent|CTS)"
+ rubric:
+ - "Agent explicitly warns against using HasUsedToolsThisTurn to skip the fallback entirely, explaining that it leaves agent sessions with no recovery if SessionIdleEvent is dropped (SDK bug #299)"
+ - "Agent proposes the correct approach: use ActiveToolCallCount > 0 to skip the 4s fallback, with an extended 30s delay when tools were used but ActiveToolCallCount is 0"
+ - "Agent references the cancellation token from AssistantTurnStartEvent as the mechanism to prevent premature firing in multi-round tool use"
+ timeout: 120
+
+ - name: "Session restore must initialize watchdog-dependent state"
+ prompt: |
+ I'm adding a new boolean flag IsHighPrioritySession to AgentSessionInfo. It needs
+ to affect the watchdog timeout — when true, use a longer 900s timeout instead of
+ 600s. I've added it to SendPromptAsync. Is there anything else I need to update?
+ assertions:
+ - type: "output_contains"
+ value: "RestoreSingleSessionAsync"
+ - type: "output_matches"
+ pattern: "(?i)(restore|restart|resume)"
+ - type: "output_matches"
+ pattern: "(?i)(before.*watchdog|StartProcessingWatchdog|watchdog.*depend)"
+ - type: "output_matches"
+ # Should mention the precedent or dual-path pattern
+ pattern: "(?i)(IsMultiAgentSession|both.*path|two.*path|dual|separate.*path)"
+ rubric:
+ - "Agent identifies that the restore path (RestoreSingleSessionAsync) must also initialize the new flag, not just SendPromptAsync"
+ - "Agent explains that state must be set BEFORE StartProcessingWatchdog is called, and the restore path is independent of the send path"
+ - "Agent references the IsMultiAgentSession precedent (PR #284) where the same mistake caused premature watchdog kills"
+ timeout: 120
+
+ - name: "ChatDatabase catch filter narrowing - prevent regression"
+ prompt: |
+ The ChatDatabase is swallowing too many exceptions. I want to narrow the catch
+ filter from `catch (Exception ex)` to `catch (SqliteException ex)` so we only
+ catch database-specific errors. Can you update the AddMessageAsync callers?
+ assertions:
+ - type: "output_not_contains"
+ value: "SqliteException"
+ # Should NOT recommend narrowing to SqliteException
+ - type: "output_matches"
+ pattern: "(?i)(unobserved|crash|AggregateException|ObjectDisposedException)"
+ - type: "output_matches"
+ pattern: "(?i)(fire.and.forget|task exception|source of truth|events\\.jsonl)"
+ - type: "output_matches"
+ # Should explain self-healing via BulkInsertAsync
+ pattern: "(?i)(self.heal|BulkInsert|replay|restart|write.through)"
+ rubric:
+ - "Agent explicitly warns against narrowing the catch filter, explaining that fire-and-forget tasks with narrow catches produce unobserved task exceptions that crash the app"
+ - "Agent explains that events.jsonl is the source of truth and DB write failures are self-healing via BulkInsertAsync on resume — broad catch is intentional"
+ - "Agent may reference PR #276 where this exact narrowing caused crashes in production"
+ timeout: 120
+
+ - name: "Background thread IsProcessing mutation without UI marshaling"
+ prompt: |
+ I'm adding a new SDK event handler for SessionReconnectEvent in
+ CopilotService.Events.cs. When we get this event, I need to clear the
+ processing state. Here's what I have inside HandleSessionEvent which runs
+ on a background thread:
+
+ ```csharp
+ case "session.reconnect":
+ state.Info.IsProcessing = false;
+ state.Info.IsResumed = false;
+ state.Info.HasUsedToolsThisTurn = false;
+ state.Info.ActiveToolCallCount = 0;
+ OnStateChanged?.Invoke();
+ break;
+ ```
+
+ Does this look correct?
+ assertions:
+ - type: "output_matches"
+ pattern: "(?i)(InvokeOnUI|UI thread|main thread|marshal|dispatch|synchronization)"
+ - type: "output_matches"
+ pattern: "(?i)(background.*thread|thread.*safe|cross.*thread)"
+ - type: "output_matches"
+ pattern: "ProcessingGeneration|generation.*guard"
+ - type: "output_matches"
+ pattern: "FlushCurrentResponse|flush"
+ - type: "output_matches"
+ # Must identify the missing companion fields from the 13-item checklist
+ pattern: "(?i)(ProcessingPhase|ProcessingStartedAt|ToolCallCount|SendingFlag|missing|incomplete)"
+ - type: "output_matches"
+ # Must mention diagnostic log entry or OnSessionComplete
+ pattern: "(?i)(diagnostic.*log|\\[RECONNECT\\]|OnSessionComplete|ClearPermissionDenials)"
+ rubric:
+ - "Agent identifies the critical thread safety violation: IsProcessing and companion fields must not be mutated directly on a background SDK event thread"
+ - "Agent recommends wrapping the state mutations in InvokeOnUI() to marshal to the UI thread"
+ - "Agent points out missing companion fields beyond the ones listed (ProcessingPhase, ProcessingStartedAt, ToolCallCount, SendingFlag, ClearPermissionDenials, OnSessionComplete)"
+ - "Agent mentions the ProcessingGeneration guard pattern and the need for FlushCurrentResponse before clearing IsProcessing"
+ - "Agent recommends adding a diagnostic log entry with an appropriate tag"
+ timeout: 120
+
+ - name: "No hardcoded short timeout for session resume (INV-4)"
+ prompt: |
+ After a session is restored, we want to quickly detect if the turn was already
+ complete so the spinner doesn't hang. Can I add a 5-second sleep after
+ RestoreSingleSessionAsync and then clear IsProcessing if no events arrived?
+ Something like:
+ ```csharp
+ await Task.Delay(5000);
+ if (state.Info.IsProcessing && !state.Info.HasReceivedEventsSinceResume)
+ await CompleteResponseAsync(state);
+ ```
+ assertions:
+ - type: "output_matches"
+ pattern: "(?i)(do not|don't|avoid|should not|wrong|dangerous|warn|problem|incorrect)"
+ - type: "output_matches"
+ pattern: "(?i)(watchdog|120s|600s|tiered|timeout.tier)"
+ - type: "output_matches"
+ # Should reference IsResumed or resume quiescence mechanism
+ pattern: "(?i)(IsResumed|resume.quiescence|30s|quiescence)"
+ - type: "output_not_contains"
+ value: "Task.Delay"
+ # Should not approve or refine the Task.Delay approach
+ rubric:
+ - "Agent warns against hardcoded short timeouts for session resume, explaining that legitimate multi-agent or tool-heavy resumes need the full watchdog timeout window"
+ - "Agent directs the developer to the existing watchdog mechanism (IsResumed flag + 600s extended timeout, or 30s resume-quiescence path) as the correct solution"
+ - "Agent explains that the 5s Task.Delay would prematurely kill sessions that are genuinely still processing after an app restart"
+ timeout: 120
+
+ - name: "Watchdog timeout should use 3-way branch not a flat 600s wait"
+ prompt: |
+ I'm simplifying the watchdog timeout logic. Currently it has complicated case
+ branches. I want to replace it with a single check: if ActiveToolCallCount > 0
+ OR HasUsedToolsThisTurn, wait the full 600s before declaring the session stuck
+ and killing it. Otherwise use 120s. Is this safe?
+ assertions:
+ - type: "output_matches"
+ pattern: "(?i)(do not|don't|should not|shouldn't|wrong|dangerous|problem|issue|warn)"
+ - type: "output_matches"
+ pattern: "(?i)(SessionIdleEvent|dropped|lost.*event|event.*dropped|zero.idle|terminal event)"
+ - type: "output_matches"
+ pattern: "(?i)(Case B|CompleteResponse.*clean|cleanly.*complete|tools.*done|!hasActiveTool)"
+ - type: "output_not_contains"
+ value: "that's fine"
+ rubric:
+ - "Agent warns against the flat 600s wait and explains that tools may have finished (ActiveToolCallCount==0, HasUsedToolsThisTurn==true) but the SDK dropped SessionIdleEvent — leading to a 10-minute kill of a successfully-completed session"
+ - "Agent describes the correct 3-way branch: Case A (active tools + server alive → reset timer), Case B (tools done, used tools → call CompleteResponse cleanly), Case C (default kill with error message)"
+ - "Agent explains that Case B calls CompleteResponse cleanly without an error message, preserving the session's output"
+ timeout: 120
+
+ - name: "Edge case - adding cross-thread boolean without Volatile (INV-7)"
+ prompt: |
+ I'm adding a new boolean flag `IsWaitingForUserInput` to AgentSessionInfo that gets
+ set to true from an SDK event handler (background thread) and read by the watchdog
+ timer. I'll just use a regular bool property like the existing HasUsedToolsThisTurn.
+ Should I also clear it in the IsProcessing cleanup paths?
+ assertions:
+ - type: "output_matches"
+ # Must mention Volatile or memory model concern
+ pattern: "(?i)(Volatile\\.Write|Volatile\\.Read|volatile|memory.model|ARM|thread.safe)"
+ - type: "output_matches"
+ # Must address the cleanup question positively
+ pattern: "(?i)(yes|clear|cleanup|checklist|IsProcessing.*path)"
+ - type: "output_matches"
+ # Should mention the existing inconsistency or tracking comment requirement
+ pattern: "(?i)(HasUsedToolsThisTurn|plain.assignment|inconsisten|tracking.comment|pre.existing)"
+ rubric:
+ - "Agent answers yes to the cleanup question — new fields read by the watchdog must be cleared on all IsProcessing=false paths"
+ - "Agent warns about using plain bool for cross-thread access and recommends Volatile.Write/Volatile.Read for ARM correctness (INV-7)"
+ - "Agent notes the existing inconsistency (HasUsedToolsThisTurn uses plain assignment) and advises not to repeat it — either use Volatile or add a tracking comment explaining the gap"
+ - "Agent identifies that both SendPromptAsync and RestoreSingleSessionAsync need to initialize the new field (INV-9 dual-path requirement)"
+ timeout: 120
+
+ - name: "No InvokeAsync inside HandleComplete - use synchronous completion (INV-8)"
+ prompt: |
+ I'm modifying HandleComplete in CopilotService to notify the UI after finalizing
+ a response. I want to call `await InvokeAsync(StateHasChanged)` inside the method
+ to trigger a Blazor render refresh. HandleComplete is called via Invoke() from the
+ SDK event handler. Is this pattern correct?
+ assertions:
+ - type: "output_matches"
+ pattern: "(?i)(do not|don't|avoid|should not|wrong|problem|stale|deferred)"
+ - type: "output_matches"
+ # Should explain why InvokeAsync defers execution
+ pattern: "(?i)(defer|async.*defer|already.*UI|UI.*thread|synchronous|stale.*render)"
+ - type: "output_matches"
+ # Should mention StateHasChanged or OnStateChanged as the correct approach
+ pattern: "(?i)(StateHasChanged|OnStateChanged|direct.*call|synchronous)"
+ - type: "output_not_contains"
+ value: "await InvokeAsync"
+ # Should not recommend InvokeAsync inside HandleComplete
+ rubric:
+ - "Agent warns against using InvokeAsync inside HandleComplete, explaining it defers execution and causes stale renders because HandleComplete already runs on the UI thread"
+ - "Agent explains that HandleComplete is dispatched via Invoke() (synchronous), so it is already on the UI thread and can call StateHasChanged or OnStateChanged directly"
+ - "Agent proposes the correct synchronous approach that avoids the deferred render issue"
+ timeout: 120
diff --git a/PolyPilot.Tests/ChatDatabaseResilienceTests.cs b/PolyPilot.Tests/ChatDatabaseResilienceTests.cs
index c0e88dc02d..951b54e39a 100644
--- a/PolyPilot.Tests/ChatDatabaseResilienceTests.cs
+++ b/PolyPilot.Tests/ChatDatabaseResilienceTests.cs
@@ -261,12 +261,15 @@ void handler(object? s, UnobservedTaskExceptionEventArgs e)
_ = db.UpdateToolCompleteAsync("session-1", "tool-1", "result", true);
_ = db.UpdateReasoningContentAsync("session-1", "reason-1", "content", true);
- // Give tasks time to complete and finalize
+ // Give tasks time to complete and finalize — multiple GC cycles
+ // needed to reliably trigger UnobservedTaskException under load
+ await Task.Delay(500);
+ for (int i = 0; i < 3; i++)
+ {
+ GC.Collect(2, GCCollectionMode.Forced, blocking: true);
+ GC.WaitForPendingFinalizers();
+ }
await Task.Delay(200);
- GC.Collect();
- GC.WaitForPendingFinalizers();
- GC.Collect();
- await Task.Delay(100);
Assert.False(unobservedException,
"Fire-and-forget ChatDatabase calls must not produce unobserved task exceptions");
diff --git a/PolyPilot.Tests/SessionOrganizationTests.cs b/PolyPilot.Tests/SessionOrganizationTests.cs
index e123df7a22..dee8ce0597 100644
--- a/PolyPilot.Tests/SessionOrganizationTests.cs
+++ b/PolyPilot.Tests/SessionOrganizationTests.cs
@@ -1370,13 +1370,13 @@ public void BuiltInPresets_IncludeSkillValidator()
{
var skillValidator = GroupPreset.BuiltIn.FirstOrDefault(p => p.Name == "Skill Validator");
Assert.NotNull(skillValidator);
- Assert.Equal(2, skillValidator!.WorkerModels.Length);
+ Assert.Equal(3, skillValidator!.WorkerModels.Length);
Assert.Equal(MultiAgentMode.OrchestratorReflect, skillValidator.Mode);
Assert.Equal("⚖️", skillValidator.Emoji);
Assert.NotNull(skillValidator.SharedContext);
Assert.NotNull(skillValidator.RoutingContext);
Assert.NotNull(skillValidator.WorkerSystemPrompts);
- Assert.Equal(2, skillValidator.WorkerSystemPrompts!.Length);
+ Assert.Equal(3, skillValidator.WorkerSystemPrompts!.Length);
Assert.All(skillValidator.WorkerSystemPrompts, p => Assert.False(string.IsNullOrWhiteSpace(p)));
Assert.NotNull(skillValidator.MaxReflectIterations);
}
diff --git a/PolyPilot.Tests/WsBridgeIntegrationTests.cs b/PolyPilot.Tests/WsBridgeIntegrationTests.cs
index 6612ce4252..400587429a 100644
--- a/PolyPilot.Tests/WsBridgeIntegrationTests.cs
+++ b/PolyPilot.Tests/WsBridgeIntegrationTests.cs
@@ -27,13 +27,17 @@ private static int GetFreePort()
}
///
- /// Polls until a condition is true, with a timeout. Replaces fixed Task.Delay for reliability under load.
+ /// Polls until a condition is true, with a timeout. Throws TimeoutException on silent timeout
+ /// to surface flaky races instead of letting assertions fail on stale state.
///
private static async Task WaitForAsync(Func condition, CancellationToken ct, int pollMs = 50, int maxMs = 4000)
{
var sw = System.Diagnostics.Stopwatch.StartNew();
while (!condition() && sw.ElapsedMilliseconds < maxMs)
await Task.Delay(pollMs, ct);
+
+ if (!condition())
+ throw new TimeoutException($"WaitForAsync condition not met within {maxMs}ms");
}
public WsBridgeIntegrationTests()
@@ -677,7 +681,7 @@ public async Task RenameSession_ViaClient_RenamesOnServer()
var client = await ConnectClientAsync(cts.Token);
await client.RenameSessionAsync("old-name", "new-name", cts.Token);
- await WaitForAsync(() => _copilot.GetSession("new-name") != null, cts.Token);
+ await WaitForAsync(() => _copilot.GetSession("new-name") != null, cts.Token, maxMs: 8000);
Assert.Null(_copilot.GetSession("old-name"));
Assert.NotNull(_copilot.GetSession("new-name"));
diff --git a/PolyPilot/Models/ModelCapabilities.cs b/PolyPilot/Models/ModelCapabilities.cs
index c8773e8695..deef5b238b 100644
--- a/PolyPilot/Models/ModelCapabilities.cs
+++ b/PolyPilot/Models/ModelCapabilities.cs
@@ -184,6 +184,13 @@ public record GroupPreset(string Name, string Description, string Emoji, MultiAg
///
public int? MaxReflectIterations { get; init; }
+ ///
+ /// Custom display names for workers, indexed to match WorkerModels.
+ /// E.g. ["dotnet-validator", "anthropic-validator", "eval-generator"].
+ /// Null = use default "worker-{i}" naming.
+ ///
+ public string?[]? WorkerDisplayNames { get; init; }
+
private const string WorkerReviewPrompt = """
You are a PR reviewer. When assigned a PR, follow this process:
@@ -346,62 +353,109 @@ public record GroupPreset(string Name, string Description, string Emoji, MultiAg
},
new GroupPreset(
- "Skill Validator", "Two evaluators assess your skill from different angles — dotnet empirical testing vs. Anthropic design review — orchestrator builds consensus",
+ "Skill Validator", "Three-phase skill evaluation: generate evals → empirical A/B testing → prompt design review → orchestrator builds consensus",
"⚖️", MultiAgentMode.OrchestratorReflect,
- "claude-opus-4.6", new[] { "claude-sonnet-4.6", "claude-sonnet-4.6" })
+ "claude-opus-4.6", new[] { "claude-sonnet-4.6", "claude-sonnet-4.6", "claude-sonnet-4.6" })
{
WorkerSystemPrompts = new[]
{
"""
- You are the Dotnet Skill Validator. You evaluate skills empirically using the methodology from the dotnet/skills skill-validator tool.
+ You are the Dotnet Skill Validator. You evaluate skills by actually RUNNING the `skill-validator` tool from dotnet/skills — not by guessing or theorizing.
+
+ ## STEP 1: Ensure skill-validator is available
+
+ Check if the binary exists, and download it if missing:
+ ```bash
+ if [ ! -x /tmp/skill-validator ]; then
+ echo "Downloading skill-validator..."
+ ARCH=$(uname -m)
+ case "$(uname -s)-${ARCH}" in
+ Darwin-arm64) PATTERN='skill-validator-osx-arm64.tar.gz' ;;
+ Darwin-x86_64) PATTERN='skill-validator-osx-x64.tar.gz' ;;
+ Linux-aarch64) PATTERN='skill-validator-linux-arm64.tar.gz' ;;
+ Linux-x86_64) PATTERN='skill-validator-linux-x64.tar.gz' ;;
+ *) echo "Unsupported platform: $(uname -s)-${ARCH}"; exit 1 ;;
+ esac
+ cd /tmp && gh release download skill-validator-nightly \
+ --repo dotnet/skills \
+ --pattern "$PATTERN" \
+ --clobber && \
+ tar xzf "$PATTERN"
+ fi
+ /tmp/skill-validator --help | head -5
+ ```
+ If `gh` is not available or the download fails, explain the failure and fall back to manual analysis (Step 3 only).
- ## Your Evaluation Approach
+ ## STEP 2: Run skill-validator against the skill
+
+ Run the tool against the skill directory. The skill directory must contain a `SKILL.md` file.
+
+ ```bash
+ /tmp/skill-validator \
+ --runs 1 \
+ --model claude-sonnet-4.6 \
+ --verdict-warn-only \
+ --verbose \
+ --results-dir /tmp/skill-validator-results
+ ```
+
+ **Flags explained:**
+ - `--runs 1`: Single run for speed (the Anthropic evaluator covers qualitative depth)
+ - `--model claude-sonnet-4.6`: Use Sonnet for agent runs (fast, cost-effective)
+ - `--verdict-warn-only`: Don't fail hard on missing eval.yaml — report the gap instead
+ - `--verbose`: Show tool calls and agent events so we can see what happened
- For each skill under evaluation, perform these steps:
+ **If the skill has no `tests/eval.yaml`:** Report this as a finding. The tool will still run static profile analysis. Note this gap prominently in your verdict.
- ### 1. Inspect the Skill Definition
- - Read the SKILL.md file carefully
- - Read the skill's `tests/eval.yaml` if present
- - Identify what scenarios the skill claims to improve
+ **After the run:** Read the results:
+ ```bash
+ cat /tmp/skill-validator-results/summary.md 2>/dev/null
+ cat /tmp/skill-validator-results/results.json 2>/dev/null | head -100
+ ```
- ### 2. Empirical Baseline Comparison
- - For each scenario in the skill's eval.yaml (or infer representative scenarios from SKILL.md):
- a. Describe what the agent response would look like WITHOUT the skill (baseline)
- b. Describe what the agent response looks like WITH the skill
- c. Assess: token efficiency, tool call count, task completion, error rate
+ ## STEP 3: Analyze the tool output
- ### 3. Pairwise Comparative Judgment
- - Compare baseline vs. skill-augmented performance side-by-side
- - Apply position-swap bias mitigation: consider both orderings in your judgment
- - Rate improvement on a scale of -1 (degraded) to +1 (strong improvement) with confidence
+ Based on the ACTUAL tool output, assess:
+ - **Profile analysis**: Token count, section structure, code blocks (from tool's static analysis)
+ - **A/B comparison**: Did the skill improve agent performance? By how much?
+ - **Statistical significance**: Was the improvement score above the threshold?
+ - **Overfitting**: Did the tool flag any overfitting concerns?
+ - **Task completion**: Did assertions pass with the skill active?
- ### 4. Statistical Assessment
- - Estimate variance across the scenario set
- - Flag scenarios where the skill helps most vs. least
- - Identify scenarios where the skill might cause regressions
+ If the tool couldn't run (no eval.yaml, download failed, etc.), perform manual analysis:
+ - Read SKILL.md and estimate token count, structure quality
+ - Identify likely improvement scenarios and potential regressions
+ - Note that this is manual analysis, not empirical data
+
+ ## STEP 4: Verdict
- ### 5. Verdict
Format your verdict as:
```
## Dotnet Validator Verdict
+ **Tool Run**: ✅ Completed / ⚠️ Partial (no eval.yaml) / ❌ Failed (reason)
+ **Improvement Score**: X% [CI: low%, high%] (from tool) or N/A
**Overall Score**: X/10
**Confidence**: High / Medium / Low
**Verdict**: KEEP / IMPROVE / REMOVE
+ ### Tool Output Summary
+ - [key findings from skill-validator output]
+
### Strengths
- - [specific strengths with evidence]
+ - [specific strengths with evidence from tool output]
### Weaknesses
- - [specific weaknesses with evidence]
+ - [specific weaknesses with evidence from tool output]
### Suggested Improvements
- [concrete, actionable suggestions]
```
## Rules
- - Be data-driven: cite specific scenarios and observed behaviors
- - Focus on measurable outcomes: does it reduce tokens? improve task completion? reduce errors?
- - Don't recommend keeping a skill that shows no measurable improvement in your empirical analysis
+ - ALWAYS attempt to run the tool first. Do not skip to manual analysis.
+ - Include the actual tool output or error in your report — transparency matters.
+ - If the skill has no eval.yaml, recommend creating one and suggest specific scenarios.
+ - Be data-driven: cite tool metrics, not vibes.
""",
"""
@@ -455,20 +509,134 @@ b. Describe what the agent response looks like WITH the skill
- Be concrete: quote specific lines from SKILL.md when critiquing
- Focus on prompt quality, not empirical test results
- Suggest specific rewrites, not vague advice like "be clearer"
+
+ ## Optional: Live Trigger Testing with Claude Code
+ If `claude` CLI is available, test trigger accuracy with non-interactive prompts:
+ ```bash
+ # Check availability first
+ which claude 2>/dev/null && claude --version 2>&1
+ ```
+ If available and authenticated, run:
+ ```bash
+ # Positive test — should trigger the skill
+ claude -p "" --output-format text 2>&1 | head -50
+ # Negative test — should NOT trigger the skill
+ claude -p "" --output-format text 2>&1 | head -50
+ ```
+ If `claude` returns auth errors ("does not have access", "login again"), skip live
+ testing and note: "Claude Code CLI not authenticated — trigger testing skipped."
+ Do NOT retry auth failures. Fall back to manual analysis.
+ """,
+
+ """
+ You are the Eval Generator. You create `tests/eval.yaml` files for skills that don't have them, enabling empirical A/B validation by the skill-validator tool.
+
+ ## Your Job
+
+ Given a skill directory with SKILL.md, generate a comprehensive `eval.yaml` that tests whether the skill actually improves agent behavior.
+
+ ## STEP 1: Read the skill
+
+ Read the SKILL.md file and understand:
+ - What the skill teaches the agent to do
+ - What triggers should activate the skill
+ - What the agent should do differently WITH the skill vs WITHOUT it
+ - What mistakes the skill prevents
+
+ ## STEP 2: Check for existing eval.yaml
+
+ ```bash
+ ls /tests/eval.yaml 2>/dev/null
+ ```
+ If one already exists, read it and IMPROVE it rather than replacing it.
+ If none exists, create the `tests/` directory and generate a new one.
+
+ ## STEP 3: Design scenarios
+
+ Create 3-5 scenarios covering:
+
+ 1. **Positive trigger (happy path)**: A prompt that SHOULD trigger the skill. Assert the agent follows the skill's key instructions.
+ 2. **Negative trigger**: A prompt that should NOT trigger the skill. Assert the agent doesn't mention skill-specific concepts.
+ 3. **Edge case**: A prompt that's ambiguous or borderline. The skill should help the agent handle it correctly.
+ 4. **Regression prevention**: A prompt that tests a specific mistake the skill warns about. Assert the agent avoids the mistake.
+
+ ### Assertion guidelines
+ - Use `output_contains` for key terms/patterns the skilled agent MUST produce
+ - Use `output_not_contains` for anti-patterns the skilled agent must avoid
+ - Use `output_matches` with regex for flexible pattern matching
+ - Keep assertions focused on BEHAVIOR differences, not exact wording (avoid overfitting)
+
+ ### Rubric guidelines
+ - Each rubric item should describe a QUALITY criterion the judge can evaluate
+ - Focus on outcomes ("correctly identified the issue") not vocabulary ("used the word X")
+ - Include 2-4 rubric items per scenario
+ - Rubric items are evaluated by pairwise LLM comparison (with-skill vs without-skill)
+
+ ## STEP 4: Write the eval.yaml
+
+ ```bash
+ mkdir -p /tests
+ cat > /tests/eval.yaml << 'EVALEOF'
+ scenarios:
+ - name: "Description of scenario"
+ prompt: "The exact prompt to send to the agent"
+ assertions:
+ - type: "output_contains"
+ value: "expected behavior indicator"
+ rubric:
+ - "Quality criterion for the judge"
+ timeout: 120
+ EVALEOF
+ ```
+
+ ## STEP 5: Validate the file
+
+ ```bash
+ cat /tests/eval.yaml
+ # Verify it's valid YAML
+ python3 -c "import yaml; yaml.safe_load(open('/tests/eval.yaml'))" 2>&1 || echo "YAML parse error"
+ ```
+
+ ## Output format
+
+ After writing the file, report:
+ ```
+ ## Eval Generator Report
+ **Skill**: [skill name]
+ **Scenarios Generated**: N
+ **File Written**: /tests/eval.yaml
+
+ ### Scenarios
+ 1. [name] — [what it tests]
+ 2. [name] — [what it tests]
+ ...
+
+ ### Design Rationale
+ - [why these specific scenarios were chosen]
+ - [what behavioral differences they target]
+ ```
+
+ ## Rules
+ - ALWAYS write the file to disk. Your output is consumed by other workers.
+ - Focus on BEHAVIORAL differences — what changes when the skill is active?
+ - Avoid overfitting: don't assert the skill's exact vocabulary, assert the outcomes
+ - Keep prompts realistic — they should sound like real user requests
+ - Set reasonable timeouts (60-180s depending on complexity)
+ - Use `exit_success` assertion sparingly — it just checks for non-empty output
""",
},
SharedContext = """
## Skill Evaluation Standards
- Both evaluators assess the same skill and produce independent verdicts.
- A good skill must satisfy BOTH evaluators to be marked KEEP.
+ All three workers assess the same skill from different angles.
+ A good skill must satisfy the empirical validator AND the prompt reviewer to be marked KEEP.
### What makes a skill worth keeping
- Measurable improvement in task completion (Dotnet validator perspective)
- Clear, precise description that triggers reliably (Anthropic evaluator perspective)
+ - Comprehensive eval.yaml that tests real behavioral differences (Eval Generator perspective)
- Focused scope — does one thing well
- Actionable instructions that guide the agent without over-constraining it
- - Adequate test coverage
### What warrants IMPROVE
- Good intent but fixable gaps (bad trigger description, missing scenarios, ambiguous instructions)
@@ -480,37 +648,96 @@ A good skill must satisfy BOTH evaluators to be marked KEEP.
- Instructions that would cause regressions or confusion
### Consensus Rule
- - KEEP requires both evaluators to say KEEP or one KEEP + one IMPROVE
+ - KEEP requires both evaluators (dotnet-validator and anthropic-validator) to say KEEP or one KEEP + one IMPROVE
- IMPROVE if the evaluators disagree or both say IMPROVE
- REMOVE if either evaluator says REMOVE with strong evidence
+
+ ### eval.yaml Format Reference
+ ```yaml
+ scenarios:
+ - name: "Descriptive name"
+ prompt: "Prompt sent to the agent"
+ assertions:
+ - type: "output_contains"
+ value: "expected text"
+ - type: "output_not_contains"
+ value: "text that should not appear"
+ - type: "output_matches"
+ pattern: "regex"
+ rubric:
+ - "Quality criterion for pairwise LLM judge"
+ timeout: 120
+ ```
+ Assertion types: output_contains, output_not_contains, output_matches, output_not_matches, file_exists, file_contains, exit_success.
""",
RoutingContext = """
## Skill Validator Orchestration
- You orchestrate two skill evaluators who assess skills from different angles. Your role is to:
- 1. Dispatch the skill to BOTH evaluators simultaneously (or sequentially if needed)
- 2. Collect their verdicts
- 3. Identify where they agree and disagree
- 4. Build a consensus verdict explaining which suggestions to adopt and why
+ ⚠️ CRITICAL: You are a DISPATCHER. You delegate work ONLY via @worker:/@end blocks.
+ NEVER use task(), bash(), view(), grep(), or ANY tool yourself. NEVER read files, run commands, or do analysis.
+ If you catch yourself about to call a tool — STOP and write an @worker block instead.
+
+ ### Delegation Format (the ONLY way to assign work)
+
+ ```
+ @worker:Skill Validator-eval-generator
+ Your task: [detailed instructions]
+ @end
+ ```
+
+ You MUST use the FULL worker session name (e.g., "Skill Validator-eval-generator", not just "eval-generator").
+ Each response MUST contain @worker:/@end blocks. Text outside the blocks is your planning notes.
### Worker Names
- - **worker-1** = Dotnet Skill Validator (empirical, outcome-focused)
- - **worker-2** = Anthropic Skill Evaluator (prompt-design-focused)
+ - **Skill Validator-dotnet-validator** = Dotnet Skill Validator (runs `skill-validator` tool)
+ - **Skill Validator-anthropic-validator** = Anthropic Skill Evaluator (prompt analysis via Claude Code)
+ - **Skill Validator-eval-generator** = Eval Generator (creates tests/eval.yaml)
- ### Dispatch Pattern
- 1. **First dispatch**: Send the skill content to BOTH workers. Include the full SKILL.md content and eval.yaml if available.
- 2. **After both complete**: Compare their verdicts. Identify agreements (high confidence) and disagreements (requires judgment).
- 3. **Build consensus**: Produce a final report that explains which worker's suggestions are adopted, which are declined, and why.
+ ### 2-Phase Dispatch
+
+ **Phase 1 — Always dispatch eval-generator first:**
+
+ Write a single @worker block for eval-generator. Tell it the skill directory path and to generate tests/eval.yaml.
+ Do NOT check if eval.yaml exists yourself — eval-generator will check and skip if it already exists.
+
+ Example:
+ ```
+ @worker:Skill Validator-eval-generator
+ Generate tests/eval.yaml for the skill at:
+ The skill is about:
+ @end
+ ```
+
+ **Phase 2 — After eval-generator completes, dispatch both validators together:**
+
+ Write TWO @worker blocks in a single response:
+ ```
+ @worker:Skill Validator-dotnet-validator
+ Evaluate the skill at:
+ Eval.yaml status: [generated by eval-generator / pre-existing]
+ @end
+
+ @worker:Skill Validator-anthropic-validator
+ Evaluate the skill at:
+ Eval.yaml status: [generated by eval-generator / pre-existing]
+ @end
+ ```
+
+ **Phase 3 — After both evaluators complete, write the consensus report (no @worker blocks needed).**
### Consensus Report Format
```
## Skill Validator Consensus Report: [Skill Name]
### Summary
+ **Eval Generation**: ✅ Pre-existing / 🆕 Generated by eval-generator / ❌ None
**Dotnet Verdict**: KEEP/IMPROVE/REMOVE (X/10)
**Anthropic Verdict**: KEEP/IMPROVE/REMOVE (X/10)
**Consensus**: KEEP / IMPROVE / REMOVE
+ ### Eval Quality (if generated)
+ - [assessment of generated eval.yaml scenarios]
+
### Points of Agreement (High Confidence)
- [issues both evaluators flagged]
@@ -528,12 +755,15 @@ 4. Build a consensus verdict explaining which suggestions to adopt and why
```
### Rules
+ - NEVER use tools yourself — you are a message relay, not a worker
- Always explain WHY suggestions are adopted or declined
- Where evaluators disagree, explain the tradeoff and make a reasoned judgment
- Highlight suggestions both evaluators agree on as high-confidence improvements
- - NEVER emit [[GROUP_REFLECT_COMPLETE]] until both evaluators have responded and a consensus report is produced
+ - If eval-generator generated evals, note whether the Dotnet validator was able to use them
+ - NEVER emit [[GROUP_REFLECT_COMPLETE]] until all dispatched workers have responded and a consensus report is produced
""",
MaxReflectIterations = 6,
+ WorkerDisplayNames = new[] { "dotnet-validator", "anthropic-validator", "eval-generator" },
},
};
}
diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs
index bde7ecaf91..69357dddcb 100644
--- a/PolyPilot/Services/CopilotService.Events.cs
+++ b/PolyPilot/Services/CopilotService.Events.cs
@@ -828,6 +828,25 @@ private void FlushCurrentResponse(SessionState state)
state.CurrentResponse.Clear();
state.HasReceivedDeltasThisTurn = false;
+
+ // Early dispatch: if the orchestrator wrote @worker blocks in an intermediate sub-turn,
+ // resolve the TCS now so ParseTaskAssignments can run immediately. Without this, the
+ // orchestrator continues doing tool work itself for minutes before dispatch happens.
+ if (state.EarlyDispatchOnWorkerBlocks && state.ResponseCompletion != null)
+ {
+ var flushed = state.FlushedResponse.ToString();
+ if (System.Text.RegularExpressions.Regex.IsMatch(flushed, @"@worker:.+?\n[\s\S]+?@end", System.Text.RegularExpressions.RegexOptions.IgnoreCase))
+ {
+ Debug($"[DISPATCH] Early dispatch: @worker blocks detected in flushed text ({flushed.Length} chars) for '{state.Info.Name}'");
+ state.EarlyDispatchOnWorkerBlocks = false; // One-shot
+ // Build the full response the same way CompleteResponse does
+ var remaining = state.CurrentResponse.ToString();
+ var fullResponse = string.IsNullOrEmpty(remaining) ? flushed : flushed + "\n\n" + remaining;
+ state.FlushedResponse.Clear();
+ state.CurrentResponse.Clear();
+ state.ResponseCompletion.TrySetResult(fullResponse);
+ }
+ }
}
///
diff --git a/PolyPilot/Services/CopilotService.Organization.cs b/PolyPilot/Services/CopilotService.Organization.cs
index 86f313047e..50e64e8094 100644
--- a/PolyPilot/Services/CopilotService.Organization.cs
+++ b/PolyPilot/Services/CopilotService.Organization.cs
@@ -1066,6 +1066,9 @@ private async Task SendViaOrchestratorAsync(string groupId, List members
InvokeOnUI(() => OnOrchestratorPhaseChanged?.Invoke(groupId, OrchestratorPhase.Planning, null));
var planningPrompt = BuildOrchestratorPlanningPrompt(prompt, workerNames, group?.OrchestratorPrompt, group?.RoutingContext);
+ // Enable early dispatch for the non-reflect orchestrator flow too
+ if (_sessions.TryGetValue(orchestratorName, out var orchPlanning))
+ orchPlanning.EarlyDispatchOnWorkerBlocks = true;
var planResponse = await SendPromptAndWaitAsync(orchestratorName, planningPrompt, cancellationToken, originalPrompt: prompt);
// Phase 2: Parse task assignments from orchestrator response, with retry loop.
@@ -1165,6 +1168,9 @@ private async Task SendViaOrchestratorAsync(string groupId, List members
}
var results = await Task.WhenAll(workerTasks);
+ // After early dispatch, the orchestrator may still be doing tool work.
+ await WaitForSessionIdleAsync(orchestratorName, cancellationToken);
+
// Phase 4: Synthesize — send worker results back to orchestrator
InvokeOnUI(() => OnOrchestratorPhaseChanged?.Invoke(groupId, OrchestratorPhase.Synthesizing, null));
@@ -1218,6 +1224,7 @@ private string BuildOrchestratorPlanningPrompt(string userPrompt, List w
{
sb.AppendLine("You are a DISPATCHER ONLY. You do NOT have tools. You CANNOT write code, read files, or run commands yourself.");
sb.AppendLine("Your ONLY job is to write @worker/@end blocks that assign work to your workers.");
+ sb.AppendLine("⚠️ This is a NEW request. Even if your conversation history shows previous work on a similar topic, you MUST delegate FRESH tasks to your workers. Previous results may be stale.");
}
else
{
@@ -1263,7 +1270,10 @@ private static List DeduplicateAssignments(
internal static string BuildDelegationNudgePrompt(List workerNames)
{
var sb = new System.Text.StringBuilder();
- sb.AppendLine("Your previous response did not contain any @worker delegation blocks. You MUST delegate work to your workers.");
+ sb.AppendLine("⚠️ CRITICAL: Your previous response did not contain any @worker delegation blocks.");
+ sb.AppendLine();
+ sb.AppendLine("You are a DISPATCHER. You CANNOT do the work yourself. Even if you believe the work is already done from a previous conversation, you MUST delegate FRESH work to your workers NOW.");
+ sb.AppendLine("Any previous results are STALE — the user is requesting a NEW evaluation. Ignore prior results and dispatch fresh tasks.");
sb.AppendLine();
sb.AppendLine($"Available workers ({workerNames.Count}): {string.Join(", ", workerNames)}");
sb.AppendLine();
@@ -1273,7 +1283,7 @@ internal static string BuildDelegationNudgePrompt(List workerNames)
sb.AppendLine("Describe the task here on a separate line.");
sb.AppendLine("@end");
sb.AppendLine();
- sb.AppendLine("Do NOT do the work yourself. Output @worker blocks now.");
+ sb.AppendLine("Produce @worker blocks for ALL workers now. Do NOT explain, do NOT summarize previous work, ONLY output @worker blocks.");
return sb.ToString();
}
@@ -1340,6 +1350,48 @@ internal static List TryParseJsonAssignments(string response, Li
private record WorkerResult(string WorkerName, string? Response, bool Success, string? Error, TimeSpan Duration);
+ ///
+ /// Wait for a session to finish processing (go idle). Used after early dispatch
+ /// resolves the orchestrator's TCS while it's still doing tool work — we must
+ /// wait for it to go idle before sending the next prompt (synthesis).
+ /// Uses a short inactivity threshold: if no SDK events for 60s, the session is
+ /// likely stuck in a "zero-idle" state and will be aborted.
+ ///
+ private async Task WaitForSessionIdleAsync(string sessionName, CancellationToken ct, int timeoutSeconds = 300)
+ {
+ if (!_sessions.TryGetValue(sessionName, out var state))
+ return;
+ if (!state.Info.IsProcessing)
+ return;
+
+ Debug($"[DISPATCH] Waiting for '{sessionName}' to go idle before sending next prompt...");
+ var sw = System.Diagnostics.Stopwatch.StartNew();
+ const int inactivityThresholdSeconds = 60;
+
+ while (state.Info.IsProcessing && !ct.IsCancellationRequested && sw.Elapsed.TotalSeconds < timeoutSeconds)
+ {
+ // Check if the session has been quiet (no SDK events) for the inactivity threshold.
+ // This catches the "zero-idle" bug where the SDK never emits SessionIdleEvent.
+ var lastEventTicks = Interlocked.Read(ref state.LastEventAtTicks);
+ var secondsSinceLastEvent = (DateTime.UtcNow - new DateTime(lastEventTicks)).TotalSeconds;
+ if (secondsSinceLastEvent >= inactivityThresholdSeconds)
+ {
+ Debug($"[DISPATCH] '{sessionName}' no SDK events for {secondsSinceLastEvent:F0}s — aborting to proceed with synthesis");
+ await AbortSessionAsync(sessionName);
+ await Task.Delay(500, ct);
+ break;
+ }
+ await Task.Delay(1000, ct);
+ }
+ if (state.Info.IsProcessing)
+ {
+ Debug($"[DISPATCH] '{sessionName}' still processing after {sw.Elapsed.TotalSeconds:F1}s — aborting to allow synthesis");
+ await AbortSessionAsync(sessionName);
+ await Task.Delay(500, ct);
+ }
+ Debug($"[DISPATCH] '{sessionName}' now idle after {sw.Elapsed.TotalSeconds:F1}s");
+ }
+
private async Task ExecuteWorkerAsync(string workerName, string task, string originalPrompt, CancellationToken cancellationToken)
{
var sw = System.Diagnostics.Stopwatch.StartNew();
@@ -1371,6 +1423,7 @@ private async Task ExecuteWorkerAsync(string workerName, string ta
var workerPrompt = $"{identity}{worktreeNote}\n\nYour response will be collected and synthesized with other workers' responses.\n\n{sharedPrefix}## Original User Request (context)\n{originalPrompt}\n\n## Your Assigned Task\n{task}";
const int maxRetries = 2;
+ var dispatchTime = DateTime.UtcNow;
for (int attempt = 1; attempt <= maxRetries; attempt++)
{
try
@@ -1402,6 +1455,21 @@ private async Task ExecuteWorkerAsync(string workerName, string ta
}
}
+ // Fallback: if response is still empty, try extracting from chat history.
+ // The SDK may have streamed content via delta events that ended up in history
+ // even though FlushedResponse/CurrentResponse were empty (e.g., watchdog completion).
+ if (string.IsNullOrWhiteSpace(response) && _sessions.TryGetValue(workerName, out var histState))
+ {
+ var lastAssistant = histState.Info.History.ToArray()
+ .LastOrDefault(m => m.Role == "assistant" && !string.IsNullOrWhiteSpace(m.Content)
+ && m.Timestamp >= dispatchTime);
+ if (lastAssistant != null)
+ {
+ response = lastAssistant.Content;
+ Debug($"[DISPATCH] Worker '{workerName}' recovered {response!.Length} chars from chat history");
+ }
+ }
+
Debug($"[DISPATCH] Worker '{workerName}' completed (response len={response?.Length ?? 0}, elapsed={sw.Elapsed.TotalSeconds:F1}s)");
return new WorkerResult(workerName, response, true, null, sw.Elapsed);
}
@@ -2059,10 +2127,13 @@ public string GetEffectiveModel(string sessionName)
Debug($"[WorktreeStrategy] Creating {preset.WorkerModels.Length} workers with strategy={strategy}, repoId={repoId}");
for (int i = 0; i < preset.WorkerModels.Length; i++)
{
- var workerName = $"{teamName}-worker-{i + 1}";
+ var displayName = preset.WorkerDisplayNames != null && i < preset.WorkerDisplayNames.Length && preset.WorkerDisplayNames[i] != null
+ ? preset.WorkerDisplayNames[i]!
+ : $"worker-{i + 1}";
+ var workerName = $"{teamName}-{displayName}";
{ int suffix = 1;
while (_sessions.ContainsKey(workerName) || Organization.Sessions.Any(s => s.SessionName == workerName))
- workerName = $"{teamName}-worker-{i + 1}-{suffix++}";
+ workerName = $"{teamName}-{displayName}-{suffix++}";
}
var workerModel = preset.WorkerModels[i];
var workerWorkDir = workerWorkDirs[i] ?? orchWorkDir ?? workingDirectory;
@@ -2241,6 +2312,7 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List
return;
}
+ var leftoverPrompts = new List();
try
{
@@ -2297,6 +2369,12 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List
planPrompt = BuildReplanPrompt(reflectState.LastEvaluation ?? "Continue iterating.", workerNames, prompt, group.RoutingContext);
}
+ // Enable early dispatch: if the orchestrator writes @worker blocks in an intermediate
+ // sub-turn (before finishing all tool rounds), FlushCurrentResponse will resolve the
+ // TCS immediately. This prevents the orchestrator from doing all the work itself.
+ if (_sessions.TryGetValue(orchestratorName, out var orchState))
+ orchState.EarlyDispatchOnWorkerBlocks = true;
+
var planResponse = await SendPromptAndWaitAsync(orchestratorName, planPrompt, ct, originalPrompt: prompt);
var rawAssignments = ParseTaskAssignments(planResponse, workerNames);
Debug($"[DISPATCH] '{orchestratorName}' reflect plan parsed: {rawAssignments.Count} raw assignments from {workerNames.Count} workers. Iteration={reflectState.CurrentIteration}, Response length={planResponse.Length}");
@@ -2312,6 +2390,9 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List
AddOrchestratorSystemMessage(orchestratorName,
"⚠️ No @worker assignments parsed from orchestrator response. Retrying...");
var nudgePrompt = BuildDelegationNudgePrompt(workerNames);
+ // Re-enable early dispatch for the nudge attempt too
+ if (_sessions.TryGetValue(orchestratorName, out var nudgeState))
+ nudgeState.EarlyDispatchOnWorkerBlocks = true;
var nudgeResponse = await SendPromptAndWaitAsync(orchestratorName, nudgePrompt, ct, originalPrompt: prompt);
var nudgeAssignments = ParseTaskAssignments(nudgeResponse, workerNames);
Debug($"[DISPATCH] '{orchestratorName}' nudge parsed: {nudgeAssignments.Count} raw assignments. Response length={nudgeResponse.Length}");
@@ -2322,26 +2403,38 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List
}
else
{
- reflectState.ConsecutiveErrors++;
- if (reflectState.ConsecutiveErrors >= 3)
- {
- reflectState.IsStalled = true;
- reflectState.IsCancelled = true;
- break;
- }
- continue;
+ // Nudge also failed — the orchestrator refuses to delegate (likely due to
+ // stale history from a previous run). Force delegation by creating synthetic
+ // @worker blocks that assign the original prompt to each worker.
+ Debug($"[DISPATCH] Nudge failed, forcing delegation to all {workerNames.Count} workers");
+ AddOrchestratorSystemMessage(orchestratorName,
+ $"⚡ Orchestrator refused to delegate after nudge. Forcing dispatch to all {workerNames.Count} workers.");
+ assignments = workerNames.Select(w => new TaskAssignment(w, prompt)).ToList();
+ // Fall through to dispatch below
}
}
else
{
// Later iterations: orchestrator decided no more work needed —
- // but only declare GoalMet if no queued prompts produced @worker blocks
- if (queuedAssignments.Count == 0)
+ // but only declare GoalMet if all workers have been dispatched
+ var allDispatched = workerNames.All(w => dispatchedWorkers.Contains(w));
+ var allAttempted = workerNames.All(w => attemptedWorkers.Contains(w));
+ if (queuedAssignments.Count == 0 && (allDispatched || allAttempted))
{
reflectState.GoalMet = true;
AddOrchestratorSystemMessage(orchestratorName, $"✅ Orchestrator completed without delegation (iteration {reflectState.CurrentIteration}).");
break;
}
+ if (!allDispatched && queuedAssignments.Count == 0)
+ {
+ // Not all workers dispatched — force dispatch remaining workers
+ var remaining = workerNames.Where(w => !dispatchedWorkers.Contains(w) && !attemptedWorkers.Contains(w)).ToList();
+ Debug($"[DISPATCH] Iteration {reflectState.CurrentIteration}: 0 assignments but {remaining.Count} workers never dispatched — forcing: {string.Join(", ", remaining)}");
+ AddOrchestratorSystemMessage(orchestratorName,
+ $"⚡ Forcing dispatch to {remaining.Count} remaining worker(s): {string.Join(", ", remaining)}");
+ assignments = remaining.Select(w => new TaskAssignment(w, prompt)).ToList();
+ // Fall through to dispatch below
+ }
// Fall through to merge and dispatch queued work
}
}
@@ -2390,6 +2483,10 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List
foreach (var r in results.Where(r => r.Success))
dispatchedWorkers.Add(r.WorkerName);
+ // After early dispatch, the orchestrator may still be doing tool work.
+ // Wait for it to go idle before sending the synthesis prompt.
+ await WaitForSessionIdleAsync(orchestratorName, ct);
+
// Phase 4: Synthesize + Evaluate
InvokeOnUI(() => OnOrchestratorPhaseChanged?.Invoke(groupId, OrchestratorPhase.Synthesizing, iterDetail));
@@ -2418,13 +2515,16 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List
var evaluatorModel = GetEffectiveModel(evaluatorName);
var trend = reflectState.RecordEvaluation(reflectState.CurrentIteration, score, rationale, evaluatorModel);
- // Check if evaluator says complete — but only if all workers have participated successfully
+ // Check if evaluator says complete
+ var allWorkersAttempted = workerNames.All(w => attemptedWorkers.Contains(w));
if ((evalResponse.Contains("[[GROUP_REFLECT_COMPLETE]]", StringComparison.OrdinalIgnoreCase) || score >= 0.9)
- && allWorkersDispatched)
+ && (allWorkersDispatched || allWorkersAttempted))
{
+ var partial = !allWorkersDispatched && allWorkersAttempted;
reflectState.GoalMet = true;
reflectState.IsActive = false;
- AddOrchestratorSystemMessage(orchestratorName, $"✅ {reflectState.BuildCompletionSummary()} (score: {score:F1})");
+ var suffix = partial ? " (some workers failed but all were attempted)" : "";
+ AddOrchestratorSystemMessage(orchestratorName, $"✅ {reflectState.BuildCompletionSummary()} (score: {score:F1}){suffix}");
break;
}
@@ -2435,7 +2535,7 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List
var neverDispatched = missing.Where(w => !attemptedWorkers.Contains(w)).ToList();
var detail = neverDispatched.Count > 0
? $"Not yet dispatched: {string.Join(", ", neverDispatched)}."
- : $"Dispatched but failed: {string.Join(", ", failedButAttempted)}. Retry or address errors.";
+ : $"Dispatched but failed: {string.Join(", ", failedButAttempted)}. Will retry next iteration.";
Debug($"Reflection: overriding completion — {detail}");
reflectState.LastEvaluation = $"{detail} Dispatch to the remaining workers before completing.";
AddOrchestratorSystemMessage(orchestratorName, $"🔄 Overriding completion — {detail}");
@@ -2452,18 +2552,21 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List
{
synthesisResponse = await SendPromptAndWaitAsync(orchestratorName, synthEvalPrompt, ct, originalPrompt: prompt);
- // Check completion sentinel — but only if all workers have participated successfully
+ // Check completion sentinel
+ var allWorkersAttempted = workerNames.All(w => attemptedWorkers.Contains(w));
if (synthesisResponse.Contains("[[GROUP_REFLECT_COMPLETE]]", StringComparison.OrdinalIgnoreCase)
- && allWorkersDispatched)
+ && (allWorkersDispatched || allWorkersAttempted))
{
+ var partial = !allWorkersDispatched && allWorkersAttempted;
reflectState.GoalMet = true;
reflectState.IsActive = false;
- AddOrchestratorSystemMessage(orchestratorName, $"✅ {reflectState.BuildCompletionSummary()}");
+ var suffix = partial ? " (some workers failed but all were attempted)" : "";
+ AddOrchestratorSystemMessage(orchestratorName, $"✅ {reflectState.BuildCompletionSummary()}{suffix}");
break;
}
if (synthesisResponse.Contains("[[GROUP_REFLECT_COMPLETE]]", StringComparison.OrdinalIgnoreCase)
- && !allWorkersDispatched)
+ && !allWorkersAttempted)
{
// Override premature completion — not all workers have participated
var missing = workerNames.Where(w => !dispatchedWorkers.Contains(w)).ToList();
@@ -2559,25 +2662,16 @@ private async Task SendViaOrchestratorReflectAsync(string groupId, List
reflectState.IsActive = false;
reflectState.CompletedAt = DateTime.Now;
ClearPendingOrchestration();
- // Send any remaining queued prompts to the orchestrator before releasing the lock.
- // These were acknowledged to the user ("📨 queued") but the loop is exiting —
- // sending them ensures the model sees them rather than silently discarding.
+ // Collect leftover queued prompts synchronously for best-effort delivery
+ // AFTER releasing the semaphore. This prevents holding the semaphore forever
+ // if SendPromptAndWaitAsync blocks on a broken SDK connection (e.g., StreamJsonRpc
+ // serialization errors during cancellation). See PR #323.
if (_reflectQueuedPrompts.TryGetValue(groupId, out var remainingQueue))
{
while (remainingQueue.TryDequeue(out var leftover))
- {
- try
- {
- Debug($"[DISPATCH] Sending leftover queued prompt on loop exit (len={leftover.Length})");
- await SendPromptAndWaitAsync(orchestratorName,
- $"[User sent a message — the reflection loop has completed]\n\n{leftover}", ct, originalPrompt: leftover);
- }
- catch (Exception ex)
- {
- Debug($"[DISPATCH] Failed to send leftover queued prompt: {ex.Message}");
- }
- }
+ leftoverPrompts.Add(leftover);
}
+
SaveOrganization();
InvokeOnUI(() =>
{
@@ -2591,6 +2685,31 @@ await SendPromptAndWaitAsync(orchestratorName,
{
loopLock.Release();
}
+
+ // Fire-and-forget: deliver leftover prompts after semaphore is released.
+ // If delivery fails (broken connection), prompts are still visible in chat history.
+ if (leftoverPrompts.Count > 0)
+ {
+ var orchName = orchestratorName;
+ SafeFireAndForget(Task.Run(async () =>
+ {
+ foreach (var leftover in leftoverPrompts)
+ {
+ try
+ {
+ Debug($"[DISPATCH] Sending leftover queued prompt after loop exit (len={leftover.Length})");
+ using var cleanupCts = new CancellationTokenSource(TimeSpan.FromSeconds(30));
+ await SendPromptAndWaitAsync(orchName,
+ $"[User sent a message — the reflection loop has completed]\n\n{leftover}",
+ cleanupCts.Token, originalPrompt: leftover);
+ }
+ catch (Exception ex)
+ {
+ Debug($"[DISPATCH] Failed to send leftover queued prompt: {ex.Message}");
+ }
+ }
+ }), "leftover-prompt-delivery");
+ }
}
private string BuildSynthesisWithEvalPrompt(string originalPrompt, List results, ReflectionCycle state,
@@ -2654,6 +2773,7 @@ private string BuildReplanPrompt(string lastEvaluation, List workerNames
var sb = new System.Text.StringBuilder();
sb.AppendLine("You are a DISPATCHER ONLY. You do NOT have tools. You CANNOT write code, read files, or run commands yourself.");
sb.AppendLine("Your ONLY job is to write @worker/@end blocks that assign work to your workers.");
+ sb.AppendLine("Even if previous conversation history shows completed work, you MUST dispatch FRESH tasks now.");
sb.AppendLine();
sb.AppendLine("## Previous Iteration Evaluation");
sb.AppendLine(lastEvaluation);
diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs
index 9bdb631d78..86ad3ade16 100644
--- a/PolyPilot/Services/CopilotService.cs
+++ b/PolyPilot/Services/CopilotService.cs
@@ -449,6 +449,12 @@ private class SessionState
/// Used for diagnostic logging: when the next TurnEnd re-arms the fallback, the log shows
/// the self-healing loop in action (TurnEnd → TurnStart cancel → TurnEnd re-arm).
public volatile bool FallbackCanceledByTurnStart;
+ /// When true, FlushCurrentResponse checks accumulated FlushedResponse for
+ /// @worker:...@end blocks after each sub-turn. If found, resolves ResponseCompletion
+ /// early so orchestrator dispatch can proceed without waiting for the model to finish
+ /// all its tool rounds. This prevents the orchestrator from doing all the work itself
+ /// when it has tool access and ignores "dispatcher only" instructions.
+ public bool EarlyDispatchOnWorkerBlocks;
/// Timer that fires shortly after a tool starts to verify the connection is still alive.
/// If no tool completion event arrives within ToolHealthCheckIntervalMs, we do an active health
/// check to detect dead connections early (instead of waiting for the 600s watchdog timeout).