From 2913f81c697e449f6f8dc527af3155a57f90b609 Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Mon, 16 Mar 2026 18:39:15 -0500 Subject: [PATCH 1/3] Fix premature idle: check BackgroundTasks before completing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SDK's SessionIdleEvent includes a BackgroundTasks payload with agents[] and shells[] arrays. When background tasks are active, session.idle means 'foreground quiesced, background still running' — NOT true completion. Previously, PolyPilot unconditionally called CompleteResponse on every SessionIdleEvent, then tried to repair the damage with EVT-REARM, PrematureIdleSignal, and recovery heuristics. This caused 90% of multi-agent worker results to be truncated (44/49 recoveries failed to collect full content, median gap to next TurnStart was 51.8s vs the 15s freshness window). The fix: inspect idle.Data.BackgroundTasks via reflection before calling CompleteResponse. If agents or shells are active, flush accumulated text and wait for the real idle (no background tasks). This eliminates the root cause rather than patching symptoms. The existing safety nets (EVT-REARM, watchdog, recovery) are kept as defense-in-depth for edge cases where BackgroundTasks is null. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot.Tests/BackgroundTasksIdleTests.cs | 106 ++++++++++++++++++++ PolyPilot/Services/CopilotService.Events.cs | 53 +++++++++- 2 files changed, 158 insertions(+), 1 deletion(-) create mode 100644 PolyPilot.Tests/BackgroundTasksIdleTests.cs diff --git a/PolyPilot.Tests/BackgroundTasksIdleTests.cs b/PolyPilot.Tests/BackgroundTasksIdleTests.cs new file mode 100644 index 0000000000..ca3eafaed2 --- /dev/null +++ b/PolyPilot.Tests/BackgroundTasksIdleTests.cs @@ -0,0 +1,106 @@ +using GitHub.Copilot.SDK; +using PolyPilot.Services; + +namespace PolyPilot.Tests; + +/// +/// Tests for HasActiveBackgroundTasks — the fix that prevents premature idle completion +/// when the SDK reports active background tasks (sub-agents, shells) in SessionIdleEvent. +/// See: session.idle with backgroundTasks means "foreground quiesced, background still running." +/// +public class BackgroundTasksIdleTests +{ + private static SessionIdleEvent MakeIdle(SessionIdleDataBackgroundTasks? bt = null) + { + return new SessionIdleEvent + { + Data = new SessionIdleData { BackgroundTasks = bt } + }; + } + + [Fact] + public void HasActiveBackgroundTasks_NullBackgroundTasks_ReturnsFalse() + { + var idle = MakeIdle(bt: null); + Assert.False(CopilotService.HasActiveBackgroundTasks(idle)); + } + + [Fact] + public void HasActiveBackgroundTasks_EmptyBackgroundTasks_ReturnsFalse() + { + var idle = MakeIdle(new SessionIdleDataBackgroundTasks + { + Agents = Array.Empty(), + Shells = Array.Empty() + }); + Assert.False(CopilotService.HasActiveBackgroundTasks(idle)); + } + + [Fact] + public void HasActiveBackgroundTasks_WithAgents_ReturnsTrue() + { + var idle = MakeIdle(new SessionIdleDataBackgroundTasks + { + Agents = new[] + { + new SessionIdleDataBackgroundTasksAgentsItem + { + AgentId = "agent-42", + AgentType = "copilot", + Description = "PR reviewer" + } + }, + Shells = Array.Empty() + }); + Assert.True(CopilotService.HasActiveBackgroundTasks(idle)); + } + + [Fact] + public void HasActiveBackgroundTasks_WithShells_ReturnsTrue() + { + var idle = MakeIdle(new SessionIdleDataBackgroundTasks + { + Agents = Array.Empty(), + Shells = new[] + { + new SessionIdleDataBackgroundTasksShellsItem + { + ShellId = "shell-1", + Description = "Running npm test" + } + } + }); + Assert.True(CopilotService.HasActiveBackgroundTasks(idle)); + } + + [Fact] + public void HasActiveBackgroundTasks_WithBothAgentsAndShells_ReturnsTrue() + { + var idle = MakeIdle(new SessionIdleDataBackgroundTasks + { + Agents = new[] + { + new SessionIdleDataBackgroundTasksAgentsItem + { + AgentId = "a1", AgentType = "copilot", Description = "" + } + }, + Shells = new[] + { + new SessionIdleDataBackgroundTasksShellsItem + { + ShellId = "s1", Description = "" + } + } + }); + Assert.True(CopilotService.HasActiveBackgroundTasks(idle)); + } + + [Fact] + public void HasActiveBackgroundTasks_DefaultIdle_ReturnsFalse() + { + // Default SessionIdleEvent — Data is auto-initialized but BackgroundTasks is null + var idle = new SessionIdleEvent { Data = new SessionIdleData() }; + Assert.False(CopilotService.HasActiveBackgroundTasks(idle)); + } +} diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs index bb9e94c128..d4826f742d 100644 --- a/PolyPilot/Services/CopilotService.Events.cs +++ b/PolyPilot/Services/CopilotService.Events.cs @@ -619,9 +619,28 @@ void Invoke(Action action) }); break; - case SessionIdleEvent: + case SessionIdleEvent idle: // Cancel the TurnEnd→Idle fallback — normal SessionIdleEvent arrived CancelTurnEndFallback(state); + + // KEY FIX: Check if the server reports active background tasks (sub-agents, shells). + // session.idle with background tasks means "foreground quiesced, background still running." + // Do NOT treat this as terminal — flush text and wait for the real idle. + if (HasActiveBackgroundTasks(idle)) + { + Debug($"[IDLE-DEFER] '{sessionName}' session.idle received with active background tasks — " + + $"deferring completion (IsProcessing={state.Info.IsProcessing}, " + + $"response={state.CurrentResponse.Length}+{state.FlushedResponse.Length} chars)"); + // Flush accumulated text at each idle boundary to prevent content loss + Invoke(() => + { + if (state.IsOrphaned) return; + FlushCurrentResponse(state); + NotifyStateChangedCoalesced(); + }); + break; // Don't complete — wait for next idle without background tasks + } + try { CompleteReasoningMessages(state, sessionName); } catch (Exception ex) { @@ -1829,6 +1848,38 @@ internal static bool IsShellEnvironmentFailure(string? text) return error.ToString(); } + /// + /// Check if a SessionIdleEvent reports active background tasks (agents or shells). + /// When background tasks are active, session.idle means "foreground quiesced, background + /// still running" — NOT true completion. Uses reflection to access the SDK's typed + /// BackgroundTasks payload safely. + /// + internal static bool HasActiveBackgroundTasks(SessionIdleEvent idle) + { + try + { + // SessionIdleEvent.Data : SessionIdleData + // SessionIdleData.BackgroundTasks : SessionIdleDataBackgroundTasks? + // SessionIdleDataBackgroundTasks.Agents : SessionIdleDataBackgroundTasksAgentsItem[]? + // SessionIdleDataBackgroundTasks.Shells : SessionIdleDataBackgroundTasksShellsItem[]? + var data = idle.GetType().GetProperty("Data")?.GetValue(idle); + if (data == null) return false; + + var bt = data.GetType().GetProperty("BackgroundTasks")?.GetValue(data); + if (bt == null) return false; + + var agents = bt.GetType().GetProperty("Agents")?.GetValue(bt) as Array; + var shells = bt.GetType().GetProperty("Shells")?.GetValue(bt) as Array; + + return (agents != null && agents.Length > 0) || (shells != null && shells.Length > 0); + } + catch + { + // Reflection failed — assume no background tasks (safe: CompleteResponse proceeds) + return false; + } + } + private void StartProcessingWatchdog(SessionState state, string sessionName) { CancelProcessingWatchdog(state); From ca9e6f5fac1f0d20109c60278691cd75b99afcde Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Mon, 16 Mar 2026 20:40:56 -0500 Subject: [PATCH 2/3] Replace reflection with direct property access, add Data==null test Review feedback: the SDK types are public, so reflection was unnecessary overhead (~1000x slower) with no compile-time safety. Also adds a test for the Data==null edge case. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot.Tests/BackgroundTasksIdleTests.cs | 7 ++++++ PolyPilot/Services/CopilotService.Events.cs | 28 +++------------------ 2 files changed, 11 insertions(+), 24 deletions(-) diff --git a/PolyPilot.Tests/BackgroundTasksIdleTests.cs b/PolyPilot.Tests/BackgroundTasksIdleTests.cs index ca3eafaed2..96f4869d61 100644 --- a/PolyPilot.Tests/BackgroundTasksIdleTests.cs +++ b/PolyPilot.Tests/BackgroundTasksIdleTests.cs @@ -103,4 +103,11 @@ public void HasActiveBackgroundTasks_DefaultIdle_ReturnsFalse() var idle = new SessionIdleEvent { Data = new SessionIdleData() }; Assert.False(CopilotService.HasActiveBackgroundTasks(idle)); } + + [Fact] + public void HasActiveBackgroundTasks_DataNull_ReturnsFalse() + { + var idle = new SessionIdleEvent { Data = null! }; + Assert.False(CopilotService.HasActiveBackgroundTasks(idle)); + } } diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs index d4826f742d..9d7cc93f68 100644 --- a/PolyPilot/Services/CopilotService.Events.cs +++ b/PolyPilot/Services/CopilotService.Events.cs @@ -1851,33 +1851,13 @@ internal static bool IsShellEnvironmentFailure(string? text) /// /// Check if a SessionIdleEvent reports active background tasks (agents or shells). /// When background tasks are active, session.idle means "foreground quiesced, background - /// still running" — NOT true completion. Uses reflection to access the SDK's typed - /// BackgroundTasks payload safely. + /// still running" — NOT true completion. /// internal static bool HasActiveBackgroundTasks(SessionIdleEvent idle) { - try - { - // SessionIdleEvent.Data : SessionIdleData - // SessionIdleData.BackgroundTasks : SessionIdleDataBackgroundTasks? - // SessionIdleDataBackgroundTasks.Agents : SessionIdleDataBackgroundTasksAgentsItem[]? - // SessionIdleDataBackgroundTasks.Shells : SessionIdleDataBackgroundTasksShellsItem[]? - var data = idle.GetType().GetProperty("Data")?.GetValue(idle); - if (data == null) return false; - - var bt = data.GetType().GetProperty("BackgroundTasks")?.GetValue(data); - if (bt == null) return false; - - var agents = bt.GetType().GetProperty("Agents")?.GetValue(bt) as Array; - var shells = bt.GetType().GetProperty("Shells")?.GetValue(bt) as Array; - - return (agents != null && agents.Length > 0) || (shells != null && shells.Length > 0); - } - catch - { - // Reflection failed — assume no background tasks (safe: CompleteResponse proceeds) - return false; - } + var bt = idle.Data?.BackgroundTasks; + if (bt == null) return false; + return (bt.Agents is { Length: > 0 }) || (bt.Shells is { Length: > 0 }); } private void StartProcessingWatchdog(SessionState state, string sessionName) From 69216ce5efa974da322b4f24f9d1f71967b082ef Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Mon, 16 Mar 2026 21:31:16 -0500 Subject: [PATCH 3/3] Update skill docs and instructions for IDLE-DEFER (PR #399) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - multi-agent-orchestration SKILL.md: 4→5 phase lifecycle, new IDLE-DEFER section, fix INV-O3 ordering, new INV-O15, mark premature idle bug as FIXED - processing-state-safety SKILL.md: 10→16 paths table with tags, new INV-18 for BackgroundTasks, IDLE-DEFER in stuck session table, note EVT-REARM is now secondary defense, add PRs #373/#375/#399 to regression history - copilot-instructions.md: update SDK Event Flow step 9 for BackgroundTasks check, add [IDLE-DEFER] diagnostic tag, fix stale path count (8→15+), add BackgroundTasksIdleTests to test list Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../skills/multi-agent-orchestration/SKILL.md | 190 +++++++++++++++--- .../skills/processing-state-safety/SKILL.md | 79 ++++++-- .github/copilot-instructions.md | 8 +- 3 files changed, 225 insertions(+), 52 deletions(-) diff --git a/.claude/skills/multi-agent-orchestration/SKILL.md b/.claude/skills/multi-agent-orchestration/SKILL.md index 45753b2cef..aa9679692b 100644 --- a/.claude/skills/multi-agent-orchestration/SKILL.md +++ b/.claude/skills/multi-agent-orchestration/SKILL.md @@ -8,8 +8,10 @@ description: > persistence or resume logic, (4) Debugging orchestrator-worker communication failures, (5) Adding error handling around worker dispatch or completion, (6) Modifying OnSessionComplete coordination or TCS ordering, (7) Working with reflection loop - concurrency (semaphores, queued prompts). Covers: 4-phase dispatch lifecycle, restart - recovery via PendingOrchestration, worker failure patterns, and connection error handling. + concurrency (semaphores, queued prompts), (8) Modifying SessionIdleEvent handling + or IDLE-DEFER logic (BackgroundTasks check). Covers: 5-phase dispatch lifecycle, + IDLE-DEFER background task guard, restart recovery via PendingOrchestration, worker + failure patterns, and connection error handling. --- # Multi-Agent Orchestration — Invariants & Error Recovery @@ -40,7 +42,7 @@ stuck sessions, and coordination failures. --- -## The 4-Phase Orchestration Lifecycle +## The 5-Phase Orchestration Lifecycle Every orchestrator dispatch (single-pass and reflect) follows these phases: @@ -49,9 +51,10 @@ Every orchestrator dispatch (single-pass and reflect) follows these phases: │ Phase 1: PLAN │ │ ├── Orchestrator receives user prompt + worker list │ │ ├── Builds planning prompt with worker models/descriptions │ +│ ├── EarlyDispatchOnWorkerBlocks = true (resolve TCS mid-turn) │ │ ├── Orchestrator responds with @worker:name task blocks │ │ └── ParseTaskAssignments extracts → List │ -│ └── If no assignments: send nudge → retry parse │ +│ └── If no assignments: send nudge → retry parse (up to 3x)│ │ └── If still none: orchestrator handled directly │ └─────────────────────────────────────────────────────────────────┘ │ @@ -61,6 +64,7 @@ Every orchestrator dispatch (single-pass and reflect) follows these phases: │ ├── SavePendingOrchestration() — BEFORE dispatching │ │ ├── Fire OnOrchestratorPhaseChanged(Dispatching) │ │ ├── Launch worker tasks in parallel: Task.WhenAll(workers) │ +│ │ └── Workers staggered with 1s delay between dispatches │ │ └── Each worker gets: system prompt + original prompt + task │ └─────────────────────────────────────────────────────────────────┘ │ @@ -68,6 +72,7 @@ Every orchestrator dispatch (single-pass and reflect) follows these phases: ┌─────────────────────────────────────────────────────────────────┐ │ Phase 3: COLLECT (WaitingForWorkers) │ │ ├── Await all worker completions (10-min timeout each) │ +│ ├── WaitForSessionIdleAsync on orchestrator (early dispatch) │ │ ├── Collect WorkerResult[] with response, success, duration │ │ └── Failed workers: response = error message, success = false │ └─────────────────────────────────────────────────────────────────┘ @@ -79,24 +84,49 @@ Every orchestrator dispatch (single-pass and reflect) follows these phases: │ ├── Send to orchestrator for final response │ │ ├── ClearPendingOrchestration() — in finally block │ │ └── Fire OnOrchestratorPhaseChanged(Complete) │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Phase 5: IDLE-DEFER (Background Task Safety) — PR #399 │ +│ ├── SessionIdleEvent handler checks HasActiveBackgroundTasks() │ +│ ├── If agents/shells active: flush text, log [IDLE-DEFER], │ +│ │ and break WITHOUT calling CompleteResponse │ +│ └── Only truly idle (no background tasks) → CompleteResponse │ +│ │ +│ This prevents premature TCS completion when workers are still │ +│ running as background agents/shells dispatched by the CLI. │ └─────────────────────────────────────────────────────────────────┘ ``` +> **Phase 5 (IDLE-DEFER) is not a sequential phase** — it's a guard in the +> `SessionIdleEvent` handler (Events.cs:622-642) that applies to ALL sessions. +> It's listed here because it fundamentally changed the premature idle story +> for orchestrator workers. See **IDLE-DEFER & BackgroundTasks** section below. + ### OrchestratorReflect: Extended Loop OrchestratorReflect wraps phases 1–4 in a loop with evaluation: ``` while (IsActive && !IsPaused && CurrentIteration < MaxIterations): + Drain _reflectQueuedPrompts (user messages during loop) Phase 1–4 (as above) Phase 5: EVALUATE ├── With evaluator: score + rationale → RecordEvaluation() - └── Self-eval: check for [[GROUP_REFLECT_COMPLETE]] sentinel + ├── Self-eval: check for [[GROUP_REFLECT_COMPLETE]] sentinel + └── AutoAdjustFromFeedback() detects quality degradation Phase 6: STALL DETECTION - ├── CheckStall() compares synthesis to previous + ├── CheckStall() compares synthesis to previous (Jaccard > 0.9) └── 2 consecutive stalls → IsStalled = true → break ``` +> **Sentinel note**: `ReflectionCycle.cs` defines `CompletionSentinel = "[[REFLECTION_COMPLETE]]"` +> (used by `IsGoalMet()` regex). The orchestrator prompts in `Organization.cs` use +> `[[GROUP_REFLECT_COMPLETE]]` for model-facing instructions. Both are checked, but +> via different mechanisms — `IsGoalMet()` for the ReflectionCycle, string.Contains +> for orchestrator-level evaluation. + --- ## PendingOrchestration — Restart Recovery @@ -233,24 +263,30 @@ even if dispatch throws. This prevents orphaned pending files. `OnSessionComplete` is fired when a session finishes processing (IsProcessing → false). Orchestrator loops use this to detect when workers finish. +**Signature**: `event Action? OnSessionComplete` — `(sessionName, summary)` + ### Ordering Invariant (from processing-state-safety) -**INV-O3: IsProcessing = false BEFORE TrySetResult** +**INV-O3: IsProcessing = false BEFORE TrySetResult BEFORE OnSessionComplete** ```csharp -// CORRECT ORDER in CompleteResponse: -state.Info.IsProcessing = false; // 1. Clear processing state -OnSessionComplete?.Invoke(name); // 2. Notify listeners -tcs.TrySetResult(response); // 3. Complete TCS (may run sync continuation) +// CORRECT ORDER in CompleteResponse (Events.cs ~line 1062-1080): +state.Info.IsProcessing = false; // 1. Clear processing state +state.ResponseCompletion?.TrySetResult(fullResponse);// 2. Complete TCS (may run sync continuation) +OnSessionComplete?.Invoke(name, summary); // 3. Notify listeners +OnStateChanged?.Invoke(); // 4. UI update ``` -If TrySetResult runs first, the reflection loop's synchronous continuation may see -`IsProcessing = true` and fail to send the next prompt. +If TrySetResult runs before IsProcessing=false, the reflection loop's synchronous +continuation may see `IsProcessing = true` and fail to send the next prompt. ### INV-O4: OnSessionComplete fired on ALL termination paths -All 8 paths that clear IsProcessing (see processing-state-safety) must also fire -OnSessionComplete. Otherwise, orchestrator loops waiting on workers hang forever. +All paths that clear IsProcessing (currently 19+ across Events.cs, CopilotService.cs, +Organization.cs, Bridge.cs, and Providers.cs) should fire OnSessionComplete. Otherwise, +orchestrator loops waiting on workers hang forever. Key invocation sites include: +CompleteResponse, SessionErrorEvent, watchdog timeout, watchdog crash recovery, +abort, steer error, and bridge completion. --- @@ -301,6 +337,7 @@ When modifying orchestration, verify: - [ ] **INV-O6**: Phase changes fire OnOrchestratorPhaseChanged for UI updates - [ ] **INV-O7**: Worker timeouts use 10-minute default (600s for resumed sessions) - [ ] **INV-O8**: Cancellation tokens propagated to all async operations +- [ ] **INV-O15**: IDLE-DEFER flushes CurrentResponse before breaking (content preservation) --- @@ -465,7 +502,9 @@ not from live TCS tracking. **Root cause**: Worker's OnSessionComplete wasn't fired (incomplete cleanup path). -**Mitigation**: Ensure all 9 IsProcessing=false paths fire OnSessionComplete. +**Mitigation**: Ensure all IsProcessing=false paths fire OnSessionComplete +(currently 19+ paths across Events.cs, CopilotService.cs, Organization.cs, +Bridge.cs, and Providers.cs). ### Bug: Reflection loop processes stale user message @@ -567,7 +606,7 @@ Conflict Tests (PR #375)" region: - Verify non-orchestrator sessions still get steered - Long-running orchestrator (15min) follow-up must queue -### Bug: Premature session.idle truncates orchestrator results (PR #375) +### Bug: Premature session.idle truncates orchestrator results (PR #375, FIXED in PR #399) **Symptom**: CLI sends `session.idle` prematurely mid-turn (after only a few tool rounds), then continues processing for 15+ more tool rounds. The @@ -580,20 +619,107 @@ missing the idle entirely, it sends it too early. The idle arrives, passes all generation guards, and CompleteResponse runs with whatever content has been flushed so far. -**Partial fix (UI)**: Added re-arm in `AssistantTurnStartEvent` handler: when -TurnStart arrives with `IsProcessing=false` on the current (non-orphaned) state, -re-arm IsProcessing, restart watchdog, log as `[EVT-REARM]`. This keeps the UI -showing "Working…" and the watchdog active. +**Primary fix (PR #399 — IDLE-DEFER)**: `SessionIdleEvent` handler now checks +`HasActiveBackgroundTasks(idle)` (Events.cs:629). If `BackgroundTasks.Agents` +or `BackgroundTasks.Shells` has any entries, the handler flushes accumulated +text but does NOT call `CompleteResponse` — it breaks early and logs +`[IDLE-DEFER]`. The next `SessionIdleEvent` without background tasks triggers +the real completion. This prevents TCS completion with partial content. -**Not fixed**: Orchestrator content truncation. The TCS was already completed -with partial content before re-arm fires. A future fix could create a NEW TCS -on re-arm so the orchestrator waits for the real completion. Complex — may -need separate PR. +**Defense-in-depth (still active)**: +- `[EVT-REARM]` (Events.cs:529): `AssistantTurnStartEvent` after premature idle + re-arms IsProcessing and sets `PrematureIdleSignal`. Keeps UI showing "Working…". +- `RecoverFromPrematureIdleIfNeededAsync` (Organization.cs:1759): Polls signal + + events.jsonl freshness to collect full response after TCS got partial content. +- `IsEventsFileActive` (Organization.cs:1946): Checks events.jsonl mtime < 15s. + +These defense layers handle edge cases where `BackgroundTasks` data isn't +present (older CLI versions, or non-agent premature idles). **Filed**: See GitHub issue for tracking. --- +## IDLE-DEFER & BackgroundTasks (PR #399) + +> **This is the primary fix for premature idle in multi-agent workers.** +> Before PR #399, premature `session.idle` events caused truncated worker +> responses. The defense-in-depth layers (`EVT-REARM`, `RecoverFromPrematureIdleIfNeededAsync`) +> mitigated but didn't fully fix the problem for orchestrator TCS completion. + +### How It Works + +The CLI's `SessionIdleEvent` now includes a `Data.BackgroundTasks` object with: +- `Agents` — array of active background agent processes +- `Shells` — array of active shell/terminal processes + +When a worker uses `task` tool to dispatch sub-agents, or runs shell commands, +these appear as background tasks. A `session.idle` with active background tasks +means "foreground processing quiesced, but background work is ongoing." + +### Code Path (Events.cs:622-642) + +```csharp +case SessionIdleEvent idle: + CancelTurnEndFallback(state); + + if (HasActiveBackgroundTasks(idle)) + { + Debug($"[IDLE-DEFER] ..."); + Invoke(() => { + if (state.IsOrphaned) return; + FlushCurrentResponse(state); // Preserve accumulated text + NotifyStateChangedCoalesced(); + }); + break; // Do NOT call CompleteResponse + } + + // Only reach here when truly idle (no background tasks) + CompleteResponse(state, idleGeneration); +``` + +### HasActiveBackgroundTasks (Events.cs:1856-1861) + +```csharp +internal static bool HasActiveBackgroundTasks(SessionIdleEvent idle) +{ + var bt = idle.Data?.BackgroundTasks; + if (bt == null) return false; + return (bt.Agents is { Length: > 0 }) || (bt.Shells is { Length: > 0 }); +} +``` + +### Impact on Multi-Agent Orchestration + +1. **Workers dispatching sub-agents**: Previously, these would trigger premature + completion when the foreground quiesced. Now correctly deferred. +2. **Workers running shell commands**: Same — deferred until shells complete. +3. **TCS ordering**: The `ResponseCompletion` TCS is no longer completed with + partial content for the common case. Only edge cases (old CLI, missing + BackgroundTasks data) fall through to the defense-in-depth layers. +4. **Content preservation**: `FlushCurrentResponse` is called on each deferred + idle, ensuring accumulated text is preserved even if the app restarts. + +### INV-O15: IDLE-DEFER Must Flush Before Breaking + +The `FlushCurrentResponse(state)` call inside the IDLE-DEFER block is critical. +Without it, accumulated response text in `state.CurrentResponse` would be lost +if another idle event arrives or the app restarts. The flush preserves content +into `state.FlushedResponse` and chat history. + +### Defense-in-Depth Layers (Still Active) + +These layers remain as fallbacks for edge cases where IDLE-DEFER doesn't fire: + +| Layer | Mechanism | When It Helps | +|-------|-----------|---------------| +| `[EVT-REARM]` | Re-arm IsProcessing on TurnStart after idle | Old CLI without BackgroundTasks | +| `PrematureIdleSignal` | ManualResetEventSlim set on re-arm | Signals ExecuteWorkerAsync to recover | +| `RecoverFromPrematureIdleIfNeededAsync` | Poll signal + events.jsonl freshness | Collect full response after partial TCS | +| `IsEventsFileActive` | events.jsonl mtime < 15s | Detect ongoing CLI activity | + +--- + ## "Fix with Copilot" — Multi-Agent Awareness ### Current State @@ -665,8 +791,8 @@ for w in 1 2 3 4 5; do [[ -n "$last" ]] && echo "W$w: $last" done -# 3. Any completions? -grep "$GROUP" ~/.polypilot/event-diagnostics.log | grep -E "IDLE|COMPLETE|DISPATCH.*completed" | tail -10 +# 3. Any completions or deferred idles? +grep "$GROUP" ~/.polypilot/event-diagnostics.log | grep -E "IDLE|IDLE-DEFER|COMPLETE|DISPATCH.*completed" | tail -10 # 4. Any errors? grep "$GROUP" ~/.polypilot/event-diagnostics.log | grep -E "ERROR|WATCHDOG" | tail -5 @@ -678,7 +804,7 @@ cat ~/.polypilot/pending-orchestration.json 2>/dev/null | head -3 || echo "(empt ### Monitoring Orchestrator Dispatch ```bash -grep "DISPATCH" ~/.polypilot/event-diagnostics.log | grep "$GROUP" | tail -20 +grep "DISPATCH\|IDLE-DEFER" ~/.polypilot/event-diagnostics.log | grep "$GROUP" | tail -20 ``` **Expected sequence for N-worker dispatch:** @@ -700,7 +826,8 @@ grep "DISPATCH" ~/.polypilot/event-diagnostics.log | grep "$GROUP" | tail -20 **Signs of healthy worker:** - TurnStart/TurnEnd pairs cycling every 2-30 seconds (tool rounds) -- Eventually a SessionIdleEvent → CompleteResponse → COMPLETE sequence +- `[IDLE-DEFER]` entries when worker has active sub-agents (expected, not stuck) +- Eventually a SessionIdleEvent (no background tasks) → CompleteResponse → COMPLETE - No [ERROR] or [WATCHDOG] entries **Signs of stuck worker:** @@ -743,9 +870,10 @@ Reflect mode runs multiple iterations. Expect this pattern: 2. **Worker Execution Phase**: - [ ] Each worker actively processes (TurnStart/TurnEnd cycling) + - [ ] Workers with sub-agents show [IDLE-DEFER] entries (expected) - [ ] Watchdog Case B correctly defers when events.jsonl is fresh - [ ] No [ERROR] entries - - [ ] Each worker eventually gets SessionIdleEvent → CompleteResponse + - [ ] Each worker eventually gets SessionIdleEvent (no BackgroundTasks) → CompleteResponse 3. **Collection Phase**: - [ ] After ALL workers complete, orchestrator synthesis triggered @@ -783,6 +911,8 @@ Reflect mode runs multiple iterations. Expect this pattern: | PendingOrchestration stale | `cat ~/.polypilot/pending-orchestration.json` | Finally block didn't run; check for crash | | All sessions die after reconnect | Check [RECONNECT] entries | IsOrphaned not set; see INV-O9 | | Orchestration hangs on reconnect | Check for missing TrySetCanceled | TCS not canceled; see INV-O9 | +| Many IDLE-DEFER entries | `grep "IDLE-DEFER" diagnostics.log` | Normal — worker has active sub-agents; wait for completion | +| IDLE-DEFER but worker never completes | Check if background tasks are leaking | Sub-agent/shell not terminating; check CLI logs | --- diff --git a/.claude/skills/processing-state-safety/SKILL.md b/.claude/skills/processing-state-safety/SKILL.md index 075bd7da50..b0ea59d26f 100644 --- a/.claude/skills/processing-state-safety/SKILL.md +++ b/.claude/skills/processing-state-safety/SKILL.md @@ -11,8 +11,8 @@ description: > (RestoreSingleSessionAsync) that must initialize watchdog-dependent state, (8) Modifying ReconcileOrganization or any code that reads Organization.Sessions during the IsRestoring window, (9) Session appears hung or unresponsive after tool use. - Covers: 13 invariants from 10 PRs of fix cycles, - the 9 code paths that clear IsProcessing, and common regression patterns. + Covers: 18 invariants from 13 PRs of fix cycles, + the 16 code paths that set/clear IsProcessing, and common regression patterns. --- # Processing State Safety @@ -34,33 +34,58 @@ Every code path that sets `IsProcessing = false` MUST also: 12. Run on UI thread (via `InvokeOnUI()` or already on UI thread) 13. After changes, run `ProcessingWatchdogTests.cs` to catch regressions -## The 10 Paths That Set/Clear IsProcessing +## The 16 Paths That Set/Clear IsProcessing ### Paths that CLEAR IsProcessing (→ false) -| # | Path | File | Thread | Notes | -|---|------|------|--------|-------| -| 1 | CompleteResponse | Events.cs | UI (via Invoke) | Normal completion | -| 2 | SessionErrorEvent | Events.cs | Background → InvokeOnUI | SDK error | -| 3 | Watchdog timeout (kill) | Events.cs | Timer → InvokeOnUI | No events for 120s/600s, server dead, or max time exceeded (Case C) | -| 4 | Watchdog clean complete | Events.cs | Timer → InvokeOnUI | Tools done, lost terminal event → calls CompleteResponse (Case B, PR #332) | -| 5 | AbortSessionAsync (local) | CopilotService.cs | UI | User clicks Stop | -| 6 | AbortSessionAsync (remote) | CopilotService.cs | UI | Mobile stop | -| 7 | SendAsync reconnect failure | CopilotService.cs | UI | Prompt send failed after reconnect | -| 8 | SendAsync initial failure | CopilotService.cs | UI | Prompt send failed | -| 9 | Bridge OnTurnEnd | Bridge.cs | Background → InvokeOnUI | Remote mode turn complete | +| # | Path | File | Thread | Tag | Notes | +|---|------|------|--------|-----|-------| +| 1 | CompleteResponse | Events.cs | UI (via Invoke) | `[COMPLETE]` | Normal completion | +| 2 | SessionErrorEvent | Events.cs | Background → InvokeOnUI | `[ERROR]` | SDK error | +| 3 | Watchdog timeout (kill) | Events.cs | Timer → InvokeOnUI | `[WATCHDOG]` | No events for 120s/600s, server dead, or max time exceeded (Case C) | +| 4 | Watchdog clean complete | Events.cs | Timer → InvokeOnUI | `[WATCHDOG]` | Tools done, lost terminal event → calls CompleteResponse (Case B, PR #332) | +| 5 | AbortSessionAsync (local) | CopilotService.cs | UI | `[ABORT]` | User clicks Stop | +| 6 | AbortSessionAsync (remote) | CopilotService.cs | UI | — | Mobile stop | +| 7 | SendAsync reconnect failure | CopilotService.cs | UI | `[ERROR]` | Prompt send failed after reconnect | +| 8 | SendAsync initial failure | CopilotService.cs | UI | `[ERROR]` | Prompt send failed | +| 9 | Bridge OnTurnEnd | Bridge.cs | Background → InvokeOnUI | `[BRIDGE-COMPLETE]` | Remote mode turn complete | +| 10 | Tool health recovery | Events.cs | Timer → InvokeOnUI | `[TOOL-HEALTH-COMPLETE]` | Dead connection detected by health check timer | +| 11 | Watchdog crash handler | Events.cs | Timer → InvokeOnUI | `[WATCHDOG-CRASH]` | Safety net when watchdog loop itself throws | +| 12 | Permission recovery failure | Events.cs | UI | `[PERMISSION-RECOVER]` | ClearProcessingStateForRecoveryFailure — recovery can't proceed | +| 13 | Permission recovery cleanup | Events.cs | UI | `[PERMISSION-RECOVER]` | After successful session resume, clears old state before resend | +| 14 | Steer error | CopilotService.cs | UI | `[STEER-ERROR]` | Soft steer SendAsync failure | +| 15 | ForceCompleteProcessing | Organization.cs | UI (via InvokeOnUI) | `[DISPATCH]` | Orchestration forces unstarted workers complete | + +Additional clearing paths exist in `CopilotService.Providers.cs` (4 paths for external +provider OnTurnEnd/OnError/OnMemberTurnEnd/OnMemberError) and `DemoService.cs` (1 path). +These follow simpler patterns and don't participate in the SDK event flow. ### Path that RE-ARMS IsProcessing (→ true) -| # | Path | File | Thread | Notes | -|---|------|------|--------|-------| -| 10 | TurnStart re-arm | Events.cs | Background → InvokeOnUI | Premature session.idle recovery (PR #375) | +| # | Path | File | Thread | Tag | Notes | +|---|------|------|--------|-----|-------| +| 16 | TurnStart re-arm | Events.cs | Background → InvokeOnUI | `[EVT-REARM]` | Premature session.idle recovery (PR #375) | -Path #10 fires when `AssistantTurnStartEvent` arrives with `IsProcessing=false` on the +Path #16 fires when `AssistantTurnStartEvent` arrives with `IsProcessing=false` on the current non-orphaned state. This detects premature `session.idle` (SDK sends idle mid-turn then continues). Re-arm sets `IsProcessing=true`, restarts the watchdog, and logs `[EVT-REARM]`. Does NOT create a new TCS — the old one was already completed with partial content. +> **Note:** EVT-REARM is now a **secondary defense**. The primary fix is IDLE-DEFER (PR #399), +> which prevents premature completion in the first place by checking `BackgroundTasks`. +> EVT-REARM remains as defense-in-depth for edge cases where `BackgroundTasks` is null. + +### Path that DEFERS completion (IsProcessing stays true) + +| Path | File | Tag | Notes | +|------|------|-----|-------| +| IDLE-DEFER | Events.cs | `[IDLE-DEFER]` | SessionIdleEvent with active background tasks (PR #399) | + +When `HasActiveBackgroundTasks(idle)` returns true (sub-agents or shells running), +`SessionIdleEvent` flushes text via `FlushCurrentResponse` but does NOT call +`CompleteResponse`. Processing stays active. The watchdog and future idle events +(without background tasks) handle eventual completion. + ## Content Persistence Safety ### Turn-End Flush @@ -77,7 +102,7 @@ that crash the app. The DB is a write-through cache; `events.jsonl` is the sourc and replays on session resume via `BulkInsertAsync`. DB write failures are self-healing. **NEVER narrow the ChatDatabase catch filters** — use `catch (Exception ex)` always. -## 13 Invariants +## 18 Invariants ### INV-1: Complete state cleanup Every IsProcessing=false path clears ALL fields. See checklist above. @@ -223,6 +248,20 @@ Both the primary reconnect path and the sibling loop must call: The primary path was missing these until PR #373 Round 5. Asymmetry between the sibling and primary reconnect configs is a recurring bug pattern. +### INV-18: SessionIdleEvent is not always terminal (PR #399) +`SessionIdleEvent` with active background tasks (`HasActiveBackgroundTasks()` returns true) +means "foreground quiesced, background still running" — NOT true completion. +This path: +1. Cancels the TurnEnd→Idle fallback timer +2. Flushes accumulated text via `FlushCurrentResponse` (preserves content) +3. Does **NOT** call `CompleteResponse` — `IsProcessing` stays `true` +4. Logs `[IDLE-DEFER]` with task counts + +The watchdog continues running and will eventually time out if background tasks never +finish. A subsequent `SessionIdleEvent` without background tasks completes normally. +**Do NOT** add `IsProcessing = false` to the IDLE-DEFER path — it would prematurely +complete the response while sub-agents are still working. + ## Top 5 Recurring Mistakes 1. **Incomplete cleanup** — modifying one IsProcessing path without @@ -271,6 +310,7 @@ When a session shows "Thinking..." indefinitely: |---------|-------------|-----| | `[SEND]` then silence | SDK never responded, watchdog will catch at 120s | Wait or abort | | `[EVT] TurnEnd` but no `[IDLE]` | Zero-idle SDK bug | Watchdog catches at 30s fallback (INV-10) | + | `[IDLE-DEFER]` then long silence | Background tasks (sub-agents/shells) active but never completed | Check agent status; watchdog will eventually catch (INV-18) | | `[COMPLETE]` fired but spinner persists | UI thread not notified | Check INV-2, INV-8 | | `[WATCHDOG]` clears but re-sticks | New turn started before watchdog callback ran | Check INV-3 generation guard | @@ -279,4 +319,5 @@ When a session shows "Thinking..." indefinitely: ## Regression History 10 PRs of fix/regression cycles: #141 → #147 → #148 → #153 → #158 → #163 → #164 → #276 → #284 → #332. +Additional safety PRs: #373 (orphaned state guards), #375 (premature idle re-arm), #399 (IDLE-DEFER for background tasks). See `references/regression-history.md` for the full timeline with root causes. diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 719d8ce787..9bbafad2ca 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -175,7 +175,7 @@ When a prompt is sent, the SDK emits events processed by `HandleSessionEvent` in 6. `ToolExecutionCompleteEvent` → tool done, increments `ToolCallCount` 7. `AssistantIntentEvent` → intent/plan updates 8. `AssistantTurnEndEvent` → end of a sub-turn, tool loop continues. `FlushCurrentResponse` persists accumulated text before the next sub-turn. -9. `SessionIdleEvent` → turn complete, response finalized +9. `SessionIdleEvent` → turn complete, response finalized. **Unless** `Data.BackgroundTasks` has active agents/shells — then flushes text, logs `[IDLE-DEFER]`, and keeps `IsProcessing=true` (PR #399). ### Processing Status Indicator `AgentSessionInfo` tracks three fields for the processing status UI: @@ -188,10 +188,10 @@ All three are reset in `SendPromptAsync` (new turn) and cleared in `CompleteResp The UI shows: "Sending…" → "Server connected…" → "Thinking…" → "Working · Xm Xs · N tool calls…". ### Abort Behavior -`AbortSessionAsync` must clear ALL processing state — see `.claude/skills/processing-state-safety/SKILL.md` for the full cleanup checklist and the 8 paths that clear `IsProcessing`. +`AbortSessionAsync` must clear ALL processing state — see `.claude/skills/processing-state-safety/SKILL.md` for the full cleanup checklist and the paths that clear `IsProcessing`. ### ⚠️ IsProcessing Cleanup Invariant -**CRITICAL**: Every code path that sets `IsProcessing = false` must clear 9 companion fields and call `FlushCurrentResponse`. This is the most recurring bug category (7 PRs of fix/regression cycles). **Read `.claude/skills/processing-state-safety/SKILL.md` before modifying ANY processing path.** There are 8 such paths across CopilotService.cs, Events.cs, and Bridge.cs. +**CRITICAL**: Every code path that sets `IsProcessing = false` must clear 9 companion fields and call `FlushCurrentResponse`. This is the most recurring bug category (13 PRs of fix/regression cycles). **Read `.claude/skills/processing-state-safety/SKILL.md` before modifying ANY processing path.** There are 15+ such paths across CopilotService.cs, Events.cs, Bridge.cs, Organization.cs, and Providers.cs. ### Content Persistence `FlushCurrentResponse` is also called on `AssistantTurnEndEvent` to persist accumulated response text at each sub-turn boundary. This prevents content loss if the app restarts between `turn_end` and `session.idle` (e.g., "zero-idle sessions" where the SDK never emits `session.idle`). The flush includes a dedup guard to prevent duplicate messages from event replay on resume. @@ -214,6 +214,7 @@ The event diagnostics log (`~/.polypilot/event-diagnostics.log`) uses these tags - `[SEND]` — prompt sent, IsProcessing set to true - `[EVT]` — SDK event received (only SessionIdleEvent, AssistantTurnEndEvent, SessionErrorEvent) - `[IDLE]` — SessionIdleEvent dispatched to CompleteResponse +- `[IDLE-DEFER]` — SessionIdleEvent deferred due to active background tasks (agents/shells) - `[COMPLETE]` — CompleteResponse executed or skipped - `[RECONNECT]` — session replaced after disconnect - `[ERROR]` — SessionErrorEvent or SendAsync/reconnect failure cleared IsProcessing @@ -307,6 +308,7 @@ Test files in `PolyPilot.Tests/`: - `ToolResultFormattingTests.cs` — Tool output formatting - `UiStatePersistenceTests.cs` — UI state save/load - `ProcessingWatchdogTests.cs` — Watchdog constants, timeout selection, HasUsedToolsThisTurn, IsResumed, abort clears queue and processing status +- `BackgroundTasksIdleTests.cs` — IDLE-DEFER background tasks handling, HasActiveBackgroundTasks - `CliPathResolutionTests.cs` — CLI path resolution - `InitializationModeTests.cs` — Mode initialization - `PersistentModeTests.cs` — Persistent mode behavior