diff --git a/.claude/skills/multi-agent-orchestration/SKILL.md b/.claude/skills/multi-agent-orchestration/SKILL.md index 45753b2cef..aa9679692b 100644 --- a/.claude/skills/multi-agent-orchestration/SKILL.md +++ b/.claude/skills/multi-agent-orchestration/SKILL.md @@ -8,8 +8,10 @@ description: > persistence or resume logic, (4) Debugging orchestrator-worker communication failures, (5) Adding error handling around worker dispatch or completion, (6) Modifying OnSessionComplete coordination or TCS ordering, (7) Working with reflection loop - concurrency (semaphores, queued prompts). Covers: 4-phase dispatch lifecycle, restart - recovery via PendingOrchestration, worker failure patterns, and connection error handling. + concurrency (semaphores, queued prompts), (8) Modifying SessionIdleEvent handling + or IDLE-DEFER logic (BackgroundTasks check). Covers: 5-phase dispatch lifecycle, + IDLE-DEFER background task guard, restart recovery via PendingOrchestration, worker + failure patterns, and connection error handling. --- # Multi-Agent Orchestration — Invariants & Error Recovery @@ -40,7 +42,7 @@ stuck sessions, and coordination failures. --- -## The 4-Phase Orchestration Lifecycle +## The 5-Phase Orchestration Lifecycle Every orchestrator dispatch (single-pass and reflect) follows these phases: @@ -49,9 +51,10 @@ Every orchestrator dispatch (single-pass and reflect) follows these phases: │ Phase 1: PLAN │ │ ├── Orchestrator receives user prompt + worker list │ │ ├── Builds planning prompt with worker models/descriptions │ +│ ├── EarlyDispatchOnWorkerBlocks = true (resolve TCS mid-turn) │ │ ├── Orchestrator responds with @worker:name task blocks │ │ └── ParseTaskAssignments extracts → List │ -│ └── If no assignments: send nudge → retry parse │ +│ └── If no assignments: send nudge → retry parse (up to 3x)│ │ └── If still none: orchestrator handled directly │ └─────────────────────────────────────────────────────────────────┘ │ @@ -61,6 +64,7 @@ Every orchestrator dispatch (single-pass and reflect) follows these phases: │ ├── SavePendingOrchestration() — BEFORE dispatching │ │ ├── Fire OnOrchestratorPhaseChanged(Dispatching) │ │ ├── Launch worker tasks in parallel: Task.WhenAll(workers) │ +│ │ └── Workers staggered with 1s delay between dispatches │ │ └── Each worker gets: system prompt + original prompt + task │ └─────────────────────────────────────────────────────────────────┘ │ @@ -68,6 +72,7 @@ Every orchestrator dispatch (single-pass and reflect) follows these phases: ┌─────────────────────────────────────────────────────────────────┐ │ Phase 3: COLLECT (WaitingForWorkers) │ │ ├── Await all worker completions (10-min timeout each) │ +│ ├── WaitForSessionIdleAsync on orchestrator (early dispatch) │ │ ├── Collect WorkerResult[] with response, success, duration │ │ └── Failed workers: response = error message, success = false │ └─────────────────────────────────────────────────────────────────┘ @@ -79,24 +84,49 @@ Every orchestrator dispatch (single-pass and reflect) follows these phases: │ ├── Send to orchestrator for final response │ │ ├── ClearPendingOrchestration() — in finally block │ │ └── Fire OnOrchestratorPhaseChanged(Complete) │ +└─────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Phase 5: IDLE-DEFER (Background Task Safety) — PR #399 │ +│ ├── SessionIdleEvent handler checks HasActiveBackgroundTasks() │ +│ ├── If agents/shells active: flush text, log [IDLE-DEFER], │ +│ │ and break WITHOUT calling CompleteResponse │ +│ └── Only truly idle (no background tasks) → CompleteResponse │ +│ │ +│ This prevents premature TCS completion when workers are still │ +│ running as background agents/shells dispatched by the CLI. │ └─────────────────────────────────────────────────────────────────┘ ``` +> **Phase 5 (IDLE-DEFER) is not a sequential phase** — it's a guard in the +> `SessionIdleEvent` handler (Events.cs:622-642) that applies to ALL sessions. +> It's listed here because it fundamentally changed the premature idle story +> for orchestrator workers. See **IDLE-DEFER & BackgroundTasks** section below. + ### OrchestratorReflect: Extended Loop OrchestratorReflect wraps phases 1–4 in a loop with evaluation: ``` while (IsActive && !IsPaused && CurrentIteration < MaxIterations): + Drain _reflectQueuedPrompts (user messages during loop) Phase 1–4 (as above) Phase 5: EVALUATE ├── With evaluator: score + rationale → RecordEvaluation() - └── Self-eval: check for [[GROUP_REFLECT_COMPLETE]] sentinel + ├── Self-eval: check for [[GROUP_REFLECT_COMPLETE]] sentinel + └── AutoAdjustFromFeedback() detects quality degradation Phase 6: STALL DETECTION - ├── CheckStall() compares synthesis to previous + ├── CheckStall() compares synthesis to previous (Jaccard > 0.9) └── 2 consecutive stalls → IsStalled = true → break ``` +> **Sentinel note**: `ReflectionCycle.cs` defines `CompletionSentinel = "[[REFLECTION_COMPLETE]]"` +> (used by `IsGoalMet()` regex). The orchestrator prompts in `Organization.cs` use +> `[[GROUP_REFLECT_COMPLETE]]` for model-facing instructions. Both are checked, but +> via different mechanisms — `IsGoalMet()` for the ReflectionCycle, string.Contains +> for orchestrator-level evaluation. + --- ## PendingOrchestration — Restart Recovery @@ -233,24 +263,30 @@ even if dispatch throws. This prevents orphaned pending files. `OnSessionComplete` is fired when a session finishes processing (IsProcessing → false). Orchestrator loops use this to detect when workers finish. +**Signature**: `event Action? OnSessionComplete` — `(sessionName, summary)` + ### Ordering Invariant (from processing-state-safety) -**INV-O3: IsProcessing = false BEFORE TrySetResult** +**INV-O3: IsProcessing = false BEFORE TrySetResult BEFORE OnSessionComplete** ```csharp -// CORRECT ORDER in CompleteResponse: -state.Info.IsProcessing = false; // 1. Clear processing state -OnSessionComplete?.Invoke(name); // 2. Notify listeners -tcs.TrySetResult(response); // 3. Complete TCS (may run sync continuation) +// CORRECT ORDER in CompleteResponse (Events.cs ~line 1062-1080): +state.Info.IsProcessing = false; // 1. Clear processing state +state.ResponseCompletion?.TrySetResult(fullResponse);// 2. Complete TCS (may run sync continuation) +OnSessionComplete?.Invoke(name, summary); // 3. Notify listeners +OnStateChanged?.Invoke(); // 4. UI update ``` -If TrySetResult runs first, the reflection loop's synchronous continuation may see -`IsProcessing = true` and fail to send the next prompt. +If TrySetResult runs before IsProcessing=false, the reflection loop's synchronous +continuation may see `IsProcessing = true` and fail to send the next prompt. ### INV-O4: OnSessionComplete fired on ALL termination paths -All 8 paths that clear IsProcessing (see processing-state-safety) must also fire -OnSessionComplete. Otherwise, orchestrator loops waiting on workers hang forever. +All paths that clear IsProcessing (currently 19+ across Events.cs, CopilotService.cs, +Organization.cs, Bridge.cs, and Providers.cs) should fire OnSessionComplete. Otherwise, +orchestrator loops waiting on workers hang forever. Key invocation sites include: +CompleteResponse, SessionErrorEvent, watchdog timeout, watchdog crash recovery, +abort, steer error, and bridge completion. --- @@ -301,6 +337,7 @@ When modifying orchestration, verify: - [ ] **INV-O6**: Phase changes fire OnOrchestratorPhaseChanged for UI updates - [ ] **INV-O7**: Worker timeouts use 10-minute default (600s for resumed sessions) - [ ] **INV-O8**: Cancellation tokens propagated to all async operations +- [ ] **INV-O15**: IDLE-DEFER flushes CurrentResponse before breaking (content preservation) --- @@ -465,7 +502,9 @@ not from live TCS tracking. **Root cause**: Worker's OnSessionComplete wasn't fired (incomplete cleanup path). -**Mitigation**: Ensure all 9 IsProcessing=false paths fire OnSessionComplete. +**Mitigation**: Ensure all IsProcessing=false paths fire OnSessionComplete +(currently 19+ paths across Events.cs, CopilotService.cs, Organization.cs, +Bridge.cs, and Providers.cs). ### Bug: Reflection loop processes stale user message @@ -567,7 +606,7 @@ Conflict Tests (PR #375)" region: - Verify non-orchestrator sessions still get steered - Long-running orchestrator (15min) follow-up must queue -### Bug: Premature session.idle truncates orchestrator results (PR #375) +### Bug: Premature session.idle truncates orchestrator results (PR #375, FIXED in PR #399) **Symptom**: CLI sends `session.idle` prematurely mid-turn (after only a few tool rounds), then continues processing for 15+ more tool rounds. The @@ -580,20 +619,107 @@ missing the idle entirely, it sends it too early. The idle arrives, passes all generation guards, and CompleteResponse runs with whatever content has been flushed so far. -**Partial fix (UI)**: Added re-arm in `AssistantTurnStartEvent` handler: when -TurnStart arrives with `IsProcessing=false` on the current (non-orphaned) state, -re-arm IsProcessing, restart watchdog, log as `[EVT-REARM]`. This keeps the UI -showing "Working…" and the watchdog active. +**Primary fix (PR #399 — IDLE-DEFER)**: `SessionIdleEvent` handler now checks +`HasActiveBackgroundTasks(idle)` (Events.cs:629). If `BackgroundTasks.Agents` +or `BackgroundTasks.Shells` has any entries, the handler flushes accumulated +text but does NOT call `CompleteResponse` — it breaks early and logs +`[IDLE-DEFER]`. The next `SessionIdleEvent` without background tasks triggers +the real completion. This prevents TCS completion with partial content. -**Not fixed**: Orchestrator content truncation. The TCS was already completed -with partial content before re-arm fires. A future fix could create a NEW TCS -on re-arm so the orchestrator waits for the real completion. Complex — may -need separate PR. +**Defense-in-depth (still active)**: +- `[EVT-REARM]` (Events.cs:529): `AssistantTurnStartEvent` after premature idle + re-arms IsProcessing and sets `PrematureIdleSignal`. Keeps UI showing "Working…". +- `RecoverFromPrematureIdleIfNeededAsync` (Organization.cs:1759): Polls signal + + events.jsonl freshness to collect full response after TCS got partial content. +- `IsEventsFileActive` (Organization.cs:1946): Checks events.jsonl mtime < 15s. + +These defense layers handle edge cases where `BackgroundTasks` data isn't +present (older CLI versions, or non-agent premature idles). **Filed**: See GitHub issue for tracking. --- +## IDLE-DEFER & BackgroundTasks (PR #399) + +> **This is the primary fix for premature idle in multi-agent workers.** +> Before PR #399, premature `session.idle` events caused truncated worker +> responses. The defense-in-depth layers (`EVT-REARM`, `RecoverFromPrematureIdleIfNeededAsync`) +> mitigated but didn't fully fix the problem for orchestrator TCS completion. + +### How It Works + +The CLI's `SessionIdleEvent` now includes a `Data.BackgroundTasks` object with: +- `Agents` — array of active background agent processes +- `Shells` — array of active shell/terminal processes + +When a worker uses `task` tool to dispatch sub-agents, or runs shell commands, +these appear as background tasks. A `session.idle` with active background tasks +means "foreground processing quiesced, but background work is ongoing." + +### Code Path (Events.cs:622-642) + +```csharp +case SessionIdleEvent idle: + CancelTurnEndFallback(state); + + if (HasActiveBackgroundTasks(idle)) + { + Debug($"[IDLE-DEFER] ..."); + Invoke(() => { + if (state.IsOrphaned) return; + FlushCurrentResponse(state); // Preserve accumulated text + NotifyStateChangedCoalesced(); + }); + break; // Do NOT call CompleteResponse + } + + // Only reach here when truly idle (no background tasks) + CompleteResponse(state, idleGeneration); +``` + +### HasActiveBackgroundTasks (Events.cs:1856-1861) + +```csharp +internal static bool HasActiveBackgroundTasks(SessionIdleEvent idle) +{ + var bt = idle.Data?.BackgroundTasks; + if (bt == null) return false; + return (bt.Agents is { Length: > 0 }) || (bt.Shells is { Length: > 0 }); +} +``` + +### Impact on Multi-Agent Orchestration + +1. **Workers dispatching sub-agents**: Previously, these would trigger premature + completion when the foreground quiesced. Now correctly deferred. +2. **Workers running shell commands**: Same — deferred until shells complete. +3. **TCS ordering**: The `ResponseCompletion` TCS is no longer completed with + partial content for the common case. Only edge cases (old CLI, missing + BackgroundTasks data) fall through to the defense-in-depth layers. +4. **Content preservation**: `FlushCurrentResponse` is called on each deferred + idle, ensuring accumulated text is preserved even if the app restarts. + +### INV-O15: IDLE-DEFER Must Flush Before Breaking + +The `FlushCurrentResponse(state)` call inside the IDLE-DEFER block is critical. +Without it, accumulated response text in `state.CurrentResponse` would be lost +if another idle event arrives or the app restarts. The flush preserves content +into `state.FlushedResponse` and chat history. + +### Defense-in-Depth Layers (Still Active) + +These layers remain as fallbacks for edge cases where IDLE-DEFER doesn't fire: + +| Layer | Mechanism | When It Helps | +|-------|-----------|---------------| +| `[EVT-REARM]` | Re-arm IsProcessing on TurnStart after idle | Old CLI without BackgroundTasks | +| `PrematureIdleSignal` | ManualResetEventSlim set on re-arm | Signals ExecuteWorkerAsync to recover | +| `RecoverFromPrematureIdleIfNeededAsync` | Poll signal + events.jsonl freshness | Collect full response after partial TCS | +| `IsEventsFileActive` | events.jsonl mtime < 15s | Detect ongoing CLI activity | + +--- + ## "Fix with Copilot" — Multi-Agent Awareness ### Current State @@ -665,8 +791,8 @@ for w in 1 2 3 4 5; do [[ -n "$last" ]] && echo "W$w: $last" done -# 3. Any completions? -grep "$GROUP" ~/.polypilot/event-diagnostics.log | grep -E "IDLE|COMPLETE|DISPATCH.*completed" | tail -10 +# 3. Any completions or deferred idles? +grep "$GROUP" ~/.polypilot/event-diagnostics.log | grep -E "IDLE|IDLE-DEFER|COMPLETE|DISPATCH.*completed" | tail -10 # 4. Any errors? grep "$GROUP" ~/.polypilot/event-diagnostics.log | grep -E "ERROR|WATCHDOG" | tail -5 @@ -678,7 +804,7 @@ cat ~/.polypilot/pending-orchestration.json 2>/dev/null | head -3 || echo "(empt ### Monitoring Orchestrator Dispatch ```bash -grep "DISPATCH" ~/.polypilot/event-diagnostics.log | grep "$GROUP" | tail -20 +grep "DISPATCH\|IDLE-DEFER" ~/.polypilot/event-diagnostics.log | grep "$GROUP" | tail -20 ``` **Expected sequence for N-worker dispatch:** @@ -700,7 +826,8 @@ grep "DISPATCH" ~/.polypilot/event-diagnostics.log | grep "$GROUP" | tail -20 **Signs of healthy worker:** - TurnStart/TurnEnd pairs cycling every 2-30 seconds (tool rounds) -- Eventually a SessionIdleEvent → CompleteResponse → COMPLETE sequence +- `[IDLE-DEFER]` entries when worker has active sub-agents (expected, not stuck) +- Eventually a SessionIdleEvent (no background tasks) → CompleteResponse → COMPLETE - No [ERROR] or [WATCHDOG] entries **Signs of stuck worker:** @@ -743,9 +870,10 @@ Reflect mode runs multiple iterations. Expect this pattern: 2. **Worker Execution Phase**: - [ ] Each worker actively processes (TurnStart/TurnEnd cycling) + - [ ] Workers with sub-agents show [IDLE-DEFER] entries (expected) - [ ] Watchdog Case B correctly defers when events.jsonl is fresh - [ ] No [ERROR] entries - - [ ] Each worker eventually gets SessionIdleEvent → CompleteResponse + - [ ] Each worker eventually gets SessionIdleEvent (no BackgroundTasks) → CompleteResponse 3. **Collection Phase**: - [ ] After ALL workers complete, orchestrator synthesis triggered @@ -783,6 +911,8 @@ Reflect mode runs multiple iterations. Expect this pattern: | PendingOrchestration stale | `cat ~/.polypilot/pending-orchestration.json` | Finally block didn't run; check for crash | | All sessions die after reconnect | Check [RECONNECT] entries | IsOrphaned not set; see INV-O9 | | Orchestration hangs on reconnect | Check for missing TrySetCanceled | TCS not canceled; see INV-O9 | +| Many IDLE-DEFER entries | `grep "IDLE-DEFER" diagnostics.log` | Normal — worker has active sub-agents; wait for completion | +| IDLE-DEFER but worker never completes | Check if background tasks are leaking | Sub-agent/shell not terminating; check CLI logs | --- diff --git a/.claude/skills/processing-state-safety/SKILL.md b/.claude/skills/processing-state-safety/SKILL.md index 075bd7da50..b0ea59d26f 100644 --- a/.claude/skills/processing-state-safety/SKILL.md +++ b/.claude/skills/processing-state-safety/SKILL.md @@ -11,8 +11,8 @@ description: > (RestoreSingleSessionAsync) that must initialize watchdog-dependent state, (8) Modifying ReconcileOrganization or any code that reads Organization.Sessions during the IsRestoring window, (9) Session appears hung or unresponsive after tool use. - Covers: 13 invariants from 10 PRs of fix cycles, - the 9 code paths that clear IsProcessing, and common regression patterns. + Covers: 18 invariants from 13 PRs of fix cycles, + the 16 code paths that set/clear IsProcessing, and common regression patterns. --- # Processing State Safety @@ -34,33 +34,58 @@ Every code path that sets `IsProcessing = false` MUST also: 12. Run on UI thread (via `InvokeOnUI()` or already on UI thread) 13. After changes, run `ProcessingWatchdogTests.cs` to catch regressions -## The 10 Paths That Set/Clear IsProcessing +## The 16 Paths That Set/Clear IsProcessing ### Paths that CLEAR IsProcessing (→ false) -| # | Path | File | Thread | Notes | -|---|------|------|--------|-------| -| 1 | CompleteResponse | Events.cs | UI (via Invoke) | Normal completion | -| 2 | SessionErrorEvent | Events.cs | Background → InvokeOnUI | SDK error | -| 3 | Watchdog timeout (kill) | Events.cs | Timer → InvokeOnUI | No events for 120s/600s, server dead, or max time exceeded (Case C) | -| 4 | Watchdog clean complete | Events.cs | Timer → InvokeOnUI | Tools done, lost terminal event → calls CompleteResponse (Case B, PR #332) | -| 5 | AbortSessionAsync (local) | CopilotService.cs | UI | User clicks Stop | -| 6 | AbortSessionAsync (remote) | CopilotService.cs | UI | Mobile stop | -| 7 | SendAsync reconnect failure | CopilotService.cs | UI | Prompt send failed after reconnect | -| 8 | SendAsync initial failure | CopilotService.cs | UI | Prompt send failed | -| 9 | Bridge OnTurnEnd | Bridge.cs | Background → InvokeOnUI | Remote mode turn complete | +| # | Path | File | Thread | Tag | Notes | +|---|------|------|--------|-----|-------| +| 1 | CompleteResponse | Events.cs | UI (via Invoke) | `[COMPLETE]` | Normal completion | +| 2 | SessionErrorEvent | Events.cs | Background → InvokeOnUI | `[ERROR]` | SDK error | +| 3 | Watchdog timeout (kill) | Events.cs | Timer → InvokeOnUI | `[WATCHDOG]` | No events for 120s/600s, server dead, or max time exceeded (Case C) | +| 4 | Watchdog clean complete | Events.cs | Timer → InvokeOnUI | `[WATCHDOG]` | Tools done, lost terminal event → calls CompleteResponse (Case B, PR #332) | +| 5 | AbortSessionAsync (local) | CopilotService.cs | UI | `[ABORT]` | User clicks Stop | +| 6 | AbortSessionAsync (remote) | CopilotService.cs | UI | — | Mobile stop | +| 7 | SendAsync reconnect failure | CopilotService.cs | UI | `[ERROR]` | Prompt send failed after reconnect | +| 8 | SendAsync initial failure | CopilotService.cs | UI | `[ERROR]` | Prompt send failed | +| 9 | Bridge OnTurnEnd | Bridge.cs | Background → InvokeOnUI | `[BRIDGE-COMPLETE]` | Remote mode turn complete | +| 10 | Tool health recovery | Events.cs | Timer → InvokeOnUI | `[TOOL-HEALTH-COMPLETE]` | Dead connection detected by health check timer | +| 11 | Watchdog crash handler | Events.cs | Timer → InvokeOnUI | `[WATCHDOG-CRASH]` | Safety net when watchdog loop itself throws | +| 12 | Permission recovery failure | Events.cs | UI | `[PERMISSION-RECOVER]` | ClearProcessingStateForRecoveryFailure — recovery can't proceed | +| 13 | Permission recovery cleanup | Events.cs | UI | `[PERMISSION-RECOVER]` | After successful session resume, clears old state before resend | +| 14 | Steer error | CopilotService.cs | UI | `[STEER-ERROR]` | Soft steer SendAsync failure | +| 15 | ForceCompleteProcessing | Organization.cs | UI (via InvokeOnUI) | `[DISPATCH]` | Orchestration forces unstarted workers complete | + +Additional clearing paths exist in `CopilotService.Providers.cs` (4 paths for external +provider OnTurnEnd/OnError/OnMemberTurnEnd/OnMemberError) and `DemoService.cs` (1 path). +These follow simpler patterns and don't participate in the SDK event flow. ### Path that RE-ARMS IsProcessing (→ true) -| # | Path | File | Thread | Notes | -|---|------|------|--------|-------| -| 10 | TurnStart re-arm | Events.cs | Background → InvokeOnUI | Premature session.idle recovery (PR #375) | +| # | Path | File | Thread | Tag | Notes | +|---|------|------|--------|-----|-------| +| 16 | TurnStart re-arm | Events.cs | Background → InvokeOnUI | `[EVT-REARM]` | Premature session.idle recovery (PR #375) | -Path #10 fires when `AssistantTurnStartEvent` arrives with `IsProcessing=false` on the +Path #16 fires when `AssistantTurnStartEvent` arrives with `IsProcessing=false` on the current non-orphaned state. This detects premature `session.idle` (SDK sends idle mid-turn then continues). Re-arm sets `IsProcessing=true`, restarts the watchdog, and logs `[EVT-REARM]`. Does NOT create a new TCS — the old one was already completed with partial content. +> **Note:** EVT-REARM is now a **secondary defense**. The primary fix is IDLE-DEFER (PR #399), +> which prevents premature completion in the first place by checking `BackgroundTasks`. +> EVT-REARM remains as defense-in-depth for edge cases where `BackgroundTasks` is null. + +### Path that DEFERS completion (IsProcessing stays true) + +| Path | File | Tag | Notes | +|------|------|-----|-------| +| IDLE-DEFER | Events.cs | `[IDLE-DEFER]` | SessionIdleEvent with active background tasks (PR #399) | + +When `HasActiveBackgroundTasks(idle)` returns true (sub-agents or shells running), +`SessionIdleEvent` flushes text via `FlushCurrentResponse` but does NOT call +`CompleteResponse`. Processing stays active. The watchdog and future idle events +(without background tasks) handle eventual completion. + ## Content Persistence Safety ### Turn-End Flush @@ -77,7 +102,7 @@ that crash the app. The DB is a write-through cache; `events.jsonl` is the sourc and replays on session resume via `BulkInsertAsync`. DB write failures are self-healing. **NEVER narrow the ChatDatabase catch filters** — use `catch (Exception ex)` always. -## 13 Invariants +## 18 Invariants ### INV-1: Complete state cleanup Every IsProcessing=false path clears ALL fields. See checklist above. @@ -223,6 +248,20 @@ Both the primary reconnect path and the sibling loop must call: The primary path was missing these until PR #373 Round 5. Asymmetry between the sibling and primary reconnect configs is a recurring bug pattern. +### INV-18: SessionIdleEvent is not always terminal (PR #399) +`SessionIdleEvent` with active background tasks (`HasActiveBackgroundTasks()` returns true) +means "foreground quiesced, background still running" — NOT true completion. +This path: +1. Cancels the TurnEnd→Idle fallback timer +2. Flushes accumulated text via `FlushCurrentResponse` (preserves content) +3. Does **NOT** call `CompleteResponse` — `IsProcessing` stays `true` +4. Logs `[IDLE-DEFER]` with task counts + +The watchdog continues running and will eventually time out if background tasks never +finish. A subsequent `SessionIdleEvent` without background tasks completes normally. +**Do NOT** add `IsProcessing = false` to the IDLE-DEFER path — it would prematurely +complete the response while sub-agents are still working. + ## Top 5 Recurring Mistakes 1. **Incomplete cleanup** — modifying one IsProcessing path without @@ -271,6 +310,7 @@ When a session shows "Thinking..." indefinitely: |---------|-------------|-----| | `[SEND]` then silence | SDK never responded, watchdog will catch at 120s | Wait or abort | | `[EVT] TurnEnd` but no `[IDLE]` | Zero-idle SDK bug | Watchdog catches at 30s fallback (INV-10) | + | `[IDLE-DEFER]` then long silence | Background tasks (sub-agents/shells) active but never completed | Check agent status; watchdog will eventually catch (INV-18) | | `[COMPLETE]` fired but spinner persists | UI thread not notified | Check INV-2, INV-8 | | `[WATCHDOG]` clears but re-sticks | New turn started before watchdog callback ran | Check INV-3 generation guard | @@ -279,4 +319,5 @@ When a session shows "Thinking..." indefinitely: ## Regression History 10 PRs of fix/regression cycles: #141 → #147 → #148 → #153 → #158 → #163 → #164 → #276 → #284 → #332. +Additional safety PRs: #373 (orphaned state guards), #375 (premature idle re-arm), #399 (IDLE-DEFER for background tasks). See `references/regression-history.md` for the full timeline with root causes. diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 719d8ce787..9bbafad2ca 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -175,7 +175,7 @@ When a prompt is sent, the SDK emits events processed by `HandleSessionEvent` in 6. `ToolExecutionCompleteEvent` → tool done, increments `ToolCallCount` 7. `AssistantIntentEvent` → intent/plan updates 8. `AssistantTurnEndEvent` → end of a sub-turn, tool loop continues. `FlushCurrentResponse` persists accumulated text before the next sub-turn. -9. `SessionIdleEvent` → turn complete, response finalized +9. `SessionIdleEvent` → turn complete, response finalized. **Unless** `Data.BackgroundTasks` has active agents/shells — then flushes text, logs `[IDLE-DEFER]`, and keeps `IsProcessing=true` (PR #399). ### Processing Status Indicator `AgentSessionInfo` tracks three fields for the processing status UI: @@ -188,10 +188,10 @@ All three are reset in `SendPromptAsync` (new turn) and cleared in `CompleteResp The UI shows: "Sending…" → "Server connected…" → "Thinking…" → "Working · Xm Xs · N tool calls…". ### Abort Behavior -`AbortSessionAsync` must clear ALL processing state — see `.claude/skills/processing-state-safety/SKILL.md` for the full cleanup checklist and the 8 paths that clear `IsProcessing`. +`AbortSessionAsync` must clear ALL processing state — see `.claude/skills/processing-state-safety/SKILL.md` for the full cleanup checklist and the paths that clear `IsProcessing`. ### ⚠️ IsProcessing Cleanup Invariant -**CRITICAL**: Every code path that sets `IsProcessing = false` must clear 9 companion fields and call `FlushCurrentResponse`. This is the most recurring bug category (7 PRs of fix/regression cycles). **Read `.claude/skills/processing-state-safety/SKILL.md` before modifying ANY processing path.** There are 8 such paths across CopilotService.cs, Events.cs, and Bridge.cs. +**CRITICAL**: Every code path that sets `IsProcessing = false` must clear 9 companion fields and call `FlushCurrentResponse`. This is the most recurring bug category (13 PRs of fix/regression cycles). **Read `.claude/skills/processing-state-safety/SKILL.md` before modifying ANY processing path.** There are 15+ such paths across CopilotService.cs, Events.cs, Bridge.cs, Organization.cs, and Providers.cs. ### Content Persistence `FlushCurrentResponse` is also called on `AssistantTurnEndEvent` to persist accumulated response text at each sub-turn boundary. This prevents content loss if the app restarts between `turn_end` and `session.idle` (e.g., "zero-idle sessions" where the SDK never emits `session.idle`). The flush includes a dedup guard to prevent duplicate messages from event replay on resume. @@ -214,6 +214,7 @@ The event diagnostics log (`~/.polypilot/event-diagnostics.log`) uses these tags - `[SEND]` — prompt sent, IsProcessing set to true - `[EVT]` — SDK event received (only SessionIdleEvent, AssistantTurnEndEvent, SessionErrorEvent) - `[IDLE]` — SessionIdleEvent dispatched to CompleteResponse +- `[IDLE-DEFER]` — SessionIdleEvent deferred due to active background tasks (agents/shells) - `[COMPLETE]` — CompleteResponse executed or skipped - `[RECONNECT]` — session replaced after disconnect - `[ERROR]` — SessionErrorEvent or SendAsync/reconnect failure cleared IsProcessing @@ -307,6 +308,7 @@ Test files in `PolyPilot.Tests/`: - `ToolResultFormattingTests.cs` — Tool output formatting - `UiStatePersistenceTests.cs` — UI state save/load - `ProcessingWatchdogTests.cs` — Watchdog constants, timeout selection, HasUsedToolsThisTurn, IsResumed, abort clears queue and processing status +- `BackgroundTasksIdleTests.cs` — IDLE-DEFER background tasks handling, HasActiveBackgroundTasks - `CliPathResolutionTests.cs` — CLI path resolution - `InitializationModeTests.cs` — Mode initialization - `PersistentModeTests.cs` — Persistent mode behavior diff --git a/PolyPilot.Tests/BackgroundTasksIdleTests.cs b/PolyPilot.Tests/BackgroundTasksIdleTests.cs new file mode 100644 index 0000000000..96f4869d61 --- /dev/null +++ b/PolyPilot.Tests/BackgroundTasksIdleTests.cs @@ -0,0 +1,113 @@ +using GitHub.Copilot.SDK; +using PolyPilot.Services; + +namespace PolyPilot.Tests; + +/// +/// Tests for HasActiveBackgroundTasks — the fix that prevents premature idle completion +/// when the SDK reports active background tasks (sub-agents, shells) in SessionIdleEvent. +/// See: session.idle with backgroundTasks means "foreground quiesced, background still running." +/// +public class BackgroundTasksIdleTests +{ + private static SessionIdleEvent MakeIdle(SessionIdleDataBackgroundTasks? bt = null) + { + return new SessionIdleEvent + { + Data = new SessionIdleData { BackgroundTasks = bt } + }; + } + + [Fact] + public void HasActiveBackgroundTasks_NullBackgroundTasks_ReturnsFalse() + { + var idle = MakeIdle(bt: null); + Assert.False(CopilotService.HasActiveBackgroundTasks(idle)); + } + + [Fact] + public void HasActiveBackgroundTasks_EmptyBackgroundTasks_ReturnsFalse() + { + var idle = MakeIdle(new SessionIdleDataBackgroundTasks + { + Agents = Array.Empty(), + Shells = Array.Empty() + }); + Assert.False(CopilotService.HasActiveBackgroundTasks(idle)); + } + + [Fact] + public void HasActiveBackgroundTasks_WithAgents_ReturnsTrue() + { + var idle = MakeIdle(new SessionIdleDataBackgroundTasks + { + Agents = new[] + { + new SessionIdleDataBackgroundTasksAgentsItem + { + AgentId = "agent-42", + AgentType = "copilot", + Description = "PR reviewer" + } + }, + Shells = Array.Empty() + }); + Assert.True(CopilotService.HasActiveBackgroundTasks(idle)); + } + + [Fact] + public void HasActiveBackgroundTasks_WithShells_ReturnsTrue() + { + var idle = MakeIdle(new SessionIdleDataBackgroundTasks + { + Agents = Array.Empty(), + Shells = new[] + { + new SessionIdleDataBackgroundTasksShellsItem + { + ShellId = "shell-1", + Description = "Running npm test" + } + } + }); + Assert.True(CopilotService.HasActiveBackgroundTasks(idle)); + } + + [Fact] + public void HasActiveBackgroundTasks_WithBothAgentsAndShells_ReturnsTrue() + { + var idle = MakeIdle(new SessionIdleDataBackgroundTasks + { + Agents = new[] + { + new SessionIdleDataBackgroundTasksAgentsItem + { + AgentId = "a1", AgentType = "copilot", Description = "" + } + }, + Shells = new[] + { + new SessionIdleDataBackgroundTasksShellsItem + { + ShellId = "s1", Description = "" + } + } + }); + Assert.True(CopilotService.HasActiveBackgroundTasks(idle)); + } + + [Fact] + public void HasActiveBackgroundTasks_DefaultIdle_ReturnsFalse() + { + // Default SessionIdleEvent — Data is auto-initialized but BackgroundTasks is null + var idle = new SessionIdleEvent { Data = new SessionIdleData() }; + Assert.False(CopilotService.HasActiveBackgroundTasks(idle)); + } + + [Fact] + public void HasActiveBackgroundTasks_DataNull_ReturnsFalse() + { + var idle = new SessionIdleEvent { Data = null! }; + Assert.False(CopilotService.HasActiveBackgroundTasks(idle)); + } +} diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs index bb9e94c128..9d7cc93f68 100644 --- a/PolyPilot/Services/CopilotService.Events.cs +++ b/PolyPilot/Services/CopilotService.Events.cs @@ -619,9 +619,28 @@ void Invoke(Action action) }); break; - case SessionIdleEvent: + case SessionIdleEvent idle: // Cancel the TurnEnd→Idle fallback — normal SessionIdleEvent arrived CancelTurnEndFallback(state); + + // KEY FIX: Check if the server reports active background tasks (sub-agents, shells). + // session.idle with background tasks means "foreground quiesced, background still running." + // Do NOT treat this as terminal — flush text and wait for the real idle. + if (HasActiveBackgroundTasks(idle)) + { + Debug($"[IDLE-DEFER] '{sessionName}' session.idle received with active background tasks — " + + $"deferring completion (IsProcessing={state.Info.IsProcessing}, " + + $"response={state.CurrentResponse.Length}+{state.FlushedResponse.Length} chars)"); + // Flush accumulated text at each idle boundary to prevent content loss + Invoke(() => + { + if (state.IsOrphaned) return; + FlushCurrentResponse(state); + NotifyStateChangedCoalesced(); + }); + break; // Don't complete — wait for next idle without background tasks + } + try { CompleteReasoningMessages(state, sessionName); } catch (Exception ex) { @@ -1829,6 +1848,18 @@ internal static bool IsShellEnvironmentFailure(string? text) return error.ToString(); } + /// + /// Check if a SessionIdleEvent reports active background tasks (agents or shells). + /// When background tasks are active, session.idle means "foreground quiesced, background + /// still running" — NOT true completion. + /// + internal static bool HasActiveBackgroundTasks(SessionIdleEvent idle) + { + var bt = idle.Data?.BackgroundTasks; + if (bt == null) return false; + return (bt.Agents is { Length: > 0 }) || (bt.Shells is { Length: > 0 }); + } + private void StartProcessingWatchdog(SessionState state, string sessionName) { CancelProcessingWatchdog(state);