diff --git a/.claude/skills/multi-agent-orchestration/SKILL.md b/.claude/skills/multi-agent-orchestration/SKILL.md index aa864bba8d..45753b2cef 100644 --- a/.claude/skills/multi-agent-orchestration/SKILL.md +++ b/.claude/skills/multi-agent-orchestration/SKILL.md @@ -17,6 +17,12 @@ description: > > **Read this before modifying orchestration dispatch, worker execution, synthesis, > reflection loops, or PendingOrchestration persistence.** +> ⚠️ **LONG-RUNNING SESSION SAFETY**: Multi-agent workers routinely run for 5-30+ +> minutes. ANY watchdog change, timeout tweak, or session lifecycle fix MUST be +> validated against long-running workers or it WILL kill legitimate sessions. +> See the **Long-Running Session Safety** section and run `LongRunningSessionSafetyTests` +> before merging ANY change to watchdog, Case B, or session lifecycle paths. + ## Overview PolyPilot's orchestration system coordinates work between an orchestrator session @@ -298,6 +304,149 @@ When modifying orchestration, verify: --- +## Session Stability & Reconnect Safety (PR #373) + +> **Critical context for multi-agent:** Reconnects affect ALL sibling sessions +> in a group, not just the session that triggered the reconnect. Every invariant +> below applies during orchestrator dispatch. + +### INV-O9: Sibling Re-Resume Must Orphan Old State + +When `CopilotClient` is recreated (connection drop + reconnect), ALL sessions +sharing that client need their `CopilotSession` re-resumed. The sibling loop +in `SendPromptAsync` (line ~2630) creates a **fresh `SessionState`** for each +sibling to prevent the shared-Info write-write race: + +```csharp +// 1. Mark old state as orphaned FIRST +otherState.IsOrphaned = true; +Interlocked.Exchange(ref otherState.ProcessingGeneration, long.MaxValue); +otherState.ResponseCompletion?.TrySetCanceled(); // Unblock orchestrator! + +// 2. Create fresh state with same Info +var siblingState = new SessionState { Session = resumed, Info = otherState.Info }; + +// 3. Register handler BEFORE publishing to dictionary +resumed.On(evt => HandleSessionEvent(siblingState, evt)); + +// 4. Atomic swap prevents stale Task.Run from overwriting +if (!_sessions.TryUpdate(key, siblingState, otherState)) { /* discard */ } +``` + +**Why TrySetCanceled matters for orchestration:** Workers await +`ResponseCompletion` TCS. If a reconnect orphans the state without canceling +the TCS, `ExecuteWorkerAsync` hangs forever → orchestration deadlocks. + +### INV-O10: Collection Snapshots Before Task.Run + +`Organization.Sessions` and `Organization.Groups` are `List` — NOT +thread-safe. The sibling loop runs in `Task.Run` (background thread). +Snapshot BEFORE entering the closure: + +```csharp +var sessionSnapshots = Organization.Sessions.ToList(); +var groupSnapshots = Organization.Groups.ToList(); +_ = Task.Run(async () => { + // Use snapshots, not live collections +}); +``` + +### INV-O11: MCP Servers Must Reload on Reconnect + +Both the primary reconnect path AND the sibling loop must call +`LoadMcpServers()` and `LoadSkillDirectories()`. Old server handles +are tied to the disposed client. Without reload, MCP tools silently +fail after reconnect. + +### INV-O12: IsProcessing Guard on Sibling Re-Resume + +Never orphan a sibling that's actively processing (mid-turn). The +sibling loop checks `if (otherState.Info.IsProcessing) continue;` +and re-checks after the async `ResumeSessionAsync` call (TOCTOU guard). + +### INV-O13: IsOrphaned Guards — 5-Layer Defense + +Every event/timer entry point checks `state.IsOrphaned` first: + +| Layer | Location | Purpose | +|-------|----------|---------| +| 1 | HandleSessionEvent:214 | Block ALL SDK events | +| 2 | CompleteResponse:913 | TrySetCanceled + return | +| 3 | Watchdog loop:1820 | Exit running watchdog | +| 4 | Watchdog callback:2095 | Guard InvokeOnUI | +| 5 | Tool/health handlers | Stale tool events | + +--- + +## Long-Running Session Safety + +> ⚠️ **This section is mandatory reading before touching ANY watchdog, timeout, +> or session lifecycle code.** Multi-agent workers are the longest-running sessions +> in PolyPilot. Changes that seem safe for interactive 30-second sessions can kill +> 20-minute review workers, losing all their work. + +### Real-World Session Durations + +| Scenario | Typical Duration | Max Observed | +|----------|-----------------|--------------| +| Interactive user chat | 5-30s | 2 min | +| Single-tool agent | 30s-2 min | 5 min | +| Multi-agent worker (review) | 3-11 min | 20 min | +| Multi-agent worker (fix+review) | 5-15 min | 30 min | +| OrchestratorReflect full loop | 15-30 min | 60 min | + +### What Looks Like a Stuck Session But Isn't + +These are **legitimate** long pauses that must NOT trigger watchdog kills: + +1. **Model thinking between tool rounds** (30s-5 min): After tool output, the LLM + decides its next action. No events written. `events.jsonl` mtime frozen. + ❌ Detecting "mtime unchanged for N checks" WILL kill this. + +2. **Large context ingestion** (1-3 min): Worker receives 60K+ char diff, model + processes before first response. Zero events between prompt send and first delta. + +3. **Multi-round tool execution bursts**: Worker runs 5 tools in 30s, then thinks + for 3 min, then runs 5 more. Bursty, not continuous. + +4. **Sub-agent dispatch** (5-15 min): Worker dispatches 5 sub-agents via task tool. + From our perspective, ONE tool call is running for 15 minutes. events.jsonl gets + one `tool_execution_start` and nothing until `tool_execution_complete`. + +### The Cardinal Rule + +> **NEVER add a timeout or staleness check that can kill a session where the CLI +> server process is alive and the events.jsonl file was written AFTER the current +> turn started.** The 1800s multi-agent freshness window exists because workers +> genuinely need 30 minutes. If detection needs to be faster, fix the ROOT CAUSE +> (e.g., missing event handler) rather than tightening timeouts. + +### Safe vs Unsafe Watchdog Changes + +| Change | Safe? | Why | +|--------|-------|-----| +| Fix missing `.On(evt => ...)` handler registration | ✅ | Root cause fix, no timeout change | +| Reduce `WatchdogMultiAgentCaseBFreshnessSeconds` | ❌ | Workers genuinely run 20+ min | +| Add mtime staleness counter (kill after N unchanged checks) | ❌ | Model thinking pauses freeze mtime for 5+ min | +| Reduce `WatchdogMaxCaseBResets` from 40 | ❌ | 40 × 120s = 80 min, matches worker execution timeout | +| Add `IsOrphaned` flag to skip stale callbacks | ✅ | Guards stale state, no timeout change | +| Add event handler on revival path | ✅ | Root cause fix, no timeout change | +| Abort IsProcessing siblings during client recreation | ⚠️ | Safe IF orchestrator retries, but loses in-flight work | + +### INV-O14: Watchdog Changes Must Pass Long-Running Tests + +Every PR that modifies watchdog logic, Case B freshness, timeout constants, or +session lifecycle (revival, reconnect, dispose) **MUST** run +`LongRunningSessionSafetyTests` and verify all pass. These tests simulate: +- 20-minute workers with bursty event patterns +- 5-minute model thinking pauses (zero events) +- Revival mid-execution +- events.jsonl written once then frozen + +If a test fails, the change would kill a legitimate long-running session in production. + +--- + ## Common Bugs & Mitigations ### Bug: Worker result lost on app restart @@ -316,7 +465,7 @@ not from live TCS tracking. **Root cause**: Worker's OnSessionComplete wasn't fired (incomplete cleanup path). -**Mitigation**: Ensure all 8 IsProcessing=false paths fire OnSessionComplete. +**Mitigation**: Ensure all 9 IsProcessing=false paths fire OnSessionComplete. ### Bug: Reflection loop processes stale user message @@ -333,10 +482,353 @@ not from live TCS tracking. **Root cause**: ParseTaskAssignments returned duplicates (orchestrator repeated @worker block). -**Mitigation**: Deduplicate assignments by worker name before dispatch: +**Mitigation**: Deduplicate assignments by worker name before dispatch. + +### Bug: Session death after reconnect (PR #373) + +**Symptom**: Sessions die instantly after connection recovery. "Thinking…" shows +briefly then clears. Multiple sessions in a group all die at once. + +**Root cause**: Old and new `SessionState` share the SAME `Info` object. Stale +`SessionIdleEvent` from the orphaned old `CopilotSession` passes the generation +check and clears `IsProcessing` on the shared Info, killing the new session. + +**Mitigation**: `IsOrphaned` volatile flag on SessionState, checked at all 5 +entry points. Old state's `ProcessingGeneration` set to `long.MaxValue`. +Fresh `SessionState` created for siblings (not reused). + +### Bug: Orchestration deadlock on reconnect + +**Symptom**: Worker dispatch hangs forever during synthesis. Orchestrator never +completes. No errors in log. + +**Root cause**: Reconnect orphaned a worker's state without calling +`TrySetCanceled()` on its `ResponseCompletion` TCS. The orchestrator's +`Task.WhenAll(workerTasks)` awaits forever. + +**Mitigation**: Always `TrySetCanceled()` on old state's TCS during orphan. +`ExecuteWorkerAsync` catches `OperationCanceledException` and returns +`WorkerResult.Success = false`. + +### Bug: Dead event stream after worker revival (discovered 2026-03-15) + +**Symptom**: Worker shows "Thinking…" or "Working…" forever after being +re-dispatched. Diagnostic log shows `[SEND]` but zero `[EVT]` entries. +events.jsonl has 100+ events (CLI is working) but `HandleSessionEvent` never +fires. Watchdog Case B defers for 30+ minutes. + +**Root cause**: `ExecuteWorkerAsync` revival path (Organization.cs ~line 1516) +creates a fresh `SessionState` and `CopilotSession` but did NOT call +`freshSession.On(evt => HandleSessionEvent(freshState, evt))`. Without the +event handler, the SDK transport delivers events to nobody. The session +completes server-side but the app never knows. + +**Why it took 30+ min to detect**: Case B freshness check uses events.jsonl +mtime. The CLI wrote events to the file (mtime updates), so the freshness +check kept deferring. With `WatchdogMultiAgentCaseBFreshnessSeconds = 1800`, +it deferred for 30 min before the file aged out. + +**Fix**: Register event handler on fresh session BEFORE sending, copy +`IsMultiAgentSession`, mark old state as `IsOrphaned`. + +**Why NOT mtime staleness detection**: Initially considered tracking mtime +changes across consecutive checks to detect "wrote once then stopped." But +legitimate workers pause for 5+ minutes between tool rounds (model thinking), +which would trigger false positives. The root cause fix (register handler) +is the correct approach. See **Long-Running Session Safety** section. + +### Bug: Steering cancels in-flight orchestration (PR #375) + +**Symptom**: User sends a follow-up message to a busy orchestrator (e.g., "also +check PR #400" while workers are running). Instead of queuing, Dashboard routes +through `SteerSessionAsync` which bumps `ProcessingGeneration`, canceling the +in-flight orchestration `ResponseCompletion` TCS. `SendToMultiAgentGroupAsync` +gets `TaskCanceledException`. Workers complete but their results are never +collected. Orchestrator appears stuck. + +**Root cause**: Dashboard.razor dispatch routing checked `IsProcessing` and +routed to `SteerSessionAsync` BEFORE checking if the session is an orchestrator. +Steering is designed for regular sessions where you want to redirect the agent. +For orchestrators, steering is destructive — it cancels the dispatch/synthesis +lifecycle. + +**Fix**: Check `GetOrchestratorGroupId(sessionName)` WITHIN the `IsProcessing` +block, BEFORE the steer path. If session is an orchestrator, route to +`EnqueueMessage` instead. The queued message will be sent after the current +orchestration completes. Logged as `QUEUED_ORCH_BUSY` in event diagnostics. + +**Key invariant**: Orchestrator sessions must NEVER be steered while processing. +Always queue. Workers CAN still be steered (useful for "stop" or "focus on X"). + +**Tests**: `MultiAgentRegressionTests.cs` — 8 tests in "Orchestrator-Steer +Conflict Tests (PR #375)" region: +- Structural test verifying orchestrator check appears before steer in Dashboard +- Verify EnqueueMessage (not steer) is used for orchestrators +- Verify non-orchestrator sessions still get steered +- Long-running orchestrator (15min) follow-up must queue + +### Bug: Premature session.idle truncates orchestrator results (PR #375) + +**Symptom**: CLI sends `session.idle` prematurely mid-turn (after only a few +tool rounds), then continues processing for 15+ more tool rounds. The +`CompleteResponse` fires on the premature idle, completing the +`ResponseCompletion` TCS with partial content. If this is a worker, the +orchestrator receives truncated results in synthesis. + +**Root cause**: SDK/CLI bug — variant of bug #299 (missing idle). Instead of +missing the idle entirely, it sends it too early. The idle arrives, passes all +generation guards, and CompleteResponse runs with whatever content has been +flushed so far. + +**Partial fix (UI)**: Added re-arm in `AssistantTurnStartEvent` handler: when +TurnStart arrives with `IsProcessing=false` on the current (non-orphaned) state, +re-arm IsProcessing, restart watchdog, log as `[EVT-REARM]`. This keeps the UI +showing "Working…" and the watchdog active. + +**Not fixed**: Orchestrator content truncation. The TCS was already completed +with partial content before re-arm fires. A future fix could create a NEW TCS +on re-arm so the orchestrator waits for the real completion. Complex — may +need separate PR. + +**Filed**: See GitHub issue for tracking. + +--- + +## "Fix with Copilot" — Multi-Agent Awareness + +### Current State + +`BuildCopilotPrompt` in `SessionSidebar.razor` (line 2578) generates a fix +prompt for the external `copilot` CLI. **It is NOT multi-agent aware.** + +When a user clicks "Fix with Copilot" on a session that's part of a +multi-agent group, the prompt should include: + +### Required Context for Multi-Agent Fix + +1. **Group membership**: session role (orchestrator/worker), group name, mode +2. **Worker list**: all workers in the group and their descriptions/models +3. **Event diagnostics**: last 30 lines of `event-diagnostics.log` for the group +4. **Multi-agent testing instructions**: the agent must verify orchestration + still works after the fix (dispatch → worker execution → synthesis → cleanup) + +### GetBugReportDebugInfo Enhancement + +When `selectedBugSession` is part of a multi-agent group, include: + +``` +--- Multi-Agent Context --- +Group: PR Review Squad +Mode: Orchestrator +Role: orchestrator (or worker-2) +Workers: worker-1 (claude-sonnet-4.6), worker-2 (gpt-5.3-codex), ... +OrchestratorMode: OrchestratorReflect +PendingOrchestration: (contents or "none") +--- Recent Event Diagnostics (group) --- +[SEND] 'PR Review Squad-orchestrator' ... +[DISPATCH] ... +``` + +### BuildCopilotPrompt Enhancement + +When the selected session is in a multi-agent group, append: + +``` +## Multi-Agent Testing Requirements +This session is part of a multi-agent group. After fixing: +1. Verify the fix doesn't break orchestration dispatch (DISPATCH-ROUTE → DISPATCH → SEND) +2. Test that workers still complete and report back to orchestrator +3. Check that PendingOrchestration is cleared after synthesis +4. Run `grep "$GROUP_NAME" ~/.polypilot/event-diagnostics.log | tail -30` to verify event flow +5. If modifying IsProcessing paths, verify all 9 companion fields are cleared (see INV-1) +6. If modifying reconnect paths, verify IsOrphaned guards (see INV-O9-O13) +``` + +--- + +## Live Testing Multi-Agent Orchestration + +When testing multi-agent orchestration on a running PolyPilot instance, use +the event diagnostics log and these checklists to verify correct behavior. + +### Quick Health Check (run this first) + +```bash +GROUP="$GROUP_NAME" # e.g., "PR Review Squad" + +# 1. Is orchestrator processing? +grep "$GROUP-orchestrator" ~/.polypilot/event-diagnostics.log | tail -3 + +# 2. Are workers alive? +for w in 1 2 3 4 5; do + last=$(grep "$GROUP-worker-$w'" ~/.polypilot/event-diagnostics.log 2>/dev/null | tail -1) + [[ -n "$last" ]] && echo "W$w: $last" +done + +# 3. Any completions? +grep "$GROUP" ~/.polypilot/event-diagnostics.log | grep -E "IDLE|COMPLETE|DISPATCH.*completed" | tail -10 + +# 4. Any errors? +grep "$GROUP" ~/.polypilot/event-diagnostics.log | grep -E "ERROR|WATCHDOG" | tail -5 + +# 5. PendingOrchestration state? +cat ~/.polypilot/pending-orchestration.json 2>/dev/null | head -3 || echo "(empty)" +``` + +### Monitoring Orchestrator Dispatch + +```bash +grep "DISPATCH" ~/.polypilot/event-diagnostics.log | grep "$GROUP" | tail -20 +``` + +**Expected sequence for N-worker dispatch:** +``` +[DISPATCH-ROUTE] session='' → mode=Orchestrator +[DISPATCH] SendToMultiAgentGroupAsync: group='', members=N+1 +[DISPATCH] Early dispatch: @worker blocks detected in flushed text +[IDLE] '' CompleteResponse dispatched +[COMPLETE] '' CompleteResponse executing +[DISPATCH] '' iteration 0: K raw assignments +[DISPATCH] Dispatching K tasks: worker-1, worker-2, ... +[DISPATCH] Saved pending orchestration +[SEND] 'worker-1' IsProcessing=true gen=1 +[SEND] 'worker-2' IsProcessing=true gen=1 (1s later) +[SEND] 'worker-3' IsProcessing=true gen=1 (2s later) +``` + +### Monitoring Worker Execution + +**Signs of healthy worker:** +- TurnStart/TurnEnd pairs cycling every 2-30 seconds (tool rounds) +- Eventually a SessionIdleEvent → CompleteResponse → COMPLETE sequence +- No [ERROR] or [WATCHDOG] entries + +**Signs of stuck worker:** +- No TurnEnd/TurnStart for >120s (watchdog will catch at 120s or 600s) +- [WATCHDOG] entries appearing +- Worker stays in TurnStart without TurnEnd for >5 min (long tool call OK, but >10 min suspicious) + +### Monitoring OrchestratorReflect Mode + +Reflect mode runs multiple iterations. Expect this pattern: + +``` +[SEND] orchestrator gen=1 → Plan (dispatches W1) +[DISPATCH] Worker W1 completed → Reflect synthesis +[SEND] orchestrator gen=2 → Evaluate, dispatch W2 +[DISPATCH] Worker W2 completed → Reflect synthesis +[SEND] orchestrator gen=3 → Evaluate, maybe dispatch both +... +[SEND] orchestrator gen=N → Final synthesis (309 chars) → DONE +``` + +**Key observations from live testing (PR Review Squad + Evaluate Ortinau Skills):** +- Orchestrator mode: 3 workers, 1 round, 12 min total, 52s synthesis +- OrchestratorReflect mode: 2 workers, 7 iterations, 26 min total, progressively + shorter responses indicating convergence (6701→4736→507→102 chars) +- Zero-assignment iteration (gen=11 had 0 assignments) handled correctly — + orchestrator re-reflected and dispatched new assignments +- Duplicate IDLE events (SDK bug) handled gracefully — CompleteResponse skipped + +### Full End-to-End Checklist + +1. **Dispatch Phase**: + - [ ] Orchestrator receives user prompt + - [ ] DISPATCH-ROUTE logged with correct mode + - [ ] Early dispatch detects @worker blocks + - [ ] Correct number of assignments parsed + - [ ] PendingOrchestration saved to disk before dispatch + - [ ] Workers staggered with 1s delay + - [ ] Each worker gets [SEND] with gen=1 + +2. **Worker Execution Phase**: + - [ ] Each worker actively processes (TurnStart/TurnEnd cycling) + - [ ] Watchdog Case B correctly defers when events.jsonl is fresh + - [ ] No [ERROR] entries + - [ ] Each worker eventually gets SessionIdleEvent → CompleteResponse + +3. **Collection Phase**: + - [ ] After ALL workers complete, orchestrator synthesis triggered + - [ ] Orchestrator gets [SEND] with new generation + - [ ] No workers stuck in IsProcessing after completion + - [ ] Duplicate IDLE events skipped ("IsProcessing already false") + +4. **Synthesis Phase**: + - [ ] Orchestrator processes synthesis + - [ ] Orchestrator completes (SessionIdleEvent → CompleteResponse) + - [ ] PendingOrchestration file empty/deleted + +5. **Reflection Phase** (OrchestratorReflect only): + - [ ] Orchestrator evaluates worker results after each iteration + - [ ] New iterations dispatch fresh worker assignments + - [ ] Zero-assignment iterations handled (re-reflect or terminate) + - [ ] Response sizes decrease over iterations (convergence signal) + - [ ] Orchestrator terminates after max iterations or goal met + +6. **Error Recovery**: + - [ ] Worker failure → WorkerResult.Success=false in synthesis + - [ ] App restart mid-dispatch → PendingOrchestration resumes + - [ ] Watchdog catches stuck sessions (120s idle / 600s tool) + - [ ] Reconnect during orchestration → TCS canceled, workers get error result + - [ ] IsOrphaned prevents stale callbacks from corrupting active sessions + +### Common Live Test Failures + +| Symptom | Diagnostic Command | Likely Cause | +|---------|-------------------|--------------| +| Workers never start | `grep "SEND.*worker" diagnostics.log` | Dispatch parse failed; check @worker format | +| One worker stuck | `grep "worker-N" diagnostics.log \| tail -5` | SDK bug, watchdog catches at 120-600s | +| Synthesis never sent | `grep "orchestrator.*SEND" diagnostics.log` | Task.WhenAll waiting; check for stuck worker | +| Orchestrator stuck post-synthesis | Check [WATCHDOG] entries | Zero-idle SDK bug; watchdog catches at 30s | +| PendingOrchestration stale | `cat ~/.polypilot/pending-orchestration.json` | Finally block didn't run; check for crash | +| All sessions die after reconnect | Check [RECONNECT] entries | IsOrphaned not set; see INV-O9 | +| Orchestration hangs on reconnect | Check for missing TrySetCanceled | TCS not canceled; see INV-O9 | + +--- + +## Test Coverage & Gaps + +### Existing Test Files + +| File | Tests | Coverage | +|------|-------|----------| +| `MultiAgentRegressionTests.cs` | ~70 | Organization, reconciliation, presets, reflection bugs | +| `ReflectionCycleTests.cs` | ~95 | Sentinels, iteration, stall detection, evaluation | +| `ProcessingWatchdogTests.cs` | ~35 | Session state, abort, reconnect, watchdog constants | +| `MultiAgentGapTests.cs` | ~30 | @worker parsing, task assignments, delegation | + +### ✅ Well-Covered + +- PendingOrchestration save/load/clear (5 tests) +- @worker block parsing (20+ tests) +- Reflection iteration counting (8+ tests) +- Stall detection (5+ tests) +- IsProcessing flag ordering (3 tests) + +### ❌ Critical Gaps (priority tests to add) + +| Gap | Priority | What to test | +|-----|----------|-------------| +| ForceCompleteProcessingAsync | HIGH | All 9 INV-1 fields cleared, TCS resolved, timers canceled | +| Mixed worker success/failure synthesis | HIGH | 2 succeed + 1 fail → synthesis includes both | +| IsOrphaned guard coverage | HIGH | Event on orphaned state → no Info mutation | +| TryUpdate concurrency | HIGH | Stale Task.Run can't overwrite newer reconnect | +| Sibling TCS cancel on reconnect | HIGH | Orphaned worker's TCS → OperationCanceledException | +| Zero-assignment in reflect mode | MEDIUM | 0 assignments → re-reflect or terminate | +| Worker stagger delay | MEDIUM | 1s gap between worker [SEND] timestamps | +| Early dispatch edge cases | MEDIUM | Partial @worker blocks, orphaned blocks | + +### Adding Tests — Quick Reference + +Test stubs are in `PolyPilot.Tests/TestStubs.cs`. Key patterns: ```csharp -var assignments = rawAssignments - .GroupBy(a => a.WorkerName, StringComparer.OrdinalIgnoreCase) - .Select(g => new TaskAssignment(g.Key, string.Join("\n\n---\n\n", g.Select(a => a.Task)))) - .ToList(); +// Use Demo mode for success paths +var settings = new ConnectionSettings { Mode = ConnectionMode.Demo }; +var service = new CopilotService(db, serverManager, bridgeClient, demoService); +await service.ReconnectAsync(settings); + +// Never use Embedded mode (spawns real processes) +// Use Persistent with port 19999 for deterministic failures ``` + +When adding model classes, add `` to `PolyPilot.Tests.csproj`. diff --git a/.claude/skills/processing-state-safety/SKILL.md b/.claude/skills/processing-state-safety/SKILL.md index fbc031cab5..075bd7da50 100644 --- a/.claude/skills/processing-state-safety/SKILL.md +++ b/.claude/skills/processing-state-safety/SKILL.md @@ -34,7 +34,9 @@ Every code path that sets `IsProcessing = false` MUST also: 12. Run on UI thread (via `InvokeOnUI()` or already on UI thread) 13. After changes, run `ProcessingWatchdogTests.cs` to catch regressions -## The 9 Paths That Clear IsProcessing +## The 10 Paths That Set/Clear IsProcessing + +### Paths that CLEAR IsProcessing (→ false) | # | Path | File | Thread | Notes | |---|------|------|--------|-------| @@ -48,6 +50,17 @@ Every code path that sets `IsProcessing = false` MUST also: | 8 | SendAsync initial failure | CopilotService.cs | UI | Prompt send failed | | 9 | Bridge OnTurnEnd | Bridge.cs | Background → InvokeOnUI | Remote mode turn complete | +### Path that RE-ARMS IsProcessing (→ true) + +| # | Path | File | Thread | Notes | +|---|------|------|--------|-------| +| 10 | TurnStart re-arm | Events.cs | Background → InvokeOnUI | Premature session.idle recovery (PR #375) | + +Path #10 fires when `AssistantTurnStartEvent` arrives with `IsProcessing=false` on the +current non-orphaned state. This detects premature `session.idle` (SDK sends idle mid-turn +then continues). Re-arm sets `IsProcessing=true`, restarts the watchdog, and logs `[EVT-REARM]`. +Does NOT create a new TCS — the old one was already completed with partial content. + ## Content Persistence Safety ### Turn-End Flush @@ -170,6 +183,46 @@ Use the class-level `InvokeOnUI()` method in all `Task.Run` and timer callbacks for explicit, unambiguous UI thread dispatch. The local `Invoke` works but the intent is less clear when reading cross-threaded code. +### INV-14: IsOrphaned guards on all event/timer entry points (PR #373) +When a `SessionState` is orphaned (after reconnect creates a replacement): +1. Set `state.IsOrphaned = true` (volatile) +2. Set `ProcessingGeneration = long.MaxValue` (prevents any generation check from passing) +3. Call `state.ResponseCompletion?.TrySetCanceled()` (unblocks orchestrator waits) + +ALL event/timer entry points must check `state.IsOrphaned` and return immediately: +- `HandleSessionEvent` (line ~214) +- `CompleteResponse` (line ~913) — TrySetCanceled + return +- Watchdog loop (line ~1820) — exit loop +- Watchdog InvokeOnUI callbacks (line ~2095) — skip +- Tool health/recovery handlers — skip + +Without this, stale SDK events from the disposed old `CopilotSession` pass through +to the shared `Info` object and corrupt the replacement session's state. + +### INV-15: TryUpdate for atomic state swaps (PR #373) +When replacing a `SessionState` in `_sessions` after reconnect, use +`_sessions.TryUpdate(key, newState, expectedOldState)` instead of +`_sessions[key] = newState`. This prevents a stale `Task.Run` (from an earlier +reconnect) from overwriting a newer reconnect's state. If TryUpdate fails, +discard the result — someone else already updated. + +### INV-16: Register handler BEFORE publishing to dictionary (PR #373) +When creating a new `SessionState` (reconnect or sibling re-resume): +```csharp +resumed.On(evt => HandleSessionEvent(newState, evt)); // 1. Handler first +_sessions.TryUpdate(key, newState, oldState); // 2. Publish second +``` +If reversed, a race window exists where events arrive before the handler is +registered, and those events are lost permanently. + +### INV-17: Sibling re-resume must reload MCP servers (PR #373) +Both the primary reconnect path and the sibling loop must call: +- `cfg.LoadMcpServers()` — MCP server handles are tied to the disposed client +- `cfg.LoadSkillDirectories()` — same issue + +The primary path was missing these until PR #373 Round 5. Asymmetry between +the sibling and primary reconnect configs is a recurring bug pattern. + ## Top 5 Recurring Mistakes 1. **Incomplete cleanup** — modifying one IsProcessing path without diff --git a/PolyPilot.Tests/LongRunningSessionSafetyTests.cs b/PolyPilot.Tests/LongRunningSessionSafetyTests.cs new file mode 100644 index 0000000000..9805baefbc --- /dev/null +++ b/PolyPilot.Tests/LongRunningSessionSafetyTests.cs @@ -0,0 +1,512 @@ +using Microsoft.Extensions.DependencyInjection; +using PolyPilot.Models; +using PolyPilot.Services; + +namespace PolyPilot.Tests; + +/// +/// Safety tests that verify watchdog and session lifecycle changes do NOT +/// prematurely kill legitimate long-running multi-agent sessions. +/// +/// Multi-agent workers routinely run 5-30+ minutes. These tests validate: +/// - Freshness windows are wide enough for real workloads +/// - Revival paths register event handlers +/// - Timeout constants are consistent with observed session durations +/// - Code structure invariants that prevent future regressions +/// +/// ⚠️ Run these tests before merging ANY change to watchdog, Case B, +/// timeout constants, or session lifecycle (revival, reconnect, dispose). +/// See: .claude/skills/multi-agent-orchestration/SKILL.md → "Long-Running Session Safety" +/// +[Collection("BaseDir")] +public class LongRunningSessionSafetyTests +{ + private readonly StubChatDatabase _chatDb = new(); + private readonly StubServerManager _serverManager = new(); + private readonly StubWsBridgeClient _bridgeClient = new(); + private readonly StubDemoService _demoService = new(); + private readonly IServiceProvider _serviceProvider; + + public LongRunningSessionSafetyTests() + { + var services = new ServiceCollection(); + _serviceProvider = services.BuildServiceProvider(); + } + + private const System.Reflection.BindingFlags NonPublic = + System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance; + private const System.Reflection.BindingFlags AnyInstance = + System.Reflection.BindingFlags.Public | System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance; + + private static object GetSessionState(CopilotService svc, string sessionName) + { + var sessionsField = typeof(CopilotService).GetField("_sessions", NonPublic)!; + var sessionsDict = sessionsField.GetValue(svc)!; + var tryGetMethod = sessionsDict.GetType().GetMethod("TryGetValue")!; + var args = new object?[] { sessionName, null }; + tryGetMethod.Invoke(sessionsDict, args); + return args[1] ?? throw new InvalidOperationException($"Session '{sessionName}' not found"); + } + + private static T GetField(object state, string fieldName) + { + var field = state.GetType().GetField(fieldName, AnyInstance) + ?? throw new InvalidOperationException($"Field '{fieldName}' not found"); + return (T)field.GetValue(state)!; + } + + private static object GetProp(object state, string propName) + { + // SessionState.Info is a property-like field + var field = state.GetType().GetField(propName, AnyInstance); + if (field != null) return field.GetValue(state)!; + var prop = state.GetType().GetProperty(propName, AnyInstance) + ?? throw new InvalidOperationException($"Property '{propName}' not found"); + return prop.GetValue(state)!; + } + + private CopilotService CreateService() => + new(_chatDb, _serverManager, _bridgeClient, new RepoManager(), _serviceProvider, _demoService); + + private static class TestPaths + { + private static readonly string ProjectRoot = Path.GetFullPath( + Path.Combine(AppContext.BaseDirectory, "..", "..", "..", "..", "PolyPilot")); + + public static string CopilotServiceCs => Path.Combine(ProjectRoot, "Services", "CopilotService.cs"); + public static string EventsCs => Path.Combine(ProjectRoot, "Services", "CopilotService.Events.cs"); + public static string OrganizationCs => Path.Combine(ProjectRoot, "Services", "CopilotService.Organization.cs"); + } + + // ─── Timeout Constant Safety ─── + + [Fact] + public void MultiAgentFreshness_CanAccommodate20MinuteWorker() + { + // Real-world: PR review workers with 5-model dispatch take 10-20 min. + // The freshness window must be wider than the longest expected worker. + var freshnessMinutes = CopilotService.WatchdogMultiAgentCaseBFreshnessSeconds / 60.0; + Assert.True(freshnessMinutes >= 20, + $"Multi-agent freshness ({freshnessMinutes:F0} min) must be >= 20 min to accommodate " + + "long PR review workers. Observed durations: 3-20 min typical, 30 min max."); + } + + [Fact] + public void MultiAgentFreshness_NotSoWideItHidesDeadSessions() + { + // Safety: freshness shouldn't exceed 60 min (worker execution timeout). + // After 60 min, the orchestrator cancels the worker anyway. + var freshnessMinutes = CopilotService.WatchdogMultiAgentCaseBFreshnessSeconds / 60.0; + Assert.True(freshnessMinutes <= 60, + $"Multi-agent freshness ({freshnessMinutes:F0} min) shouldn't exceed worker " + + "execution timeout (60 min). Dead sessions would hide for too long."); + } + + [Fact] + public void StandardFreshness_NotUsedForMultiAgentSessions() + { + // The standard 300s (5 min) freshness is too short for multi-agent workers. + // Verify the code uses the multi-agent constant for multi-agent sessions. + Assert.True(CopilotService.WatchdogCaseBFreshnessSeconds < CopilotService.WatchdogMultiAgentCaseBFreshnessSeconds, + "Standard freshness must be shorter than multi-agent freshness"); + Assert.True(CopilotService.WatchdogCaseBFreshnessSeconds <= 300, + "Standard freshness should be ≤ 5 min for interactive sessions"); + } + + [Fact] + public void CaseBDeferralCap_AllowsFullWorkerExecution() + { + // 40 deferrals × 120s = 4800s = 80 min. This exceeds the 60 min worker + // execution timeout, ensuring the deferral cap is never the binding constraint + // for legitimate workers (the orchestrator cancels first). + var totalDeferralTime = CopilotService.WatchdogMaxCaseBResets * 120; // 120s per check cycle + var workerTimeoutSeconds = 3600; // 60 min + Assert.True(totalDeferralTime >= workerTimeoutSeconds, + $"Total deferral time ({totalDeferralTime}s = {totalDeferralTime / 60} min) must be >= " + + $"worker execution timeout ({workerTimeoutSeconds}s = {workerTimeoutSeconds / 60} min). " + + "Otherwise the deferral cap kills workers before the orchestrator can cancel them."); + } + + [Fact] + public void MaxProcessingTime_AccommodatesReflectLoops() + { + // OrchestratorReflect loops can run 30-60 min (7+ iterations × 5 min each). + // The max processing time safety net must not kill them. + var maxMinutes = CopilotService.WatchdogMaxProcessingTimeSeconds / 60.0; + Assert.True(maxMinutes >= 30, + $"Max processing time ({maxMinutes:F0} min) must be >= 30 min for OrchestratorReflect loops"); + } + + // ─── Code Structure Safety: Case B uses correct freshness for multi-agent ─── + + [Fact] + public void CaseB_UsesIsMultiAgentSession_ToSelectFreshness() + { + var source = File.ReadAllText(TestPaths.EventsCs); + + // The watchdog reads IsMultiAgentSession from state and uses it + // to choose between the two freshness constants. + // isMultiAgentSession is a local variable inside the watchdog loop. + Assert.Contains("IsMultiAgentSession", source); + Assert.Contains("WatchdogMultiAgentCaseBFreshnessSeconds", source); + Assert.Contains("WatchdogCaseBFreshnessSeconds", source); + + // Must NOT have hardcoded numeric freshness values + Assert.DoesNotContain("age < 300", source); + Assert.DoesNotContain("age < 1800", source); + } + + // ─── Revival Path Safety: event handler must be registered ─── + + [Fact] + public void WorkerRevival_RegistersEventHandler() + { + // The revival path in ExecuteWorkerAsync creates a fresh session. + // It MUST register .On(evt => HandleSessionEvent(...)) BEFORE sending. + // Without this, the session has a dead event stream and the watchdog + // is the only recovery — taking 30+ minutes for multi-agent workers. + var source = File.ReadAllText(TestPaths.OrganizationCs); + + // Find the revival section (between "fresh session revival" and next response assignment) + var revivalStart = source.IndexOf("attempting fresh session revival"); + Assert.True(revivalStart > 0, "Revival code must exist in Organization.cs"); + + var revivalEnd = source.IndexOf("SendPromptAndWaitAsync", revivalStart); + Assert.True(revivalEnd > revivalStart, "Revival must call SendPromptAndWaitAsync after setup"); + + var revivalSection = source[revivalStart..revivalEnd]; + + Assert.Contains(".On(evt => HandleSessionEvent(", revivalSection); + } + + [Fact] + public void WorkerRevival_CopiesIsMultiAgentSession() + { + // Fresh SessionState defaults IsMultiAgentSession=false. The revival + // must copy it from the dead state, otherwise the watchdog uses the + // standard 300s freshness instead of the 1800s multi-agent window. + // (SendPromptAsync also sets it, but defense-in-depth.) + var source = File.ReadAllText(TestPaths.OrganizationCs); + + var revivalStart = source.IndexOf("attempting fresh session revival"); + Assert.True(revivalStart > 0); + + var revivalEnd = source.IndexOf("SendPromptAndWaitAsync", revivalStart); + var revivalSection = source[revivalStart..revivalEnd]; + + Assert.Contains("IsMultiAgentSession", revivalSection); + } + + [Fact] + public void WorkerRevival_MarksOldStateOrphaned() + { + // The old SessionState must be marked IsOrphaned so any lingering + // callbacks from the disposed session are no-ops. + var source = File.ReadAllText(TestPaths.OrganizationCs); + + var revivalStart = source.IndexOf("attempting fresh session revival"); + Assert.True(revivalStart > 0); + + var revivalEnd = source.IndexOf("SendPromptAndWaitAsync", revivalStart); + var revivalSection = source[revivalStart..revivalEnd]; + + Assert.Contains("IsOrphaned", revivalSection); + } + + // ─── All session creation paths register event handlers ─── + + [Fact] + public void AllSessionCreationPaths_RegisterEventHandler() + { + // Every path that creates a CopilotSession via the SDK and stores it + // must call .On(evt => HandleSessionEvent(...)). Missing this causes + // dead event streams. + // + // We check that in files that call SDK's CreateSessionAsync (the low-level + // call that returns a CopilotSession), there's a corresponding handler. + // We use a targeted pattern: client.CreateSessionAsync( — the SDK call + // vs the service's own public CreateSessionAsync method. + + // Organization.cs: The revival path creates a session via _client.CreateSessionAsync + var orgSource = File.ReadAllText(TestPaths.OrganizationCs); + var orgSdkCreates = CountOccurrences(orgSource, ".CreateSessionAsync("); + var orgHandlers = CountOccurrences(orgSource, ".On(evt => HandleSessionEvent("); + + Assert.True(orgHandlers >= orgSdkCreates, + $"Organization.cs: Found {orgSdkCreates} SDK CreateSessionAsync calls but only {orgHandlers} " + + $"event handler registrations. Every session creation MUST register a handler."); + + // Verify each CreateSessionAsync call has a handler within 15 lines (same code block) + VerifyHandlerProximity(orgSource, "Organization.cs"); + + // Events.cs: Tool health revival creates sessions + var eventsSource = File.ReadAllText(TestPaths.EventsCs); + var eventsSdkCreates = CountOccurrences(eventsSource, ".CreateSessionAsync("); + var eventsHandlers = CountOccurrences(eventsSource, ".On(evt => HandleSessionEvent("); + + Assert.True(eventsHandlers >= eventsSdkCreates, + $"Events.cs: Found {eventsSdkCreates} SDK CreateSessionAsync calls but only {eventsHandlers} " + + $"event handler registrations. Every session creation MUST register a handler."); + + VerifyHandlerProximity(eventsSource, "Events.cs"); + + // CopilotService.cs: Main session creation and reconnect paths + // Uses count-based check (not proximity) because the reconnect-recovery + // paths have try/retry sharing a single handler after the catch blocks. + var mainSource = File.ReadAllText(TestPaths.CopilotServiceCs); + Assert.True(CountOccurrences(mainSource, ".On(evt => HandleSessionEvent(") >= 3, + "CopilotService.cs must have at least 3 event handler registrations " + + "(create, restore, reconnect primary + sibling)."); + } + + /// + /// Verifies that each SDK CreateSessionAsync call (client.CreateSessionAsync) has an + /// .On(evt => HandleSessionEvent( registration within 15 lines, ensuring handlers + /// aren't missing on individual paths while the file-wide count appears balanced. + /// Skips non-SDK calls like _bridgeClient.CreateSessionAsync and method signatures. + /// + private static void VerifyHandlerProximity(string source, string fileName) + { + var lines = source.Split('\n'); + for (int i = 0; i < lines.Length; i++) + { + if (!lines[i].Contains(".CreateSessionAsync(", StringComparison.Ordinal)) continue; + // Skip comments + var trimmed = lines[i].TrimStart(); + if (trimmed.StartsWith("//") || trimmed.StartsWith("*")) continue; + // Skip non-SDK calls: bridge client, method signatures, test stubs + if (trimmed.Contains("_bridgeClient.", StringComparison.Ordinal)) continue; + if (trimmed.Contains("public ", StringComparison.Ordinal) || + trimmed.Contains("private ", StringComparison.Ordinal) || + trimmed.Contains("internal ", StringComparison.Ordinal)) continue; + // Only match SDK client calls (e.g., client.CreateSessionAsync, _client.CreateSessionAsync) + if (!trimmed.Contains("client.CreateSessionAsync", StringComparison.OrdinalIgnoreCase) && + !trimmed.Contains("_client.CreateSessionAsync", StringComparison.OrdinalIgnoreCase) && + !trimmed.Contains("codespaceClient.CreateSessionAsync", StringComparison.OrdinalIgnoreCase)) + continue; + + // Search forward up to 60 lines for the handler registration. + // 60 lines accommodates retry patterns where the initial try and retry + // share a single handler registration after the try/catch blocks. + bool foundHandler = false; + int searchEnd = Math.Min(i + 60, lines.Length); + for (int j = i; j < searchEnd; j++) + { + if (lines[j].Contains(".On(evt => HandleSessionEvent(", StringComparison.Ordinal)) + { + foundHandler = true; + break; + } + } + + Assert.True(foundHandler, + $"{fileName} line {i + 1}: CreateSessionAsync call has no .On(evt => HandleSessionEvent( " + + $"within 60 lines. Every session creation path must register a handler."); + } + } + + // ─── Simulate long-running session scenarios ─── + + [Fact] + public void LongRunningWorker_IsNotKilledByStandardTimeout() + { + // Verifies that the watchdog timeout selection uses extended multi-agent + // thresholds for multi-agent workers, not the standard 120s inactivity timeout. + // This tests the code STRUCTURE (timeout selection logic) rather than running + // a real timer — the watchdog loop itself is tested in ProcessingWatchdogTests. + var source = File.ReadAllText(TestPaths.EventsCs); + + // The watchdog must check IsMultiAgentSession when selecting freshness threshold + Assert.Contains("IsMultiAgentSession", source); + Assert.Contains("WatchdogMultiAgentCaseBFreshnessSeconds", source); + + // Multi-agent freshness (1800s) must be checked BEFORE falling back to standard (300s) + var multiIdx = source.IndexOf("WatchdogMultiAgentCaseBFreshnessSeconds", StringComparison.Ordinal); + var stdIdx = source.IndexOf("WatchdogCaseBFreshnessSeconds", StringComparison.Ordinal); + Assert.True(multiIdx >= 0, "Multi-agent freshness constant must be referenced in Events.cs"); + Assert.True(stdIdx >= 0, "Standard freshness constant must be referenced in Events.cs"); + + // Also verify the actual constant values ensure long-running workers survive + // 1800s (30 min) multi-agent window >> 120s standard inactivity timeout + var eventsFields = typeof(CopilotService).GetFields( + System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static); + var multiField = eventsFields.FirstOrDefault(f => f.Name == "WatchdogMultiAgentCaseBFreshnessSeconds"); + Assert.NotNull(multiField); // Fail loudly if constant is renamed + var multiValue = Convert.ToInt32(multiField.GetValue(null)); + Assert.True(multiValue >= 1800, + $"WatchdogMultiAgentCaseBFreshnessSeconds={multiValue} must be >= 1800 for long-running workers"); + } + + [Fact] + public async Task LongRunningWorker_SendingFlag_ResetOnCleanAbort() + { + // If a long-running session is aborted, SendingFlag must be cleared + // to allow future sends. Without this, the session deadlocks. + var service = CreateService(); + await service.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + + var sessionName = "test-long-worker"; + await service.CreateSessionAsync(sessionName); + await service.SendPromptAsync(sessionName, "test prompt"); + + // Wait for demo completion + await Task.Delay(300); + + // Abort should clear all processing state + await service.AbortSessionAsync(sessionName); + + var state = GetSessionState(service, sessionName); + var info = GetProp(state, "Info"); + Assert.False((bool)info.GetType().GetProperty("IsProcessing")!.GetValue(info)!, + "IsProcessing should be cleared after abort"); + Assert.Equal(0, (int)info.GetType().GetProperty("ProcessingPhase")!.GetValue(info)!); + } + + [Fact] + public void WatchdogConstants_AreInternallyConsistent() + { + // Verify the relationship between all watchdog constants. + // These invariants prevent changes to one constant from + // silently breaking the timeout hierarchy. + + // Standard freshness < Multi-agent freshness + Assert.True(CopilotService.WatchdogCaseBFreshnessSeconds < + CopilotService.WatchdogMultiAgentCaseBFreshnessSeconds); + + // Inactivity timeout < Tool execution timeout + Assert.True(CopilotService.WatchdogInactivityTimeoutSeconds < + CopilotService.WatchdogToolExecutionTimeoutSeconds); + + // Case B deferral cap × check interval > worker execution timeout + var totalDeferralSeconds = CopilotService.WatchdogMaxCaseBResets * + CopilotService.WatchdogInactivityTimeoutSeconds; + Assert.True(totalDeferralSeconds > 3600, + $"Total deferral time ({totalDeferralSeconds}s) must exceed 60 min worker timeout"); + + // Max processing time > multi-agent freshness + Assert.True(CopilotService.WatchdogMaxProcessingTimeSeconds > + CopilotService.WatchdogMultiAgentCaseBFreshnessSeconds); + } + + [Fact] + public void CaseB_DoesNotUse_MtimeStalenessDetection() + { + // Mtime staleness detection (killing sessions when events.jsonl mtime + // is unchanged for N consecutive checks) is UNSAFE for multi-agent + // workers. The model can think for 5+ minutes between tool rounds, + // during which no events are written and mtime stays frozen. + // + // This test guards against re-introducing mtime staleness detection. + // The correct fix for dead event streams is to register event handlers + // on all session creation paths (see WorkerRevival_RegistersEventHandler). + var source = File.ReadAllText(TestPaths.EventsCs); + var watchdogBody = ExtractMethod(source, "RunProcessingWatchdogAsync"); + + // Must NOT track consecutive mtime changes to declare death + Assert.DoesNotContain("StaleMtimeCount", watchdogBody); + Assert.DoesNotContain("StaleLimit", watchdogBody); + Assert.DoesNotContain("staleMtime", watchdogBody); + Assert.DoesNotContain("mtime unchanged", watchdogBody.ToLowerInvariant()); + } + + [Theory] + [InlineData(5)] // 5 min — typical tool execution + [InlineData(10)] // 10 min — long review worker + [InlineData(20)] // 20 min — max typical worker + [InlineData(29)] // 29 min — just under 30 min freshness window + public void MultiAgentFreshness_DoesNotExpireBefore(int workerMinutes) + { + // A worker that has been running for N minutes should still be within + // the multi-agent freshness window. + var workerSeconds = workerMinutes * 60; + Assert.True(workerSeconds < CopilotService.WatchdogMultiAgentCaseBFreshnessSeconds, + $"A {workerMinutes}-min worker would exceed the multi-agent freshness window " + + $"({CopilotService.WatchdogMultiAgentCaseBFreshnessSeconds}s). " + + "This would cause the watchdog to complete the session prematurely."); + } + + [Theory] + [InlineData(1)] // 1 min pause — well within standard freshness + [InlineData(3)] // 3 min — typical model thinking, within standard + [InlineData(4)] // 4 min — within 5 min standard window + public void ModelThinkingPause_IsWithinFreshnessWindow(int pauseMinutes) + { + // During model thinking pauses (no events written), the events.jsonl + // age grows. The freshness window must accommodate this. + var pauseSeconds = pauseMinutes * 60; + Assert.True(pauseSeconds < CopilotService.WatchdogCaseBFreshnessSeconds, + $"A {pauseMinutes}-min model thinking pause would exceed the STANDARD freshness window " + + $"({CopilotService.WatchdogCaseBFreshnessSeconds}s)."); + } + + [Theory] + [InlineData(5)] // 5 min — long model thinking, needs multi-agent window + [InlineData(10)] // 10 min — very long thinking, needs multi-agent window + [InlineData(20)] // 20 min — extreme, still within multi-agent 30 min + public void LongModelThinkingPause_IsWithinMultiAgentFreshnessWindow(int pauseMinutes) + { + // Multi-agent workers can have very long model thinking pauses. + // The multi-agent freshness window (1800s = 30 min) must accommodate. + var pauseSeconds = pauseMinutes * 60; + Assert.True(pauseSeconds < CopilotService.WatchdogMultiAgentCaseBFreshnessSeconds, + $"A {pauseMinutes}-min model thinking pause would exceed the MULTI-AGENT freshness window " + + $"({CopilotService.WatchdogMultiAgentCaseBFreshnessSeconds}s)."); + } + + // ─── Revival path creates complete SessionState ─── + + [Fact] + public void RevivalPath_CreatesCompleteState() + { + // The revival path must create a SessionState with all required fields. + // Check that the revival section sets up all the state that + // SendPromptAsync and the watchdog depend on. + var source = File.ReadAllText(TestPaths.OrganizationCs); + + var revivalStart = source.IndexOf("attempting fresh session revival"); + Assert.True(revivalStart > 0); + var revivalEnd = source.IndexOf("SendPromptAndWaitAsync", revivalStart); + var revivalSection = source[revivalStart..revivalEnd]; + + // Must have these elements (in any order) + Assert.Contains("new SessionState", revivalSection); + Assert.Contains("IsMultiAgentSession", revivalSection); + Assert.Contains(".On(evt => HandleSessionEvent(", revivalSection); + Assert.Contains("IsOrphaned", revivalSection); + Assert.Contains("TryUpdate", revivalSection); + } + + // ─── Helpers ─── + + private static string ExtractMethod(string source, string methodSignature) + { + var idx = source.IndexOf(methodSignature); + if (idx < 0) return string.Empty; + + int braceCount = 0; + bool foundBrace = false; + int start = idx; + + for (int i = idx; i < source.Length; i++) + { + if (source[i] == '{') { braceCount++; foundBrace = true; } + else if (source[i] == '}') { braceCount--; } + + if (foundBrace && braceCount == 0) + return source[start..(i + 1)]; + } + return source[start..]; + } + + private static int CountOccurrences(string source, string pattern) + { + int count = 0; + int idx = 0; + while ((idx = source.IndexOf(pattern, idx, StringComparison.Ordinal)) >= 0) + { + count++; + idx += pattern.Length; + } + return count; + } +} diff --git a/PolyPilot.Tests/MultiAgentRegressionTests.cs b/PolyPilot.Tests/MultiAgentRegressionTests.cs index fe799684fb..4486180110 100644 --- a/PolyPilot.Tests/MultiAgentRegressionTests.cs +++ b/PolyPilot.Tests/MultiAgentRegressionTests.cs @@ -2121,4 +2121,471 @@ public void PendingOrchestration_AtomicWrite_NoCorruption() } #endregion + + #region Orchestrator-Steer Conflict Tests (PR #375) + + /// + /// CRITICAL REGRESSION: When a user sends a message to a busy orchestrator, + /// Dashboard must queue the message (EnqueueMessage) — NOT steer it. + /// Steering cancels the in-flight orchestration TCS via ProcessingGeneration bump, + /// which causes TaskCanceledException in SendToMultiAgentGroupAsync. + /// + /// Bug scenario: + /// 1. User sends "review these PRs" → orchestrator starts dispatching workers + /// 2. User sends "also check PR #400" while orchestrator is still processing + /// 3. OLD: Dashboard sees IsProcessing=true → calls SteerSessionAsync → cancels orchestration + /// 4. FIX: Dashboard sees IsProcessing=true + orchestrator → calls EnqueueMessage → safe + /// + [Fact] + public void EnqueueMessage_QueuesDrainAfterCompletion() + { + var svc = CreateService(); + CopilotService.SetBaseDirForTesting(TestSetup.TestBaseDir); + + // Create a session via reflection helper + AddDummySessions(svc, "test-orch"); + var info = svc.GetSession("test-orch")!; + + // Enqueue a message + svc.EnqueueMessage("test-orch", "also check PR #400"); + + // The message should be queued + Assert.Equal(1, info.MessageQueue.Count); + var queued = info.MessageQueue.TryDequeue(); + Assert.Equal("also check PR #400", queued); + } + + [Fact] + public void EnqueueMessage_MultipleMessages_QueuedInOrder() + { + var svc = CreateService(); + CopilotService.SetBaseDirForTesting(TestSetup.TestBaseDir); + + AddDummySessions(svc, "test-orch"); + var info = svc.GetSession("test-orch")!; + + svc.EnqueueMessage("test-orch", "message 1"); + svc.EnqueueMessage("test-orch", "message 2"); + svc.EnqueueMessage("test-orch", "message 3"); + + Assert.Equal(3, info.MessageQueue.Count); + + // Drain in order + Assert.Equal("message 1", info.MessageQueue.TryDequeue()); + Assert.Equal("message 2", info.MessageQueue.TryDequeue()); + Assert.Equal("message 3", info.MessageQueue.TryDequeue()); + } + + /// + /// Structural test: Dashboard.razor dispatch routing must check for orchestrator + /// sessions BEFORE the general steer path. This prevents the steer from canceling + /// in-flight orchestrations. + /// + /// Order must be: + /// 1. if (IsProcessing) → check GetOrchestratorGroupId → queue if orchestrator + /// 2. else → SteerSessionAsync (for regular sessions) + /// + [Fact] + public void DashboardDispatch_OrchestratorCheckBeforeSteer() + { + var dashboardPath = Path.GetFullPath( + Path.Combine(AppContext.BaseDirectory, "..", "..", "..", "..", "PolyPilot", + "Components", "Pages", "Dashboard.razor")); + + Assert.True(File.Exists(dashboardPath), $"Dashboard.razor not found at {dashboardPath}"); + + var source = File.ReadAllText(dashboardPath); + + // Find the IsProcessing block that contains both the orchestrator check and steer + var isProcessingIdx = source.IndexOf("if (session?.IsProcessing == true)"); + Assert.True(isProcessingIdx >= 0, "Dashboard must have 'if (session?.IsProcessing == true)' check"); + + // Within the IsProcessing block, orchestrator check must come BEFORE steer + var orchCheckIdx = source.IndexOf("GetOrchestratorGroupId(sessionName)", isProcessingIdx); + var steerIdx = source.IndexOf("SteerSessionAsync(sessionName", isProcessingIdx); + + Assert.True(orchCheckIdx >= 0, "Dashboard must call GetOrchestratorGroupId within the IsProcessing block"); + Assert.True(steerIdx >= 0, "Dashboard must call SteerSessionAsync within the IsProcessing block"); + Assert.True(orchCheckIdx < steerIdx, + $"GetOrchestratorGroupId (pos {orchCheckIdx}) must appear BEFORE " + + $"SteerSessionAsync (pos {steerIdx}) in the IsProcessing block. " + + "If steer fires first, it cancels the in-flight orchestration TCS."); + } + + /// + /// Structural test: Dashboard must use EnqueueMessage (not SteerSessionAsync) + /// when the session is identified as an orchestrator. + /// + [Fact] + public void DashboardDispatch_OrchestratorUsesEnqueueNotSteer() + { + var dashboardPath = Path.GetFullPath( + Path.Combine(AppContext.BaseDirectory, "..", "..", "..", "..", "PolyPilot", + "Components", "Pages", "Dashboard.razor")); + + var source = File.ReadAllText(dashboardPath); + + // Find the orchestrator guard block + var orchCheckIdx = source.IndexOf("GetOrchestratorGroupId(sessionName)"); + Assert.True(orchCheckIdx >= 0); + + // Find the return statement after the EnqueueMessage in the orchestrator block + // The pattern should be: orchGroupId != null → EnqueueMessage → return + var orchBlockStart = orchCheckIdx; + var orchNullCheck = source.IndexOf("orchGroupId != null", orchBlockStart); + Assert.True(orchNullCheck >= 0, "Must check orchGroupId != null"); + + // Within the orchestrator block (between orchGroupId check and the next return), + // EnqueueMessage must be called + var nextReturn = source.IndexOf("return;", orchNullCheck); + Assert.True(nextReturn >= 0); + + var orchBlock = source[orchNullCheck..nextReturn]; + Assert.Contains("EnqueueMessage", orchBlock); + Assert.DoesNotContain("SteerSessionAsync", orchBlock); + } + + /// + /// Structural test: The QUEUED_ORCH_BUSY diagnostic log tag must be present + /// in the orchestrator queue path. This ensures diagnostic tracing is maintained. + /// + [Fact] + public void DashboardDispatch_OrchestratorQueueHasDiagnosticLog() + { + var dashboardPath = Path.GetFullPath( + Path.Combine(AppContext.BaseDirectory, "..", "..", "..", "..", "PolyPilot", + "Components", "Pages", "Dashboard.razor")); + + var source = File.ReadAllText(dashboardPath); + + Assert.Contains("QUEUED_ORCH_BUSY", source); + } + + /// + /// Non-orchestrator sessions that are processing should still be steered. + /// This ensures the fix only affects orchestrator sessions, not regular ones. + /// + [Fact] + public void DashboardDispatch_NonOrchestratorStillSteered() + { + var dashboardPath = Path.GetFullPath( + Path.Combine(AppContext.BaseDirectory, "..", "..", "..", "..", "PolyPilot", + "Components", "Pages", "Dashboard.razor")); + + var source = File.ReadAllText(dashboardPath); + + // After the orchestrator block (orchGroupId != null → EnqueueMessage → return), + // the steer path must still exist for non-orchestrator sessions + var orchReturnIdx = source.IndexOf("QUEUED_ORCH_BUSY"); + Assert.True(orchReturnIdx >= 0); + + // SteerSessionAsync should still appear AFTER the orchestrator block + var steerAfterOrch = source.IndexOf("SteerSessionAsync", orchReturnIdx); + Assert.True(steerAfterOrch >= 0, + "SteerSessionAsync must still be called for non-orchestrator sessions " + + "that are processing. The orchestrator check is a special case, not a replacement."); + } + + /// + /// Tests that GetOrchestratorGroupId returns null for a session that IS in a + /// multi-agent group but as a worker (not orchestrator). This ensures workers + /// can still be steered normally. + /// + [Fact] + public void GetOrchestratorGroupId_WorkerInActiveGroup_ReturnsNull() + { + var svc = CreateService(); + CopilotService.SetBaseDirForTesting(TestSetup.TestBaseDir); + + var group = svc.CreateMultiAgentGroup("Steer Test Team", MultiAgentMode.Orchestrator); + svc.Organization.Sessions.Add(new SessionMeta + { + SessionName = "Steer Test Team-orchestrator", + GroupId = group.Id, + Role = MultiAgentRole.Orchestrator + }); + svc.Organization.Sessions.Add(new SessionMeta + { + SessionName = "Steer Test Team-worker-1", + GroupId = group.Id, + Role = MultiAgentRole.Worker + }); + + // Workers should NOT be identified as orchestrators + Assert.Null(svc.GetOrchestratorGroupId("Steer Test Team-worker-1")); + // Workers can be steered without issue + } + + /// + /// Long-running orchestrator scenario: when an orchestrator has been dispatching + /// workers for 10+ minutes and the user sends a follow-up, it must be queued. + /// This is the exact scenario from the PR Review Squad bug. + /// + [Fact] + public void LongRunningOrchestrator_UserFollowup_MustQueue() + { + var svc = CreateService(); + CopilotService.SetBaseDirForTesting(TestSetup.TestBaseDir); + + // Set up a multi-agent group with orchestrator + var group = svc.CreateMultiAgentGroup("Long Run Team", MultiAgentMode.Orchestrator); + AddDummySessions(svc, "Long Run Team-orchestrator"); + + svc.Organization.Sessions.Add(new SessionMeta + { + SessionName = "Long Run Team-orchestrator", + GroupId = group.Id, + Role = MultiAgentRole.Orchestrator + }); + + // Verify the orchestrator is detected + Assert.Equal(group.Id, svc.GetOrchestratorGroupId("Long Run Team-orchestrator")); + + // User sends a follow-up while orchestrator is busy + svc.EnqueueMessage("Long Run Team-orchestrator", "also review PR #500"); + + // Message should be queued, NOT cause steering + var info = svc.GetSession("Long Run Team-orchestrator")!; + Assert.Equal(1, info.MessageQueue.Count); + } + + #endregion + + #region Premature Idle Recovery Tests (PR #375 — SDK bug #299) + + [Fact] + public void PrematureIdleDetectionWindowMs_IsReasonable() + { + // The detection window must be long enough for EVT-REARM to fire on the UI thread + // after the premature idle, but short enough not to delay normal completions excessively. + Assert.True(CopilotService.PrematureIdleDetectionWindowMs >= 3000, + "Detection window must be >= 3s to allow UI thread EVT-REARM dispatch"); + Assert.True(CopilotService.PrematureIdleDetectionWindowMs <= 10_000, + "Detection window must be <= 10s to avoid excessive delay on normal completions"); + } + + [Fact] + public void PrematureIdleRecoveryTimeoutMs_IsReasonable() + { + // Recovery timeout must accommodate workers with long tool runs (up to 10+ minutes) + // but not exceed the worker execution timeout. + Assert.True(CopilotService.PrematureIdleRecoveryTimeoutMs >= 60_000, + "Recovery timeout must be >= 60s to accommodate worker tool runs"); + Assert.True(CopilotService.PrematureIdleRecoveryTimeoutMs <= 600_000, + "Recovery timeout must be <= 600s (10 min) to not exceed worker timeout"); + } + + [Fact] + public void PrematureIdleSignal_ExistsOnSessionState() + { + // ManualResetEventSlim signal must exist on SessionState for EVT-REARM → ExecuteWorkerAsync signaling + var field = typeof(CopilotService).GetNestedType("SessionState", + System.Reflection.BindingFlags.NonPublic)? + .GetField("PrematureIdleSignal"); + Assert.NotNull(field); + Assert.True(field.FieldType == typeof(ManualResetEventSlim), "PrematureIdleSignal must be a ManualResetEventSlim"); + } + + [Fact] + public void PrematureIdleSignal_SetInRearmPath() + { + // Structural: the EVT-REARM path must call PrematureIdleSignal.Set() + var eventsPath = Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs"); + var source = File.ReadAllText(eventsPath); + + // Find the EVT-REARM block + var rearmIdx = source.IndexOf("[EVT-REARM]", StringComparison.Ordinal); + Assert.True(rearmIdx >= 0, "EVT-REARM diagnostic tag must exist in Events.cs"); + + // Within the next 200 chars after the tag, PrematureIdleSignal must be set + var rearmBlock = source.Substring(rearmIdx, Math.Min(200, source.Length - rearmIdx)); + Assert.Contains("PrematureIdleSignal.Set()", rearmBlock); + } + + [Fact] + public void PrematureIdleSignal_ResetInSendPromptAsync() + { + // Structural: SendPromptAsync must reset PrematureIdleSignal on each new turn + var servicePath = Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs"); + var source = File.ReadAllText(servicePath); + + // Find SendPromptAsync method + var sendIdx = source.IndexOf("async Task SendPromptAsync(", StringComparison.Ordinal); + Assert.True(sendIdx >= 0, "SendPromptAsync must exist in CopilotService.cs"); + + var sendBlock = source.Substring(sendIdx, Math.Min(5000, source.Length - sendIdx)); + Assert.Contains("PrematureIdleSignal.Reset()", sendBlock); + } + + [Fact] + public void RecoverFromPrematureIdleIfNeededAsync_ExistsInOrganization() + { + // Structural: the recovery method must exist and be called from ExecuteWorkerAsync + var orgPath = Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Organization.cs"); + var source = File.ReadAllText(orgPath); + + Assert.Contains("RecoverFromPrematureIdleIfNeededAsync", source); + + // Must be called within ExecuteWorkerAsync (find the method definition, not a call site) + var execIdx = source.IndexOf("private async Task ExecuteWorkerAsync", StringComparison.Ordinal); + Assert.True(execIdx >= 0, "ExecuteWorkerAsync method definition must exist"); + var execBlock = source.Substring(execIdx, Math.Min(5000, source.Length - execIdx)); + Assert.Contains("RecoverFromPrematureIdleIfNeededAsync", execBlock); + } + + [Fact] + public void RecoverFromPrematureIdleIfNeededAsync_OnlyForMultiAgentSessions() + { + // Structural: the recovery check must be guarded by IsMultiAgentSession + // to avoid adding latency to normal single-session completions + var orgPath = Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Organization.cs"); + var source = File.ReadAllText(orgPath); + + var execIdx = source.IndexOf("private async Task ExecuteWorkerAsync", StringComparison.Ordinal); + Assert.True(execIdx >= 0, "ExecuteWorkerAsync method definition must exist"); + var execBlock = source.Substring(execIdx, Math.Min(5000, source.Length - execIdx)); + + // Must check IsMultiAgentSession before calling recovery + var recoveryIdx = execBlock.IndexOf("RecoverFromPrematureIdleIfNeededAsync", StringComparison.Ordinal); + Assert.True(recoveryIdx >= 0, "Recovery call must exist in ExecuteWorkerAsync"); + var beforeRecovery = execBlock[..recoveryIdx]; + Assert.Contains("IsMultiAgentSession", beforeRecovery); + } + + [Fact] + public void RecoverFromPrematureIdleIfNeededAsync_SubscribesToOnSessionComplete() + { + // Structural: the recovery method must subscribe to OnSessionComplete to detect + // the worker's real completion (after premature idle re-arm) + var orgPath = Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Organization.cs"); + var source = File.ReadAllText(orgPath); + + // Find the method definition (not a call site) + var methodIdx = source.IndexOf("private async Task RecoverFromPrematureIdleIfNeededAsync", StringComparison.Ordinal); + Assert.True(methodIdx >= 0, "RecoverFromPrematureIdleIfNeededAsync method definition must exist"); + var methodBlock = source.Substring(methodIdx, Math.Min(8000, source.Length - methodIdx)); + + Assert.Contains("OnSessionComplete +=", methodBlock); + Assert.Contains("OnSessionComplete -=", methodBlock); // Must unsubscribe in finally + } + + [Fact] + public void RecoverFromPrematureIdleIfNeededAsync_HasDiskFallback() + { + // Structural: if History doesn't have full content, fall back to LoadHistoryFromDisk + var orgPath = Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Organization.cs"); + var source = File.ReadAllText(orgPath); + + // Find the method definition (not a call site) + var methodIdx = source.IndexOf("private async Task RecoverFromPrematureIdleIfNeededAsync", StringComparison.Ordinal); + Assert.True(methodIdx >= 0, "RecoverFromPrematureIdleIfNeededAsync method definition must exist"); + var methodBlock = source.Substring(methodIdx, Math.Min(8000, source.Length - methodIdx)); + + Assert.Contains("LoadHistoryFromDisk", methodBlock); + } + + [Fact] + public void RecoverFromPrematureIdleIfNeededAsync_HasDiagnosticLogging() + { + // Every recovery path must have diagnostic log entries + var orgPath = Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Organization.cs"); + var source = File.ReadAllText(orgPath); + + // Find the method definition (not a call site) + var methodIdx = source.IndexOf("private async Task RecoverFromPrematureIdleIfNeededAsync", StringComparison.Ordinal); + Assert.True(methodIdx >= 0, "RecoverFromPrematureIdleIfNeededAsync method definition must exist"); + var methodBlock = source.Substring(methodIdx, Math.Min(8000, source.Length - methodIdx)); + + Assert.Contains("[DISPATCH-RECOVER]", methodBlock); + } + + [Fact] + public void MutationBeforeCommit_SessionIdSetAfterTryUpdate() + { + // Structural: SessionId must be set AFTER TryUpdate succeeds, not before. + // This prevents mutating shared Info on a path that might discard the state. + var orgPath = Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Organization.cs"); + var source = File.ReadAllText(orgPath); + + // Find the revival block in ExecuteWorkerAsync + var revivalIdx = source.IndexOf("revived with fresh session", StringComparison.Ordinal); + Assert.True(revivalIdx >= 0, "Revival debug message must exist"); + + // SessionId assignment must be near/after the "revived" message, not before TryUpdate + var tryUpdateIdx = source.IndexOf("TryUpdate(workerName, freshState, deadState)", StringComparison.Ordinal); + Assert.True(tryUpdateIdx >= 0, "TryUpdate call must exist"); + + // Find the SessionId assignment + var sessionIdAssign = source.IndexOf("deadState.Info.SessionId = freshSession.SessionId", StringComparison.Ordinal); + Assert.True(sessionIdAssign >= 0, "SessionId assignment must exist"); + Assert.True(sessionIdAssign > tryUpdateIdx, + "SessionId must be assigned AFTER TryUpdate succeeds (mutation-after-commit pattern)"); + } + + [Fact] + public void RecoverFromPrematureIdleIfNeededAsync_UsesEventsFileFreshness() + { + // Structural: recovery must check events.jsonl freshness as a parallel detection + // signal alongside WasPrematurelyIdled flag. This catches cases where EVT-REARM + // takes 30-60s to fire but the CLI is still writing events. + var orgPath = Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Organization.cs"); + var source = File.ReadAllText(orgPath); + + var methodIdx = source.IndexOf("private async Task RecoverFromPrematureIdleIfNeededAsync", StringComparison.Ordinal); + Assert.True(methodIdx >= 0, "RecoverFromPrematureIdleIfNeededAsync method definition must exist"); + var methodBlock = source.Substring(methodIdx, Math.Min(8000, source.Length - methodIdx)); + + Assert.Contains("IsEventsFileActive", methodBlock); + } + + [Fact] + public void IsEventsFileActive_HelperExists() + { + // Structural: IsEventsFileActive must exist as a helper for events.jsonl freshness checks + var orgPath = Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Organization.cs"); + var source = File.ReadAllText(orgPath); + + var helperIdx = source.IndexOf("private bool IsEventsFileActive(", StringComparison.Ordinal); + Assert.True(helperIdx >= 0, "IsEventsFileActive helper must exist"); + + var helperBlock = source.Substring(helperIdx, Math.Min(1000, source.Length - helperIdx)); + Assert.Contains("GetLastWriteTimeUtc", helperBlock); + Assert.Contains("PrematureIdleEventsFileFreshnessSeconds", helperBlock); + } + + [Fact] + public void RecoverFromPrematureIdleIfNeededAsync_LoopsOnRepeatedPrematureIdle() + { + // Structural: recovery must loop to handle repeated premature idle (observed: 4x in a row). + // After each OnSessionComplete, it checks if events.jsonl is still active before deciding + // the worker is truly done. + var orgPath = Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Organization.cs"); + var source = File.ReadAllText(orgPath); + + var methodIdx = source.IndexOf("private async Task RecoverFromPrematureIdleIfNeededAsync", StringComparison.Ordinal); + Assert.True(methodIdx >= 0, "RecoverFromPrematureIdleIfNeededAsync method definition must exist"); + var methodBlock = source.Substring(methodIdx, Math.Min(8000, source.Length - methodIdx)); + + // Must have a loop for repeated premature idle rounds + Assert.Contains("while (", methodBlock); + Assert.Contains("rounds++", methodBlock); + // Must check events.jsonl freshness inside the loop to decide if worker is truly done + Assert.Contains("IsEventsFileActive", methodBlock); + } + + [Fact] + public void PrematureIdleEventsFileFreshnessSeconds_ConstantExists() + { + // The freshness threshold constant must exist and be reasonable (10-60s range) + var orgPath = Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Organization.cs"); + var source = File.ReadAllText(orgPath); + + Assert.Contains("PrematureIdleEventsFileFreshnessSeconds", source); + + // Verify it's a constant (internal const int) + var constIdx = source.IndexOf("internal const int PrematureIdleEventsFileFreshnessSeconds", StringComparison.Ordinal); + Assert.True(constIdx >= 0, "Must be an internal const int"); + } + + #endregion } diff --git a/PolyPilot.Tests/SessionPersistenceTests.cs b/PolyPilot.Tests/SessionPersistenceTests.cs index 18e5883648..12fa5a3241 100644 --- a/PolyPilot.Tests/SessionPersistenceTests.cs +++ b/PolyPilot.Tests/SessionPersistenceTests.cs @@ -451,6 +451,35 @@ public void MergeSessionEntries_PreservesLastPrompt() Assert.Equal("deploy to production", result[0].LastPrompt); } + [Fact] + public void RestorePreviousSessionsAsync_QueuesEagerResumeForInterruptedSessions() + { + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Persistence.cs")); + + var placeholderIdx = source.IndexOf("Loaded session placeholder", StringComparison.Ordinal); + Assert.True(placeholderIdx > 0, "Placeholder restore block not found"); + + var placeholderBlock = source.Substring(Math.Max(0, placeholderIdx - 1200), Math.Min(1600, source.Length - Math.Max(0, placeholderIdx - 1200))); + Assert.Contains("!string.IsNullOrWhiteSpace(entry.LastPrompt)", placeholderBlock); + Assert.Contains("eagerResumeCandidates.Add((entry.DisplayName, lazyState))", placeholderBlock); + } + + [Fact] + public void RestorePreviousSessionsAsync_RunsInterruptedSessionResumesAfterPlaceholderLoad() + { + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Persistence.cs")); + + var loadIdx = source.IndexOf("Loaded session placeholder", StringComparison.Ordinal); + var eagerResumeIdx = source.IndexOf("await EnsureSessionConnectedAsync(pendingResume.SessionName, pendingResume.State, cancellationToken)", StringComparison.Ordinal); + + Assert.True(loadIdx > 0, "Placeholder restore block not found"); + Assert.True(eagerResumeIdx > 0, "Eager resume loop not found"); + Assert.True(eagerResumeIdx > loadIdx, "Interrupted-session eager resume must run after placeholders are loaded"); + Assert.Contains("Task.Run(async () =>", source); + } + // --- DeleteGroup persistence tests --- [Fact] diff --git a/PolyPilot.Tests/SessionStabilityTests.cs b/PolyPilot.Tests/SessionStabilityTests.cs new file mode 100644 index 0000000000..1cafe847e3 --- /dev/null +++ b/PolyPilot.Tests/SessionStabilityTests.cs @@ -0,0 +1,379 @@ +using System.Text.Json; +using Microsoft.Extensions.DependencyInjection; +using PolyPilot.Models; +using PolyPilot.Services; + +namespace PolyPilot.Tests; + +/// +/// Tests for session stability hardening from PR #373: +/// - IsOrphaned guards on all event/timer entry points +/// - ForceCompleteProcessingAsync INV-1 compliance +/// - Mixed worker success/failure in synthesis prompt +/// - TryUpdate concurrency guard on reconnect +/// - Sibling TCS cancellation on orphan +/// - MCP servers reload on reconnect +/// - Collection snapshots before Task.Run +/// +[Collection("BaseDir")] +public class SessionStabilityTests +{ + private readonly StubChatDatabase _chatDb = new(); + private readonly StubServerManager _serverManager = new(); + private readonly StubWsBridgeClient _bridgeClient = new(); + private readonly StubDemoService _demoService = new(); + private readonly IServiceProvider _serviceProvider; + + public SessionStabilityTests() + { + var services = new ServiceCollection(); + _serviceProvider = services.BuildServiceProvider(); + } + + private CopilotService CreateService() => + new(_chatDb, _serverManager, _bridgeClient, new RepoManager(), _serviceProvider, _demoService); + + // ─── IsOrphaned Guard Tests (source verification) ─── + + [Fact] + public void HandleSessionEvent_ChecksIsOrphaned_BeforeProcessing() + { + var source = File.ReadAllText(TestPaths.EventsCs); + var handleMethod = ExtractMethod(source, "void HandleSessionEvent"); + Assert.Contains("IsOrphaned", handleMethod); + // The orphan check should guard with an immediate return + Assert.Contains("if (state.IsOrphaned)", handleMethod); + } + + [Fact] + public void CompleteResponse_ChecksIsOrphaned_AndCancelsTcs() + { + var source = File.ReadAllText(TestPaths.EventsCs); + var method = ExtractMethod(source, "void CompleteResponse"); + Assert.Contains("IsOrphaned", method); + Assert.Contains("TrySetCanceled", method, + StringComparison.Ordinal); + } + + [Fact] + public void WatchdogLoop_ChecksIsOrphaned_AndExits() + { + var source = File.ReadAllText(TestPaths.EventsCs); + var method = ExtractMethod(source, "RunProcessingWatchdogAsync"); + Assert.Contains("IsOrphaned", method); + } + + [Fact] + public void IsOrphaned_IsVolatile() + { + var source = File.ReadAllText(TestPaths.CopilotServiceCs); + // SessionState must declare IsOrphaned as volatile for cross-thread visibility + Assert.Contains("volatile bool IsOrphaned", source); + } + + // ─── ForceCompleteProcessingAsync INV-1 Tests ─── + + [Fact] + public void ForceCompleteProcessing_ClearsAllInv1Fields() + { + var source = File.ReadAllText(TestPaths.OrganizationCs); + var method = ExtractMethod(source, "ForceCompleteProcessingAsync"); + + // Every INV-1 field must be cleared + var requiredClears = new[] + { + "ActiveToolCallCount", // INV-1 field 3 + "HasUsedToolsThisTurn", // INV-1 field 2 + "SendingFlag", // INV-1 field 7 + "IsResumed", // INV-1 field 1 + "ProcessingStartedAt", // INV-1 field 4 + "ToolCallCount", // INV-1 field 5 + "ProcessingPhase", // INV-1 field 6 + "ClearPermissionDenials", // INV-1 field 8 + "FlushCurrentResponse", // INV-1 field 9 + "IsProcessing", // The flag itself + "OnSessionComplete", // INV-1 field 10 + "TrySetResult", // Resolves the worker TCS + }; + + foreach (var field in requiredClears) + { + Assert.True(method.Contains(field, StringComparison.Ordinal), + $"ForceCompleteProcessingAsync must clear '{field}' (INV-1 compliance)"); + } + } + + [Fact] + public void ForceCompleteProcessing_CancelsTimersBeforeUiThreadWork() + { + var source = File.ReadAllText(TestPaths.OrganizationCs); + var method = ExtractMethod(source, "ForceCompleteProcessingAsync"); + + // Timer cancellation must happen BEFORE InvokeOnUI (thread-safe operations first) + var cancelIdx = method.IndexOf("CancelProcessingWatchdog", StringComparison.Ordinal); + var invokeIdx = method.IndexOf("InvokeOnUI", StringComparison.Ordinal); + Assert.True(cancelIdx >= 0, "CancelProcessingWatchdog must be present in ForceCompleteProcessingAsync"); + Assert.True(invokeIdx >= 0, "InvokeOnUI must be present in ForceCompleteProcessingAsync"); + Assert.True(cancelIdx < invokeIdx, + "Timer cancellation must happen before InvokeOnUI in ForceCompleteProcessingAsync"); + } + + [Fact] + public void ForceCompleteProcessing_SkipsIfNotProcessing() + { + var source = File.ReadAllText(TestPaths.OrganizationCs); + var method = ExtractMethod(source, "ForceCompleteProcessingAsync"); + + // Must early-return if already not processing (idempotent) + Assert.Contains("!state.Info.IsProcessing", method); + } + + // ─── Mixed Worker Success/Failure Synthesis Tests ─── + + [Fact] + public void BuildSynthesisPrompt_IncludesBothSuccessAndFailure() + { + var source = File.ReadAllText(TestPaths.OrganizationCs); + var method = ExtractMethod(source, "string BuildSynthesisPrompt"); + + // Must include success indicator + Assert.Contains("completed", method); + // Must include failure indicator + Assert.Contains("failed", method); + // Must include error text for failed workers + Assert.Contains("result.Error", method); + } + + [Fact] + public void BuildSynthesisPrompt_SanitizesReflectSentinel() + { + var source = File.ReadAllText(TestPaths.OrganizationCs); + var method = ExtractMethod(source, "string BuildSynthesisPrompt"); + + // Worker responses containing the reflect complete sentinel must be sanitized + // to prevent the orchestrator from echoing it and causing false loop termination + Assert.Contains("GROUP_REFLECT_COMPLETE", method); + Assert.Contains("WORKER_APPROVED", method); + } + + // ─── Sibling Re-Resume Safety Tests (source verification) ─── + + [Fact] + public void SiblingReResume_OrphansOldState_BeforeCreatingNew() + { + var source = File.ReadAllText(TestPaths.CopilotServiceCs); + + // Must set IsOrphaned = true on old state during reconnect + Assert.Contains("IsOrphaned = true", source); + // Must cancel old TCS + Assert.Contains("TrySetCanceled", source); + // Must set ProcessingGeneration to max to prevent stale callbacks + Assert.Contains("long.MaxValue", source); + } + + [Fact] + public void SiblingReResume_UsesTryUpdate_NotIndexAssignment() + { + var source = File.ReadAllText(TestPaths.CopilotServiceCs); + + // Reconnect sibling path must use TryUpdate for atomic swap (prevents stale Task.Run overwrite) + // The sibling re-resume code lives inside SendPromptAsync's reconnect-on-failure path + var sendMethod = ExtractMethod(source, "Task SendPromptAsync("); + Assert.False(string.IsNullOrEmpty(sendMethod), "SendPromptAsync method must exist"); + Assert.Contains("TryUpdate", sendMethod); + } + + [Fact] + public void SiblingReResume_SnapshotsCollections_BeforeTaskRun() + { + var source = File.ReadAllText(TestPaths.CopilotServiceCs); + + // Must snapshot Organization.Sessions and Groups before Task.Run + // (List is not thread-safe for concurrent reads during modification) + // The sibling re-resume code lives inside SendPromptAsync's reconnect-on-failure path + var sendMethod = ExtractMethod(source, "Task SendPromptAsync("); + Assert.False(string.IsNullOrEmpty(sendMethod), "SendPromptAsync method must exist"); + Assert.Contains("Sessions.ToList()", sendMethod); + Assert.Contains("Groups.ToList()", sendMethod); + } + + [Fact] + public void SiblingReResume_RegistersHandler_BeforePublishing() + { + var source = File.ReadAllText(TestPaths.CopilotServiceCs); + + // Handler registration (HandleSessionEvent(siblingState)) must appear + // in the SendPromptAsync reconnect path — paired with TryUpdate for correct ordering + var sendMethod = ExtractMethod(source, "Task SendPromptAsync("); + Assert.False(string.IsNullOrEmpty(sendMethod), "SendPromptAsync method must exist"); + Assert.Contains("HandleSessionEvent(siblingState", sendMethod); + + // Handler must appear BEFORE TryUpdate (register before publishing) + var handlerIdx = sendMethod.IndexOf("HandleSessionEvent(siblingState", StringComparison.Ordinal); + var tryUpdateIdx = sendMethod.IndexOf("TryUpdate", StringComparison.Ordinal); + Assert.True(handlerIdx >= 0, "HandleSessionEvent(siblingState must be present in reconnect path"); + Assert.True(tryUpdateIdx >= 0, "TryUpdate must be present in reconnect path"); + Assert.True(handlerIdx < tryUpdateIdx, + "Handler registration must happen BEFORE TryUpdate (no window where events arrive with no handler)"); + } + + [Fact] + public void ReconnectConfig_LoadsMcpServers_BothPaths() + { + var source = File.ReadAllText(TestPaths.CopilotServiceCs); + + // Both sibling and primary reconnect paths must reload MCP servers + var mcpCount = CountOccurrences(source, "LoadMcpServers"); + Assert.True(mcpCount >= 2, + $"Expected LoadMcpServers in both sibling and primary reconnect paths, found {mcpCount} occurrences"); + } + + [Fact] + public void ReconnectConfig_LoadsSkillDirectories_BothPaths() + { + var source = File.ReadAllText(TestPaths.CopilotServiceCs); + + var skillCount = CountOccurrences(source, "LoadSkillDirectories"); + Assert.True(skillCount >= 2, + $"Expected LoadSkillDirectories in both sibling and primary reconnect paths, found {skillCount} occurrences"); + } + + // ─── Diagnostic Log Tag Completeness ─── + + [Fact] + public void AllIsProcessingFalsePaths_HaveDiagnosticLogEntry() + { + // Verify that every IsProcessing = false has a nearby Debug() call + var eventsSource = File.ReadAllText(TestPaths.EventsCs); + var serviceSource = File.ReadAllText(TestPaths.CopilotServiceCs); + + // Events.cs paths + var eventsFalseCount = CountOccurrences(eventsSource, "IsProcessing = false"); + var eventsDebugTagCount = CountPatterns(eventsSource, new[] { + "[COMPLETE]", "[ERROR]", "[WATCHDOG]", "[BRIDGE-COMPLETE]", "[INTERRUPTED]" + }); + Assert.True(eventsDebugTagCount >= eventsFalseCount, + $"Events.cs has {eventsFalseCount} IsProcessing=false paths but only {eventsDebugTagCount} diagnostic tags"); + + // CopilotService.cs paths (ABORT, ERROR, SEND-fail) + var serviceFalseCount = CountOccurrences(serviceSource, "IsProcessing = false"); + var serviceDebugTagCount = CountPatterns(serviceSource, new[] { + "[ABORT]", "[ERROR]", "[SEND]" + }); + // At least the abort paths should have tags + Assert.True(serviceDebugTagCount >= 2, + "CopilotService.cs must have diagnostic tags for abort and send-failure paths"); + } + + // ─── Processing Watchdog Orphan Guard ─── + + [Fact] + public void WatchdogCrashRecovery_ClearsAllCompanionFields() + { + var source = File.ReadAllText(TestPaths.EventsCs); + var watchdogMethod = ExtractMethod(source, "RunProcessingWatchdogAsync"); + + // The crash recovery block (Case C kill) must clear companion fields + var companionFields = new[] + { + "IsProcessing = false", + "ProcessingPhase", + "ProcessingStartedAt", + "ToolCallCount", + }; + + foreach (var field in companionFields) + { + Assert.True(watchdogMethod.Contains(field, StringComparison.Ordinal), + $"Watchdog crash recovery must clear '{field}'"); + } + } + + // ─── Multi-Agent Fix Prompt Enhancement ─── + + [Fact] + public void BuildCopilotPrompt_IncludesMultiAgentSection_InSource() + { + var source = File.ReadAllText(TestPaths.SessionSidebarRazor); + + // Fix prompt must include multi-agent testing requirements when session is in a group + Assert.Contains("Multi-Agent Testing Requirements", source); + Assert.Contains("IsSessionInMultiAgentGroup", source); + } + + [Fact] + public void GetBugReportDebugInfo_IncludesMultiAgentContext_InSource() + { + var source = File.ReadAllText(TestPaths.SessionSidebarRazor); + + // Bug report debug info must include multi-agent context + Assert.Contains("AppendMultiAgentDebugInfo", source); + } + + [Fact] + public void AppendMultiAgentDebugInfo_IncludesEventDiagnostics() + { + var source = File.ReadAllText(TestPaths.SessionSidebarRazor); + + // Multi-agent debug info must include: + Assert.Contains("OrchestratorMode", source); // Group mode + Assert.Contains("event-diagnostics", source); // Recent events + Assert.Contains("pending-orchestration", source); // Pending state + } + + // ─── Helpers ─── + + private static string ExtractMethod(string source, string methodSignature) + { + var idx = source.IndexOf(methodSignature, StringComparison.Ordinal); + if (idx < 0) return ""; + var braceIdx = source.IndexOf('{', idx); + if (braceIdx < 0) return ""; + return source[idx..FindEndOfBlock(source, braceIdx)]; + } + + private static int FindEndOfBlock(string source, int openBraceIdx) + { + int depth = 0; + for (int i = openBraceIdx; i < source.Length; i++) + { + if (source[i] == '{') depth++; + else if (source[i] == '}') { depth--; if (depth == 0) return i + 1; } + } + return source.Length; + } + + private static int FindMatchingBrace(string text) + { + var braceIdx = text.IndexOf('{'); + if (braceIdx < 0) return text.Length; + return FindEndOfBlock(text, braceIdx); + } + + private static int CountOccurrences(string text, string pattern) + { + int count = 0, idx = 0; + while ((idx = text.IndexOf(pattern, idx, StringComparison.Ordinal)) >= 0) + { count++; idx += pattern.Length; } + return count; + } + + private static int CountPatterns(string text, string[] patterns) + { + return patterns.Sum(p => CountOccurrences(text, p)); + } + + /// + /// Centralized source file paths to avoid repetition. + /// + private static class TestPaths + { + private static readonly string ProjectRoot = Path.GetFullPath( + Path.Combine(AppContext.BaseDirectory, "..", "..", "..", "..", "PolyPilot")); + + public static string CopilotServiceCs => Path.Combine(ProjectRoot, "Services", "CopilotService.cs"); + public static string EventsCs => Path.Combine(ProjectRoot, "Services", "CopilotService.Events.cs"); + public static string OrganizationCs => Path.Combine(ProjectRoot, "Services", "CopilotService.Organization.cs"); + public static string SessionSidebarRazor => Path.Combine(ProjectRoot, "Components", "Layout", "SessionSidebar.razor"); + } +} diff --git a/PolyPilot.Tests/WsBridgeIntegrationTests.cs b/PolyPilot.Tests/WsBridgeIntegrationTests.cs index fba7f61888..74fee25bd0 100644 --- a/PolyPilot.Tests/WsBridgeIntegrationTests.cs +++ b/PolyPilot.Tests/WsBridgeIntegrationTests.cs @@ -679,6 +679,7 @@ public async Task RenameSession_ViaClient_RenamesOnServer() using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(10)); var client = await ConnectClientAsync(cts.Token); + await WaitForAsync(() => client.Sessions.Any(s => s.Name == "old-name"), cts.Token); await client.RenameSessionAsync("old-name", "new-name", cts.Token); await WaitForAsync(() => _copilot.GetSession("new-name") != null, cts.Token, maxMs: 8000); @@ -696,6 +697,7 @@ public async Task RenameSession_ViaClient_UpdatesSessionList() using var cts = new CancellationTokenSource(TimeSpan.FromSeconds(10)); var client = await ConnectClientAsync(cts.Token); + await WaitForAsync(() => client.Sessions.Any(s => s.Name == "rename-list"), cts.Token); await client.RenameSessionAsync("rename-list", "renamed-list", cts.Token); await WaitForAsync(() => client.Sessions.Any(s => s.Name == "renamed-list"), cts.Token); diff --git a/PolyPilot/Components/Layout/SessionSidebar.razor b/PolyPilot/Components/Layout/SessionSidebar.razor index 877bd26191..45bddcec8f 100644 --- a/PolyPilot/Components/Layout/SessionSidebar.razor +++ b/PolyPilot/Components/Layout/SessionSidebar.razor @@ -2371,20 +2371,24 @@ else sb.AppendLine($"LastUpdatedAt: {session.LastUpdatedAt:o}"); sb.AppendLine($"WorkingDirectory: {session.WorkingDirectory}"); } + + // Multi-agent context + AppendMultiAgentDebugInfo(sb, selectedBugSession); } try { - var crashPath = Path.Combine( - Environment.GetFolderPath(Environment.SpecialFolder.UserProfile), - ".polypilot", "crash.log"); + var crashPath = Path.Combine(CopilotService.BaseDir, "crash.log"); if (File.Exists(crashPath)) { - var lines = File.ReadAllLines(crashPath); - var tail = lines.Length > 10 ? lines[^10..] : lines; - sb.AppendLine($"--- crash.log (last {tail.Length} lines) ---"); - foreach (var line in tail) - sb.AppendLine(line); + // Read only the tail to avoid loading entire file + var tail = ReadLastLines(crashPath, 10); + if (tail.Length > 0) + { + sb.AppendLine($"--- crash.log (last {tail.Length} lines) ---"); + foreach (var line in tail) + sb.AppendLine(line); + } } } catch { /* ignore */ } @@ -2392,6 +2396,78 @@ else return sb.ToString(); } + private void AppendMultiAgentDebugInfo(System.Text.StringBuilder sb, string sessionName) + { + try + { + // Use thread-safe snapshots to avoid concurrent modification issues + var sessions = CopilotService.SnapshotSessionMetas(); + var groups = CopilotService.SnapshotGroups(); + + var meta = sessions.FirstOrDefault(m => m.SessionName == sessionName); + if (meta?.GroupId == null) return; + + var group = groups.FirstOrDefault(g => g.Id == meta.GroupId); + if (group is not { IsMultiAgent: true }) return; + + sb.AppendLine($"--- Multi-Agent Context ---"); + sb.AppendLine($"Group: {group.Name}"); + sb.AppendLine($"OrchestratorMode: {group.OrchestratorMode}"); + sb.AppendLine($"Role: {meta.Role}"); + + // List all members of the group + var members = sessions.Where(m => m.GroupId == group.Id).ToList(); + foreach (var m in members) + { + var mSession = CopilotService.GetAllSessions() + .FirstOrDefault(s => s.Name == m.SessionName); + var status = mSession?.IsProcessing == true ? "processing" : "idle"; + sb.AppendLine($" {m.Role}: {m.SessionName} ({mSession?.Model ?? "?"}) [{status}]"); + } + + // Include recent event-diagnostics entries for the group + try + { + var diagPath = Path.Combine(CopilotService.BaseDir, "event-diagnostics.log"); + if (File.Exists(diagPath)) + { + var groupPrefix = group.Name; + // Read only the tail to avoid blocking on large logs + var tailLines = ReadLastLines(diagPath, 500); + var groupLines = tailLines + .Where(l => l.Contains(groupPrefix, StringComparison.OrdinalIgnoreCase)) + .TakeLast(30) + .ToArray(); + if (groupLines.Length > 0) + { + sb.AppendLine($"--- Event Diagnostics ({groupPrefix}, last {groupLines.Length} lines) ---"); + foreach (var line in groupLines) + sb.AppendLine(line); + } + } + } + catch { /* best-effort */ } + + // Check PendingOrchestration + try + { + var pendingPath = Path.Combine(CopilotService.BaseDir, "pending-orchestration.json"); + if (File.Exists(pendingPath)) + { + using var fs = new FileStream(pendingPath, FileMode.Open, FileAccess.Read, FileShare.ReadWrite); + using var reader = new StreamReader(fs); + var content = reader.ReadToEnd(); + if (!string.IsNullOrWhiteSpace(content) && content.Trim() != "{}" && content.Trim() != "[]") + sb.AppendLine($"PendingOrchestration: active ({content.Length} chars)"); + else + sb.AppendLine("PendingOrchestration: none"); + } + } + catch { /* best-effort */ } + } + catch { /* multi-agent debug info is best-effort */ } + } + // --- Report Bug: creates a GitHub issue --- private async Task SubmitBugReport() { @@ -2575,8 +2651,28 @@ else } } - private static string BuildCopilotPrompt(string description, string debugInfo, string branchName) + private string BuildCopilotPrompt(string description, string debugInfo, string branchName) { + var isMultiAgent = !string.IsNullOrEmpty(selectedBugSession) && + CopilotService.IsSessionInMultiAgentGroup(selectedBugSession); + + var multiAgentSection = ""; + if (isMultiAgent) + { + multiAgentSection = """ + +## Multi-Agent Testing Requirements +This session is part of a multi-agent orchestration group. After fixing the issue: +1. Verify orchestration dispatch still works — `grep "DISPATCH" ~/.polypilot/event-diagnostics.log | tail -20` should show DISPATCH-ROUTE → DISPATCH → SEND sequence. +2. Verify workers complete and report back — each should get SessionIdleEvent → CompleteResponse. +3. Verify PendingOrchestration is cleared after synthesis — `cat ~/.polypilot/pending-orchestration.json` should be empty. +4. If modifying IsProcessing paths, verify ALL 9 companion fields are cleared (see processing-state-safety skill — INV-1). +5. If modifying reconnect paths, verify IsOrphaned guards prevent stale callbacks (see INV-O9-O13 in multi-agent-orchestration skill). +6. If modifying event handlers, ensure IsOrphaned check is the FIRST guard in the handler. +7. Run `dotnet test PolyPilot.Tests/ --filter "MultiAgent"` to verify multi-agent regression tests pass. +"""; + } + return $""" You are fixing a bug in the PolyPilot app (a .NET MAUI Blazor Hybrid app). @@ -2599,7 +2695,7 @@ You are fixing a bug in the PolyPilot app (a .NET MAUI Blazor Hybrid app). 8. **Verify your branch is based on `main`** — run `git log --oneline upstream/main..HEAD` (or `origin/main..HEAD`) and confirm only YOUR commits appear. If unrelated commits are included, the branch was created from the wrong base. Fix it: create a new branch from `upstream/main` and cherry-pick your commits onto it. 9. Commit your changes to branch `{branchName}` with a descriptive commit message. 10. Push the branch and create a PR against `main` in the PureWeen/PolyPilot repository using `gh pr create`. - +{multiAgentSection} Important conventions: - Never use `static readonly` fields that call platform APIs (use lazy `??=` properties instead). - Tests must NEVER call `ConnectionSettings.Save()` or `ConnectionSettings.Load()`. @@ -2777,4 +2873,30 @@ Important conventions: CopilotService.OnOrchestratorPhaseChanged -= HandleOrchestratorPhaseChanged; RepoManager.OnStateChanged -= OnRepoStateChanged; } + + /// + /// Reads the last N lines from a file without loading the entire file into memory. + /// Falls back to empty array on IOException. + /// + private static string[] ReadLastLines(string path, int count) + { + try + { + using var fs = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite); + using var reader = new StreamReader(fs); + var buffer = new Queue(); + string? line; + while ((line = reader.ReadLine()) != null) + { + buffer.Enqueue(line); + if (buffer.Count > count) + buffer.Dequeue(); + } + return buffer.ToArray(); + } + catch (IOException) + { + return Array.Empty(); + } + } } diff --git a/PolyPilot/Components/Pages/Dashboard.razor b/PolyPilot/Components/Pages/Dashboard.razor index 83721a6ab3..e3f4a047cd 100644 --- a/PolyPilot/Components/Pages/Dashboard.razor +++ b/PolyPilot/Components/Pages/Dashboard.razor @@ -1300,6 +1300,30 @@ if (session?.IsProcessing == true) { + // If this session is an orchestrator in a multi-agent group, queue the message + // instead of steering. Steering cancels the in-flight orchestration TCS which + // aborts worker dispatch. The queued message will be sent after the current + // orchestration completes. + var orchGroupId = CopilotService.GetOrchestratorGroupId(sessionName); + if (orchGroupId != null) + { + CopilotService.LogDispatchRoute(sessionName, true, "QUEUED_ORCH_BUSY", null, null, null, true); + List? queueImagePaths = null; + if (hasImages) + { + queueImagePaths = pendingSendImages!.Select(i => i.TempPath).ToList(); + pendingImagesBySession.Remove(sessionName); + } + CopilotService.EnqueueMessage(sessionName, finalPrompt, queueImagePaths, agentMode); + // Show user feedback that message is queued (orchestrator is busy dispatching workers) + session.History.Add(ChatMessage.SystemMessage( + "📋 Orchestrator is busy coordinating workers — your message has been queued and will be sent when the current operation completes.")); + session.MessageCount = session.History.Count; + _needsScrollToBottom = true; + await InvokeAsync(SafeRefreshAsync); + return; + } + CopilotService.LogDispatchRoute(sessionName, true, "STEER", null, null, null, false); List? steerImagePaths = null; if (hasImages) diff --git a/PolyPilot/Services/CopilotService.Bridge.cs b/PolyPilot/Services/CopilotService.Bridge.cs index cc82bc22a0..4ac164049d 100644 --- a/PolyPilot/Services/CopilotService.Bridge.cs +++ b/PolyPilot/Services/CopilotService.Bridge.cs @@ -459,8 +459,11 @@ private void SyncRemoteSessions() var remoteNames = remoteSessions.Select(s => s.Name).ToHashSet(); foreach (var name in _sessions.Keys.ToList()) { - if (!remoteNames.Contains(name) && !_pendingRemoteSessions.ContainsKey(name)) - _sessions.TryRemove(name, out _); + if (!remoteNames.Contains(name) && !_pendingRemoteSessions.ContainsKey(name) && + _sessions.TryRemove(name, out var removedState)) + { + DisposePrematureIdleSignal(removedState); + } } // Clear pending flag for sessions confirmed by server diff --git a/PolyPilot/Services/CopilotService.Codespace.cs b/PolyPilot/Services/CopilotService.Codespace.cs index 69e5288a22..d887a82f8d 100644 --- a/PolyPilot/Services/CopilotService.Codespace.cs +++ b/PolyPilot/Services/CopilotService.Codespace.cs @@ -789,6 +789,7 @@ private async Task ResumeCodespaceSessionsAsync(SessionGroup group, Cancellation CancelProcessingWatchdog(state); CancelToolHealthCheck(state); + var oldState = state; var newState = new SessionState { Session = newSession, @@ -796,6 +797,7 @@ private async Task ResumeCodespaceSessionsAsync(SessionGroup group, Cancellation }; newSession.On(evt => HandleSessionEvent(newState, evt)); _sessions[meta.SessionName] = newState; + DisposePrematureIdleSignal(oldState); // Remove the placeholder system message and add connected notification (on UI thread) var infoRef = state.Info; diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs index d942f3b99a..2a200f86ec 100644 --- a/PolyPilot/Services/CopilotService.Events.cs +++ b/PolyPilot/Services/CopilotService.Events.cs @@ -510,12 +510,36 @@ void Invoke(Action action) var phaseAdvancedToThinking = state.Info.ProcessingPhase < 2; if (phaseAdvancedToThinking) state.Info.ProcessingPhase = 2; // Thinking Interlocked.Exchange(ref state.ActiveToolCallCount, 0); - Invoke(() => + // Premature session.idle recovery: the SDK sometimes sends session.idle + // mid-turn, then continues processing (ghost events). If we receive a + // TurnStart after IsProcessing was already cleared, re-arm processing + // so the UI shows the session as active and content is properly captured + // via the normal CompleteResponse path on the next session.idle. + if (!state.Info.IsProcessing && isCurrentState && !state.IsOrphaned) { - OnTurnStart?.Invoke(sessionName); - OnActivity?.Invoke(sessionName, "🤔 Thinking..."); - if (phaseAdvancedToThinking) NotifyStateChangedCoalesced(); - }); + Debug($"[EVT-REARM] '{sessionName}' TurnStartEvent arrived after premature session.idle — re-arming IsProcessing"); + state.PrematureIdleSignal.Set(); // Signal to ExecuteWorkerAsync that TCS result was truncated + Invoke(() => + { + if (state.IsOrphaned) return; + state.Info.IsProcessing = true; + state.Info.ProcessingPhase = 2; + state.Info.ProcessingStartedAt ??= DateTime.UtcNow; + StartProcessingWatchdog(state, sessionName); + OnTurnStart?.Invoke(sessionName); + OnActivity?.Invoke(sessionName, "🤔 Thinking..."); + NotifyStateChangedCoalesced(); + }); + } + else + { + Invoke(() => + { + OnTurnStart?.Invoke(sessionName); + OnActivity?.Invoke(sessionName, "🤔 Thinking..."); + if (phaseAdvancedToThinking) NotifyStateChangedCoalesced(); + }); + } break; case AssistantTurnEndEvent: @@ -2316,6 +2340,7 @@ private async Task TryRecoverPermissionAsync(SessionState state, string sessionN state.ResponseCompletion?.TrySetCanceled(); // Create new state preserving Info + var oldState = state; var newState = new SessionState { Session = newSession, @@ -2337,6 +2362,7 @@ private async Task TryRecoverPermissionAsync(SessionState state, string sessionN // Replace in sessions dictionary BEFORE registering event handler // so HandleSessionEvent's isCurrentState check passes for the new state. _sessions[sessionName] = newState; + DisposePrematureIdleSignal(oldState); newSession.On(evt => HandleSessionEvent(newState, evt)); // Bug A fix: Clear IsProcessing + all 9 companion fields so SendPromptAsync diff --git a/PolyPilot/Services/CopilotService.Organization.cs b/PolyPilot/Services/CopilotService.Organization.cs index 959edc13c4..38acb5076d 100644 --- a/PolyPilot/Services/CopilotService.Organization.cs +++ b/PolyPilot/Services/CopilotService.Organization.cs @@ -36,6 +36,20 @@ public partial class CopilotService private static readonly TimeSpan WorkerExecutionTimeout = TimeSpan.FromMinutes(60); private static readonly TimeSpan WorkerExecutionTimeoutRemote = TimeSpan.FromMinutes(10); + /// How long to poll for premature idle indicators after the initial TCS completes. + /// Checks both WasPrematurelyIdled flag (set by EVT-REARM) and events.jsonl freshness + /// (CLI still writing events). The events.jsonl check catches cases where EVT-REARM + /// takes 30-60s to fire. + internal const int PrematureIdleDetectionWindowMs = 10_000; + + /// If events.jsonl was modified within this many seconds of TCS completion, + /// the worker is likely still active despite the premature session.idle. + internal const int PrematureIdleEventsFileFreshnessSeconds = 15; + + /// Maximum time to wait for the worker's real completion after detecting a + /// premature session.idle re-arm. Workers with long tool runs can take minutes. + internal const int PrematureIdleRecoveryTimeoutMs = 120_000; + // Per-session semaphores to prevent concurrent model switches during rapid dispatch private readonly ConcurrentDictionary _modelSwitchLocks = new(); @@ -1465,7 +1479,7 @@ private async Task ExecuteWorkerAsync(string workerName, string ta var workerPrompt = BuildWorkerPrompt(identity, worktreeNote, sharedPrefix, originalPrompt, task); const int maxRetries = 2; - var dispatchTime = DateTime.UtcNow; + var dispatchTime = DateTime.Now; // Pre-dispatch: if worker is still processing from a previous run (e.g., restored // mid-processing after app relaunch), wait for it to become idle. The watchdog will @@ -1497,6 +1511,17 @@ private async Task ExecuteWorkerAsync(string workerName, string ta Debug($"[DISPATCH] Worker '{workerName}' starting (prompt len={workerPrompt.Length}, attempt={attempt})"); var response = await SendPromptAndWaitAsync(workerName, workerPrompt, cancellationToken, originalPrompt: originalPrompt); + // Premature session.idle recovery (SDK bug #299): + // The SDK sometimes sends session.idle mid-turn, completing the TCS with + // truncated content. The EVT-REARM path detects the follow-up TurnStartEvent + // and sets WasPrematurelyIdled=true. Poll briefly for this flag, then wait + // for the worker's real completion and re-collect full content from History. + if (_sessions.TryGetValue(workerName, out var postState) && postState.IsMultiAgentSession) + { + response = await RecoverFromPrematureIdleIfNeededAsync( + workerName, postState, response, dispatchTime, cancellationToken); + } + // Worker revival: empty response means the session died (e.g., dead SSE stream // after reconnect). Create a fresh session and retry once. if (string.IsNullOrWhiteSpace(response)) @@ -1504,6 +1529,8 @@ private async Task ExecuteWorkerAsync(string workerName, string ta Debug($"[DISPATCH] Worker '{workerName}' returned empty — attempting fresh session revival"); if (_sessions.TryGetValue(workerName, out var deadState)) { + // Mark old state as orphaned so any lingering callbacks are no-ops + deadState.IsOrphaned = true; try { await deadState.Session.DisposeAsync(); } catch { } var workerMeta = GetSessionMeta(workerName); @@ -1512,11 +1539,30 @@ private async Task ExecuteWorkerAsync(string workerName, string ta { var freshConfig = BuildFreshSessionConfig(deadState); var freshSession = await client.CreateSessionAsync(freshConfig, cancellationToken); - deadState.Info.SessionId = freshSession.SessionId; var freshState = new SessionState { Session = freshSession, Info = deadState.Info }; - _sessions[workerName] = freshState; - Debug($"[DISPATCH] Worker '{workerName}' revived with fresh session '{freshSession.SessionId}'"); - response = await SendPromptAndWaitAsync(workerName, workerPrompt, cancellationToken, originalPrompt: originalPrompt); + freshState.IsMultiAgentSession = deadState.IsMultiAgentSession; + // Register event handler BEFORE sending — without this, the SDK + // writes events.jsonl but HandleSessionEvent never fires, creating + // a dead event stream where the watchdog is the only recovery path. + freshSession.On(evt => HandleSessionEvent(freshState, evt)); + // Use TryUpdate for atomic swap — prevents a stale Task.Run + // from a concurrent reconnect from overwriting newer state (INV-15). + if (!_sessions.TryUpdate(workerName, freshState, deadState)) + { + Debug($"[DISPATCH] Worker '{workerName}' revival state already replaced — discarding"); + freshState.IsOrphaned = true; + try { await freshSession.DisposeAsync(); } catch { } + DisposePrematureIdleSignal(deadState); + } + else + { + // Commit SessionId only after TryUpdate succeeds — avoids + // mutating shared Info on a path that might discard the state. + deadState.Info.SessionId = freshSession.SessionId; + DisposePrematureIdleSignal(deadState); + Debug($"[DISPATCH] Worker '{workerName}' revived with fresh session '{freshSession.SessionId}'"); + response = await SendPromptAndWaitAsync(workerName, workerPrompt, cancellationToken, originalPrompt: originalPrompt); + } } } } @@ -1578,10 +1624,11 @@ private async Task ExecuteWorkerAsync(string workerName, string ta { try { - var diskHistory = LoadHistoryFromDisk(sessionId); + var diskHistory = await LoadHistoryFromDiskAsync(sessionId); var lastDiskAssistant = diskHistory .LastOrDefault(m => m.Role == "assistant" && !string.IsNullOrWhiteSpace(m.Content) - && m.MessageType == ChatMessageType.Assistant); + && m.MessageType == ChatMessageType.Assistant + && m.Timestamp >= dispatchTime); if (lastDiskAssistant != null) { response = lastDiskAssistant.Content; @@ -1642,6 +1689,210 @@ private async Task SendPromptAndWaitAsync(string sessionName, string pro return await SendPromptAsync(sessionName, prompt, cancellationToken: cts.Token, originalPrompt: originalPrompt); } + /// + /// Detects premature session.idle (SDK bug #299) and recovers the full worker response. + /// After the initial TCS completes, polls briefly for the WasPrematurelyIdled flag set by + /// EVT-REARM. If detected, subscribes to OnSessionComplete and waits for the worker's real + /// completion, then re-collects full content from History or events.jsonl. + /// + private async Task RecoverFromPrematureIdleIfNeededAsync( + string workerName, SessionState state, string? initialResponse, + DateTime dispatchTime, CancellationToken cancellationToken) + { + // Two detection signals (either triggers recovery): + // 1. PrematureIdleSignal (ManualResetEventSlim) — set by EVT-REARM, event-based (efficient) + // 2. events.jsonl freshness — CLI is still writing events despite our TCS completing + // This catches cases where EVT-REARM takes 30-60s to fire + + bool IsPrematureIdleSignalSet() + { + try { return state.PrematureIdleSignal.IsSet; } + catch (ObjectDisposedException) { return false; } + } + + bool WaitForPrematureIdleSignal() + { + try { return state.PrematureIdleSignal.Wait(500, cancellationToken); } + catch (ObjectDisposedException) { return false; } + } + + // Fast path: check if PrematureIdleSignal was already set + var detected = IsPrematureIdleSignalSet(); + + if (!detected) + { + // Check events.jsonl freshness immediately — if the file was just written, + // the CLI is still actively working despite the premature session.idle + detected = IsEventsFileActive(state.Info.SessionId); + } + + if (!detected) + { + // Wait for PrematureIdleSignal OR poll events.jsonl freshness + var detectStart = DateTime.UtcNow; + while ((DateTime.UtcNow - detectStart).TotalMilliseconds < PrematureIdleDetectionWindowMs) + { + // Wait up to 500ms on the signal (exits immediately if Set()) + var signaled = await Task.Run(WaitForPrematureIdleSignal, cancellationToken) + .ConfigureAwait(false); + + if (signaled || cancellationToken.IsCancellationRequested) + { + detected = signaled; + break; + } + + // Re-check events.jsonl freshness each cycle + if (IsEventsFileActive(state.Info.SessionId)) + { + detected = true; + break; + } + } + } + + if (!detected) + return initialResponse; // Normal completion — no premature idle indicators + + var signal = IsPrematureIdleSignalSet() ? "PrematureIdleSignal" : "events.jsonl freshness"; + Debug($"[DISPATCH-RECOVER] Worker '{workerName}' premature idle detected via {signal} — " + + $"truncated response={initialResponse?.Length ?? 0} chars, " + + $"IsProcessing={state.Info.IsProcessing}. Waiting for real completion..."); + + // The worker may hit premature idle repeatedly (observed: 4x in a row), + // so we loop until events.jsonl goes stale (worker truly done). + string? bestResponse = initialResponse; + try + { + using var recoveryCts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken); + recoveryCts.CancelAfter(PrematureIdleRecoveryTimeoutMs); + var rounds = 0; + + while (!recoveryCts.Token.IsCancellationRequested) + { + rounds++; + var completionTcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + void LocalHandler(string name, string _) + { + if (name == workerName) + completionTcs.TrySetResult(true); + } + OnSessionComplete += LocalHandler; + + try + { + // If worker already finished while we were setting up, complete immediately + if (!state.Info.IsProcessing) + completionTcs.TrySetResult(true); + + await using var reg = recoveryCts.Token.Register(() => completionTcs.TrySetResult(false)); + var completed = await completionTcs.Task; + if (!completed) + { + Debug($"[DISPATCH-RECOVER] Worker '{workerName}' recovery timed out after {PrematureIdleRecoveryTimeoutMs / 1000}s " + + $"(round {rounds}) — using best response ({bestResponse?.Length ?? 0} chars)"); + break; + } + + // Collect content from this completion round + ChatMessage[] histSnapshot; + try { histSnapshot = state.Info.History.ToArray(); } + catch (InvalidOperationException) { histSnapshot = Array.Empty(); } + + var latestContent = histSnapshot + .LastOrDefault(m => m.Role == "assistant" && !string.IsNullOrWhiteSpace(m.Content) + && m.MessageType == ChatMessageType.Assistant + && m.Timestamp >= dispatchTime); + + if (latestContent != null && latestContent.Content!.Length > (bestResponse?.Length ?? 0)) + { + bestResponse = latestContent.Content; + Debug($"[DISPATCH-RECOVER] Worker '{workerName}' round {rounds}: collected {bestResponse.Length} chars from History"); + } + + // Check if the worker is truly done or will hit premature idle again + try { await Task.Delay(2000, recoveryCts.Token); } // Brief settle time + catch (OperationCanceledException) when (!cancellationToken.IsCancellationRequested) { break; } + + if (!IsEventsFileActive(state.Info.SessionId) && !state.Info.IsProcessing) + { + Debug($"[DISPATCH-RECOVER] Worker '{workerName}' events.jsonl stale and not processing " + + $"after round {rounds} — worker is truly done ({bestResponse?.Length ?? 0} chars)"); + break; + } + + if (state.Info.IsProcessing) + { + Debug($"[DISPATCH-RECOVER] Worker '{workerName}' still processing after round {rounds} " + + $"(re-armed again) — waiting for next completion..."); + } + } + finally + { + OnSessionComplete -= LocalHandler; + } + } + + // If History didn't have better content, try disk fallback + if ((bestResponse?.Length ?? 0) <= (initialResponse?.Length ?? 0)) + { + var sessionId = state.Info.SessionId; + if (!string.IsNullOrEmpty(sessionId)) + { + try + { + var diskHistory = await LoadHistoryFromDiskAsync(sessionId); + var lastDisk = diskHistory + .LastOrDefault(m => m.Role == "assistant" && !string.IsNullOrWhiteSpace(m.Content) + && m.MessageType == ChatMessageType.Assistant + && m.Timestamp >= dispatchTime); + if (lastDisk != null && lastDisk.Content!.Length > (bestResponse?.Length ?? 0)) + { + Debug($"[DISPATCH-RECOVER] Worker '{workerName}' recovered {lastDisk.Content.Length} chars from events.jsonl"); + return lastDisk.Content; + } + } + catch (Exception ex) + { + Debug($"[DISPATCH-RECOVER] Worker '{workerName}' events.jsonl recovery failed: {ex.Message}"); + } + } + } + + if ((bestResponse?.Length ?? 0) > (initialResponse?.Length ?? 0)) + { + Debug($"[DISPATCH-RECOVER] Worker '{workerName}' final recovery: {bestResponse!.Length} chars " + + $"(was {initialResponse?.Length ?? 0} chars truncated, {rounds} rounds)"); + return bestResponse; + } + + Debug($"[DISPATCH-RECOVER] Worker '{workerName}' recovery found no additional content after {rounds} rounds — " + + $"using original response ({initialResponse?.Length ?? 0} chars)"); + return initialResponse; + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + return bestResponse ?? initialResponse; + } + } + + /// Check if a session's events.jsonl was recently modified, indicating the CLI + /// is still actively working. Used by premature idle recovery to detect truncation + /// when EVT-REARM hasn't fired yet. + private bool IsEventsFileActive(string? sessionId) + { + if (string.IsNullOrEmpty(sessionId)) return false; + try + { + var eventsPath = Path.Combine(SessionStatePath, sessionId, "events.jsonl"); + if (!File.Exists(eventsPath)) return false; + var lastWrite = File.GetLastWriteTimeUtc(eventsPath); + var fileAge = (DateTime.UtcNow - lastWrite).TotalSeconds; + return fileAge < PrematureIdleEventsFileFreshnessSeconds; + } + catch { return false; } + } + private static string BuildWorkerPrompt(string identity, string worktreeNote, string sharedPrefix, string originalPrompt, string task) { return $"{identity}{worktreeNote}\n\nYour response will be collected and synthesized with other workers' responses.\n\n" + @@ -1869,10 +2120,11 @@ private async Task MonitorAndSynthesizeAsync(PendingOrchestration pending, Cance { try { - var diskHistory = LoadHistoryFromDisk(session.SessionId); + var diskHistory = await LoadHistoryFromDiskAsync(session.SessionId); var lastDiskAssistant = diskHistory .LastOrDefault(m => m.Role == "assistant" && !string.IsNullOrWhiteSpace(m.Content) - && m.MessageType == ChatMessageType.Assistant); + && m.MessageType == ChatMessageType.Assistant + && m.Timestamp >= dispatchTimeLocal); if (lastDiskAssistant != null) { diskResponse = lastDiskAssistant.Content; diff --git a/PolyPilot/Services/CopilotService.Persistence.cs b/PolyPilot/Services/CopilotService.Persistence.cs index 3811374fda..499379b311 100644 --- a/PolyPilot/Services/CopilotService.Persistence.cs +++ b/PolyPilot/Services/CopilotService.Persistence.cs @@ -1,4 +1,5 @@ using System.Text.Json; +using GitHub.Copilot.SDK; using PolyPilot.Models; namespace PolyPilot.Services; @@ -290,6 +291,126 @@ private static void BackfillUsageFromEvents(AgentSessionInfo info, string sessio info.CreatedAt = firstTimestamp.Value; } + /// + /// Check if a session belongs to a codespace group. + /// + private bool IsCodespaceSession(string sessionName) + { + // Use snapshots for thread safety — may be called from background threads + var metas = SnapshotSessionMetas(); + var meta = metas.FirstOrDefault(m => m.SessionName == sessionName); + if (meta?.GroupId == null) return false; + var groups = SnapshotGroups(); + var group = groups.FirstOrDefault(g => g.Id == meta.GroupId); + return group?.IsCodespace == true; + } + + /// + /// Lazily connect a placeholder session to the SDK. Called on first interaction + /// (send message) with a session that was loaded at startup without an SDK connection. + /// + private async Task EnsureSessionConnectedAsync(string sessionName, SessionState state, CancellationToken cancellationToken) + { + var connectLock = _sessionConnectLocks.GetOrAdd(sessionName, _ => new SemaphoreSlim(1, 1)); + await connectLock.WaitAsync(cancellationToken); + try + { + if (state.Session != null) return; // already connected + + var sessionId = state.Info.SessionId; + if (string.IsNullOrEmpty(sessionId) || !Guid.TryParse(sessionId, out _)) + throw new InvalidOperationException($"Session '{sessionName}' has no valid session ID for resume."); + + if (!IsInitialized || _client == null) + throw new InvalidOperationException("Copilot is not connected yet. Go to Settings to configure."); + + Debug($"Lazy-resuming session '{sessionName}' (id={sessionId})..."); + + // Use snapshot for thread safety — may be called from ThreadPool via SendPromptAsync + var groupId = SnapshotSessionMetas().FirstOrDefault(m => m.SessionName == sessionName)?.GroupId; + var resumeModel = state.Info.Model ?? DefaultModel; + var resumeWorkDir = state.Info.WorkingDirectory; + + var resumeConfig = new ResumeSessionConfig + { + Model = resumeModel, + WorkingDirectory = resumeWorkDir, + Tools = new List { ShowImageTool.CreateFunction() }, + OnPermissionRequest = AutoApprovePermissions + }; + + CopilotSession copilotSession; + try + { + copilotSession = await GetClientForGroup(groupId).ResumeSessionAsync(sessionId, resumeConfig, cancellationToken); + } + catch (Exception ex) when ( + ex.Message.Contains("Session not found", StringComparison.OrdinalIgnoreCase) || + ex.Message.Contains("corrupt", StringComparison.OrdinalIgnoreCase) || + ex.Message.Contains("session file", StringComparison.OrdinalIgnoreCase) || + IsProcessError(ex)) + { + Debug($"Lazy-resume failed for '{sessionName}': {ex.Message} — creating fresh session"); + copilotSession = await GetClientForGroup(groupId).CreateSessionAsync(new SessionConfig + { + Model = resumeModel, + WorkingDirectory = resumeWorkDir, + Tools = new List { ShowImageTool.CreateFunction() }, + OnPermissionRequest = AutoApprovePermissions + }, cancellationToken); + state.Info.SessionId = copilotSession.SessionId; + FlushSaveActiveSessionsToDisk(); + } + + state.Session = copilotSession; + state.IsMultiAgentSession = IsSessionInMultiAgentGroup(sessionName); + copilotSession.On(evt => HandleSessionEvent(state, evt)); + Debug($"Lazy-resume complete: '{sessionName}'"); + } + finally + { + connectLock.Release(); + } + } + + /// + /// Background wrapper for session restore + post-restore tasks. + /// Runs off the UI thread so the app renders immediately on launch. + /// + private async Task RestoreSessionsInBackgroundAsync(CancellationToken cancellationToken) + { + try + { + await RestorePreviousSessionsAsync(cancellationToken); + } + catch (Exception ex) + { + Debug($"Background restore failed: {ex.GetType().Name}: {ex.Message}"); + } + finally + { + IsRestoring = false; + + // Flush session list so recreated session IDs are persisted + FlushSaveActiveSessionsToDisk(); + + if (CodespacesEnabled) + StartCodespaceHealthCheck(); + + // ReconcileOrganization reads/writes Organization.Sessions (a plain List) + // which is not thread-safe. Now that restore runs on ThreadPool via Task.Run, + // we must marshal this to the UI thread to avoid concurrent enumeration crashes. + InvokeOnUI(() => + { + ReconcileOrganization(); + OnStateChanged?.Invoke(); + }); + + // Resume any pending orchestration dispatch interrupted by relaunch + _ = ResumeOrchestrationIfPendingAsync(cancellationToken); + } + } + /// /// Load and resume all previously active sessions /// @@ -299,7 +420,7 @@ public async Task RestorePreviousSessionsAsync(CancellationToken cancellationTok { try { - var json = await File.ReadAllTextAsync(ActiveSessionsFile, cancellationToken); + var json = await File.ReadAllTextAsync(ActiveSessionsFile, cancellationToken).ConfigureAwait(false); var entries = JsonSerializer.Deserialize>(json); if (entries != null && entries.Count > 0) { @@ -327,6 +448,8 @@ public async Task RestorePreviousSessionsAsync(CancellationToken cancellationTok activeEvaluators.Add(g.ReflectionState.EvaluatorSessionName); } + var eagerResumeCandidates = new List<(string SessionName, SessionState State)>(); + foreach (var entry in entries) { try @@ -390,9 +513,40 @@ public async Task RestorePreviousSessionsAsync(CancellationToken cancellationTok continue; } - await ResumeSessionAsync(entry.SessionId, entry.DisplayName, entry.WorkingDirectory, entry.Model, cancellationToken, entry.LastPrompt, entry.GroupId); + // Create lightweight placeholder — actual SDK resume happens lazily + // when user sends a message (EnsureSessionConnectedAsync). + // This avoids 41 sequential SDK connections blocking app startup. + var lazyHistory = LoadHistoryFromDisk(entry.SessionId); + var lazyModel = Models.ModelHelper.NormalizeToSlug(entry.Model ?? DefaultModel); + if (string.IsNullOrEmpty(lazyModel)) lazyModel = DefaultModel; + var lazyWorkDir = entry.WorkingDirectory ?? GetSessionWorkingDirectory(entry.SessionId); + + var lazyInfo = new AgentSessionInfo + { + Name = entry.DisplayName, + Model = lazyModel, + CreatedAt = DateTime.Now, + SessionId = entry.SessionId, + WorkingDirectory = lazyWorkDir + }; + lazyInfo.GitBranch = GetGitBranch(lazyInfo.WorkingDirectory); + foreach (var msg in lazyHistory) lazyInfo.History.Add(msg); + lazyInfo.MessageCount = lazyInfo.History.Count; + lazyInfo.LastReadMessageCount = lazyInfo.History.Count; + // Mark stale incomplete tool calls / reasoning as complete + foreach (var msg in lazyInfo.History.Where(m => (m.MessageType == ChatMessageType.ToolCall || m.MessageType == ChatMessageType.Reasoning) && !m.IsComplete)) + msg.IsComplete = true; + + var lazyState = new SessionState { Session = null!, Info = lazyInfo }; + _sessions[entry.DisplayName] = lazyState; + _activeSessionName ??= entry.DisplayName; RestoreUsageStats(entry); - Debug($"Restored session: {entry.DisplayName}"); + if (!string.IsNullOrWhiteSpace(entry.LastPrompt)) + { + eagerResumeCandidates.Add((entry.DisplayName, lazyState)); + Debug($"Queued eager resume for interrupted session: {entry.DisplayName}"); + } + Debug($"Loaded session placeholder: {entry.DisplayName} ({lazyHistory.Count} messages)"); } catch (Exception ex) { @@ -488,6 +642,29 @@ public async Task RestorePreviousSessionsAsync(CancellationToken cancellationTok } } } + + if (eagerResumeCandidates.Count > 0) + { + var pendingResumes = eagerResumeCandidates.ToArray(); + _ = Task.Run(async () => + { + foreach (var pendingResume in pendingResumes) + { + try + { + await EnsureSessionConnectedAsync(pendingResume.SessionName, pendingResume.State, cancellationToken).ConfigureAwait(false); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + break; + } + catch (Exception resumeEx) + { + Debug($"Eager resume failed for '{pendingResume.SessionName}': {resumeEx.Message}"); + } + } + }, cancellationToken); + } IsRestoring = false; } diff --git a/PolyPilot/Services/CopilotService.Providers.cs b/PolyPilot/Services/CopilotService.Providers.cs index c2c6951fc1..8eedf8ab3a 100644 --- a/PolyPilot/Services/CopilotService.Providers.cs +++ b/PolyPilot/Services/CopilotService.Providers.cs @@ -356,7 +356,8 @@ private void SyncProviderMembers(ISessionProvider provider, string groupId) .Where(k => k != leaderName && k.StartsWith(prefix) && k.EndsWith(suffix) && !expectedNames.Contains(k)) .ToList()) { - _sessions.TryRemove(existing, out _); + if (_sessions.TryRemove(existing, out var removedState)) + DisposePrematureIdleSignal(removedState); _sessionToProviderId.TryRemove(existing, out _); var meta = Organization.Sessions.FirstOrDefault(m => m.SessionName == existing); if (meta != null) RemoveSessionMeta(meta); diff --git a/PolyPilot/Services/CopilotService.Utilities.cs b/PolyPilot/Services/CopilotService.Utilities.cs index c5f249ed58..6f184e6bca 100644 --- a/PolyPilot/Services/CopilotService.Utilities.cs +++ b/PolyPilot/Services/CopilotService.Utilities.cs @@ -343,9 +343,11 @@ internal static bool IsProcessError(Exception ex) } /// - /// Load conversation history from events.jsonl + /// Load conversation history from events.jsonl asynchronously with proper file sharing. + /// Uses FileShare.ReadWrite to avoid contention with concurrent SDK writes during + /// premature idle recovery scenarios. /// - private List LoadHistoryFromDisk(string sessionId) + private async Task> LoadHistoryFromDiskAsync(string sessionId) { var history = new List(); var eventsFile = Path.Combine(SessionStatePath, sessionId, "events.jsonl"); @@ -358,7 +360,16 @@ private List LoadHistoryFromDisk(string sessionId) // Track tool calls by ID so we can update them when complete var toolCallMessages = new Dictionary(); - foreach (var line in File.ReadLines(eventsFile)) + // Open file with FileShare.ReadWrite to allow concurrent reads/writes + using var stream = new FileStream( + eventsFile, + FileMode.Open, + FileAccess.Read, + FileShare.ReadWrite); + using var reader = new StreamReader(stream); + + string? line; + while ((line = await reader.ReadLineAsync().ConfigureAwait(false)) != null) { if (string.IsNullOrWhiteSpace(line)) continue; @@ -475,6 +486,24 @@ private List LoadHistoryFromDisk(string sessionId) return history; } + /// + /// Synchronous wrapper for LoadHistoryFromDiskAsync for callers that can't await. + /// This is a temporary bridge; callers should be updated to use async version. + /// + private List LoadHistoryFromDisk(string sessionId) + { + // For synchronous contexts, use blocking wait on the async version + // This is not ideal but maintains backward compatibility during transition + try + { + return LoadHistoryFromDiskAsync(sessionId).GetAwaiter().GetResult(); + } + catch + { + return new List(); + } + } + // Dock badge for completed sessions private int _badgeCount; diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs index 7ce29f526a..4f447d23ef 100644 --- a/PolyPilot/Services/CopilotService.cs +++ b/PolyPilot/Services/CopilotService.cs @@ -56,6 +56,9 @@ public partial class CopilotService : IAsyncDisposable // Serializes the IsConnectionError reconnect path so concurrent workers // don't destroy each other's freshly-created client (thundering herd fix). private readonly SemaphoreSlim _clientReconnectLock = new(1, 1); + // Serializes per-session lazy/eager SDK reconnects so background restore and + // the first user send can't both resume the same placeholder concurrently. + private readonly ConcurrentDictionary _sessionConnectLocks = new(); private static readonly object _pathLock = new(); private static string? _copilotBaseDir; @@ -475,12 +478,23 @@ private class SessionState /// Timestamp (UTC ticks) when AssistantTurnEndEvent was received. /// Used by zero-idle capture to measure fallback wait duration. public long TurnEndReceivedAtTicks; + /// Signals when EVT-REARM fires (premature session.idle followed by + /// TurnStartEvent while IsProcessing=false). Used by ExecuteWorkerAsync to detect + /// that the initial TCS result was truncated and the worker is still running. + /// Reset in SendPromptAsync (new turn start). + /// Uses ManualResetEventSlim for event-based signaling (no polling). + public readonly ManualResetEventSlim PrematureIdleSignal = new ManualResetEventSlim(initialState: false); /// Set to true when this state is replaced by a reconnect. Prevents orphaned /// event handlers (still registered on the old CopilotSession) from processing events /// or clearing IsProcessing on the shared Info object. public volatile bool IsOrphaned; } + private static void DisposePrematureIdleSignal(SessionState? state) + { + try { state?.PrematureIdleSignal?.Dispose(); } catch { } + } + private void Debug(string message) { LastDebugMessage = message; @@ -780,28 +794,16 @@ public async Task InitializeAsync(CancellationToken cancellationToken = default) // Load organization state FIRST (groups, pinning, sorting) so reconcile during restore doesn't wipe it LoadOrganization(); - // Restore previous sessions (includes subscribing to untracked server sessions in Persistent mode) + // Session restore runs in the background so the UI renders immediately. + // With many sessions (40+), sequential ResumeSessionAsync calls can take + // minutes — blocking here shows a blue screen until all are connected. IsRestoring = true; OnStateChanged?.Invoke(); - await RestorePreviousSessionsAsync(cancellationToken); - IsRestoring = false; - - // Flush session list immediately after restore so that any session IDs changed - // during fallback recreation (resume failed → CreateSessionAsync) are persisted. - // Without this, active-sessions.json retains stale IDs and LoadHistoryFromDisk - // reads the wrong events.jsonl on the next restart. - FlushSaveActiveSessionsToDisk(); - - // Start health check loop for any codespace groups (regardless of whether sessions were restored) - if (CodespacesEnabled) - StartCodespaceHealthCheck(); - - // Reconcile now that all sessions are restored - ReconcileOrganization(); - OnStateChanged?.Invoke(); - - // Resume any pending orchestration dispatch that was interrupted by a relaunch - _ = ResumeOrchestrationIfPendingAsync(cancellationToken); + // CRITICAL: Must use Task.Run to ensure restore runs on ThreadPool, not UI SyncContext. + // Without Task.Run, the async continuations run on the UI thread. LoadHistoryFromDisk's + // .GetAwaiter().GetResult() then blocks the UI thread waiting for async file I/O whose + // continuation needs the UI thread → classic SyncContext deadlock → blue screen. + _ = Task.Run(() => RestoreSessionsInBackgroundAsync(cancellationToken)); // Initialize any registered providers (from DI / plugin loader) await InitializeProvidersAsync(cancellationToken); @@ -1894,14 +1896,20 @@ ALWAYS run the relaunch script as the final step after making changes to this pr IsCreating = true }; // If a session with this name already exists, dispose it to avoid leaking the SDK session - if (_sessions.TryGetValue(name, out var existing) && existing.Session != null) + SessionState? replacedState = null; + if (_sessions.TryGetValue(name, out var existing)) { - try { await existing.Session.DisposeAsync(); } catch { } + replacedState = existing; + if (existing.Session != null) + { + try { await existing.Session.DisposeAsync(); } catch { } + } } var state = new SessionState { Session = null!, Info = info }; var previousActiveSessionName = _activeSessionName; _sessions[name] = state; + DisposePrematureIdleSignal(replacedState); _activeSessionName = name; if (!Organization.Sessions.Any(m => m.SessionName == name)) AddSessionMeta(new SessionMeta { SessionName = name, GroupId = groupId ?? SessionGroup.DefaultId }); @@ -1914,7 +1922,8 @@ ALWAYS run the relaunch script as the final step after making changes to this pr } catch (OperationCanceledException) { - _sessions.TryRemove(name, out _); + if (_sessions.TryRemove(name, out var removedState)) + DisposePrematureIdleSignal(removedState); RemoveSessionMetasWhere(m => m.SessionName == name); _activeSessionName = previousActiveSessionName; OnStateChanged?.Invoke(); @@ -1930,7 +1939,8 @@ ALWAYS run the relaunch script as the final step after making changes to this pr Organization.Groups.Any(g => g.Id == groupId && g.IsCodespace); if (isCodespaceSession) { - _sessions.TryRemove(name, out _); + if (_sessions.TryRemove(name, out var removedState)) + DisposePrematureIdleSignal(removedState); RemoveSessionMetasWhere(m => m.SessionName == name); _activeSessionName = previousActiveSessionName; OnStateChanged?.Invoke(); @@ -1951,7 +1961,8 @@ ALWAYS run the relaunch script as the final step after making changes to this pr try { if (_client != null) await _client.DisposeAsync(); } catch { } _client = null; IsInitialized = false; - _sessions.TryRemove(name, out _); + if (_sessions.TryRemove(name, out var removedState)) + DisposePrematureIdleSignal(removedState); RemoveSessionMetasWhere(m => m.SessionName == name); _activeSessionName = previousActiveSessionName; OnStateChanged?.Invoke(); @@ -1976,7 +1987,8 @@ ALWAYS run the relaunch script as the final step after making changes to this pr try { if (_client != null) await _client.DisposeAsync(); } catch { } _client = null; IsInitialized = false; - _sessions.TryRemove(name, out _); + if (_sessions.TryRemove(name, out var removedState)) + DisposePrematureIdleSignal(removedState); RemoveSessionMetasWhere(m => m.SessionName == name); _activeSessionName = previousActiveSessionName; OnStateChanged?.Invoke(); @@ -1988,7 +2000,8 @@ ALWAYS run the relaunch script as the final step after making changes to this pr try { if (_client != null) await _client.DisposeAsync(); } catch { } _client = null; IsInitialized = false; - _sessions.TryRemove(name, out _); + if (_sessions.TryRemove(name, out var removedState)) + DisposePrematureIdleSignal(removedState); RemoveSessionMetasWhere(m => m.SessionName == name); _activeSessionName = previousActiveSessionName; OnStateChanged?.Invoke(); @@ -2002,7 +2015,8 @@ ALWAYS run the relaunch script as the final step after making changes to this pr } catch (OperationCanceledException) { - _sessions.TryRemove(name, out _); + if (_sessions.TryRemove(name, out var removedState)) + DisposePrematureIdleSignal(removedState); RemoveSessionMetasWhere(m => m.SessionName == name); _activeSessionName = previousActiveSessionName; OnStateChanged?.Invoke(); @@ -2013,7 +2027,8 @@ ALWAYS run the relaunch script as the final step after making changes to this pr try { if (_client != null) await _client.DisposeAsync(); } catch { } _client = null; IsInitialized = false; - _sessions.TryRemove(name, out _); + if (_sessions.TryRemove(name, out var removedState)) + DisposePrematureIdleSignal(removedState); RemoveSessionMetasWhere(m => m.SessionName == name); _activeSessionName = previousActiveSessionName; OnStateChanged?.Invoke(); @@ -2023,7 +2038,8 @@ ALWAYS run the relaunch script as the final step after making changes to this pr catch { // SDK creation failed — remove the optimistic placeholder and restore prior state - _sessions.TryRemove(name, out _); + if (_sessions.TryRemove(name, out var removedState)) + DisposePrematureIdleSignal(removedState); RemoveSessionMetasWhere(m => m.SessionName == name); _activeSessionName = previousActiveSessionName; OnStateChanged?.Invoke(); @@ -2379,6 +2395,7 @@ public async Task ChangeModelAsync(string sessionName, string newModel, Ca // Build replacement state, preserving info/history state.Info.Model = normalizedModel; + var oldState = state; var newState = new SessionState { Session = newSession, @@ -2386,6 +2403,7 @@ public async Task ChangeModelAsync(string sessionName, string newModel, Ca }; newSession.On(evt => HandleSessionEvent(newState, evt)); _sessions[sessionName] = newState; + DisposePrematureIdleSignal(oldState); Debug($"Model switched for '{sessionName}' to {normalizedModel}"); SaveActiveSessionsToDisk(); @@ -2452,7 +2470,7 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis throw new InvalidOperationException("Session is still being created. Please wait."); // Placeholder codespace sessions have Session=null until the tunnel connects - if (state.Session == null) + if (state.Session == null && IsCodespaceSession(sessionName)) throw new InvalidOperationException("This session is waiting for its codespace to connect. Please wait for the green status dot."); if (state.Info.IsProcessing) @@ -2463,6 +2481,22 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis if (Interlocked.CompareExchange(ref state.SendingFlag, 1, 0) != 0) throw new InvalidOperationException("Session is already processing a request."); + // Lazy resume INSIDE the SendingFlag guard to prevent double-resume race: + // without this, two rapid sends could both see Session==null and both call + // EnsureSessionConnectedAsync concurrently, leaking the first resumed session. + if (state.Session == null) + { + try + { + await EnsureSessionConnectedAsync(sessionName, state, cancellationToken); + } + catch + { + Interlocked.Exchange(ref state.SendingFlag, 0); + throw; + } + } + long myGeneration = 0; // will be set right after the generation increment inside try try @@ -2479,6 +2513,7 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis state.Info.ClearPermissionDenials(); Interlocked.Exchange(ref state.ActiveToolCallCount, 0); // Reset stale tool count from previous turn state.HasUsedToolsThisTurn = false; // Reset stale tool flag from previous turn + state.PrematureIdleSignal.Reset(); // Clear premature idle detection from previous turn state.FallbackCanceledByTurnStart = false; Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); Interlocked.Exchange(ref state.EventCountThisTurn, 0); // Reset event counter for zero-idle capture @@ -2738,8 +2773,10 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis Debug($"[RECONNECT] Sibling '{kvp.Key}' already replaced by another reconnect — discarding"); siblingState.IsOrphaned = true; try { await resumed.DisposeAsync(); } catch { } + DisposePrematureIdleSignal(otherState); continue; } + DisposePrematureIdleSignal(otherState); Debug($"[RECONNECT] Re-resumed sibling session '{kvp.Key}' after client recreation"); } catch (Exception reEx) @@ -2876,6 +2913,7 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis // FlushedResponse contains text from earlier FlushCurrentResponse calls — // this is real output the worker produced before the connection died. var preservedFlushed = state.FlushedResponse.ToString(); + var oldState = state; var newState = new SessionState { Session = newSession, @@ -2897,6 +2935,7 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis newState.IsMultiAgentSession = state.IsMultiAgentSession; newSession.On(evt => HandleSessionEvent(newState, evt)); _sessions[sessionName] = newState; + DisposePrematureIdleSignal(oldState); state = newState; // Increment generation AFTER registering the event handler so that any @@ -3707,6 +3746,7 @@ internal async Task CloseSessionCoreAsync(string name, bool notifyUi) CancelProcessingWatchdog(state); CancelTurnEndFallback(state); CancelToolHealthCheck(state); + DisposePrematureIdleSignal(state); // Dispose the SDK session AFTER UI has updated — DisposeAsync talks to the CLI // process and may trigger additional SDK events on background threads. Running it