diff --git a/.claude/skills/processing-state-safety/SKILL.md b/.claude/skills/processing-state-safety/SKILL.md index b7a9e016e7..2861b2a042 100644 --- a/.claude/skills/processing-state-safety/SKILL.md +++ b/.claude/skills/processing-state-safety/SKILL.md @@ -10,8 +10,8 @@ description: > (6) Adding diagnostic log tags, (7) Modifying session restore paths (RestoreSingleSessionAsync) that must initialize watchdog-dependent state, (8) Modifying ReconcileOrganization or any code that reads Organization.Sessions - during the IsRestoring window. Covers: 9 invariants from 8 PRs of fix cycles, - the 8 code paths that clear IsProcessing, and common regression patterns. + during the IsRestoring window. Covers: 13 invariants from 10 PRs of fix cycles, + the 9 code paths that clear IsProcessing, and common regression patterns. --- # Processing State Safety @@ -32,18 +32,19 @@ Every code path that sets `IsProcessing = false` MUST also: 11. Add a diagnostic log entry (`[COMPLETE]`, `[ERROR]`, `[ABORT]`, etc.) 12. Run on UI thread (via `InvokeOnUI()` or already on UI thread) -## The 8 Paths That Clear IsProcessing +## The 9 Paths That Clear IsProcessing | # | Path | File | Thread | Notes | |---|------|------|--------|-------| | 1 | CompleteResponse | Events.cs | UI (via Invoke) | Normal completion | | 2 | SessionErrorEvent | Events.cs | Background → InvokeOnUI | SDK error | -| 3 | Watchdog timeout | Events.cs | Timer → InvokeOnUI | No events for 120s/600s | -| 4 | AbortSessionAsync (local) | CopilotService.cs | UI | User clicks Stop | -| 5 | AbortSessionAsync (remote) | CopilotService.cs | UI | Mobile stop | -| 6 | SendAsync reconnect failure | CopilotService.cs | UI | Prompt send failed after reconnect | -| 7 | SendAsync initial failure | CopilotService.cs | UI | Prompt send failed | -| 8 | Bridge OnTurnEnd | Bridge.cs | Background → InvokeOnUI | Remote mode turn complete | +| 3 | Watchdog timeout (kill) | Events.cs | Timer → InvokeOnUI | No events for 120s/600s, server dead, or max time exceeded (Case C) | +| 4 | Watchdog clean complete | Events.cs | Timer → InvokeOnUI | Tools done, lost terminal event → calls CompleteResponse (Case B, PR #332) | +| 5 | AbortSessionAsync (local) | CopilotService.cs | UI | User clicks Stop | +| 6 | AbortSessionAsync (remote) | CopilotService.cs | UI | Mobile stop | +| 7 | SendAsync reconnect failure | CopilotService.cs | UI | Prompt send failed after reconnect | +| 8 | SendAsync initial failure | CopilotService.cs | UI | Prompt send failed | +| 9 | Bridge OnTurnEnd | Bridge.cs | Background → InvokeOnUI | Remote mode turn complete | ## Content Persistence Safety @@ -61,7 +62,7 @@ that crash the app. The DB is a write-through cache; `events.jsonl` is the sourc and replays on session resume via `BulkInsertAsync`. DB write failures are self-healing. **NEVER narrow the ChatDatabase catch filters** — use `catch (Exception ex)` always. -## 9 Invariants +## 13 Invariants ### INV-1: Complete state cleanup Every IsProcessing=false path clears ALL fields. See checklist above. @@ -108,12 +109,60 @@ When `ReconcileOrganization` hasn't run yet (during `IsRestoring` window), during this window must call `ReconcileOrganization(allowPruning: false)` first. This additive mode safely adds missing entries without pruning loading sessions. +### INV-10: TurnEnd fallback must not be permanently suppressed (PR #332) +The `AssistantTurnEndEvent` 4s fallback → `CompleteResponse` guards against +premature firing during multi-tool sessions. **Do NOT** use `HasUsedToolsThisTurn` +to skip this fallback entirely — that permanently disables recovery for all +agent sessions and leaves them 100% dependent on `SessionIdleEvent`. If that +event is dropped (SDK bug #299), the session sticks for 600s. + +**Correct approach**: Use `ActiveToolCallCount > 0` to skip the 4s fallback +(tools are still running). If tools are done (`ActiveToolCallCount == 0`) but +`HasUsedToolsThisTurn` is true, use an extended 30s delay +(`TurnEndIdleToolFallbackAdditionalMs = 30_000`). The cancellation token from +`AssistantTurnStartEvent` is the correct mechanism to prevent premature firing +when the LLM does multi-round tool use. + +### INV-11: Watchdog must distinguish active tools from lost events (PR #332) +Blindly waiting the full 600s tool timeout when `ActiveToolCallCount == 0` +(tools finished) is wrong — the SDK may have silently dropped the terminal event +(`SessionIdleEvent`). The watchdog timeout path must use a 3-way branch: + +- **Case A** (`hasActiveTool && server alive`): Probe `_serverManager.IsServerRunning()` + (TCP port check). If alive → reset `LastEventAtTicks` and continue. If dead → fall through to kill. +- **Case B** (`!hasActiveTool && HasUsedToolsThisTurn && !exceededMaxTime`): Call + `CompleteResponse` cleanly (no error message) then `break`. Lost terminal event scenario. +- **Case C** (default): Kill with "⚠️ Session appears stuck" error message. Max time + exceeded, server dead, or something genuinely wrong. + +This prevents the "10-minute kill" where tools ran successfully but the session +was murdered because the SDK dropped the follow-up `SessionIdleEvent`. + +### INV-12: All background→UI dispatches must capture ProcessingGeneration (PR #332) +Any code that posts work to the UI thread from a background thread (watchdog loop, +`Task.Run`, timer callbacks) must: +1. Capture `var gen = Interlocked.Read(ref state.ProcessingGeneration)` **before** the `InvokeOnUI` call +2. Validate `if (Interlocked.Read(ref state.ProcessingGeneration) != gen) return;` **inside** the lambda + +Without this guard, a stale watchdog tick (racing with abort+resend) can flush +content from a new turn into the old turn's history. Every Case B and Case C +watchdog callback has this guard; the periodic flush callback must too. + +### INV-13: Use InvokeOnUI() (class method) in Task.Run closures (PR #332) +The local `Invoke(Action)` function inside `HandleSessionEvent` (declared at +line ~249) can have scoping ambiguity when captured by `Task.Run` closures. +Use the class-level `InvokeOnUI()` method in all `Task.Run` and timer callbacks +for explicit, unambiguous UI thread dispatch. The local `Invoke` works but the +intent is less clear when reading cross-threaded code. + ## Top 5 Recurring Mistakes 1. **Incomplete cleanup** — modifying one IsProcessing path without updating ALL fields that must be cleared simultaneously. -2. **ActiveToolCallCount as sole tool signal** — gets reset/skipped - in several paths; always check `HasUsedToolsThisTurn` too. +2. **Suppressing the TurnEnd fallback for tool sessions** — using `HasUsedToolsThisTurn` + to skip the fallback entirely leaves agent sessions with zero recovery when + `SessionIdleEvent` is dropped. Use `ActiveToolCallCount` to guard and an + extended delay for the tool-used case. (PR #332) 3. **Background thread mutations** — mutating IsProcessing or related state on SDK event threads instead of marshaling to UI thread. 4. **Missing content flush on turn boundaries** — `FlushCurrentResponse` @@ -128,7 +177,10 @@ This additive mode safely adds missing entries without pruning loading sessions. `IsMultiAgentSession` not being set during restore, causing the watchdog to use 120s instead of 600s for multi-agent workers. +**Retired mistake (was #2):** *ActiveToolCallCount as sole tool signal* — still relevant per +INV-5, but the more impactful version is #2 above (suppressing the fallback entirely). + ## Regression History -9 PRs of fix/regression cycles: #141 → #147 → #148 → #153 → #158 → #163 → #164 → #276 → #284. +10 PRs of fix/regression cycles: #141 → #147 → #148 → #153 → #158 → #163 → #164 → #276 → #284 → #332. See `references/regression-history.md` for the full timeline with root causes. diff --git a/.claude/skills/processing-state-safety/references/regression-history.md b/.claude/skills/processing-state-safety/references/regression-history.md index 3ac8d4057a..571d156cdb 100644 --- a/.claude/skills/processing-state-safety/references/regression-history.md +++ b/.claude/skills/processing-state-safety/references/regression-history.md @@ -70,3 +70,49 @@ - **Pattern**: This is a variant of mistake #5 ("Missing state initialization on session restore") — the restore path must independently initialize ALL state that `SendPromptAsync` initializes, because events/watchdog/dispatch all depend on it. + +## PR #332 — Rescue Stuck Chat Sessions (TurnEnd Fallback + Smart Watchdog) + +### Root Cause 1: TurnEnd fallback permanently disabled for tool-using sessions +- **Symptom**: Messages stop streaming mid-response (appear stuck, need "prod" to continue). + More precisely: `SessionIdleEvent` is lost (SDK bug #299) and nothing else fires `CompleteResponse`. +- **Root cause**: `HasUsedToolsThisTurn = true` caused the 4s `AssistantTurnEndEvent` fallback + to be **completely skipped** (not just delayed). For any session that ever used tools, the guard + permanently disabled recovery. The session then waited the full 600s watchdog timeout. +- **Fix**: Use `ActiveToolCallCount > 0` to skip the 4s fallback (tools genuinely running). + When `ActiveToolCallCount == 0` but `HasUsedToolsThisTurn == true`, wait an additional 30s + (`TurnEndIdleToolFallbackAdditionalMs = 30_000`) then fire `CompleteResponse`. Total wait: 34s + instead of 600s. The `AssistantTurnStartEvent` CTS cancels the fallback if the LLM starts a new round. + +### Root Cause 2: Watchdog couldn't distinguish active tools from lost SDK events +- **Symptom**: Sessions with long-running tools (builds, tests) killed after 600s even when + the tool finished successfully and the SDK dropped the terminal event. +- **Root cause**: Watchdog timeout path only had one behavior: kill with error. It couldn't tell + "tool still running" from "tool done, lost SessionIdleEvent." +- **Fix**: 3-way branch in `RunProcessingWatchdogAsync` at the timeout threshold: + - **Case A** (`hasActiveTool && !exceededMaxTime && !demo && !remote`): + Probe `_serverManager.IsServerRunning()` (TCP check). If alive → reset `LastEventAtTicks` + continue. + If dead → fall through to Case C (kill). + - **Case B** (`!hasActiveTool && HasUsedToolsThisTurn && !exceededMaxTime`): + Call `CompleteResponse(state, watchdogGen)` then `break`. Clean completion, no error message. + This is now **Path #4 that clears IsProcessing** (added to the table in SKILL.md). + - **Case C** (default): Kill with "⚠️ Session appears stuck" error (original behavior). + +### Additional fixes +- **Periodic watchdog flush** — Every 15s, if `CurrentResponse` has content, flush to History. + Ensures user sees streaming content even when all SDK events stop (midway through long tool). + Uses `ProcessingGeneration` guard (captured before `InvokeOnUI`, validated inside lambda) to + prevent stale ticks from flushing new-turn content into the old turn. +- **ExternalToolRequestedEvent** — Added to `SdkEventMatrix` as `TimelineOnly`. Was arriving as + "Unhandled" causing log spam without any functional impact. +- **InvokeOnUI() in TurnEnd fallback Task.Run** — Switched from local `Invoke()` function to + class-level `InvokeOnUI()` for unambiguous intent in cross-thread closures (INV-13). +- **44 behavioral safety tests** + 7 watchdog tests + 4 TurnEnd fallback tests = 55 new tests. + All use source-code assertions and reflection-based state inspection to verify invariants. + +### Key insight +These two bugs affected EVERY agent session because: (a) agents always use tools, so +`HasUsedToolsThisTurn` is always `true`, and (b) agent tasks frequently take >10 min (build, +test, research). The bugs compounded: fallback disabled → 600s watchdog is only hope → watchdog +kills after 600s → user loses the entire turn. PR #332 reduced worst-case stuck-session +recovery from 600s to 34s for lost-event scenarios. diff --git a/PolyPilot.Tests/ChatExperienceSafetyTests.cs b/PolyPilot.Tests/ChatExperienceSafetyTests.cs new file mode 100644 index 0000000000..5f8deaac2b --- /dev/null +++ b/PolyPilot.Tests/ChatExperienceSafetyTests.cs @@ -0,0 +1,984 @@ +using Microsoft.Extensions.DependencyInjection; +using PolyPilot.Models; +using PolyPilot.Services; +using System.Reflection; +using System.Text; + +namespace PolyPilot.Tests; + +/// +/// Behavioral safety tests for the PolyPilot chat experience. +/// These tests exercise the REAL CopilotService pipeline (CompleteResponse, +/// watchdog state, AbortSessionAsync) via reflection — NOT demo mode shortcuts. +/// +/// Covers the recurring bug patterns: +/// 1. "Chat messages get killed while doing long-running processes" (watchdog timeouts) +/// 2. "Turns get lost / messages disappear" (generation counter races, content flushing) +/// 3. State not fully cleaned up after errors/abort (INV-1 violations) +/// +/// Regression test history: PRs #141→#147→#148→#153→#158→#163→#164→#276→#284→#330 +/// +public class ChatExperienceSafetyTests +{ + // ========================================================================= + // Infrastructure — reflection helpers to access private CopilotService guts + // ========================================================================= + + private readonly StubChatDatabase _chatDb = new(); + private readonly StubServerManager _serverManager = new(); + private readonly StubWsBridgeClient _bridgeClient = new(); + private readonly StubDemoService _demoService = new(); + private readonly RepoManager _repoManager = new(); + private readonly IServiceProvider _serviceProvider; + + private static readonly BindingFlags NonPublic = BindingFlags.NonPublic | BindingFlags.Instance; + private static readonly BindingFlags AnyInstance = BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance; + + public ChatExperienceSafetyTests() + { + _serviceProvider = new ServiceCollection().BuildServiceProvider(); + } + + private CopilotService CreateService() => + new CopilotService(_chatDb, _serverManager, _bridgeClient, _repoManager, _serviceProvider, _demoService); + + // These helpers use string-based reflection to access private members. Renaming any private + // member silently breaks tests at runtime rather than compile time. + // TODO: Consider adding [assembly: InternalsVisibleTo("PolyPilot.Tests")] to PolyPilot.csproj + // for compile-time safety on internal members. + + /// Gets the private SessionState object from CopilotService._sessions dictionary. + private static object GetSessionState(CopilotService svc, string sessionName) + { + var sessionsField = typeof(CopilotService).GetField("_sessions", NonPublic)!; + var sessionsDict = sessionsField.GetValue(svc)!; + var tryGetMethod = sessionsDict.GetType().GetMethod("TryGetValue")!; + var args = new object?[] { sessionName, null }; + tryGetMethod.Invoke(sessionsDict, args); + return args[1] ?? throw new InvalidOperationException($"Session '{sessionName}' not found in _sessions"); + } + + /// Invokes the private CompleteResponse method via reflection. + private static void InvokeCompleteResponse(CopilotService svc, object sessionState, long? expectedGeneration = null) + { + var method = typeof(CopilotService).GetMethod("CompleteResponse", NonPublic)!; + method.Invoke(svc, new object?[] { sessionState, expectedGeneration }); + } + + /// Invokes the private FlushCurrentResponse method via reflection. + private static void InvokeFlushCurrentResponse(CopilotService svc, object sessionState) + { + var method = typeof(CopilotService).GetMethod("FlushCurrentResponse", NonPublic)!; + method.Invoke(svc, new object?[] { sessionState }); + } + + /// Gets a field from SessionState by name. + private static T GetField(object state, string fieldName) + { + var field = state.GetType().GetField(fieldName, AnyInstance) + ?? throw new InvalidOperationException($"Field '{fieldName}' not found on {state.GetType().Name}"); + return (T)field.GetValue(state)!; + } + + /// Sets a field on SessionState by name. + private static void SetField(object state, string fieldName, object? value) + { + var field = state.GetType().GetField(fieldName, AnyInstance) + ?? throw new InvalidOperationException($"Field '{fieldName}' not found on {state.GetType().Name}"); + field.SetValue(state, value); + } + + /// Gets the CurrentResponse StringBuilder from SessionState. + private static StringBuilder GetCurrentResponse(object state) + { + var prop = state.GetType().GetProperty("CurrentResponse", AnyInstance)!; + return (StringBuilder)prop.GetValue(state)!; + } + + /// Gets the FlushedResponse StringBuilder from SessionState. + private static StringBuilder GetFlushedResponse(object state) + { + var prop = state.GetType().GetProperty("FlushedResponse", AnyInstance)!; + return (StringBuilder)prop.GetValue(state)!; + } + + /// Gets or sets the ResponseCompletion TCS from SessionState. + private static TaskCompletionSource? GetResponseCompletion(object state) + { + var prop = state.GetType().GetProperty("ResponseCompletion", AnyInstance)!; + return (TaskCompletionSource?)prop.GetValue(state); + } + private static void SetResponseCompletion(object state, TaskCompletionSource? tcs) + { + var prop = state.GetType().GetProperty("ResponseCompletion", AnyInstance)!; + prop.SetValue(state, tcs); + } + + /// + /// Sets up a session in a "dirty" processing state, simulating a mid-turn state + /// where all flags are set as if tools were executing. + /// + private static void SetupDirtyProcessingState(object state, AgentSessionInfo info) + { + info.IsProcessing = true; + info.IsResumed = true; + info.ProcessingStartedAt = DateTime.UtcNow.AddSeconds(-30); + info.ToolCallCount = 5; + info.ProcessingPhase = 3; + SetField(state, "SendingFlag", 1); + SetField(state, "ActiveToolCallCount", 3); + SetField(state, "HasUsedToolsThisTurn", true); + SetField(state, "SuccessfulToolCountThisTurn", 2); + } + + private static string GetRepoRoot() + { + var dir = AppContext.BaseDirectory; + while (dir != null && !File.Exists(Path.Combine(dir, "PolyPilot.slnx"))) + dir = Path.GetDirectoryName(dir); + return dir ?? throw new InvalidOperationException("Could not find repo root"); + } + + // ========================================================================= + // A. CompleteResponse Behavioral Tests + // These call the REAL CompleteResponse via reflection — no demo mode. + // ========================================================================= + + /// + /// INV-1: CompleteResponse must clear ALL processing state fields. + /// This is the most critical invariant — incomplete cleanup causes stuck sessions. + /// Regression: PRs #141, #148, #158, #163, #164 all missed fields. + /// + [Fact] + public async Task CompleteResponse_ClearsAllINV1Fields() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("inv1-complete"); + + var state = GetSessionState(svc, "inv1-complete"); + SetupDirtyProcessingState(state, session); + + // Set up TCS + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + SetResponseCompletion(state, tcs); + + // Act: call CompleteResponse with no generation check + InvokeCompleteResponse(svc, state, null); + + // Assert: ALL INV-1 fields must be cleared + Assert.False(session.IsProcessing, "IsProcessing must be false after CompleteResponse"); + Assert.False(session.IsResumed, "IsResumed must be false after CompleteResponse"); + Assert.Null(session.ProcessingStartedAt); + Assert.Equal(0, session.ToolCallCount); + Assert.Equal(0, session.ProcessingPhase); + + var sendingFlag = GetField(state, "SendingFlag"); + Assert.Equal(0, sendingFlag); + + var activeToolCount = GetField(state, "ActiveToolCallCount"); + Assert.Equal(0, activeToolCount); + + var hasUsedTools = GetField(state, "HasUsedToolsThisTurn"); + Assert.False(hasUsedTools); + + var successfulToolCount = GetField(state, "SuccessfulToolCountThisTurn"); + Assert.Equal(0, successfulToolCount); + } + + /// + /// CompleteResponse is idempotent: calling it when IsProcessing is already false + /// must NOT add duplicate messages or throw. (Watchdog may beat IDLE callback.) + /// + [Fact] + public async Task CompleteResponse_WhenIsProcessingAlreadyFalse_NoOp() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("idempotent-test"); + + var state = GetSessionState(svc, "idempotent-test"); + session.IsProcessing = false; // Already cleared + + var historyCountBefore = session.History.Count; + + // Act: CompleteResponse should be a no-op + InvokeCompleteResponse(svc, state, null); + + Assert.False(session.IsProcessing); + Assert.Equal(historyCountBefore, session.History.Count); + } + + /// + /// INV-3: Generation guard prevents stale IDLE from completing wrong turn. + /// This is THE race condition fix from PR #147. Without it, a queued IDLE + /// callback from turn N can complete turn N+1 when the user sends rapidly. + /// + [Fact] + public async Task CompleteResponse_GenerationMismatch_Skips() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("gen-guard-test"); + + var state = GetSessionState(svc, "gen-guard-test"); + session.IsProcessing = true; + + // Set generation to 5 (simulating 5th prompt sent) + SetField(state, "ProcessingGeneration", 5L); + + // Act: CompleteResponse with stale generation 3 + InvokeCompleteResponse(svc, state, 3L); + + // Assert: should NOT have completed — IsProcessing stays true + Assert.True(session.IsProcessing, "CompleteResponse must skip when generation mismatches"); + } + + /// + /// Generation guard allows completion when generations match. + /// + [Fact] + public async Task CompleteResponse_GenerationMatch_Executes() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("gen-match-test"); + + var state = GetSessionState(svc, "gen-match-test"); + session.IsProcessing = true; + SetField(state, "ProcessingGeneration", 5L); + SetField(state, "SendingFlag", 1); + + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + SetResponseCompletion(state, tcs); + + // Act: CompleteResponse with matching generation + InvokeCompleteResponse(svc, state, 5L); + + // Assert: should have completed + Assert.False(session.IsProcessing, "CompleteResponse must execute when generation matches"); + } + + /// + /// CompleteResponse with null generation always executes (used by error/watchdog paths). + /// + [Fact] + public async Task CompleteResponse_NullGeneration_AlwaysExecutes() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("null-gen-test"); + + var state = GetSessionState(svc, "null-gen-test"); + session.IsProcessing = true; + SetField(state, "ProcessingGeneration", 99L); + SetField(state, "SendingFlag", 1); + + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + SetResponseCompletion(state, tcs); + + // Act: null generation skips the generation check entirely + InvokeCompleteResponse(svc, state, null); + + Assert.False(session.IsProcessing, "CompleteResponse with null generation must always execute"); + } + + /// + /// CompleteResponse must flush accumulated content to History before clearing state. + /// Without this (PR #158 regression), content in CurrentResponse is silently lost. + /// + [Fact] + public async Task CompleteResponse_FlushesContentToHistory() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("flush-content-test"); + + var state = GetSessionState(svc, "flush-content-test"); + session.IsProcessing = true; + SetField(state, "SendingFlag", 1); + + // Simulate accumulated response text (from AssistantMessageDeltaEvent) + var currentResponse = GetCurrentResponse(state); + currentResponse.Append("This is the model's response text that must not be lost."); + + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + SetResponseCompletion(state, tcs); + + var historyCountBefore = session.History.Count; + + // Act + InvokeCompleteResponse(svc, state, null); + + // Assert: content was flushed to History + Assert.True(session.History.Count > historyCountBefore, + "CompleteResponse must add accumulated content to History"); + var lastMessage = session.History.Last(); + Assert.Equal("assistant", lastMessage.Role); + Assert.Contains("model's response text", lastMessage.Content); + } + + /// + /// CompleteResponse must include FlushedResponse (from mid-turn flushes on TurnEnd) + /// in the TCS result. Without this, orchestrator dispatch gets empty string. + /// This was the root cause of "orchestrator didn't respond to worker" bugs. + /// + [Fact] + public async Task CompleteResponse_IncludesFlushedResponseInTcsResult() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("flushed-tcs-test"); + + var state = GetSessionState(svc, "flushed-tcs-test"); + session.IsProcessing = true; + SetField(state, "SendingFlag", 1); + + // Simulate mid-turn flush (happens on AssistantTurnEndEvent) + var flushedResponse = GetFlushedResponse(state); + flushedResponse.Append("First sub-turn response text"); + + // Simulate additional content after turn end + var currentResponse = GetCurrentResponse(state); + currentResponse.Append("Second sub-turn continuation"); + + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + SetResponseCompletion(state, tcs); + + // Act + InvokeCompleteResponse(svc, state, null); + + // Assert: TCS result includes BOTH flushed and current text + Assert.True(tcs.Task.IsCompleted); + var result = tcs.Task.Result; + Assert.Contains("First sub-turn response text", result); + Assert.Contains("Second sub-turn continuation", result); + } + + /// + /// CompleteResponse fires OnSessionComplete so orchestrator loops can unblock. + /// Without this (INV-O4), multi-agent workers hang forever waiting for completion. + /// + [Fact] + public async Task CompleteResponse_FiresOnSessionComplete() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("complete-event-test"); + + var state = GetSessionState(svc, "complete-event-test"); + session.IsProcessing = true; + SetField(state, "SendingFlag", 1); + + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + SetResponseCompletion(state, tcs); + + var completedSessions = new List(); + svc.OnSessionComplete += (name, _) => completedSessions.Add(name); + + // Act + InvokeCompleteResponse(svc, state, null); + + // Assert + Assert.Contains("complete-event-test", completedSessions); + } + + /// + /// When IsProcessing is already false but CurrentResponse has accumulated content, + /// CompleteResponse should still flush that content (late deltas after watchdog/error). + /// + [Fact] + public async Task CompleteResponse_WhenAlreadyFalse_StillFlushesLateContent() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("late-flush-test"); + + var state = GetSessionState(svc, "late-flush-test"); + session.IsProcessing = false; // Already cleared by watchdog + + // But late deltas arrived after watchdog cleared state + var currentResponse = GetCurrentResponse(state); + currentResponse.Append("Late delta content that arrived after watchdog"); + + var historyCountBefore = session.History.Count; + + // Act + InvokeCompleteResponse(svc, state, null); + + // Assert: late content was still flushed + Assert.True(session.History.Count > historyCountBefore, + "Late content must be flushed even when IsProcessing is already false"); + } + + // ========================================================================= + // B. Watchdog Timeout Decision Logic + // Tests the actual timeout selection logic from RunProcessingWatchdogAsync. + // ========================================================================= + + /// + /// Mirrors the three-tier timeout selection logic from RunProcessingWatchdogAsync. + /// This is a direct copy of the production code — if production changes, this test + /// detects the deviation. + /// + private static int ComputeEffectiveTimeout( + bool hasActiveTool, bool hasUsedTools, bool isMultiAgent, + bool isResumed, bool hasReceivedEvents) + { + var useResumeQuiescence = isResumed && !hasReceivedEvents && !hasActiveTool && !hasUsedTools; + var useToolTimeout = hasActiveTool || (isResumed && !useResumeQuiescence) || hasUsedTools || isMultiAgent; + return useResumeQuiescence + ? CopilotService.WatchdogResumeQuiescenceTimeoutSeconds + : useToolTimeout + ? CopilotService.WatchdogToolExecutionTimeoutSeconds + : CopilotService.WatchdogInactivityTimeoutSeconds; + } + + /// + /// INV-5: HasUsedToolsThisTurn BETWEEN tool rounds must keep 600s timeout. + /// This is the primary protection against "messages killed during long-running processes." + /// ActiveToolCallCount resets on AssistantTurnStartEvent between rounds — only + /// HasUsedToolsThisTurn persists and keeps the longer timeout. + /// + [Fact] + public void WatchdogTimeout_BetweenToolRounds_Uses600s() + { + // Between tool rounds: ActiveToolCallCount=0, but HasUsedToolsThisTurn=true + var timeout = ComputeEffectiveTimeout( + hasActiveTool: false, hasUsedTools: true, isMultiAgent: false, + isResumed: false, hasReceivedEvents: false); + + Assert.Equal(CopilotService.WatchdogToolExecutionTimeoutSeconds, timeout); + Assert.Equal(600, timeout); + } + + /// Active tool execution gets the 600s timeout. + [Fact] + public void WatchdogTimeout_ActiveTool_Uses600s() + { + var timeout = ComputeEffectiveTimeout( + hasActiveTool: true, hasUsedTools: false, isMultiAgent: false, + isResumed: false, hasReceivedEvents: false); + Assert.Equal(600, timeout); + } + + /// Multi-agent sessions always get 600s to prevent killing workers mid-task. + [Fact] + public void WatchdogTimeout_MultiAgent_Uses600s() + { + var timeout = ComputeEffectiveTimeout( + hasActiveTool: false, hasUsedTools: false, isMultiAgent: true, + isResumed: false, hasReceivedEvents: false); + Assert.Equal(600, timeout); + } + + /// Resumed session with no events → 30s quiescence (fast recovery). + [Fact] + public void WatchdogTimeout_ResumedNoEvents_Uses30sQuiescence() + { + var timeout = ComputeEffectiveTimeout( + hasActiveTool: false, hasUsedTools: false, isMultiAgent: false, + isResumed: true, hasReceivedEvents: false); + Assert.Equal(CopilotService.WatchdogResumeQuiescenceTimeoutSeconds, timeout); + Assert.Equal(30, timeout); + } + + /// Resumed session with events flowing → 600s (session is active). + [Fact] + public void WatchdogTimeout_ResumedWithEvents_Uses600s() + { + var timeout = ComputeEffectiveTimeout( + hasActiveTool: false, hasUsedTools: false, isMultiAgent: false, + isResumed: true, hasReceivedEvents: true); + Assert.Equal(600, timeout); + } + + /// No tools, not resumed, not multi-agent → 120s base timeout. + [Fact] + public void WatchdogTimeout_BaseCase_Uses120s() + { + var timeout = ComputeEffectiveTimeout( + hasActiveTool: false, hasUsedTools: false, isMultiAgent: false, + isResumed: false, hasReceivedEvents: false); + Assert.Equal(CopilotService.WatchdogInactivityTimeoutSeconds, timeout); + Assert.Equal(120, timeout); + } + + /// + /// Comprehensive: all 8 combinations of the 3 main flags that contribute to useToolTimeout. + /// This catches logic errors like using && instead of ||. + /// + [Theory] + [InlineData(false, false, false, false, false, 120)] // base case + [InlineData(true, false, false, false, false, 600)] // active tool + [InlineData(false, true, false, false, false, 600)] // used tools (between rounds!) + [InlineData(false, false, true, false, false, 600)] // multi-agent + [InlineData(true, true, false, false, false, 600)] // active + used + [InlineData(true, false, true, false, false, 600)] // active + multi + [InlineData(false, true, true, false, false, 600)] // used + multi + [InlineData(true, true, true, false, false, 600)] // all three + public void WatchdogTimeout_AllCombinations( + bool hasActive, bool hasUsed, bool isMulti, + bool isResumed, bool hasEvents, int expected) + { + var timeout = ComputeEffectiveTimeout(hasActive, hasUsed, isMulti, isResumed, hasEvents); + Assert.Equal(expected, timeout); + } + + // ========================================================================= + // C. AbortSessionAsync — Full INV-1 Cleanup + // Tests use real AbortSessionAsync (public) with reflection state setup. + // ========================================================================= + + /// + /// INV-1: AbortSessionAsync clears ALL processing state fields. + /// This is the path exercised when user clicks "Stop" during a long-running process. + /// + [Fact] + public async Task AbortSession_ClearsAllINV1Fields() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("abort-inv1"); + + var state = GetSessionState(svc, "abort-inv1"); + SetupDirtyProcessingState(state, session); + + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + SetResponseCompletion(state, tcs); + + // Act + await svc.AbortSessionAsync("abort-inv1"); + + // Assert: ALL fields cleared + Assert.False(session.IsProcessing); + Assert.False(session.IsResumed); + Assert.Null(session.ProcessingStartedAt); + Assert.Equal(0, session.ToolCallCount); + Assert.Equal(0, session.ProcessingPhase); + Assert.Equal(0, GetField(state, "SendingFlag")); + Assert.Equal(0, GetField(state, "ActiveToolCallCount")); + Assert.False(GetField(state, "HasUsedToolsThisTurn")); + Assert.Equal(0, GetField(state, "SuccessfulToolCountThisTurn")); + } + + /// + /// Abort with accumulated content preserves partial response in History. + /// Without this, clicking Stop discards what the user was waiting for. + /// + [Fact] + public async Task AbortSession_PreservesAccumulatedContent() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("abort-content"); + + var state = GetSessionState(svc, "abort-content"); + session.IsProcessing = true; + SetField(state, "SendingFlag", 1); + + // Accumulate partial response + GetCurrentResponse(state).Append("Partial response that should be preserved on abort"); + + var historyCountBefore = session.History.Count; + + // Act + await svc.AbortSessionAsync("abort-content"); + + // Assert: partial content preserved + Assert.True(session.History.Count > historyCountBefore); + var lastMsg = session.History.Last(m => m.Role == "assistant"); + Assert.Contains("Partial response", lastMsg.Content); + } + + /// + /// Abort completes the ResponseCompletion TCS so orchestrator loops unblock. + /// TCS must be completed AFTER state cleanup (INV-O3). + /// + [Fact] + public async Task AbortSession_CompletesResponseTcs() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("abort-tcs"); + + var state = GetSessionState(svc, "abort-tcs"); + session.IsProcessing = true; + SetField(state, "SendingFlag", 1); + + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + SetResponseCompletion(state, tcs); + + // Act + await svc.AbortSessionAsync("abort-tcs"); + + // Assert: TCS was completed (canceled, since it's an abort) + Assert.True(tcs.Task.IsCompleted); + } + + /// + /// Abort fires OnSessionComplete so orchestrator loops don't hang. + /// + [Fact] + public async Task AbortSession_FiresOnSessionComplete() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("abort-event"); + + var state = GetSessionState(svc, "abort-event"); + session.IsProcessing = true; + SetField(state, "SendingFlag", 1); + + var completedSessions = new List(); + svc.OnSessionComplete += (name, _) => completedSessions.Add(name); + + // Act + await svc.AbortSessionAsync("abort-event"); + + Assert.Contains("abort-event", completedSessions); + } + + /// + /// Abort clears the message queue so queued prompts don't auto-send. + /// + [Fact] + public async Task AbortSession_ClearsMessageQueue() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("abort-queue"); + + session.IsProcessing = true; + session.MessageQueue.Add("queued-message-1"); + session.MessageQueue.Add("queued-message-2"); + + var state = GetSessionState(svc, "abort-queue"); + SetField(state, "SendingFlag", 1); + + // Act + await svc.AbortSessionAsync("abort-queue"); + + Assert.Empty(session.MessageQueue); + } + + // ========================================================================= + // D. Content Preservation — Messages Never Lost + // ========================================================================= + + /// + /// FlushCurrentResponse moves accumulated text to History and clears CurrentResponse. + /// This is called on AssistantTurnEndEvent to persist content mid-turn. + /// + [Fact] + public async Task FlushCurrentResponse_AddsToHistory() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("flush-test"); + + var state = GetSessionState(svc, "flush-test"); + session.SessionId = "test-session-id"; // Required for DB write-through + + var currentResponse = GetCurrentResponse(state); + currentResponse.Append("Response content to be flushed"); + + var historyCountBefore = session.History.Count; + + // Act + InvokeFlushCurrentResponse(svc, state); + + // Assert: content in History, CurrentResponse cleared + Assert.True(session.History.Count > historyCountBefore); + Assert.Equal(0, currentResponse.Length); + + var flushedMsg = session.History.Last(); + Assert.Equal("assistant", flushedMsg.Role); + Assert.Equal("Response content to be flushed", flushedMsg.Content); + } + + /// + /// FlushCurrentResponse dedup guard: if the last assistant message has identical content, + /// the flush is skipped to prevent duplicates on session resume. + /// + [Fact] + public async Task FlushCurrentResponse_DedupGuard_SkipsDuplicate() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("dedup-test"); + + var state = GetSessionState(svc, "dedup-test"); + + // Add a message that looks like it was already flushed + session.History.Add(ChatMessage.AssistantMessage("Already flushed content")); + var historyCountAfterFirst = session.History.Count; + + // Simulate the same content appearing in CurrentResponse (SDK replay on resume) + GetCurrentResponse(state).Append("Already flushed content"); + + // Act + InvokeFlushCurrentResponse(svc, state); + + // Assert: no duplicate added + Assert.Equal(historyCountAfterFirst, session.History.Count); + } + + /// + /// FlushCurrentResponse accumulates text in FlushedResponse so CompleteResponse + /// can include it in the TCS result for orchestrator dispatch. + /// + [Fact] + public async Task FlushCurrentResponse_AccumulatesInFlushedResponse() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("accum-test"); + + var state = GetSessionState(svc, "accum-test"); + + // First flush + GetCurrentResponse(state).Append("First sub-turn"); + InvokeFlushCurrentResponse(svc, state); + + // Second flush + GetCurrentResponse(state).Append("Second sub-turn"); + InvokeFlushCurrentResponse(svc, state); + + // Assert: FlushedResponse has both + var flushed = GetFlushedResponse(state).ToString(); + Assert.Contains("First sub-turn", flushed); + Assert.Contains("Second sub-turn", flushed); + } + + /// + /// Multi-turn conversation: messages are preserved across sequential sends. + /// + [Fact] + public async Task MultiTurn_AllMessagesPreserved() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("multi-turn"); + + for (int i = 0; i < 5; i++) + { + await svc.SendPromptAsync("multi-turn", $"Message {i}"); + } + + // All user messages should be in history + var userMessages = session.History.Where(m => m.Role == "user").ToList(); + Assert.True(userMessages.Count >= 5, + $"Expected at least 5 user messages, got {userMessages.Count}"); + Assert.False(session.IsProcessing); + } + + /// + /// FlushCurrentResponse with empty content is a no-op (no empty messages in history). + /// + [Fact] + public async Task FlushCurrentResponse_EmptyContent_NoOp() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("empty-flush"); + + var state = GetSessionState(svc, "empty-flush"); + var historyCount = session.History.Count; + + // Act: flush with empty buffer + InvokeFlushCurrentResponse(svc, state); + + Assert.Equal(historyCount, session.History.Count); + } + + // ========================================================================= + // E. Structural Regression Guards (source assertions) + // These verify critical code patterns are present. Keep minimal — 3 max. + // ========================================================================= + + /// + /// The watchdog callback must have a generation guard to prevent killing a new + /// turn if the user aborts + resends between Post() and callback execution. + /// + [Fact] + public void WatchdogCallback_HasGenerationGuard() + { + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + + // Find the watchdog callback inside InvokeOnUI + var watchdogIdx = source.IndexOf("watchdogGeneration != currentGen", StringComparison.Ordinal); + Assert.True(watchdogIdx > 0, + "Watchdog callback must compare watchdogGeneration to currentGen"); + } + + /// + /// CompleteResponse must clear SendingFlag to allow subsequent sends. + /// Without this, the session deadlocks on next SendPromptAsync. + /// + [Fact] + public void CompleteResponse_Source_ClearsSendingFlag() + { + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + + // Find CompleteResponse method + var crIdx = source.IndexOf("private void CompleteResponse(", StringComparison.Ordinal); + Assert.True(crIdx > 0); + + // Within CompleteResponse, SendingFlag must be cleared (may be 80+ lines into method) + var afterCR = source.Substring(crIdx, Math.Min(5000, source.Length - crIdx)); + Assert.Contains("SendingFlag", afterCR); + } + + /// + /// The "Session not found" reconnect path must include McpServers and SkillDirectories + /// in the fresh session config (PR #330 regression guard). + /// + [Fact] + public void ReconnectPath_IncludesMcpServersAndSkills() + { + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs")); + + var sessionNotFoundIdx = source.IndexOf("Session not found", StringComparison.OrdinalIgnoreCase); + Assert.True(sessionNotFoundIdx > 0); + + var afterNotFound = source.Substring(sessionNotFoundIdx, Math.Min(2000, source.Length - sessionNotFoundIdx)); + Assert.Contains("McpServers", afterNotFound); + Assert.Contains("SkillDirectories", afterNotFound); + } + + // ========================================================================= + // F. Race Condition & Edge Case Tests + // ========================================================================= + + /// + /// Sequential sends don't leave ghost processing state. + /// Each send must complete fully before the next can proceed. + /// + [Fact] + public async Task SequentialSends_NoGhostState() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("no-ghost"); + + for (int i = 0; i < 10; i++) + { + await svc.SendPromptAsync("no-ghost", $"Prompt {i}"); + Assert.False(session.IsProcessing, $"IsProcessing stuck after prompt {i}"); + Assert.Equal(0, session.ProcessingPhase); + } + } + + /// + /// CompleteResponse clears PendingReasoningMessages to prevent stale reasoning + /// from previous turn leaking into next turn's display. + /// + [Fact] + public async Task CompleteResponse_ClearsPendingReasoning() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("reasoning-clear"); + + var state = GetSessionState(svc, "reasoning-clear"); + session.IsProcessing = true; + SetField(state, "SendingFlag", 1); + + // Add pending reasoning + var pendingProp = state.GetType().GetProperty("PendingReasoningMessages", AnyInstance)!; + var pendingDict = pendingProp.GetValue(state) as System.Collections.Concurrent.ConcurrentDictionary; + pendingDict!.TryAdd("reasoning-1", ChatMessage.SystemMessage("test reasoning")); + + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + SetResponseCompletion(state, tcs); + + // Act + InvokeCompleteResponse(svc, state, null); + + // Assert: pending reasoning cleared + Assert.True(pendingDict.IsEmpty, "PendingReasoningMessages must be cleared on CompleteResponse"); + } + + /// + /// Abort when not processing is a no-op (safe to call multiple times). + /// + [Fact] + public async Task AbortSession_WhenNotProcessing_NoOp() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("abort-noop"); + + session.IsProcessing = false; + var historyCount = session.History.Count; + + // Act: abort should be a safe no-op + await svc.AbortSessionAsync("abort-noop"); + + Assert.Equal(historyCount, session.History.Count); + Assert.False(session.IsProcessing); + } + + /// + /// The TCS result includes full response even when FlushedResponse has content + /// but CurrentResponse is empty (common case: TurnEnd flush cleared CurrentResponse + /// before SessionIdle fires CompleteResponse). + /// + [Fact] + public async Task CompleteResponse_FlushedOnlyNoCurrentResponse_TcsHasContent() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("flushed-only"); + + var state = GetSessionState(svc, "flushed-only"); + session.IsProcessing = true; + SetField(state, "SendingFlag", 1); + + // FlushedResponse has content (from TurnEnd flush), CurrentResponse is empty + GetFlushedResponse(state).Append("Full turn response from mid-turn flush"); + // CurrentResponse stays empty — this is the normal TurnEnd → SessionIdle flow + + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + SetResponseCompletion(state, tcs); + + // Act + InvokeCompleteResponse(svc, state, null); + + // Assert: TCS has the flushed content + Assert.True(tcs.Task.IsCompleted); + Assert.Contains("Full turn response from mid-turn flush", tcs.Task.Result); + } + + /// + /// ChatDatabase write-through is called on content flush (fire-and-forget). + /// + [Fact] + public async Task FlushCurrentResponse_WritesToChatDatabase() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("db-write-test"); + + var state = GetSessionState(svc, "db-write-test"); + session.SessionId = "test-db-session-id"; + + GetCurrentResponse(state).Append("Content for database"); + + var dbCountBefore = _chatDb.AddedMessages.Count; + + // Act + InvokeFlushCurrentResponse(svc, state); + + // Assert: DB write was triggered + Assert.True(_chatDb.AddedMessages.Count > dbCountBefore, + "FlushCurrentResponse must write through to ChatDatabase"); + var lastDbMsg = _chatDb.AddedMessages.Last(); + Assert.Equal("test-db-session-id", lastDbMsg.SessionId); + Assert.Contains("Content for database", lastDbMsg.Message.Content); + } +} diff --git a/PolyPilot.Tests/ProcessingWatchdogTests.cs b/PolyPilot.Tests/ProcessingWatchdogTests.cs index 217b93b4d4..33db436a76 100644 --- a/PolyPilot.Tests/ProcessingWatchdogTests.cs +++ b/PolyPilot.Tests/ProcessingWatchdogTests.cs @@ -895,7 +895,10 @@ public void HasUsedToolsThisTurn_ResetByCompleteResponse() private static int ComputeEffectiveTimeout(bool hasActiveTool, bool isResumed, bool hasReceivedEvents, bool hasUsedTools, bool isMultiAgent = false) { var useResumeQuiescence = isResumed && !hasReceivedEvents && !hasActiveTool && !hasUsedTools; - var useToolTimeout = hasActiveTool || (isResumed && !useResumeQuiescence) || hasUsedTools || isMultiAgent; + // NOTE: isMultiAgent no longer extends the timeout (PR #332 fix). + // Workers actively streaming have tiny elapsed values (delta events reset the timer). + // Only tool execution silence (hasActiveTool or hasUsedTools) warrants 600s. + var useToolTimeout = hasActiveTool || (isResumed && !useResumeQuiescence) || hasUsedTools; return useResumeQuiescence ? CopilotService.WatchdogResumeQuiescenceTimeoutSeconds : useToolTimeout @@ -973,14 +976,18 @@ public void WatchdogTimeoutSelection_ResumedWithActiveTool_UsesToolTimeout() } [Fact] - public void WatchdogTimeoutSelection_MultiAgent_UsesToolTimeout() + public void WatchdogTimeoutSelection_MultiAgent_NoTools_UsesInactivityTimeout() { - // Multi-agent sessions use longer tool timeout even without tool activity + // Multi-agent sessions without tool use now use the 120s inactivity timeout. + // The old 600s blanket for isMultiAgent caused stuck-session UX bugs when the SDK + // dropped terminal events (sdk bug #299 variant): users waited up to 600s when + // workers that do active text generation (delta events flowing) are NOT at risk of + // the 120s timeout because deltas keep elapsed small. var effectiveTimeout = ComputeEffectiveTimeout( hasActiveTool: false, isResumed: false, hasReceivedEvents: false, hasUsedTools: false, isMultiAgent: true); - Assert.Equal(CopilotService.WatchdogToolExecutionTimeoutSeconds, effectiveTimeout); - Assert.Equal(600, effectiveTimeout); + Assert.Equal(CopilotService.WatchdogInactivityTimeoutSeconds, effectiveTimeout); + Assert.Equal(120, effectiveTimeout); } [Fact] @@ -1201,7 +1208,8 @@ public void ResumeQuiescence_NotResumed_NeverTriggersQuiescence() hasActiveTool: true, isResumed: false, hasReceivedEvents: false, hasUsedTools: false)); Assert.Equal(600, ComputeEffectiveTimeout( hasActiveTool: false, isResumed: false, hasReceivedEvents: false, hasUsedTools: true)); - Assert.Equal(600, ComputeEffectiveTimeout( + // Multi-agent without tools now uses 120s (not 600s) — see WatchdogTimeoutSelection_MultiAgent_NoTools_UsesInactivityTimeout + Assert.Equal(120, ComputeEffectiveTimeout( hasActiveTool: false, isResumed: false, hasReceivedEvents: false, hasUsedTools: false, isMultiAgent: true)); } @@ -1238,7 +1246,7 @@ public void ResumeQuiescence_TransitionsToInactivity_AfterIsResumedCleared() [InlineData(false, true, true, false, false, 600)] // Resumed, events: tool timeout [InlineData(true, true, false, false, false, 600)] // Resumed, active tool: tool timeout [InlineData(false, true, false, true, false, 600)] // Resumed, used tools: tool timeout - [InlineData(false, false, false, false, true, 600)] // Multi-agent: tool timeout + [InlineData(false, false, false, false, true, 120)] // Multi-agent no-tools: inactivity (PR #332 fix) [InlineData(false, true, false, false, true, 30)] // Resumed+multiAgent, no events: quiescence wins [InlineData(false, false, false, true, false, 600)] // HasUsedTools: tool timeout [InlineData(true, true, true, true, true, 600)] // All flags: tool timeout @@ -1817,14 +1825,22 @@ public void Regression_PR163_IsResumed_NotClearedDuringToolActivity() // --- PR #195 regression: multi-agent workers killed at 120s --- [Fact] - public void Regression_PR195_MultiAgentWorkers_Use600s() + public void Regression_PR195_MultiAgentWorkers_DeltasKeepElapsedSmall() { - // Multi-agent workers doing text-heavy tasks (PR reviews, no tools) - // were killed at 120s inactivity. Fix: isMultiAgent flag → 600s. + // PR #195 concern: multi-agent workers doing text-heavy tasks (PR reviews, no tools) + // were killed at 120s. The fix was: isMultiAgent → 600s. But that was wrong reasoning. + // + // The CORRECT insight (PR #332): workers generating text stream DELTA EVENTS continuously. + // Each delta resets LastEventAtTicks, keeping elapsed tiny. The 120s timeout cannot fire + // during active generation. It can only fire when the session goes SILENT, which means + // either stuck (good to clean up) or done with terminal events dropped (Case B). + // + // Multi-agent without tools → 120s timeout (inactivity). This is intentional. var timeout = ComputeEffectiveTimeout( hasActiveTool: false, isResumed: false, hasReceivedEvents: true, hasUsedTools: false, isMultiAgent: true); - Assert.Equal(CopilotService.WatchdogToolExecutionTimeoutSeconds, timeout); + Assert.Equal(CopilotService.WatchdogInactivityTimeoutSeconds, timeout); + Assert.Equal(120, timeout); } // --- PR #211 regression: quiescence must not kill active sessions --- @@ -2153,4 +2169,246 @@ public void Scenario_NormalPrompt_QuiescenceNeverApplies() Assert.Equal(CopilotService.WatchdogInactivityTimeoutSeconds, timeout); Assert.True(timeout > 30, "Normal prompt must use timeout > 30s"); } -} + + // ===== Watchdog tool-vs-no-tool behavior (new in PR fix) ===== + + [Fact] + public void WatchdogDecision_ActiveTool_ServerAlive_ShouldResetTimer() + { + // When a tool is actively running (ActiveToolCallCount > 0) and the server is alive, + // the watchdog must reset the inactivity timer rather than killing the session. + // This is verified via source-code assertion (since SessionState is private). + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync"); + var elapsedIdx = source.IndexOf("elapsed >= effectiveTimeout", methodIdx); + // Find the end of the method dynamically (next top-level member after RunProcessingWatchdogAsync) + var methodEndIdx = source.IndexOf(" private readonly ConcurrentDictionary", methodIdx); + if (methodEndIdx < 0) methodEndIdx = source.Length; + var block = source.Substring(elapsedIdx, methodEndIdx - elapsedIdx); + + // Must check hasActiveTool before deciding server liveness path + Assert.True(block.Contains("hasActiveTool"), + "Watchdog must branch on hasActiveTool to distinguish running tool from lost-idle scenario"); + + // Server liveness check must be conditioned on hasActiveTool + var activeToolIdx = block.IndexOf("hasActiveTool"); + var serverRunningIdx = block.IndexOf("IsServerRunning"); + Assert.True(serverRunningIdx > activeToolIdx, + "IsServerRunning check must appear AFTER the hasActiveTool check (guarded by it)"); + + // Timer reset: LastEventAtTicks must be updated in the 'server alive' path + var lastEventIdx = block.IndexOf("LastEventAtTicks"); + Assert.True(lastEventIdx > 0, "Must reset LastEventAtTicks when server is alive and tool is running"); + + // Must use continue to skip the kill + var continueIdx = block.IndexOf("continue;"); + Assert.True(continueIdx > 0, "Must 'continue' watchdog loop when server alive and tool running"); + } + + [Fact] + public void WatchdogDecision_NoActiveTool_ToolsWereUsed_ShouldCompleteCleanly() + { + // When no tool is active (ActiveToolCallCount = 0) but tools WERE used this turn, + // the watchdog must complete the session CLEANLY (call CompleteResponse, no error msg). + // This is the "SessionIdleEvent lost" scenario — response is done, just terminal event missed. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync"); + var elapsedIdx = source.IndexOf("elapsed >= effectiveTimeout", methodIdx); + var methodEndIdx = source.IndexOf(" private readonly ConcurrentDictionary", methodIdx); + if (methodEndIdx < 0) methodEndIdx = source.Length; + var block = source.Substring(elapsedIdx, methodEndIdx - elapsedIdx); + + // Must have both conditions + Assert.True(block.Contains("hasUsedTools"), + "Watchdog must check hasUsedTools to identify lost-idle-event scenario"); + + // CompleteResponse must be called in this path (not just the error path) + var completeResponseIdx = block.IndexOf("CompleteResponse"); + Assert.True(completeResponseIdx > 0, + "Watchdog must call CompleteResponse for the lost-idle-event scenario (not just show error)"); + + // The error message should NOT appear in this path — clean completion, no error text + // The error path ('Session appears stuck') should be in a different branch + var stuckMsgIdx = block.IndexOf("Session appears stuck"); + Assert.True(stuckMsgIdx > completeResponseIdx, + "The 'appears stuck' error message must come AFTER CompleteResponse, in a separate branch"); + + // Case B must exit the block (break/return) before falling through to Case C error path + var breakIdx = block.IndexOf("break;", completeResponseIdx); + Assert.True(breakIdx > 0 && breakIdx < stuckMsgIdx, + "Case B must have a 'break;' before Case C to prevent fallthrough to the error kill path"); + } + + [Fact] + public void WatchdogDecision_MaxTimeExceeded_AlwaysKills() + { + // When total processing time exceeds WatchdogMaxProcessingTimeSeconds, the watchdog + // must kill regardless of server liveness or tool state — no session runs forever. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync"); + var elapsedIdx = source.IndexOf("elapsed >= effectiveTimeout", methodIdx); + var methodEndIdx = source.IndexOf(" private readonly ConcurrentDictionary", methodIdx); + if (methodEndIdx < 0) methodEndIdx = source.Length; + var block = source.Substring(elapsedIdx, methodEndIdx - elapsedIdx); + + // exceededMaxTime must gate the liveness/clean-complete paths + var exceededIdx = block.IndexOf("exceededMaxTime"); + var serverRunningIdx = block.IndexOf("IsServerRunning"); + Assert.True(exceededIdx > 0 && serverRunningIdx > 0, + "Both exceededMaxTime and IsServerRunning must be present"); + Assert.True(exceededIdx < serverRunningIdx, + "exceededMaxTime check must appear before the server liveness bypass — max time always kills"); + } + + [Fact] + public void WatchdogPeriodicFlush_HasGenerationGuard() + { + // The periodic flush must capture ProcessingGeneration before InvokeOnUI and + // validate it inside the lambda — preventing stale watchdog ticks from flushing + // new-turn content into old-turn history if the user aborts + resends. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync"); + var flushCommentIdx = source.IndexOf("Periodic mid-watchdog flush", methodIdx); + Assert.True(flushCommentIdx > 0, "Periodic flush comment must exist in RunProcessingWatchdogAsync"); + // Capture 1000 chars around the flush block to verify generation guard + var flushBlock = source.Substring(flushCommentIdx, 1000); + Assert.True(flushBlock.Contains("ProcessingGeneration"), + "Periodic flush block must read ProcessingGeneration before InvokeOnUI (race condition guard)"); + } + + private static string GetRepoRoot() + { + var dir = new DirectoryInfo(AppContext.BaseDirectory); + while (dir != null && !File.Exists(Path.Combine(dir.FullName, "PolyPilot.slnx"))) + dir = dir.Parent; + return dir?.FullName ?? throw new InvalidOperationException("Could not find repo root"); + } + + // ===== ExternalToolRequestedEvent in EventMatrix ===== + + [Fact] + public void ExternalToolRequestedEvent_IsInEventMatrix() + { + // ExternalToolRequestedEvent was arriving as "Unhandled" in logs, causing spam + // and incorrectly updating LastEventAtTicks. It must be explicitly classified. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + Assert.True(source.Contains("ExternalToolRequestedEvent"), + "ExternalToolRequestedEvent must be explicitly listed in SdkEventMatrix to prevent 'Unhandled' log spam"); + } + + [Fact] + public void ExternalToolRequestedEvent_ClassifiedAsTimelineOnly() + { + // Must be TimelineOnly — it doesn't need chat projection but should not suppress + // LastEventAtTicks updates (it does represent live activity on the session). + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var idx = source.IndexOf("ExternalToolRequestedEvent"); + Assert.True(idx >= 0); + var context = source.Substring(idx, 80); + Assert.True(context.Contains("TimelineOnly"), + "ExternalToolRequestedEvent must be classified as TimelineOnly"); + } + + // ===== Periodic mid-watchdog flush ===== + + [Fact] + public void WatchdogPeriodicFlush_PresenceInSource() + { + // The watchdog must flush CurrentResponse to History periodically so partial + // responses are visible even while IsProcessing=true (e.g., stuck mid-stream). + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync"); + // The flush must be inside the watchdog method + var watchdogBody = source.Substring(methodIdx, 6000); + Assert.True(watchdogBody.Contains("CurrentResponse.Length"), + "Watchdog must check CurrentResponse.Length for periodic flush"); + Assert.True(watchdogBody.Contains("periodic flush") || watchdogBody.Contains("FlushCurrentResponse"), + "Watchdog must call FlushCurrentResponse for periodic flush of accumulated content"); + } + + [Fact] + public void WatchdogPeriodicFlush_RunsBeforeTimeoutCheck() + { + // The periodic flush must run BEFORE the elapsed >= effectiveTimeout check, + // so content is visible even before the session is declared stuck. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync"); + var flushIdx = source.IndexOf("Periodic mid-watchdog flush", methodIdx); + var elapsedCheckIdx = source.IndexOf("elapsed >= effectiveTimeout", methodIdx); + Assert.True(flushIdx > 0, "Periodic flush comment must exist in RunProcessingWatchdogAsync"); + Assert.True(flushIdx < elapsedCheckIdx, + "Periodic flush must appear BEFORE the elapsed >= effectiveTimeout kill check"); + } + + // ===== Multi-agent no-tool session stuck-session recovery (PR #332 fix) ===== + + [Fact] + public void WatchdogDecision_MultiAgent_NoTools_CaseBIncludesMultiAgent_InSource() + { + // Case B must handle multi-agent sessions without tool use. + // This covers the scenario where an orchestrator/worker receives AssistantTurnStartEvent + // (cancels the 4s TurnEnd fallback), then the SDK drops the terminal events for the + // follow-on sub-turn. Without this, the 120s watchdog fires Case C (error kill) instead + // of Case B (clean complete), showing an unnecessary error message. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync"); + var endIdx = source.IndexOf(" private readonly ConcurrentDictionary", methodIdx); + var watchdogBody = source.Substring(methodIdx, endIdx - methodIdx); + + Assert.True(watchdogBody.Contains("isMultiAgentSession"), + "Case B must reference isMultiAgentSession to handle multi-agent no-tool sessions"); + // Find the inline "Case B:" comment that's directly in the else-if block + var caseBInlineIdx = watchdogBody.IndexOf("Case B:"); + Assert.True(caseBInlineIdx >= 0, "Inline 'Case B:' comment must exist in watchdog block"); + // Look back ~200 chars to find the else if condition containing isMultiAgentSession + var conditionStart = Math.Max(0, caseBInlineIdx - 200); + var conditionBlock = watchdogBody.Substring(conditionStart, 400); + Assert.True(conditionBlock.Contains("isMultiAgentSession"), + "Case B condition must include isMultiAgentSession for multi-agent no-tool recovery"); + } + + [Fact] + public void MultiAgent_NoTools_UseInactivityTimeout_NotToolTimeout() + { + // Multi-agent sessions without tool use must use 120s inactivity timeout, NOT 600s. + // Workers generating text stream deltas continuously — elapsed stays small during + // generation. The 120s timeout only fires when the session goes SILENT. + // When silent: either done with lost terminal event (Case B clean complete) or + // genuinely stuck (Case C error kill). + // This prevents the 600s stuck-session UX bug: sdk drops TurnEnd/Idle after + // TurnStart cancels the 4s fallback → user waits 600s instead of 120s. + var timeout = ComputeEffectiveTimeout( + hasActiveTool: false, isResumed: false, hasReceivedEvents: false, + hasUsedTools: false, isMultiAgent: true); + Assert.Equal(120, timeout); // Must be inactivity (120s), not tool timeout (600s) + Assert.True(timeout < CopilotService.WatchdogToolExecutionTimeoutSeconds, + "Multi-agent without tools must use shorter inactivity timeout (120s), not 600s"); + } + + [Fact] + public void AssistantTurnStartEvent_LoggedInEvtDiagnostics_InSource() + { + // AssistantTurnStartEvent MUST be included in the [EVT] log filter. + // Without this, when TurnStart cancels the TurnEnd fallback silently, + // event-diagnostics.log shows a gap with no explanation — making stuck-session + // forensics impossible (root cause of the PR #332 debug session bug). + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var evtLogIdx = source.IndexOf("[EVT]"); + Assert.True(evtLogIdx >= 0, "[EVT] log must exist in event handler"); + // The EVT filter block must include AssistantTurnStartEvent + var filterContext = source.Substring(Math.Max(0, evtLogIdx - 200), 400); + Assert.True(filterContext.Contains("AssistantTurnStartEvent"), + "AssistantTurnStartEvent must be included in [EVT] logging so TurnStart " + + "is visible in diagnostics (prevents invisible fallback cancellations)"); + } +} \ No newline at end of file diff --git a/PolyPilot.Tests/TurnEndFallbackTests.cs b/PolyPilot.Tests/TurnEndFallbackTests.cs index c325c65df5..19c4169f16 100644 --- a/PolyPilot.Tests/TurnEndFallbackTests.cs +++ b/PolyPilot.Tests/TurnEndFallbackTests.cs @@ -134,4 +134,82 @@ public async Task FallbackTimer_CancelledAfterIsCancellationRequestedCheck_DoesN Assert.False(fired, "Explicit IsCancellationRequested guard must prevent firing after cancel"); } -} + + // ===== Tool session extended fallback ===== + + [Fact] + public void TurnEndIdleToolFallbackAdditionalMs_IsReasonable() + { + // Must be long enough that a normal LLM inter-round pause (reasoning, >4s up to ~30s) + // won't trigger a premature CompleteResponse, but short enough to rescue a stuck session + // faster than the watchdog (600s). + Assert.InRange(CopilotService.TurnEndIdleToolFallbackAdditionalMs, 10_000, 120_000); + Assert.True( + CopilotService.TurnEndIdleToolFallbackAdditionalMs > CopilotService.TurnEndIdleFallbackMs, + "Tool session additional delay must be longer than the base 4s fallback"); + } + + [Fact] + public void TurnEndIdleToolFallback_TotalDelay_IsLessThanWatchdog() + { + // The combined fallback delay must be shorter than the watchdog tool timeout so + // we rescue stuck sessions before the watchdog fires the error message. + var totalMs = CopilotService.TurnEndIdleFallbackMs + CopilotService.TurnEndIdleToolFallbackAdditionalMs; + var watchdogMs = CopilotService.WatchdogToolExecutionTimeoutSeconds * 1000; + Assert.True(totalMs < watchdogMs, + $"Total fallback ({totalMs}ms) must be less than watchdog tool timeout ({watchdogMs}ms)"); + } + + [Fact] + public async Task ToolFallback_CancelledByTurnStart_DoesNotFire() + { + // Simulates: TurnEnd (tools used) starts timer -> TurnStart arrives -> cancels -> no fire + var cts = new CancellationTokenSource(); + var token = cts.Token; + bool completeResponseFired = false; + + var fallbackTask = Task.Run(async () => + { + try + { + await Task.Delay(50, token); // accelerated base delay (mirrors other tests in this file) + if (token.IsCancellationRequested) return; + await Task.Delay(100, token); // accelerated extended delay + if (token.IsCancellationRequested) return; + completeResponseFired = true; + } + catch (OperationCanceledException) { } + }); + + await Task.Delay(50); + cts.Cancel(); + cts.Dispose(); + await fallbackTask; + Assert.False(completeResponseFired, "Fallback must not fire when cancelled by TurnStart"); + } + + [Fact] + public async Task ToolFallback_NoTurnStart_EventuallyFires() + { + // Simulates: TurnEnd (tools used) + no TurnStart + no SessionIdle -> fallback fires + var cts = new CancellationTokenSource(); + var token = cts.Token; + bool completeResponseFired = false; + + _ = Task.Run(async () => + { + try + { + await Task.Delay(50, token); // accelerated base delay + if (token.IsCancellationRequested) return; + await Task.Delay(100, token); // accelerated extended delay + if (token.IsCancellationRequested) return; + completeResponseFired = true; + } + catch (OperationCanceledException) { } + }); + + await Task.Delay(400); + Assert.True(completeResponseFired, "Fallback must fire when no TurnStart or SessionIdle arrives"); + } +} \ No newline at end of file diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs index 6f790e0a1c..a995a9a03c 100644 --- a/PolyPilot/Services/CopilotService.Events.cs +++ b/PolyPilot/Services/CopilotService.Events.cs @@ -64,6 +64,9 @@ private enum EventVisibility ["SessionLifecycleEvent"] = EventVisibility.Ignore, ["HookStartEvent"] = EventVisibility.Ignore, ["HookEndEvent"] = EventVisibility.Ignore, + + // External tool requests — classified explicitly to avoid "Unhandled" log spam + ["ExternalToolRequestedEvent"] = EventVisibility.TimelineOnly, }; private static EventVisibility ClassifySessionEvent(SessionEvent evt) @@ -222,8 +225,10 @@ private void HandleSessionEvent(SessionState state, SessionEvent evt) var sessionName = state.Info.Name; var isCurrentState = _sessions.TryGetValue(sessionName, out var current) && ReferenceEquals(current, state); - // Log critical lifecycle events and detect orphaned handlers - if (evt is SessionIdleEvent or AssistantTurnEndEvent or SessionErrorEvent) + // Log all critical lifecycle events, including TurnStart. TurnStart cancels the + // TurnEnd→Idle fallback; without logging it, stuck-session forensics cannot see the + // sub-turn boundary that caused the fallback to be silently cancelled. + if (evt is SessionIdleEvent or AssistantTurnEndEvent or SessionErrorEvent or AssistantTurnStartEvent) { Debug($"[EVT] '{sessionName}' received {evt.GetType().Name} " + $"(IsProcessing={state.Info.IsProcessing}, isCurrentState={isCurrentState}, " + @@ -480,15 +485,27 @@ void Invoke(Action action) } // Guard: if tools were used this turn, the LLM may still be reasoning // between tool rounds (TurnEnd → thinking → TurnStart, >4s). - // HasUsedToolsThisTurn stays true across all sub-turns and is only - // cleared at CompleteResponse/abort. + // Don't complete immediately — wait an additional period. If no new + // TurnStart arrives within that window, SessionIdleEvent was lost + // (SDK bug #299) and we must complete to unblock the session. if (Volatile.Read(ref state.HasUsedToolsThisTurn)) { - Debug($"[IDLE-FALLBACK] '{sessionName}' skipped — tools were used this turn, TurnStart likely coming"); + await Task.Delay(TurnEndIdleToolFallbackAdditionalMs, fallbackToken); + if (fallbackToken.IsCancellationRequested) return; + // Re-check: if a new tool started or TurnStart fired and cancelled + // this token, we would have exited above. If still here, no new + // activity arrived → SessionIdleEvent was lost → complete. + if (Interlocked.CompareExchange(ref state.ActiveToolCallCount, 0, 0) > 0) + { + Debug($"[IDLE-FALLBACK] '{sessionName}' skipped after extended wait — tools still active"); + return; + } + Debug($"[IDLE-FALLBACK] '{sessionName}' SessionIdleEvent not received {TurnEndIdleFallbackMs + TurnEndIdleToolFallbackAdditionalMs}ms after TurnEnd (tools used) — firing CompleteResponse"); + InvokeOnUI(() => CompleteResponse(state, turnEndGen)); return; } Debug($"[IDLE-FALLBACK] '{sessionName}' SessionIdleEvent not received {TurnEndIdleFallbackMs}ms after TurnEnd — firing CompleteResponse"); - Invoke(() => CompleteResponse(state, turnEndGen)); + InvokeOnUI(() => CompleteResponse(state, turnEndGen)); } catch (OperationCanceledException) { /* expected on cancellation */ } catch (Exception ex) { Debug($"[IDLE-FALLBACK] '{sessionName}' unexpected error: {ex}"); } @@ -1385,6 +1402,15 @@ private void HandleReflectionAdvanceResult(SessionState state, string response, /// internal const int TurnEndIdleFallbackMs = 4000; + /// + /// Additional milliseconds to wait after the initial TurnEnd fallback when tools were used + /// this turn. After the base 4s wait we know there are no active tools, but the LLM may + /// be reasoning between rounds (TurnEnd → TurnStart gap can exceed 4s). If no TurnStart + /// arrives within this additional window, SessionIdleEvent was likely lost — fire CompleteResponse. + /// Total wait for tool sessions = TurnEndIdleFallbackMs + TurnEndIdleToolFallbackAdditionalMs = 34s. + /// + internal const int TurnEndIdleToolFallbackAdditionalMs = 30_000; + private static void CancelProcessingWatchdog(SessionState state) { if (state.ProcessingWatchdog != null) @@ -1466,11 +1492,15 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session // have 2-3 min gaps between events while the model reasons), OR // 3. Tools have been executed this turn (HasUsedToolsThisTurn) — even between // tool rounds when ActiveToolCallCount is 0, the model may spend minutes - // thinking about what tool to call next, OR - // 4. Session is in a multi-agent group — workers doing text-heavy tasks - // (e.g., PR reviews) can take 2-4 min without tool calls. The orchestration - // loop has its own 10-minute CancelAfter timeout per worker. Cached at - // send time on UI thread to avoid cross-thread List access. + // thinking about what tool to call next. + // + // NOTE: isMultiAgentSession alone does NOT extend the timeout. + // Workers actively streaming text (e.g., PR reviews) generate delta events + // continuously, so elapsed stays small regardless of timeout tier. The 600s + // timeout is only needed for actual tool execution silence. Using 600s for + // pure reasoning sub-turns (no tools) caused stuck-session UX bugs when the + // SDK dropped terminal events (sdk bug #299 variant): user had to wait 600s + // instead of the 120s inactivity timeout. var isMultiAgentSession = Volatile.Read(ref state.IsMultiAgentSession); var hasReceivedEvents = Volatile.Read(ref state.HasReceivedEventsSinceResume); var hasUsedTools = Volatile.Read(ref state.HasUsedToolsThisTurn); @@ -1481,7 +1511,29 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session // goes true and we fall through to the normal timeout tiers. var useResumeQuiescence = state.Info.IsResumed && !hasReceivedEvents && !hasActiveTool && !hasUsedTools; - var useToolTimeout = hasActiveTool || (state.Info.IsResumed && !useResumeQuiescence) || hasUsedTools || isMultiAgentSession; + // Periodic mid-watchdog flush: if content has accumulated in CurrentResponse + // for longer than the check interval without being moved to History, flush it now. + // This ensures partial responses are visible in the chat even while IsProcessing=true + // (e.g., if TurnEnd→Idle fallback hasn't fired yet, or streaming stalled mid-response). + if (elapsed >= WatchdogCheckIntervalSeconds) + { + // Capture generation before InvokeOnUI — if the user aborts + resends between + // this check and the UI dispatch, the generation changes and we must not flush + // new-turn content into the old turn's history. + var flushGen = Interlocked.Read(ref state.ProcessingGeneration); + InvokeOnUI(() => + { + if (Interlocked.Read(ref state.ProcessingGeneration) != flushGen) return; + if (state.CurrentResponse.Length > 0) + { + Debug($"[WATCHDOG] '{sessionName}' periodic flush — CurrentResponse has content after {elapsed:F0}s of inactivity"); + FlushCurrentResponse(state); + OnStateChanged?.Invoke(); + } + }); + } + + var useToolTimeout = hasActiveTool || (state.Info.IsResumed && !useResumeQuiescence) || hasUsedTools; var effectiveTimeout = useResumeQuiescence ? WatchdogResumeQuiescenceTimeoutSeconds : useToolTimeout @@ -1502,6 +1554,61 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session if (elapsed >= effectiveTimeout) { var exceededMaxTime = totalProcessingSeconds >= WatchdogMaxProcessingTimeSeconds; + + // Before killing, check what state we're actually in: + // + // Case A — Tool actively running (ActiveToolCallCount > 0): + // The SDK only fires events at tool start/complete; a long build or test run + // produces zero events while executing. Use server TCP liveness to decide: + // - Server alive → tool is running, reset inactivity timer and keep waiting + // - Server dead → no events will ever arrive, kill with error + // (Skipped for demo/remote where we can't probe the local TCP port) + // + // Case B — No active tool (ActiveToolCallCount = 0) and tools were used: + // Tool has finished. The LLM has finished (AssistantTurnEndEvent fired). + // SessionIdleEvent was lost (SDK bug #299 or network hiccup). + // The 34s fallback should have caught this, but if not, complete cleanly. + // No error message — the response IS complete, we just missed the terminal event. + // + // Case C — Max time exceeded OR server dead / demo / remote: + // Kill with error (genuine zombie session). + if (!exceededMaxTime) + { + if (hasActiveTool && !IsDemoMode && !IsRemoteMode) + { + // Case A: check server TCP port + var serverAlive = _serverManager.IsServerRunning; + if (serverAlive) + { + Debug($"[WATCHDOG] '{sessionName}' {elapsed:F0}s inactivity but tool is running and server is alive — resetting timer " + + $"(timeout={effectiveTimeout}s, totalProcessing={totalProcessingSeconds:F0}s)"); + Interlocked.Exchange(ref state.LastEventAtTicks, DateTime.UtcNow.Ticks); + continue; // keep waiting — don't kill + } + Debug($"[WATCHDOG] '{sessionName}' tool running but server is not responding — killing stuck session " + + $"(elapsed={elapsed:F0}s, totalProcessing={totalProcessingSeconds:F0}s)"); + } + else if (!hasActiveTool && (hasUsedTools || (isMultiAgentSession && !IsDemoMode && !IsRemoteMode && _serverManager.IsServerRunning))) + { + // Case B: tools finished (or multi-agent session with server alive) and + // terminal event was lost — complete cleanly without an error message. + // For tool-using sessions: tool finished, SessionIdleEvent lost (sdk bug #299). + // For multi-agent no-tool sessions: LLM finished generating, SDK dropped + // both AssistantTurnEndEvent and SessionIdleEvent. Server liveness check + // confirms the session completed normally, not a server crash. + var watchdogGen = Interlocked.Read(ref state.ProcessingGeneration); + Debug($"[WATCHDOG] '{sessionName}' tool finished but SessionIdleEvent never arrived — completing cleanly " + + $"(elapsed={elapsed:F0}s, totalProcessing={totalProcessingSeconds:F0}s)"); + InvokeOnUI(() => + { + if (!state.Info.IsProcessing) return; + if (Interlocked.Read(ref state.ProcessingGeneration) != watchdogGen) return; + CompleteResponse(state, watchdogGen); + }); + break; + } + } + var timeoutDisplay = exceededMaxTime ? $"{WatchdogMaxProcessingTimeSeconds / 60} minute(s) total processing time" : effectiveTimeout >= 60