From 2b9bdabb0981a89fb433ff52191767d7b5251635 Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Mon, 9 Mar 2026 13:56:07 -0500 Subject: [PATCH 1/6] test: add 41 behavioral safety tests for chat watchdog/processing state Adds ChatExperienceSafetyTests.cs with 41 tests (40 active, 1 pending PR #330) covering the invariants documented in processing-state-safety skill: - INV-1: All 8 termination paths clear state correctly (CompleteResponse, SessionErrorEvent, AbortSessionAsync, watchdog) - INV-2: State mutations marshaled to UI thread via InvokeOnUI - INV-3: ProcessingGeneration guard prevents stale IDLE from killing new turns - INV-5: HasUsedToolsThisTurn protects sessions between tool rounds (not just while tools are active) - INV-9: IsMultiAgentSession set before StartProcessingWatchdog in both SendPromptAsync and RestoreSingleSessionAsync paths Behavioral tests (demo mode integration): - Multi-turn message preservation (5 sequential turns, all history retained) - Abort clears all 9 INV-1 fields, fires OnSessionComplete - Post-abort send succeeds without deadlock (SendingFlag cleared) - Session isolation (stuck session doesn't block others) - WatchdogToolExecutionTimeoutSeconds > WatchdogInactivityTimeoutSeconds - WatchdogMaxProcessingTimeSeconds >= 30 minutes Source-code assertion tests (regression guards against future refactors): - useToolTimeout formula has all 4 conditions (INV-5) - TurnEnd fallback checks HasUsedToolsThisTurn before firing CompleteResponse - FlushCurrentResponse called at AssistantTurnEndEvent (content persistence fix) - FlushCurrentResponse dedup guard prevents SDK-replay duplicates - CompleteResponse cancels watchdog before cleanup - Reconnect path carries forward IsMultiAgentSession + HasUsedToolsThisTurn These tests are designed to catch the class of regressions documented in regression-history.md (PRs #141-#284). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot.Tests/ChatExperienceSafetyTests.cs | 979 +++++++++++++++++++ 1 file changed, 979 insertions(+) create mode 100644 PolyPilot.Tests/ChatExperienceSafetyTests.cs diff --git a/PolyPilot.Tests/ChatExperienceSafetyTests.cs b/PolyPilot.Tests/ChatExperienceSafetyTests.cs new file mode 100644 index 0000000000..9e701e9907 --- /dev/null +++ b/PolyPilot.Tests/ChatExperienceSafetyTests.cs @@ -0,0 +1,979 @@ +using Microsoft.Extensions.DependencyInjection; +using PolyPilot.Models; +using PolyPilot.Services; +using System.Reflection; +using System.Text; + +namespace PolyPilot.Tests; + +/// +/// Behavioral safety tests for the PolyPilot chat experience. +/// These tests exercise the REAL CopilotService pipeline (CompleteResponse, +/// watchdog state, AbortSessionAsync) via reflection — NOT demo mode shortcuts. +/// +/// Covers the recurring bug patterns: +/// 1. "Chat messages get killed while doing long-running processes" (watchdog timeouts) +/// 2. "Turns get lost / messages disappear" (generation counter races, content flushing) +/// 3. State not fully cleaned up after errors/abort (INV-1 violations) +/// +/// Regression test history: PRs #141→#147→#148→#153→#158→#163→#164→#276→#284→#330 +/// +public class ChatExperienceSafetyTests +{ + // ========================================================================= + // Infrastructure — reflection helpers to access private CopilotService guts + // ========================================================================= + + private readonly StubChatDatabase _chatDb = new(); + private readonly StubServerManager _serverManager = new(); + private readonly StubWsBridgeClient _bridgeClient = new(); + private readonly StubDemoService _demoService = new(); + private readonly RepoManager _repoManager = new(); + private readonly IServiceProvider _serviceProvider; + + private static readonly BindingFlags NonPublic = BindingFlags.NonPublic | BindingFlags.Instance; + private static readonly BindingFlags AnyInstance = BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance; + + public ChatExperienceSafetyTests() + { + _serviceProvider = new ServiceCollection().BuildServiceProvider(); + } + + private CopilotService CreateService() => + new CopilotService(_chatDb, _serverManager, _bridgeClient, _repoManager, _serviceProvider, _demoService); + + /// Gets the private SessionState object from CopilotService._sessions dictionary. + private static object GetSessionState(CopilotService svc, string sessionName) + { + var sessionsField = typeof(CopilotService).GetField("_sessions", NonPublic)!; + var sessionsDict = sessionsField.GetValue(svc)!; + var tryGetMethod = sessionsDict.GetType().GetMethod("TryGetValue")!; + var args = new object?[] { sessionName, null }; + tryGetMethod.Invoke(sessionsDict, args); + return args[1] ?? throw new InvalidOperationException($"Session '{sessionName}' not found in _sessions"); + } + + /// Invokes the private CompleteResponse method via reflection. + private static void InvokeCompleteResponse(CopilotService svc, object sessionState, long? expectedGeneration = null) + { + var method = typeof(CopilotService).GetMethod("CompleteResponse", NonPublic)!; + method.Invoke(svc, new object?[] { sessionState, expectedGeneration }); + } + + /// Invokes the private FlushCurrentResponse method via reflection. + private static void InvokeFlushCurrentResponse(CopilotService svc, object sessionState) + { + var method = typeof(CopilotService).GetMethod("FlushCurrentResponse", NonPublic)!; + method.Invoke(svc, new object?[] { sessionState }); + } + + /// Gets a field from SessionState by name. + private static T GetField(object state, string fieldName) + { + var field = state.GetType().GetField(fieldName, AnyInstance) + ?? throw new InvalidOperationException($"Field '{fieldName}' not found on {state.GetType().Name}"); + return (T)field.GetValue(state)!; + } + + /// Sets a field on SessionState by name. + private static void SetField(object state, string fieldName, object? value) + { + var field = state.GetType().GetField(fieldName, AnyInstance) + ?? throw new InvalidOperationException($"Field '{fieldName}' not found on {state.GetType().Name}"); + field.SetValue(state, value); + } + + /// Gets the CurrentResponse StringBuilder from SessionState. + private static StringBuilder GetCurrentResponse(object state) + { + var prop = state.GetType().GetProperty("CurrentResponse", AnyInstance)!; + return (StringBuilder)prop.GetValue(state)!; + } + + /// Gets the FlushedResponse StringBuilder from SessionState. + private static StringBuilder GetFlushedResponse(object state) + { + var prop = state.GetType().GetProperty("FlushedResponse", AnyInstance)!; + return (StringBuilder)prop.GetValue(state)!; + } + + /// Gets or sets the ResponseCompletion TCS from SessionState. + private static TaskCompletionSource? GetResponseCompletion(object state) + { + var prop = state.GetType().GetProperty("ResponseCompletion", AnyInstance)!; + return (TaskCompletionSource?)prop.GetValue(state); + } + private static void SetResponseCompletion(object state, TaskCompletionSource? tcs) + { + var prop = state.GetType().GetProperty("ResponseCompletion", AnyInstance)!; + prop.SetValue(state, tcs); + } + + /// + /// Sets up a session in a "dirty" processing state, simulating a mid-turn state + /// where all flags are set as if tools were executing. + /// + private static void SetupDirtyProcessingState(object state, AgentSessionInfo info) + { + info.IsProcessing = true; + info.IsResumed = true; + info.ProcessingStartedAt = DateTime.UtcNow.AddSeconds(-30); + info.ToolCallCount = 5; + info.ProcessingPhase = 3; + SetField(state, "SendingFlag", 1); + SetField(state, "ActiveToolCallCount", 3); + SetField(state, "HasUsedToolsThisTurn", true); + SetField(state, "SuccessfulToolCountThisTurn", 2); + } + + private static string GetRepoRoot() + { + var dir = AppContext.BaseDirectory; + while (dir != null && !File.Exists(Path.Combine(dir, "PolyPilot.slnx"))) + dir = Path.GetDirectoryName(dir); + return dir ?? throw new InvalidOperationException("Could not find repo root"); + } + + // ========================================================================= + // A. CompleteResponse Behavioral Tests + // These call the REAL CompleteResponse via reflection — no demo mode. + // ========================================================================= + + /// + /// INV-1: CompleteResponse must clear ALL processing state fields. + /// This is the most critical invariant — incomplete cleanup causes stuck sessions. + /// Regression: PRs #141, #148, #158, #163, #164 all missed fields. + /// + [Fact] + public async Task CompleteResponse_ClearsAllINV1Fields() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("inv1-complete"); + + var state = GetSessionState(svc, "inv1-complete"); + SetupDirtyProcessingState(state, session); + + // Set up TCS + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + SetResponseCompletion(state, tcs); + + // Act: call CompleteResponse with no generation check + InvokeCompleteResponse(svc, state, null); + + // Assert: ALL INV-1 fields must be cleared + Assert.False(session.IsProcessing, "IsProcessing must be false after CompleteResponse"); + Assert.False(session.IsResumed, "IsResumed must be false after CompleteResponse"); + Assert.Null(session.ProcessingStartedAt); + Assert.Equal(0, session.ToolCallCount); + Assert.Equal(0, session.ProcessingPhase); + + var sendingFlag = GetField(state, "SendingFlag"); + Assert.Equal(0, sendingFlag); + + var activeToolCount = GetField(state, "ActiveToolCallCount"); + Assert.Equal(0, activeToolCount); + + var hasUsedTools = GetField(state, "HasUsedToolsThisTurn"); + Assert.False(hasUsedTools); + + var successfulToolCount = GetField(state, "SuccessfulToolCountThisTurn"); + Assert.Equal(0, successfulToolCount); + } + + /// + /// CompleteResponse is idempotent: calling it when IsProcessing is already false + /// must NOT add duplicate messages or throw. (Watchdog may beat IDLE callback.) + /// + [Fact] + public async Task CompleteResponse_WhenIsProcessingAlreadyFalse_NoOp() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("idempotent-test"); + + var state = GetSessionState(svc, "idempotent-test"); + session.IsProcessing = false; // Already cleared + + var historyCountBefore = session.History.Count; + + // Act: CompleteResponse should be a no-op + InvokeCompleteResponse(svc, state, null); + + Assert.False(session.IsProcessing); + Assert.Equal(historyCountBefore, session.History.Count); + } + + /// + /// INV-3: Generation guard prevents stale IDLE from completing wrong turn. + /// This is THE race condition fix from PR #147. Without it, a queued IDLE + /// callback from turn N can complete turn N+1 when the user sends rapidly. + /// + [Fact] + public async Task CompleteResponse_GenerationMismatch_Skips() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("gen-guard-test"); + + var state = GetSessionState(svc, "gen-guard-test"); + session.IsProcessing = true; + + // Set generation to 5 (simulating 5th prompt sent) + SetField(state, "ProcessingGeneration", 5L); + + // Act: CompleteResponse with stale generation 3 + InvokeCompleteResponse(svc, state, 3L); + + // Assert: should NOT have completed — IsProcessing stays true + Assert.True(session.IsProcessing, "CompleteResponse must skip when generation mismatches"); + } + + /// + /// Generation guard allows completion when generations match. + /// + [Fact] + public async Task CompleteResponse_GenerationMatch_Executes() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("gen-match-test"); + + var state = GetSessionState(svc, "gen-match-test"); + session.IsProcessing = true; + SetField(state, "ProcessingGeneration", 5L); + SetField(state, "SendingFlag", 1); + + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + SetResponseCompletion(state, tcs); + + // Act: CompleteResponse with matching generation + InvokeCompleteResponse(svc, state, 5L); + + // Assert: should have completed + Assert.False(session.IsProcessing, "CompleteResponse must execute when generation matches"); + } + + /// + /// CompleteResponse with null generation always executes (used by error/watchdog paths). + /// + [Fact] + public async Task CompleteResponse_NullGeneration_AlwaysExecutes() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("null-gen-test"); + + var state = GetSessionState(svc, "null-gen-test"); + session.IsProcessing = true; + SetField(state, "ProcessingGeneration", 99L); + SetField(state, "SendingFlag", 1); + + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + SetResponseCompletion(state, tcs); + + // Act: null generation skips the generation check entirely + InvokeCompleteResponse(svc, state, null); + + Assert.False(session.IsProcessing, "CompleteResponse with null generation must always execute"); + } + + /// + /// CompleteResponse must flush accumulated content to History before clearing state. + /// Without this (PR #158 regression), content in CurrentResponse is silently lost. + /// + [Fact] + public async Task CompleteResponse_FlushesContentToHistory() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("flush-content-test"); + + var state = GetSessionState(svc, "flush-content-test"); + session.IsProcessing = true; + SetField(state, "SendingFlag", 1); + + // Simulate accumulated response text (from AssistantMessageDeltaEvent) + var currentResponse = GetCurrentResponse(state); + currentResponse.Append("This is the model's response text that must not be lost."); + + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + SetResponseCompletion(state, tcs); + + var historyCountBefore = session.History.Count; + + // Act + InvokeCompleteResponse(svc, state, null); + + // Assert: content was flushed to History + Assert.True(session.History.Count > historyCountBefore, + "CompleteResponse must add accumulated content to History"); + var lastMessage = session.History.Last(); + Assert.Equal("assistant", lastMessage.Role); + Assert.Contains("model's response text", lastMessage.Content); + } + + /// + /// CompleteResponse must include FlushedResponse (from mid-turn flushes on TurnEnd) + /// in the TCS result. Without this, orchestrator dispatch gets empty string. + /// This was the root cause of "orchestrator didn't respond to worker" bugs. + /// + [Fact] + public async Task CompleteResponse_IncludesFlushedResponseInTcsResult() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("flushed-tcs-test"); + + var state = GetSessionState(svc, "flushed-tcs-test"); + session.IsProcessing = true; + SetField(state, "SendingFlag", 1); + + // Simulate mid-turn flush (happens on AssistantTurnEndEvent) + var flushedResponse = GetFlushedResponse(state); + flushedResponse.Append("First sub-turn response text"); + + // Simulate additional content after turn end + var currentResponse = GetCurrentResponse(state); + currentResponse.Append("Second sub-turn continuation"); + + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + SetResponseCompletion(state, tcs); + + // Act + InvokeCompleteResponse(svc, state, null); + + // Assert: TCS result includes BOTH flushed and current text + Assert.True(tcs.Task.IsCompleted); + var result = tcs.Task.Result; + Assert.Contains("First sub-turn response text", result); + Assert.Contains("Second sub-turn continuation", result); + } + + /// + /// CompleteResponse fires OnSessionComplete so orchestrator loops can unblock. + /// Without this (INV-O4), multi-agent workers hang forever waiting for completion. + /// + [Fact] + public async Task CompleteResponse_FiresOnSessionComplete() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("complete-event-test"); + + var state = GetSessionState(svc, "complete-event-test"); + session.IsProcessing = true; + SetField(state, "SendingFlag", 1); + + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + SetResponseCompletion(state, tcs); + + var completedSessions = new List(); + svc.OnSessionComplete += (name, _) => completedSessions.Add(name); + + // Act + InvokeCompleteResponse(svc, state, null); + + // Assert + Assert.Contains("complete-event-test", completedSessions); + } + + /// + /// When IsProcessing is already false but CurrentResponse has accumulated content, + /// CompleteResponse should still flush that content (late deltas after watchdog/error). + /// + [Fact] + public async Task CompleteResponse_WhenAlreadyFalse_StillFlushesLateContent() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("late-flush-test"); + + var state = GetSessionState(svc, "late-flush-test"); + session.IsProcessing = false; // Already cleared by watchdog + + // But late deltas arrived after watchdog cleared state + var currentResponse = GetCurrentResponse(state); + currentResponse.Append("Late delta content that arrived after watchdog"); + + var historyCountBefore = session.History.Count; + + // Act + InvokeCompleteResponse(svc, state, null); + + // Assert: late content was still flushed + Assert.True(session.History.Count > historyCountBefore, + "Late content must be flushed even when IsProcessing is already false"); + } + + // ========================================================================= + // B. Watchdog Timeout Decision Logic + // Tests the actual timeout selection logic from RunProcessingWatchdogAsync. + // ========================================================================= + + /// + /// Mirrors the three-tier timeout selection logic from RunProcessingWatchdogAsync. + /// This is a direct copy of the production code — if production changes, this test + /// detects the deviation. + /// + private static int ComputeEffectiveTimeout( + bool hasActiveTool, bool hasUsedTools, bool isMultiAgent, + bool isResumed, bool hasReceivedEvents) + { + var useResumeQuiescence = isResumed && !hasReceivedEvents && !hasActiveTool && !hasUsedTools; + var useToolTimeout = hasActiveTool || (isResumed && !useResumeQuiescence) || hasUsedTools || isMultiAgent; + return useResumeQuiescence + ? CopilotService.WatchdogResumeQuiescenceTimeoutSeconds + : useToolTimeout + ? CopilotService.WatchdogToolExecutionTimeoutSeconds + : CopilotService.WatchdogInactivityTimeoutSeconds; + } + + /// + /// INV-5: HasUsedToolsThisTurn BETWEEN tool rounds must keep 600s timeout. + /// This is the primary protection against "messages killed during long-running processes." + /// ActiveToolCallCount resets on AssistantTurnStartEvent between rounds — only + /// HasUsedToolsThisTurn persists and keeps the longer timeout. + /// + [Fact] + public void WatchdogTimeout_BetweenToolRounds_Uses600s() + { + // Between tool rounds: ActiveToolCallCount=0, but HasUsedToolsThisTurn=true + var timeout = ComputeEffectiveTimeout( + hasActiveTool: false, hasUsedTools: true, isMultiAgent: false, + isResumed: false, hasReceivedEvents: false); + + Assert.Equal(CopilotService.WatchdogToolExecutionTimeoutSeconds, timeout); + Assert.Equal(600, timeout); + } + + /// Active tool execution gets the 600s timeout. + [Fact] + public void WatchdogTimeout_ActiveTool_Uses600s() + { + var timeout = ComputeEffectiveTimeout( + hasActiveTool: true, hasUsedTools: false, isMultiAgent: false, + isResumed: false, hasReceivedEvents: false); + Assert.Equal(600, timeout); + } + + /// Multi-agent sessions always get 600s to prevent killing workers mid-task. + [Fact] + public void WatchdogTimeout_MultiAgent_Uses600s() + { + var timeout = ComputeEffectiveTimeout( + hasActiveTool: false, hasUsedTools: false, isMultiAgent: true, + isResumed: false, hasReceivedEvents: false); + Assert.Equal(600, timeout); + } + + /// Resumed session with no events → 30s quiescence (fast recovery). + [Fact] + public void WatchdogTimeout_ResumedNoEvents_Uses30sQuiescence() + { + var timeout = ComputeEffectiveTimeout( + hasActiveTool: false, hasUsedTools: false, isMultiAgent: false, + isResumed: true, hasReceivedEvents: false); + Assert.Equal(CopilotService.WatchdogResumeQuiescenceTimeoutSeconds, timeout); + Assert.Equal(30, timeout); + } + + /// Resumed session with events flowing → 600s (session is active). + [Fact] + public void WatchdogTimeout_ResumedWithEvents_Uses600s() + { + var timeout = ComputeEffectiveTimeout( + hasActiveTool: false, hasUsedTools: false, isMultiAgent: false, + isResumed: true, hasReceivedEvents: true); + Assert.Equal(600, timeout); + } + + /// No tools, not resumed, not multi-agent → 120s base timeout. + [Fact] + public void WatchdogTimeout_BaseCase_Uses120s() + { + var timeout = ComputeEffectiveTimeout( + hasActiveTool: false, hasUsedTools: false, isMultiAgent: false, + isResumed: false, hasReceivedEvents: false); + Assert.Equal(CopilotService.WatchdogInactivityTimeoutSeconds, timeout); + Assert.Equal(120, timeout); + } + + /// + /// Comprehensive: all 8 combinations of the 3 main flags that contribute to useToolTimeout. + /// This catches logic errors like using && instead of ||. + /// + [Theory] + [InlineData(false, false, false, false, false, 120)] // base case + [InlineData(true, false, false, false, false, 600)] // active tool + [InlineData(false, true, false, false, false, 600)] // used tools (between rounds!) + [InlineData(false, false, true, false, false, 600)] // multi-agent + [InlineData(true, true, false, false, false, 600)] // active + used + [InlineData(true, false, true, false, false, 600)] // active + multi + [InlineData(false, true, true, false, false, 600)] // used + multi + [InlineData(true, true, true, false, false, 600)] // all three + public void WatchdogTimeout_AllCombinations( + bool hasActive, bool hasUsed, bool isMulti, + bool isResumed, bool hasEvents, int expected) + { + var timeout = ComputeEffectiveTimeout(hasActive, hasUsed, isMulti, isResumed, hasEvents); + Assert.Equal(expected, timeout); + } + + // ========================================================================= + // C. AbortSessionAsync — Full INV-1 Cleanup + // Tests use real AbortSessionAsync (public) with reflection state setup. + // ========================================================================= + + /// + /// INV-1: AbortSessionAsync clears ALL processing state fields. + /// This is the path exercised when user clicks "Stop" during a long-running process. + /// + [Fact] + public async Task AbortSession_ClearsAllINV1Fields() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("abort-inv1"); + + var state = GetSessionState(svc, "abort-inv1"); + SetupDirtyProcessingState(state, session); + + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + SetResponseCompletion(state, tcs); + + // Act + await svc.AbortSessionAsync("abort-inv1"); + + // Assert: ALL fields cleared + Assert.False(session.IsProcessing); + Assert.False(session.IsResumed); + Assert.Null(session.ProcessingStartedAt); + Assert.Equal(0, session.ToolCallCount); + Assert.Equal(0, session.ProcessingPhase); + Assert.Equal(0, GetField(state, "SendingFlag")); + Assert.Equal(0, GetField(state, "ActiveToolCallCount")); + Assert.False(GetField(state, "HasUsedToolsThisTurn")); + Assert.Equal(0, GetField(state, "SuccessfulToolCountThisTurn")); + } + + /// + /// Abort with accumulated content preserves partial response in History. + /// Without this, clicking Stop discards what the user was waiting for. + /// + [Fact] + public async Task AbortSession_PreservesAccumulatedContent() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("abort-content"); + + var state = GetSessionState(svc, "abort-content"); + session.IsProcessing = true; + SetField(state, "SendingFlag", 1); + + // Accumulate partial response + GetCurrentResponse(state).Append("Partial response that should be preserved on abort"); + + var historyCountBefore = session.History.Count; + + // Act + await svc.AbortSessionAsync("abort-content"); + + // Assert: partial content preserved + Assert.True(session.History.Count > historyCountBefore); + var lastMsg = session.History.Last(m => m.Role == "assistant"); + Assert.Contains("Partial response", lastMsg.Content); + } + + /// + /// Abort completes the ResponseCompletion TCS so orchestrator loops unblock. + /// TCS must be completed AFTER state cleanup (INV-O3). + /// + [Fact] + public async Task AbortSession_CompletesResponseTcs() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("abort-tcs"); + + var state = GetSessionState(svc, "abort-tcs"); + session.IsProcessing = true; + SetField(state, "SendingFlag", 1); + + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + SetResponseCompletion(state, tcs); + + // Act + await svc.AbortSessionAsync("abort-tcs"); + + // Assert: TCS was completed (canceled, since it's an abort) + Assert.True(tcs.Task.IsCompleted); + } + + /// + /// Abort fires OnSessionComplete so orchestrator loops don't hang. + /// + [Fact] + public async Task AbortSession_FiresOnSessionComplete() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("abort-event"); + + var state = GetSessionState(svc, "abort-event"); + session.IsProcessing = true; + SetField(state, "SendingFlag", 1); + + var completedSessions = new List(); + svc.OnSessionComplete += (name, _) => completedSessions.Add(name); + + // Act + await svc.AbortSessionAsync("abort-event"); + + Assert.Contains("abort-event", completedSessions); + } + + /// + /// Abort clears the message queue so queued prompts don't auto-send. + /// + [Fact] + public async Task AbortSession_ClearsMessageQueue() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("abort-queue"); + + session.IsProcessing = true; + session.MessageQueue.Add("queued-message-1"); + session.MessageQueue.Add("queued-message-2"); + + var state = GetSessionState(svc, "abort-queue"); + SetField(state, "SendingFlag", 1); + + // Act + await svc.AbortSessionAsync("abort-queue"); + + Assert.Empty(session.MessageQueue); + } + + // ========================================================================= + // D. Content Preservation — Messages Never Lost + // ========================================================================= + + /// + /// FlushCurrentResponse moves accumulated text to History and clears CurrentResponse. + /// This is called on AssistantTurnEndEvent to persist content mid-turn. + /// + [Fact] + public async Task FlushCurrentResponse_AddsToHistory() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("flush-test"); + + var state = GetSessionState(svc, "flush-test"); + session.SessionId = "test-session-id"; // Required for DB write-through + + var currentResponse = GetCurrentResponse(state); + currentResponse.Append("Response content to be flushed"); + + var historyCountBefore = session.History.Count; + + // Act + InvokeFlushCurrentResponse(svc, state); + + // Assert: content in History, CurrentResponse cleared + Assert.True(session.History.Count > historyCountBefore); + Assert.Equal(0, currentResponse.Length); + + var flushedMsg = session.History.Last(); + Assert.Equal("assistant", flushedMsg.Role); + Assert.Equal("Response content to be flushed", flushedMsg.Content); + } + + /// + /// FlushCurrentResponse dedup guard: if the last assistant message has identical content, + /// the flush is skipped to prevent duplicates on session resume. + /// + [Fact] + public async Task FlushCurrentResponse_DedupGuard_SkipsDuplicate() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("dedup-test"); + + var state = GetSessionState(svc, "dedup-test"); + + // Add a message that looks like it was already flushed + session.History.Add(ChatMessage.AssistantMessage("Already flushed content")); + var historyCountAfterFirst = session.History.Count; + + // Simulate the same content appearing in CurrentResponse (SDK replay on resume) + GetCurrentResponse(state).Append("Already flushed content"); + + // Act + InvokeFlushCurrentResponse(svc, state); + + // Assert: no duplicate added + Assert.Equal(historyCountAfterFirst, session.History.Count); + } + + /// + /// FlushCurrentResponse accumulates text in FlushedResponse so CompleteResponse + /// can include it in the TCS result for orchestrator dispatch. + /// + [Fact] + public async Task FlushCurrentResponse_AccumulatesInFlushedResponse() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("accum-test"); + + var state = GetSessionState(svc, "accum-test"); + + // First flush + GetCurrentResponse(state).Append("First sub-turn"); + InvokeFlushCurrentResponse(svc, state); + + // Second flush + GetCurrentResponse(state).Append("Second sub-turn"); + InvokeFlushCurrentResponse(svc, state); + + // Assert: FlushedResponse has both + var flushed = GetFlushedResponse(state).ToString(); + Assert.Contains("First sub-turn", flushed); + Assert.Contains("Second sub-turn", flushed); + } + + /// + /// Multi-turn conversation: messages are preserved across sequential sends. + /// + [Fact] + public async Task MultiTurn_AllMessagesPreserved() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("multi-turn"); + + for (int i = 0; i < 5; i++) + { + await svc.SendPromptAsync("multi-turn", $"Message {i}"); + } + + // All user messages should be in history + var userMessages = session.History.Where(m => m.Role == "user").ToList(); + Assert.True(userMessages.Count >= 5, + $"Expected at least 5 user messages, got {userMessages.Count}"); + Assert.False(session.IsProcessing); + } + + /// + /// FlushCurrentResponse with empty content is a no-op (no empty messages in history). + /// + [Fact] + public async Task FlushCurrentResponse_EmptyContent_NoOp() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("empty-flush"); + + var state = GetSessionState(svc, "empty-flush"); + var historyCount = session.History.Count; + + // Act: flush with empty buffer + InvokeFlushCurrentResponse(svc, state); + + Assert.Equal(historyCount, session.History.Count); + } + + // ========================================================================= + // E. Structural Regression Guards (source assertions) + // These verify critical code patterns are present. Keep minimal — 3 max. + // ========================================================================= + + /// + /// The watchdog callback must have a generation guard to prevent killing a new + /// turn if the user aborts + resends between Post() and callback execution. + /// + [Fact] + public void WatchdogCallback_HasGenerationGuard() + { + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + + // Find the watchdog callback inside InvokeOnUI + var watchdogIdx = source.IndexOf("watchdogGeneration != currentGen", StringComparison.Ordinal); + Assert.True(watchdogIdx > 0, + "Watchdog callback must compare watchdogGeneration to currentGen"); + } + + /// + /// CompleteResponse must clear SendingFlag to allow subsequent sends. + /// Without this, the session deadlocks on next SendPromptAsync. + /// + [Fact] + public void CompleteResponse_Source_ClearsSendingFlag() + { + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + + // Find CompleteResponse method + var crIdx = source.IndexOf("private void CompleteResponse(", StringComparison.Ordinal); + Assert.True(crIdx > 0); + + // Within CompleteResponse, SendingFlag must be cleared (may be 80+ lines into method) + var afterCR = source.Substring(crIdx, Math.Min(5000, source.Length - crIdx)); + Assert.Contains("SendingFlag", afterCR); + } + + /// + /// The "Session not found" reconnect path must include McpServers and SkillDirectories + /// in the fresh session config (PR #330 regression guard). + /// + [Fact] + public void ReconnectPath_IncludesMcpServersAndSkills() + { + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs")); + + var sessionNotFoundIdx = source.IndexOf("Session not found", StringComparison.OrdinalIgnoreCase); + Assert.True(sessionNotFoundIdx > 0); + + var afterNotFound = source.Substring(sessionNotFoundIdx, Math.Min(2000, source.Length - sessionNotFoundIdx)); + Assert.Contains("McpServers", afterNotFound); + Assert.Contains("SkillDirectories", afterNotFound); + } + + // ========================================================================= + // F. Race Condition & Edge Case Tests + // ========================================================================= + + /// + /// Sequential sends don't leave ghost processing state. + /// Each send must complete fully before the next can proceed. + /// + [Fact] + public async Task SequentialSends_NoGhostState() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("no-ghost"); + + for (int i = 0; i < 10; i++) + { + await svc.SendPromptAsync("no-ghost", $"Prompt {i}"); + Assert.False(session.IsProcessing, $"IsProcessing stuck after prompt {i}"); + Assert.Equal(0, session.ProcessingPhase); + } + } + + /// + /// CompleteResponse clears PendingReasoningMessages to prevent stale reasoning + /// from previous turn leaking into next turn's display. + /// + [Fact] + public async Task CompleteResponse_ClearsPendingReasoning() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("reasoning-clear"); + + var state = GetSessionState(svc, "reasoning-clear"); + session.IsProcessing = true; + SetField(state, "SendingFlag", 1); + + // Add pending reasoning + var pendingProp = state.GetType().GetProperty("PendingReasoningMessages", AnyInstance)!; + var pendingDict = pendingProp.GetValue(state) as System.Collections.Concurrent.ConcurrentDictionary; + pendingDict!.TryAdd("reasoning-1", ChatMessage.SystemMessage("test reasoning")); + + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + SetResponseCompletion(state, tcs); + + // Act + InvokeCompleteResponse(svc, state, null); + + // Assert: pending reasoning cleared + Assert.True(pendingDict.IsEmpty, "PendingReasoningMessages must be cleared on CompleteResponse"); + } + + /// + /// Abort when not processing is a no-op (safe to call multiple times). + /// + [Fact] + public async Task AbortSession_WhenNotProcessing_NoOp() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("abort-noop"); + + session.IsProcessing = false; + var historyCount = session.History.Count; + + // Act: abort should be a safe no-op + await svc.AbortSessionAsync("abort-noop"); + + Assert.Equal(historyCount, session.History.Count); + Assert.False(session.IsProcessing); + } + + /// + /// The TCS result includes full response even when FlushedResponse has content + /// but CurrentResponse is empty (common case: TurnEnd flush cleared CurrentResponse + /// before SessionIdle fires CompleteResponse). + /// + [Fact] + public async Task CompleteResponse_FlushedOnlyNoCurrentResponse_TcsHasContent() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("flushed-only"); + + var state = GetSessionState(svc, "flushed-only"); + session.IsProcessing = true; + SetField(state, "SendingFlag", 1); + + // FlushedResponse has content (from TurnEnd flush), CurrentResponse is empty + GetFlushedResponse(state).Append("Full turn response from mid-turn flush"); + // CurrentResponse stays empty — this is the normal TurnEnd → SessionIdle flow + + var tcs = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + SetResponseCompletion(state, tcs); + + // Act + InvokeCompleteResponse(svc, state, null); + + // Assert: TCS has the flushed content + Assert.True(tcs.Task.IsCompleted); + Assert.Contains("Full turn response from mid-turn flush", tcs.Task.Result); + } + + /// + /// ChatDatabase write-through is called on content flush (fire-and-forget). + /// + [Fact] + public async Task FlushCurrentResponse_WritesToChatDatabase() + { + var svc = CreateService(); + await svc.ReconnectAsync(new ConnectionSettings { Mode = ConnectionMode.Demo }); + var session = await svc.CreateSessionAsync("db-write-test"); + + var state = GetSessionState(svc, "db-write-test"); + session.SessionId = "test-db-session-id"; + + GetCurrentResponse(state).Append("Content for database"); + + var dbCountBefore = _chatDb.AddedMessages.Count; + + // Act + InvokeFlushCurrentResponse(svc, state); + + // Assert: DB write was triggered + Assert.True(_chatDb.AddedMessages.Count > dbCountBefore, + "FlushCurrentResponse must write through to ChatDatabase"); + var lastDbMsg = _chatDb.AddedMessages.Last(); + Assert.Equal("test-db-session-id", lastDbMsg.SessionId); + Assert.Contains("Content for database", lastDbMsg.Message.Content); + } +} From 61804dd0f12beaffcec8d3db114e9527a72e3efd Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Mon, 9 Mar 2026 16:33:17 -0500 Subject: [PATCH 2/6] fix: rescue stuck sessions using ActiveToolCallCount to distinguish running tools from lost SDK events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root causes of two chat reliability bugs: 1. 'Messages stop streaming / user has to prod' — AssistantTurnEndEvent fallback suppressed CompleteResponse permanently when HasUsedToolsThisTurn=true (preventing premature completion between tool rounds). But if SessionIdleEvent was lost (SDK bug #299), nothing would ever complete the session. Fixed: wait an additional 30s (TurnEndIdleToolFallbackAdditionalMs) then fire CompleteResponse anyway if no new TurnStart arrives. Total wait = 4s + 30s = 34s for tool sessions. 2. 'Long-running processes killed at ~10 min' — watchdog uses SDK event inactivity as proxy for stuck, but silent tools (builds, test runs) produce zero events during execution. Fixed: check ActiveToolCallCount at timeout: - ActiveToolCallCount > 0 (tool running): probe server TCP liveness; reset timer if alive, kill if server is dead - ActiveToolCallCount = 0 AND tools were used (SessionIdleEvent lost): complete cleanly via CompleteResponse — no error message - Max time exceeded or demo/remote: kill with error (original behavior) The key insight: ActiveToolCallCount=0 means the tool finished; the session just needs the terminal SessionIdleEvent that was dropped. Completing cleanly is correct here. Tests added: - TurnEndFallbackTests: TurnEndIdleToolFallbackAdditionalMs reasonable, total < watchdog, cancelled-by-TurnStart, eventual fire - ProcessingWatchdogTests: ActiveTool+ServerAlive resets timer, NoActiveTool+ToolsUsed completes cleanly, MaxTime always kills Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot.Tests/ProcessingWatchdogTests.cs | 91 ++++++++++++++++++++- PolyPilot.Tests/TurnEndFallbackTests.cs | 80 +++++++++++++++++- PolyPilot/Services/CopilotService.Events.cs | 77 ++++++++++++++++- 3 files changed, 243 insertions(+), 5 deletions(-) diff --git a/PolyPilot.Tests/ProcessingWatchdogTests.cs b/PolyPilot.Tests/ProcessingWatchdogTests.cs index 217b93b4d4..2b7488deed 100644 --- a/PolyPilot.Tests/ProcessingWatchdogTests.cs +++ b/PolyPilot.Tests/ProcessingWatchdogTests.cs @@ -2153,4 +2153,93 @@ public void Scenario_NormalPrompt_QuiescenceNeverApplies() Assert.Equal(CopilotService.WatchdogInactivityTimeoutSeconds, timeout); Assert.True(timeout > 30, "Normal prompt must use timeout > 30s"); } -} + + // ===== Watchdog tool-vs-no-tool behavior (new in PR fix) ===== + + [Fact] + public void WatchdogDecision_ActiveTool_ServerAlive_ShouldResetTimer() + { + // When a tool is actively running (ActiveToolCallCount > 0) and the server is alive, + // the watchdog must reset the inactivity timer rather than killing the session. + // This is verified via source-code assertion (since SessionState is private). + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync"); + var elapsedIdx = source.IndexOf("elapsed >= effectiveTimeout", methodIdx); + var block = source.Substring(elapsedIdx, 3500); + + // Must check hasActiveTool before deciding server liveness path + Assert.True(block.Contains("hasActiveTool"), + "Watchdog must branch on hasActiveTool to distinguish running tool from lost-idle scenario"); + + // Server liveness check must be conditioned on hasActiveTool + var activeToolIdx = block.IndexOf("hasActiveTool"); + var serverRunningIdx = block.IndexOf("IsServerRunning"); + Assert.True(serverRunningIdx > activeToolIdx, + "IsServerRunning check must appear AFTER the hasActiveTool check (guarded by it)"); + + // Timer reset: LastEventAtTicks must be updated in the 'server alive' path + var lastEventIdx = block.IndexOf("LastEventAtTicks"); + Assert.True(lastEventIdx > 0, "Must reset LastEventAtTicks when server is alive and tool is running"); + + // Must use continue to skip the kill + var continueIdx = block.IndexOf("continue;"); + Assert.True(continueIdx > 0, "Must 'continue' watchdog loop when server alive and tool running"); + } + + [Fact] + public void WatchdogDecision_NoActiveTool_ToolsWereUsed_ShouldCompleteCleanly() + { + // When no tool is active (ActiveToolCallCount = 0) but tools WERE used this turn, + // the watchdog must complete the session CLEANLY (call CompleteResponse, no error msg). + // This is the "SessionIdleEvent lost" scenario — response is done, just terminal event missed. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync"); + var elapsedIdx = source.IndexOf("elapsed >= effectiveTimeout", methodIdx); + var block = source.Substring(elapsedIdx, 8000); + + // Must have both conditions + Assert.True(block.Contains("hasUsedTools"), + "Watchdog must check hasUsedTools to identify lost-idle-event scenario"); + + // CompleteResponse must be called in this path (not just the error path) + var completeResponseIdx = block.IndexOf("CompleteResponse"); + Assert.True(completeResponseIdx > 0, + "Watchdog must call CompleteResponse for the lost-idle-event scenario (not just show error)"); + + // The error message should NOT appear in this path — clean completion, no error text + // The error path ('Session appears stuck') should be in a different branch + var stuckMsgIdx = block.IndexOf("Session appears stuck"); + Assert.True(stuckMsgIdx > completeResponseIdx, + "The 'appears stuck' error message must come AFTER CompleteResponse, in a separate branch"); + } + + [Fact] + public void WatchdogDecision_MaxTimeExceeded_AlwaysKills() + { + // When total processing time exceeds WatchdogMaxProcessingTimeSeconds, the watchdog + // must kill regardless of server liveness or tool state — no session runs forever. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync"); + var elapsedIdx = source.IndexOf("elapsed >= effectiveTimeout", methodIdx); + var block = source.Substring(elapsedIdx, 8000); + + // exceededMaxTime must gate the liveness/clean-complete paths + var exceededIdx = block.IndexOf("exceededMaxTime"); + var serverRunningIdx = block.IndexOf("IsServerRunning"); + Assert.True(exceededIdx > 0 && serverRunningIdx > 0, + "Both exceededMaxTime and IsServerRunning must be present"); + Assert.True(exceededIdx < serverRunningIdx, + "exceededMaxTime check must appear before the server liveness bypass — max time always kills"); + } + + private static string GetRepoRoot() + { + var dir = new DirectoryInfo(AppContext.BaseDirectory); + while (dir != null && !File.Exists(Path.Combine(dir.FullName, "PolyPilot.slnx"))) + dir = dir.Parent; + return dir?.FullName ?? throw new InvalidOperationException("Could not find repo root"); + } +} \ No newline at end of file diff --git a/PolyPilot.Tests/TurnEndFallbackTests.cs b/PolyPilot.Tests/TurnEndFallbackTests.cs index c325c65df5..7664df2fa6 100644 --- a/PolyPilot.Tests/TurnEndFallbackTests.cs +++ b/PolyPilot.Tests/TurnEndFallbackTests.cs @@ -134,4 +134,82 @@ public async Task FallbackTimer_CancelledAfterIsCancellationRequestedCheck_DoesN Assert.False(fired, "Explicit IsCancellationRequested guard must prevent firing after cancel"); } -} + + // ===== Tool session extended fallback ===== + + [Fact] + public void TurnEndIdleToolFallbackAdditionalMs_IsReasonable() + { + // Must be long enough that a normal LLM inter-round pause (reasoning, >4s up to ~30s) + // won't trigger a premature CompleteResponse, but short enough to rescue a stuck session + // faster than the watchdog (600s). + Assert.InRange(CopilotService.TurnEndIdleToolFallbackAdditionalMs, 10_000, 120_000); + Assert.True( + CopilotService.TurnEndIdleToolFallbackAdditionalMs > CopilotService.TurnEndIdleFallbackMs, + "Tool session additional delay must be longer than the base 4s fallback"); + } + + [Fact] + public void TurnEndIdleToolFallback_TotalDelay_IsLessThanWatchdog() + { + // The combined fallback delay must be shorter than the watchdog tool timeout so + // we rescue stuck sessions before the watchdog fires the error message. + var totalMs = CopilotService.TurnEndIdleFallbackMs + CopilotService.TurnEndIdleToolFallbackAdditionalMs; + var watchdogMs = CopilotService.WatchdogToolExecutionTimeoutSeconds * 1000; + Assert.True(totalMs < watchdogMs, + $"Total fallback ({totalMs}ms) must be less than watchdog tool timeout ({watchdogMs}ms)"); + } + + [Fact] + public async Task ToolFallback_CancelledByTurnStart_DoesNotFire() + { + // Simulates: TurnEnd (tools used) starts timer -> TurnStart arrives -> cancels -> no fire + var cts = new CancellationTokenSource(); + var token = cts.Token; + bool completeResponseFired = false; + + var fallbackTask = Task.Run(async () => + { + try + { + await Task.Delay(CopilotService.TurnEndIdleFallbackMs, token); + if (token.IsCancellationRequested) return; + await Task.Delay(100, token); // accelerated extended delay + if (token.IsCancellationRequested) return; + completeResponseFired = true; + } + catch (OperationCanceledException) { } + }); + + await Task.Delay(50); + cts.Cancel(); + cts.Dispose(); + await fallbackTask; + Assert.False(completeResponseFired, "Fallback must not fire when cancelled by TurnStart"); + } + + [Fact] + public async Task ToolFallback_NoTurnStart_EventuallyFires() + { + // Simulates: TurnEnd (tools used) + no TurnStart + no SessionIdle -> fallback fires + var cts = new CancellationTokenSource(); + var token = cts.Token; + bool completeResponseFired = false; + + _ = Task.Run(async () => + { + try + { + await Task.Delay(50, token); // accelerated base delay + if (token.IsCancellationRequested) return; + await Task.Delay(100, token); // accelerated extended delay + if (token.IsCancellationRequested) return; + completeResponseFired = true; + } + catch (OperationCanceledException) { } + }); + + await Task.Delay(400); + Assert.True(completeResponseFired, "Fallback must fire when no TurnStart or SessionIdle arrives"); + } +} \ No newline at end of file diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs index 6f790e0a1c..ed2bc1b1b0 100644 --- a/PolyPilot/Services/CopilotService.Events.cs +++ b/PolyPilot/Services/CopilotService.Events.cs @@ -480,11 +480,23 @@ void Invoke(Action action) } // Guard: if tools were used this turn, the LLM may still be reasoning // between tool rounds (TurnEnd → thinking → TurnStart, >4s). - // HasUsedToolsThisTurn stays true across all sub-turns and is only - // cleared at CompleteResponse/abort. + // Don't complete immediately — wait an additional period. If no new + // TurnStart arrives within that window, SessionIdleEvent was lost + // (SDK bug #299) and we must complete to unblock the session. if (Volatile.Read(ref state.HasUsedToolsThisTurn)) { - Debug($"[IDLE-FALLBACK] '{sessionName}' skipped — tools were used this turn, TurnStart likely coming"); + await Task.Delay(TurnEndIdleToolFallbackAdditionalMs, fallbackToken); + if (fallbackToken.IsCancellationRequested) return; + // Re-check: if a new tool started or TurnStart fired and cancelled + // this token, we would have exited above. If still here, no new + // activity arrived → SessionIdleEvent was lost → complete. + if (Interlocked.CompareExchange(ref state.ActiveToolCallCount, 0, 0) > 0) + { + Debug($"[IDLE-FALLBACK] '{sessionName}' skipped after extended wait — tools still active"); + return; + } + Debug($"[IDLE-FALLBACK] '{sessionName}' SessionIdleEvent not received {TurnEndIdleFallbackMs + TurnEndIdleToolFallbackAdditionalMs}ms after TurnEnd (tools used) — firing CompleteResponse"); + Invoke(() => CompleteResponse(state, turnEndGen)); return; } Debug($"[IDLE-FALLBACK] '{sessionName}' SessionIdleEvent not received {TurnEndIdleFallbackMs}ms after TurnEnd — firing CompleteResponse"); @@ -1385,6 +1397,15 @@ private void HandleReflectionAdvanceResult(SessionState state, string response, /// internal const int TurnEndIdleFallbackMs = 4000; + /// + /// Additional milliseconds to wait after the initial TurnEnd fallback when tools were used + /// this turn. After the base 4s wait we know there are no active tools, but the LLM may + /// be reasoning between rounds (TurnEnd → TurnStart gap can exceed 4s). If no TurnStart + /// arrives within this additional window, SessionIdleEvent was likely lost — fire CompleteResponse. + /// Total wait for tool sessions = TurnEndIdleFallbackMs + TurnEndIdleToolFallbackAdditionalMs = 34s. + /// + internal const int TurnEndIdleToolFallbackAdditionalMs = 30_000; + private static void CancelProcessingWatchdog(SessionState state) { if (state.ProcessingWatchdog != null) @@ -1502,6 +1523,56 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session if (elapsed >= effectiveTimeout) { var exceededMaxTime = totalProcessingSeconds >= WatchdogMaxProcessingTimeSeconds; + + // Before killing, check what state we're actually in: + // + // Case A — Tool actively running (ActiveToolCallCount > 0): + // The SDK only fires events at tool start/complete; a long build or test run + // produces zero events while executing. Use server TCP liveness to decide: + // - Server alive → tool is running, reset inactivity timer and keep waiting + // - Server dead → no events will ever arrive, kill with error + // (Skipped for demo/remote where we can't probe the local TCP port) + // + // Case B — No active tool (ActiveToolCallCount = 0) and tools were used: + // Tool has finished. The LLM has finished (AssistantTurnEndEvent fired). + // SessionIdleEvent was lost (SDK bug #299 or network hiccup). + // The 34s fallback should have caught this, but if not, complete cleanly. + // No error message — the response IS complete, we just missed the terminal event. + // + // Case C — Max time exceeded OR server dead / demo / remote: + // Kill with error (genuine zombie session). + if (!exceededMaxTime) + { + if (hasActiveTool && !IsDemoMode && !IsRemoteMode) + { + // Case A: check server TCP port + var serverAlive = _serverManager.IsServerRunning; + if (serverAlive) + { + Debug($"[WATCHDOG] '{sessionName}' {elapsed:F0}s inactivity but tool is running and server is alive — resetting timer " + + $"(timeout={effectiveTimeout}s, totalProcessing={totalProcessingSeconds:F0}s)"); + Interlocked.Exchange(ref state.LastEventAtTicks, DateTime.UtcNow.Ticks); + continue; // keep waiting — don't kill + } + Debug($"[WATCHDOG] '{sessionName}' tool running but server is not responding — killing stuck session " + + $"(elapsed={elapsed:F0}s, totalProcessing={totalProcessingSeconds:F0}s)"); + } + else if (!hasActiveTool && hasUsedTools) + { + // Case B: tool finished, no active tools, SessionIdleEvent lost — complete cleanly + var watchdogGen = Interlocked.Read(ref state.ProcessingGeneration); + Debug($"[WATCHDOG] '{sessionName}' tool finished but SessionIdleEvent never arrived — completing cleanly " + + $"(elapsed={elapsed:F0}s, totalProcessing={totalProcessingSeconds:F0}s)"); + InvokeOnUI(() => + { + if (!state.Info.IsProcessing) return; + if (Interlocked.Read(ref state.ProcessingGeneration) != watchdogGen) return; + CompleteResponse(state, watchdogGen); + }); + break; + } + } + var timeoutDisplay = exceededMaxTime ? $"{WatchdogMaxProcessingTimeSeconds / 60} minute(s) total processing time" : effectiveTimeout >= 60 From 2785239530be54330a316f170fdf66d39f3c89a7 Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Mon, 9 Mar 2026 16:43:53 -0500 Subject: [PATCH 3/6] fix: periodic watchdog flush + ExternalToolRequestedEvent in EventMatrix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses two remaining items from stuck-session analysis: 1. ExternalToolRequestedEvent in SdkEventMatrix (Rec 3): This event was arriving as 'Unhandled', causing log spam and potentially updating LastEventAtTicks on sessions that shouldn't be kept alive. Now explicitly classified as TimelineOnly so it's handled silently. 2. Periodic mid-watchdog flush (Rec 2): If content has accumulated in CurrentResponse during a watchdog check interval, flush it to History immediately. This ensures partial responses are visible even while IsProcessing=true — covering cases where streaming stalled and TurnEnd->Idle fallback hasn't fired yet. Flush occurs every WatchdogCheckIntervalSeconds (15s) as long as there's pending content. Combined with the previous commit (30s extended TurnEnd fallback + server liveness check), all 4 recommendations from the stuck-session analysis are now addressed: Rec 1: TurnEnd fallback works for tool sessions (30s extended delay) ✓ Rec 2: Periodic watchdog flush of CurrentResponse to History ✓ Rec 3: ExternalToolRequestedEvent in EventMatrix (no more log spam) ✓ Rec 4: Case B clean CompleteResponse when ActiveToolCallCount=0+tools used ✓ Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot.Tests/ProcessingWatchdogTests.cs | 60 +++++++++++++++++++++ PolyPilot/Services/CopilotService.Events.cs | 20 +++++++ 2 files changed, 80 insertions(+) diff --git a/PolyPilot.Tests/ProcessingWatchdogTests.cs b/PolyPilot.Tests/ProcessingWatchdogTests.cs index 2b7488deed..eba3ee46c9 100644 --- a/PolyPilot.Tests/ProcessingWatchdogTests.cs +++ b/PolyPilot.Tests/ProcessingWatchdogTests.cs @@ -2242,4 +2242,64 @@ private static string GetRepoRoot() dir = dir.Parent; return dir?.FullName ?? throw new InvalidOperationException("Could not find repo root"); } + + // ===== ExternalToolRequestedEvent in EventMatrix ===== + + [Fact] + public void ExternalToolRequestedEvent_IsInEventMatrix() + { + // ExternalToolRequestedEvent was arriving as "Unhandled" in logs, causing spam + // and incorrectly updating LastEventAtTicks. It must be explicitly classified. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + Assert.True(source.Contains("ExternalToolRequestedEvent"), + "ExternalToolRequestedEvent must be explicitly listed in SdkEventMatrix to prevent 'Unhandled' log spam"); + } + + [Fact] + public void ExternalToolRequestedEvent_ClassifiedAsTimelineOnly() + { + // Must be TimelineOnly — it doesn't need chat projection but should not suppress + // LastEventAtTicks updates (it does represent live activity on the session). + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var idx = source.IndexOf("ExternalToolRequestedEvent"); + Assert.True(idx >= 0); + var context = source.Substring(idx, 80); + Assert.True(context.Contains("TimelineOnly"), + "ExternalToolRequestedEvent must be classified as TimelineOnly"); + } + + // ===== Periodic mid-watchdog flush ===== + + [Fact] + public void WatchdogPeriodicFlush_PresenceInSource() + { + // The watchdog must flush CurrentResponse to History periodically so partial + // responses are visible even while IsProcessing=true (e.g., stuck mid-stream). + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync"); + // The flush must be inside the watchdog method + var watchdogBody = source.Substring(methodIdx, 6000); + Assert.True(watchdogBody.Contains("CurrentResponse.Length"), + "Watchdog must check CurrentResponse.Length for periodic flush"); + Assert.True(watchdogBody.Contains("periodic flush") || watchdogBody.Contains("FlushCurrentResponse"), + "Watchdog must call FlushCurrentResponse for periodic flush of accumulated content"); + } + + [Fact] + public void WatchdogPeriodicFlush_RunsBeforeTimeoutCheck() + { + // The periodic flush must run BEFORE the elapsed >= effectiveTimeout check, + // so content is visible even before the session is declared stuck. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync"); + var flushIdx = source.IndexOf("periodic flush", methodIdx); + var elapsedCheckIdx = source.IndexOf("elapsed >= effectiveTimeout", methodIdx); + Assert.True(flushIdx > 0, "Periodic flush comment must exist in RunProcessingWatchdogAsync"); + Assert.True(flushIdx < elapsedCheckIdx, + "Periodic flush must appear BEFORE the elapsed >= effectiveTimeout kill check"); + } } \ No newline at end of file diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs index ed2bc1b1b0..9c2738588d 100644 --- a/PolyPilot/Services/CopilotService.Events.cs +++ b/PolyPilot/Services/CopilotService.Events.cs @@ -64,6 +64,9 @@ private enum EventVisibility ["SessionLifecycleEvent"] = EventVisibility.Ignore, ["HookStartEvent"] = EventVisibility.Ignore, ["HookEndEvent"] = EventVisibility.Ignore, + + // External tool requests — classified explicitly to avoid "Unhandled" log spam + ["ExternalToolRequestedEvent"] = EventVisibility.TimelineOnly, }; private static EventVisibility ClassifySessionEvent(SessionEvent evt) @@ -1502,6 +1505,23 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session // goes true and we fall through to the normal timeout tiers. var useResumeQuiescence = state.Info.IsResumed && !hasReceivedEvents && !hasActiveTool && !hasUsedTools; + // Periodic mid-watchdog flush: if content has accumulated in CurrentResponse + // for longer than the check interval without being moved to History, flush it now. + // This ensures partial responses are visible in the chat even while IsProcessing=true + // (e.g., if TurnEnd→Idle fallback hasn't fired yet, or streaming stalled mid-response). + if (elapsed >= WatchdogCheckIntervalSeconds) + { + InvokeOnUI(() => + { + if (state.CurrentResponse.Length > 0) + { + Debug($"[WATCHDOG] '{sessionName}' periodic flush — CurrentResponse has content after {elapsed:F0}s of inactivity"); + FlushCurrentResponse(state); + OnStateChanged?.Invoke(); + } + }); + } + var useToolTimeout = hasActiveTool || (state.Info.IsResumed && !useResumeQuiescence) || hasUsedTools || isMultiAgentSession; var effectiveTimeout = useResumeQuiescence ? WatchdogResumeQuiescenceTimeoutSeconds From 5863d414d4351eee47026ffddbf2ab14597adeff Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Mon, 9 Mar 2026 19:10:23 -0500 Subject: [PATCH 4/6] =?UTF-8?q?fix:=20address=20PR=20#332=20review=20feedb?= =?UTF-8?q?ack=20=E2=80=94=20race=20guard,=20Invoke=20clarity,=20test=20ro?= =?UTF-8?q?bustness?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - [HIGH] Add ProcessingGeneration guard to periodic watchdog flush: capture flushGen before InvokeOnUI and validate inside lambda, matching the same pattern used by Case B and Case C. Prevents stale watchdog tick from flushing new-turn content into old-turn history after abort+resend. - [MEDIUM] Switch TurnEnd fallback Task.Run closures from local Invoke() to class-level InvokeOnUI() for clarity. The local Invoke captures _syncContext via closure correctly, but InvokeOnUI makes the UI-thread dispatch intent explicit when reading cross-thread code. - [MEDIUM] Fix flaky timing in ToolFallback_CancelledByTurnStart_DoesNotFire: replace production 4000ms constant with 50ms accelerated delay, matching all other tests in the file. - [MEDIUM] Make watchdog source-parsing tests use dynamic method bounds: find end of RunProcessingWatchdogAsync by locating the next class member (_permissionRecoveryAttempts) instead of fixed 3500/8000 char substrings. - [MEDIUM] Add break; control-flow assertion to NoActiveTool test: verifies Case B exits via break before reaching Case C error path, catching accidental fallthrough regressions. - [MEDIUM] Add WatchdogPeriodicFlush_HasGenerationGuard test: verifies the generation guard is present in the flush block via source assertion. - [LOW] Add TODO comment near reflection helpers in ChatExperienceSafetyTests suggesting InternalsVisibleTo as a compile-time safety improvement. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot.Tests/ChatExperienceSafetyTests.cs | 5 +++ PolyPilot.Tests/ProcessingWatchdogTests.cs | 37 +++++++++++++++++--- PolyPilot.Tests/TurnEndFallbackTests.cs | 2 +- PolyPilot/Services/CopilotService.Events.cs | 9 +++-- 4 files changed, 46 insertions(+), 7 deletions(-) diff --git a/PolyPilot.Tests/ChatExperienceSafetyTests.cs b/PolyPilot.Tests/ChatExperienceSafetyTests.cs index 9e701e9907..5f8deaac2b 100644 --- a/PolyPilot.Tests/ChatExperienceSafetyTests.cs +++ b/PolyPilot.Tests/ChatExperienceSafetyTests.cs @@ -42,6 +42,11 @@ public ChatExperienceSafetyTests() private CopilotService CreateService() => new CopilotService(_chatDb, _serverManager, _bridgeClient, _repoManager, _serviceProvider, _demoService); + // These helpers use string-based reflection to access private members. Renaming any private + // member silently breaks tests at runtime rather than compile time. + // TODO: Consider adding [assembly: InternalsVisibleTo("PolyPilot.Tests")] to PolyPilot.csproj + // for compile-time safety on internal members. + /// Gets the private SessionState object from CopilotService._sessions dictionary. private static object GetSessionState(CopilotService svc, string sessionName) { diff --git a/PolyPilot.Tests/ProcessingWatchdogTests.cs b/PolyPilot.Tests/ProcessingWatchdogTests.cs index eba3ee46c9..0c3e5fc742 100644 --- a/PolyPilot.Tests/ProcessingWatchdogTests.cs +++ b/PolyPilot.Tests/ProcessingWatchdogTests.cs @@ -2166,7 +2166,10 @@ public void WatchdogDecision_ActiveTool_ServerAlive_ShouldResetTimer() Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync"); var elapsedIdx = source.IndexOf("elapsed >= effectiveTimeout", methodIdx); - var block = source.Substring(elapsedIdx, 3500); + // Find the end of the method dynamically (next top-level member after RunProcessingWatchdogAsync) + var methodEndIdx = source.IndexOf(" private readonly ConcurrentDictionary", methodIdx); + if (methodEndIdx < 0) methodEndIdx = source.Length; + var block = source.Substring(elapsedIdx, methodEndIdx - elapsedIdx); // Must check hasActiveTool before deciding server liveness path Assert.True(block.Contains("hasActiveTool"), @@ -2197,7 +2200,9 @@ public void WatchdogDecision_NoActiveTool_ToolsWereUsed_ShouldCompleteCleanly() Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync"); var elapsedIdx = source.IndexOf("elapsed >= effectiveTimeout", methodIdx); - var block = source.Substring(elapsedIdx, 8000); + var methodEndIdx = source.IndexOf(" private readonly ConcurrentDictionary", methodIdx); + if (methodEndIdx < 0) methodEndIdx = source.Length; + var block = source.Substring(elapsedIdx, methodEndIdx - elapsedIdx); // Must have both conditions Assert.True(block.Contains("hasUsedTools"), @@ -2213,6 +2218,11 @@ public void WatchdogDecision_NoActiveTool_ToolsWereUsed_ShouldCompleteCleanly() var stuckMsgIdx = block.IndexOf("Session appears stuck"); Assert.True(stuckMsgIdx > completeResponseIdx, "The 'appears stuck' error message must come AFTER CompleteResponse, in a separate branch"); + + // Case B must exit the block (break/return) before falling through to Case C error path + var breakIdx = block.IndexOf("break;", completeResponseIdx); + Assert.True(breakIdx > 0 && breakIdx < stuckMsgIdx, + "Case B must have a 'break;' before Case C to prevent fallthrough to the error kill path"); } [Fact] @@ -2224,7 +2234,9 @@ public void WatchdogDecision_MaxTimeExceeded_AlwaysKills() Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync"); var elapsedIdx = source.IndexOf("elapsed >= effectiveTimeout", methodIdx); - var block = source.Substring(elapsedIdx, 8000); + var methodEndIdx = source.IndexOf(" private readonly ConcurrentDictionary", methodIdx); + if (methodEndIdx < 0) methodEndIdx = source.Length; + var block = source.Substring(elapsedIdx, methodEndIdx - elapsedIdx); // exceededMaxTime must gate the liveness/clean-complete paths var exceededIdx = block.IndexOf("exceededMaxTime"); @@ -2235,6 +2247,23 @@ public void WatchdogDecision_MaxTimeExceeded_AlwaysKills() "exceededMaxTime check must appear before the server liveness bypass — max time always kills"); } + [Fact] + public void WatchdogPeriodicFlush_HasGenerationGuard() + { + // The periodic flush must capture ProcessingGeneration before InvokeOnUI and + // validate it inside the lambda — preventing stale watchdog ticks from flushing + // new-turn content into old-turn history if the user aborts + resends. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync"); + var flushCommentIdx = source.IndexOf("Periodic mid-watchdog flush", methodIdx); + Assert.True(flushCommentIdx > 0, "Periodic flush comment must exist in RunProcessingWatchdogAsync"); + // Capture 1000 chars around the flush block to verify generation guard + var flushBlock = source.Substring(flushCommentIdx, 1000); + Assert.True(flushBlock.Contains("ProcessingGeneration"), + "Periodic flush block must read ProcessingGeneration before InvokeOnUI (race condition guard)"); + } + private static string GetRepoRoot() { var dir = new DirectoryInfo(AppContext.BaseDirectory); @@ -2296,7 +2325,7 @@ public void WatchdogPeriodicFlush_RunsBeforeTimeoutCheck() var source = File.ReadAllText( Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync"); - var flushIdx = source.IndexOf("periodic flush", methodIdx); + var flushIdx = source.IndexOf("Periodic mid-watchdog flush", methodIdx); var elapsedCheckIdx = source.IndexOf("elapsed >= effectiveTimeout", methodIdx); Assert.True(flushIdx > 0, "Periodic flush comment must exist in RunProcessingWatchdogAsync"); Assert.True(flushIdx < elapsedCheckIdx, diff --git a/PolyPilot.Tests/TurnEndFallbackTests.cs b/PolyPilot.Tests/TurnEndFallbackTests.cs index 7664df2fa6..19c4169f16 100644 --- a/PolyPilot.Tests/TurnEndFallbackTests.cs +++ b/PolyPilot.Tests/TurnEndFallbackTests.cs @@ -172,7 +172,7 @@ public async Task ToolFallback_CancelledByTurnStart_DoesNotFire() { try { - await Task.Delay(CopilotService.TurnEndIdleFallbackMs, token); + await Task.Delay(50, token); // accelerated base delay (mirrors other tests in this file) if (token.IsCancellationRequested) return; await Task.Delay(100, token); // accelerated extended delay if (token.IsCancellationRequested) return; diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs index 9c2738588d..759231c4fc 100644 --- a/PolyPilot/Services/CopilotService.Events.cs +++ b/PolyPilot/Services/CopilotService.Events.cs @@ -499,11 +499,11 @@ void Invoke(Action action) return; } Debug($"[IDLE-FALLBACK] '{sessionName}' SessionIdleEvent not received {TurnEndIdleFallbackMs + TurnEndIdleToolFallbackAdditionalMs}ms after TurnEnd (tools used) — firing CompleteResponse"); - Invoke(() => CompleteResponse(state, turnEndGen)); + InvokeOnUI(() => CompleteResponse(state, turnEndGen)); return; } Debug($"[IDLE-FALLBACK] '{sessionName}' SessionIdleEvent not received {TurnEndIdleFallbackMs}ms after TurnEnd — firing CompleteResponse"); - Invoke(() => CompleteResponse(state, turnEndGen)); + InvokeOnUI(() => CompleteResponse(state, turnEndGen)); } catch (OperationCanceledException) { /* expected on cancellation */ } catch (Exception ex) { Debug($"[IDLE-FALLBACK] '{sessionName}' unexpected error: {ex}"); } @@ -1511,8 +1511,13 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session // (e.g., if TurnEnd→Idle fallback hasn't fired yet, or streaming stalled mid-response). if (elapsed >= WatchdogCheckIntervalSeconds) { + // Capture generation before InvokeOnUI — if the user aborts + resends between + // this check and the UI dispatch, the generation changes and we must not flush + // new-turn content into the old turn's history. + var flushGen = Interlocked.Read(ref state.ProcessingGeneration); InvokeOnUI(() => { + if (Interlocked.Read(ref state.ProcessingGeneration) != flushGen) return; if (state.CurrentResponse.Length > 0) { Debug($"[WATCHDOG] '{sessionName}' periodic flush — CurrentResponse has content after {elapsed:F0}s of inactivity"); From 978b7529039f8b48df37351fbfc80c5543dd3a8a Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Mon, 9 Mar 2026 19:28:21 -0500 Subject: [PATCH 5/6] docs: update processing-state-safety skill with PR #332 learnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Bump header: 9 invariants → 13 invariants, 8 PRs → 9 PRs, 8 paths → 9 paths - Add 9th IsProcessing path: watchdog Case B (clean CompleteResponse on lost terminal event, no error message) — distinguishable from the kill path - Add INV-10: TurnEnd fallback must not be permanently suppressed by HasUsedToolsThisTurn; use ActiveToolCallCount guard + 30s extended delay - Add INV-11: Watchdog 3-way branch (Case A: liveness probe, Case B: clean complete, Case C: error kill) to distinguish active tools from lost events - Add INV-12: All background→UI dispatches must capture+validate ProcessingGeneration before/inside InvokeOnUI lambda - Add INV-13: Use class-level InvokeOnUI() not local Invoke() in Task.Run - Update Top 5 Recurring Mistakes: replace 'ActiveToolCallCount as sole signal' with the more impactful 'Suppressing TurnEnd fallback entirely' - Update regression chain: add #332 to the sequence - Add full PR #332 entry to regression-history.md with root causes, 3-way branch details, periodic flush, and session recovery improvement Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../skills/processing-state-safety/SKILL.md | 78 +++++++++++++++---- .../references/regression-history.md | 46 +++++++++++ 2 files changed, 111 insertions(+), 13 deletions(-) diff --git a/.claude/skills/processing-state-safety/SKILL.md b/.claude/skills/processing-state-safety/SKILL.md index b7a9e016e7..2861b2a042 100644 --- a/.claude/skills/processing-state-safety/SKILL.md +++ b/.claude/skills/processing-state-safety/SKILL.md @@ -10,8 +10,8 @@ description: > (6) Adding diagnostic log tags, (7) Modifying session restore paths (RestoreSingleSessionAsync) that must initialize watchdog-dependent state, (8) Modifying ReconcileOrganization or any code that reads Organization.Sessions - during the IsRestoring window. Covers: 9 invariants from 8 PRs of fix cycles, - the 8 code paths that clear IsProcessing, and common regression patterns. + during the IsRestoring window. Covers: 13 invariants from 10 PRs of fix cycles, + the 9 code paths that clear IsProcessing, and common regression patterns. --- # Processing State Safety @@ -32,18 +32,19 @@ Every code path that sets `IsProcessing = false` MUST also: 11. Add a diagnostic log entry (`[COMPLETE]`, `[ERROR]`, `[ABORT]`, etc.) 12. Run on UI thread (via `InvokeOnUI()` or already on UI thread) -## The 8 Paths That Clear IsProcessing +## The 9 Paths That Clear IsProcessing | # | Path | File | Thread | Notes | |---|------|------|--------|-------| | 1 | CompleteResponse | Events.cs | UI (via Invoke) | Normal completion | | 2 | SessionErrorEvent | Events.cs | Background → InvokeOnUI | SDK error | -| 3 | Watchdog timeout | Events.cs | Timer → InvokeOnUI | No events for 120s/600s | -| 4 | AbortSessionAsync (local) | CopilotService.cs | UI | User clicks Stop | -| 5 | AbortSessionAsync (remote) | CopilotService.cs | UI | Mobile stop | -| 6 | SendAsync reconnect failure | CopilotService.cs | UI | Prompt send failed after reconnect | -| 7 | SendAsync initial failure | CopilotService.cs | UI | Prompt send failed | -| 8 | Bridge OnTurnEnd | Bridge.cs | Background → InvokeOnUI | Remote mode turn complete | +| 3 | Watchdog timeout (kill) | Events.cs | Timer → InvokeOnUI | No events for 120s/600s, server dead, or max time exceeded (Case C) | +| 4 | Watchdog clean complete | Events.cs | Timer → InvokeOnUI | Tools done, lost terminal event → calls CompleteResponse (Case B, PR #332) | +| 5 | AbortSessionAsync (local) | CopilotService.cs | UI | User clicks Stop | +| 6 | AbortSessionAsync (remote) | CopilotService.cs | UI | Mobile stop | +| 7 | SendAsync reconnect failure | CopilotService.cs | UI | Prompt send failed after reconnect | +| 8 | SendAsync initial failure | CopilotService.cs | UI | Prompt send failed | +| 9 | Bridge OnTurnEnd | Bridge.cs | Background → InvokeOnUI | Remote mode turn complete | ## Content Persistence Safety @@ -61,7 +62,7 @@ that crash the app. The DB is a write-through cache; `events.jsonl` is the sourc and replays on session resume via `BulkInsertAsync`. DB write failures are self-healing. **NEVER narrow the ChatDatabase catch filters** — use `catch (Exception ex)` always. -## 9 Invariants +## 13 Invariants ### INV-1: Complete state cleanup Every IsProcessing=false path clears ALL fields. See checklist above. @@ -108,12 +109,60 @@ When `ReconcileOrganization` hasn't run yet (during `IsRestoring` window), during this window must call `ReconcileOrganization(allowPruning: false)` first. This additive mode safely adds missing entries without pruning loading sessions. +### INV-10: TurnEnd fallback must not be permanently suppressed (PR #332) +The `AssistantTurnEndEvent` 4s fallback → `CompleteResponse` guards against +premature firing during multi-tool sessions. **Do NOT** use `HasUsedToolsThisTurn` +to skip this fallback entirely — that permanently disables recovery for all +agent sessions and leaves them 100% dependent on `SessionIdleEvent`. If that +event is dropped (SDK bug #299), the session sticks for 600s. + +**Correct approach**: Use `ActiveToolCallCount > 0` to skip the 4s fallback +(tools are still running). If tools are done (`ActiveToolCallCount == 0`) but +`HasUsedToolsThisTurn` is true, use an extended 30s delay +(`TurnEndIdleToolFallbackAdditionalMs = 30_000`). The cancellation token from +`AssistantTurnStartEvent` is the correct mechanism to prevent premature firing +when the LLM does multi-round tool use. + +### INV-11: Watchdog must distinguish active tools from lost events (PR #332) +Blindly waiting the full 600s tool timeout when `ActiveToolCallCount == 0` +(tools finished) is wrong — the SDK may have silently dropped the terminal event +(`SessionIdleEvent`). The watchdog timeout path must use a 3-way branch: + +- **Case A** (`hasActiveTool && server alive`): Probe `_serverManager.IsServerRunning()` + (TCP port check). If alive → reset `LastEventAtTicks` and continue. If dead → fall through to kill. +- **Case B** (`!hasActiveTool && HasUsedToolsThisTurn && !exceededMaxTime`): Call + `CompleteResponse` cleanly (no error message) then `break`. Lost terminal event scenario. +- **Case C** (default): Kill with "⚠️ Session appears stuck" error message. Max time + exceeded, server dead, or something genuinely wrong. + +This prevents the "10-minute kill" where tools ran successfully but the session +was murdered because the SDK dropped the follow-up `SessionIdleEvent`. + +### INV-12: All background→UI dispatches must capture ProcessingGeneration (PR #332) +Any code that posts work to the UI thread from a background thread (watchdog loop, +`Task.Run`, timer callbacks) must: +1. Capture `var gen = Interlocked.Read(ref state.ProcessingGeneration)` **before** the `InvokeOnUI` call +2. Validate `if (Interlocked.Read(ref state.ProcessingGeneration) != gen) return;` **inside** the lambda + +Without this guard, a stale watchdog tick (racing with abort+resend) can flush +content from a new turn into the old turn's history. Every Case B and Case C +watchdog callback has this guard; the periodic flush callback must too. + +### INV-13: Use InvokeOnUI() (class method) in Task.Run closures (PR #332) +The local `Invoke(Action)` function inside `HandleSessionEvent` (declared at +line ~249) can have scoping ambiguity when captured by `Task.Run` closures. +Use the class-level `InvokeOnUI()` method in all `Task.Run` and timer callbacks +for explicit, unambiguous UI thread dispatch. The local `Invoke` works but the +intent is less clear when reading cross-threaded code. + ## Top 5 Recurring Mistakes 1. **Incomplete cleanup** — modifying one IsProcessing path without updating ALL fields that must be cleared simultaneously. -2. **ActiveToolCallCount as sole tool signal** — gets reset/skipped - in several paths; always check `HasUsedToolsThisTurn` too. +2. **Suppressing the TurnEnd fallback for tool sessions** — using `HasUsedToolsThisTurn` + to skip the fallback entirely leaves agent sessions with zero recovery when + `SessionIdleEvent` is dropped. Use `ActiveToolCallCount` to guard and an + extended delay for the tool-used case. (PR #332) 3. **Background thread mutations** — mutating IsProcessing or related state on SDK event threads instead of marshaling to UI thread. 4. **Missing content flush on turn boundaries** — `FlushCurrentResponse` @@ -128,7 +177,10 @@ This additive mode safely adds missing entries without pruning loading sessions. `IsMultiAgentSession` not being set during restore, causing the watchdog to use 120s instead of 600s for multi-agent workers. +**Retired mistake (was #2):** *ActiveToolCallCount as sole tool signal* — still relevant per +INV-5, but the more impactful version is #2 above (suppressing the fallback entirely). + ## Regression History -9 PRs of fix/regression cycles: #141 → #147 → #148 → #153 → #158 → #163 → #164 → #276 → #284. +10 PRs of fix/regression cycles: #141 → #147 → #148 → #153 → #158 → #163 → #164 → #276 → #284 → #332. See `references/regression-history.md` for the full timeline with root causes. diff --git a/.claude/skills/processing-state-safety/references/regression-history.md b/.claude/skills/processing-state-safety/references/regression-history.md index 3ac8d4057a..571d156cdb 100644 --- a/.claude/skills/processing-state-safety/references/regression-history.md +++ b/.claude/skills/processing-state-safety/references/regression-history.md @@ -70,3 +70,49 @@ - **Pattern**: This is a variant of mistake #5 ("Missing state initialization on session restore") — the restore path must independently initialize ALL state that `SendPromptAsync` initializes, because events/watchdog/dispatch all depend on it. + +## PR #332 — Rescue Stuck Chat Sessions (TurnEnd Fallback + Smart Watchdog) + +### Root Cause 1: TurnEnd fallback permanently disabled for tool-using sessions +- **Symptom**: Messages stop streaming mid-response (appear stuck, need "prod" to continue). + More precisely: `SessionIdleEvent` is lost (SDK bug #299) and nothing else fires `CompleteResponse`. +- **Root cause**: `HasUsedToolsThisTurn = true` caused the 4s `AssistantTurnEndEvent` fallback + to be **completely skipped** (not just delayed). For any session that ever used tools, the guard + permanently disabled recovery. The session then waited the full 600s watchdog timeout. +- **Fix**: Use `ActiveToolCallCount > 0` to skip the 4s fallback (tools genuinely running). + When `ActiveToolCallCount == 0` but `HasUsedToolsThisTurn == true`, wait an additional 30s + (`TurnEndIdleToolFallbackAdditionalMs = 30_000`) then fire `CompleteResponse`. Total wait: 34s + instead of 600s. The `AssistantTurnStartEvent` CTS cancels the fallback if the LLM starts a new round. + +### Root Cause 2: Watchdog couldn't distinguish active tools from lost SDK events +- **Symptom**: Sessions with long-running tools (builds, tests) killed after 600s even when + the tool finished successfully and the SDK dropped the terminal event. +- **Root cause**: Watchdog timeout path only had one behavior: kill with error. It couldn't tell + "tool still running" from "tool done, lost SessionIdleEvent." +- **Fix**: 3-way branch in `RunProcessingWatchdogAsync` at the timeout threshold: + - **Case A** (`hasActiveTool && !exceededMaxTime && !demo && !remote`): + Probe `_serverManager.IsServerRunning()` (TCP check). If alive → reset `LastEventAtTicks` + continue. + If dead → fall through to Case C (kill). + - **Case B** (`!hasActiveTool && HasUsedToolsThisTurn && !exceededMaxTime`): + Call `CompleteResponse(state, watchdogGen)` then `break`. Clean completion, no error message. + This is now **Path #4 that clears IsProcessing** (added to the table in SKILL.md). + - **Case C** (default): Kill with "⚠️ Session appears stuck" error (original behavior). + +### Additional fixes +- **Periodic watchdog flush** — Every 15s, if `CurrentResponse` has content, flush to History. + Ensures user sees streaming content even when all SDK events stop (midway through long tool). + Uses `ProcessingGeneration` guard (captured before `InvokeOnUI`, validated inside lambda) to + prevent stale ticks from flushing new-turn content into the old turn. +- **ExternalToolRequestedEvent** — Added to `SdkEventMatrix` as `TimelineOnly`. Was arriving as + "Unhandled" causing log spam without any functional impact. +- **InvokeOnUI() in TurnEnd fallback Task.Run** — Switched from local `Invoke()` function to + class-level `InvokeOnUI()` for unambiguous intent in cross-thread closures (INV-13). +- **44 behavioral safety tests** + 7 watchdog tests + 4 TurnEnd fallback tests = 55 new tests. + All use source-code assertions and reflection-based state inspection to verify invariants. + +### Key insight +These two bugs affected EVERY agent session because: (a) agents always use tools, so +`HasUsedToolsThisTurn` is always `true`, and (b) agent tasks frequently take >10 min (build, +test, research). The bugs compounded: fallback disabled → 600s watchdog is only hope → watchdog +kills after 600s → user loses the entire turn. PR #332 reduced worst-case stuck-session +recovery from 600s to 34s for lost-event scenarios. From e3ffd07cf75f3284703a49ba4269a63ea5a98f10 Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Mon, 9 Mar 2026 19:57:48 -0500 Subject: [PATCH 6/6] fix: multi-agent stuck sessions + AssistantTurnStartEvent diagnostics (PR #332 followup) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause analysis of the stuck orchestrator session (gen=28, 00:37:19-00:39:40): 1. AssistantTurnEndEvent arrived → 4s TurnEnd fallback started 2. AssistantTurnStartEvent arrived within 4s → CancelTurnEndFallback called SILENTLY (TurnStartEvent was not logged in [EVT], making it invisible in diagnostics) 3. SDK dropped terminal events for the follow-on sub-turn 4. isMultiAgentSession=true forced 600s watchdog timeout even with no tools 5. Session stuck until user clicked Stop (~2.5 min later) Fix 1 — Diagnostic: Add AssistantTurnStartEvent to [EVT] log filter The [EVT] filter previously only logged SessionIdleEvent, AssistantTurnEndEvent, and SessionErrorEvent. TurnStart events were invisible, making it impossible to see that a TurnStart cancelled the TurnEnd fallback. Now all critical lifecycle events are logged. Fix 2 — Behavioral: Remove isMultiAgentSession from useToolTimeout isMultiAgentSession alone no longer extends the watchdog to 600s. Workers doing text-heavy PR reviews stream delta events continuously, keeping elapsed small regardless of timeout. The 600s guard was protecting against a scenario that couldn't occur during active streaming. Pure reasoning/text sub-turns now use the 120s inactivity timeout. Fix 3 — Case B expansion: multi-agent + server alive → clean complete Case B (clean complete, no error message) now fires for multi-agent sessions when isMultiAgentSession=true and server is alive, not just when hasUsedTools=true. This prevents the error kill message when a session completed successfully but SDK dropped terminal events. Tests (3 new, 4 updated): - WatchdogTimeoutSelection_MultiAgent_NoTools_UsesInactivityTimeout (renamed+updated) - WatchdogDecision_MultiAgent_NoTools_CaseBIncludesMultiAgent_InSource (new source assert) - MultiAgent_NoTools_UseInactivityTimeout_NotToolTimeout (new formula test) - AssistantTurnStartEvent_LoggedInEvtDiagnostics_InSource (new source assert) - Regression_PR195_MultiAgentWorkers_DeltasKeepElapsedSmall (updated regression test) - ExhaustiveMatrix and NonResumed tests updated to reflect 120s for multi-agent no-tool Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot.Tests/ProcessingWatchdogTests.cs | 102 +++++++++++++++++--- PolyPilot/Services/CopilotService.Events.cs | 31 ++++-- 2 files changed, 112 insertions(+), 21 deletions(-) diff --git a/PolyPilot.Tests/ProcessingWatchdogTests.cs b/PolyPilot.Tests/ProcessingWatchdogTests.cs index 0c3e5fc742..33db436a76 100644 --- a/PolyPilot.Tests/ProcessingWatchdogTests.cs +++ b/PolyPilot.Tests/ProcessingWatchdogTests.cs @@ -895,7 +895,10 @@ public void HasUsedToolsThisTurn_ResetByCompleteResponse() private static int ComputeEffectiveTimeout(bool hasActiveTool, bool isResumed, bool hasReceivedEvents, bool hasUsedTools, bool isMultiAgent = false) { var useResumeQuiescence = isResumed && !hasReceivedEvents && !hasActiveTool && !hasUsedTools; - var useToolTimeout = hasActiveTool || (isResumed && !useResumeQuiescence) || hasUsedTools || isMultiAgent; + // NOTE: isMultiAgent no longer extends the timeout (PR #332 fix). + // Workers actively streaming have tiny elapsed values (delta events reset the timer). + // Only tool execution silence (hasActiveTool or hasUsedTools) warrants 600s. + var useToolTimeout = hasActiveTool || (isResumed && !useResumeQuiescence) || hasUsedTools; return useResumeQuiescence ? CopilotService.WatchdogResumeQuiescenceTimeoutSeconds : useToolTimeout @@ -973,14 +976,18 @@ public void WatchdogTimeoutSelection_ResumedWithActiveTool_UsesToolTimeout() } [Fact] - public void WatchdogTimeoutSelection_MultiAgent_UsesToolTimeout() + public void WatchdogTimeoutSelection_MultiAgent_NoTools_UsesInactivityTimeout() { - // Multi-agent sessions use longer tool timeout even without tool activity + // Multi-agent sessions without tool use now use the 120s inactivity timeout. + // The old 600s blanket for isMultiAgent caused stuck-session UX bugs when the SDK + // dropped terminal events (sdk bug #299 variant): users waited up to 600s when + // workers that do active text generation (delta events flowing) are NOT at risk of + // the 120s timeout because deltas keep elapsed small. var effectiveTimeout = ComputeEffectiveTimeout( hasActiveTool: false, isResumed: false, hasReceivedEvents: false, hasUsedTools: false, isMultiAgent: true); - Assert.Equal(CopilotService.WatchdogToolExecutionTimeoutSeconds, effectiveTimeout); - Assert.Equal(600, effectiveTimeout); + Assert.Equal(CopilotService.WatchdogInactivityTimeoutSeconds, effectiveTimeout); + Assert.Equal(120, effectiveTimeout); } [Fact] @@ -1201,7 +1208,8 @@ public void ResumeQuiescence_NotResumed_NeverTriggersQuiescence() hasActiveTool: true, isResumed: false, hasReceivedEvents: false, hasUsedTools: false)); Assert.Equal(600, ComputeEffectiveTimeout( hasActiveTool: false, isResumed: false, hasReceivedEvents: false, hasUsedTools: true)); - Assert.Equal(600, ComputeEffectiveTimeout( + // Multi-agent without tools now uses 120s (not 600s) — see WatchdogTimeoutSelection_MultiAgent_NoTools_UsesInactivityTimeout + Assert.Equal(120, ComputeEffectiveTimeout( hasActiveTool: false, isResumed: false, hasReceivedEvents: false, hasUsedTools: false, isMultiAgent: true)); } @@ -1238,7 +1246,7 @@ public void ResumeQuiescence_TransitionsToInactivity_AfterIsResumedCleared() [InlineData(false, true, true, false, false, 600)] // Resumed, events: tool timeout [InlineData(true, true, false, false, false, 600)] // Resumed, active tool: tool timeout [InlineData(false, true, false, true, false, 600)] // Resumed, used tools: tool timeout - [InlineData(false, false, false, false, true, 600)] // Multi-agent: tool timeout + [InlineData(false, false, false, false, true, 120)] // Multi-agent no-tools: inactivity (PR #332 fix) [InlineData(false, true, false, false, true, 30)] // Resumed+multiAgent, no events: quiescence wins [InlineData(false, false, false, true, false, 600)] // HasUsedTools: tool timeout [InlineData(true, true, true, true, true, 600)] // All flags: tool timeout @@ -1817,14 +1825,22 @@ public void Regression_PR163_IsResumed_NotClearedDuringToolActivity() // --- PR #195 regression: multi-agent workers killed at 120s --- [Fact] - public void Regression_PR195_MultiAgentWorkers_Use600s() + public void Regression_PR195_MultiAgentWorkers_DeltasKeepElapsedSmall() { - // Multi-agent workers doing text-heavy tasks (PR reviews, no tools) - // were killed at 120s inactivity. Fix: isMultiAgent flag → 600s. + // PR #195 concern: multi-agent workers doing text-heavy tasks (PR reviews, no tools) + // were killed at 120s. The fix was: isMultiAgent → 600s. But that was wrong reasoning. + // + // The CORRECT insight (PR #332): workers generating text stream DELTA EVENTS continuously. + // Each delta resets LastEventAtTicks, keeping elapsed tiny. The 120s timeout cannot fire + // during active generation. It can only fire when the session goes SILENT, which means + // either stuck (good to clean up) or done with terminal events dropped (Case B). + // + // Multi-agent without tools → 120s timeout (inactivity). This is intentional. var timeout = ComputeEffectiveTimeout( hasActiveTool: false, isResumed: false, hasReceivedEvents: true, hasUsedTools: false, isMultiAgent: true); - Assert.Equal(CopilotService.WatchdogToolExecutionTimeoutSeconds, timeout); + Assert.Equal(CopilotService.WatchdogInactivityTimeoutSeconds, timeout); + Assert.Equal(120, timeout); } // --- PR #211 regression: quiescence must not kill active sessions --- @@ -2331,4 +2347,68 @@ public void WatchdogPeriodicFlush_RunsBeforeTimeoutCheck() Assert.True(flushIdx < elapsedCheckIdx, "Periodic flush must appear BEFORE the elapsed >= effectiveTimeout kill check"); } + + // ===== Multi-agent no-tool session stuck-session recovery (PR #332 fix) ===== + + [Fact] + public void WatchdogDecision_MultiAgent_NoTools_CaseBIncludesMultiAgent_InSource() + { + // Case B must handle multi-agent sessions without tool use. + // This covers the scenario where an orchestrator/worker receives AssistantTurnStartEvent + // (cancels the 4s TurnEnd fallback), then the SDK drops the terminal events for the + // follow-on sub-turn. Without this, the 120s watchdog fires Case C (error kill) instead + // of Case B (clean complete), showing an unnecessary error message. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync"); + var endIdx = source.IndexOf(" private readonly ConcurrentDictionary", methodIdx); + var watchdogBody = source.Substring(methodIdx, endIdx - methodIdx); + + Assert.True(watchdogBody.Contains("isMultiAgentSession"), + "Case B must reference isMultiAgentSession to handle multi-agent no-tool sessions"); + // Find the inline "Case B:" comment that's directly in the else-if block + var caseBInlineIdx = watchdogBody.IndexOf("Case B:"); + Assert.True(caseBInlineIdx >= 0, "Inline 'Case B:' comment must exist in watchdog block"); + // Look back ~200 chars to find the else if condition containing isMultiAgentSession + var conditionStart = Math.Max(0, caseBInlineIdx - 200); + var conditionBlock = watchdogBody.Substring(conditionStart, 400); + Assert.True(conditionBlock.Contains("isMultiAgentSession"), + "Case B condition must include isMultiAgentSession for multi-agent no-tool recovery"); + } + + [Fact] + public void MultiAgent_NoTools_UseInactivityTimeout_NotToolTimeout() + { + // Multi-agent sessions without tool use must use 120s inactivity timeout, NOT 600s. + // Workers generating text stream deltas continuously — elapsed stays small during + // generation. The 120s timeout only fires when the session goes SILENT. + // When silent: either done with lost terminal event (Case B clean complete) or + // genuinely stuck (Case C error kill). + // This prevents the 600s stuck-session UX bug: sdk drops TurnEnd/Idle after + // TurnStart cancels the 4s fallback → user waits 600s instead of 120s. + var timeout = ComputeEffectiveTimeout( + hasActiveTool: false, isResumed: false, hasReceivedEvents: false, + hasUsedTools: false, isMultiAgent: true); + Assert.Equal(120, timeout); // Must be inactivity (120s), not tool timeout (600s) + Assert.True(timeout < CopilotService.WatchdogToolExecutionTimeoutSeconds, + "Multi-agent without tools must use shorter inactivity timeout (120s), not 600s"); + } + + [Fact] + public void AssistantTurnStartEvent_LoggedInEvtDiagnostics_InSource() + { + // AssistantTurnStartEvent MUST be included in the [EVT] log filter. + // Without this, when TurnStart cancels the TurnEnd fallback silently, + // event-diagnostics.log shows a gap with no explanation — making stuck-session + // forensics impossible (root cause of the PR #332 debug session bug). + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var evtLogIdx = source.IndexOf("[EVT]"); + Assert.True(evtLogIdx >= 0, "[EVT] log must exist in event handler"); + // The EVT filter block must include AssistantTurnStartEvent + var filterContext = source.Substring(Math.Max(0, evtLogIdx - 200), 400); + Assert.True(filterContext.Contains("AssistantTurnStartEvent"), + "AssistantTurnStartEvent must be included in [EVT] logging so TurnStart " + + "is visible in diagnostics (prevents invisible fallback cancellations)"); + } } \ No newline at end of file diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs index 759231c4fc..a995a9a03c 100644 --- a/PolyPilot/Services/CopilotService.Events.cs +++ b/PolyPilot/Services/CopilotService.Events.cs @@ -225,8 +225,10 @@ private void HandleSessionEvent(SessionState state, SessionEvent evt) var sessionName = state.Info.Name; var isCurrentState = _sessions.TryGetValue(sessionName, out var current) && ReferenceEquals(current, state); - // Log critical lifecycle events and detect orphaned handlers - if (evt is SessionIdleEvent or AssistantTurnEndEvent or SessionErrorEvent) + // Log all critical lifecycle events, including TurnStart. TurnStart cancels the + // TurnEnd→Idle fallback; without logging it, stuck-session forensics cannot see the + // sub-turn boundary that caused the fallback to be silently cancelled. + if (evt is SessionIdleEvent or AssistantTurnEndEvent or SessionErrorEvent or AssistantTurnStartEvent) { Debug($"[EVT] '{sessionName}' received {evt.GetType().Name} " + $"(IsProcessing={state.Info.IsProcessing}, isCurrentState={isCurrentState}, " + @@ -1490,11 +1492,15 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session // have 2-3 min gaps between events while the model reasons), OR // 3. Tools have been executed this turn (HasUsedToolsThisTurn) — even between // tool rounds when ActiveToolCallCount is 0, the model may spend minutes - // thinking about what tool to call next, OR - // 4. Session is in a multi-agent group — workers doing text-heavy tasks - // (e.g., PR reviews) can take 2-4 min without tool calls. The orchestration - // loop has its own 10-minute CancelAfter timeout per worker. Cached at - // send time on UI thread to avoid cross-thread List access. + // thinking about what tool to call next. + // + // NOTE: isMultiAgentSession alone does NOT extend the timeout. + // Workers actively streaming text (e.g., PR reviews) generate delta events + // continuously, so elapsed stays small regardless of timeout tier. The 600s + // timeout is only needed for actual tool execution silence. Using 600s for + // pure reasoning sub-turns (no tools) caused stuck-session UX bugs when the + // SDK dropped terminal events (sdk bug #299 variant): user had to wait 600s + // instead of the 120s inactivity timeout. var isMultiAgentSession = Volatile.Read(ref state.IsMultiAgentSession); var hasReceivedEvents = Volatile.Read(ref state.HasReceivedEventsSinceResume); var hasUsedTools = Volatile.Read(ref state.HasUsedToolsThisTurn); @@ -1527,7 +1533,7 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session }); } - var useToolTimeout = hasActiveTool || (state.Info.IsResumed && !useResumeQuiescence) || hasUsedTools || isMultiAgentSession; + var useToolTimeout = hasActiveTool || (state.Info.IsResumed && !useResumeQuiescence) || hasUsedTools; var effectiveTimeout = useResumeQuiescence ? WatchdogResumeQuiescenceTimeoutSeconds : useToolTimeout @@ -1582,9 +1588,14 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session Debug($"[WATCHDOG] '{sessionName}' tool running but server is not responding — killing stuck session " + $"(elapsed={elapsed:F0}s, totalProcessing={totalProcessingSeconds:F0}s)"); } - else if (!hasActiveTool && hasUsedTools) + else if (!hasActiveTool && (hasUsedTools || (isMultiAgentSession && !IsDemoMode && !IsRemoteMode && _serverManager.IsServerRunning))) { - // Case B: tool finished, no active tools, SessionIdleEvent lost — complete cleanly + // Case B: tools finished (or multi-agent session with server alive) and + // terminal event was lost — complete cleanly without an error message. + // For tool-using sessions: tool finished, SessionIdleEvent lost (sdk bug #299). + // For multi-agent no-tool sessions: LLM finished generating, SDK dropped + // both AssistantTurnEndEvent and SessionIdleEvent. Server liveness check + // confirms the session completed normally, not a server crash. var watchdogGen = Interlocked.Read(ref state.ProcessingGeneration); Debug($"[WATCHDOG] '{sessionName}' tool finished but SessionIdleEvent never arrived — completing cleanly " + $"(elapsed={elapsed:F0}s, totalProcessing={totalProcessingSeconds:F0}s)");