diff --git a/PolyPilot.Tests/MultiAgentRegressionTests.cs b/PolyPilot.Tests/MultiAgentRegressionTests.cs index c25995f5c8..fe799684fb 100644 --- a/PolyPilot.Tests/MultiAgentRegressionTests.cs +++ b/PolyPilot.Tests/MultiAgentRegressionTests.cs @@ -1459,11 +1459,17 @@ public void ReconnectState_ShouldCarryIsMultiAgentSession() // 120s inactivity timeout instead of 600s, killing long-running worker tasks. var source = File.ReadAllText(Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs")); - // Find the reconnect block where HasUsedToolsThisTurn is carried forward - var reconnectBlock = source.Substring(source.IndexOf("newState.HasUsedToolsThisTurn = state.HasUsedToolsThisTurn")); - // IsMultiAgentSession must be carried forward in the same block - // Use 400 chars to accommodate intermediate carry-forward lines (e.g. SuccessfulToolCountThisTurn) - Assert.Contains("newState.IsMultiAgentSession = state.IsMultiAgentSession", reconnectBlock.Substring(0, 400)); + // Find the reconnect block where state is replaced + var marker = "[RECONNECT] '{sessionName}' replacing state"; + var reconnectIdx = source.IndexOf(marker); + Assert.True(reconnectIdx >= 0, "Reconnect block must exist"); + var watchdogIdx = source.IndexOf("StartProcessingWatchdog(state, sessionName)", reconnectIdx); + Assert.True(watchdogIdx >= 0, "StartProcessingWatchdog must follow reconnect block"); + var reconnectBlock = source.Substring(reconnectIdx, watchdogIdx - reconnectIdx); + // IsMultiAgentSession must be carried forward (it's a property of the conversation, not the connection) + Assert.Contains("newState.IsMultiAgentSession = state.IsMultiAgentSession", reconnectBlock); + // But HasUsedToolsThisTurn must be RESET to false (it's connection-specific, not conversation-specific) + Assert.Contains("HasUsedToolsThisTurn = false", reconnectBlock); } [Fact] diff --git a/PolyPilot.Tests/ProcessingWatchdogTests.cs b/PolyPilot.Tests/ProcessingWatchdogTests.cs index 33db436a76..ec16c7ba7a 100644 --- a/PolyPilot.Tests/ProcessingWatchdogTests.cs +++ b/PolyPilot.Tests/ProcessingWatchdogTests.cs @@ -2411,4 +2411,180 @@ public void AssistantTurnStartEvent_LoggedInEvtDiagnostics_InSource() "AssistantTurnStartEvent must be included in [EVT] logging so TurnStart " + "is visible in diagnostics (prevents invisible fallback cancellations)"); } + + // =========================================================================== + // Regression tests for: stuck session due to watchdog Case A infinite reset + // Bug: When a tool is active (ActiveToolCallCount > 0) and the persistent + // server is alive but the specific session's JSON-RPC connection is dead, + // Case A resets LastEventAtTicks indefinitely. ProcessingStartedAt resets + // on each app restart, so the 60-minute max time safety net never fires. + // Fix: Cap consecutive Case A resets via WatchdogMaxToolAliveResets. + // =========================================================================== + + [Fact] + public void WatchdogMaxToolAliveResets_IsReasonable() + { + // Must allow at least 1 reset (legitimate long tool execution), + // but not infinite. Cap at a reasonable number so stuck sessions + // are killed in 30-60 minutes even with server alive. + Assert.InRange(CopilotService.WatchdogMaxToolAliveResets, 1, 10); + } + + [Fact] + public void WatchdogMaxToolAliveResets_BoundsMaxStuckTime() + { + // The reset counter is incremented BEFORE the comparison: + // var resets = Interlocked.Increment(ref state.WatchdogCaseAResets); + // if (resets > WatchdogMaxToolAliveResets) { /* fall through */ } + // So the cap fires on the (N+1)th trigger. Actual max stuck time + // = (WatchdogMaxToolAliveResets + 1) × tool timeout (one initial + // timeout to enter the block, then N resets before exceeding the cap). + // This must be less than WatchdogMaxProcessingTimeSeconds (3600s) + // so the reset cap fires before the absolute max (which may be + // defeated by ProcessingStartedAt resetting on app restart). + var maxCaseAStuckSeconds = (CopilotService.WatchdogMaxToolAliveResets + 1) + * CopilotService.WatchdogToolExecutionTimeoutSeconds; + Assert.True(maxCaseAStuckSeconds < CopilotService.WatchdogMaxProcessingTimeSeconds, + $"Case A max stuck time ({maxCaseAStuckSeconds}s) must be less than " + + $"absolute max ({CopilotService.WatchdogMaxProcessingTimeSeconds}s). " + + "The reset cap is the PRIMARY safety net since the absolute max resets on app restart."); + } + + [Fact] + public void CaseA_ExceedingMaxResets_FallsThroughToKill_InSource() + { + // Verify the watchdog's Case A checks WatchdogMaxToolAliveResets and falls + // through when exceeded. This is the core fix for the stuck-session bug where + // a dead session's tool appears active but no events ever arrive. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync"); + Assert.True(methodIdx >= 0, "RunProcessingWatchdogAsync must exist"); + var endIdx = source.IndexOf(" private readonly ConcurrentDictionary", methodIdx); + var watchdogBody = source.Substring(methodIdx, endIdx - methodIdx); + + // Case A must reference WatchdogMaxToolAliveResets + Assert.True(watchdogBody.Contains("WatchdogMaxToolAliveResets"), + "Case A must check WatchdogMaxToolAliveResets to cap consecutive resets"); + // Case A must increment WatchdogCaseAResets + Assert.True(watchdogBody.Contains("WatchdogCaseAResets"), + "Case A must track reset count via state.WatchdogCaseAResets"); + } + + [Fact] + public void CaseA_ResetCounter_ClearedOnRealEvents_InSource() + { + // When real SDK events arrive (not usage/metrics), the Case A reset counter + // must be cleared. This proves the session's connection is alive, so future + // Case A resets should be fresh (not counting against the cap). + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var handlerIdx = source.IndexOf("private void HandleSessionEvent"); + Assert.True(handlerIdx >= 0, "HandleSessionEvent must exist"); + // Find the block that resets LastEventAtTicks (only for real events) + var lastEventResetIdx = source.IndexOf("LastEventAtTicks", handlerIdx); + Assert.True(lastEventResetIdx >= 0, "HandleSessionEvent must update LastEventAtTicks"); + // The counter reset must be near the LastEventAtTicks reset (within ~300 chars) + var nearbyBlock = source.Substring(lastEventResetIdx, Math.Min(300, source.Length - lastEventResetIdx)); + Assert.True(nearbyBlock.Contains("WatchdogCaseAResets"), + "WatchdogCaseAResets must be reset near LastEventAtTicks in HandleSessionEvent " + + "to clear the counter when real SDK events prove the connection is alive"); + } + + [Fact] + public void CaseA_ResetCounter_ClearedOnWatchdogStart_InSource() + { + // StartProcessingWatchdog must reset WatchdogCaseAResets to 0 so each new + // watchdog instance starts with a clean reset counter. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var startIdx = source.IndexOf("private void StartProcessingWatchdog"); + Assert.True(startIdx >= 0, "StartProcessingWatchdog must exist"); + var methodEnd = source.IndexOf("_ = RunProcessingWatchdogAsync", startIdx); + var methodBody = source.Substring(startIdx, methodEnd - startIdx); + Assert.True(methodBody.Contains("WatchdogCaseAResets"), + "StartProcessingWatchdog must reset WatchdogCaseAResets to 0"); + } + + [Fact] + public void ExceededMaxTime_TrueWhenProcessingStartedAtNull_InSource() + { + // Defensive: if ProcessingStartedAt is null while IsProcessing is true, + // exceededMaxTime must be true. Without this, Case A can reset forever + // because totalProcessingSeconds=0 makes exceededMaxTime always false. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync"); + Assert.True(methodIdx >= 0, "RunProcessingWatchdogAsync must exist"); + var endIdx = source.IndexOf(" private readonly ConcurrentDictionary", methodIdx); + var watchdogBody = source.Substring(methodIdx, endIdx - methodIdx); + + // The exceededMaxTime calculation must handle null ProcessingStartedAt + Assert.True(watchdogBody.Contains("!startedAt.HasValue"), + "exceededMaxTime must be true when ProcessingStartedAt is null " + + "(defensive guard against Case A infinite reset)"); + } + + // =========================================================================== + // Regression tests for: reconnect path inherits stale tool state + // Bug: After reconnect, HasUsedToolsThisTurn=true from the dead connection + // inflates the watchdog timeout from 120s to 600s. ProcessingStartedAt is + // not reset, so the max-time safety net measures from the original send. + // Fix: Reset tool tracking and ProcessingStartedAt in the reconnect block. + // =========================================================================== + + [Fact] + public void ReconnectPath_ResetsHasUsedToolsThisTurn_InSource() + { + // After reconnect, HasUsedToolsThisTurn must be false so the new connection + // uses the 120s inactivity timeout, not the 600s tool timeout inherited from + // the dead connection. Without this, reconnected sessions wait 5x longer. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs")); + var reconnectIdx = source.IndexOf("[RECONNECT] '{sessionName}' replacing state"); + Assert.True(reconnectIdx >= 0, "Reconnect block must exist"); + // Find StartProcessingWatchdog in the reconnect block (marks the end of state setup) + var watchdogIdx = source.IndexOf("StartProcessingWatchdog(state, sessionName)", reconnectIdx); + Assert.True(watchdogIdx >= 0, "StartProcessingWatchdog must be called in reconnect block"); + var reconnectBlock = source.Substring(reconnectIdx, watchdogIdx - reconnectIdx); + + // HasUsedToolsThisTurn must be set to false (not carried from old state) + Assert.True(reconnectBlock.Contains("HasUsedToolsThisTurn = false"), + "Reconnect block must reset HasUsedToolsThisTurn to false for the new connection. " + + "Carrying over true from the dead connection inflates the timeout from 120s to 600s."); + } + + [Fact] + public void ReconnectPath_ResetsProcessingStartedAt_InSource() + { + // After reconnect, ProcessingStartedAt must be reset to DateTime.UtcNow + // so the watchdog's max-time safety net measures from the reconnect time, + // not the original send time. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs")); + var reconnectIdx = source.IndexOf("[RECONNECT] '{sessionName}' replacing state"); + Assert.True(reconnectIdx >= 0, "Reconnect block must exist"); + var watchdogIdx = source.IndexOf("StartProcessingWatchdog(state, sessionName)", reconnectIdx); + var reconnectBlock = source.Substring(reconnectIdx, watchdogIdx - reconnectIdx); + + Assert.True(reconnectBlock.Contains("ProcessingStartedAt = DateTime.UtcNow"), + "Reconnect block must reset ProcessingStartedAt to DateTime.UtcNow. " + + "Without this, the 60-min max-time safety net measures from the original send."); + } + + [Fact] + public void ReconnectPath_ResetsActiveToolCallCount_InSource() + { + // After reconnect, ActiveToolCallCount must be 0. No tools have started + // on the new connection. A stale count > 0 would trigger Case A resets. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs")); + var reconnectIdx = source.IndexOf("[RECONNECT] '{sessionName}' replacing state"); + Assert.True(reconnectIdx >= 0, "Reconnect block must exist"); + var watchdogIdx = source.IndexOf("StartProcessingWatchdog(state, sessionName)", reconnectIdx); + var reconnectBlock = source.Substring(reconnectIdx, watchdogIdx - reconnectIdx); + + Assert.True(reconnectBlock.Contains("ActiveToolCallCount") && reconnectBlock.Contains("0"), + "Reconnect block must reset ActiveToolCallCount to 0 for the new connection."); + } } \ No newline at end of file diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs index ec292752b5..f6ef17aa56 100644 --- a/PolyPilot/Services/CopilotService.Events.cs +++ b/PolyPilot/Services/CopilotService.Events.cs @@ -220,6 +220,9 @@ private void HandleSessionEvent(SessionState state, SessionEvent evt) if (evt is not SessionUsageInfoEvent and not AssistantUsageEvent) { Interlocked.Exchange(ref state.LastEventAtTicks, DateTime.UtcNow.Ticks); + // Real event arrived — reset the Case A reset counter. This proves the session's + // JSON-RPC connection is alive, so future Case A resets are legitimate. + Interlocked.Exchange(ref state.WatchdogCaseAResets, 0); state.Info.LastUpdatedAt = DateTime.Now; } var sessionName = state.Info.Name; @@ -1399,6 +1402,15 @@ private void HandleReflectionAdvanceResult(SessionState state, string response, /// non-progress events (like repeated SessionUsageInfoEvent) keep arriving without a terminal event. internal const int WatchdogMaxProcessingTimeSeconds = 3600; // 60 minutes + /// Maximum number of consecutive Case A resets (tool active + server alive) before + /// the watchdog assumes the session's JSON-RPC connection is dead and kills it anyway. + /// The persistent server may still be alive serving other sessions while this specific + /// session's transport-level connection is broken (ConnectionLostException). Without this cap, + /// Case A resets LastEventAtTicks indefinitely, and ProcessingStartedAt resets on each app + /// restart — so neither the inactivity nor the max-time safety net ever fires. + /// (3+1) resets × 600s effective timeout ≈ 40 minutes max of Case A resets. + internal const int WatchdogMaxToolAliveResets = 3; + /// /// Milliseconds to wait after AssistantTurnEndEvent before firing CompleteResponse /// as a fallback, in case SessionIdleEvent never arrives (SDK bug #299). @@ -1458,6 +1470,7 @@ private void StartProcessingWatchdog(SessionState state, string sessionName) // timeout to fire on the first watchdog check for any file > ~15s old. // This is the exact regression pattern from PR #148 (short timeout killing active sessions). Interlocked.Exchange(ref state.LastEventAtTicks, DateTime.UtcNow.Ticks); + Interlocked.Exchange(ref state.WatchdogCaseAResets, 0); state.ProcessingWatchdog = new CancellationTokenSource(); var ct = state.ProcessingWatchdog.Token; _ = RunProcessingWatchdogAsync(state, sessionName, ct); @@ -1559,7 +1572,11 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session if (elapsed >= effectiveTimeout) { - var exceededMaxTime = totalProcessingSeconds >= WatchdogMaxProcessingTimeSeconds; + // Defensive: if ProcessingStartedAt is null while IsProcessing is true, + // something is already wrong — treat as exceeded max time so Case A + // can't reset the timer indefinitely. + var exceededMaxTime = !startedAt.HasValue + || totalProcessingSeconds >= WatchdogMaxProcessingTimeSeconds; // Before killing, check what state we're actually in: // @@ -1586,13 +1603,28 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session var serverAlive = _serverManager.IsServerRunning; if (serverAlive) { - Debug($"[WATCHDOG] '{sessionName}' {elapsed:F0}s inactivity but tool is running and server is alive — resetting timer " + - $"(timeout={effectiveTimeout}s, totalProcessing={totalProcessingSeconds:F0}s)"); - Interlocked.Exchange(ref state.LastEventAtTicks, DateTime.UtcNow.Ticks); - continue; // keep waiting — don't kill + var resets = Interlocked.Increment(ref state.WatchdogCaseAResets); + if (resets > WatchdogMaxToolAliveResets) + { + // Too many consecutive resets with no real SDK events — the + // session's JSON-RPC connection is likely dead even though the + // shared persistent server is still alive. Fall through to kill. + Debug($"[WATCHDOG] '{sessionName}' Case A reset cap exceeded ({resets}/{WatchdogMaxToolAliveResets}) " + + $"— killing despite server alive (elapsed={elapsed:F0}s, totalProcessing={totalProcessingSeconds:F0}s)"); + } + else + { + Debug($"[WATCHDOG] '{sessionName}' {elapsed:F0}s inactivity but tool is running and server is alive — resetting timer " + + $"(reset #{resets}/{WatchdogMaxToolAliveResets}, timeout={effectiveTimeout}s, totalProcessing={totalProcessingSeconds:F0}s)"); + Interlocked.Exchange(ref state.LastEventAtTicks, DateTime.UtcNow.Ticks); + continue; // keep waiting — don't kill + } + } + else + { + Debug($"[WATCHDOG] '{sessionName}' tool running but server is not responding — killing stuck session " + + $"(elapsed={elapsed:F0}s, totalProcessing={totalProcessingSeconds:F0}s)"); } - Debug($"[WATCHDOG] '{sessionName}' tool running but server is not responding — killing stuck session " + - $"(elapsed={elapsed:F0}s, totalProcessing={totalProcessingSeconds:F0}s)"); } else if (!hasActiveTool && (hasUsedTools || (isMultiAgentSession && !IsDemoMode && !IsRemoteMode && _serverManager.IsServerRunning))) { diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs index a3ca55bfd8..1ed3b24b80 100644 --- a/PolyPilot/Services/CopilotService.cs +++ b/PolyPilot/Services/CopilotService.cs @@ -408,6 +408,11 @@ private class SessionState /// Cleared on CompleteResponse and SendPromptAsync. /// public ConcurrentDictionary PendingReasoningMessages { get; } = new(); + /// Number of consecutive times the watchdog's Case A (tool active + server alive) + /// has reset the inactivity timer without any real SDK events arriving. Capped by + /// WatchdogMaxToolAliveResets to prevent infinite resets when the session's JSON-RPC + /// connection is dead but the shared persistent server is still alive. + public int WatchdogCaseAResets; } private void Debug(string message) @@ -2600,8 +2605,14 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis // orphaned old state can't pass generation checks on the new state. Interlocked.Exchange(ref newState.ProcessingGeneration, Interlocked.Read(ref state.ProcessingGeneration)); - newState.HasUsedToolsThisTurn = state.HasUsedToolsThisTurn; - Interlocked.Exchange(ref newState.SuccessfulToolCountThisTurn, Volatile.Read(ref state.SuccessfulToolCountThisTurn)); + // Reset tool tracking for the NEW connection. The old connection's + // tool state is stale — no tools have run on this connection yet. + // Without this, HasUsedToolsThisTurn=true from the dead connection + // inflates the watchdog timeout from 120s to 600s, making stuck + // sessions wait 5x longer than necessary to recover. + newState.HasUsedToolsThisTurn = false; + Interlocked.Exchange(ref newState.ActiveToolCallCount, 0); + Interlocked.Exchange(ref newState.SuccessfulToolCountThisTurn, 0); newState.IsMultiAgentSession = state.IsMultiAgentSession; newSession.On(evt => HandleSessionEvent(newState, evt)); _sessions[sessionName] = newState; @@ -2613,6 +2624,11 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis // replayed IDLE from clearing IsProcessing before the retry completes. Interlocked.Increment(ref state.ProcessingGeneration); state.Info.IsProcessing = true; + // Reset ProcessingStartedAt so the watchdog's max-time safety net + // measures from the reconnect, not the original send. Without this, + // the 60-min absolute max is measured from the first attempt and + // the reconnected session inherits a stale deadline. + state.Info.ProcessingStartedAt = DateTime.UtcNow; state.CurrentResponse.Clear(); state.FlushedResponse.Clear(); state.PendingReasoningMessages.Clear();