From 0d4590b5518c269d92e581ee2cf6f33f771e4675 Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Tue, 10 Mar 2026 11:25:26 -0500 Subject: [PATCH 1/4] Fix stuck sessions: cap watchdog Case A resets for dead connections When a session's JSON-RPC connection dies mid-tool-execution (ConnectionLostException), the SDK never emits SessionErrorEvent so IsProcessing stays true. The watchdog's Case A logic (tool active + server alive) kept resetting LastEventAtTicks indefinitely because the persistent server is shared and stays alive serving other sessions. Combined with ProcessingStartedAt resetting to DateTime.UtcNow on every app restart, the 60-minute absolute max safety net never fires. Fix: - Add WatchdogCaseAResets counter to SessionState, capped at 3 resets (WatchdogMaxToolAliveResets). After 3 consecutive resets (~30 min) with no real SDK events, assume the session connection is dead and fall through to the timeout handler. - Reset the counter when real SDK events arrive (HandleSessionEvent), proving the connection is alive. - Add defensive ProcessingStartedAt null guard: if null while IsProcessing=true, treat as exceededMaxTime so Case A cannot loop forever. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot.Tests/ProcessingWatchdogTests.cs | 108 ++++++++++++++++++++ PolyPilot/Services/CopilotService.Events.cs | 39 ++++++- PolyPilot/Services/CopilotService.cs | 5 + 3 files changed, 147 insertions(+), 5 deletions(-) diff --git a/PolyPilot.Tests/ProcessingWatchdogTests.cs b/PolyPilot.Tests/ProcessingWatchdogTests.cs index 33db436a76..f53c6851ab 100644 --- a/PolyPilot.Tests/ProcessingWatchdogTests.cs +++ b/PolyPilot.Tests/ProcessingWatchdogTests.cs @@ -2411,4 +2411,112 @@ public void AssistantTurnStartEvent_LoggedInEvtDiagnostics_InSource() "AssistantTurnStartEvent must be included in [EVT] logging so TurnStart " + "is visible in diagnostics (prevents invisible fallback cancellations)"); } + + // =========================================================================== + // Regression tests for: stuck session due to watchdog Case A infinite reset + // Bug: When a tool is active (ActiveToolCallCount > 0) and the persistent + // server is alive but the specific session's JSON-RPC connection is dead, + // Case A resets LastEventAtTicks indefinitely. ProcessingStartedAt resets + // on each app restart, so the 60-minute max time safety net never fires. + // Fix: Cap consecutive Case A resets via WatchdogMaxToolAliveResets. + // =========================================================================== + + [Fact] + public void WatchdogMaxToolAliveResets_IsReasonable() + { + // Must allow at least 1 reset (legitimate long tool execution), + // but not infinite. Cap at a reasonable number so stuck sessions + // are killed in 30-60 minutes even with server alive. + Assert.InRange(CopilotService.WatchdogMaxToolAliveResets, 1, 10); + } + + [Fact] + public void WatchdogMaxToolAliveResets_BoundsMaxStuckTime() + { + // Total stuck time from Case A resets = resets × tool timeout. + // This must be less than WatchdogMaxProcessingTimeSeconds (3600s) + // so the reset cap fires before the absolute max (which may be + // defeated by ProcessingStartedAt resetting on app restart). + var maxCaseAStuckSeconds = CopilotService.WatchdogMaxToolAliveResets + * CopilotService.WatchdogToolExecutionTimeoutSeconds; + Assert.True(maxCaseAStuckSeconds < CopilotService.WatchdogMaxProcessingTimeSeconds, + $"Case A max stuck time ({maxCaseAStuckSeconds}s) must be less than " + + $"absolute max ({CopilotService.WatchdogMaxProcessingTimeSeconds}s). " + + "The reset cap is the PRIMARY safety net since the absolute max resets on app restart."); + } + + [Fact] + public void CaseA_ExceedingMaxResets_FallsThroughToKill_InSource() + { + // Verify the watchdog's Case A checks WatchdogMaxToolAliveResets and falls + // through when exceeded. This is the core fix for the stuck-session bug where + // a dead session's tool appears active but no events ever arrive. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync"); + Assert.True(methodIdx >= 0, "RunProcessingWatchdogAsync must exist"); + var endIdx = source.IndexOf(" private readonly ConcurrentDictionary", methodIdx); + var watchdogBody = source.Substring(methodIdx, endIdx - methodIdx); + + // Case A must reference WatchdogMaxToolAliveResets + Assert.True(watchdogBody.Contains("WatchdogMaxToolAliveResets"), + "Case A must check WatchdogMaxToolAliveResets to cap consecutive resets"); + // Case A must increment WatchdogCaseAResets + Assert.True(watchdogBody.Contains("WatchdogCaseAResets"), + "Case A must track reset count via state.WatchdogCaseAResets"); + } + + [Fact] + public void CaseA_ResetCounter_ClearedOnRealEvents_InSource() + { + // When real SDK events arrive (not usage/metrics), the Case A reset counter + // must be cleared. This proves the session's connection is alive, so future + // Case A resets should be fresh (not counting against the cap). + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var handlerIdx = source.IndexOf("private void HandleSessionEvent"); + Assert.True(handlerIdx >= 0, "HandleSessionEvent must exist"); + // Find the block that resets LastEventAtTicks (only for real events) + var lastEventResetIdx = source.IndexOf("LastEventAtTicks", handlerIdx); + Assert.True(lastEventResetIdx >= 0, "HandleSessionEvent must update LastEventAtTicks"); + // The counter reset must be near the LastEventAtTicks reset (within ~300 chars) + var nearbyBlock = source.Substring(lastEventResetIdx, Math.Min(300, source.Length - lastEventResetIdx)); + Assert.True(nearbyBlock.Contains("WatchdogCaseAResets"), + "WatchdogCaseAResets must be reset near LastEventAtTicks in HandleSessionEvent " + + "to clear the counter when real SDK events prove the connection is alive"); + } + + [Fact] + public void CaseA_ResetCounter_ClearedOnWatchdogStart_InSource() + { + // StartProcessingWatchdog must reset WatchdogCaseAResets to 0 so each new + // watchdog instance starts with a clean reset counter. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var startIdx = source.IndexOf("private void StartProcessingWatchdog"); + Assert.True(startIdx >= 0, "StartProcessingWatchdog must exist"); + var methodEnd = source.IndexOf("_ = RunProcessingWatchdogAsync", startIdx); + var methodBody = source.Substring(startIdx, methodEnd - startIdx); + Assert.True(methodBody.Contains("WatchdogCaseAResets"), + "StartProcessingWatchdog must reset WatchdogCaseAResets to 0"); + } + + [Fact] + public void ExceededMaxTime_TrueWhenProcessingStartedAtNull_InSource() + { + // Defensive: if ProcessingStartedAt is null while IsProcessing is true, + // exceededMaxTime must be true. Without this, Case A can reset forever + // because totalProcessingSeconds=0 makes exceededMaxTime always false. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync"); + Assert.True(methodIdx >= 0, "RunProcessingWatchdogAsync must exist"); + var endIdx = source.IndexOf(" private readonly ConcurrentDictionary", methodIdx); + var watchdogBody = source.Substring(methodIdx, endIdx - methodIdx); + + // The exceededMaxTime calculation must handle null ProcessingStartedAt + Assert.True(watchdogBody.Contains("!startedAt.HasValue"), + "exceededMaxTime must be true when ProcessingStartedAt is null " + + "(defensive guard against Case A infinite reset)"); + } } \ No newline at end of file diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs index ec292752b5..a120b91a2f 100644 --- a/PolyPilot/Services/CopilotService.Events.cs +++ b/PolyPilot/Services/CopilotService.Events.cs @@ -220,6 +220,9 @@ private void HandleSessionEvent(SessionState state, SessionEvent evt) if (evt is not SessionUsageInfoEvent and not AssistantUsageEvent) { Interlocked.Exchange(ref state.LastEventAtTicks, DateTime.UtcNow.Ticks); + // Real event arrived — reset the Case A reset counter. This proves the session's + // JSON-RPC connection is alive, so future Case A resets are legitimate. + Interlocked.Exchange(ref state.WatchdogCaseAResets, 0); state.Info.LastUpdatedAt = DateTime.Now; } var sessionName = state.Info.Name; @@ -1399,6 +1402,15 @@ private void HandleReflectionAdvanceResult(SessionState state, string response, /// non-progress events (like repeated SessionUsageInfoEvent) keep arriving without a terminal event. internal const int WatchdogMaxProcessingTimeSeconds = 3600; // 60 minutes + /// Maximum number of consecutive Case A resets (tool active + server alive) before + /// the watchdog assumes the session's JSON-RPC connection is dead and kills it anyway. + /// The persistent server may still be alive serving other sessions while this specific + /// session's transport-level connection is broken (ConnectionLostException). Without this cap, + /// Case A resets LastEventAtTicks indefinitely, and ProcessingStartedAt resets on each app + /// restart — so neither the inactivity nor the max-time safety net ever fires. + /// 3 resets × 600s effective timeout ≈ 30 minutes max of Case A resets. + internal const int WatchdogMaxToolAliveResets = 3; + /// /// Milliseconds to wait after AssistantTurnEndEvent before firing CompleteResponse /// as a fallback, in case SessionIdleEvent never arrives (SDK bug #299). @@ -1458,6 +1470,7 @@ private void StartProcessingWatchdog(SessionState state, string sessionName) // timeout to fire on the first watchdog check for any file > ~15s old. // This is the exact regression pattern from PR #148 (short timeout killing active sessions). Interlocked.Exchange(ref state.LastEventAtTicks, DateTime.UtcNow.Ticks); + Interlocked.Exchange(ref state.WatchdogCaseAResets, 0); state.ProcessingWatchdog = new CancellationTokenSource(); var ct = state.ProcessingWatchdog.Token; _ = RunProcessingWatchdogAsync(state, sessionName, ct); @@ -1559,7 +1572,11 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session if (elapsed >= effectiveTimeout) { - var exceededMaxTime = totalProcessingSeconds >= WatchdogMaxProcessingTimeSeconds; + // Defensive: if ProcessingStartedAt is null while IsProcessing is true, + // something is already wrong — treat as exceeded max time so Case A + // can't reset the timer indefinitely. + var exceededMaxTime = !startedAt.HasValue + || totalProcessingSeconds >= WatchdogMaxProcessingTimeSeconds; // Before killing, check what state we're actually in: // @@ -1586,10 +1603,22 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session var serverAlive = _serverManager.IsServerRunning; if (serverAlive) { - Debug($"[WATCHDOG] '{sessionName}' {elapsed:F0}s inactivity but tool is running and server is alive — resetting timer " + - $"(timeout={effectiveTimeout}s, totalProcessing={totalProcessingSeconds:F0}s)"); - Interlocked.Exchange(ref state.LastEventAtTicks, DateTime.UtcNow.Ticks); - continue; // keep waiting — don't kill + var resets = Interlocked.Increment(ref state.WatchdogCaseAResets); + if (resets > WatchdogMaxToolAliveResets) + { + // Too many consecutive resets with no real SDK events — the + // session's JSON-RPC connection is likely dead even though the + // shared persistent server is still alive. Fall through to kill. + Debug($"[WATCHDOG] '{sessionName}' Case A reset count {resets} exceeded max {WatchdogMaxToolAliveResets} " + + $"— session connection likely dead (elapsed={elapsed:F0}s, totalProcessing={totalProcessingSeconds:F0}s)"); + } + else + { + Debug($"[WATCHDOG] '{sessionName}' {elapsed:F0}s inactivity but tool is running and server is alive — resetting timer " + + $"(reset #{resets}/{WatchdogMaxToolAliveResets}, timeout={effectiveTimeout}s, totalProcessing={totalProcessingSeconds:F0}s)"); + Interlocked.Exchange(ref state.LastEventAtTicks, DateTime.UtcNow.Ticks); + continue; // keep waiting — don't kill + } } Debug($"[WATCHDOG] '{sessionName}' tool running but server is not responding — killing stuck session " + $"(elapsed={elapsed:F0}s, totalProcessing={totalProcessingSeconds:F0}s)"); diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs index a3ca55bfd8..b943028a9f 100644 --- a/PolyPilot/Services/CopilotService.cs +++ b/PolyPilot/Services/CopilotService.cs @@ -408,6 +408,11 @@ private class SessionState /// Cleared on CompleteResponse and SendPromptAsync. /// public ConcurrentDictionary PendingReasoningMessages { get; } = new(); + /// Number of consecutive times the watchdog's Case A (tool active + server alive) + /// has reset the inactivity timer without any real SDK events arriving. Capped by + /// WatchdogMaxToolAliveResets to prevent infinite resets when the session's JSON-RPC + /// connection is dead but the shared persistent server is still alive. + public int WatchdogCaseAResets; } private void Debug(string message) From a9748748f2f74782c9740f53212d40389f1fbd17 Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Tue, 10 Mar 2026 11:50:07 -0500 Subject: [PATCH 2/4] Fix off-by-one in WatchdogMaxToolAliveResets_BoundsMaxStuckTime test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reset counter is incremented before the comparison (Increment returns new value), so the cap fires on the (N+1)th trigger. Actual max stuck time is (WatchdogMaxToolAliveResets + 1) * WatchdogToolExecutionTimeoutSeconds. Without this fix, bumping WatchdogMaxToolAliveResets to 5 would pass the test (5*600=3000 < 3600) while actual max is 6*600=3600 — equal to the absolute max, providing zero protection. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot.Tests/ProcessingWatchdogTests.cs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/PolyPilot.Tests/ProcessingWatchdogTests.cs b/PolyPilot.Tests/ProcessingWatchdogTests.cs index f53c6851ab..e3d3337926 100644 --- a/PolyPilot.Tests/ProcessingWatchdogTests.cs +++ b/PolyPilot.Tests/ProcessingWatchdogTests.cs @@ -2433,11 +2433,16 @@ public void WatchdogMaxToolAliveResets_IsReasonable() [Fact] public void WatchdogMaxToolAliveResets_BoundsMaxStuckTime() { - // Total stuck time from Case A resets = resets × tool timeout. + // The reset counter is incremented BEFORE the comparison: + // var resets = Interlocked.Increment(ref state.WatchdogCaseAResets); + // if (resets > WatchdogMaxToolAliveResets) { /* fall through */ } + // So the cap fires on the (N+1)th trigger. Actual max stuck time + // = (WatchdogMaxToolAliveResets + 1) × tool timeout (one initial + // timeout to enter the block, then N resets before exceeding the cap). // This must be less than WatchdogMaxProcessingTimeSeconds (3600s) // so the reset cap fires before the absolute max (which may be // defeated by ProcessingStartedAt resetting on app restart). - var maxCaseAStuckSeconds = CopilotService.WatchdogMaxToolAliveResets + var maxCaseAStuckSeconds = (CopilotService.WatchdogMaxToolAliveResets + 1) * CopilotService.WatchdogToolExecutionTimeoutSeconds; Assert.True(maxCaseAStuckSeconds < CopilotService.WatchdogMaxProcessingTimeSeconds, $"Case A max stuck time ({maxCaseAStuckSeconds}s) must be less than " + From fbcee030f56614c513fb2098f0490afb32fc7485 Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Tue, 10 Mar 2026 12:17:01 -0500 Subject: [PATCH 3/4] Fix reconnect path inheriting stale tool state from dead connection After a reconnect in SendPromptAsync, the new SessionState was inheriting HasUsedToolsThisTurn=true from the dead connection, inflating the watchdog timeout from 120s (inactivity) to 600s (tool execution). Additionally, ProcessingStartedAt was not reset, so the 60-min max-time safety net measured from the original send time rather than reconnect time. Changes: - Reset HasUsedToolsThisTurn=false in reconnect block (connection-specific) - Reset ActiveToolCallCount=0 and SuccessfulToolCountThisTurn=0 - Reset ProcessingStartedAt=DateTime.UtcNow for fresh max-time deadline - Add 3 source-verification tests for reconnect state resets - Update MultiAgentRegressionTests to match new reconnect behavior Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot.Tests/MultiAgentRegressionTests.cs | 16 +++-- PolyPilot.Tests/ProcessingWatchdogTests.cs | 63 ++++++++++++++++++++ PolyPilot/Services/CopilotService.cs | 15 ++++- 3 files changed, 87 insertions(+), 7 deletions(-) diff --git a/PolyPilot.Tests/MultiAgentRegressionTests.cs b/PolyPilot.Tests/MultiAgentRegressionTests.cs index c25995f5c8..fe799684fb 100644 --- a/PolyPilot.Tests/MultiAgentRegressionTests.cs +++ b/PolyPilot.Tests/MultiAgentRegressionTests.cs @@ -1459,11 +1459,17 @@ public void ReconnectState_ShouldCarryIsMultiAgentSession() // 120s inactivity timeout instead of 600s, killing long-running worker tasks. var source = File.ReadAllText(Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs")); - // Find the reconnect block where HasUsedToolsThisTurn is carried forward - var reconnectBlock = source.Substring(source.IndexOf("newState.HasUsedToolsThisTurn = state.HasUsedToolsThisTurn")); - // IsMultiAgentSession must be carried forward in the same block - // Use 400 chars to accommodate intermediate carry-forward lines (e.g. SuccessfulToolCountThisTurn) - Assert.Contains("newState.IsMultiAgentSession = state.IsMultiAgentSession", reconnectBlock.Substring(0, 400)); + // Find the reconnect block where state is replaced + var marker = "[RECONNECT] '{sessionName}' replacing state"; + var reconnectIdx = source.IndexOf(marker); + Assert.True(reconnectIdx >= 0, "Reconnect block must exist"); + var watchdogIdx = source.IndexOf("StartProcessingWatchdog(state, sessionName)", reconnectIdx); + Assert.True(watchdogIdx >= 0, "StartProcessingWatchdog must follow reconnect block"); + var reconnectBlock = source.Substring(reconnectIdx, watchdogIdx - reconnectIdx); + // IsMultiAgentSession must be carried forward (it's a property of the conversation, not the connection) + Assert.Contains("newState.IsMultiAgentSession = state.IsMultiAgentSession", reconnectBlock); + // But HasUsedToolsThisTurn must be RESET to false (it's connection-specific, not conversation-specific) + Assert.Contains("HasUsedToolsThisTurn = false", reconnectBlock); } [Fact] diff --git a/PolyPilot.Tests/ProcessingWatchdogTests.cs b/PolyPilot.Tests/ProcessingWatchdogTests.cs index e3d3337926..ec16c7ba7a 100644 --- a/PolyPilot.Tests/ProcessingWatchdogTests.cs +++ b/PolyPilot.Tests/ProcessingWatchdogTests.cs @@ -2524,4 +2524,67 @@ public void ExceededMaxTime_TrueWhenProcessingStartedAtNull_InSource() "exceededMaxTime must be true when ProcessingStartedAt is null " + "(defensive guard against Case A infinite reset)"); } + + // =========================================================================== + // Regression tests for: reconnect path inherits stale tool state + // Bug: After reconnect, HasUsedToolsThisTurn=true from the dead connection + // inflates the watchdog timeout from 120s to 600s. ProcessingStartedAt is + // not reset, so the max-time safety net measures from the original send. + // Fix: Reset tool tracking and ProcessingStartedAt in the reconnect block. + // =========================================================================== + + [Fact] + public void ReconnectPath_ResetsHasUsedToolsThisTurn_InSource() + { + // After reconnect, HasUsedToolsThisTurn must be false so the new connection + // uses the 120s inactivity timeout, not the 600s tool timeout inherited from + // the dead connection. Without this, reconnected sessions wait 5x longer. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs")); + var reconnectIdx = source.IndexOf("[RECONNECT] '{sessionName}' replacing state"); + Assert.True(reconnectIdx >= 0, "Reconnect block must exist"); + // Find StartProcessingWatchdog in the reconnect block (marks the end of state setup) + var watchdogIdx = source.IndexOf("StartProcessingWatchdog(state, sessionName)", reconnectIdx); + Assert.True(watchdogIdx >= 0, "StartProcessingWatchdog must be called in reconnect block"); + var reconnectBlock = source.Substring(reconnectIdx, watchdogIdx - reconnectIdx); + + // HasUsedToolsThisTurn must be set to false (not carried from old state) + Assert.True(reconnectBlock.Contains("HasUsedToolsThisTurn = false"), + "Reconnect block must reset HasUsedToolsThisTurn to false for the new connection. " + + "Carrying over true from the dead connection inflates the timeout from 120s to 600s."); + } + + [Fact] + public void ReconnectPath_ResetsProcessingStartedAt_InSource() + { + // After reconnect, ProcessingStartedAt must be reset to DateTime.UtcNow + // so the watchdog's max-time safety net measures from the reconnect time, + // not the original send time. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs")); + var reconnectIdx = source.IndexOf("[RECONNECT] '{sessionName}' replacing state"); + Assert.True(reconnectIdx >= 0, "Reconnect block must exist"); + var watchdogIdx = source.IndexOf("StartProcessingWatchdog(state, sessionName)", reconnectIdx); + var reconnectBlock = source.Substring(reconnectIdx, watchdogIdx - reconnectIdx); + + Assert.True(reconnectBlock.Contains("ProcessingStartedAt = DateTime.UtcNow"), + "Reconnect block must reset ProcessingStartedAt to DateTime.UtcNow. " + + "Without this, the 60-min max-time safety net measures from the original send."); + } + + [Fact] + public void ReconnectPath_ResetsActiveToolCallCount_InSource() + { + // After reconnect, ActiveToolCallCount must be 0. No tools have started + // on the new connection. A stale count > 0 would trigger Case A resets. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs")); + var reconnectIdx = source.IndexOf("[RECONNECT] '{sessionName}' replacing state"); + Assert.True(reconnectIdx >= 0, "Reconnect block must exist"); + var watchdogIdx = source.IndexOf("StartProcessingWatchdog(state, sessionName)", reconnectIdx); + var reconnectBlock = source.Substring(reconnectIdx, watchdogIdx - reconnectIdx); + + Assert.True(reconnectBlock.Contains("ActiveToolCallCount") && reconnectBlock.Contains("0"), + "Reconnect block must reset ActiveToolCallCount to 0 for the new connection."); + } } \ No newline at end of file diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs index b943028a9f..1ed3b24b80 100644 --- a/PolyPilot/Services/CopilotService.cs +++ b/PolyPilot/Services/CopilotService.cs @@ -2605,8 +2605,14 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis // orphaned old state can't pass generation checks on the new state. Interlocked.Exchange(ref newState.ProcessingGeneration, Interlocked.Read(ref state.ProcessingGeneration)); - newState.HasUsedToolsThisTurn = state.HasUsedToolsThisTurn; - Interlocked.Exchange(ref newState.SuccessfulToolCountThisTurn, Volatile.Read(ref state.SuccessfulToolCountThisTurn)); + // Reset tool tracking for the NEW connection. The old connection's + // tool state is stale — no tools have run on this connection yet. + // Without this, HasUsedToolsThisTurn=true from the dead connection + // inflates the watchdog timeout from 120s to 600s, making stuck + // sessions wait 5x longer than necessary to recover. + newState.HasUsedToolsThisTurn = false; + Interlocked.Exchange(ref newState.ActiveToolCallCount, 0); + Interlocked.Exchange(ref newState.SuccessfulToolCountThisTurn, 0); newState.IsMultiAgentSession = state.IsMultiAgentSession; newSession.On(evt => HandleSessionEvent(newState, evt)); _sessions[sessionName] = newState; @@ -2618,6 +2624,11 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis // replayed IDLE from clearing IsProcessing before the retry completes. Interlocked.Increment(ref state.ProcessingGeneration); state.Info.IsProcessing = true; + // Reset ProcessingStartedAt so the watchdog's max-time safety net + // measures from the reconnect, not the original send. Without this, + // the 60-min absolute max is measured from the first attempt and + // the reconnected session inherits a stale deadline. + state.Info.ProcessingStartedAt = DateTime.UtcNow; state.CurrentResponse.Clear(); state.FlushedResponse.Clear(); state.PendingReasoningMessages.Clear(); From 9bd79887d860c9ec82727b27d599fbc6f670d23b Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Tue, 10 Mar 2026 14:46:58 -0500 Subject: [PATCH 4/4] Fix misleading watchdog log and doc comment When Case A reset cap is exceeded, the log now accurately says reset cap exceeded instead of falling through to server not responding. Also fix doc comment from 30 to 40 minutes. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot/Services/CopilotService.Events.cs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs index a120b91a2f..f6ef17aa56 100644 --- a/PolyPilot/Services/CopilotService.Events.cs +++ b/PolyPilot/Services/CopilotService.Events.cs @@ -1408,7 +1408,7 @@ private void HandleReflectionAdvanceResult(SessionState state, string response, /// session's transport-level connection is broken (ConnectionLostException). Without this cap, /// Case A resets LastEventAtTicks indefinitely, and ProcessingStartedAt resets on each app /// restart — so neither the inactivity nor the max-time safety net ever fires. - /// 3 resets × 600s effective timeout ≈ 30 minutes max of Case A resets. + /// (3+1) resets × 600s effective timeout ≈ 40 minutes max of Case A resets. internal const int WatchdogMaxToolAliveResets = 3; /// @@ -1609,8 +1609,8 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session // Too many consecutive resets with no real SDK events — the // session's JSON-RPC connection is likely dead even though the // shared persistent server is still alive. Fall through to kill. - Debug($"[WATCHDOG] '{sessionName}' Case A reset count {resets} exceeded max {WatchdogMaxToolAliveResets} " + - $"— session connection likely dead (elapsed={elapsed:F0}s, totalProcessing={totalProcessingSeconds:F0}s)"); + Debug($"[WATCHDOG] '{sessionName}' Case A reset cap exceeded ({resets}/{WatchdogMaxToolAliveResets}) " + + $"— killing despite server alive (elapsed={elapsed:F0}s, totalProcessing={totalProcessingSeconds:F0}s)"); } else { @@ -1620,8 +1620,11 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session continue; // keep waiting — don't kill } } - Debug($"[WATCHDOG] '{sessionName}' tool running but server is not responding — killing stuck session " + - $"(elapsed={elapsed:F0}s, totalProcessing={totalProcessingSeconds:F0}s)"); + else + { + Debug($"[WATCHDOG] '{sessionName}' tool running but server is not responding — killing stuck session " + + $"(elapsed={elapsed:F0}s, totalProcessing={totalProcessingSeconds:F0}s)"); + } } else if (!hasActiveTool && (hasUsedTools || (isMultiAgentSession && !IsDemoMode && !IsRemoteMode && _serverManager.IsServerRunning))) {