From a1b592afda411959ba5494be6af27842a38d5680 Mon Sep 17 00:00:00 2001 From: Shane Date: Tue, 10 Mar 2026 15:11:20 -0500 Subject: [PATCH] Fix watchdog stuck in 'tool running + server alive' state indefinitely When the JSON-RPC connection dies mid-tool-execution but the server process stays alive, ActiveToolCallCount stays > 0 (zombie) because ToolExecutionCompleteEvent never arrives. The watchdog's TCP port check passes (server alive), so it keeps resetting the inactivity timer forever. Fix: Add StaleToolLivenessChecks counter that increments each time the watchdog resets the timer in the 'tool running + server alive' state. If this counter exceeds WatchdogMaxStaleToolLivenessChecks (4 checks = ~60s) without any real events arriving to reset it, assume the JSON-RPC connection is dead and proceed to recovery instead of continuing to wait indefinitely. The counter is reset to 0: - When any real SDK event arrives (proves connection alive) - On new prompt send (fresh turn) - On abort/complete response (turn ended) - On error handling paths Includes comprehensive regression tests verifying: - Constant exists and is reasonable (2-10 range) - Field exists in SessionState - Counter resets when real events arrive - Counter increments in server-alive case - Recovery triggered when limit exceeded - Counter reset in all state cleanup paths Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot.Tests/ProcessingWatchdogTests.cs | 175 ++++++++++++++++++++ PolyPilot/Services/CopilotService.Events.cs | 40 ++++- PolyPilot/Services/CopilotService.cs | 11 ++ 3 files changed, 222 insertions(+), 4 deletions(-) diff --git a/PolyPilot.Tests/ProcessingWatchdogTests.cs b/PolyPilot.Tests/ProcessingWatchdogTests.cs index 33db436a76..975d8edf28 100644 --- a/PolyPilot.Tests/ProcessingWatchdogTests.cs +++ b/PolyPilot.Tests/ProcessingWatchdogTests.cs @@ -2411,4 +2411,179 @@ public void AssistantTurnStartEvent_LoggedInEvtDiagnostics_InSource() "AssistantTurnStartEvent must be included in [EVT] logging so TurnStart " + "is visible in diagnostics (prevents invisible fallback cancellations)"); } + + // ===== Stale tool liveness check (fix for zombie ActiveToolCallCount) ===== + + [Fact] + public void WatchdogMaxStaleToolLivenessChecks_Constant_Exists() + { + // The constant must exist and be reasonable — at least 2 checks (30s) to avoid + // false positives, at most 10 checks (150s) to ensure recovery in reasonable time. + Assert.InRange(CopilotService.WatchdogMaxStaleToolLivenessChecks, 2, 10); + } + + [Fact] + public void StaleToolLivenessChecks_ExistsInSessionState() + { + // StaleToolLivenessChecks must exist in SessionState to track consecutive + // "tool running + server alive" resets without any real events. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs")); + Assert.True(source.Contains("StaleToolLivenessChecks"), + "StaleToolLivenessChecks field must exist in SessionState to track zombie tool detection"); + } + + [Fact] + public void StaleToolLivenessChecks_ResetWhenRealEventArrives() + { + // When a real SDK event arrives (not SessionUsageInfoEvent or AssistantUsageEvent), + // StaleToolLivenessChecks must be reset to 0 to prove the connection is alive. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + // Find the event handler where LastEventAtTicks is updated + var lastEventTicksIdx = source.IndexOf("LastEventAtTicks, DateTime.UtcNow.Ticks"); + Assert.True(lastEventTicksIdx >= 0, "LastEventAtTicks update must exist in event handler"); + // StaleToolLivenessChecks must be reset near the same location (within 500 chars to account for comment) + var contextStart = Math.Max(0, lastEventTicksIdx - 100); + var contextEnd = Math.Min(source.Length, lastEventTicksIdx + 500); + var context = source.Substring(contextStart, contextEnd - contextStart); + Assert.True(context.Contains("StaleToolLivenessChecks, 0"), + "StaleToolLivenessChecks must be reset to 0 when a real event arrives (proves connection alive)"); + } + + [Fact] + public void WatchdogDecision_ActiveTool_ServerAlive_IncrementsStaleCounter() + { + // When the watchdog sees "tool running + server alive", it must increment + // StaleToolLivenessChecks before deciding whether to reset the timer. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync"); + var elapsedIdx = source.IndexOf("elapsed >= effectiveTimeout", methodIdx); + var methodEndIdx = source.IndexOf(" private readonly ConcurrentDictionary", methodIdx); + if (methodEndIdx < 0) methodEndIdx = source.Length; + var block = source.Substring(elapsedIdx, methodEndIdx - elapsedIdx); + + // Find the server alive check + var serverAliveIdx = block.IndexOf("serverAlive"); + Assert.True(serverAliveIdx >= 0, "serverAlive check must exist in timeout block"); + + // After server alive check, must increment stale counter before continue + var afterServerAlive = block.Substring(serverAliveIdx); + var incrementIdx = afterServerAlive.IndexOf("Increment(ref state.StaleToolLivenessChecks)"); + Assert.True(incrementIdx >= 0, + "Must increment StaleToolLivenessChecks when server is alive to detect zombie state"); + } + + [Fact] + public void WatchdogDecision_StaleCounterExceedsLimit_StopsResettingTimer() + { + // When StaleToolLivenessChecks exceeds WatchdogMaxStaleToolLivenessChecks, + // the watchdog must stop resetting the timer and proceed to recovery. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync"); + var elapsedIdx = source.IndexOf("elapsed >= effectiveTimeout", methodIdx); + var methodEndIdx = source.IndexOf(" private readonly ConcurrentDictionary", methodIdx); + if (methodEndIdx < 0) methodEndIdx = source.Length; + var block = source.Substring(elapsedIdx, methodEndIdx - elapsedIdx); + + // Must compare stale counter to limit + Assert.True(block.Contains("WatchdogMaxStaleToolLivenessChecks"), + "Watchdog must compare stale counter to WatchdogMaxStaleToolLivenessChecks limit"); + + // When limit exceeded, must NOT continue — should fall through to recovery + var staleCheckIdx = block.IndexOf("staleChecks > WatchdogMaxStaleToolLivenessChecks"); + Assert.True(staleCheckIdx >= 0, + "Watchdog must check if staleChecks exceeds limit to detect zombie connection"); + + // The 'continue' (timer reset) must be inside the else block, not after stale check + var afterStaleCheck = block.Substring(staleCheckIdx); + var elseIdx = afterStaleCheck.IndexOf("else"); + var continueIdx = afterStaleCheck.IndexOf("continue;"); + Assert.True(elseIdx > 0 && continueIdx > elseIdx, + "The 'continue' must be in the else branch (only reset timer if below stale limit)"); + } + + [Fact] + public void StaleToolLivenessChecks_ResetOnSendPrompt() + { + // SendPromptAsync must reset StaleToolLivenessChecks along with other tool state. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs")); + // Find where ActiveToolCallCount is reset in SendPromptAsync (unique enough signature) + var sendPromptIdx = source.IndexOf("SendPromptAsync(string sessionName, string prompt"); + Assert.True(sendPromptIdx >= 0, "SendPromptAsync must exist"); + // Search within 8000 chars (method body) for the reset + var searchEnd = Math.Min(source.Length, sendPromptIdx + 8000); + var sendPromptBody = source.Substring(sendPromptIdx, searchEnd - sendPromptIdx); + + // Must reset stale tool liveness checks + Assert.True(sendPromptBody.Contains("StaleToolLivenessChecks, 0"), + "SendPromptAsync must reset StaleToolLivenessChecks to 0 on new turn"); + } + + [Fact] + public void StaleToolLivenessChecks_ResetOnAbort() + { + // AbortSessionAsync must reset StaleToolLivenessChecks along with other tool state. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs")); + var abortIdx = source.IndexOf("public async Task AbortSessionAsync"); + Assert.True(abortIdx >= 0, "AbortSessionAsync must exist"); + var abortEnd = source.IndexOf("public async Task SteerSessionAsync", abortIdx); + if (abortEnd < 0) abortEnd = source.Length; + var abortBody = source.Substring(abortIdx, abortEnd - abortIdx); + + Assert.True(abortBody.Contains("StaleToolLivenessChecks, 0"), + "AbortSessionAsync must reset StaleToolLivenessChecks to 0"); + } + + [Fact] + public void StaleToolLivenessChecks_ResetOnCompleteResponse() + { + // CompleteResponse must reset StaleToolLivenessChecks along with other tool state. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + var completeIdx = source.IndexOf("private void CompleteResponse(SessionState state"); + Assert.True(completeIdx >= 0, "CompleteResponse must exist"); + // Search within 3000 chars for the body + var searchEnd = Math.Min(source.Length, completeIdx + 3000); + var completeBody = source.Substring(completeIdx, searchEnd - completeIdx); + + Assert.True(completeBody.Contains("StaleToolLivenessChecks, 0"), + "CompleteResponse must reset StaleToolLivenessChecks to 0"); + } + + [Fact] + public void BugFix_ZombieActiveToolCount_RecoveredAfterStaleLimit() + { + // REGRESSION TEST for the bug where: + // 1. Tool starts → ActiveToolCallCount incremented + // 2. JSON-RPC connection dies → ToolExecutionCompleteEvent never arrives + // 3. ActiveToolCallCount stays > 0 (zombie) + // 4. Server TCP port check passes (server process alive) + // 5. Watchdog keeps resetting timer indefinitely + // + // Fix: After WatchdogMaxStaleToolLivenessChecks consecutive resets without + // any real events, stop trusting ActiveToolCallCount and recover. + var source = File.ReadAllText( + Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs")); + + // The fix introduces StaleToolLivenessChecks + Assert.True(source.Contains("StaleToolLivenessChecks"), + "StaleToolLivenessChecks must exist for zombie ActiveToolCallCount detection"); + + // The fix must increment counter on each "server alive" reset + Assert.True(source.Contains("Increment(ref state.StaleToolLivenessChecks)"), + "Must increment counter when resetting timer in server-alive case"); + + // The fix must check against limit before resetting timer + Assert.True(source.Contains("staleChecks > WatchdogMaxStaleToolLivenessChecks"), + "Must check if counter exceeds limit to detect zombie state"); + + // The fix must reset counter when real events arrive (check whole file) + Assert.True(source.Contains("Exchange(ref state.StaleToolLivenessChecks, 0)"), + "Must reset counter when real events arrive (proves connection alive)"); + } } \ No newline at end of file diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs index 6c94fdb02e..ed8b894c82 100644 --- a/PolyPilot/Services/CopilotService.Events.cs +++ b/PolyPilot/Services/CopilotService.Events.cs @@ -221,6 +221,10 @@ private void HandleSessionEvent(SessionState state, SessionEvent evt) { Interlocked.Exchange(ref state.LastEventAtTicks, DateTime.UtcNow.Ticks); state.Info.LastUpdatedAt = DateTime.Now; + // Real event arrived — reset the stale tool liveness counter. This proves the + // JSON-RPC connection is alive, so previous "tool running + server alive" resets + // were legitimate, not zombie state from a dead connection. + Interlocked.Exchange(ref state.StaleToolLivenessChecks, 0); } var sessionName = state.Info.Name; var isCurrentState = _sessions.TryGetValue(sessionName, out var current) && ReferenceEquals(current, state); @@ -445,6 +449,8 @@ void Invoke(Action action) var phaseAdvancedToThinking = state.Info.ProcessingPhase < 2; if (phaseAdvancedToThinking) state.Info.ProcessingPhase = 2; // Thinking Interlocked.Exchange(ref state.ActiveToolCallCount, 0); + // Note: StaleToolLivenessChecks is reset by the real event handler (above) that + // updates LastEventAtTicks, so we don't need to reset it here explicitly. Invoke(() => { OnTurnStart?.Invoke(sessionName); @@ -651,6 +657,7 @@ await notifService.SendNotificationAsync( Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); + Interlocked.Exchange(ref state.StaleToolLivenessChecks, 0); InvokeOnUI(() => { OnError?.Invoke(sessionName, errMsg); @@ -862,6 +869,7 @@ private void CompleteResponse(SessionState state, long? expectedGeneration = nul Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); + Interlocked.Exchange(ref state.StaleToolLivenessChecks, 0); state.Info.IsResumed = false; // Clear after first successful turn var response = state.CurrentResponse.ToString(); if (!string.IsNullOrWhiteSpace(response)) @@ -1398,6 +1406,12 @@ private void HandleReflectionAdvanceResult(SessionState state, string response, /// no single turn should run longer than this. This is a safety net for scenarios where /// non-progress events (like repeated SessionUsageInfoEvent) keep arriving without a terminal event. internal const int WatchdogMaxProcessingTimeSeconds = 3600; // 60 minutes + /// Maximum number of consecutive watchdog checks that can reset the timer + /// in the "tool running + server alive" state without any real events arriving. + /// After this limit, the JSON-RPC connection is assumed dead (even if the server TCP port is alive) + /// and the session should be recovered. This prevents indefinite stuck sessions when the + /// connection dies mid-tool-execution but the server process remains running. + internal const int WatchdogMaxStaleToolLivenessChecks = 4; // ~60 seconds at 15s intervals /// /// Milliseconds to wait after AssistantTurnEndEvent before firing CompleteResponse @@ -1586,10 +1600,25 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session var serverAlive = _serverManager.IsServerRunning; if (serverAlive) { - Debug($"[WATCHDOG] '{sessionName}' {elapsed:F0}s inactivity but tool is running and server is alive — resetting timer " + - $"(timeout={effectiveTimeout}s, totalProcessing={totalProcessingSeconds:F0}s)"); - Interlocked.Exchange(ref state.LastEventAtTicks, DateTime.UtcNow.Ticks); - continue; // keep waiting — don't kill + // Increment the stale tool liveness counter. If no real events have arrived + // to reset this counter, we may have a zombie ActiveToolCallCount from a dead + // JSON-RPC connection (the server TCP port is alive, but our session's + // connection died and ToolExecutionCompleteEvent will never arrive). + var staleChecks = Interlocked.Increment(ref state.StaleToolLivenessChecks); + if (staleChecks > WatchdogMaxStaleToolLivenessChecks) + { + // Too many consecutive "server alive" checks without any real events. + // The JSON-RPC connection is likely dead. Proceed to recovery. + Debug($"[WATCHDOG] '{sessionName}' {staleChecks} consecutive stale liveness checks without events — " + + $"assuming dead JSON-RPC connection (server TCP port is alive but session connection likely died)"); + } + else + { + Debug($"[WATCHDOG] '{sessionName}' {elapsed:F0}s inactivity but tool is running and server is alive — resetting timer " + + $"(timeout={effectiveTimeout}s, totalProcessing={totalProcessingSeconds:F0}s, staleChecks={staleChecks}/{WatchdogMaxStaleToolLivenessChecks})"); + Interlocked.Exchange(ref state.LastEventAtTicks, DateTime.UtcNow.Ticks); + continue; // keep waiting — don't kill + } } Debug($"[WATCHDOG] '{sessionName}' tool running but server is not responding — killing stuck session " + $"(elapsed={elapsed:F0}s, totalProcessing={totalProcessingSeconds:F0}s)"); @@ -1644,6 +1673,7 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); + Interlocked.Exchange(ref state.StaleToolLivenessChecks, 0); // Cancel any pending TurnEnd→Idle fallback CancelTurnEndFallback(state); state.Info.IsResumed = false; @@ -1710,6 +1740,7 @@ private void ClearProcessingStateForRecoveryFailure(SessionState state, string s state.HasUsedToolsThisTurn = false; Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); Interlocked.Exchange(ref state.ActiveToolCallCount, 0); + Interlocked.Exchange(ref state.StaleToolLivenessChecks, 0); state.Info.ProcessingStartedAt = null; state.Info.ToolCallCount = 0; state.Info.ProcessingPhase = 0; @@ -1839,6 +1870,7 @@ private async Task TryRecoverPermissionAsync(SessionState state, string sessionN state.HasUsedToolsThisTurn = false; Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); Interlocked.Exchange(ref state.ActiveToolCallCount, 0); + Interlocked.Exchange(ref state.StaleToolLivenessChecks, 0); state.Info.ProcessingStartedAt = null; state.Info.ToolCallCount = 0; state.Info.ProcessingPhase = 0; diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs index 63856b2d68..ba9b7294cd 100644 --- a/PolyPilot/Services/CopilotService.cs +++ b/PolyPilot/Services/CopilotService.cs @@ -440,6 +440,12 @@ private class SessionState /// Cleared on CompleteResponse and SendPromptAsync. /// public ConcurrentDictionary PendingReasoningMessages { get; } = new(); + /// + /// Counter for consecutive watchdog checks that reset the timer in the "tool running + server alive" + /// state without receiving any real events. Reset to 0 whenever a real event arrives. + /// Used to detect dead JSON-RPC connections that left ActiveToolCallCount > 0 as a zombie. + /// + public int StaleToolLivenessChecks; } private void Debug(string message) @@ -2410,6 +2416,7 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis Interlocked.Exchange(ref state.ActiveToolCallCount, 0); // Reset stale tool count from previous turn state.HasUsedToolsThisTurn = false; // Reset stale tool flag from previous turn Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); + Interlocked.Exchange(ref state.StaleToolLivenessChecks, 0); // Reset stale watchdog liveness counter // Cancel any pending TurnEnd→Idle fallback from the previous turn CancelTurnEndFallback(state); state.IsMultiAgentSession = IsSessionInMultiAgentGroup(sessionName); // Cache for watchdog (UI thread safe) @@ -2678,6 +2685,7 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); + Interlocked.Exchange(ref state.StaleToolLivenessChecks, 0); state.Info.IsResumed = false; state.Info.IsProcessing = false; if (state.Info.ProcessingStartedAt is { } rcStarted) @@ -2699,6 +2707,7 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); + Interlocked.Exchange(ref state.StaleToolLivenessChecks, 0); state.Info.IsResumed = false; state.Info.IsProcessing = false; if (state.Info.ProcessingStartedAt is { } saStarted) @@ -2847,6 +2856,7 @@ public async Task AbortSessionAsync(string sessionName, bool markAsInterrupted = Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); + Interlocked.Exchange(ref state.StaleToolLivenessChecks, 0); // Release send lock — allows a subsequent SteerSessionAsync to acquire it immediately Interlocked.Exchange(ref state.SendingFlag, 0); // Clear queued messages so they don't auto-send after abort @@ -2951,6 +2961,7 @@ await InvokeOnUIAsync(() => Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0); + Interlocked.Exchange(ref state.StaleToolLivenessChecks, 0); state.Info.IsResumed = false; Interlocked.Exchange(ref state.SendingFlag, 0); // Clear IsProcessing BEFORE completing TCS (INV-O3)