Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
175 changes: 175 additions & 0 deletions PolyPilot.Tests/ProcessingWatchdogTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2411,4 +2411,179 @@ public void AssistantTurnStartEvent_LoggedInEvtDiagnostics_InSource()
"AssistantTurnStartEvent must be included in [EVT] logging so TurnStart " +
"is visible in diagnostics (prevents invisible fallback cancellations)");
}

// ===== Stale tool liveness check (fix for zombie ActiveToolCallCount) =====

[Fact]
public void WatchdogMaxStaleToolLivenessChecks_Constant_Exists()
{
// The constant must exist and be reasonable — at least 2 checks (30s) to avoid
// false positives, at most 10 checks (150s) to ensure recovery in reasonable time.
Assert.InRange(CopilotService.WatchdogMaxStaleToolLivenessChecks, 2, 10);
}

[Fact]
public void StaleToolLivenessChecks_ExistsInSessionState()
{
// StaleToolLivenessChecks must exist in SessionState to track consecutive
// "tool running + server alive" resets without any real events.
var source = File.ReadAllText(
Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs"));
Assert.True(source.Contains("StaleToolLivenessChecks"),
"StaleToolLivenessChecks field must exist in SessionState to track zombie tool detection");
}

[Fact]
public void StaleToolLivenessChecks_ResetWhenRealEventArrives()
{
// When a real SDK event arrives (not SessionUsageInfoEvent or AssistantUsageEvent),
// StaleToolLivenessChecks must be reset to 0 to prove the connection is alive.
var source = File.ReadAllText(
Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs"));
// Find the event handler where LastEventAtTicks is updated
var lastEventTicksIdx = source.IndexOf("LastEventAtTicks, DateTime.UtcNow.Ticks");
Assert.True(lastEventTicksIdx >= 0, "LastEventAtTicks update must exist in event handler");
// StaleToolLivenessChecks must be reset near the same location (within 500 chars to account for comment)
var contextStart = Math.Max(0, lastEventTicksIdx - 100);
var contextEnd = Math.Min(source.Length, lastEventTicksIdx + 500);
var context = source.Substring(contextStart, contextEnd - contextStart);
Assert.True(context.Contains("StaleToolLivenessChecks, 0"),
"StaleToolLivenessChecks must be reset to 0 when a real event arrives (proves connection alive)");
}

[Fact]
public void WatchdogDecision_ActiveTool_ServerAlive_IncrementsStaleCounter()
{
// When the watchdog sees "tool running + server alive", it must increment
// StaleToolLivenessChecks before deciding whether to reset the timer.
var source = File.ReadAllText(
Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs"));
var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync");
var elapsedIdx = source.IndexOf("elapsed >= effectiveTimeout", methodIdx);
var methodEndIdx = source.IndexOf(" private readonly ConcurrentDictionary", methodIdx);
if (methodEndIdx < 0) methodEndIdx = source.Length;
var block = source.Substring(elapsedIdx, methodEndIdx - elapsedIdx);

// Find the server alive check
var serverAliveIdx = block.IndexOf("serverAlive");
Assert.True(serverAliveIdx >= 0, "serverAlive check must exist in timeout block");

// After server alive check, must increment stale counter before continue
var afterServerAlive = block.Substring(serverAliveIdx);
var incrementIdx = afterServerAlive.IndexOf("Increment(ref state.StaleToolLivenessChecks)");
Assert.True(incrementIdx >= 0,
"Must increment StaleToolLivenessChecks when server is alive to detect zombie state");
}

[Fact]
public void WatchdogDecision_StaleCounterExceedsLimit_StopsResettingTimer()
{
// When StaleToolLivenessChecks exceeds WatchdogMaxStaleToolLivenessChecks,
// the watchdog must stop resetting the timer and proceed to recovery.
var source = File.ReadAllText(
Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs"));
var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync");
var elapsedIdx = source.IndexOf("elapsed >= effectiveTimeout", methodIdx);
var methodEndIdx = source.IndexOf(" private readonly ConcurrentDictionary", methodIdx);
if (methodEndIdx < 0) methodEndIdx = source.Length;
var block = source.Substring(elapsedIdx, methodEndIdx - elapsedIdx);

// Must compare stale counter to limit
Assert.True(block.Contains("WatchdogMaxStaleToolLivenessChecks"),
"Watchdog must compare stale counter to WatchdogMaxStaleToolLivenessChecks limit");

// When limit exceeded, must NOT continue — should fall through to recovery
var staleCheckIdx = block.IndexOf("staleChecks > WatchdogMaxStaleToolLivenessChecks");
Assert.True(staleCheckIdx >= 0,
"Watchdog must check if staleChecks exceeds limit to detect zombie connection");

// The 'continue' (timer reset) must be inside the else block, not after stale check
var afterStaleCheck = block.Substring(staleCheckIdx);
var elseIdx = afterStaleCheck.IndexOf("else");
var continueIdx = afterStaleCheck.IndexOf("continue;");
Assert.True(elseIdx > 0 && continueIdx > elseIdx,
"The 'continue' must be in the else branch (only reset timer if below stale limit)");
}

[Fact]
public void StaleToolLivenessChecks_ResetOnSendPrompt()
{
// SendPromptAsync must reset StaleToolLivenessChecks along with other tool state.
var source = File.ReadAllText(
Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs"));
// Find where ActiveToolCallCount is reset in SendPromptAsync (unique enough signature)
var sendPromptIdx = source.IndexOf("SendPromptAsync(string sessionName, string prompt");
Assert.True(sendPromptIdx >= 0, "SendPromptAsync must exist");
// Search within 8000 chars (method body) for the reset
var searchEnd = Math.Min(source.Length, sendPromptIdx + 8000);
var sendPromptBody = source.Substring(sendPromptIdx, searchEnd - sendPromptIdx);

// Must reset stale tool liveness checks
Assert.True(sendPromptBody.Contains("StaleToolLivenessChecks, 0"),
"SendPromptAsync must reset StaleToolLivenessChecks to 0 on new turn");
}

[Fact]
public void StaleToolLivenessChecks_ResetOnAbort()
{
// AbortSessionAsync must reset StaleToolLivenessChecks along with other tool state.
var source = File.ReadAllText(
Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs"));
var abortIdx = source.IndexOf("public async Task AbortSessionAsync");
Assert.True(abortIdx >= 0, "AbortSessionAsync must exist");
var abortEnd = source.IndexOf("public async Task<bool> SteerSessionAsync", abortIdx);
if (abortEnd < 0) abortEnd = source.Length;
var abortBody = source.Substring(abortIdx, abortEnd - abortIdx);

Assert.True(abortBody.Contains("StaleToolLivenessChecks, 0"),
"AbortSessionAsync must reset StaleToolLivenessChecks to 0");
}

[Fact]
public void StaleToolLivenessChecks_ResetOnCompleteResponse()
{
// CompleteResponse must reset StaleToolLivenessChecks along with other tool state.
var source = File.ReadAllText(
Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs"));
var completeIdx = source.IndexOf("private void CompleteResponse(SessionState state");
Assert.True(completeIdx >= 0, "CompleteResponse must exist");
// Search within 3000 chars for the body
var searchEnd = Math.Min(source.Length, completeIdx + 3000);
var completeBody = source.Substring(completeIdx, searchEnd - completeIdx);

Assert.True(completeBody.Contains("StaleToolLivenessChecks, 0"),
"CompleteResponse must reset StaleToolLivenessChecks to 0");
}

[Fact]
public void BugFix_ZombieActiveToolCount_RecoveredAfterStaleLimit()
{
// REGRESSION TEST for the bug where:
// 1. Tool starts → ActiveToolCallCount incremented
// 2. JSON-RPC connection dies → ToolExecutionCompleteEvent never arrives
// 3. ActiveToolCallCount stays > 0 (zombie)
// 4. Server TCP port check passes (server process alive)
// 5. Watchdog keeps resetting timer indefinitely
//
// Fix: After WatchdogMaxStaleToolLivenessChecks consecutive resets without
// any real events, stop trusting ActiveToolCallCount and recover.
var source = File.ReadAllText(
Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs"));

// The fix introduces StaleToolLivenessChecks
Assert.True(source.Contains("StaleToolLivenessChecks"),
"StaleToolLivenessChecks must exist for zombie ActiveToolCallCount detection");

// The fix must increment counter on each "server alive" reset
Assert.True(source.Contains("Increment(ref state.StaleToolLivenessChecks)"),
"Must increment counter when resetting timer in server-alive case");

// The fix must check against limit before resetting timer
Assert.True(source.Contains("staleChecks > WatchdogMaxStaleToolLivenessChecks"),
"Must check if counter exceeds limit to detect zombie state");

// The fix must reset counter when real events arrive (check whole file)
Assert.True(source.Contains("Exchange(ref state.StaleToolLivenessChecks, 0)"),
"Must reset counter when real events arrive (proves connection alive)");
}
}
40 changes: 36 additions & 4 deletions PolyPilot/Services/CopilotService.Events.cs
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,10 @@ private void HandleSessionEvent(SessionState state, SessionEvent evt)
{
Interlocked.Exchange(ref state.LastEventAtTicks, DateTime.UtcNow.Ticks);
state.Info.LastUpdatedAt = DateTime.Now;
// Real event arrived — reset the stale tool liveness counter. This proves the
// JSON-RPC connection is alive, so previous "tool running + server alive" resets
// were legitimate, not zombie state from a dead connection.
Interlocked.Exchange(ref state.StaleToolLivenessChecks, 0);
}
var sessionName = state.Info.Name;
var isCurrentState = _sessions.TryGetValue(sessionName, out var current) && ReferenceEquals(current, state);
Expand Down Expand Up @@ -445,6 +449,8 @@ void Invoke(Action action)
var phaseAdvancedToThinking = state.Info.ProcessingPhase < 2;
if (phaseAdvancedToThinking) state.Info.ProcessingPhase = 2; // Thinking
Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
// Note: StaleToolLivenessChecks is reset by the real event handler (above) that
// updates LastEventAtTicks, so we don't need to reset it here explicitly.
Invoke(() =>
{
OnTurnStart?.Invoke(sessionName);
Expand Down Expand Up @@ -651,6 +657,7 @@ await notifService.SendNotificationAsync(
Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
state.HasUsedToolsThisTurn = false;
Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
Interlocked.Exchange(ref state.StaleToolLivenessChecks, 0);
InvokeOnUI(() =>
{
OnError?.Invoke(sessionName, errMsg);
Expand Down Expand Up @@ -862,6 +869,7 @@ private void CompleteResponse(SessionState state, long? expectedGeneration = nul
Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
state.HasUsedToolsThisTurn = false;
Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
Interlocked.Exchange(ref state.StaleToolLivenessChecks, 0);
state.Info.IsResumed = false; // Clear after first successful turn
var response = state.CurrentResponse.ToString();
if (!string.IsNullOrWhiteSpace(response))
Expand Down Expand Up @@ -1398,6 +1406,12 @@ private void HandleReflectionAdvanceResult(SessionState state, string response,
/// no single turn should run longer than this. This is a safety net for scenarios where
/// non-progress events (like repeated SessionUsageInfoEvent) keep arriving without a terminal event.</summary>
internal const int WatchdogMaxProcessingTimeSeconds = 3600; // 60 minutes
/// <summary>Maximum number of consecutive watchdog checks that can reset the timer
/// in the "tool running + server alive" state without any real events arriving.
/// After this limit, the JSON-RPC connection is assumed dead (even if the server TCP port is alive)
/// and the session should be recovered. This prevents indefinite stuck sessions when the
/// connection dies mid-tool-execution but the server process remains running.</summary>
internal const int WatchdogMaxStaleToolLivenessChecks = 4; // ~60 seconds at 15s intervals

/// <summary>
/// Milliseconds to wait after AssistantTurnEndEvent before firing CompleteResponse
Expand Down Expand Up @@ -1586,10 +1600,25 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session
var serverAlive = _serverManager.IsServerRunning;
if (serverAlive)
{
Debug($"[WATCHDOG] '{sessionName}' {elapsed:F0}s inactivity but tool is running and server is alive — resetting timer " +
$"(timeout={effectiveTimeout}s, totalProcessing={totalProcessingSeconds:F0}s)");
Interlocked.Exchange(ref state.LastEventAtTicks, DateTime.UtcNow.Ticks);
continue; // keep waiting — don't kill
// Increment the stale tool liveness counter. If no real events have arrived
// to reset this counter, we may have a zombie ActiveToolCallCount from a dead
// JSON-RPC connection (the server TCP port is alive, but our session's
// connection died and ToolExecutionCompleteEvent will never arrive).
var staleChecks = Interlocked.Increment(ref state.StaleToolLivenessChecks);
if (staleChecks > WatchdogMaxStaleToolLivenessChecks)
{
// Too many consecutive "server alive" checks without any real events.
// The JSON-RPC connection is likely dead. Proceed to recovery.
Debug($"[WATCHDOG] '{sessionName}' {staleChecks} consecutive stale liveness checks without events — " +
$"assuming dead JSON-RPC connection (server TCP port is alive but session connection likely died)");
}
else
{
Debug($"[WATCHDOG] '{sessionName}' {elapsed:F0}s inactivity but tool is running and server is alive — resetting timer " +
$"(timeout={effectiveTimeout}s, totalProcessing={totalProcessingSeconds:F0}s, staleChecks={staleChecks}/{WatchdogMaxStaleToolLivenessChecks})");
Interlocked.Exchange(ref state.LastEventAtTicks, DateTime.UtcNow.Ticks);
continue; // keep waiting — don't kill
}
}
Debug($"[WATCHDOG] '{sessionName}' tool running but server is not responding — killing stuck session " +
$"(elapsed={elapsed:F0}s, totalProcessing={totalProcessingSeconds:F0}s)");
Expand Down Expand Up @@ -1644,6 +1673,7 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session
Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
state.HasUsedToolsThisTurn = false;
Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
Interlocked.Exchange(ref state.StaleToolLivenessChecks, 0);
// Cancel any pending TurnEnd→Idle fallback
CancelTurnEndFallback(state);
state.Info.IsResumed = false;
Expand Down Expand Up @@ -1710,6 +1740,7 @@ private void ClearProcessingStateForRecoveryFailure(SessionState state, string s
state.HasUsedToolsThisTurn = false;
Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
Interlocked.Exchange(ref state.StaleToolLivenessChecks, 0);
state.Info.ProcessingStartedAt = null;
state.Info.ToolCallCount = 0;
state.Info.ProcessingPhase = 0;
Expand Down Expand Up @@ -1839,6 +1870,7 @@ private async Task TryRecoverPermissionAsync(SessionState state, string sessionN
state.HasUsedToolsThisTurn = false;
Interlocked.Exchange(ref state.SuccessfulToolCountThisTurn, 0);
Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
Interlocked.Exchange(ref state.StaleToolLivenessChecks, 0);
state.Info.ProcessingStartedAt = null;
state.Info.ToolCallCount = 0;
state.Info.ProcessingPhase = 0;
Expand Down
Loading