Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions PolyPilot.Tests/MultiAgentRegressionTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1459,11 +1459,17 @@ public void ReconnectState_ShouldCarryIsMultiAgentSession()
// 120s inactivity timeout instead of 600s, killing long-running worker tasks.
var source = File.ReadAllText(Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs"));

// Find the reconnect block where HasUsedToolsThisTurn is carried forward
var reconnectBlock = source.Substring(source.IndexOf("newState.HasUsedToolsThisTurn = state.HasUsedToolsThisTurn"));
// IsMultiAgentSession must be carried forward in the same block
// Use 400 chars to accommodate intermediate carry-forward lines (e.g. SuccessfulToolCountThisTurn)
Assert.Contains("newState.IsMultiAgentSession = state.IsMultiAgentSession", reconnectBlock.Substring(0, 400));
// Find the reconnect block where state is replaced
var marker = "[RECONNECT] '{sessionName}' replacing state";
var reconnectIdx = source.IndexOf(marker);
Assert.True(reconnectIdx >= 0, "Reconnect block must exist");
var watchdogIdx = source.IndexOf("StartProcessingWatchdog(state, sessionName)", reconnectIdx);
Assert.True(watchdogIdx >= 0, "StartProcessingWatchdog must follow reconnect block");
var reconnectBlock = source.Substring(reconnectIdx, watchdogIdx - reconnectIdx);
// IsMultiAgentSession must be carried forward (it's a property of the conversation, not the connection)
Assert.Contains("newState.IsMultiAgentSession = state.IsMultiAgentSession", reconnectBlock);
// But HasUsedToolsThisTurn must be RESET to false (it's connection-specific, not conversation-specific)
Assert.Contains("HasUsedToolsThisTurn = false", reconnectBlock);
}

[Fact]
Expand Down
176 changes: 176 additions & 0 deletions PolyPilot.Tests/ProcessingWatchdogTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2411,4 +2411,180 @@ public void AssistantTurnStartEvent_LoggedInEvtDiagnostics_InSource()
"AssistantTurnStartEvent must be included in [EVT] logging so TurnStart " +
"is visible in diagnostics (prevents invisible fallback cancellations)");
}

// ===========================================================================
// Regression tests for: stuck session due to watchdog Case A infinite reset
// Bug: When a tool is active (ActiveToolCallCount > 0) and the persistent
// server is alive but the specific session's JSON-RPC connection is dead,
// Case A resets LastEventAtTicks indefinitely. ProcessingStartedAt resets
// on each app restart, so the 60-minute max time safety net never fires.
// Fix: Cap consecutive Case A resets via WatchdogMaxToolAliveResets.
// ===========================================================================

[Fact]
public void WatchdogMaxToolAliveResets_IsReasonable()
{
// Must allow at least 1 reset (legitimate long tool execution),
// but not infinite. Cap at a reasonable number so stuck sessions
// are killed in 30-60 minutes even with server alive.
Assert.InRange(CopilotService.WatchdogMaxToolAliveResets, 1, 10);
}

[Fact]
public void WatchdogMaxToolAliveResets_BoundsMaxStuckTime()
{
// The reset counter is incremented BEFORE the comparison:
// var resets = Interlocked.Increment(ref state.WatchdogCaseAResets);
// if (resets > WatchdogMaxToolAliveResets) { /* fall through */ }
// So the cap fires on the (N+1)th trigger. Actual max stuck time
// = (WatchdogMaxToolAliveResets + 1) × tool timeout (one initial
// timeout to enter the block, then N resets before exceeding the cap).
// This must be less than WatchdogMaxProcessingTimeSeconds (3600s)
// so the reset cap fires before the absolute max (which may be
// defeated by ProcessingStartedAt resetting on app restart).
var maxCaseAStuckSeconds = (CopilotService.WatchdogMaxToolAliveResets + 1)
* CopilotService.WatchdogToolExecutionTimeoutSeconds;
Assert.True(maxCaseAStuckSeconds < CopilotService.WatchdogMaxProcessingTimeSeconds,
$"Case A max stuck time ({maxCaseAStuckSeconds}s) must be less than " +
$"absolute max ({CopilotService.WatchdogMaxProcessingTimeSeconds}s). " +
"The reset cap is the PRIMARY safety net since the absolute max resets on app restart.");
}

[Fact]
public void CaseA_ExceedingMaxResets_FallsThroughToKill_InSource()
{
// Verify the watchdog's Case A checks WatchdogMaxToolAliveResets and falls
// through when exceeded. This is the core fix for the stuck-session bug where
// a dead session's tool appears active but no events ever arrive.
var source = File.ReadAllText(
Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs"));
var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync");
Assert.True(methodIdx >= 0, "RunProcessingWatchdogAsync must exist");
var endIdx = source.IndexOf(" private readonly ConcurrentDictionary", methodIdx);
var watchdogBody = source.Substring(methodIdx, endIdx - methodIdx);

// Case A must reference WatchdogMaxToolAliveResets
Assert.True(watchdogBody.Contains("WatchdogMaxToolAliveResets"),
"Case A must check WatchdogMaxToolAliveResets to cap consecutive resets");
// Case A must increment WatchdogCaseAResets
Assert.True(watchdogBody.Contains("WatchdogCaseAResets"),
"Case A must track reset count via state.WatchdogCaseAResets");
}

[Fact]
public void CaseA_ResetCounter_ClearedOnRealEvents_InSource()
{
// When real SDK events arrive (not usage/metrics), the Case A reset counter
// must be cleared. This proves the session's connection is alive, so future
// Case A resets should be fresh (not counting against the cap).
var source = File.ReadAllText(
Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs"));
var handlerIdx = source.IndexOf("private void HandleSessionEvent");
Assert.True(handlerIdx >= 0, "HandleSessionEvent must exist");
// Find the block that resets LastEventAtTicks (only for real events)
var lastEventResetIdx = source.IndexOf("LastEventAtTicks", handlerIdx);
Assert.True(lastEventResetIdx >= 0, "HandleSessionEvent must update LastEventAtTicks");
// The counter reset must be near the LastEventAtTicks reset (within ~300 chars)
var nearbyBlock = source.Substring(lastEventResetIdx, Math.Min(300, source.Length - lastEventResetIdx));
Assert.True(nearbyBlock.Contains("WatchdogCaseAResets"),
"WatchdogCaseAResets must be reset near LastEventAtTicks in HandleSessionEvent " +
"to clear the counter when real SDK events prove the connection is alive");
}

[Fact]
public void CaseA_ResetCounter_ClearedOnWatchdogStart_InSource()
{
// StartProcessingWatchdog must reset WatchdogCaseAResets to 0 so each new
// watchdog instance starts with a clean reset counter.
var source = File.ReadAllText(
Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs"));
var startIdx = source.IndexOf("private void StartProcessingWatchdog");
Assert.True(startIdx >= 0, "StartProcessingWatchdog must exist");
var methodEnd = source.IndexOf("_ = RunProcessingWatchdogAsync", startIdx);
var methodBody = source.Substring(startIdx, methodEnd - startIdx);
Assert.True(methodBody.Contains("WatchdogCaseAResets"),
"StartProcessingWatchdog must reset WatchdogCaseAResets to 0");
}

[Fact]
public void ExceededMaxTime_TrueWhenProcessingStartedAtNull_InSource()
{
// Defensive: if ProcessingStartedAt is null while IsProcessing is true,
// exceededMaxTime must be true. Without this, Case A can reset forever
// because totalProcessingSeconds=0 makes exceededMaxTime always false.
var source = File.ReadAllText(
Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs"));
var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync");
Assert.True(methodIdx >= 0, "RunProcessingWatchdogAsync must exist");
var endIdx = source.IndexOf(" private readonly ConcurrentDictionary", methodIdx);
var watchdogBody = source.Substring(methodIdx, endIdx - methodIdx);

// The exceededMaxTime calculation must handle null ProcessingStartedAt
Assert.True(watchdogBody.Contains("!startedAt.HasValue"),
"exceededMaxTime must be true when ProcessingStartedAt is null " +
"(defensive guard against Case A infinite reset)");
}

// ===========================================================================
// Regression tests for: reconnect path inherits stale tool state
// Bug: After reconnect, HasUsedToolsThisTurn=true from the dead connection
// inflates the watchdog timeout from 120s to 600s. ProcessingStartedAt is
// not reset, so the max-time safety net measures from the original send.
// Fix: Reset tool tracking and ProcessingStartedAt in the reconnect block.
// ===========================================================================

[Fact]
public void ReconnectPath_ResetsHasUsedToolsThisTurn_InSource()
{
// After reconnect, HasUsedToolsThisTurn must be false so the new connection
// uses the 120s inactivity timeout, not the 600s tool timeout inherited from
// the dead connection. Without this, reconnected sessions wait 5x longer.
var source = File.ReadAllText(
Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs"));
var reconnectIdx = source.IndexOf("[RECONNECT] '{sessionName}' replacing state");
Assert.True(reconnectIdx >= 0, "Reconnect block must exist");
// Find StartProcessingWatchdog in the reconnect block (marks the end of state setup)
var watchdogIdx = source.IndexOf("StartProcessingWatchdog(state, sessionName)", reconnectIdx);
Assert.True(watchdogIdx >= 0, "StartProcessingWatchdog must be called in reconnect block");
var reconnectBlock = source.Substring(reconnectIdx, watchdogIdx - reconnectIdx);

// HasUsedToolsThisTurn must be set to false (not carried from old state)
Assert.True(reconnectBlock.Contains("HasUsedToolsThisTurn = false"),
"Reconnect block must reset HasUsedToolsThisTurn to false for the new connection. " +
"Carrying over true from the dead connection inflates the timeout from 120s to 600s.");
}

[Fact]
public void ReconnectPath_ResetsProcessingStartedAt_InSource()
{
// After reconnect, ProcessingStartedAt must be reset to DateTime.UtcNow
// so the watchdog's max-time safety net measures from the reconnect time,
// not the original send time.
var source = File.ReadAllText(
Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs"));
var reconnectIdx = source.IndexOf("[RECONNECT] '{sessionName}' replacing state");
Assert.True(reconnectIdx >= 0, "Reconnect block must exist");
var watchdogIdx = source.IndexOf("StartProcessingWatchdog(state, sessionName)", reconnectIdx);
var reconnectBlock = source.Substring(reconnectIdx, watchdogIdx - reconnectIdx);

Assert.True(reconnectBlock.Contains("ProcessingStartedAt = DateTime.UtcNow"),
"Reconnect block must reset ProcessingStartedAt to DateTime.UtcNow. " +
"Without this, the 60-min max-time safety net measures from the original send.");
}

[Fact]
public void ReconnectPath_ResetsActiveToolCallCount_InSource()
{
// After reconnect, ActiveToolCallCount must be 0. No tools have started
// on the new connection. A stale count > 0 would trigger Case A resets.
var source = File.ReadAllText(
Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs"));
var reconnectIdx = source.IndexOf("[RECONNECT] '{sessionName}' replacing state");
Assert.True(reconnectIdx >= 0, "Reconnect block must exist");
var watchdogIdx = source.IndexOf("StartProcessingWatchdog(state, sessionName)", reconnectIdx);
var reconnectBlock = source.Substring(reconnectIdx, watchdogIdx - reconnectIdx);

Assert.True(reconnectBlock.Contains("ActiveToolCallCount") && reconnectBlock.Contains("0"),
"Reconnect block must reset ActiveToolCallCount to 0 for the new connection.");
}
}
46 changes: 39 additions & 7 deletions PolyPilot/Services/CopilotService.Events.cs
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,9 @@ private void HandleSessionEvent(SessionState state, SessionEvent evt)
if (evt is not SessionUsageInfoEvent and not AssistantUsageEvent)
{
Interlocked.Exchange(ref state.LastEventAtTicks, DateTime.UtcNow.Ticks);
// Real event arrived — reset the Case A reset counter. This proves the session's
// JSON-RPC connection is alive, so future Case A resets are legitimate.
Interlocked.Exchange(ref state.WatchdogCaseAResets, 0);
state.Info.LastUpdatedAt = DateTime.Now;
}
var sessionName = state.Info.Name;
Expand Down Expand Up @@ -1399,6 +1402,15 @@ private void HandleReflectionAdvanceResult(SessionState state, string response,
/// non-progress events (like repeated SessionUsageInfoEvent) keep arriving without a terminal event.</summary>
internal const int WatchdogMaxProcessingTimeSeconds = 3600; // 60 minutes

/// <summary>Maximum number of consecutive Case A resets (tool active + server alive) before
/// the watchdog assumes the session's JSON-RPC connection is dead and kills it anyway.
/// The persistent server may still be alive serving other sessions while this specific
/// session's transport-level connection is broken (ConnectionLostException). Without this cap,
/// Case A resets LastEventAtTicks indefinitely, and ProcessingStartedAt resets on each app
/// restart — so neither the inactivity nor the max-time safety net ever fires.
/// (3+1) resets × 600s effective timeout ≈ 40 minutes max of Case A resets.</summary>
internal const int WatchdogMaxToolAliveResets = 3;

/// <summary>
/// Milliseconds to wait after AssistantTurnEndEvent before firing CompleteResponse
/// as a fallback, in case SessionIdleEvent never arrives (SDK bug #299).
Expand Down Expand Up @@ -1458,6 +1470,7 @@ private void StartProcessingWatchdog(SessionState state, string sessionName)
// timeout to fire on the first watchdog check for any file > ~15s old.
// This is the exact regression pattern from PR #148 (short timeout killing active sessions).
Interlocked.Exchange(ref state.LastEventAtTicks, DateTime.UtcNow.Ticks);
Interlocked.Exchange(ref state.WatchdogCaseAResets, 0);
state.ProcessingWatchdog = new CancellationTokenSource();
var ct = state.ProcessingWatchdog.Token;
_ = RunProcessingWatchdogAsync(state, sessionName, ct);
Expand Down Expand Up @@ -1559,7 +1572,11 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session

if (elapsed >= effectiveTimeout)
{
var exceededMaxTime = totalProcessingSeconds >= WatchdogMaxProcessingTimeSeconds;
// Defensive: if ProcessingStartedAt is null while IsProcessing is true,
// something is already wrong — treat as exceeded max time so Case A
// can't reset the timer indefinitely.
var exceededMaxTime = !startedAt.HasValue
|| totalProcessingSeconds >= WatchdogMaxProcessingTimeSeconds;

// Before killing, check what state we're actually in:
//
Expand All @@ -1586,13 +1603,28 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session
var serverAlive = _serverManager.IsServerRunning;
if (serverAlive)
{
Debug($"[WATCHDOG] '{sessionName}' {elapsed:F0}s inactivity but tool is running and server is alive — resetting timer " +
$"(timeout={effectiveTimeout}s, totalProcessing={totalProcessingSeconds:F0}s)");
Interlocked.Exchange(ref state.LastEventAtTicks, DateTime.UtcNow.Ticks);
continue; // keep waiting — don't kill
var resets = Interlocked.Increment(ref state.WatchdogCaseAResets);
if (resets > WatchdogMaxToolAliveResets)
{
// Too many consecutive resets with no real SDK events — the
// session's JSON-RPC connection is likely dead even though the
// shared persistent server is still alive. Fall through to kill.
Debug($"[WATCHDOG] '{sessionName}' Case A reset cap exceeded ({resets}/{WatchdogMaxToolAliveResets}) " +
$"— killing despite server alive (elapsed={elapsed:F0}s, totalProcessing={totalProcessingSeconds:F0}s)");
}
else
{
Debug($"[WATCHDOG] '{sessionName}' {elapsed:F0}s inactivity but tool is running and server is alive — resetting timer " +
$"(reset #{resets}/{WatchdogMaxToolAliveResets}, timeout={effectiveTimeout}s, totalProcessing={totalProcessingSeconds:F0}s)");
Interlocked.Exchange(ref state.LastEventAtTicks, DateTime.UtcNow.Ticks);
continue; // keep waiting — don't kill
}
}
else
{
Debug($"[WATCHDOG] '{sessionName}' tool running but server is not responding — killing stuck session " +
$"(elapsed={elapsed:F0}s, totalProcessing={totalProcessingSeconds:F0}s)");
}
Debug($"[WATCHDOG] '{sessionName}' tool running but server is not responding — killing stuck session " +
$"(elapsed={elapsed:F0}s, totalProcessing={totalProcessingSeconds:F0}s)");
}
else if (!hasActiveTool && (hasUsedTools || (isMultiAgentSession && !IsDemoMode && !IsRemoteMode && _serverManager.IsServerRunning)))
{
Expand Down
Loading