diff --git a/PolyPilot.Tests/MultiAgentRegressionTests.cs b/PolyPilot.Tests/MultiAgentRegressionTests.cs
index c25995f5c8..fe799684fb 100644
--- a/PolyPilot.Tests/MultiAgentRegressionTests.cs
+++ b/PolyPilot.Tests/MultiAgentRegressionTests.cs
@@ -1459,11 +1459,17 @@ public void ReconnectState_ShouldCarryIsMultiAgentSession()
// 120s inactivity timeout instead of 600s, killing long-running worker tasks.
var source = File.ReadAllText(Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs"));
- // Find the reconnect block where HasUsedToolsThisTurn is carried forward
- var reconnectBlock = source.Substring(source.IndexOf("newState.HasUsedToolsThisTurn = state.HasUsedToolsThisTurn"));
- // IsMultiAgentSession must be carried forward in the same block
- // Use 400 chars to accommodate intermediate carry-forward lines (e.g. SuccessfulToolCountThisTurn)
- Assert.Contains("newState.IsMultiAgentSession = state.IsMultiAgentSession", reconnectBlock.Substring(0, 400));
+ // Find the reconnect block where state is replaced
+ var marker = "[RECONNECT] '{sessionName}' replacing state";
+ var reconnectIdx = source.IndexOf(marker);
+ Assert.True(reconnectIdx >= 0, "Reconnect block must exist");
+ var watchdogIdx = source.IndexOf("StartProcessingWatchdog(state, sessionName)", reconnectIdx);
+ Assert.True(watchdogIdx >= 0, "StartProcessingWatchdog must follow reconnect block");
+ var reconnectBlock = source.Substring(reconnectIdx, watchdogIdx - reconnectIdx);
+ // IsMultiAgentSession must be carried forward (it's a property of the conversation, not the connection)
+ Assert.Contains("newState.IsMultiAgentSession = state.IsMultiAgentSession", reconnectBlock);
+ // But HasUsedToolsThisTurn must be RESET to false (it's connection-specific, not conversation-specific)
+ Assert.Contains("HasUsedToolsThisTurn = false", reconnectBlock);
}
[Fact]
diff --git a/PolyPilot.Tests/ProcessingWatchdogTests.cs b/PolyPilot.Tests/ProcessingWatchdogTests.cs
index 33db436a76..ec16c7ba7a 100644
--- a/PolyPilot.Tests/ProcessingWatchdogTests.cs
+++ b/PolyPilot.Tests/ProcessingWatchdogTests.cs
@@ -2411,4 +2411,180 @@ public void AssistantTurnStartEvent_LoggedInEvtDiagnostics_InSource()
"AssistantTurnStartEvent must be included in [EVT] logging so TurnStart " +
"is visible in diagnostics (prevents invisible fallback cancellations)");
}
+
+ // ===========================================================================
+ // Regression tests for: stuck session due to watchdog Case A infinite reset
+ // Bug: When a tool is active (ActiveToolCallCount > 0) and the persistent
+ // server is alive but the specific session's JSON-RPC connection is dead,
+ // Case A resets LastEventAtTicks indefinitely. ProcessingStartedAt resets
+ // on each app restart, so the 60-minute max time safety net never fires.
+ // Fix: Cap consecutive Case A resets via WatchdogMaxToolAliveResets.
+ // ===========================================================================
+
+ [Fact]
+ public void WatchdogMaxToolAliveResets_IsReasonable()
+ {
+ // Must allow at least 1 reset (legitimate long tool execution),
+ // but not infinite. Cap at a reasonable number so stuck sessions
+ // are killed in 30-60 minutes even with server alive.
+ Assert.InRange(CopilotService.WatchdogMaxToolAliveResets, 1, 10);
+ }
+
+ [Fact]
+ public void WatchdogMaxToolAliveResets_BoundsMaxStuckTime()
+ {
+ // The reset counter is incremented BEFORE the comparison:
+ // var resets = Interlocked.Increment(ref state.WatchdogCaseAResets);
+ // if (resets > WatchdogMaxToolAliveResets) { /* fall through */ }
+ // So the cap fires on the (N+1)th trigger. Actual max stuck time
+ // = (WatchdogMaxToolAliveResets + 1) × tool timeout (one initial
+ // timeout to enter the block, then N resets before exceeding the cap).
+ // This must be less than WatchdogMaxProcessingTimeSeconds (3600s)
+ // so the reset cap fires before the absolute max (which may be
+ // defeated by ProcessingStartedAt resetting on app restart).
+ var maxCaseAStuckSeconds = (CopilotService.WatchdogMaxToolAliveResets + 1)
+ * CopilotService.WatchdogToolExecutionTimeoutSeconds;
+ Assert.True(maxCaseAStuckSeconds < CopilotService.WatchdogMaxProcessingTimeSeconds,
+ $"Case A max stuck time ({maxCaseAStuckSeconds}s) must be less than " +
+ $"absolute max ({CopilotService.WatchdogMaxProcessingTimeSeconds}s). " +
+ "The reset cap is the PRIMARY safety net since the absolute max resets on app restart.");
+ }
+
+ [Fact]
+ public void CaseA_ExceedingMaxResets_FallsThroughToKill_InSource()
+ {
+ // Verify the watchdog's Case A checks WatchdogMaxToolAliveResets and falls
+ // through when exceeded. This is the core fix for the stuck-session bug where
+ // a dead session's tool appears active but no events ever arrive.
+ var source = File.ReadAllText(
+ Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs"));
+ var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync");
+ Assert.True(methodIdx >= 0, "RunProcessingWatchdogAsync must exist");
+ var endIdx = source.IndexOf(" private readonly ConcurrentDictionary", methodIdx);
+ var watchdogBody = source.Substring(methodIdx, endIdx - methodIdx);
+
+ // Case A must reference WatchdogMaxToolAliveResets
+ Assert.True(watchdogBody.Contains("WatchdogMaxToolAliveResets"),
+ "Case A must check WatchdogMaxToolAliveResets to cap consecutive resets");
+ // Case A must increment WatchdogCaseAResets
+ Assert.True(watchdogBody.Contains("WatchdogCaseAResets"),
+ "Case A must track reset count via state.WatchdogCaseAResets");
+ }
+
+ [Fact]
+ public void CaseA_ResetCounter_ClearedOnRealEvents_InSource()
+ {
+ // When real SDK events arrive (not usage/metrics), the Case A reset counter
+ // must be cleared. This proves the session's connection is alive, so future
+ // Case A resets should be fresh (not counting against the cap).
+ var source = File.ReadAllText(
+ Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs"));
+ var handlerIdx = source.IndexOf("private void HandleSessionEvent");
+ Assert.True(handlerIdx >= 0, "HandleSessionEvent must exist");
+ // Find the block that resets LastEventAtTicks (only for real events)
+ var lastEventResetIdx = source.IndexOf("LastEventAtTicks", handlerIdx);
+ Assert.True(lastEventResetIdx >= 0, "HandleSessionEvent must update LastEventAtTicks");
+ // The counter reset must be near the LastEventAtTicks reset (within ~300 chars)
+ var nearbyBlock = source.Substring(lastEventResetIdx, Math.Min(300, source.Length - lastEventResetIdx));
+ Assert.True(nearbyBlock.Contains("WatchdogCaseAResets"),
+ "WatchdogCaseAResets must be reset near LastEventAtTicks in HandleSessionEvent " +
+ "to clear the counter when real SDK events prove the connection is alive");
+ }
+
+ [Fact]
+ public void CaseA_ResetCounter_ClearedOnWatchdogStart_InSource()
+ {
+ // StartProcessingWatchdog must reset WatchdogCaseAResets to 0 so each new
+ // watchdog instance starts with a clean reset counter.
+ var source = File.ReadAllText(
+ Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs"));
+ var startIdx = source.IndexOf("private void StartProcessingWatchdog");
+ Assert.True(startIdx >= 0, "StartProcessingWatchdog must exist");
+ var methodEnd = source.IndexOf("_ = RunProcessingWatchdogAsync", startIdx);
+ var methodBody = source.Substring(startIdx, methodEnd - startIdx);
+ Assert.True(methodBody.Contains("WatchdogCaseAResets"),
+ "StartProcessingWatchdog must reset WatchdogCaseAResets to 0");
+ }
+
+ [Fact]
+ public void ExceededMaxTime_TrueWhenProcessingStartedAtNull_InSource()
+ {
+ // Defensive: if ProcessingStartedAt is null while IsProcessing is true,
+ // exceededMaxTime must be true. Without this, Case A can reset forever
+ // because totalProcessingSeconds=0 makes exceededMaxTime always false.
+ var source = File.ReadAllText(
+ Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.Events.cs"));
+ var methodIdx = source.IndexOf("private async Task RunProcessingWatchdogAsync");
+ Assert.True(methodIdx >= 0, "RunProcessingWatchdogAsync must exist");
+ var endIdx = source.IndexOf(" private readonly ConcurrentDictionary", methodIdx);
+ var watchdogBody = source.Substring(methodIdx, endIdx - methodIdx);
+
+ // The exceededMaxTime calculation must handle null ProcessingStartedAt
+ Assert.True(watchdogBody.Contains("!startedAt.HasValue"),
+ "exceededMaxTime must be true when ProcessingStartedAt is null " +
+ "(defensive guard against Case A infinite reset)");
+ }
+
+ // ===========================================================================
+ // Regression tests for: reconnect path inherits stale tool state
+ // Bug: After reconnect, HasUsedToolsThisTurn=true from the dead connection
+ // inflates the watchdog timeout from 120s to 600s. ProcessingStartedAt is
+ // not reset, so the max-time safety net measures from the original send.
+ // Fix: Reset tool tracking and ProcessingStartedAt in the reconnect block.
+ // ===========================================================================
+
+ [Fact]
+ public void ReconnectPath_ResetsHasUsedToolsThisTurn_InSource()
+ {
+ // After reconnect, HasUsedToolsThisTurn must be false so the new connection
+ // uses the 120s inactivity timeout, not the 600s tool timeout inherited from
+ // the dead connection. Without this, reconnected sessions wait 5x longer.
+ var source = File.ReadAllText(
+ Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs"));
+ var reconnectIdx = source.IndexOf("[RECONNECT] '{sessionName}' replacing state");
+ Assert.True(reconnectIdx >= 0, "Reconnect block must exist");
+ // Find StartProcessingWatchdog in the reconnect block (marks the end of state setup)
+ var watchdogIdx = source.IndexOf("StartProcessingWatchdog(state, sessionName)", reconnectIdx);
+ Assert.True(watchdogIdx >= 0, "StartProcessingWatchdog must be called in reconnect block");
+ var reconnectBlock = source.Substring(reconnectIdx, watchdogIdx - reconnectIdx);
+
+ // HasUsedToolsThisTurn must be set to false (not carried from old state)
+ Assert.True(reconnectBlock.Contains("HasUsedToolsThisTurn = false"),
+ "Reconnect block must reset HasUsedToolsThisTurn to false for the new connection. " +
+ "Carrying over true from the dead connection inflates the timeout from 120s to 600s.");
+ }
+
+ [Fact]
+ public void ReconnectPath_ResetsProcessingStartedAt_InSource()
+ {
+ // After reconnect, ProcessingStartedAt must be reset to DateTime.UtcNow
+ // so the watchdog's max-time safety net measures from the reconnect time,
+ // not the original send time.
+ var source = File.ReadAllText(
+ Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs"));
+ var reconnectIdx = source.IndexOf("[RECONNECT] '{sessionName}' replacing state");
+ Assert.True(reconnectIdx >= 0, "Reconnect block must exist");
+ var watchdogIdx = source.IndexOf("StartProcessingWatchdog(state, sessionName)", reconnectIdx);
+ var reconnectBlock = source.Substring(reconnectIdx, watchdogIdx - reconnectIdx);
+
+ Assert.True(reconnectBlock.Contains("ProcessingStartedAt = DateTime.UtcNow"),
+ "Reconnect block must reset ProcessingStartedAt to DateTime.UtcNow. " +
+ "Without this, the 60-min max-time safety net measures from the original send.");
+ }
+
+ [Fact]
+ public void ReconnectPath_ResetsActiveToolCallCount_InSource()
+ {
+ // After reconnect, ActiveToolCallCount must be 0. No tools have started
+ // on the new connection. A stale count > 0 would trigger Case A resets.
+ var source = File.ReadAllText(
+ Path.Combine(GetRepoRoot(), "PolyPilot", "Services", "CopilotService.cs"));
+ var reconnectIdx = source.IndexOf("[RECONNECT] '{sessionName}' replacing state");
+ Assert.True(reconnectIdx >= 0, "Reconnect block must exist");
+ var watchdogIdx = source.IndexOf("StartProcessingWatchdog(state, sessionName)", reconnectIdx);
+ var reconnectBlock = source.Substring(reconnectIdx, watchdogIdx - reconnectIdx);
+
+ Assert.True(reconnectBlock.Contains("ActiveToolCallCount") && reconnectBlock.Contains("0"),
+ "Reconnect block must reset ActiveToolCallCount to 0 for the new connection.");
+ }
}
\ No newline at end of file
diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs
index ec292752b5..f6ef17aa56 100644
--- a/PolyPilot/Services/CopilotService.Events.cs
+++ b/PolyPilot/Services/CopilotService.Events.cs
@@ -220,6 +220,9 @@ private void HandleSessionEvent(SessionState state, SessionEvent evt)
if (evt is not SessionUsageInfoEvent and not AssistantUsageEvent)
{
Interlocked.Exchange(ref state.LastEventAtTicks, DateTime.UtcNow.Ticks);
+ // Real event arrived — reset the Case A reset counter. This proves the session's
+ // JSON-RPC connection is alive, so future Case A resets are legitimate.
+ Interlocked.Exchange(ref state.WatchdogCaseAResets, 0);
state.Info.LastUpdatedAt = DateTime.Now;
}
var sessionName = state.Info.Name;
@@ -1399,6 +1402,15 @@ private void HandleReflectionAdvanceResult(SessionState state, string response,
/// non-progress events (like repeated SessionUsageInfoEvent) keep arriving without a terminal event.
internal const int WatchdogMaxProcessingTimeSeconds = 3600; // 60 minutes
+ /// Maximum number of consecutive Case A resets (tool active + server alive) before
+ /// the watchdog assumes the session's JSON-RPC connection is dead and kills it anyway.
+ /// The persistent server may still be alive serving other sessions while this specific
+ /// session's transport-level connection is broken (ConnectionLostException). Without this cap,
+ /// Case A resets LastEventAtTicks indefinitely, and ProcessingStartedAt resets on each app
+ /// restart — so neither the inactivity nor the max-time safety net ever fires.
+ /// (3+1) resets × 600s effective timeout ≈ 40 minutes max of Case A resets.
+ internal const int WatchdogMaxToolAliveResets = 3;
+
///
/// Milliseconds to wait after AssistantTurnEndEvent before firing CompleteResponse
/// as a fallback, in case SessionIdleEvent never arrives (SDK bug #299).
@@ -1458,6 +1470,7 @@ private void StartProcessingWatchdog(SessionState state, string sessionName)
// timeout to fire on the first watchdog check for any file > ~15s old.
// This is the exact regression pattern from PR #148 (short timeout killing active sessions).
Interlocked.Exchange(ref state.LastEventAtTicks, DateTime.UtcNow.Ticks);
+ Interlocked.Exchange(ref state.WatchdogCaseAResets, 0);
state.ProcessingWatchdog = new CancellationTokenSource();
var ct = state.ProcessingWatchdog.Token;
_ = RunProcessingWatchdogAsync(state, sessionName, ct);
@@ -1559,7 +1572,11 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session
if (elapsed >= effectiveTimeout)
{
- var exceededMaxTime = totalProcessingSeconds >= WatchdogMaxProcessingTimeSeconds;
+ // Defensive: if ProcessingStartedAt is null while IsProcessing is true,
+ // something is already wrong — treat as exceeded max time so Case A
+ // can't reset the timer indefinitely.
+ var exceededMaxTime = !startedAt.HasValue
+ || totalProcessingSeconds >= WatchdogMaxProcessingTimeSeconds;
// Before killing, check what state we're actually in:
//
@@ -1586,13 +1603,28 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session
var serverAlive = _serverManager.IsServerRunning;
if (serverAlive)
{
- Debug($"[WATCHDOG] '{sessionName}' {elapsed:F0}s inactivity but tool is running and server is alive — resetting timer " +
- $"(timeout={effectiveTimeout}s, totalProcessing={totalProcessingSeconds:F0}s)");
- Interlocked.Exchange(ref state.LastEventAtTicks, DateTime.UtcNow.Ticks);
- continue; // keep waiting — don't kill
+ var resets = Interlocked.Increment(ref state.WatchdogCaseAResets);
+ if (resets > WatchdogMaxToolAliveResets)
+ {
+ // Too many consecutive resets with no real SDK events — the
+ // session's JSON-RPC connection is likely dead even though the
+ // shared persistent server is still alive. Fall through to kill.
+ Debug($"[WATCHDOG] '{sessionName}' Case A reset cap exceeded ({resets}/{WatchdogMaxToolAliveResets}) " +
+ $"— killing despite server alive (elapsed={elapsed:F0}s, totalProcessing={totalProcessingSeconds:F0}s)");
+ }
+ else
+ {
+ Debug($"[WATCHDOG] '{sessionName}' {elapsed:F0}s inactivity but tool is running and server is alive — resetting timer " +
+ $"(reset #{resets}/{WatchdogMaxToolAliveResets}, timeout={effectiveTimeout}s, totalProcessing={totalProcessingSeconds:F0}s)");
+ Interlocked.Exchange(ref state.LastEventAtTicks, DateTime.UtcNow.Ticks);
+ continue; // keep waiting — don't kill
+ }
+ }
+ else
+ {
+ Debug($"[WATCHDOG] '{sessionName}' tool running but server is not responding — killing stuck session " +
+ $"(elapsed={elapsed:F0}s, totalProcessing={totalProcessingSeconds:F0}s)");
}
- Debug($"[WATCHDOG] '{sessionName}' tool running but server is not responding — killing stuck session " +
- $"(elapsed={elapsed:F0}s, totalProcessing={totalProcessingSeconds:F0}s)");
}
else if (!hasActiveTool && (hasUsedTools || (isMultiAgentSession && !IsDemoMode && !IsRemoteMode && _serverManager.IsServerRunning)))
{
diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs
index a3ca55bfd8..1ed3b24b80 100644
--- a/PolyPilot/Services/CopilotService.cs
+++ b/PolyPilot/Services/CopilotService.cs
@@ -408,6 +408,11 @@ private class SessionState
/// Cleared on CompleteResponse and SendPromptAsync.
///
public ConcurrentDictionary PendingReasoningMessages { get; } = new();
+ /// Number of consecutive times the watchdog's Case A (tool active + server alive)
+ /// has reset the inactivity timer without any real SDK events arriving. Capped by
+ /// WatchdogMaxToolAliveResets to prevent infinite resets when the session's JSON-RPC
+ /// connection is dead but the shared persistent server is still alive.
+ public int WatchdogCaseAResets;
}
private void Debug(string message)
@@ -2600,8 +2605,14 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis
// orphaned old state can't pass generation checks on the new state.
Interlocked.Exchange(ref newState.ProcessingGeneration,
Interlocked.Read(ref state.ProcessingGeneration));
- newState.HasUsedToolsThisTurn = state.HasUsedToolsThisTurn;
- Interlocked.Exchange(ref newState.SuccessfulToolCountThisTurn, Volatile.Read(ref state.SuccessfulToolCountThisTurn));
+ // Reset tool tracking for the NEW connection. The old connection's
+ // tool state is stale — no tools have run on this connection yet.
+ // Without this, HasUsedToolsThisTurn=true from the dead connection
+ // inflates the watchdog timeout from 120s to 600s, making stuck
+ // sessions wait 5x longer than necessary to recover.
+ newState.HasUsedToolsThisTurn = false;
+ Interlocked.Exchange(ref newState.ActiveToolCallCount, 0);
+ Interlocked.Exchange(ref newState.SuccessfulToolCountThisTurn, 0);
newState.IsMultiAgentSession = state.IsMultiAgentSession;
newSession.On(evt => HandleSessionEvent(newState, evt));
_sessions[sessionName] = newState;
@@ -2613,6 +2624,11 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis
// replayed IDLE from clearing IsProcessing before the retry completes.
Interlocked.Increment(ref state.ProcessingGeneration);
state.Info.IsProcessing = true;
+ // Reset ProcessingStartedAt so the watchdog's max-time safety net
+ // measures from the reconnect, not the original send. Without this,
+ // the 60-min absolute max is measured from the first attempt and
+ // the reconnected session inherits a stale deadline.
+ state.Info.ProcessingStartedAt = DateTime.UtcNow;
state.CurrentResponse.Clear();
state.FlushedResponse.Clear();
state.PendingReasoningMessages.Clear();