From ab1ee70c74e201438464a7df6c982cee2e6c0857 Mon Sep 17 00:00:00 2001 From: Shane Date: Sun, 8 Mar 2026 18:53:21 -0500 Subject: [PATCH 1/4] fix: reduce watchdog timeout for multi-agent sessions without tools When a JSON-RPC connection dies (SocketException 10054), orchestrator sessions would previously wait up to 600 seconds (10 minutes) before the watchdog cleared IsProcessing. This is because all multi-agent sessions used WatchdogToolExecutionTimeoutSeconds regardless of tool activity. Now multi-agent sessions WITHOUT active tools use a new moderate timeout of 180 seconds (3 minutes). This is: - Long enough for legitimate model reasoning (typically 1-3 minutes) - Short enough to not leave users waiting 10 minutes for dead connections - Still shorter than the orchestration loop's own CancelAfter timeout Sessions WITH tool activity (hasActiveTool or hasUsedTools) continue to use the full 600 second timeout since tools can legitimately run for many minutes. The fix adds a new timeout tier: 1. Resume quiescence: 30s (no events since restart) 2. Standard inactivity: 120s (no tools, not multi-agent) 3. Multi-agent no-tool: 180s (multi-agent but no tool activity) 4. Tool execution: 600s (tools running or have been used) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot.Tests/ProcessingWatchdogTests.cs | 101 +++++++++++++++++--- PolyPilot/Services/CopilotService.Events.cs | 23 ++++- 2 files changed, 110 insertions(+), 14 deletions(-) diff --git a/PolyPilot.Tests/ProcessingWatchdogTests.cs b/PolyPilot.Tests/ProcessingWatchdogTests.cs index 217b93b4d4..4e4b940136 100644 --- a/PolyPilot.Tests/ProcessingWatchdogTests.cs +++ b/PolyPilot.Tests/ProcessingWatchdogTests.cs @@ -56,6 +56,20 @@ public void WatchdogToolExecutionTimeout_IsReasonable() "Tool execution timeout must be greater than base inactivity timeout"); } + [Fact] + public void WatchdogMultiAgentNoToolTimeout_IsReasonable() + { + // Multi-agent no-tool timeout must be between base inactivity and tool execution timeouts. + // Long enough for model reasoning (1-3 min) but short enough to catch dead connections. + Assert.InRange(CopilotService.WatchdogMultiAgentNoToolTimeoutSeconds, 120, 300); + Assert.True( + CopilotService.WatchdogMultiAgentNoToolTimeoutSeconds > CopilotService.WatchdogInactivityTimeoutSeconds, + "Multi-agent no-tool timeout must be greater than base inactivity timeout"); + Assert.True( + CopilotService.WatchdogMultiAgentNoToolTimeoutSeconds < CopilotService.WatchdogToolExecutionTimeoutSeconds, + "Multi-agent no-tool timeout must be less than tool execution timeout"); + } + [Fact] public void WatchdogTimeout_IsGreaterThanCheckInterval() { @@ -889,18 +903,21 @@ public void HasUsedToolsThisTurn_ResetByCompleteResponse() } /// - /// Mirrors the three-tier timeout selection logic from RunProcessingWatchdogAsync. + /// Mirrors the four-tier timeout selection logic from RunProcessingWatchdogAsync. /// Kept in sync so tests validate the actual production formula. /// private static int ComputeEffectiveTimeout(bool hasActiveTool, bool isResumed, bool hasReceivedEvents, bool hasUsedTools, bool isMultiAgent = false) { var useResumeQuiescence = isResumed && !hasReceivedEvents && !hasActiveTool && !hasUsedTools; - var useToolTimeout = hasActiveTool || (isResumed && !useResumeQuiescence) || hasUsedTools || isMultiAgent; + var useToolTimeout = hasActiveTool || (isResumed && !useResumeQuiescence) || hasUsedTools; + var useMultiAgentNoToolTimeout = isMultiAgent && !hasActiveTool && !hasUsedTools && !isResumed; return useResumeQuiescence ? CopilotService.WatchdogResumeQuiescenceTimeoutSeconds : useToolTimeout ? CopilotService.WatchdogToolExecutionTimeoutSeconds - : CopilotService.WatchdogInactivityTimeoutSeconds; + : useMultiAgentNoToolTimeout + ? CopilotService.WatchdogMultiAgentNoToolTimeoutSeconds + : CopilotService.WatchdogInactivityTimeoutSeconds; } [Fact] @@ -973,12 +990,35 @@ public void WatchdogTimeoutSelection_ResumedWithActiveTool_UsesToolTimeout() } [Fact] - public void WatchdogTimeoutSelection_MultiAgent_UsesToolTimeout() + public void WatchdogTimeoutSelection_MultiAgent_NoTools_UsesMultiAgentNoToolTimeout() { - // Multi-agent sessions use longer tool timeout even without tool activity + // Multi-agent sessions without tool activity use a moderate timeout (180s) + // to catch dead connections faster than 600s while still allowing model reasoning var effectiveTimeout = ComputeEffectiveTimeout( hasActiveTool: false, isResumed: false, hasReceivedEvents: false, hasUsedTools: false, isMultiAgent: true); + Assert.Equal(CopilotService.WatchdogMultiAgentNoToolTimeoutSeconds, effectiveTimeout); + Assert.Equal(180, effectiveTimeout); + } + + [Fact] + public void WatchdogTimeoutSelection_MultiAgent_WithTools_UsesToolTimeout() + { + // Multi-agent sessions WITH tool activity still use the full 600s timeout + var effectiveTimeout = ComputeEffectiveTimeout( + hasActiveTool: true, isResumed: false, hasReceivedEvents: false, hasUsedTools: false, isMultiAgent: true); + + Assert.Equal(CopilotService.WatchdogToolExecutionTimeoutSeconds, effectiveTimeout); + Assert.Equal(600, effectiveTimeout); + } + + [Fact] + public void WatchdogTimeoutSelection_MultiAgent_HasUsedTools_UsesToolTimeout() + { + // Multi-agent sessions that have used tools this turn use 600s timeout + var effectiveTimeout = ComputeEffectiveTimeout( + hasActiveTool: false, isResumed: false, hasReceivedEvents: false, hasUsedTools: true, isMultiAgent: true); + Assert.Equal(CopilotService.WatchdogToolExecutionTimeoutSeconds, effectiveTimeout); Assert.Equal(600, effectiveTimeout); } @@ -1194,14 +1234,22 @@ public void ResumeQuiescence_OnlyTriggersWhenResumedAndNoEvents() public void ResumeQuiescence_NotResumed_NeverTriggersQuiescence() { // Non-resumed sessions must NEVER get the 30s quiescence timeout, - // regardless of other flags. + // regardless of other flags. Each case uses its appropriate timeout tier: + // - Base inactivity (120s) for standard sessions + // - Tool execution (600s) for sessions with tool activity + // - Multi-agent no-tool (180s) for multi-agent sessions without tools Assert.Equal(120, ComputeEffectiveTimeout( hasActiveTool: false, isResumed: false, hasReceivedEvents: false, hasUsedTools: false)); Assert.Equal(600, ComputeEffectiveTimeout( hasActiveTool: true, isResumed: false, hasReceivedEvents: false, hasUsedTools: false)); Assert.Equal(600, ComputeEffectiveTimeout( hasActiveTool: false, isResumed: false, hasReceivedEvents: false, hasUsedTools: true)); - Assert.Equal(600, ComputeEffectiveTimeout( + Assert.Equal(180, ComputeEffectiveTimeout( + hasActiveTool: false, isResumed: false, hasReceivedEvents: false, hasUsedTools: false, isMultiAgent: true)); + // All must NOT be 30s (quiescence) + Assert.NotEqual(30, ComputeEffectiveTimeout( + hasActiveTool: false, isResumed: false, hasReceivedEvents: false, hasUsedTools: false)); + Assert.NotEqual(30, ComputeEffectiveTimeout( hasActiveTool: false, isResumed: false, hasReceivedEvents: false, hasUsedTools: false, isMultiAgent: true)); } @@ -1238,7 +1286,9 @@ public void ResumeQuiescence_TransitionsToInactivity_AfterIsResumedCleared() [InlineData(false, true, true, false, false, 600)] // Resumed, events: tool timeout [InlineData(true, true, false, false, false, 600)] // Resumed, active tool: tool timeout [InlineData(false, true, false, true, false, 600)] // Resumed, used tools: tool timeout - [InlineData(false, false, false, false, true, 600)] // Multi-agent: tool timeout + [InlineData(false, false, false, false, true, 180)] // Multi-agent, no tools: 180s (faster dead connection detection) + [InlineData(true, false, false, false, true, 600)] // Multi-agent, active tool: tool timeout + [InlineData(false, false, false, true, true, 600)] // Multi-agent, used tools: tool timeout [InlineData(false, true, false, false, true, 30)] // Resumed+multiAgent, no events: quiescence wins [InlineData(false, false, false, true, false, 600)] // HasUsedTools: tool timeout [InlineData(true, true, true, true, true, 600)] // All flags: tool timeout @@ -1817,14 +1867,41 @@ public void Regression_PR163_IsResumed_NotClearedDuringToolActivity() // --- PR #195 regression: multi-agent workers killed at 120s --- [Fact] - public void Regression_PR195_MultiAgentWorkers_Use600s() + public void Regression_PR195_MultiAgentWorkers_NotKilledAt120s() { // Multi-agent workers doing text-heavy tasks (PR reviews, no tools) - // were killed at 120s inactivity. Fix: isMultiAgent flag → 600s. - var timeout = ComputeEffectiveTimeout( + // were killed at 120s inactivity. Fix: now uses 180s (multi-agent no-tool timeout). + // Sessions with tool activity use 600s. + var timeoutNoTools = ComputeEffectiveTimeout( hasActiveTool: false, isResumed: false, hasReceivedEvents: true, hasUsedTools: false, isMultiAgent: true); - Assert.Equal(CopilotService.WatchdogToolExecutionTimeoutSeconds, timeout); + Assert.Equal(CopilotService.WatchdogMultiAgentNoToolTimeoutSeconds, timeoutNoTools); + Assert.True(timeoutNoTools > 120, "Multi-agent sessions must not be killed at 120s"); + + // If tools are being used, use the full 600s timeout + var timeoutWithTools = ComputeEffectiveTimeout( + hasActiveTool: false, isResumed: false, hasReceivedEvents: true, + hasUsedTools: true, isMultiAgent: true); + Assert.Equal(CopilotService.WatchdogToolExecutionTimeoutSeconds, timeoutWithTools); + } + + // --- Bug fix: orchestrator stuck after SocketException 10054 --- + + [Fact] + public void Regression_OrchestratorStuck_MultiAgentNoToolsUsesModerateTimeout() + { + // When SocketException 10054 kills the JSON-RPC connection, orchestrator sessions + // would previously wait 600s before the watchdog cleared IsProcessing. + // Fix: multi-agent sessions without tool activity use 180s timeout. + // This is long enough for legitimate model reasoning (1-3 min) + // but short enough to not leave users waiting 10 minutes for a dead connection. + var timeout = ComputeEffectiveTimeout( + hasActiveTool: false, isResumed: false, hasReceivedEvents: false, hasUsedTools: false, isMultiAgent: true); + + Assert.Equal(CopilotService.WatchdogMultiAgentNoToolTimeoutSeconds, timeout); + Assert.Equal(180, timeout); + Assert.True(timeout < 600, "Multi-agent no-tool timeout must be shorter than tool timeout"); + Assert.True(timeout > 120, "Multi-agent no-tool timeout must be longer than base inactivity timeout"); } // --- PR #211 regression: quiescence must not kill active sessions --- diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs index 1c828bbc1d..e33a04ce83 100644 --- a/PolyPilot/Services/CopilotService.Events.cs +++ b/PolyPilot/Services/CopilotService.Events.cs @@ -1370,6 +1370,12 @@ private void HandleReflectionAdvanceResult(SessionState state, string response, /// If no SDK events arrive for this many seconds while a tool is actively executing, the session is considered stuck. /// This is much longer because legitimate tool executions (e.g., running UI tests, long builds) can take many minutes. internal const int WatchdogToolExecutionTimeoutSeconds = 600; + /// If no SDK events arrive for this many seconds for a multi-agent session WITHOUT active tools, + /// the session is considered stuck. This is shorter than WatchdogToolExecutionTimeoutSeconds because: + /// - Multi-agent sessions without tool activity are likely waiting for model reasoning (1-3 min typical) + /// - If the connection died (SocketException 10054), we don't want users waiting 10 minutes + /// - The orchestration loop has its own CancelAfter timeout as a backup + internal const int WatchdogMultiAgentNoToolTimeoutSeconds = 180; /// If a resumed session receives zero SDK events for this many seconds, it was likely already /// finished when the app restarted. Short enough that users don't have to click Stop, long enough /// for the SDK to start streaming if the turn is genuinely still active. @@ -1481,12 +1487,25 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session // goes true and we fall through to the normal timeout tiers. var useResumeQuiescence = state.Info.IsResumed && !hasReceivedEvents && !hasActiveTool && !hasUsedTools; - var useToolTimeout = hasActiveTool || (state.Info.IsResumed && !useResumeQuiescence) || hasUsedTools || isMultiAgentSession; + // Timeout tier selection: + // 1. Resumed session with no events → 30s quiescence + // 2. Active tool running OR resumed session with events → 600s (tools can take 10+ min) + // 3. Multi-agent session WITHOUT active tools or prior tool use → 180s (catch dead connections faster) + // 4. Standard session → 120s inactivity + // + // The key insight for tier 3: multi-agent sessions that haven't touched tools are likely + // doing pure model reasoning (1-3 min typical). If we hit 3 min with no events, the + // connection is probably dead (SocketException 10054 in SDK event loop). Users shouldn't + // wait 10 min to discover a dead connection. + var useToolTimeout = hasActiveTool || (state.Info.IsResumed && !useResumeQuiescence) || hasUsedTools; + var useMultiAgentNoToolTimeout = isMultiAgentSession && !hasActiveTool && !hasUsedTools && !state.Info.IsResumed; var effectiveTimeout = useResumeQuiescence ? WatchdogResumeQuiescenceTimeoutSeconds : useToolTimeout ? WatchdogToolExecutionTimeoutSeconds - : WatchdogInactivityTimeoutSeconds; + : useMultiAgentNoToolTimeout + ? WatchdogMultiAgentNoToolTimeoutSeconds + : WatchdogInactivityTimeoutSeconds; // Safety net: check absolute max processing time, but only if events have also // gone stale. If events are still flowing (elapsed < effectiveTimeout), the session From 9adcb34a76626281adab63a2cf04ff5d20c762b3 Mon Sep 17 00:00:00 2001 From: Shane Date: Sun, 8 Mar 2026 19:05:04 -0500 Subject: [PATCH 2/4] fix: pre-existing test failures and improve test coverage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - InputValidationTests: Add Windows platform skips for symlink tests (Unix paths treated differently on Windows) - MultiAgentRegressionTests: Increase substring search from 200→400 chars for ReconnectState_ShouldCarryIsMultiAgentSession test - ProcessingWatchdogTests: Add comprehensive coverage for new timeout tier - 3 new InlineData rows for edge cases - 3 new named tests for clarity - Coverage for resumed+events, events-only, and tier transitions Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot.Tests/InputValidationTests.cs | 10 ++++ PolyPilot.Tests/MultiAgentRegressionTests.cs | 2 +- PolyPilot.Tests/ProcessingWatchdogTests.cs | 48 ++++++++++++++++++++ 3 files changed, 59 insertions(+), 1 deletion(-) diff --git a/PolyPilot.Tests/InputValidationTests.cs b/PolyPilot.Tests/InputValidationTests.cs index 281873f11a..0e7ddca103 100644 --- a/PolyPilot.Tests/InputValidationTests.cs +++ b/PolyPilot.Tests/InputValidationTests.cs @@ -119,6 +119,11 @@ public void ValidateImagePath_OutOverload_ErrorReturnsEmpty() [Fact] public void ValidateImagePath_SymlinkOutsideImagesDir_ReturnsNotAllowed() { + // Skip on Windows: Unix paths like /etc/passwd are relative on Windows, + // so symlink resolution yields a path inside the images dir, not outside. + if (OperatingSystem.IsWindows()) + return; + var imagesDir = ShowImageTool.GetImagesDir(); Directory.CreateDirectory(imagesDir); var linkPath = Path.Combine(imagesDir, "evil-link.png"); @@ -144,6 +149,11 @@ public void ValidateImagePath_SymlinkOutsideImagesDir_ReturnsNotAllowed() [Fact] public void ValidateImagePath_DirectorySymlinkBypass_ReturnsNotAllowed() { + // Skip on Windows: Unix paths like /etc are relative on Windows, + // so symlink resolution yields a path inside the images dir, not outside. + if (OperatingSystem.IsWindows()) + return; + var imagesDir = ShowImageTool.GetImagesDir(); Directory.CreateDirectory(imagesDir); var symlinkDir = Path.Combine(imagesDir, "evil-subdir"); diff --git a/PolyPilot.Tests/MultiAgentRegressionTests.cs b/PolyPilot.Tests/MultiAgentRegressionTests.cs index ab90802756..1570fc7a3c 100644 --- a/PolyPilot.Tests/MultiAgentRegressionTests.cs +++ b/PolyPilot.Tests/MultiAgentRegressionTests.cs @@ -1462,7 +1462,7 @@ public void ReconnectState_ShouldCarryIsMultiAgentSession() // Find the reconnect block where HasUsedToolsThisTurn is carried forward var reconnectBlock = source.Substring(source.IndexOf("newState.HasUsedToolsThisTurn = state.HasUsedToolsThisTurn")); // IsMultiAgentSession must be carried forward in the same block - Assert.Contains("newState.IsMultiAgentSession = state.IsMultiAgentSession", reconnectBlock.Substring(0, 200)); + Assert.Contains("newState.IsMultiAgentSession = state.IsMultiAgentSession", reconnectBlock.Substring(0, 400)); } [Fact] diff --git a/PolyPilot.Tests/ProcessingWatchdogTests.cs b/PolyPilot.Tests/ProcessingWatchdogTests.cs index 4e4b940136..ce681363ac 100644 --- a/PolyPilot.Tests/ProcessingWatchdogTests.cs +++ b/PolyPilot.Tests/ProcessingWatchdogTests.cs @@ -1035,6 +1035,51 @@ public void WatchdogTimeoutSelection_MultiAgentResumed_NoEvents_UsesQuiescenceTi Assert.Equal(30, effectiveTimeout); } + [Fact] + public void WatchdogTimeoutSelection_MultiAgentResumed_WithEvents_UsesToolTimeout() + { + // Multi-agent session that is resumed and has received events MUST use 600s timeout, + // NOT 180s. The resumed+events path takes precedence over multi-agent tier. + var effectiveTimeout = ComputeEffectiveTimeout( + hasActiveTool: false, isResumed: true, hasReceivedEvents: true, hasUsedTools: false, isMultiAgent: true); + + Assert.Equal(CopilotService.WatchdogToolExecutionTimeoutSeconds, effectiveTimeout); + Assert.Equal(600, effectiveTimeout); + } + + [Fact] + public void WatchdogTimeoutSelection_MultiAgent_EventsOnly_UsesModerateTimeout() + { + // Edge case: multi-agent session that has received events but no tools used + // and is NOT resumed. The hasReceivedEvents flag does NOT affect tier 3. + // Should still use 180s because hasActiveTool=false and hasUsedTools=false. + var effectiveTimeout = ComputeEffectiveTimeout( + hasActiveTool: false, isResumed: false, hasReceivedEvents: true, hasUsedTools: false, isMultiAgent: true); + + Assert.Equal(CopilotService.WatchdogMultiAgentNoToolTimeoutSeconds, effectiveTimeout); + Assert.Equal(180, effectiveTimeout); + } + + [Fact] + public void WatchdogTimeoutSelection_Transition_180sTo600s_WhenToolStarts() + { + // Documents the transition: when a tool starts (HasUsedToolsThisTurn goes true), + // multi-agent sessions transition from 180s to 600s. + var before = ComputeEffectiveTimeout( + hasActiveTool: false, isResumed: false, hasReceivedEvents: false, hasUsedTools: false, isMultiAgent: true); + Assert.Equal(180, before); + + // Tool starts → hasActiveTool goes true + var during = ComputeEffectiveTimeout( + hasActiveTool: true, isResumed: false, hasReceivedEvents: false, hasUsedTools: false, isMultiAgent: true); + Assert.Equal(600, during); + + // After tool completes → hasUsedTools stays true + var after = ComputeEffectiveTimeout( + hasActiveTool: false, isResumed: false, hasReceivedEvents: false, hasUsedTools: true, isMultiAgent: true); + Assert.Equal(600, after); + } + [Fact] public void HasUsedToolsThisTurn_ResetOnNewSend() { @@ -1292,6 +1337,9 @@ public void ResumeQuiescence_TransitionsToInactivity_AfterIsResumedCleared() [InlineData(false, true, false, false, true, 30)] // Resumed+multiAgent, no events: quiescence wins [InlineData(false, false, false, true, false, 600)] // HasUsedTools: tool timeout [InlineData(true, true, true, true, true, 600)] // All flags: tool timeout + [InlineData(false, true, true, false, true, 600)] // Resumed+multiAgent+events: tool timeout (not 180s) + [InlineData(false, false, true, false, true, 180)] // Multi-agent+events, no tools: still 180s (events don't affect tier 3) + [InlineData(true, false, false, true, true, 600)] // Multi-agent+activeTool+hasUsedTools: tool timeout public void WatchdogTimeoutSelection_ExhaustiveMatrix( bool hasActiveTool, bool isResumed, bool hasReceivedEvents, bool hasUsedTools, bool isMultiAgent, int expectedTimeout) From bf439195ba2ce39ea0baeac12f143cad590a9502 Mon Sep 17 00:00:00 2001 From: Shane Date: Sun, 8 Mar 2026 19:23:31 -0500 Subject: [PATCH 3/4] fix: prevent IsRestoring stuck at true when restore fails CRITICAL BUG: If RestorePreviousSessionsAsync() threw an exception or hit the 'break' statement at line 420, IsRestoring was never reset to false. This left the entire UI unresponsive - Resume buttons disabled, session interactions blocked. Root cause: IsRestoring=false was inside the inner try block, so it was skipped when: - An exception occurred before reaching line 426 - The 'break' statement at line 420 exited the loop early - The outer catch block (line 429) didn't reset IsRestoring Fix: Use a finally block to GUARANTEE IsRestoring=false is called, even when restore fails. This follows the same pattern as other critical state cleanup in the codebase. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot/Services/CopilotService.Persistence.cs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/PolyPilot/Services/CopilotService.Persistence.cs b/PolyPilot/Services/CopilotService.Persistence.cs index b3a4c3bc76..59c7c745b4 100644 --- a/PolyPilot/Services/CopilotService.Persistence.cs +++ b/PolyPilot/Services/CopilotService.Persistence.cs @@ -423,13 +423,19 @@ public async Task RestorePreviousSessionsAsync(CancellationToken cancellationTok } } - IsRestoring = false; } } catch (Exception ex) { Debug($"Failed to load active sessions file: {ex.Message}"); } + finally + { + // CRITICAL: Always reset IsRestoring even on failure. + // If this stays true, the entire UI becomes unresponsive + // (Resume buttons disabled, session interactions blocked). + IsRestoring = false; + } } } From 1d818bb1a1b825a17fa495a9a61aa7fdd8f1bb94 Mon Sep 17 00:00:00 2001 From: Shane Date: Sun, 8 Mar 2026 20:27:42 -0500 Subject: [PATCH 4/4] Add try-finally wrapper around IsRestoring in InitializeAsync Defense-in-depth fix for the IsRestoring stuck bug. This wraps the outer call site (InitializeAsync) in try-finally in addition to the inner finally block inside RestorePreviousSessionsAsync. If RestorePreviousSessionsAsync throws before setting IsRestoring=false in its own finally block, the outer finally ensures the UI isn't stuck. Found by code review agent cross-referencing with processing-state-safety skill invariants. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot/Services/CopilotService.cs | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs index 21c960ee5b..962f02c7ed 100644 --- a/PolyPilot/Services/CopilotService.cs +++ b/PolyPilot/Services/CopilotService.cs @@ -638,10 +638,18 @@ public async Task InitializeAsync(CancellationToken cancellationToken = default) LoadOrganization(); // Restore previous sessions (includes subscribing to untracked server sessions in Persistent mode) - IsRestoring = true; - OnStateChanged?.Invoke(); - await RestorePreviousSessionsAsync(cancellationToken); - IsRestoring = false; + // Use try-finally to GUARANTEE IsRestoring is cleared even if restore throws. + // This is defense-in-depth alongside the finally block inside RestorePreviousSessionsAsync. + try + { + IsRestoring = true; + OnStateChanged?.Invoke(); + await RestorePreviousSessionsAsync(cancellationToken); + } + finally + { + IsRestoring = false; + } // Start health check loop for any codespace groups (regardless of whether sessions were restored) if (CodespacesEnabled)