From 0e198296925b3e3bca45e00e33f8f96e17ed0c91 Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Wed, 18 Feb 2026 12:34:44 -0600 Subject: [PATCH 1/4] Add diagnostic logging for stuck-session root cause analysis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Log critical lifecycle events to ~/.polypilot/event-diagnostics.log: - [EVT] SessionIdleEvent/TurnEnd/Error with IsProcessing state and thread ID - [EVT-WARN] Events delivered to orphaned (replaced) state objects - [EVT-ERR] Exceptions in SyncContext.Post callbacks and CompleteReasoningMessages - [IDLE] CompleteResponse dispatch confirmation on UI thread - [COMPLETE] CompleteResponse entry/skip with IsProcessing guard result - [SEND] IsProcessing=true set point with thread ID - [RECONNECT] Old/new state swap with session IDs - [UI-ERR] InvokeOnUI callback failures Also wraps SyncContext.Post callbacks in try-catch to prevent silent exception swallowing — the most likely root cause of stuck sessions. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot/Services/CopilotService.Events.cs | 50 +++++++++++++++++++-- PolyPilot/Services/CopilotService.cs | 29 +++++++++++- 2 files changed, 74 insertions(+), 5 deletions(-) diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs index c90d217a46..57695a8c7f 100644 --- a/PolyPilot/Services/CopilotService.Events.cs +++ b/PolyPilot/Services/CopilotService.Events.cs @@ -199,10 +199,36 @@ private void HandleSessionEvent(SessionState state, SessionEvent evt) state.HasReceivedEventsSinceResume = true; Interlocked.Exchange(ref state.LastEventAtTicks, DateTime.UtcNow.Ticks); var sessionName = state.Info.Name; + var isCurrentState = _sessions.TryGetValue(sessionName, out var current) && ReferenceEquals(current, state); + + // Log critical lifecycle events and detect orphaned handlers + if (evt is SessionIdleEvent or AssistantTurnEndEvent or SessionErrorEvent) + { + Debug($"[EVT] '{sessionName}' received {evt.GetType().Name} " + + $"(IsProcessing={state.Info.IsProcessing}, isCurrentState={isCurrentState}, " + + $"thread={Environment.CurrentManagedThreadId})"); + } + + // Warn if receiving events on an orphaned (replaced) state object + if (!isCurrentState) + { + Debug($"[EVT-WARN] '{sessionName}' event {evt.GetType().Name} delivered to ORPHANED state " + + $"(not in _sessions). This handler should have been detached."); + } + void Invoke(Action action) { if (_syncContext != null) - _syncContext.Post(_ => action(), null); + { + _syncContext.Post(_ => + { + try { action(); } + catch (Exception ex) + { + Debug($"[EVT-ERR] '{sessionName}' SyncContext.Post callback threw: {ex.GetType().Name}: {ex.Message}"); + } + }, null); + } else action(); } @@ -333,8 +359,17 @@ void Invoke(Action action) break; case SessionIdleEvent: - CompleteReasoningMessages(state, sessionName); - Invoke(() => CompleteResponse(state)); + try { CompleteReasoningMessages(state, sessionName); } + catch (Exception ex) + { + Debug($"[EVT-ERR] '{sessionName}' CompleteReasoningMessages threw before CompleteResponse: {ex.Message}"); + } + Invoke(() => + { + Debug($"[IDLE] '{sessionName}' CompleteResponse dispatched on UI thread " + + $"(IsProcessing={state.Info.IsProcessing}, thread={Environment.CurrentManagedThreadId})"); + CompleteResponse(state); + }); // Refresh git branch — agent may have switched branches state.Info.GitBranch = GetGitBranch(state.Info.WorkingDirectory); // Send notification when agent finishes @@ -556,7 +591,14 @@ private void FlushCurrentResponse(SessionState state) private void CompleteResponse(SessionState state) { - if (!state.Info.IsProcessing) return; // Already completed (e.g. timeout) + if (!state.Info.IsProcessing) + { + Debug($"[COMPLETE] '{state.Info.Name}' CompleteResponse skipped — IsProcessing already false"); + return; // Already completed (e.g. timeout) + } + + Debug($"[COMPLETE] '{state.Info.Name}' CompleteResponse executing " + + $"(responseLen={state.CurrentResponse.Length}, thread={Environment.CurrentManagedThreadId})"); CancelProcessingWatchdog(state); var response = state.CurrentResponse.ToString(); diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs index 5fcf68250d..bddfc71223 100644 --- a/PolyPilot/Services/CopilotService.cs +++ b/PolyPilot/Services/CopilotService.cs @@ -210,12 +210,36 @@ private void Debug(string message) LastDebugMessage = message; Console.WriteLine($"[DEBUG] {message}"); OnDebug?.Invoke(message); + + // Persist lifecycle diagnostics to file for post-mortem analysis (DEBUG builds only) +#if DEBUG + if (message.StartsWith("[EVT") || message.StartsWith("[IDLE") || + message.StartsWith("[COMPLETE") || message.StartsWith("[SEND") || + message.StartsWith("[RECONNECT") || message.StartsWith("[UI-ERR") || + message.Contains("watchdog")) + { + try + { + var logPath = Path.Combine(PolyPilotBaseDir, "event-diagnostics.log"); + File.AppendAllText(logPath, + $"{DateTime.UtcNow:yyyy-MM-dd HH:mm:ss.fff} {message}{Environment.NewLine}"); + } + catch { /* Don't let logging failures cascade */ } + } +#endif } private void InvokeOnUI(Action action) { if (_syncContext != null) - _syncContext.Post(_ => action(), null); + _syncContext.Post(_ => + { + try { action(); } + catch (Exception ex) + { + Debug($"[UI-ERR] InvokeOnUI callback threw: {ex.GetType().Name}: {ex.Message}"); + } + }, null); else action(); } @@ -1398,6 +1422,7 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis throw new InvalidOperationException("Session is already processing a request."); state.Info.IsProcessing = true; + Debug($"[SEND] '{sessionName}' IsProcessing=true (thread={Environment.CurrentManagedThreadId})"); state.ResponseCompletion = new TaskCompletionSource(); state.CurrentResponse.Clear(); StartProcessingWatchdog(state, sessionName); @@ -1464,6 +1489,8 @@ public async Task SendPromptAsync(string sessionName, string prompt, Lis var newSession = await _client.ResumeSessionAsync(state.Info.SessionId, reconnectConfig, cancellationToken); // Cancel old watchdog BEFORE creating new state — they share Info/TCS CancelProcessingWatchdog(state); + Debug($"[RECONNECT] '{sessionName}' replacing state (old handler will be orphaned, " + + $"old session disposed, new session={newSession.SessionId})"); var newState = new SessionState { Session = newSession, From 9c470c870e7f7c208d3526bf916b288bbacf636e Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Wed, 18 Feb 2026 13:24:03 -0600 Subject: [PATCH 2/4] Address 5-model review findings: thread-safe logging, full stack traces, symmetric try-catch Fixes from GPT-5.3, Sonnet 4.6, Opus 4.6, Gemini, GPT-5 code reviews: - Add try-catch to 'else' branches of Invoke() and InvokeOnUI() (asymmetric bug) - Use ex.ToString() instead of ex.Message for full stack traces in diagnostics - Add lock(_diagnosticLogLock) around File.AppendAllText for thread safety - Add 10MB log rotation to prevent unbounded event-diagnostics.log growth - Wrap CompleteReasoningMessages in try-catch for AssistantTurnEndEvent (was only protected in SessionIdleEvent) - Fix 'UI thread' misnomer in IDLE log message - Add comment explaining why orphaned handlers are not early-returned Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot/Services/CopilotService.Events.cs | 28 +++++++++++++++------ PolyPilot/Services/CopilotService.cs | 22 +++++++++++++--- 2 files changed, 39 insertions(+), 11 deletions(-) diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs index 57695a8c7f..fe722674d4 100644 --- a/PolyPilot/Services/CopilotService.Events.cs +++ b/PolyPilot/Services/CopilotService.Events.cs @@ -209,7 +209,10 @@ private void HandleSessionEvent(SessionState state, SessionEvent evt) $"thread={Environment.CurrentManagedThreadId})"); } - // Warn if receiving events on an orphaned (replaced) state object + // Warn if receiving events on an orphaned (replaced) state object. + // We don't early-return here: both old and new SessionState share the same Info object + // (reconnect copies Info to newState), so CompleteResponse on the orphaned state still + // correctly clears IsProcessing on the live session's shared Info. if (!isCurrentState) { Debug($"[EVT-WARN] '{sessionName}' event {evt.GetType().Name} delivered to ORPHANED state " + @@ -225,12 +228,18 @@ void Invoke(Action action) try { action(); } catch (Exception ex) { - Debug($"[EVT-ERR] '{sessionName}' SyncContext.Post callback threw: {ex.GetType().Name}: {ex.Message}"); + Debug($"[EVT-ERR] '{sessionName}' SyncContext.Post callback threw: {ex}"); } }, null); } else - action(); + { + try { action(); } + catch (Exception ex) + { + Debug($"[EVT-ERR] '{sessionName}' inline callback threw: {ex}"); + } + } } switch (evt) @@ -350,7 +359,11 @@ void Invoke(Action action) break; case AssistantTurnEndEvent: - CompleteReasoningMessages(state, sessionName); + try { CompleteReasoningMessages(state, sessionName); } + catch (Exception ex) + { + Debug($"[EVT-ERR] '{sessionName}' CompleteReasoningMessages threw in TurnEnd: {ex}"); + } Invoke(() => { OnTurnEnd?.Invoke(sessionName); @@ -362,12 +375,13 @@ void Invoke(Action action) try { CompleteReasoningMessages(state, sessionName); } catch (Exception ex) { - Debug($"[EVT-ERR] '{sessionName}' CompleteReasoningMessages threw before CompleteResponse: {ex.Message}"); + Debug($"[EVT-ERR] '{sessionName}' CompleteReasoningMessages threw before CompleteResponse: {ex}"); } Invoke(() => { - Debug($"[IDLE] '{sessionName}' CompleteResponse dispatched on UI thread " + - $"(IsProcessing={state.Info.IsProcessing}, thread={Environment.CurrentManagedThreadId})"); + Debug($"[IDLE] '{sessionName}' CompleteResponse dispatched " + + $"(syncCtx={(_syncContext != null ? "UI" : "inline")}, " + + $"IsProcessing={state.Info.IsProcessing}, thread={Environment.CurrentManagedThreadId})"); CompleteResponse(state); }); // Refresh git branch — agent may have switched branches diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs index bddfc71223..bfaf09558e 100644 --- a/PolyPilot/Services/CopilotService.cs +++ b/PolyPilot/Services/CopilotService.cs @@ -19,6 +19,7 @@ public partial class CopilotService : IAsyncDisposable private readonly ConcurrentDictionary _closedSessionIds = new(); // Image paths queued alongside messages when session is busy (keyed by session name, list per queued message) private readonly ConcurrentDictionary>> _queuedImagePaths = new(); + private static readonly object _diagnosticLogLock = new(); private readonly IChatDatabase _chatDb; private readonly IServerManager _serverManager; private readonly IWsBridgeClient _bridgeClient; @@ -221,8 +222,15 @@ private void Debug(string message) try { var logPath = Path.Combine(PolyPilotBaseDir, "event-diagnostics.log"); - File.AppendAllText(logPath, - $"{DateTime.UtcNow:yyyy-MM-dd HH:mm:ss.fff} {message}{Environment.NewLine}"); + // Rotate at 10 MB to prevent unbounded growth + var fi = new FileInfo(logPath); + if (fi.Exists && fi.Length > 10 * 1024 * 1024) + try { File.Delete(logPath); } catch { } + lock (_diagnosticLogLock) + { + File.AppendAllText(logPath, + $"{DateTime.UtcNow:yyyy-MM-dd HH:mm:ss.fff} {message}{Environment.NewLine}"); + } } catch { /* Don't let logging failures cascade */ } } @@ -237,11 +245,17 @@ private void InvokeOnUI(Action action) try { action(); } catch (Exception ex) { - Debug($"[UI-ERR] InvokeOnUI callback threw: {ex.GetType().Name}: {ex.Message}"); + Debug($"[UI-ERR] InvokeOnUI callback threw: {ex}"); } }, null); else - action(); + { + try { action(); } + catch (Exception ex) + { + Debug($"[UI-ERR] InvokeOnUI inline callback threw: {ex}"); + } + } } public async Task InitializeAsync(CancellationToken cancellationToken = default) From 3c3035b96239b2c8e247565c06577586d908e71d Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Wed, 18 Feb 2026 13:32:27 -0600 Subject: [PATCH 3/4] Fix TOCTOU race in diagnostic log rotation Move FileInfo size check inside the lock to prevent race where multiple threads check size, all pass, and delete concurrently. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot/Services/CopilotService.cs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs index bfaf09558e..bd8a7e4d8c 100644 --- a/PolyPilot/Services/CopilotService.cs +++ b/PolyPilot/Services/CopilotService.cs @@ -221,13 +221,13 @@ private void Debug(string message) { try { - var logPath = Path.Combine(PolyPilotBaseDir, "event-diagnostics.log"); - // Rotate at 10 MB to prevent unbounded growth - var fi = new FileInfo(logPath); - if (fi.Exists && fi.Length > 10 * 1024 * 1024) - try { File.Delete(logPath); } catch { } lock (_diagnosticLogLock) { + var logPath = Path.Combine(PolyPilotBaseDir, "event-diagnostics.log"); + // Rotate at 10 MB to prevent unbounded growth + var fi = new FileInfo(logPath); + if (fi.Exists && fi.Length > 10 * 1024 * 1024) + try { File.Delete(logPath); } catch { } File.AppendAllText(logPath, $"{DateTime.UtcNow:yyyy-MM-dd HH:mm:ss.fff} {message}{Environment.NewLine}"); } From 3afc71f19d1ee60eef33d9ab45f1057a18d20f1f Mon Sep 17 00:00:00 2001 From: Shane Neuville Date: Wed, 18 Feb 2026 15:10:56 -0600 Subject: [PATCH 4/4] Fix UI not updating after session completes (throttle race) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CompleteResponse fires OnStateChanged followed by OnSessionComplete, the RefreshState handler may be throttled (500ms window) causing the sessions list to not be refreshed. HandleComplete then calls ScheduleRender but renders stale data — the session still appears as 'Thinking'. Fix: refresh sessions list in HandleComplete so the scheduled render always has up-to-date IsProcessing state. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- PolyPilot/Components/Pages/Dashboard.razor | 3 +++ 1 file changed, 3 insertions(+) diff --git a/PolyPilot/Components/Pages/Dashboard.razor b/PolyPilot/Components/Pages/Dashboard.razor index 27392189ea..7a4da087a1 100644 --- a/PolyPilot/Components/Pages/Dashboard.razor +++ b/PolyPilot/Components/Pages/Dashboard.razor @@ -820,6 +820,9 @@ completedSessions.Add(sessionName); streamingBySession.Remove(sessionName); activityBySession.Remove(sessionName); + // Refresh sessions list immediately — RefreshState may have been throttled + // when OnStateChanged fired just before OnSessionComplete + sessions = CopilotService.GetAllSessions().ToList(); ScheduleRender(); _ = Task.Delay(10000).ContinueWith(_ => {