diff --git a/.claude/skills/processing-state-safety/SKILL.md b/.claude/skills/processing-state-safety/SKILL.md new file mode 100644 index 0000000000..7d8ad99456 --- /dev/null +++ b/.claude/skills/processing-state-safety/SKILL.md @@ -0,0 +1,75 @@ +--- +name: processing-state-safety +description: > + Safety guide for modifying IsProcessing, watchdog, session resume, or abort code paths in CopilotService. + Use when: (1) Modifying any code that sets IsProcessing to true or false, (2) Changing watchdog timeout + logic or adding new timeout paths, (3) Touching session resume/restore logic, (4) Modifying + AbortSessionAsync or CompleteResponse, (5) Adding new processing-related fields to AgentSessionInfo + or SessionState, (6) Debugging sessions stuck in "Thinking" state, (7) Reviewing PRs that touch + CopilotService.Events.cs, CopilotService.cs, or CopilotService.Utilities.cs processing paths. +--- + +# Processing State Safety Guide + +Modifying processing state code involves these steps: + +1. Identify which of the 7 cleanup paths you're touching +2. Apply the cleanup checklist to your change +3. Verify all 7 paths still satisfy the checklist +4. Ensure thread safety rules are followed + +If debugging a stuck session, see [references/regression-history.md](references/regression-history.md) +for the 7 common mistakes and full regression timeline across 7 PRs. + +## The Cleanup Checklist + +Every code path that sets `IsProcessing = false` MUST perform ALL of these: + +```csharp +FlushCurrentResponse(state); // BEFORE clearing — saves accumulated response +CancelProcessingWatchdog(state); +Interlocked.Exchange(ref state.ActiveToolCallCount, 0); +state.HasUsedToolsThisTurn = false; +state.Info.IsResumed = false; +state.Info.IsProcessing = false; +state.Info.ProcessingStartedAt = null; +state.Info.ToolCallCount = 0; +state.Info.ProcessingPhase = 0; +``` + +Skip any field not applicable to the path (e.g., remote mode has no `ActiveToolCallCount`). + +## The 7 Cleanup Paths + +| # | Path | Location | Notes | +|---|------|----------|-------| +| 1 | CompleteResponse | Events.cs ~L699 | Normal completion via SessionIdleEvent (saves response inline, not via FlushCurrentResponse) | +| 2 | SessionErrorEvent | Events.cs ~L517 | SDK error — wrapped in InvokeOnUI | +| 3 | Watchdog timeout | Events.cs ~L1192 | InvokeOnUI + generation guard | +| 4 | AbortSessionAsync (local) | CopilotService.cs ~L1681 | User clicks Stop | +| 5 | AbortSessionAsync (remote) | CopilotService.cs ~L1638 | Remote mode optimistic clear | +| 6 | SendAsync reconnect failure | CopilotService.cs ~L1600 | Reconnect+retry failed | +| 7 | SendAsync initial failure | CopilotService.cs ~L1613 | First send attempt failed | +| 8 | Bridge OnTurnEnd | Bridge.cs ~L127 | Remote mode normal turn completion — InvokeOnUI | + +**When adding a new field to AgentSessionInfo or SessionState**, add its reset to ALL 8 paths. +**When adding a new cleanup path**, copy the full checklist from an existing path (path 3 is the most complete). + +## Key Watchdog Rules + +- **Two timeout tiers**: 120s inactivity, 600s tool execution +- **600s triggers when**: `ActiveToolCallCount > 0` OR `IsResumed` OR `HasUsedToolsThisTurn` +- **Never add timeouts shorter than 120s** for resume — tool calls gap 30-60s between events +- **`ActiveToolCallCount` returns to 0 between tool rounds** — `AssistantTurnStartEvent` resets it to 0 (line ~365). Between rounds the model reasons about the next tool call, so `hasActiveTool` is 0 even though the session is actively working. Always check `HasUsedToolsThisTurn` too +- **IsResumed clearing** must guard on `!hasActiveTool && !HasUsedToolsThisTurn` +- **Staleness check**: `IsSessionStillProcessing` uses `File.GetLastWriteTimeUtc` >600s = idle + +## Thread Safety Rules + +- All `state.Info.*` mutations from background threads → `InvokeOnUI()` +- `HasUsedToolsThisTurn`, `HasReceivedEventsSinceResume` → `Volatile.Write`/`Read` +- `ActiveToolCallCount` → `Interlocked` operations only +- Capture `ProcessingGeneration` before `SyncContext.Post`, verify inside callback + +For detailed thread safety patterns and the full regression history, see +[references/regression-history.md](references/regression-history.md). diff --git a/.claude/skills/processing-state-safety/references/regression-history.md b/.claude/skills/processing-state-safety/references/regression-history.md new file mode 100644 index 0000000000..2eeb593862 --- /dev/null +++ b/.claude/skills/processing-state-safety/references/regression-history.md @@ -0,0 +1,67 @@ +# Regression History & Common Mistakes + +## 7 Mistakes That Keep Recurring + +### 1. Forgetting companion fields on error paths +**What happens**: Clear `IsProcessing` and `ProcessingPhase` but forget `IsResumed` or +`HasUsedToolsThisTurn`. Next turn inherits stale 600s timeout or stale tool state. +**PRs where this happened**: #148, #158, #164 + +### 2. Missing FlushCurrentResponse before clearing +**What happens**: Accumulated `CurrentResponse` (StringBuilder) content is silently lost. +User sees "Thinking..." then nothing — the partial response vanishes. +**PRs where this happened**: #158 + +### 3. Using ActiveToolCallCount alone as tool signal +**What happens**: `AssistantTurnStartEvent` resets `ActiveToolCallCount` to 0 between tool rounds +(line ~365 in Events.cs). Between rounds, the model reasons about the next tool call, so +`hasActiveTool` is 0 even though the session is actively working. +`HasUsedToolsThisTurn` persists across tool rounds and is the reliable signal. +**PRs where this happened**: #148, #163 + +### 4. Adding hardcoded short timeouts for resume +**What happens**: A 10s timeout kills sessions that are legitimately processing (tool calls +take 30-60s between events). The watchdog's tiered 120s/600s approach is the correct mechanism. +**PRs where this happened**: #148 + +### 5. Mutating state on background threads +**What happens**: SDK events arrive on worker threads. `IsProcessing` write on a background +thread races with Blazor rendering on the UI thread. Use `InvokeOnUI()` for all `state.Info.*` +mutations from background code. +**PRs where this happened**: #147, #148, #163 + +### 6. Clearing IsResumed without checking tool activity +**What happens**: After resume, the dedup path leaves `ActiveToolCallCount` at 0, so +`hasActiveTool` is false. If you clear `IsResumed` based only on `HasReceivedEventsSinceResume`, +the 600s timeout drops to 120s and kills resumed mid-tool sessions. +**Guard condition**: `!hasActiveTool && !HasUsedToolsThisTurn` +**PRs where this happened**: #163 + +### 7. InvokeAsync in HandleComplete (Dashboard.razor) +**What happens**: `HandleComplete` is called from `CompleteResponse` via +`Invoke(SyncContext.Post)` — already on UI thread. Wrapping in `InvokeAsync` defers +`StateHasChanged()` to next render cycle, causing stale "Thinking" indicators. +**PRs where this happened**: #153 + +## Full Regression Timeline + +| PR | What broke | Root cause | +|----|-----------|------------| +| #141 | 120s timeout killed tool executions | Single timeout tier too aggressive | +| #147 | Stale IDLE killed new turns | Missing ProcessingGeneration guard | +| #148 | 10s resume timeout killed active sessions | Hardcoded short timeout | +| #148 | 120s during tool loops | ActiveToolCallCount reset between rounds | +| #148 | IsResumed leaked → permanent 600s | Not cleared on abort/error/watchdog | +| #153 | Stale "Thinking" renders | InvokeAsync deferred StateHasChanged | +| #158 | Response content silently lost | No FlushCurrentResponse before clearing | +| #163 | Resumed mid-tool killed at 120s | IsResumed cleared without tool guard | +| #164 | Processing fields not reset on error | New fields added to only some paths | + +## Thread Safety Details + +1. **All `state.Info.*` mutations from background threads** → `InvokeOnUI()` +2. **`HasUsedToolsThisTurn`, `HasReceivedEventsSinceResume`** → `Volatile.Write` on set, `Volatile.Read` on check (ARM memory model) +3. **`ActiveToolCallCount`** → `Interlocked.Increment`/`Decrement`/`Exchange` (concurrent tool starts/completions) +4. **`LastEventAtTicks`** → `Interlocked.Exchange`/`Read` (long requires atomic ops) +5. **`ProcessingGeneration`** → `Interlocked.Increment` on send, `Interlocked.Read` on check +6. **`ProcessingGeneration` guard**: `SyncContext.Post` is async — a new `SendPromptAsync` can execute between the Post() and the callback. Capture generation before posting, verify inside callback. diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index fd7d75dafb..b27b5cf3bb 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -157,14 +157,10 @@ All three are reset in `SendPromptAsync` (new turn) and cleared in `CompleteResp The UI shows: "Sending…" → "Server connected…" → "Thinking…" → "Working · Xm Xs · N tool calls…". ### Abort Behavior -`AbortSessionAsync` must clear ALL processing state: -- `IsProcessing = false`, `IsResumed = false` -- `ProcessingStartedAt = null`, `ToolCallCount = 0`, `ProcessingPhase = 0` -- `MessageQueue.Clear()` — prevents queued messages from auto-sending after abort -- `_queuedImagePaths.TryRemove()` — clears associated image attachments -- `CancelProcessingWatchdog()` and `ResponseCompletion.TrySetCanceled()` +`AbortSessionAsync` must clear ALL processing state — see `.claude/skills/processing-state-safety/SKILL.md` for the full cleanup checklist and the 7 paths that clear `IsProcessing`. -In remote mode, the mobile client optimistically clears all fields and delegates to the bridge server. +### ⚠️ IsProcessing Cleanup Invariant +**CRITICAL**: Every code path that sets `IsProcessing = false` must clear 9 companion fields and call `FlushCurrentResponse`. This is the most recurring bug category (7 PRs, 16 fix/regression cycles). **Read `.claude/skills/processing-state-safety/SKILL.md` before modifying ANY processing path.** There are 8 such paths across CopilotService.cs, Events.cs, and Bridge.cs. ### Processing Watchdog The processing watchdog (`RunProcessingWatchdogAsync` in `CopilotService.Events.cs`) detects stuck sessions by checking how long since the last SDK event. It checks every 15 seconds and has two timeout tiers: @@ -174,9 +170,7 @@ The processing watchdog (`RunProcessingWatchdogAsync` in `CopilotService.Events. - The session was resumed mid-turn after app restart (`IsResumed`) - Tools have been used this turn (`HasUsedToolsThisTurn`) — even between tool rounds when the model is thinking -The 10-second resume timeout was removed — the watchdog handles all stuck-session detection. - -When the watchdog fires, it marshals state mutations to the UI thread via `InvokeOnUI()` and adds a system warning message. All code paths that set `IsProcessing = false` must go through the UI thread. +When the watchdog fires, it marshals state mutations to the UI thread via `InvokeOnUI()` and adds a system warning message. ### Diagnostic Log Tags The event diagnostics log (`~/.polypilot/event-diagnostics.log`) uses these tags: @@ -189,6 +183,7 @@ The event diagnostics log (`~/.polypilot/event-diagnostics.log`) uses these tags - `[ABORT]` — user-initiated abort cleared IsProcessing - `[BRIDGE-COMPLETE]` — bridge OnTurnEnd cleared IsProcessing - `[INTERRUPTED]` — app restart detected interrupted turn (watchdog timeout after resume) +- `[WATCHDOG]` — watchdog clearing IsResumed or timing out a stuck session Every code path that sets `IsProcessing = false` MUST have a diagnostic log entry. This is critical for debugging stuck-session issues. diff --git a/PolyPilot.Tests/Scenarios/mode-switch-scenarios.json b/PolyPilot.Tests/Scenarios/mode-switch-scenarios.json index 717c5d139b..8f1d171154 100644 --- a/PolyPilot.Tests/Scenarios/mode-switch-scenarios.json +++ b/PolyPilot.Tests/Scenarios/mode-switch-scenarios.json @@ -602,6 +602,34 @@ "note": "Processing status text (elapsed time / tool rounds / waiting) should be visible" } ] + }, + { + "id": "stale-session-not-marked-processing-on-restore", + "name": "Stale sessions are NOT marked as processing on restore", + "description": "When a session's events.jsonl hasn't been modified in over 10 minutes, the session should not be restored with IsProcessing=true, even if the last event was an 'active' type. This prevents hours-old sessions from showing a permanent 'Thinking' indicator.", + "unitTestCoverage": [ + "StuckSessionRecoveryTests.IsSessionStillProcessing_StaleFile_ReturnsFalse", + "StuckSessionRecoveryTests.IsSessionStillProcessing_RecentFile_ActiveEvent_ReturnsTrue", + "StuckSessionRecoveryTests.IsSessionStillProcessing_RecentFile_IdleEvent_ReturnsFalse", + "StuckSessionRecoveryTests.StalenessThreshold_UsesWatchdogToolExecutionTimeout" + ], + "steps": [ + { + "action": "evaluate", + "script": "document.querySelector('.status')?.className", + "expect": { "contains": "connected" }, + "note": "App should be connected" + }, + { + "action": "evaluate", + "script": "Array.from(document.querySelectorAll('.session-card.processing, .session-item .processing-dot')).length", + "note": "Count sessions showing processing state — stale sessions should not be here" + }, + { + "action": "note", + "text": "Manual verification: check debug info for any sessions with IsProcessing=true where LastUpdatedAt is over 10 minutes old. Such sessions should have been detected as stale during restore." + } + ] } ] } diff --git a/PolyPilot.Tests/StuckSessionRecoveryTests.cs b/PolyPilot.Tests/StuckSessionRecoveryTests.cs new file mode 100644 index 0000000000..c054fa5227 --- /dev/null +++ b/PolyPilot.Tests/StuckSessionRecoveryTests.cs @@ -0,0 +1,353 @@ +using Microsoft.Extensions.DependencyInjection; +using PolyPilot.Models; +using PolyPilot.Services; + +namespace PolyPilot.Tests; + +/// +/// Tests for the stuck-session recovery logic: +/// - IsSessionStillProcessing staleness check (events.jsonl file age) +/// - Watchdog IsResumed clearing after events arrive +/// Regression tests for: sessions permanently stuck in IsProcessing=true after app restart. +/// +public class StuckSessionRecoveryTests +{ + private readonly StubChatDatabase _chatDb = new(); + private readonly StubServerManager _serverManager = new(); + private readonly StubWsBridgeClient _bridgeClient = new(); + private readonly StubDemoService _demoService = new(); + private readonly RepoManager _repoManager = new(); + private readonly IServiceProvider _serviceProvider; + + public StuckSessionRecoveryTests() + { + var services = new ServiceCollection(); + _serviceProvider = services.BuildServiceProvider(); + } + + private CopilotService CreateService() => + new CopilotService(_chatDb, _serverManager, _bridgeClient, _repoManager, _serviceProvider, _demoService); + + // --- Staleness check: IsSessionStillProcessing --- + + [Fact] + public void IsSessionStillProcessing_StaleFile_ReturnsFalse() + { + // Arrange: create a temp session dir with an events.jsonl that was modified long ago + var svc = CreateService(); + var tmpDir = Path.Combine(Path.GetTempPath(), "polypilot-test-" + Guid.NewGuid().ToString("N")); + var sessionId = Guid.NewGuid().ToString(); + var sessionDir = Path.Combine(tmpDir, sessionId); + Directory.CreateDirectory(sessionDir); + var eventsFile = Path.Combine(sessionDir, "events.jsonl"); + + try + { + // Write an "active" event as the last line + File.WriteAllText(eventsFile, """{"type":"assistant.message_delta","data":{"deltaContent":"hello"}}"""); + // Backdate the file to be older than the watchdog timeout + var staleTime = DateTime.UtcNow.AddSeconds(-(CopilotService.WatchdogToolExecutionTimeoutSeconds + 60)); + File.SetLastWriteTimeUtc(eventsFile, staleTime); + + // Act: call IsSessionStillProcessing via the service + var result = svc.IsSessionStillProcessing(sessionId, tmpDir); + + // Assert: should be false because the file is stale + Assert.False(result, "Stale events.jsonl should not report session as still processing"); + } + finally + { + Directory.Delete(tmpDir, true); + } + } + + [Fact] + public void IsSessionStillProcessing_RecentFile_ActiveEvent_ReturnsTrue() + { + var svc = CreateService(); + var tmpDir = Path.Combine(Path.GetTempPath(), "polypilot-test-" + Guid.NewGuid().ToString("N")); + var sessionId = Guid.NewGuid().ToString(); + var sessionDir = Path.Combine(tmpDir, sessionId); + Directory.CreateDirectory(sessionDir); + var eventsFile = Path.Combine(sessionDir, "events.jsonl"); + + try + { + // Write recent events with an active last event + File.WriteAllText(eventsFile, + """{"type":"session.start","data":{}}""" + "\n" + + """{"type":"assistant.turn_start","data":{}}""" + "\n" + + """{"type":"assistant.message_delta","data":{"deltaContent":"thinking..."}}"""); + // File was just written so it's recent — no need to adjust LastWriteTime + + var result = svc.IsSessionStillProcessing(sessionId, tmpDir); + + Assert.True(result, "Recent events.jsonl with active last event should report still processing"); + } + finally + { + Directory.Delete(tmpDir, true); + } + } + + [Fact] + public void IsSessionStillProcessing_RecentFile_IdleEvent_ReturnsFalse() + { + var svc = CreateService(); + var tmpDir = Path.Combine(Path.GetTempPath(), "polypilot-test-" + Guid.NewGuid().ToString("N")); + var sessionId = Guid.NewGuid().ToString(); + var sessionDir = Path.Combine(tmpDir, sessionId); + Directory.CreateDirectory(sessionDir); + var eventsFile = Path.Combine(sessionDir, "events.jsonl"); + + try + { + File.WriteAllText(eventsFile, + """{"type":"session.start","data":{}}""" + "\n" + + """{"type":"assistant.message_delta","data":{"deltaContent":"done"}}""" + "\n" + + """{"type":"session.idle","data":{}}"""); + + var result = svc.IsSessionStillProcessing(sessionId, tmpDir); + + Assert.False(result, "events.jsonl ending with session.idle should not report still processing"); + } + finally + { + Directory.Delete(tmpDir, true); + } + } + + [Fact] + public void IsSessionStillProcessing_MissingFile_ReturnsFalse() + { + var svc = CreateService(); + var tmpDir = Path.Combine(Path.GetTempPath(), "polypilot-test-" + Guid.NewGuid().ToString("N")); + Directory.CreateDirectory(tmpDir); + + try + { + var result = svc.IsSessionStillProcessing(Guid.NewGuid().ToString(), tmpDir); + Assert.False(result, "Missing events.jsonl should not report still processing"); + } + finally + { + Directory.Delete(tmpDir, true); + } + } + + [Fact] + public void IsSessionStillProcessing_EmptyFile_ReturnsFalse() + { + var svc = CreateService(); + var tmpDir = Path.Combine(Path.GetTempPath(), "polypilot-test-" + Guid.NewGuid().ToString("N")); + var sessionId = Guid.NewGuid().ToString(); + var sessionDir = Path.Combine(tmpDir, sessionId); + Directory.CreateDirectory(sessionDir); + var eventsFile = Path.Combine(sessionDir, "events.jsonl"); + + try + { + File.WriteAllText(eventsFile, ""); + var result = svc.IsSessionStillProcessing(sessionId, tmpDir); + Assert.False(result, "Empty events.jsonl should not report still processing"); + } + finally + { + Directory.Delete(tmpDir, true); + } + } + + [Fact] + public void IsSessionStillProcessing_CorruptJsonl_ReturnsFalse() + { + var svc = CreateService(); + var tmpDir = Path.Combine(Path.GetTempPath(), "polypilot-test-" + Guid.NewGuid().ToString("N")); + var sessionId = Guid.NewGuid().ToString(); + var sessionDir = Path.Combine(tmpDir, sessionId); + Directory.CreateDirectory(sessionDir); + var eventsFile = Path.Combine(sessionDir, "events.jsonl"); + + try + { + File.WriteAllText(eventsFile, "this is not json at all\n{broken json}"); + var result = svc.IsSessionStillProcessing(sessionId, tmpDir); + Assert.False(result, "Corrupt events.jsonl should not report still processing (should not crash)"); + } + finally + { + Directory.Delete(tmpDir, true); + } + } + + // --- Watchdog IsResumed clearing --- + + [Fact] + public void IsResumed_NotCleared_When_ToolsActiveOnResume() + { + // Simulates the watchdog condition: even after events arrive, + // IsResumed should NOT be cleared if tools are active or have been used. + // This mirrors the guard in RunProcessingWatchdogAsync: + // if (IsResumed && HasReceivedEventsSinceResume && !hasActiveTool && !HasUsedToolsThisTurn) + var info = new AgentSessionInfo + { + Name = "test", + Model = "test-model", + IsResumed = true, + IsProcessing = true + }; + + bool hasReceivedEvents = true; + bool hasActiveTool = true; // Tool still running + bool hasUsedToolsThisTurn = true; + + // Watchdog guard: should NOT clear IsResumed when tools are active + if (info.IsResumed && hasReceivedEvents && !hasActiveTool && !hasUsedToolsThisTurn) + { + info.IsResumed = false; + } + + Assert.True(info.IsResumed, "IsResumed must stay true when tools are active (prevents premature timeout downgrade)"); + } + + [Fact] + public void IsResumed_Cleared_When_EventsArrive_NoToolActivity() + { + // When events have arrived and there's no tool activity, + // the watchdog should clear IsResumed to transition from 600s to 120s timeout. + var info = new AgentSessionInfo + { + Name = "test", + Model = "test-model", + IsResumed = true, + IsProcessing = true + }; + + bool hasReceivedEvents = true; + bool hasActiveTool = false; // No active tools + bool hasUsedToolsThisTurn = false; + + // Watchdog guard: should clear IsResumed + if (info.IsResumed && hasReceivedEvents && !hasActiveTool && !hasUsedToolsThisTurn) + { + info.IsResumed = false; + } + + Assert.False(info.IsResumed, "IsResumed should be cleared when events arrive with no tool activity"); + } + + [Fact] + public void IsResumed_NotCleared_When_NoEventsYet() + { + // Before any events arrive, IsResumed should stay true + // even with no tool activity — the session just resumed. + var info = new AgentSessionInfo + { + Name = "test", + Model = "test-model", + IsResumed = true, + IsProcessing = true + }; + + bool hasReceivedEvents = false; // No events yet + bool hasActiveTool = false; + bool hasUsedToolsThisTurn = false; + + if (info.IsResumed && hasReceivedEvents && !hasActiveTool && !hasUsedToolsThisTurn) + { + info.IsResumed = false; + } + + Assert.True(info.IsResumed, "IsResumed should stay true when no events have arrived yet"); + } + + [Fact] + public void IsResumed_NotCleared_When_HasUsedToolsThisTurn() + { + // Even after events arrive, if tools have been used this turn + // (between tool rounds), keep the longer 600s timeout. + var info = new AgentSessionInfo + { + Name = "test", + Model = "test-model", + IsResumed = true, + IsProcessing = true + }; + + bool hasReceivedEvents = true; + bool hasActiveTool = false; // No tool actively running right now + bool hasUsedToolsThisTurn = true; // But tools were used earlier in this turn + + if (info.IsResumed && hasReceivedEvents && !hasActiveTool && !hasUsedToolsThisTurn) + { + info.IsResumed = false; + } + + Assert.True(info.IsResumed, "IsResumed must stay true when tools were used this turn (even between rounds)"); + } + + // --- Staleness threshold validation --- + + [Fact] + public void StalenessThreshold_UsesWatchdogToolExecutionTimeout() + { + // The staleness threshold should match the tool execution timeout + // to ensure we don't prematurely declare sessions as idle during long tool runs. + Assert.Equal(600, CopilotService.WatchdogToolExecutionTimeoutSeconds); + } + + [Theory] + [InlineData("assistant.turn_start")] + [InlineData("tool.execution_start")] + [InlineData("tool.execution_progress")] + [InlineData("assistant.message_delta")] + [InlineData("assistant.reasoning")] + [InlineData("assistant.reasoning_delta")] + [InlineData("assistant.intent")] + public void IsSessionStillProcessing_AllActiveEventTypes_ReturnTrue(string eventType) + { + var svc = CreateService(); + var tmpDir = Path.Combine(Path.GetTempPath(), "polypilot-test-" + Guid.NewGuid().ToString("N")); + var sessionId = Guid.NewGuid().ToString(); + var sessionDir = Path.Combine(tmpDir, sessionId); + Directory.CreateDirectory(sessionDir); + var eventsFile = Path.Combine(sessionDir, "events.jsonl"); + + try + { + File.WriteAllText(eventsFile, $$$"""{"type":"{{{eventType}}}","data":{}}"""); + var result = svc.IsSessionStillProcessing(sessionId, tmpDir); + Assert.True(result, $"Active event type '{eventType}' should report still processing"); + } + finally + { + Directory.Delete(tmpDir, true); + } + } + + [Theory] + [InlineData("session.idle")] + [InlineData("assistant.message")] + [InlineData("session.start")] + [InlineData("assistant.turn_end")] + [InlineData("tool.execution_end")] + public void IsSessionStillProcessing_InactiveEventTypes_ReturnFalse(string eventType) + { + var svc = CreateService(); + var tmpDir = Path.Combine(Path.GetTempPath(), "polypilot-test-" + Guid.NewGuid().ToString("N")); + var sessionId = Guid.NewGuid().ToString(); + var sessionDir = Path.Combine(tmpDir, sessionId); + Directory.CreateDirectory(sessionDir); + var eventsFile = Path.Combine(sessionDir, "events.jsonl"); + + try + { + File.WriteAllText(eventsFile, $$$"""{"type":"{{{eventType}}}","data":{}}"""); + var result = svc.IsSessionStillProcessing(sessionId, tmpDir); + Assert.False(result, $"Inactive event type '{eventType}' should not report still processing"); + } + finally + { + Directory.Delete(tmpDir, true); + } + } +} diff --git a/PolyPilot/Services/CopilotService.Bridge.cs b/PolyPilot/Services/CopilotService.Bridge.cs index 80837b55d2..5d40c7eb0d 100644 --- a/PolyPilot/Services/CopilotService.Bridge.cs +++ b/PolyPilot/Services/CopilotService.Bridge.cs @@ -127,16 +127,23 @@ private async Task InitializeRemoteAsync(ConnectionSettings settings, Cancellati _bridgeClient.OnTurnEnd += (s) => { _remoteStreamingSessions.TryRemove(s, out _); - var session = GetRemoteSession(s); - if (session != null) + InvokeOnUI(() => { - Debug($"[BRIDGE-COMPLETE] '{session.Name}' OnTurnEnd cleared IsProcessing"); - session.IsProcessing = false; - // Mark last assistant message as complete - var lastAssistant = session.History.LastOrDefault(m => m.IsAssistant && !m.IsComplete); - if (lastAssistant != null) { lastAssistant.IsComplete = true; lastAssistant.Model = session.Model; } - } - InvokeOnUI(() => OnTurnEnd?.Invoke(s)); + var session = GetRemoteSession(s); + if (session != null) + { + Debug($"[BRIDGE-COMPLETE] '{session.Name}' OnTurnEnd cleared IsProcessing"); + session.IsProcessing = false; + session.IsResumed = false; + session.ProcessingStartedAt = null; + session.ToolCallCount = 0; + session.ProcessingPhase = 0; + // Mark last assistant message as complete + var lastAssistant = session.History.LastOrDefault(m => m.IsAssistant && !m.IsComplete); + if (lastAssistant != null) { lastAssistant.IsComplete = true; lastAssistant.Model = session.Model; } + } + OnTurnEnd?.Invoke(s); + }); }; _bridgeClient.OnSessionComplete += (s, sum) => InvokeOnUI(() => OnSessionComplete?.Invoke(s, sum)); _bridgeClient.OnError += (s, e) => InvokeOnUI(() => OnError?.Invoke(s, e)); diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs index c5e352be2a..b224aaaa89 100644 --- a/PolyPilot/Services/CopilotService.Events.cs +++ b/PolyPilot/Services/CopilotService.Events.cs @@ -678,6 +678,7 @@ private void CompleteResponse(SessionState state, long? expectedGeneration = nul $"(responseLen={state.CurrentResponse.Length}, thread={Environment.CurrentManagedThreadId})"); CancelProcessingWatchdog(state); + Interlocked.Exchange(ref state.ActiveToolCallCount, 0); state.HasUsedToolsThisTurn = false; state.Info.IsResumed = false; // Clear after first successful turn var response = state.CurrentResponse.ToString(); @@ -1136,6 +1137,19 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session var lastEventTicks = Interlocked.Read(ref state.LastEventAtTicks); var elapsed = (DateTime.UtcNow - new DateTime(lastEventTicks)).TotalSeconds; var hasActiveTool = Interlocked.CompareExchange(ref state.ActiveToolCallCount, 0, 0) > 0; + + // After events have started flowing on a resumed session, clear IsResumed + // so the watchdog transitions from the long 600s timeout to the shorter 120s. + // Guard: don't clear if tools are active or have been used this turn — between + // tool rounds, ActiveToolCallCount returns to 0 when AssistantTurnStartEvent + // resets it, but the model may still be reasoning about the next tool call. + // HasUsedToolsThisTurn persists across rounds and prevents premature downgrade. + if (state.Info.IsResumed && Volatile.Read(ref state.HasReceivedEventsSinceResume) + && !hasActiveTool && !Volatile.Read(ref state.HasUsedToolsThisTurn)) + { + Debug($"[WATCHDOG] '{sessionName}' clearing IsResumed — events have arrived since resume with no tool activity"); + InvokeOnUI(() => state.Info.IsResumed = false); + } // Use the longer tool-execution timeout if: // 1. A tool call is actively running (hasActiveTool), OR // 2. This is a resumed session that was mid-turn (agent sessions routinely diff --git a/PolyPilot/Services/CopilotService.Utilities.cs b/PolyPilot/Services/CopilotService.Utilities.cs index f8bdc9f863..7cf09dcbc6 100644 --- a/PolyPilot/Services/CopilotService.Utilities.cs +++ b/PolyPilot/Services/CopilotService.Utilities.cs @@ -85,15 +85,36 @@ public partial class CopilotService } /// - /// Check if a session was still processing when the app last closed + /// Check if a session was still processing when the app last closed. + /// Returns false if the events file is stale (not modified recently), + /// preventing sessions from being incorrectly marked as processing + /// after long app restarts. /// - private bool IsSessionStillProcessing(string sessionId) + internal bool IsSessionStillProcessing(string sessionId) => + IsSessionStillProcessing(sessionId, SessionStatePath); + + /// + /// Testable overload that accepts a custom base path. + /// + internal bool IsSessionStillProcessing(string sessionId, string basePath) { - var eventsFile = Path.Combine(SessionStatePath, sessionId, "events.jsonl"); + var eventsFile = Path.Combine(basePath, sessionId, "events.jsonl"); if (!File.Exists(eventsFile)) return false; try { + // Staleness check: if the file hasn't been modified recently, + // the CLI finished processing long ago — don't mark as still active. + var lastWrite = File.GetLastWriteTimeUtc(eventsFile); + var staleness = (DateTime.UtcNow - lastWrite).TotalSeconds; + if (staleness > WatchdogToolExecutionTimeoutSeconds) + { + Debug($"[RESTORE] events.jsonl for '{sessionId}' is stale " + + $"({staleness:F0}s old > {WatchdogToolExecutionTimeoutSeconds}s threshold), " + + $"treating session as idle"); + return false; + } + string? lastLine = null; foreach (var line in File.ReadLines(eventsFile)) { diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs index 39045525d5..65db4808c4 100644 --- a/PolyPilot/Services/CopilotService.cs +++ b/PolyPilot/Services/CopilotService.cs @@ -1596,7 +1596,11 @@ await state.Session.SendAsync(new MessageOptions Console.WriteLine($"[DEBUG] Reconnect+retry failed: {retryEx.Message}"); OnError?.Invoke(sessionName, $"Session disconnected and reconnect failed: {Models.ErrorMessageHelper.Humanize(retryEx)}"); CancelProcessingWatchdog(state); + FlushCurrentResponse(state); Debug($"[ERROR] '{sessionName}' reconnect+retry failed, clearing IsProcessing"); + Interlocked.Exchange(ref state.ActiveToolCallCount, 0); + state.HasUsedToolsThisTurn = false; + state.Info.IsResumed = false; state.Info.IsProcessing = false; state.Info.ProcessingStartedAt = null; state.Info.ToolCallCount = 0; @@ -1609,7 +1613,11 @@ await state.Session.SendAsync(new MessageOptions { OnError?.Invoke(sessionName, $"SendAsync failed: {Models.ErrorMessageHelper.Humanize(ex)}"); CancelProcessingWatchdog(state); + FlushCurrentResponse(state); Debug($"[ERROR] '{sessionName}' SendAsync failed, clearing IsProcessing (error={ex.Message})"); + Interlocked.Exchange(ref state.ActiveToolCallCount, 0); + state.HasUsedToolsThisTurn = false; + state.Info.IsResumed = false; state.Info.IsProcessing = false; state.Info.ProcessingStartedAt = null; state.Info.ToolCallCount = 0; @@ -1636,6 +1644,7 @@ public async Task AbortSessionAsync(string sessionName) if (_sessions.TryGetValue(sessionName, out var remoteState)) { remoteState.Info.IsProcessing = false; + remoteState.Info.IsResumed = false; remoteState.Info.ProcessingStartedAt = null; remoteState.Info.ToolCallCount = 0; remoteState.Info.ProcessingPhase = 0;