diff --git a/.claude/skills/processing-state-safety/SKILL.md b/.claude/skills/processing-state-safety/SKILL.md
new file mode 100644
index 0000000000..7d8ad99456
--- /dev/null
+++ b/.claude/skills/processing-state-safety/SKILL.md
@@ -0,0 +1,75 @@
+---
+name: processing-state-safety
+description: >
+ Safety guide for modifying IsProcessing, watchdog, session resume, or abort code paths in CopilotService.
+ Use when: (1) Modifying any code that sets IsProcessing to true or false, (2) Changing watchdog timeout
+ logic or adding new timeout paths, (3) Touching session resume/restore logic, (4) Modifying
+ AbortSessionAsync or CompleteResponse, (5) Adding new processing-related fields to AgentSessionInfo
+ or SessionState, (6) Debugging sessions stuck in "Thinking" state, (7) Reviewing PRs that touch
+ CopilotService.Events.cs, CopilotService.cs, or CopilotService.Utilities.cs processing paths.
+---
+
+# Processing State Safety Guide
+
+Modifying processing state code involves these steps:
+
+1. Identify which of the 7 cleanup paths you're touching
+2. Apply the cleanup checklist to your change
+3. Verify all 7 paths still satisfy the checklist
+4. Ensure thread safety rules are followed
+
+If debugging a stuck session, see [references/regression-history.md](references/regression-history.md)
+for the 7 common mistakes and full regression timeline across 7 PRs.
+
+## The Cleanup Checklist
+
+Every code path that sets `IsProcessing = false` MUST perform ALL of these:
+
+```csharp
+FlushCurrentResponse(state); // BEFORE clearing — saves accumulated response
+CancelProcessingWatchdog(state);
+Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
+state.HasUsedToolsThisTurn = false;
+state.Info.IsResumed = false;
+state.Info.IsProcessing = false;
+state.Info.ProcessingStartedAt = null;
+state.Info.ToolCallCount = 0;
+state.Info.ProcessingPhase = 0;
+```
+
+Skip any field not applicable to the path (e.g., remote mode has no `ActiveToolCallCount`).
+
+## The 7 Cleanup Paths
+
+| # | Path | Location | Notes |
+|---|------|----------|-------|
+| 1 | CompleteResponse | Events.cs ~L699 | Normal completion via SessionIdleEvent (saves response inline, not via FlushCurrentResponse) |
+| 2 | SessionErrorEvent | Events.cs ~L517 | SDK error — wrapped in InvokeOnUI |
+| 3 | Watchdog timeout | Events.cs ~L1192 | InvokeOnUI + generation guard |
+| 4 | AbortSessionAsync (local) | CopilotService.cs ~L1681 | User clicks Stop |
+| 5 | AbortSessionAsync (remote) | CopilotService.cs ~L1638 | Remote mode optimistic clear |
+| 6 | SendAsync reconnect failure | CopilotService.cs ~L1600 | Reconnect+retry failed |
+| 7 | SendAsync initial failure | CopilotService.cs ~L1613 | First send attempt failed |
+| 8 | Bridge OnTurnEnd | Bridge.cs ~L127 | Remote mode normal turn completion — InvokeOnUI |
+
+**When adding a new field to AgentSessionInfo or SessionState**, add its reset to ALL 8 paths.
+**When adding a new cleanup path**, copy the full checklist from an existing path (path 3 is the most complete).
+
+## Key Watchdog Rules
+
+- **Two timeout tiers**: 120s inactivity, 600s tool execution
+- **600s triggers when**: `ActiveToolCallCount > 0` OR `IsResumed` OR `HasUsedToolsThisTurn`
+- **Never add timeouts shorter than 120s** for resume — tool calls gap 30-60s between events
+- **`ActiveToolCallCount` returns to 0 between tool rounds** — `AssistantTurnStartEvent` resets it to 0 (line ~365). Between rounds the model reasons about the next tool call, so `hasActiveTool` is 0 even though the session is actively working. Always check `HasUsedToolsThisTurn` too
+- **IsResumed clearing** must guard on `!hasActiveTool && !HasUsedToolsThisTurn`
+- **Staleness check**: `IsSessionStillProcessing` uses `File.GetLastWriteTimeUtc` >600s = idle
+
+## Thread Safety Rules
+
+- All `state.Info.*` mutations from background threads → `InvokeOnUI()`
+- `HasUsedToolsThisTurn`, `HasReceivedEventsSinceResume` → `Volatile.Write`/`Read`
+- `ActiveToolCallCount` → `Interlocked` operations only
+- Capture `ProcessingGeneration` before `SyncContext.Post`, verify inside callback
+
+For detailed thread safety patterns and the full regression history, see
+[references/regression-history.md](references/regression-history.md).
diff --git a/.claude/skills/processing-state-safety/references/regression-history.md b/.claude/skills/processing-state-safety/references/regression-history.md
new file mode 100644
index 0000000000..2eeb593862
--- /dev/null
+++ b/.claude/skills/processing-state-safety/references/regression-history.md
@@ -0,0 +1,67 @@
+# Regression History & Common Mistakes
+
+## 7 Mistakes That Keep Recurring
+
+### 1. Forgetting companion fields on error paths
+**What happens**: Clear `IsProcessing` and `ProcessingPhase` but forget `IsResumed` or
+`HasUsedToolsThisTurn`. Next turn inherits stale 600s timeout or stale tool state.
+**PRs where this happened**: #148, #158, #164
+
+### 2. Missing FlushCurrentResponse before clearing
+**What happens**: Accumulated `CurrentResponse` (StringBuilder) content is silently lost.
+User sees "Thinking..." then nothing — the partial response vanishes.
+**PRs where this happened**: #158
+
+### 3. Using ActiveToolCallCount alone as tool signal
+**What happens**: `AssistantTurnStartEvent` resets `ActiveToolCallCount` to 0 between tool rounds
+(line ~365 in Events.cs). Between rounds, the model reasons about the next tool call, so
+`hasActiveTool` is 0 even though the session is actively working.
+`HasUsedToolsThisTurn` persists across tool rounds and is the reliable signal.
+**PRs where this happened**: #148, #163
+
+### 4. Adding hardcoded short timeouts for resume
+**What happens**: A 10s timeout kills sessions that are legitimately processing (tool calls
+take 30-60s between events). The watchdog's tiered 120s/600s approach is the correct mechanism.
+**PRs where this happened**: #148
+
+### 5. Mutating state on background threads
+**What happens**: SDK events arrive on worker threads. `IsProcessing` write on a background
+thread races with Blazor rendering on the UI thread. Use `InvokeOnUI()` for all `state.Info.*`
+mutations from background code.
+**PRs where this happened**: #147, #148, #163
+
+### 6. Clearing IsResumed without checking tool activity
+**What happens**: After resume, the dedup path leaves `ActiveToolCallCount` at 0, so
+`hasActiveTool` is false. If you clear `IsResumed` based only on `HasReceivedEventsSinceResume`,
+the 600s timeout drops to 120s and kills resumed mid-tool sessions.
+**Guard condition**: `!hasActiveTool && !HasUsedToolsThisTurn`
+**PRs where this happened**: #163
+
+### 7. InvokeAsync in HandleComplete (Dashboard.razor)
+**What happens**: `HandleComplete` is called from `CompleteResponse` via
+`Invoke(SyncContext.Post)` — already on UI thread. Wrapping in `InvokeAsync` defers
+`StateHasChanged()` to next render cycle, causing stale "Thinking" indicators.
+**PRs where this happened**: #153
+
+## Full Regression Timeline
+
+| PR | What broke | Root cause |
+|----|-----------|------------|
+| #141 | 120s timeout killed tool executions | Single timeout tier too aggressive |
+| #147 | Stale IDLE killed new turns | Missing ProcessingGeneration guard |
+| #148 | 10s resume timeout killed active sessions | Hardcoded short timeout |
+| #148 | 120s during tool loops | ActiveToolCallCount reset between rounds |
+| #148 | IsResumed leaked → permanent 600s | Not cleared on abort/error/watchdog |
+| #153 | Stale "Thinking" renders | InvokeAsync deferred StateHasChanged |
+| #158 | Response content silently lost | No FlushCurrentResponse before clearing |
+| #163 | Resumed mid-tool killed at 120s | IsResumed cleared without tool guard |
+| #164 | Processing fields not reset on error | New fields added to only some paths |
+
+## Thread Safety Details
+
+1. **All `state.Info.*` mutations from background threads** → `InvokeOnUI()`
+2. **`HasUsedToolsThisTurn`, `HasReceivedEventsSinceResume`** → `Volatile.Write` on set, `Volatile.Read` on check (ARM memory model)
+3. **`ActiveToolCallCount`** → `Interlocked.Increment`/`Decrement`/`Exchange` (concurrent tool starts/completions)
+4. **`LastEventAtTicks`** → `Interlocked.Exchange`/`Read` (long requires atomic ops)
+5. **`ProcessingGeneration`** → `Interlocked.Increment` on send, `Interlocked.Read` on check
+6. **`ProcessingGeneration` guard**: `SyncContext.Post` is async — a new `SendPromptAsync` can execute between the Post() and the callback. Capture generation before posting, verify inside callback.
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
index fd7d75dafb..b27b5cf3bb 100644
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@@ -157,14 +157,10 @@ All three are reset in `SendPromptAsync` (new turn) and cleared in `CompleteResp
The UI shows: "Sending…" → "Server connected…" → "Thinking…" → "Working · Xm Xs · N tool calls…".
### Abort Behavior
-`AbortSessionAsync` must clear ALL processing state:
-- `IsProcessing = false`, `IsResumed = false`
-- `ProcessingStartedAt = null`, `ToolCallCount = 0`, `ProcessingPhase = 0`
-- `MessageQueue.Clear()` — prevents queued messages from auto-sending after abort
-- `_queuedImagePaths.TryRemove()` — clears associated image attachments
-- `CancelProcessingWatchdog()` and `ResponseCompletion.TrySetCanceled()`
+`AbortSessionAsync` must clear ALL processing state — see `.claude/skills/processing-state-safety/SKILL.md` for the full cleanup checklist and the 7 paths that clear `IsProcessing`.
-In remote mode, the mobile client optimistically clears all fields and delegates to the bridge server.
+### ⚠️ IsProcessing Cleanup Invariant
+**CRITICAL**: Every code path that sets `IsProcessing = false` must clear 9 companion fields and call `FlushCurrentResponse`. This is the most recurring bug category (7 PRs, 16 fix/regression cycles). **Read `.claude/skills/processing-state-safety/SKILL.md` before modifying ANY processing path.** There are 8 such paths across CopilotService.cs, Events.cs, and Bridge.cs.
### Processing Watchdog
The processing watchdog (`RunProcessingWatchdogAsync` in `CopilotService.Events.cs`) detects stuck sessions by checking how long since the last SDK event. It checks every 15 seconds and has two timeout tiers:
@@ -174,9 +170,7 @@ The processing watchdog (`RunProcessingWatchdogAsync` in `CopilotService.Events.
- The session was resumed mid-turn after app restart (`IsResumed`)
- Tools have been used this turn (`HasUsedToolsThisTurn`) — even between tool rounds when the model is thinking
-The 10-second resume timeout was removed — the watchdog handles all stuck-session detection.
-
-When the watchdog fires, it marshals state mutations to the UI thread via `InvokeOnUI()` and adds a system warning message. All code paths that set `IsProcessing = false` must go through the UI thread.
+When the watchdog fires, it marshals state mutations to the UI thread via `InvokeOnUI()` and adds a system warning message.
### Diagnostic Log Tags
The event diagnostics log (`~/.polypilot/event-diagnostics.log`) uses these tags:
@@ -189,6 +183,7 @@ The event diagnostics log (`~/.polypilot/event-diagnostics.log`) uses these tags
- `[ABORT]` — user-initiated abort cleared IsProcessing
- `[BRIDGE-COMPLETE]` — bridge OnTurnEnd cleared IsProcessing
- `[INTERRUPTED]` — app restart detected interrupted turn (watchdog timeout after resume)
+- `[WATCHDOG]` — watchdog clearing IsResumed or timing out a stuck session
Every code path that sets `IsProcessing = false` MUST have a diagnostic log entry. This is critical for debugging stuck-session issues.
diff --git a/PolyPilot.Tests/Scenarios/mode-switch-scenarios.json b/PolyPilot.Tests/Scenarios/mode-switch-scenarios.json
index 717c5d139b..8f1d171154 100644
--- a/PolyPilot.Tests/Scenarios/mode-switch-scenarios.json
+++ b/PolyPilot.Tests/Scenarios/mode-switch-scenarios.json
@@ -602,6 +602,34 @@
"note": "Processing status text (elapsed time / tool rounds / waiting) should be visible"
}
]
+ },
+ {
+ "id": "stale-session-not-marked-processing-on-restore",
+ "name": "Stale sessions are NOT marked as processing on restore",
+ "description": "When a session's events.jsonl hasn't been modified in over 10 minutes, the session should not be restored with IsProcessing=true, even if the last event was an 'active' type. This prevents hours-old sessions from showing a permanent 'Thinking' indicator.",
+ "unitTestCoverage": [
+ "StuckSessionRecoveryTests.IsSessionStillProcessing_StaleFile_ReturnsFalse",
+ "StuckSessionRecoveryTests.IsSessionStillProcessing_RecentFile_ActiveEvent_ReturnsTrue",
+ "StuckSessionRecoveryTests.IsSessionStillProcessing_RecentFile_IdleEvent_ReturnsFalse",
+ "StuckSessionRecoveryTests.StalenessThreshold_UsesWatchdogToolExecutionTimeout"
+ ],
+ "steps": [
+ {
+ "action": "evaluate",
+ "script": "document.querySelector('.status')?.className",
+ "expect": { "contains": "connected" },
+ "note": "App should be connected"
+ },
+ {
+ "action": "evaluate",
+ "script": "Array.from(document.querySelectorAll('.session-card.processing, .session-item .processing-dot')).length",
+ "note": "Count sessions showing processing state — stale sessions should not be here"
+ },
+ {
+ "action": "note",
+ "text": "Manual verification: check debug info for any sessions with IsProcessing=true where LastUpdatedAt is over 10 minutes old. Such sessions should have been detected as stale during restore."
+ }
+ ]
}
]
}
diff --git a/PolyPilot.Tests/StuckSessionRecoveryTests.cs b/PolyPilot.Tests/StuckSessionRecoveryTests.cs
new file mode 100644
index 0000000000..c054fa5227
--- /dev/null
+++ b/PolyPilot.Tests/StuckSessionRecoveryTests.cs
@@ -0,0 +1,353 @@
+using Microsoft.Extensions.DependencyInjection;
+using PolyPilot.Models;
+using PolyPilot.Services;
+
+namespace PolyPilot.Tests;
+
+///
+/// Tests for the stuck-session recovery logic:
+/// - IsSessionStillProcessing staleness check (events.jsonl file age)
+/// - Watchdog IsResumed clearing after events arrive
+/// Regression tests for: sessions permanently stuck in IsProcessing=true after app restart.
+///
+public class StuckSessionRecoveryTests
+{
+ private readonly StubChatDatabase _chatDb = new();
+ private readonly StubServerManager _serverManager = new();
+ private readonly StubWsBridgeClient _bridgeClient = new();
+ private readonly StubDemoService _demoService = new();
+ private readonly RepoManager _repoManager = new();
+ private readonly IServiceProvider _serviceProvider;
+
+ public StuckSessionRecoveryTests()
+ {
+ var services = new ServiceCollection();
+ _serviceProvider = services.BuildServiceProvider();
+ }
+
+ private CopilotService CreateService() =>
+ new CopilotService(_chatDb, _serverManager, _bridgeClient, _repoManager, _serviceProvider, _demoService);
+
+ // --- Staleness check: IsSessionStillProcessing ---
+
+ [Fact]
+ public void IsSessionStillProcessing_StaleFile_ReturnsFalse()
+ {
+ // Arrange: create a temp session dir with an events.jsonl that was modified long ago
+ var svc = CreateService();
+ var tmpDir = Path.Combine(Path.GetTempPath(), "polypilot-test-" + Guid.NewGuid().ToString("N"));
+ var sessionId = Guid.NewGuid().ToString();
+ var sessionDir = Path.Combine(tmpDir, sessionId);
+ Directory.CreateDirectory(sessionDir);
+ var eventsFile = Path.Combine(sessionDir, "events.jsonl");
+
+ try
+ {
+ // Write an "active" event as the last line
+ File.WriteAllText(eventsFile, """{"type":"assistant.message_delta","data":{"deltaContent":"hello"}}""");
+ // Backdate the file to be older than the watchdog timeout
+ var staleTime = DateTime.UtcNow.AddSeconds(-(CopilotService.WatchdogToolExecutionTimeoutSeconds + 60));
+ File.SetLastWriteTimeUtc(eventsFile, staleTime);
+
+ // Act: call IsSessionStillProcessing via the service
+ var result = svc.IsSessionStillProcessing(sessionId, tmpDir);
+
+ // Assert: should be false because the file is stale
+ Assert.False(result, "Stale events.jsonl should not report session as still processing");
+ }
+ finally
+ {
+ Directory.Delete(tmpDir, true);
+ }
+ }
+
+ [Fact]
+ public void IsSessionStillProcessing_RecentFile_ActiveEvent_ReturnsTrue()
+ {
+ var svc = CreateService();
+ var tmpDir = Path.Combine(Path.GetTempPath(), "polypilot-test-" + Guid.NewGuid().ToString("N"));
+ var sessionId = Guid.NewGuid().ToString();
+ var sessionDir = Path.Combine(tmpDir, sessionId);
+ Directory.CreateDirectory(sessionDir);
+ var eventsFile = Path.Combine(sessionDir, "events.jsonl");
+
+ try
+ {
+ // Write recent events with an active last event
+ File.WriteAllText(eventsFile,
+ """{"type":"session.start","data":{}}""" + "\n" +
+ """{"type":"assistant.turn_start","data":{}}""" + "\n" +
+ """{"type":"assistant.message_delta","data":{"deltaContent":"thinking..."}}""");
+ // File was just written so it's recent — no need to adjust LastWriteTime
+
+ var result = svc.IsSessionStillProcessing(sessionId, tmpDir);
+
+ Assert.True(result, "Recent events.jsonl with active last event should report still processing");
+ }
+ finally
+ {
+ Directory.Delete(tmpDir, true);
+ }
+ }
+
+ [Fact]
+ public void IsSessionStillProcessing_RecentFile_IdleEvent_ReturnsFalse()
+ {
+ var svc = CreateService();
+ var tmpDir = Path.Combine(Path.GetTempPath(), "polypilot-test-" + Guid.NewGuid().ToString("N"));
+ var sessionId = Guid.NewGuid().ToString();
+ var sessionDir = Path.Combine(tmpDir, sessionId);
+ Directory.CreateDirectory(sessionDir);
+ var eventsFile = Path.Combine(sessionDir, "events.jsonl");
+
+ try
+ {
+ File.WriteAllText(eventsFile,
+ """{"type":"session.start","data":{}}""" + "\n" +
+ """{"type":"assistant.message_delta","data":{"deltaContent":"done"}}""" + "\n" +
+ """{"type":"session.idle","data":{}}""");
+
+ var result = svc.IsSessionStillProcessing(sessionId, tmpDir);
+
+ Assert.False(result, "events.jsonl ending with session.idle should not report still processing");
+ }
+ finally
+ {
+ Directory.Delete(tmpDir, true);
+ }
+ }
+
+ [Fact]
+ public void IsSessionStillProcessing_MissingFile_ReturnsFalse()
+ {
+ var svc = CreateService();
+ var tmpDir = Path.Combine(Path.GetTempPath(), "polypilot-test-" + Guid.NewGuid().ToString("N"));
+ Directory.CreateDirectory(tmpDir);
+
+ try
+ {
+ var result = svc.IsSessionStillProcessing(Guid.NewGuid().ToString(), tmpDir);
+ Assert.False(result, "Missing events.jsonl should not report still processing");
+ }
+ finally
+ {
+ Directory.Delete(tmpDir, true);
+ }
+ }
+
+ [Fact]
+ public void IsSessionStillProcessing_EmptyFile_ReturnsFalse()
+ {
+ var svc = CreateService();
+ var tmpDir = Path.Combine(Path.GetTempPath(), "polypilot-test-" + Guid.NewGuid().ToString("N"));
+ var sessionId = Guid.NewGuid().ToString();
+ var sessionDir = Path.Combine(tmpDir, sessionId);
+ Directory.CreateDirectory(sessionDir);
+ var eventsFile = Path.Combine(sessionDir, "events.jsonl");
+
+ try
+ {
+ File.WriteAllText(eventsFile, "");
+ var result = svc.IsSessionStillProcessing(sessionId, tmpDir);
+ Assert.False(result, "Empty events.jsonl should not report still processing");
+ }
+ finally
+ {
+ Directory.Delete(tmpDir, true);
+ }
+ }
+
+ [Fact]
+ public void IsSessionStillProcessing_CorruptJsonl_ReturnsFalse()
+ {
+ var svc = CreateService();
+ var tmpDir = Path.Combine(Path.GetTempPath(), "polypilot-test-" + Guid.NewGuid().ToString("N"));
+ var sessionId = Guid.NewGuid().ToString();
+ var sessionDir = Path.Combine(tmpDir, sessionId);
+ Directory.CreateDirectory(sessionDir);
+ var eventsFile = Path.Combine(sessionDir, "events.jsonl");
+
+ try
+ {
+ File.WriteAllText(eventsFile, "this is not json at all\n{broken json}");
+ var result = svc.IsSessionStillProcessing(sessionId, tmpDir);
+ Assert.False(result, "Corrupt events.jsonl should not report still processing (should not crash)");
+ }
+ finally
+ {
+ Directory.Delete(tmpDir, true);
+ }
+ }
+
+ // --- Watchdog IsResumed clearing ---
+
+ [Fact]
+ public void IsResumed_NotCleared_When_ToolsActiveOnResume()
+ {
+ // Simulates the watchdog condition: even after events arrive,
+ // IsResumed should NOT be cleared if tools are active or have been used.
+ // This mirrors the guard in RunProcessingWatchdogAsync:
+ // if (IsResumed && HasReceivedEventsSinceResume && !hasActiveTool && !HasUsedToolsThisTurn)
+ var info = new AgentSessionInfo
+ {
+ Name = "test",
+ Model = "test-model",
+ IsResumed = true,
+ IsProcessing = true
+ };
+
+ bool hasReceivedEvents = true;
+ bool hasActiveTool = true; // Tool still running
+ bool hasUsedToolsThisTurn = true;
+
+ // Watchdog guard: should NOT clear IsResumed when tools are active
+ if (info.IsResumed && hasReceivedEvents && !hasActiveTool && !hasUsedToolsThisTurn)
+ {
+ info.IsResumed = false;
+ }
+
+ Assert.True(info.IsResumed, "IsResumed must stay true when tools are active (prevents premature timeout downgrade)");
+ }
+
+ [Fact]
+ public void IsResumed_Cleared_When_EventsArrive_NoToolActivity()
+ {
+ // When events have arrived and there's no tool activity,
+ // the watchdog should clear IsResumed to transition from 600s to 120s timeout.
+ var info = new AgentSessionInfo
+ {
+ Name = "test",
+ Model = "test-model",
+ IsResumed = true,
+ IsProcessing = true
+ };
+
+ bool hasReceivedEvents = true;
+ bool hasActiveTool = false; // No active tools
+ bool hasUsedToolsThisTurn = false;
+
+ // Watchdog guard: should clear IsResumed
+ if (info.IsResumed && hasReceivedEvents && !hasActiveTool && !hasUsedToolsThisTurn)
+ {
+ info.IsResumed = false;
+ }
+
+ Assert.False(info.IsResumed, "IsResumed should be cleared when events arrive with no tool activity");
+ }
+
+ [Fact]
+ public void IsResumed_NotCleared_When_NoEventsYet()
+ {
+ // Before any events arrive, IsResumed should stay true
+ // even with no tool activity — the session just resumed.
+ var info = new AgentSessionInfo
+ {
+ Name = "test",
+ Model = "test-model",
+ IsResumed = true,
+ IsProcessing = true
+ };
+
+ bool hasReceivedEvents = false; // No events yet
+ bool hasActiveTool = false;
+ bool hasUsedToolsThisTurn = false;
+
+ if (info.IsResumed && hasReceivedEvents && !hasActiveTool && !hasUsedToolsThisTurn)
+ {
+ info.IsResumed = false;
+ }
+
+ Assert.True(info.IsResumed, "IsResumed should stay true when no events have arrived yet");
+ }
+
+ [Fact]
+ public void IsResumed_NotCleared_When_HasUsedToolsThisTurn()
+ {
+ // Even after events arrive, if tools have been used this turn
+ // (between tool rounds), keep the longer 600s timeout.
+ var info = new AgentSessionInfo
+ {
+ Name = "test",
+ Model = "test-model",
+ IsResumed = true,
+ IsProcessing = true
+ };
+
+ bool hasReceivedEvents = true;
+ bool hasActiveTool = false; // No tool actively running right now
+ bool hasUsedToolsThisTurn = true; // But tools were used earlier in this turn
+
+ if (info.IsResumed && hasReceivedEvents && !hasActiveTool && !hasUsedToolsThisTurn)
+ {
+ info.IsResumed = false;
+ }
+
+ Assert.True(info.IsResumed, "IsResumed must stay true when tools were used this turn (even between rounds)");
+ }
+
+ // --- Staleness threshold validation ---
+
+ [Fact]
+ public void StalenessThreshold_UsesWatchdogToolExecutionTimeout()
+ {
+ // The staleness threshold should match the tool execution timeout
+ // to ensure we don't prematurely declare sessions as idle during long tool runs.
+ Assert.Equal(600, CopilotService.WatchdogToolExecutionTimeoutSeconds);
+ }
+
+ [Theory]
+ [InlineData("assistant.turn_start")]
+ [InlineData("tool.execution_start")]
+ [InlineData("tool.execution_progress")]
+ [InlineData("assistant.message_delta")]
+ [InlineData("assistant.reasoning")]
+ [InlineData("assistant.reasoning_delta")]
+ [InlineData("assistant.intent")]
+ public void IsSessionStillProcessing_AllActiveEventTypes_ReturnTrue(string eventType)
+ {
+ var svc = CreateService();
+ var tmpDir = Path.Combine(Path.GetTempPath(), "polypilot-test-" + Guid.NewGuid().ToString("N"));
+ var sessionId = Guid.NewGuid().ToString();
+ var sessionDir = Path.Combine(tmpDir, sessionId);
+ Directory.CreateDirectory(sessionDir);
+ var eventsFile = Path.Combine(sessionDir, "events.jsonl");
+
+ try
+ {
+ File.WriteAllText(eventsFile, $$$"""{"type":"{{{eventType}}}","data":{}}""");
+ var result = svc.IsSessionStillProcessing(sessionId, tmpDir);
+ Assert.True(result, $"Active event type '{eventType}' should report still processing");
+ }
+ finally
+ {
+ Directory.Delete(tmpDir, true);
+ }
+ }
+
+ [Theory]
+ [InlineData("session.idle")]
+ [InlineData("assistant.message")]
+ [InlineData("session.start")]
+ [InlineData("assistant.turn_end")]
+ [InlineData("tool.execution_end")]
+ public void IsSessionStillProcessing_InactiveEventTypes_ReturnFalse(string eventType)
+ {
+ var svc = CreateService();
+ var tmpDir = Path.Combine(Path.GetTempPath(), "polypilot-test-" + Guid.NewGuid().ToString("N"));
+ var sessionId = Guid.NewGuid().ToString();
+ var sessionDir = Path.Combine(tmpDir, sessionId);
+ Directory.CreateDirectory(sessionDir);
+ var eventsFile = Path.Combine(sessionDir, "events.jsonl");
+
+ try
+ {
+ File.WriteAllText(eventsFile, $$$"""{"type":"{{{eventType}}}","data":{}}""");
+ var result = svc.IsSessionStillProcessing(sessionId, tmpDir);
+ Assert.False(result, $"Inactive event type '{eventType}' should not report still processing");
+ }
+ finally
+ {
+ Directory.Delete(tmpDir, true);
+ }
+ }
+}
diff --git a/PolyPilot/Services/CopilotService.Bridge.cs b/PolyPilot/Services/CopilotService.Bridge.cs
index 80837b55d2..5d40c7eb0d 100644
--- a/PolyPilot/Services/CopilotService.Bridge.cs
+++ b/PolyPilot/Services/CopilotService.Bridge.cs
@@ -127,16 +127,23 @@ private async Task InitializeRemoteAsync(ConnectionSettings settings, Cancellati
_bridgeClient.OnTurnEnd += (s) =>
{
_remoteStreamingSessions.TryRemove(s, out _);
- var session = GetRemoteSession(s);
- if (session != null)
+ InvokeOnUI(() =>
{
- Debug($"[BRIDGE-COMPLETE] '{session.Name}' OnTurnEnd cleared IsProcessing");
- session.IsProcessing = false;
- // Mark last assistant message as complete
- var lastAssistant = session.History.LastOrDefault(m => m.IsAssistant && !m.IsComplete);
- if (lastAssistant != null) { lastAssistant.IsComplete = true; lastAssistant.Model = session.Model; }
- }
- InvokeOnUI(() => OnTurnEnd?.Invoke(s));
+ var session = GetRemoteSession(s);
+ if (session != null)
+ {
+ Debug($"[BRIDGE-COMPLETE] '{session.Name}' OnTurnEnd cleared IsProcessing");
+ session.IsProcessing = false;
+ session.IsResumed = false;
+ session.ProcessingStartedAt = null;
+ session.ToolCallCount = 0;
+ session.ProcessingPhase = 0;
+ // Mark last assistant message as complete
+ var lastAssistant = session.History.LastOrDefault(m => m.IsAssistant && !m.IsComplete);
+ if (lastAssistant != null) { lastAssistant.IsComplete = true; lastAssistant.Model = session.Model; }
+ }
+ OnTurnEnd?.Invoke(s);
+ });
};
_bridgeClient.OnSessionComplete += (s, sum) => InvokeOnUI(() => OnSessionComplete?.Invoke(s, sum));
_bridgeClient.OnError += (s, e) => InvokeOnUI(() => OnError?.Invoke(s, e));
diff --git a/PolyPilot/Services/CopilotService.Events.cs b/PolyPilot/Services/CopilotService.Events.cs
index c5e352be2a..b224aaaa89 100644
--- a/PolyPilot/Services/CopilotService.Events.cs
+++ b/PolyPilot/Services/CopilotService.Events.cs
@@ -678,6 +678,7 @@ private void CompleteResponse(SessionState state, long? expectedGeneration = nul
$"(responseLen={state.CurrentResponse.Length}, thread={Environment.CurrentManagedThreadId})");
CancelProcessingWatchdog(state);
+ Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
state.HasUsedToolsThisTurn = false;
state.Info.IsResumed = false; // Clear after first successful turn
var response = state.CurrentResponse.ToString();
@@ -1136,6 +1137,19 @@ private async Task RunProcessingWatchdogAsync(SessionState state, string session
var lastEventTicks = Interlocked.Read(ref state.LastEventAtTicks);
var elapsed = (DateTime.UtcNow - new DateTime(lastEventTicks)).TotalSeconds;
var hasActiveTool = Interlocked.CompareExchange(ref state.ActiveToolCallCount, 0, 0) > 0;
+
+ // After events have started flowing on a resumed session, clear IsResumed
+ // so the watchdog transitions from the long 600s timeout to the shorter 120s.
+ // Guard: don't clear if tools are active or have been used this turn — between
+ // tool rounds, ActiveToolCallCount returns to 0 when AssistantTurnStartEvent
+ // resets it, but the model may still be reasoning about the next tool call.
+ // HasUsedToolsThisTurn persists across rounds and prevents premature downgrade.
+ if (state.Info.IsResumed && Volatile.Read(ref state.HasReceivedEventsSinceResume)
+ && !hasActiveTool && !Volatile.Read(ref state.HasUsedToolsThisTurn))
+ {
+ Debug($"[WATCHDOG] '{sessionName}' clearing IsResumed — events have arrived since resume with no tool activity");
+ InvokeOnUI(() => state.Info.IsResumed = false);
+ }
// Use the longer tool-execution timeout if:
// 1. A tool call is actively running (hasActiveTool), OR
// 2. This is a resumed session that was mid-turn (agent sessions routinely
diff --git a/PolyPilot/Services/CopilotService.Utilities.cs b/PolyPilot/Services/CopilotService.Utilities.cs
index f8bdc9f863..7cf09dcbc6 100644
--- a/PolyPilot/Services/CopilotService.Utilities.cs
+++ b/PolyPilot/Services/CopilotService.Utilities.cs
@@ -85,15 +85,36 @@ public partial class CopilotService
}
///
- /// Check if a session was still processing when the app last closed
+ /// Check if a session was still processing when the app last closed.
+ /// Returns false if the events file is stale (not modified recently),
+ /// preventing sessions from being incorrectly marked as processing
+ /// after long app restarts.
///
- private bool IsSessionStillProcessing(string sessionId)
+ internal bool IsSessionStillProcessing(string sessionId) =>
+ IsSessionStillProcessing(sessionId, SessionStatePath);
+
+ ///
+ /// Testable overload that accepts a custom base path.
+ ///
+ internal bool IsSessionStillProcessing(string sessionId, string basePath)
{
- var eventsFile = Path.Combine(SessionStatePath, sessionId, "events.jsonl");
+ var eventsFile = Path.Combine(basePath, sessionId, "events.jsonl");
if (!File.Exists(eventsFile)) return false;
try
{
+ // Staleness check: if the file hasn't been modified recently,
+ // the CLI finished processing long ago — don't mark as still active.
+ var lastWrite = File.GetLastWriteTimeUtc(eventsFile);
+ var staleness = (DateTime.UtcNow - lastWrite).TotalSeconds;
+ if (staleness > WatchdogToolExecutionTimeoutSeconds)
+ {
+ Debug($"[RESTORE] events.jsonl for '{sessionId}' is stale " +
+ $"({staleness:F0}s old > {WatchdogToolExecutionTimeoutSeconds}s threshold), " +
+ $"treating session as idle");
+ return false;
+ }
+
string? lastLine = null;
foreach (var line in File.ReadLines(eventsFile))
{
diff --git a/PolyPilot/Services/CopilotService.cs b/PolyPilot/Services/CopilotService.cs
index 39045525d5..65db4808c4 100644
--- a/PolyPilot/Services/CopilotService.cs
+++ b/PolyPilot/Services/CopilotService.cs
@@ -1596,7 +1596,11 @@ await state.Session.SendAsync(new MessageOptions
Console.WriteLine($"[DEBUG] Reconnect+retry failed: {retryEx.Message}");
OnError?.Invoke(sessionName, $"Session disconnected and reconnect failed: {Models.ErrorMessageHelper.Humanize(retryEx)}");
CancelProcessingWatchdog(state);
+ FlushCurrentResponse(state);
Debug($"[ERROR] '{sessionName}' reconnect+retry failed, clearing IsProcessing");
+ Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
+ state.HasUsedToolsThisTurn = false;
+ state.Info.IsResumed = false;
state.Info.IsProcessing = false;
state.Info.ProcessingStartedAt = null;
state.Info.ToolCallCount = 0;
@@ -1609,7 +1613,11 @@ await state.Session.SendAsync(new MessageOptions
{
OnError?.Invoke(sessionName, $"SendAsync failed: {Models.ErrorMessageHelper.Humanize(ex)}");
CancelProcessingWatchdog(state);
+ FlushCurrentResponse(state);
Debug($"[ERROR] '{sessionName}' SendAsync failed, clearing IsProcessing (error={ex.Message})");
+ Interlocked.Exchange(ref state.ActiveToolCallCount, 0);
+ state.HasUsedToolsThisTurn = false;
+ state.Info.IsResumed = false;
state.Info.IsProcessing = false;
state.Info.ProcessingStartedAt = null;
state.Info.ToolCallCount = 0;
@@ -1636,6 +1644,7 @@ public async Task AbortSessionAsync(string sessionName)
if (_sessions.TryGetValue(sessionName, out var remoteState))
{
remoteState.Info.IsProcessing = false;
+ remoteState.Info.IsResumed = false;
remoteState.Info.ProcessingStartedAt = null;
remoteState.Info.ToolCallCount = 0;
remoteState.Info.ProcessingPhase = 0;